示例#1
0
    def load_resources(self):
        super().load_resources()
        logger.info('Load the corpus')
        with open(preprocess(res.train_path, force=self.repreprocess), 'rb') as p_file:
            self.train = pickle.load(p_file)
        with open(preprocess(res.test_path, force=self.repreprocess), 'rb') as p_file:
            self.test = pickle.load(p_file)
        self.train.truncate(self.train_truncate)
        self.test.truncate(self.test_truncate)
        self.train.filter_label(self.train_only_labels)
        self.test.filter_label(self.test_only_labels)
        if self.only_uid is not None:
            self.test.filter_uid(self.only_uid)

        logger.info('Load the lexicons')
        self.bing_liu_lexicon = read_bing_liu(res.bing_liu_lexicon_path)
        self.nrc_emotion_lexicon = read_nrc_emotion(res.nrc_emotion_lexicon_path)
        self.nrc_hashtag_unigram_lexicon = read_nrc_hashtag_unigram(res.nrc_hashtag_unigram_lexicon_path)
        self.nrc_hashtag_bigram_lexicon = read_nrc_hashtag_bigram(res.nrc_hashtag_bigram_lexicon_path)
        self.nrc_hashtag_pair_lexicon = read_nrc_hashtag_pair(res.nrc_hashtag_pair_lexicon_path)
        self.nrc_sentiment140_unigram_lexicon = read_nrc_hashtag_unigram(res.nrc_sentiment140_unigram_lexicon_path)
        self.nrc_sentiment140_bigram_lexicon = read_nrc_hashtag_bigram(res.nrc_sentiment140_bigram_lexicon_path)
        self.nrc_sentiment140_pair_lexicon = read_nrc_hashtag_pair(res.nrc_sentiment140_pair_lexicon_path)
        self.nrc_hashtag_sentimenthashtags_lexicon = read_nrc_hashtag_sentimenthashtags(res.nrc_hashtag_sentimenthashtags_lexicon_path)

        self.mpqa_lexicon = read_mpqa(res.mpqa_lexicon_path)

        logger.info('Load carnegie clusters')
        self.carnegie_clusters = read_carnegie_clusters(res.carnegie_clusters_path)
示例#2
0
 def load_resources(self):
     super().load_resources()
     logger.info('Load the corpus')
     with open(preprocess(res.train_path, force=self.repreprocess), 'rb') as p_file:
         self.train = pickle.load(p_file)
     with open(preprocess(res.test_path, force=self.repreprocess), 'rb') as p_file:
         self.test = pickle.load(p_file)
     self.train.truncate(self.train_truncate)
     self.test.truncate(self.test_truncate)
     self.train.filter_label(self.train_only_labels)
     self.test.filter_label(self.test_only_labels)
     if self.only_uid is not None:
         self.test.filter_uid(self.only_uid)
#                 fpath = os.path.join(path, fname)
#                 if sys.version_info < (3,):
#                     f = open(fpath)
#                 else:
#                     f = open(fpath, encoding='latin-1')
#                 texts.append(f.read())
#                 f.close()
#                 labels.append(label_id)

import resources as res
from base import preprocess
from reader import Dataset  # We need this import because we're loading
# a Dataset with pickle
import pickle

with open(preprocess(res.train_path, force=False), 'rb') as p_file:
    train = pickle.load(p_file)
texts = [d['tok'] for d in train.data]
labels_index = dict([(name, nid) for nid, name in enumerate(train.labels)])
labels = np.array(train.target)

print('Found %s texts.' % len(texts))

# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import logging
import logger_config

logger = logging.getLogger(__name__)

from base import preprocess
import resources as res

preprocess(res.train_path, force=True)
preprocess(res.test_path, force=True)
preprocess(res.dev_path, force=True)
示例#5
0
    def load_resources(self):
        super().load_resources()
        logger.info('Load the corpus')
        with open(preprocess(res.train_path, force=self.repreprocess),
                  'rb') as p_file:
            self.train = pickle.load(p_file)
        with open(preprocess(res.test_path, force=self.repreprocess),
                  'rb') as p_file:
            self.test = pickle.load(p_file)

        with open(preprocess(res.dev_path, force=self.repreprocess),
                  'rb') as p_file:
            self.dev = pickle.load(p_file)

        self.train.truncate(self.train_truncate)
        self.test.truncate(self.test_truncate)
        self.train.filter_label(self.train_only_labels)
        self.test.filter_label(self.test_only_labels)
        self.dev.filter_label(self.dev_only_labels)
        if self.only_uid is not None:
            self.test.filter_uid(self.only_uid)

        self.texts = [d['tok'] for d in self.train.data]
        self.labels_index = dict([
            (name, nid) for nid, name in enumerate(self.train.labels)
        ])
        self.labels = to_categorical(self.train.target)
        logger.info('Found %s texts', len(self.texts))

        logger.info('Vectorize the text samples into a 2D integer tensor')
        self.tokenizer = Tokenizer(nb_words=self.max_nb_words)
        self.tokenizer.fit_on_texts(self.texts)
        self.sequences = self.tokenizer.texts_to_sequences(self.texts)
        self.train = None

        self.dev_texts = [d['tok'] for d in self.dev.data]
        self.dev_sequences = self.tokenizer.texts_to_sequences(self.dev_texts)
        self.dev_data = pad_sequences(self.dev_sequences,
                                      maxlen=self.max_sequence_length)
        self.dev_labels = to_categorical(self.dev.target)
        self.dev_texts = None

        self.test_texts = [d['tok'] for d in self.test.data]
        self.test_sequences = self.tokenizer.texts_to_sequences(
            self.test_texts)
        self.test_data = pad_sequences(self.test_sequences,
                                       maxlen=self.max_sequence_length)
        self.test_labels = to_categorical(self.test.target)
        self.test_texts = None

        self.word_index = self.tokenizer.word_index
        logger.info('Found %s unique tokens.', len(self.word_index))

        self.train_data = pad_sequences(self.sequences,
                                        maxlen=self.max_sequence_length)

        logger.info('Shape of data tensor: %s', self.train_data.shape)
        logger.info('Shape of label tensor: %s', self.labels.shape)
        logger.info('label index: %s', self.labels_index)

        if self.embedding is None:
            self.load_trainable_embedding()
        else:
            self.load_fixed_embedding()