def load_resources(self): super().load_resources() logger.info('Load the corpus') with open(preprocess(res.train_path, force=self.repreprocess), 'rb') as p_file: self.train = pickle.load(p_file) with open(preprocess(res.test_path, force=self.repreprocess), 'rb') as p_file: self.test = pickle.load(p_file) self.train.truncate(self.train_truncate) self.test.truncate(self.test_truncate) self.train.filter_label(self.train_only_labels) self.test.filter_label(self.test_only_labels) if self.only_uid is not None: self.test.filter_uid(self.only_uid) logger.info('Load the lexicons') self.bing_liu_lexicon = read_bing_liu(res.bing_liu_lexicon_path) self.nrc_emotion_lexicon = read_nrc_emotion(res.nrc_emotion_lexicon_path) self.nrc_hashtag_unigram_lexicon = read_nrc_hashtag_unigram(res.nrc_hashtag_unigram_lexicon_path) self.nrc_hashtag_bigram_lexicon = read_nrc_hashtag_bigram(res.nrc_hashtag_bigram_lexicon_path) self.nrc_hashtag_pair_lexicon = read_nrc_hashtag_pair(res.nrc_hashtag_pair_lexicon_path) self.nrc_sentiment140_unigram_lexicon = read_nrc_hashtag_unigram(res.nrc_sentiment140_unigram_lexicon_path) self.nrc_sentiment140_bigram_lexicon = read_nrc_hashtag_bigram(res.nrc_sentiment140_bigram_lexicon_path) self.nrc_sentiment140_pair_lexicon = read_nrc_hashtag_pair(res.nrc_sentiment140_pair_lexicon_path) self.nrc_hashtag_sentimenthashtags_lexicon = read_nrc_hashtag_sentimenthashtags(res.nrc_hashtag_sentimenthashtags_lexicon_path) self.mpqa_lexicon = read_mpqa(res.mpqa_lexicon_path) logger.info('Load carnegie clusters') self.carnegie_clusters = read_carnegie_clusters(res.carnegie_clusters_path)
def load_resources(self): super().load_resources() logger.info('Load the corpus') with open(preprocess(res.train_path, force=self.repreprocess), 'rb') as p_file: self.train = pickle.load(p_file) with open(preprocess(res.test_path, force=self.repreprocess), 'rb') as p_file: self.test = pickle.load(p_file) self.train.truncate(self.train_truncate) self.test.truncate(self.test_truncate) self.train.filter_label(self.train_only_labels) self.test.filter_label(self.test_only_labels) if self.only_uid is not None: self.test.filter_uid(self.only_uid)
# fpath = os.path.join(path, fname) # if sys.version_info < (3,): # f = open(fpath) # else: # f = open(fpath, encoding='latin-1') # texts.append(f.read()) # f.close() # labels.append(label_id) import resources as res from base import preprocess from reader import Dataset # We need this import because we're loading # a Dataset with pickle import pickle with open(preprocess(res.train_path, force=False), 'rb') as p_file: train = pickle.load(p_file) texts = [d['tok'] for d in train.data] labels_index = dict([(name, nid) for nid, name in enumerate(train.labels)]) labels = np.array(train.target) print('Found %s texts.' % len(texts)) # finally, vectorize the text samples into a 2D integer tensor tokenizer = Tokenizer(nb_words=MAX_NB_WORDS) tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index))
#!/usr/bin/env python # -*- coding: utf-8 -*- import logging import logger_config logger = logging.getLogger(__name__) from base import preprocess import resources as res preprocess(res.train_path, force=True) preprocess(res.test_path, force=True) preprocess(res.dev_path, force=True)
def load_resources(self): super().load_resources() logger.info('Load the corpus') with open(preprocess(res.train_path, force=self.repreprocess), 'rb') as p_file: self.train = pickle.load(p_file) with open(preprocess(res.test_path, force=self.repreprocess), 'rb') as p_file: self.test = pickle.load(p_file) with open(preprocess(res.dev_path, force=self.repreprocess), 'rb') as p_file: self.dev = pickle.load(p_file) self.train.truncate(self.train_truncate) self.test.truncate(self.test_truncate) self.train.filter_label(self.train_only_labels) self.test.filter_label(self.test_only_labels) self.dev.filter_label(self.dev_only_labels) if self.only_uid is not None: self.test.filter_uid(self.only_uid) self.texts = [d['tok'] for d in self.train.data] self.labels_index = dict([ (name, nid) for nid, name in enumerate(self.train.labels) ]) self.labels = to_categorical(self.train.target) logger.info('Found %s texts', len(self.texts)) logger.info('Vectorize the text samples into a 2D integer tensor') self.tokenizer = Tokenizer(nb_words=self.max_nb_words) self.tokenizer.fit_on_texts(self.texts) self.sequences = self.tokenizer.texts_to_sequences(self.texts) self.train = None self.dev_texts = [d['tok'] for d in self.dev.data] self.dev_sequences = self.tokenizer.texts_to_sequences(self.dev_texts) self.dev_data = pad_sequences(self.dev_sequences, maxlen=self.max_sequence_length) self.dev_labels = to_categorical(self.dev.target) self.dev_texts = None self.test_texts = [d['tok'] for d in self.test.data] self.test_sequences = self.tokenizer.texts_to_sequences( self.test_texts) self.test_data = pad_sequences(self.test_sequences, maxlen=self.max_sequence_length) self.test_labels = to_categorical(self.test.target) self.test_texts = None self.word_index = self.tokenizer.word_index logger.info('Found %s unique tokens.', len(self.word_index)) self.train_data = pad_sequences(self.sequences, maxlen=self.max_sequence_length) logger.info('Shape of data tensor: %s', self.train_data.shape) logger.info('Shape of label tensor: %s', self.labels.shape) logger.info('label index: %s', self.labels_index) if self.embedding is None: self.load_trainable_embedding() else: self.load_fixed_embedding()