def analyze(texts): """Analyses the emotional content of the texts given.""" encoding_to_label = { 0: 'anger', 1: 'disgust', 2: 'fear', 3: 'joy', 4: 'sadness', 5: 'surprise', } word_index, seqs = tokenize(texts, MAX_NB_WORDS) padded = pad_sequences(seqs, maxlen=MAX_SEQUENCE_LENGTH) predictions = model.predict(padded, batch_size=128, verbose=1) analyses = [] for pred in predictions: analysis = {} for i, prob in enumerate(pred): label = encoding_to_label[i] percentage = round(prob * 100, 3) analysis[label] = percentage analyses.append(analysis) return analyses
def preprocess_raw(self): ''' Tokenize/bpe encode the raw text ''' def is_xml(filename): ''' Determine if a file is XML formatted ''' return filename.endswith('.sgm') or filename.endswith('.xml') def filter_lines(in_file, basename): ''' Scan the file for any filtered lines ''' filtered = set() xml = is_xml(basename) for i, line in enumerate(in_file): if not self.preprocess_raw_line(line, xml=xml): filtered.add(i) return filtered def merge(basename, in_file, out_file, filtered=None): ''' Tokenize the passed in file and write it to the designated file ''' filtered = filtered or set() xml = is_xml(basename) for i, line in enumerate(in_file): if i in filtered: continue processed_line = self.preprocess_raw_line(line, xml=xml) out_file.write(processed_line + '\n') # First, clean-up any incomplete preprocessing files for path in glob.glob( os.path.join(self.preprocess_directory, '*.incomplete')): os.remove(os.path.join(self.preprocess_directory, path)) bpe_code_path = os.path.join(self.preprocess_directory, 'bpe.32000') if not os.path.exists(bpe_code_path): for split, file_pairs in type(self).RAW_SPLITS.items(): for pair in file_pairs: # First determine which lines must be skipped in both files, since the files are # a parallel corpora. filtered = set() for filename, lang in zip(pair, type(self).LANGUAGE_PAIR): in_path = os.path.join(self.preprocess_directory, filename) with ExitStack() as stack: in_file = stack.enter_context(Open(in_path, 'rt')) filtered.update( filter_lines(in_file, os.path.basename(filename))) for filename, lang in zip(pair, type(self).LANGUAGE_PAIR): basename = os.path.basename(filename) in_path = os.path.join(self.preprocess_directory, filename) split_path = os.path.join(self.preprocess_directory, f'{split}.{lang}') if os.path.exists(split_path): continue with ExitStack() as stack: out_path = f'{split_path}.incomplete' in_file = stack.enter_context(Open(in_path, 'rt')) out_file = stack.enter_context(Open( out_path, 'at')) merge(basename, in_file, out_file, filtered) word_counts = Counter() for split in type(self).RAW_SPLITS: for lang in type(self).LANGUAGE_PAIR: try: split_path = os.path.join(self.preprocess_directory, f'{split}.{lang}') os.rename(f'{split_path}.incomplete', split_path) except FileNotFoundError: # This can happen if the preprocessing is interrupted pass tokenized_path = os.path.join(self.preprocess_directory, f'{split}.tok.{lang}') word_counts.update( preprocess.tokenize(split_path, tokenized_path, self.preprocess_buffer_size)) print('Learning BPE') preprocess.learn_bpe(bpe_code_path, word_counts.items()) vocab_path = os.path.join(self.preprocess_directory, 'vocab.bpe.32000') if not os.path.exists(vocab_path): vocab = set() for split in type(self).RAW_SPLITS: for lang in type(self).LANGUAGE_PAIR: in_path = os.path.join(self.preprocess_directory, f'{split}.tok.{lang}') bpe_path = os.path.join(self.preprocess_directory, f'{split}.tok.bpe.32000.{lang}') vocab.update( preprocess.apply_bpe(bpe_code_path, in_path, bpe_path, self.preprocess_buffer_size)) vocab_path = os.path.join(self.preprocess_directory, 'vocab.bpe.32000') incomplete_vocab_path = f'{vocab_path}.incomplete' with Open(incomplete_vocab_path, 'wt') as vocab_file: vocab_file.writelines('\n'.join( [word for word in sorted(vocab)])) os.rename(incomplete_vocab_path, vocab_path)
from keras.layers import Embedding, Dense, Dropout, Activation, LSTM from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau from data.preprocess import clean, tokenize, remove_common_words, undersample EMBEDDINGS_DIM = 25 MAX_SEQUENCE_LENGTH = 280 # Max tweet length MAX_NB_WORDS = 30000 df = clean('./Jan9-2012-tweets-clean.txt') df = undersample(df, 'joy', 3000) X = df['tweet'] y = df['emotion'] word_index, sequences = tokenize(X, MAX_NB_WORDS) word_index, sequences = remove_common_words(word_index, sequences, 100) data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) label_encoder = LabelEncoder() label_encoder.fit(y) print('Labels:', label_encoder.classes_) print('Labels encodings:', label_encoder.transform(label_encoder.classes_)) labels = to_categorical(label_encoder.transform(y)) print("Shape of data: {}.".format(data.shape)) embeddings_index = {} with open('./glove.twitter.27B/glove.twitter.27B.{}d.txt'.format(