class Model(object): def __init__(self): self.__tokenizer = Tokenizer() def __tweet2idx(self, tweet, w2idx): # maps tokens to ids return np.array([ w2idx[token] if token in w2idx else w2idx['<UNK>'] for token in tweet ]) def __convert_format(self, corpus, classes, w2idx, max_len, only_predict: bool = False): dataset = [] for tweet in corpus: dataset.append((tweet.get_text(), tweet.get_gold_label())) x_data, y_data = zip(*dataset) x_data = [self.__tokenizer.get_only_tokens(tweet) for tweet in x_data] # make it possible to get predictions for unlabeled tweets if not only_predict: y_data = [classes[label] for label in y_data] # class to one hot vector y_data = [np.eye(len(classes))[label] for label in y_data] y_data = np.array(y_data) else: y_data = None # to np array x_data = np.array([self.__tweet2idx(tweet, w2idx) for tweet in x_data]) # padding x_data = pad_sequences(x_data, max_len) return x_data, y_data def __create_vocab(self, corpus): vocab = {} for tweet in corpus: for token in self.__tokenizer.get_only_tokens(tweet.get_text()): if token in vocab: vocab[token] += 1 else: vocab[token] = 1 # add <unk> token to map unseen words to, use high nuber so that it does not get filtered out by min_count vocab['<UNK>'] = 100 return vocab def __get_word_embeddings(self, vecs, vocab, min_count): dim = vecs.vector_size embeddings = {} for word in vecs._w2idx.keys(): embeddings[word] = vecs[word] # add random embeddings for words that occur in training data but not in the pretrained w2v embeddings for word in vocab: if word not in embeddings and vocab[word] >= min_count: embeddings[word] = np.random.uniform(-0.25, 0.25, dim) vocab_size = len(embeddings) word_idx_map = dict() W = np.zeros(shape=(vocab_size + 1, dim), dtype='float32') W[0] = np.zeros(dim, dtype='float32') i = 1 for word in embeddings: W[i] = embeddings[word] word_idx_map[word] = i i += 1 return embeddings, W, word_idx_map def train(self, train_corpus, classes, architecture, params, num_epochs: int, max_len: int, embedding_file, file_type, min_count: int, save_dir, dev_corpus=None): """ Function to train a model. Args: train_corpus : Corpus containg the Tweets for training classes : Dictionary containing a mapping from classification classes to ids architecture : String one of LSTM, BiLSTM, CNN, LSTM+ATT, BiLSTM+ATT params : Dictionary containing a mapping from model parameters to values num_epochs : int, number of training iterations max_len : int, maximum sequence length embedding_file : String, name/path to pretrained word embedding file file_type : String, Word2Vec (for text) or binary min_count : int, minum number of occurences save_dir : String, directiory to save model and weights to dev_corpus :(optional) Corpus containing Tweets for development if None a validation split of 90/10 is used Files that are written: 'attention_model.h5' : only if architecture is LSTM+ATT or BiLSTM+ATT 'model.json' : model architecture 'vocab.p' : vocab of traing data 'max_sequence_len.p' : maximum sequence length 'word_idx_map.p' : word to id mapping 'classes.p' : classes to id mapping """ # restrict gpu memory consumption config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) set_session(sess) params['max_len'] = max_len print('Creating vocab...') vocab = self.__create_vocab(train_corpus) print('vocab finished') # create wordvecs and W print('Loading embeddings...') # filter embedding file with vocab vecs = WordVecs(embedding_file, file_type, vocab) print('finished loading') print('Creating wordvecs, W and w2idx map...') embeddings, W, word_idx_map = self.__get_word_embeddings( vecs, vocab, min_count) print('wordvecs, W, w2idx map finished') # convert train corpus to xtrain, ytrain print('Converting train corpus...') x_train, y_train = self.__convert_format(train_corpus, classes, word_idx_map, max_len) # convert dev corpus to xdev, ydev if dev_corpus: print('Converting dev corpus...') x_dev, y_dev = self.__convert_format(dev_corpus, classes, word_idx_map, max_len) print('converting finished') # create nn output_dim = len(classes) vocab_size = len(embeddings) embedding_dim = vecs.vector_size print('Creating nn...') if architecture == 'LSTM': nn = LSTM_Model(vocab_size, embedding_dim, output_dim, W, params) if architecture == 'LSTM+ATT': nn = LSTM_Model(vocab_size, embedding_dim, output_dim, W, params) elif architecture == 'BiLSTM': nn = BiLSTM_Model(vocab_size, embedding_dim, output_dim, W, params) elif architecture == 'BiLSTM+ATT': nn = BiLSTM_Model(vocab_size, embedding_dim, output_dim, W, params) elif architecture == 'CNN': nn = CNN_Model(vocab_size, embedding_dim, output_dim, W, params) else: return print('nn finished') #checkpointing filepath = save_dir + "weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5" callbacks = [ ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='auto'), EarlyStopping(monitor='val_acc', patience=3, mode='max') ] # train if dev_corpus: nn.model.fit(x_train, y_train, validation_data=[x_dev, y_dev], epochs=num_epochs, verbose=1, callbacks=callbacks) else: nn.model.fit(x_train, y_train, validation_split=0.1, epochs=num_epochs, verbose=1, callbacks=callbacks) print('Finished training ' + architecture) # serialize attention model if architecture == 'LSTM+ATT' or architecture == 'BiLSTM+ATT': attention_model = nn.attention_model attention_model.save(save_dir + 'attention_model.h5') # serialize model architecture to JSON model_json = nn.model.to_json() with open(save_dir + "model.json", "w") as json_file: json_file.write(model_json) # serialize vocab, word to id mapping, max_len and classes pickle.dump(vocab, open(save_dir + "vocab.p", "wb")) pickle.dump(max_len, open(save_dir + "max_sequence_len.p", "wb")) pickle.dump(word_idx_map, open(save_dir + "word_idx_map.p", "wb")) pickle.dump(classes, open(save_dir + "classes.p", "wb")) def get_word_attention(self, saved_dir, test_corpus): """ Gets attention score for all tokens in the tweet for every Tweet in the test_corpus. Args: saved_dir : String, path/name to file where weights of the attention model, the file containg the max_sequence_length (for padding; assumed to be called "max_sequence_len.p") and token to id mapping (assumed to be called word_idx_map.p) are stored test_corpus : Corpus, containg the Tweets for which attentions should be calculated Returns: list of list of tupels containg the attention values for each token. """ inv_classes = {v: k for k, v in classes.items()} max_len = pickle.load(open(save_dir + "max_sequence_len.p", "rb")) word_idx_map = pickle.load(open(save_dir + "word_idx_map.p", "rb")) inv_word_idx_map = {v: k for k, v in word_idx_map.items()} # convert test data into input format x_test, y_test = self.__convert_format(test_corpus, classes, word_idx_map, max_len) # load model attention_model = load_model(saved_dir) print("Attention model loaded from disk") # compile model attention_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) # get attentions word_attentions = [] attentions = attention_model.predict(x_test, verbose=1) for importances, tweet in zip(attentions, x_test): temp = [] for importance, token in zip(importances, tweet): if token != 0: temp.append((inv_word_idx_map[token], importance)) word_attentions.append(temp) return word_attentions def predict(self, saved_dir, path_weights, test_corpus): """ Gets predictions tweet for every Tweet in the test_corpus and writes them into the Tweet data structure as well as to file. Args: saved_dir : String, path/name to file where weights of the model, architecture of the model (assumed to be called model.json) the file containg the max_sequence_length (for padding; assumed to be called "max_sequence_len.p"), token to id mapping (assumed to be called word_idx_map.p), classes to id mapping (assumed to be called classes.p) are stored path_weights : String, path/name to file where model weights are stored test_corpus : Corpus, containg the Tweets for which attentions should be calculated Writes file 'predictions.csv' containg the predition of the model as well as the tweet itself into the saved_dir. """ classes = pickle.load(open(save_dir + "classes.p", "rb")) inv_classes = {v: k for k, v in classes.items()} max_len = pickle.load(open(save_dir + "max_sequence_len.p", "rb")) word_idx_map = pickle.load(open(save_dir + "word_idx_map.p", "rb")) inv_word_idx_map = {v: k for k, v in word_idx_map.items()} # convert test data into input format x_test, _ = self.__convert_format(test_corpus, classes, word_idx_map, max_len, True) # load model architecture json_file = open(saved_dir + 'model.json', 'r') loaded_model = json_file.read() json_file.close() model = model_from_json(loaded_model) # load weights model.load_weights(path_weights) print("Loaded model from disk") model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) # get predictions predictions = model.predict(x_test, verbose=1) # one hot vector to class predictions = [ np.argmax(prediction, axis=-1) for prediction in predictions ] true_labels = [np.argmax(label, axis=-1) for label in y_test] # write predictions to file i = 0 with open(saved_dir + 'predictions.csv', 'w') as out: out.write("tpredicted_label\ttweet\n") for prediction in predictions: pred_label = inv_classes[prediction] text = test_corpus.get_ith(i).get_text() out.write(pred_label + "\t" + text + "\n") i += 1 # write predictions in tweets of test corpus # order of predictions should be the same as oder of tweets in test_corpus for i in range(len(predictions)): test_corpus.get_ith(i).set_pred_label(inv_classes[predictions[i]]) return test_corpus def test(self, save_dir, path_weights, test_corpus): """ Tests predictions of the models for the Tweets in test_corpus. Writes predictions into the Tweet data structure as well as to file. Args: saved_dir : String, path/name to file where weights of the model, architecture of the model (assumed to be called model.json) the file containg the max_sequence_length (for padding; assumed to be called "max_sequence_len.p"), token to id mapping (assumed to be called word_idx_map.p), classes to id mapping (assumed to be called classes.p) are stored path_weights : String, path/name to file where model weights are stored test_corpus : Corpus, containg the Tweets for which attentions should be calculated Writes file 'predictions.csv' containg the predition of the model as well as the tweet itself into the saved_dir. """ classes = pickle.load(open(save_dir + "classes.p", "rb")) inv_classes = {v: k for k, v in classes.items()} max_len = pickle.load(open(save_dir + "max_sequence_len.p", "rb")) word_idx_map = pickle.load(open(save_dir + "word_idx_map.p", "rb")) inv_word_idx_map = {v: k for k, v in word_idx_map.items()} # convert test data into input format x_test, y_test = self.__convert_format(test_corpus, classes, word_idx_map, max_len) # load model architecture json_file = open(save_dir + 'model.json', 'r') loaded_model = json_file.read() json_file.close() model = model_from_json(loaded_model) # load weights model.load_weights(path_weights) print("Loaded model from disk") model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) # evaluate model & print accuracy score = model.evaluate(x_test, y_test, verbose=0) print("%s: %.2f%%" % (model.metrics_names[1], score[1] * 100)) # get predictions predictions = model.predict(x_test, verbose=1) # one hot vector to class predictions = [ np.argmax(prediction, axis=-1) for prediction in predictions ] true_labels = [np.argmax(label, axis=-1) for label in y_test] # write predictions to file i = 0 with open(save_dir + 'predictions.csv', 'w') as out: out.write("true_label\tpredicted_label\ttweet\n") for label, prediction in zip(true_labels, predictions): pred_label = inv_classes[prediction] true_label = inv_classes[label] text = test_corpus.get_ith(i).get_text() out.write(true_label + "\t" + pred_label + "\t" + text + "\n") i += 1 # write predictions in tweets of test corpus # order of predictions should be the same as oder of tweets in test_corpus for i in range(len(predictions)): test_corpus.get_ith(i).set_pred_label(inv_classes[predictions[i]]) return test_corpus