def load_data(args): global word_dict, word_embed docs = [] docs += du.load_sent('../datasets/bbcnews.txt') # BBC_news # docs += du.load_sent('../datasets/BBC_news.txt') word_dict = util.build_dict(docs) # inv_dict = util.build_inv_dict(word_dict) word_embed = util.words2embedding(word_dict, 100, args.embedding_file) print('word_dict:', word_dict) with open('../datasets/word_dict', 'wb') as fid: dump(word_dict, fid) doc = ' '.join(docs) return doc
def test1(args): docs = [] docs += du.load_sent('../datasets/bbcnews.txt') logging.info('docs: {}'.format(len(docs))) logging.info("building dictionary...") word_dict, char_dict = util.build_dict(docs) word_embed = util.words2embedding(word_dict, 100, args.embedding_file) (args.word_vocab_size, args.word_embed_size) = word_embed.shape logging.info('docs: {}'.format(word_embed.shape)) # (119, 100) # Words: 117 -> 117 print(word_dict) doc = ' '.join(docs[0]) # with open('bbcnews.txt') as f: # docs = f.read() # sp.build_graph(doc) vertice_map = sp.hash_vertex(doc) for vertice in vertice_map: print(words2word(vertice[0],word_embed,word_dict))
from event_chain import * import data_utilities as du import utilities as util from pprint import pprint import time ####### test 3 compare ###### docs = du.load_sent('../datasets/bbcsample1.txt') word_dict = util.build_dict(docs) print('models already downloaded') srl_predictor = Predictor.from_path( '../pretrained/srl-model-2018.05.25.tar.gz') def test3(): global word_dict, srl_predictor # docs = du.load_sent('../datasets/bbcsample1.txt') # wd = util.build_dict(docs) # pprint(wd) # print(docs) print('using ecb **********') start = time.time() ecb = EventChainBuilder(word_dict) for i, sent in enumerate(docs): print('processing sentence', i)
def main(args): logging.info("loading data...") fake_train, fake_dev, fake_test = du.load_fake(doc_ling=False, sent_ling=False) true_train, true_dev, true_test = du.load_true(doc_ling=False, sent_ling=False) if args.debug: true_train = true_train[0][:100] fake_train = fake_train[:10] true_dev = true_dev[:100] fake_dev = fake_dev[:10] true_test = true_test[:100] fake_test = fake_test[:10] if args.rnn_type == 'gru': args.rnn = lasagne.layers.GRULayer elif args.rnn_type == 'lstm': args.rnn = lasagne.layers.LSTMLayer else: args.rnn = lasagne.layers.RecurrentLayer logging.info("building dictionary...") word_dict, char_dict = util.build_dict( None, max_words=0, dict_file=["word_dict", "char_dict"]) logging.info("creating embedding matrix...") word_embed = util.words2embedding(word_dict, 100, args.embedding_file) char_embed = util.char2embedding(char_dict, 30) (args.word_vocab_size, args.word_embed_size) = word_embed.shape (args.char_vocab_size, args.char_embed_size) = char_embed.shape logging.info("compiling Theano function...") att_fn, eval_fn, train_fn, params = create_theano_function(word_embed, char_embed, values=None) logging.info("batching examples...") dev_examples = mb.vec_minibatch(fake_dev + true_dev, word_dict, char_dict, args, False, char=False, sent_ling=False, doc_ling=False) test_examples = mb.vec_minibatch(fake_test + true_test, word_dict, char_dict, args, False, char=False, sent_ling=False, doc_ling=False) temp = [] for true_batch in true_train: temp += true_batch true_train = temp del temp train_examples = mb.doc_minibatch(fake_train + true_train, args.batch_size) # train_examples = mb.train_doc_minibatch(fake_train, true_train, args) logging.info("checking network...") dev_acc = evals.eval_vec_batch(eval_fn, dev_examples, char=False, sent_ling=False, doc_ling=False) print('Dev A: %.2f P:%.2f R:%.2f F:%.2f' % dev_acc) test_acc = evals.eval_vec_batch(eval_fn, test_examples, char=False, sent_ling=False, doc_ling=False) print('Performance on Test set: A: %.2f P:%.2f R:%.2f F:%.2f' % test_acc) prev_fsc = 0 stop_count = 0 best_fsc = 0 best_acc = 0 logging.info("training %d examples" % len(train_examples)) start_time = time.time() n_updates = 0 for epoch in range(args.epoches): np.random.shuffle(train_examples) # if epoch > 3: # logging.info("compiling Theano function again...") # args.learning_rate *= 0.9 # att_fn, eval_fn, train_fn, params = create_theano_function( # word_embed, char_embed, values=[x.get_value() for x in params]) for batch_x, _ in train_examples: batch_x, batch_y = zip(*batch_x) batch_x = util.vectorization(list(batch_x), word_dict, char_dict, max_char_length=args.max_char) batch_rnn, batch_sent_mask, batch_word_mask, _ = \ util.mask_padding(batch_x, args.max_sent, args.max_word, args.max_char) batch_y = np.array(list(batch_y)) train_loss = train_fn(batch_rnn, batch_word_mask, batch_sent_mask, batch_y) n_updates += 1 if n_updates % 100 == 0 and epoch > 7: logging.info( 'Epoch = %d, loss = %.2f, elapsed time = %.2f (s)' % (epoch, train_loss, time.time() - start_time)) # dev_acc = evals.eval_batch(eval_fn, dev_examples, word_dict, char_dict, args) dev_acc = evals.eval_vec_batch(eval_fn, dev_examples, char=False, sent_ling=False, doc_ling=False) logging.info('Dev A: %.2f P:%.2f R:%.2f F:%.2f' % dev_acc) if dev_acc[3] > best_fsc and dev_acc[0] > best_acc: best_fsc = dev_acc[3] best_acc = dev_acc[0] logging.info( 'Best dev f1: epoch = %d, n_udpates = %d, f1 = %.2f %%' % (epoch, n_updates, dev_acc[3])) record = 'Best dev accuracy: epoch = %d, n_udpates = %d ' % \ (epoch, n_updates) + ' Dev A: %.2f P:%.2f R:%.2f F:%.2f' % dev_acc test_acc = evals.eval_vec_batch(eval_fn, test_examples, char=False, sent_ling=False, doc_ling=False) print( 'Performance on Test set: A: %.2f P:%.2f R:%.2f F:%.2f' % test_acc) # util.save_params('char_not_params_%.2f' % test_acc[3], params, # epoch=epoch, n_updates=n_updates) if prev_fsc > dev_acc[3]: stop_count += 1 else: stop_count = 0 if stop_count == 6: print("stopped") prev_fsc = dev_acc[3] print(record) print('Performance on Test set: A: %.2f P:%.2f R:%.2f F:%.2f' % test_acc) return
def predict_multilingual(input_path: str, output_path: str, resources_path: str, lang: str) -> None: """ DO NOT MODIFY THE SIGNATURE! This is the skeleton of the prediction function. The predict function will build your model, load the weights from the checkpoint and write a new file (output_path) with your predictions in the "<id> <BABELSynset>" format (e.g. "d000.s000.t000 bn:01234567n"). The resources folder should contain everything you need to make the predictions. It is the "resources" folder in your submission. N.B. DO NOT HARD CODE PATHS IN HERE. Use resource_path instead, otherwise we will not be able to run the code. If you don't know what HARD CODING means see: https://en.wikipedia.org/wiki/Hard_coding :param input_path: the path of the input file to predict in the same format as Raganato's framework (XML files you downloaded). :param output_path: the path of the output file (where you save your predictions) :param resources_path: the path of the resources folder containing your model and stuff you might need. :param lang: the language of the dataset specified in input_path, specify which model to load on which dataset :return: None """ # load the model model_path = os.path.join(resources_path, 'SensEmbed_BiLSTM_ATT_MultiTask_model.h5') model = load_model(model_path, custom_objects={'SeqSelfAttention': SeqSelfAttention}) logging.info(f'{model._name} is loaded.') # load tokenizer, fetch our vocabulary size tokenizer_path = os.path.join(resources_path, 'multilingual_tokenizer.pkl') tokenizer = load_pickle(tokenizer_path) word_tokens = [word for word in tokenizer.word_index if 'bn:' not in word] sense_tokens = [word for word in tokenizer.word_index if 'bn:' in word] vocabulary_size = len(word_tokens) output_size = vocabulary_size + len(sense_tokens) batch_size = 8 # hard coded; as this was the one worked on Colab Google # Parse the testing dataset gold_dict_path = input_path.replace("data.xml", "gold.key.txt") gold_dict = build_dict(gold_dict_path) data_x, mask_x = parse_test(input_path, tokenizer=tokenizer, gold_dict=gold_dict, batch_size=batch_size) # Getting the model predictions predictions = [] for batch_x, batch_mask in tqdm(test_generator(np.array(data_x), batch_size, output_size, use_elmo=False, mask_builder=np.array(mask_x), tokenizer=tokenizer, use_bert=False), desc="Predicting Senses"): # Output Shape (batch_size, max_len_per_batch, output_vocab_size) batch_pred = model.predict_on_batch([batch_x, batch_mask]) y_hat = np.argmax(batch_pred[0], axis=-1) predictions.extend(y_hat) # load lemma2synsets lemma2synsets_file_path = os.path.join(os.getcwd(), 'resources', 'lemma2synsets4.0.xx.wn.ALL.txt') lemma_synsets = get_lemma2synsets(lemma2synsets_file_path) # load wordnet 2 babelnet synsets' mapping bn2wn_path = os.path.join(resources_path, "babelnet2wordnet.tsv") _, wordnet_babelnet_ = build_bn2wn_dict(bn2wn_path) # Save predictions to a file id_bn_list = [] # stands for predictions in {word_id babelnet_sense} _predictions = [] for i, sentence in enumerate(tqdm(data_x, desc="Preparing models' predictions")): for j, word in enumerate(sentence): if len(mask_x[i][j]) == 2: # So it is an instance prediction = predictions[i][j] prediction_sense_ = tokenizer.index_word.get(prediction, '<OOV>') if 'wn:' not in prediction_sense_ or 'bn:' not in prediction_sense_: # Fallback Strategy prediction_sense = predict_multilingual_sense(word=word, word2idx=tokenizer.word_index, lemma_synsets=lemma_synsets, wordnet_babelnet=wordnet_babelnet_) else: prediction_sense = prediction_sense_[prediction_sense_.find('bn:'):] word_id = mask_x[i][j][1] bn = prediction_sense if prediction_sense is not None else '<OOV>' if word_id is None or bn is None: continue id_bn_list.append(f'{word_id}\t{bn}') _predictions.append(bn) # Writing model predictions with open(output_path, encoding='utf-8', mode="w+") as output_file: for id_bn in tqdm(id_bn_list, desc="Writing model predictions"): output_file.write(f'{id_bn}\n') # Fetching the ground truth of the data ground_truth = [] ground_truth_path = input_path.replace("data.xml", "gold.key.txt") with open(ground_truth_path, encoding='utf-8', mode='r') as ground_truth_file: lines = ground_truth_file.read().splitlines() for line in lines: sense_key = line.split()[1] ground_truth.append(sense_key) # Compute F1_Score _, _, f1score, _ = precision_recall_fscore_support(ground_truth, _predictions, average='micro') print(f'{model._name} F1_score: {f1score}')