예제 #1
0
def load_data(args):
    global word_dict, word_embed
    docs = []
    docs += du.load_sent('../datasets/bbcnews.txt') # BBC_news
    # docs += du.load_sent('../datasets/BBC_news.txt')
    word_dict = util.build_dict(docs)
    # inv_dict = util.build_inv_dict(word_dict)
    word_embed = util.words2embedding(word_dict, 100, args.embedding_file) 
    print('word_dict:', word_dict)
    with open('../datasets/word_dict', 'wb') as fid:
        dump(word_dict, fid)
    doc = ' '.join(docs)
    return doc
예제 #2
0
def test1(args):
    
    docs = []
    docs += du.load_sent('../datasets/bbcnews.txt')
    logging.info('docs: {}'.format(len(docs)))
    logging.info("building dictionary...")
    word_dict, char_dict = util.build_dict(docs)
    word_embed = util.words2embedding(word_dict, 100, args.embedding_file) 
    (args.word_vocab_size, args.word_embed_size) = word_embed.shape
  
    logging.info('docs: {}'.format(word_embed.shape)) # (119, 100) # Words: 117 -> 117
    print(word_dict)
    doc = ' '.join(docs[0])
    # with open('bbcnews.txt') as f:
    #     docs = f.read()
    # sp.build_graph(doc)
    vertice_map = sp.hash_vertex(doc)
    for vertice in vertice_map:
        print(words2word(vertice[0],word_embed,word_dict))
예제 #3
0
from event_chain import *
import data_utilities as du
import utilities as util
from pprint import pprint

import time
####### test 3 compare ######
docs = du.load_sent('../datasets/bbcsample1.txt')
word_dict = util.build_dict(docs)

print('models already downloaded')
srl_predictor = Predictor.from_path(
    '../pretrained/srl-model-2018.05.25.tar.gz')


def test3():
    global word_dict, srl_predictor

    # docs = du.load_sent('../datasets/bbcsample1.txt')

    # wd = util.build_dict(docs)
    # pprint(wd)

    # print(docs)
    print('using ecb **********')
    start = time.time()

    ecb = EventChainBuilder(word_dict)

    for i, sent in enumerate(docs):
        print('processing sentence', i)
예제 #4
0
def main(args):
    logging.info("loading data...")
    fake_train, fake_dev, fake_test = du.load_fake(doc_ling=False,
                                                   sent_ling=False)
    true_train, true_dev, true_test = du.load_true(doc_ling=False,
                                                   sent_ling=False)
    if args.debug:
        true_train = true_train[0][:100]
        fake_train = fake_train[:10]
        true_dev = true_dev[:100]
        fake_dev = fake_dev[:10]
        true_test = true_test[:100]
        fake_test = fake_test[:10]
    if args.rnn_type == 'gru':
        args.rnn = lasagne.layers.GRULayer
    elif args.rnn_type == 'lstm':
        args.rnn = lasagne.layers.LSTMLayer
    else:
        args.rnn = lasagne.layers.RecurrentLayer

    logging.info("building dictionary...")
    word_dict, char_dict = util.build_dict(
        None, max_words=0, dict_file=["word_dict", "char_dict"])
    logging.info("creating embedding matrix...")
    word_embed = util.words2embedding(word_dict, 100, args.embedding_file)
    char_embed = util.char2embedding(char_dict, 30)
    (args.word_vocab_size, args.word_embed_size) = word_embed.shape
    (args.char_vocab_size, args.char_embed_size) = char_embed.shape
    logging.info("compiling Theano function...")
    att_fn, eval_fn, train_fn, params = create_theano_function(word_embed,
                                                               char_embed,
                                                               values=None)
    logging.info("batching examples...")
    dev_examples = mb.vec_minibatch(fake_dev + true_dev,
                                    word_dict,
                                    char_dict,
                                    args,
                                    False,
                                    char=False,
                                    sent_ling=False,
                                    doc_ling=False)
    test_examples = mb.vec_minibatch(fake_test + true_test,
                                     word_dict,
                                     char_dict,
                                     args,
                                     False,
                                     char=False,
                                     sent_ling=False,
                                     doc_ling=False)

    temp = []
    for true_batch in true_train:
        temp += true_batch
    true_train = temp
    del temp
    train_examples = mb.doc_minibatch(fake_train + true_train, args.batch_size)

    # train_examples = mb.train_doc_minibatch(fake_train, true_train, args)
    logging.info("checking network...")
    dev_acc = evals.eval_vec_batch(eval_fn,
                                   dev_examples,
                                   char=False,
                                   sent_ling=False,
                                   doc_ling=False)
    print('Dev A: %.2f P:%.2f R:%.2f F:%.2f' % dev_acc)
    test_acc = evals.eval_vec_batch(eval_fn,
                                    test_examples,
                                    char=False,
                                    sent_ling=False,
                                    doc_ling=False)
    print('Performance on Test set: A: %.2f P:%.2f R:%.2f F:%.2f' % test_acc)
    prev_fsc = 0
    stop_count = 0
    best_fsc = 0
    best_acc = 0
    logging.info("training %d examples" % len(train_examples))
    start_time = time.time()
    n_updates = 0
    for epoch in range(args.epoches):
        np.random.shuffle(train_examples)
        # if epoch > 3:
        #     logging.info("compiling Theano function again...")
        #     args.learning_rate *= 0.9
        #     att_fn, eval_fn, train_fn, params = create_theano_function(
        #         word_embed, char_embed, values=[x.get_value() for x in params])
        for batch_x, _ in train_examples:
            batch_x, batch_y = zip(*batch_x)
            batch_x = util.vectorization(list(batch_x),
                                         word_dict,
                                         char_dict,
                                         max_char_length=args.max_char)
            batch_rnn, batch_sent_mask, batch_word_mask, _ = \
                util.mask_padding(batch_x, args.max_sent, args.max_word, args.max_char)
            batch_y = np.array(list(batch_y))
            train_loss = train_fn(batch_rnn, batch_word_mask, batch_sent_mask,
                                  batch_y)
            n_updates += 1
            if n_updates % 100 == 0 and epoch > 7:
                logging.info(
                    'Epoch = %d, loss = %.2f, elapsed time = %.2f (s)' %
                    (epoch, train_loss, time.time() - start_time))
                # dev_acc = evals.eval_batch(eval_fn, dev_examples, word_dict, char_dict, args)
                dev_acc = evals.eval_vec_batch(eval_fn,
                                               dev_examples,
                                               char=False,
                                               sent_ling=False,
                                               doc_ling=False)
                logging.info('Dev A: %.2f P:%.2f R:%.2f F:%.2f' % dev_acc)
                if dev_acc[3] > best_fsc and dev_acc[0] > best_acc:
                    best_fsc = dev_acc[3]
                    best_acc = dev_acc[0]
                    logging.info(
                        'Best dev f1: epoch = %d, n_udpates = %d, f1 = %.2f %%'
                        % (epoch, n_updates, dev_acc[3]))
                    record = 'Best dev accuracy: epoch = %d, n_udpates = %d ' % \
                             (epoch, n_updates) + ' Dev A: %.2f P:%.2f R:%.2f F:%.2f' % dev_acc
                    test_acc = evals.eval_vec_batch(eval_fn,
                                                    test_examples,
                                                    char=False,
                                                    sent_ling=False,
                                                    doc_ling=False)
                    print(
                        'Performance on Test set: A: %.2f P:%.2f R:%.2f F:%.2f'
                        % test_acc)
                    # util.save_params('char_not_params_%.2f' % test_acc[3], params,
                    #                  epoch=epoch, n_updates=n_updates)
                if prev_fsc > dev_acc[3]:
                    stop_count += 1
                else:
                    stop_count = 0
                if stop_count == 6:
                    print("stopped")
                prev_fsc = dev_acc[3]

    print(record)
    print('Performance on Test set: A: %.2f P:%.2f R:%.2f F:%.2f' % test_acc)
    return
def predict_multilingual(input_path: str, output_path: str, resources_path: str, lang: str) -> None:
    """
    DO NOT MODIFY THE SIGNATURE!
    This is the skeleton of the prediction function.
    The predict function will build your model, load the weights from the checkpoint and write a new file (output_path)
    with your predictions in the "<id> <BABELSynset>" format (e.g. "d000.s000.t000 bn:01234567n").

    The resources folder should contain everything you need to make the predictions. It is the "resources" folder in your submission.

    N.B. DO NOT HARD CODE PATHS IN HERE. Use resource_path instead, otherwise we will not be able to run the code.
    If you don't know what HARD CODING means see: https://en.wikipedia.org/wiki/Hard_coding

    :param input_path: the path of the input file to predict in the same format as Raganato's framework (XML files you downloaded).
    :param output_path: the path of the output file (where you save your predictions)
    :param resources_path: the path of the resources folder containing your model and stuff you might need.
    :param lang: the language of the dataset specified in input_path, specify which model to load on which dataset 
    :return: None
    """
    # load the model
    model_path = os.path.join(resources_path, 'SensEmbed_BiLSTM_ATT_MultiTask_model.h5')
    model = load_model(model_path, custom_objects={'SeqSelfAttention': SeqSelfAttention})
    logging.info(f'{model._name} is loaded.')

    # load tokenizer, fetch our vocabulary size
    tokenizer_path = os.path.join(resources_path, 'multilingual_tokenizer.pkl')
    tokenizer = load_pickle(tokenizer_path)

    word_tokens = [word for word in tokenizer.word_index if 'bn:' not in word]
    sense_tokens = [word for word in tokenizer.word_index if 'bn:' in word]
    vocabulary_size = len(word_tokens)
    output_size = vocabulary_size + len(sense_tokens)

    batch_size = 8  # hard coded; as this was the one worked on Colab Google

    # Parse the testing dataset
    gold_dict_path = input_path.replace("data.xml", "gold.key.txt")
    gold_dict = build_dict(gold_dict_path)
    data_x, mask_x = parse_test(input_path, tokenizer=tokenizer, gold_dict=gold_dict, batch_size=batch_size)

    # Getting the model predictions
    predictions = []
    for batch_x, batch_mask in tqdm(test_generator(np.array(data_x), batch_size, output_size,
                                                   use_elmo=False, mask_builder=np.array(mask_x),
                                                   tokenizer=tokenizer, use_bert=False),
                                    desc="Predicting Senses"):
        # Output Shape (batch_size, max_len_per_batch, output_vocab_size)
        batch_pred = model.predict_on_batch([batch_x, batch_mask])
        y_hat = np.argmax(batch_pred[0], axis=-1)
        predictions.extend(y_hat)

    # load lemma2synsets
    lemma2synsets_file_path = os.path.join(os.getcwd(), 'resources', 'lemma2synsets4.0.xx.wn.ALL.txt')
    lemma_synsets = get_lemma2synsets(lemma2synsets_file_path)

    # load wordnet 2 babelnet synsets' mapping
    bn2wn_path = os.path.join(resources_path, "babelnet2wordnet.tsv")
    _, wordnet_babelnet_ = build_bn2wn_dict(bn2wn_path)

    # Save predictions to a file
    id_bn_list = []
    # stands for predictions in {word_id babelnet_sense}
    _predictions = []
    for i, sentence in enumerate(tqdm(data_x, desc="Preparing models' predictions")):
        for j, word in enumerate(sentence):
            if len(mask_x[i][j]) == 2:  # So it is an instance
                prediction = predictions[i][j]
                prediction_sense_ = tokenizer.index_word.get(prediction, '<OOV>')
                if 'wn:' not in prediction_sense_ or 'bn:' not in prediction_sense_:
                    # Fallback Strategy
                    prediction_sense = predict_multilingual_sense(word=word, word2idx=tokenizer.word_index,
                                                                  lemma_synsets=lemma_synsets,
                                                                  wordnet_babelnet=wordnet_babelnet_)
                else:
                    prediction_sense = prediction_sense_[prediction_sense_.find('bn:'):]
                word_id = mask_x[i][j][1]
                bn = prediction_sense if prediction_sense is not None else '<OOV>'
                if word_id is None or bn is None:
                    continue
                id_bn_list.append(f'{word_id}\t{bn}')
                _predictions.append(bn)

    # Writing model predictions
    with open(output_path, encoding='utf-8', mode="w+") as output_file:
        for id_bn in tqdm(id_bn_list, desc="Writing model predictions"):
            output_file.write(f'{id_bn}\n')

    # Fetching the ground truth of the data
    ground_truth = []
    ground_truth_path = input_path.replace("data.xml", "gold.key.txt")
    with open(ground_truth_path, encoding='utf-8', mode='r') as ground_truth_file:
        lines = ground_truth_file.read().splitlines()
        for line in lines:
            sense_key = line.split()[1]
            ground_truth.append(sense_key)

    # Compute F1_Score
    _, _, f1score, _ = precision_recall_fscore_support(ground_truth, _predictions, average='micro')
    print(f'{model._name} F1_score: {f1score}')