示例#1
0
def evaluate(sess, model):
    Yhat = inference(model)
    #Return the index with the largest value across axis
    Ypredict = tf.argmax(Yhat, axis=1, output_type=tf.int32)

    # predictions test
    predictions_test = [
        map(lambda x: idx2label[x],
            sess.run(Ypredict, feed_dict={x_input: [sentence]}))
        for sentence in test_lex
    ]
    groundtruth_test = [map(lambda x: idx2label[x], label) for label in test_y]
    words_test = [map(lambda x: idx2word[x], word) for word in test_lex]

    predictions_valid = [
        map(lambda x: idx2label[x],
            sess.run(Ypredict, feed_dict={x_input: [sentence]}))
        for sentence in valid_lex
    ]
    groundtruth_valid = [
        map(lambda x: idx2label[x], label) for label in valid_y
    ]
    words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex]

    # evaluation // compute the accuracy using conlleval.pl
    res_test = conlleval(predictions_test, groundtruth_test, words_test,
                         folder + "current.test.txt")
    res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid,
                          folder + "current.valid.txt")

    return res_test, res_valid
示例#2
0
def validate(model_filename):
    word_model = CustomEmbedding()

    valid_set, indexes = word_model.valid_set, word_model.indexes

    w2idx, la2idx = indexes['w2idx'], indexes['la2idx']
    idx2w, idx2la = indexes['idx2w'], indexes['idx2la']

    n_classes = len(idx2la)
    n_vocab = len(idx2w)

    valid_x, valid_label = valid_set

    log("Processing word indexes... ")

    words_val = [list(map(lambda x: idx2w[x], w)) for w in valid_x]
    groundtruth_val = [list(map(lambda x: idx2la[x], y)) for y in valid_label]

    log("Done processing word indexes!")

    process = Process()

    process.load(model_filename)

    predword_val = process.validate(valid_set)

    metrics = conlleval(predword_val, groundtruth_val, words_val, 'diff.txt')

    log('Precision = {}, Recall = {}, F1 = {}'.format(metrics['precision'],
                                                      metrics['recall'],
                                                      metrics['f1']))
示例#3
0
        # View each sentence as a batch ..
        sent = sent[np.newaxis, :]
        if sent.shape[1] > 1:  # ignore 1 word sentences
            model.train_on_batch(sent, label)

    from metrics.accuracy import conlleval

    labels_pred_val = []
    bar = progressbar.ProgressBar(max_value=len(val_x))
    for n_batch, sent in bar(enumerate(val_x)):
        label = val_label[n_batch]
        label = np.eye(n_classes)[label][np.newaxis, :]
        sent = sent[np.newaxis, :]

        pred = model.predict_on_batch(sent)
        pred = np.argmax(pred, -1)[0]
        labels_pred_val.append(pred)

    labels_pred_val = [list(map(lambda x: idx2la[x], y)) \
                       for y in labels_pred_val]
    con_dict = conlleval(labels_pred_val, labels_val, words_val, 'measure.txt')

    print('Precision = {}, Recall = {}, F1 = {}'.format(
        con_dict['r'], con_dict['p'], con_dict['f1']))

    # model.fit(x=train_x, y=train_label, steps_per_epoch=1,callbacks=[checkpointer, tensorboard])

    model.save_weights(path + "MUSIC_LSTM-" + str(i) + ".h5")
    model_json = model.to_json()
    with open(path + "model_embed_lstm.json", "w") as jf:
        jf.write(model_json)
示例#4
0
        # evaluation // back into the real world : idx -> words
        predictions_test = [map(lambda x: idx2label[x],
                                rnn.classify(numpy.asarray(contextwin(x, s['win'])).astype('int32'), f) ) for x, f in zip (test_lex, test_feat)]

        ground_truth_test = [map(lambda x: idx2label[x], y) for y in test_y]
        words_test = [ map(lambda x: idx2word[x], w) for w in test_lex]

        predictions_valid = [map(lambda x: idx2label[x],
                                 rnn.classify(numpy.asarray(contextwin(x, s['win'])).astype('int32'), f)) for x, f in zip (valid_lex, valid_feat)]

        ground_truth_valid = [map(lambda x: idx2label[x], y) for y in valid_y]
        words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex]

        # evaluation // compute the accuracy using conlleval.pl
        res_test  = conlleval(predictions_test, ground_truth_test, words_test, model_folder + '/current.test.txt')
        res_valid = conlleval(predictions_valid, ground_truth_valid, words_valid, model_folder + '/current.valid.txt')

        if res_test['f1'] > best_f1_test:
            rnn.save(model_folder)

            best_f1_test, best_f1_test_val = res_test['f1'], res_valid['f1']

            if s['verbose']:
                print 'NEW BEST: epoch', e, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' '*20

            s['vf1'], s['vp'], s['vr'] = res_valid['f1'], res_valid['p'], res_valid['r']
            s['tf1'], s['tp'], s['tr'] = res_test['f1'],  res_test['p'],  res_test['r']

            s['be'] = e
示例#5
0
def main():
    settings = {
        'fold': 3,  # 5 folds 0,1,2,3,4
        'lr': 0.0627142536696559,
        'verbose': 1,
        'decay': False,  # decay on the learning rate if improvement stops
        'win': 7,  # number of words in the context window
        'bs': 9,  # number of backprop through time steps
        'nhidden': 100,  # number of hidden units
        'seed': 345,
        'emb_dimension': 100,  # dimension of word embedding
        'nepochs': 50
    }

    folder = os.path.basename(__file__).split('.')[0]

    if not os.path.exists(folder):
        os.mkdir(folder)

    # load the dataset
    train_set, valid_set, test_set, dic = load.atisfold(settings['fold'])
    idx2label = dict((k, v) for v, k in dic['labels2idx'].iteritems())
    idx2word = dict((k, v) for v, k in dic['words2idx'].iteritems())

    train_lex, train_ne, train_y = train_set
    valid_lex, valid_ne, valid_y = valid_set
    test_lex,  test_ne,  test_y = test_set

    vocsize = len(dic['words2idx'])
    nclasses = len(dic['labels2idx'])
    nsentences = len(train_lex)

    # instantiate the model
    numpy.random.seed(settings['seed'])
    random.seed(settings['seed'])

    if LOAD:
        print "Loading model from %s..." % folder

        rnn = ElmanRNNModel.load(folder)
    else:
        rnn = ElmanRNNModel(
            hidden_dims=settings['nhidden'],
            num_classes=nclasses,
            vocab_size=vocsize,
            embed_dims=settings['emb_dimension'],
            context_size=settings['win']
        )

    # train with early stopping on validation set
    best_f1 = -numpy.inf
    settings['current_lr'] = settings['lr']
    for e in xrange(settings['nepochs']):
        # shuffle
        shuffle([train_lex, train_ne, train_y], settings['seed'])
        settings['current_epoch'] = e
        tic = time.time()
        for i in xrange(nsentences):
            cwords = contextwin(train_lex[i], settings['win'])

            words = map(
                lambda x: numpy.asarray(x).astype('int32'),
                minibatch(cwords, settings['bs'])
            )

            labels = train_y[i]

            for word_batch, label_last_word in zip(words, labels):
                rnn.train(word_batch, label_last_word, settings['current_lr'])
                rnn.normalize()

            if settings['verbose']:
                print '[learning] epoch %i >> %2.2f%%' % (e, (i+1)*100./nsentences), \
                    'completed in %.2f (sec) <<\r' % (time.time()-tic),
                sys.stdout.flush()

        # evaluation // back into the real world : idx -> words
        predictions_test = [
            map(lambda x: idx2label[x],
                rnn.classify(numpy.asarray(contextwin(x, settings['win'])).astype('int32')))
            for x in test_lex
        ]

        groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y ]

        words_test = [map(lambda x: idx2word[x], w) for w in test_lex]

        predictions_valid = [
            map(
                lambda idx: idx2label[idx],
                rnn.classify(
                    numpy.asarray(contextwin(x, settings['win'])).astype('int32'))
            )
            for x in valid_lex
        ]

        groundtruth_valid = [map(lambda x: idx2label[x], y) for y in valid_y]

        words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex]

        # evaluation // compute the accuracy using conlleval.pl
        res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/current.test.txt')
        res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/current.valid.txt')

        if res_valid['f1'] > best_f1:
            rnn.save(folder)
            best_f1 = res_valid['f1']
            if settings['verbose']:
                print 'NEW BEST: epoch', e, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' '*20
            settings['vf1'], settings['vp'], settings['vr'] = res_valid['f1'], res_valid['p'], res_valid['r']
            settings['tf1'], settings['tp'], settings['tr'] = res_test['f1'],  res_test['p'],  res_test['r']
            settings['be'] = e
            subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt'])
            subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt'])
        else:
            print ''

        # learning rate decay if no improvement in 10 epochs
        if settings['decay'] and abs(settings['be'] - settings['current_epoch']) >= 10:
            settings['current_lr'] *= 0.5

        if settings['current_lr'] < 1e-5:
            break

    print 'BEST RESULT: epoch', e, 'valid F1', settings['vf1'], 'best test F1', settings['tf1'], 'with the model', folder
    def train(self):
        # Prepare data
        sentence_train, slot_train, sentence_dev, slot_dev, vocab_sentence,\
            vocab_slot = data_helper.prepare_data(
                                    "data",
                                    sentence_training_file,
                                    slot_training_file,
                                    sentence_developing_file,
                                    slot_developing_file,
                                    from_vocabulary_size=2000,
                                    to_vocabulary_size=2000,
                                    tokenizer=None)
        sentence_developing, slot_devloping = data_helper.read_data(
            sentence_dev, slot_dev, max_size=None)
        sentence_training, slot_training = data_helper.read_data(
            sentence_train, slot_train, max_size=None)

        ## TODO:
        #sentence_training, slot_training = sentence_training[:1000],\
        #    slot_training[:1000]

        # Dictionaries
        w2id_sentence, id2w_sentence = data_helper.initialize_vocabulary(
            vocab_sentence)
        w2id_slot, id2w_slot = data_helper.initialize_vocabulary(vocab_slot)

        # For conlleval script
        words_train = [
            list(map(lambda x: id2w_sentence[x].decode('utf8'), w))
            for w in sentence_training
        ]
        labels_train = [
            list(map(lambda x: id2w_slot[x].decode('utf8'), y))
            for y in slot_training
        ]
        words_val = [
            list(map(lambda x: id2w_sentence[x].decode('utf8'), w))
            for w in sentence_developing
        ]
        labels_val = [
            list(map(lambda x: id2w_slot[x].decode('utf8'), y))
            for y in slot_devloping
        ]

        # Define model
        n_vocab = len(w2id_sentence)
        n_classes = len(w2id_slot)

        #model = Sequential()
        #model.add(Embedding(n_vocab,100))
        #model.add(Convolution1D(128, 5, border_mode='same', activation='relu'))
        #model.add(Dropout(0.25))
        #model.add(GRU(100,return_sequences=True))
        #model.add(TimeDistributed(Dense(n_classes, activation='softmax')))
        #model.compile('rmsprop', 'categorical_crossentropy')

        ## Training
        ##n_epochs = 30
        #n_epochs = 1

        train_f_scores = []
        val_f_scores = []
        best_val_f1 = 0

        #print("Training =>")
        #train_pred_label = []
        #avgLoss = 0

        #for i in range(n_epochs):
        #    print("Training epoch {}".format(i))

        #    bar = progressbar.ProgressBar(max_value=len(sentence_training))
        #    for n_batch, sent in bar(enumerate(sentence_training)):
        #        label = slot_training[n_batch]
        #        # Make labels one hot
        #        label = np.eye(n_classes)[label][np.newaxis, :]
        #        # View each sentence as a batch
        #        sent = sent[np.newaxis, :]

        #        if sent.shape[1] > 1: #ignore 1 word sentences
        #            loss = model.train_on_batch(sent, label)
        #            avgLoss += loss

        #        pred = model.predict_on_batch(sent)
        #        pred = np.argmax(pred, -1)[0]
        #        train_pred_label.append(pred)

        #    avgLoss = avgLoss/n_batch

        #    predword_train = [list(map(lambda x: id2w_slot[x].decode('utf8'), y))
        #                      for y in train_pred_label]
        #    con_dict = conlleval(predword_train, labels_train,
        #                         words_train, 'measure.txt')
        #    train_f_scores.append(con_dict['f1'])
        #    print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format(
        #        avgLoss, con_dict['r'], con_dict['p'], con_dict['f1']))
        #    # Save model
        #    model.save(filepath_model)
        #    gc.collect()

        print("Validating =>")
        from keras.models import load_model
        model = load_model(filepath_model)

        labels_pred_val = []
        avgLoss = 0

        bar = progressbar.ProgressBar(max_value=len(sentence_developing))
        for n_batch, sent in bar(enumerate(sentence_developing)):
            label = slot_devloping[n_batch]
            label = np.eye(n_classes)[label][np.newaxis, :]
            sent = sent[np.newaxis, :]

            if sent.shape[1] > 1:  #some bug in keras
                loss = model.test_on_batch(sent, label)
                avgLoss += loss

            pred = model.predict_on_batch(sent)
            pred = np.argmax(pred, -1)[0]
            labels_pred_val.append(pred)

        avgLoss = avgLoss / n_batch
        gc.collect()

        predword_val = [
            list(map(lambda x: id2w_slot[x].decode('utf8'), y))
            for y in labels_pred_val
        ]
        con_dict = conlleval(predword_val, labels_val, words_val,
                             'measure.txt')
        val_f_scores.append(con_dict['f1'])
        print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format(
            avgLoss, con_dict['r'], con_dict['p'], con_dict['f1']))

        if con_dict['f1'] > best_val_f1:
            best_val_f1 = con_dict['f1']
            print('here')
            with open('model_architecture.json', 'w') as outf:
                outf.write(model.to_json())
            model.save_weights('best_model_weights.h5', overwrite=True)
            print("Best validation F1 score = {}".format(best_val_f1))
        print()
示例#7
0
def play_with_spelling():
    """Play with spelling mistakes"""
    print CONF
    np.random.seed(CONF['seed'])
    random.seed(CONF['seed'])
    print "Calculate output"
    session_files = get_session_files(number_of_files=CONF['number_of_files'], random_seed=CONF['seed'])
    sentences = get_sentences(session_files)
    print len(sentences)
    labels2idx = char2idx = get_char_to_idx(sentences)

    print "Prepare train, validation and test sets"
    train_valid_sentences, test_sentences = train_test_split(sentences, test_size=0.15, random_state=CONF['seed'])
    train_sentences, valid_sentences = train_test_split(train_valid_sentences, test_size=0.2, random_state=CONF['seed'])
    print len(train_valid_sentences), len(test_sentences)
    test_lex, test_y = create_tests(test_sentences, CONF['error_probability'], labels2idx, char2idx)
    valid_lex, valid_y = create_tests(valid_sentences, CONF['error_probability'], labels2idx, char2idx)
    train_lex = []
    train_y = []
    for error_probability in (CONF['error_probability'], CONF['error_probability'] / 10, CONF['error_probability'] / 100, 0):
        _train_idxes, _train_labels_idxes = create_tests(train_sentences, error_probability, labels2idx, char2idx)
        train_lex.extend(_train_idxes)
        train_y.extend(_train_labels_idxes)
#     train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_idxes, train_valid_labels_idxes, test_size=0.2, random_state=CONF['seed'])
    print len(train_lex), len(valid_lex), len(train_y), len(valid_y)

    print "Some more prep"
    idx2label = dict((k, v) for v, k in labels2idx.iteritems()) # Reverse the dictionary
    idx2word = dict((k, v) for v, k in char2idx.iteritems()) # Reverse the dictionary
    groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y]
    windowed_test_lex = [np.asarray(contextwin(x, CONF['win'])).astype('int32') for x in test_lex]
    windowed_valid_lex = [np.asarray(contextwin(x, CONF['win'])).astype('int32') for x in valid_lex]

    words_test = [ map(lambda x: idx2word[x], w) for w in test_lex]
    groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ]
    words_valid = [ map(lambda x: idx2word[x], w) for w in valid_lex]
    vocsize = 1 + len(set(item for lex in (train_lex, valid_lex, test_lex) for sublist in lex for item in sublist))
    nclasses = 1 + len(set(item for _y in (train_y, test_y, valid_y) for sublist in _y for item in sublist))
    nsentences = len(train_lex)

    words_lex = []
    for i in xrange(nsentences):
        cwords = contextwin(train_lex[i], CONF['win'])
        words = [np.asarray(x).astype(np.int32) for x in minibatch(cwords, CONF['batch_size'])]
        words_lex.append(words)

    print "Some file os calls"
    folder = os.path.basename(__file__).split('.')[0] + "_3"
    if not os.path.exists(folder):
        os.mkdir(folder)
    print "Create a Neural Network"
    rnn = regular_elman(nh=CONF['nhidden'],
                        nc=nclasses,
                        ne=vocsize,
                        de=CONF['emb_dimension'],
                        cs=CONF['win'],)

    # train with early stopping on validation set
    best_f1 = -np.inf
    CONF['current_learning_rate'] = CONF['learning_rate']
    print "Start training"
    start_time = print_time = time.time()
    for epoch in xrange(CONF['nepochs']):
        # shuffle
        shuffle([words_lex, train_y], CONF['seed'])
        CONF['ce'] = epoch
        tic = time.time()
        percentage_of_sentences_to_train = (epoch + 1) / CONF['nepochs']
        numer_of_sentences_to_train = int(nsentences * percentage_of_sentences_to_train)
        print "starting an epoch, numer_of_sentences_to_train =", numer_of_sentences_to_train
        test_size = int(len(windowed_test_lex) * percentage_of_sentences_to_train)
        print "test_size", test_size
        validation_size = int(len(windowed_valid_lex) * percentage_of_sentences_to_train)
        print "validation_size", validation_size
        for _ in xrange(30): # Trauma!
            print "_", _
            for i in xrange(numer_of_sentences_to_train):
                words = words_lex[i]
                labels = train_y[i]
                for word_batch, label_last_word in zip(words, labels):
                    rnn.train(word_batch, label_last_word, CONF['current_learning_rate'])
                    rnn.normalize()
                if CONF['verbose'] and time.time() - print_time > 30:
                    print '[learning] epoch %i >> %2.2f%%' % (epoch, (i + 1) * 100. / numer_of_sentences_to_train), 'completed in %.2f (sec) <<\r' % (time.time() - tic),
                    print_time = time.time()            

        # evaluation // back into the real world : idx -> words
        if CONF['verbose']:
            print "Classify test"
        predictions_test = [[idx2label[x] for x in rnn.classify(windowed_test_lex_item)]
                            for windowed_test_lex_item in windowed_test_lex[:test_size]]

        if CONF['verbose']:
            print "Classify validation"
        predictions_valid = [[idx2label[x] for x in rnn.classify(windowed_valid_lex_item)]
                             for windowed_valid_lex_item in windowed_valid_lex[:validation_size]]
        # evaluation // compute the accuracy using conlleval.pl
        if CONF['verbose']:
            print "Evaluate test and validation"
        res_test = conlleval(predictions_test, groundtruth_test[:test_size], words_test[:test_size], folder + '/current.test.txt')
        res_valid = conlleval(predictions_valid, groundtruth_valid[:validation_size], words_valid[:validation_size], folder + '/current.valid.txt')

        if res_valid['f1'] > best_f1:
            rnn.save(folder)
            best_f1 = res_valid['f1']
            print 'NEW BEST: epoch', epoch, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' ' * 20
            CONF['vf1'], CONF['vp'], CONF['vr'] = res_valid['f1'], res_valid['p'], res_valid['r']
            CONF['tf1'], CONF['tp'], CONF['tr'] = res_test['f1'], res_test['p'], res_test['r']
            CONF['be'] = epoch
            subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt'])
            subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt'])
        else:
            print '        : epoch', epoch, 'valid F1', res_valid['f1'], '     test F1', res_test['f1'], ' ' * 20
#             rnn.load(folder)

        # learning rate decay if no improvement in 10 epochs
        if CONF['decay'] and abs(CONF['be'] - CONF['ce']) >= 10:
            CONF['current_learning_rate'] *= 0.5
        if CONF['current_learning_rate'] < 1e-5:
            break

    print 'BEST RESULT: epoch', CONF['be'], 'valid F1', best_f1, 'best test F1', CONF['tf1'], 'with the model', folder
    print "total time = {} seconds".format(time.time() - start_time)
示例#8
0
def play_with_splitting_sentences():
    """Play with splitting sentences"""
    conf = { # 'fold': 3, # 5 folds 0,1,2,3,4
        'lr': 0.0627142536696559,
        'verbose': False,
        'decay': True, # decay on the learning rate if improvement stops
        'win': 15, # number of characters in the context window
        'bs': 5, # number of back-propagation through time steps
        'nhidden': 100, # number of hidden units
        'seed': 345,
        'emb_dimension': 30, # dimension of character embedding
        'nepochs': 10}
    number_of_files = 50000
    np.random.seed(conf['seed'])
    random.seed(conf['seed'])
    print "Calculate output"
    session_files = get_session_files(number_of_files=number_of_files, random_seed=conf['seed']) # Limit the scope To speed things up...
    labels2idx = {"O": 0, "X": 1}
    sentences = []
    idxes = []
    labels_idxes = []
    labels = []
    char2idx = get_char_to_idx(session_files)
    for session_file in session_files:
        session = json.loads(open(session_file, "rb").read())
        sentence = session_to_text0(session)
        if not sentence.strip():
            continue
        sentence_out, label = create_test(sentence, probability=0.2)
        sentences.append(sentence_out)
        labels.append(label)
        labels_idxes.append(np.fromiter((labels2idx[l] for l in label), dtype=np.uint32))
        idxes.append(np.fromiter((char2idx[char] for char in sentence_out), dtype=np.uint32))

    print "Prepare train, validation and test sets"
    train_valid_lex, test_lex, train_valid_y, test_y = train_test_split(idxes, labels_idxes, test_size=0.15, random_state=42)
    train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_lex, train_valid_y, test_size=0.2, random_state=42)
    print "Some more prep"
    idx2label = dict((k, v) for v, k in labels2idx.iteritems()) # Reverse the dictionary
    idx2word = dict((k, v) for v, k in char2idx.iteritems()) # Reverse the dictionary

#     vocsize = 1 + len(set(reduce(\
#                                  lambda x, y: list(x)+list(y),\
#                                  train_lex+valid_lex+test_lex)))
    vocsize = 1 + len(set(item for lex in (train_lex, valid_lex, test_lex) for sublist in lex for item in sublist))
    nclasses = 2  #len(set(reduce(lambda x, y: list(x) + list(y), train_y + test_y + valid_y)))
    nsentences = len(train_lex)
    print "Some file os calls"
    folder = os.path.basename(__file__).split('.')[0] + "_3"
    if not os.path.exists(folder):
        os.mkdir(folder)
    print "Create a Neural Network"
    rnn = regular_elman(nh=conf['nhidden'],
                        nc=nclasses,
                        ne=vocsize,
                        de=conf['emb_dimension'],
                        cs=conf['win'],)

    # train with early stopping on validation set
    best_f1 = -np.inf
    conf['clr'] = conf['lr']
    print "Start training"
    start_time = time.time()
    for epoch in xrange(conf['nepochs']):
        # shuffle
        shuffle([train_lex, train_y], conf['seed'])
        conf['ce'] = epoch
        tic = time.time()
        for i in xrange(nsentences):
            cwords = contextwin(train_lex[i], conf['win'])
            words = [np.asarray(x).astype(np.int32) for x in minibatch(cwords, conf['bs'])]
            labels = train_y[i]
            for word_batch , label_last_word in zip(words, labels):
                rnn.train(word_batch, label_last_word, conf['clr'])
                rnn.normalize()
            if conf['verbose']:
                print '[learning] epoch %i >> %2.2f%%' % (epoch, (i + 1) * 100. / nsentences), 'completed in %.2f (sec) <<\r' % (time.time() - tic),
                sys.stdout.flush()

        # evaluation // back into the real world : idx -> words
        predictions_test = [ map(lambda x: idx2label[x], \
                         rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\
                         for x in test_lex ]
        groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ]
        words_test = [ map(lambda x: idx2word[x], w) for w in test_lex]

        predictions_valid = [ map(lambda x: idx2label[x], \
                             rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\
                             for x in valid_lex ]
        groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ]
        words_valid = [ map(lambda x: idx2word[x], w) for w in valid_lex]

        # evaluation // compute the accuracy using conlleval.pl
        res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/current.test.txt')
        res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/current.valid.txt')

        if res_valid['f1'] > best_f1:
            rnn.save(folder)
            best_f1 = res_valid['f1']
            print 'NEW BEST: epoch', epoch, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' ' * 20
            conf['vf1'], conf['vp'], conf['vr'] = res_valid['f1'], res_valid['p'], res_valid['r']
            conf['tf1'], conf['tp'], conf['tr'] = res_test['f1'], res_test['p'], res_test['r']
            conf['be'] = epoch
            subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt'])
            subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt'])
        else:
            print '        : epoch', epoch, 'valid F1', res_valid['f1'], '     test F1', res_test['f1'], ' ' * 20

        # learning rate decay if no improvement in 10 epochs
        if conf['decay'] and abs(conf['be'] - conf['ce']) >= 10:
            conf['clr'] *= 0.5
        if conf['clr'] < 1e-5:
            break

    print 'BEST RESULT: epoch', conf['be'], 'valid F1', best_f1, 'best test F1', conf['tf1'], 'with the model', folder
    print "total time = {} seconds".format(time.time() - start_time)
示例#9
0
        ground_truth_test = [map(lambda x: idx2label[x], y) for y in test_y]
        words_test = [map(lambda x: idx2word[x], w) for w in test_lex]

        predictions_valid = [
            map(
                lambda x: idx2label[x],
                rnn.classify(
                    numpy.asarray(contextwin(x, s['win'])).astype('int32'), f))
            for x, f in zip(valid_lex, valid_feat)
        ]

        ground_truth_valid = [map(lambda x: idx2label[x], y) for y in valid_y]
        words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex]

        # evaluation // compute the accuracy using conlleval.pl
        res_test = conlleval(predictions_test, ground_truth_test, words_test,
                             model_folder + '/current.test.txt')
        res_valid = conlleval(predictions_valid, ground_truth_valid,
                              words_valid, model_folder + '/current.valid.txt')

        if res_test['f1'] > best_f1_test:
            rnn.save(model_folder)

            best_f1_test, best_f1_test_val = res_test['f1'], res_valid['f1']

            if s['verbose']:
                print 'NEW BEST: epoch', e, 'valid F1', res_valid[
                    'f1'], 'best test F1', res_test['f1'], ' ' * 20

            s['vf1'], s['vp'], s['vr'] = res_valid['f1'], res_valid[
                'p'], res_valid['r']
            s['tf1'], s['tp'], s['tr'] = res_test['f1'], res_test[
        sent = sent[np.newaxis, :]
        #print(sent)
        if sent.shape[1] > 1:  #some bug in keras
            loss = model.train_on_batch(sent, label)
            avgLoss += loss

        pred = model.predict_on_batch(sent)
        pred = np.argmax(pred, -1)[0]
        #print(pred)
        train_pred_label.append(pred)

    avgLoss = avgLoss / n_batch
    predword_train = [
        list(map(lambda x: idx2la[x], y)) for y in train_pred_label
    ]
    con_dict = conlleval(predword_train, trainY, trainY, 'r.txt')
    train_f_scores.append(con_dict['f1'])
    print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format(
        avgLoss, con_dict['r'], con_dict['p'], con_dict['f1']))

    print("Validating =>")

    val_pred_label = []
    avgLoss = 0

    bar = progressbar.ProgressBar(max_value=len(val_x))
    for n_batch, sent in bar(enumerate(val_x)):
        label = val_label[n_batch]
        label = np.eye(n_classes)[label][np.newaxis, :]
        sent = sent[np.newaxis, :]
示例#11
0
    def train(self):
        sentence_developing, slot_devloping = data_helper.read_data(
            self.sentence_dev, self.slot_dev, max_size=None)
        sentence_training, slot_training = data_helper.read_data(
            self.sentence_train, self.slot_train, max_size=None)

        # Make toy data; comment this block to train on the full dataset
        #n_toy = 1000
        #sentence_training, slot_training = sentence_training[:n_toy],\
        #    slot_training[:n_toy]
        #sentence_developing, slot_devloping = sentence_developing[:round(n_toy/2)],\
        #    slot_devloping[:round(n_toy/2)]

        # Dictionaries
        w2id_sentence, id2w_sentence = data_helper.initialize_vocabulary(
            self.vocab_sentence)
        w2id_slot, id2w_slot = data_helper.initialize_vocabulary(
            self.vocab_slot)

        # For conlleval script
        words_train = [
            list(map(lambda x: id2w_sentence[x].decode('utf8'), w))
            for w in sentence_training
        ]
        labels_train = [
            list(map(lambda x: id2w_slot[x].decode('utf8'), y))
            for y in slot_training
        ]
        words_val = [
            list(map(lambda x: id2w_sentence[x].decode('utf8'), w))
            for w in sentence_developing
        ]
        labels_val = [
            list(map(lambda x: id2w_slot[x].decode('utf8'), y))
            for y in slot_devloping
        ]

        # Define model
        n_vocab = len(w2id_sentence)
        n_classes = len(w2id_slot)

        model = Sequential()
        model.add(Embedding(n_vocab, 100))
        model.add(Convolution1D(128, 5, border_mode='same', activation='relu'))
        model.add(Dropout(0.25))
        model.add(GRU(100, return_sequences=True))
        model.add(TimeDistributed(Dense(n_classes, activation='softmax')))
        model.compile('rmsprop', 'categorical_crossentropy')

        # Training
        #n_epochs = 30
        n_epochs = 1

        train_f_scores = []
        val_f_scores = []
        best_val_f1 = 0

        print("Training =>")
        train_pred_label = []
        avgLoss = 0

        for i in range(n_epochs):
            print("Training epoch {}".format(i))

            bar = progressbar.ProgressBar(max_value=len(sentence_training))
            for n_batch, sent in bar(enumerate(sentence_training)):
                label = slot_training[n_batch]
                # Make labels one hot
                label = np.eye(n_classes)[label][np.newaxis, :]
                # View each sentence as a batch
                sent = sent[np.newaxis, :]

                if sent.shape[1] > 1:  #ignore 1 word sentences
                    loss = model.train_on_batch(sent, label)
                    avgLoss += loss

                pred = model.predict_on_batch(sent)
                pred = np.argmax(pred, -1)[0]
                train_pred_label.append(pred)

            avgLoss = avgLoss / n_batch

            predword_train = [
                list(map(lambda x: id2w_slot[x].decode('utf8'), y))
                for y in train_pred_label
            ]
            con_dict = conlleval(predword_train, labels_train, words_train,
                                 'measure.txt')
            train_f_scores.append(con_dict['f1'])
            print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format(
                avgLoss, con_dict['r'], con_dict['p'], con_dict['f1']))
            # Save model
            model.save(model_file)

            print("Validating =>")

            labels_pred_val = []
            avgLoss = 0

            bar = progressbar.ProgressBar(max_value=len(sentence_developing))
            for n_batch, sent in bar(enumerate(sentence_developing)):
                label = slot_devloping[n_batch]
                label = np.eye(n_classes)[label][np.newaxis, :]
                sent = sent[np.newaxis, :]

                if sent.shape[1] > 1:  #some bug in keras
                    loss = model.test_on_batch(sent, label)
                    avgLoss += loss

                pred = model.predict_on_batch(sent)
                pred = np.argmax(pred, -1)[0]
                labels_pred_val.append(pred)

            avgLoss = avgLoss / n_batch

            predword_val = [
                list(map(lambda x: id2w_slot[x].decode('utf8'), y))
                for y in labels_pred_val
            ]
            con_dict = conlleval(predword_val, labels_val, words_val,
                                 'measure.txt')
            val_f_scores.append(con_dict['f1'])
            print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format(
                avgLoss, con_dict['r'], con_dict['p'], con_dict['f1']))

            if con_dict['f1'] > best_val_f1:
                best_val_f1 = con_dict['f1']
                with open('model_architecture.json', 'w') as outf:
                    outf.write(model.to_json())
                model.save_weights('best_model_weights.h5', overwrite=True)
                print("Best validation F1 score = {}".format(best_val_f1))
            print()

            # Prevent from tensorflow bugs: BaseSession.__del__
            gc.collect()
示例#12
0
def train(model=None, re_train=False):
    word_model = CustomEmbedding(
        re_train=re_train
    )  # To prevent creating embeddings if re_train is True

    train_set, valid_set, indexes = word_model.train_set, word_model.valid_set, word_model.indexes

    w2idx, la2idx = indexes['w2idx'], indexes['la2idx']
    idx2w, idx2la = indexes['idx2w'], indexes['idx2la']

    n_classes = len(idx2la)
    n_vocab = len(idx2w)

    train_x, train_label = train_set
    valid_x, valid_label = valid_set

    log("Processing word indexes... ")

    words_val = [list(map(lambda x: idx2w[x], w)) for w in valid_x]
    groundtruth_val = [list(map(lambda x: idx2la[x], y)) for y in valid_label]

    log("Done processing word indexes!")

    if re_train == False:
        '''
            DEFINE MODEL 
        '''
        model = Sequential()

        model.add(word_model.EmbeddingLayer())

        model.add(Conv1D(128, 5, padding="same", activation='relu'))

        model.add(Dropout(Config.DROPOUT))

        model.add(
            Bidirectional(
                LSTM(units=Config.EMBEDDING_SIZE,
                     dropout=Config.DROPOUT,
                     recurrent_dropout=Config.DROPOUT,
                     kernel_initializer=he_normal(),
                     return_sequences=True)))

        model.add(SeqSelfAttention(attention_activation='sigmoid'))

        # model.add(GRU(units=Config.EMBEDDING_SIZE,
        #               dropout=Config.DROPOUT,
        #               recurrent_dropout=Config.DROPOUT,
        #               kernel_initializer=he_normal(),
        #               return_sequences=True))

        model.add(TimeDistributed(Dense(n_classes, activation='softmax')))

        model.add(CRF(n_classes, sparse_target=False, learn_mode='join'))

        model.compile(Config.OPTIMIZER,
                      Config.LOSS,
                      metrics=[crf_viterbi_accuracy])

    process = Process(model)

    max_f1 = 0

    try:
        for i in range(Config.N_EPOCHS):
            log("Epoch " + str(i + 1), display=False)
            highlight('violet', 'Epoch ' + str(i + 1))

            partition(80)

            log("Training ")

            process.train(train_set)

            log("Validating ")

            predword_val = process.validate(valid_set)

            # Accuracy tests here using (predword_val, groundtruth_val, words_val) and save best model
            metrics = conlleval(predword_val, groundtruth_val, words_val,
                                'diff.txt')

            log('Precision = {}, Recall = {}, F1 = {}'.format(
                metrics['precision'], metrics['recall'], metrics['f1']))

            if metrics['f1'] > max_f1:
                max_f1 = metrics['f1']
                process.save('trained_model_' + str(Config.FILE_PATTERN) +
                             '_' + str(max_f1))
                log("New model saved!", display=False)

        highlight('white', 'Best validation F1 score : ' + str(max_f1))
        log('Best validation F1 score : ' + str(max_f1), display=False)

        log('Cleaning /trained_model folder...')
        clean()
        log('Removed all other saved models, kept the best model only!')

    except KeyboardInterrupt:  # If in case ctrl + c pressed, needs to clean up and exit
        log("\nTraining interrupted with ctrl + c ...")
        log('Cleaning /trained_model folder...')
        clean()
        log('Removed all other saved models, kept the best model only!')

        sys.exit()
示例#13
0
        if sent.shape[1] > 1:  # ignore 1 word sentences
            model.train_on_batch(sent, label)

    from metrics.accuracy import conlleval

    labels_pred_val = []
    bar = progressbar.ProgressBar(max_value=len(train_encoded[851:]))
    for n_batch, sent in bar(enumerate(train_encoded[851:])):
        label = lbl_encoded[n_batch]
        label = np.eye(n_classes)[label][np.newaxis, :]
        sent = sent[np.newaxis, :]

        pred = model.predict_on_batch(sent)
        pred = np.argmax(pred, -1)[0]
        labels_pred_val.append(pred)

    labels_pred_val = [list(map(lambda x: idx2la[x], y)) \
                       for y in labels_pred_val]
    con_dict = conlleval(labels_pred_val, labels_train[851:],
                         words_train[851:], 'measure.txt')

    print('Precision = {}, Recall = {}, F1 = {}'.format(
        con_dict['r'], con_dict['p'], con_dict['f1']))

    # model.fit(x=train_x, y=train_label, steps_per_epoch=1,callbacks=[checkpointer, tensorboard])

    model.save_weights(path + "music_LSTM-" + str(i) + ".h5")
    model_json = model.to_json()
    with open(path + "model_embed_lstm.json", "w") as jf:
        jf.write(model_json)
示例#14
0
def run(params):

    start_time = time.time()

    folder = os.path.basename(__file__).split('.')[0]
    if not os.path.exists(folder): os.mkdir(folder)
    rhoList = numpy.array([100, 50]).astype(
        numpy.int32
    )  # 100,90,80,70,60,50,0 # combining forward and backward layers

    # load the dataset
    eval_options = []
    params['measure'] = 'F1score'
    if params['dataset'] == 'atis':
        train_set, valid_set, test_set, dic = loadData.atisfold(params['fold'])
    if params['dataset'] == 'ner':
        train_set, valid_set, test_set, dic = loadData.ner()
    if params['dataset'] == 'chunk':
        train_set, valid_set, test_set, dic = loadData.chunk()
    if params['dataset'] == 'pos':
        train_set, valid_set, test_set, dic = loadData.pos()
        eval_options = ['-r']
        params['measure'] = 'Accuracy'

    idx2label = dict((k, v) for v, k in dic['labels2idx'].items())
    idx2word = dict((k, v) for v, k in dic['words2idx'].items())

    train_lex, train_ne, train_y = train_set
    valid_lex, valid_ne, valid_y = valid_set
    test_lex, test_ne, test_y = test_set

    ## :( hack
    # train_lex = train_lex[::100]
    # train_ne = train_ne[::100]
    # train_y = train_y[::100]
    # valid_lex = valid_lex[::100]
    # valid_ne = valid_ne[::100]
    # valid_y = valid_y[::100]
    # test_lex = test_lex[::100]
    # test_ne = test_ne[::100]
    # test_y = test_y[::100]

    vocsize = len(dic['words2idx'])
    nclasses = len(dic['labels2idx'])
    nsentences = len(train_lex)

    wv = None
    if params['WVFolder'] != 'random':
        if '[' in params['WVFolder'] and ']' in params['WVFolder']:
            folderSet = set(
                eval(params['WVFolder'].replace('[', '[\'').replace(
                    ']', '\']').replace(',', '\',\'')))
            print(folderSet)
            wv = numpy.zeros(
                (vocsize + 1,
                 params['WVModel']['emb_dimension'] * len(folderSet)))

            modelIndex = 0
            for folder in folderSet:
                params['WVFile'] = folder + '/' + 'words' + str(
                    params['WVModel']['emb_dimension']) + '.npy'
                params['WVVocabFile'] = folder + '/' + 'words' + str(
                    params['WVModel']['emb_dimension']) + '.vocab'
                # load word vector
                wvnp = np.load(params['WVFile'])

                # load vocab
                with open(params['WVVocabFile']) as f:
                    vocab = [line.strip() for line in f if len(line) > 0]
                wi = dict([(a, i) for i, a in enumerate(vocab)])

                random_v = math.sqrt(
                    6.0 / numpy.sum(params['WVModel']['emb_dimension'])
                ) * numpy.random.uniform(-1.0, 1.0,
                                         (params['WVModel']['emb_dimension']))
                miss = 0  # the number of missing words in pre-trained word embeddings
                for i in range(0, vocsize):
                    word = idx2word[i]
                    if word in wi:
                        wv[i][params['WVModel']['emb_dimension'] *
                              modelIndex:params['WVModel']['emb_dimension'] *
                              (modelIndex + 1)] = wvnp[wi[word]]
                        # print wvnp[wi[word]]
                    else:
                        wv[i][params['WVModel']['emb_dimension'] *
                              modelIndex:params['WVModel']['emb_dimension'] *
                              (modelIndex + 1)] = random_v
                        miss += 1
                print("missing words rate : ", miss, '/', vocsize)
                params['WVModel']['vocab_size'] = len(vocab)
                modelIndex = modelIndex + 1

            params['WVModel']['emb_dimension'] *= len(folderSet)
            # return
        else:
            folder = params['WVFolder']
            params['WVFile'] = folder + '/' + 'words' + str(
                params['WVModel']['emb_dimension']) + '.npy'
            params['WVVocabFile'] = folder + '/' + 'words' + str(
                params['WVModel']['emb_dimension']) + '.vocab'

            # load word vector
            wvnp = np.load(params['WVFile'])
            params['WVModel']['emb_dimension'] = len(wvnp[0])

            # load vocab
            with open(params['WVVocabFile']) as f:
                vocab = [line.strip() for line in f if len(line) > 0]
            wi = dict([(a, i) for i, a in enumerate(vocab)])
            wv = numpy.zeros((vocsize + 1, params['WVModel']['emb_dimension']))
            random_v = math.sqrt(6.0 / numpy.sum(
                params['WVModel']['emb_dimension'])) * numpy.random.uniform(
                    -1.0, 1.0, (params['WVModel']['emb_dimension']))

            miss = 0  # the number of missing words in pre-trained word embeddings
            for i in range(0, vocsize):
                word = idx2word[i]
                if word in wi:
                    wv[i] = wvnp[wi[word]]
                    # print wvnp[wi[word]]
                else:
                    wv[i] = random_v
                    miss += 1
            print("missing words rate : ", miss, '/', vocsize)
            params['WVModel']['vocab_size'] = len(vocab)

    print(json.dumps(params, sort_keys=True, indent=4, separators=(',', ': ')))

    rhoSuffix = "%_forward"
    best_valid = {}
    best_test = {}
    for i_rho in range(len(rhoList)):
        best_valid[str(rhoList[i_rho]) + rhoSuffix] = -numpy.inf
        best_test[str(rhoList[i_rho]) + rhoSuffix] = -numpy.inf
    validMeasureList = {}
    testMeasureList = {}  # this is used for drawing line chart.
    for i_rho in range(len(rhoList)):
        validMeasureList[str(rhoList[i_rho]) + rhoSuffix] = []
        testMeasureList[str(rhoList[i_rho]) + rhoSuffix] = []

    # instanciate the model
    numpy.random.seed(params['seed'])
    random.seed(params['seed'])
    rnn = elman_attention.model(nh=params['nhidden'],
                                nc=nclasses,
                                ne=vocsize,
                                de=params['WVModel']['emb_dimension'],
                                attention=params['attention'],
                                h_win=(params['h_win_left'],
                                       params['h_win_right']),
                                lvrg=params['lvrg'],
                                wv=wv)

    # train
    for e in range(params['nepochs']):
        # shuffle
        shuffle([train_lex, train_ne, train_y], params['seed'])

        tic = time.time()
        for i in range(nsentences):
            cwords = contextwin(train_lex[i])
            labels = train_y[i]

            nl, aaL = rnn.train(cwords, labels, params['dropRate'], 1)

            # rnn.normalize()
            if params['verbose']:
                sys.stdout.write(
                    ('\r[learning] epoch %i >> %2.2f%%' %
                     (e, (i + 1) * 100. / nsentences) +
                     ('  average speed in %.2f (min) <<' %
                      ((time.time() - tic) / 60 / (i + 1) * nsentences)) +
                     (' completed in %.2f (sec) <<' % ((time.time() - tic)))))
                sys.stdout.flush()

        print('start test', time.time() / 60)

        print('start pred train', time.time() / 60)
        predictions_train = [[map(lambda varible: idx2label[varible], w) \
                              for w in rnn.classify(numpy.asarray(contextwin(x)).astype('int32'), params['dropRate'], 0, rhoList)]
                             for x in train_lex]

        predictions_test = [[map(lambda varible: idx2label[varible], w) \
                             for w in rnn.classify(numpy.asarray(contextwin(x)).astype('int32'), params['dropRate'], 0, rhoList)]
                            for x in test_lex]

        predictions_valid = [[map(lambda varible: idx2label[varible], w) \
                              for w in rnn.classify(numpy.asarray(contextwin(x)).astype('int32'), params['dropRate'], 0, rhoList)]
                             for x in valid_lex]

        for i_rho in range(len(rhoList)):

            groundtruth_train = [
                map(lambda x: idx2label[x], y) for y in train_y
            ]
            words_train = [map(lambda x: idx2word[x], w) for w in train_lex]
            groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y]
            words_test = [map(lambda x: idx2word[x], w) for w in test_lex]
            groundtruth_valid = [
                map(lambda x: idx2label[x], y) for y in valid_y
            ]
            words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex]

            ptrain = [p[i_rho] for p in predictions_train]
            ptest = [p[i_rho] for p in predictions_test]
            pvalid = [p[i_rho] for p in predictions_valid]

            res_train = conlleval(
                ptrain, groundtruth_train, words_train, folder +
                '/current.train.txt' + str(i_rho) + str(params['seed']),
                eval_options)
            res_test = conlleval(
                ptest, groundtruth_test, words_test, folder +
                '/current.test.txt' + str(i_rho) + str(params['seed']),
                eval_options)
            res_valid = conlleval(
                pvalid, groundtruth_valid, words_valid, folder +
                '/current.valid.txt' + str(i_rho) + str(params['seed']),
                eval_options)

            print('                                     epoch', e, ' rhoList ',
                  i_rho, '  train p', res_train['p'], 'valid p',
                  res_valid['p'], '  train r', res_train['r'], 'valid r',
                  res_valid['r'], '  train ', params['measure'],
                  res_train['measure'], 'valid ', params['measure'],
                  res_valid['measure'], 'best test ', params['measure'],
                  res_test['measure'], ' ' * 20)

            validMeasureList[str(rhoList[i_rho]) + rhoSuffix].append(
                res_valid['measure'])
            testMeasureList[str(rhoList[i_rho]) + rhoSuffix].append(
                res_test['measure'])

            if res_valid['measure'] > best_valid[str(rhoList[i_rho]) +
                                                 rhoSuffix]:
                best_valid[str(rhoList[i_rho]) +
                           rhoSuffix] = res_valid['measure']
                best_test[str(rhoList[i_rho]) +
                          rhoSuffix] = res_test['measure']

        for i_rho in range(
                len(rhoList)):  # this is used for drawing line chart.
            print(i_rho, params['dataset'], end=' ')
            for v in testMeasureList[str(rhoList[i_rho]) + rhoSuffix]:
                print(v, end=' ')
            print('')

        for i_rho in range(len(rhoList)):
            print('current best results', rhoList[i_rho], ' ',
                  best_valid[str(rhoList[i_rho]) + rhoSuffix], '/',
                  best_test[str(rhoList[i_rho]) + rhoSuffix])

    end_time = time.time()

    with open(params['JSONOutputFile'], 'w') as outputFile:
        params['results'] = {}
        params['results']['best_valid_' + params['measure']] = best_valid
        params['results']['best_test_' + params['measure']] = best_test
        params['results']['valid_' + params['measure'] +
                          'ListBasedOnEpochs'] = validMeasureList
        params['results']['test_' + params['measure'] +
                          'ListBasedOnEpochs'] = testMeasureList
        params['running_time'] = {}
        params['running_time']['start'] = time.strftime(
            "%Y-%m-%d %H:%M:%S", time.localtime(start_time))
        params['running_time']['end'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                      time.localtime(end_time))
        params['running_time']['duration'] = end_time - start_time

        res = json.dump(params,
                        outputFile,
                        sort_keys=True,
                        indent=4,
                        separators=(',', ': '))
        print(res)
示例#15
0
        sent = sent[np.newaxis, :]

        # if sent.shape[1] > 1:  # some bug in keras
        #     loss = model.train_on_batch(sent, label)
        #     avgLoss += loss

        pred = model.predict_on_batch(sent)
        pred = np.argmax(pred, -1)[0]
        train_pred_label.append(pred)

    avgLoss = avgLoss / n_batch

    predword_train = [
        list(map(lambda x: idx2la[x], y)) for y in train_pred_label
    ]
    con_dict = conlleval(predword_train, groundtruth_train, words_train,
                         'r.txt')
    train_f_scores.append(con_dict['f1'])
    print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format(
        avgLoss, con_dict['r'], con_dict['p'], con_dict['f1']))

    print("Validating =>")

    val_pred_label = []
    avgLoss = 0

    bar = progressbar.ProgressBar(max_value=len(val_x))
    for n_batch, sent in bar(enumerate(val_x)):
        label = val_label[n_batch]
        label = np.eye(n_classes)[label][np.newaxis, :]
        sent = sent[np.newaxis, :]
示例#16
0
        label = train_label[n_batch]
        label = np.eye(n_classes)[label][np.newaxis,:]
        sent = sent[np.newaxis,:]
        
        if sent.shape[1] > 1: #some bug in keras
            loss = model.train_on_batch(sent, label)
            avgLoss += loss

        pred = model.predict_on_batch(sent)
        pred = np.argmax(pred,-1)[0]
        train_pred_label.append(pred)

    avgLoss = avgLoss/n_batch
    
    predword_train = [ list(map(lambda x: idx2la[x], y)) for y in train_pred_label]
    con_dict = conlleval(predword_train, groundtruth_train, words_train, 'r.txt')
    train_f_scores.append(con_dict['f1'])
    print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format(avgLoss, con_dict['r'], con_dict['p'], con_dict['f1']))
    
    
    print("Validating =>")
    
    val_pred_label = []
    avgLoss = 0
    
    bar = progressbar.ProgressBar(max_value=len(val_x))
    for n_batch, sent in bar(enumerate(val_x)):
        label = val_label[n_batch]
        label = np.eye(n_classes)[label][np.newaxis,:]
        sent = sent[np.newaxis,:]
        
示例#17
0
def prepare_data():
    """Prepare the data"""
    conf = {
        'fold': 3,  # 5 folds 0,1,2,3,4
        'lr': 0.0627142536696559,
        'verbose': True,
        'decay': True,  # decay on the learning rate if improvement stops
        'win': 7,  # number of words in the context window
        'bs': 9,  # number of back-propagation through time steps
        'nhidden': 100,  # number of hidden units
        'seed': 345,
        'emb_dimension': 300,  # dimension of word embedding
        'nepochs': 50
    }
    np.random.seed(conf['seed'])
    random.seed(conf['seed'])
    session_files = get_session_files(
        number_of_files=None,
        random_seed=conf['seed'])  # Limit the scope To speed things up...
    sentences = []
    idxes = []
    labels = []
    labels_idxes = []
    print "Calculate words2idx"
    words2idx = get_words2idx(session_files)
    unknown = words2idx["<UNK>"]
    print "Calculate output"
    for session_file in session_files:
        session = json.loads(open(session_file, "rb").read())
        sentence = session_to_text0(session)
        if not sentence.strip():
            continue
        sentences.append(sentence)
        token_list = tokenize(sentence.lower())
        dtp_search_res = dtp_search(sentence, None)
        iobes = to_iob(token_list, dtp_search_res)
        labels.append(iobes)
        labels_idxes.append(
            np.fromiter((LABELS2IDX[iob] for iob in iobes), dtype=np.int32))
        #         token_list = [re.sub(r"\d", "DIGIT", token) for token in token_list]
        idxes.append(
            np.fromiter(
                (words2idx.get(token, unknown) for token in token_list),
                dtype=np.int32))

    print "Prepare train, validation and test sets"
    train_valid_lex, test_lex, train_valid_y, test_y = train_test_split(
        idxes, labels_idxes, test_size=0.15, random_state=42)
    train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_lex,
                                                              train_valid_y,
                                                              test_size=0.2,
                                                              random_state=42)

    idx2label = dict(
        (k, v) for v, k in LABELS2IDX.iteritems())  # Reverse the dictionary
    idx2word = dict(
        (k, v) for v, k in words2idx.iteritems())  # Reverse the dictionary

    vocsize = len(idx2word)

    nclasses = len({label for labels in labels_idxes for label in labels})
    # nclasses = len(set(reduce(lambda x, y: list(x) + list(y), train_y + test_y + valid_y)))

    nsentences = len(train_lex)
    folder = os.path.basename(__file__).split('.')[0]
    if not os.path.exists(folder):
        os.mkdir(folder)

    print "Loading Word2Vec"
    word2vec = Word2Vec.load_word2vec_format(WORD2VEC_FILENAME,
                                             binary=True)  # C binary format

    print "Calculate word embeddings"
    embeddings = 0.2 * np.random.uniform(
        -1.0, 1.0, (vocsize + 1, conf['emb_dimension'])).astype(
            theano.config.floatX
        )  # add one for PADDING at the end @UndefinedVariable
    for idx, word in idx2word.iteritems():
        try:
            embedding = word2vec[word]
        except KeyError:
            try:
                embedding = word2vec[word.capitalize()]
            except KeyError:
                embedding = embeddings[idx]  # Keep it random
        embeddings[idx] = embedding

    del word2vec  # It is huge

    print "Create a Neural Network"
    rnn = elman2vec(nh=conf['nhidden'],
                    nc=nclasses,
                    ne=vocsize,
                    de=conf['emb_dimension'],
                    cs=conf['win'],
                    embeddings=embeddings)

    # train with early stopping on validation set
    best_f1 = -np.inf
    conf['clr'] = conf['lr']
    print "Start training"
    for epoch in xrange(conf['nepochs']):
        # shuffle
        shuffle([train_lex, train_y], conf['seed'])
        conf['ce'] = epoch
        tic = time.time()
        for i in xrange(nsentences):
            cwords = contextwin(train_lex[i], conf['win'])
            words = [
                np.asarray(x).astype(np.int32)
                for x in minibatch(cwords, conf['bs'])
            ]
            labels = train_y[i]
            for word_batch, label_last_word in zip(words, labels):
                rnn.train(word_batch, label_last_word, conf['clr'])


#                 rnn.normalize()
            if conf['verbose']:
                print '[learning] epoch %i >> %2.2f%%' % (
                    epoch, (i + 1) * 100. / nsentences
                ), 'completed in %.2f (sec) <<\r' % (time.time() - tic),
                sys.stdout.flush()

        # evaluation // back into the real world : idx -> words
        predictions_test = [ map(lambda x: idx2label[x], \
                         rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\
                         for x in test_lex ]
        groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y]
        words_test = [map(lambda x: idx2word[x], w) for w in test_lex]

        predictions_valid = [ map(lambda x: idx2label[x], \
                             rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\
                             for x in valid_lex ]
        groundtruth_valid = [map(lambda x: idx2label[x], y) for y in valid_y]
        words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex]

        # evaluation // compute the accuracy using conlleval.pl
        res_test = conlleval(predictions_test, groundtruth_test, words_test,
                             folder + '/current.test.txt')
        res_valid = conlleval(predictions_valid, groundtruth_valid,
                              words_valid, folder + '/current.valid.txt')

        if res_valid['f1'] > best_f1:
            rnn.save(folder)
            best_f1 = res_valid['f1']
            print 'NEW BEST: epoch', epoch, 'valid F1', res_valid[
                'f1'], 'best test F1', res_test['f1'], ' ' * 20
            conf['vf1'], conf['vp'], conf['vr'] = res_valid['f1'], res_valid[
                'p'], res_valid['r']
            conf['tf1'], conf['tp'], conf['tr'] = res_test['f1'], res_test[
                'p'], res_test['r']
            conf['be'] = epoch
            subprocess.call([
                'mv', folder + '/current.test.txt', folder + '/best.test.txt'
            ])
            subprocess.call([
                'mv', folder + '/current.valid.txt', folder + '/best.valid.txt'
            ])
        else:
            print '        : epoch', epoch, 'valid F1', res_valid[
                'f1'], '     test F1', res_test['f1'], ' ' * 20

        # learning rate decay if no improvement in 10 epochs
        if conf['decay'] and abs(conf['be'] - conf['ce']) >= 10:
            conf['clr'] *= 0.5
        if conf['clr'] < 1e-5:
            break

    print 'BEST RESULT: epoch', epoch, 'valid F1', res_valid[
        'f1'], 'best test F1', res_test['f1'], 'with the model', folder
示例#18
0
def play_with_splitting_sentences():
    """Play with splitting sentences"""
    conf = {  # 'fold': 3, # 5 folds 0,1,2,3,4
        'lr': 0.0627142536696559,
        'verbose': False,
        'decay': True,  # decay on the learning rate if improvement stops
        'win': 15,  # number of characters in the context window
        'bs': 5,  # number of back-propagation through time steps
        'nhidden': 100,  # number of hidden units
        'seed': 345,
        'emb_dimension': 30,  # dimension of character embedding
        'nepochs': 10
    }
    number_of_files = 50000
    np.random.seed(conf['seed'])
    random.seed(conf['seed'])
    print "Calculate output"
    session_files = get_session_files(
        number_of_files=number_of_files,
        random_seed=conf['seed'])  # Limit the scope To speed things up...
    labels2idx = {"O": 0, "X": 1}
    sentences = []
    idxes = []
    labels_idxes = []
    labels = []
    char2idx = get_char_to_idx(session_files)
    for session_file in session_files:
        session = json.loads(open(session_file, "rb").read())
        sentence = session_to_text0(session)
        if not sentence.strip():
            continue
        sentence_out, label = create_test(sentence, probability=0.2)
        sentences.append(sentence_out)
        labels.append(label)
        labels_idxes.append(
            np.fromiter((labels2idx[l] for l in label), dtype=np.uint32))
        idxes.append(
            np.fromiter((char2idx[char] for char in sentence_out),
                        dtype=np.uint32))

    print "Prepare train, validation and test sets"
    train_valid_lex, test_lex, train_valid_y, test_y = train_test_split(
        idxes, labels_idxes, test_size=0.15, random_state=42)
    train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_lex,
                                                              train_valid_y,
                                                              test_size=0.2,
                                                              random_state=42)
    print "Some more prep"
    idx2label = dict(
        (k, v) for v, k in labels2idx.iteritems())  # Reverse the dictionary
    idx2word = dict(
        (k, v) for v, k in char2idx.iteritems())  # Reverse the dictionary

    #     vocsize = 1 + len(set(reduce(\
    #                                  lambda x, y: list(x)+list(y),\
    #                                  train_lex+valid_lex+test_lex)))
    vocsize = 1 + len(
        set(item for lex in (train_lex, valid_lex, test_lex) for sublist in lex
            for item in sublist))
    nclasses = 2  #len(set(reduce(lambda x, y: list(x) + list(y), train_y + test_y + valid_y)))
    nsentences = len(train_lex)
    print "Some file os calls"
    folder = os.path.basename(__file__).split('.')[0] + "_3"
    if not os.path.exists(folder):
        os.mkdir(folder)
    print "Create a Neural Network"
    rnn = regular_elman(
        nh=conf['nhidden'],
        nc=nclasses,
        ne=vocsize,
        de=conf['emb_dimension'],
        cs=conf['win'],
    )

    # train with early stopping on validation set
    best_f1 = -np.inf
    conf['clr'] = conf['lr']
    print "Start training"
    start_time = time.time()
    for epoch in xrange(conf['nepochs']):
        # shuffle
        shuffle([train_lex, train_y], conf['seed'])
        conf['ce'] = epoch
        tic = time.time()
        for i in xrange(nsentences):
            cwords = contextwin(train_lex[i], conf['win'])
            words = [
                np.asarray(x).astype(np.int32)
                for x in minibatch(cwords, conf['bs'])
            ]
            labels = train_y[i]
            for word_batch, label_last_word in zip(words, labels):
                rnn.train(word_batch, label_last_word, conf['clr'])
                rnn.normalize()
            if conf['verbose']:
                print '[learning] epoch %i >> %2.2f%%' % (
                    epoch, (i + 1) * 100. / nsentences
                ), 'completed in %.2f (sec) <<\r' % (time.time() - tic),
                sys.stdout.flush()

        # evaluation // back into the real world : idx -> words
        predictions_test = [ map(lambda x: idx2label[x], \
                         rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\
                         for x in test_lex ]
        groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y]
        words_test = [map(lambda x: idx2word[x], w) for w in test_lex]

        predictions_valid = [ map(lambda x: idx2label[x], \
                             rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\
                             for x in valid_lex ]
        groundtruth_valid = [map(lambda x: idx2label[x], y) for y in valid_y]
        words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex]

        # evaluation // compute the accuracy using conlleval.pl
        res_test = conlleval(predictions_test, groundtruth_test, words_test,
                             folder + '/current.test.txt')
        res_valid = conlleval(predictions_valid, groundtruth_valid,
                              words_valid, folder + '/current.valid.txt')

        if res_valid['f1'] > best_f1:
            rnn.save(folder)
            best_f1 = res_valid['f1']
            print 'NEW BEST: epoch', epoch, 'valid F1', res_valid[
                'f1'], 'best test F1', res_test['f1'], ' ' * 20
            conf['vf1'], conf['vp'], conf['vr'] = res_valid['f1'], res_valid[
                'p'], res_valid['r']
            conf['tf1'], conf['tp'], conf['tr'] = res_test['f1'], res_test[
                'p'], res_test['r']
            conf['be'] = epoch
            subprocess.call([
                'mv', folder + '/current.test.txt', folder + '/best.test.txt'
            ])
            subprocess.call([
                'mv', folder + '/current.valid.txt', folder + '/best.valid.txt'
            ])
        else:
            print '        : epoch', epoch, 'valid F1', res_valid[
                'f1'], '     test F1', res_test['f1'], ' ' * 20

        # learning rate decay if no improvement in 10 epochs
        if conf['decay'] and abs(conf['be'] - conf['ce']) >= 10:
            conf['clr'] *= 0.5
        if conf['clr'] < 1e-5:
            break

    print 'BEST RESULT: epoch', conf[
        'be'], 'valid F1', best_f1, 'best test F1', conf[
            'tf1'], 'with the model', folder
    print "total time = {} seconds".format(time.time() - start_time)
示例#19
0
def prepare_data():
    """Prepare the data"""
    conf = {'fold': 3, # 5 folds 0,1,2,3,4
            'lr': 0.0627142536696559,
            'verbose': True,
            'decay': True, # decay on the learning rate if improvement stops
            'win': 7, # number of words in the context window
            'bs': 9, # number of back-propagation through time steps
            'nhidden': 100, # number of hidden units
            'seed': 345,
            'emb_dimension': 300, # dimension of word embedding
            'nepochs': 50}
    np.random.seed(conf['seed'])
    random.seed(conf['seed'])
    session_files = get_session_files(number_of_files=None, random_seed=conf['seed']) # Limit the scope To speed things up...
    sentences = []
    idxes = []
    labels = []
    labels_idxes = []
    print "Calculate words2idx"
    words2idx = get_words2idx(session_files)
    unknown = words2idx["<UNK>"]
    print "Calculate output"
    for session_file in session_files:
        session = json.loads(open(session_file, "rb").read())
        sentence = session_to_text0(session)
        if not sentence.strip():
            continue
        sentences.append(sentence)
        token_list = tokenize(sentence.lower())
        dtp_search_res = dtp_search(sentence, None)
        iobes = to_iob(token_list, dtp_search_res)
        labels.append(iobes)
        labels_idxes.append(np.fromiter((LABELS2IDX[iob] for iob in iobes), dtype=np.int32))
#         token_list = [re.sub(r"\d", "DIGIT", token) for token in token_list]
        idxes.append(np.fromiter((words2idx.get(token, unknown) for token in token_list), dtype=np.int32))



    print "Prepare train, validation and test sets"
    train_valid_lex, test_lex, train_valid_y, test_y = train_test_split(idxes, labels_idxes, test_size=0.15, random_state=42)
    train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_lex, train_valid_y, test_size=0.2, random_state=42)

    idx2label = dict((k, v) for v, k in LABELS2IDX.iteritems()) # Reverse the dictionary
    idx2word = dict((k, v) for v, k in words2idx.iteritems()) # Reverse the dictionary

    vocsize = len(idx2word)

    nclasses = len({label for labels in labels_idxes for label in labels})
    # nclasses = len(set(reduce(lambda x, y: list(x) + list(y), train_y + test_y + valid_y)))

    nsentences = len(train_lex)
    folder = os.path.basename(__file__).split('.')[0]
    if not os.path.exists(folder):
        os.mkdir(folder)

    print "Loading Word2Vec"
    word2vec = Word2Vec.load_word2vec_format(WORD2VEC_FILENAME, binary=True) # C binary format

    print "Calculate word embeddings"
    embeddings = 0.2 * np.random.uniform(-1.0, 1.0, (vocsize + 1, conf['emb_dimension'])).astype(theano.config.floatX) # add one for PADDING at the end @UndefinedVariable
    for idx, word in idx2word.iteritems():
        try:
            embedding = word2vec[word]
        except KeyError:
            try:
                embedding = word2vec[word.capitalize()]
            except KeyError:
                embedding = embeddings[idx] # Keep it random
        embeddings[idx] = embedding

    del word2vec # It is huge

    print "Create a Neural Network"
    rnn = elman2vec(nh=conf['nhidden'],
                nc=nclasses,
                ne=vocsize,
                de=conf['emb_dimension'],
                cs=conf['win'],
                embeddings=embeddings)

    # train with early stopping on validation set
    best_f1 = -np.inf
    conf['clr'] = conf['lr']
    print "Start training"
    for epoch in xrange(conf['nepochs']):
        # shuffle
        shuffle([train_lex, train_y], conf['seed'])
        conf['ce'] = epoch
        tic = time.time()
        for i in xrange(nsentences):
            cwords = contextwin(train_lex[i], conf['win'])
            words = [np.asarray(x).astype(np.int32) for x in minibatch(cwords, conf['bs'])]
            labels = train_y[i]
            for word_batch , label_last_word in zip(words, labels):
                rnn.train(word_batch, label_last_word, conf['clr'])
#                 rnn.normalize()
            if conf['verbose']:
                print '[learning] epoch %i >> %2.2f%%' % (epoch, (i + 1) * 100. / nsentences), 'completed in %.2f (sec) <<\r' % (time.time() - tic),
                sys.stdout.flush()

        # evaluation // back into the real world : idx -> words
        predictions_test = [ map(lambda x: idx2label[x], \
                         rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\
                         for x in test_lex ]
        groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ]
        words_test = [ map(lambda x: idx2word[x], w) for w in test_lex]

        predictions_valid = [ map(lambda x: idx2label[x], \
                             rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\
                             for x in valid_lex ]
        groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ]
        words_valid = [ map(lambda x: idx2word[x], w) for w in valid_lex]

        # evaluation // compute the accuracy using conlleval.pl
        res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/current.test.txt')
        res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/current.valid.txt')

        if res_valid['f1'] > best_f1:
            rnn.save(folder)
            best_f1 = res_valid['f1']
            print 'NEW BEST: epoch', epoch, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' ' * 20
            conf['vf1'], conf['vp'], conf['vr'] = res_valid['f1'], res_valid['p'], res_valid['r']
            conf['tf1'], conf['tp'], conf['tr'] = res_test['f1'], res_test['p'], res_test['r']
            conf['be'] = epoch
            subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt'])
            subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt'])
        else:
            print '        : epoch', epoch, 'valid F1', res_valid['f1'], '     test F1', res_test['f1'], ' ' * 20

        # learning rate decay if no improvement in 10 epochs
        if conf['decay'] and abs(conf['be'] - conf['ce']) >= 10:
            conf['clr'] *= 0.5
        if conf['clr'] < 1e-5:
            break

    print 'BEST RESULT: epoch', epoch, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], 'with the model', folder