def filter_low_segment_documents(self, documents):
     new_documents = []
     skipped = 0
     for sample in documents:
         sentences, groundTruths = zip(*sample)
         avg_seg = compute_avg_seg_len(groundTruths)
         if avg_seg >= MIN_TRAIN_AVG_SEGMENT_LENGTH:
             new_documents.append(sample)
         else:
             skipped += 1
     #    pdb.set_trace()
     print "Skipped %d documents due to MIN_TRAIN_AVG_SEGMENT_LENGTH=%d" %(skipped, MIN_TRAIN_AVG_SEGMENT_LENGTH)
     return new_documents
def custom_fit(X_train, Y_train, X_test, Y_test, model, batch_size, epochs=10):

    # Print Train stats
    total_sentences, total_documents = 0, 0
    total_documents = X_train.shape[0]
    total_sentences = sum([doc.shape[0] for doc in X_train])
    print "X-wiki TRAIN stats: Total %d sentences in %d documents" % (
        total_sentences, total_documents)

    class_weight = None
    if SCALE_LOSS_FUN:
        # Iterate as the no of sentences in each document is different
        # so np.unique() messes up.
        classes, counts = None, []
        for _temp_Yi in Y_train:
            classes, _temp_counts = np.unique(_temp_Yi, return_counts=True)
            counts.append(_temp_counts)
        counts = np.sum(counts, axis=0)
        class_weight = dict(zip(classes.tolist(), counts / float(sum(counts))))
        print class_weight

    train_avg_seg_len = np.mean(
        [helper.compute_avg_seg_len(Yi) for Yi in Y_train], axis=0)
    print ">> Train AVG_SEGMENT_LENGTH:", train_avg_seg_len

    print 'Train...'
    start_epoch = 0
    if LOAD_SAVED_MODEL_AND_CONTINUE_TRAIN:  # If we have saved model, then continue from the last epoch where we stopped
        start_epoch = saved_model_epoch_done  # The epoch count is zero indexed in TRAIN, while the count in saved file is 1 indexed

    print_iter_count = 0
    for epoch in range(start_epoch, epochs):
        mean_tr_acc, mean_tr_loss, mean_tr_rec = [], [], []
        rLoss, rRecall, rAcc = 0, 0, 0  # Running parameters for printing while training
        for batch_count, (
                batch_X_left, batch_X_mid, batch_X_right,
                batch_Y_mid) in enumerate(
                    batch_gen_consecutive_context_segments_from_big_seq(
                        "train", X_train, Y_train, batch_size,
                        ONE_SIDE_CONTEXT_SIZE)):
            #batch_Y_vec = to_categorical_MULTI_DIM(batch_Y, nb_classes=2)
            try:
                #pdb.set_trace()

                batch_Y_mid = to_categorical(batch_Y_mid, nb_classes=2)

                start = time.time()
                tr_loss, tr_acc, tr_rec = model.train_on_batch(
                    [batch_X_left, batch_X_mid, batch_X_right], batch_Y_mid)
                speed = time.time() - start

                mean_tr_acc.append(tr_acc)
                mean_tr_loss.append(tr_loss)
                mean_tr_rec.append(tr_rec)
                #rLoss, rRecall, rAcc = (rLoss*batch_count + tr_loss)/(batch_count + 1), (rRecall*batch_count + tr_rec)/(batch_count + 1), (rAcc*batch_count + tr_acc)/(batch_count + 1)
                #progbar.prog_bar(True, total_sentences, epochs, batch_size, epoch, batch_count, speed=speed, data={ 'rLoss': rLoss, 'rAcc': rAcc, 'rRec': rRecall })
                progbar.prog_bar(True,
                                 total_sentences,
                                 epochs,
                                 batch_size,
                                 epoch,
                                 batch_count,
                                 speed=speed,
                                 data={
                                     'Loss': tr_loss,
                                     'Acc': tr_acc,
                                     'Rec': tr_rec
                                 })

                # Print test results after every 100 batch trains
                #if (not batch_count % 100) and batch_count != 0:
                #    print "\nTEST-ITER-COUNT: %d" %(print_iter_count)
                #    testing_on_data("Wikipedia(DEVELOPMENT)", X_test, Y_test, model, batch_size, summary_only=True)
                #    testing_on_data("Clinical", X_cli, Y_cli, model, batch_size, summary_only=True)
                #    testing_on_data("Biography", X_bio, Y_bio, model, batch_size)
                #    testing_on_data("Fiction", X_fic, Y_fic, model, batch_size, summary_only=True)
                #    testing_on_data("Wikipedia(BENCHMARK)", X_wikitest, Y_wikitest, model, batch_size, summary_only=True)
                #    print_iter_count += 1

            except KeyboardInterrupt, SystemExit:
                print ""
                print "########################################################"
                print "######  Pausing execution. Press ENTER to continue #####"
                print "########################################################"
                out = raw_input(
                    'Enter "pdb" to get prompt or ENTER to exit.> ')
                if out == "pdb":
                    pdb.set_trace()
            except Exception as e:
                print e
                print ">>>>> Is it intentional ?"
Exemplo n.º 3
0
def custom_fit(X_train, Y_train, X_test, Y_test, model, batch_size, epochs=10):

    # Print Train stats
    total_sentences, total_documents = 0, 0
    total_documents = X_train.shape[0]
    total_sentences = sum([doc.shape[0] for doc in X_train])
    print "X-wiki TRAIN stats: Total %d sentences in %d documents" % (
        total_sentences, total_documents)

    class_weight = None
    if SCALE_LOSS_FUN:
        # Iterate as the no of sentences in each document is different
        # so np.unique() messes up.
        classes, counts = None, []
        for _temp_Yi in Y_train:
            classes, _temp_counts = np.unique(_temp_Yi, return_counts=True)
            counts.append(_temp_counts)
        counts = np.sum(counts, axis=0)
        class_weight = dict(zip(classes.tolist(), counts / float(sum(counts))))
        print class_weight

    train_avg_seg_len = np.mean(
        [helper.compute_avg_seg_len(Yi) for Yi in Y_train], axis=0)
    print ">> Train AVG_SEGMENT_LENGTH:", train_avg_seg_len

    print 'Train...'
    start_epoch = 0
    if LOAD_SAVED_MODEL_AND_CONTINUE_TRAIN:  # If we have saved model, then continue from the last epoch where we stopped
        start_epoch = saved_model_epoch_done  # The epoch count is zero indexed in TRAIN, while the count in saved file is 1 indexed

    print_iter_count = 0
    for epoch in range(start_epoch, epochs):
        mean_tr_acc, mean_tr_loss, mean_tr_rec = [], [], []
        batch_count = 0
        rLoss, rRecall, rAcc = 0, 0, 0  # Running parameters for printing while training
        for i in range(total_documents):
            X, Y = X_train[i], Y_train[i]
            for (batch_X, batch_Y) in batch_gen_sentences_without_context(
                    X, Y, batch_size, fixed_size=False):
                #pdb.set_trace()

                batch_Y = to_categorical(
                    batch_Y, nb_classes=2)  # Convert to output as 2 classes

                start = time.time()
                tr_loss, tr_acc, tr_rec = model.train_on_batch([batch_X],
                                                               batch_Y)
                speed = time.time() - start

                mean_tr_acc.append(tr_acc)
                mean_tr_loss.append(tr_loss)
                mean_tr_rec.append(tr_rec)
                #rLoss, rRecall, rAcc = (rLoss*batch_count + tr_loss)/(batch_count + 1), (rRecall*batch_count + tr_rec)/(batch_count + 1), (rAcc*batch_count + tr_acc)/(batch_count + 1)
                #progbar.prog_bar(True, total_sentences, epochs, batch_size, epoch, batch_count, speed=speed, data={ 'rLoss': rLoss, 'rAcc': rAcc, 'rRec': rRecall })
                progbar.prog_bar(True,
                                 total_sentences,
                                 epochs,
                                 batch_size,
                                 epoch,
                                 batch_count,
                                 speed=speed,
                                 data={
                                     'Loss': tr_loss,
                                     'Acc': tr_acc,
                                     'Rec': tr_rec
                                 })
                batch_count += 1

        progbar.end()
        if SAVE_MODEL_AFTER_EACH_EPOCH:
            model.save("model_trainable_%s_epoc_%d.h5" %
                       (str(TRAINABLE_EMBEDDINGS), epoch + 1))

        print ">> Epoch: %d/%d" % (epoch + 1, epochs)
        print('accuracy training = {}'.format(np.mean(mean_tr_acc)))
        print('recall training = {}'.format(np.mean(mean_tr_rec)))
        print('loss training = {}'.format(np.mean(mean_tr_loss)))

        testing_on_data("Wikipedia(DEVELOPMENT)",
                        X_test,
                        Y_test,
                        model,
                        batch_size,
                        summary_only=True)
        testing_on_data("Clinical",
                        X_cli,
                        Y_cli,
                        model,
                        batch_size,
                        summary_only=True)
        #testing_on_data("Biography", X_bio, Y_bio, model, batch_size)
        testing_on_data("Fiction",
                        X_fic,
                        Y_fic,
                        model,
                        batch_size,
                        summary_only=True)
        testing_on_data("Wikipedia(BENCHMARK)",
                        X_wikitest,
                        Y_wikitest,
                        model,
                        batch_size,
                        summary_only=True)

        print('___________________________________')

    # Testing
    print "####################################################################"
    print ">> (TEST) >> Testing, X:", X_test.shape, "Y:", Y_test.shape
    mean_te_acc, mean_te_loss, mean_te_rec = [], [], []
    for i in range(X_test.shape[0]):
        X, Y = X_test[i], Y_test[i]
        for batch_X, batch_Y in batch_gen_sentences_without_context(
                X, Y, batch_size, fixed_size=False):
            te_loss, te_acc, te_rec = model.test_on_batch([batch_X], batch_Y)
            mean_te_acc.append(te_acc)
            mean_te_loss.append(te_loss)
            mean_te_rec.append(te_rec)

    print('accuracy testing = {}'.format(np.mean(mean_te_acc)))
    print('recall testing = {}'.format(np.mean(mean_te_rec)))
    print('loss testing = {}'.format(np.mean(mean_te_loss)))