# print_iter_count += 1 except KeyboardInterrupt, SystemExit: print "" print "########################################################" print "###### Pausing execution. Press ENTER to continue #####" print "########################################################" out = raw_input( 'Enter "pdb" to get prompt or ENTER to exit.> ') if out == "pdb": pdb.set_trace() except Exception as e: print e print ">>>>> Is it intentional ?" progbar.end() if SAVE_MODEL_AFTER_EACH_EPOCH: model.save("model_trainable_%s_epoc_%d.h5" % (str(TRAINABLE_EMBEDDINGS), epoch + 1)) print ">> Epoch: %d/%d" % (epoch + 1, epochs) print('accuracy training = {}'.format(np.mean(mean_tr_acc))) print('recall training = {}'.format(np.mean(mean_tr_rec))) print('loss training = {}'.format(np.mean(mean_tr_loss))) testing_on_data("Wikipedia(DEVELOPMENT)", X_test, Y_test, model, batch_size, summary_only=True)
def get_input(sample_type, shuffle_documents, pad, trained_sent2vec_model=None): # Returns X, Y # X: Each row is a sample # Y: A 1-D vector for ground truth # Also pads the sample input as per the mentioned value of INPUT_VECTOR_LENGTH is needed start = time.time() data_handler = DataHandler() print "===========================================" if sample_type == 1: # NOT SURE ABOUT THIS TYPE! sample_type, samples = data_handler.get_samples( ) # Get samples, each sample is a set of INPUT_VECTOR_LENGTH consecutive sentences. No document information captured elif sample_type == 2: ld = load_data.LoadData() sample_type, samples = ld.load_wikipedia_sequence() elif sample_type in (2, 3): # type2 : Get samples, each sample is a document (a set of sentences resulting in a sequence), or, (NUM_DOCUMENTS, NUM_SENTENCES, SENTENCE) # type3 : Same as type2 just merge the samples to remove the sequence information and treat as simple sentence classification problem, i.e. (TOTAL_NUM_SENTENCES, SENTENCE) # This processing will be done in the cnn_clssifier.py itself. sample_type, samples = data_handler.get_sequence_samples(sample_type) #sample_type, samples = data_handler.get_sequence_samples_PARALLEL() # Get samples, each sample is a document (a set of sentences resulting in a sequence) elif sample_type == 4: # type4: Clinical sequence of a multiple samples # X.shape = (MULTIPLE_SAMPLES, TOTAL_SENTENCES) # Y.shape = (MULTIPLE_SAMPLES, TOTAL_SENTENCES, 1) ld = load_data.LoadData() sample_type, samples = ld.load_clinical_sequence() elif sample_type == 5: # type5: Biography sequence of a single sample # X.shape = (1, TOTAL_SENTENCES) # Y.shape = (TOTAL_SENTENCES, 1) ld = load_data.LoadData() sample_type, samples = ld.load_biography_sequence() elif sample_type == 6: # type6: Fiction sequence of a multiple documents # X.shape = (NO_OF_BOOKS, TOTAL_SENTENCES) # Y.shape = (NO_OF_BOOKS, TOTAL_SENTENCES, 1) ld = load_data.LoadData() sample_type, samples = ld.load_fiction_sequence() elif sample_type == 7: # type7: Wiki sequence of a multiple sample # Data format is just like the clinical sequence as each line is a sentence # X.shape = (MULTIPLE_DOCUMENTS, TOTAL_SENTENCES) # Y.shape = (MULTIPLE_DOCUMENTS, TOTAL_SENTENCES, 1) ld = load_data.LoadData() sample_type, samples = ld.load_wikipedia_sequence() else: print "NOTE: INVALID SAMPLE_TYPE!" return None del data_handler print "Samples Loading took", time.time() - start, "seconds" model = trained_sent2vec_model if not trained_sent2vec_model: #model = TFIDF(samples) #model = MeanWord2vec() #model = TFIDFweightedMeanWord2vec(samples) model = CustomSent2vec() X, Y = [], [] _total_samples, _start_time = len(samples), time.time() print len(samples) #pdb.set_trace() for _idx, sample in enumerate(samples): # Each sample is a document # Each sample is a list of tuples with each tuple as (sentence, groundTruth) sentences, groundTruths = zip(*sample) # Unpack a sample ## Create Wikipedia test set CREATE_WIKI_TEST_SET = False if CREATE_WIKI_TEST_SET: wiki_prefix = "wiki_save/wiki_test" if _idx >= 300: break with open(wiki_prefix + "_" + str(_idx + 1) + ".ref", "a") as f: for (_s, _g) in sample: if _g: f.write("==========\r\n") f.write(_s + "\r\n") f.write("==========\r\n") else: # Traditional code if not _idx % 50: progbar.simple_update("Converting doc to martices", _idx + 1, _total_samples, time_elapsed=(time.time() - _start_time)) if sample_type == 1: # Correct groundtruth sync problem here sentences, groundTruths = model.convert_sample_to_vec( sentences, groundTruths) elif sample_type in (2, 3, 4, 5, 6, 7): sentences, groundTruths = model.convert_sequence_sample_to_vec( sentences, groundTruths) else: print "Wrong Sample TYPE" if sentences is None: continue X.append(sentences) # X[0].shape = matrix([[1,2,3,4.....]]) Y.append(np.asarray( groundTruths)) # Y[0] = [1, 0, 0, ..... 0, 1, 0, 1....] progbar.simple_update("Creating a standalone matrix for samples...", -1, -1) X, Y = np.asarray(X), np.asarray(Y) progbar.end() print "Total samples: %d" % (len(X)) if shuffle_documents: # Shuffle the X's and Y's if required # Both of them have to be in unison X, Y = unison_shuffled_copies(X, Y) print "SHUFFLE: Shuffled input document order! (X:", X.shape, ", Y:", Y.shape, ")" if sample_type == 2 and pad == False: print "NOTE: Sample type2 requires PADDING!" if pad: #### THIS PAD is messy!!!! ### Check once before padding if STATIC_PAD: max_len = AVERAGE_WORDS else: max_len = None # Uses the max length of the sequences doc_lengths = [len(doc) for doc in X] print "Padding sequences. Doc-lengths: Mean=%d, Std=%d" % ( np.mean(doc_lengths), np.std(doc_lengths)) X = pad_sequences(X, padding="post", truncating="post", value=0.0, dtype=np.float32) Y = pad_sequences(Y, padding="post", truncating="post", value=0.0, dtype=np.float32) print "Size of new X(after padding):", X.shape return sample_type, X, Y, model
def custom_fit(X_train, Y_train, X_test, Y_test, model, batch_size, epochs=10): # Print Train stats total_sentences, total_documents = 0, 0 total_documents = X_train.shape[0] total_sentences = sum([doc.shape[0] for doc in X_train]) print "X-wiki TRAIN stats: Total %d sentences in %d documents" % ( total_sentences, total_documents) class_weight = None if SCALE_LOSS_FUN: # Iterate as the no of sentences in each document is different # so np.unique() messes up. classes, counts = None, [] for _temp_Yi in Y_train: classes, _temp_counts = np.unique(_temp_Yi, return_counts=True) counts.append(_temp_counts) counts = np.sum(counts, axis=0) class_weight = dict(zip(classes.tolist(), counts / float(sum(counts)))) print class_weight train_avg_seg_len = np.mean( [helper.compute_avg_seg_len(Yi) for Yi in Y_train], axis=0) print ">> Train AVG_SEGMENT_LENGTH:", train_avg_seg_len print 'Train...' start_epoch = 0 if LOAD_SAVED_MODEL_AND_CONTINUE_TRAIN: # If we have saved model, then continue from the last epoch where we stopped start_epoch = saved_model_epoch_done # The epoch count is zero indexed in TRAIN, while the count in saved file is 1 indexed for epoch in range(start_epoch, epochs): mean_tr_acc, mean_tr_loss, mean_tr_rec = [], [], [] rLoss, rRecall, rAcc = 0, 0, 0 # Running parameters for printing while training for batch_count, ( batch_X_left, batch_X_mid, batch_X_right, batch_Y_mid) in enumerate( batch_gen_consecutive_context_segments_from_big_seq( X_train, Y_train, batch_size, ONE_SIDE_CONTEXT_SIZE)): #batch_Y_vec = to_categorical_MULTI_DIM(batch_Y, nb_classes=2) try: start = time.time() tr_loss, tr_acc, tr_rec = model.train_on_batch( [batch_X_left, batch_X_mid, batch_X_right], batch_Y_mid) speed = time.time() - start mean_tr_acc.append(tr_acc) mean_tr_loss.append(tr_loss) mean_tr_rec.append(tr_rec) #rLoss, rRecall, rAcc = (rLoss*batch_count + tr_loss)/(batch_count + 1), (rRecall*batch_count + tr_rec)/(batch_count + 1), (rAcc*batch_count + tr_acc)/(batch_count + 1) #progbar.prog_bar(True, total_sentences, epochs, batch_size, epoch, batch_count, speed=speed, data={ 'rLoss': rLoss, 'rAcc': rAcc, 'rRec': rRecall }) progbar.prog_bar(True, total_sentences, epochs, batch_size, epoch, batch_count, speed=speed, data={ 'Loss': tr_loss, 'Acc': tr_acc, 'Rec': tr_rec }) # Print test results after every 100 batch trains if (not batch_count % 100) and batch_count != 0: testing_on_data("Wikipedia", X_test, Y_test, model, batch_size, summary_only=True) testing_on_data("Clinical", X_cli, Y_cli, model, batch_size) testing_on_data("Biography", X_bio, Y_bio, model, batch_size) testing_on_data("Fiction", X_fic, Y_fic, model, batch_size, summary_only=True) except KeyboardInterrupt, SystemExit: print "" print "########################################################" print "###### Pausing execution. Press ENTER to continue #####" print "########################################################" out = raw_input( 'Enter "pdb" to get prompt or ENTER to exit.> ') if out == "pdb": pdb.set_trace() progbar.end() if SAVE_MODEL_AFTER_EACH_EPOCH: model.save("model_trainable_%s_epoc_%d.h5" % (str(TRAINABLE_EMBEDDINGS), epoch + 1)) print ">> Epoch: %d/%d" % (epoch + 1, epochs) print('accuracy training = {}'.format(np.mean(mean_tr_acc))) print('recall training = {}'.format(np.mean(mean_tr_rec))) print('loss training = {}'.format(np.mean(mean_tr_loss))) testing_on_data("Wikipedia", X_test, Y_test, model, batch_size, summary_only=True) testing_on_data("Clinical", X_cli, Y_cli, model, batch_size) testing_on_data("Biography", X_bio, Y_bio, model, batch_size) testing_on_data("Fiction", X_fic, Y_fic, model, batch_size, summary_only=True) print('___________________________________')
def custom_fit(X_train, Y_train, X_test, Y_test, model, batch_size, epochs=10): # Print Train stats total_sentences, total_documents = 0, 0 total_documents = X_train.shape[0] total_sentences = sum([doc.shape[0] for doc in X_train]) print "X-wiki TRAIN stats: Total %d sentences in %d documents" % ( total_sentences, total_documents) class_weight = None if SCALE_LOSS_FUN: # Iterate as the no of sentences in each document is different # so np.unique() messes up. classes, counts = None, [] for _temp_Yi in Y_train: classes, _temp_counts = np.unique(_temp_Yi, return_counts=True) counts.append(_temp_counts) counts = np.sum(counts, axis=0) class_weight = dict(zip(classes.tolist(), counts / float(sum(counts)))) print class_weight train_avg_seg_len = np.mean( [helper.compute_avg_seg_len(Yi) for Yi in Y_train], axis=0) print ">> Train AVG_SEGMENT_LENGTH:", train_avg_seg_len print 'Train...' start_epoch = 0 if LOAD_SAVED_MODEL_AND_CONTINUE_TRAIN: # If we have saved model, then continue from the last epoch where we stopped start_epoch = saved_model_epoch_done # The epoch count is zero indexed in TRAIN, while the count in saved file is 1 indexed print_iter_count = 0 for epoch in range(start_epoch, epochs): mean_tr_acc, mean_tr_loss, mean_tr_rec = [], [], [] batch_count = 0 rLoss, rRecall, rAcc = 0, 0, 0 # Running parameters for printing while training for i in range(total_documents): X, Y = X_train[i], Y_train[i] for (batch_X, batch_Y) in batch_gen_sentences_without_context( X, Y, batch_size, fixed_size=False): #pdb.set_trace() batch_Y = to_categorical( batch_Y, nb_classes=2) # Convert to output as 2 classes start = time.time() tr_loss, tr_acc, tr_rec = model.train_on_batch([batch_X], batch_Y) speed = time.time() - start mean_tr_acc.append(tr_acc) mean_tr_loss.append(tr_loss) mean_tr_rec.append(tr_rec) #rLoss, rRecall, rAcc = (rLoss*batch_count + tr_loss)/(batch_count + 1), (rRecall*batch_count + tr_rec)/(batch_count + 1), (rAcc*batch_count + tr_acc)/(batch_count + 1) #progbar.prog_bar(True, total_sentences, epochs, batch_size, epoch, batch_count, speed=speed, data={ 'rLoss': rLoss, 'rAcc': rAcc, 'rRec': rRecall }) progbar.prog_bar(True, total_sentences, epochs, batch_size, epoch, batch_count, speed=speed, data={ 'Loss': tr_loss, 'Acc': tr_acc, 'Rec': tr_rec }) batch_count += 1 progbar.end() if SAVE_MODEL_AFTER_EACH_EPOCH: model.save("model_trainable_%s_epoc_%d.h5" % (str(TRAINABLE_EMBEDDINGS), epoch + 1)) print ">> Epoch: %d/%d" % (epoch + 1, epochs) print('accuracy training = {}'.format(np.mean(mean_tr_acc))) print('recall training = {}'.format(np.mean(mean_tr_rec))) print('loss training = {}'.format(np.mean(mean_tr_loss))) testing_on_data("Wikipedia(DEVELOPMENT)", X_test, Y_test, model, batch_size, summary_only=True) testing_on_data("Clinical", X_cli, Y_cli, model, batch_size, summary_only=True) #testing_on_data("Biography", X_bio, Y_bio, model, batch_size) testing_on_data("Fiction", X_fic, Y_fic, model, batch_size, summary_only=True) testing_on_data("Wikipedia(BENCHMARK)", X_wikitest, Y_wikitest, model, batch_size, summary_only=True) print('___________________________________') # Testing print "####################################################################" print ">> (TEST) >> Testing, X:", X_test.shape, "Y:", Y_test.shape mean_te_acc, mean_te_loss, mean_te_rec = [], [], [] for i in range(X_test.shape[0]): X, Y = X_test[i], Y_test[i] for batch_X, batch_Y in batch_gen_sentences_without_context( X, Y, batch_size, fixed_size=False): te_loss, te_acc, te_rec = model.test_on_batch([batch_X], batch_Y) mean_te_acc.append(te_acc) mean_te_loss.append(te_loss) mean_te_rec.append(te_rec) print('accuracy testing = {}'.format(np.mean(mean_te_acc))) print('recall testing = {}'.format(np.mean(mean_te_rec))) print('loss testing = {}'.format(np.mean(mean_te_loss)))