def get_dataTrain(source, source_labels, limit): n = int(limit / 2) labels = read_text_file(source_labels, n) labels = (' '.join(labels)).split() positive_labels_indices = [i for i, j in enumerate(labels) if j == '1'] positive_labels_indices = positive_labels_indices[:n] print("positives: ", len(positive_labels_indices)) negative_labels_indices = [i for i, j in enumerate(labels) if j == '0'] negative_labels_indices = negative_labels_indices[:n] print("negatives: ", len(negative_labels_indices)) labels_indices = positive_labels_indices + negative_labels_indices labels = [labels[i] for i in labels_indices] labels = list(map(int, labels)) cat_labels = convert_to_categorical(labels) docs = read_text_file(source, n) docs = nltk.sent_tokenize(' '.join(docs)) docs = [docs[i] for i in labels_indices] #print("num of data", len(docs)) ''' data = form_pairs(docs,labels) np.random.shuffle(data) docs,labels = extract_labels_data(data) #ref_labels_indices = [i for i, j in enumerate(labels) if j == [1.0, 0.0]] #ref_docs =[docs[i] for i in ref_labels_indices] ref_labels =[1 if pr[0] >= pr[1] else 0 for pr in labels] ''' return docs, cat_labels, labels
def prepare_dataTrain(source, source_labels, limit): labels = read_text_file(source_labels, limit) labels = (' '.join(labels)).split() positive_labels_indices = [i for i, j in enumerate(labels) if j == '1'] positive_labels_indices = positive_labels_indices[:20000] print("positives: ", len(positive_labels_indices)) negative_labels_indices = [i for i, j in enumerate(labels) if j == '0'] negative_labels_indices = negative_labels_indices[:20000] print("negatives: ", len(negative_labels_indices)) ''' labels_indices= zip(positive_labels_indices,negative_labels_indices) labels_indices= list(itertools.chain(*labels_indices)) labels = [labels[i] for i in labels_indices] labels = list(map(int,labels)) #print(labels[:100]) ''' labels_indices = positive_labels_indices + negative_labels_indices labels = [labels[i] for i in labels_indices] labels = list(map(int, labels)) docs = read_text_file(source, limit) docs = nltk.sent_tokenize(' '.join(docs)) docs = [docs[i] for i in labels_indices] print("num of data", len(docs)) data = form_pairs(docs, labels) np.random.shuffle(data) docs, labels = extract_labels_data(data) return docs, labels
def get_data(source1, source2, src_fused): #shuffle the positive pairs sents1 = read_text_file(source1, num_examples) sents2 = read_text_file(source2, num_examples) fused = read_text_file(src_fused, num_examples) #sents1,sents2,fused = shuffle_data(sents1,sents2,fused,num_examples) return sents1, sents2, fused
def prepare_data(source, source_labels, limit): docs = read_text_file(source, limit) docs = nltk.sent_tokenize(' '.join(docs)) docs = docs[:20000] labels = read_text_file(source_labels, limit) labels = (' '.join(labels)).split() labels = labels[:20000] labels = list(map(int, labels)) data = docs, labels return data
def get_data(source, source_labels, limit): n = int(limit / 2) docs = read_text_file(source, n) docs = nltk.sent_tokenize(' '.join(docs)) docs = docs[:limit] labels = read_text_file(source_labels, n) labels = (' '.join(labels)).split() labels = labels[:limit] labels = list(map(int, labels)) cat_labels = convert_to_categorical(labels) return docs, cat_labels, labels
def run_tuning(source_doc,source_summ,dest,desc): doc = read_text_file(source_doc) summ = read_text_file(source_summ) print("Tuning " + desc + " doc") tuned_doc="" for i in range(len(doc)): art_sents = nltk.sent_tokenize(doc[i]) abs_sents = summ[i] tuned_doc +=summ_sents_extractor(art_sents,abs_sents) + "\n" if(i%500 == 0): prog = int(i/len(doc) * 100) print(prog,"% ...") write_to_file(dest,tuned_doc,"w")
def prepare_data(): #shuffle the ~42,000 positive pairs and form triples with the label sents1 = read_text_file(SOURCE_SENTS1) sents2 = read_text_file(SOURCE_SENTS2) sents1,sents2 = shuffle_data(sents1,sents2,20000) training_data = form_triples(1,sents1,sents2) #randomly select only 30,000 of the negative pairs and shuffle _sents1 = read_text_file(_SOURCE_SENTS1) _sents2 = read_text_file(_SOURCE_SENTS2) _sents1,_sents2 = shuffle_data(_sents1,_sents2,20000) neg = form_triples(0,_sents1,_sents2) training_data.extend(neg) #combine positive and negative examples np.random.shuffle(training_data) #shuffle the mix return training_data
def prepare_data(source1, source2, _source1, _source2): #shuffle the ~2500 positive pairs and form triples with the label sents1 = read_text_file(source1) sents2 = read_text_file(source2) sents1, sents2 = shuffle_data(sents1, sents2, 25000) #2000p,3400n test data = form_triples(1, sents1, sents2) #mix with the 4300 negative pairs _sents1 = read_text_file(_source1) _sents2 = read_text_file(_source2) _sents1, _sents2 = shuffle_data(_sents1, _sents2, 25000) neg = form_triples(0, _sents1, _sents2) data.extend(neg) #combine positive and negative examples np.random.shuffle(data) #shuffle the mix return data
def load_processed_embeddings(sess): try: saver = tf.train.import_meta_graph( 'LOG_DIR_300/embeddings/model.ckpt.meta') saver.restore(sess, 'LOG_DIR_300/embeddings/model.ckpt') #graph = tf.get_default_graph() word_embeddings = sess.run('embed:0') #word_embeddings = graph.get_tensor_by_name('embed:0') #word_embeddings = sess.run('embed:0') #print_tensors_in_checkpoint_file(file_name='LOG_DIR/model.ckpt', tensor_name='', all_tensors=False) except Exception as e: print("Error: ", e) vocab, word_embeddings = run_glove(sess, "self") else: vocab = read_text_file('LOG_DIR_300/embeddings/metadata.tsv') print("Embeddings loaded") return vocab, word_embeddings
def main(): lines = read_text_file(TRAIN) print("Processing...") docs = "" summ = "" lines_procd = 0 for l in range(len(lines)): abstract, article = get_abs_art(lines[l]) docs += article + "\n" summ += abstract + "\n" lines_procd += 1 if (lines_procd % 500) == 0: print(lines_procd, "...") print("writing to file") write_to_file(TRAIN_DOC, docs, "w") write_to_file(TRAIN_SUM, summ, "w") print("end")
def main(): lines = read_text_file(TESTING) print("Processing...") error_log = [] num_of_processed_lines = 0 reset_file(FILES) #reset file #temp commented for l in range(len(lines)): abstract, article = get_abs_art(lines[l]) abs_sents = nltk.sent_tokenize(abstract) art_sents = nltk.sent_tokenize(article) try: fusion_pairs_extractor(art_sents,abs_sents,str(l)) except BaseException as e: print("Error in line: ",l, " ",str(e)) error_log.append(l) else: num_of_processed_lines += 1 if(num_of_processed_lines%500 == 0): print(num_of_processed_lines,"...") save_info(num_of_processed_lines, error_log) #temp comment writeToFile()
def model_wrapper(n_examples, mood, source1, source2, sourceFused, sys, ref): #parameters global num_examples, state, seq_len, inc_prob, num_batches state = mood num_examples = n_examples if state == "Training": num_batches = int(num_examples / BATCH_SIZE) epochs = 1200 #1000 steps = num_batches * epochs num_epochs = None inc_prob = 1.0 / steps else: num_epochs = 1 sess = tf.InteractiveSession() #initialize estimator #run_config = tf.estimator.RunConfig(save_summary_steps=num_batches) run_config = tf.estimator.RunConfig(save_summary_steps=num_batches, save_checkpoints_steps=num_batches * 3) estimator = tf.estimator.Estimator(model_fn=rbmE_gruD, model_dir=MODEL_DIR, config=run_config, params=params) #get data s1, s2, fused = get_data(source1, source2, sourceFused) #get rbm conc states encoder_embd, _ = get_conc_hidden_states(s1, s2) #tensor 500 *15*50 sos = tf.constant(0.5, shape=[num_examples, 1, embd_dim]) eos = tf.constant(1.0, shape=[num_examples, 1, embd_dim]) encoder_embd = tf.concat([sos, encoder_embd, eos], axis=1) encoder_embd = sess.run(encoder_embd) #print(encoder_embd[:10]) write_to_file(ref, fused, "w") #write ref fusion to file reset_file(sys) #reset system fusion for new predictions '''prepare for Training, eval or testing''' if state != "Infering": #get ground truth vectors 500*seq_len*50 sos_fused = preProc(fused) sos_id, _ = lookUp_batch_embeddings(DECODER, sos_fused, extra_pad=True) sos_id_eos, ids = postProcDecoding(sos_id) dec_inp = ids2words(sos_id_eos) _, decoder_embd = lookUp_batch_embeddings(DECODER, dec_inp) #ids,decoder_embd = lookUp_batch_embeddings(DECODER,fused) #mask padded or unk words weights = sess.run(tf.to_float(tf.not_equal(ids, -1))) ids[ids == -1] = vocab_size - 1 if state == "Infering": seq_len = 15 inp_fn = tf.estimator.inputs.numpy_input_fn( x={"x": np.array(encoder_embd)}, batch_size=BATCH_SIZE, num_epochs=num_epochs, shuffle=False) else: inp_fn = tf.estimator.inputs.numpy_input_fn(x={ "x": np.array(encoder_embd), "ids": np.array(ids), "weights": np.array(weights) }, y=np.array(decoder_embd), batch_size=BATCH_SIZE, num_epochs=num_epochs, shuffle=False) # Set up logging for predictions # Log the values in the "predictions" tensor with label "pred" tensors_to_log = {"pred": "predictions"} lr = {"learning_rate": "learning_rate"} print_predictions = tf.train.LoggingTensorHook(tensors_to_log, every_n_iter=1, formatter=id2words) print_lr = tf.train.LoggingTensorHook(lr, every_n_iter=1000) '''run model''' coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) tf.reset_default_graph() #reset graph before importing saved checkpoints if state == "Training": estimator.train(input_fn=inp_fn, hooks=[print_predictions, print_lr], steps=steps) elif state == "Infering": infer_res = list(estimator.predict(input_fn=inp_fn)) #id2words(infer_res) infer_ids = [i["ids"] for i in infer_res] #infer_ids=np.array(infer_ids) pos = [i["pos"] for i in infer_res] #print(infer_ids[:3]) c = 0 sl = [] for inh in infer_ids: sl.append(inh[:, pos[c]]) c += 1 id2words(sl) else: eval_results = estimator.evaluate(input_fn=inp_fn, hooks=[print_predictions]) print(eval_results) coord.request_stop() coord.join(threads) #BLEU evaluation hyp = read_text_file(sys) bleu = bleuPerSent(fused, hyp) print("Bleu score: ", bleu)
#trained RBM MODEL_PATH1 = 'LOG_DIR_300/RBM_model/Sent1/' #MODEL_PATH2 = 'LOG_DIR_300/RBM_model/Sent2/' MODEL = 'LOG_DIR_300/RBM_model/Evaluating/' BATCH_SIZE = 50 #50 DECODER = "LOG_DIR_300/Fusion/Ground_truth/" NUM_UNITS = 200 #200 #initialize GO = "sttt " #"<s> " START = 0 STOP = " stte" #" </s>" END = 1 vocab = read_text_file('LOG_DIR_300/embeddings/metadata.tsv') vocab_size = len(vocab) UNK = -1 embd_dim = 300 seq_len = 10 num_examples = 100 state = "Training" count = 0 probs = 0.0 num_batches = 1 params = {"batch_size": BATCH_SIZE} def main(): tf.reset_default_graph() #start clean
def model_wrapper(n_examples, mood, source, labels, sys, ref): global num_examples, state, num_batches state = mood num_examples = n_examples if state == "Training": num_batches = int(num_examples / BATCH_SIZE) epochs = 38 #50 steps = num_batches * epochs num_epochs = None else: num_epochs = 1 sess = tf.InteractiveSession() #initialize estimator run_config = tf.estimator.RunConfig(save_summary_steps=num_batches) estimator = tf.estimator.Estimator(model_fn=rbmE_Class, model_dir=MODEL_DIR, config=run_config, params=params) #get data doc, labels, ref_labels = prepData(source, labels, num_examples) #write_to_file(ref,sumries,"w") #write ref extracted sents to file reset_file(sys) #get rbm pretrained states pre_embd = sess.run(get_sent_states(doc, RBM_MODEL)) #tensor 500 *15*50 '''prepare for Training, eval or testing''' if state == "Infering": inp_fn = tf.estimator.inputs.numpy_input_fn( x={"x": np.array(pre_embd)}, batch_size=BATCH_SIZE, num_epochs=num_epochs, shuffle=False) else: inp_fn = tf.estimator.inputs.numpy_input_fn( x={"x": np.array(pre_embd)}, y=np.array(labels), batch_size=BATCH_SIZE, num_epochs=num_epochs, shuffle=False) # Set up logging for training # Log the values in the "predictions" tensor with label "pred" tensors_to_log = {"pred": "predictions"} print_predictions = tf.train.LoggingTensorHook(tensors_to_log, every_n_iter=1, formatter=logits2preds) '''run model''' coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) tf.reset_default_graph() #reset graph before importing saved checkpoints if state == "Training": estimator.train(input_fn=inp_fn, hooks=[print_predictions], steps=steps) #estimator.train(input_fn=inp_fn,steps=steps) else: eval_results = estimator.evaluate(input_fn=inp_fn, hooks=[print_predictions]) #eval_results = estimator.evaluate(input_fn=inp_fn) print(eval_results) coord.request_stop() coord.join(threads) #metrics evaluation preds = read_text_file(sys) preds = list(map(int, preds)) get_metrics(preds, ref_labels, state, num_examples)