def main(): print args assert args.embedding, "Pre-trained word embeddings required." embedding_layer = myio.create_embedding_layer(args.embedding) max_len = args.max_len if args.train: train_x, train_y = myio.read_annotations(args.train) train_x = [embedding_layer.map_to_ids(x)[:max_len] for x in train_x] if args.dev: dev_x, dev_y = myio.read_annotations(args.dev) dev_x = [embedding_layer.map_to_ids(x)[:max_len] for x in dev_x] if args.test: test_x, test_y = myio.read_annotations(args.test) test_x = [embedding_layer.map_to_ids(x)[:max_len] for x in test_x] if args.train: model = Model(args=args, embedding_layer=embedding_layer, nclasses=len(train_y[0])) model.ready() #debug_func2 = theano.function( # inputs = [ model.x, model.z ], # outputs = model.generator.logpz # ) #theano.printing.debugprint(debug_func2) #return model.train((train_x, train_y), (dev_x, dev_y) if args.dev else None, (test_x, test_y) if args.test else None)
def __init__(self, model_path, corpus_path, emb_path): raw_corpus = myio.read_corpus(corpus_path) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d = 10, cut_off = 1, embs = load_embedding_iterator(emb_path) ) weights = myio.create_idf_weights(corpus_path, embedding_layer) say("vocab size={}, corpus size={}\n".format( embedding_layer.n_V, len(raw_corpus) )) model = Model(args=None, embedding_layer=embedding_layer, weights=weights) model_data = model.load_model(model_path) model.set_model(model_data) model.dropout.set_value(0.0) say("model initialized\n") score_func = theano.function( inputs = [ model.idts, model.idbs ], outputs = model.scores, on_unused_input='ignore' ) self.model = model self.score_func = score_func say("scoring function compiled\n")
def __init__(self, model_path, corpus_path, emb_path): raw_corpus = myio.read_corpus(corpus_path) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d=10, cut_off=1, embs=load_embedding_iterator(emb_path)) weights = myio.create_idf_weights(corpus_path, embedding_layer) say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V, len(raw_corpus))) model = Model(args=None, embedding_layer=embedding_layer, weights=weights) model_data = model.load_model(model_path) model.set_model(model_data) model.dropout.set_value(0.0) say("model initialized\n") score_func = theano.function(inputs=[model.idts, model.idbs], outputs=model.scores, on_unused_input='ignore') self.model = model self.score_func = score_func say("scoring function compiled\n")
def main(args): raw_corpus = myio.read_corpus(args.corpus) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d=args.hidden_dim, embs=load_embedding_iterator(args.embeddings) if args.embeddings else None) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, max_len=args.max_seq_len) say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V, len(raw_corpus))) padding_id = embedding_layer.vocab_map["<padding>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) if args.dev: dev_raw = myio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus, dev_raw, padding_id, pad_left=not args.average, merge=args.merge) if args.test: test_raw = myio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus, test_raw, padding_id, pad_left=not args.average, merge=args.merge) if args.train: start_time = time.time() train = myio.read_annotations(args.train) train_batches = myio.create_batches(ids_corpus, train, args.batch_size, padding_id, pad_left=not args.average, merge=args.merge) say("{} to create batches\n".format(time.time() - start_time)) say("{} batches, {} tokens in total, {} triples in total\n".format( len(train_batches), sum(len(x[0].ravel()) for x in train_batches), sum(len(x[1].ravel()) for x in train_batches))) train_batches = None model = Model(args, embedding_layer, weights=weights if args.reweight else None) model.ready() # set parameters using pre-trained network if args.load_pretrain: model.encoder.load_pretrained_parameters(args) model.train(ids_corpus, train, (dev, dev_raw) if args.dev else None, (test, test_raw) if args.test else None)
def main(args): raw_corpus = myio.read_corpus(args.corpus) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d = args.hidden_dim, cut_off = args.cut_off, embs = load_embedding_iterator(args.embeddings) if args.embeddings else None ) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer) say("vocab size={}, corpus size={}\n".format( embedding_layer.n_V, len(raw_corpus) )) padding_id = embedding_layer.vocab_map["<padding>"] bos_id = embedding_layer.vocab_map["<s>"] eos_id = embedding_layer.vocab_map["</s>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) if args.dev: dev = myio.read_annotations(args.dev, K_neg=20, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus, dev, padding_id) if args.test: test = myio.read_annotations(args.test, K_neg=20, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus, test, padding_id) if args.heldout: with open(args.heldout) as fin: heldout_ids = fin.read().split() heldout_corpus = dict((id, ids_corpus[id]) for id in heldout_ids if id in ids_corpus) train_corpus = dict((id, ids_corpus[id]) for id in ids_corpus if id not in heldout_corpus) heldout = myio.create_batches(heldout_corpus, [ ], args.batch_size, padding_id, bos_id, eos_id, auto_encode=True) heldout = [ myio.create_one_batch(b1, t2, padding_id) for t1, b1, t2 in heldout ] say("heldout examples={}\n".format(len(heldout_corpus))) if args.train: model = Model(args, embedding_layer, weights=weights if args.reweight else None) start_time = time.time() train = myio.read_annotations(args.train) if not args.use_anno: train = [ ] train_batches = myio.create_batches(ids_corpus, train, args.batch_size, model.padding_id, model.bos_id, model.eos_id, auto_encode=True) say("{} to create batches\n".format(time.time()-start_time)) model.ready() model.train( ids_corpus if not args.heldout else train_corpus, train, dev if args.dev else None, test if args.test else None, heldout if args.heldout else None )
def main(args): raw_corpus = myio.read_corpus(args.corpus) print("raw corpus:", args.corpus, "len:", len(raw_corpus)) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d = args.hidden_dim, cut_off = args.cut_off, embs = None # embs = load_embedding_iterator(args.embeddings) if args.embeddings else None ) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, max_len=args.max_seq_len) myio.say("vocab size={}, corpus size={}\n".format( embedding_layer.n_V, len(raw_corpus) )) padding_id = embedding_layer.vocab_map["<padding>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) # # if args.dev: # dev = myio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1) # dev = myio.create_eval_batches(ids_corpus, dev, padding_id, pad_left = not args.average) # if args.test: # test = myio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1) # test = myio.create_eval_batches(ids_corpus, test, padding_id, pad_left = not args.average) if args.train: start_time = time.time() train = myio.read_annotations(args.train) print("training data:", args.train, "len:", len(train)) train_batches = myio.create_batches(ids_corpus, train, args.batch_size, padding_id, pad_left = not args.average) myio.say("{:.2f} secs to create {} batches of size {}\n".format( (time.time()-start_time), len(train_batches), args.batch_size)) myio.say("{} batches, {} tokens in total, {} triples in total\n".format( len(train_batches), sum(len(x[0].ravel())+len(x[1].ravel()) for x in train_batches), sum(len(x[2].ravel()) for x in train_batches) )) # train_batches = None model = Model(args, embedding_layer, weights=weights if args.reweight else None) model.ready() # # set parameters using pre-trained network # if args.load_pretrain: # model.load_pretrained_parameters(args) # model.train( ids_corpus, train, dev = None, # dev if args.dev else None, test = None # test if args.test else None )
def main(args): raw_corpus = myio.read_corpus(args.corpus) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d = args.hidden_dim, embs = load_embedding_iterator(args.embeddings) if args.embeddings else None ) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, max_len=args.max_seq_len) say("vocab size={}, corpus size={}\n".format( embedding_layer.n_V, len(raw_corpus) )) padding_id = embedding_layer.vocab_map["<padding>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) if args.dev: dev_raw = myio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus, dev_raw, padding_id, pad_left=not args.average, merge=args.merge) if args.test: test_raw = myio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus, test_raw, padding_id, pad_left=not args.average, merge=args.merge) if args.train: start_time = time.time() train = myio.read_annotations(args.train) train_batches = myio.create_batches(ids_corpus, train, args.batch_size, padding_id, pad_left = not args.average, merge=args.merge) say("{} to create batches\n".format(time.time()-start_time)) say("{} batches, {} tokens in total, {} triples in total\n".format( len(train_batches), sum(len(x[0].ravel()) for x in train_batches), sum(len(x[1].ravel()) for x in train_batches) )) train_batches = None model = Model(args, embedding_layer, weights=weights if args.reweight else None) model.ready() # set parameters using pre-trained network if args.load_pretrain: model.encoder.load_pretrained_parameters(args) model.train( ids_corpus, train, (dev, dev_raw) if args.dev else None, (test, test_raw) if args.test else None )
def main(): print args embedding_layer = None if args.embedding: assert args.embedding, "Pre-trained word embeddings required." embedding_layer = myio.create_embedding_layer(args.embedding) max_len = args.max_len if args.train: train_x, train_y = myio.read_annotations(args.train) train_words = set([word for x in train_x for word in x]) embedding_layer = EmbeddingLayer(n_d=args.hidden_dimension, vocab=["<unk>", "<padding>"] + list(train_words), oov="<unk>", fix_init_embs=False) train_x = [embedding_layer.map_to_ids(x)[:max_len] for x in train_x] if args.dev: dev_x, dev_y = myio.read_annotations(args.dev) dev_x = [embedding_layer.map_to_ids(x)[:max_len] for x in dev_x] if args.load_rationale: rationale_data = myio.read_rationales(args.load_rationale) for x in rationale_data: x["xids"] = embedding_layer.map_to_ids(x["x"]) if args.train: model = Model(args=args, embedding_layer=embedding_layer, nclasses=len(train_y[0])) model.ready() #debug_func2 = theano.function( # inputs = [ model.x, model.z ], # outputs = model.generator.logpz # ) #theano.printing.debugprint(debug_func2) #return model.train( (train_x, train_y), (dev_x, dev_y) if args.dev else None, None, #(test_x, test_y), rationale_data if args.load_rationale else None)
def main(): print args assert args.embedding, "Pre-trained word embeddings required." embedding_layer = myio.create_embedding_layer( args.embedding ) max_len = args.max_len if args.train: train_x, train_y = myio.read_annotations(args.train) train_x = [ embedding_layer.map_to_ids(x)[:max_len] for x in train_x ] if args.dev: dev_x, dev_y = myio.read_annotations(args.dev) dev_x = [ embedding_layer.map_to_ids(x)[:max_len] for x in dev_x ] if args.load_rationale: rationale_data = myio.read_rationales(args.load_rationale) for x in rationale_data: x["xids"] = embedding_layer.map_to_ids(x["x"]) if args.train: model = Model( args = args, embedding_layer = embedding_layer, nclasses = len(train_y[0]) ) model.ready() #debug_func2 = theano.function( # inputs = [ model.x, model.z ], # outputs = model.generator.logpz # ) #theano.printing.debugprint(debug_func2) #return model.train( (train_x, train_y), (dev_x, dev_y) if args.dev else None, None, #(test_x, test_y), rationale_data if args.load_rationale else None )
def main(): assert args.embedding, "Pre-trained word embeddings required." vocab = myio.get_vocab(args) embedding_layer = myio.create_embedding_layer(args, args.embedding, vocab, args.embedding_dim, '<unk>') position_emb_layer = myio.create_posit_embedding_layer(args.inp_len, 30) n_classes = args.nclasses model = Model(args=args, embedding_layer=embedding_layer, embedding_layer_posit=position_emb_layer, nclasses=n_classes) if args.train: if args.pretrain: model.ready_pretrain() model.pretrain() else: if args.load_model_pretrain: model.load_model_pretrain(args.save_model + 'pretrain/' + args.load_model, inference=False) else: model.ready() model.train() elif args.dev: model.load_model(args.save_model + args.load_model) model.dev_full() elif args.test: model.load_model(args.save_model + args.load_model, True) model.test()
def main(): print args assert args.embedding, "Pre-trained word embeddings required." embedding_layer = myio.create_embedding_layer( args.embedding ) max_len = args.max_len if args.train: train_x, train_y = myio.read_annotations(args.train) train_x = [ embedding_layer.map_to_ids(x)[:max_len] for x in train_x ] if args.dev: dev_x, dev_y = myio.read_annotations(args.dev) dev_x = [ embedding_layer.map_to_ids(x)[:max_len] for x in dev_x ] if args.load_rationale: rationale_data = myio.read_rationales(args.load_rationale) for x in rationale_data: x["xids"] = embedding_layer.map_to_ids(x["x"]) if args.train: model = Model( args = args, embedding_layer = embedding_layer, nclasses = len(train_y[0]) ) model.ready() #debug_func2 = theano.function( # inputs = [ model.x, model.z ], # outputs = model.generator.logpz # ) #theano.printing.debugprint(debug_func2) #return model.train( (train_x, train_y), (dev_x, dev_y) if args.dev else None, None, #(test_x, test_y), rationale_data if args.load_rationale else None ) if args.load_model and args.dev and not args.train: model = Model( args = None, embedding_layer = embedding_layer, nclasses = -1 ) model.load_model(args.load_model) say("model loaded successfully.\n") # compile an evaluation function eval_func = theano.function( inputs = [ model.x, model.y ], outputs = [ model.z, model.encoder.obj, model.encoder.loss, model.encoder.pred_diff ], updates = model.generator.sample_updates ) # compile a predictor function pred_func = theano.function( inputs = [ model.x ], outputs = [ model.z, model.encoder.preds ], updates = model.generator.sample_updates ) # batching data padding_id = embedding_layer.vocab_map["<padding>"] dev_batches_x, dev_batches_y = myio.create_batches( dev_x, dev_y, args.batch, padding_id ) # disable dropout model.dropout.set_value(0.0) dev_obj, dev_loss, dev_diff, dev_p1 = model.evaluate_data( dev_batches_x, dev_batches_y, eval_func, sampling=True) say("{} {} {} {}\n".format(dev_obj, dev_loss, dev_diff, dev_p1))
def main(): print args set_default_rng_seed(args.seed) assert args.embedding, "Pre-trained word embeddings required." embedding_layer = myio.create_embedding_layer(args.embedding) max_len = args.max_len if args.train: train_x, train_y = myio.read_annotations(args.train) if args.debug: len_ = len(train_x) * args.debug len_ = int(len_) train_x = train_x[:len_] train_y = train_y[:len_] print 'train size: ', len(train_x) #, train_x[0], len(train_x[0]) #exit() train_x = [embedding_layer.map_to_ids(x)[:max_len] for x in train_x] if args.dev: dev_x, dev_y = myio.read_annotations(args.dev) if args.debug: len_ = len(dev_x) * args.debug len_ = int(len_) dev_x = dev_x[:len_] dev_x = dev_y[:len_] print 'train size: ', len(train_x) dev_x = [embedding_layer.map_to_ids(x)[:max_len] for x in dev_x] if args.load_rationale: rationale_data = myio.read_rationales(args.load_rationale) for x in rationale_data: x["xids"] = embedding_layer.map_to_ids(x["x"]) #print 'in main: ', args.seed if args.train: model = Model(args=args, embedding_layer=embedding_layer, nclasses=len(train_y[0])) if args.load_model: model.load_model(args.load_model, seed=args.seed, select_all=args.select_all) say("model loaded successfully.\n") else: model.ready() #say(" ready time nedded {} \n".format(time.time()-start_ready_time)) #debug_func2 = theano.function( # inputs = [ model.x, model.z ], # outputs = model.generator.logpz # ) #theano.printing.debugprint(debug_func2) #return model.train( (train_x, train_y), (dev_x, dev_y) if args.dev else None, None, #(test_x, test_y), rationale_data if args.load_rationale else None, trained_max_epochs=args.trained_max_epochs) if args.load_model and not args.dev and not args.train: model = Model(args=args, embedding_layer=embedding_layer, nclasses=-1) model.load_model(args.load_model, seed=args.seed, select_all=args.select_all) say("model loaded successfully.\n") sample_generator = theano.function( inputs=[model.x], outputs=model.z, #updates = model.generator.sample_updates ) sample_encoder = theano.function( inputs=[model.x, model.y, model.z], outputs=[ model.encoder.obj, model.encoder.loss, model.encoder.pred_diff ], #updates = model.generator.sample_updates ) # compile an evaluation function eval_func = theano.function( inputs=[model.x, model.y], outputs=[ model.z, model.encoder.obj, model.encoder.loss, model.encoder.pred_diff ], #updates = model.generator.sample_updates ) debug_func_enc = theano.function( inputs=[model.x, model.y], outputs=[ model.z, model.encoder.obj, model.encoder.loss, model.encoder.pred_diff ], #updates = model.generator.sample_updates ) debug_func_gen = theano.function( inputs=[model.x, model.y], outputs=[ model.z, model.encoder.obj, model.encoder.loss, model.encoder.pred_diff ], #updates = model.generator.sample_updates ) # compile a predictor function pred_func = theano.function( inputs=[model.x], outputs=[model.z, model.encoder.preds], #updates = model.generator.sample_updates ) # batching data padding_id = embedding_layer.vocab_map["<padding>"] if rationale_data is not None: valid_batches_x, valid_batches_y = myio.create_batches( [u["xids"] for u in rationale_data], [u["y"] for u in rationale_data], args.batch, padding_id, sort=False) # disable dropout model.dropout.set_value(0.0) if rationale_data is not None: #model.dropout.set_value(0.0) start_rational_time = time.time() r_mse, r_p1, r_prec1, r_prec2, gen_time, enc_time, prec_cal_time = model.evaluate_rationale( rationale_data, valid_batches_x, valid_batches_y, sample_generator, sample_encoder, eval_func) #valid_batches_y, eval_func) #model.dropout.set_value(dropout_prob) #say(("\ttest rationale mser={:.4f} p[1]r={:.2f} prec1={:.4f}" + # " prec2={:.4f} generator time={:.4f} encoder time={:.4f} total test time={:.4f}\n").format( # r_mse, # r_p1, # r_prec1, # r_prec2, # gen_time, # enc_time, # time.time() - start_rational_time #)) data = str('%.5f' % r_mse) + "\t" + str( '%4.2f' % r_p1) + "\t" + str('%4.4f' % r_prec1) + "\t" + str( '%4.4f' % r_prec2) + "\t" + str('%4.2f' % gen_time) + "\t" + str( '%4.2f' % enc_time) + "\t" + str( '%4.2f' % prec_cal_time) + "\t" + str( '%4.2f' % (time.time() - start_rational_time) ) + "\t" + str(args.sparsity) + "\t" + str( args.coherent) + "\t" + str( args.max_epochs) + "\t" + str( args.cur_epoch) with open(args.graph_data_path, 'a') as g_f: print 'writning to file: ', data g_f.write(data + "\n")
def main(): print(args) assert args.embedding, "Pre-trained word embeddings required." embedding_layer = myio.create_embedding_layer(args.embedding) max_len = args.max_len if args.train: train_x, train_y = myio.read_annotations(args.train) train_x = [embedding_layer.map_to_ids(x)[:max_len] for x in train_x] if args.dev: dev_x, dev_y = myio.read_annotations(args.dev) dev_x = [embedding_layer.map_to_ids(x)[:max_len] for x in dev_x] if args.load_rationale: rationale_data = myio.read_rationales(args.load_rationale) for x in rationale_data: x["xids"] = embedding_layer.map_to_ids(x["x"]) if args.train: model = Model(args=args, embedding_layer=embedding_layer, nclasses=len(train_y[0])) model.ready() model.train( (train_x, train_y), (dev_x, dev_y) if args.dev else None, None, #(test_x, test_y), rationale_data if args.load_rationale else None) if args.load_model and args.dev and not args.train: model = Model(args=None, embedding_layer=embedding_layer, nclasses=-1) model.load_model(args.load_model) say("model loaded successfully.\n") # compile an evaluation function eval_func = theano.function( inputs=[model.x, model.y], outputs=[ model.z, model.generator.obj, model.generator.loss, model.encoder.pred_diff ], givens={model.z: model.generator.z_pred}, ) # compile a predictor function pred_func = theano.function( inputs=[model.x], outputs=[model.z, model.encoder.preds], givens={model.z: model.generator.z_pred}, ) # batching data padding_id = embedding_layer.vocab_map["<padding>"] dev_batches_x, dev_batches_y = myio.create_batches( dev_x, dev_y, args.batch, padding_id) # disable dropout model.dropout.set_value(0.0) dev_obj, dev_loss, dev_diff, dev_p1 = model.evaluate_data( dev_batches_x, dev_batches_y, eval_func, sampling=True) say("{} {} {} {}\n".format(dev_obj, dev_loss, dev_diff, dev_p1))
def main(args): raw_corpus = myio.read_corpus(args.corpus) embedding_layer = myio.create_embedding_layer( raw_corpus, n_d=args.hidden_dim, cut_off=args.cut_off, embs=load_embedding_iterator(args.embeddings) if args.embeddings else None) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer) say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V, len(raw_corpus))) padding_id = embedding_layer.vocab_map["<padding>"] bos_id = embedding_layer.vocab_map["<s>"] eos_id = embedding_layer.vocab_map["</s>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) if args.dev: dev = myio.read_annotations(args.dev, K_neg=20, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus, dev, padding_id) if args.test: test = myio.read_annotations(args.test, K_neg=20, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus, test, padding_id) if args.heldout: with open(args.heldout) as fin: heldout_ids = fin.read().split() heldout_corpus = dict( (id, ids_corpus[id]) for id in heldout_ids if id in ids_corpus) train_corpus = dict((id, ids_corpus[id]) for id in ids_corpus if id not in heldout_corpus) heldout = myio.create_batches(heldout_corpus, [], args.batch_size, padding_id, bos_id, eos_id, auto_encode=True) heldout = [ myio.create_one_batch(b1, t2, padding_id) for t1, b1, t2 in heldout ] say("heldout examples={}\n".format(len(heldout_corpus))) if args.train: model = Model(args, embedding_layer, weights=weights if args.reweight else None) start_time = time.time() train = myio.read_annotations(args.train) if not args.use_anno: train = [] train_batches = myio.create_batches(ids_corpus, train, args.batch_size, model.padding_id, model.bos_id, model.eos_id, auto_encode=True) say("{} to create batches\n".format(time.time() - start_time)) model.ready() model.train(ids_corpus if not args.heldout else train_corpus, train, dev if args.dev else None, test if args.test else None, heldout if args.heldout else None)
def main(): assert args.embedding, "Pre-trained word embeddings required." embedding_layer = myio.create_embedding_layer(args.embedding) embedding_layer_y = myio.create_embedding_layer(args.embedding) max_len_x = args.sentence_length * args.max_sentences max_len_y = args.sentence_length_hl * args.max_sentences_hl if args.train: train_x, train_y = myio.read_docs(args.train) train_x = [embedding_layer.map_to_ids(x)[:max_len_x] for x in train_x] train_y = [ embedding_layer_y.map_to_ids(y)[:max_len_y] for y in train_y ] if args.dev: dev_x, dev_y = myio.read_docs(args.dev) dev_x = [embedding_layer.map_to_ids(x)[:max_len_x] for x in dev_x] dev_y = [embedding_layer_y.map_to_ids(y)[:max_len_y] for y in dev_y] if args.load_rationale: rationale_data = myio.read_rationales(args.load_rationale) for x in rationale_data: x["xids"] = embedding_layer.map_to_ids(x["x"]) if args.train: model = Model(args=args, embedding_layer=embedding_layer, embedding_layer_y=embedding_layer_y, nclasses=len(train_y[0])) model.ready() # debug_func2 = theano.function( # inputs = [ model.x, model.z ], # outputs = model.generator.logpz # ) # theano.printing.debugprint(debug_func2) # return model.train( (train_x, train_y), (dev_x, dev_y) if args.dev else None, None, # (test_x, test_y), rationale_data if args.load_rationale else None) if args.load_model and args.dev and not args.train: model = Model(args=None, embedding_layer=embedding_layer, nclasses=-1) model.load_model(args.load_model) say("model loaded successfully.\n") # compile an evaluation function eval_func = theano.function(inputs=[model.x, model.y], outputs=[ model.z, model.encoder.obj, model.encoder.loss, model.encoder.pred_diff ], updates=model.generator.sample_updates) # compile a predictor function pred_func = theano.function(inputs=[model.x], outputs=[model.z, model.encoder.preds], updates=model.generator.sample_updates) # batching data padding_id = embedding_layer.vocab_map["<padding>"] dev_batches_x, dev_batches_y = myio.create_batches( dev_x, dev_y, args.batch, padding_id) # disable dropout model.dropout.set_value(0.0) dev_obj, dev_loss, dev_diff, dev_p1 = model.evaluate_data( dev_batches_x, dev_batches_y, eval_func, sampling=True) say("{} {} {} {}\n".format(dev_obj, dev_loss, dev_diff, dev_p1))
def main(): assert args.embedding, "Pre-trained word embeddings required." vocab = myio.get_vocab(args) embedding_layer = myio.create_embedding_layer(args, args.embedding, vocab) n_classes = args.nclasses model = Model(args=args, embedding_layer=embedding_layer, nclasses=n_classes) if args.dev_baseline: num_files = args.num_files_dev rx_ls = [] bm_ls = [] for i in xrange(num_files): batches_x, _, _, batches_bm, batches_sha, batches_rx = myio.load_batches( args.batch_dir + args.source + 'dev', i) cur_len = len(batches_x) for j in xrange(cur_len): _, bm, _, rx = batches_x[j], batches_bm[j], batches_sha[ j], batches_rx[j] rx_ls.append(rx) bm_ls.append(bm) myio.eval_baseline(args, bm_ls, rx_ls, 'dev') elif args.test_baseline: num_files = args.num_files_test rx_ls = [] bm_ls = [] for i in xrange(num_files): batches_x, batches_bm, batches_sha, batches_rx = myio.load_batches( args.batch_dir + args.source + 'test', i) cur_len = len(batches_x) for j in xrange(cur_len): _, bm, _, rx = batches_x[j], batches_bm[j], batches_sha[ j], batches_rx[j] rx_ls.append(rx) bm_ls.append(bm) myio.eval_baseline(args, bm_ls, rx_ls, 'test') elif args.train: if args.pretrain: model.ready_pretrain() model.pretrain() else: if args.load_model_pretrain: model.load_model_pretrain(args.save_model + 'pretrain/' + args.load_model) else: model.ready() model.train() elif args.dev: if args.pretrain: model.load_model_pretrain(args.save_model + 'pretrain/' + args.load_model) model.dev() else: model.load_model(args.save_model + args.load_model) model.dev_full() elif args.test: model.load_model(args.save_model + args.load_model, True) model.test()
def main(args): raw_corpus = myio.read_corpus(args.corpus, args.translations or None, args.translatable_ids or None, args.generated_questions_train or None) generated_questions_eval = myio.read_generated_questions( args.generated_questions) embedding_layer = None if args.trainable_embeddings == 1: embedding_layer = myio.create_embedding_layer( raw_corpus, n_d=args.hidden_dim, cut_off=args.cut_off, embs=load_embedding_iterator(args.embeddings) if args.embeddings else None, fix_init_embs=False) else: embedding_layer = myio.create_embedding_layer( raw_corpus, n_d=args.hidden_dim, cut_off=args.cut_off, embs=load_embedding_iterator(args.embeddings) if args.embeddings else None) ids_corpus = myio.map_corpus(raw_corpus, embedding_layer, max_len=args.max_seq_len, generated_questions=generated_questions_eval) say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V, len(raw_corpus))) padding_id = embedding_layer.vocab_map["<padding>"] if args.reweight: weights = myio.create_idf_weights(args.corpus, embedding_layer) if args.dev: # dev = myio.read_annotations(args.dev, K_neg=-1, prune_pos_cnt=-1) dev = myio.read_annotations(args.dev, K_neg=args.dev_pool_size, prune_pos_cnt=-1) dev = myio.create_eval_batches(ids_corpus, dev, padding_id, pad_left=not args.average) if args.test: test = myio.read_annotations(args.test, K_neg=-1, prune_pos_cnt=-1) test = myio.create_eval_batches(ids_corpus, test, padding_id, pad_left=not args.average) if args.train: start_time = time.time() train = myio.read_annotations( args.train, training_data_percent=args.training_data_percent) train_batches = myio.create_batches(ids_corpus, train, args.batch_size, padding_id, pad_left=not args.average, include_generated_questions=True) say("{} to create batches\n".format(time.time() - start_time)) say("{} batches, {} tokens in total, {} triples in total\n".format( len(train_batches), sum(len(x[0].ravel()) + len(x[1].ravel()) for x in train_batches), sum(len(x[2].ravel()) for x in train_batches))) train_batches = None model = Model(args, embedding_layer, weights=weights if args.reweight else None) # print('args.average: '+args.average) model.ready() # # # set parameters using pre-trained network if args.do_train == 1: if args.load_pretrain: model.load_pretrained_parameters(args) model.train(ids_corpus, train, dev if args.dev else None, test if args.test else None) # AVERAGE THE PREDICTIONS OBTAINED BY RUNNING THE MODEL 10 TIMES if args.do_evaluate == 1: model.load_pretrained_parameters(args) # model.set_model(model.load_model(args.load_pretrain)) for i in range(1): r = model.just_eval(dev if args.dev else None, test if args.test else None) # ANALYZE the results if len(args.analyze_file.strip()) > 0: model.load_pretrained_parameters(args) file_name = args.analyze_file.strip( ) # 'AskUbuntu.Rcnn_analysis3.gt(es)-gt.txt' model.analyze(file_name, embedding_layer, dev)