def test(args): source_vocab = Vocab.load(args.model_path + SRC_VOCAB_NAME) target_vocab = Vocab.load(args.model_path + TAR_VOCAB_NAME) vocab_size, hidden_size, maxout_hidden_size, embed_size = Backup.load( args.model_path + HPARAM_NAME) att_encdec = ABED(vocab_size, hidden_size, maxout_hidden_size, embed_size) if args.use_gpu: att_encdec.to_gpu() serializers.load_hdf5(args.model_path + str(args.epochs) + '.attencdec', att_encdec) with open(args.output + str(args.epochs), 'w') as fp: source_gen = word_list(args.source) target_gen = word_list(args.target) batch_gen = batch(sort(source_gen, target_gen, 100 * args.minibatch), args.minibatch) for source_batch, target_batch in batch_gen: source_batch = fill_batch_end(source_batch) target_batch = fill_batch_end(target_batch) if args.beam_search: hyp_batch = forward_beam(source_batch, None, source_vocab, target_vocab, att_encdec, False, args.limit, args.beam_size) else: hyp_batch = forward(source_batch, None, source_vocab, target_vocab, att_encdec, False, args.limit) for i, hyp in enumerate(hyp_batch): hyp.append(END) hyp = hyp[:hyp.index(END)] show(source_batch[i], target_batch[i], hyp, "TEST") fwrite(source_batch[i], target_batch[i], hyp, fp)
def test(args): source_vocab = Vocab.load(args.model_path+SRC_VOCAB_NAME) target_vocab= Vocab.load(args.model_path+TAR_VOCAB_NAME) vocab_size, hidden_size, maxout_hidden_size, embed_size = Backup.load(args.model_path+HPARAM_NAME) att_encdec = ABED(vocab_size, hidden_size, maxout_hidden_size, embed_size) if args.use_gpu: att_encdec.to_gpu() serializers.load_hdf5(args.model_path+str(args.epochs)+'.attencdec', att_encdec) with open(args.output+str(args.epochs), 'w') as fp: source_gen = word_list(args.source) target_gen = word_list(args.target) batch_gen = batch(sort(source_gen, target_gen, 100*args.minibatch), args.minibatch) for source_batch, target_batch in batch_gen: source_batch = fill_batch_end(source_batch) target_batch = fill_batch_end(target_batch) if args.beam_search: hyp_batch = forward_beam(source_batch, None, source_vocab, target_vocab, att_encdec, False, args.limit, args.beam_size) else: hyp_batch = forward(source_batch, None, source_vocab, target_vocab, att_encdec, False, args.limit) for i, hyp in enumerate(hyp_batch): hyp.append(END) hyp = hyp[:hyp.index(END)] show(source_batch[i], target_batch[i], hyp, "TEST") fwrite(source_batch[i], target_batch[i], hyp, fp)
def explore_vocab(self, config, export_name, threshold=None, keep_ratio=None): vocab = Vocab() vocab.load_file(config[0]) vocab = vocab.high_freq(threshold, keep_ratio).order() self._multi_plot(config, export_name, 'Vocab', 'Log(x+1) Max Ratio', self._explore_vocab, vocab)
def train(args): source_vocab = Vocab(args.source, args.vocab) target_vocab = Vocab(args.target, args.vocab) att_encdec = ABED(args.vocab, args.hidden_size, args.maxout_hidden_size, args.embed_size) if args.use_gpu: att_encdec.to_gpu() if args.source_validation: if os.path.exists(PLOT_DIR)==False: os.mkdir(PLOT_DIR) fp_loss = open(PLOT_DIR+"loss", "w") fp_loss_val = open(PLOT_DIR+"loss_val", "w") opt = optimizers.AdaDelta(args.rho, args.eps) opt.setup(att_encdec) opt.add_hook(optimizer.WeightDecay(DECAY_COEFF)) opt.add_hook(optimizer.GradientClipping(CLIP_THR)) for epoch in xrange(args.epochs): print "--- epoch: %s/%s ---"%(epoch+1, args.epochs) source_gen = word_list(args.source) target_gen = word_list(args.target) batch_gen = batch(sort(source_gen, target_gen, 100*args.minibatch), args.minibatch) n = 0 total_loss = 0.0 for source_batch, target_batch in batch_gen: n += len(source_batch) source_batch = fill_batch_end(source_batch) target_batch = fill_batch_end(target_batch) hyp_batch, loss = forward(source_batch, target_batch, source_vocab, target_vocab, att_encdec, True, 0) total_loss += loss.data*len(source_batch) closed_test(source_batch, target_batch, hyp_batch) loss.backward() opt.update() print "[n=%s]"%(n) print "[total=%s]"%(n) prefix = args.model_path + '%s'%(epoch+1) serializers.save_hdf5(prefix+'.attencdec', att_encdec) if args.source_validation: total_loss_val, n_val = validation_test(args, att_encdec, source_vocab, target_vocab) fp_loss.write("\t".join([str(epoch), str(total_loss/n)+"\n"])) fp_loss_val.write("\t".join([str(epoch), str(total_loss_val/n_val)+"\n"])) fp_loss.flush() fp_loss_val.flush() hyp_params = att_encdec.get_hyper_params() Backup.dump(hyp_params, args.model_path+HPARAM_NAME) source_vocab.save(args.model_path+SRC_VOCAB_NAME) target_vocab.save(args.model_path+TAR_VOCAB_NAME) hyp_params = att_encdec.get_hyper_params() Backup.dump(hyp_params, args.model_path+HPARAM_NAME) source_vocab.save(args.model_path+SRC_VOCAB_NAME) target_vocab.save(args.model_path+TAR_VOCAB_NAME) if args.source_validation: fp_loss.close() fp_loss_val.close()
def test_score(model_name,parameter): label2id = {'Other': 0, 'Cause-Effect(e1,e2)': 1, 'Cause-Effect(e2,e1)': 2, 'Component-Whole(e1,e2)': 3, 'Component-Whole(e2,e1)': 4, 'Content-Container(e1,e2)': 5, 'Content-Container(e2,e1)': 6, 'Entity-Destination(e1,e2)': 7, 'Entity-Destination(e2,e1)': 8, 'Entity-Origin(e1,e2)': 9, 'Entity-Origin(e2,e1)': 10, 'Instrument-Agency(e2,e1)': 11, 'Instrument-Agency(e1,e2)': 12, 'Member-Collection(e1,e2)': 13, 'Member-Collection(e2,e1)': 14, 'Message-Topic(e1,e2)': 15, 'Message-Topic(e2,e1)': 16, 'Product-Producer(e1,e2)': 17, 'Product-Producer(e2,e1)': 18} #加载预训练语料 vocab_file = 'dataset/vocab/vocab.pkl' vocab = Vocab(vocab_file, load=True) resoult_file=open('resoult/resoult.txt','w') #加载预处理测试集 test_batch = DataLoader('dataset/sem/test_file.json',parameter, vocab, train=False) test_emb_file = './dataset/vocab/embedding.npy' test_emb_matrix = np.load(test_emb_file) parameter['vocab_size'] = vocab.size #加载模型 trainer = GCNTrainer(parameter,test_emb_matrix) trainer.load(model_name) id2label = dict([(v,k) for k,v in label2id.items()]) predictions = [] for i, batch in enumerate(test_batch): preds = trainer.predict(batch) predictions += preds predictions = [id2label[p] for p in predictions] #结果写入文件 counter=8001 for resoult in predictions: if counter == 10718: break resoult_file.writelines(str(counter)+"\t"+resoult+'\n') counter+=1 resoult_file.close() #计算分数 os.system('perl ./resoult/semeval2010_task8_scorer-v1.2.pl ./resoult/resoult.txt ./resoult/test_key.txt > ./resoult/score.txt') f=open("resoult/score.txt") resoult1="" line=f.readline() i=0 while line: if i==143: resoult1=line if i==147: break line=f.readline() i+=1 f.close() return line,resoult1
def start_train(self): #加载预训练语料 vocab_file = 'dataset/vocab/vocab.pkl' self.vocab = Vocab(vocab_file, load=True) self.parameter['vocab_size'] = self.vocab.size emb_file = './dataset/vocab/embedding.npy' self.emb_matrix = np.load(emb_file) #加载训练集 self.train_batch = DataLoader('dataset/sem/train_file.json', self.parameter, self.vocab, train=True) self.trainer = GCNTrainer(self.parameter, emb_matrix=self.emb_matrix) self.current_lr = self.parameter['lr'] self.log_name = "log/" + str(time.strftime("%d_%I_%M")) + ".log" log = open(self.log_name, 'w+') log.write(str(self.parameter)) log.close()
def train(args): source_vocab = Vocab(args.source, args.vocab) target_vocab = Vocab(args.target, args.vocab) att_encdec = ABED(args.vocab, args.hidden_size, args.maxout_hidden_size, args.embed_size) if args.use_gpu: att_encdec.to_gpu() if args.source_validation: if os.path.exists(PLOT_DIR) == False: os.mkdir(PLOT_DIR) fp_loss = open(PLOT_DIR + "loss", "w") fp_loss_val = open(PLOT_DIR + "loss_val", "w") opt = optimizers.AdaDelta(args.rho, args.eps) opt.setup(att_encdec) opt.add_hook(optimizer.WeightDecay(DECAY_COEFF)) opt.add_hook(optimizer.GradientClipping(CLIP_THR)) for epoch in xrange(args.epochs): print "--- epoch: %s/%s ---" % (epoch + 1, args.epochs) source_gen = word_list(args.source) target_gen = word_list(args.target) batch_gen = batch(sort(source_gen, target_gen, 100 * args.minibatch), args.minibatch) n = 0 total_loss = 0.0 for source_batch, target_batch in batch_gen: n += len(source_batch) source_batch = fill_batch_end(source_batch) target_batch = fill_batch_end(target_batch) hyp_batch, loss = forward(source_batch, target_batch, source_vocab, target_vocab, att_encdec, True, 0) total_loss += loss.data * len(source_batch) closed_test(source_batch, target_batch, hyp_batch) loss.backward() opt.update() print "[n=%s]" % (n) print "[total=%s]" % (n) prefix = args.model_path + '%s' % (epoch + 1) serializers.save_hdf5(prefix + '.attencdec', att_encdec) if args.source_validation: total_loss_val, n_val = validation_test(args, att_encdec, source_vocab, target_vocab) fp_loss.write("\t".join([str(epoch), str(total_loss / n) + "\n"])) fp_loss_val.write("\t".join( [str(epoch), str(total_loss_val / n_val) + "\n"])) fp_loss.flush() fp_loss_val.flush() hyp_params = att_encdec.get_hyper_params() Backup.dump(hyp_params, args.model_path + HPARAM_NAME) source_vocab.save(args.model_path + SRC_VOCAB_NAME) target_vocab.save(args.model_path + TAR_VOCAB_NAME) hyp_params = att_encdec.get_hyper_params() Backup.dump(hyp_params, args.model_path + HPARAM_NAME) source_vocab.save(args.model_path + SRC_VOCAB_NAME) target_vocab.save(args.model_path + TAR_VOCAB_NAME) if args.source_validation: fp_loss.close() fp_loss_val.close()