示例#1
0
def test(args):
    source_vocab = Vocab.load(args.model_path + SRC_VOCAB_NAME)
    target_vocab = Vocab.load(args.model_path + TAR_VOCAB_NAME)
    vocab_size, hidden_size, maxout_hidden_size, embed_size = Backup.load(
        args.model_path + HPARAM_NAME)

    att_encdec = ABED(vocab_size, hidden_size, maxout_hidden_size, embed_size)
    if args.use_gpu:
        att_encdec.to_gpu()
    serializers.load_hdf5(args.model_path + str(args.epochs) + '.attencdec',
                          att_encdec)

    with open(args.output + str(args.epochs), 'w') as fp:
        source_gen = word_list(args.source)
        target_gen = word_list(args.target)
        batch_gen = batch(sort(source_gen, target_gen, 100 * args.minibatch),
                          args.minibatch)
        for source_batch, target_batch in batch_gen:
            source_batch = fill_batch_end(source_batch)
            target_batch = fill_batch_end(target_batch)
            if args.beam_search:
                hyp_batch = forward_beam(source_batch, None, source_vocab,
                                         target_vocab, att_encdec, False,
                                         args.limit, args.beam_size)
            else:
                hyp_batch = forward(source_batch, None, source_vocab,
                                    target_vocab, att_encdec, False,
                                    args.limit)
            for i, hyp in enumerate(hyp_batch):
                hyp.append(END)
                hyp = hyp[:hyp.index(END)]
                show(source_batch[i], target_batch[i], hyp, "TEST")
                fwrite(source_batch[i], target_batch[i], hyp, fp)
def test(args):
    source_vocab = Vocab.load(args.model_path+SRC_VOCAB_NAME)
    target_vocab= Vocab.load(args.model_path+TAR_VOCAB_NAME) 
    vocab_size, hidden_size, maxout_hidden_size, embed_size = Backup.load(args.model_path+HPARAM_NAME)

    att_encdec = ABED(vocab_size, hidden_size, maxout_hidden_size, embed_size)
    if args.use_gpu:
        att_encdec.to_gpu()
    serializers.load_hdf5(args.model_path+str(args.epochs)+'.attencdec', att_encdec)

    with open(args.output+str(args.epochs), 'w') as fp:
        source_gen = word_list(args.source)
        target_gen = word_list(args.target)
        batch_gen = batch(sort(source_gen, target_gen, 100*args.minibatch), args.minibatch) 
        for source_batch, target_batch in batch_gen: 
            source_batch = fill_batch_end(source_batch)
            target_batch = fill_batch_end(target_batch) 
            if args.beam_search:
                hyp_batch = forward_beam(source_batch, None, source_vocab, target_vocab, att_encdec, False, args.limit, args.beam_size)
            else:
                hyp_batch = forward(source_batch, None, source_vocab, target_vocab, att_encdec, False, args.limit)
            for i, hyp in enumerate(hyp_batch):
                hyp.append(END)
                hyp = hyp[:hyp.index(END)]
                show(source_batch[i], target_batch[i], hyp, "TEST")
                fwrite(source_batch[i], target_batch[i], hyp, fp)
示例#3
0
    def explore_vocab(self,
                      config,
                      export_name,
                      threshold=None,
                      keep_ratio=None):
        vocab = Vocab()
        vocab.load_file(config[0])
        vocab = vocab.high_freq(threshold, keep_ratio).order()

        self._multi_plot(config, export_name, 'Vocab', 'Log(x+1) Max Ratio',
                         self._explore_vocab, vocab)
def train(args):
    source_vocab = Vocab(args.source, args.vocab)
    target_vocab = Vocab(args.target, args.vocab)
    att_encdec = ABED(args.vocab, args.hidden_size, args.maxout_hidden_size, args.embed_size)
    if args.use_gpu:
        att_encdec.to_gpu()
    if args.source_validation:
        if os.path.exists(PLOT_DIR)==False: os.mkdir(PLOT_DIR)
        fp_loss = open(PLOT_DIR+"loss", "w")
        fp_loss_val = open(PLOT_DIR+"loss_val", "w")

    opt = optimizers.AdaDelta(args.rho, args.eps)
    opt.setup(att_encdec)
    opt.add_hook(optimizer.WeightDecay(DECAY_COEFF))
    opt.add_hook(optimizer.GradientClipping(CLIP_THR))
    for epoch in xrange(args.epochs):
        print "--- epoch: %s/%s ---"%(epoch+1, args.epochs)
        source_gen = word_list(args.source)
        target_gen = word_list(args.target)
        batch_gen = batch(sort(source_gen, target_gen, 100*args.minibatch), args.minibatch)
        n = 0
        total_loss = 0.0
        for source_batch, target_batch in batch_gen:
            n += len(source_batch)
            source_batch = fill_batch_end(source_batch)
            target_batch = fill_batch_end(target_batch)
            hyp_batch, loss = forward(source_batch, target_batch, source_vocab, target_vocab, att_encdec, True, 0)
            total_loss += loss.data*len(source_batch)
            closed_test(source_batch, target_batch, hyp_batch)

            loss.backward()
            opt.update()
            print "[n=%s]"%(n)
        print "[total=%s]"%(n)
        prefix = args.model_path + '%s'%(epoch+1)
        serializers.save_hdf5(prefix+'.attencdec', att_encdec)
        if args.source_validation:
            total_loss_val, n_val = validation_test(args, att_encdec, source_vocab, target_vocab)
            fp_loss.write("\t".join([str(epoch), str(total_loss/n)+"\n"]))
            fp_loss_val.write("\t".join([str(epoch), str(total_loss_val/n_val)+"\n"])) 
            fp_loss.flush()
            fp_loss_val.flush()
        hyp_params = att_encdec.get_hyper_params()
        Backup.dump(hyp_params, args.model_path+HPARAM_NAME)
        source_vocab.save(args.model_path+SRC_VOCAB_NAME)
        target_vocab.save(args.model_path+TAR_VOCAB_NAME)
    hyp_params = att_encdec.get_hyper_params()
    Backup.dump(hyp_params, args.model_path+HPARAM_NAME)
    source_vocab.save(args.model_path+SRC_VOCAB_NAME)
    target_vocab.save(args.model_path+TAR_VOCAB_NAME)
    if args.source_validation:
        fp_loss.close()
        fp_loss_val.close()
示例#5
0
def test_score(model_name,parameter):

    label2id = {'Other': 0, 'Cause-Effect(e1,e2)': 1, 'Cause-Effect(e2,e1)': 2, 'Component-Whole(e1,e2)': 3,
                'Component-Whole(e2,e1)': 4, 'Content-Container(e1,e2)': 5, 'Content-Container(e2,e1)': 6, 
                'Entity-Destination(e1,e2)': 7, 'Entity-Destination(e2,e1)': 8, 'Entity-Origin(e1,e2)': 9, 
                'Entity-Origin(e2,e1)': 10, 'Instrument-Agency(e2,e1)': 11, 'Instrument-Agency(e1,e2)': 12, 
                'Member-Collection(e1,e2)': 13, 'Member-Collection(e2,e1)': 14, 'Message-Topic(e1,e2)': 15, 
                'Message-Topic(e2,e1)': 16, 'Product-Producer(e1,e2)': 17, 'Product-Producer(e2,e1)': 18}

    #加载预训练语料
    vocab_file = 'dataset/vocab/vocab.pkl'
    vocab = Vocab(vocab_file, load=True)
    resoult_file=open('resoult/resoult.txt','w')

    #加载预处理测试集
    test_batch = DataLoader('dataset/sem/test_file.json',parameter, vocab, train=False)
    test_emb_file = './dataset/vocab/embedding.npy'
    test_emb_matrix = np.load(test_emb_file)
    parameter['vocab_size'] = vocab.size

    #加载模型
    trainer = GCNTrainer(parameter,test_emb_matrix)
    trainer.load(model_name)
    id2label = dict([(v,k) for k,v in label2id.items()])
    predictions = []
    for i, batch in enumerate(test_batch):
        preds = trainer.predict(batch)
        predictions += preds
    predictions = [id2label[p] for p in predictions]

    #结果写入文件
    counter=8001
    for resoult in predictions:
        if counter == 10718:
            break
        resoult_file.writelines(str(counter)+"\t"+resoult+'\n')
        counter+=1
    resoult_file.close()

    #计算分数
    os.system('perl ./resoult/semeval2010_task8_scorer-v1.2.pl ./resoult/resoult.txt ./resoult/test_key.txt > ./resoult/score.txt')
    f=open("resoult/score.txt")

    resoult1=""
    line=f.readline()
    i=0
    while line:
        if i==143:
            resoult1=line
        if i==147:
            break
        line=f.readline()
        i+=1
    f.close()
    return line,resoult1
示例#6
0
    def start_train(self):
        #加载预训练语料
        vocab_file = 'dataset/vocab/vocab.pkl'
        self.vocab = Vocab(vocab_file, load=True)
        self.parameter['vocab_size'] = self.vocab.size
        emb_file = './dataset/vocab/embedding.npy'
        self.emb_matrix = np.load(emb_file)

        #加载训练集
        self.train_batch = DataLoader('dataset/sem/train_file.json',
                                      self.parameter,
                                      self.vocab,
                                      train=True)
        self.trainer = GCNTrainer(self.parameter, emb_matrix=self.emb_matrix)
        self.current_lr = self.parameter['lr']
        self.log_name = "log/" + str(time.strftime("%d_%I_%M")) + ".log"
        log = open(self.log_name, 'w+')
        log.write(str(self.parameter))
        log.close()
示例#7
0
def train(args):
    source_vocab = Vocab(args.source, args.vocab)
    target_vocab = Vocab(args.target, args.vocab)
    att_encdec = ABED(args.vocab, args.hidden_size, args.maxout_hidden_size,
                      args.embed_size)
    if args.use_gpu:
        att_encdec.to_gpu()
    if args.source_validation:
        if os.path.exists(PLOT_DIR) == False: os.mkdir(PLOT_DIR)
        fp_loss = open(PLOT_DIR + "loss", "w")
        fp_loss_val = open(PLOT_DIR + "loss_val", "w")

    opt = optimizers.AdaDelta(args.rho, args.eps)
    opt.setup(att_encdec)
    opt.add_hook(optimizer.WeightDecay(DECAY_COEFF))
    opt.add_hook(optimizer.GradientClipping(CLIP_THR))
    for epoch in xrange(args.epochs):
        print "--- epoch: %s/%s ---" % (epoch + 1, args.epochs)
        source_gen = word_list(args.source)
        target_gen = word_list(args.target)
        batch_gen = batch(sort(source_gen, target_gen, 100 * args.minibatch),
                          args.minibatch)
        n = 0
        total_loss = 0.0
        for source_batch, target_batch in batch_gen:
            n += len(source_batch)
            source_batch = fill_batch_end(source_batch)
            target_batch = fill_batch_end(target_batch)
            hyp_batch, loss = forward(source_batch, target_batch, source_vocab,
                                      target_vocab, att_encdec, True, 0)
            total_loss += loss.data * len(source_batch)
            closed_test(source_batch, target_batch, hyp_batch)

            loss.backward()
            opt.update()
            print "[n=%s]" % (n)
        print "[total=%s]" % (n)
        prefix = args.model_path + '%s' % (epoch + 1)
        serializers.save_hdf5(prefix + '.attencdec', att_encdec)
        if args.source_validation:
            total_loss_val, n_val = validation_test(args, att_encdec,
                                                    source_vocab, target_vocab)
            fp_loss.write("\t".join([str(epoch), str(total_loss / n) + "\n"]))
            fp_loss_val.write("\t".join(
                [str(epoch), str(total_loss_val / n_val) + "\n"]))
            fp_loss.flush()
            fp_loss_val.flush()
        hyp_params = att_encdec.get_hyper_params()
        Backup.dump(hyp_params, args.model_path + HPARAM_NAME)
        source_vocab.save(args.model_path + SRC_VOCAB_NAME)
        target_vocab.save(args.model_path + TAR_VOCAB_NAME)
    hyp_params = att_encdec.get_hyper_params()
    Backup.dump(hyp_params, args.model_path + HPARAM_NAME)
    source_vocab.save(args.model_path + SRC_VOCAB_NAME)
    target_vocab.save(args.model_path + TAR_VOCAB_NAME)
    if args.source_validation:
        fp_loss.close()
        fp_loss_val.close()