def evaluate(
    model_path,
    corpus_path,
    pairs_path,
    batch_size=100,
):

    model = torch.load(model_path)
    model = model.cuda()
    model.eval()

    corpus = Corpus([tuple([corpus_path, os.path.dirname(corpus_path)])])
    pairs_batch_loader = FileLoader(
        [tuple([pairs_path, os.path.dirname(pairs_path)])], batch_size)

    code = []
    nl = []

    for data in tqdm.tqdm(pairs_batch_loader):
        data = map(corpus.get, data)
        batch = (make_batch(model.embedding_layer, data[0][0]),
                 make_batch(model.embedding_layer, data[1][0]))
        batch = [x.cuda() for x in batch]
        batch = (Variable(batch[0],
                          volatile=True), Variable(batch[1], volatile=True))

        # embed code and NL
        repr_left = model(batch[0])
        repr_right = model(batch[1])
        # accumulate for evaluation
        code.extend(repr_left.cpu().data.numpy())
        nl.extend(repr_right.cpu().data.numpy())

    code = np.array(code)
    nl = np.array(nl)

    sim_mat = cosine_similarity(nl, code)
    ans_locs = location_of_correct(sim_mat)

    summary = {}
    mr = np.mean(ans_locs)
    mrr = get_mrr(ans_locs)
    summary["mrr"] = mrr

    cutoffs = [1, 5, 10]
    fracs = []

    for c in cutoffs:
        frac = get_fraction_correct_at(ans_locs, c)
        fracs.append(frac)
    print("Num obs: {}".format(code.shape[0]))
    print("Mean Rank: {}".format(mr))
    print("MRR: {}".format(mrr))

    for c, f in zip(cutoffs, fracs):
        print("Fraction Correct@{}: {}".format(c, f))
        summary["success@{}".format(c)] = f
    return summary
示例#2
0
def train(iter_cnt, model, corpus, args, optimizer):

    train_writer = FileWriter(args.run_path + "/train", flush_secs=5)

    pos_file_path = "{}.pos.txt".format(args.train)
    neg_file_path = "{}.neg.txt".format(args.train)
    pos_batch_loader = FileLoader(
        [tuple([pos_file_path, os.path.dirname(args.train)])], args.batch_size)
    neg_batch_loader = FileLoader(
        [tuple([neg_file_path, os.path.dirname(args.train)])], args.batch_size)
    #neg_batch_loader = RandomLoader(
    #    corpus = corpus,
    #    exclusive_set = zip(pos_batch_loader.data_left, pos_batch_loader.data_right),
    #    batch_size = args.batch_size
    #)
    #neg_batch_loader = CombinedLoader(
    #    neg_batch_loader_1,
    #    neg_batch_loader_2,
    #    args.batch_size
    #)

    use_content = False
    if args.use_content:
        use_content = True

    embedding_layer = model.embedding_layer

    criterion = model.compute_loss

    start = time.time()
    tot_loss = 0.0
    tot_cnt = 0

    for batch, labels in tqdm(
            pad_iter(corpus,
                     embedding_layer,
                     pos_batch_loader,
                     neg_batch_loader,
                     use_content,
                     pad_left=False)):
        iter_cnt += 1
        model.zero_grad()
        labels = labels.type(torch.LongTensor)
        new_batch = []
        if args.use_content:
            for x in batch:
                for y in x:
                    new_batch.append(y)
            batch = new_batch
        if args.cuda:
            batch = [x.cuda() for x in batch]
            labels = labels.cuda()
        batch = map(Variable, batch)
        labels = Variable(labels)
        repr_left = None
        repr_right = None
        if not use_content:
            repr_left = model(batch[0])
            repr_right = model(batch[1])
        else:
            repr_left = model(batch[0]) + model(batch[1])
            repr_right = model(batch[2]) + model(batch[3])
        output = model.compute_similarity(repr_left, repr_right)
        loss = criterion(output, labels)
        loss.backward()
        prev_emb = embedding_layer.embedding.weight.cpu().data.numpy()
        optimizer.step()
        current_emb = embedding_layer.embedding.weight.cpu().data.numpy()
        diff = np.sum(np.absolute(current_emb - prev_emb))
        tot_loss += loss.data[0] * output.size(0)
        tot_cnt += output.size(0)
        if iter_cnt % 100 == 0:
            outputManager.say("\r" + " " * 50)
            outputManager.say("\r{} loss: {:.4f}  eps: {:.0f} ".format(
                iter_cnt, tot_loss / tot_cnt, tot_cnt / (time.time() - start)))
            s = summary.scalar('loss', tot_loss / tot_cnt)
            train_writer.add_summary(s, iter_cnt)

    outputManager.say("\n")
    train_writer.close()
    #if model.criterion.startswith('classification'):
    #    print model.output_op.weight.min().data[0], model.output_op.weight.max().data[0]

    return iter_cnt
示例#3
0
def evaluate(iter_cnt, filepath, model, corpus, args, logging=True):
    if logging:
        valid_writer = FileWriter(args.run_path + "/valid", flush_secs=5)

    pos_file_path = "{}.pos.txt".format(filepath)
    neg_file_path = "{}.neg.txt".format(filepath)
    pos_batch_loader = FileLoader(
        [tuple([pos_file_path, os.path.dirname(args.eval)])], args.batch_size)
    neg_batch_loader = FileLoader(
        [tuple([neg_file_path, os.path.dirname(args.eval)])], args.batch_size)

    batchify = lambda bch: make_batch(model.embedding_layer, bch)
    model.eval()
    criterion = model.compute_loss
    auc_meter = AUCMeter()
    scores = [np.asarray([], dtype='float32') for i in range(2)]
    for loader_id, loader in tqdm(
            enumerate((neg_batch_loader, pos_batch_loader))):
        for data in tqdm(loader):
            data = map(corpus.get, data)
            batch = None
            if not args.eval_use_content:
                batch = (batchify(data[0][0]), batchify(data[1][0]))
            else:
                batch = (map(batchify, data[0]), map(batchify, data[1]))
                new_batch = []
                for x in batch:
                    for y in x:
                        new_batch.append(y)
                batch = new_batch
            labels = torch.ones(batch[0].size(1)).type(
                torch.LongTensor) * loader_id
            if args.cuda:
                batch = [x.cuda() for x in batch]
                labels = labels.cuda()
            if not args.eval_use_content:
                batch = (Variable(batch[0], volatile=True),
                         Variable(batch[1], volatile=True))
            else:
                batch = (Variable(batch[0], volatile=True),
                         Variable(batch[1], volatile=True),
                         Variable(batch[2], volatile=True),
                         Variable(batch[3], volatile=True))
            labels = Variable(labels)
            if not args.eval_use_content:
                repr_left = model(batch[0])
                repr_right = model(batch[1])
            else:
                repr_left = model(batch[0]) + model(batch[1])
                repr_right = model(batch[2]) + model(batch[3])
            output = model.compute_similarity(repr_left, repr_right)

            if model.criterion.startswith('classification'):
                assert output.size(1) == 2
                output = nn.functional.log_softmax(output)
                current_scores = -output[:, loader_id].data.cpu().squeeze(
                ).numpy()
                output = output[:, 1]
            else:
                assert output.size(1) == 1
                current_scores = output.data.cpu().squeeze().numpy()
            auc_meter.add(output.data, labels.data)
            scores[loader_id] = np.append(scores[loader_id], current_scores)

    auc_score = auc_meter.value()
    auc10_score = auc_meter.value(0.1)
    auc05_score = auc_meter.value(0.05)
    auc02_score = auc_meter.value(0.02)
    auc01_score = auc_meter.value(0.01)
    if model.criterion.startswith('classification'):
        avg_score = (scores[1].mean() + scores[0].mean()) * 0.5
    else:
        avg_score = scores[1].mean() - scores[0].mean()
    outputManager.say(
        "\r[{}] auc(.01): {:.3f}  auc(.02): {:.3f}  auc(.05): {:.3f}"
        "  auc(.1): {:.3f}  auc: {:.3f}"
        "  scores: {:.2f} ({:.2f} {:.2f})\n".format(
            os.path.basename(filepath).split('.')[0], auc01_score, auc02_score,
            auc05_score, auc10_score, auc_score, avg_score, scores[1].mean(),
            scores[0].mean()))

    if logging:
        s = summary.scalar('auc', auc_score)
        valid_writer.add_summary(s, iter_cnt)
        s = summary.scalar('auc (fpr<0.1)', auc10_score)
        valid_writer.add_summary(s, iter_cnt)
        s = summary.scalar('auc (fpr<0.05)', auc05_score)
        valid_writer.add_summary(s, iter_cnt)
        s = summary.scalar('auc (fpr<0.02)', auc02_score)
        valid_writer.add_summary(s, iter_cnt)
        s = summary.scalar('auc (fpr<0.01)', auc01_score)
        valid_writer.add_summary(s, iter_cnt)
        valid_writer.close()

    return auc05_score
        fb.write(json_data + '\n')


if __name__ == "__main__":
    parse = argparse.ArgumentParser()
    parse.add_argument('--dataset',
                       default='ENZYMES',
                       type=str,
                       help='dataset')
    args = parse.parse_args()
    config_file = osp.join(osp.dirname(osp.abspath(__file__)), 'config',
                           '%s.ini' % args.dataset)
    config = Config(config_file)
    set_seed(config.seed)

    G_data = FileLoader(args.dataset, config).load_data()
    training_process_data_file = generate_result_file_name(
        args.dataset, config)
    check_dir(training_process_data_file)

    train_fb = open(training_process_data_file, 'a+', encoding='utf-8')
    for fold_idx in range(config.fold):
        G_data.use_fold_data(fold_idx)
        train_graphs, test_graphs = G_data.train_graphs, G_data.test_graphs
        print('start training ------> fold', fold_idx + 1)
        print('train sample number: {}   test sample number: {}'.format(
            len(train_graphs), len(test_graphs)))
        app_run(train_graphs, test_graphs, args.dataset, fold_idx, train_fb,
                config)
        print()
示例#5
0
    model = Net(config, convolution_method)
    trainer = Trainer(config, model, G_data)
    trainer.train(acc_file, fold_idx)


if __name__ == "__main__":
    parse = argparse.ArgumentParser()
    parse.add_argument('--dataset', default='NCI109', type=str, help='dataset')
    parse.add_argument('--convolution',
                       default='GCN',
                       type=str,
                       help='GCN, GAT or GraphSage')
    args = parse.parse_args()
    config_file = osp.join(osp.dirname(osp.abspath(__file__)), 'config',
                           '%s.ini' % args.dataset)
    config = Config(config_file)
    set_seed(config.seed)

    G_data = FileLoader(args.dataset, config).load_data()
    acc_file = osp.join(osp.dirname(osp.abspath(__file__)), 'result',
                        args.convolution, args.dataset + '_result.txt')
    check_dir(acc_file)
    for fold_idx in range(config.fold):
        print('start training ------> fold', fold_idx + 1)
        start = time.time()
        app_run(config, G_data, fold_idx, acc_file, args.convolution)
        print('Total time cost in this fold: {:.2f}s'.format(time.time() -
                                                             start))
        print()

    calculate_final_result(args.dataset, acc_file, args.convolution)
示例#6
0
def train(iter_cnt, model, domain_d, corpus, args, optimizer_encoder,
          optimizer_domain_d):

    train_writer = FileWriter(args.run_path + "/train", flush_secs=5)

    pos_file_path = "{}.pos.txt".format(args.train)
    neg_file_path = "{}.neg.txt".format(args.train)

    # for adversarial training just use natural language portions of inputs
    train_corpus_path = os.path.dirname(args.train) + "/nl.tsv.gz"
    cross_train_corpus_path = os.path.dirname(args.cross_train) + "/nl.tsv.gz"

    use_content = False
    if args.use_content:
        use_content = True

    pos_batch_loader = FileLoader(
        [tuple([pos_file_path, os.path.dirname(args.train)])], args.batch_size)
    neg_batch_loader = FileLoader(
        [tuple([neg_file_path, os.path.dirname(args.train)])], args.batch_size)
    cross_loader = TwoDomainLoader(
        [tuple([train_corpus_path,
                os.path.dirname(train_corpus_path)])], [
                    tuple([
                        cross_train_corpus_path,
                        os.path.dirname(cross_train_corpus_path)
                    ])
                ], args.batch_size * 2)

    embedding_layer = model.embedding_layer

    criterion1 = model.compute_loss
    criterion2 = domain_d.compute_loss

    start = time.time()
    task_loss = 0.0
    task_cnt = 0
    domain_loss = 0.0
    dom_cnt = 0
    total_loss = 0.0
    total_cnt = 0

    for batch, labels, domain_batch, domain_labels in tqdm(
            cross_pad_iter(corpus,
                           embedding_layer,
                           pos_batch_loader,
                           neg_batch_loader,
                           cross_loader,
                           use_content,
                           pad_left=False)):
        iter_cnt += 1

        new_batch = []
        if args.use_content:
            for x in batch:
                for y in x:
                    new_batch.append(y)
            batch = new_batch
            domain_batch = [x for x in domain_batch]

        if args.cuda:
            batch = [x.cuda() for x in batch]
            labels = labels.cuda()
            if not use_content:
                domain_batch = domain_batch.cuda()
            else:
                domain_batch = [x.cuda() for x in domain_batch]
            domain_labels = domain_labels.cuda()
        batch = map(Variable, batch)
        labels = Variable(labels)
        if not use_content:
            domain_batch = Variable(domain_batch)
        else:
            domain_batch = map(Variable, domain_batch)
        domain_labels = Variable(domain_labels)

        model.zero_grad()
        domain_d.zero_grad()

        repr_left = None
        repr_right = None
        if not use_content:
            repr_left = model(batch[0])
            repr_right = model(batch[1])
        else:
            repr_left = model(batch[0]) + model(batch[1])
            repr_right = model(batch[2]) + model(batch[3])
        output = model.compute_similarity(repr_left, repr_right)
        loss1 = criterion1(output, labels)
        task_loss += loss1.data[0] * output.size(0)
        task_cnt += output.size(0)

        domain_output = None
        if not use_content:
            domain_output = domain_d(model(domain_batch))
        else:
            domain_output = domain_d(model(domain_batch[0])) + domain_d(
                model(domain_batch[1]))
        loss2 = criterion2(domain_output, domain_labels)
        domain_loss += loss2.data[0] * domain_output.size(0)
        dom_cnt += domain_output.size(0)

        loss = loss1 - args.lambda_d * loss2
        total_loss += loss.data[0]
        total_cnt += 1
        loss.backward()
        optimizer_encoder.step()
        optimizer_domain_d.step()

        if iter_cnt % 100 == 0:
            outputManager.say("\r" + " " * 50)
            outputManager.say(
                "\r{} tot_loss: {:.4f} task_loss: {:.4f} domain_loss: {:.4f} eps: {:.0f} "
                .format(iter_cnt, total_loss / total_cnt, task_loss / task_cnt,
                        domain_loss / dom_cnt,
                        (task_cnt + dom_cnt) / (time.time() - start)))

            s = summary.scalar('total_loss', total_loss / total_cnt)
            train_writer.add_summary(s, iter_cnt)
            s = summary.scalar('domain_loss', domain_loss / dom_cnt)
            train_writer.add_summary(s, iter_cnt)
            s = summary.scalar('task_loss', task_loss / task_cnt)
            train_writer.add_summary(s, iter_cnt)

    outputManager.say("\n")
    train_writer.close()

    return iter_cnt