예제 #1
0
    def walk_from_graph(self):
        def node_generator():
            if self.train_files is None:
                while True:
                    for nodes in self.graph.node_batch_iter(self.batch_size):
                        yield nodes
            else:
                nodes = []
                while True:
                    for filename in self.train_files:
                        with io.open(filename) as inf:
                            for line in inf:
                                node = int(line.strip('\n\t'))
                                nodes.append(node)
                                if len(nodes) == self.batch_size:
                                    yield nodes
                                    nodes = []
                if len(nodes):
                    yield nodes

        if "alias" in self.graph.node_feat and "events" in self.graph.node_feat:
            log.info("Deepwalk using alias sample")
        for nodes in node_generator():
            if "alias" in self.graph.node_feat and "events" in self.graph.node_feat:
                walks = deepwalk_sample(self.graph, nodes, self.walk_len,
                                        "alias", "events")
            else:
                walks = deepwalk_sample(self.graph, nodes, self.walk_len)
            yield walks
예제 #2
0
파일: train.py 프로젝트: zhangwanzhi/PGL
def main(args):
    if not args.use_cuda:
        paddle.set_device("cpu")
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    graph = load(args.dataset)

    model = SkipGramModel(graph.num_nodes,
                          args.embed_size,
                          args.neg_num,
                          sparse=not args.use_cuda)
    model = paddle.DataParallel(model)

    train_steps = int(graph.num_nodes / args.batch_size) * args.epoch
    scheduler = paddle.optimizer.lr.PolynomialDecay(
        learning_rate=args.learning_rate,
        decay_steps=train_steps,
        end_lr=0.0001)

    optim = Adam(learning_rate=scheduler, parameters=model.parameters())

    train_ds = ShardedDataset(graph.nodes)
    collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size,
                               args.neg_num, args.neg_sample_type)
    data_loader = Dataloader(train_ds,
                             batch_size=args.batch_size,
                             shuffle=True,
                             num_workers=args.sample_workers,
                             collate_fn=collate_fn)

    for epoch in tqdm.tqdm(range(args.epoch)):
        train_loss = train(model, data_loader, optim)
        log.info("Runing epoch:%s\t train_loss:%.6f", epoch, train_loss)
    paddle.save(model.state_dict(), "model.pdparams")
예제 #3
0
 def mlp(self, features, name):
     h = features
     dim = features.shape[-1]
     dim_list = [dim * 2, dim]
     for i in range(2):
         h = L.fc(h,
                  size=dim_list[i],
                  name="%s_fc_%s" % (name, i),
                  act=None)
         if self.args.norm_type == "layer_norm":
             log.info("norm_type is %s" % self.args.norm_type)
             h = L.layer_norm(
                 h,
                 begin_norm_axis=1,
                 param_attr=F.ParamAttr(
                     name="norm_scale_%s_%s" % (name, i),
                     initializer=F.initializer.Constant(1.0)),
                 bias_attr=F.ParamAttr(
                     name="norm_bias_%s_%s" % (name, i),
                     initializer=F.initializer.Constant(0.0)),
             )
         else:
             log.info("using batch_norm")
             h = L.batch_norm(h)
         h = pgl.layers.graph_norm(self.graph_wrapper, h)
         h = L.relu(h)
     return h
예제 #4
0
def load_link_prediction_train_data(config, str2id, term_file, terms,
                                    item_distribution):
    train_data = []
    neg_samples = []
    with io.open(config.train_data, encoding=config.encoding) as f:
        for idx, line in enumerate(f):
            if idx % 100000 == 0:
                log.info("%s readed %s lines" % (config.train_data, idx))
            slots = []
            for col_idx, col in enumerate(line.strip("\n").split("\t")):
                s = col[:config.max_seqlen]
                if s not in str2id:
                    str2id[s] = len(str2id)
                    term_file.write(str(col_idx) + "\t" + col + "\n")
                    item_distribution.append(0)
                slots.append(str2id[s])

            src = slots[0]
            dst = slots[1]
            neg_samples.append(slots[2:])
            train_data.append((src, dst))
    train_data = np.array(train_data, dtype="int64")
    np.save(os.path.join(config.graph_work_path, "train_data.npy"), train_data)
    if len(neg_samples) != 0:
        np.save(os.path.join(config.graph_work_path, "neg_samples.npy"),
                np.array(neg_samples))
예제 #5
0
def split_10_cv(dataset, args):
    """10 folds cross validation
    """
    dataset.shuffle()
    X = np.array([0] * len(dataset))
    y = X
    kf = KFold(n_splits=10, shuffle=False)

    i = 1
    test_acc = []
    for train_index, test_index in kf.split(X, y):
        train_val_dataset = Subset(dataset, train_index)
        test_dataset = Subset(dataset, test_index)
        train_val_index_range = list(range(0, len(train_val_dataset)))
        num_val = int(len(train_val_dataset) / 9)
        val_dataset = Subset(train_val_dataset,
                             train_val_index_range[:num_val])
        train_dataset = Subset(train_val_dataset,
                               train_val_index_range[num_val:])

        log.info("######%d fold of 10-fold cross validation######" % i)
        i += 1
        test_acc_ = main(args, train_dataset, val_dataset, test_dataset)
        test_acc.append(test_acc_)

    mean_acc = sum(test_acc) / len(test_acc)
    return mean_acc, test_acc
예제 #6
0
def test(model, data_loader, log_per_step=1000, threshold=0.3):
    model.eval()
    total_loss = 0.
    total_sample = 0
    bce_loss = paddle.nn.BCEWithLogitsLoss()
    test_probs_vals, test_labels_vals, test_topk_vals = [], [], []

    for batch, (node, labels) in enumerate(data_loader):
        num_samples = len(node)
        node = paddle.to_tensor(node)
        labels = paddle.to_tensor(labels)
        logits = model(node)
        probs = paddle.nn.functional.sigmoid(logits)
        loss = bce_loss(logits, labels)

        topk = labels.sum(-1)
        test_probs_vals.append(probs.numpy())
        test_labels_vals.append(labels.numpy())
        test_topk_vals.append(topk.numpy())

        total_loss += loss.numpy()[0] * num_samples
        total_sample += num_samples

    test_probs_array = np.concatenate(test_probs_vals)
    test_labels_array = np.concatenate(test_labels_vals)
    test_topk_array = np.concatenate(test_topk_vals)
    test_macro_f1 = topk_f1_score(test_labels_array, test_probs_array,
                                  test_topk_array, "macro", threshold)
    test_micro_f1 = topk_f1_score(test_labels_array, test_probs_array,
                                  test_topk_array, "micro", threshold)
    test_loss_val = total_loss / total_sample
    log.info("\t\tTest Loss: %f " % test_loss_val +
             "Test Macro F1: %f " % test_macro_f1 +
             "Test Micro F1: %f " % test_micro_f1)
    return test_loss_val, test_macro_f1, test_micro_f1
예제 #7
0
    def multi_m2v_node_generate(self):
        """multi_m2v_node_generate"""
        n_type_list = self.first_node_type.split(';')
        num_n_type = len(n_type_list)
        node_types = np.unique(self.graph.node_types).tolist()

        node_generators = {}
        for n_type in node_types:
            node_generators[n_type] = \
                    self.graph.node_batch_iter(self.batch_size, n_type=n_type)

        cc = 0
        while True:
            idx = cc % num_n_type
            n_type = n_type_list[idx]
            try:
                nodes = next(node_generators[n_type])
            except StopIteration as e:
                log.info("node type of %s iteration finished in one epoch" %
                         (n_type))
                node_generators[n_type] = \
                        self.graph.node_batch_iter(self.batch_size, n_type=n_type)
                break
            yield (nodes, idx)
            cc += 1
예제 #8
0
def train_prog(exe, program, loss, node2vec_pyreader, args, train_steps):
    step = 0
    node2vec_pyreader.start()

    profiler.start_profiler("All")
    while True:
        try:
            begin_time = time.time()
            loss_val = exe.run(program, fetch_list=[loss])
            log.info("step %s: loss %.5f speed: %.5f s/step" %
                     (step, np.mean(loss_val), time.time() - begin_time))
            step += 1
        except F.core.EOFException:
            node2vec_pyreader.reset()

        if step % args.steps_per_save == 0 or step == train_steps:
            profiler.stop_profiler("total", "/tmp/profile")
            model_save_dir = args.save_path
            model_path = os.path.join(model_save_dir, str(step))
            if not os.path.exists(model_save_dir):
                os.makedirs(model_save_dir)
            #fleet.save_persistables(exe, model_path)
            F.io.save_params(exe, dirname=model_path, main_program=program)
        if step == train_steps:
            break
예제 #9
0
def run_predict(py_reader,
                exe,
                program,
                model_dict,
                log_per_step=1,
                args=None):

    id2str = io.open(os.path.join(args.graph_work_path, "terms.txt"),
                     encoding=args.encoding).readlines()

    trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
    trainer_count = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
    if not os.path.exists(args.output_path):
        os.mkdir(args.output_path)

    fout = io.open("%s/part-%s" % (args.output_path, trainer_id),
                   "w",
                   encoding="utf8")
    batch = 0

    for batch_feed_dict in py_reader():
        batch += 1
        batch_usr_feat, _, _, batch_src_real_index, _ = exe.run(
            program, feed=batch_feed_dict, fetch_list=model_dict.outputs)

        if batch % log_per_step == 0:
            log.info("Predict %s finished" % batch)

        for ufs, sri in zip(batch_usr_feat, batch_src_real_index):
            if args.input_type == "text":
                sri = id2str[int(sri)].strip("\n")
            line = "{}\t{}\n".format(sri, tostr(ufs))
            fout.write(line)

    fout.close()
예제 #10
0
파일: gpu_train.py 프로젝트: Yelrose/PGL
def train(train_exe, exe, program, loss, node2vec_pyreader, args, train_steps):
    """ train
    """
    trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
    step = 0
    while True:
        try:
            begin_time = time.time()
            loss_val, = train_exe.run(fetch_list=[loss])
            log.info("step %s: loss %.5f speed: %.5f s/step" %
                     (step, np.mean(loss_val), time.time() - begin_time))
            step += 1
        except F.core.EOFException:
            node2vec_pyreader.reset()

        if (step % args.steps_per_save == 0 or
                step == train_steps) and trainer_id == 0:

            model_save_dir = args.output_path
            model_path = os.path.join(model_save_dir, str(step))
            if not os.path.exists(model_save_dir):
                os.makedirs(model_save_dir)
            F.io.save_params(exe, model_path, program)

        if step == train_steps:
            break
예제 #11
0
def dump_node_feat(config):
    log.info("Dump node feat starting...")
    id2str = [
        line.strip("\n").split("\t")[-1]
        for line in io.open(os.path.join(config.graph_work_path, "terms.txt"),
                            encoding=config.encoding)
    ]
    if "tiny" in config.ernie_name:
        tokenizer = ErnieTinyTokenizer.from_pretrained(config.ernie_name)
        #tokenizer.vocab = tokenizer.sp_model.vocab
        term_ids = [
            partial(term2id, tokenizer=tokenizer,
                    max_seqlen=config.max_seqlen)(s) for s in id2str
        ]
    else:
        tokenizer = ErnieTokenizer.from_pretrained(config.ernie_name)
        pool = multiprocessing.Pool()
        term_ids = pool.map(
            partial(term2id, tokenizer=tokenizer,
                    max_seqlen=config.max_seqlen), id2str)
        pool.terminate()
    node_feat_path = os.path.join(config.graph_work_path, "node_feat")
    if not os.path.exists(node_feat_path):
        os.makedirs(node_feat_path)
    np.save(os.path.join(config.graph_work_path, "node_feat", "term_ids.npy"),
            np.array(term_ids, np.uint16))
    log.info("Dump node feat done.")
예제 #12
0
파일: train.py 프로젝트: Yelrose/PGL
def train(dataloader, model, feature, criterion, optim, log_per_step=100):
    model.train()

    batch = 0
    total_loss = 0.
    total_acc = 0.
    total_sample = 0

    for g, sample_index, index, label in dataloader:
        batch += 1
        num_samples = len(index)

        g.tensor()
        sample_index = paddle.to_tensor(sample_index)
        index = paddle.to_tensor(index)
        label = paddle.to_tensor(label)

        feat = paddle.gather(feature, sample_index)
        pred = model(g, feat)
        pred = paddle.gather(pred, index)
        loss = criterion(pred, label)
        loss.backward()
        acc = paddle.metric.accuracy(input=pred, label=label, k=1)
        optim.step()
        optim.clear_grad()

        total_loss += loss.numpy() * num_samples
        total_acc += acc.numpy() * num_samples
        total_sample += num_samples

        if batch % log_per_step == 0:
            log.info("Batch %s %s-Loss %s %s-Acc %s" %
                     (batch, "train", loss.numpy(), "train", acc.numpy()))

    return total_loss / total_sample, total_acc / total_sample
예제 #13
0
파일: utils.py 프로젝트: WenjinW/PGL
def save_model(output_path, model, steps, opt, lr_scheduler, max_ckpt=2):
    if paddle.distributed.get_rank() == 0:
        output_dir = os.path.join(output_path, "model_%d" % steps)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        paddle.save(model.state_dict(),
                    os.path.join(output_dir, "ckpt.pdparams"))
        #         paddle.save({ "global_step": steps }, os.path.join(output_dir, "step"))
        #         paddle.save(opt.state_dict(), os.path.join(output_dir, "opt.pdparams"))
        #         paddle.save(lr_scheduler.state_dict(), os.path.join(output_dir, "lr_scheduler.pdparams"))

        log.info("save model %s" % output_dir)

        ckpt_paths = glob.glob(os.path.join(output_path, "model_*"))
        if len(ckpt_paths) > max_ckpt:

            def version(x):
                x = int(x.split("_")[-1])
                return x

            rm_ckpt_paths = sorted(ckpt_paths, key=version,
                                   reverse=True)[max_ckpt:]
            for ckpt_dir in rm_ckpt_paths:
                if os.path.exists(ckpt_dir):
                    shutil.rmtree(ckpt_dir)
예제 #14
0
파일: train.py 프로젝트: zgsxwsdxg/PGL
def run_epoch(batch_iter,
              exe,
              program,
              prefix,
              model_loss,
              model_acc,
              epoch,
              log_per_step=100):
    batch = 0
    total_loss = 0.
    total_acc = 0.
    total_sample = 0
    start = time.time()
    for batch_feed_dict in batch_iter():
        batch += 1
        batch_loss, batch_acc = exe.run(program,
                                        fetch_list=[model_loss, model_acc],
                                        feed=batch_feed_dict)

        if batch % log_per_step == 0:
            log.info("Batch %s %s-Loss %s %s-Acc %s" %
                     (batch, prefix, batch_loss, prefix, batch_acc))

        num_samples = len(batch_feed_dict["node_index"])
        total_loss += batch_loss * num_samples
        total_acc += batch_acc * num_samples
        total_sample += num_samples
    end = time.time()

    log.info(
        "%s Epoch %s Loss %.5lf Acc %.5lf Speed(per batch) %.5lf sec" %
        (prefix, epoch, total_loss / total_sample, total_acc / total_sample,
         (end - start) / batch))
예제 #15
0
def main(config):
    # Select Model
    model = Model.factory(config)

    # Build Train Edges
    data = TrainData(config.graph_path)

    # Build Train Data
    train_iter = GraphGenerator(
        graph_wrappers=model.graph_wrappers,
        batch_size=config.batch_size,
        data=data,
        samples=config.samples,
        num_workers=config.sample_workers,
        feed_name_list=[var.name for var in model.feed_list],
        use_pyreader=config.use_pyreader,
        phase="train",
        graph_data_path=config.graph_path,
        shuffle=True)

    log.info("build graph reader done.")

    learner = Learner.factory(config.learner_type)
    learner.build(model, train_iter, config)

    learner.start()
    learner.stop()
예제 #16
0
def test_gen_speed(gen_func):
    cur_time = time.time()
    for idx, _ in enumerate(gen_func()):
        log.info("iter %s: %s s" % (idx, time.time() - cur_time))
        cur_time = time.time()
        if idx == 100:
            break
예제 #17
0
def main(args):
    import logging
    log.setLevel(logging.DEBUG)
    log.info("start")

    num_devices = len(F.cuda_places())
    model = DeepwalkModel(args.num_nodes, args.hidden_size, args.neg_num,
                          False, False, 1.)
    pyreader = model.pyreader
    loss = model.forward()

    train_steps = int(args.num_nodes * args.epoch / args.batch_size /
                      num_devices)
    optimization(args.lr * num_devices, loss, train_steps, args.optimizer)

    place = F.CUDAPlace(0)
    exe = F.Executor(place)
    exe.run(F.default_startup_program())

    graph = build_graph(args.num_nodes, args.edge_path)
    gen_func = build_gen_func(args, graph)

    pyreader.decorate_tensor_provider(gen_func)
    pyreader.start()

    train_prog = F.default_main_program()

    if args.warm_start_from_dir is not None:
        F.io.load_params(exe, args.warm_start_from_dir, train_prog)

    train_exe = get_parallel_exe(train_prog, loss)
    train(train_exe, exe, train_prog, loss, pyreader, args, train_steps)
예제 #18
0
파일: conv.py 프로젝트: WenjinW/PGL
    def __init__(self, config):
        super(GNNVirt, self).__init__()
        log.info("gnn_type is %s" % self.__class__.__name__)
        self.config = config

        self.atom_encoder = getattr(ME, self.config.atom_enc_type, ME.AtomEncoder)(
                self.config.emb_dim)

        self.virtualnode_embedding = self.create_parameter(
            shape=[1, self.config.emb_dim],
            dtype='float32',
            default_initializer=nn.initializer.Constant(value=0.0))

        self.convs = paddle.nn.LayerList()
        self.batch_norms = paddle.nn.LayerList()
        self.mlp_virtualnode_list = paddle.nn.LayerList()

        for layer in range(self.config.num_layers):
            self.convs.append(getattr(L, self.config.layer_type)(self.config))
            self.batch_norms.append(L.batch_norm_1d(self.config.emb_dim))

        for layer in range(self.config.num_layers - 1):
            self.mlp_virtualnode_list.append(
                    nn.Sequential(L.Linear(self.config.emb_dim, self.config.emb_dim), 
                        L.batch_norm_1d(self.config.emb_dim), 
                        nn.Swish(),
                        L.Linear(self.config.emb_dim, self.config.emb_dim), 
                        L.batch_norm_1d(self.config.emb_dim), 
                        nn.Swish())
                    )

        self.pool = gnn.GraphPool(pool_type="sum")
예제 #19
0
def train_prog(exe, program, model, pyreader, args):
    trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
    start = time.time()
    batch = 0
    total_loss = 0.
    total_acc = 0.
    total_sample = 0
    for epoch_idx in range(args.num_epoch):
        for step, batch_feed_dict in enumerate(pyreader()):
            try:
                cpu_time = time.time()
                batch += 1
                batch_loss, batch_acc = exe.run(
                    program,
                    feed=batch_feed_dict,
                    fetch_list=[model.loss, model.acc])

                end = time.time()
                if batch % args.log_per_step == 0:
                    log.info(
                        "Batch %s Loss %s Acc %s \t Speed(per batch) %.5lf/%.5lf sec"
                        % (batch, np.mean(batch_loss), np.mean(batch_acc),
                           (end - start) / batch, (end - cpu_time)))

                if step % args.steps_per_save == 0:
                    save_path = args.save_path
                    if trainer_id == 0:
                        model_path = os.path.join(save_path, "%s" % step)
                        fleet.save_persistables(exe, model_path)
            except Exception as e:
                log.info("Pyreader train error")
                log.exception(e)
예제 #20
0
파일: train.py 프로젝트: weihua916/PGL
def main(args):
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    graph = load(args.dataset)

    model = SkipGramModel(
        graph.num_nodes,
        args.embed_size,
        args.neg_num,
        sparse=not args.use_cuda)
    model = paddle.DataParallel(model)

    optim = Adam(
        learning_rate=args.learning_rate,
        parameters=model.parameters(),
        weight_decay=args.weight_decay)

    train_ds = ShardedDataset(graph.nodes)
    collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size,
                               args.neg_num, args.neg_sample_type)
    data_loader = Dataloader(
        train_ds,
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.sample_workers,
        collate_fn=collate_fn)

    for epoch in tqdm.tqdm(range(args.epoch)):
        train_loss = train(model, data_loader, optim)
        log.info("Runing epoch:%s\t train_loss:%.6f", epoch, train_loss)
예제 #21
0
파일: test.py 프로젝트: WenjinW/PGL
def infer(config, output_path):
    model = getattr(M, config.model_type)(config)

    log.info("infer model from %s" % config.infer_from)
    model.set_state_dict(paddle.load(config.infer_from))

    log.info("loading data")
    ds = getattr(DS, config.dataset_type)(config)

    split_idx = ds.get_idx_split()
    test_ds = DS.Subset(ds, split_idx['test'], mode='test')
    log.info("Test exapmles: %s" % len(test_ds))

    test_loader = Dataloader(test_ds,
                             batch_size=config.valid_batch_size,
                             shuffle=False,
                             num_workers=1,
                             collate_fn=DS.CollateFn(config))

    ### automatic evaluator. takes dataset name as input
    evaluator = PCQM4MEvaluator()

    # ---------------- test ----------------------- #
    log.info("testing ...")
    pred_dict = evaluate(model, test_loader)

    test_output_path = os.path.join(config.output_dir, config.task_name)
    make_dir(test_output_path)
    test_output_file = os.path.join(test_output_path, "test_pred.npz")

    log.info("saving test result to %s" % test_output_file)
    np.savez_compressed(test_output_file,
                        pred_dict['y_pred'].astype(np.float32))
예제 #22
0
파일: dump_graph.py 프로젝트: GaoFengs/PGL
def dump_node_feat(args):
    log.info("Dump node feat starting...")
    id2str = np.load(os.path.join(args.outpath, "id2str.npy"), mmap_mode="r")
    pool = multiprocessing.Pool()
    tokenizer = FullTokenizer(args.vocab_file)
    term_ids = pool.map(partial(term2id, tokenizer=tokenizer, max_seqlen=args.max_seqlen), id2str)
    np.save(os.path.join(args.outpath, "term_ids.npy"), np.array(term_ids))
    log.info("Dump node feat done.")
    pool.terminate()
예제 #23
0
파일: dist_graph.py 프로젝트: WenjinW/PGL
 def load_edges(self):
     for etype, file_or_dir in self.etype2files.items():
         file_list = [f for f in helper.get_files(file_or_dir)]
         filepath = ";".join(file_list)
         log.info("load edges of type %s from %s" % (etype, filepath))
         self._client.load_edge_file(etype, filepath, False)
         if self.symmetry:
             r_etype = helper.get_inverse_etype(etype)
             self._client.load_edge_file(r_etype, filepath, True)
예제 #24
0
def run(dataset,
        feature,
        exe,
        program,
        loss,
        acc,
        phase="train",
        log_per_step=5,
        cpu_num=1):

    batch = 0
    total_loss = 0.
    total_acc = 0.
    total_sample = 0

    feed_list = []
    step = 0
    for g, sample_index, index, label in dataset:
        feed_dict = {
            "num_nodes": np.array(
                [g.num_nodes], dtype="int32"),
            "edges": g.edges.astype("int32"),
            "sample_index": sample_index.astype("int32"),
            "index": index.astype("int32"),
            "label": label.astype("int64").reshape(-1),
            "feature": feature[sample_index].astype("float32")
        }

        if len(feed_list) < cpu_num:
            feed_list.append(feed_dict)
        batch += 1

        if len(feed_list) == cpu_num:
            batch_index = []
            for feed_dict in feed_list:
                batch_index.append(feed_dict["index"])

            if len(feed_list) == 1:
                feed_list = feed_list[0]

            batch_loss, batch_acc = exe.run(program,
                                            feed=feed_list,
                                            fetch_list=[loss.name, acc.name])
            step += 1
            feed_list = []

            if step % log_per_step == 0:
                log.info("Batch %s %s-Loss %s %s-Acc %s" %
                         (batch, phase, np.mean(batch_loss), phase,
                          np.mean(batch_acc)))

            for n, index in enumerate(batch_index):
                total_acc += batch_acc[n] * len(index)
                total_loss += batch_loss[n] * len(index)
                total_sample += len(index)

    return total_loss / total_sample, total_acc / total_sample
예제 #25
0
def random_split(dataset_size, split_ratio=0.9, seed=0, shuffle=True):
    """random splitter"""
    np.random.seed(seed)
    indices = list(range(dataset_size))
    np.random.shuffle(indices)
    split = int(split_ratio * dataset_size)
    train_idx, valid_idx = indices[:split], indices[split:]
    log.info("train_set : test_set == %d : %d" %
             (len(train_idx), len(valid_idx)))
    return train_idx, valid_idx
예제 #26
0
def test(args):
    graph = build_graph(args.num_nodes, args.edge_path)
    gen_func = build_gen_func(args, graph)

    start = time.time()
    num = 10
    for idx, _ in enumerate(gen_func()):
        if idx % num == num - 1:
            log.info("%s" % (1.0 * (time.time() - start) / num))
            start = time.time()
예제 #27
0
파일: mol_encoder.py 프로젝트: WenjinW/PGL
    def __init__(self, emb_dim):
        super(BondEncoder, self).__init__()
        log.info("bond encoder type is %s" % self.__class__.__name__)

        self.bond_embedding_list = nn.LayerList()

        for i, dim in enumerate(full_bond_feature_dims):
            weight_attr = nn.initializer.XavierUniform()
            emb = paddle.nn.Embedding(dim, emb_dim, weight_attr=weight_attr)
            self.bond_embedding_list.append(emb)
예제 #28
0
def dump_id2str_map(args):
    log.info("Dump id2str map starting...")
    id2str = np.array([
        line.strip("\n")
        for line in open(os.path.join(args.outpath, "terms.txt"),
                         "r",
                         encoding=args.encoding)
    ])
    np.save(os.path.join(args.outpath, "id2str.npy"), id2str)
    log.info("Dump id2str map done.")
예제 #29
0
파일: conv.py 프로젝트: WenjinW/PGL
    def __init__(self, config):
        super(JuncGNNVirt, self).__init__()
        log.info("gnn_type is %s" % self.__class__.__name__)
        self.config = config
        self.num_layers = config.num_layers
        self.drop_ratio = config.drop_ratio
        self.JK = config.JK
        self.residual = config.residual
        self.emb_dim = config.emb_dim
        self.gnn_type = config.gnn_type
        self.layer_type = config.layer_type

        if self.num_layers < 2:
            raise ValueError("Number of GNN layers must be greater than 1.")

        self.atom_encoder = getattr(ME, self.config.atom_enc_type, ME.AtomEncoder)(
                self.emb_dim)

        self.junc_embed = paddle.nn.Embedding(6000, self.emb_dim)

        ### set the initial virtual node embedding to 0.
        #  self.virtualnode_embedding = paddle.nn.Embedding(1, emb_dim)
        #  torch.nn.init.constant_(self.virtualnode_embedding.weight.data, 0)
        self.virtualnode_embedding = self.create_parameter(
            shape=[1, self.emb_dim],
            dtype='float32',
            default_initializer=nn.initializer.Constant(value=0.0))

        ### List of GNNs
        self.convs = nn.LayerList()
        ### batch norms applied to node embeddings
        self.batch_norms = nn.LayerList()

        ### List of MLPs to transform virtual node at every layer
        self.mlp_virtualnode_list = nn.LayerList()

        self.junc_convs = nn.LayerList()

        for layer in range(self.num_layers):
            self.convs.append(getattr(L, self.layer_type)(self.config))
            self.junc_convs.append(gnn.GINConv(self.emb_dim, self.emb_dim))

            self.batch_norms.append(L.batch_norm_1d(self.emb_dim))

        for layer in range(self.num_layers - 1):
            self.mlp_virtualnode_list.append(
                    nn.Sequential(L.Linear(self.emb_dim, self.emb_dim), 
                        L.batch_norm_1d(self.emb_dim), 
                        nn.Swish(),
                        L.Linear(self.emb_dim, self.emb_dim), 
                        L.batch_norm_1d(self.emb_dim), 
                        nn.Swish())
                    )

        self.pool = gnn.GraphPool(pool_type="sum")
예제 #30
0
파일: train.py 프로젝트: Yelrose/PGL
def main(args, config):
    dataset = load(args.dataset, args.feature_pre_normalize)

    graph = dataset.graph
    train_index = dataset.train_index
    train_label = dataset.train_label

    val_index = dataset.val_index
    val_label = dataset.val_label

    test_index = dataset.test_index
    test_label = dataset.test_label
    GraphModel = getattr(model, config.model_name)
    criterion = paddle.nn.loss.CrossEntropyLoss()

    dur = []

    best_test = []

    for run in range(args.runs):
        cal_val_acc = []
        cal_test_acc = []
        cal_val_loss = []
        cal_test_loss = []

        gnn_model = GraphModel(input_size=graph.node_feat["words"].shape[1],
                               num_class=dataset.num_classes,
                               **config)

        optim = Adam(learning_rate=config.learning_rate,
                     parameters=gnn_model.parameters(),
                     weight_decay=config.weight_decay)

        for epoch in tqdm.tqdm(range(args.epoch)):
            train_loss, train_acc = train(train_index, train_label, gnn_model,
                                          graph, criterion, optim)
            val_loss, val_acc = eval(val_index, val_label, gnn_model, graph,
                                     criterion)
            cal_val_acc.append(val_acc.numpy())
            cal_val_loss.append(val_loss.numpy())

            test_loss, test_acc = eval(test_index, test_label, gnn_model,
                                       graph, criterion)
            cal_test_acc.append(test_acc.numpy())
            cal_test_loss.append(test_loss.numpy())

        log.info(
            "Runs %s: Model: %s Best Test Accuracy: %f" %
            (run, config.model_name, cal_test_acc[np.argmin(cal_val_loss)]))

        best_test.append(cal_test_acc[np.argmin(cal_val_loss)])

    log.info("Dataset: %s Best Test Accuracy: %f ( stddev: %f )" %
             (args.dataset, np.mean(best_test), np.std(best_test)))