Пример #1
0
def train(config):
    # Build Train Data
    data = TrainData(config.graph_work_path)
    train_iter = BatchGraphGenerator(graph_wrappers=[1],
                                     batch_size=config.batch_size,
                                     data=data,
                                     samples=config.samples,
                                     num_workers=config.sample_workers,
                                     feed_name_list=None,
                                     use_pyreader=False,
                                     phase="train",
                                     graph_data_path=config.graph_work_path,
                                     shuffle=True,
                                     neg_type=config.neg_type)
    train_ds = Dataset.from_generator_func(train_iter).repeat(config.epochs)
    dev_ds = Dataset.from_generator_func(train_iter)

    ernie_cfg_dict, ernie_param_path = PretrainedModelLoader.from_pretrained(
        config.ernie_name)

    if "warm_start_from" not in config:
        warm_start_from = ernie_param_path
    else:
        ernie_param_path = config.ernie_param_path

    if "ernie_config" not in config:
        config.ernie_config = ernie_cfg_dict

    ws = propeller.WarmStartSetting(predicate_fn=lambda v: os.path.exists(
        os.path.join(warm_start_from, v.name)),
                                    from_dir=warm_start_from)

    train_ds.name = "train"
    train_ds.data_shapes = [[-1] + list(shape[1:])
                            for shape in train_ds.data_shapes]
    dev_ds.name = "dev"
    dev_ds.data_shapes = [[-1] + list(shape[1:])
                          for shape in dev_ds.data_shapes]

    tokenizer = load_tokenizer(config.ernie_name)
    config.cls_id = tokenizer.cls_id

    propeller.train.train_and_eval(
        model_class_or_model_fn=ERNIESageLinkPredictModel,
        params=config,
        run_config=config,
        train_dataset=train_ds,
        eval_dataset={"eval": dev_ds},
        warm_start_setting=ws,
    )
Пример #2
0
    def bb_to_segments(filename):
        ds = Dataset.from_record_file(filename).map(_parse_gz)
        iterable = iter(ds)

        def gen():
            buf, size = [], 0
            iterator = iter(ds)
            while 1:
                doc, doc_seg = next(iterator)
                for line, line_seg in zip(doc, doc_seg):
                    #line = np.array(sp_model.SampleEncodeAsIds(line, -1, 0.1), dtype=np.int64) # 0.1 means large variance on sentence piece result
                    if len(line) == 0:
                        continue
                    line = np.array(
                        line
                    )  # 0.1 means large variance on sentence piece result
                    line_seg = np.array(line_seg)
                    size += len(line)
                    buf.append(np.stack([line, line_seg]).transpose())
                    if size > max_input_seqlen:
                        yield buf,
                        buf, size = [], 0
                if len(buf) != 0:
                    yield buf,
                    buf, size = [], 0

        return Dataset.from_generator_func(gen)
Пример #3
0
def infer(args):
    log.info("loading data")
    raw_dataset = GraphPropPredDataset(name=args.dataset_name)
    args.num_class = raw_dataset.num_tasks
    args.eval_metric = raw_dataset.eval_metric
    args.task_type = raw_dataset.task_type

    test_ds = MolDataset(args, raw_dataset, mode="test")

    fn = MgfCollateFn(args, mode="test")

    test_loader = Dataloader(test_ds,
                             batch_size=args.batch_size,
                             num_workers=1,
                             collate_fn=fn)
    test_loader = PDataset.from_generator_func(test_loader)

    est = propeller.Learner(MgfModel, args, args.model_config)

    mgf_list = []
    for soft_mgf in est.predict(test_loader,
                                ckpt_path=args.model_path_for_infer,
                                split_batch=True):
        mgf_list.append(soft_mgf)

    mgf = np.concatenate(mgf_list)
    log.info("saving features")
    np.save(
        "dataset/%s/soft_mgf_feat.npy" % (args.dataset_name.replace("-", "_")),
        mgf)
Пример #4
0
def interleave(ds1, ds2):
    def gen():
        for i, j in six.moves.zip_longest(iter(ds1), iter(ds2)):
            if i is not None:
                yield i
            if j is not None:
                yield j
    return Dataset.from_generator_func(gen)
Пример #5
0
def predict(config):
    # Build Train Data
    trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))

    num_nodes = int(
        np.load(os.path.join(config.graph_work_path, "num_nodes.npy")))
    data = PredictData(num_nodes)
    predict_iter = BatchGraphGenerator(graph_wrappers=[1],
                                       batch_size=config.infer_batch_size,
                                       data=data,
                                       samples=config.samples,
                                       num_workers=config.sample_workers,
                                       feed_name_list=None,
                                       use_pyreader=False,
                                       phase="predict",
                                       graph_data_path=config.graph_work_path,
                                       shuffle=False,
                                       neg_type=config.neg_type)
    predict_ds = Dataset.from_generator_func(predict_iter)

    predict_ds.name = "predict"
    predict_ds.data_shapes = [[-1] + list(shape[1:])
                              for shape in predict_ds.data_shapes]

    tokenizer = load_tokenizer(config.ernie_name)
    config.cls_id = tokenizer.cls_id

    ernie_cfg_dict, ernie_param_path = PretrainedModelLoader.from_pretrained(
        config.ernie_name)
    config.ernie_config = ernie_cfg_dict

    est = propeller.Learner(ERNIESageLinkPredictModel, config, config)

    id2str = io.open(os.path.join(config.graph_work_path, "terms.txt"),
                     encoding=config.encoding).readlines()
    fout = io.open("%s/part-%s" % (config.model_dir, trainer_id),
                   "w",
                   encoding="utf8")

    if "infer_model" in config:
        predict_result_iter = est.predict(predict_ds,
                                          ckpt_path=config["infer_model"])
    else:
        predict_result_iter = est.predict(predict_ds, ckpt=-1)

    for user_feat, user_real_index in predict_result_iter:
        sri = id2str[int(user_real_index)].strip("\n")
        line = "{}\t{}\n".format(sri, tostr(user_feat))
        fout.write(line)

    fout.close()
Пример #6
0
def train(args, pretrained_model_config=None):
    log.info("loading data")
    raw_dataset = GraphPropPredDataset(name=args.dataset_name)
    args.num_class = raw_dataset.num_tasks
    args.eval_metric = raw_dataset.eval_metric
    args.task_type = raw_dataset.task_type

    train_ds = MolDataset(args, raw_dataset)

    args.eval_steps = math.ceil(len(train_ds) / args.batch_size)
    log.info("Total %s steps (eval_steps) every epoch." % (args.eval_steps))

    fn = MgfCollateFn(args)

    train_loader = Dataloader(train_ds,
                              batch_size=args.batch_size,
                              num_workers=args.num_workers,
                              shuffle=args.shuffle,
                              stream_shuffle_size=args.shuffle_size,
                              collate_fn=fn)

    # for evaluating
    eval_train_loader = train_loader
    eval_train_loader = PDataset.from_generator_func(eval_train_loader)

    train_loader = multi_epoch_dataloader(train_loader, args.epochs)
    train_loader = PDataset.from_generator_func(train_loader)

    if args.warm_start_from is not None:
        # warm start setting
        def _fn(v):
            if not isinstance(v, F.framework.Parameter):
                return False
            if os.path.exists(os.path.join(args.warm_start_from, v.name)):
                return True
            else:
                return False

        ws = propeller.WarmStartSetting(predicate_fn=_fn,
                                        from_dir=args.warm_start_from)
    else:
        ws = None

    def cmp_fn(old, new):
        if old['eval'][args.metrics] - new['eval'][args.metrics] > 0:
            log.info("best %s eval result: %s" % (args.metrics, new['eval']))
            return True
        else:
            return False

    if args.log_id is not None:
        save_best_model = int(args.log_id) == 5
    else:
        save_best_model = True
    best_exporter = propeller.exporter.BestResultExporter(
        args.output_dir, (cmp_fn, save_best_model))

    eval_datasets = {"eval": eval_train_loader}

    propeller.train.train_and_eval(
        model_class_or_model_fn=MgfModel,
        params=pretrained_model_config,
        run_config=args,
        train_dataset=train_loader,
        eval_dataset=eval_datasets,
        warm_start_setting=ws,
        exporters=[best_exporter],
    )