示例#1
0
def infer(args):
    log.info("loading data")
    raw_dataset = GraphPropPredDataset(name=args.dataset_name)
    args.num_class = raw_dataset.num_tasks
    args.eval_metric = raw_dataset.eval_metric
    args.task_type = raw_dataset.task_type

    test_ds = MolDataset(args, raw_dataset, mode="test")

    fn = MgfCollateFn(args, mode="test")

    test_loader = Dataloader(test_ds,
                             batch_size=args.batch_size,
                             num_workers=1,
                             collate_fn=fn)
    test_loader = PDataset.from_generator_func(test_loader)

    est = propeller.Learner(MgfModel, args, args.model_config)

    mgf_list = []
    for soft_mgf in est.predict(test_loader,
                                ckpt_path=args.model_path_for_infer,
                                split_batch=True):
        mgf_list.append(soft_mgf)

    mgf = np.concatenate(mgf_list)
    log.info("saving features")
    np.save(
        "dataset/%s/soft_mgf_feat.npy" % (args.dataset_name.replace("-", "_")),
        mgf)
示例#2
0
def predict(config):
    # Build Train Data
    trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))

    num_nodes = int(
        np.load(os.path.join(config.graph_work_path, "num_nodes.npy")))
    data = PredictData(num_nodes)
    predict_iter = BatchGraphGenerator(graph_wrappers=[1],
                                       batch_size=config.infer_batch_size,
                                       data=data,
                                       samples=config.samples,
                                       num_workers=config.sample_workers,
                                       feed_name_list=None,
                                       use_pyreader=False,
                                       phase="predict",
                                       graph_data_path=config.graph_work_path,
                                       shuffle=False,
                                       neg_type=config.neg_type)
    predict_ds = Dataset.from_generator_func(predict_iter)

    predict_ds.name = "predict"
    predict_ds.data_shapes = [[-1] + list(shape[1:])
                              for shape in predict_ds.data_shapes]

    tokenizer = load_tokenizer(config.ernie_name)
    config.cls_id = tokenizer.cls_id

    ernie_cfg_dict, ernie_param_path = PretrainedModelLoader.from_pretrained(
        config.ernie_name)
    config.ernie_config = ernie_cfg_dict

    est = propeller.Learner(ERNIESageLinkPredictModel, config, config)

    id2str = io.open(os.path.join(config.graph_work_path, "terms.txt"),
                     encoding=config.encoding).readlines()
    fout = io.open("%s/part-%s" % (config.model_dir, trainer_id),
                   "w",
                   encoding="utf8")

    if "infer_model" in config:
        predict_result_iter = est.predict(predict_ds,
                                          ckpt_path=config["infer_model"])
    else:
        predict_result_iter = est.predict(predict_ds, ckpt=-1)

    for user_feat, user_real_index in predict_result_iter:
        sri = id2str[int(user_real_index)].strip("\n")
        line = "{}\t{}\n".format(sri, tostr(user_feat))
        fout.write(line)

    fout.close()
示例#3
0
        feature_column = propeller.data.FeatureColumns([
            propeller.data.TextColumn('title',
                                      unk_id=unk_id,
                                      vocab_dict=tokenizer.vocab,
                                      tokenizer=tokenizer.tokenize),
            propeller.data.TextColumn('comment',
                                      unk_id=unk_id,
                                      vocab_dict=tokenizer.vocab,
                                      tokenizer=tokenizer.tokenize),
        ])

        def map_fn(seg_a, seg_b):
            seg_a, seg_b = tokenizer.truncate(seg_a,
                                              seg_b,
                                              seqlen=args.max_seqlen)
            sentence, segments = tokenizer.build_for_ernie(seg_a, seg_b)
            return sentence, segments


        predict_ds = feature_column.build_dataset_from_stdin('predict') \
                               .map(map_fn) \
                               .padded_batch(hparams.batch_size) \

        predict_ds.data_shapes = shapes[:-1]
        predict_ds.data_types = types[:-1]

        est = propeller.Learner(model_fn, run_config, hparams)
        for res, in est.predict(predict_ds, ckpt=-1):
            print('%d\t%.5f\t%.5f\t%.5f' %
                  (np.argmax(res), res[0], res[1], res[2]))
示例#4
0
文件: pretrain.py 项目: leo038/ERNIE
                                     args.data_dir,
                                     vocab=tokenizer.vocab,
                                     hparams=hparams,
                                     args=args)

    seq_shape = [-1, args.max_seqlen]
    ints_shape = [
        -1,
    ]
    shapes = (seq_shape, seq_shape, ints_shape, [-1, 2], ints_shape)
    types = ('int64', 'int64', 'int64', 'int64', 'int64')

    train_ds.data_shapes = shapes
    train_ds.data_types = types
    ws = None

    #varname_to_warmstart = re.compile(r'^encoder.*[wb]_0$|^.*embedding$|^.*bias$|^.*scale$|^pooled_fc.[wb]_0$')
    varname_to_warmstart = re.compile(r'.*')
    if args.from_pretrained is not None:
        warm_start_dir = os.path.join(args.from_pretrained, 'params')
        ws = propeller.WarmStartSetting(
            predicate_fn=lambda v: varname_to_warmstart.match(v.name) and os.
            path.exists(os.path.join(warm_start_dir, v.name)),
            from_dir=warm_start_dir)

    ernie_learner = propeller.Learner(ernie_pretrain_model_fn,
                                      run_config,
                                      params=hparams,
                                      warm_start_setting=ws)
    ernie_learner.train(train_ds)
示例#5
0
            },
            warm_start_setting=ws,
            exporters=[best_exporter])

        for k in best_exporter._best['dev'].keys():
            if 'loss' in k:
                continue
            dev_v = best_exporter._best['dev'][k]
            test_v = best_exporter._best['test'][k]
            print('dev_%s\t%.5f\ntest_%s\t%.5f' % (k, dev_v, k, test_v))
    else:
        predict_ds = make_sequence_label_dataset_from_stdin(
            name='pred',
            tokenizer=tokenizer,
            batch_size=hparams.batch_size,
            max_seqlen=args.max_seqlen)

        shapes = ([-1, args.max_seqlen, 1], [-1, args.max_seqlen, 1], [-1, 1])
        types = ('int64', 'int64', 'int64')

        predict_ds.data_shapes = shapes
        predict_ds.data_types = types

        rev_label_map = {i: v for i, v in enumerate(label_list)}
        learner = propeller.Learner(SequenceLabelErnieModel, run_config,
                                    hparams)
        for pred, _ in learner.predict(predict_ds, ckpt=-1):
            pred_str = ' '.join(
                [rev_label_map[idx] for idx in np.argmax(pred, 1).tolist()])
            print(pred_str)
示例#6
0
            sentence, segments = utils.data.build_2_pair(
                seg_a,
                seg_b,
                max_seqlen=args.max_seqlen,
                cls_id=cls_id,
                sep_id=sep_id)
            return sentence, segments, qid

        def after(sentence, segments, qid):
            sentence, segments, qid = utils.data.expand_dims(
                sentence, segments, qid)
            return sentence, segments, qid

        predict_ds = feature_column.build_dataset_from_stdin('predict') \
                               .map(before) \
                               .padded_batch(hparams.batch_size, (0, 0, 0)) \
                               .map(after)

        predict_ds.data_shapes = shapes[:-1]
        predict_ds.data_types = types[:-1]

        est = propeller.Learner(RankingErnieModel, run_config, hparams)
        for qid, res in est.predict(predict_ds, ckpt=-1):
            print('%d\t%d\t%.5f\t%.5f' %
                  (qid[0], np.argmax(res), res[0], res[1]))

        #for i in predict_ds:
        #    sen = i[0]
        #    for ss in np.squeeze(sen):
        #        print(' '.join(map(str, ss)))
示例#7
0
                                      vocab_dict=vocab,
                                      tokenizer=tokenizer_func),
            propeller.data.LabelColumn('label'),
        ])

        def before(seg_a):
            sentence, segments = utils.data.build_1_pair(
                seg_a,
                max_seqlen=args.max_seqlen,
                cls_id=cls_id,
                sep_id=sep_id)
            return sentence, segments

        def after(sentence, segments):
            sentence, segments = utils.data.expand_dims(sentence, segments)
            return sentence, segments
        predict_ds = feature_column.build_dataset_from_stdin('predict') \
                               .map(before) \
                               .padded_batch(hparams.batch_size, (0, 0)) \
                               .map(after)
        shapes = ([-1, args.max_seqlen, 1], [-1, args.max_seqlen, 1])
        types = ('int64', 'int64')

        predict_ds.data_shapes = shapes
        predict_ds.data_types = types
        finetuned_model = propeller.Learner(ClassificationErnieModel,
                                            run_config, hparams)
        for logits, in finetuned_model.predict(
                predict_ds, ckpt=-1):  # ckpt=-1 means last step
            print(np.argmax(logits))