def infer(args): log.info("loading data") raw_dataset = GraphPropPredDataset(name=args.dataset_name) args.num_class = raw_dataset.num_tasks args.eval_metric = raw_dataset.eval_metric args.task_type = raw_dataset.task_type test_ds = MolDataset(args, raw_dataset, mode="test") fn = MgfCollateFn(args, mode="test") test_loader = Dataloader(test_ds, batch_size=args.batch_size, num_workers=1, collate_fn=fn) test_loader = PDataset.from_generator_func(test_loader) est = propeller.Learner(MgfModel, args, args.model_config) mgf_list = [] for soft_mgf in est.predict(test_loader, ckpt_path=args.model_path_for_infer, split_batch=True): mgf_list.append(soft_mgf) mgf = np.concatenate(mgf_list) log.info("saving features") np.save( "dataset/%s/soft_mgf_feat.npy" % (args.dataset_name.replace("-", "_")), mgf)
def predict(config): # Build Train Data trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) num_nodes = int( np.load(os.path.join(config.graph_work_path, "num_nodes.npy"))) data = PredictData(num_nodes) predict_iter = BatchGraphGenerator(graph_wrappers=[1], batch_size=config.infer_batch_size, data=data, samples=config.samples, num_workers=config.sample_workers, feed_name_list=None, use_pyreader=False, phase="predict", graph_data_path=config.graph_work_path, shuffle=False, neg_type=config.neg_type) predict_ds = Dataset.from_generator_func(predict_iter) predict_ds.name = "predict" predict_ds.data_shapes = [[-1] + list(shape[1:]) for shape in predict_ds.data_shapes] tokenizer = load_tokenizer(config.ernie_name) config.cls_id = tokenizer.cls_id ernie_cfg_dict, ernie_param_path = PretrainedModelLoader.from_pretrained( config.ernie_name) config.ernie_config = ernie_cfg_dict est = propeller.Learner(ERNIESageLinkPredictModel, config, config) id2str = io.open(os.path.join(config.graph_work_path, "terms.txt"), encoding=config.encoding).readlines() fout = io.open("%s/part-%s" % (config.model_dir, trainer_id), "w", encoding="utf8") if "infer_model" in config: predict_result_iter = est.predict(predict_ds, ckpt_path=config["infer_model"]) else: predict_result_iter = est.predict(predict_ds, ckpt=-1) for user_feat, user_real_index in predict_result_iter: sri = id2str[int(user_real_index)].strip("\n") line = "{}\t{}\n".format(sri, tostr(user_feat)) fout.write(line) fout.close()
feature_column = propeller.data.FeatureColumns([ propeller.data.TextColumn('title', unk_id=unk_id, vocab_dict=tokenizer.vocab, tokenizer=tokenizer.tokenize), propeller.data.TextColumn('comment', unk_id=unk_id, vocab_dict=tokenizer.vocab, tokenizer=tokenizer.tokenize), ]) def map_fn(seg_a, seg_b): seg_a, seg_b = tokenizer.truncate(seg_a, seg_b, seqlen=args.max_seqlen) sentence, segments = tokenizer.build_for_ernie(seg_a, seg_b) return sentence, segments predict_ds = feature_column.build_dataset_from_stdin('predict') \ .map(map_fn) \ .padded_batch(hparams.batch_size) \ predict_ds.data_shapes = shapes[:-1] predict_ds.data_types = types[:-1] est = propeller.Learner(model_fn, run_config, hparams) for res, in est.predict(predict_ds, ckpt=-1): print('%d\t%.5f\t%.5f\t%.5f' % (np.argmax(res), res[0], res[1], res[2]))
args.data_dir, vocab=tokenizer.vocab, hparams=hparams, args=args) seq_shape = [-1, args.max_seqlen] ints_shape = [ -1, ] shapes = (seq_shape, seq_shape, ints_shape, [-1, 2], ints_shape) types = ('int64', 'int64', 'int64', 'int64', 'int64') train_ds.data_shapes = shapes train_ds.data_types = types ws = None #varname_to_warmstart = re.compile(r'^encoder.*[wb]_0$|^.*embedding$|^.*bias$|^.*scale$|^pooled_fc.[wb]_0$') varname_to_warmstart = re.compile(r'.*') if args.from_pretrained is not None: warm_start_dir = os.path.join(args.from_pretrained, 'params') ws = propeller.WarmStartSetting( predicate_fn=lambda v: varname_to_warmstart.match(v.name) and os. path.exists(os.path.join(warm_start_dir, v.name)), from_dir=warm_start_dir) ernie_learner = propeller.Learner(ernie_pretrain_model_fn, run_config, params=hparams, warm_start_setting=ws) ernie_learner.train(train_ds)
}, warm_start_setting=ws, exporters=[best_exporter]) for k in best_exporter._best['dev'].keys(): if 'loss' in k: continue dev_v = best_exporter._best['dev'][k] test_v = best_exporter._best['test'][k] print('dev_%s\t%.5f\ntest_%s\t%.5f' % (k, dev_v, k, test_v)) else: predict_ds = make_sequence_label_dataset_from_stdin( name='pred', tokenizer=tokenizer, batch_size=hparams.batch_size, max_seqlen=args.max_seqlen) shapes = ([-1, args.max_seqlen, 1], [-1, args.max_seqlen, 1], [-1, 1]) types = ('int64', 'int64', 'int64') predict_ds.data_shapes = shapes predict_ds.data_types = types rev_label_map = {i: v for i, v in enumerate(label_list)} learner = propeller.Learner(SequenceLabelErnieModel, run_config, hparams) for pred, _ in learner.predict(predict_ds, ckpt=-1): pred_str = ' '.join( [rev_label_map[idx] for idx in np.argmax(pred, 1).tolist()]) print(pred_str)
sentence, segments = utils.data.build_2_pair( seg_a, seg_b, max_seqlen=args.max_seqlen, cls_id=cls_id, sep_id=sep_id) return sentence, segments, qid def after(sentence, segments, qid): sentence, segments, qid = utils.data.expand_dims( sentence, segments, qid) return sentence, segments, qid predict_ds = feature_column.build_dataset_from_stdin('predict') \ .map(before) \ .padded_batch(hparams.batch_size, (0, 0, 0)) \ .map(after) predict_ds.data_shapes = shapes[:-1] predict_ds.data_types = types[:-1] est = propeller.Learner(RankingErnieModel, run_config, hparams) for qid, res in est.predict(predict_ds, ckpt=-1): print('%d\t%d\t%.5f\t%.5f' % (qid[0], np.argmax(res), res[0], res[1])) #for i in predict_ds: # sen = i[0] # for ss in np.squeeze(sen): # print(' '.join(map(str, ss)))
vocab_dict=vocab, tokenizer=tokenizer_func), propeller.data.LabelColumn('label'), ]) def before(seg_a): sentence, segments = utils.data.build_1_pair( seg_a, max_seqlen=args.max_seqlen, cls_id=cls_id, sep_id=sep_id) return sentence, segments def after(sentence, segments): sentence, segments = utils.data.expand_dims(sentence, segments) return sentence, segments predict_ds = feature_column.build_dataset_from_stdin('predict') \ .map(before) \ .padded_batch(hparams.batch_size, (0, 0)) \ .map(after) shapes = ([-1, args.max_seqlen, 1], [-1, args.max_seqlen, 1]) types = ('int64', 'int64') predict_ds.data_shapes = shapes predict_ds.data_types = types finetuned_model = propeller.Learner(ClassificationErnieModel, run_config, hparams) for logits, in finetuned_model.predict( predict_ds, ckpt=-1): # ckpt=-1 means last step print(np.argmax(logits))