示例#1
0
文件: pretrain.py 项目: leo038/ERNIE
    def bb_to_segments(filename):
        ds = Dataset.from_record_file(filename).map(_parse_gz)
        iterable = iter(ds)

        def gen():
            buf, size = [], 0
            iterator = iter(ds)
            while 1:
                doc, doc_seg = next(iterator)
                for line, line_seg in zip(doc, doc_seg):
                    #line = np.array(sp_model.SampleEncodeAsIds(line, -1, 0.1), dtype=np.int64) # 0.1 means large variance on sentence piece result
                    if len(line) == 0:
                        continue
                    line = np.array(
                        line
                    )  # 0.1 means large variance on sentence piece result
                    line_seg = np.array(line_seg)
                    size += len(line)
                    buf.append(np.stack([line, line_seg]).transpose())
                    if size > max_input_seqlen:
                        yield buf,
                        buf, size = [], 0
                if len(buf) != 0:
                    yield buf,
                    buf, size = [], 0

        return Dataset.from_generator_func(gen)
示例#2
0
def train(config):
    # Build Train Data
    data = TrainData(config.graph_work_path)
    train_iter = BatchGraphGenerator(graph_wrappers=[1],
                                     batch_size=config.batch_size,
                                     data=data,
                                     samples=config.samples,
                                     num_workers=config.sample_workers,
                                     feed_name_list=None,
                                     use_pyreader=False,
                                     phase="train",
                                     graph_data_path=config.graph_work_path,
                                     shuffle=True,
                                     neg_type=config.neg_type)
    train_ds = Dataset.from_generator_func(train_iter).repeat(config.epochs)
    dev_ds = Dataset.from_generator_func(train_iter)

    ernie_cfg_dict, ernie_param_path = PretrainedModelLoader.from_pretrained(
        config.ernie_name)

    if "warm_start_from" not in config:
        warm_start_from = ernie_param_path
    else:
        ernie_param_path = config.ernie_param_path

    if "ernie_config" not in config:
        config.ernie_config = ernie_cfg_dict

    ws = propeller.WarmStartSetting(predicate_fn=lambda v: os.path.exists(
        os.path.join(warm_start_from, v.name)),
                                    from_dir=warm_start_from)

    train_ds.name = "train"
    train_ds.data_shapes = [[-1] + list(shape[1:])
                            for shape in train_ds.data_shapes]
    dev_ds.name = "dev"
    dev_ds.data_shapes = [[-1] + list(shape[1:])
                          for shape in dev_ds.data_shapes]

    tokenizer = load_tokenizer(config.ernie_name)
    config.cls_id = tokenizer.cls_id

    propeller.train.train_and_eval(
        model_class_or_model_fn=ERNIESageLinkPredictModel,
        params=config,
        run_config=config,
        train_dataset=train_ds,
        eval_dataset={"eval": dev_ds},
        warm_start_setting=ws,
    )
示例#3
0
def infer(args):
    log.info("loading data")
    raw_dataset = GraphPropPredDataset(name=args.dataset_name)
    args.num_class = raw_dataset.num_tasks
    args.eval_metric = raw_dataset.eval_metric
    args.task_type = raw_dataset.task_type

    test_ds = MolDataset(args, raw_dataset, mode="test")

    fn = MgfCollateFn(args, mode="test")

    test_loader = Dataloader(test_ds,
                             batch_size=args.batch_size,
                             num_workers=1,
                             collate_fn=fn)
    test_loader = PDataset.from_generator_func(test_loader)

    est = propeller.Learner(MgfModel, args, args.model_config)

    mgf_list = []
    for soft_mgf in est.predict(test_loader,
                                ckpt_path=args.model_path_for_infer,
                                split_batch=True):
        mgf_list.append(soft_mgf)

    mgf = np.concatenate(mgf_list)
    log.info("saving features")
    np.save(
        "dataset/%s/soft_mgf_feat.npy" % (args.dataset_name.replace("-", "_")),
        mgf)
示例#4
0
def interleave(ds1, ds2):
    def gen():
        for i, j in six.moves.zip_longest(iter(ds1), iter(ds2)):
            if i is not None:
                yield i
            if j is not None:
                yield j
    return Dataset.from_generator_func(gen)
示例#5
0
def predict(config):
    # Build Train Data
    trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))

    num_nodes = int(
        np.load(os.path.join(config.graph_work_path, "num_nodes.npy")))
    data = PredictData(num_nodes)
    predict_iter = BatchGraphGenerator(graph_wrappers=[1],
                                       batch_size=config.infer_batch_size,
                                       data=data,
                                       samples=config.samples,
                                       num_workers=config.sample_workers,
                                       feed_name_list=None,
                                       use_pyreader=False,
                                       phase="predict",
                                       graph_data_path=config.graph_work_path,
                                       shuffle=False,
                                       neg_type=config.neg_type)
    predict_ds = Dataset.from_generator_func(predict_iter)

    predict_ds.name = "predict"
    predict_ds.data_shapes = [[-1] + list(shape[1:])
                              for shape in predict_ds.data_shapes]

    tokenizer = load_tokenizer(config.ernie_name)
    config.cls_id = tokenizer.cls_id

    ernie_cfg_dict, ernie_param_path = PretrainedModelLoader.from_pretrained(
        config.ernie_name)
    config.ernie_config = ernie_cfg_dict

    est = propeller.Learner(ERNIESageLinkPredictModel, config, config)

    id2str = io.open(os.path.join(config.graph_work_path, "terms.txt"),
                     encoding=config.encoding).readlines()
    fout = io.open("%s/part-%s" % (config.model_dir, trainer_id),
                   "w",
                   encoding="utf8")

    if "infer_model" in config:
        predict_result_iter = est.predict(predict_ds,
                                          ckpt_path=config["infer_model"])
    else:
        predict_result_iter = est.predict(predict_ds, ckpt=-1)

    for user_feat, user_real_index in predict_result_iter:
        sri = id2str[int(user_real_index)].strip("\n")
        line = "{}\t{}\n".format(sri, tostr(user_feat))
        fout.write(line)

    fout.close()
示例#6
0
文件: pretrain.py 项目: leo038/ERNIE
def make_pretrain_dataset(name, dir, vocab, hparams, args):
    gz_files = glob(dir)
    if not gz_files:
        raise ValueError('train data not found in %s' % dir)

    log.info('read from %s' % '\n'.join(gz_files))
    max_input_seqlen = args.max_seqlen
    max_pretrain_seqlen = lambda: max_input_seqlen if r.random(
    ) > 0.15 else r.randint(1, max_input_seqlen)  # short sentence rate

    def _parse_gz(record_str):  # function that takes python_str as input
        ex = propeller.data.example_pb2.SequenceExample()
        ex.ParseFromString(record_str)
        doc = [
            np.array(f.int64_list.value, dtype=np.int64)
            for f in ex.feature_lists.feature_list['txt'].feature
        ]
        doc_seg = [
            np.array(f.int64_list.value, dtype=np.int64)
            for f in ex.feature_lists.feature_list['segs'].feature
        ]
        return doc, doc_seg

    def bb_to_segments(filename):
        ds = Dataset.from_record_file(filename).map(_parse_gz)
        iterable = iter(ds)

        def gen():
            buf, size = [], 0
            iterator = iter(ds)
            while 1:
                doc, doc_seg = next(iterator)
                for line, line_seg in zip(doc, doc_seg):
                    #line = np.array(sp_model.SampleEncodeAsIds(line, -1, 0.1), dtype=np.int64) # 0.1 means large variance on sentence piece result
                    if len(line) == 0:
                        continue
                    line = np.array(
                        line
                    )  # 0.1 means large variance on sentence piece result
                    line_seg = np.array(line_seg)
                    size += len(line)
                    buf.append(np.stack([line, line_seg]).transpose())
                    if size > max_input_seqlen:
                        yield buf,
                        buf, size = [], 0
                if len(buf) != 0:
                    yield buf,
                    buf, size = [], 0

        return Dataset.from_generator_func(gen)

    def sample_negative(dataset):
        def gen():
            iterator = iter(dataset)
            while True:
                chunk_a, = next(iterator)
                #chunk_b, = next(iterator)

                seqlen = max_pretrain_seqlen()
                seqlen_a = r.randint(1, seqlen)
                seqlen_b = seqlen - seqlen_a
                len_a = list(accumulate([len(c) for c in chunk_a]))
                buf_a = [c for c, l in zip(chunk_a, len_a)
                         if l < seqlen_a]  #always take the first one
                buf_b = [
                    c for c, l in zip(chunk_a, len_a) if seqlen_a <= l < seqlen
                ]

                if r.random() < 0.5:  #pos or neg
                    label = np.int64(1)
                else:
                    label = np.int64(0)
                    buf_a, buf_b = buf_b, buf_a

                if not (len(buf_a) and len(buf_b)):
                    continue
                a = np.concatenate(buf_a)
                b = np.concatenate(buf_b)
                #log.debug(a)
                #log.debug(b)
                sample, seg_info, token_type = build_pair(
                    a, b, args.max_seqlen,
                    vocab)  #negative sample might exceed max seqlen
                yield sample, seg_info, token_type, label

        ds = propeller.data.Dataset.from_generator_func(gen)
        return ds

    def after(sentence, seg_info, segments, label):
        batch_size, seqlen = sentence.shape
        sentence, mask_pos, mlm_label = apply_mask(sentence, seg_info,
                                                   args.mask_rate,
                                                   hparams.vocab_size, vocab)

        ra = r.random()
        if ra < args.check:
            print('***')
            print('\n'.join([
                str(j) + '\t' + '|'.join(map(str, i))
                for i, j in zip(sentence.tolist(), label)
            ]))
            print('***')
            print('\n'.join(['|'.join(map(str, i))
                             for i in seg_info.tolist()]))
            print('***')
            print('|'.join(map(str, mlm_label.tolist())))
            print('***')

        return sentence, segments, mlm_label, mask_pos, label

    # pretrain pipeline
    dataset = Dataset.from_list(gz_files)
    if propeller.train.distribution.status.mode == propeller.train.distribution.DistributionMode.NCCL:
        log.info('Apply sharding in distribution env')
        dataset = dataset.shard(
            propeller.train.distribution.status.num_replica,
            propeller.train.distribution.status.replica_id)
    dataset = dataset.repeat().shuffle(buffer_size=len(gz_files))

    dataset = dataset.interleave(map_fn=bb_to_segments,
                                 cycle_length=len(gz_files),
                                 block_length=1)
    dataset = dataset.shuffle(
        buffer_size=1000)  #must shuffle to ensure negative sample randomness
    dataset = sample_negative(dataset)
    dataset = dataset.padded_batch(hparams.batch_size, (0, 0, 0, 0)).map(after)
    dataset.name = name
    return dataset
示例#7
0
def train(args, pretrained_model_config=None):
    log.info("loading data")
    raw_dataset = GraphPropPredDataset(name=args.dataset_name)
    args.num_class = raw_dataset.num_tasks
    args.eval_metric = raw_dataset.eval_metric
    args.task_type = raw_dataset.task_type

    train_ds = MolDataset(args, raw_dataset)

    args.eval_steps = math.ceil(len(train_ds) / args.batch_size)
    log.info("Total %s steps (eval_steps) every epoch." % (args.eval_steps))

    fn = MgfCollateFn(args)

    train_loader = Dataloader(train_ds,
                              batch_size=args.batch_size,
                              num_workers=args.num_workers,
                              shuffle=args.shuffle,
                              stream_shuffle_size=args.shuffle_size,
                              collate_fn=fn)

    # for evaluating
    eval_train_loader = train_loader
    eval_train_loader = PDataset.from_generator_func(eval_train_loader)

    train_loader = multi_epoch_dataloader(train_loader, args.epochs)
    train_loader = PDataset.from_generator_func(train_loader)

    if args.warm_start_from is not None:
        # warm start setting
        def _fn(v):
            if not isinstance(v, F.framework.Parameter):
                return False
            if os.path.exists(os.path.join(args.warm_start_from, v.name)):
                return True
            else:
                return False

        ws = propeller.WarmStartSetting(predicate_fn=_fn,
                                        from_dir=args.warm_start_from)
    else:
        ws = None

    def cmp_fn(old, new):
        if old['eval'][args.metrics] - new['eval'][args.metrics] > 0:
            log.info("best %s eval result: %s" % (args.metrics, new['eval']))
            return True
        else:
            return False

    if args.log_id is not None:
        save_best_model = int(args.log_id) == 5
    else:
        save_best_model = True
    best_exporter = propeller.exporter.BestResultExporter(
        args.output_dir, (cmp_fn, save_best_model))

    eval_datasets = {"eval": eval_train_loader}

    propeller.train.train_and_eval(
        model_class_or_model_fn=MgfModel,
        params=pretrained_model_config,
        run_config=args,
        train_dataset=train_loader,
        eval_dataset=eval_datasets,
        warm_start_setting=ws,
        exporters=[best_exporter],
    )