Exemplo n.º 1
0
def save(args):
    """Save inference model."""
    gpu_id = 0
    place = fluid.CUDAPlace(gpu_id)

    task = tasks.create_task(args)
    model = models.create_model(args, place)
    model.save_inference_model(args.inference_model_path)
    return
Exemplo n.º 2
0
def evaluate(args):
    """Evaluation main function."""
    if args.is_distributed:
        dev_count = fluid.core.get_cuda_device_count()
        gpu_id = int(os.getenv("FLAGS_selected_gpus"))
        phase = "distributed_test"
    else:
        dev_count = 1
        gpu_id = 0
        phase = "test"
    place = fluid.CUDAPlace(gpu_id)

    # setup task and model
    task = tasks.create_task(args)
    model = models.create_model(args, place)

    # setup dataset
    eval_generator = task.get_data_loader(
        model,
        input_file=args.eval_file,
        num_part=model.topo.data_info.size,
        part_id=model.topo.data_info.rank,
        phase=phase
    )
    if model.topo.pp_info.size != 1:
        raise ValueError("Cannot support pipeline in evaluation now!")
    if model.topo.world.size > dev_count:
        raise ValueError("Cannot support evaluation on multiple nodes now!")

    evaluate_dataset(
        task,
        model,
        eval_generator,
        args,
        dev_count,
        gpu_id,
        training_step=0,
        tag="test"
    )
    return
Exemplo n.º 3
0
def infer(args):
    """Inference main function."""
    if args.is_distributed:
        dev_count = fluid.core.get_cuda_device_count()
        gpu_id = int(os.getenv("FLAGS_selected_gpus"))
        phase = "distributed_test"
    else:
        dev_count = 1
        gpu_id = 0
        phase = "test"
    place = fluid.CUDAPlace(gpu_id)

    task = tasks.create_task(args)
    model = models.create_model(args, place)

    # setup dataset
    infer_generator = task.get_data_loader(model,
                                           input_file=args.infer_file,
                                           num_part=model.topo.data_info.size,
                                           part_id=model.topo.data_info.rank,
                                           phase=phase,
                                           is_infer=True)
    if model.topo.pp_info.size != 1:
        raise ValueError("Cannot support pipeline in inference now!")
    if model.topo.sharding_info.size != 1:
        raise ValueError("Cannot support sharding in inference now!")
    if model.topo.world.size > dev_count:
        raise ValueError("Cannot support evaluation on multiple nodes now!")

    # run inference
    timer = Timer()
    timer.start()
    infer_out = {}
    step = 0  # fix no input data case.
    for step, data in enumerate(infer_generator(), 1):
        predictions = task.infer_step(model, data)
        for pred in predictions:
            infer_out[pred["data_id"]] = pred
        if step % args.log_steps == 0:
            time_cost = timer.pass_time
            print(f"\tstep: {step}, time: {time_cost:.3f}, "
                  f"queue size: {infer_generator.queue.size()}, "
                  f"speed: {step / time_cost:.3f} steps/s")

    time_cost = timer.pass_time
    print(f"[infer] steps: {step} time cost: {time_cost}, "
          f"speed: {step / time_cost} steps/s")

    if args.is_distributed:
        # merge inference outputs in distributed mode.
        part_file = os.path.join(args.save_path,
                                 f"inference_output.part_{gpu_id}")
        with open(part_file, "w") as fp:
            json.dump(infer_out, fp, ensure_ascii=False, indent=2)
        part_finish_file = os.path.join(
            args.save_path, f"inference_output.part_{gpu_id}.finish")
        with open(part_finish_file, "w"):
            pass

    # Only run on master GPU in each node
    if gpu_id != 0:
        return

    if args.is_distributed:
        part_files = f"inference_output.part_*.finish"
        while True:
            ret = subprocess.getoutput(
                f"find {args.save_path} -maxdepth 1 -name {part_files}")
            num_completed = len(ret.split("\n"))
            if num_completed != dev_count:
                time.sleep(1)
                continue
            infer_out = {}
            for dev_id in range(dev_count):
                part_file = os.path.join(args.save_path,
                                         f"inference_output.part_{dev_id}")
                with open(part_file, "r") as fp:
                    part_infer_out = json.load(fp)
                    for data_id in part_infer_out:
                        infer_out[data_id] = part_infer_out[data_id]
            break
        subprocess.getoutput(
            "rm " + os.path.join(args.save_path, f"inference_output.part*"))

    # save inference outputs
    inference_output = os.path.join(args.save_path, "inference_output.txt")
    with open(inference_output, "w") as f:
        for data_id in sorted(infer_out.keys(), key=lambda x: int(x)):
            f.write("\t".join(
                map(str, [
                    infer_out[data_id][name]
                    for name in args.output_name.split(",")
                ])) + "\n")
    print(f"save inference result into: {inference_output}")

    return
Exemplo n.º 4
0
def train(args):
    """The main function of training."""
    if args.is_distributed:
        dev_count = fluid.core.get_cuda_device_count()
        gpu_id = int(os.getenv("FLAGS_selected_gpus"))
    else:
        dev_count = 1
        gpu_id = 0
    place = fluid.CUDAPlace(gpu_id)

    # setup task and model
    task = tasks.create_task(args)
    model = models.create_model(args, place)

    global need_save
    need_save = model.topo.dp_info.rank == 0

    # setup datasets
    train_generator = task.get_data_loader(
        model,
        input_file=args.train_file,
        num_epochs=args.num_epochs,
        num_part=model.topo.data_info.size,
        part_id=model.topo.data_info.rank,
        phase="train"
    )
    if model.topo.pp_info.size == 1:
        assert model.topo.mp_info.size <= dev_count and dev_count % model.topo.mp_info.size == 0
        valid_num_part = dev_count // model.topo.mp_info.size
        valid_part_id = gpu_id // model.topo.mp_info.size
    else:
        raise ValueError("Cannot support pipeline in training now!")
    print("# part in validation:", valid_num_part)
    print("part id in validation:", valid_part_id)
    valid_tags = []
    valid_generators = []
    for valid_file in args.valid_file.split(","):
        if ":" in valid_file:
            valid_tag, valid_file = valid_file.split(":")
        else:
            valid_tag = "valid"
        valid_tags.append(valid_tag)
        valid_generators.append(task.get_data_loader(
            model,
            input_file=valid_file,
            num_part=valid_num_part,
            part_id=valid_part_id,
            phase="distributed_valid" if args.is_distributed else "valid"
        ))

    # maintain best metric (init)
    best_metric = -1e10
    if args.eval_metric.startswith("-"):
        scale = -1.0
        eval_metric = args.eval_metric[1:]
    else:
        scale = 1.0
        eval_metric = args.eval_metric

    # start training
    timer = Timer()
    timer.start()
    print("Training is start.")
    for step, data in enumerate(train_generator(), args.start_step + 1):
        outputs = task.train_step(model, data)
        timer.pause()

        if step % args.log_steps == 0:
            time_cost = timer.pass_time
            current_epoch, current_file_index, total_file = task.reader.get_train_progress()
            current_lr = outputs.pop('scheduled_lr')
            print(f"[train][{current_epoch}] progress: {current_file_index}/{total_file} "
                  f"step: {step}, time: {time_cost:.3f}, "
                  f"queue size: {train_generator.queue.size()}, "
                  f"speed: {args.log_steps / time_cost:.3f} steps/s")
            print(f"\tcurrent lr: {current_lr:.7f}")
            metrics = task.get_metrics(outputs)
            print("\t" + ", ".join(f"{k}: {v:.4f}" for k, v in metrics.items()))
            timer.reset()

        if step % args.validation_steps == 0:
            for valid_tag, valid_generator in zip(valid_tags, valid_generators):
                eval_metrics = evaluate(task, model, valid_generator, args, dev_count, gpu_id, step, tag=valid_tag)
                if valid_tag == "valid":
                    valid_metrics = eval_metrics

            # save lastest model
            if args.save_steps <= 0:
                save_model(model, args.save_path, "lastest", dev_count, gpu_id, args)
            # maintain best metric (update)
            if valid_metrics[eval_metric] * scale > best_metric:
                best_metric = valid_metrics[eval_metric] * scale
                print(f"Get better valid metric: {eval_metric} = {valid_metrics[eval_metric]}")
                # save best model (with best evaluation metric)
                save_model(model, args.save_path, "best", dev_count, gpu_id, args)

        if args.save_steps > 0 and step % args.save_steps == 0:
            save_model(model, args.save_path, f"step_{step}", dev_count, gpu_id, args)

        timer.start()
    print("Training is completed.")

    return
Exemplo n.º 5
0
def infer(args):
    """Inference main function."""
    if args.is_distributed:
        fleet.init(is_collective=True)

        dev_count = fluid.core.get_cuda_device_count()
        gpu_id = int(os.getenv("FLAGS_selected_gpus"))
        trainers_num = fleet.worker_num()
        trainer_id = fleet.worker_index()
        phase = "distributed_test"
    else:
        dev_count = 1
        gpu_id = 0
        trainers_num = 1
        trainer_id = 0
        phase = "test"
    place = fluid.CUDAPlace(gpu_id)

    task = tasks.create_task(args)
    model = models.create_model(args, place)
    infer_generator = task.get_data_loader(model,
                                           input_file=args.infer_file,
                                           num_part=trainers_num,
                                           part_id=trainer_id,
                                           phase=phase,
                                           is_infer=True)

    # run inference
    timer = Timer()
    timer.start()
    infer_out = {}
    step = 0
    for step, data in enumerate(infer_generator(), 1):
        predictions = task.infer_step(model, data)
        for pred in predictions:
            infer_out[pred["data_id"]] = pred
        if step % args.log_steps == 0:
            time_cost = timer.pass_time
            print(f"\tstep: {step}, time: {time_cost:.3f}, "
                  f"queue size: {infer_generator.queue.size()}, "
                  f"speed: {step / time_cost:.3f} steps/s")

    time_cost = timer.pass_time
    print(f"[infer] steps: {step} time cost: {time_cost}, "
          f"speed: {step / time_cost} steps/s")

    if args.is_distributed:
        # merge inference outputs in distributed mode.
        part_file = os.path.join(args.save_path,
                                 f"inference_output.part_{gpu_id}")
        with open(part_file, "w") as fp:
            json.dump(infer_out, fp, ensure_ascii=False, indent=2)
        part_finish_file = os.path.join(
            args.save_path, f"inference_output.part_{gpu_id}.finish")
        with open(part_finish_file, "w"):
            pass

    # Only run on master GPU in each node
    if gpu_id != 0:
        return

    if args.is_distributed:
        part_files = f"inference_output.part_*.finish"
        while True:
            ret = subprocess.getoutput(
                f"find {args.save_path} -maxdepth 1 -name {part_files}")
            num_completed = len(ret.split("\n"))
            if num_completed != dev_count:
                time.sleep(1)
                continue
            infer_out = {}
            for dev_id in range(dev_count):
                part_file = os.path.join(args.save_path,
                                         f"inference_output.part_{dev_id}")
                with open(part_file, "r") as fp:
                    part_infer_out = json.load(fp)
                    for data_id in part_infer_out:
                        infer_out[data_id] = part_infer_out[data_id]
            break
        subprocess.getoutput(
            "rm " + os.path.join(args.save_path, f"inference_output.part*"))

    # save inference outputs
    inference_output = os.path.join(args.save_path, args.save_name)
    save_array = []
    for i in range(len(infer_out)):
        save_array.append(infer_out[str(i)]["emb"])
    np_array = np.array(save_array)
    np.save(inference_output, np_array)

    return
Exemplo n.º 6
0
def infer_dst(args):
    """Inference main function."""
    if args.is_distributed:
        fleet.init(is_collective=True)

        dev_count = fluid.core.get_cuda_device_count()
        gpu_id = int(os.getenv("FLAGS_selected_gpus"))
        trainers_num = fleet.worker_num()
        trainer_id = fleet.worker_index()
        phase = "distributed_test"
    else:
        dev_count = 1
        gpu_id = 0
        trainers_num = 1
        trainer_id = 0
        phase = "test"
    place = fluid.CUDAPlace(gpu_id)

    task = tasks.create_task(args)
    model = models.create_model(args, place)
    # task.debug()

    schema = get_schema(args.dataset)
    empty_ds_seq = "<ds/> " + " ".join(flatten_ds({}, schema)) + " </ds>"

    # record original order and init status
    output_order = []
    # {"dial_id": {"prev_ds": "", "turns": [{"utts": utts, "turn_idx": turn_idx}], "cur_idx": 0}}
    dial_status = defaultdict(dict)
    with open(args.infer_file, "r") as fin:
        next(fin)
        for line in fin:
            dial_id, turn_idx, utts = line.strip().split("\t")
            output_order.append(f"{dial_id}-{turn_idx}")
            if dial_id not in dial_status:
                dial_status[dial_id]["prev_ds"] = empty_ds_seq
                dial_status[dial_id]["turns"] = []
                dial_status[dial_id]["cur_idx"] = 0
            dial_status[dial_id]["turns"].append({
                "utts": utts,
                "turn_idx": turn_idx
            })
    dial_ids = list(dial_status.keys())

    # batch inference
    outputs = {}
    timer = Timer()
    while len(dial_ids) > 0:
        timer.start()
        cur_dial_ids = dial_ids[:args.dial_batch_size]
        logger.info(f"Sampled dialogue ids: {cur_dial_ids}")

        # 1st: basic generation
        basic_inputs = {}
        for cur_dial_id in cur_dial_ids:
            cur_idx = dial_status[cur_dial_id]["cur_idx"]
            cur_dial_turn = dial_status[cur_dial_id]["turns"][cur_idx]
            cur_utts = cur_dial_turn["utts"]
            prev_ds = dial_status[cur_dial_id]["prev_ds"]
            src = f"<gen/> {cur_utts} [SEP] {prev_ds} </gen>\x010"
            basic_inputs[f"{cur_dial_id}-{cur_dial_turn['turn_idx']}"] = src
        basic_outputs = generate(basic_inputs, model, task)

        # 2nd: amending generation
        amending_inputs = {}
        for cur_dial_id in cur_dial_ids:
            cur_idx = dial_status[cur_dial_id]["cur_idx"]
            cur_dial_turn = dial_status[cur_dial_id]["turns"][cur_idx]
            cur_utts = cur_dial_turn["utts"]
            basic_ds = basic_outputs[
                f"{cur_dial_id}-{cur_dial_turn['turn_idx']}"]
            src = f"<amend/> {cur_utts} [SEP] {basic_ds} </amend>\x010"
            amending_inputs[f"{cur_dial_id}-{cur_dial_turn['turn_idx']}"] = src
        amending_outputs = generate(amending_inputs, model, task)

        outputs.update(amending_outputs)
        time_cost_infer = timer.pass_time
        logger.info(f"Time cost: {time_cost_infer}")

        # debug info
        for dial_turn_tag in basic_inputs:
            logger.debug(f"[basic input]: {basic_inputs[dial_turn_tag]}")
            logger.debug(f"[basic output]: {basic_outputs[dial_turn_tag]}")
            logger.debug(f"[amending input]: {amending_inputs[dial_turn_tag]}")
            logger.debug(
                f"[amending output]: {amending_outputs[dial_turn_tag]}")

        # update dial_status
        for dial_turn_tag in amending_outputs:
            dial_id, _ = dial_turn_tag.split("-")
            dial_status[dial_id]["cur_idx"] += 1
            if dial_status[dial_id]["cur_idx"] >= len(
                    dial_status[dial_id]["turns"]):
                dial_ids.remove(dial_id)
            else:
                dial_status[dial_id]["prev_ds"] = outputs[dial_turn_tag]
        timer.reset()

    # reorder and output
    if gpu_id == 0:
        pred_seqs = []
        pred_labels = []
        for dial_turn_tag in output_order:
            pred_seqs.append(outputs[dial_turn_tag])
            pred_label = parse_ds(outputs[dial_turn_tag], schema)
            pred_labels.append(pred_label)

        out_seq_file = os.path.join(args.save_path, "inference_output.txt")
        out_label_file = os.path.join(args.save_path, "inference_labels.json")
        with open(out_seq_file, "w") as fout_seq, open(out_label_file,
                                                       "w") as fout_label:
            fout_seq.write("\n".join(pred_seqs))
            json.dump(pred_labels, fout_label, indent=2)
        logger.info(f"Save inference sequences to `{out_seq_file}`")
        logger.info(f"Save inference labels to `{out_label_file}`")