예제 #1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--no_cuda", action='store_true')
    parser.add_argument("--task",
                        type=str,
                        default="test",
                        choices=["test", "corpus"])
    parser.add_argument("--name", type=str, default="bi_patent")
    parser.add_argument("--threshold", type=float, default=0.5)
    parser.add_argument("--batch_size", type=int, default=32)
    main_args = parser.parse_known_args()[0]
    main_args = vars(main_args)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    with open(os.path.join("./output", main_args["name"], "args.json")) as f:
        args = json.load(f)
        args.update(main_args)
    print(args)
    output_dir = os.path.join("./output", main_args["name"])
    model, tokenizer = create_model(args, device)
    model.load_state_dict(
        torch.load("./output/{}/model.bin".format(args["name"]),
                   map_location=torch.device('cpu')))
    model.to(device)
    model.eval()
    with torch.no_grad():
        if args["task"] == "test":
            one_stage_test(model, device, args["max_length"], tokenizer,
                           args["dataset"], output_dir)
        else:
            generate_corpus(model, device, args["max_length"], tokenizer,
                            args["dataset"], output_dir)
예제 #2
0
def main(_):
    config_file = os.path.join(FLAGS.output, 'config.json')
    log_file = os.path.join(FLAGS.output, 'model.log')

    config = load_config(config_file)
    config['init_checkpoint'] = FLAGS.init_checkpoint
    logger = get_logger(log_file)
    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    map_file = os.path.join(FLAGS.output, 'maps.pkl')
    with open(map_file, "rb") as f:
        tag_to_id, id_to_tag = pickle.load(f)

    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model,
                             os.path.join(FLAGS.output, 'checkpoint'), config,
                             logger)
        text = "中国你好成都"
        result = model.evaluate_line(sess,
                                     input_from_line(text, FLAGS.max_seq_len,
                                                     tag_to_id),
                                     id_to_tag,
                                     export=True)
        print(result)
예제 #3
0
def run(bs, path, lr, ks, num_layer):
    fold = 1
    for X_train, Y_train, X_val, Y_val, val_cat in zip(training_data,
                                                       training_label,
                                                       validation_data,
                                                       validation_label,
                                                       validation_cate_label):
        print("Fold " + str(fold))
        model = tools.create_model(lr, bs, ks, num_layer)
        inner_path = path + "/fold_" + str(fold)
        if not os.path.exists(inner_path):
            os.makedirs(inner_path)

        early_stop = EarlyStopping(patience=20)
        history = model.fit(x=X_train,
                            y=Y_train,
                            epochs=80,
                            validation_data=(X_val, Y_val),
                            callbacks=[early_stop],
                            batch_size=bs,
                            verbose=0)
        evaluation = model.evaluate(x=X_val, y=Y_val)
        validation_prediction = model.predict_classes(X_val, batch_size=bs)
        score = f1_score(val_cat, validation_prediction, average=None)

        tools.show_plot(inner_path, history)
        tools.write_file(inner_path + "/readme.txt", evaluation, score, model)
        fold = fold + 1
        del model
예제 #4
0
def main():
    setup_seed(20)
    args = get_args()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    args["output_dir"] = os.path.join("./output", args["domain"], args["name"])
    if not os.path.exists(args["output_dir"]):
        os.makedirs(args["output_dir"], exist_ok=True)
    logger = create_logger()
    with open(os.path.join(args["output_dir"], "args.json"), mode="w") as f:
        json.dump(args, f, ensure_ascii=False, indent=2)
    logger.info(args)
    model, tokenizer = create_model(args, device)
    model = model.to(device)
    train_list = get_tsv_data(
        os.path.join("./data", args["domain"], "train.tsv"))
    val_list = get_tsv_data(os.path.join("./data", args["domain"], "test.tsv"))
    # 开始训练
    train(model, device, train_list, val_list, args, tokenizer, logger)
예제 #5
0
def load_model(model_dir):
    with open(os.path.join(model_dir, "args.json"), mode="r") as f:
        args = json.load(f)
    with open(os.path.join(model_dir, "total.pkl"), mode="rb") as f:
        data = pickle.load(f)
    with open(os.path.join(model_dir, "label2answer.json"), mode="r") as f:
        label2answer = json.load(f)
    vecs, labels = data["vecs"], data["labels"]
    # model
    model, tokenizer = create_model(args, device)
    model.load_state_dict(
        torch.load(os.path.join(model_dir, "model.bin"), map_location=device)
    )
    model = model.eval()
    return {
        "record_id": args["record_id"],
        "name": args["name"],
        "model": model,
        "tokenizer": tokenizer,
        "vecs": vecs,
        "labels": labels,
        "label2answer": label2answer
    }
예제 #6
0
if not os.path.exists(base_path):
    os.makedirs(base_path)
index = 1
with open(base_path + "Cartesian_product.txt", "w+") as writer:
    for ks, num_layer, bs, lr in itertools.product(kernel_size, num_layers,
                                                   batch_size, learning_rate):
        writer.write(str(index) + ":" + "\n")
        writer.write(
            'ks={}, num_layer={}, bs={}, lr={}'.format(ks, num_layer, bs, lr) +
            "\n")
        index = index + 1
    del index

index = 1
for ks, num_layer, bs, lr in itertools.product(kernel_size, num_layers,
                                               batch_size, learning_rate):
    print("Index " + str(index))
    path = base_path + str(index)
    if not os.path.exists(path):
        os.makedirs(path)

    model = tools.create_model(lr, bs, ks, num_layer)
    print(model.summary())
    if model:
        run(bs, path, lr, ks, num_layer)
    else:
        overflow_model = overflow_model + 1
    del model
    index = index + 1

print("Training finished! Overflow model: ", overflow_model)
예제 #7
0
def main(args):
    args["output_dir"] = "./output/{}/{}_{}".format(args["domain"],
                                                    args["name"],
                                                    args["record_id"])
    data_path = os.path.join(args["output_dir"], "data.tsv")
    logger = create_logger(
        log_path=os.path.join(args["output_dir"], "train.log"))
    if not os.path.exists(args["output_dir"]):
        os.makedirs(args["output_dir"], exist_ok=True)
        try:
            copyfile(args["data_path"], data_path)
        except IOError as e:
            traceback.print_exc()
            logger.info("No source file")
            return -1
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    with open(os.path.join(args["output_dir"], "args.json"), mode="w") as f:
        json.dump(args, f, ensure_ascii=False, indent=2)
    logger.info(args)
    key = False
    try_count = 2
    model, tokenizer = create_model(args, device)
    data_list = get_tsv_data(data_path)
    # 开始训练
    train_dataset = EncoderDataset(data_list, tokenizer, args["max_length"],
                                   args["cnt"])
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=args["batch_size"],
                                  shuffle=True,
                                  num_workers=args["num_workers"],
                                  collate_fn=train_dataset.collate_fn)
    while not key and try_count > 0:
        try:
            model = model.to(device)
            optimizer = AdamW(model.parameters(),
                              lr=args["lr"],
                              correct_bias=True)
            logger.info('starting training, each epoch step {}'.format(
                len(train_dataloader)))
            for _ in range(1, args["epochs"] + 1):
                model.train()
                for batch in train_dataloader:
                    queries, q_masks, responses, r_masks = batch
                    queries, q_masks = queries.to(device), q_masks.to(device)
                    responses, r_masks = responses.to(device), r_masks.to(
                        device)
                    # B * cnt
                    outputs = model(queries, q_masks, responses, r_masks)
                    bsz = outputs.size(0)
                    if args["sim"] == "cosine" and args["cosine_scale"] > 1:
                        outputs = outputs * args["cosine_scale"]
                    labels = torch.zeros(bsz, dtype=torch.long, device=device)
                    loss = F.cross_entropy(outputs, labels, reduction="mean")
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args["max_grad_norm"])
                    optimizer.step()
                    optimizer.zero_grad()
                    model.zero_grad()
            model.eval()
            model_path = os.path.join(args["output_dir"], "model.bin")
            torch.save(model.state_dict(), model_path)
            logger.info('training finished')
            # 开始生成预制文件
            doc_vecs, doc_labels = create_data_vec(data_list, model, tokenizer,
                                                   args["max_length"], device,
                                                   "response", True)
            saved_data = {"vecs": doc_vecs, "labels": doc_labels}
            with open(os.path.join(args["output_dir"], "total.pkl"),
                      mode="wb") as f:
                pickle.dump(saved_data, f)
            logger.info("finish generate corpus vecs to " +
                        os.path.join(args["output_dir"], "total.pkl"))
            key = True
        except RuntimeError as e:
            traceback.print_exc()
            device = torch.device("cpu")
            try_count -= 1
    return 0 if key else -1
예제 #8
0
def train():
    tf.io.gfile.mkdir(FLAGS.output)
    log_path = os.path.join(FLAGS.output, 'model.log')
    logger = get_logger(log_path)
    # load data sets
    train_sentences = load_sentences(os.path.join(FLAGS.data, "train.txt"),
                                     FLAGS.zeros)
    dev_sentences = load_sentences(os.path.join(FLAGS.data, "dev.txt"),
                                   FLAGS.zeros)
    test_sentences = load_sentences(os.path.join(FLAGS.data, "test.txt"),
                                    FLAGS.zeros)
    # create maps if not exist
    map_file = os.path.join(FLAGS.output, 'maps.pkl')
    if not os.path.isfile(map_file):
        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(map_file, "wb") as f:
            pickle.dump([tag_to_id, id_to_tag], f)
    else:
        with open(map_file, "rb") as f:
            tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, FLAGS.max_seq_len, tag_to_id)
    dev_data = prepare_dataset(dev_sentences, FLAGS.max_seq_len, tag_to_id)
    test_data = prepare_dataset(test_sentences, FLAGS.max_seq_len, tag_to_id)
    logger.info("%i / %i / %i sentences in train / dev / test." %
                (len(train_data), len(dev_data), len(test_data)))
    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, FLAGS.batch_size)
    test_manager = BatchManager(test_data, FLAGS.batch_size)
    # make path for store log and model if not exist
    config_file = os.path.join(FLAGS.output, 'config.json')
    if os.path.isfile(config_file):
        config = load_config(config_file)
    else:
        config = config_model(tag_to_id)
        save_config(config, config_file)
    print_config(config, logger)
    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model,
                             os.path.join(FLAGS.output, 'checkpoint'), config,
                             logger)

        logger.info("start training")
        loss = []
        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)

                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess,
                           model,
                           os.path.join(FLAGS.output, 'checkpoint'),
                           logger,
                           global_steps=step)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)