Exemplo n.º 1
0
def test_save_features():
    from lib.handler import get_dataset
    from my_py_toolkit.file.file_toolkit import writejson
    import lib.config as cf
    dataset = get_dataset("train", cf.mode)
    writejson(dataset.convert_all_features4human_visual(),
              cf.path_save_feature)
Exemplo n.º 2
0
def generate_unlabel_data(origin_data_path,
                          target_path="./generate_data/unlabel_data",
                          file_prefix="unlabel_data",
                          data_num_each_file=100):
    if not os.path.exists(target_path):
        os.makedirs(target_path)
    origin_data = readjson(origin_data_path)
    file_index = 1
    unlabel_data = []
    keys = list(origin_data)
    for index in tqdm(range(0, len(keys)), total=len(keys)):
        # for _, val_list in origin_data.items():
        key = keys[index]
        val_list = origin_data.get(key)
        for val in val_list:
            cur_data = {"context": val, "question": "", "answer": [val]}
            unlabel_data.append(cur_data)
            if len(unlabel_data) >= data_num_each_file:
                save_path = os.path.join(
                    target_path, f"{file_prefix}_{str(file_index)}.json")
                writejson(unlabel_data, save_path)
                unlabel_data = []
                file_index += 1
    if len(unlabel_data) > 0:
        save_path = os.path.join(target_path,
                                 f"{file_prefix}_{str(file_index)}.json")
        writejson(unlabel_data, save_path)
        unlabel_data = []
        file_index += 1
Exemplo n.º 3
0
def generate_date(dir_name, target_path, data_max_len):
    if not os.path.exists(target_path):
        os.makedirs(target_path)
    if not os.path.exists(os.path.split(target_path)[0]):
        os.makedirs(os.path.split(target_path)[0])

    files = get_file_paths(dir_name, ["docx"])
    gene_data = {}
    for index in tqdm(range(0, len(files)), total=len(files)):
        # for i in tqdm(range(start, length + start), total=length):
        file = files[index]
        # print(file)
        if re.search("[~$]", file):
            continue
        file_name = get_file_name(file)
        data = process_one_doc(file, data_max_len)
        gene_data[file] = data

    writejson(gene_data, target_path)
Exemplo n.º 4
0
def generate_train_data_from_labeled_data(
        data_dir="./generate_data/labeled_data",
        target_path="./generate_data/labeled_data.json"):
    """"""
    if not os.path.exists(os.path.dirname(target_path)):
        os.makedirs(os.path.dirname(target_path))
    files = get_file_paths(data_dir)
    labeled_data = []
    for file in files:
        datas = readjson(file)
        for data in datas:
            answer_text = data.get("answer")
            if len(answer_text) < 2:
                continue
            answer_idx = [
                str(len(answer_text[0])),
                str(len(answer_text[0] + answer_text[1]))
            ]
            data["answer"] = ",".join(answer_idx)
            labeled_data.append(data)

    writejson(labeled_data, target_path)
Exemplo n.º 5
0
def classify(model, dataset):
    # model.eval()
    # losses = []
    # valid_result = []
    less_loss_data = []
    high_loss_data = []
    print("start classify:")
    model.eval()
    with torch.no_grad():
        for i in tqdm(range(0, dataset.data_szie, dataset.batch_size),
                      total=dataset.data_szie):
            logger.info(f"Classify index : {i}")
            Cwid, Qwid, answer, ids = dataset[i]
            Cwid, Qwid = Cwid.to(device), Qwid.to(device)
            y1, y2 = answer[:, 0].view(-1).to(device), answer[:, 1].view(
                -1).to(device)
            p1, p2 = model(Cwid, Qwid)
            y1, y2 = y1.to(device), y2.to(device)
            loss1 = F.nll_loss(torch.log(p1), y1, reduction='none')
            loss2 = F.nll_loss(torch.log(p2), y2, reduction='none')
            loss = (loss1 + loss2) / 2
            origin_data = dataset.get_origin_data(ids, "whole")
            for index, l in enumerate(loss):
                if l < 10e10:
                    less_loss_data.append(origin_data[index])
                else:
                    high_loss_data.append(origin_data[index])

            # losses.append(loss.item())
    # loss, metrics = test_model(dataset, losses, model, valid_result)
    # record_info(losses,f1=[metrics["f1"]], em=[metrics["exact_match"]],
    #             valid_result=valid_result, r_type="test")
    # print("TEST loss {:8f} F1 {:8f} EM {:8f}\n".format(loss, metrics["f1"],
    #                                                    metrics["exact_match"]))
    writejson(less_loss_data, config.less_loss_path)
    writejson(high_loss_data, config.high_loss_path)
Exemplo n.º 6
0
def record_features(dataset):
    if config.is_save_features:
        writejson(dataset.convert_all_features4human_visual(),
                  config.path_save_feature)
Exemplo n.º 7
0
def save_example(expamles, path):
    data = []
    for exam in expamles:
        data.append(exam.to_dict())
    writejson(data, path)