示例#1
0
def make_etr_dataset_v1(args):
    """ETRI 데이터 셋 가져오는 함수
    1. 문서 길이 512이하 Filtering
    2. 중복 Context 제거, Question 최대 4개
    3. ans_start 위치로 3000개 샘플링
    """
    etr_dataset_path = p.join(args.path.train_data_dir, "etr_dataset_v1")

    if p.exists(etr_dataset_path):
        raise FileExistsError(f"{etr_dataset_path}는 이미 존재하는 파일입니다!")

    etr_dataset = get_etr_dataset(args)

    # (1) 문서 길이: KLUE MRC 512가 최소 길이
    etr_dataset = filtering_by_doc_len(etr_dataset, doc_len=512)

    # (2) 중복 Context 제거: Context당 최대 4개의 질문
    etr_dataset = filtering_by_dup_question(etr_dataset, dup_limit=4)

    # (3) ETR answer_start Weight 3000개 Sampling
    etr_dataset = sampling_by_ans_start_weights(etr_dataset, sample=3000)

    # (4) ETR_DATASET만 저장
    etr_datasets = DatasetDict({"train": etr_dataset})
    etr_datasets.save_to_disk(etr_dataset_path)

    print(f"{etr_dataset_path}에 저장되었습니다!")
示例#2
0
def make_kor_dataset_v1(args):
    """KorQuad Dataset V1
    1. 문서 길이 512이하 Filtering
    2. Context당 Question 최대 4개
    3. ans_start 위치로 8000개 샘플링
    """

    kor_dataset_path = p.join(args.path.train_data_dir, "kor_dataset")

    if p.exists(kor_dataset_path):
        raise FileExistsError(f"{kor_dataset_path}는 이미 존재하는 파일입니다!")

    kor_dataset = load_dataset("squad_kor_v1")

    kor_dataset = concatenate_datasets([
        kor_dataset["train"].flatten_indices(),
        kor_dataset["validation"].flatten_indices()
    ])

    # (1) 문서 길이: KLUE MRC 512가 최소 길이
    kor_dataset = filtering_by_doc_len(kor_dataset, doc_len=512)

    # (2) 중복 Context 제거: Context당 최대 4개의 질문
    kor_dataset = filtering_by_dup_question(kor_dataset, dup_limit=4)

    # (3) KOR answer_start Weight Sampling 2배수 사용
    kor_dataset = sampling_by_ans_start_weights(kor_dataset, sample=8000)

    # (4) KOR_DATASET만 저장
    kor_datasets = DatasetDict({"train": kor_dataset})
    kor_datasets.save_to_disk(kor_dataset_path)

    print(f"{kor_dataset_path}에 저장되었습니다!")
示例#3
0
    def test_push_streaming_dataset_dict_to_hub(self):
        ds = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})
        local_ds = DatasetDict({"train": ds})
        with tempfile.TemporaryDirectory() as tmp:
            local_ds.save_to_disk(tmp)
            local_ds = load_dataset(tmp, streaming=True)

            ds_name = f"{USER}/test-{int(time.time() * 10e3)}"
            try:
                local_ds.push_to_hub(ds_name, token=self._token)
                hub_ds = load_dataset(ds_name, download_mode="force_redownload")

                self.assertDictEqual(local_ds.column_names, hub_ds.column_names)
                self.assertListEqual(list(local_ds["train"].features.keys()), list(hub_ds["train"].features.keys()))
                self.assertDictEqual(local_ds["train"].features, hub_ds["train"].features)
            finally:
                self.cleanup_repo(ds_name)
示例#4
0
def createDataset(config):
    """
    build dataset from the h5 file
    also filter out rare *individual ATU*
    """
    df = pd.read_hdf(config.data.h5_file, key=config.data.h5_key)
    atu = df.loc[df.groupby("atu")["atu"].filter(
        lambda g: len(g) >= config["datamodules"]["atu_filter_no"]).index]
    atu = atu[["text", "atu", "desc", "label"]]

    dataset = Dataset.from_pandas(atu)
    tokenizer = AutoTokenizer.from_pretrained(config["module"]["arch"])

    def tokenize(instance):
        return tokenizer(instance["text"],
                         max_length=config["module"]["seq_len"],
                         truncation="longest_first",
                         padding="max_length")

    dataset = dataset. \
        shuffle(seed=config.seed). \
        map(tokenize, batched=True)

    # split by cls (stratified)
    sub_ds = {"train": [], "test": []}
    for cls in np.unique(dataset["label"]):
        cls_ds = dataset. \
            filter(lambda d: d['label'] == int(cls))
        cls_ds = cls_ds.train_test_split(test_size=config.data.test_ratio,
                                         seed=config.seed)
        sub_ds["train"].append(cls_ds["train"])
        sub_ds["test"].append(cls_ds["test"])

    dataset = DatasetDict(
        {split: concatenate_datasets(ds)
         for split, ds in sub_ds.items()})
    dataset.save_to_disk(config.data.cached_dir)
    return dataset
def build_datasets(
        data_args: DataTrainingArguments,
        tokenizer: PreTrainedTokenizer,
        cache_dir=None,
        skip_train=False,
        skip_eval=False) -> Tuple[Dataset, Dataset]:
    if skip_eval and skip_train:
        logger.warning("Both `skip_train` and `skip_eval` are set to True")

    json_path = data_args.data_json
    data_dir = data_args.load_data_from
    add_line_breaks = data_args.add_line_breaks
    break_token = data_args.line_break_token
    train_data, eval_data = None, None
    dataset = DatasetDict()

    if add_line_breaks:
        tokenizer.add_special_tokens(dict(additional_special_tokens=[break_token]))

    if json_path is not None:
        logger.info("Preprocessing new dataset from {}".format(json_path))
        eval_split = data_args.eval_split
        save_dir = data_args.save_data_to

        dataset = load_dataset('json', data_files=[json_path], cache_dir=cache_dir)
        if eval_split < 1:
            dataset = dataset["train"].train_test_split(test_size=eval_split, shuffle=False)

        if save_dir is None:
            # Spend less time on preprocessing
            if skip_train:
                del dataset["train"]
            if skip_eval and "test" in dataset:
                del dataset["test"]

        if not data_args.skip_text_clean:
            normalize = partial(normalize_text, add_line_breaks=add_line_breaks, brk=break_token)
            dataset = dataset.map(normalize, input_columns='text')

        proc_kwargs = dict(
            batched=True,
            batch_size=data_args.tokenizer_batch_size,
            remove_columns=["text", "title"])

        if "train" in dataset:
            proc_train = create_preprocess_fn(
                tokenizer, data_args.max_source_length, data_args.max_target_length)
            dataset["train"] = dataset["train"].map(proc_train, **proc_kwargs)

        if "test" in dataset:
            proc_eval = create_preprocess_fn(
                tokenizer, data_args.max_source_length, data_args.val_max_target_length)
            dataset["test"] = dataset["test"].map(proc_eval, **proc_kwargs)

        dataset.set_format(type="torch",
                           columns=["input_ids", "attention_mask", "decoder_input_ids",
                                    "decoder_attention_mask", "labels"])

        save_dir = data_args.save_data_to
        if save_dir is not None:
            if not os.path.exists(save_dir):
                os.mkdir(save_dir)
            logger.info("Saving preprocessed dataset to {}".format(save_dir))
            dataset.save_to_disk(save_dir)

    elif data_dir is not None:
        logger.info("Loading preprocessed dataset from {}".format(data_dir))
        if skip_train:
            eval_data = load_from_disk(os.path.join(data_dir, "test"))
        elif skip_eval:
            train_data = load_from_disk(os.path.join(data_dir, "train"))
        else:
            dataset = load_from_disk(data_dir)
    else:
        raise AttributeError("You must provide either `--data_json` or `--load_data_from` argument.")

    if "train" in dataset:
        train_data = dataset["train"]
    if "test" in dataset:
        eval_data = dataset["test"]
    return train_data, eval_data