def make_etr_dataset_v1(args): """ETRI 데이터 셋 가져오는 함수 1. 문서 길이 512이하 Filtering 2. 중복 Context 제거, Question 최대 4개 3. ans_start 위치로 3000개 샘플링 """ etr_dataset_path = p.join(args.path.train_data_dir, "etr_dataset_v1") if p.exists(etr_dataset_path): raise FileExistsError(f"{etr_dataset_path}는 이미 존재하는 파일입니다!") etr_dataset = get_etr_dataset(args) # (1) 문서 길이: KLUE MRC 512가 최소 길이 etr_dataset = filtering_by_doc_len(etr_dataset, doc_len=512) # (2) 중복 Context 제거: Context당 최대 4개의 질문 etr_dataset = filtering_by_dup_question(etr_dataset, dup_limit=4) # (3) ETR answer_start Weight 3000개 Sampling etr_dataset = sampling_by_ans_start_weights(etr_dataset, sample=3000) # (4) ETR_DATASET만 저장 etr_datasets = DatasetDict({"train": etr_dataset}) etr_datasets.save_to_disk(etr_dataset_path) print(f"{etr_dataset_path}에 저장되었습니다!")
def test_push_dataset_dict_to_hub_custom_features(self): features = Features({ "x": Value("int64"), "y": ClassLabel(names=["neg", "pos"]) }) ds = Dataset.from_dict({ "x": [1, 2, 3], "y": [0, 0, 1] }, features=features) local_ds = DatasetDict({"test": ds}) ds_name = f"{USER}/test-{int(time.time() * 10e3)}" try: local_ds.push_to_hub(ds_name, token=self._token) hub_ds = load_dataset(ds_name, download_mode="force_redownload") self.assertDictEqual(local_ds.column_names, hub_ds.column_names) self.assertListEqual(list(local_ds["test"].features.keys()), list(hub_ds["test"].features.keys())) self.assertDictEqual(local_ds["test"].features, hub_ds["test"].features) finally: self._api.delete_repo(ds_name.split("/")[1], organization=ds_name.split("/")[0], token=self._token, repo_type="dataset")
def load_data( self, data: Any, columns: List[str] = ["input_ids", "attention_mask", "labels"] ) -> 'datasets.Dataset': file, input, target = data data_files = {} stage = self._running_stage.value data_files[stage] = str(file) # FLASH_TESTING is set in the CI to run faster. if flash._IS_TESTING: try: dataset_dict = DatasetDict({ stage: load_dataset(self.filetype, data_files=data_files, split=[f'{stage}[:20]'])[0] }) except Exception: dataset_dict = load_dataset(self.filetype, data_files=data_files) else: dataset_dict = load_dataset(self.filetype, data_files=data_files) dataset_dict = dataset_dict.map(partial(self._tokenize_fn, input=input, target=target), batched=True) dataset_dict.set_format(columns=columns) return dataset_dict[stage]
def test_push_dataset_dict_to_hub_multiple_files(self): ds = Dataset.from_dict({ "x": list(range(1000)), "y": list(range(1000)) }) local_ds = DatasetDict({"train": ds}) ds_name = f"{USER}/test-{int(time.time() * 10e3)}" try: local_ds.push_to_hub(ds_name, token=self._token, shard_size=500 << 5) hub_ds = load_dataset(ds_name, download_mode="force_redownload") self.assertDictEqual(local_ds.column_names, hub_ds.column_names) self.assertListEqual(list(local_ds["train"].features.keys()), list(hub_ds["train"].features.keys())) self.assertDictEqual(local_ds["train"].features, hub_ds["train"].features) # Ensure that there are two files on the repository that have the correct name files = sorted( self._api.list_repo_files(ds_name, repo_type="dataset", token=self._token)) self.assertListEqual(files, [ ".gitattributes", "data/train-00000-of-00002.parquet", "data/train-00001-of-00002.parquet" ]) finally: self._api.delete_repo(ds_name.split("/")[1], organization=ds_name.split("/")[0], token=self._token, repo_type="dataset")
def load_data( self, file: str, use_full: bool = True, columns: List[str] = ["input_ids", "attention_mask", "labels"] ) -> 'datasets.Dataset': data_files = {} stage = self._running_stage.value data_files[stage] = str(file) # FLASH_TESTING is set in the CI to run faster. if use_full and os.getenv("FLASH_TESTING", "0") == "0": dataset_dict = load_dataset(self.filetype, data_files=data_files) else: # used for debugging. Avoid processing the entire dataset # noqa E265 try: dataset_dict = DatasetDict({ stage: load_dataset(self.filetype, data_files=data_files, split=[f'{stage}[:20]'])[0] }) except AssertionError: dataset_dict = load_dataset(self.filetype, data_files=data_files) dataset_dict = dataset_dict.map(self._tokenize_fn_wrapped, batched=True) dataset_dict.set_format(columns=columns) return dataset_dict[stage]
def __init__(self, pretrained, prepared_dir, classifier_dir): """ pretrained is None means disable classifier """ self.pretrained = pretrained self.classifier_dir = classifier_dir self.prepared_dir = prepared_dir self.datasets = DatasetDict({ 'train': read_dataset_from_csv(prepared_dir + '/train.csv'), 'test': read_dataset_from_csv(prepared_dir + '/test.csv'), 'validation': read_dataset_from_csv(prepared_dir + '/validation.csv') }) self.metric = load_metric("seqeval") self.label_list = self.datasets["train"].features["tag"].feature.names check_folder(self.classifier_dir) if pretrained: self.model = AutoModelForTokenClassification.from_pretrained( self.pretrained, num_labels=len(self.label_list)) self.tokenizer = AutoTokenizer.from_pretrained(self.pretrained) self.data_collator = DataCollatorForTokenClassification( self.tokenizer)
def test_push_dataset_dict_to_hub_no_token(self): ds = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]}) local_ds = DatasetDict({"train": ds}) ds_name = f"{USER}/test-{int(time.time() * 10e3)}" try: local_ds.push_to_hub(ds_name) hub_ds = load_dataset(ds_name, download_mode="force_redownload") self.assertDictEqual(local_ds.column_names, hub_ds.column_names) self.assertListEqual(list(local_ds["train"].features.keys()), list(hub_ds["train"].features.keys())) self.assertDictEqual(local_ds["train"].features, hub_ds["train"].features) # Ensure that there is a single file on the repository that has the correct name files = sorted(self._api.list_repo_files(ds_name, repo_type="dataset")) self.assertTrue( all( fnmatch.fnmatch(file, expected_file) for file, expected_file in zip( files, [".gitattributes", "data/train-00000-of-00001-*.parquet", "dataset_infos.json"] ) ) ) finally: self.cleanup_repo(ds_name)
def test_push_dataset_dict_to_hub_multiple_files(self): ds = Dataset.from_dict({"x": list(range(1000)), "y": list(range(1000))}) local_ds = DatasetDict({"train": ds}) ds_name = f"{USER}/test-{int(time.time() * 10e3)}" try: local_ds.push_to_hub(ds_name, token=self._token, max_shard_size="16KB") hub_ds = load_dataset(ds_name, download_mode="force_redownload") self.assertDictEqual(local_ds.column_names, hub_ds.column_names) self.assertListEqual(list(local_ds["train"].features.keys()), list(hub_ds["train"].features.keys())) self.assertDictEqual(local_ds["train"].features, hub_ds["train"].features) # Ensure that there are two files on the repository that have the correct name files = sorted(self._api.list_repo_files(ds_name, repo_type="dataset", token=self._token)) self.assertTrue( all( fnmatch.fnmatch(file, expected_file) for file, expected_file in zip( files, [ ".gitattributes", "data/train-00000-of-00002-*.parquet", "data/train-00001-of-00002-*.parquet", "dataset_infos.json", ], ) ) ) finally: self.cleanup_repo(ds_name)
def test_push_dataset_dict_to_hub_name_without_namespace(self): ds = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]}) local_ds = DatasetDict({"train": ds}) ds_name = f"{USER}/test-{int(time.time() * 10e3)}" try: local_ds.push_to_hub(ds_name.split("/")[-1], token=self._token) hub_ds = load_dataset(ds_name, download_mode="force_redownload") self.assertDictEqual(local_ds.column_names, hub_ds.column_names) self.assertListEqual(list(local_ds["train"].features.keys()), list(hub_ds["train"].features.keys())) self.assertDictEqual(local_ds["train"].features, hub_ds["train"].features) # Ensure that there is a single file on the repository that has the correct name files = sorted( self._api.list_repo_files(ds_name, repo_type="dataset")) self.assertListEqual(files, [ ".gitattributes", "data/train-00000-of-00001.parquet", "dataset_infos.json" ]) finally: self._api.delete_repo(ds_name.split("/")[1], organization=ds_name.split("/")[0], repo_type="dataset")
def make_kor_dataset_v1(args): """KorQuad Dataset V1 1. 문서 길이 512이하 Filtering 2. Context당 Question 최대 4개 3. ans_start 위치로 8000개 샘플링 """ kor_dataset_path = p.join(args.path.train_data_dir, "kor_dataset") if p.exists(kor_dataset_path): raise FileExistsError(f"{kor_dataset_path}는 이미 존재하는 파일입니다!") kor_dataset = load_dataset("squad_kor_v1") kor_dataset = concatenate_datasets([ kor_dataset["train"].flatten_indices(), kor_dataset["validation"].flatten_indices() ]) # (1) 문서 길이: KLUE MRC 512가 최소 길이 kor_dataset = filtering_by_doc_len(kor_dataset, doc_len=512) # (2) 중복 Context 제거: Context당 최대 4개의 질문 kor_dataset = filtering_by_dup_question(kor_dataset, dup_limit=4) # (3) KOR answer_start Weight Sampling 2배수 사용 kor_dataset = sampling_by_ans_start_weights(kor_dataset, sample=8000) # (4) KOR_DATASET만 저장 kor_datasets = DatasetDict({"train": kor_dataset}) kor_datasets.save_to_disk(kor_dataset_path) print(f"{kor_dataset_path}에 저장되었습니다!")
def check_model(self, model): raw_datasets = DatasetDict() raw_datasets["eval"] = load_dataset("superb", "ks", split="validation") raw_datasets = raw_datasets.cast_column( "audio", datasets.features.Audio(sampling_rate=16000)) sample = raw_datasets["eval"][0] out = model(sample["audio"]["array"].reshape(1, 16000)) self.assertEqual(np.argmax(out.logits), 11)
def load_data(self, data: str, dataset: Optional[Any] = None) -> "datasets.Dataset": stage = self._running_stage.value file_path = data path = Path(file_path) with open(path, "rb") as f: squad_v_2_dict = json.load(f) ids = [] titles = [] contexts = [] questions = [] answers = [] for topic in squad_v_2_dict["data"]: title = topic["title"] for comprehension in topic["paragraphs"]: context = comprehension["context"] for qa in comprehension["qas"]: question = qa["question"] id = qa["id"] _answer_starts = [ answer["answer_start"] for answer in qa["answers"] ] _answers = [answer["text"] for answer in qa["answers"]] ids.append(id) titles.append(title) contexts.append(context) questions.append(question) answers.append( dict(text=_answers, answer_start=_answer_starts)) dataset_dict = DatasetDict({ stage: Dataset.from_dict({ "id": ids, "title": titles, "context": contexts, "question": questions, "answer": answers }) }) column_names = dataset_dict[stage].column_names dataset_dict = dataset_dict.map(self._tokenize_fn, batched=True, remove_columns=column_names) return dataset_dict[stage]
def test_push_dataset_dict_to_hub_datasets_with_different_features(self): ds_train = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]}) ds_test = Dataset.from_dict({"x": [True, False, True], "y": ["a", "b", "c"]}) local_ds = DatasetDict({"train": ds_train, "test": ds_test}) ds_name = f"{USER}/test-{int(time.time() * 10e3)}" try: with self.assertRaises(ValueError): local_ds.push_to_hub(ds_name.split("/")[-1], token=self._token) except AssertionError: self.cleanup_repo(ds_name) raise
def load_data(self, data: Any, columns: List[str] = None) -> "datasets.Dataset": stage = self._running_stage.value dataset_dict = DatasetDict({stage: Dataset.from_dict(data)}) column_names = dataset_dict[stage].column_names dataset_dict = dataset_dict.map(self._tokenize_fn, batched=True, remove_columns=column_names) return dataset_dict[stage]
def test_push_dataset_dict_to_hub_custom_splits(self): ds = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]}) local_ds = DatasetDict({"random": ds}) ds_name = f"{USER}/test-{int(time.time() * 10e3)}" try: local_ds.push_to_hub(ds_name, token=self._token) hub_ds = load_dataset(ds_name, download_mode="force_redownload") self.assertDictEqual(local_ds.column_names, hub_ds.column_names) self.assertListEqual(list(local_ds["random"].features.keys()), list(hub_ds["random"].features.keys())) self.assertDictEqual(local_ds["random"].features, hub_ds["random"].features) finally: self.cleanup_repo(ds_name)
def get_correct_dataset_TUWS(wrong_key=False): labeled, unlabeled = correct_examples() train_dic = Dataset.from_dict({ 'sentence': labeled['sentence'] + unlabeled['sentence'], 'label': labeled['label'] + unlabeled['label'] }) if wrong_key is False: return DatasetDict({'train': train_dic}) else: return DatasetDict({'training_Data': train_dic})
def build_dataloader(location, shuffle_dataset, sampling_fraction, config, collate_fn, tokenizer, continuous_iter=True, world_size=1, num_workers=1): size_dicts = {128: 64*8, 256: 32*8, 512: 16*8, 768: 8*8, 1024: 8*8} # TODO: num workers based on dataset size, only top 16 datasets get 2 workers, next 16 get 1 worker and rest are done in main process single_node = world_size == 1 try: train_dataset = Dataset.load_from_disk(location) train_dataset = TokenizerDataset(config, tokenizer, char_to_id, dict(padding="max_length", truncation=True, return_tensors="pt", max_length=config.tokenizer_length), train_dataset) if num_workers > 0: train_loader = DataLoader(train_dataset, sampler=None if single_node else DistributedSampler(train_dataset, shuffle=shuffle_dataset), batch_size=8*8, collate_fn=None, prefetch_factor=4 if num_workers > 0 else None, num_workers=(2*num_workers) if single_node else num_workers) else: train_loader = DataLoader(train_dataset, sampler=None if single_node else DistributedSampler(train_dataset, shuffle=shuffle_dataset), batch_size=8*8, collate_fn=None, num_workers=(2 * num_workers) if single_node else num_workers) train_loader = custom_batching_fn(train_loader, size_dicts, continuous_iter) except: train_dataset = DatasetDict.load_from_disk(location) train_dataset = {k: v for k, v in train_dataset.items() if len(v) >= world_size} train_dataset_sampling_proba = {k: len(v) ** sampling_fraction for k, v in train_dataset.items()} lsum = sum(train_dataset_sampling_proba.values()) train_dataset_sampling_proba = {k: v / lsum for k, v in train_dataset_sampling_proba.items()} train_dataset = {k: TokenizerDataset(config, tokenizer, char_to_id, dict(padding="max_length", truncation=True, return_tensors="pt", max_length=config.tokenizer_length), v) for k, v in train_dataset.items()} # for v in train_dataset.values(): # v.training = False if num_workers > 0: train_loader = {k: DataLoader(v, sampler=None if single_node else DistributedSampler(v, shuffle=shuffle_dataset, ), batch_size=8*8, collate_fn=collate_fn, prefetch_factor=2, num_workers=(2*num_workers) if single_node else num_workers) for k, v in train_dataset.items()} else: train_loader = { k: DataLoader(v, sampler=None if single_node else DistributedSampler(v, shuffle=shuffle_dataset, ), batch_size=8*8, collate_fn=collate_fn, num_workers=(2 * num_workers) if single_node else num_workers) for k, v in train_dataset.items()} train_loader = {k: custom_batching_fn(dataloader, size_dicts, continuous_iter) for k, dataloader in train_loader.items()} train_loader = datadict_iterator(train_loader, train_dataset_sampling_proba) return train_loader
def create_vocabulary_from_data(datasets: DatasetDict): # Given training and test labels create vocabulary def extract_all_chars(batch): all_text = " ".join(batch["target_text"]) vocab = list(set(all_text)) return {"vocab": [vocab], "all_text": [all_text]} vocabs = datasets.map( extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=datasets["train"].column_names, ) # take union of all unique characters in each dataset vocab_set = functools.reduce( lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab" ][0]), vocabs.values()) vocab_dict = {v: k for k, v in enumerate(sorted(list(vocab_set)))} # replace white space with delimiter token vocab_dict["|"] = vocab_dict[" "] del vocab_dict[" "] # add unk and pad token vocab_dict["[UNK]"] = len(vocab_dict) vocab_dict["[PAD]"] = len(vocab_dict) return vocab_dict
def concate(dataset_name, data, cache_dir): if dataset_name in dataset_types: all_datasets_downloaded = [ load_dataset(dataset_name, sub_dataset, cache_dir=cache_dir) for sub_dataset in dataset_types[dataset_name] ] combined_datasets = [ concatenate_datasets(list(sub_dataset.values())) for sub_dataset in all_datasets_downloaded ] data = concatenate_datasets(combined_datasets) return DatasetDict({"train": data}) data = concatenate_datasets( list(load_dataset(dataset_name, cache_dir=cache_dir).values()) ) return DatasetDict({"train": data})
def save_data(train_df, val_df): train_f = Features({ 'answers': Sequence(feature={ 'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None) }, length=-1, id=None), 'context': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None), 'question_type': Value(dtype='int32', id=None) }) train_datasets = DatasetDict({ 'train': Dataset.from_pandas(train_df, features=train_f), 'validation': Dataset.from_pandas(val_df, features=train_f) }) file = open("../../data/question_type.pkl", "wb") pickle.dump(train_datasets, file) file.close()
def load_data( self, data: Tuple[str, Union[str, List[str]], Union[str, List[str]]], dataset: Optional[Any] = None, columns: Union[List[str], Tuple[str]] = ("input_ids", "attention_mask", "labels"), ) -> Union[Sequence[Mapping[str, Any]]]: csv_file, input, target = data data_files = {} stage = self.running_stage.value data_files[stage] = str(csv_file) # FLASH_TESTING is set in the CI to run faster. # FLASH_TESTING is set in the CI to run faster. if flash._IS_TESTING and not torch.cuda.is_available(): try: dataset_dict = DatasetDict({ stage: load_dataset(self.filetype, data_files=data_files, split=[f'{stage}[:20]'])[0] }) except Exception: dataset_dict = load_dataset(self.filetype, data_files=data_files) else: dataset_dict = load_dataset(self.filetype, data_files=data_files) if self.training: labels = list(sorted(list(set(dataset_dict[stage][target])))) dataset.num_classes = len(labels) self.set_state(LabelsState(labels)) labels = self.get_state(LabelsState) # convert labels to ids # if not self.predicting: if labels is not None: labels = labels.labels label_to_class_mapping = {v: k for k, v in enumerate(labels)} dataset_dict = dataset_dict.map( partial(self._transform_label, label_to_class_mapping, target)) dataset_dict = dataset_dict.map(partial(self._tokenize_fn, input=input), batched=True) # Hugging Face models expect target to be named ``labels``. if not self.predicting and target != "labels": dataset_dict.rename_column_(target, "labels") dataset_dict.set_format("torch", columns=columns) return dataset_dict[stage]
def get_dataset_cotrain(wrong_key=False): labeled, _ = correct_examples() unlabeled = Dataset.from_dict( {'sentence': ['moon what??.', 'I am people']}) if wrong_key is False: return DatasetDict({ 'labeled1': labeled, 'labeled2': labeled, 'unlabeled': unlabeled }) else: return DatasetDict({ 'labeled1': labeled, 'labeled2': labeled, 'unlabels': unlabeled })
def load_eval_data(self, force_reload=False, save_datasets=True) -> None: eval_save_dir = self.save_dir / "eval" try: if force_reload: raise Exception() self.datasets["eval"] = DatasetDict.load_from_disk(eval_save_dir) print("Evaluation data loaded from disk.") except: print("Regenerating evaluation data.") eval_df_dict = self._parse_eval_data(self.eval_dir) self.datasets["eval"] = DatasetDict({ "far": Dataset.from_pandas(eval_df_dict["far"]), "obj": Dataset.from_pandas(eval_df_dict["obj"]), }) if save_datasets: print(f"Saving evaluation dataset to {eval_save_dir}") self.datasets["eval"].save_to_disk(eval_save_dir)
def split_relabel_jigsaw_severetoxic(dataset): dataset = dataset.rename_column("severe_toxic", "labels") train_val = dataset['train'].train_test_split(test_size=0.25) dataset = DatasetDict({ 'train': train_val['train'], 'test': dataset['test'], 'validation': train_val['test'] }) return dataset
def split_relabel_jigsaw_identityhate(dataset): dataset = dataset.rename_column("identity_hate", "labels") train_val = dataset['train'].train_test_split(test_size=0.25) dataset = DatasetDict({ 'train': train_val['train'], 'test': dataset['test'], 'validation': train_val['test'] }) return dataset
def load_data(self, data: Any, columns: List[str] = None) -> "datasets.Dataset": if columns is None: columns = ["input_ids", "attention_mask", "labels"] if self.filetype == "json": file, input, target, field = data else: file, input, target = data data_files = {} stage = self._running_stage.value data_files[stage] = str(file) # FLASH_TESTING is set in the CI to run faster. if flash._IS_TESTING: try: if self.filetype == "json" and field is not None: dataset_dict = DatasetDict({ stage: load_dataset(self.filetype, data_files=data_files, split=[f"{stage}[:20]"], field=field)[0] }) else: dataset_dict = DatasetDict({ stage: load_dataset(self.filetype, data_files=data_files, split=[f"{stage}[:20]"])[0] }) except Exception: if self.filetype == "json" and field is not None: dataset_dict = load_dataset(self.filetype, data_files=data_files, field=field) else: dataset_dict = load_dataset(self.filetype, data_files=data_files) else: if self.filetype == "json" and field is not None: dataset_dict = load_dataset(self.filetype, data_files=data_files, field=field) else: dataset_dict = load_dataset(self.filetype, data_files=data_files) dataset_dict = dataset_dict.map(partial(self._tokenize_fn, input=input, target=target), batched=True) dataset_dict.set_format(columns=columns) return dataset_dict[stage]
def createDataset(config): """ build dataset from the h5 file also filter out rare *individual ATU* """ df = pd.read_hdf(config.data.h5_file, key=config.data.h5_key) atu = df.loc[df.groupby("atu")["atu"].filter( lambda g: len(g) >= config["datamodules"]["atu_filter_no"]).index] atu = atu[["text", "atu", "desc", "label"]] dataset = Dataset.from_pandas(atu) tokenizer = AutoTokenizer.from_pretrained(config["module"]["arch"]) def tokenize(instance): return tokenizer(instance["text"], max_length=config["module"]["seq_len"], truncation="longest_first", padding="max_length") dataset = dataset. \ shuffle(seed=config.seed). \ map(tokenize, batched=True) # split by cls (stratified) sub_ds = {"train": [], "test": []} for cls in np.unique(dataset["label"]): cls_ds = dataset. \ filter(lambda d: d['label'] == int(cls)) cls_ds = cls_ds.train_test_split(test_size=config.data.test_ratio, seed=config.seed) sub_ds["train"].append(cls_ds["train"]) sub_ds["test"].append(cls_ds["test"]) dataset = DatasetDict( {split: concatenate_datasets(ds) for split, ds in sub_ds.items()}) dataset.save_to_disk(config.data.cached_dir) return dataset
def run_sparse_retrieval(datasets, training_args): #### retreival process #### retriever = SparseRetrieval(tokenize_fn=tokenize, data_path="./data", context_path="wikipedia_documents.json" # context_path="all_wikipedia_documents.json" ) # sparse embedding retrieval # retriever.get_sparse_embedding() #df = retriever.retrieve(datasets['validation']) # bm25 retrieval # retriever.get_embedding_BM25() # df = retriever.retrieve_BM25(query_or_dataset=datasets['validation'], topk=10) # elastic search retrieval # retriever.get_elastic_search() df = retriever.retrieve_ES(query_or_dataset=datasets['validation'], topk=10) # faiss retrieval # df = retriever.retrieve_faiss(dataset['validation']) if training_args.do_predict: # test data 에 대해선 정답이 없으므로 id question context 로만 데이터셋이 구성됩니다. f = Features({ 'context': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None) }) elif training_args.do_eval: # train data 에 대해선 정답이 존재하므로 id question context answer 로 데이터셋이 구성됩니다. f = Features({ 'answers': Sequence(feature={ 'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None) }, length=-1, id=None), 'context': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None) }) datasets = DatasetDict({'validation': Dataset.from_pandas(df, features=f)}) return datasets
def load_data(self, filepath: str, dataset: AutoDataset, columns: Union[List[str], Tuple[str]] = ("input_ids", "attention_mask", "labels"), use_full: bool = True): data_files = {} stage = dataset.running_stage.value data_files[stage] = str(filepath) # FLASH_TESTING is set in the CI to run faster. if use_full and os.getenv("FLASH_TESTING", "0") == "0": dataset_dict = load_dataset(self.filetype, data_files=data_files) else: # used for debugging. Avoid processing the entire dataset # noqa E265 dataset_dict = DatasetDict({ stage: load_dataset(self.filetype, data_files=data_files, split=[f'{stage}[:20]'])[0] }) dataset_dict = dataset_dict.map(self._tokenize_fn, batched=True) # convert labels to ids if not self.predicting: dataset_dict = dataset_dict.map(self._transform_label) dataset_dict = dataset_dict.map(self._tokenize_fn, batched=True) # Hugging Face models expect target to be named ``labels``. if not self.predicting and self.target != "labels": dataset_dict.rename_column_(self.target, "labels") dataset_dict.set_format("torch", columns=columns) if not self.predicting: dataset.num_classes = len(self.label_to_class_mapping) return dataset_dict[stage]
def run_sparse_retrieval(datasets, training_args, inf_args): #### retreival process #### if inf_args.retrieval == None: retriever = SparseRetrieval_BM25PLUS( tokenize_fn=tokenize, data_path="./data", context_path="wikipedia_documents.json") elif inf_args.retrieval.lower() == "sparse": retriever = SparseRetrieval(tokenize_fn=tokenize, data_path="./data", context_path="wikipedia_documents.json") # elif inf_args.retrieval.lower() == "bm25" or inf_args.retrieval.lower() == "bm25": # retriever = SparseRetrieval_BM25(tokenize_fn=tokenize, # data_path="./data", # context_path="wikipedia_documents.json") retriever.get_sparse_embedding() df = retriever.retrieve(datasets['validation'], inf_args.k) # faiss retrieval # test data 에 대해선 정답이 없으므로 id question context 로만 데이터셋이 구성됩니다. if training_args.do_predict: f = Features({ 'contexts': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None) }) # train data 에 대해선 정답이 존재하므로 id question context answer 로 데이터셋이 구성됩니다. elif training_args.do_eval: f = Features({ 'answers': Sequence(feature={ 'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None) }, length=-1, id=None), 'context': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None) }) datasets = DatasetDict({'validation': Dataset.from_pandas(df, features=f)}) return datasets