def load_datasets(lang="es", random_state=2021, preprocessing_args={}): """ Load emotion recognition datasets """ train_df = load_df(paths[lang]["train"]) test_df = load_df(paths[lang]["test"]) train_df, dev_df = train_test_split(train_df, stratify=train_df["label"], random_state=random_state) for df in [train_df, dev_df, test_df]: for label, idx in label2id.items(): df.loc[df["label"] == label, "label"] = idx df["label"] = df["label"].astype(int) preprocess = lambda x: preprocess_tweet(x, lang=lang, **preprocessing_args) train_df.loc[:, "text"] = train_df["text"].apply(preprocess) dev_df.loc[:, "text"] = dev_df["text"].apply(preprocess) test_df.loc[:, "text"] = test_df["text"].apply(preprocess) features = Features({ 'text': Value('string'), 'label': ClassLabel(num_classes=len(id2label), names=[id2label[k] for k in sorted(id2label.keys())]) }) train_dataset = Dataset.from_pandas(train_df, features=features) dev_dataset = Dataset.from_pandas(dev_df, features=features) test_dataset = Dataset.from_pandas(test_df, features=features) return train_dataset, dev_dataset, test_dataset
def concatenate_datasets_with_ratio(args, train_dataset): concatenate_list = [] for sub_dataset_name, ratio in zip( args.data.sub_datasets.split(","), args.data.sub_datasets_ratio.split(",")): ratio = float(ratio) sub_dataset_path = p.join(args.path.train_data_dir, sub_dataset_name) assert p.exists(sub_dataset_path), f"{sub_dataset_name}이 존재하지 않습니다." sub_dataset = load_from_disk(sub_dataset_path) sub_dataset_len = int(len(sub_dataset["train"]) * ratio) print(f"ADD SUB DATASET {sub_dataset_name}, LENGTH: {sub_dataset_len}") # sub dataset must have same features: ['id', 'title', 'context', 'question', 'answers'] features = sub_dataset["train"].features new_sub_dataset = sub_dataset["train"].select(range(sub_dataset_len)) new_sub_dataset = Dataset.from_pandas(new_sub_dataset.to_pandas(), features=features) concatenate_list.append(new_sub_dataset.flatten_indices()) train_dataset = Dataset.from_pandas(train_dataset.to_pandas(), features=features) train_dataset = concatenate_datasets([train_dataset.flatten_indices()] + concatenate_list) return train_dataset
def save_data(train_df, val_df): train_f = Features({ 'answers': Sequence(feature={ 'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None) }, length=-1, id=None), 'context': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None), 'question_type': Value(dtype='int32', id=None) }) train_datasets = DatasetDict({ 'train': Dataset.from_pandas(train_df, features=train_f), 'validation': Dataset.from_pandas(val_df, features=train_f) }) file = open("../../data/question_type.pkl", "wb") pickle.dump(train_datasets, file) file.close()
def load_domain_split_dataset(self, data_dir, logger=None): """ Loads break dataset with domain split. Train - on text. val + test - on DB + images :param data_dir: The path of the directory where the preprocessed dataset should be saved to or loaded from. :param logger: A logger for logging events. :return: The loaded dataset. """ current_dir = Path() dir_path = current_dir / "data" / "break_data" / "preprocessed" file_name = "dataset_preprocessed_domain_split.pkl" if not (dir_path / file_name).is_file(): if logger: logger.info('Creating domain split dataset...') text_domain_dataset_prefixes = ('COMQA', 'CWQ', 'DROP', 'HOTP') image_domain_dataset_prefixes = ('CLEVR', 'NLVR2') DB_domain_dataset_prefixes = ('ACADEMIC', 'ATIS', 'GEO', 'SPIDER') image_plus_DB = image_domain_dataset_prefixes + DB_domain_dataset_prefixes train_filtererd = pd.DataFrame() validation_filtererd = pd.DataFrame() test_filtererd = pd.DataFrame() for i, example in enumerate(self.dataset_logical['train']): if example['question_id'].startswith( text_domain_dataset_prefixes): train_filtererd = train_filtererd.append(example, ignore_index=True) for i, example in enumerate(self.dataset_logical['validation']): if example['question_id'].startswith(image_plus_DB): validation_filtererd = validation_filtererd.append( example, ignore_index=True) for i, example in enumerate(self.dataset_logical['test']): if example['question_id'].startswith(image_plus_DB): test_filtererd = test_filtererd.append(example, ignore_index=True) # TODO delete this? # train_dataset = self.dataset_logical['train'].filter( # lambda example: example['question_id'].startswith(text_domain_dataset_prefixes)) # validation_dataset = self.dataset_logical['validation'].filter( # lambda example: example['question_id'].startswith(image_plus_DB)) # test_dataset = self.dataset_logical['test'].filter( # lambda example: example['question_id'].startswith(image_plus_DB)) # train_filtererd_ds = Dataset.from_pandas(train_filtererd) to_save = { 'train': Dataset.from_pandas(train_filtererd), 'validation': Dataset.from_pandas(validation_filtererd), 'test': Dataset.from_pandas(test_filtererd) } save_obj(dir_path, to_save, file_name) dataset = load_obj(dir_path, file_name) return dataset
def clean_datasets(): config = read_config() if config['kaggle']: trainset, testset = get_datasets("../input/commonlitreadabilityprize/train.csv", discard = ["url_legal","license"]) trainset = trainset.rename(columns = {'target': 'labels', 'excerpt': 'text'}) testset = testset.rename(columns = {'target': 'labels', 'excerpt': 'text'}) else: trainset, testset = get_datasets(config['dataset']['filename'], discard = config['dataset']['discard']) trainset = Dataset.from_pandas(trainset) testset = Dataset.from_pandas(testset) return trainset, testset
def load_hf_dataset(data, tokenizer, args): if isinstance(data, str): dataset = load_dataset( "csv", data_files=data, delimiter="\t", download_mode="force_redownload" if args.reprocess_input_data else "reuse_dataset_if_exists", ) else: dataset = HFDataset.from_pandas(data) dataset = dataset.map( lambda x: preprocess_batch_for_hf_dataset( x, tokenizer=tokenizer, args=args), batched=True, ) dataset.set_format(type="pt", columns=["input_ids", "attention_mask"]) if isinstance(data, str): # This is not necessarily a train dataset. The datasets library insists on calling it train. return dataset["train"] else: return dataset
def load_dataset(dataset: str = "ChnSentiCorp", split: str = "train"): df = pd.read_csv(f"/data/{dataset}_{split}.tsv", sep="\t") ds = Dataset.from_pandas(df) ds.features["label"].num_classes = 2 ds.features["label"].names = ["pos", "neg"] return ds
def test_from_hf_datasets_multilabel(): TEST_HF_DATASET_DATA_MULTILABEL = Dataset.from_pandas(TEST_DATA_FRAME_DATA_MULTILABEL) dm = TextClassificationData.from_hf_datasets( "sentence", ["lab1", "lab2"], train_hf_dataset=TEST_HF_DATASET_DATA_MULTILABEL, val_hf_dataset=TEST_HF_DATASET_DATA_MULTILABEL, test_hf_dataset=TEST_HF_DATASET_DATA_MULTILABEL, predict_hf_dataset=TEST_HF_DATASET_DATA_MULTILABEL, batch_size=1, ) assert dm.multi_label batch = next(iter(dm.train_dataloader())) assert all([label in [0, 1] for label in batch[DataKeys.TARGET][0]]) assert isinstance(batch[DataKeys.INPUT][0], str) batch = next(iter(dm.val_dataloader())) assert all([label in [0, 1] for label in batch[DataKeys.TARGET][0]]) assert isinstance(batch[DataKeys.INPUT][0], str) batch = next(iter(dm.test_dataloader())) assert all([label in [0, 1] for label in batch[DataKeys.TARGET][0]]) assert isinstance(batch[DataKeys.INPUT][0], str) batch = next(iter(dm.predict_dataloader())) assert isinstance(batch[DataKeys.INPUT][0], str)
def build_dataset(df, tokenizer, batch_size): features = Features({ 'id': Value('uint64'), 'context': Value('string'), 'text': Value('string'), }) dataset = Dataset.from_pandas(df, features=features) dataset = dataset.map( lambda x: tokenizer(x["text"], x["context"], padding="longest", truncation='longest_first'), batched=True, batch_size=batch_size, ) def format_dataset(dataset): dataset.set_format( type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask']) return dataset dataset = format_dataset(dataset) return dataset
def getDataset(config): """ build dataset from the h5 file """ atu = pd.read_hdf(config.data.h5_file, key=config.data.h5_key) atu = atu[["text", "atu", "desc"]] dataset = Dataset.from_pandas(atu) tokenizer = AutoTokenizer.from_pretrained(config["module"]["arch"]) def tokenize(instance): return tokenizer(instance["text"], max_length=config["module"]["seq_len"], truncation=True, padding=True) dataset = dataset. \ shuffle(seed=config.seed). \ map(tokenize, batched=True) dataset.set_format( type="numpy", columns=['input_ids', 'attention_mask', "atu", "desc", "text"]) return dataset
def parse_test_key(test: str, key: str): tweets = [] file1 = open(test, 'r') file2 = open(key, 'r') file1.readline() file2.readline() while True: line1 = file1.readline() line2 = file2.readline() if not line1 or not line2: break split_line1 = line1.split('\t') split_line2 = line2.split('\t') if split_line2[1].strip() == 'oth': tweets.append([split_line1[0], detweetify(split_line1[1]), 4]) elif split_line2[1].strip() == 'grp': tweets.append([split_line1[0], detweetify(split_line1[1]), 3]) elif split_line2[1].strip() == 'ind': tweets.append([split_line1[0], detweetify(split_line1[1]), 2]) elif split_line2[1].strip() == 'prof': tweets.append([split_line1[0], detweetify(split_line1[1]), 1]) else: tweets.append([split_line1[0], detweetify(split_line1[1]), 0]) return Dataset.from_pandas( pd.DataFrame(tweets, columns=['id', 'tweet', 'labels']))
def test_from_hf_datasets(): TEST_HF_DATASET_DATA = Dataset.from_pandas(TEST_DATA_FRAME_DATA) dm = TextClassificationData.from_hf_datasets( "sentence", "lab1", train_hf_dataset=TEST_HF_DATASET_DATA, val_hf_dataset=TEST_HF_DATASET_DATA, test_hf_dataset=TEST_HF_DATASET_DATA, predict_hf_dataset=TEST_HF_DATASET_DATA, batch_size=1, ) batch = next(iter(dm.train_dataloader())) assert batch[DataKeys.TARGET].item() in [0, 1] assert isinstance(batch[DataKeys.INPUT][0], str) batch = next(iter(dm.val_dataloader())) assert batch[DataKeys.TARGET].item() in [0, 1] assert isinstance(batch[DataKeys.INPUT][0], str) batch = next(iter(dm.test_dataloader())) assert batch[DataKeys.TARGET].item() in [0, 1] assert isinstance(batch[DataKeys.INPUT][0], str) batch = next(iter(dm.predict_dataloader())) assert isinstance(batch[DataKeys.INPUT][0], str)
def load_hf_dataset(data, tokenizer, args, multi_label): if isinstance(data, str): dataset = load_dataset("csv", data_files=data, delimiter="\t") else: dataset = HFDataset.from_pandas(data) if args.labels_map and not args.regression: dataset = dataset.map( lambda x: map_labels_to_numeric(x, multi_label, args)) dataset = dataset.map( lambda x: preprocess_batch_for_hf_dataset( x, tokenizer=tokenizer, max_seq_length=args.max_seq_length), batched=True, ) if args.model_type in ["bert", "xlnet", "albert", "layoutlm"]: dataset.set_format(type="pt", columns=[ "input_ids", "token_type_ids", "attention_mask", "labels" ]) else: dataset.set_format(type="pt", columns=["input_ids", "attention_mask", "labels"]) if isinstance(data, str): # This is not necessarily a train dataset. The datasets library insists on calling it train. return dataset["train"] else: return dataset
def load_datasets(preprocess_args={}): """ Return train, dev, test datasets """ train_files = glob(os.path.join(tass_dir, "train/*.tsv")) dev_files = glob(os.path.join(tass_dir, "dev/*.tsv")) test_files = glob(os.path.join(tass_dir, "test1.1/*.tsv")) train_dfs = {get_lang(file): load_df(file) for file in train_files} dev_dfs = {get_lang(file): load_df(file) for file in dev_files} test_dfs = { get_lang(file): load_df(file, test=True) for file in test_files } train_df = pd.concat(train_dfs.values()) dev_df = pd.concat(dev_dfs.values()) test_df = pd.concat(test_dfs.values()) print(len(train_df), len(dev_df), len(test_df)) """ Tokenize tweets """ preprocess_with_args = lambda x: preprocess_tweet(x, **preprocess_args) train_df["text"] = train_df["text"].apply(preprocess_with_args) dev_df["text"] = dev_df["text"].apply(preprocess_with_args) test_df["text"] = test_df["text"].apply(preprocess_with_args) features = Features({ 'text': Value('string'), 'lang': Value('string'), 'label': ClassLabel(num_classes=3, names=["neg", "neu", "pos"]) }) columns = ["text", "lang", "label"] train_dataset = Dataset.from_pandas(train_df[columns], features=features) dev_dataset = Dataset.from_pandas(dev_df[columns], features=features) test_dataset = Dataset.from_pandas(test_df[columns], features=features) return train_dataset, dev_dataset, test_dataset
def gen_fake_data(): _data = {'text': ['今天是星期四<SIMSEP>今天是周四','今天是星期四<SIMSEP>今天是周四','今天是星期四<SIMSEP>今天是周四','今天是星期四<SIMSEP>今天是周四',\ '今天是星期四<SIMSEP>今天是周四','今天是星期四<SIMSEP>今天是周四','今天是星期四<SIMSEP>今天是周四','今天是星期四<SIMSEP>今天是周四',\ '今天是星期四<SIMSEP>今天是周四','今天是星期四<SIMSEP>今天是周四','今天是星期四<SIMSEP>今天是周四','今天是星期四<SIMSEP>今天是周四',\ '今天是星期四<SIMSEP>今天是周四','今天是星期四<SIMSEP>今天是周四','今天是星期四<SIMSEP>今天是周四','今天是星期四<SIMSEP>今天是周四']} _df = pd.DataFrame(_data) _dataset = Dataset.from_pandas(_df) return _dataset
def read_txt(txt_path): data = pd.read_csv(txt_path, delimiter='\n', header=None, names=['path', 'sentence']) has_colon = data['path'].str.contains('|') data[['path', 'sentence']] = data.loc[has_colon, 'path'].str.split('|', expand=True) data = Dataset.from_pandas(data) return(data)
def get_etr_dataset(args): etr_path = p.join(args.path.train_data_dir, "etr_qa_dataset.json") if not p.exists(etr_path): raise FileNotFoundError( f"ETRI 데이터 셋 {etr_path}로 파일명 바꿔서 데이터 넣어주시길 바랍니다.") with open(etr_path, "r") as f: etr_dict = json.load(f) # print(etr_dict["data"][0]) new_dataset = defaultdict(list) cnt = 0 for datas in etr_dict["data"]: title = datas["title"] context = datas["paragraphs"][0]["context"] for questions in datas["paragraphs"][0]["qas"]: question = questions["question"] answers = { "answer_start": [questions["answers"][0]["answer_start"]], "text": [questions["answers"][0]["text"]], } new_dataset["id"].append(f"etr-custom-{cnt}") new_dataset["title"].append(title) new_dataset["context"].append(context) new_dataset["question"].append(question) new_dataset["answers"].append(answers) cnt += 1 f = Features({ "answers": Sequence( feature={ "text": Value(dtype="string", id=None), "answer_start": Value(dtype="int32", id=None) }, length=-1, id=None, ), "id": Value(dtype="string", id=None), "context": Value(dtype="string", id=None), "question": Value(dtype="string", id=None), "title": Value(dtype="string", id=None), }) df = pd.DataFrame(new_dataset) etr_dataset = Dataset.from_pandas(df, features=f) return etr_dataset
def load_length_split_dataset(self, data_dir, logger=None): """ Loads break dataset with length split based on number of operators. Train - <= 4 steps. val + test - on DB + images :param data_dir: The path of the directory where the preprocessed dataset should be saved to or loaded from. :param logger: A logger for logging events. :return: The loaded dataset. """ # TODO datadir required in signature? current_dir = Path() dir_path = current_dir / "data" / "break_data" / "preprocessed" file_name = "dataset_preprocessed_length_split.pkl" if not (dir_path / file_name).is_file(): if logger: logger.info('Creating length split dataset...') threshold_amount_ops = 4 train_filtererd = pd.DataFrame() validation_filtererd = pd.DataFrame() test_filtererd = pd.DataFrame() for i, example in enumerate(self.dataset_logical['train']): if example['operators'].count(',') < threshold_amount_ops: train_filtererd = train_filtererd.append(example, ignore_index=True) for i, example in enumerate(self.dataset_logical['validation']): if example['operators'].count(',') >= threshold_amount_ops: validation_filtererd = validation_filtererd.append( example, ignore_index=True) for i, example in enumerate(self.dataset_logical['test']): if example['operators'].count(',') >= threshold_amount_ops: test_filtererd = test_filtererd.append(example, ignore_index=True) to_save = { 'train': Dataset.from_pandas(train_filtererd), 'validation': Dataset.from_pandas(validation_filtererd), 'test': Dataset.from_pandas(test_filtererd) } save_obj(dir_path, to_save, file_name) dataset = load_obj(dir_path, file_name) return dataset
def update_metadata(token, commit_sha): """ Update the metada for the Transformers repo. """ with tempfile.TemporaryDirectory() as tmp_dir: repo = Repository(tmp_dir, clone_from="huggingface/transformers-metadata", repo_type="dataset", use_auth_token=token) frameworks_table = get_frameworks_table() frameworks_dataset = Dataset.from_pandas(frameworks_table) frameworks_dataset.to_json(os.path.join(tmp_dir, "frameworks.json")) tags_dataset = Dataset.from_json( os.path.join(tmp_dir, "pipeline_tags.json")) table = { tags_dataset[i]["model_class"]: (tags_dataset[i]["pipeline_tag"], tags_dataset[i]["auto_class"]) for i in range(len(tags_dataset)) } table = update_pipeline_and_auto_class_table(table) # Sort the model classes to avoid some nondeterministic updates to create false update commits. model_classes = sorted(list(table.keys())) tags_table = pd.DataFrame({ "model_class": model_classes, "pipeline_tag": [table[m][0] for m in model_classes], "auto_class": [table[m][1] for m in model_classes], }) tags_dataset = Dataset.from_pandas(tags_table) tags_dataset.to_json(os.path.join(tmp_dir, "pipeline_tags.json")) if repo.is_repo_clean(): print("Nothing to commit!") else: if commit_sha is not None: commit_message = ( f"Update with commit {commit_sha}\n\nSee: " f"https://github.com/huggingface/transformers/commit/{commit_sha}" ) else: commit_message = "Update" repo.push_to_hub(commit_message)
def load_custom_dataset_commonvoice_format(path, split, path_column='path'): # TODO: add support for multiple split to be together. Example: train+validation dataset_path = Path(path) / (split + '.tsv') df = pd.read_csv(dataset_path, sep='\t') df[path_column] = [ str((Path(path) / p).absolute()) for _, p in df[path_column].iteritems() ] return Dataset.from_pandas(df)
def init_dataset(self, X, y): dataset = Dataset.from_pandas( pd.DataFrame({ 'text': X, 'label': [self.idx_to_label.index(lbl_str) for lbl_str in y] })) return dataset.map(self._tokenize_function, batched=True)
def make_negative_dataset(args, bm25, queries, answers, contexts, name, num=16): total = [] scores, indices = bm25.get_relevant_doc_bulk(queries, topk=num * 2) answers, indices = np.array(answers, dtype="object"), np.array(indices) contexts = np.array(contexts, dtype="object") for idx, query in enumerate(queries): label = idx % num answer = answers[idx] context_list = contexts[indices[idx]] check_in = np.argwhere(context_list == answer) if check_in.shape[0] == 0: context_list[label] = answer context_list = context_list[:num] else: context_list[check_in[0][0]] = context_list[num] context_list[label] = answer context_list = context_list[:num] if idx % 100 == 0: print("query: ", query) print("answer: ", answer) print("negative:", context_list) print("label:", label) tmp = { "query": query, "negative_samples": context_list, "label": label } total.append(tmp) df = pd.DataFrame(total) f = Features({ "query": Value(dtype="string", id=None), "negative_samples": Sequence(feature=Value(dtype="string", id=None), length=-1, id=None), "label": Value(dtype="int32", id=None), }) dataset = Dataset.from_pandas(df, features=f) dataset.save_to_disk(os.path.join(args.path.train_data_dir, name))
def parse_training(file_location: str): tweets = [] file = open(file_location, 'r') lines = file.readlines() for line in lines[1:]: split_line = line.split('\t') tweets.append([split_line[0], detweetify(split_line[1]), int(split_line[2].strip() == 'OFF')]) return Dataset.from_pandas(pd.DataFrame(tweets, columns=['id', 'tweet', 'labels']))
def load_eval_data(self, force_reload=False, save_datasets=True) -> None: eval_save_dir = self.save_dir / "eval" try: if force_reload: raise Exception() self.datasets["eval"] = DatasetDict.load_from_disk(eval_save_dir) print("Evaluation data loaded from disk.") except: print("Regenerating evaluation data.") eval_df_dict = self._parse_eval_data(self.eval_dir) self.datasets["eval"] = DatasetDict({ "far": Dataset.from_pandas(eval_df_dict["far"]), "obj": Dataset.from_pandas(eval_df_dict["obj"]), }) if save_datasets: print(f"Saving evaluation dataset to {eval_save_dir}") self.datasets["eval"].save_to_disk(eval_save_dir)
def generate_faiss_index_dataset(data, ctx_encoder_name, args, device): """ Adapted from Huggingface example script at https://github.com/huggingface/transformers/blob/master/examples/research_projects/rag/use_own_knowledge_dataset.py """ import faiss if isinstance(data, str): dataset = load_dataset("csv", data_files=data, delimiter="\t", column_names=["title", "text"]) else: dataset = HFDataset.from_pandas(data) dataset = dataset.map( partial(split_documents, split_text_n=args.split_text_n, split_text_character=args.split_text_character), batched=True, num_proc=args.process_count, ) ctx_encoder = DPRContextEncoder.from_pretrained(ctx_encoder_name).to( device=device) ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained( ctx_encoder_name) new_features = Features({ "text": Value("string"), "title": Value("string"), "embeddings": Sequence(Value("float32")) }) # optional, save as float32 instead of float64 to save space dataset = dataset.map( partial(embed, ctx_encoder=ctx_encoder, ctx_tokenizer=ctx_tokenizer, device=device), batched=True, batch_size=args.rag_embed_batch_size, features=new_features, ) if isinstance(data, str): dataset = dataset["train"] if args.save_knowledge_dataset: output_dataset_directory = os.path.join(args.output_dir, "knowledge_dataset") os.makedirs(output_dataset_directory, exist_ok=True) dataset.save_to_disk(output_dataset_directory) index = faiss.IndexHNSWFlat(args.faiss_d, args.faiss_m, faiss.METRIC_INNER_PRODUCT) dataset.add_faiss_index("embeddings", custom_index=index) return dataset
def load_data( self, data_frame: pd.DataFrame, input_key: str, target_keys: Optional[Union[str, List[str]]] = None, target_formatter: Optional[TargetFormatter] = None, ) -> Dataset: return super().load_data(Dataset.from_pandas(data_frame), input_key, target_keys, target_formatter=target_formatter)
def load_dt_data(data): data_df = pd.DataFrame(data, columns=['path', 'sentence']) data = Dataset.from_pandas(data_df) CHARS_TO_IGNORE = r'[\,\?\.\!\-\;\:\"\“\%\‘\”\�\$\©\~\)\(\§\'\d]' def remove_special_characters(batch): batch["sentence"] = re.sub(CHARS_TO_IGNORE, '', batch["sentence"]).lower() + " " return batch def dt_speech_file_to_array_fn(batch): speech_array, sampling_rate = torchaudio.load(batch["path"]) batch["speech"] = speech_array[0].numpy() batch["sampling_rate"] = sampling_rate batch["target_text"] = batch["sentence"] return batch def resample(batch): batch["speech"] = librosa.resample(np.asarray(batch["speech"]), 22_050, 16_000) batch["sampling_rate"] = 16_000 return batch # print(data) data = data.map(remove_special_characters) data = data.map(dt_speech_file_to_array_fn, remove_columns=data.column_names) data = data.map(resample, num_proc=4) # processor = create_processor() def prepare_dataset(batch): # check that all files have the correct sampling rate assert ( len(set(batch["sampling_rate"])) == 1 ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}." batch["input_values"] = processor( batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values with processor.as_target_processor(): batch["labels"] = processor(batch["target_text"]).input_ids return batch data = data.map(prepare_dataset, remove_columns=data.column_names, batch_size=4, num_proc=4, batched=True) return data
def load_datasets(seed=2021, preprocessing_args={}): """ Return train, dev, test datasets """ train_df = load_df(os.path.join(semeval_dir, "train.csv")) test_df = load_df(os.path.join(semeval_dir, "test.csv")) train_df, dev_df = train_test_split(train_df, test_size=0.2) print(len(train_df), len(dev_df), len(test_df)) """ Tokenize tweets """ en_preprocess = lambda x: preprocess_tweet( x, lang="en", **preprocessing_args) train_df["text"] = train_df["text"].apply(en_preprocess) dev_df["text"] = dev_df["text"].apply(en_preprocess) test_df["text"] = test_df["text"].apply(en_preprocess) features = Features({ 'id': Value('int64'), 'text': Value('string'), 'label': ClassLabel(num_classes=3, names=["NEG", "NEU", "POS"]) }) columns = ["text", "id", "label"] train_dataset = Dataset.from_pandas(train_df[columns], features=features) dev_dataset = Dataset.from_pandas(dev_df[columns], features=features) test_dataset = Dataset.from_pandas(test_df[columns], features=features) return train_dataset, dev_dataset, test_dataset
def run_sparse_retrieval(datasets, training_args): #### retreival process #### retriever = SparseRetrieval(tokenize_fn=tokenize, data_path="./data", context_path="wikipedia_documents.json" # context_path="all_wikipedia_documents.json" ) # sparse embedding retrieval # retriever.get_sparse_embedding() #df = retriever.retrieve(datasets['validation']) # bm25 retrieval # retriever.get_embedding_BM25() # df = retriever.retrieve_BM25(query_or_dataset=datasets['validation'], topk=10) # elastic search retrieval # retriever.get_elastic_search() df = retriever.retrieve_ES(query_or_dataset=datasets['validation'], topk=10) # faiss retrieval # df = retriever.retrieve_faiss(dataset['validation']) if training_args.do_predict: # test data 에 대해선 정답이 없으므로 id question context 로만 데이터셋이 구성됩니다. f = Features({ 'context': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None) }) elif training_args.do_eval: # train data 에 대해선 정답이 존재하므로 id question context answer 로 데이터셋이 구성됩니다. f = Features({ 'answers': Sequence(feature={ 'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None) }, length=-1, id=None), 'context': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None) }) datasets = DatasetDict({'validation': Dataset.from_pandas(df, features=f)}) return datasets
def load_hf_dataset(data, encoder_tokenizer, decoder_tokenizer, args): if isinstance(data, str): dataset = load_dataset( "csv", data_files=data, delimiter="\t", download_mode="force_redownload" if args.reprocess_input_data else "reuse_dataset_if_exists", cache_dir=args.dataset_cache_dir, ) else: dataset = HFDataset.from_pandas(data) dataset = dataset.map( lambda x: preprocess_batch_for_hf_dataset( x, encoder_tokenizer=encoder_tokenizer, decoder_tokenizer=decoder_tokenizer, args=args, ), batched=True, ) if args.model_type == "bart": column_names = [ "source_ids", "source_mask", "target_ids", ] elif args.model_type == "mbart": column_names = [ "input_ids", "attention_mask", "decoder_input_ids", "labels", ] else: column_names = [ "input_ids", "attention_mask", "decoder_input_ids", ] dataset.set_format(type="pt", columns=column_names) if isinstance(data, str): # This is not necessarily a train dataset. The datasets library insists on calling it train. return dataset["train"] else: return dataset