def _setup_datasets(url, top_n=-1, local_cache_path=".data"): FILE_NAME = "cnndm.tar.gz" maybe_download(url, FILE_NAME, local_cache_path) dataset_tar = os.path.join(local_cache_path, FILE_NAME) extracted_files = extract_archive(dataset_tar) for fname in extracted_files: if fname.endswith("train.txt.src"): train_source_file = fname if fname.endswith("train.txt.tgt.tagged"): train_target_file = fname if fname.endswith("test.txt.src"): test_source_file = fname if fname.endswith("test.txt.tgt.tagged"): test_target_file = fname return ( SummarizationDataset( train_source_file, train_target_file, [_clean, tokenize.sent_tokenize], [_clean, _remove_ttags, _target_sentence_tokenization], nltk.word_tokenize, top_n, ), SummarizationDataset( test_source_file, test_target_file, [_clean, tokenize.sent_tokenize], [_clean, _remove_ttags, _target_sentence_tokenization], nltk.word_tokenize, top_n, ), )
def get_generator( local_cache_path=".", file_split="train", block_size=10e6, batch_size=10e6, num_batches=None ): """ Downloads and extracts the dataset files and then returns a random batch generator that yields pandas dataframes. Args: local_cache_path ([type], optional): [description]. Defaults to None. file_split (str, optional): The subset to load. One of: {"train", "dev_matched", "dev_mismatched"} Defaults to "train". block_size (int, optional): Size of partition in bytes. random_seed (int, optional): Random seed. See random.seed().Defaults to None. num_batches (int): Number of batches to generate. batch_size (int]): Batch size. Returns: Generator[pd.Dataframe, None, None] : Random batch generator that yields pandas dataframes. """ file_name = URL.split("/")[-1] maybe_download(URL, file_name, local_cache_path) if not os.path.exists(os.path.join(local_cache_path, DATA_FILES[file_split])): extract_zip(os.path.join(local_cache_path, file_name), local_cache_path) loader = DaskJSONLoader( os.path.join(local_cache_path, DATA_FILES[file_split]), block_size=block_size ) return loader.get_sequential_batches(batch_size=int(batch_size), num_batches=num_batches)
def test_maybe_download(): # ToDo: Change this url when repo goes public. file_url = "https://raw.githubusercontent.com/Microsoft/Recommenders/master/LICENSE" filepath = "license.txt" assert not os.path.exists(filepath) filepath = maybe_download(file_url, "license.txt", expected_bytes=1162) assert os.path.exists(filepath) os.remove(filepath) with pytest.raises(IOError): filepath = maybe_download(file_url, "license.txt", expected_bytes=0)
def load_train_test_dfs(local_cache_path="./", test_fraction=0.5, random_seed=None): """ Get the training and testing data frames based on test_fraction. Args: local_cache_path (str): Path to store the data. If the data file doesn't exist in this path, it's downloaded. test_fraction (float, optional): Fraction of data ot use for testing. Since this is a small dataset, the default testing fraction is set to 0.5 random_seed (float, optional): Random seed used to shuffle the data. Returns: tuple: (train_pandas_df, test_pandas_df), each data frame contains two columns "sentence": sentences in strings. "labels": list of entity labels of the words in the sentence. """ file_name = URL.split("/")[-1] maybe_download(URL, file_name, local_cache_path) data_file = os.path.join(local_cache_path, file_name) with open(data_file, "r", encoding="utf8") as file: text = file.read() sentence_list, labels_list = preprocess_conll(text) if random_seed: random.seed(random_seed) sentence_and_labels = list(zip(sentence_list, labels_list)) random.shuffle(sentence_and_labels) sentence_list[:], labels_list[:] = zip(*sentence_and_labels) sentence_count = len(sentence_list) test_sentence_count = round(sentence_count * test_fraction) test_sentence_list = sentence_list[:test_sentence_count] test_labels_list = labels_list[:test_sentence_count] train_sentence_list = sentence_list[test_sentence_count:] train_labels_list = labels_list[test_sentence_count:] train_df = pd.DataFrame({ "sentence": train_sentence_list, "labels": train_labels_list }) test_df = pd.DataFrame({ "sentence": test_sentence_list, "labels": test_labels_list }) return (train_df, test_df)
def download_snli(dest_path): """ Download the SNLI dataset Args: dest_path (str): file path where SNLI dataset should be downloaded Returns: str: file path where SNLI dataset is downloaded """ dirs, file = os.path.split(dest_path) maybe_download(SNLI_URL, file, work_directory=dirs)
def load_pandas_df(local_cache_path=None, num_rows=None): """Downloads and extracts the dataset files Args: local_cache_path ([type], optional): [description]. Defaults to None. num_rows (int): Number of rows to load. If None, all data is loaded. Returns: pd.DataFrame: pandas DataFrame containing the loaded dataset. """ zip_file = URL.split("/")[-1] maybe_download(URL, zip_file, local_cache_path) zip_file_path = os.path.join(local_cache_path, zip_file) csv_file_path = os.path.join(local_cache_path, zip_file.replace(".zip", "")) if not os.path.exists(csv_file_path): extract_zip(file_path=zip_file_path, dest_path=local_cache_path) return pd.read_csv(csv_file_path, nrows=num_rows)
def __init__(self, path) -> None: maybe_download( "https://drive.switch.ch/index.php/s/YoyW9S8yml7wVhN/download?path=%2F&files=data_train.csv", "data_train.csv", path) pd = pandas.read_csv(path + "data_train.csv") train = pd.values.tolist() source = [item[0] for item in train] summary = [item[1] for item in train] self.train_source, self.test_source, self.train_summary, self.test_summary = train_test_split( source, summary, train_size=0.95, test_size=0.005, random_state=123) self.reset() self.no_bertscore() self.add_bertscore()
def load_pandas_df(local_cache_path=".", file_split="train"): """Downloads and extracts the dataset files Args: local_cache_path ([type], optional): [description]. Defaults to None. file_split (str, optional): The subset to load. One of: {"train", "dev_matched", "dev_mismatched"} Defaults to "train". Returns: pd.DataFrame: pandas DataFrame containing the specified MultiNLI subset. """ file_name = URL.split("/")[-1] maybe_download(URL, file_name, local_cache_path) if not os.path.exists(os.path.join(local_cache_path, DATA_FILES[file_split])): extract_zip(os.path.join(local_cache_path, file_name), local_cache_path) return pd.read_json(os.path.join(local_cache_path, DATA_FILES[file_split]), lines=True)
def download_file_and_extract(local_cache_path: str = ".", file_split: str = "train") -> None: """Download and extract the dataset files Args: local_cache_path (str [optional]) -- Directory to cache files to. Defaults to current working directory (default: {"."}) file_split {str} -- [description] (default: {"train"}) Returns: None -- Nothing is returned """ file_name = URL.split("/")[-1] maybe_download(URL, file_name, local_cache_path) if not os.path.exists( os.path.join(local_cache_path, DATA_FILES[file_split])): extract_zip(os.path.join(local_cache_path, file_name), local_cache_path)
def download_msrpc(download_dir): """Downloads Windows Installer for Microsoft Paraphrase Corpus. Args: download_dir (str): File path for the downloaded file Returns: str: file_path to the downloaded dataset. """ url = ( "https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B" "-3604ED519838/MSRParaphraseCorpus.msi") return maybe_download(url, work_directory=download_dir)
def load_pandas_df(local_cache_path=TemporaryDirectory().name): """ Downloads and extracts the dataset files Args: local_cache_path (str, optional): The local file path to save the raw file. Defaults to TemporaryDirectory().name. Returns: pd.DataFrame: pandas DataFrame containing the loaded dataset. """ zipped_file = URL.split("/")[-1] maybe_download(URL, zipped_file, local_cache_path) zipped_file_path = os.path.join(local_cache_path, zipped_file) tar = tarfile.open(zipped_file_path, "r:gz") tar.extractall(path=local_cache_path) tar.close() train_csv_file_path = os.path.join(local_cache_path, "hindi-train.csv") test_csv_file_path = os.path.join(local_cache_path, "hindi-test.csv") train_df = pd.read_csv(train_csv_file_path, sep="\t", encoding="utf-8", header=None) test_df = pd.read_csv(test_csv_file_path, sep="\t", encoding="utf-8", header=None) train_df = train_df.fillna("") test_df = test_df.fillna("") return (train_df, test_df)
def download_tsv_files_and_extract(local_cache_path: str = ".") -> None: """Download and extract the dataset files in tsv format from NYU Jiant downloads both original and tsv formatted data. Args: local_cache_path (str [optional]) -- Directory to cache files to. Defaults to current working directory (default: {"."}) Returns: None -- Nothing is returned """ try: folder_name = "MNLI" file_name = f"{folder_name}.zip" maybe_download(URL_JIANT_MNLI_TSV, file_name, local_cache_path) if not os.path.exists(os.path.join(local_cache_path, folder_name)): extract_zip(os.path.join(local_cache_path, file_name), local_cache_path) # Clean up zip download if os.path.exists(os.path.join(local_cache_path, file_name)): os.remove(os.path.join(local_cache_path, file_name)) except IOError as e: raise (e) print("Downloaded file to: ", os.path.join(local_cache_path, folder_name))
def _download_glove_vectors(download_dir, file_name="glove.840B.300d.zip"): """ Downloads gloVe word vectors trained on Common Crawl corpus. You can directly download the vectors from here: http://nlp.stanford.edu/data/glove.840B.300d.zip Args: download_dir (str): File path to download the file file_name (str) : File name given by default but can be changed by the user. Returns: str: file_path to the downloaded vectors. """ return maybe_download(GLOVE_URL, filename=file_name, work_directory=download_dir)
def _download_sts(dirpath): """Download and extract data from http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz Args: dirpath (str): Path to data directory. Returns: str: Path to extracted STS Benchmark data. """ filepath = maybe_download(STS_URL, work_directory=dirpath) extracted_path = _extract_sts(filepath, target_dirpath=dirpath, tmode="r:gz") print("Data downloaded to {}".format(extracted_path)) return extracted_path
def _download_word2vec_vectors( download_dir, file_name="GoogleNews-vectors-negative300.bin.gz"): """ Downloads pretrained word vectors trained on GoogleNews corpus. You can directly download the vectors from here: https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz Args: download_dir (str): File path to download the file file_name (str) : File name given by default but can be changed by the user. Returns: str: file_path to the downloaded vectors. """ return maybe_download(WORD2VEC_URL, filename=file_name, work_directory=download_dir)
def _download_fasttext_vectors(download_dir, file_name="wiki.simple.zip"): """ Downloads pre-trained word vectors for English, trained on Wikipedia using fastText. You can directly download the vectors from here: https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.simple.zip For the full version of pre-trained word vectors, change the url for FASTTEXT_EN_URL to https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip in __init__.py Args: download_dir (str): File path to download the file file_name (str) : File name given by default but can be changed by the user. Returns: str: file_path to the downloaded vectors. """ return maybe_download( FASTTEXT_EN_URL, filename=file_name, work_directory=download_dir )
def load_pandas_df(local_cache_path=".", squad_version="v1.1", file_split="train"): """Loads the SQuAD dataset in pandas data frame. Args: local_cache_path (str, optional): Path to load the data from. If the file doesn't exist, download it first. Defaults to the current directory. squad_version (str, optional): Version of the SQuAD dataset, accepted values are: "v1.1" and "v2.0". Defaults to "v1.1". file_split (str, optional): Dataset split to load, accepted values are: "train" and "dev". Defaults to "train". """ if file_split not in ["train", "dev"]: raise ValueError("file_split should be either train or dev") URL = URL_DICT[squad_version][file_split] file_name = URL.split("/")[-1] maybe_download(URL, file_name, local_cache_path) file_path = os.path.join(local_cache_path, file_name) with open(file_path, "r", encoding="utf-8") as reader: input_data = json.load(reader)["data"] paragraph_text_list = [] question_text_list = [] answer_start_list = [] answer_text_list = [] qa_id_list = [] is_impossible_list = [] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] answer_offset = None is_impossible = False if squad_version == "v2.0": is_impossible = qa["is_impossible"] if file_split == "train": if (len(qa["answers"]) != 1) and (not is_impossible): raise ValueError( "For training, each question should have exactly 1 answer." ) if not is_impossible: answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] else: orig_answer_text = "" else: if not is_impossible: orig_answer_text = [] answer_offset = [] for answer in qa["answers"]: orig_answer_text.append(answer["text"]) answer_offset.append(answer["answer_start"]) else: orig_answer_text = "" paragraph_text_list.append(paragraph_text) question_text_list.append(question_text) answer_start_list.append(answer_offset) answer_text_list.append(orig_answer_text) qa_id_list.append(qas_id) is_impossible_list.append(is_impossible) output_df = pd.DataFrame( { "doc_text": paragraph_text_list, "question_text": question_text_list, "answer_start": answer_start_list, "answer_text": answer_text_list, "qa_id": qa_id_list, "is_impossible": is_impossible_list, } ) return output_df
def SwissSummarizationDataset(top_n=-1, validation=False): """Load the CNN/Daily Mail dataset preprocessed by harvardnlp group.""" URLS = [ "https://drive.switch.ch/index.php/s/YoyW9S8yml7wVhN/download?path=%2F&files=data_train.csv", "https://drive.switch.ch/index.php/s/YoyW9S8yml7wVhN/download?path=%2F&files=data_test.csv", ] LOCAL_CACHE_PATH = '.data' FILE_NAME = "data_train.csv" maybe_download(URLS[0], FILE_NAME, LOCAL_CACHE_PATH) dataset_path = os.path.join(LOCAL_CACHE_PATH, FILE_NAME) train = pandas.read_csv(dataset_path).values.tolist() if (top_n != -1): train = train[0:top_n] source = [item[0] for item in train] summary = [item[1] for item in train] train_source, test_source, train_summary, test_summary = train_test_split( source, summary, train_size=0.95, test_size=0.05, random_state=123) if validation: train_source, validation_source, train_summary, validation_summary = train_test_split( train_source, train_summary, train_size=0.9, test_size=0.1, random_state=123) return ( SummarizationDataset( source_file=None, source=train_source, target=train_summary, source_preprocessing=[tokenize.sent_tokenize], target_preprocessing=[ tokenize.sent_tokenize, ], top_n=top_n, ), SummarizationDataset( source_file=None, source=validation_source, target=validation_summary, source_preprocessing=[tokenize.sent_tokenize], target_preprocessing=[ tokenize.sent_tokenize, ], top_n=top_n, ), SummarizationDataset( source_file=None, source=test_source, target=test_summary, source_preprocessing=[tokenize.sent_tokenize], target_preprocessing=[ tokenize.sent_tokenize, ], top_n=top_n, ), ) else: return ( SummarizationDataset( source_file=None, source=train_source, target=train_summary, source_preprocessing=[tokenize.sent_tokenize], target_preprocessing=[ tokenize.sent_tokenize, ], top_n=top_n, ), SummarizationDataset( source_file=None, source=test_source, target=test_summary, source_preprocessing=[tokenize.sent_tokenize], target_preprocessing=[ tokenize.sent_tokenize, ], top_n=top_n, ), )
def load_pandas_df(local_cache_path=".", file_split="dev", language="zh"): """Downloads and extracts the dataset files. Utilities information can be found `on this link <https://www.nyu.edu/projects/bowman/xnli/>`_. Args: local_cache_path (str, optional): Path to store the data. Defaults to "./". file_split (str, optional): The subset to load. One of: {"train", "dev", "test"} Defaults to "dev". language (str, optional): language subset to read. One of: {"en", "fr", "es", "de", "el", "bg", "ru", "tr", "ar", "vi", "th", "zh", "hi", "sw", "ur"} Defaults to "zh" (Chinese). Returns: pd.DataFrame: pandas DataFrame containing the specified XNLI subset. """ if file_split in ("dev", "test"): url = URL_XNLI sentence_1_index = 6 sentence_2_index = 7 label_index = 1 zip_file_name = url.split("/")[-1] folder_name = ".".join(zip_file_name.split(".")[:-1]) file_name = folder_name + "/" + ".".join(["xnli", file_split, "tsv"]) elif file_split == "train": url = URL_XNLI_MT sentence_1_index = 0 sentence_2_index = 1 label_index = 2 zip_file_name = url.split("/")[-1] folder_name = ".".join(zip_file_name.split(".")[:-1]) file_name = folder_name + "/multinli/" + ".".join( ["multinli", file_split, language, "tsv"]) maybe_download(url, zip_file_name, local_cache_path) if not os.path.exists(os.path.join(local_cache_path, folder_name)): extract_zip(os.path.join(local_cache_path, zip_file_name), local_cache_path) with open(os.path.join(local_cache_path, file_name), "r", encoding="utf-8") as f: lines = f.read().splitlines() line_list = [line.split("\t") for line in lines] # Remove the column name row line_list.pop(0) if file_split != "train": line_list = [line for line in line_list if line[0] == language] valid_lines = [ True if line[sentence_1_index] and line[sentence_2_index] else False for line in line_list ] total_line_count = len(line_list) line_list = [line for line, valid in zip(line_list, valid_lines) if valid] valid_line_count = len(line_list) if valid_line_count != total_line_count: print("{} invalid lines removed.".format(total_line_count - valid_line_count)) label_list = [convert_to_unicode(line[label_index]) for line in line_list] old_contradict_label = convert_to_unicode("contradictory") new_contradict_label = convert_to_unicode("contradiction") label_list = [ new_contradict_label if label == old_contradict_label else label for label in label_list ] text_list = [(convert_to_unicode(line[sentence_1_index]), convert_to_unicode(line[sentence_2_index])) for line in line_list] df = pd.DataFrame({"text": text_list, "label": label_list}) return df