def load_winer(split="train", shuffle=True, inc_outside=True, merge_entities=True): path = check_cache_and_download("winer") if split == "train": train_data_path = os.path.join(path, "train.txt") X, Y = _load_data_spacy(train_data_path, inc_outside=inc_outside, merge_entities=merge_entities) elif split == "test": test_data_path = os.path.join(path, "test.txt") X, Y = _load_data_spacy(test_data_path, inc_outside=inc_outside, merge_entities=merge_entities) else: raise ValueError( f"Split argument {split} is not one of train, test or evaluate") if shuffle: data = list(zip(X, Y)) random.shuffle(data) X, Y = zip(*data) return X, Y
def load_conll(split="train", shuffle=True, inc_outside=True, dataset: str = "conll"): """Load the conll dataset Args: split(str): Which split of the data to collect, one of ["train", "test", "evaluate"]. shuffle(bool): Should the data be shuffled with random.shuffle? inc_outside(bool): Should outside charavters be included? dataset(str): Which dataset to load. This defaults to "conll" and should only be altered for test purposes in which case it should be set to "test_conll". """ path = check_cache_and_download(dataset) map = {"train": "eng.train", "test": "eng.testa", "evaluate": "eng.testb"} try: data_path = os.path.join(path, map[split]) X, Y = _load_data_spacy(data_path, inc_outside=inc_outside) except KeyError: raise KeyError( f"Split argument {split} is not one of train, test or evaluate") if shuffle: data = list(zip(X, Y)) random.shuffle(data) X, Y = zip(*data) return X, Y
def load_hoc(split="train", shuffle=True): path = check_cache_and_download("hoc") if split == "train": train_data_path = os.path.join(path, "train.tsv") X, Y = load_split(train_data_path) elif split == "test": test_data_path = os.path.join(path, "test.tsv") X, Y = load_split(test_data_path) else: raise ValueError(f"Split argument {split} is not one of train or test") if shuffle: data = list(zip(X, Y)) random.shuffle(data) X, Y = zip(*data) return X, Y
def load_conll(split="train", shuffle=True, inc_outside=True): path = check_cache_and_download("conll") if split == "train": train_data_path = os.path.join(path, "eng.train") X, Y = _load_data_spacy(train_data_path, inc_outside=inc_outside) elif split == "test": test_data_path = os.path.join(path, "eng.testa") X, Y = _load_data_spacy(test_data_path, inc_outside=inc_outside) elif split == "evaluate": eval_data_path = os.path.join(path, "eng.testb") X, Y = _load_data_spacy(eval_data_path, inc_outside=inc_outside) else: raise ValueError( f"Split argument {split} is not one of train, test or evaluate" ) if shuffle: data = list(zip(X, Y)) random.shuffle(data) X, Y = zip(*data) return X, Y
merge_entities=merge_entities) else: raise ValueError( f"Split argument {split} is not one of train, test or evaluate") if shuffle: data = list(zip(X, Y)) random.shuffle(data) X, Y = zip(*data) return X, Y if __name__ == "__main__": path = check_cache_and_download("winer") train_processed_path = os.path.join(path, "train.txt") test_processed_path = os.path.join(path, "test.txt") if not os.path.exists(train_processed_path): # Since this has been done once it shouldnt need to be done again, # including here for completeness or in th case we want to increase # the sample size logger.info("No {} training data file found, generating ...".format( train_processed_path)) NE_path = os.path.join(path, "CoarseNE.tar.bz2") docs_path = os.path.join(path, "Documents.tar.bz2") vocab_path = os.path.join(path, "document.vocab")