def get_bert_clf_demo(): """Download the pretrained classifier for demo dataset.""" data_dir = get_root_dir() data_dir = os.path.join(data_dir, "bert_clf") if not os.path.exists(os.path.join(data_dir, "demo")): download_file( subdir=data_dir, **downloadable_resource_urls["bert-base-uncased-clf-demo"])
def get_corenlp(): """Download stanford corenlp package. """ data_dir = get_root_dir() data_dir = os.path.join(data_dir, "common") if not os.path.exists(os.path.join(data_dir, "stanford-corenlp-4.1.0")): download_file(subdir=os.path.join(data_dir), **downloadable_resource_urls["stanford-corenlp"])
def get_nltk_data(): """Download nltk data to ``<fibber_root_dir>/nltk_data``.""" data_dir = get_root_dir() data_dir = os.path.join(data_dir, "common", "nltk_data", "tokenizers") if not os.path.exists(os.path.join(data_dir, "punkt")): download_file(subdir=os.path.join(data_dir), **downloadable_resource_urls["nltk-punkt"]) data_dir = get_root_dir() data_dir = os.path.join(data_dir, "common", "nltk_data", "corpora") if not os.path.exists(os.path.join(data_dir, "stopwords")): download_file(subdir=os.path.join(data_dir), **downloadable_resource_urls["nltk_stopwords"])
def get_bert_lm_demo(path="."): """Download the pretrained language model for demo dataset. Since this data is algorithm-specific, it is downloaded to ``path`` instead of ``<fibber_root_dir>``. """ if not os.path.exists(os.path.join(path, "lm_all")): download_file( abs_path=path, **downloadable_resource_urls["bert-base-uncased-lm-demo"]) if not os.path.exists(os.path.join(path, "wordpiece_emb-demo-0500.pt")): download_file(abs_path=path, **downloadable_resource_urls["wpe-demo"])
def get_stopwords(): """Download default stopword words. Returns: ([str]): a list of strings. """ data_dir = get_root_dir() data_dir = os.path.join(data_dir, "common") download_file(subdir=os.path.join(data_dir), **downloadable_resource_urls["default-stopwords"]) with open(os.path.join(data_dir, "stopwords.txt")) as f: stopwords = f.readlines() stopwords = [x.strip().lower() for x in stopwords] return stopwords
def get_universal_sentence_encoder(): """Download pretrained universal sentence encoder. Returns: (str): directory of the downloaded model. """ data_dir = get_root_dir() data_dir = os.path.join(data_dir, "common", "tfhub_pretrained", "universal-sentence-encoder-large_5") if not os.path.exists(data_dir): download_file( subdir=os.path.join(data_dir), **downloadable_resource_urls["universal-sentence-encoder"]) return data_dir
def get_transformers(name): """Download pretrained transformer models. Args: name (str): the name of the pretrained models. options are ``["bert-base-cased", "bert-base-uncased", "gpt2-medium"]``. Returns: (str): directory of the downloaded model. """ data_dir = get_root_dir() data_dir = os.path.join(data_dir, "common", "transformers_pretrained") if not os.path.exists(os.path.join(data_dir, name)): download_file(subdir=os.path.join(data_dir), **downloadable_resource_urls[name]) return os.path.join(data_dir, name)
def get_demo_dataset(): """download demo dataset. Returns: (dict, dict): trainset and testset. """ download_file(subdir="", **downloadable_dataset_urls["mr-demo"]) data_dir = get_root_dir() data_dir = os.path.join(data_dir, "mr-demo") with open(os.path.join(data_dir, "train.json")) as f: trainset = json.load(f) with open(os.path.join(data_dir, "test.json")) as f: testset = json.load(f) logger.info("Demo training set has %d records.", len(trainset["data"])) logger.info("Demo test set has %d records.", len(testset["data"])) return trainset, testset
def download_raw_and_preprocess(dataset_name, download_list, preprocess_fn, preprocess_input_output_list): """Download and preprocess raw data into fibber's format. Args: dataset_name (str): the name of the dataset. download_list ([str]): a list of strings indicating which file to download. Each element in this list should corresponds to a one key in ``downloadable_dataset_urls``. preprocess_fn (fn): a function to preprocess the dataset. preprocess_input_output_list ([(str, str), ...]): A list of tuples. Each tuple indicate a pair of input and output file or path name. """ root_dir = get_root_dir() dataset_dir = "datasets/" + dataset_name for item in download_list: download_file(**downloadable_dataset_urls[item], subdir=os.path.join(dataset_dir, "raw")) for input_name, output_name in preprocess_input_output_list: preprocess_fn(os.path.join(root_dir, dataset_dir, input_name), os.path.join(root_dir, dataset_dir, output_name))
def get_glove_emb(download_only=False): """Download default pretrained glove embeddings and return a dict. We use the 300-dimensional model trained on Wikipedia 2014 + Gigaword 5. See https://nlp.stanford.edu/projects/glove/ Args: download_only (bool): set True to only download. (Returns None) Returns: (dict): a dict of GloVe word embedding model. "emb_table": a numpy array of size(N, 300) "id2tok": a list of strings. "tok2id": a dict that maps word (string) to its id. """ data_dir = get_root_dir() data_dir = os.path.join(data_dir, "common") if not os.path.exists(os.path.join(data_dir, "glove.6B.300d.txt")): download_file(subdir=os.path.join(data_dir), **downloadable_resource_urls["default-glove-embeddings"]) if download_only: return None return load_glove_model(os.path.join(data_dir, "glove.6B.300d.txt"), 300)
"ag": preprocess_ag.download_and_preprocess_ag, "imdb": preprocess_imdb.download_and_preprocess_imdb, "mnli": preprocess_mnli.download_and_preprocess_mnli, "mr": preprocess_mr.download_and_preprocess_mr, "snli": preprocess_snli.download_and_preprocess_snli, "yelp": preprocess_yelp.download_and_preprocess_yelp } if __name__ == "__main__": FLAGS = parser.parse_args() if FLAGS.process_raw == "1": for name, processing_func in DATASET_PREPROCESS_FN.items(): logger.info("Start download and process %s.", name) processing_func() else: download_file(subdir="", **downloadable_dataset_urls["processed-datasets"]) if FLAGS.verify == "1": root_dir = get_root_dir() datasets_dir = os.path.join(root_dir, "datasets") dataset_json_list = sorted(glob.glob(datasets_dir + "/*/*.json")) for json_filename in dataset_json_list: logger.info("Verify %s.", json_filename) with open(json_filename) as f: data = json.load(f) verify_dataset(data)
def download_and_preprocess_mr(): """preprocess raw movie review dataset to Fibber's JSON format.""" root_dir = get_root_dir() dataset_dir = "datasets/mr/" download_file(subdir=os.path.join(dataset_dir, "raw"), **downloadable_dataset_urls["mr-raw"]) logger.info("Start processing data.") with open(os.path.join(root_dir, dataset_dir, "raw/rt-polaritydata/rt-polarity.neg"), encoding="utf-8", errors="ignore") as f: neg = f.readlines() with open(os.path.join(root_dir, dataset_dir, "raw/rt-polaritydata/rt-polarity.pos"), encoding="utf-8", errors="ignore") as f: pos = f.readlines() train = { "label_mapping": ["negative", "positive"], "cased": False, "paraphrase_field": "text0", } test = { "label_mapping": ["negative", "positive"], "cased": False, "paraphrase_field": "text0", } trainlist = [] testlist = [] for id, item in enumerate(neg): if id % 10 == 0: testlist.append({ "label": 0, "text0": item.strip()}) else: trainlist.append({ "label": 0, "text0": item.strip()}) for id, item in enumerate(pos): if id % 10 == 0: testlist.append({ "label": 1, "text0": item.strip()}) else: trainlist.append({ "label": 1, "text0": item.strip()}) train["data"] = trainlist test["data"] = testlist with open(os.path.join(root_dir, dataset_dir, "train.json"), "w") as f: json.dump(train, f, indent=2) with open(os.path.join(root_dir, dataset_dir, "test.json"), "w") as f: json.dump(test, f, indent=2)