예제 #1
0
def get_bert_clf_demo():
    """Download the pretrained classifier for demo dataset."""
    data_dir = get_root_dir()
    data_dir = os.path.join(data_dir, "bert_clf")
    if not os.path.exists(os.path.join(data_dir, "demo")):
        download_file(
            subdir=data_dir,
            **downloadable_resource_urls["bert-base-uncased-clf-demo"])
예제 #2
0
def get_corenlp():
    """Download stanford corenlp package.
    """
    data_dir = get_root_dir()
    data_dir = os.path.join(data_dir, "common")
    if not os.path.exists(os.path.join(data_dir, "stanford-corenlp-4.1.0")):
        download_file(subdir=os.path.join(data_dir),
                      **downloadable_resource_urls["stanford-corenlp"])
예제 #3
0
def get_nltk_data():
    """Download nltk data to ``<fibber_root_dir>/nltk_data``."""
    data_dir = get_root_dir()
    data_dir = os.path.join(data_dir, "common", "nltk_data", "tokenizers")
    if not os.path.exists(os.path.join(data_dir, "punkt")):
        download_file(subdir=os.path.join(data_dir),
                      **downloadable_resource_urls["nltk-punkt"])

    data_dir = get_root_dir()
    data_dir = os.path.join(data_dir, "common", "nltk_data", "corpora")
    if not os.path.exists(os.path.join(data_dir, "stopwords")):
        download_file(subdir=os.path.join(data_dir),
                      **downloadable_resource_urls["nltk_stopwords"])
예제 #4
0
def get_bert_lm_demo(path="."):
    """Download the pretrained language model for demo dataset.

    Since this data is algorithm-specific, it is downloaded to ``path`` instead of
    ``<fibber_root_dir>``.
    """
    if not os.path.exists(os.path.join(path, "lm_all")):
        download_file(
            abs_path=path,
            **downloadable_resource_urls["bert-base-uncased-lm-demo"])

    if not os.path.exists(os.path.join(path, "wordpiece_emb-demo-0500.pt")):
        download_file(abs_path=path, **downloadable_resource_urls["wpe-demo"])
예제 #5
0
def get_stopwords():
    """Download default stopword words.

    Returns:
        ([str]): a list of strings.
    """
    data_dir = get_root_dir()
    data_dir = os.path.join(data_dir, "common")
    download_file(subdir=os.path.join(data_dir),
                  **downloadable_resource_urls["default-stopwords"])

    with open(os.path.join(data_dir, "stopwords.txt")) as f:
        stopwords = f.readlines()
    stopwords = [x.strip().lower() for x in stopwords]
    return stopwords
예제 #6
0
def get_universal_sentence_encoder():
    """Download pretrained universal sentence encoder.

    Returns:
        (str): directory of the downloaded model.
    """
    data_dir = get_root_dir()
    data_dir = os.path.join(data_dir, "common", "tfhub_pretrained",
                            "universal-sentence-encoder-large_5")
    if not os.path.exists(data_dir):
        download_file(
            subdir=os.path.join(data_dir),
            **downloadable_resource_urls["universal-sentence-encoder"])

    return data_dir
예제 #7
0
def get_transformers(name):
    """Download pretrained transformer models.

    Args:
        name (str): the name of the pretrained models. options are ``["bert-base-cased",
            "bert-base-uncased", "gpt2-medium"]``.

    Returns:
        (str): directory of the downloaded model.
    """
    data_dir = get_root_dir()
    data_dir = os.path.join(data_dir, "common", "transformers_pretrained")
    if not os.path.exists(os.path.join(data_dir, name)):
        download_file(subdir=os.path.join(data_dir),
                      **downloadable_resource_urls[name])

    return os.path.join(data_dir, name)
예제 #8
0
def get_demo_dataset():
    """download demo dataset.

    Returns:
        (dict, dict): trainset and testset.
    """
    download_file(subdir="", **downloadable_dataset_urls["mr-demo"])

    data_dir = get_root_dir()
    data_dir = os.path.join(data_dir, "mr-demo")

    with open(os.path.join(data_dir, "train.json")) as f:
        trainset = json.load(f)
    with open(os.path.join(data_dir, "test.json")) as f:
        testset = json.load(f)

    logger.info("Demo training set has %d records.", len(trainset["data"]))
    logger.info("Demo test set has %d records.", len(testset["data"]))

    return trainset, testset
예제 #9
0
def download_raw_and_preprocess(dataset_name, download_list, preprocess_fn,
                                preprocess_input_output_list):
    """Download and preprocess raw data into fibber's format.

    Args:
        dataset_name (str): the name of the dataset.
        download_list ([str]): a list of strings indicating which file to download. Each
            element in this list should corresponds to a one key in ``downloadable_dataset_urls``.
        preprocess_fn (fn): a function to preprocess the dataset.
        preprocess_input_output_list ([(str, str), ...]): A list of tuples. Each tuple indicate a
            pair of input and output file or path name.
    """
    root_dir = get_root_dir()
    dataset_dir = "datasets/" + dataset_name

    for item in download_list:
        download_file(**downloadable_dataset_urls[item],
                      subdir=os.path.join(dataset_dir, "raw"))

    for input_name, output_name in preprocess_input_output_list:
        preprocess_fn(os.path.join(root_dir, dataset_dir, input_name),
                      os.path.join(root_dir, dataset_dir, output_name))
예제 #10
0
def get_glove_emb(download_only=False):
    """Download default pretrained glove embeddings and return a dict.

    We use the 300-dimensional model trained on Wikipedia 2014 + Gigaword 5.
    See https://nlp.stanford.edu/projects/glove/

    Args:
        download_only (bool): set True to only download. (Returns None)

    Returns:
        (dict): a dict of GloVe word embedding model.
            "emb_table": a numpy array of size(N, 300)
            "id2tok": a list of strings.
            "tok2id": a dict that maps word (string) to its id.
    """
    data_dir = get_root_dir()
    data_dir = os.path.join(data_dir, "common")
    if not os.path.exists(os.path.join(data_dir, "glove.6B.300d.txt")):
        download_file(subdir=os.path.join(data_dir),
                      **downloadable_resource_urls["default-glove-embeddings"])

    if download_only:
        return None
    return load_glove_model(os.path.join(data_dir, "glove.6B.300d.txt"), 300)
예제 #11
0
    "ag": preprocess_ag.download_and_preprocess_ag,
    "imdb": preprocess_imdb.download_and_preprocess_imdb,
    "mnli": preprocess_mnli.download_and_preprocess_mnli,
    "mr": preprocess_mr.download_and_preprocess_mr,
    "snli": preprocess_snli.download_and_preprocess_snli,
    "yelp": preprocess_yelp.download_and_preprocess_yelp
}

if __name__ == "__main__":
    FLAGS = parser.parse_args()

    if FLAGS.process_raw == "1":
        for name, processing_func in DATASET_PREPROCESS_FN.items():
            logger.info("Start download and process %s.", name)
            processing_func()

    else:
        download_file(subdir="",
                      **downloadable_dataset_urls["processed-datasets"])

    if FLAGS.verify == "1":
        root_dir = get_root_dir()
        datasets_dir = os.path.join(root_dir, "datasets")
        dataset_json_list = sorted(glob.glob(datasets_dir + "/*/*.json"))
        for json_filename in dataset_json_list:
            logger.info("Verify %s.", json_filename)
            with open(json_filename) as f:
                data = json.load(f)

            verify_dataset(data)
예제 #12
0
def download_and_preprocess_mr():
    """preprocess raw movie review dataset to Fibber's JSON format."""
    root_dir = get_root_dir()
    dataset_dir = "datasets/mr/"

    download_file(subdir=os.path.join(dataset_dir, "raw"),
                  **downloadable_dataset_urls["mr-raw"])

    logger.info("Start processing data.")

    with open(os.path.join(root_dir, dataset_dir, "raw/rt-polaritydata/rt-polarity.neg"),
              encoding="utf-8", errors="ignore") as f:
        neg = f.readlines()

    with open(os.path.join(root_dir, dataset_dir, "raw/rt-polaritydata/rt-polarity.pos"),
              encoding="utf-8", errors="ignore") as f:
        pos = f.readlines()

    train = {
        "label_mapping": ["negative", "positive"],
        "cased": False,
        "paraphrase_field": "text0",
    }

    test = {
        "label_mapping": ["negative", "positive"],
        "cased": False,
        "paraphrase_field": "text0",
    }

    trainlist = []
    testlist = []

    for id, item in enumerate(neg):
        if id % 10 == 0:
            testlist.append({
                "label": 0,
                "text0": item.strip()})
        else:
            trainlist.append({
                "label": 0,
                "text0": item.strip()})

    for id, item in enumerate(pos):
        if id % 10 == 0:
            testlist.append({
                "label": 1,
                "text0": item.strip()})
        else:
            trainlist.append({
                "label": 1,
                "text0": item.strip()})

    train["data"] = trainlist
    test["data"] = testlist

    with open(os.path.join(root_dir, dataset_dir, "train.json"), "w") as f:
        json.dump(train, f, indent=2)

    with open(os.path.join(root_dir, dataset_dir, "test.json"), "w") as f:
        json.dump(test, f, indent=2)