import os import argparse import shutil from gluonnlp.utils.misc import download, load_checksum_stats from gluonnlp.base import get_data_home_dir _CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__))) _BASE_DATASET_PATH = os.path.join(get_data_home_dir(), 'squad') _URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums', 'squad.txt') _URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH) _CITATIONS = """ @inproceedings{rajpurkar2016squad, title={Squad: 100,000+ questions for machine comprehension of text}, author={Rajpurkar, Pranav and Zhang, Jian and Lopyrev, Konstantin and Liang, Percy}, booktitle={EMNLP}, year={2016} } @inproceedings{rajpurkar2018know, title={Know What You Don't Know: Unanswerable Questions for SQuAD}, author={Rajpurkar, Pranav and Jia, Robin and Liang, Percy}, booktitle={ACL}, year={2018} } """ _URLS = { '1.1': {
import os import argparse import pandas as pd import shutil import tarfile from gluonnlp.utils.misc import download, load_checksum_stats from gluonnlp.base import get_data_home_dir, get_repo_url _CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__))) _URL_FILE_STATS = load_checksum_stats( os.path.join(_CURR_DIR, '..', 'url_checksums', 'text_classification.txt')) TASK2PATH = { "ag": get_repo_url() + "datasets/text_classification/ag_news_csv.tar.gz", "imdb": get_repo_url() + "datasets/text_classification/imdb.tar.gz", "dbpedia": get_repo_url() + "datasets/text_classification/dbpedia_csv.tar.gz", "yelp2": get_repo_url() + "datasets/text_classification/yelp_review_polarity_csv.tar.gz", "yelp5": get_repo_url() + "datasets/text_classification/yelp_review_full_csv.tar.gz", "amazon2": get_repo_url() + "datasets/text_classification/amazon_review_polarity_csv.tar.gz", "amazon5": get_repo_url() + "datasets/text_classification/amazon_review_full_csv.tar.gz",
pages={3261--3275}, year={2019} } """ GLUE_TASKS = [ "cola", "sst", "mrpc", "qqp", "sts", "mnli", "snli", "qnli", "rte", "wnli", "diagnostic" ] SUPERGLUE_TASKS = [ "cb", "copa", "multirc", "rte", "wic", "wsc", "boolq", "record", 'broadcoverage-diagnostic', 'winogender-diagnostic' ] _CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__))) _URL_FILE_STATS = load_checksum_stats( os.path.join(_CURR_DIR, '..', 'url_checksums', 'glue.txt')) _URL_FILE_STATS.update( load_checksum_stats( os.path.join(_CURR_DIR, '..', 'url_checksums', 'superglue.txt'))) def read_tsv_glue(tsv_file, num_skip=1, keep_column_names=False): out = [] nrows = None if keep_column_names: assert num_skip == 1 column_names = None with open(tsv_file, 'r') as f: for i, line in enumerate(f): line = line.strip() if i < num_skip:
pages={3530--3534}, year={2016} } @inproceedings{barrault2019findings, title={Findings of the 2019 conference on machine translation (wmt19)}, author={Barrault, Lo{\"\i}c and Bojar, Ond{\v{r}}ej and Costa-juss{\`a}, Marta R and Federmann, Christian and Fishel, Mark and Graham, Yvette and Haddow, Barry and Huck, Matthias and Koehn, Philipp and Malmasi, Shervin and others}, booktitle={Proceedings of the Fourth Conference on Machine Translation (Volume 2: Shared Task Papers, Day 1)}, pages={1--61}, year={2019} } """ _CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__))) _BASE_DATASET_PATH = os.path.join(get_data_home_dir(), 'wmt') _URL_FILE_STATS = load_checksum_stats(os.path.join(_CURR_DIR, '..', 'url_checksums', 'wmt.txt')) # Here, we will make sure that the languages follow the standard ISO 639-1 language tag. # Also, for more information related to the language tag, you may refer to # https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes _PARA_URLS = { 'europarl': { 'v7': { 'cs-en': { 'url': 'http://www.statmt.org/europarl/v7/cs-en.tgz', 'cs': 'europarl-v7.cs-en.cs', 'en': 'europarl-v7.cs-en.en', }, 'de-en': { 'url': 'http://www.statmt.org/europarl/v7/de-en.tgz',