def imdb_dataset(directory='data/', train=False, test=False, train_directory='train', test_directory='test', extracted_name='aclImdb', check_file='aclImdb/README', url='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz', sentiments=['pos', 'neg']): """ Load the IMDB dataset (Large Movie Review Dataset v1.0). This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. Provided a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well. Raw text and already processed bag of words formats are provided. **Reference:** http://ai.stanford.edu/~amaas/data/sentiment/ Args: directory (str, optional): Directory to cache the dataset. train (bool, optional): If to load the training split of the dataset. test (bool, optional): If to load the test split of the dataset. train_directory (str, optional): The directory of the training split. test_directory (str, optional): The directory of the test split. extracted_name (str, optional): Name of the extracted dataset directory. check_file (str, optional): Check this file exists if the download was successful. url (str, optional): URL of the dataset `tar.gz` file. sentiments (list of str, optional): Sentiments to load from the dataset. Returns: :class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training dataset and test dataset in order if their respective boolean argument is true. Example: >>> from torchnlp.datasets import imdb_dataset >>> train = imdb_dataset(train=True) >>> train[0:2] [{ 'text': 'For a movie that gets no respect there sure are a lot of memorable quotes...', 'sentiment': 'pos' }, { 'text': 'Bizarre horror movie filled with famous faces but stolen by Cristina Raines...', 'sentiment': 'pos' }] """ download_compressed_directory(file_url=url, directory=directory, check_file=check_file) ret = [] splits = [ dir_ for (requested, dir_) in [(train, train_directory), (test, test_directory)] if requested ] for split_directory in splits: full_path = os.path.join(directory, extracted_name, split_directory) examples = [] for sentiment in sentiments: for filename in glob.iglob(os.path.join(full_path, sentiment, '*.txt')): with open(filename, 'r', encoding="utf-8") as f: text = f.readline() examples.append({ 'text': text, 'sentiment': sentiment, }) ret.append(Dataset(examples)) if len(ret) == 1: return ret[0] else: return tuple(ret)
def cache(self, name, cache, url=None): print(name) if os.path.isfile(name): path = name path_pt = os.path.join(cache, os.path.basename(name)) + '.pt' else: path = os.path.join(cache, name) path_pt = path + '.pt' if not os.path.isfile(path_pt) or self.is_include is not None: if url: download_compressed_directory(url, cache, name) if not os.path.isfile(path): raise RuntimeError('no vectors found at {}'.format(path)) # str call is necessary for Python 2/3 compatibility, since # argument must be Python 2 str (Python 3 bytes) or # Python 3 str (Python 2 unicode) itos, vectors, dim = [], array.array(str('d')), None # Try to read the whole file with utf-8 encoding. binary_lines = False try: with io.open(path, encoding="utf8") as f: lines = [line for line in f] # If there are malformed lines, read in binary mode # and manually decode each word from utf-8 except: logger.warning("Could not read {} as UTF8 file, " "reading file as bytes and skipping " "words with malformed UTF8.".format(path)) with open(path, 'rb') as f: lines = [line for line in f] binary_lines = True logger.info("Loading vectors from {}".format(path)) for line in tqdm(lines, total=len(lines)): # Explicitly splitting on " " is important, so we don't # get rid of Unicode non-breaking spaces in the vectors. entries = line.rstrip().split(b" " if binary_lines else " ") word, entries = entries[0], entries[1:] if dim is None and len(entries) > 1: dim = len(entries) elif len(entries) == 1: logger.warning("Skipping token {} with 1-dimensional " "vector {}; likely a header".format( word, entries)) continue elif dim != len(entries): raise RuntimeError( "Vector for token {} has {} dimensions, but previously " "read vectors have {} dimensions. All vectors must have " "the same number of dimensions.".format( word, len(entries), dim)) if binary_lines: try: if isinstance(word, six.binary_type): word = word.decode('utf-8') except: logger.info("Skipping non-UTF8 token {}".format( repr(word))) continue if self.is_include is not None and not self.is_include(word): continue vectors.extend(float(x) for x in entries) itos.append(word) self.itos = itos self.stoi = {word: i for i, word in enumerate(itos)} self.vectors = torch.Tensor(vectors).view(-1, dim) self.dim = dim logger.info('Saving vectors to {}'.format(path_pt)) torch.save((self.itos, self.stoi, self.vectors, self.dim), path_pt) else: logger.info('Loading vectors from {}'.format(path_pt)) self.itos, self.stoi, self.vectors, self.dim = torch.load(path_pt)
def wmt_dataset( directory='data/wmt16_en_de', train=False, dev=False, test=False, train_filename='train.tok.clean.bpe.32000', dev_filename='newstest2013.tok.bpe.32000', test_filename='newstest2014.tok.bpe.32000', check_file='train.tok.clean.bpe.32000.en', url='https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8' ): """ The Workshop on Machine Translation (WMT) 2014 English-German dataset. Initially this dataset was preprocessed by Google Brain. Though this download contains test sets from 2015 and 2016, the train set differs slightly from WMT 2015 and 2016 and significantly from WMT 2017. The provided data is mainly taken from version 7 of the Europarl corpus, which is freely available. Note that this the same data as last year, since Europarl is not anymore translted across all 23 official European languages. Additional training data is taken from the new News Commentary corpus. There are about 50 million words of training data per language from the Europarl corpus and 3 million words from the News Commentary corpus. A new data resource from 2013 is the Common Crawl corpus which was collected from web sources. Each parallel corpus comes with a annotation file that gives the source of each sentence pair. References: * https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/data_generators/translate_ende.py # noqa: E501 * http://www.statmt.org/wmt14/translation-task.html Args: directory (str, optional): Directory to cache the dataset. train (bool, optional): If to load the training split of the dataset. dev (bool, optional): If to load the dev split of the dataset. test (bool, optional): If to load the test split of the dataset. train_filename (str, optional): The filename of the training split. dev_filename (str, optional): The filename of the dev split. test_filename (str, optional): The filename of the test split. check_file (str, optional): Check this file exists if download was successful. url (str, optional): URL of the dataset `tar.gz` file. Returns: :class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training tokens, dev tokens and test tokens in order if their respective boolean argument is true. Example: >>> from torchnlp.datasets import wmt_dataset >>> train = wmt_dataset(train=True) >>> train[:2] [{ 'en': 'Res@@ um@@ ption of the session', 'de': 'Wiederaufnahme der Sitzungsperiode' }, { 'en': 'I declare resumed the session of the European Parliament ad@@ jour@@ ned on...' 'de': 'Ich erklär@@ e die am Freitag , dem 17. Dezember unterbro@@ ch@@ ene...' }] """ download_compressed_directory(file_url=url, directory=directory, check_file=check_file, filename='wmt16_en_de.tar.gz') ret = [] splits = [(train, train_filename), (dev, dev_filename), (test, test_filename)] splits = [f for (requested, f) in splits if requested] for filename in splits: examples = [] en_path = os.path.join(directory, filename + '.en') de_path = os.path.join(directory, filename + '.de') en_file = [l.strip() for l in open(en_path, 'r', encoding='utf-8')] de_file = [l.strip() for l in open(de_path, 'r', encoding='utf-8')] assert len(en_file) == len(de_file) for i in range(len(en_file)): if en_file[i] != '' and de_file[i] != '': examples.append({'en': en_file[i], 'de': de_file[i]}) ret.append(Dataset(examples)) if len(ret) == 1: return ret[0] else: return tuple(ret)
def smt_dataset( directory='data/', train=False, dev=False, test=False, train_filename='train.txt', dev_filename='dev.txt', test_filename='test.txt', extracted_name='trees', check_file='trees/train.txt', url='http://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip', fine_grained=False, subtrees=False): """ Load the Stanford Sentiment Treebank dataset. Semantic word spaces have been very useful but cannot express the meaning of longer phrases in a principled way. Further progress towards understanding compositionality in tasks such as sentiment detection requires richer supervised training and evaluation resources and more powerful models of composition. To remedy this, we introduce a Sentiment Treebank. It includes fine grained sentiment labels for 215,154 phrases in the parse trees of 11,855 sentences and presents new challenges for sentiment compositionality. **Reference**: https://nlp.stanford.edu/sentiment/index.html **Citation:** Richard Socher, Alex Perelygin, Jean Y. Wu, Jason Chuang, Christopher D. Manning, Andrew Y. Ng and Christopher Potts. Recursive Deep Models for Semantic Compositionality Over a Sentiment Treebank Args: directory (str, optional): Directory to cache the dataset. train (bool, optional): If to load the training split of the dataset. dev (bool, optional): If to load the development split of the dataset. test (bool, optional): If to load the test split of the dataset. train_filename (str, optional): The filename of the training split. dev_filename (str, optional): The filename of the development split. test_filename (str, optional): The filename of the test split. extracted_name (str, optional): Name of the extracted dataset directory. check_file (str, optional): Check this file exists if download was successful. url (str, optional): URL of the dataset `tar.gz` file. subtrees (bool, optional): Whether to include sentiment-tagged subphrases in addition to complete examples. fine_grained (bool, optional): Whether to use 5-class instead of 3-class labeling. Returns: :class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training tokens, dev tokens and test tokens in order if their respective boolean argument is true. Example: >>> from torchnlp.datasets import smt_dataset >>> train = smt_dataset(train=True) >>> train[5] { 'text': "Whether or not you 're enlightened by any of Derrida 's lectures on ...", 'label': 'positive' } """ download_compressed_directory(file_url=url, directory=directory, check_file=check_file) ret = [] splits = [(train, train_filename), (dev, dev_filename), (test, test_filename)] splits = [f for (requested, f) in splits if requested] for filename in splits: full_path = os.path.join(directory, extracted_name, filename) examples = [] with io.open(full_path, encoding='utf-8') as f: for line in f: line = line.strip() if subtrees: examples.extend(parse_tree(line, subtrees=subtrees)) else: examples.append(parse_tree(line, subtrees=subtrees)) ret.append(Dataset(examples)) if len(ret) == 1: return ret[0] else: return tuple(ret)
def wikitext_2_dataset( directory='data/', train=False, dev=False, test=False, train_filename='wiki.train.tokens', dev_filename='wiki.valid.tokens', test_filename='wiki.test.tokens', extracted_name='wikitext-2', check_file='wikitext-2/wiki.train.tokens', url='https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip' ): """ Load the WikiText-2 dataset. The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike License. **Reference:** https://einstein.ai/research/the-wikitext-long-term-dependency-language-modeling-dataset Args: directory (str, optional): Directory to cache the dataset. train (bool, optional): If to load the training split of the dataset. dev (bool, optional): If to load the development split of the dataset. test (bool, optional): If to load the test split of the dataset. train_filename (str, optional): The filename of the training split. dev_filename (str, optional): The filename of the development split. test_filename (str, optional): The filename of the test split. extracted_name (str, optional): Name of the extracted dataset directory. check_file (str, optional): Check this file exists if download was successful. url (str, optional): URL of the dataset `tar.gz` file. Returns: :class:`tuple` of :class:`list` of :class:`str`: Tuple with the training tokens, dev tokens and test tokens in order if their respective boolean argument is true. Example: >>> from torchnlp.datasets import wikitext_2_dataset >>> train = wikitext_2_dataset(train=True) >>> train[:10] ['</s>', '=', 'Valkyria', 'Chronicles', 'III', '=', '</s>', '</s>', 'Senjō', 'no'] """ download_compressed_directory(file_url=url, directory=directory, check_file=check_file) ret = [] splits = [(train, train_filename), (dev, dev_filename), (test, test_filename)] splits = [f for (requested, f) in splits if requested] for filename in splits: full_path = os.path.join(directory, extracted_name, filename) text = [] with io.open(full_path, encoding='utf-8') as f: for line in f: text.extend(line.replace('<unk>', UNKNOWN_TOKEN).split()) text.append(EOS_TOKEN) ret.append(text) if len(ret) == 1: return ret[0] else: return tuple(ret)
def snli_dataset(directory='data/', train=False, dev=False, test=False, train_filename='snli_1.0_train.jsonl', dev_filename='snli_1.0_dev.jsonl', test_filename='snli_1.0_test.jsonl', extracted_name='snli_1.0', check_file='snli_1.0/snli_1.0_train.jsonl', url='http://nlp.stanford.edu/projects/snli/snli_1.0.zip'): """ Load the Stanford Natural Language Inference (SNLI) dataset. The SNLI corpus (version 1.0) is a collection of 570k human-written English sentence pairs manually labeled for balanced classification with the labels entailment, contradiction, and neutral, supporting the task of natural language inference (NLI), also known as recognizing textual entailment (RTE). We aim for it to serve both as a benchmark for evaluating representational systems for text, especially including those induced by representation learning methods, as well as a resource for developing NLP models of any kind. **Reference:** https://nlp.stanford.edu/projects/snli/ **Citation:** Samuel R. Bowman, Gabor Angeli, Christopher Potts, and Christopher D. Manning. 2015. A large annotated corpus for learning natural language inference. In Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing (EMNLP). Args: directory (str, optional): Directory to cache the dataset. train (bool, optional): If to load the training split of the dataset. dev (bool, optional): If to load the development split of the dataset. test (bool, optional): If to load the test split of the dataset. train_filename (str, optional): The filename of the training split. dev_filename (str, optional): The filename of the development split. test_filename (str, optional): The filename of the test split. extracted_name (str, optional): Name of the extracted dataset directory. check_file (str, optional): Check this file exists if download was successful. url (str, optional): URL of the dataset `tar.gz` file. Returns: :class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training tokens, dev tokens and test tokens in order if their respective boolean argument is true. Example: >>> from torchnlp.datasets import snli_dataset >>> train = snli_dataset(train=True) >>> train[0] { 'premise': 'Kids are on a amusement ride.', 'hypothesis': 'A car is broke down on the side of the road.', 'label': 'contradiction', 'premise_transitions': ['shift', 'shift', 'shift', 'shift', 'shift', 'shift', ...], 'hypothesis_transitions': ['shift', 'shift', 'shift', 'shift', 'shift', 'shift', ...], } """ download_compressed_directory(file_url=url, directory=directory, check_file=check_file) get_transitions = lambda parse: [ 'reduce' if t == ')' else 'shift' for t in parse if t != '(' ] ret = [] splits = [(train, train_filename), (dev, dev_filename), (test, test_filename)] splits = [f for (requested, f) in splits if requested] for filename in splits: full_path = os.path.join(directory, extracted_name, filename) examples = [] with io.open(full_path, encoding='utf-8') as f: for line in f: line = line.strip() line = json.loads(line) examples.append({ 'premise': line['sentence1'], 'hypothesis': line['sentence2'], 'label': line['gold_label'], 'premise_transitions': get_transitions(line['sentence1_binary_parse']), 'hypothesis_transitions': get_transitions(line['sentence2_binary_parse']) }) ret.append(Dataset(examples)) if len(ret) == 1: return ret[0] else: return tuple(ret)
def iwslt_dataset( directory='data/iwslt/', train=False, dev=False, test=False, language_extensions=['en', 'de'], train_filename='{source}-{target}/train.{source}-{target}.{lang}', dev_filename='{source}-{target}/IWSLT16.TED.tst2013.{source}-{target}.{lang}', test_filename='{source}-{target}/IWSLT16.TED.tst2014.{source}-{target}.{lang}', check_file='{source}-{target}/train.tags.{source}-{target}.{source}', url='https://wit3.fbk.eu/archive/2016-01/texts/{source}/{target}/{source}-{target}.tgz'): """ Load the International Workshop on Spoken Language Translation (IWSLT) 2017 translation dataset. In-domain training, development and evaluation sets were supplied through the website of the WIT3 project, while out-of-domain training data were linked in the workshop’s website. With respect to edition 2016 of the evaluation campaign, some of the talks added to the TED repository during the last year have been used to define the evaluation sets (tst2017), while the remaining new talks have been included in the training sets. The English data that participants were asked to recognize and translate consists in part of TED talks as in the years before, and in part of real-life lectures and talks that have been mainly recorded in lecture halls at KIT and Carnegie Mellon University. TED talks are challenging due to their variety in topics, but are very benign as they are very thoroughly rehearsed and planned, leading to easy to recognize and translate language. References: * http://workshop2017.iwslt.org/downloads/iwslt2017_proceeding_v2.pdf * http://workshop2017.iwslt.org/ **Citation:** M. Cettolo, C. Girardi, and M. Federico. 2012. WIT3: Web Inventory of Transcribed and Translated Talks. In Proc. of EAMT, pp. 261-268, Trento, Italy. Args: directory (str, optional): Directory to cache the dataset. train (bool, optional): If to load the training split of the dataset. dev (bool, optional): If to load the dev split of the dataset. test (bool, optional): If to load the test split of the dataset. language_extensions (:class:`list` of :class:`str`): Two language extensions ['en'|'de'|'it'|'ni'|'ro'] to load. train_filename (str, optional): The filename of the training split. dev_filename (str, optional): The filename of the dev split. test_filename (str, optional): The filename of the test split. check_file (str, optional): Check this file exists if download was successful. url (str, optional): URL of the dataset file. Returns: :class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training tokens, dev tokens and test tokens in order if their respective boolean argument is true. Example: >>> from torchnlp.datasets import iwslt_dataset >>> train = iwslt_dataset(train=True) >>> train[:2] [{ 'en': "David Gallo: This is Bill Lange. I'm Dave Gallo.", 'de': 'David Gallo: Das ist Bill Lange. Ich bin Dave Gallo.' }, { 'en': "And we're going to tell you some stories from the sea here in video.", 'de': 'Wir werden Ihnen einige Geschichten über das Meer in Videoform erzählen.' }] """ if len(language_extensions) != 2: raise ValueError("`language_extensions` must be two language extensions " "['en'|'de'|'it'|'ni'|'ro'] to load.") # Format Filenames source, target = tuple(language_extensions) check_file = check_file.format(source=source, target=target) url = url.format(source=source, target=target) download_compressed_directory(file_url=url, directory=directory, check_file=check_file) iwslt_clean(os.path.join(directory, '{source}-{target}'.format(source=source, target=target))) ret = [] splits = [(train, train_filename), (dev, dev_filename), (test, test_filename)] splits = [f for (requested, f) in splits if requested] for filename in splits: examples = [] for extension in language_extensions: path = os.path.join(directory, filename.format(lang=extension, source=source, target=target)) with open(path, 'r', encoding='utf-8') as f: language_specific_examples = [l.strip() for l in f] if len(examples) == 0: examples = [{} for _ in range(len(language_specific_examples))] for i, example in enumerate(language_specific_examples): examples[i][extension] = example ret.append(Dataset(examples)) if len(ret) == 1: return ret[0] else: return tuple(ret)
def ud_pos_dataset( directory='data/', train=False, dev=False, test=False, train_filename='en-ud-tag.v2.train.txt', dev_filename='en-ud-tag.v2.dev.txt', test_filename='en-ud-tag.v2.test.txt', extracted_name='en-ud-v2', check_file='en-ud-v2/en-ud-tag.v2.train.txt', url='https://bitbucket.org/sivareddyg/public/downloads/en-ud-v2.zip'): """ Load the Universal Dependencies - English Dependency Treebank dataset. Corpus of sentences annotated using Universal Dependencies annotation. The corpus comprises 254,830 words and 16,622 sentences, taken from various web media including weblogs, newsgroups, emails, reviews, and Yahoo! answers. References: * http://universaldependencies.org/ * https://github.com/UniversalDependencies/UD_English **Citation:** Natalia Silveira and Timothy Dozat and Marie-Catherine de Marneffe and Samuel Bowman and Miriam Connor and John Bauer and Christopher D. Manning (2014). A Gold Standard Dependency Corpus for {E}nglish Args: directory (str, optional): Directory to cache the dataset. train (bool, optional): If to load the training split of the dataset. dev (bool, optional): If to load the development split of the dataset. test (bool, optional): If to load the test split of the dataset. train_filename (str, optional): The filename of the training split. dev_filename (str, optional): The filename of the development split. test_filename (str, optional): The filename of the test split. extracted_name (str, optional): Name of the extracted dataset directory. check_file (str, optional): Check this file exists if download was successful. url (str, optional): URL of the dataset `tar.gz` file. Returns: :class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training tokens, dev tokens and test tokens in order if their respective boolean argument is true. Example: >>> from torchnlp.datasets import ud_pos_dataset >>> train = ud_pos_dataset(train=True) >>> train[17] # Sentence at index 17 is shortish { 'tokens': ['Guerrillas', 'killed', 'an', 'engineer', ',', 'Asi', 'Ali', ',', 'from', 'Tikrit', '.'], 'ud_tags': ['NOUN', 'VERB', 'DET', 'NOUN', 'PUNCT', 'PROPN', 'PROPN', 'PUNCT', 'ADP', 'PROPN', 'PUNCT'], 'ptb_tags': ['NNS', 'VBD', 'DT', 'NN', ',', 'NNP', 'NNP', ',', 'IN', 'NNP', '.'] } """ download_compressed_directory(file_url=url, directory=directory, check_file=check_file) ret = [] splits = [(train, train_filename), (dev, dev_filename), (test, test_filename)] splits = [f for (requested, f) in splits if requested] for filename in splits: full_path = os.path.join(directory, extracted_name, filename) examples = [] with io.open(full_path, encoding='utf-8') as f: sentence = {'tokens': [], 'ud_tags': [], 'ptb_tags': []} for line in f: line = line.strip() if line == '' and len(sentence['tokens']) > 0: examples.append(sentence) sentence = {'tokens': [], 'ud_tags': [], 'ptb_tags': []} elif line != '': token, ud_tag, ptb_tag = tuple(line.split('\t')) sentence['tokens'].append(token) sentence['ud_tags'].append(ud_tag) sentence['ptb_tags'].append(ptb_tag) ret.append(Dataset(examples)) if len(ret) == 1: return ret[0] else: return tuple(ret)