Exemplo n.º 1
0
	def __init__(self, max_seq_len, train=False, val=False, test=False, root="data/", **kwargs):
		self.max_seq_len = max_seq_len
		self.dist_between_sents = int(self.max_seq_len / 10)
		self.is_train = train

		dataset = penn_treebank_dataset(root + "penn-treebank", train=train, dev=val, test=test)

		self.vocabulary = PennTreeBankDataset.get_vocabulary(root=root)
		self.index_to_word = {val: key for key, val in self.vocabulary.items()}

		words = [[]]
		for word_index, word in enumerate(dataset):
			if word == "</s>":
				words.append([])
			else:
				if word in self.vocabulary:
					words[-1].append(self.vocabulary[word])
				else:
					words[-1] += [self.vocabulary[c] for c in word]
				if word != "</s>":
					words[-1].append(self.vocabulary[" "])
				
		self.data = [np.array(sent) for sent in words if (len(sent) != 0 and len(sent)<self.max_seq_len)]

		print("Length of dataset: ", len(self))
Exemplo n.º 2
0
def initialize_dataset(
    corpus_name,
    max_word_length=None,
    max_words=None,
    data_splits=[0.7, 0.2, 0.1],
):
    print(f"Initializing dataset ..")
    assert abs(sum(data_splits) - 1) < 0.0001
    if corpus_name == "penn-treebank":

        from torchnlp.datasets import penn_treebank_dataset

        train, val, test = penn_treebank_dataset(
            train=True, dev=True, test=True
        )
        train = preprocess_sentence(train)
        val = preprocess_sentence(val)
        test = preprocess_sentence(test)
        datasets = {
            "train": train,
            "val": val,
            "test": test,
        }

    elif corpus_name == "brown":
        import nltk

        nltk.download("brown")
        from nltk.corpus import brown

        processed_txt = []
        for s in brown.sents():
            processed_txt += preprocess_sentence(s)

        n_tokens = len(processed_txt)
        split_n = [int(s * n_tokens) for s in np.cumsum(data_splits)]
        train = processed_txt[: split_n[0]]
        val = processed_txt[split_n[0] : split_n[1]]
        test = processed_txt[split_n[1] :]
        datasets = {
            "train": train,
            "val": val,
            "test": test,
        }

    else:
        raise ValueError(f"Corpus {corpus_name} not supported")

    root_path = os.path.join("data", corpus_name, "objects")
    if os.path.exists(root_path):
        shutil.rmtree(root_path)
    os.makedirs(root_path + "/train", exist_ok=True)
    os.makedirs(root_path + "/val", exist_ok=True)
    os.makedirs(root_path + "/test", exist_ok=True)

    return create_objects(
        corpus_name, datasets, root_path, max_word_length, max_words
    )
Exemplo n.º 3
0
	def create_vocabulary(root="data/"):
		if root is None:
			root = ""
		dataset = penn_treebank_dataset(root + "penn-treebank", train=True, dev=False, test=False)
		all_words = [w for w in dataset]
		vocabulary = list(set([c for w in all_words for c in w])) + [" ", "<unk>", "</s>"]
		vocabulary = sorted(vocabulary)
		vocabulary = {vocabulary[i]: i for i in range(len(vocabulary))}
		with open(root + PennTreeBankDataset.VOCABULARY_FILE, "w") as f:
			json.dump(vocabulary, f, indent=4)
Exemplo n.º 4
0
def test_penn_treebank_dataset_row(mock_urlretrieve):
    mock_urlretrieve.side_effect = urlretrieve_side_effect

    # Check a row are parsed correctly
    train, dev, test = penn_treebank_dataset(
        directory=directory, test=True, dev=True, train=True, check_files=[])
    assert len(train) > 0
    assert len(test) > 0
    assert len(dev) > 0
    assert train[0:10] == [
        'aer', 'banknote', 'berlitz', 'calloway', 'centrust', 'cluett', 'fromstein', 'gitano',
        'guterman', 'hydro-quebec'
    ]
Exemplo n.º 5
0
def sample_data():
    return penn_treebank_dataset(dev=True)
Exemplo n.º 6
0
    def _build_dataloader(self):
        self.val_loader = self.corpus = None
        if self.dataset_kind == "mnist":
            transform = transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize((0.1307, ), (0.3081, ))
            ])
            self.dataset = MNISTBufferedDataset(self.data_dir,
                                                download=True,
                                                train=True,
                                                transform=transform)
            self.val_dataset = MNISTBufferedDataset(self.data_dir,
                                                    download=True,
                                                    transform=transform)

            self.train_sampler = MNISTSequenceSampler(
                self.dataset,
                sequences=self.sequences,
                batch_size=self.batch_size,
                random_mnist_images=not self.static_digit,
                randomize_sequence_cursors=self.randomize_sequence_cursors,
                noise_buffer=self.noise_buffer,
                use_mnist_pct=self.use_mnist_pct,
                max_batches=self.batches_in_epoch,
            )

            if self.static_digit:
                # For static digit paradigm, val & train samplers much
                # match to ensure same digit prototype used for each sequence item.
                self.val_sampler = self.train_sampler
            else:
                self.val_sampler = MNISTSequenceSampler(
                    self.val_dataset,
                    sequences=self.sequences,
                    batch_size=self.batch_size,
                    random_mnist_images=not self.static_digit,
                    randomize_sequence_cursors=self.randomize_sequence_cursors,
                    noise_buffer=self.noise_buffer,
                    use_mnist_pct=self.use_mnist_pct,
                    max_batches=self.eval_batches_in_epoch,
                )
            self.train_loader = DataLoader(
                self.dataset,
                batch_sampler=self.train_sampler,
                collate_fn=pred_sequence_collate,
            )
            self.val_loader = DataLoader(
                self.val_dataset,
                batch_sampler=self.val_sampler,
                collate_fn=pred_sequence_collate,
            )

        elif self.dataset_kind == "ptb":
            # Download "Penn Treebank" dataset
            from torchnlp.datasets import penn_treebank_dataset

            print("Maybe download PTB...")
            penn_treebank_dataset(self.data_dir + "/PTB",
                                  train=True,
                                  test=True)
            corpus = lang_util.Corpus(self.data_dir + "/PTB")
            train_sampler = PTBSequenceSampler(
                corpus.train,
                batch_size=self.batch_size,
                max_batches=self.batches_in_epoch,
            )

            if self.embedding_kind == "rsm_bitwise":
                embedding = lang_util.BitwiseWordEmbedding().embedding_dict
            elif self.embedding_kind in ["bpe", "glove"]:
                from torchnlp.word_to_vector import BPEmb, GloVe

                cache_dir = self.data_dir + "/torchnlp/.word_vectors_cache"
                if self.embedding_kind == "bpe":
                    vectors = BPEmb(dim=self.embed_dim, cache=cache_dir)
                else:
                    vectors = GloVe(name="6B",
                                    dim=self.embed_dim,
                                    cache=cache_dir)
                embedding = {}
                for word_id, word in enumerate(corpus.dictionary.idx2word):
                    embedding[word_id] = vectors[word]
            elif "ptb_fasttext" in self.embedding_kind:
                import fasttext

                # Generated via notebooks/ptb_embeddings.ipynb
                embedding = {}
                ft_model = fasttext.load_model(self.data_dir +
                                               "/embeddings/%s.bin" %
                                               self.embedding_kind)
                for word_id, word in enumerate(corpus.dictionary.idx2word):
                    embedding[word_id] = torch.tensor(ft_model[word])

            if self.embedding_kind:
                print("Loaded embedding dict (%s) with %d entries" %
                      (self.embedding_kind, len(embedding)))

            collate_fn = partial(ptb_pred_sequence_collate,
                                 vector_dict=embedding)
            self.train_loader = DataLoader(corpus.train,
                                           batch_sampler=train_sampler,
                                           collate_fn=collate_fn)
            val_sampler = PTBSequenceSampler(
                corpus.test,
                batch_size=self.eval_batch_size,
                max_batches=self.eval_batches_in_epoch,
                uniform_offsets=True,
            )
            self.val_loader = DataLoader(corpus.test,
                                         batch_sampler=val_sampler,
                                         collate_fn=collate_fn)
            self.corpus = corpus
            print("Built dataloaders...")
Exemplo n.º 7
0
import torch
from torchnlp.datasets import penn_treebank_dataset
train = penn_treebank_dataset(train=True)
print(train[:100])
Exemplo n.º 8
0
    def _build_dataloader(self):
        # Extra element for sequential prediction labels

        self.val_loader = None
        if self.dataset_kind == "mnist":
            transform = transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize((0.1307, ), (0.3081, ))
            ])
            self.dataset = MNISTBufferedDataset(self.data_dir,
                                                download=True,
                                                train=True,
                                                transform=transform)
            self.val_dataset = MNISTBufferedDataset(self.data_dir,
                                                    download=True,
                                                    transform=transform)

            self.train_sampler = MNISTSequenceSampler(
                self.dataset,
                sequences=self.sequences,
                batch_size=self.batch_size,
                random_mnist_images=not self.static_digit,
                noise_buffer=self.noise_buffer,
                use_mnist_pct=self.use_mnist_pct,
                max_batches=self.batches_in_epoch,
            )

            if self.static_digit:
                # For static digit paradigm, val & train samplers much
                # match to ensure same digit prototype used for each sequence item.
                self.val_sampler = self.train_sampler
            else:
                self.val_sampler = MNISTSequenceSampler(
                    self.val_dataset,
                    sequences=self.sequences,
                    batch_size=self.batch_size,
                    random_mnist_images=not self.static_digit,
                    noise_buffer=self.noise_buffer,
                    use_mnist_pct=self.use_mnist_pct,
                    max_batches=self.batches_in_epoch,
                )
            self.train_loader = DataLoader(
                self.dataset,
                batch_sampler=self.train_sampler,
                collate_fn=pred_sequence_collate,
            )
            self.val_loader = DataLoader(
                self.val_dataset,
                batch_sampler=self.val_sampler,
                collate_fn=pred_sequence_collate,
            )

        elif self.dataset_kind == "ptb":
            # Download "Penn Treebank" dataset
            from torchnlp.datasets import penn_treebank_dataset

            penn_treebank_dataset(self.data_dir + "/PTB", train=True)
            corpus = lang_util.Corpus(self.data_dir + "/PTB")
            train_sampler = PTBSequenceSampler(
                corpus.train,
                batch_size=self.batch_size,
                max_batches=self.batches_in_epoch,
            )

            if self.embedding_kind == "rsm_bitwise":
                embedding = lang_util.BitwiseWordEmbedding().embedding_dict
            elif self.embedding_kind == "bpe":
                from torchnlp.word_to_vector import BPEmb
                cache_dir = self.data_dir + "/torchnlp/.word_vectors_cache"
                vectors = BPEmb(dim=self.embed_dim, cache=cache_dir)
                embedding = {}
                for word_id, word in enumerate(corpus.dictionary.idx2word):
                    embedding[word_id] = vectors[word]

            collate_fn = partial(ptb_pred_sequence_collate,
                                 vector_dict=embedding)
            self.train_loader = DataLoader(corpus.train,
                                           batch_sampler=train_sampler,
                                           collate_fn=collate_fn)
            val_sampler = PTBSequenceSampler(
                corpus.test,
                batch_size=self.batch_size,
                max_batches=self.batches_in_epoch,
            )
            self.val_loader = DataLoader(corpus.test,
                                         batch_sampler=val_sampler,
                                         collate_fn=collate_fn)
Exemplo n.º 9
0
#  You should have received a copy of the GNU Affero Public License
#  along with this program.  If not, see http://www.gnu.org/licenses.
#
#  http://numenta.org/licenses/

import os
import sys

import fasttext
from torchnlp.datasets import penn_treebank_dataset

PATH = "/home/ubuntu"
# PATH = "/Users/jgordon"

print("Maybe download ptb...")
penn_treebank_dataset(PATH + "/nta/datasets/PTB", train=True, test=True)

PTB_TRAIN_PATH = PATH + "/nta/datasets/PTB/ptb.train.txt"

if len(sys.argv) > 1:
    epoch = int(sys.argv[1])
else:
    epoch = 5

model = fasttext.train_unsupervised(PTB_TRAIN_PATH,
                                    model="skipgram",
                                    minCount=1,
                                    epoch=epoch)
embed_dir = PATH + "/nta/datasets/embeddings"
filename = PATH + "/nta/datasets/embeddings/ptb_fasttext_e%d.bin" % epoch
if not os.path.exists(embed_dir):