예제 #1
0
    def test_dataset(self):
        # sentences
        def generator():
            for i in range(1, 10):
                yield [0] + list(range(1, i + 1)) + [0]

        SIZE = 1000
        counts = [0 for idx in range(SIZE)]

        corpus = list()
        sid = list()
        start_id = 0
        for seq in generator():
            for v in seq:
                corpus.append(v)
                counts[v] += 1
            sid.append([start_id, start_id + len(seq), len(seq)])
            start_id += len(seq)

        dataset = FastGBWDataset(
            torch.from_numpy(np.asarray(corpus)).long(), np.asarray(sid))
        counts2 = [0 for idx in range(SIZE)]
        for x, y in dataset.batch_generator(4, 2):
            for v in x.numpy().ravel():
                counts2[v] += 1

        for i in range(1, SIZE):
            self.assertEqual(
                counts[i], counts2[i],
                "Mismatch at i=%d. counts[i]=%s, counts2[i]=%s" %
                (i, counts[i], counts2[i]))
예제 #2
0
        ntokens = len(corpus.dictionary.idx2word)
        cutoffs = args.cutoffs + [ntokens]
else:
    ###############################################################################
    # Load data
    ###############################################################################

    # Torch
    word_freq = load_lua(os.path.join(args.data, 'word_freq.th7')).numpy()
    mapto = torch.from_numpy(util.reverse(np.argsort(-word_freq))).long()
    print("load word frequency mapping - complete")

    ntokens = len(word_freq)
    nsampled = 8192

    train_corpus = FastGBWDataset(args.data, 'train_data.th7', 'train_data.sid', mapto)
    print("load train data - complete")

    test_corpus = GBWDataset(args.data, 'test_data.th7', mapto)
    print("load test data - complete")

    cutoffs = args.cutoffs + [ntokens]


# with doing('Constructing model'):
    # if not args.lm1b:
    #     criterion = AdaptiveLoss(cutoffs)
    # else:
    #     criterion = SplitCrossEntropyLoss(args.emsize, args.cutoffs, verbose=False)
    #     criterion.cuda()
logging.info("Constructing model")
예제 #3
0
torch.manual_seed(args.seed)
torch.cuda.manual_seed(args.seed)

###############################################################################
# Load data
###############################################################################

# Torch
word_freq = torch.load(os.path.join(args.data, 'word_freq.pt')).numpy()
mapto = torch.from_numpy(util.reverse(np.argsort(-word_freq))).long()
print("load word frequency mapping - complete")

ntokens = len(word_freq)
nsampled = 16384

train_corpus = FastGBWDataset(args.data, 'train_data.pt', 'train_data.sid', mapto, seq_length=args.bptt, batch_size=args.batch_size)
print("load train data - complete")

test_corpus = GBWDataset(args.data, 'test_data.pt', mapto)
print("load test data - complete")

# Streaming
'''
vocabulary = Vocabulary.from_file(os.path.join(args.data, "1b_word_vocab.txt"))

ntokens = len(vocabulary)
nsampled = 16384

train_corpus = StreamGBWDataset(vocabulary, os.path.join(args.data, "training-monolingual.tokenized.shuffled/*"))
test_corpus = StreamGBWDataset(vocabulary, os.path.join(args.data, "heldout-monolingual.tokenized.shuffled/*"), deterministic=True)
print("load dataset - complete")
예제 #4
0
torch.manual_seed(args.seed)
torch.cuda.manual_seed(args.seed)

###############################################################################
# Load data
###############################################################################

# Torch
word_freq = np.load(os.path.join(args.data, args.freq_file))
mapto = torch.from_numpy(util.reverse(np.argsort(-word_freq))).long()
print("load word frequency mapping - complete")

ntokens = len(word_freq)
nsampled = 8192

train_corpus = FastGBWDataset(args.data, args.train_file, args.sid_file, mapto)
print("load train data - complete")

test_corpus = GBWDataset(args.data, args.validation_file, mapto)
print("load test data - complete")

# Streaming
'''
vocabulary = Vocabulary.from_file(os.path.join(args.data, "1b_word_vocab.txt"))

ntokens = len(vocabulary)
nsampled = 8192

train_corpus = StreamGBWDataset(vocabulary, os.path.join(args.data, "training-monolingual.tokenized.shuffled/*"))
test_corpus = StreamGBWDataset(vocabulary, os.path.join(args.data, "heldout-monolingual.tokenized.shuffled/*"), deterministic=True)
print("load dataset - complete")