def test_dataset(self): # sentences def generator(): for i in range(1, 10): yield [0] + list(range(1, i + 1)) + [0] SIZE = 1000 counts = [0 for idx in range(SIZE)] corpus = list() sid = list() start_id = 0 for seq in generator(): for v in seq: corpus.append(v) counts[v] += 1 sid.append([start_id, start_id + len(seq), len(seq)]) start_id += len(seq) dataset = FastGBWDataset( torch.from_numpy(np.asarray(corpus)).long(), np.asarray(sid)) counts2 = [0 for idx in range(SIZE)] for x, y in dataset.batch_generator(4, 2): for v in x.numpy().ravel(): counts2[v] += 1 for i in range(1, SIZE): self.assertEqual( counts[i], counts2[i], "Mismatch at i=%d. counts[i]=%s, counts2[i]=%s" % (i, counts[i], counts2[i]))
ntokens = len(corpus.dictionary.idx2word) cutoffs = args.cutoffs + [ntokens] else: ############################################################################### # Load data ############################################################################### # Torch word_freq = load_lua(os.path.join(args.data, 'word_freq.th7')).numpy() mapto = torch.from_numpy(util.reverse(np.argsort(-word_freq))).long() print("load word frequency mapping - complete") ntokens = len(word_freq) nsampled = 8192 train_corpus = FastGBWDataset(args.data, 'train_data.th7', 'train_data.sid', mapto) print("load train data - complete") test_corpus = GBWDataset(args.data, 'test_data.th7', mapto) print("load test data - complete") cutoffs = args.cutoffs + [ntokens] # with doing('Constructing model'): # if not args.lm1b: # criterion = AdaptiveLoss(cutoffs) # else: # criterion = SplitCrossEntropyLoss(args.emsize, args.cutoffs, verbose=False) # criterion.cuda() logging.info("Constructing model")
torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) ############################################################################### # Load data ############################################################################### # Torch word_freq = torch.load(os.path.join(args.data, 'word_freq.pt')).numpy() mapto = torch.from_numpy(util.reverse(np.argsort(-word_freq))).long() print("load word frequency mapping - complete") ntokens = len(word_freq) nsampled = 16384 train_corpus = FastGBWDataset(args.data, 'train_data.pt', 'train_data.sid', mapto, seq_length=args.bptt, batch_size=args.batch_size) print("load train data - complete") test_corpus = GBWDataset(args.data, 'test_data.pt', mapto) print("load test data - complete") # Streaming ''' vocabulary = Vocabulary.from_file(os.path.join(args.data, "1b_word_vocab.txt")) ntokens = len(vocabulary) nsampled = 16384 train_corpus = StreamGBWDataset(vocabulary, os.path.join(args.data, "training-monolingual.tokenized.shuffled/*")) test_corpus = StreamGBWDataset(vocabulary, os.path.join(args.data, "heldout-monolingual.tokenized.shuffled/*"), deterministic=True) print("load dataset - complete")
torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) ############################################################################### # Load data ############################################################################### # Torch word_freq = np.load(os.path.join(args.data, args.freq_file)) mapto = torch.from_numpy(util.reverse(np.argsort(-word_freq))).long() print("load word frequency mapping - complete") ntokens = len(word_freq) nsampled = 8192 train_corpus = FastGBWDataset(args.data, args.train_file, args.sid_file, mapto) print("load train data - complete") test_corpus = GBWDataset(args.data, args.validation_file, mapto) print("load test data - complete") # Streaming ''' vocabulary = Vocabulary.from_file(os.path.join(args.data, "1b_word_vocab.txt")) ntokens = len(vocabulary) nsampled = 8192 train_corpus = StreamGBWDataset(vocabulary, os.path.join(args.data, "training-monolingual.tokenized.shuffled/*")) test_corpus = StreamGBWDataset(vocabulary, os.path.join(args.data, "heldout-monolingual.tokenized.shuffled/*"), deterministic=True) print("load dataset - complete")