def loaders(self, batch_size: int, shuffle_train=False, shuffle_test=False, num_workers: int = 0) -> (DataLoader, DataLoader): # Use BucketSampler for sampling train_sampler = BucketBatchSampler(self.train_set, batch_size=batch_size, drop_last=True, sort_key=lambda r: len(r['text'])) test_sampler = BucketBatchSampler(self.test_set, batch_size=batch_size, drop_last=True, sort_key=lambda r: len(r['text'])) print("trainset", self.train_set) train_loader = DataLoader(dataset=self.train_set, batch_sampler=train_sampler, collate_fn=collate_fn, num_workers=num_workers) test_loader = DataLoader(dataset=self.test_set, batch_sampler=test_sampler, collate_fn=collate_fn, num_workers=num_workers) return train_loader, test_loader
def test_bucket_batch_sampler_uneven(): data_source = [[1], [2], [3], [4], [5]] sort_key = lambda r: len(r) batch_size = 2 sampler = BucketBatchSampler( data_source, batch_size, sort_key=sort_key, drop_last=False, bucket_size_multiplier=2) batches = list(sampler) assert len(batches) == 3 assert len(sampler) == 3 sampler = BucketBatchSampler( data_source, batch_size, sort_key=sort_key, drop_last=True, bucket_size_multiplier=2) batches = list(sampler) assert len(batches) == 2 assert len(sampler) == 2
def test_pickleable(): sampler = SequentialSampler(list(range(10))) batch_sampler = BucketBatchSampler(sampler, batch_size=2, drop_last=False, bucket_size_multiplier=2) pickle.dumps(batch_sampler)
def get_data_loader(sampler_name, dataset, batch_size, max_tokens, num_workers, shuffle): kwargs_test = {} if sampler_name == "bucket": sampler = BucketBatchSampler( SequentialSampler(dataset), batch_size=batch_size, drop_last=False, sort_key=lambda i: len(dataset.datasets.iloc[i]["src"].split())) elif sampler_name == "maxtokens": sampler = MaxTokensBatchSampler( SequentialSampler(dataset), shuffle=shuffle, batch_size=batch_size, max_tokens=max_tokens, drop_last=False, sort_key=lambda i: len(dataset.datasets.iloc[i]["src"].split())) else: sampler = None kwargs_test = {"batch_size": batch_size, "shuffle": shuffle} # Define dataloader data_loader = DataLoader( dataset, num_workers=num_workers, collate_fn=lambda x: TranslationDataset.collate_fn(x, max_tokens), pin_memory=True, batch_sampler=sampler, **kwargs_test) return data_loader
def test_bucket_batch_sampler__drop_last(): sampler = SequentialSampler(list(range(10))) batch_sampler = BucketBatchSampler(sampler, batch_size=3, drop_last=True, bucket_size_multiplier=2) assert len(batch_sampler) == 3 assert len(list(iter(batch_sampler))) == 3
def test_bucket_batch_sampler(): sampler = SequentialSampler(list(range(10))) batch_sampler = BucketBatchSampler(sampler, batch_size=3, drop_last=False, bucket_size_multiplier=2) assert len(batch_sampler) == 4 assert list(batch_sampler) == [[0, 1, 2], [3, 4, 5], [9], [6, 7, 8]]
def test_bucket_batch_sampler_last_batch_first(): data_source = [torch.tensor([j for j in range(i)]) for i in range(100)] sort_key = lambda r: len(r) batch_size = 1 batches = list( BucketBatchSampler( data_source, batch_size, sort_key=sort_key, drop_last=False, bucket_size_multiplier=2)) # Largest batch (4) is in first batch assert 99 == batches[0][0]
def test_bucket_batch_sampler(): data_source = [[1], [2], [3], [4], [5], [6]] sort_key = lambda r: len(r) batch_size = 2 batches = list( BucketBatchSampler(data_source, batch_size, sort_key, bucket_size_multiplier=2)) assert len(batches) == 3
def main(args): """ Main training routine specific for this project :param hparams: """ if args.train_file is not None and args.dev_file is not None: if (args.load_checkpoint): model = StreamingPunctuatorModel.load_from_checkpoint(args.load_checkpoint, **vars(args)) else: model = StreamingPunctuatorModel(**vars(args)) batch_size = args.batch_size train_dataset = PunctuationDataset(tokenizer=model.tokenizer, filename=args.train_file, label_delay=args.label_delay) dev_dataset = PunctuationDataset(tokenizer=model.tokenizer, filename=args.dev_file, label_delay=args.label_delay) random_sampler = RandomSampler(train_dataset) batch_iterator = BucketBatchSampler(random_sampler, batch_size=batch_size, drop_last=False, sort_key=lambda i: train_dataset[i]["length"], bucket_size_multiplier=100) train_loader = torch.utils.data.DataLoader(train_dataset, batch_sampler=batch_iterator, collate_fn=train_dataset.collate_batch, num_workers=8) dev_loader = torch.utils.data.DataLoader( dataset=dev_dataset, batch_size=batch_size, shuffle=False, collate_fn=dev_dataset.collate_batch, num_workers=2) checkpoint_callback = ModelCheckpoint( save_top_k=4, save_last=True, verbose=True, monitor='val_f1', mode='max', prefix='' ) trainer = Trainer.from_argparse_args(args, checkpoint_callback=checkpoint_callback, callbacks=[ ]) trainer.fit(model, train_dataloader=train_loader, val_dataloaders=dev_loader) elif args.process_stdin and args.load_checkpoint is not None: model = StreamingPunctuatorModel.load_from_checkpoint(args.load_checkpoint) while True: l = sys.stdin.readline() if not l: break print(model.process_line(l.strip())) sys.stdout.flush() else: raise Exception("Either --train-file and --dev-file or --process-stdin and --load-checkpoint should be specified")
def get_data_loader(data, batch_size, drop_last, collate_fn=collate_fn_eval_base): sampler = BucketBatchSampler(data, batch_size, drop_last=drop_last, sort_key=lambda row: -len(row['word_ids'])) loader = DataLoader(data, batch_sampler=sampler, collate_fn=collate_fn) return loader
def test_bucket_batch_sampler_last_batch_first(): data_source = [[1], [2], [3], [4], [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]] sort_key = lambda r: len(r) batch_size = 2 batches = list( BucketBatchSampler(data_source, batch_size, sort_key, biggest_batches_first=True, bucket_size_multiplier=2)) # Largest batch (4) is in first batch assert 4 in batches[0]
def test_bucket_batch_sampler_uneven_length(): data_source = [[1], [2], [3], [4], [5]] sort_key = lambda i: len(data_source[i]) batch_size = 2 sampler = SequentialSampler(data_source) batch_sampler = BucketBatchSampler(sampler, batch_size, sort_key=sort_key, drop_last=False, bucket_size_multiplier=2) batches = list(batch_sampler) assert len(batches) == 3 assert len(batch_sampler) == 3 batch_sampler = BucketBatchSampler(sampler, batch_size, sort_key=sort_key, drop_last=True, bucket_size_multiplier=2) batches = list(batch_sampler) assert len(batches) == 2 assert len(batch_sampler) == 2
def test_bucket_batch_sampler_sorted(): data_source = [[1], [2], [3], [4], [5]] sort_key = lambda i: data_source[i] batch_size = len(data_source) sampler = SequentialSampler(data_source) batches = list( BucketBatchSampler(sampler, batch_size, sort_key=sort_key, drop_last=False, bucket_size_multiplier=1)) for i, batch in enumerate(batches): assert batch[0] == i
def test_bucket_batch_sampler_sorted(): data_source = [[1], [2], [3], [4], [5]] sort_key = lambda r: r[0] batch_size = len(data_source) batches = list( BucketBatchSampler(data_source, batch_size, sort_key, biggest_batches_first=False, bucket_size_multiplier=1)) # Largest batch (4) is in first batch for i, batch in enumerate(batches): assert batch[0] == i
def get_data_loader(data, batch_size, drop_last, use_rnn, is_train=False): # if is_train: sampler = BucketBatchSampler(data, batch_size, drop_last=drop_last, sort_key=lambda row: -len(row["sents"])) # else: # sampler = SequentialSampler(data) collate_fn = collate_fn_rnn if use_rnn else collate_fn_transformer loader = DataLoader(data, batch_sampler=sampler, collate_fn=collate_fn, pin_memory=False) # shuffle=True, # num_workers=1) return loader
def test_dataloader(self): tokenizer = Tokenizer.from_file("test/tokenizer.json") tokenizer.add_special_tokens(["<s>", "</s>"]) dataset = PunctuationDataset(tokenizer, "test/dev.txt") batch_size = 8 random_sampler = RandomSampler(dataset) batch_iterator = BucketBatchSampler(random_sampler, batch_size=batch_size, drop_last=False, sort_key=lambda x: dataset[x]["length"], bucket_size_multiplier=100) dataloader = torch.utils.data.DataLoader(dataset, batch_sampler=batch_iterator, collate_fn=dataset.collate_batch) for i in range(2): print(f"Testing epoch {i}") for j, batch in enumerate(dataloader): if j == 0: # make sure that the length difference inside a batch is not > 20% self.assertTrue((batch["lengths"].max() - batch["lengths"].min()) / batch["lengths"].max() < 0.2 )
def train_dataloader(self) -> torch.utils.data.DataLoader: """Return a PyTorch DataLoader for the training set. Requires calling ``prepare_data`` beforehand. Return: PyTorch DataLoader """ sampler = BucketBatchSampler( RandomSampler(self.train_dataset), batch_size=self.config.batch_size.train, drop_last=False, sort_key=lambda sample: len(self.train_dataset[sample][const.TARGET ].split()), # bucket_size_multiplier=100, ) return torch.utils.data.DataLoader( self.train_dataset, batch_sampler=sampler, num_workers=self.config.num_data_workers, collate_fn=self.data_encoders.collate_fn, pin_memory=torch.cuda.is_initialized(), # NOQA )
dev_log_template = ' '.join( '{:>6.0f},{:>5.0f},{:>9.0f},{:>5.0f}/{:<5.0f} {:>7.0f}%,{:>8.6f},{:8.6f},{:12.4f},{:12.4f}' .split(',')) log_template = ' '.join( '{:>6.0f},{:>5.0f},{:>9.0f},{:>5.0f}/{:<5.0f} {:>7.0f}%,{:>8.6f},{},{:12.4f},{}' .split(',')) makedirs(args.save_path) print(header) for epoch in range(args.epochs): n_correct, n_total = 0, 0 train_sampler = SequentialSampler(train) train_batch_sampler = BucketBatchSampler( train_sampler, args.batch_size, True, sort_key=lambda r: len(train[r]['premise'])) train_iterator = DataLoader(train, batch_sampler=train_batch_sampler, collate_fn=collate_fn, pin_memory=torch.cuda.is_available(), num_workers=0) for batch_idx, (premise_batch, hypothesis_batch, label_batch) in enumerate(train_iterator): # switch model to training mode, clear gradient accumulators model.train() torch.set_grad_enabled(True) opt.zero_grad()
TEXT_LENGTH = opt.text_length ROUTING_TYPE = opt.routing_type NUM_ITERATIONS = opt.num_iterations BATCH_SIZE = opt.batch_size MODEL_WEIGHT = opt.load_model_weight GPU = opt.gpu vocab_size, num_class, test_dataset = load_data(data_type='custom', preprocessing=False, fine_grained=FINE_GRAINED, verbose=True, text_length=TEXT_LENGTH) print("[!] vocab_size: {}, num_class: {}".format(vocab_size, num_class)) test_sampler = BucketBatchSampler(test_dataset, BATCH_SIZE, False, sort_key=lambda row: len(row['text'])) test_iterator = DataLoader(test_dataset, batch_sampler=test_sampler, collate_fn=collate_fn) model = Model(vocab_size, num_class=num_class, routing_type=ROUTING_TYPE, num_iterations=NUM_ITERATIONS) if MODEL_WEIGHT is not None: model.load_state_dict(torch.load('epochs/' + MODEL_WEIGHT)) margin_loss, focal_loss = MarginLoss(), FocalLoss() if torch.cuda.is_available(): model, margin_loss, focal_loss = model.to( 'cuda:{}'.format(GPU)), margin_loss.to(
iterations = 0 start = time.time() best_dev_acc = -1 header = ' Time Epoch Iteration Progress (%Epoch) Loss Dev/Loss Accuracy Dev/Accuracy' dev_log_template = ' '.join( '{:>6.0f},{:>5.0f},{:>9.0f},{:>5.0f}/{:<5.0f} {:>7.0f}%,{:>8.6f},{:8.6f},{:12.4f},{:12.4f}' .split(',')) log_template = ' '.join( '{:>6.0f},{:>5.0f},{:>9.0f},{:>5.0f}/{:<5.0f} {:>7.0f}%,{:>8.6f},{},{:12.4f},{}'.split(',')) makedirs(args.save_path) print(header) for epoch in range(args.epochs): n_correct, n_total = 0, 0 train_sampler = BucketBatchSampler( train, args.batch_size, True, sort_key=lambda r: len(row['premise'])) train_iterator = DataLoader( train, batch_sampler=train_sampler, collate_fn=collate_fn, pin_memory=torch.cuda.is_available(), num_workers=0) for batch_idx, (premise_batch, hypothesis_batch, label_batch) in enumerate(train_iterator): # switch model to training mode, clear gradient accumulators model.train() torch.set_grad_enabled(True) opt.zero_grad() iterations += 1
def test_pickleable(): data_source = [[1], [2], [3], [4], [5]] sampler = BucketBatchSampler(data_source, batch_size=2, drop_last=False) pickle.dumps(sampler)
def run_experiment(datapath, src, trg, model_name, domain=None, smart_batch=False): start_time = time.time() ########################################################################### ########################################################################### wandb.init(project=WANDB_PROJECT, entity='salvacarrion', reinit=True) config = wandb.config config.model_name = MODEL_NAME config.domain = domain config.max_epochs = MAX_EPOCHS config.learning_rate = LEARNING_RATE config.batch_size = BATCH_SIZE config.max_tokens = MAX_TOKENS config.warmup_updates = WARMUP_UPDATES config.patience = PATIENCE config.acc_gradients = ACC_GRADIENTS config.weight_decay = WEIGHT_DECAY config.clip_gradients = CLIP_GRADIENTS config.multigpu = MULTIGPU config.device1 = str(DEVICE1) config.device2 = str(DEVICE2) config.num_workers = NUM_WORKERS config.tok_model = TOK_MODEL config.tok_size = TOK_SIZE config.tok_folder = TOK_FOLDER config.lowercase = LOWERCASE config.truncate = TRUNCATE config.max_length_truncate = MAX_LENGTH_TRUNC config.sampler_name = str(SAMPLER_NAME) print(config) ########################################################################### ########################################################################### checkpoint_path = os.path.join(datapath, DATASET_CHECKPOINT_NAME, f"{model_name}_{domain}_acc") # Load tokenizers src_tok, trg_tok = helpers.get_tokenizers(os.path.join( datapath, DATASET_TOK_NAME, TOK_FOLDER), src, trg, tok_model=TOK_MODEL, lower=LOWERCASE, truncation=TRUNCATE, max_length=MAX_LENGTH_TRUNC) # Load dataset datapath_clean = DATASET_CLEAN_SORTED_NAME if smart_batch else DATASET_CLEAN_NAME if TOK_MODEL == "bpe": # Do not preprocess again when using bpe src_tok.apply_bpe = False trg_tok.apply_bpe = False datapath_clean = os.path.join(DATASET_TOK_NAME, TOK_FOLDER) train_ds = TranslationDataset(os.path.join(datapath, datapath_clean), src_tok, trg_tok, "train") val_ds = TranslationDataset(os.path.join(datapath, datapath_clean), src_tok, trg_tok, "val") # Build dataloaders kwargs_train = {} kwargs_val = {} if SAMPLER_NAME == "bucket": train_sampler = BucketBatchSampler( SequentialSampler(train_ds), batch_size=BATCH_SIZE, drop_last=False, sort_key=lambda i: len_func(train_ds, i)) val_sampler = BucketBatchSampler( SequentialSampler(val_ds), batch_size=BATCH_SIZE, drop_last=False, sort_key=lambda i: len_func(val_ds, i)) elif SAMPLER_NAME == "maxtokens": train_sampler = MaxTokensBatchSampler( SequentialSampler(train_ds), shuffle=True, batch_size=BATCH_SIZE, max_tokens=MAX_TOKENS, drop_last=False, sort_key=lambda i: len_func(train_ds, i)) val_sampler = MaxTokensBatchSampler( SequentialSampler(val_ds), shuffle=False, batch_size=BATCH_SIZE, max_tokens=MAX_TOKENS, drop_last=False, sort_key=lambda i: len_func(val_ds, i)) else: train_sampler = val_sampler = None kwargs_train = {"batch_size": BATCH_SIZE, "shuffle": True} kwargs_val = {"batch_size": BATCH_SIZE, "shuffle": False} # Define dataloader train_loader = DataLoader( train_ds, num_workers=NUM_WORKERS, collate_fn=lambda x: TranslationDataset.collate_fn(x, MAX_TOKENS), pin_memory=True, batch_sampler=train_sampler, **kwargs_train) val_loader = DataLoader( val_ds, num_workers=NUM_WORKERS, collate_fn=lambda x: TranslationDataset.collate_fn(x, MAX_TOKENS), pin_memory=True, batch_sampler=val_sampler, **kwargs_val) # Instantiate model #1 model = Transformer(d_model=256, enc_layers=3, dec_layers=3, enc_heads=8, dec_heads=8, enc_dff_dim=512, dec_dff_dim=512, enc_dropout=0.1, dec_dropout=0.1, max_src_len=2000, max_trg_len=2000, src_tok=src_tok, trg_tok=trg_tok, static_pos_emb=True) #.to(DEVICE1) model.apply(initialize_weights) print(f'The model has {model.count_parameters():,} trainable parameters') criterion = nn.CrossEntropyLoss( ignore_index=trg_tok.word2idx[trg_tok.PAD_WORD]) optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) # Load weights # checkpoint_path = os.path.join(datapath, DATASET_CHECKPOINT_NAME, "transformer_multi30k_best_new.pt") # print(f"Loading weights from: {checkpoint_path}") # model.load_state_dict(torch.load(checkpoint_path)) # Tensorboard (it needs some epochs to start working ~10-20) tb_writer = SummaryWriter( os.path.join(datapath, DATASET_LOGS_NAME, f"{model_name}")) wandb.watch(model) # Prepare model and data for acceleration model, optimizer, train_loader, val_loader = accelerator.prepare( model, optimizer, train_loader, val_loader) # Train and validate model fit(model, optimizer, train_loader=train_loader, val_loader=val_loader, epochs=MAX_EPOCHS, criterion=criterion, checkpoint_path=checkpoint_path, src_tok=src_tok, trg_tok=trg_tok, tb_writer=tb_writer) print("************************************************************") epoch_hours, epoch_mins, epoch_secs = helpers.epoch_time( start_time, end_time=time.time()) print(f'Time experiment: {epoch_hours}h {epoch_mins}m {epoch_secs}s') print("************************************************************") print("Done!")
def train(arg): version = arg.version device = arg.device block = arg.block batch_size = arg.batch_size eval_batch_size = arg.eval_batch_size epoch_num = arg.epoch_num learning_rate = arg.learning_rate early_stop_epoch = arg.early_stop_epoch valid_sample_num = arg.valid_sample_num train_sample_num = arg.train_sample_num max_len = arg.max_len POSSIBLE_BATCH_SIZE = arg.possible_batch_size # build collate_fn function def my_collate_fn(data): # x, pad_id = 1 # bert, pad_id = 0, cls = 101, seq = 102 length = max(d[0].shape[0] for d in data) length = min(max_len, length) x = np.empty([len(data), length + 2], dtype=np.int64) x.fill(0) x[:, 0] = 101 x[:, -1] = 102 for i, d in enumerate(data): l = min(d[0].shape[0], max_len) x[i, 1:l + 1] = d[0][-l:] y = np.vstack([d[1] for d in data]) # turn to torch tensor x = torch.LongTensor(x) y = torch.FloatTensor(y) return x, y # load data dl_model_dir = os.path.join(model_dir, arg.data_name, "bert", version) data_cached_path = os.path.join(dl_model_dir, "data.h5") os.makedirs(dl_model_dir, exist_ok=True) print(f"output model and all the info to '{dl_model_dir}'") # save config with open(os.path.join(dl_model_dir, "config.json"), 'w', encoding='utf-8') as outfile: json.dump( { "block": block, "batch_size": batch_size, "epoch_num": epoch_num, "learning_rate": learning_rate, "early_stop_epoch": early_stop_epoch, "valid_sample_num": valid_sample_num, "train_sample_num": train_sample_num, "max_len": max_len, }, outfile, indent=4) if arg.data_name == "bookcorpus": if arg.history == 1: _, x_train, y_train = load_text_data(block=block, phase="train", target_model=arg.model_type, verbose=True) _, x_valid, y_valid = load_text_data(block=block, phase="valid", target_model=arg.model_type, verbose=True) _, x_test, y_test = load_text_data(block=block, phase="test", target_model=arg.model_type, verbose=True) else: _, x_train, y_train = load_text_data_long( block=block, phase="train", target_model=arg.model_type, verbose=True, history=arg.history) _, x_valid, y_valid = load_text_data_long( block=block, phase="valid", target_model=arg.model_type, verbose=True, history=arg.history) _, x_test, y_test = load_text_data_long( block=block, phase="test", target_model=arg.model_type, verbose=True, history=arg.history) elif arg.data_name == "coda19": _, x_train, y_train = coda_load_text_data(block=block, phase="train", target_model=arg.model_type, verbose=True) _, x_valid, y_valid = coda_load_text_data(block=block, phase="valid", target_model=arg.model_type, verbose=True) _, x_test, y_test = coda_load_text_data(block=block, phase="test", target_model=arg.model_type, verbose=True) else: print(f"{arg.data_name} not supported yet!") quit() if arg.downsample != -1: random_index = np.random.RandomState(5516).permutation( x_train.shape[0])[:arg.downsample] x_train, y_train = x_train[random_index], y_train[random_index] random_index = np.random.permutation(x_valid.shape[0])[:valid_sample_num] x_valid, y_valid = x_valid[random_index], y_valid[random_index] random_index = np.random.permutation(x_test.shape[0])[:] x_test, y_test = x_test[random_index], y_test[random_index] print("Train", x_train.shape, y_train.shape) print("Test", x_test.shape, y_test.shape) print("Valid", x_valid.shape, y_valid.shape) x_valid, x_test = x_valid.tolist(), x_test.tolist() validation = data.DataLoader( StoryDataset(x_valid, y_valid), batch_sampler=BucketBatchSampler( torch.utils.data.sampler.SequentialSampler(x_valid), batch_size=batch_size, drop_last=True, sort_key=lambda i: x_valid[i].shape[0], bucket_size_multiplier=100), num_workers=3, collate_fn=my_collate_fn, ) testing = data.DataLoader( StoryDataset(x_test, y_test), batch_sampler=BucketBatchSampler( torch.utils.data.sampler.SequentialSampler(x_test), batch_size=batch_size, drop_last=True, sort_key=lambda i: x_test[i].shape[0], bucket_size_multiplier=100), num_workers=3, collate_fn=my_collate_fn, ) if arg.model_type == "bert": model = BertRegressor(output_size=y_train.shape[1]) elif arg.model_type == "scibert": pretrained_model = AutoModel.from_pretrained( "allenai/scibert_scivocab_uncased") pretrained_config = AutoConfig.from_pretrained( "allenai/scibert_scivocab_uncased") model = BertRegressor(output_size=y_train.shape[1], model=pretrained_model, config=pretrained_config) model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) loss_function = lambda y_pred, y_batch: 1 - F.cosine_similarity( y_pred, y_batch).mean() best_epoch = 0 best_cosine = 0.0 stopper = EarlyStop(mode="max", history=early_stop_epoch) for epoch in range(1, epoch_num + 1): # generate data if arg.downsample == -1 or arg.downsample > train_sample_num: random_index = np.random.permutation( x_train.shape[0])[:train_sample_num] x_train_epoch, y_train_epoch = x_train[random_index], y_train[ random_index] x_train_epoch = x_train_epoch.tolist() else: x_train_epoch, y_train_epoch = x_train, y_train x_train_epoch = x_train_epoch.tolist() training = data.DataLoader( StoryDataset(x_train_epoch, y_train_epoch), batch_sampler=BucketBatchSampler( torch.utils.data.sampler.SequentialSampler(x_train_epoch), batch_size=batch_size if POSSIBLE_BATCH_SIZE == -1 else POSSIBLE_BATCH_SIZE, drop_last=True, sort_key=lambda i: x_train_epoch[i].shape[0], bucket_size_multiplier=100), num_workers=3, collate_fn=my_collate_fn, ) # training model.train() total_loss = 0 total_acc = 0 total_count = len( training.dataset) // training.batch_sampler.batch_size error_case = 0 if POSSIBLE_BATCH_SIZE != -1: accumulation_steps = batch_size // POSSIBLE_BATCH_SIZE for count, (x_batch, y_batch) in enumerate(training, 1): x_batch = x_batch.to(device) y_batch = y_batch.to(device) try: y_pred = model(x_batch) loss = loss_function(y_pred, y_batch) loss.backward() total_loss += loss.item() if POSSIBLE_BATCH_SIZE == -1: optimizer.step() optimizer.zero_grad() elif count % accumulation_steps == 0: optimizer.step() optimizer.zero_grad() except RuntimeError: #print(x_batch.shape, y_batch.shape) error_case += 1 continue # compute cosine total_acc += F.cosine_similarity(y_pred, y_batch, dim=1).mean().item() print( "\x1b[2K\rEpoch: {} / {} [{:.2f}%] Loss: {:.5f} Acc: {:.5f} Error: {}" .format(epoch, epoch_num, 100.0 * count / total_count, total_loss / count, total_acc / count, error_case), end="") print() if epoch % 1 == 0: cosine = evaluate(model, validation, device=device) if cosine > best_cosine: best_model = copy.deepcopy(model.state_dict()) best_cosine = cosine best_epoch = epoch # check early stopping if stopper.check(cosine): print("Early Stopping at Epoch = ", epoch) break # finish training print("Loading model from epoch {}".format(best_epoch)) torch.save(best_model, os.path.join(dl_model_dir, "best_model.pt")) model.load_state_dict(best_model) test_cosine = evaluate(model, testing, device) print("Testing Cosine = ", test_cosine) with open(os.path.join(dl_model_dir, "result.json"), 'w', encoding='utf-8') as outfile: json.dump( { "cosine": float(test_cosine), "best_cosine": float(best_cosine), "best_epoch": best_epoch, "max_len": max_len, }, outfile, indent=4)
def train_f(config): run_name = 'run_%d' % run_config['run'] run_config['run'] = run_config['run'] + 1 visdom_logger.new_run(run_name) model_path = Path('/tmp/models/') delete_checkpoint(model_path) train_batch_sampler = FlexibleBucketBatchSampler( train, config.batch_size, sampler=train_sampler, drop_last=True, sort_key=lambda r: len(row['text'])) train_loader = DataLoader(train, batch_sampler=train_batch_sampler, collate_fn=collate_fn, pin_memory=config.use_cuda, num_workers=0) dev_batch_sampler = FlexibleBucketBatchSampler( train, config.test_batch_size, drop_last=True, sampler=dev_sampler, sort_key=lambda r: len(row['text'])) dev_loader = DataLoader(train, batch_sampler=dev_batch_sampler, collate_fn=collate_fn, pin_memory=config.use_cuda, num_workers=0) test_sampler = BucketBatchSampler(test, config.test_batch_size, drop_last=True, sort_key=lambda r: len(row['text'])) test_loader = DataLoader(test, batch_sampler=test_sampler, collate_fn=collate_fn, pin_memory=config.use_cuda, num_workers=0) embedding = nn.Embedding(text_encoder.vocab_size, config.d_embedding) if config.word_vectors_freeze: embedding.weight.requires_grad = False if config.word_vectors: # Load word vectors word_vectors = word_to_vector.aliases[config.word_vectors]( cache=config.vector_cache_dir) for i, token in enumerate(text_encoder.vocab): embedding.weight.data[i] = word_vectors[token] print( 'Found vectors for %d tokens in vocabulary' % len([t for t in text_encoder.vocab if t in word_vectors.stoi])) model = LSTMClassifier(d_in=embedding.embedding_dim, d_out=label_encoder.vocab_size, d_hidden=config.d_hidden, dropout=config.dropout, embedding=embedding) model.to(device) optimizer_params = list( filter(lambda p: p.requires_grad, model.parameters())) optimizer = torch.optim.SGD(optimizer_params, lr=config.lr, momentum=config.momentum) trainer = create_supervised_trainer(model, optimizer, F.nll_loss, device=device) evaluator_train = \ create_supervised_evaluator(model, metrics={ 'accuracy': CategoricalAccuracy(), 'nll': Loss(F.nll_loss)}, device=device) evaluator_dev = \ create_supervised_evaluator(model, metrics={ 'accuracy': CategoricalAccuracy(), 'nll': Loss(F.nll_loss)}, device=device) visdom_logger.attach_trainer(trainer) visdom_logger.attach_evaluator(evaluator_train, trainer, phase='train') visdom_logger.attach_evaluator(evaluator_dev, trainer, phase='dev') lr_scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lambda epoch_: 1. / (1 + config.lr_decay * (epoch_ - 1))) # scoring function for early stopping and checkpointing def score_function(engine): dev_loss = engine.state.metrics['nll'] return -dev_loss early_stopping = EarlyStopping(patience=15, score_function=score_function, trainer=trainer) def checkpoint_score_function(engine): dev_accuracy = engine.state.metrics['accuracy'] return dev_accuracy checkpoint = ModelCheckpoint('/tmp/models', 'checkpoint', score_function=checkpoint_score_function, n_saved=1, create_dir=True, score_name="dev_accuracy") # lets train! train_model( model=model, trainer=trainer, epochs=config.epochs, evaluator_train=evaluator_train, evaluator_dev=evaluator_dev, train_loader=train_loader, dev_loader=dev_loader, lr_scheduler=lr_scheduler, early_stopping=early_stopping if config.early_stopping else None, checkpoint=checkpoint if config.checkpoint else None) # load checkpointed (best) model and evaluate on test loader model = torch.load(list(model_path.glob('checkpoint_model*.pth'))[0]) test_evaluator = \ create_supervised_evaluator(model, metrics={ 'accuracy': CategoricalAccuracy(), 'nll': Loss(F.nll_loss)}, device=device) test_evaluator.run(test_loader) metrics = test_evaluator.state.metrics print("Test Results: Avg accuracy: {:.2f} Avg loss: {:.2f}".format( metrics['accuracy'], metrics['nll'])) test_evaluator.run(dev_loader) metrics = test_evaluator.state.metrics return metrics['nll']
import xgboost as xgb import pandas as pd from torchnlp.encoders.text import WhitespaceEncoder from torchnlp.samplers import BucketBatchSampler from torchnlp.utils import collate_tensors from torchnlp.encoders.text import stack_and_pad_tensors from torchnlp.nn import LockedDropout loaded_data = ["now this ain't funny", "so don't you dare laugh"] encoder = WhitespaceEncoder(loaded_data) encoded_data = [encoder.encode(example) for example in loaded_data] print("encoded_data", encoded_data) encoded_data = [torch.randn(2), torch.randn(3), torch.randn(4), torch.randn(5)] train_sampler = torch.utils.data.sampler.SequentialSampler(encoded_data) train_batch_sampler = BucketBatchSampler( train_sampler, batch_size=2, drop_last=False, sort_key=lambda i: encoded_data[i].shape[0]) batches = [[encoded_data[i] for i in batch] for batch in train_batch_sampler] batches = [ collate_tensors(batch, stack_tensors=stack_and_pad_tensors) for batch in batches ] print("batches=", batches)
def main(params_file, batch_size, epochs, model_file_name, learning_rate=1e-3, weight_decay=1e-5, n_workers=6, use_pretrained_embs=False): use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if use_cuda else "cpu") torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True seed = 0 torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed) # Files to Load train_json_file = './cleaned_datasets/intro_1_para/train.txt' val_json_file = './cleaned_datasets/intro_1_para/val.txt' word2vec_model_path = '/word2vec_100D.w2v' print('[' + str(datetime.datetime.now()) + '] : Reading Files') if use_pretrained_embs: wordsModel = load_w2v_model(word2vec_model_path) vocab = sorted(list(wordsModel.wv.vocab)) else: vocab = None print('[' + str(datetime.datetime.now()) + '] : Creating Dataset Objects') train_dataset = WikiDataset.fromJsonFile(train_json_file, vocab=vocab, mode='train') val_dataset = WikiDataset.fromJsonFile( val_json_file, text_encoder=train_dataset.text_encoder, label_encoder=train_dataset.label_encoder, vocab=train_dataset.text_encoder.vocab, mode='train') trainset = DataLoader(train_dataset, num_workers=n_workers, batch_sampler=BucketBatchSampler( train_dataset.data['data'], batch_size=batch_size, drop_last=True, sort_key=lambda a: -len(a['intro'].split())), collate_fn=train_dataset.collate_fn) valset = DataLoader(val_dataset, num_workers=n_workers, batch_sampler=BucketBatchSampler( val_dataset.data['data'], batch_size=batch_size, drop_last=True, sort_key=lambda a: -len(a['intro'].split())), collate_fn=val_dataset.collate_fn) print('[' + str(datetime.datetime.now()) + '] : Reading params_file') with open(params_file, 'r') as stream: params = yaml.load(stream) params['emb_size'] = (train_dataset.vocab_size, 100) params['num_classes'] = train_dataset.label_encoder.vocab_size print('[' + str(datetime.datetime.now()) + '] : Creating Model Object') classifier = create_model(params) if use_pretrained_embs: print('[' + str(datetime.datetime.now()) + '] : Creating Embedding Matrix') embedding_matrix = create_embedding_matrix(wordsModel, train_dataset) classifier.embeddings.weight = nn.Parameter(embedding_matrix) del embedding_matrix classifier.to(device) criterion = nn.CrossEntropyLoss( weight=torch.tensor([0, 1.36 / 1, 1.36 / 0.36]).to(device)) optimizer = optim.Adam(classifier.parameters(), lr=learning_rate, weight_decay=weight_decay) print('[' + str(datetime.datetime.now()) + '] : Training Model ...') classifier = train_model(classifier, epochs, trainset, valset, criterion, optimizer, device, model_file_name) model_utils = { 'text_encoder': train_dataset.text_encoder, 'label_encoder': train_dataset.label_encoder } joblib.dump(model_utils, model_file_name + str('_model_utils.pkl')) with open(params_file, 'w') as stream: yaml.dump(params, stream)