def lc_train(author1, author2, model): # evaluate linear classifier model # create and build evaluation dataset dataset = TextDataset([author1, author2], norm=None, vectorizer='tfidf') X, _, Y, _ = dataset.build_dataset(eval=True) # load model from pickle model = pickle.load(open(model, 'rb')) # predict dataset labels predictions = model.predict(X) predictions_proba = model.predict_proba(X) # print results print(f'accuracy: {accuracy_score(Y, predictions)*100}%') print(f'logloss: {log_loss(Y, predictions_proba)}')
def nn_train(author1, author2, model, w2v_path): # evaluate neural network classifier # define batch size batch_size = 128 # select device (CPU | GPU) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # create and build evaluation dataset dataset = TextDataset([author1, author2], norm=None, vectorizer='embed', w2v_path=w2v_path) X, _, Y, _ = dataset.build_dataset(eval=True) # load model from file model = torch.load(model) model.to(device) model.eval() valid_steps = int(len(X) / batch_size) predictions = list() # loop over all evaluation dataset for step in tqdm(range(valid_steps)): # get x and y batches x_batch = X[step * batch_size:(step + 1) * batch_size] x_batch = np.stack(x_batch, axis=0) x_batch = torch.from_numpy(x_batch).float().to(device) y_batch = Y[step * batch_size:(step + 1) * batch_size] y_batch = np.stack(y_batch, axis=0) # model forward pass y_out = model(x_batch) y_out = torch.squeeze(y_out, dim=1).cpu().detach().numpy() # save predictions predictions.append(y_out) # perform inference on the remaining samples x_batch = X[valid_steps * batch_size:] x_batch = np.stack(x_batch, axis=0) x_batch = torch.from_numpy(x_batch).float().to(device) y_batch = Y[valid_steps * batch_size:] y_batch = np.stack(y_batch, axis=0) y_out = model(x_batch) y_out = torch.squeeze(y_out, dim=1).cpu().detach().numpy() predictions.append(y_out) # concatenate all predictions predictions_proba = np.concatenate(predictions, axis=0) predictions = predictions_proba > 0.5 # print results print(f'accuracy: {accuracy_score(Y, predictions)*100}%') print( f'logloss: {F.binary_cross_entropy(torch.from_numpy(predictions_proba).float(), torch.from_numpy(np.array(Y)).float())}' )
def lc_train(config): # train linear classifier (Naive Bayes) # create and build dataset dataset = TextDataset(config['txt_list'], norm=config['norm'], vectorizer=config['vectorizer']) xtrain, xvalid, ytrain, yvalid = dataset.build_dataset() # define model model = StylometryLC(truncation=config['truncation']) # fit model model.fit(xtrain, ytrain) # infer on validation data predictions = model.predict(xvalid) predictions_proba = model.predict_proba(xvalid) # dump model pickle pickle.dump(model, open('models/nb_model.sav', 'wb')) # print results print(f'accuracy: {accuracy_score(yvalid, predictions)*100}%') print(f'logloss: {log_loss(yvalid, predictions_proba)}')
description='Train a simple LSTM language model.') parser.add_argument('weights_file', help='Path to model weights file') parser.add_argument('train_dataset', help='Path to processed train dataset file') parser.add_argument('valid_dataset', help='Path to processed train dataset file') parser.add_argument('test_dataset', help='Path to processed test dataset file') parser.add_argument( '--vocab_unk_rate', help= 'UNKing rate to use for vocabulary, by default will use true UNK rate based on validation set OOV rate', default=-1.0) args = parser.parse_args() train_dataset = TextDataset(args.train_dataset, 50) valid_dataset = TextDataset(args.valid_dataset, 50) test_dataset = TextDataset(args.test_dataset, 50) if args.vocab_unk_rate == -1.0: train_dataset.unk_vocabulary_with_true_oov_rate(valid_dataset) elif args.vocab_unk_rate > 0: train_dataset.unk_vocabulary_with_oov_rate(args.vocab_unk_rate) test_dataset.use_vocabulary_from_dataset(train_dataset) max_word_id = train_dataset.vocabulary.get_max_word_id() lm_min_word_id = train_dataset.vocabulary.get_min_valid_lm_output_word_id() dataset_transformer = transforms.Compose( [Seq2Seq(), RemapUsingMinWordID('target', lm_min_word_id),
parser.add_argument('valid_dataset', help='Path to processed validation dataset file') parser.add_argument( '--vocab_unk_rate', help= 'UNKing rate to use for vocabulary, by default will use true UNK rate based on validation set OOV rate', default=-1.0) args = parser.parse_args() n_epochs = 1000 train_samples_per_epoch = 1000 valid_samples_per_epoch = 100 batch_size = 4 max_sequence_length = 50 train_dataset = TextDataset(args.train_dataset, max_sequence_length) valid_dataset = TextDataset(args.valid_dataset, max_sequence_length) if args.vocab_unk_rate == -1.0: train_dataset.unk_vocabulary_with_true_oov_rate(valid_dataset) elif args.vocab_unk_rate > 0: train_dataset.unk_vocabulary_with_oov_rate(args.vocab_unk_rate) max_word_id = train_dataset.vocabulary.get_max_word_id() lm_min_word_id = train_dataset.vocabulary.get_min_valid_lm_output_word_id() vocabulary_size = train_dataset.vocabulary.get_vocab_size() valid_dataset.use_vocabulary_from_dataset(train_dataset) print(f'Vocabulary Size: {vocabulary_size}') dataset_transformer = transforms.Compose( [Seq2Seq(),
def main(args): ts = time.strftime('%Y-%b-%d-%H:%M:%S', time.gmtime()) splits = ['train', 'valid'] + (['test'] if args.test else []) RANDOM_SEED = 42 dataset = load_dataset("yelp_polarity", split="train") TRAIN_SIZE = len(dataset) - 2_000 VALID_SIZE = 1_000 TEST_SIZE = 1_000 train_test_split = dataset.train_test_split(train_size=TRAIN_SIZE, seed=RANDOM_SEED) train_dataset = train_test_split["train"] test_val_dataset = train_test_split["test"].train_test_split( train_size=VALID_SIZE, test_size=TEST_SIZE, seed=RANDOM_SEED) val_dataset, test_dataset = test_val_dataset["train"], test_val_dataset[ "test"] tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_fast=True) datasets = OrderedDict() datasets['train'] = TextDataset(train_dataset, tokenizer, args.max_sequence_length, not args.disable_sent_tokenize) datasets['valid'] = TextDataset(val_dataset, tokenizer, args.max_sequence_length, not args.disable_sent_tokenize) if args.test: datasets['text'] = TextDataset(test_dataset, tokenizer, args.max_sequence_length, not args.disable_sent_tokenize) print( f"Loading {args.model_name} model. Setting {args.trainable_layers} trainable layers." ) encoder = AutoModel.from_pretrained(args.model_name, return_dict=True) if not args.train_embeddings: for p in encoder.embeddings.parameters(): p.requires_grad = False encoder_layers = encoder.encoder.layer if args.trainable_layers > len(encoder_layers): warnings.warn( f"You are asking to train {args.trainable_layers} layers, but this model has only {len(encoder_layers)}" ) for layer in range(len(encoder_layers) - args.trainable_layers): for p in encoder_layers[layer].parameters(): p.requires_grad = False params = dict(vocab_size=datasets['train'].vocab_size, embedding_size=args.embedding_size, rnn_type=args.rnn_type, hidden_size=args.hidden_size, word_dropout=args.word_dropout, embedding_dropout=args.embedding_dropout, latent_size=args.latent_size, num_layers=args.num_layers, bidirectional=args.bidirectional, max_sequence_length=args.max_sequence_length) model = SentenceVAE(encoder=encoder, tokenizer=tokenizer, **params) if torch.cuda.is_available(): model = model.cuda() print(model) if args.tensorboard_logging: writer = SummaryWriter( os.path.join(args.logdir, expierment_name(args, ts))) writer.add_text("model", str(model)) writer.add_text("args", str(args)) writer.add_text("ts", ts) save_model_path = os.path.join(args.save_model_path, ts) os.makedirs(save_model_path) with open(os.path.join(save_model_path, 'model_params.json'), 'w') as f: json.dump(params, f, indent=4) with open(os.path.join(save_model_path, 'train_args.json'), 'w') as f: json.dump(vars(args), f, indent=4) def kl_anneal_function(anneal_function, step, k, x0): if step <= x0: return args.initial_kl_weight if anneal_function == 'logistic': return float(1 / (1 + np.exp(-k * (step - x0 - 2500)))) elif anneal_function == 'linear': return min(1, step / x0) NLL = torch.nn.NLLLoss(ignore_index=datasets['train'].pad_idx, reduction='sum') def loss_fn(logp, target, length, mean, logv, anneal_function, step, k, x0): # cut-off unnecessary padding from target, and flatten target = target[:, :torch.max(length).item()].contiguous().view(-1) logp = logp.view(-1, logp.size(2)) # Negative Log Likelihood NLL_loss = NLL(logp, target) # KL Divergence KL_loss = -0.5 * torch.sum(1 + logv - mean.pow(2) - logv.exp()) KL_weight = kl_anneal_function(anneal_function, step, k, x0) return NLL_loss, KL_loss, KL_weight params = [{ 'params': model.encoder.parameters(), 'lr': args.encoder_learning_rate }, { 'params': [ *model.decoder_rnn.parameters(), *model.hidden2mean.parameters(), *model.hidden2logv.parameters(), *model.latent2hidden.parameters(), *model.outputs2vocab.parameters() ] }] optimizer = torch.optim.Adam(params, lr=args.learning_rate, weight_decay=args.weight_decay) tensor = torch.cuda.FloatTensor if torch.cuda.is_available( ) else torch.Tensor step = 0 for epoch in range(args.epochs): for split in splits: data_loader = DataLoader(dataset=datasets[split], batch_size=args.batch_size, shuffle=(split == 'train'), num_workers=cpu_count(), pin_memory=torch.cuda.is_available(), collate_fn=DataCollator(tokenizer)) tracker = defaultdict(tensor) # Enable/Disable Dropout if split == 'train': model.train() else: model.eval() for iteration, batch in enumerate(data_loader): batch_size = batch['input'].size(0) for k, v in batch.items(): if torch.is_tensor(v): batch[k] = to_var(v) # Forward pass logp, mean, logv, z = model(batch['input'], batch['attention_mask'], batch['length']) # loss calculation NLL_loss, KL_loss, KL_weight = loss_fn(logp, batch['target'], batch['length'], mean, logv, args.anneal_function, step, args.k, args.x0) loss = (NLL_loss + KL_weight * KL_loss) / batch_size # backward + optimization if split == 'train': optimizer.zero_grad() loss.backward() optimizer.step() step += 1 # bookkeepeing tracker['ELBO'] = torch.cat( (tracker['ELBO'], loss.data.view(1, -1)), dim=0) if args.tensorboard_logging: writer.add_scalar("%s/ELBO" % split.upper(), loss.item(), epoch * len(data_loader) + iteration) writer.add_scalar("%s/NLL Loss" % split.upper(), NLL_loss.item() / batch_size, epoch * len(data_loader) + iteration) writer.add_scalar("%s/KL Loss" % split.upper(), KL_loss.item() / batch_size, epoch * len(data_loader) + iteration) writer.add_scalar("%s/KL Weight" % split.upper(), KL_weight, epoch * len(data_loader) + iteration) if iteration % args.print_every == 0 or iteration + 1 == len( data_loader): print( "%s Batch %04d/%i, Loss %9.4f, NLL-Loss %9.4f, KL-Loss %9.4f, KL-Weight %6.3f" % (split.upper(), iteration, len(data_loader) - 1, loss.item(), NLL_loss.item() / batch_size, KL_loss.item() / batch_size, KL_weight)) if split == 'valid': if 'target_sents' not in tracker: tracker['target_sents'] = list() tracker['target_sents'] += idx2word( batch['target'].tolist(), tokenizer=tokenizer) tracker['z'] = torch.cat((tracker['z'], z.data), dim=0) print("%s Epoch %02d/%i, Mean ELBO %9.4f" % (split.upper(), epoch, args.epochs, tracker['ELBO'].mean())) if args.tensorboard_logging: writer.add_scalar("%s-Epoch/ELBO" % split.upper(), torch.mean(tracker['ELBO']), epoch) # save a dump of all sentences, the encoded latent space and generated sequences if split == 'valid': samples, _ = model.inference(z=tracker['z']) generated_sents = idx2word(samples.tolist(), tokenizer) sents = [{ 'original': target, 'generated': generated } for target, generated in zip(tracker['target_sents'], generated_sents)] dump = {'sentences': sents, 'z': tracker['z'].tolist()} if not os.path.exists(os.path.join('dumps', ts)): os.makedirs('dumps/' + ts) with open( os.path.join('dumps/' + ts + '/valid_E%i.json' % epoch), 'w') as dump_file: json.dump(dump, dump_file, indent=3) # save checkpoint if split == 'train': checkpoint_path = os.path.join(save_model_path, "E%i.pytorch" % epoch) torch.save(model.state_dict(), checkpoint_path) print("Model saved at %s" % checkpoint_path)
args = parser.parse_args() n_epochs = 1000 train_samples_per_epoch = 80000 valid_samples_per_epoch = 500 batch_size = 24 max_sequence_length = 50 logfile_prefix = os.path.splitext(args.log_file)[0] logfile_dir = os.path.dirname(args.log_file) weight_files = get_model_weight_files(logfile_dir) lm_train_vocab = None if not args.character_level and not args.phoneme_level: lm_train_dataset = TextDataset(args.lm_train_dataset, max_sequence_length) lm_valid_dataset = TextDataset(args.lm_valid_dataset, max_sequence_length) if args.vocab_unk_rate == -1.0: lm_train_dataset.unk_vocabulary_with_true_oov_rate( lm_valid_dataset) elif args.vocab_unk_rate > 0: lm_train_dataset.unk_vocabulary_with_oov_rate(args.vocab_unk_rate) lm_train_vocab = lm_train_dataset.vocabulary train_dataset = SpeechDataset(args.train_dataset, vocabulary=lm_train_vocab, character_level=args.character_level, phoneme_level=args.phoneme_level)
def nn_train(config): # train neural network classifier # select device (CPU | GPU) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # create and build dataset dataset = TextDataset(config['txt_list'], norm=config['norm'], vectorizer=config['vectorizer'], w2v_path=config['w2v_path']) xtrain, xvalid, ytrain, yvalid = dataset.build_dataset() # define network configuration network_config = { 'emb_dim': config['emb_dim'], 'rnn_hid_dim': config['rnn_hid_dim'], 'dense_hid_dim': config['dense_hid_dim'], } # define model model = StylometryNN(network_config) model.to(device) model.train() # define BCE loss criterion = nn.BCELoss() # define optimizer optimizer = Adam(model.parameters(), lr=config['initial_lr'], weight_decay=config['weight_decay']) train_steps = int(len(xtrain) / config['batch_size']) valid_steps = int(len(xvalid) / config['batch_size']) best_accuracy = 0.5 # training loop for epoch in range(config['num_epochs']): total_loss = 0.0 # loop over all training dataset samples for step in tqdm(range(train_steps)): # get x and y batches x_batch = xtrain[step * config['batch_size']:(step + 1) * config['batch_size']] x_batch = np.stack(x_batch, axis=0) x_batch = torch.from_numpy(x_batch).float().to(device) y_batch = ytrain[step * config['batch_size']:(step + 1) * config['batch_size']] y_batch = np.stack(y_batch, axis=0) y_batch = torch.from_numpy(y_batch).float().to(device) # model forward pass y_out = model(x_batch) y_out = torch.squeeze(y_out, dim=1) # calculate loss loss = criterion(y_out, y_batch) total_loss += loss.item() # back propagation optimizer.zero_grad() loss.backward() optimizer.step() print( f'Epoch [{int(epoch + 1)}/{int(config["num_epochs"])}], Total Epoch Loss: {total_loss/train_steps}' ) model.eval() accuracies = list() losses = list() # loop over all validation dataset samples for step in tqdm(range(valid_steps)): # get x and y batches x_batch = xvalid[step * config['batch_size']:(step + 1) * config['batch_size']] x_batch = np.stack(x_batch, axis=0) x_batch = torch.from_numpy(x_batch).float().to(device) y_batch = yvalid[step * config['batch_size']:(step + 1) * config['batch_size']] y_batch = np.stack(y_batch, axis=0) # model forward pass y_out = model(x_batch) y_out = torch.squeeze(y_out, dim=1).cpu().detach().numpy() # calculate loss and accuracy y_out_labels = y_out > 0.5 accuracies.append(accuracy_score(y_batch, y_out_labels)) losses.append( F.binary_cross_entropy( torch.from_numpy(y_out).float(), torch.from_numpy(y_batch).float())) # print results print(f'Validation accuracy: {(sum(accuracies)/len(accuracies))*100}%') print(f'Validation logloss: {sum(losses)/len(losses)}') # save model (based on best validation accuracy) if sum(accuracies) / len(accuracies) > best_accuracy: torch.save(model, 'models/deep_model.pt') best_accuracy = sum(accuracies) / len(accuracies) model.train()