def train(**kwargs): for k_, v_ in kwargs.items(): setattr(options, k_, v_) training_set = TextDataset(path='data/train/train.csv', model='wordvec/skipgram.bin', max_length=options.max_length, word_dim=options.word_dim) training_loader = Data.DataLoader(dataset=training_set, batch_size=options.batch_size, shuffle=True, drop_last=True) model = TextCNN(options.word_dim, options.max_length, training_set.encoder.classes_.shape[0]) if torch.cuda.is_available(): model.cuda() optimizer = optim.Adam(model.parameters(), lr=options.learning_rate) for epoch in tqdm(range(options.epochs)): loss_sum = 0 for data, label in tqdm(training_loader): if torch.cuda.is_available(): data = data.cuda() label = label.cuda() out = model(data) loss = criteration(out, autograd.Variable(label.squeeze().long())) loss_sum += loss.item() / options.batch_size optimizer.zero_grad() loss.backward() optimizer.step() tqdm.write(f'epoch {epoch + 1}: loss = {loss_sum/len(training_set.data)}') model.save(f'checkpoints/loss-{loss_sum/len(training_set.data)}.pt')
def main(): testset = TextDataset(args.testset) test_loader = DataLoader(dataset=testset, batch_size=args.test_batch, drop_last=False, shuffle=False, collate_fn=synth_collate_fn, pin_memory=True) t2m = Text2Mel().to(DEVICE) ssrn = SSRN().to(DEVICE) mname = type(t2m).__name__ ckpt = sorted( glob.glob(os.path.join(args.logdir, mname, '{}-*k.pth'.format(mname)))) state = torch.load(ckpt[-1]) t2m.load_state_dict(state['model']) args.global_step = state['global_step'] mname = type(ssrn).__name__ ckpt = sorted( glob.glob(os.path.join(args.logdir, mname, '{}-*k.pth'.format(mname)))) state = torch.load(ckpt[-1]) ssrn.load_state_dict(state['model']) print('All of models are loaded.') t2m.eval() ssrn.eval() if not os.path.exists(os.path.join(args.sampledir, 'A')): os.makedirs(os.path.join(args.sampledir, 'A')) synthesize(t2m, ssrn, test_loader, args.test_batch)
def main(): testset = TextDataset(args.testset) test_loader = DataLoader(dataset=testset, batch_size=args.test_batch, drop_last=False, shuffle=False, collate_fn=synth_collate_fn, pin_memory=True) t2m = Text2Mel().to(DEVICE) ssrn = SSRN().to(DEVICE) ckpt = pd.read_csv(os.path.join(args.logdir, t2m.name, 'ckpt.csv'), sep=',', header=None) ckpt.columns = ['models', 'loss'] ckpt = ckpt.sort_values(by='loss', ascending=True) state = torch.load(os.path.join(args.logdir, t2m.name, ckpt.models.loc[0])) t2m.load_state_dict(state['model']) args.global_step = state['global_step'] ckpt = pd.read_csv(os.path.join(args.logdir, ssrn.name, 'ckpt.csv'), sep=',', header=None) ckpt.columns = ['models', 'loss'] ckpt = ckpt.sort_values(by='loss', ascending=True) state = torch.load(os.path.join(args.logdir, ssrn.name, ckpt.models.loc[0])) ssrn.load_state_dict(state['model']) print('All of models are loaded.') t2m.eval() ssrn.eval() if not os.path.exists(os.path.join(args.sampledir, 'A')): os.makedirs(os.path.join(args.sampledir, 'A')) return synthesize(t2m=t2m, ssrn=ssrn, data_loader=test_loader, batch_size=args.test_batch)
def main(): testset = TextDataset(args.testset, args.lang, args.ref_path) test_loader = DataLoader(dataset=testset, batch_size=args.test_batch, drop_last=False, shuffle=False, collate_fn=synth_collate_fn, pin_memory=True) model = DCTTS(args).to(DEVICE) ckpt = sorted( glob.glob( os.path.join(args.logdir, args.model_name, '{}-*k.pth'.format(args.model_name)))) state = torch.load(ckpt[-1]) model.load_state_dict(state['model']) args.global_step = state['global_step'] print('All of models are loaded.') model.eval() if not os.path.exists(os.path.join(args.sampledir, 'A')): os.makedirs(os.path.join(args.sampledir, 'A')) os.makedirs(os.path.join(args.sampledir, 'f0')) synthesize(model, test_loader, args.test_batch)
def main(): testset = TextDataset(args.testset) test_loader = DataLoader(dataset=testset, batch_size=args.test_batch, drop_last=False, shuffle=False, collate_fn=synth_collate_fn, pin_memory=True) model = Tacotron().to(DEVICE) model_path = sorted( glob.glob(os.path.join(args.logdir, model.name, 'model-*.tar')))[-1] # latest model state = torch.load(model_path) model.load_state_dict(state['model']) args.global_step = state['global_step'] print('The model is loaded. Step: {}'.format(args.global_step)) model.eval() if not os.path.exists(os.path.join(args.sampledir, 'A')): os.makedirs(os.path.join(args.sampledir, 'A')) synthesize(model, test_loader, args.test_batch)
def get_data(self): self.convert = TextConverter(self.args.txt, max_vocab=self.args.max_vocab) dataset = TextDataset(self.args.txt, self.args.len, self.convert.text_to_arr) self.train_loader = DataLoader(dataset, self.args.batch_size, shuffle=True, num_workers=self.args.num_workers)
def main(load_model='latest'): """ main function :param load_model: String. {best, latest, <model_path>} :param synth_mode: {'test', 'synthesize'} """ assert os.path.exists(args.testset), 'Test sentence path is wrong.' model = TPGST().to(DEVICE) testset = TextDataset(args.testset, args.ref_path) test_loader = DataLoader(dataset=testset, batch_size=args.test_batch, drop_last=False, shuffle=False, collate_fn=text_collate_fn, pin_memory=True) if load_model.lower() == 'best': ckpt = pd.read_csv(os.path.join(args.logdir, model.name, 'ckpt.csv'), sep=',', header=None) ckpt.columns = ['models', 'loss'] model_path = ckpt.sort_values(by='loss', ascending=True).models.loc[0] model_path = os.path.join(args.logdir, model.name, model_path) elif 'pth.tar' in load_model: model_path = load_model else: model_path = sorted(glob.glob(os.path.join(args.logdir, model.name, 'model-*.tar')))[-1] # latest model state = torch.load(model_path) model.load_state_dict(state['model']) args.global_step = state['global_step'] print('The model is loaded. Step: {}'.format(args.global_step)) model.eval() if not os.path.exists(os.path.join(args.sampledir, 'A')): os.makedirs(os.path.join(args.sampledir, 'A')) if synth_mode == 'test': ref_synthesize(model, test_loader, args.test_batch) elif synth_mode == 'style': style_synthesize(model, test_loader, args.test_batch) elif synth_mode == 'tp': tp_synthesize(model, test_loader, args.test_batch) elif synth_mode == 'fix': fixed_synthesize(model, test_loader, args.test_batch)
bptt = 8 batch_size = 32 train = [ '/media/lytic/STORE/ru_open_stt_wav/text/public_youtube1120_hq.txt', '/media/lytic/STORE/ru_open_stt_wav/text/public_youtube1120.txt', '/media/lytic/STORE/ru_open_stt_wav/text/public_youtube700.txt' ] test = [ '/media/lytic/STORE/ru_open_stt_wav/text/asr_calls_2_val.txt', '/media/lytic/STORE/ru_open_stt_wav/text/buriy_audiobooks_2_val.txt', '/media/lytic/STORE/ru_open_stt_wav/text/public_youtube700_val.txt' ] train = TextDataset(train, labels, batch_size) test = TextDataset(test, labels, batch_size) test.shuffle(0) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=3e-4, weight_decay=1e-5) scheduler = StepLR(optimizer, step_size=10000, gamma=0.99) for epoch in range(20): model.train() hidden = model.step_init(batch_size) err = AverageMeter('loss')
torch.cuda.manual_seed(LUCKY_NUM) np.random.seed(LUCKY_NUM) # initialize matplotlib and CUDA # plt.ion() torch.cuda.set_device(config.deviceID) # set the work path PATH = config.path if not os.path.isdir(PATH): os.makedirs(PATH) # Parameters used in the net ERROR_PER = config.ERROR_PER NE = config.ne # number of ensemble GAMMA = config.GAMMA T = config.T # Load data and initialize enn net text = TextDataset() # Set the loss function criterion = torch.nn.MSELoss() INFO = { "train len": config.train_len, "shrink len": config.shrink_len, "window step": config.window_step, "Error per": config.ERROR_PER, "input dim": config.input_dim, "hid dim": config.hid_dim, "num layer": config.num_layer, "number of ensemble": config.ne, "T": config.T, "batch size": config.batch_size, "epoch": config.epoch, "GAMMA": config.GAMMA,
### Rebuild dictionary print('Build word2idx ... ', end='') word2idx = {} for k, v in word2vec.wv.vocab.items(): word2idx[k] = v.index word2vec.wv.syn0[word2idx['<pad>']] = np.zeros(embedding_dim) pickle.dump(word2idx, open('_word2vec.pkl', 'wb')) print('Done !') ### Load dataset print('Load dataset ... ', end='') d_train = TextDataset(word2idx, fp_train_labeled, train=True) d_val = TextDataset(word2idx, fp_train_labeled, train=True, val=True) train_loader = DataLoader(d_train, batch_size=batch_size, shuffle=True) val_loader = DataLoader(d_val, batch_size=batch_size, shuffle=False) ### Train model print('Train LSTM ... ') model = LSTMClassifier(embedding_dim, hidden_dim, num_layers, batch_size) model.init_weights() model.embedding.weight = torch.nn.Parameter(torch.Tensor(word2vec.wv.syn0)) model.embedding.weight.requires_grad = False model.cuda() print(model)
def main(_): # Set up logging configure_logging(FLAGS.debug_log) # Load configuration with open(FLAGS.config, 'r') as f: config = yaml.load(f) # Get the directory paths ckpt_dir = os.path.join(config['training']['ckpt_dir'], config['experiment_name']) summary_dir = os.path.join(config['training']['summary_dir'], config['experiment_name']) # Create the directories if they do not already exist if not os.path.exists(ckpt_dir): logging.info('Creating checkpoint directory: `%s`.' % ckpt_dir) os.makedirs(ckpt_dir) if not os.path.exists(summary_dir): logging.info('Creating summary directory: `%s`.' % summary_dir) os.makedirs(summary_dir) # Check for conflicting configurations safe_copy_config(config, FLAGS.force_overwrite) # Init summary writer summary_writer = SummaryWriter(summary_dir) # Load vocab and datasets logging.info('Loading the vocabulary.') with open(config['data']['vocab'], 'r') as f: vocab = Vocab.load(f) logging.info('Loading train and valid data.') train_data = TextDataset(config['data']['train'], vocab=vocab, max_length=config['training']['max_length']) valid_data = TextDataset(config['data']['valid'], vocab=vocab, max_length=config['training']['max_length']) # Initialize models logging.info('Initializing the inference network and generative model.') inference_network = RNNTextInferenceNetwork( dim=config['model']['dim'], vocab_size=len(vocab), encoder_kwargs=config['model']['encoder'], normalizing_flow_kwargs=config['model']['normalizing_flow']) generative_model = RNNTextGenerativeModel( dim=config['model']['dim'], vocab_size=len(vocab), max_length=config['training']['max_length'], sos_idx=vocab.sos_idx, **config['model']['generator']) if torch.cuda.is_available(): inference_network = inference_network.cuda() generative_model = generative_model.cuda() # Setup model optimizers optimizer_in = torch.optim.Adam(inference_network.parameters(), lr=config['training']['learning_rate']) optimizer_gm = torch.optim.Adam(generative_model.parameters(), lr=config['training']['learning_rate']) # Restore ckpt = os.path.join(ckpt_dir, 'model.pt') if os.path.exists(ckpt): logging.info('Model checkpoint detected at: `%s`. Restoring.' % ckpt) checkpoint = torch.load(ckpt) epoch = checkpoint['epoch'] t = checkpoint['t'] best_loss = checkpoint['best_loss'] inference_network.load_state_dict(checkpoint['state_dict_in']) generative_model.load_state_dict(checkpoint['state_dict_gm']) optimizer_in.load_state_dict(checkpoint['optimizer_in']) optimizer_gm.load_state_dict(checkpoint['optimizer_gm']) else: logging.info('No existing checkpoint found.') epoch = 0 t = 0 best_loss = float('inf') # Start train weight = torch.ones(len(vocab)) weight[vocab.unk_idx] = config['training']['unk_weight'] if torch.cuda.is_available(): weight = weight.cuda() while epoch < config['training']['epochs']: logging.info('Starting epoch - %i.' % epoch) inference_network.train() generative_model.train() # Training step logging.info('Start train step.') train_loader = DataLoader( dataset=train_data, batch_size=config['training']['batch_size'], shuffle=True, num_workers=cpu_count(), pin_memory=torch.cuda.is_available()) # Init train summaries train_nll = 0.0 train_kl = 0.0 train_loss = 0.0 for batch in train_loader: optimizer_in.zero_grad() optimizer_gm.zero_grad() x = batch['input'] target = batch['target'] lengths = batch['lengths'] if torch.cuda.is_available(): x = x.cuda() target = target.cuda() lengths = lengths.cuda() # Forward pass of inference network z, kl = inference_network(x, lengths) # Teacher forcing x_hat = word_dropout(x, config['training']['word_dropout_rate'], vocab.unk_idx) logp, _ = generative_model(z, x_hat, lengths) # Obtain current value of the annealing constant with beta trick beta = get_beta(config, epoch) # Compute annealed loss length = logp.shape[1] logp = logp.view(-1, len(vocab)) target = target[:,:length].contiguous().view(-1) nll = F.nll_loss(logp, target, ignore_index=vocab.pad_idx, weight=weight, size_average=False) loss = nll + beta * kl # Update summaries train_nll += nll.data train_kl += kl.data train_loss += loss.data # Backpropagate gradients batch_size = config['training']['batch_size'] loss /= batch_size kl /= batch_size nll /= batch_size loss.backward() optimizer_in.step() optimizer_gm.step() # Log if not t % config['training']['log_frequency']: # Note: logged train loss only for a single batch - see # tensorboard for summary over epochs line = 'Iteration: %i - Loss: %0.4f. - KL: %0.4f - NLL: %0.4f' logging.info(line % (t, loss.data, kl.data, nll.data)) # Print a greedy sample z_k, _ = inference_network(x, lengths) _, sample = generative_model(z_k) example = [vocab.id2word(int(x)) for x in sample[0]] try: T = example.index(vocab.eos_token) example = example[:T] except ValueError: pass example = ' '.join(example) logging.info('Example - `%s`' % example) t += 1 # Validation step logging.info('Start valid step.') valid_loader = DataLoader( dataset=valid_data, batch_size=config['training']['batch_size'], shuffle=False, num_workers=cpu_count(), pin_memory=torch.cuda.is_available()) # Init valid summaries valid_nll = 0.0 valid_kl = 0.0 valid_loss = 0.0 for batch in valid_loader: x = batch['input'] target = batch['target'] lengths = batch['lengths'] if torch.cuda.is_available(): x = x.cuda() target = target.cuda() lengths = lengths.cuda() # Forward pass of inference network z, kl = inference_network(x, lengths) # Teacher forcing logp, _ = generative_model(z, x, lengths) # Compute annealed loss length = logp.shape[1] logp = logp.view(-1, len(vocab)) target = target[:,:length].contiguous().view(-1) nll = F.nll_loss(logp, target, ignore_index=vocab.pad_idx, size_average=False) loss = nll + kl # Update summaries valid_nll += nll.data valid_kl += kl.data valid_loss += loss.data # Normalize losses train_nll /= len(train_data) train_kl /= len(train_data) train_loss /= len(train_data) valid_nll /= len(valid_data) valid_kl /= len(valid_data) valid_loss /= len(valid_data) # Tensorboard logging summary_writer.add_scalar("elbo/train", train_loss.data, epoch) summary_writer.add_scalar("kl/train", train_kl.data, epoch) summary_writer.add_scalar("nll/train", train_nll.data, epoch) summary_writer.add_scalar("elbo/val", valid_loss.data, epoch) summary_writer.add_scalar("kl/val", valid_kl.data, epoch) summary_writer.add_scalar("nll/val", valid_nll.data, epoch) # Save checkpoint is_best = valid_loss < best_loss best_loss = min(loss, best_loss) save_checkpoint({ 'epoch': epoch + 1, 't': t, 'best_loss': best_loss, 'state_dict_in': inference_network.state_dict(), 'state_dict_gm': generative_model.state_dict(), 'optimizer_in': optimizer_in.state_dict(), 'optimizer_gm': optimizer_gm.state_dict() }, is_best, ckpt) epoch += 1
def train_iters(ae_model, dis_model): if args.use_albert: tokenizer = BertTokenizer.from_pretrained("clue/albert_chinese_tiny", do_lower_case=True) elif args.use_tiny_bert: tokenizer = AutoTokenizer.from_pretrained( "google/bert_uncased_L-2_H-256_A-4", do_lower_case=True) elif args.use_distil_bert: tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-uncased', do_lower_case=True) # tokenizer = BertTokenizer.from_pretrained(args.PRETRAINED_MODEL_NAME, do_lower_case=True) tokenizer.add_tokens('[EOS]') bos_id = tokenizer.convert_tokens_to_ids(['[CLS]'])[0] ae_model.bert_encoder.resize_token_embeddings(len(tokenizer)) #print("[CLS] ID: ", bos_id) print("Load trainData...") if args.load_trainData and os.path.exists('./{}_trainData.pkl'.format( args.task)): with open('./{}_trainData.pkl'.format(args.task), 'rb') as f: trainData = pickle.load(f) else: trainData = TextDataset(batch_size=args.batch_size, id_bos='[CLS]', id_eos='[EOS]', id_unk='[UNK]', max_sequence_length=args.max_sequence_length, vocab_size=0, file_list=args.train_file_list, label_list=args.train_label_list, tokenizer=tokenizer) with open('./{}_trainData.pkl'.format(args.task), 'wb') as f: pickle.dump(trainData, f) add_log("Start train process.") ae_model.train() dis_model.train() ae_model.to(device) dis_model.to(device) ''' Fixing or distilling BERT encoder ''' if args.fix_first_6: print("Try fixing first 6 bertlayers") for layer in range(6): for param in ae_model.bert_encoder.encoder.layer[layer].parameters( ): param.requires_grad = False elif args.fix_last_6: print("Try fixing last 6 bertlayers") for layer in range(6, 12): for param in ae_model.bert_encoder.encoder.layer[layer].parameters( ): param.requires_grad = False if args.distill_2: print("Get result from layer 2") for layer in range(2, 12): for param in ae_model.bert_encoder.encoder.layer[layer].parameters( ): param.requires_grad = False ae_optimizer = NoamOpt( ae_model.d_model, 1, 2000, torch.optim.Adam(ae_model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)) dis_optimizer = torch.optim.Adam(dis_model.parameters(), lr=0.0001) #ae_criterion = get_cuda(LabelSmoothing(size=args.vocab_size, padding_idx=args.id_pad, smoothing=0.1)) ae_criterion = LabelSmoothing(size=ae_model.bert_encoder.config.vocab_size, padding_idx=0, smoothing=0.1).to(device) dis_criterion = nn.BCELoss(reduction='mean') history = {'train': []} for epoch in range(args.epochs): print('-' * 94) epoch_start_time = time.time() total_rec_loss = 0 total_dis_loss = 0 train_data_loader = DataLoader(trainData, batch_size=args.batch_size, shuffle=True, collate_fn=trainData.collate_fn, num_workers=4) num_batch = len(train_data_loader) trange = tqdm(enumerate(train_data_loader), total=num_batch, desc='Training', file=sys.stdout, position=0, leave=True) for it, data in trange: batch_sentences, tensor_labels, tensor_src, tensor_src_mask, tensor_tgt, tensor_tgt_y, tensor_tgt_mask, tensor_ntokens = data tensor_labels = tensor_labels.to(device) tensor_src = tensor_src.to(device) tensor_tgt = tensor_tgt.to(device) tensor_tgt_y = tensor_tgt_y.to(device) tensor_src_mask = tensor_src_mask.to(device) tensor_tgt_mask = tensor_tgt_mask.to(device) # Forward pass latent, out = ae_model.forward(tensor_src, tensor_tgt, tensor_src_mask, tensor_tgt_mask) # Loss calculation loss_rec = ae_criterion( out.contiguous().view(-1, out.size(-1)), tensor_tgt_y.contiguous().view(-1)) / tensor_ntokens.data ae_optimizer.optimizer.zero_grad() loss_rec.backward() ae_optimizer.step() latent = latent.detach() next_latent = latent.to(device) # Classifier dis_lop = dis_model.forward(next_latent) loss_dis = dis_criterion(dis_lop, tensor_labels) dis_optimizer.zero_grad() loss_dis.backward() dis_optimizer.step() total_rec_loss += loss_rec.item() total_dis_loss += loss_dis.item() trange.set_postfix(total_rec_loss=total_rec_loss / (it + 1), total_dis_loss=total_dis_loss / (it + 1)) if it % 100 == 0: add_log( '| epoch {:3d} | {:5d}/{:5d} batches | rec loss {:5.4f} | dis loss {:5.4f} |' .format(epoch, it, num_batch, loss_rec, loss_dis)) print(id2text_sentence(tensor_tgt_y[0], tokenizer, args.task)) generator_text = ae_model.greedy_decode( latent, max_len=args.max_sequence_length, start_id=bos_id) print(id2text_sentence(generator_text[0], tokenizer, args.task)) # Save model #torch.save(ae_model.state_dict(), args.current_save_path / 'ae_model_params.pkl') #torch.save(dis_model.state_dict(), args.current_save_path / 'dis_model_params.pkl') history['train'].append({ 'epoch': epoch, 'total_rec_loss': total_rec_loss / len(trange), 'total_dis_loss': total_dis_loss / len(trange) }) add_log('| end of epoch {:3d} | time: {:5.2f}s |'.format( epoch, (time.time() - epoch_start_time))) # Save model torch.save(ae_model.state_dict(), args.current_save_path / 'ae_model_params.pkl') torch.save(dis_model.state_dict(), args.current_save_path / 'dis_model_params.pkl') print("Save in ", args.current_save_path) return
embedding_dim = 128 hidden_dim = 128 num_layers = 2 batch_size = 128 ### Load dictionary print('Loading Dictionary ... ', end='') word2idx = pickle.load(open(fp_word2idx, 'rb')) print('Done !') ### Load data print('Loading Data ... ', end='') d_test = TextDataset(word2idx, fp_test, train=False) test_loader = DataLoader(d_test, batch_size=batch_size, shuffle=False) print('Done !') ### Load model print('Loading Model ... ', end='') model = LSTMClassifier(embedding_dim, hidden_dim, num_layers, batch_size) model.cuda() model.load_state_dict(torch.load(fp_model)) print('Done !') ### Predict print('Predict ... ', end='')
if n in pretrained_state_dict: w = pretrained_state_dict[n] p.data.copy_(w.data) model = cuda(model) print('loaded pretrained ckpt') optimizer = AdamW( optimizer_params(model), lr=args.lr, weight_decay=args.wd, eps=args.eps, ) criterion = nn.CrossEntropyLoss() best_loss = float('inf') train_ds = TextDataset(f'amazon/{args.src}_train.csv', args.src_p) valid_ds = TextDataset(f'amazon/{args.src}_valid.csv', args.src_p) test_ds = TextDataset(f'amazon/{args.src}_test.csv', args.src_p) if args.train: for epoch in range(1, args.epochs + 1): train_loss = train(train_ds) valid_loss = valid(valid_ds) if valid_loss < best_loss: best_loss = valid_loss torch.save(model.state_dict(), args.ckpt) print(f'epoch: {epoch} | ' f'train loss: {train_loss:.6f} | ' f'valid loss: {valid_loss:.6f}') model.load_state_dict(torch.load(args.ckpt))
parser.add_argument('--output', type=str, default='./result') parser.add_argument('--model', type=str, required=True) parser.add_argument('--batchsize', type=int, default=5) parser.add_argument('--manga_name', type=str, default=None) parser.add_argument('--visualize', action='store_true') return parser.parse_args() if __name__ == '__main__': # Good formatting when printing the APs for each class and mAP pp = PrettyPrinter() args = get_args() obj = torch.load(args.model) model = obj['model'] if args.manga_name != None: args.output = os.path.join(args.output, args.manga_name) if not os.path.exists(args.output): os.makedirs(args.output) if args.visualize: mytransforms = MyTransform() test_dataset = TextDataset(args.root, model_type='ssd-fork', transforms=None, specific_manga=args.manga_name) else: mytransforms = None test_dataset = TextDataset(args.root, model_type='ssd-fork', transforms=MyTransform(), specific_manga=args.manga_name) test_loader = DataLoader(test_dataset, batch_size=args.batchsize, shuffle=False, collate_fn=my_collate_fn, num_workers=4, pin_memory=True) evaluate(test_loader, model, args.visualize, args.output, mytransforms)
print('Vocabulary has been loaded from {}'.format(args.vocab_file)) if args.tokenized == 1: corpus = Corpus_tok(path, args.train, args.valid, args.test, load_vocab=args.load_vocab, vocab_file=args.vocab_file) else: corpus = Corpus(path, args.train, args.valid, args.test, load_vocab=args.load_vocab, vocab_file=args.vocab_file) torch.save(corpus, fn) if args.save_vocab: with open('{}/{}'.format(path, args.vocab_file), 'wb') as f: torch.save([corpus.vocabulary.word2idx, corpus.vocabulary.idx2word], f) vocab_sz = len(corpus.vocabulary) # Produce dataloaders if args.tokenized == 1: print("Producing train dataloader...") train_loader = TextDataset(path, args.train, corpus.vocabulary) dlt = DataLoader(train_loader, batch_size=args.bs, drop_last=True) train_data = SortingTextDataLoader(dlt) print("Num sentences train loader:", len(train_loader)) print("Producing val dataloader...") valid_loader = TextDataset(path, args.valid, train_loader.vocabulary) dlv = DataLoader(valid_loader, batch_size=args.bs, drop_last=True) valid_data = SortingTextDataLoader(dlv) print("Num sentences valid loader:", len(valid_loader)) print("Producing test dataloader...") test_loader = TextDataset(path, args.test, valid_loader.vocabulary) dlte = DataLoader(test_loader, batch_size=args.bs, drop_last=True) test_data = SortingTextDataLoader(dlte) corpus.vocabulary = test_loader.vocabulary print("Num sentences test loader:", len(test_loader)) else:
def main(_): # Set up logging configure_logging(FLAGS.debug_log) # Load configuration with open(FLAGS.config, 'r') as f: config = yaml.load(f) # Get the checkpoint path ckpt_dir = os.path.join(config['training']['ckpt_dir'], config['experiment_name']) # Load vocab and datasets logging.info('Loading the vocabulary.') with open(config['data']['vocab'], 'r') as f: vocab = Vocab.load(f) logging.info('Loading test data.') test_data = TextDataset(config['data']['test'], vocab=vocab, max_length=config['training']['max_length']) test_loader = DataLoader(dataset=test_data, batch_size=config['training']['batch_size'], shuffle=False, num_workers=cpu_count(), pin_memory=torch.cuda.is_available()) # Initialize models logging.info('Initializing the inference network and generative model.') inference_network = RNNTextInferenceNetwork( dim=config['model']['dim'], vocab_size=len(vocab), encoder_kwargs=config['model']['encoder'], normalizing_flow_kwargs=config['model']['normalizing_flow']) generative_model = RNNTextGenerativeModel( dim=config['model']['dim'], vocab_size=len(vocab), max_length=config['training']['max_length'], sos_idx=vocab.sos_idx, **config['model']['generator']) if torch.cuda.is_available(): inference_network = inference_network.cuda() generative_model = generative_model.cuda() # Restore ckpt = os.path.join(ckpt_dir, 'model.pt.best') if os.path.exists(ckpt): logging.info('Model checkpoint detected at: `%s`. Restoring.' % ckpt) checkpoint = torch.load(ckpt) inference_network.load_state_dict(checkpoint['state_dict_in']) generative_model.load_state_dict(checkpoint['state_dict_gm']) else: logging.error('No model checkpoint found. Terminating.') sys.exit(1) # Init test summaries test_nll = 0.0 test_kl = 0.0 test_loss = 0.0 test_suml2p = 0.0 test_n = 0.0 # Evaluate inference_network.eval() generative_model.eval() for batch in test_loader: x = batch['input'] target = batch['target'] lengths = batch['lengths'] if torch.cuda.is_available(): x = x.cuda() target = target.cuda() lengths = lengths.cuda() # Forward pass of inference network z, kl = inference_network(x, lengths) # Teacher forcing logp, _ = generative_model(z, x, lengths) # Compute loss length = logp.shape[1] logp = logp.view(-1, len(vocab)) target = target[:, :length].contiguous().view(-1) nll = F.nll_loss(logp, target, ignore_index=vocab.pad_idx, size_average=False) loss = nll + kl l2p, n = suml2p(logp, target, vocab.pad_idx) # Update summaries test_nll += nll.data test_kl += kl.data test_loss += loss.data test_suml2p += l2p.data test_n += n # Normalize losses test_nll /= len(test_data) test_kl /= len(test_data) test_loss /= len(test_data) H = -test_suml2p / test_n test_perplexity = 2**H # Log output logging.info('NLL: %0.4f' % test_nll) logging.info('KL: %0.4f' % test_kl) logging.info('ELBO: %0.4f' % test_loss) logging.info('Perplexity: %0.4f' % test_perplexity)
words = [] for t in text_corpus: splits = t.split(' ') words.extend([s for s in splits if len(s) > 2]) return words if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--config', type=str, help="path to config with parameters") args = parser.parse_args() cfg = config_from_file(args.config) dataset = TextDataset(cfg) word_sets = dict() pos_words = text_corpus_to_words(dataset.pos_text_corpus) neg_words = text_corpus_to_words(dataset.neg_text_corpus) for name, words in zip(['pos', 'neg'], [pos_words, neg_words]): plt.hist(words) plt.title(f'hist of {name} words') plt.show() counter = Counter(words) word_sets[name] = (set(words), counter) print(f'{name} descriptive stat:') most_common = counter.most_common() print(f' most frequent words in {name}: ')
torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.cuda.set_device(args.gpu) # Create the configuration config = Config(sentence_max_size=50, batch_size=args.batch_size, word_num=11000, label_num=args.label_num, learning_rate=args.lr, cuda=args.gpu, epoch=args.epoch, out_channel=args.out_channel) training_set = TextDataset(path='data/train') training_iter = data.DataLoader(dataset=training_set, batch_size=config.batch_size, num_workers=2) model = DPCNN(config) embeds = nn.Embedding(config.word_num, config.word_embedding_dimension) if torch.cuda.is_available(): model.cuda() embeds = embeds.cuda() criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=config.lr)
def eval_iters(ae_model, dis_model): # tokenizer = BertTokenizer.from_pretrained(args.PRETRAINED_MODEL_NAME, do_lower_case=True) if args.use_albert: tokenizer = BertTokenizer.from_pretrained("clue/albert_chinese_tiny", do_lower_case=True) elif args.use_tiny_bert: tokenizer = AutoTokenizer.from_pretrained( "google/bert_uncased_L-2_H-256_A-4", do_lower_case=True) elif args.use_distil_bert: tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-uncased', do_lower_case=True) tokenizer.add_tokens('[EOS]') bos_id = tokenizer.convert_tokens_to_ids(['[CLS]'])[0] ae_model.bert_encoder.resize_token_embeddings(len(tokenizer)) print("[CLS] ID: ", bos_id) # if args.task == 'news_china_taiwan': eval_file_list = [ args.data_path + 'test.0', args.data_path + 'test.1', ] eval_label_list = [ [0], [1], ] if args.eval_positive: eval_file_list = eval_file_list[::-1] eval_label_list = eval_label_list[::-1] print("Load testData...") testData = TextDataset(batch_size=args.batch_size, id_bos='[CLS]', id_eos='[EOS]', id_unk='[UNK]', max_sequence_length=args.max_sequence_length, vocab_size=0, file_list=eval_file_list, label_list=eval_label_list, tokenizer=tokenizer) dataset = testData eval_data_loader = DataLoader(dataset, batch_size=1, shuffle=False, collate_fn=dataset.collate_fn, num_workers=4) num_batch = len(eval_data_loader) trange = tqdm(enumerate(eval_data_loader), total=num_batch, desc='Training', file=sys.stdout, position=0, leave=True) gold_ans = [''] * num_batch add_log("Start eval process.") ae_model.to(device) dis_model.to(device) ae_model.eval() dis_model.eval() total_latent_lst = [] for it, data in trange: batch_sentences, tensor_labels, tensor_src, tensor_src_mask, tensor_tgt, tensor_tgt_y, tensor_tgt_mask, tensor_ntokens = data tensor_labels = tensor_labels.to(device) tensor_src = tensor_src.to(device) tensor_tgt = tensor_tgt.to(device) tensor_tgt_y = tensor_tgt_y.to(device) tensor_src_mask = tensor_src_mask.to(device) tensor_tgt_mask = tensor_tgt_mask.to(device) print("------------%d------------" % it) print(id2text_sentence(tensor_tgt_y[0], tokenizer, args.task)) print("origin_labels", tensor_labels.cpu().detach().numpy()[0]) latent, out = ae_model.forward(tensor_src, tensor_tgt, tensor_src_mask, tensor_tgt_mask) generator_text = ae_model.greedy_decode( latent, max_len=args.max_sequence_length, start_id=bos_id) print(id2text_sentence(generator_text[0], tokenizer, args.task)) # Define target label target = torch.FloatTensor([[1.0]]).to(device) if tensor_labels[0].item() > 0.5: target = torch.FloatTensor([[0.0]]).to(device) print("target_labels", target) modify_text, latent_lst = fgim_attack(dis_model, latent, target, ae_model, args.max_sequence_length, bos_id, id2text_sentence, None, gold_ans[it], tokenizer, device, task=args.task, save_latent=args.save_latent) if args.save_latent != -1: total_latent_lst.append(latent_lst) add_output(modify_text) if it >= args.save_latent_num: break print("Save log in ", args.output_file) if args.save_latent == -1: return folder = './latent_{}/'.format(args.task) if not os.path.exists(folder): os.mkdir(folder) if args.save_latent == 0: # full prefix = 'full' elif args.save_latent == 1: # first 6 layer prefix = 'first_6' elif args.save_latent == 2: # last 6 layer prefix = 'last_6' elif args.save_latent == 3: # get second layer prefix = 'distill_2' total_latent_lst = np.asarray(total_latent_lst) if args.eval_negative: save_label = 0 else: save_label = 1 with open(folder + '{}_{}.pkl'.format(prefix, save_label), 'wb') as f: pickle.dump(total_latent_lst, f) print("Save laten in ", folder + '{}_{}.pkl'.format(prefix, save_label))
def get_data(convert): dataset = TextDataset(opt.txt, opt.len, convert.text_to_arr) return DataLoader(dataset, opt.batch_size, shuffle=True, num_workers=opt.num_workers)
if __name__ == '__main__': args = get_args() model, loss_func = build_fork_model_and_loss_function(args.n_classes) model.to(device) loss_func.to(device) optim = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=5e-4) scheduler = torch.optim.lr_scheduler.StepLR(optim, step_size=6, gamma=0.1) # create dataloader dataset = TextDataset(args.root, model_type='ssd-fork', transforms=MyTransform()) # according to fork paper test_size = 927 # test_size = 20 indices = list(range(len(dataset))) # split data set to training and testing train_set = torch.utils.data.Subset(dataset, indices[:-test_size]) test_set = torch.utils.data.Subset(dataset, indices[-test_size:]) train_dataloader = DataLoader(train_set, batch_size=args.batchsize, collate_fn=partial(my_collate_fn), shuffle=True, num_workers=4)
model = cuda(model) optimizer = AdamW( optimizer_params(model), lr=args.lr, weight_decay=args.wd, eps=args.eps, ) criterion = nn.CrossEntropyLoss(ignore_index=-1) best_loss = float('inf') train_ds_list = [] valid_ds_list = [] test_ds_list = [] if args.src_p > 0: train_ds_list.append(TextDataset('train-labels2.csv', args.src_p)) valid_ds_list.append(TextDataset(f'valid-labels2.csv', args.src_p)) test_ds_list.append(TextDataset(f'valid-labels2.csv', args.src_p)) if args.trg_p > 0: train_ds_list.append(TextDataset(f'train-labels2.csv', args.trg_p)) valid_ds_list.append(TextDataset(f'valid-labels2.csv', args.trg_p)) test_ds_list.append(TextDataset(f'valid-labels2.csv', args.trg_p)) if args.train: for epoch in range(1, args.epochs + 1): print(args.src_p, args.trg_p) print(train_ds_list[0].__len__()) #print(train_ds_list[0].__getitem__(10)) train_loss = train(train_ds_list) valid_loss = test(valid_ds_list)