def train(): # Turn on training mode which enables dropout. print(f'starting training for epoch {epoch}') if args.model == 'QRNN': model.reset() global STEP total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) batch = 0 for tr_data_shard in corpus.iterate_train_shards(): print('opening new data shard!') train_data = batchify(tr_data_shard, args.batch_size, args) i = 0 while i < train_data.size(0) - 1 - 1: bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2. # Prevent excessively small or negative sequence lengths seq_len = max(5, int(np.random.normal(bptt, 5))) # There's a very small chance that it could select a very long sequence length resulting in OOM # seq_len = min(seq_len, args.bptt + 10) lr2 = optimizer.param_groups[0]['lr'] optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt model.train() data, targets = get_batch(train_data, i, args, seq_len=seq_len) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) optimizer.zero_grad() output, hidden, rnn_hs, dropped_rnn_hs = model(data, hidden, return_h=True) raw_loss = criterion(model.decoder.weight, model.decoder.bias, output, targets) loss = raw_loss # Activiation Regularization if args.alpha: loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) # Temporal Activation Regularization (slowness) if args.beta: loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:]) loss.backward() STEP += 1 writer.add_scalar('Loss/train-batch', raw_loss.data, STEP) # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. if args.clip: torch.nn.utils.clip_grad_norm_(params, args.clip) optimizer.step() total_loss += raw_loss.data optimizer.param_groups[0]['lr'] = lr2 if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss.item() / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format( epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'], elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss), cur_loss / math.log(2))) writer.add_scalar('Loss/train', cur_loss, STEP) writer.add_scalar('BPC/train', curr_loss / math.log(2), STEP) total_loss = 0 start_time = time.time() ### batch += 1 i += seq_len
parser = argparse.ArgumentParser(description='LSTM language model') parser.add_argument('--test', type=str, help='test data') parser.add_argument('--model', type=str, required=True) parser.add_argument('--vocab', type=str, required=True) args = parser.parse_args() # load dictionary word2idx = utils.read_json(args.vocab) # load data test = utils.process_valid_data(args.test, word2idx) # load model # params = torch.load(args.model) test_batches = utils.batchify(test, params["sequence_length"], params["batch_size"], word2idx) model = lstm_lm.LSTMLM(params) optimizer = optim.SGD(model.parameters(), lr=0.1) loss_function = nn.NLLLoss() load_model(params, model, optimizer) if params["use_gpu"]: model = model.cuda() model.eval() # change state to evaluation mode print( "Test perplexity: ", utils.evaluate(model, loss_function, test_batches, params["use_gpu"]) / len(test_batches))
decoder_args=dec_args) conf.experiment_name = experiment_name conf.save(os.path.join(train_dir, 'configuration')) reset_tf_graph() ae = PointNetAutoEncoder(conf.experiment_name, conf) #buf_size = 1 # flush each line #fout = open(os.path.join(conf.train_dir, 'train_stats.txt'), 'a', buf_size) #train_stats = ae.train(pcd_dataset, conf, log_file=fout) #fout.close() ae.restore_model('data/shapenet_1024_ae_128', 90, True) print("Transforming Training data") X_train_trans = [] for x_b in batchify(X_train, 100): X_train_trans.append(ae.transform(x_b)) X_train_trans = np.concatenate(X_train_trans) print("Transforming test data") X_test_trans = [] for x_b in batchify(X_test, 100): X_test_trans.append(ae.transform(x_b)) X_test_trans = np.concatenate(X_test_trans) print("Fitting svm") svm = LinearSVC() svm.fit(X_train_trans, y_train[:len(X_train_trans)]) print(svm.score(X_test_trans, y_test[:len(X_test_trans)]))
if torch.cuda.is_available(): if not args.cuda: print("WARNING: You have a CUDA device, so you should probably run with --cuda") else: torch.cuda.set_device(0) torch.cuda.manual_seed(args.seed) ############################################################################### # Load data ############################################################################### corpus = data_ori_type.Corpus(args.data) eval_batch_size = 10 test_batch_size = 1 train_data = batchify(corpus.train, args.batch_size, args) val_data = batchify(corpus.valid, eval_batch_size, args) test_data = batchify(corpus.test, test_batch_size, args) train_data_type = batchify(corpus.train_type, args.batch_size, args) val_data_type = batchify(corpus.valid_type, eval_batch_size, args) test_data_type = batchify(corpus.test_type, test_batch_size, args) corpus2 = data.Corpus(args.data_type) train_data2 = batchify(corpus2.train, args.batch_size, args) val_data2 = batchify(corpus2.valid, eval_batch_size, args) test_data2 = batchify(corpus2.test, test_batch_size, args)
np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) else: torch.cuda.set_device(args.gpu) cudnn.benchmark = True cudnn.enabled = True torch.cuda.manual_seed_all(args.seed) corpus = data.Corpus(args.data) test_batch_size = 1 test_data = batchify(corpus.test, test_batch_size, args) def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) for i in range(0, data_source.size(0) - 1, args.bptt): print(i, data_source.size(0) - 1) data, targets = get_batch(data_source, i, args, evaluation=True) targets = targets.view(-1) log_prob, hidden = parallel_model(data, hidden) loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)),
def worker(gpu, ngpus_per_node, config_in): # init config = copy.deepcopy(config_in) args = config jobid = os.environ["SLURM_JOBID"] procid = int(os.environ["SLURM_PROCID"]) config.gpu = gpu if config.gpu is not None: writer_name = "tb.{}-{:d}-{:d}".format(jobid, procid, gpu) logger_name = "{}.{}-{:d}-{:d}.search.log".format(config.name, jobid, procid, gpu) model_name = "{}-{:d}-{:d}-model.pt".format(jobid, procid, gpu) optimizer_name = "{}-{:d}-{:d}-optimizer.pt".format(jobid, procid, gpu) msic_name = "{}-{:d}-{:d}-misc.pt".format(jobid, procid, gpu) ck_name = "{}-{:d}-{:d}".format(jobid, procid, gpu) else: writer_name = "tb.{}-{:d}-all".format(jobid, procid) logger_name = "{}.{}-{:d}-all.search.log".format(config.name, jobid, procid) model_name = "{}-{:d}-all-model.pt".format(jobid, procid) optimizer_name = "{}-{:d}-all-optimizer.pt".format(jobid, procid) msic_name = "{}-{:d}-all-misc.pt".format(jobid, procid) ck_name = "{}-{:d}-all".format(jobid, procid) writer = SummaryWriter(log_dir=os.path.join(config.path, writer_name)) # writer.add_text('config', config.as_markdown(), 0) logger = get_logger(os.path.join(config.path, logger_name)) # get cuda device device = torch.device('cuda', gpu) # ============================== begin ============================== logger.info("Logger is set - training start") logger.info('Args: {}'.format(args)) if config.dist_url == "env://" and config.rank == -1: config.rank = int(os.environ["RANK"]) if config.mp_dist: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes config.rank = config.rank * ngpus_per_node + gpu # print('back:{}, dist_url:{}, world_size:{}, rank:{}'.format(config.dist_backend, config.dist_url, config.world_size, config.rank)) dist.init_process_group(backend=config.dist_backend, init_method=config.dist_url, world_size=config.world_size, rank=config.rank) # get data corpus = data.Corpus(args.data) eval_batch_size = 10 test_batch_size = 1 train_data = batchify(corpus.train, args.batch_size, args) search_data = batchify(corpus.valid, args.batch_size, args) val_data = batchify(corpus.valid, eval_batch_size, args) test_data = batchify(corpus.test, test_batch_size, args) # split data ( with respect to GPU_id) def split_set(set_in): per_set_length = set_in.size(0) // config.world_size set_out = set_in[per_set_length*config.rank + 0: per_set_length*config.rank + per_set_length] return set_out train_data = split_set(train_data).to(device) search_data = split_set(search_data).to(device) val_data = split_set(val_data).to(device) test_data = split_set(test_data).to(device) if config.dist_privacy: logger.info("PRIVACY ENGINE ON") # build model ntokens = len(corpus.dictionary) if args.continue_train: model = torch.load(os.path.join(args.save, model_name)) else: genotype = eval("genotypes.%s" % args.arch) model = model_aug.RNNModel(ntokens, args.emsize, args.nhid, args.nhidlast, args.dropout, args.dropouth, args.dropoutx, args.dropouti, args.dropoute, cell_cls=model_aug.DARTSCell, genotype=genotype) # make model distributed if config.gpu is not None: torch.cuda.set_device(config.gpu) # model = model.to(device) model.cuda(config.gpu) # When using a single GPU per process and per DistributedDataParallel, we need to divide # the batch size ourselves based on the total number of GPUs we have # config.batch_size = int(config.batch_size / ngpus_per_node) config.workers = int((config.workers + ngpus_per_node - 1) / ngpus_per_node) # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[config.rank]) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[config.gpu]) # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=None, output_device=None) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) total_params = sum(x.data.nelement() for x in model.parameters()) logger.info('Model total parameters: {}'.format(total_params)) logger.info('Genotype: {}'.format(genotype)) # Loop over epochs. lr = args.lr best_val_loss = [] stored_loss = 100000000 # At any point you can hit Ctrl + C to break out of training early. try: if args.continue_train: optimizer_state = torch.load(os.path.join(args.save, optimizer_name)) if 't0' in optimizer_state['param_groups'][0]: optimizer = torch.optim.ASGD(model.parameters(), lr=args.lr, t0=0, lambd=0., weight_decay=args.wdecay) else: optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wdecay) optimizer.load_state_dict(optimizer_state) else: optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wdecay) epoch = 1 while epoch < args.epochs + 1: epoch_start_time = time.time() try: # train() train(model, epoch, corpus, train_data, search_data, optimizer, device, logger, writer, args) except: logger.info('rolling back to the previous best model ...') model = torch.load(os.path.join(args.save, model_name)) model = model.cuda() optimizer_state = torch.load(os.path.join(args.save, optimizer_name)) if 't0' in optimizer_state['param_groups'][0]: optimizer = torch.optim.ASGD(model.parameters(), lr=args.lr, t0=0, lambd=0., weight_decay=args.wdecay) else: optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wdecay) optimizer.load_state_dict(optimizer_state) epoch = torch.load(os.path.join(args.save, msic_name))['epoch'] continue if 't0' in optimizer.param_groups[0]: tmp = {} for prm in model.parameters(): tmp[prm] = prm.data.clone() prm.data = optimizer.state[prm]['ax'].clone() val_loss2 = evaluate(model, corpus, args, val_data) logger.info('-' * 89) logger.info('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time), val_loss2, math.exp(val_loss2))) logger.info('-' * 89) if val_loss2 < stored_loss: save_checkpoint(model, optimizer, epoch, args.save, dist_name=ck_name) logger.info('Saving Averaged!') stored_loss = val_loss2 for prm in model.parameters(): prm.data = tmp[prm].clone() else: val_loss = evaluate(model, corpus, args, val_data, eval_batch_size) logger.info('-' * 89) logger.info('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss))) logger.info('-' * 89) if val_loss < stored_loss: save_checkpoint(model, optimizer, epoch, args.save, dist_name=ck_name) logger.info('Saving Normal!') stored_loss = val_loss if 't0' not in optimizer.param_groups[0] and (len(best_val_loss)>args.nonmono and val_loss > min(best_val_loss[:-args.nonmono])): logger.info('Switching!') optimizer = torch.optim.ASGD(model.parameters(), lr=args.lr, t0=0, lambd=0., weight_decay=args.wdecay) best_val_loss.append(val_loss) epoch += 1 except KeyboardInterrupt: logger.info('-' * 89) logger.info('Exiting from training early') # Load the best saved model. model = torch.load(os.path.join(args.save, model_name)) model = model.cuda() test_loss = evaluate(model, corpus, args, test_data, test_batch_size) logger.info('=' * 89) logger.info('| End of training & Testing | test loss {:5.2f} | test ppl {:8.2f}'.format( test_loss, math.exp(test_loss))) logger.info('=' * 89)
import os import hashlib fn = 'corpus' if os.path.exists(fn): print('Loading cached dataset...') corpus = torch.load(fn) else: print('Producing dataset...') corpus = data.Corpus(args.data) torch.save(corpus, fn) eval_batch_size = 10 test_batch_size = 1 train_data, train_rps = batchify(corpus.train, corpus.train_rps, args.batch_size, args) val_data, val_rps = batchify(corpus.valid, corpus.valid_rps, eval_batch_size, args) test_data, test_rps = batchify(corpus.test, corpus.test_rps, test_batch_size, args) print('Args:', args) def evaluate(data_source, rps, batch_size=10): # Turn on evaluation mode which disables dropout. criterion = torch.nn.CrossEntropyLoss() ntokens = len(corpus.dictionary) model.eval() if args.model == 'QRNN': model.reset() total_loss = 0
def _get_online_predictions(self, lines: List[str], types: List[str] = None) -> List[int]: """retrieves predictions by triggering google cloud function, which invokes ml-engine to make a prediction for each line. """ contexts = self._get_line_context(lines, n=CONTEXT_N_LINES) instances = [] for i, line in enumerate(lines): context = contexts[i] if MAX_LENGTH > 0: if len(line) > MAX_LENGTH: line = line[:MAX_LENGTH] context = '' elif (len(line) + len(context)) > MAX_LENGTH: context = context[:MAX_LENGTH-len(line)] assert (len(line) + len(context)) <= MAX_LENGTH instances.append({'inputs': line, 'context': context}) if self.verbosity > 1: raw_instances = instances.copy() if LABEL_SPEECHES_ONLY: assert types is not None, '`types` must be provided when LABEL_SPEECHES_ONLY == True.' assert len(types) == len(lines), f'types must have same length as lines, but {len(types)} != {len(lines)}.' speeches = [] speeches_idx = [] for i, instance in enumerate(instances): if types[i] == 'speech': speeches.append(instance) speeches_idx.append(i) instances = speeches if self.verbosity > 0: print(f'Making "speaker span" predictions for {len(instances)} lines...') problem_class = PROBLEM_CLASSES[PROBLEM] problem = problem_class() encoders = problem.feature_encoders(data_dir=DATA_DIR) instances_b64 = [] for instance in instances: if 'targets' not in instance: instance['targets'] = '' encoded_instance = problem.encode_example(instance, encoders) # encoded_sample.pop('targets') # encoded_sample.pop('context') serialized_instance = to_example(encoded_instance).SerializeToString() instances_b64.append({"b64": base64.b64encode(serialized_instance).decode('utf-8')}) instances = instances_b64 preds = [] batch_generator = batchify(instances, BATCH_SIZE) if self.verbosity > 0: batch_generator = tqdm(batch_generator, total=np.ceil(len(instances)/BATCH_SIZE).astype(int)) for batch in batch_generator: try: # print([len(inst['inputs']) + len(inst['context']) for inst in raw_instances[len(preds):len(preds)+BATCH_SIZE]]) if LOCAL: res = requests.post(LOCAL_URL, data=json.dumps({"instances": batch}), headers={"Content-Type": "application/json"}) else: res = self._get_cloud_predictions(project=PROJECT, model=MODEL, instances=batch, version=VERSION) assert res.ok, f'request failed. Reason: {res.reason}.' predictions = json.loads(res.content) predictions = predictions['predictions'] for i, pred in enumerate(predictions): pred_out = pred['outputs'] pred_out = encoders['targets'].decode(pred_out) # removes spaces. pred_out = re.sub(r'\s+', '', pred_out) try: eos_idx = pred_out.lower().index('<eos>') pred_out = pred_out[:eos_idx] except ValueError: if self.verbosity > 1: logging.warn(f'<eos> not found in prediction: {pred_out}') preds.append(pred_out) # preds.append([token_pred[0][0] for token_pred in pred['outputs']]) except AssertionError as e: print(e) for i in range(len(batch)): preds.append(None) if LABEL_SPEECHES_ONLY: preds_all_lines = [] for i, line in enumerate(lines): pred = 'O' * len(line) preds_all_lines.append(pred) n_preds = 0 assert len(speeches_idx) == len(preds) for i, idx in enumerate(speeches_idx): preds_all_lines[idx] = preds[i] n_preds += 1 # sanity check. assert n_preds == len(preds) preds = preds_all_lines if self.verbosity > 1: for i, pred in enumerate(preds): instance = raw_instances[i] if 'targets' in instance: instance.pop('targets') if 'label' in instance: instance.pop('label') print(f'INPUT (len={len(instance["inputs"])}): {instance}\nOUTPUT (len={len(pred) if pred is not None else None}): {pred}') return preds
def trainEvalLM(args): fn = 'corpus.{}.data'.format(hashlib.md5(args.data.encode()).hexdigest()) if os.path.exists(fn): print('Loading cached dataset...') corpus = torch.load(fn) else: print('Producing dataset...') corpus = data.Corpus(args.data) torch.save(corpus, fn) if torch.cuda.is_available(): args.cuda = True ntokens = len(corpus.dictionary) eval_batch_size = 10 train_data = batchify(corpus.train, args.batch_size, args) val_data = batchify(corpus.valid, eval_batch_size, args) # Build the model and loss function model = lmModel.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied, g=args.g, k=args.k) criterion = nn.CrossEntropyLoss() if torch.cuda.is_available(): model = model.cuda() criterion = criterion.cuda() #compute network parameters params = list(model.parameters()) total_params = np.sum([np.prod(p.size()) for p in params]) print( '\033[1;32;40mTotal parameters (in million):\033[0m\033[1;31;40m {:0.2f} \033[0m\n' .format(total_params / 1e6, 2)) optimizer = torch.optim.SGD(params, lr=args.lr, weight_decay=args.wdecay) start_epoch = 1 if args.resume: print('Resuming model ...') model, criterion, optimizer, start_epoch = model_load(args.resume) optimizer.param_groups[0]['lr'] = args.lr model.dropout = args.dropout # At any point you can hit Ctrl + C to break out of training early. try: #Create folder for saving model and log files args.saveDir += '_' + args.model # ===================== if not os.path.isdir(args.saveDir): os.mkdir(args.saveDir) save_str = 'nl_' + str(args.nlayers) + '_nh_' + str( args.nhid) + '_g_' + str(args.g) + '_k_' + str(args.k) args.save = args.saveDir + '/model_' + save_str + '.pt' logFileLoc = args.saveDir + '/logs_' + save_str + '.txt' logger = open(logFileLoc, 'w') logger.write(str(args)) logger.write('\n Total parameters (in million): {:0.2f}'.format( total_params / 1e6, 2)) logger.write('\n\n') logger.write( "\n%s\t%s\t%s\t%s\t%s" % ('Epoch', 'Loss(Tr)', 'Loss(val)', 'ppl (tr)', 'ppl (val)')) logger.flush() best_val_loss = [] stored_loss = 100000000 # Loop over epochs. for epoch in range(start_epoch, args.epochs + 1): epoch_start_time = time.time() train_loss = train(args, model, criterion, optimizer, epoch, train_data, ntokens) ### TRAIN WITH ASGD if 't0' in optimizer.param_groups[0]: tmp = {} for prm in model.parameters(): tmp[prm] = prm.data.clone() prm.data = optimizer.state[prm]['ax'].clone() val_loss = evaluate(args, model, criterion, val_data, ntokens, eval_batch_size) print('-' * 89) print( '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f}'.format( epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss))) print('-' * 89) logger.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f" % (epoch, train_loss, val_loss, math.exp(train_loss), math.exp(val_loss))) logger.flush() if val_loss < stored_loss: model_save(args.save, model, criterion, optimizer, epoch) print('Saving Averaged (new best validation)') stored_loss = val_loss for prm in model.parameters(): prm.data = tmp[prm].clone() else: val_loss = evaluate(args, model, criterion, val_data, ntokens, eval_batch_size) print('-' * 89) print( '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f}'.format( epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss))) print('-' * 89) logger.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f" % (epoch, train_loss, val_loss, math.exp(train_loss), math.exp(val_loss))) logger.flush() if val_loss < stored_loss: model_save(args.save, model, criterion, optimizer, epoch) print('Saving model (new best validation)') stored_loss = val_loss if 't0' not in optimizer.param_groups[0] and ( len(best_val_loss) > args.nonmono and val_loss > min(best_val_loss[:-args.nonmono])): print('Switching to ASGD') optimizer = torch.optim.ASGD(model.parameters(), lr=args.lr, t0=0, lambd=0., weight_decay=args.wdecay) best_val_loss.append(val_loss) except KeyboardInterrupt: print('-' * 89) print('Exiting from training early')
def get_df(text): fn = 'corpus.{}.data'.format(hashlib.md5(args.data.encode()).hexdigest()) if args.philly: fn = os.path.join(os.environ['PT_OUTPUT_DIR'], fn) if os.path.exists(fn): print('Loading cached dataset...') corpus = torch.load(fn) else: print('Producing dataset...') corpus = data.Corpus(data_path, mode=mode) torch.save(corpus, fn) ntokens = len(corpus.dictionary) #initialize the model model = RNNModel(args.model, ntokens, args.emsize, args.nhid, args.chunk_size, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) with open(model_path, "rb") as f: model, criterion, optimizer = torch.load(f) #prepare data eval_batch_size = 10 test_batch_size = 1 train_data = batchify(corpus.train, args.batch_size, args) val_data = batchify(corpus.valid, eval_batch_size, args) test_data = batchify(corpus.test, test_batch_size, args) def idx2text(index): global corpus text = [corpus.dictionary.idx2word[idx] for idx in index] text = " ".join(text) return text def text2idx(text, mode="chinese"): global corpus if mode == "chinese": idx = [ corpus.dictionary.word2idx.get(word, corpus.dictionary.word2idx['K']) for word in text ] else: idx = [ corpus.dictionary.word2idx.get( word, corpus.dictionary.word2idx['<unk>']) for word in text.split() ] return idx idx = torch.tensor(text2idx(text, mode=mode)).unsqueeze(dim=-1).cuda() # seq_len = idx.size(0) hidden = model.init_hidden(args.batch_size) hidden = repackage_hidden(hidden) output, hidden, distances = model(idx, hidden, return_d=True) target_layer = 2 target_idx = 0 df = distances[0].cpu().data.numpy() target_text = [word for word in texts] df = df[target_layer, :, target_idx] return df
else: # to convert model trained on cuda to cpu model model = torch.load(f, map_location=lambda storage, loc: storage) model.eval() if args.cuda: model.cuda() else: model.cpu() eval_batch_size = 1 seq_len = 20 dictionary = dictionary_corpus.Dictionary(args.data) vocab_size = len(dictionary) print("Vocab size", vocab_size) print("TESTING") # assuming the mask file contains one number per line indicating the index of the target word index_col = 0 mask = create_target_mask(args.path + ".text", args.path + ".eval", index_col) mask_data = batchify(torch.LongTensor(mask), eval_batch_size, args.cuda) test_data = batchify( dictionary_corpus.tokenize(dictionary, args.path + ".text"), eval_batch_size, args.cuda) f_output = open(args.path + ".output_" + args.suffix, 'w') evaluate(test_data, mask_data) f_output.close()
def main(argv): parser = argparse.ArgumentParser( description='WikiText-2 language modeling') parser.add_argument('--batch-size', type=int, default=20, metavar='N', help='input batch size for training (default: 90)'), parser.add_argument('--eval-batch-size', type=int, default=20, metavar='N', help='input batch size for training (default: 50)'), parser.add_argument('--save-directory', type=str, default='output/wikitext-2', help='output directory') parser.add_argument('--model-save-directory', type=str, default='models/', help='output directory') parser.add_argument('--epochs', type=int, default=5, metavar='N', help='number of epochs to train') parser.add_argument('--base-seq-len', type=int, default=35, metavar='N', help='Batch length'), parser.add_argument('--min-seq-len', type=int, default=35, metavar='N', help='minimum batch length'), parser.add_argument('--seq-prob', type=int, default=0.95, metavar='N', help='prob of being divided by 2'), parser.add_argument('--seq-std', type=int, default=5, metavar='N', help='squence length std'), parser.add_argument('--hidden-dim', type=int, default=200, metavar='N', help='Hidden dim') parser.add_argument('--embedding-dim', type=int, default=200, metavar='N', help='Embedding dim') parser.add_argument('--lr', type=int, default=20, metavar='N', help='learning rate'), parser.add_argument('--weight-decay', type=int, default=2e-6, metavar='N', help='learning rate'), parser.add_argument('--tag', type=str, default='drop-out-training.pt', metavar='N', help='learning rate'), parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') args = parser.parse_args(argv) args.cuda = not args.no_cuda and torch.cuda.is_available() # load dataset train_data, val_data, vocabulary = (np.load('./dataset/wiki.train.npy'), np.load('./dataset/wiki.valid.npy'), np.load('./dataset/vocab.npy')) word_count = len(vocabulary) #model = models.RNNModel(word_count, args) loss_fn = torch.nn.CrossEntropyLoss() checkpoint_path = os.path.join(args.model_save_directory, args.tag) if not os.path.exists(checkpoint_path): model = models.LSTMModelSingle(word_count, args.embedding_dim, args.hidden_dim) else: print("Using pre-trained model") print("*" * 90) model = models.LSTMModelSingle(word_count, args.embedding_dim, args.hidden_dim) checkpoint_path = os.path.join(args.model_save_directory, args.tag) model.load_state_dict(torch.load(checkpoint_path)) if args.cuda: model = model.cuda() loss_fn = loss_fn.cuda() ''' generated = utils.generate( model, sequence_length=10, batch_size=2, stochastic=True, args=args).data.cpu().numpy() utils.print_generated( utils.to_text( preds=generated, vocabulary=vocabulary)) ''' print('Model: ', model) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) logging = dict() logging['loss'] = [] logging['train_acc'] = [] logging['val_loss'] = [] val_data_ = utils.batchify(utils.to_tensor(np.concatenate(val_data)), args.eval_batch_size) val_data_loader = utils.custom_data_loader(val_data_, args, evaluation=True) #X, y, seq_len = next(val_data_loader) #model.eval() #hidden = model.init_hidden(args.batch_size) #output = model.generate(X, hidden, 10) #print('output: ', output.shape) model.train() return for epoch in range(args.epochs): epoch_time = time.time() np.random.shuffle(train_data) train_data_ = utils.batchify( utils.to_tensor(np.concatenate(train_data)), args.batch_size) train_data_loader = utils.custom_data_loader(train_data_, args, evaluation=True) val_data_loader = utils.custom_data_loader(val_data_, args, evaluation=True) # number of words train_size = train_data_.size(0) * train_data_.size(1) val_size = val_data_.size(0) * val_data_.size(1) n_batchs = len(train_data_) n_batchs_val = len(val_data_) correct = 0 epoch_loss = 0 batch_index = 0 seq_len = 0 counter = 0 hidden = model.init_hidden(args.batch_size) while (batch_index < n_batchs - 1): #optimizer.zero_grad() X, y, seq_len = next(train_data_loader) #print('X: ', X.shape, 'y: ', y.shape) hidden = repackage_hidden(hidden) #out, hidden = model(X, hidden) model.zero_grad() out, hidden = model(X, hidden) loss = loss_fn(out.view(-1, word_count), y) loss.backward() # scale lr with respect the size of the seq_len #utils.adjust_learning_rate(optimizer, args, seq_len) torch.nn.utils.clip_grad_norm(model.parameters(), 0.25) for p in model.parameters(): p.data.add_(-args.lr, p.grad.data) #optimizer.step() #utils.adjust_learning_rate(optimizer, args, args.base_seq_len) epoch_loss += loss.data.sum() batch_index += seq_len if counter % 200 == 0 and counter != 0: print('|batch {:3d}|train loss {:5.2f}|'.format( counter, epoch_loss / counter)) counter += 1 train_loss = epoch_loss / counter val_loss = validate(model, val_data_loader, loss_fn, n_batchs_val, word_count) logging['loss'].append(train_loss) logging['val_loss'].append(val_loss) utils.save_model(model, checkpoint_path) print('=' * 83) print('|epoch {:3d}|time: {:5.2f}s|valid loss {:5.2f}|' 'train loss {:8.2f}'.format(epoch + 1, (time.time() - epoch_time), val_loss, train_loss))
def main(): parser = argparse.ArgumentParser( description='PyTorch PennTreeBank RNN/LSTM Language Model') parser.add_argument('--data', type=str, default='data/penn/', help='location of the data corpus') parser.add_argument('--model', type=str, default='LSTM', help='type of recurrent net (LSTM, QRNN, GRU)') parser.add_argument('--emsize', type=int, default=400, help='size of word embeddings') parser.add_argument('--nhid', type=int, default=1150, help='number of hidden units per layer') parser.add_argument('--nlayers', type=int, default=3, help='number of layers') parser.add_argument('--lr', type=float, default=30, help='initial learning rate') parser.add_argument('--clip', type=float, default=0.25, help='gradient clipping') parser.add_argument('--epochs', type=int, default=8000, help='upper epoch limit') parser.add_argument('--max-steps-per-epoch', type=int, default=-1, help='upper steps per epoch epoch limit') parser.add_argument('--batch-size', type=int, default=80, metavar='N', help='batch size') parser.add_argument('--bptt', type=int, default=70, help='sequence length') parser.add_argument('--warmup', type=int, default=4000, help='warmup for learning rate') parser.add_argument('--cooldown', type=int, default=None, help='cooldown for learning rate') parser.add_argument( '--accumulate', type=int, default=1, help='number of batches to accumulate before gradient update') parser.add_argument('--dropout', type=float, default=0.4, help='dropout applied to layers (0 = no dropout)') parser.add_argument('--dropouth', type=float, default=0.3, help='dropout for rnn layers (0 = no dropout)') parser.add_argument( '--dropouti', type=float, default=0.65, help='dropout for input embedding layers (0 = no dropout)') parser.add_argument( '--dropoute', type=float, default=0.1, help='dropout to remove words from embedding layer (0 = no dropout)') parser.add_argument( '--wdrop', type=float, default=0.0, help= 'amount of weight dropout to apply to the RNN hidden to hidden matrix') parser.add_argument('--seed', type=int, default=1111, help='random seed') parser.add_argument('--nonmono', type=int, default=5, help='random seed') parser.add_argument('--cuda', action='store_false', help='use CUDA') parser.add_argument('--log-interval', type=int, default=200, metavar='N', help='report interval') randomhash = ''.join(str(time.time()).split('.')) parser.add_argument('--save', type=str, default=randomhash + '.pt', help='path to save the final model') parser.add_argument( '--alpha', type=float, default=2, help= 'alpha L2 regularization on RNN activation (alpha = 0 means no regularization)' ) parser.add_argument( '--beta', type=float, default=1, help= 'beta slowness regularization applied on RNN activiation (beta = 0 means no regularization)' ) parser.add_argument('--wdecay', type=float, default=1.2e-6, help='weight decay applied to all weights') parser.add_argument('--resume', type=str, default='', help='path of model to resume') parser.add_argument('--optimizer', type=str, default='sgd', help='optimizer to use (sgd, adam)') parser.add_argument( '--when', nargs="+", type=int, default=[-1], help= 'When (which epochs) to divide the learning rate by 10 - accepts multiple' ) parser.add_argument( '--discard-highest-losses', type=float, default=0.0, help= 'discard highest percentage of prediction losses before executing an optimizer step' ) parser.add_argument( '--enlarge-model-every-n-epochs', type=int, default=-1, help='enlarge model (hidden and embedding dims) after every n epochs') args = parser.parse_args() args.tied = True # Set the random seed manually for reproducibility. np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) else: torch.cuda.manual_seed(args.seed) ############################################################################### # Load data ############################################################################### import os import hashlib fn = 'corpus.{}.data'.format(hashlib.md5(args.data.encode()).hexdigest()) if os.path.exists(fn): print('Loading cached dataset...') corpus = torch.load(fn) else: print('Producing dataset...') corpus = data.Corpus(args.data) torch.save(corpus, fn) eval_batch_size = min(100, args.batch_size) print('Eval batch size of', eval_batch_size) test_batch_size = 8 train_data = batchify(corpus.train, args.batch_size, args) val_data = batchify(corpus.valid, eval_batch_size, args) test_data = batchify(corpus.test, test_batch_size, args) ############################################################################### # Build the model ############################################################################### from splitcross import SplitCrossEntropyLoss criterion = None ntokens = len(corpus.dictionary) print('Total number of tokens:', ntokens) #model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) #model = model.BoomRNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) if args.enlarge_model_every_n_epochs <= 0: model = SHARNN(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) else: model = None #model = model.AttnRNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) #model = model.RecAttn(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) #model = model.LNRNN(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) #model = model.LNRR(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) ### splits = [] if ntokens > 500000: # One Billion # This produces fairly even matrix mults for the buckets: # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422 splits = [4200, 35000, 180000] elif ntokens > 75000: # WikiText-103 splits = [2800, 20000, 76000] print('Using', splits) if model is not None: if args.resume and args.epochs > 0: print('Resuming model ...') criterion = model_load(args.resume, model) #optimizer.param_groups[0]['lr'] = args.lr model.dropouti, model.dropouth, model.dropout, args.dropoute = args.dropouti, args.dropouth, args.dropout, args.dropoute #if args.wdrop: # from weight_drop import WeightDrop # for rnn in model.rnns: # if type(rnn) == WeightDrop: rnn.dropout = args.wdrop # elif rnn.zoneout > 0: rnn.zoneout = args.wdrop ### if not criterion: criterion = SplitCrossEntropyLoss(args.emsize, splits=splits, verbose=False) ### if args.cuda: model = model.cuda() criterion = criterion.cuda() if False: # or args.jit: print('Jitting ...') model.eval() model.lmr = torch.jit.trace(model.lmr, (torch.rand([ args.bptt, args.batch_size, args.emsize ]).cuda(), torch.rand([1, args.batch_size, args.emsize]).cuda())) #model = torch.jit.trace_module(model, torch.zeros((args.bptt, args.batch_size), dtype=torch.long)) ### ############################################################################### # Training code ############################################################################### # Loop over epochs. #lr = args.lr best_val_loss = [] stored_loss = 100000000 # At any point you can hit Ctrl + C to break out of training early. try: if model is not None: model, optimizer, params = init_optimizer(args, model, criterion) for epoch in range(1, args.epochs + 1): epoch_start_time = time.time() discard_highest_losses = args.discard_highest_losses * ( args.epochs - epoch + 1) / args.epochs if args.enlarge_model_every_n_epochs > 0 and ( epoch - 1) % args.enlarge_model_every_n_epochs == 0: prev_model = model current_factor = (args.enlarge_model_every_n_epochs + epoch - 1) / (args.enlarge_model_every_n_epochs + args.epochs) emsize = int(args.emsize * current_factor) nhid = int(args.nhid * current_factor) print( f'enlarge model: emsize={emsize}, nhid={nhid} (discard_highest_losses={discard_highest_losses})' ) model = SHARNN(args.model, ntokens, emsize, nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) criterion = SplitCrossEntropyLoss(emsize, splits=splits, verbose=False) if args.cuda: model = model.cuda() criterion = criterion.cuda() if prev_model is not None: model.load_from_smaller_and_freeze(prev_model) model, optimizer, params = init_optimizer( args, model, criterion) train(model, optimizer, criterion, args, train_data, params, epoch=epoch - 1, max_steps=args.max_steps_per_epoch, discard_highest_losses=discard_highest_losses) if 't0' in optimizer.param_groups[0]: tmp = {} for prm in model.parameters(): tmp[prm] = prm.data.clone() prm.data = optimizer.state[prm]['ax'].clone() val_loss2 = evaluate(model, criterion, args, val_data) print('-' * 89) print( '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f} | valid bpc {:8.3f}'.format( epoch, (time.time() - epoch_start_time), val_loss2, math.exp(val_loss2), val_loss2 / math.log(2))) print('-' * 89) if val_loss2 < stored_loss: model_save(args.save, model, criterion) print('Saving Averaged!') stored_loss = val_loss2 for prm in model.parameters(): prm.data = tmp[prm].clone() else: val_loss = evaluate(model, criterion, args, val_data, eval_batch_size) print('-' * 89) print( '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f} | valid bpc {:8.3f}'.format( epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss), val_loss / math.log(2))) print('-' * 89) if val_loss < stored_loss: model_save(args.save, model, criterion) print('Saving model (new best validation)') stored_loss = val_loss if args.optimizer == 'sgd' and 't0' not in optimizer.param_groups[ 0] and (len(best_val_loss) > args.nonmono and val_loss > min(best_val_loss[:-args.nonmono])): print('Switching to ASGD') optimizer = torch.optim.ASGD(model.parameters(), lr=args.lr, t0=0, lambd=0., weight_decay=args.wdecay) if epoch in args.when: print('Saving model before learning rate decreased') model_save('{}.e{}'.format(args.save, epoch), model, criterion) print('Dividing learning rate by 10') optimizer.param_groups[0]['lr'] /= 10. best_val_loss.append(val_loss) except KeyboardInterrupt: print('-' * 89) print('Exiting from training early') # Load the best saved model. criterion = model_load(args.save, model) params = list(model.parameters()) + list(criterion.parameters()) total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in params if x.size()) print('Model total parameters:', total_params) # Run on test data. test_loss = evaluate(model, criterion, args, test_data, test_batch_size) print('=' * 89) print( '| End of training | test loss {:5.2f} | test ppl {:8.2f} | test bpc {:8.3f}' .format(test_loss, math.exp(test_loss), test_loss / math.log(2))) print('=' * 89)
import os import hashlib fn = 'corpus.{}.data'.format(hashlib.md5(args.data.encode()).hexdigest()) if os.path.exists(fn): print('Loading cached dataset...') corpus = torch.load(fn) else: print('Producing dataset...') corpus = data.Corpus(args.data) torch.save(corpus, fn) eval_batch_size = 10 test_batch_size = 1 train_data = batchify(corpus.train, args.batch_size, args) val_data = batchify(corpus.valid, eval_batch_size, args) test_data = batchify(corpus.test, test_batch_size, args) ############################################################################### # Build the model ############################################################################### from splitcross import SplitCrossEntropyLoss criterion = None ntokens = len(corpus.dictionary) model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) ###
vocabdict = json.load(f) vocabdict = {k: int(v) for k, v in vocabdict.items()} corpus = Corpus(datafiles, maxlen=args.maxlen, vocab_size=args.vocab_size, lowercase=args.lowercase, vocab=vocabdict) # save arguments ntokens = len(corpus.dictionary.word2idx) print("Vocabulary Size: {}".format(ntokens)) args.ntokens = ntokens eval_batch_size = 100 en_data = batchify(corpus.data[args.corpus_name], eval_batch_size, shuffle=False) print(len(en_data)) print("Loaded data!") model_args, idx2word, autoencoder, gan_gen, gan_disc = load_models( args.outf, args.epochs, twodecoders=True) if args.cuda: autoencoder = autoencoder.cuda() gan_gen = gan_gen.cuda() gan_disc = gan_disc.cuda() one = to_gpu(args.cuda, torch.FloatTensor([1])) mone = one * -1
parser.add_argument('--theta', type=float, default=0.6625523432485668, help='mix between uniform distribution and pointer softmax distribution over previous words') parser.add_argument('--lambdasm', type=float, default=0.12785920428335693, help='linear mix between only pointer (1) and only vocab (0) distribution') args = parser.parse_args() ############################################################################### # Load data ############################################################################### corpus = data.Corpus(args.data) eval_batch_size = 1 test_batch_size = 1 #train_data = batchify(corpus.train, args.batch_size) val_data = batchify(corpus.valid, test_batch_size, args) test_data = batchify(corpus.test, test_batch_size, args) ############################################################################### # Build the model ############################################################################### ntokens = len(corpus.dictionary) criterion = nn.CrossEntropyLoss() def one_hot(idx, size, cuda=True): a = np.zeros((1, size), np.float32) a[0][idx] = 1 v = Variable(torch.from_numpy(a)) if cuda: v = v.cuda() return v
def train(source, target, encoder, decoder, lr, conf): """ ---------- @params source: list of list, sequences of source language target: list of list, sequences of target language encoder: Encoder, object of encoder in NMT decoder: Decoder, object of decoder in NMT lr: float, learning rate conf: Config, wraps anything needed ---------- """ encoder.train() decoder.train() enc_opt = optim.Adam(encoder.parameters(), lr=lr) dec_opt = optim.Adam(decoder.parameters(), lr=lr) loss_fn = nn.NLLLoss() total_loss = 0 for batch, (x, x_len, y, mask) in enumerate(utils.batchify( source, target, conf.stride, conf.batch_size, True)): enc_opt.zero_grad() dec_opt.zero_grad() loss = 0 x = x[:,1:] # skip <SOS> batch_size, src_len = x.shape x = Variable(torch.LongTensor(x.tolist()), volatile=False) y = Variable(torch.LongTensor(y.tolist())) enc_h = encoder.init_hidden(batch_size) if conf.cuda: x = x.cuda() y = y.cuda() enc_h = enc_h.cuda() encoder_out, enc_h = encoder(x, enc_h, x_len-1) # use last forward hidden state in encoder dec_h = enc_h[:decoder.n_layers] #dec_h = decoder.init_hidden(enc_h) target_len = y.size(1) decoder_input = y[:, 0:1] # Scheduled sampling use_teacher_forcing = random.random() < conf.teaching_ratio if use_teacher_forcing: for i in range(1, target_len): decoder_out, dec_h = decoder(decoder_input, dec_h) loss += utils.loss_in_batch(decoder_out, y[:,i], mask[:,i], loss_fn) decoder_input = y[:, i:i+1] else: for i in range(1, target_len): decoder_out, dec_h = decoder(decoder_input, dec_h) loss += utils.loss_in_batch(decoder_out, y[:,i], mask[:,i], loss_fn) topv, topi = decoder_out.data.topk(1) ni = topi[:,:1] decoder_input = Variable(torch.LongTensor(ni.tolist())) if conf.cuda: decoder_input = decoder_input.cuda() total_loss += loss.data[0] loss /= batch_size loss.backward() enc_opt.step() dec_opt.step() return total_loss / len(source)
parser.add_argument('--reward', type=int, default=80) parser.add_argument('--mu0', type=int, default=1) # 高斯分布的均值 parser.add_argument('--sigma0', type=int, default=1) # 高斯分布的方差 parser.add_argument('--sigma_tilde', type=int, default=1) args = parser.parse_args() np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.set_device(args.gpu) cudnn.benchmark = True # cuDNN使用的非确定性算法就会自动寻找最适合当前配置的高效算法,来达到优化运行效率 cudnn.enabled = True corpus = data.Corpus(data_path) train_data = batchify(corpus.train, train_batch_size) val_data = batchify(corpus.valid, eval_batch_size) test_data = batchify(corpus.test, test_batch_size) n_tokens = len(corpus.dictionary) model = RNNModel(n_tokens, embed_size, n_hid, n_hid_last, dropout, dropout_h, dropout_x, dropout_i, dropout_e, cell_cls=DARTSCell) parallel_model = model.cuda()
# ************** CREATE DATASET, MODEL AND OPTIMIZER****************** bpe = yttm.BPE(model=args.bpe_path) TEXT = torchtext.data.Field(tokenize=lambda x: utils.bpe_tokenize(x, bpe), lower=True) train_txt, val_txt, test_txt = utils.get_datasets(args.dataset).splits(TEXT) print('Dataset fetched') TEXT.build_vocab(train_txt) vocab_size = len(TEXT.vocab.stoi) print(f"Unique tokens in vocabulary: {len(TEXT.vocab)}") device = torch.device( f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu") print(f'device={device}') train_data = utils.batchify(train_txt, TEXT, args.batch_size, device) val_data = utils.batchify(val_txt, TEXT, args.batch_size, device) layernorm = not args.nolayernorm model = transformer.LMTransformer(vocab_size, args.dmodel, args.nheads, args.dff, args.nlayers, args.dropout, tie_embeddings=args.tie_embeddings, dfa=args.dfa, no_training=args.no_training, dfa_after_vocab=args.dfa_after_vocab, dfa_embed=args.dfa_embed, attn=args.attention,
def __init__(self, args, dataset): """Constructor for training algorithm. Args: args: From command line, picked up by `argparse`. dataset: Currently only `data.text.Corpus` is supported. Initializes: - Data: train, val and test. - Model: shared and controller. - Inference: optimizers for shared and controller parameters. - Criticism: cross-entropy loss for training the shared model. """ self.args = args self.controller_step = 0 self.cuda = args.cuda self.dataset = dataset self.epoch = 0 self.shared_step = 0 self.start_epoch = 0 # best_evaluated_dag on the validation set self.best_evaluated_dag = None self.best_ppl = np.inf self.best_epoch = None logger.info('regularizing:') for regularizer in [('activation regularization', self.args.activation_regularization), ('temporal activation regularization', self.args.temporal_activation_regularization), ('norm stabilizer regularization', self.args.norm_stabilizer_regularization)]: if regularizer[1]: logger.info(f'{regularizer[0]}') self.train_data = utils.batchify(dataset.train, args.batch_size, self.cuda) # NOTE(brendan): The validation set data is batchified twice # separately: once for computing rewards during the Train Controller # phase (valid_data, batch size == 64), and once for evaluating ppl # over the entire validation set (eval_data, batch size == 1) self.valid_data = utils.batchify(dataset.valid, args.batch_size, self.cuda) self.eval_data = utils.batchify(dataset.valid, args.test_batch_size, self.cuda) self.test_data = utils.batchify(dataset.test, args.test_batch_size, self.cuda) self.max_length = self.args.shared_rnn_max_length if args.use_tensorboard: self.tb = TensorBoard(args.model_dir) else: self.tb = None self.build_model() if self.args.load_path: self.load_model() shared_optimizer = _get_optimizer(self.args.shared_optim) controller_optimizer = _get_optimizer(self.args.controller_optim) self.shared_optim = shared_optimizer( self.shared.parameters(), lr=self.shared_lr, weight_decay=self.args.shared_l2_reg) self.controller_optim = controller_optimizer( self.controller.parameters(), lr=self.args.controller_lr) self.ce = nn.CrossEntropyLoss()
print ("Error: input files directory does not exist") exit(0) params = {} params["embedding_size"] = args.embedding_size params["rnn_size"] = args.rnn_size params["rnn_layers"] = args.rnn_layers params["dropout"] = args.dropout params["use_gpu"] = use_gpu params["sequence_length"] = args.sequence_length params["batch_size"] = args.batch_size train, valid, test, word2idx = load_data(args.data_dir) params["vocab_size"] = len(word2idx) train_batches = utils.batchify(train, args.sequence_length, args.batch_size, word2idx) valid_batches = utils.batchify(valid, args.sequence_length, args.batch_size, word2idx) test_batches = utils.batchify(test, args.sequence_length, args.batch_size, word2idx) # define loss, model and optimization model = LSTMLM(params) if use_gpu: print ("CUDA found!") model.cuda() loss_function = nn.NLLLoss() optimizer = optim.SGD(model.parameters(), lr=0.1) # model summary print (str(model)) # training
def __init__(self, save_path, seed, batch_size, grad_clip, config='eval'): if config == 'search': args = { 'emsize': 300, 'nhid': 300, 'nhidlast': 300, 'dropoute': 0, 'wdecay': 5e-7 } elif config == 'eval': args = { 'emsize': 850, 'nhid': 850, 'nhidlast': 850, 'dropoute': 0.1, 'wdecay': 8e-7 } args['config'] = config args['data'] = '/home/liamli4465/darts/data/penn' args['lr'] = 20 args['clip'] = grad_clip args['batch_size'] = batch_size args['search_batch_size'] = 256 * 4 args['small_batch_size'] = batch_size args['bptt'] = 35 args['dropout'] = 0.75 args['dropouth'] = 0.25 args['dropoutx'] = 0.75 args['dropouti'] = 0.2 args['seed'] = seed args['nonmono'] = 5 args['log_interval'] = 50 args['save'] = save_path args['alpha'] = 0 args['beta'] = 1e-3 args['max_seq_length_delta'] = 20 args['unrolled'] = True args['gpu'] = 0 args['cuda'] = True args = AttrDict(args) self.args = args self.seed = seed np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.set_device(args.gpu) cudnn.benchmark = True cudnn.enabled = True torch.cuda.manual_seed_all(args.seed) corpus = data.Corpus(args.data) self.corpus = corpus eval_batch_size = 10 test_batch_size = 1 self.train_data = batchify(corpus.train, args.batch_size, args) self.search_data = batchify(corpus.valid, args.search_batch_size, args) self.val_data = batchify(corpus.valid, eval_batch_size, args) self.test_data = batchify(corpus.test, test_batch_size, args) self.batch = 0 self.steps = 0 self.epochs = 0 self.total_loss = 0 self.start_time = time.time() ntokens = len(corpus.dictionary) # if args.continue_train: # model = torch.load(os.path.join(args.save, 'model.pt')) try: model = torch.load(os.path.join(args.save, 'model.pt')) print('Loaded model from checkpoint') except Exception as e: print(e) model = RNNModel(ntokens, args.emsize, args.nhid, args.nhidlast, args.dropout, args.dropouth, args.dropoutx, args.dropouti, args.dropoute, genotype=genotypes.DARTS) size = 0 for p in model.parameters(): size += p.nelement() logging.info('param size: {}'.format(size)) logging.info('initial genotype:') logging.info(model.rnns[0].genotype) total_params = sum(x.data.nelement() for x in model.parameters()) logging.info('Args: {}'.format(args)) logging.info('Model total parameters: {}'.format(total_params)) self.model = model.cuda() self.optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wdecay)
print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) else: torch.cuda.manual_seed_all(args.seed) ############################################################################### # Load data ############################################################################### corpus = data.Corpus(args.data) eval_batch_size = 10 test_batch_size = 1 train_data_src, train_data_trg = batchify(corpus.train_src, corpus.train_trg, args.batch_size, args) val_data_src, val_data_trg = batchify(corpus.valid_src, corpus.valid_trg, eval_batch_size, args) test_data_src, test_data_trg = batchify( corpus.valid_src, corpus.valid_trg, test_batch_size, args) # test data is same as valid data, we just use different batch size ############################################################################### # Build the model ############################################################################### ntokens = len(corpus.dictionary) if args.continue_train: # probably needs to be fixed model = torch.load(os.path.join(args.save, 'model.pt')) print("Loaded existing model.")
forget_gates = {} input_gates = {} output_gates = {} cell_states = {} hidden_states = {} relevant_labels = {} with open(input_path) as input_file: for line in input_file: sentence, labels = parse_line(line) tokenized_data = corpus.safe_tokenize_sentence(sentence.strip()) batch_size = 1 input_data = batchify(tokenized_data,batch_size,False) gate_data, outputs = evaluate(input_data,batch_size) for lyr, gates in enumerate(gate_data): if lyr not in forget_gates: forget_gates[lyr] = [] if lyr not in input_gates: input_gates[lyr] = [] if lyr not in output_gates: output_gates[lyr] = [] if lyr not in cell_states: cell_states[lyr] = [] if lyr not in hidden_states: hidden_states[lyr] = [] if lyr not in relevant_labels: relevant_labels[lyr] = []
args = parser.parse_args() scope_autoencoder = 'autoencoder' scope_critic = 'critic' scope_generator = 'generator' corpus = Corpus(args.data_path, maxlen=args.maxlen, vocab_size=args.vocab_size, lowercase=True) # Prepare data ntokens = len(corpus.dictionary.word2idx) args.ntokens = ntokens test_data = batchify(corpus.test, args.batch_size, args.maxlen, shuffle=False) train_data = batchify(corpus.train, args.batch_size, args.maxlen, shuffle=False) tf.reset_default_graph() # Build graph fixed_noise = tf.Variable( tf.random_normal(shape=(args.batch_size, args.z_size), mean=0.0, stddev=1.0, dtype=tf.float32)) with tf.variable_scope(scope_autoencoder):
model, criterion, optimizer = torch.load(f) import os import hashlib fn = 'corpus.{}.data'.format(hashlib.md5(args.data.encode()).hexdigest()) if os.path.exists(fn): print('Loading cached dataset...') corpus = torch.load(fn) else: print('Producing dataset...') corpus = data.Corpus(args.data) torch.save(corpus, fn) eval_batch_size = 10 test_batch_size = 1 train_data = batchify(corpus.train, args.batch_size, args) val_data = batchify(corpus.valid, eval_batch_size, args) test_data = batchify(corpus.test, test_batch_size, args) ############################################################################### # Build the model ############################################################################### from splitcross import SplitCrossEntropyLoss criterion = None ntokens = len(corpus.dictionary) model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) ### if args.resume: print('Resuming model ...')
# dumping vocabulary with open('{}/vocab.json'.format(args.outf), 'w') as f: json.dump(corpus.dictionary.word2idx, f) # save arguments ntokens = len(corpus.dictionary.word2idx) print("Vocabulary Size: {}".format(ntokens)) args.ntokens = ntokens with open('{}/args.json'.format(args.outf), 'w') as f: json.dump(vars(args), f) with open("{}/log.txt".format(args.outf), 'w') as f: f.write(str(vars(args))) f.write("\n\n") eval_batch_size = 100 test1_data = batchify(corpus.data['valid1'], eval_batch_size, shuffle=False) test2_data = batchify(corpus.data['valid2'], eval_batch_size, shuffle=False) train1_data = batchify(corpus.data['train1'], args.batch_size, shuffle=True) train2_data = batchify(corpus.data['train2'], args.batch_size, shuffle=True) print("Loaded data!") ############################################################################### # Build the models ############################################################################### ntokens = len(corpus.dictionary.word2idx) autoencoder = Seq2Seq2Decoder(emsize=args.emsize, nhidden=args.nhidden, ntokens=ntokens, nlayers=args.nlayers,
maxlen=args.maxlen, vocab_size=args.vocab_size, lowercase=args.lowercase, load_vocab=cur_dir + '/vocab.json') else: corpus = Corpus(args.data_path, maxlen=args.maxlen, vocab_size=args.vocab_size, lowercase=args.lowercase) eval_batch_size = 10 if not args.convolution_enc: args.packed_rep = True train_data = batchify(corpus.train, args.batch_size, args.maxlen, packed_rep=args.packed_rep, shuffle=True) corpus_test = SNLIDataset( train=False, vocab_size=41578, reset_vocab="/home/ddua/data/arae/output/example/1504200881/vocab.json") testloader = torch.utils.data.DataLoader(corpus_test, batch_size=10, collate_fn=collate_snli, shuffle=False) test_data = iter(testloader) classifier1 = Baseline_Embeddings(100, maxlen=10, gpu=True, vocab_size=41578) classifier1.load_state_dict( torch.load("/home/ddua/data/snli/baseline/model_emb.pt"))
#load conditonal information test_C = np.load('data/test_weight-YAGO.npy') train_C = np.load('data/train_weight-YAGO.npy') test_C = preprocessing.normalize(test_C, norm='l2') train_C = preprocessing.normalize(train_C, norm='l2') test_data, test_c = batchify_C(corpus.test, test_C, eval_batch_size, shuffle=False) train_data, train_c = batchify_C(corpus.train, train_C, args.batch_size, shuffle=False) test_final = batchify(test_C, len(test_C), shuffle=False) print("Loaded data!") ############################################################################### # Build the models ############################################################################### ntokens = len(corpus.dictionary.word2idx) autoencoder = Seq2Seq(emsize=args.emsize, nhidden=args.nhidden, ntokens=ntokens, nlayers=args.nlayers, noise_radius=args.noise_radius, hidden_init=args.hidden_init, dropout=args.dropout,
# dumping vocabulary with open('./output/{}/vocab.json'.format(args.outf), 'w') as f: json.dump(corpus.dictionary.word2idx, f) # save arguments ntokens = len(corpus.dictionary.word2idx) print("Vocabulary Size: {}".format(ntokens)) args.ntokens = ntokens with open('./output/{}/args.json'.format(args.outf), 'w') as f: json.dump(vars(args), f) with open("./output/{}/logs.txt".format(args.outf), 'w') as f: f.write(str(vars(args))) f.write("\n\n") eval_batch_size = 10 test_data = batchify(corpus.test, eval_batch_size, shuffle=False) train_data = batchify(corpus.train, args.batch_size, shuffle=True) print("Loaded data!") ############################################################################### # Build the models ############################################################################### ntokens = len(corpus.dictionary.word2idx) autoencoder = Seq2Seq(emsize=args.emsize, nhidden=args.nhidden, ntokens=ntokens, nlayers=args.nlayers, noise_radius=args.noise_radius, hidden_init=args.hidden_init,
help='linear mix between only pointer (1) and only vocab (0) distribution') # ThinkNet params add_tn_params(parser) args = parser.parse_args() ############################################################################### # Load data ############################################################################### corpus = data.Corpus(args.data) eval_batch_size = 1 test_batch_size = 1 #train_data = batchify(corpus.train, args.batch_size) val_data = batchify(corpus.valid, test_batch_size, args) test_data = batchify(corpus.test, test_batch_size, args) ############################################################################### # Build the model ############################################################################### ntokens = len(corpus.dictionary) criterion = nn.CrossEntropyLoss() def one_hot(idx, size, cuda=True): a = np.zeros((1, size), np.float32) a[0][idx] = 1 v = Variable(torch.from_numpy(a)) if cuda: v = v.cuda() return v
if args.philly: fn = os.path.join(os.environ['PT_OUTPUT_DIR'], fn) if os.path.exists(fn): tools.print_log(args.save, 'Loading cached dataset...') corpus = torch.load(fn) else: tools.print_log(args.save, 'Producing dataset...') corpus = data.Corpus(args.data) torch.save(corpus, fn) # Generate data eval_batch_size = 10 test_batch_size = 1 train_data = batchify(corpus.train, args.batch_size, args) # tensor (46479 * 20) 929589 / tot words887521 val_data = batchify(corpus.valid, eval_batch_size, args) # 7376 * 10 / 70390 test_data = batchify(corpus.test, test_batch_size, args) # 82430 * 1 / 78669 (tot tokens) + 3761 ('eos') if args.debug: train_data = train_data[:50] val_data = val_data[:50] test_data = test_data[:50] ############################################################################### # Build the model ############################################################################### ################################################3 criterion = None
# dumping vocabulary with open('./output/{}/vocab.json'.format(args.outf), 'w') as f: json.dump(corpus.dictionary.word2idx, f) # save arguments ntokens = len(corpus.dictionary.word2idx) print("Vocabulary Size: {}".format(ntokens)) args.ntokens = ntokens with open('./output/{}/args.json'.format(args.outf), 'w') as f: json.dump(vars(args), f) with open("./output/{}/logs.txt".format(args.outf), 'w') as f: f.write(str(vars(args))) f.write("\n\n") eval_batch_size = 10 test_data = batchify(corpus.test, eval_batch_size, shuffle=False) train_data = batchify(corpus.train, args.batch_size, shuffle=True) print("Loaded data!") ############################################################################### # Build the models ############################################################################### ntokens = len(corpus.dictionary.word2idx) autoencoder = Seq2Seq(emsize=args.emsize, nhidden=args.nhidden, ntokens=ntokens, nlayers=args.nlayers, noise_radius=args.noise_radius, hidden_init=args.hidden_init,