def prepare_data(data_path, w2v_path, vocab_path, make_vocab=True, elmo_w2v_path=None, elmo_pca=False): [anchor, check, label, anchor_len, check_len] = data_utils.read_data(data_path, "train", cut_tool, data_clearner_api, "tab") if make_vocab: dic = data_utils.make_dic(anchor+check) if not elmo_w2v_path: data_utils.read_pretrained_embedding(w2v_path, dic, vocab_path, min_freq=3) else: data_utils.read_pretrained_elmo_embedding(w2v_path, dic, vocab_path, min_freq=3, elmo_embedding_path=elmo_w2v_path, elmo_pca=elmo_pca) if sys.version_info < (3, ): embedding_info = pkl.load(open(os.path.join(vocab_path), "rb")) else: embedding_info = pkl.load(open(os.path.join(vocab_path), "rb"), encoding="iso-8859-1") return [anchor, check, label, anchor_len, check_len, embedding_info]
def test(args, vocab_size): device = t.device('cuda') if args.use_gpu else t.device('cpu') beam_size = args.beam_size topk = args.topk rev_model = args.load_model_path # print(rev_model) model = Seq2Seq(embed_size=args.embed_size, enc_dec_output_size=args.enc_dec_output_size, attn_size=args.attn_size, num_layers=args.num_layers, bidirectional=args.bidirectional, use_gpu=args.use_gpu, vocab_size=vocab_size).to(device) assert rev_model is not None # 读取已经保存的模型 rev_path = os.path.join(model_dir, rev_model) if os.path.exists(rev_path): print('read in model from', rev_path) model.load(load_path=rev_path) batch_size = args.batch_size test_set = Set(read_data(args.test_data_root)) test_loader = Loader(test_set, batch_size, shuffle=False, use_gpu=args.use_gpu, num_workers=args.num_workers).loader model.eval() with t.no_grad(): recorder.epoch_start(0, 'test', len(test_set)) for batch_id, batch in enumerate(test_loader): encoder_inputs, seq_len, decoder_inputs, weights = batch encoder_inputs = encoder_inputs.to(device) seq_len = seq_len.to(device) decoder_inputs = decoder_inputs.to(device) weights = weights.to(device) encoder_inputs.to(device) logits, output_symbols = model( encoder_inputs, seq_len, decoder_inputs[:, :-1], mode='test', max_len=args.max_len, beam_search=False if args.beam_size == 1 else True, beam_size=args.beam_size, topk=args.topk) nll_loss = compute_loss(logits, decoder_inputs[:, 1:], weights) ppl = perplexity(nll_loss) recorder.batch_end(batch_id, batch_size, nll_loss, ppl) recorder.log_text(encoder_inputs.tolist(), decoder_inputs[:, 1:].tolist(), output_symbols.tolist()) recorder.epoch_end()
def main(argv): # random.seed(21) # So we have same parition every time. device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Read in data from .pickle as a list of (features, label) tuples # each representing a zipcode datapoint. data_and_labels = data_utils.read_data() need = np.sum([1 if label[1] == 0 else 0 for label in data_and_labels]) print('need', need) # Oversample # data_and_labels = data_utils.oversample(data_and_labels) # data_and_labels = data_utils.undersample(data_and_labels) # Standardize the data. x_data = [x[0] for x in data_and_labels] y_data = [x[1] for x in data_and_labels] scaler = preprocessing.StandardScaler() x_data = scaler.fit_transform(x_data) # New dataset that is standardized. data = [(x_data[i], y_data[i]) for i in range(len(x_data))] # Separate 80/10/10 as train/val/test partition. data_size = len(data) random.shuffle(data) train_data = data[:(data_size // 10) * 8] val_data = data[(data_size // 10) * 8 : (data_size // 10) * 9] test_data = data[(data_size // 10) * 9 :] train_data = data_utils.oversample_train(train_data) print(len(train_data), 'training points.') print(len(val_data), 'validation points.') print(len(test_data), 'testing points.') input_dim = len(data[0][0]) # number of features output_dim = 2 # two classes: food desert and not food desert hidden_dim_list = [16, 36, 36, 24] model_nn = FoodDesertClassifier(input_dim, hidden_dim_list, output_dim).to(device) loss = optimize_nn(model_nn, train_data, val_data, test_data) eval_model_nn(model_nn, loss, test_data, "Testing")
def prepare_data(data_path, w2v_path, vocab_path): import time [anchor, check, label, anchor_len, check_len] = data_utils.read_data(data_path, "train", cut_tool, data_clearner_api, "tab") dic = data_utils.make_dic(anchor + check) data_utils.read_pretrained_embedding(w2v_path, dic, vocab_path, min_freq=3) if sys.version_info < (3, ): embedding_info = pkl.load(open(os.path.join(vocab_path), "rb")) else: embedding_info = pkl.load(open(os.path.join(vocab_path), "rb"), encoding="iso-8859-1") return [anchor, check, label, anchor_len, check_len, embedding_info]
def train(args, vocab_size): # opt._parse(kwarg) print('enter train func') device = t.device('cuda') if args.use_gpu else t.device('cpu') model = Seq2Seq(embed_size=args.embed_size, enc_dec_output_size=args.enc_dec_output_size, attn_size=args.attn_size, num_layers=args.num_layers, bidirectional=args.bidirectional, use_gpu=args.use_gpu, vocab_size=vocab_size).to(device) print('Model structure') print(model) print('The model has %d parameters' % count_parameters(model)) if args.load_model_path is not None: rev_path = os.path.join(model_dir, args.load_model_path) if os.path.exists(rev_path): print('read in model from', rev_path) last_epoch = model.load(load_path=rev_path, return_list=['epoch'])[0] start_epoch = last_epoch + 1 else: start_epoch = 1 last_epoch = -1 optimizer = Adam(model.parameters(), lr=args.lr) if args.scheduler_type == 'exponential': scheduler = lr_scheduler.ExponentialLR(optimizer, gamma=args.exponential_lr_decay, last_epoch=last_epoch) elif args.scheduler_type == 'step': scheduler = lr_scheduler.StepLR(optimizer, step_size=args.step_size, gamma=args.step_lr_decay) print('read in data') # 读取数据 batch_size = args.batch_size train_set = Set(read_data(args.train_data_root)) valid_set = Set(read_data(args.valid_data_root)) # 构造dataloader train_loader = Loader(train_set, batch_size, shuffle=True, use_gpu=args.use_gpu, num_workers=args.num_workers).loader valid_loader = Loader(valid_set, batch_size, shuffle=False, use_gpu=args.use_gpu, num_workers=args.num_workers).loader # 统计数据量 print('data scale:') print('train data:', len(train_set), "batch_nums:", len(train_loader)) print('valid data:', len(valid_set), "batch_nums:", len(valid_loader)) # train print('start training...') epochs = args.max_epoch for epoch in range(start_epoch, epochs + 1): model.train() # epoch开始前记录 recorder.epoch_start(epoch, 'train', len(train_set)) if args.scheduler_type is not None: print(epoch, 'lr={:.10f}'.format(scheduler.get_lr()[0])) for batch_id, batch in enumerate(train_loader): encoder_inputs, seq_len, decoder_inputs, weights = batch encoder_inputs = encoder_inputs.to(device) seq_len = seq_len.to(device) decoder_inputs = decoder_inputs.to(device) weights = weights.to(device) encoder_inputs.to(device) optimizer.zero_grad() # 第三个参数, 最长的句子最后一个token为EOS_I,不需要作为输入,这样可以减少一些计算 logits, output_symbols = model( encoder_inputs, seq_len, decoder_inputs[:, :-1], mode='train', max_len=None, teacher_forcing_ratio=args.teacher_forcing_ratio) # print('train out',output_symbols) # 计算损失 nll_loss = compute_loss(logits, decoder_inputs[:, 1:], weights) # 计算困惑度 ppl = perplexity(nll_loss) # print(nll_loss.item(), ppl.item()) # 反向传播,更新参数 nll_loss.backward() # 减轻梯度爆炸 小trick nn.utils.clip_grad_norm_(model.parameters(), args.max_gradient_norm) optimizer.step() recorder.batch_end(batch_id, batch_size, nll_loss, ppl) if args.scheduler_type is not None: scheduler.step() recorder.epoch_end() # 保存模型 if epoch % 5 == 0: model.save(os.path.join( model_dir, f'{args.project}_{datetime.datetime.now().strftime("%y_%m_%d_%H:%M:%S")}_{nll_loss.item()}_{ppl.item()}' ), epoch=epoch) # 训练一轮后,在验证集上计算loss, ppl model.eval() with t.no_grad(): recorder.epoch_start(epoch, 'eval', len(valid_set)) for batch_id, batch in enumerate(valid_loader): encoder_inputs, seq_len, decoder_inputs, weights = batch encoder_inputs = encoder_inputs.to(device) seq_len = seq_len.to(device) decoder_inputs = decoder_inputs.to(device) weights = weights.to(device) encoder_inputs.to(device) logits, output_symbols = model( encoder_inputs, seq_len, decoder_inputs[:, :-1], mode='eval', max_len=args.max_len, beam_search=False if args.beam_size == 1 else True, beam_size=args.beam_size, topk=args.topk) # print('eval out: ', output_symbols) nll_loss = compute_loss(logits, decoder_inputs[:, 1:], weights) ppl = perplexity(nll_loss) recorder.batch_end(batch_id, batch_size, nll_loss, ppl) recorder.log_text(encoder_inputs.tolist(), decoder_inputs[:, 1:].tolist(), output_symbols.tolist()) recorder.epoch_end()
parser.add_argument("--z_dim", type=int, default=32) parser.add_argument("--seq_len", type=int, default=10) parser.add_argument("--epochs", type=int, default=100) parser.add_argument("--batch_size", type=int, default=1) parser.add_argument("--prt_evry", type=int, default=1) parser.add_argument("--save_evry", type=int, default=10) parser.add_argument("--lr", type=float, default=1e-3) config = parser.parse_args() device = th.device('cuda' if th.cuda.is_available() else 'cpu') dir_name = mk_dir(config.data + 'experiment') print(config, "DEVICE", device) data = read_data('data/pianorolls/{}.pkl'.format(config.data)) train_data, test_data = data2seq(data=data, split='train', seq_len=config.seq_len) if config.model == "VRNN": model = VRNN(config, device) else: print("NotImplementedERROR") model.to(device) epoch = 0 while (epoch < config.epochs):
w2v_path = "/data/xuht/Chinese_w2v/sgns.merge.char/sgns.merge.char.pkl" # vocab_path = "/data/xuht/duplicate_sentence/ChineseSTSCorpus/emb_mat.pkl" vocab_path = "/data/xuht/duplicate_sentence/LCQMC/emb_mat.pkl" data_clearner_api = data_clean.DataCleaner({}) cut_tool = data_utils.cut_tool_api() import time [train_anchor, train_check, train_label, train_anchor_len, train_check_len] = data_utils.read_data(train_data_path, "train", cut_tool, data_clearner_api, "tab") dic = data_utils.make_dic(train_anchor+train_check) data_utils.read_pretrained_embedding(w2v_path, dic, vocab_path, min_freq=3) if sys.version_info < (3, ): embedding_info = pkl.load(open(os.path.join(vocab_path), "rb")) else: embedding_info = pkl.load(open(os.path.join(vocab_path), "rb"), encoding="iso-8859-1") token2id = embedding_info["token2id"] id2token = embedding_info["id2token"] embedding_mat = embedding_info["embedding_matrix"]