def train_main(args): """ Trains model with specified args. """ # Load text with open(args.text_path) as f: text = f.read() logger.info("corpus length: %s.", len(text)) # Restore model from checkpoint or build model if args.restore: load_path = args.checkpoint_path if args.restore is True else args.restore model = load_model(load_path) logger.info("model restored: %s.", load_path) else: model = build_model(batch_size=args.batch_size, seq_len=args.seq_len, vocab_size=VOCAB_SIZE, embedding_size=args.embedding_size, rnn_size=args.rnn_size, num_layers=args.num_layers, drop_rate=args.drop_rate, learning_rate=args.learning_rate, clip_norm=args.clip_norm) # Make and clear checkpoint directory model.save(args.checkpoint_path) logger.info("model saved: %s.", args.checkpoint_path) callbacks = [ ModelCheckpoint(args.checkpoint_path, verbose=1, save_best_only=False), LoggerCallback(text, model) ] # Split data into training and validation training_fraction = 0.95 split = int(round(len(text) * training_fraction)) text_train = text[:split] text_validation = text[split:] # Start training num_batches = (len(text_train) - 1) // (args.batch_size * args.seq_len) val_batches = (len(text_validation) - 1) // (args.batch_size * args.seq_len) model.reset_states() model.fit_generator(batch_generator(encode_text(text_train), args.batch_size, args.seq_len, one_hot_labels=True), num_batches, args.num_epochs, callbacks=callbacks, validation_data=batch_generator( encode_text(text_validation), args.batch_size, args.seq_len, one_hot_labels=True), validation_steps=val_batches) return model
def get_data(in_file, out_file): ''' 加载语料 :param in_file: 中文数据集路径 :param out_file: 英文数据集路径 :return: ''' print('getting data {}->{}...'.format(in_file, out_file)) with open(in_file, 'r', encoding='utf-8') as file: in_lines = file.readlines() with open(out_file, 'r', encoding='utf-8') as file: out_lines = file.readlines() samples = [] for i in tqdm(range(len(in_lines))): sentence_zh = in_lines[i].strip() tokens = jieba.cut(sentence_zh.strip()) in_data = encode_text( src_char2idx, tokens) # encode_text(src_char2idx, tokens) 将语料转为id序列 sentence_en = out_lines[i].strip().lower() tokens = [ normalizeString(s.strip()) for s in nltk.word_tokenize(sentence_en) ] # 将英文单词预处理 out_data = [Config.sos_id] + encode_text( tgt_char2idx, tokens) + [Config.eos_id] # 转为id 并加上开始和结束标志 # 这里的maxlen_in=50 和 maxlen_out=100 也是有超参数给出的 if len(in_data) < Config.maxlen_in and len( out_data ) < Config.maxlen_out and Config.unk_id not in in_data and Config.unk_id not in out_data: samples.append({'in': in_data, 'out': out_data}) return samples
def main(): print('Load raw data') data = utils.load_dumped('../data/raw/dump.txt') print('Filter text') content = [utils.filter_text(_[1]) for _ in tqdm(data)] idx = np.arange(len(content)) np.random.seed(19) np.random.shuffle(idx) test_len = int(0.1 * len(idx)) print('Split into train/test') test = "".join(content[_] for _ in tqdm(idx[:test_len])) train = "".join(content[_] for _ in tqdm(idx[test_len:])) vocab = utils.generate_vocab() with open('../data/processed/vocab.json', 'w') as fout: json.dump(vocab, fout) print('Encoding test') test = utils.encode_text(test, vocab) np.save('../data/processed/test', test) print('Encoding train') train = utils.encode_text(train, vocab) np.save('../data/processed/train', train)
def get_data(in_file): ''' 得到数据并切分数据 :param in_file: :return: ''' print('getting data {}...'.format(in_file)) with open(in_file, 'r', encoding='utf-8') as f: lines = f.readlines() samples = [] for line in lines: sentences = line.split('|') in_sentence = sentences[0].strip() out_sentence = sentences[1].strip() in_data = encode_text(char2idx, in_sentence) out_data = [Config.sos_id] + encode_text( char2idx, out_sentence) + [Config.eos_id] if len(in_data) < Config.maxlen_in and len( out_data ) < Config.maxlen_out and Config.unk_id not in in_data and Config.unk_id not in out_data: samples.append({'in': in_data, 'out': out_data}) return samples
def get_data(in_file, out_file): print('getting data {}->{}...'.format(in_file, out_file)) with open(in_file, 'r', encoding='utf-8') as file: in_lines = file.readlines() with open(out_file, 'r', encoding='utf-8') as file: out_lines = file.readlines() samples = [] for i in tqdm(range(len(in_lines))): sentence_zh = in_lines[i].strip() tokens = jieba.cut(sentence_zh.strip()) in_data = encode_text(src_char2idx, tokens) sentence_en = out_lines[i].strip().lower() tokens = [ normalizeString(s.strip()) for s in nltk.word_tokenize(sentence_en) ] out_data = [sos_id] + encode_text(tgt_char2idx, tokens) + [eos_id] if len(in_data) < maxlen_in and len( out_data ) < maxlen_out and unk_id not in in_data and unk_id not in out_data: samples.append({'in': in_data, 'out': out_data}) return samples
def build_samples(): word_map_zh = json.load(open('data/WORDMAP_zh.json', 'r')) word_map_en = json.load(open('data/WORDMAP_en.json', 'r')) for usage in ['train', 'valid']: if usage == 'train': translation_path_en = os.path.join(train_translation_folder, train_translation_en_filename) translation_path_zh = os.path.join(train_translation_folder, train_translation_zh_filename) filename = 'data/samples_train.json' else: translation_path_en = os.path.join(valid_translation_folder, valid_translation_en_filename) translation_path_zh = os.path.join(valid_translation_folder, valid_translation_zh_filename) filename = 'data/samples_valid.json' print('loading {} texts and vocab'.format(usage)) with open(translation_path_en, 'r') as f: data_en = f.readlines() with open(translation_path_zh, 'r') as f: data_zh = f.readlines() print('building {} samples'.format(usage)) samples = [] for idx in tqdm(range(len(data_en))): sentence_zh = data_zh[idx].strip() seg_list = jieba.cut(sentence_zh) input_zh = encode_text(word_map_zh, list(seg_list)) sentence_en = data_en[idx].strip().lower() tokens = [ normalizeString(s) for s in nltk.word_tokenize(sentence_en) if len(normalizeString(s)) > 0 ] output_en = encode_text(word_map_en, tokens) if len(input_zh) <= max_len and len( output_en ) <= max_len and UNK_token not in input_zh and UNK_token not in output_en: samples.append({ 'input': list(input_zh), 'output': list(output_en) }) with open(filename, 'w') as f: json.dump(samples, f, indent=4) print('{} {} samples created at: {}.'.format(len(samples), usage, filename))
def generate_text(model, seed, length=512, top_n=10): """ generates text of specified length from trained model with given seed character sequence. """ print("generating {} characters from top {} choices.".format(length, top_n), file=sys.stderr) print('generating with seed: "{}".'.format(seed), file=sys.stderr) generated = seed encoded = utils.encode_text(seed) model.reset_states() for idx in encoded[:-1]: x = np.array([[idx]]) # input shape: (1, 1) # set internal states model.predict(x) next_index = encoded[-1] for i in range(length): x = np.array([[next_index]]) # input shape: (1, 1) probs = model.predict(x) # output shape: (1, 1, vocab_size) next_index = sample_from_probs(probs.squeeze(), top_n) # append to sequence generated += utils.ID2CHAR[next_index] return generated
def generate_text(model, sess, seed, length=512, top_n=10): """ generates text of specified length from trained model with given seed character sequence. """ logger.info("generating %s characters from top %s choices.", length, top_n) logger.info('generating with seed: "%s".', seed) generated = seed encoded = encode_text(seed) x = np.expand_dims(encoded[:-1], 0) # input shape: [1, seq_len] # get rnn state due to seed sequence state = sess.run(model["output_state"], feed_dict={model["X"]: x}) next_index = encoded[-1] for i in range(length): x = np.array([[next_index]]) # input shape: [1, 1] feed_dict = {model["X"]: x, model["input_state"]: state} probs, state = sess.run([model["probs"], model["output_state"]], feed_dict=feed_dict) # output shape: [1, 1, vocab_size] next_index = sample_from_probs(probs.squeeze(), top_n) # append to sequence generated += ID2CHAR[next_index] logger.info("generated text: \n%s\n", generated) return generated
def generate_text(model, seed, length=512, top_n=10): """ generates text of specified length from trained model with given seed character sequence. """ logger.info("generating %s characters from top %s choices.", length, top_n) logger.info('generating with seed: "%s".', seed) generated = seed encoded = encode_text(seed).astype(np.int32) model.predictor.reset_state() with chainer.using_config("train", False), chainer.no_backprop_mode(): for idx in encoded[:-1]: x = Variable(np.array([idx])) # input shape: [1] # set internal states model.predictor(x) next_index = encoded[-1] for i in range(length): x = Variable(np.array([next_index], dtype=np.int32)) # input shape: [1] probs = F.softmax(model.predictor(x)) # output shape: [1, vocab_size] next_index = sample_from_probs(probs.data.squeeze(), top_n) # append to sequence generated += ID2CHAR[next_index] logger.info("generated text: \n%s\n", generated) return generated
def generate_text(model, seed, length=512, top_n=2): """ Generates text of specified length from trained model with given seed (e.g. the prefix string). """ logger.info("generating %s characters from top %s choices.", length, top_n) logger.info('generating with seed: "%s".', seed) generated = seed encoded = encode_text(seed) model.reset_states() for idx in encoded[:-1]: x = np.array([[idx]]) # Input shape: (1, 1) # Set internal states model.predict(x) next_index = encoded[-1] for i in range(length): x = np.array([[next_index]]) # Input shape: (1, 1) probs = model.predict(x) # Output shape: (1, 1, vocab_size) next_index = sample_from_probs(probs.squeeze(), top_n) # Append to sequence if ID2CHAR[next_index] in [".", "!", "?"]: generated += ID2CHAR[next_index] break elif ID2CHAR[next_index] == "\n": break generated += ID2CHAR[next_index] logger.info("generated text: \n%s\n", generated) return generated
def analyze_after_init(comment, encoder, voc): # 构造分析对象 sample = {'content': comment, 'label_tensor': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]} pair_batch = [] result = [] content = sample['content'] result.append({'content': content}) content = content.strip() # 分词 seg_list = jieba.cut(content) input_indexes = encode_text(voc.word2index, list(seg_list)) label_tensor = sample['label_tensor'] pair_batch.append((input_indexes, label_tensor)) # 分析 # 将数据整理为tensor input_variable, lengths, _ = batch2TrainData(pair_batch) # 设置用什么device处理数据 input_variable = input_variable.to(device) lengths = lengths.to(device) # 使用encoder 计算输入得到输出 outputs = encoder(input_variable, lengths) # 取出第一个维度的最大值 _, outputs = torch.max(outputs, 1) print('outputs.size(): ' + str(outputs.size())) # 将输出数据转移到内存中,并将类型转换为numpy outputs = outputs.cpu().numpy() # 整理输出 result[0]['labels'] = (outputs[0] - 2).tolist() return result[0]
def generate_text(model, seed, length=512, top_n=10): """ generates text of specified length from trained model with given seed character sequence. """ logger.info("generating %s characters from top %s choices.", length, top_n) logger.info('generating with seed: "%s".', seed) generated = seed encoded = mx.nd.array(encode_text(seed)) seq_len = encoded.shape[0] x = F.expand_dims(encoded[:seq_len - 1], 1) # input shape: [seq_len, 1] state = model.begin_state() # get rnn state due to seed sequence _, state = model(x, state) next_index = encoded[seq_len - 1].asscalar() for i in range(length): x = mx.nd.array([[next_index]]) # input shape: [1, 1] logit, state = model(x, state) # output shape: [1, vocab_size] probs = F.softmax(logit) next_index = sample_from_probs(probs.asnumpy().squeeze(), top_n) # append to sequence generated += ID2CHAR[next_index] logger.info("generated text: \n%s\n", generated) return generated
def __init__(self, text, batch_size=64, seq_len=64): self.data_iterator = batch_generator( encode_text(text).astype(np.int32), batch_size, seq_len) self.num_batches = (len(text) - 1) // (batch_size * seq_len) self.iteration = 0 self.epoch = 0 self.is_new_epoch = True
def generate_text(model, seed, length=512, top_n=None): """ generates text of specified length from trained model with given seed character sequence. """ logger.info("generating %s characters from top %s choices.", length, top_n) logger.info('generating with seed: "%s".', seed) generated = seed encoded = encode_text(seed, get_CHAR2ID()) model.reset_states() for idx in encoded[:-1]: x = np.array([[idx]]) # input shape: (1, 1) # set internal states model.predict(x) next_index = encoded[-1] for i in range(length): x = np.array([[next_index]]) # input shape: (1, 1) probs = model.predict(x) # output shape: (1, 1, vocab_size) next_index = sample_from_probs(probs.squeeze(), top_n) # append to sequence generated += get_ID2CHAR()[next_index] logger.info("generated text: \n%s\n", generated) return generated
def generate_text(model, seed, length=512, top_n=10): """ generates text of specified length from trained model with given seed character sequence. """ logger.info("generating %s characters from top %s choices.", length, top_n) logger.info('generating with seed: "%s".', seed) generated = seed encoded = encode_text(seed) encoded = Variable(torch.from_numpy(encoded), volatile=True) model.eval() x = encoded[:-1].unsqueeze(1) # input shape: [seq_len, 1] state = model.init_state() # get rnn state due to seed sequence _, state = model.predict(x, state) next_index = encoded[-1:] for i in range(length): x = next_index.unsqueeze(1) # input shape: [1, 1] probs, state = model.predict(x, state) # output shape: [1, 1, vocab_size] next_index = sample_from_probs(probs.squeeze(), top_n) # append to sequence generated += ID2CHAR[next_index.data[0]] logger.info("generated text: \n%s\n", generated) return generated
def get_data(in_file): contexts, questions = getCQpair(in_file) samples = [] for i in tqdm(range(len(contexts))): tokens = [s.strip() for s in nltk.word_tokenize(contexts[i])] in_data = encode_text(word2idx_dict, tokens) q_tokens = [s.strip() for s in nltk.word_tokenize(questions[i])] out_data = [sos_id] + encode_text(word2idx_dict, q_tokens) + [eos_id] if len(in_data) < maxlen_in and len(out_data) < maxlen_out: samples.append({'in': in_data, 'out': out_data}) return samples
def train_main(args): """ trains model specfied in args. main method for train subcommand. """ # load text text = load_text(args.text_path) if args.test_path: test_text = load_text(args.test_path) else: test_text = None # load or build model if args.restore: load_path = args.checkpoint_path if args.restore is True else args.restore model = load_model(load_path) logger.info("model restored: %s.", load_path) else: model = build_model(batch_size=args.batch_size, seq_len=args.seq_len, vocab_size=get_VOCAB_SIZE(), embedding_size=args.embedding_size, rnn_size=args.rnn_size, num_layers=args.num_layers, drop_rate=args.drop_rate, learning_rate=args.learning_rate, clip_norm=args.clip_norm) # make and clear checkpoint directory log_dir = make_dirs(args.checkpoint_path, empty=True) model.save(args.checkpoint_path) logger.info("model saved: %s.", args.checkpoint_path) # callbacks callbacks = [ ModelCheckpoint(args.checkpoint_path, verbose=1, save_best_only=False), TensorBoard(log_dir, write_graph=True, embeddings_freq=1, embeddings_metadata={ "embedding_1": os.path.abspath(os.path.join("data", "id2char.tsv")) }), LoggerCallback(text, test_text, model, args.checkpoint_path) ] # training start num_batches = (len(text) - 1) // (args.batch_size * args.seq_len) model.reset_states() model.fit_generator(batch_generator(encode_text(text, get_CHAR2ID()), args.batch_size, args.seq_len, one_hot_labels=True), num_batches, args.num_epochs, callbacks=callbacks) return model
def mp_func(data_en, data_zh, manager_d, index): samples = [] for idx in tqdm(range(len(data_en))): sentence_zh = data_zh[idx].strip() seg_list = jieba.cut(sentence_zh) input_zh = encode_text(word_map_zh, list(seg_list)) sentence_en = data_en[idx].strip().lower() tokens = [ normalizeString(s) for s in nltk.word_tokenize(sentence_en) if len(normalizeString(s)) > 0 ] output_en = encode_text(word_map_en, tokens) if len(input_zh) <= max_len and len( output_en ) <= max_len and UNK_token not in input_zh and UNK_token not in output_en: samples.append({ 'input': list(input_zh), 'output': list(output_en) }) manager_d[index] = samples return manager_d
def train_main(args): """ trains model specfied in args. main method for train subcommand. """ # load text with open(args.text_path) as f: text = f.read() logger.info("corpus length: %s.", len(text)) # restore or build model if args.restore: load_path = args.checkpoint_path if args.restore is True else args.restore with open("{}.json".format(args.checkpoint_path)) as f: model_args = json.load(f) logger.info("model restored: %s.", load_path) else: load_path = None model_args = { "batch_size": args.batch_size, "vocab_size": VOCAB_SIZE, "embedding_size": args.embedding_size, "rnn_size": args.rnn_size, "num_layers": args.num_layers, "p_keep": 1 - args.drop_rate, "learning_rate": args.learning_rate, "clip_norm": args.clip_norm } # build train model train_graph = tf.Graph() with train_graph.as_default(): train_model = build_model(**model_args) with tf.Session(graph=train_graph) as train_sess: # restore or initialise model weights if load_path is not None: train_model["saver"].restore(train_sess, load_path) logger.info("model weights restored: %s.", load_path) else: train_sess.run(train_model["init_op"]) # clear checkpoint directory log_dir = make_dirs(args.checkpoint_path, empty=True) # save model with open("{}.json".format(args.checkpoint_path), "w") as f: json.dump(train_model["args"], f, indent=2) checkpoint_path = train_model["saver"].save(train_sess, args.checkpoint_path) logger.info("model saved: %s.", checkpoint_path) # tensorboard logger summary_writer = tf.summary.FileWriter(log_dir, train_sess.graph) # embeddings visualisation config = projector.ProjectorConfig() embedding = config.embeddings.add() embedding.tensor_name = "EmbedSequence/embeddings" embedding.metadata_path = os.path.abspath( os.path.join("data", "id2char.tsv")) projector.visualize_embeddings(summary_writer, config) logger.info("tensorboard set up.") # build infer model inference_graph = tf.Graph() with inference_graph.as_default(): inference_model = load_inference_model(args.checkpoint_path) # training start num_batches = (len(text) - 1) // (args.batch_size * args.seq_len) data_iter = batch_generator(encode_text(text), args.batch_size, args.seq_len) fetches = [ train_model["train_op"], train_model["output_state"], train_model["loss"], train_model["summary"] ] state = train_sess.run(train_model["input_state"]) logger.info("start of training.") time_train = time.time() for i in range(args.num_epochs): epoch_losses = np.empty(num_batches) time_epoch = time.time() # training epoch for j in tqdm(range(num_batches), desc="epoch {}/{}".format(i + 1, args.num_epochs)): x, y = next(data_iter) feed_dict = { train_model["X"]: x, train_model["Y"]: y, train_model["input_state"]: state } _, state, loss, summary_log = train_sess.run( fetches, feed_dict) epoch_losses[j] = loss # logs duration_epoch = time.time() - time_epoch logger.info("epoch: %s, duration: %ds, loss: %.6g.", i + 1, duration_epoch, epoch_losses.mean()) # tensorboard logs summary_writer.add_summary(summary_log, i + 1) summary_writer.flush() # checkpoint checkpoint_path = train_model["saver"].save( train_sess, args.checkpoint_path) logger.info("model saved: %s.", checkpoint_path) # generate text seed = generate_seed(text) with tf.Session(graph=inference_graph) as infer_sess: # restore weights inference_model["saver"].restore(infer_sess, checkpoint_path) generate_text(inference_model, infer_sess, seed) # training end duration_train = time.time() - time_train logger.info("end of training, duration: %ds.", duration_train) # generate text seed = generate_seed(text) with tf.Session(graph=inference_graph) as infer_sess: # restore weights inference_model["saver"].restore(infer_sess, checkpoint_path) generate_text(inference_model, infer_sess, seed, 1024, 3) return train_model
def train_main(args): """ trains model specfied in args. main method for train subcommand. """ # load text with open(args.text_path) as f: text = f.read() logger.info("corpus length: %s.", len(text)) # load or build model if args.restore: logger.info("restoring model.") load_path = args.checkpoint_path if args.restore is True else args.restore model = Model.load(load_path) else: model = Model(vocab_size=VOCAB_SIZE, embedding_size=args.embedding_size, rnn_size=args.rnn_size, num_layers=args.num_layers, drop_rate=args.drop_rate) # make checkpoint directory make_dirs(args.checkpoint_path) model.save(args.checkpoint_path) # loss function and optimizer criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=args.learning_rate) # training start num_batches = (len(text) - 1) // (args.batch_size * args.seq_len) data_iter = batch_generator(encode_text(text), args.batch_size, args.seq_len) state = model.init_state(args.batch_size) logger.info("start of training.") time_train = time.time() for i in range(args.num_epochs): epoch_losses = torch.Tensor(num_batches) time_epoch = time.time() # training epoch for j in tqdm(range(num_batches), desc="epoch {}/{}".format(i + 1, args.num_epochs)): # prepare inputs x, y = next(data_iter) x = Variable(torch.from_numpy(x)).t() y = Variable(torch.from_numpy(y)).t().contiguous() # reset state variables to remove their history state = tuple([Variable(var.data) for var in state]) # prepare model model.train() model.zero_grad() # calculate loss logits, state = model.forward(x, state) loss = criterion(logits, y.view(-1)) epoch_losses[j] = loss.data[0] # calculate gradients loss.backward() # clip gradient norm nn.utils.clip_grad_norm(model.parameters(), args.clip_norm) # apply gradient update optimizer.step() # logs duration_epoch = time.time() - time_epoch logger.info("epoch: %s, duration: %ds, loss: %.6g.", i + 1, duration_epoch, epoch_losses.mean()) # checkpoint model.save(args.checkpoint_path) # generate text seed = generate_seed(text) generate_text(model, seed) # training end duration_train = time.time() - time_train logger.info("end of training, duration: %ds.", duration_train) # generate text seed = generate_seed(text) generate_text(model, seed, 1024, 3) return model
def equal_file(path1, path2): # c extension module can only handle encoded path path1 = utils.encode_text(path1) path2 = utils.encode_text(path2) ret = compare_file(path1, path2) return ret == 0
def copy_file(src, dst): # c extension module can only handle encoded path src = utils.encode_text(src) dst = utils.encode_text(dst) ret, errmsg = cp_file(src, dst) return ret == 0, errmsg
def hex_middle_md5(path, full=False): # c extension module can only handle encoded path path = utils.encode_text(path) digest = calc_middle_md5(path, 1 if full else 0) return ''.join('{:02x}'.format(ord(x)) for x in digest)
def train_main(args): """ trains model specfied in args. main method for train subcommand. """ # load text with open(args.text_path) as f: text = f.read() logger.info("corpus length: %s.", len(text)) # restore or build model if args.restore: logger.info("restoring model.") load_path = args.checkpoint_path if args.restore is True else args.restore model = Model.load(load_path) else: model = Model(vocab_size=VOCAB_SIZE, embedding_size=args.embedding_size, rnn_size=args.rnn_size, num_layers=args.num_layers, drop_rate=args.drop_rate) model.initialize(mx.init.Xavier()) model.hybridize() # make checkpoint directory make_dirs(args.checkpoint_path) model.save(args.checkpoint_path) # loss function loss = gluon.loss.SoftmaxCrossEntropyLoss(batch_axis=1) # optimizer optimizer = mx.optimizer.Adam(learning_rate=args.learning_rate, clip_gradient=args.clip_norm) # trainer trainer = gluon.Trainer(model.collect_params(), optimizer) # training start num_batches = (len(text) - 1) // (args.batch_size * args.seq_len) data_iter = batch_generator(encode_text(text), args.batch_size, args.seq_len) state = model.begin_state(args.batch_size) logger.info("start of training.") time_train = time.time() for i in range(args.num_epochs): epoch_losses = mx.nd.empty(num_batches) time_epoch = time.time() # training epoch for j in tqdm(range(num_batches), desc="epoch {}/{}".format(i + 1, args.num_epochs)): # prepare inputs x, y = next(data_iter) x = mx.nd.array(x.T) y = mx.nd.array(y.T) # reset state variables to remove their history state = [arr.detach() for arr in state] with autograd.record(): logits, state = model(x, state) # calculate loss L = loss(logits, y) L = F.mean(L) epoch_losses[j] = L.asscalar() # calculate gradient L.backward() # apply gradient update trainer.step(1) # logs duration_epoch = time.time() - time_epoch logger.info("epoch: %s, duration: %ds, loss: %.6g.", i + 1, duration_epoch, F.mean(epoch_losses).asscalar()) # checkpoint model.save_params(args.checkpoint_path) logger.info("model saved: %s.", args.checkpoint_path) # generate text seed = generate_seed(text) generate_text(model, seed) # training end duration_train = time.time() - time_train logger.info("end of training, duration: %ds.", duration_train) # generate text seed = generate_seed(text) generate_text(model, seed, 1024, 3) return model
def train_model(config, is_wandb=False): """ Build and train a classifier based on a configuration object. Runs a stratified K-fold cross-validation process balancing both labels and languages in each fold. Arguments: - config: A configuration object for the run. - is_wandb: a flag for sweeps, adding the Weights and Biases callback to the model. Returns: - model: The best trained classifier out of all folds. - preds_oof: Out of fold predictions on the training set. - preds_test: Predictions on the test set. """ if config.VERBOSE: print("--- Reading Data ---") df_train = pd.read_csv(config.PATH_TRAIN) df_test = pd.read_csv(config.PATH_TEST) if config.VERBOSE: print("Done!") if is_wandb: wb = wandb.keras.WandbCallback() if config.TRANSLATION: if config.VERBOSE: print("--- Translating Premises ---") df_train.loc[df_train.language != "English", "premise"] = df_train[ df_train.language != "English"].premise.apply( lambda x: translate_text(x)) df_test.loc[df_test.language != "English", "premise"] = df_test[ df_test.language != "English"].premise.apply( lambda x: translate_text(x)) if config.VERBOSE: print("Done!") print("--- Translating Hypotheses ---") df_train.loc[df_train.language != "English", "hypothesis"] = df_train[ df_train.language != "English"].hypothesis.apply( lambda x: translate_text(x)) df_test.loc[df_test.language != "English", "hypothesis"] = df_test[ df_test.language != "English"].hypothesis.apply( lambda x: translate_text(x)) if config.VERBOSE: print("Done!") if config.VERBOSE: print("--- Preprocessing ---") # adding language column for stratified splitting df_train["language_label"] = df_train.language.astype( str) + "_" + df_train.label.astype(str) # stratified K-fold on language and label for balance skf = StratifiedKFold(n_splits=config.TRAIN_SPLITS, shuffle=True, random_state=config.SEED) preds_oof = np.zeros((df_train.shape[0], 3)) preds_test = np.zeros((df_test.shape[0], 3)) acc_oof = [] if config.VERBOSE: print("Done!") for (fold, (train_index, valid_index)) in enumerate( skf.split(df_train, df_train.language_label)): if config.VERBOSE: print(f"--- Fold {fold+1} ---") # Initializing TPU if config.ACCELERATOR == "TPU": if config.tpu: config.initialize_accelerator() if config.VERBOSE: print("Building Model...") tf.keras.backend.clear_session() with config.strategy.scope(): model = build_classifier(config.MODEL_NAME, config.MAX_LENGTH, config.LEARNING_RATE, config.METRICS) if fold == 0: print(model.summary()) X_train = df_train.iloc[train_index] X_valid = df_train.iloc[valid_index] y_train = X_train.label.values y_valid = X_valid.label.values if config.VERBOSE: print("Tokenizing...") # Encoding text data using tokenizer X_train_encoded = encode_text(df=X_train, tokenizer=config.TOKENIZER, max_len=config.MAX_LENGTH, padding=config.PAD_TO_MAX_LENGTH) X_valid_encoded = encode_text(df=X_valid, tokenizer=config.TOKENIZER, max_len=config.MAX_LENGTH, padding=config.PAD_TO_MAX_LENGTH) # Creating TF Datasets ds_train = to_tfds(X_train_encoded, y_train, config.AUTO, repeat=True, shuffle=True, batch_size=config.BATCH_SIZE * config.REPLICAS) ds_valid = to_tfds(X_valid_encoded, y_valid, config.AUTO, batch_size=config.BATCH_SIZE * config.REPLICAS * 4) n_train = X_train.shape[0] # Only need to encode test data once if fold == 0: X_test_encoded = encode_text(df=df_test, tokenizer=config.TOKENIZER, max_len=config.MAX_LENGTH, padding=config.PAD_TO_MAX_LENGTH) # Defining checkpoint callback sv = tf.keras.callbacks.ModelCheckpoint( "models\model.h5", monitor="val_sparse_categorical_accuracy", verbose=0, save_best_only=True, save_weights_only=True, mode="max", save_freq="epoch") # Adding wandb callback cbs = [sv] if is_wandb: cbs.append(wb) if config.VERBOSE: print("Training...") model_history = model.fit(ds_train, epochs=config.EPOCHS, callbacks=cbs, steps_per_epoch=n_train / config.BATCH_SIZE // config.REPLICAS, validation_data=ds_valid, verbose=config.VERBOSE) if config.VERBOSE: print("Validating...") # Scoring validation data model.load_weights("models\model.h5") ds_valid = to_tfds(X_valid_encoded, -1, config.AUTO, labelled=False, batch_size=config.BATCH_SIZE * config.REPLICAS * 4) preds_valid = model.predict(ds_valid, verbose=config.VERBOSE) acc = accuracy_score(y_valid, np.argmax(preds_valid, axis=1)) preds_oof[valid_index] = preds_valid acc_oof.append(acc) if config.VERBOSE: print("Testing...") # Scoring test data ds_test = to_tfds(X_test_encoded, -1, config.AUTO, labelled=False, batch_size=config.BATCH_SIZE * config.REPLICAS * 4) preds_test += model.predict( ds_test, verbose=config.VERBOSE) / config.TRAIN_SPLITS print(f"Fold {fold + 1} Accuracy: {round(acc, 4)}") g = gc.collect() # overall CV score and standard deviation print(f"CV Mean Accuracy: {round(np.mean(acc_oof), 4)}") print(f"CV StdDev Accuracy: {round(np.std(acc_oof), 4)}") return model, preds_oof, preds_test
encoder.eval() filename = os.path.join(valid_folder, valid_filename) user_reviews = pd.read_csv(filename) samples = parse_user_reviews(user_reviews) samples = random.sample(samples, 10) pair_batch = [] result = [] for i, sample in enumerate(samples): content = sample['content'] # print(content) result.append({'content': content}) content = content.strip() seg_list = jieba.cut(content) input_indexes = encode_text(voc.word2index, list(seg_list)) label_tensor = sample['label_tensor'] pair_batch.append((input_indexes, label_tensor)) test_data = batch2TrainData(pair_batch) input_variable, lengths, _ = test_data input_variable = input_variable.to(device) lengths = lengths.to(device) outputs = encoder(input_variable, lengths) _, outputs = torch.max(outputs, 1) print('outputs.size(): ' + str(outputs.size())) outputs = outputs.cpu().numpy() for i in range(10): result[i]['labels'] = (outputs[i] - 2).tolist()