def main(args): conf = getattr(configs, 'config_'+args.model)() # Set the random seed manually for reproducibility. random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) else: print("Note that our pre-trained models require CUDA to evaluate.") # Load data test_set=APIDataset(args.data_path+'valid.h5', conf['maxlen']) test_loader=torch.utils.data.DataLoader(dataset=test_set, batch_size=1, shuffle=False, num_workers=1) vocab_api = load_dict(input_dir+'vocab.apiseq.pkl') vocab_desc = load_dict(input_dir+'vocab.desc.pkl') n_tokens = len(vocab_api) metrics=Metrics() # Load model checkpoints model = getattr(model, args.model)(conf, n_tokens) ckpt='./output/{}/{}/models/model_epo{}.pkl'.format(args.model, args.expname, args.reload_from) model.load_state_dict(torch.load(ckpt)) if torch.cuda.is_available(): model=model.cuda() model.eval() f_eval = open("./output/{}/{}/results.txt".format(args.model, args.expname), "w") repeat = args.n_samples evaluate(model, metrics, test_loader, vocab_desc, vocab_api, f_eval, repeat)
def __init__(self, conf): self.conf = conf self.path = conf['workdir'] self.vocab_methname = load_dict(self.path + conf['vocab_name']) self.vocab_apiseq = load_dict(self.path + conf['vocab_api']) self.vocab_tokens = load_dict(self.path + conf['vocab_tokens']) self.vocab_desc = load_dict(self.path + conf['vocab_desc']) self.codevecs = [] self.codebase = [] self.codebase_chunksize = 2000000
def __init__(self, conf): self.model_params = conf self.path = conf['workdir'] self.vocab_methname = load_dict(self.path + conf['vocab_name']) self.vocab_apiseq = load_dict(self.path + conf['vocab_api']) self.vocab_tokens = load_dict(self.path + conf['vocab_tokens']) self.vocab_desc = load_dict(self.path + conf['vocab_desc']) self.codevecs = [] self.codebase = [] self.codebase_chunksize = conf['chunk_size'] self.validation_set = None
def main(): print("Loading dictionary...") print(opt.dictionary) dict = data.load_dict(opt.dictionary) DataLoader = data.HierDataLoader if opt.hier else data.AbsDataLoader print("Constructing train tensors...") train_data = DataLoader(opt.train, dict, opt, window=opt.window, max_size=opt.maxSize) print("Constructing validation tensors...") valid_data = DataLoader(opt.valid, dict, opt, window=opt.window, max_size=opt.maxSize) print("Setting up language model and training parameters...") t = trainer.Trainer(opt, dict) print("Training...") t.train(train_data, valid_data)
def __init__(self, args, logger, hvd=False): self.args = args self.logger = logger self.hvd = hvd self.optimizer = args.optimizer self.num_hidden_layers = args.layer_depth self.embedding_dim = args.embedding_dim self.bucket = BUCKET self.ngram_dim = args.ngram_dim self.tag2id, self.id2tag = load_dict(args.tag_to_id_path, args.encoding) self.num_tags = 3 # len(self.tag2id) self.word2id, self.id2word = load_dict(args.word_to_id_path, args.encoding) self.logger.info("tag2id size: %d" % self.num_tags) self.logger.info("word2id size: %d" % len(self.word2id)) self.lambda1 = args.lambda1 self.lambda2 = args.lambda2 self.lambda3 = args.lambda3 self.lambda4 = args.lambda4
def __init__(self, seqmodel, paths, config): self.model = seqmodel self.args = seqmodel.args self.encoding = self.args.encoding self.logger = seqmodel.logger self.model_global_step = seqmodel.global_step self.global_step = 0 self.hvd = seqmodel.hvd self.model_path = paths['model_path'] if self.model.args.restore: self.restore_model_path = paths['restore_model_path'] self.summary_path = paths['summary_path'] self.result_path = paths['result_path'] self.tag2id, self.id2tag = load_dict(self.args.tag_to_id_path, self.encoding) self.word2id, self.id2word = load_dict(self.args.word_to_id_path, self.encoding) self.dataset2flag, self.flag2dataset = load_dict_ano( self.args.dataset_to_flag_path, self.encoding) self.config = config self.batch_size = self.args.batch_size self.epoch_num = self.args.epoch self.min_epoch_num = self.args.min_epoch self.restore = self.args.restore self.dropout = self.args.dropout self.optimizer = self.args.optimizer self.lr = self.args.lr self.max_scores = 0.0 self.unseccessful_step_num = 0 self.eval_step = self.args.eval_step self.local_step_num = 0 self.total_w_count = 0 self.total_w_loss = 0 self.total_w_loss1 = 0 self.total_w_loss2 = 0 self.total_w_loss3 = 0 self.total_w_loss4 = 0 self.train_sample_num = 0 self.save_max = self.args.save_max self.logger.info("model path: %s " % self.model_path)
def quant_post(args): place = paddle.set_device("gpu") exe = paddle.static.Executor(place) label2id, id2label = load_dict(args.label_path) train_ds = load_dataset(read, data_path=args.dev_path, lazy=False) tokenizer = PPMiniLMTokenizer.from_pretrained(args.base_model_name) trans_func = partial(convert_example_to_feature, tokenizer=tokenizer, label2id=label2id, max_seq_len=args.max_seq_len) train_ds = train_ds.map(trans_func, lazy=True) def batch_generator_func(): batch_data = [[], []] for data in train_ds: batch_data[0].append(data[0]) batch_data[1].append(data[1]) if len(batch_data[0]) == args.batch_size: input_ids = Pad(axis=0, pad_val=0, dtype="int64")(batch_data[0]) segment_ids = Pad(axis=0, pad_val=0, dtype="int64")(batch_data[1]) yield [input_ids, segment_ids] batch_data = [[], []] paddleslim.quant.quant_post_static( exe, args.static_model_dir, args.quant_model_dir, save_model_filename=args.save_model_filename, save_params_filename=args.save_params_filename, algo=args.algorithm, hist_percent=0.9999, batch_generator=batch_generator_func, model_filename=args.input_model_filename, params_filename=args.input_param_filename, quantizable_op_type=['matmul', 'matmul_v2'], weight_bits=8, weight_quantize_type='channel_wise_abs_max', batch_nums=1)
def dict_to_mongo(): items = load_dict() for word, item in items.items(): if Word.objects(word=word).count() == 0: dbitem = Word(**item) dbitem.save()
def main(args): global anno, infer_y, h_pre, alpha_past, if_trainning, dictLen worddicts = load_dict(args.dictPath) dictLen = len(worddicts) worddicts_r = [None] * len(worddicts) for kk, vv in worddicts.items(): worddicts_r[vv] = kk test, test_uid_list = dataIterator( args.testPklPath, args.testCaptionPath, worddicts, batch_size=2, batch_Imagesize=400000, maxlen=100, maxImagesize=400000, ) x = tf.placeholder(tf.float32, shape=[None, None, None, 1]) y = tf.placeholder(tf.int32, shape=[None, None]) x_mask = tf.placeholder(tf.float32, shape=[None, None, None]) y_mask = tf.placeholder(tf.float32, shape=[None, None]) lr = tf.placeholder(tf.float32, shape=()) if_trainning = tf.placeholder(tf.bool, shape=()) watcher_train = Watcher_train(blocks=3, level=16, growth_rate=24, training=if_trainning) annotation, anno_mask = watcher_train.dense_net(x, x_mask) # for initilaizing validation anno = tf.placeholder( tf.float32, shape=[ None, annotation.shape.as_list()[1], annotation.shape.as_list()[2], annotation.shape.as_list()[3], ], ) infer_y = tf.placeholder(tf.int64, shape=(None, )) h_pre = tf.placeholder(tf.float32, shape=[None, 256]) alpha_past = tf.placeholder( tf.float32, shape=[ None, annotation.shape.as_list()[1], annotation.shape.as_list()[2] ], ) attender = Attender(annotation.shape.as_list()[3], 256, 512) parser = Parser(256, 256, attender, annotation.shape.as_list()[3]) wap = WAP( watcher_train, attender, parser, 256, 256, annotation.shape.as_list()[3], dictLen, if_trainning, ) hidden_state_0 = tf.tanh( tf.tensordot(tf.reduce_mean(anno, axis=[1, 2]), wap.Wa2h, axes=1) + wap.ba2h) # [batch, hidden_dim] cost = wap.get_cost(annotation, y, anno_mask, y_mask) vs = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) for vv in vs: if not vv.name.startswith("batch_normalization"): cost += 1e-4 * tf.reduce_sum(tf.pow(vv, 2)) p, w, h, alpha = wap.get_word(infer_y, h_pre, alpha_past, anno) optimizer = tf.train.AdadeltaOptimizer(learning_rate=lr) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): trainer = optimizer.minimize(cost) max_epoch = 200 config = tf.ConfigProto() config.gpu_options.allow_growth = True init = tf.global_variables_initializer() saver = tf.train.Saver() saver = tf.train.Saver() with tf.Session(config=config) as sess: sess.run(init) saver.restore( sess, os.path.join(args.modelPath, args.modelFileName) + ".ckpt") print("Start sampling...") _t = time.time() fpp_sample = open( os.path.join(args.resultPath, str(args.resultFileName) + ".txt"), "w", ) test_count_idx = 0 for batch_x, batch_y in test: for xx in batch_x: xx = np.moveaxis(xx, 0, -1) xx_pad = np.zeros((xx.shape[0], xx.shape[1], xx.shape[2]), dtype="float32") xx_pad[:, :, :] = xx / 255.0 xx_pad = xx_pad[None, :, :, :] annot = sess.run(annotation, feed_dict={ x: xx_pad, if_trainning: False }) h_state = sess.run(hidden_state_0, feed_dict={anno: annot}) sample, score = wap.get_sample( p, w, h, alpha, annot, h_state, 10, 100, False, sess, training=False, ) score = score / np.array([len(s) for s in sample]) ss = sample[score.argmin()] fpp_sample.write(test_uid_list[test_count_idx]) test_count_idx = test_count_idx + 1 if np.mod(test_count_idx, 100) == 0: print("gen %d samples" % test_count_idx) log.write("gen %d samples" % test_count_idx + "\n") log.flush() for vv in ss: if vv == 0: # <eol> break fpp_sample.write(" " + worddicts_r[vv]) fpp_sample.write("\n") fpp_sample.close() print("valid set decode done") log.write("valid set decode done\n") log.flush() print("Done sampling, took" + str(time.time() - _t)) print("Start validating...") _t = time.time() probs = [] for batch_x, batch_y in test: batch_x, batch_x_m, batch_y, batch_y_m = prepare_data( batch_x, batch_y) pprobs, annot = sess.run( [cost, annotation], feed_dict={ x: batch_x, y: batch_y, x_mask: batch_x_m, y_mask: batch_y_m, if_trainning: False, }, ) probs.append(pprobs) valid_errs = np.array(probs) valid_err_cost = valid_errs.mean() wer_process( os.path.join(args.resultPath, args.resultFileName + ".txt"), args.validCaptionPath, os.path.join(args.resultPath, args.resultFileName + ".wer"), ) fpp = open(os.path.join(args.resultPath, f"{args.resultFileName}.wer")) stuff = fpp.readlines() fpp.close() m = re.search("WER (.*)\n", stuff[0]) test_per = 100.0 * float(m.group(1)) m = re.search("ExpRate (.*)\n", stuff[1]) test_sacc = 100.0 * float(m.group(1)) test_err = test_per print("Test WER: %.2f%%, ExpRate: %.2f%%, Cost: %f" % (test_per, test_sacc, test_err_cost)) print(f"Done validating, took {time.time() - _t}.")
token_type_ids = paddle.to_tensor(token_type_ids) seq_len = paddle.to_tensor(seq_len) pred_tags = model(input_ids, token_type_ids, lengths=seq_len) all_pred_tags.extend(pred_tags.numpy().tolist()) results = decode(data, all_pred_tags, summary_num, idx_to_tags) return results if __name__ == "__main__": paddle.set_device(args.device) data = [ '美人鱼是周星驰执导的一部电影', ] tags_to_idx = load_dict(os.path.join(args.data_dir, "tags.txt")) idx_to_tags = dict(zip(*(tags_to_idx.values(), tags_to_idx.keys()))) model = ErnieCtmWordtagModel.from_pretrained("wordtag", num_tag=len(tags_to_idx)) tokenizer = ErnieCtmTokenizer.from_pretrained("wordtag") if args.params_path and os.path.isfile(args.params_path): state_dict = paddle.load(args.params_path) model.set_dict(state_dict) print("Loaded parameters from %s" % args.params_path) results = do_predict(data, model, tokenizer, model.viterbi_decoder,
preds = [pred[1:] for pred in preds.numpy()] all_preds.append(preds) all_lens.append(lens) sentences = [example[0] for example in ds.data] results = parse_decodes(sentences, all_preds, all_lens, label_vocab) return results if __name__ == '__main__': paddle.set_device('gpu') # Create dataset, tokenizer and dataloader. train_ds, dev_ds, test_ds = load_dataset(datafiles=( './waybill_data/train.txt', './waybill_data/dev.txt', './waybill_data/test.txt')) label_vocab = load_dict('./conf/tag.dic') tokenizer = ErnieGramTokenizer.from_pretrained('ernie-gram-zh') trans_func = partial( convert_to_features, tokenizer=tokenizer, label_vocab=label_vocab) train_ds.map(trans_func) dev_ds.map(trans_func) test_ds.map(trans_func) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32'), # input_ids Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32'), # token_type_ids Stack(dtype='int64'), # seq_len Pad(axis=0, pad_val=label_vocab.get("O", 0), dtype='int64') # labels ): fn(samples)
if __name__ == "__main__": # yapf: disable parser = argparse.ArgumentParser() parser.add_argument("--base_model_name", type=str, default=None, help="The name of base model.") parser.add_argument("--model_path", type=str, default=None, help="The path of saved model that you want to load.") parser.add_argument('--test_path', type=str, default=None, help="The path of test set.") parser.add_argument("--label_path", type=str, default=None, help="The path of label dict.") parser.add_argument("--batch_size", type=int, default=32, help="Batch size per GPU/CPU for training.") parser.add_argument("--max_seq_len", type=int, default=512, help="The maximum total input sequence length after tokenization.") args = parser.parse_args() # yapf: enbale # load dev data label2id, id2label = load_dict(args.label_path) test_ds = load_dataset(read, data_path=args.test_path, lazy=False) tokenizer = PPMiniLMTokenizer.from_pretrained(args.base_model_name) trans_func = partial(convert_example_to_feature, tokenizer=tokenizer, label2id=label2id, max_seq_len=args.max_seq_len) test_ds = test_ds.map(trans_func, lazy=False) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), Stack(dtype="int64"), Stack(dtype="int64") ): fn(samples) test_batch_sampler = paddle.io.BatchSampler(test_ds, batch_size=args.batch_size, shuffle=False) test_loader = paddle.io.DataLoader(test_ds, batch_sampler=test_batch_sampler, collate_fn=batchify_fn)
def train(): # set running envir model_name = "skep_ernie_1.0_large_ch" paddle.set_device(args.device) set_seed(args.seed) if not os.path.exists(args.checkpoints): os.mkdir(args.checkpoints) # load and process data label2id, id2label = load_dict(args.label_path) train_ds = load_dataset(read, data_path=args.train_path, lazy=False) dev_ds = load_dataset(read, data_path=args.dev_path, lazy=False) tokenizer = SkepTokenizer.from_pretrained(model_name) trans_func = partial( convert_example_to_feature, tokenizer=tokenizer, label2id=label2id, max_seq_len=args.max_seq_len) train_ds = train_ds.map(trans_func, lazy=False) dev_ds = dev_ds.map(trans_func, lazy=False) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), Stack(dtype="int64"), Pad(axis=0, pad_val= -1, dtype="int64") ): fn(samples) train_batch_sampler = paddle.io.BatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) dev_batch_sampler = paddle.io.BatchSampler( dev_ds, batch_size=args.batch_size, shuffle=False) train_loader = paddle.io.DataLoader( train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn) dev_loader = paddle.io.DataLoader( dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn) # configure model training model = SkepForTokenClassification.from_pretrained( model_name, num_classes=len(label2id)) num_training_steps = len(train_loader) * args.num_epochs lr_scheduler = LinearDecayWithWarmup( learning_rate=args.learning_rate, total_steps=num_training_steps, warmup=args.warmup_proportion) decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] grad_clip = paddle.nn.ClipGradByGlobalNorm(args.max_grad_norm) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=grad_clip) metric = ChunkEvaluator(label2id.keys()) # start to train model global_step, best_f1 = 1, 0. model.train() for epoch in range(1, args.num_epochs + 1): for batch_data in train_loader(): input_ids, token_type_ids, _, labels = batch_data # logits: batch_size, seql_len, num_tags logits = model(input_ids, token_type_ids=token_type_ids) loss = F.cross_entropy( logits.reshape([-1, len(label2id)]), labels.reshape([-1]), ignore_index=-1) loss.backward() lr_scheduler.step() optimizer.step() optimizer.clear_grad() if global_step > 0 and global_step % args.log_steps == 0: print( f"epoch: {epoch} - global_step: {global_step}/{num_training_steps} - loss:{loss.numpy().item():.6f}" ) if (global_step > 0 and global_step % args.eval_steps == 0 ) or global_step == num_training_steps: precision, recall, f1 = evaluate(model, dev_loader, metric) model.train() if f1 > best_f1: print( f"best F1 performence has been updated: {best_f1:.5f} --> {f1:.5f}" ) best_f1 = f1 paddle.save(model.state_dict(), f"{args.checkpoints}/best.pdparams") print( f'evalution result: precision: {precision:.5f}, recall: {recall:.5f}, F1: {f1:.5f}' ) global_step += 1 paddle.save(model.state_dict(), f"{args.checkpoints}/final.pdparams")
def main(): state = torch.load(opt.model) if opt.hier: mlp, encoder = state else: mlp = state dict = data.load_dict(opt.dictionary) sent_file = open(opt.inputf).read().split("\n") length = opt.length if not opt.hier: W = mlp.window opt.window = mlp.window else: W = 1 w2i = dict["w2i"] i2w = dict["i2w"] K = opt.beamSize actual = open(opt.outputf).read().split('\n') sent_num = 0 with torch.no_grad(): for line in sent_file: if line.strip() == "": continue # Add padding if opt.hier: summaries = extractive(line).split("\t") print("\n> {}...".format(summaries[0])) encoded_summaries = [ encode("<s> {} </s>".format(normalize(summary)), w2i) for summary in summaries ] article = HierDataLoader.torchify(encoded_summaries, variable=True, revsort=True, opt=opt) hidden_state = encoder.init_hidden() summ_hidden_state = encoder.init_hidden(n=opt.summLstmLayers, K=opt.K) print(hidden_state[0].shape, summ_hidden_state[0].shape) print(article[0].shape) encoder_out, hidden_state, _ = encoder(article, hidden_state, summ_hidden_state) else: print("\n> {}".format(line)) true_line = "<s> <s> <s> {} </s> </s> </s>".format( normalize(line)) article = torch.tensor(encode(true_line, w2i)) n = opt.length hyps = apply_cuda(torch.zeros(K, W + n).long().fill_(w2i["<s>"])) scores = apply_cuda(torch.zeros(K).float()) if opt.hier: hidden_size = len(hidden_state[0][0][0]) hidden = apply_cuda(torch.zeros(K, hidden_size).float()) cell = apply_cuda(torch.zeros(K, hidden_size).float()) for k in range(K): hidden[k] = hidden_state[0][0] cell[k] = hidden_state[1][0] for step in range(n): new_candidates = [] start = step end = step + W context = hyps[:, start:end] # context if opt.hier: model_scores = torch.zeros(K, len(w2i)) for c in range(K): ctx = context[c].view(1, -1) ctx = article[0][0][step].view(1, -1) model_scores[c], new_hidden, attn = mlp( encoder_out, ctx, (hidden[c].view(1, 1, -1), cell[c].view(1, 1, -1))) hidden[c] = new_hidden[0] cell[c] = new_hidden[1] else: article_t, context_t = AbsDataLoader.make_input( article, context, K) model_scores, attn = mlp(article_t, context_t) out_scores = model_scores.data # Apply hard constraints finalized = (step == n - 1) and opt.fixedLength set_hard_constraints(out_scores, w2i, finalized) for sample in range(K): # Per certain context top_scores, top_indexes = torch.topk(out_scores[sample], K) for ix, score in zip(top_indexes, top_scores): repetition = opt.noRepeat and apply_cuda( ix) in apply_cuda(hyps[sample]) combined = torch.cat((hyps[sample][:end], apply_cuda(torch.tensor([ix])))) if opt.hier: candidate = [ combined, -INF if repetition else scores[sample] + apply_cuda(score), hidden[c], cell[c] ] else: candidate = [ combined, -INF if repetition else scores[sample] + apply_cuda(score), None, None ] new_candidates.append(candidate) ordered = list( reversed(sorted(new_candidates, key=lambda cand: cand[1]))) h, s, hidden_temp, cell_temp = zip(*ordered) for r in range(K): hyps[r][0:end + 1] = h[r] scores[r] = s[r] if opt.hier: hidden[r] = hidden_temp[r] cell[r] = cell_temp[r] s, top_ixs = torch.topk(scores, 1) final = hyps[int(top_ixs)][W:-1] print("= {}".format(actual[sent_num])) print("< {}".format(decode(final, i2w))) print("") sent_num += 1
if use_cuda: embedder = embedder.cuda() encoder = encoder.cuda() hidvar = hidvar.cuda() #!!!!! decoder = decoder.cuda() TRAIN_FILE = input_dir + 'train.h5' train_set = UbuntuDataset(TRAIN_FILE, max_seq_len=20) train_data_loader = torch.utils.data.DataLoader( dataset=train_set, batch_size=batch_size, shuffle=True, num_workers=1 # multiple num_workers could introduce error (conflict?) ) vocab = load_dict(input_dir + 'vocab.json') train(embedder, encoder, hidvar, decoder, train_data_loader, vocab, n_iters, model_dir, p_teach_force, save_every=save_every, sample_every=sample_every, print_every=print_every, learning_rate=learning_rate)
def main(args): global anno, infer_y, h_pre, alpha_past, if_trainning, dictLen worddicts = load_dict(args.dictPath) dictLen = len(worddicts) worddicts_r = [None] * len(worddicts) for kk, vv in worddicts.items(): worddicts_r[vv] = kk train, train_uid_list = dataIterator( args.trainPklPath, args.trainCaptionPath, worddicts, batch_size=args.batch_size, batch_Imagesize=500000, maxlen=150, maxImagesize=500000, ) valid, valid_uid_list = dataIterator( args.validPklPath, args.validCaptionPath, worddicts, batch_size=args.batch_size, batch_Imagesize=500000, maxlen=150, maxImagesize=500000, ) print("train lenth is ", len(train)) print("valid lenth is ", len(valid)) x = tf.placeholder(tf.float32, shape=[None, None, None, 1]) y = tf.placeholder(tf.int32, shape=[None, None]) x_mask = tf.placeholder(tf.float32, shape=[None, None, None]) y_mask = tf.placeholder(tf.float32, shape=[None, None]) lr = tf.placeholder(tf.float32, shape=()) if_trainning = tf.placeholder(tf.bool, shape=()) watcher_train = Watcher_train(blocks=3, level=16, growth_rate=24, training=if_trainning) annotation, anno_mask = watcher_train.dense_net(x, x_mask) # for initilaizing validation anno = tf.placeholder( tf.float32, shape=[ None, annotation.shape.as_list()[1], annotation.shape.as_list()[2], annotation.shape.as_list()[3], ], ) infer_y = tf.placeholder(tf.int64, shape=(None, )) h_pre = tf.placeholder(tf.float32, shape=[None, 256]) alpha_past = tf.placeholder( tf.float32, shape=[ None, annotation.shape.as_list()[1], annotation.shape.as_list()[2] ], ) attender = Attender(annotation.shape.as_list()[3], 256, 512) parser = Parser(256, 256, attender, annotation.shape.as_list()[3]) wap = WAP( watcher_train, attender, parser, 256, 256, annotation.shape.as_list()[3], dictLen, if_trainning, ) hidden_state_0 = tf.tanh( tf.tensordot(tf.reduce_mean(anno, axis=[1, 2]), wap.Wa2h, axes=1) + wap.ba2h) # [batch, hidden_dim] cost = wap.get_cost(annotation, y, anno_mask, y_mask) vs = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) for vv in vs: if not vv.name.startswith("batch_normalization"): cost += 1e-4 * tf.reduce_sum(tf.pow(vv, 2)) p, w, h, alpha = wap.get_word(infer_y, h_pre, alpha_past, anno) optimizer = tf.train.AdadeltaOptimizer(learning_rate=lr) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): trainer = optimizer.minimize(cost) max_epoch = 200 config = tf.ConfigProto() config.gpu_options.allow_growth = True init = tf.global_variables_initializer() uidx = 0 cost_s = 0 dispFreq = 100 if args.dispFreq is None else args.dispFreq saveFreq = (len(train) * args.epochDispRatio if args.saveFreq is None else args.saveFreq) sampleFreq = (len(train) * args.epochSampleRatio if args.sampleFreq is None else args.sampleFreq) validFreq = (len(train) * args.epochValidRatio if args.validFreq is None else args.validFreq) history_errs = [] estop = False halfLrFlag = 0 patience = 15 if args.patience is None else args.patience lrate = args.lr logPath = "./log.txt" if args.logPath is None else args.logPath log = open(logPath, "w") log.write(str(vars(args))) log.write(str(patience)) log.write(str(lr)) saver = tf.train.Saver() with tf.Session(config=config) as sess: sess.run(init) for epoch in range(max_epoch): n_samples = 0 random.shuffle(train) for batch_x, batch_y in train: batch_x, batch_x_m, batch_y, batch_y_m = prepare_data( batch_x, batch_y) n_samples += len(batch_x) uidx += 1 cost_i, _ = sess.run( [cost, trainer], feed_dict={ x: batch_x, y: batch_y, x_mask: batch_x_m, y_mask: batch_y_m, if_trainning: True, lr: lrate, }, ) cost_s += cost_i if np.isnan(cost_i) or np.isinf(cost_i): print("invalid cost value detected") sys.exit(0) if np.mod(uidx, dispFreq) == 0: cost_s /= dispFreq print("Epoch ", epoch, "Update ", uidx, "Cost ", cost_s, "Lr ", lrate) log.write("Epoch " + str(epoch) + " Update " + str(uidx) + " Cost " + str(cost_s) + " Lr " + str(lrate) + "\n") log.flush() cost_s = 0 if np.mod(uidx, sampleFreq) == 0: print("Start sampling...") _t = time.time() fpp_sample = open( os.path.join(args.resultPath, str(args.resultFileName) + ".txt"), "w", ) valid_count_idx = 0 for batch_x, batch_y in valid: for xx in batch_x: xx = np.moveaxis(xx, 0, -1) xx_pad = np.zeros( (xx.shape[0], xx.shape[1], xx.shape[2]), dtype="float32") xx_pad[:, :, :] = xx / 255.0 xx_pad = xx_pad[None, :, :, :] annot = sess.run(annotation, feed_dict={ x: xx_pad, if_trainning: False }) h_state = sess.run(hidden_state_0, feed_dict={anno: annot}) sample, score = wap.get_sample( p, w, h, alpha, annot, h_state, 10, 100, False, sess, training=False, ) score = score / np.array([len(s) for s in sample]) ss = sample[score.argmin()] fpp_sample.write(valid_uid_list[valid_count_idx]) valid_count_idx = valid_count_idx + 1 if np.mod(valid_count_idx, 100) == 0: print("gen %d samples" % valid_count_idx) log.write("gen %d samples" % valid_count_idx + "\n") log.flush() for vv in ss: if vv == 0: # <eol> break fpp_sample.write(" " + worddicts_r[vv]) fpp_sample.write("\n") fpp_sample.close() print("valid set decode done") log.write("valid set decode done\n") log.flush() print("Done sampling, took" + str(time.time() - _t)) if np.mod(uidx, validFreq) == 0: print("Start validating...") _t = time.time() probs = [] for batch_x, batch_y in valid: batch_x, batch_x_m, batch_y, batch_y_m = prepare_data( batch_x, batch_y) pprobs, annot = sess.run( [cost, annotation], feed_dict={ x: batch_x, y: batch_y, x_mask: batch_x_m, y_mask: batch_y_m, if_trainning: False, }, ) probs.append(pprobs) valid_errs = np.array(probs) valid_err_cost = valid_errs.mean() wer_process( os.path.join(args.resultPath, args.resultFileName + ".txt"), args.validCaptionPath, os.path.join(args.resultPath, args.resultFileName + ".wer"), ) fpp = open( os.path.join(args.resultPath, args.resultFileName + ".wer")) stuff = fpp.readlines() fpp.close() m = re.search("WER (.*)\n", stuff[0]) valid_per = 100.0 * float(m.group(1)) m = re.search("ExpRate (.*)\n", stuff[1]) valid_sacc = 100.0 * float(m.group(1)) valid_err = valid_per history_errs.append(valid_err) if (uidx / validFreq == 0 or valid_err <= np.array(history_errs).min()): bad_counter = 0 if (uidx / validFreq != 0 and valid_err > np.array(history_errs).min()): bad_counter += 1 if bad_counter > patience: if halfLrFlag == 2: print("Early Stop!") log.write("Early Stop!\n") log.flush() estop = True break else: print("Lr decay and retrain!") log.write("Lr decay and retrain!\n") log.flush() bad_counter = 0 lrate = lrate / 10 halfLrFlag += 1 print("bad_counter" + str(bad_counter)) print("Valid WER: %.2f%%, ExpRate: %.2f%%, Cost: %f" % (valid_per, valid_sacc, valid_err_cost)) log.write("Valid WER: %.2f%%, ExpRate: %.2f%%, Cost: %f" % (valid_per, valid_sacc, valid_err_cost) + "\n") log.flush() print("Done validating, took" + str(time.time() - _t)) if estop: break save_path = saver.save(sess, os.path.join(args.savePath + args.saveName))
def do_train(args): paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) train_ds = load_dataset(read_custom_data, filename=os.path.join(args.data_dir, "train.txt"), is_test=False, lazy=False) dev_ds = load_dataset(read_custom_data, filename=os.path.join(args.data_dir, "dev.txt"), is_test=False, lazy=False) tags_to_idx = load_dict(os.path.join(args.data_dir, "tags.txt")) tokenizer = ErnieCtmTokenizer.from_pretrained("wordtag") model = ErnieCtmWordtagModel.from_pretrained("wordtag", num_tag=len(tags_to_idx)) model.crf_loss = LinearChainCrfLoss( LinearChainCrf(len(tags_to_idx), 0.1, with_start_stop_tag=False)) trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_len=args.max_seq_len, tags_to_idx=tags_to_idx) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64' ), # input_ids Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64' ), # token_type_ids Stack(dtype='int64'), # seq_len Pad(axis=0, pad_val=tags_to_idx["O"], dtype='int64'), # tags ): fn(samples) train_data_loader = create_dataloader(train_ds, mode="train", batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) dev_data_loader = create_dataloader(dev_ds, mode="dev", batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) num_training_steps = len(train_data_loader) * args.num_train_epochs warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) num_train_optimization_steps = len( train_ds) / args.batch_size * args.num_train_epochs decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) logger.info("Total steps: %s" % num_training_steps) logger.info("WarmUp steps: %s" % warmup) metric = SequenceAccuracy() total_loss = 0 global_step = 0 for epoch in range(1, args.num_train_epochs + 1): logger.info(f"Epoch {epoch} beginnig") start_time = time.time() for total_step, batch in enumerate(train_data_loader): global_step += 1 input_ids, token_type_ids, seq_len, tags = batch loss, _ = model(input_ids, token_type_ids, lengths=seq_len, tag_labels=tags) loss = loss.mean() total_loss += loss loss.backward() optimizer.step() optimizer.clear_grad() lr_scheduler.step() if global_step % args.logging_steps == 0 and rank == 0: end_time = time.time() speed = float(args.logging_steps) / (end_time - start_time) logger.info( "global step %d, epoch: %d, loss: %.5f, speed: %.2f step/s" % (global_step, epoch, total_loss / args.logging_steps, speed)) start_time = time.time() total_loss = 0 if (global_step % args.save_steps == 0 or global_step == num_training_steps) and rank == 0: output_dir = os.path.join(args.output_dir, "model_%d" % (global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) evaluate(model, metric, dev_data_loader, tags, tags_to_idx)
def predict(text): """ :return: """ def load_model(): output_graph_def = tf.GraphDef() with open('./ckpt/ner-1.pb', "rb") as f: output_graph_def.ParseFromString(f.read()) _ = tf.import_graph_def(output_graph_def, name="") sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) return sess # 读入数据 char2id, type2id = load_dict(char_dict="train_data/char2id.json", type_dict="train_data/type2id.json") id2type = {value: key for key, value in type2id.items()} ids = list(id2type.keys()) sess = load_model() input_ids = sess.graph.get_tensor_by_name("placeholder/input_ids:0") # is_training = sess.graph.get_tensor_by_name("placeholder/Placeholder:0") # is_training viterbi_sequence = sess.graph.get_tensor_by_name( "doc_segment/ReverseSequence_1:0") t1 = time.time() x = sequence_padding([char2id.get(c, 1) for c in text], max_len=384) feed_dict = { input_ids: [x], # is_training: True, } predicts_d = sess.run([viterbi_sequence], feed_dict)[0] p = predicts_d.tolist()[0] # 封装一下,输出结果 IOS = [] index = 0 start = None for i in p: if i == 0: if start is None: pass else: IOS.append((start, index)) break elif i == 1: if start is None: pass else: if index > 0: IOS.append((start, index)) start = None else: # 包含实体 if start is None: start = index else: if i == p[index - 1]: pass else: IOS.append((start, index)) start = index index += 1 print(p) print(IOS) extract_dict = [] for i in IOS: extract_id = p[i[0]] tag = id2type.get(extract_id) value = text[i[0]:i[1]] extract_dict.append({"type": tag, "value": value}) return extract_dict
def train(): """ 模型训练 :return: """ char2id, type2id = load_dict(char_dict="train_data/char2id.json", type_dict="train_data/type2id.json") # tf.flags.DEFINE_string("data_dir", "data/data.dat", "data directory") tf.flags.DEFINE_integer("vocab_size", len(char2id), "vocabulary size") tf.flags.DEFINE_integer("num_classes", len(type2id), "number of classes") tf.flags.DEFINE_integer("max_num", 384, "max_sentence_num") tf.flags.DEFINE_integer( "embedding_size", 256, "Dimensionality of character embedding (default: 200)") tf.flags.DEFINE_integer( "hidden_size", 128, "Dimensionality of GRU hidden layer (default: 50)") tf.flags.DEFINE_integer("batch_size", 256, "Batch Size (default: 64)") tf.flags.DEFINE_integer("num_epochs", 10, "Number of training epochs (default: 50)") tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)") tf.flags.DEFINE_integer("num_checkpoints", 3, "Number of checkpoints to store (default: 5)") tf.flags.DEFINE_integer("evaluate_every", 300, "evaluate every this many batches") tf.flags.DEFINE_float("learning_rate", 0.01, "learning rate") tf.flags.DEFINE_float("grad_clip", 5, "grad clip to prevent gradient explode") FLAGS = tf.flags.FLAGS with tf.Session(config=config) as sess: ner = NER(vocab_size=FLAGS.vocab_size, num_classes=FLAGS.num_classes, embedding_size=FLAGS.embedding_size, hidden_size=FLAGS.hidden_size, max_num=FLAGS.max_num) # 外部定义 优化器 global_step = tf.Variable(0, trainable=False) optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate) # RNN中常用的梯度截断,防止出现梯度过大难以求导的现象 tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(ner.loss, tvars), FLAGS.grad_clip) grads_and_vars = tuple(zip(grads, tvars)) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) if not os.path.exists('./ckpt/'): os.makedirs("./ckpt/") # 恢复模型 / 重新初始化参数 # model_file = tf.train.latest_checkpoint('./ckpt/') ckpt = tf.train.get_checkpoint_state('./ckpt/') if ckpt: print("load saved model:\t", ckpt.model_checkpoint_path) saver = tf.train.Saver() saver.restore(sess, ckpt.model_checkpoint_path) else: print("init model...") sess.run(tf.global_variables_initializer()) def evaluate(viterbi_sequence, Y): ''' 计算变长的 准确率 指标 :return: ''' t_all = 0 t_true = 0 for p, y in zip(viterbi_sequence, Y): # 当前句子的长度 l = len(np.nonzero(y)) # 通过两个序列,计算准确率 t_all += l t_true += np.sum(np.equal(p[:l], y[:l])) return float(t_true) / float(t_all), t_true, t_all def train_step(x, y): feed_dict = { ner.input_ids: x, ner.output_types: y, ner.is_training: True, } _, step, predicts_t, cost, accuracy = sess.run([ train_op, global_step, ner.viterbi_sequence, ner.loss, ner.acc ], feed_dict) acc_t, count, total = evaluate(np.array(predicts_t), y) time_str = str(int(time.time())) print("{}: step {}, loss {}, f_acc {}, t_acc {}".format( time_str, step, cost, accuracy, acc_t)) # train_summary_writer.add_summary(summaries, step) return step def dev_step(x, y, writer=None): feed_dict = { ner.input_ids: x, ner.output_types: y, ner.is_training: False, } step, predicts_d, cost, accuracy = sess.run( [global_step, ner.viterbi_sequence, ner.loss, ner.acc], feed_dict) acc_d, count, total = evaluate(np.array(predicts_d), y) time_str = str(int(time.time())) print("+dev+{}: step {}, loss {}, f_acc {}, t_acc {}".format( time_str, step, cost, accuracy, acc_d)) return cost, accuracy, count, total best_accuracy, best_at_step = 0, 0 train_example_len = 173109 dev_example_len = 21639 num_train_steps = int(train_example_len / FLAGS.batch_size * FLAGS.num_epochs) num_dev_steps = int(dev_example_len / FLAGS.batch_size) max_acc = 0.0 input_ids_train, output_types_train = get_input_data( "./train_data/train_ner.tf_record", FLAGS.batch_size) input_ids_dev, output_types_dev = get_input_data( "./train_data/dev_ner.tf_record", FLAGS.batch_size) for i in range(num_train_steps): # batch 数据 input_ids_train_, output_types_train_ = sess.run( [input_ids_train, output_types_train]) step = train_step(input_ids_train_, output_types_train_) if step % FLAGS.evaluate_every == 0: # dev 数据过大, 也需要进行 分批 total_dev_correct = 0 total_devs = 0 for j in range(num_dev_steps): input_ids_dev_, output_types_dev_ = sess.run( [input_ids_dev, output_types_dev]) loss, acc, count, total = dev_step(input_ids_dev_, output_types_dev_) total_dev_correct += count total_devs += total dev_accuracy = float(total_dev_correct) / total_devs print("预测:", total_dev_correct) print("长度为:", total_devs) print("最后预测结果:", dev_accuracy) if dev_accuracy > max_acc: print("save model:\t%f\t>%f" % (dev_accuracy, max_acc)) max_acc = dev_accuracy saver.save(sess, './ckpt/ner.ckpt', global_step=step) sess.close()
def load_model(model, epoch): """Load parameters from checkpoint""" ckpt_path='./output/{}/{}/models/model_epo{}.pkl'.format(args.model, args.expname, epoch) print(f'Loading model parameters from {ckpt_path}') model.load_state_dict(torch.load(checkpoint)) config = getattr(configs, 'config_'+args.model)() ############################################################################### # Load data ############################################################################### train_set=APIDataset(args.data_path+'train.desc.shuf.h5', args.data_path+'train.apiseq.shuf.h5', config['maxlen']) valid_set=APIDataset(args.data_path+'test.desc.shuf.h5', args.data_path+'test.apiseq.shuf.h5', config['maxlen']) vocab_api = load_dict(args.data_path+'vocab.apiseq.pkl') vocab_desc = load_dict(args.data_path+'vocab.desc.pkl') n_tokens = len(vocab_api) metrics=Metrics() print("Loaded data!") ############################################################################### # Define the models ############################################################################### model = getattr(model, args.model)(config, n_tokens) if args.reload_from>=0: load_model(model, args.reload_from)
def main(data_path): dataset_en, unknown_en = data.load_dataset(data_path + "/dataset_en.pkl") dataset_vi, unknown_vi = data.load_dataset(data_path + "/dataset_vi.pkl") dict_en, dict_vi = data.load_dict(data_path) print(dataset_en[1])
def main(args): worddicts = load_dict(args.path + '/data/dictionary.txt') worddicts_r = [None] * len(worddicts) for kk, vv in worddicts.items(): worddicts_r[vv] = kk train, train_uid_list = dataIterator(args.path + '/data/offline-train.pkl', args.path + '/data/train_caption.txt', worddicts, batch_size=args.batch_size, batch_Imagesize=400000, maxlen=100, maxImagesize=400000) valid, valid_uid_list = dataIterator(args.path + '/data/offline-test.pkl', args.path + '/data/test_caption.txt', worddicts, batch_size=args.batch_size, batch_Imagesize=400000, maxlen=100, maxImagesize=400000) print('train lenght is ', len(train)) x = tf.placeholder(tf.float32, shape=[None, None, None, 1]) y = tf.placeholder(tf.int32, shape=[None, None]) x_mask = tf.placeholder(tf.float32, shape=[None, None, None]) y_mask = tf.placeholder(tf.float32, shape=[None, None]) lr = tf.placeholder(tf.float32, shape=()) if_trainning = tf.placeholder(tf.bool, shape=()) watcher_train = Watcher_train(blocks=3, level=16, growth_rate=24, training=if_trainning) annotation, anno_mask = watcher_train.dense_net(x, x_mask) # for initilaizing validation anno = tf.placeholder(tf.float32, shape=[ None, annotation.shape.as_list()[1], annotation.shape.as_list()[2], annotation.shape.as_list()[3] ]) infer_y = tf.placeholder(tf.int64, shape=(None, )) h_pre = tf.placeholder(tf.float32, shape=[None, 256]) alpha_past = tf.placeholder(tf.float32, shape=[ None, annotation.shape.as_list()[1], annotation.shape.as_list()[2] ]) attender = Attender(annotation.shape.as_list()[3], 256, 512) parser = Parser(256, 256, attender, annotation.shape.as_list()[3]) w = WAP(watcher_train, attender, parser, 256, 256, annotation.shape.as_list()[3], 111, if_trainning) hidden_state_0 = tf.tanh( tf.tensordot(tf.reduce_mean(anno, axis=[1, 2]), w.Wa2h, axes=1) + w.ba2h) # [batch, hidden_dim] cost = w.get_cost(annotation, y, anno_mask, y_mask) vs = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) for vv in vs: if not vv.name.startswith('batch_normalization'): cost += 1e-4 * tf.reduce_sum(tf.pow(vv, 2)) p, w, h, alpha = w.get_word(infer_y, h_pre, alpha_past, anno) optimizer = tf.train.AdadeltaOptimizer(learning_rate=lr) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): trainer = optimizer.minimize(cost) max_epoch = 200 config = tf.ConfigProto() config.gpu_options.allow_growth = True init = tf.global_variables_initializer() uidx = 0 cost_s = 0 dispFreq = 100 saveFreq = len(train) sampleFreq = len(train) validFreq = len(train) history_errs = [] estop = False halfLrFlag = 0 patience = 15 lrate = 1.0 log = open(args.path + '/log-bs-6.txt', 'w') with tf.Session(config=config) as sess: sess.run(init) for epoch in range(max_epoch): n_samples = 0 random.shuffle(train) for batch_x, batch_y in train: batch_x, batch_x_m, batch_y, batch_y_m = prepare_data( batch_x, batch_y) n_samples += len(batch_x) uidx += 1 cost_i, _ = sess.run( [cost, trainer], feed_dict={ x: batch_x, y: batch_y, x_mask: batch_x_m, y_mask: batch_y_m, if_trainning: True, lr: lrate }) cost_s += cost_i if np.isnan(cost_i) or np.isinf(cost_i): print('invalid cost value detected') sys.exit(0) if np.mod(uidx, dispFreq) == 0: cost_s /= dispFreq print('Epoch ', epoch, 'Update ', uidx, 'Cost ', cost_s, 'Lr ', lrate) log.write('Epoch ' + str(epoch) + ' Update ' + str(uidx) + ' Cost ' + str(cost_s) + ' Lr ' + str(lrate) + '\n') log.flush() cost_s = 0 if np.mod(uidx, sampleFreq) == 0: fpp_sample = open( args.path + '/result/valid_decode_result-bs-6.txt', 'w') valid_count_idx = 0 for batch_x, batch_y in valid: for xx in batch_x: xx = np.moveaxis(xx, 0, -1) xx_pad = np.zeros( (xx.shape[0], xx.shape[1], xx.shape[2]), dtype='float32') xx_pad[:, :, :] = xx / 255. xx_pad = xx_pad[None, :, :, :] annot = sess.run(annotation, feed_dict={ x: xx_pad, if_trainning: False }) h_state = sess.run(hidden_state_0, feed_dict={anno: annot}) sample, score = w.get_sample(p, w, h, alpha, annot, h_state, 10, 100, False, sess, training=False) score = score / np.array([len(s) for s in sample]) ss = sample[score.argmin()] fpp_sample.write(valid_uid_list[valid_count_idx]) valid_count_idx = valid_count_idx + 1 if np.mod(valid_count_idx, 100) == 0: print('gen %d samples' % valid_count_idx) log.write('gen %d samples' % valid_count_idx + '\n') log.flush() for vv in ss: if vv == 0: # <eol> break fpp_sample.write(' ' + worddicts_r[vv]) fpp_sample.write('\n') fpp_sample.close() print('valid set decode done') log.write('valid set decode done\n') log.flush() if np.mod(uidx, validFreq) == 0: probs = [] for batch_x, batch_y in valid: batch_x, batch_x_m, batch_y, batch_y_m = prepare_data( batch_x, batch_y) pprobs, annot = sess.run( [cost, annotation], feed_dict={ x: batch_x, y: batch_y, x_mask: batch_x_m, y_mask: batch_y_m, if_trainning: False }) probs.append(pprobs) valid_errs = np.array(probs) valid_err_cost = valid_errs.mean() os.system('python3.4 compute-wer.py ' + args.path + '/result/valid_decode_result-bs-6.txt' + ' ' + args.path + '/data/test_caption.txt' + ' ' + args.path + '/result/valid-bs-6.wer') fpp = open(args.path + '/result/valid-bs-6.wer') stuff = fpp.readlines() fpp.close() m = re.search('WER (.*)\n', stuff[0]) valid_per = 100. * float(m.group(1)) m = re.search('ExpRate (.*)\n', stuff[1]) valid_sacc = 100. * float(m.group(1)) valid_err = valid_per history_errs.append(valid_err) if uidx / validFreq == 0 or valid_err <= np.array( history_errs).min(): bad_counter = 0 if uidx / validFreq != 0 and valid_err > np.array( history_errs).min(): bad_counter += 1 if bad_counter > patience: if halfLrFlag == 2: print('Early Stop!') log.write('Early Stop!\n') log.flush() estop = True break else: print('Lr decay and retrain!') log.write('Lr decay and retrain!\n') log.flush() bad_counter = 0 lrate = lrate / 10 halfLrFlag += 1 print('Valid WER: %.2f%%, ExpRate: %.2f%%, Cost: %f' % (valid_per, valid_sacc, valid_err_cost)) log.write('Valid WER: %.2f%%, ExpRate: %.2f%%, Cost: %f' % (valid_per, valid_sacc, valid_err_cost) + '\n') log.flush() if estop: break
all_preds.append(preds) all_lens.append(lens) sentences = [example[0] for example in ds.data] results = parse_decodes(sentences, all_preds, all_lens, label_vocab) return results if __name__ == '__main__': paddle.set_device('gpu') # Create dataset, tokenizer and dataloader. train_ds, dev_ds, test_ds = load_dataset(datafiles=('./data/train.txt', './data/dev.txt', './data/test.txt')) label_vocab = load_dict('./data/tag.dic') tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0') trans_func = partial(convert_to_features, tokenizer=tokenizer, label_vocab=label_vocab) train_ds.map(trans_func) dev_ds.map(trans_func) test_ds.map(trans_func) ignore_label = -1 batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64' ), # input_ids Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64'
all_lens.append(lens) sentences = [example[0] for example in ds.data] results = parse_decodes(sentences, all_preds, all_lens, label_vocab) return results if __name__ == '__main__': paddle.set_device(args.device) # Create dataset, tokenizer and dataloader. train_ds, dev_ds, test_ds = load_dataset( datafiles=(os.path.join(args.data_dir, 'train.txt'), os.path.join(args.data_dir, 'dev.txt'), os.path.join(args.data_dir, 'test.txt'))) label_vocab = load_dict(os.path.join(args.data_dir, 'tag.dic')) tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0') trans_func = partial(convert_to_features, tokenizer=tokenizer, label_vocab=label_vocab) train_ds.map(trans_func) dev_ds.map(trans_func) test_ds.map(trans_func) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32' ), # input_ids Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32' ), # token_type_ids
def do_train(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() train_ds = load_dataset(datafiles=('./data/train.json')) tags_to_idx = load_dict("./data/tags.txt") labels_to_idx = load_dict("./data/classifier_labels.txt") tokenizer = ErnieCtmTokenizer.from_pretrained(args.model_dir) trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_len=args.max_seq_len, tags_to_idx=tags_to_idx, labels_to_idx=labels_to_idx) train_ds.map(trans_func) ignore_label = tags_to_idx["O"] batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64' ), # input_ids Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64' ), # token_type_ids Stack(dtype='int64'), # seq_len Pad(axis=0, pad_val=ignore_label, dtype='int64'), # tags Stack(dtype='int64'), # cls_label ): fn(samples) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=False, drop_last=True) train_data_loader = DataLoader(train_ds, batch_sampler=train_batch_sampler, num_workers=0, collate_fn=batchify_fn, return_list=True) model = ErnieCtmWordtagModel.from_pretrained( args.model_dir, num_cls_label=len(labels_to_idx), num_tag=len(tags_to_idx), ignore_index=tags_to_idx["O"]) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) num_training_steps = args.max_steps if args.max_steps > 0 else ( len(train_data_loader) * args.num_train_epochs) warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) num_train_optimization_steps = len( train_ds) / args.batch_size * args.num_train_epochs decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) logger.info("Total steps: %s" % num_training_steps) logger.info("WarmUp steps: %s" % warmup) cls_acc = paddle.metric.Accuracy() seq_acc = SequenceAccuracy() total_loss = 0 global_step = 0 for epoch in range(1, args.num_train_epochs + 1): logger.info(f"Epoch {epoch} beginnig") start_time = time.time() for total_step, batch in enumerate(train_data_loader): global_step += 1 input_ids, token_type_ids, seq_len, tags, cls_label = batch outputs = model(input_ids, token_type_ids, lengths=seq_len, tag_labels=tags, cls_label=cls_label) loss, seq_logits, cls_logits = outputs[0], outputs[1], outputs[2] loss = loss.mean() total_loss += loss loss.backward() optimizer.step() optimizer.clear_grad() lr_scheduler.step() cls_correct = cls_acc.compute(pred=cls_logits.reshape( [-1, len(labels_to_idx)]), label=cls_label.reshape([-1])) cls_acc.update(cls_correct) seq_correct = seq_acc.compute(pred=seq_logits.reshape( [-1, len(tags_to_idx)]), label=tags.reshape([-1]), ignore_index=tags_to_idx["O"]) seq_acc.update(seq_correct) if global_step % args.logging_steps == 0 and global_step != 0: end_time = time.time() speed = float(args.logging_steps) / (end_time - start_time) logger.info( "[Training][" "epoch: %s/%s][step: %s/%s] loss: %6f, Classification Accuracy: %6f, Sequence Labeling Accuracy: %6f, speed: %6f" % (epoch, args.num_train_epochs, global_step, num_training_steps, total_loss / args.logging_steps, cls_acc.accumulate(), seq_acc.accumulate(), speed)) start_time = time.time() cls_acc.reset() seq_acc.reset() total_loss = 0 if (global_step % args.save_steps == 0 or global_step == num_training_steps ) and paddle.distributed.get_rank() == 0: output_dir = os.path.join( args.output_dir, "ernie_ctm_ft_model_%d.pdparams" % (global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir)