def import_sejm_data(): es = Elasticsearch() df = load_clean_df() embeddings = load_embeddings(LEXRANK_WEIGHTED, dim=None) df['embedding'] = embeddings def to_doc(row): return {"_index": 'sejm', "_id": row.id, "_source": row.to_dict()} docs_gen = (to_doc(row) for index_, row in df.iterrows()) sejm_settings = { "settings": { "max_result_window": 300000 }, "mappings": { "properties": { "embedding": { "type": "dense_vector", "dims": 768 } } } } es.indices.create(index='sejm', ignore=400, body=sejm_settings) helpers.bulk(es, tqdm(docs_gen, total=len(df)))
def main(test_file, vocab_file, embeddings_file, pretrained_file, max_length=50, gpu_index=0, batch_size=128): """ Test the ESIM model with pretrained weights on some dataset. Args: test_file: The path to a file containing preprocessed NLI data. pretrained_file: The path to a checkpoint produced by the 'train_model' script. vocab_size: The number of words in the vocabulary of the model being tested. embedding_dim: The size of the embeddings in the model. hidden_size: The size of the hidden layers in the model. Must match the size used during training. Defaults to 300. num_classes: The number of classes in the output of the model. Must match the value used during training. Defaults to 3. batch_size: The size of the batches used for testing. Defaults to 32. """ device = torch.device("cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu") print(20 * "=", " Preparing for testing ", 20 * "=") if platform == "linux" or platform == "linux2": checkpoint = torch.load(pretrained_file) else: checkpoint = torch.load(pretrained_file, map_location=device) # Retrieving model parameters from checkpoint. hidden_size = checkpoint["model"]["projection.0.weight"].size(0) num_classes = checkpoint["model"]["classification.6.weight"].size(0) embeddings = load_embeddings(embeddings_file) print("\t* Loading test data...") test_data = LCQMC_Dataset(test_file, vocab_file, max_length) test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size) print("\t* Building model...") model = ESIM(hidden_size, embeddings=embeddings, num_classes=num_classes, device=device).to(device) model.load_state_dict(checkpoint["model"]) print(20 * "=", " Testing ESIM model on device: {} ".format(device), 20 * "=") batch_time, total_time, accuracy, auc = test(model, test_loader) print("\n-> Average batch processing time: {:.4f}s, total test time: {:.4f}s, accuracy: {:.4f}%, auc: {:.4f}\n".format(batch_time, total_time, (accuracy*100), auc))
def main(test_file, vocab_file, embeddings_file, pretrained_file, max_length=50, gpu_index=0, batch_size=128): device = torch.device( "cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu") print(20 * "=", " Preparing for testing ", 20 * "=") if platform == "linux" or platform == "linux2": checkpoint = torch.load(pretrained_file) else: checkpoint = torch.load(pretrained_file, map_location=device) # Retrieving model parameters from checkpoint. embeddings = load_embeddings(embeddings_file) print("\t* Loading test data...") test_data = LCQMC_Dataset(test_file, vocab_file, max_length) test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size) print("\t* Building model...") model = SiaGRU(embeddings, device=device).to(device) model.load_state_dict(checkpoint["model"]) print(20 * "=", " Testing SiaGRU model on device: {} ".format(device), 20 * "=") batch_time, total_time, accuracy, auc = test(model, test_loader) print( "\n-> Average batch processing time: {:.4f}s, total test time: {:.4f}s, accuracy: {:.4f}%, auc: {:.4f}\n" .format(batch_time, total_time, (accuracy * 100), auc))
def model_load_test(test_df, vocab_file, embeddings_file, pretrained_file, test_prediction_dir, test_prediction_name, mode, num_labels=2, max_length=64, gpu_index=0, batch_size=128): device = torch.device( "cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu") print(20 * "=", " Preparing for testing ", 20 * "=") if platform == "linux" or platform == "linux2": checkpoint = torch.load(pretrained_file) else: checkpoint = torch.load(pretrained_file, map_location=device) # Retrieving model parameters from checkpoint. embeddings = load_embeddings(embeddings_file) print("\t* Loading test data...") test_data = My_Dataset(test_df, vocab_file, max_length, mode) test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size) print("\t* Building model...") model = ABCNN(embeddings, num_labels=num_labels, num_layer=1, linear_size=300, max_length=max_length, device=device).to(device) model.load_state_dict(checkpoint["model"]) print(20 * "=", " Testing ABCNN model on device: {} ".format(device), 20 * "=") batch_time, total_time, accuracy, predictions = test(model, test_loader) print( "\n-> Average batch processing time: {:.4f}s, total test time: {:.4f}s, accuracy: {:.4f}%\n" .format(batch_time, total_time, (accuracy * 100))) test_prediction = pd.DataFrame({'prediction': predictions}) if not os.path.exists(test_prediction_dir): os.makedirs(test_prediction_dir) test_prediction.to_csv(os.path.join(test_prediction_dir, test_prediction_name), index=False)
def main(vocab_file, embeddings_file, pretrained_file, max_length=50, gpu_index=0, batch_size=128): device = torch.device( "cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu") print(20 * "=", " Preparing for testing ", 20 * "=") if platform == "linux" or platform == "linux2": checkpoint = torch.load(pretrained_file) else: checkpoint = torch.load(pretrained_file, map_location=device) # Retrieving model parameters from checkpoint. embeddings = load_embeddings(embeddings_file) print("\t* Loading test data...") # test_data = LCQMC_Dataset(test_file, vocab_file, max_length) # test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size) print("\t* Building model...") model = SiaGRU(embeddings, device=device).to(device) model.load_state_dict(checkpoint["model"]) print(20 * "=", " Testing SiaGRU model on device: {} ".format(device), 20 * "=") database = [ line for line in open('./data/rumors.txt', 'r', encoding='utf-8') ] while True: input("enter to continue") inputs = [ line for line in open('./data/input.txt', 'r', encoding='utf-8') ] init_csv(inputs, database, './data/work_data.csv') dataset = LCQMC_Dataset('./data/work_data.csv', vocab_file, max_length) dataloader = DataLoader(dataset, shuffle=False, batch_size=batch_size) prob = get_score(model, dataloader) for i, p in enumerate(prob): if p > 0.5: print("text:", inputs[i // len(database)]) print("rumor:", database[i % len(database)]) print("prob:", p)
def main(args): print(20 * "=", " Preparing for training ", 20 * "=") if not os.path.exists(args.result): os.makedirs(args.result) # -------------------- Loda pretraining model ------------------- # checkpoints = torch.load(args.pretrained_file) # 可以从模型中直接恢复,也可以直接在前面定义 Retrieving model parameters from checkpoint. # hidden_size = checkpoints["model"]["projection.0.weight"].size(0) # num_classes = checkpoints["model"]["classification.6.weight"].size(0) # -------------------- Data loading ------------------- # print("\t* Loading training data...") test_data = LCQMC_dataset(args.test_file, args.vocab_file, args.max_length, test_flag=True) test_loader = DataLoader(test_data, batch_size=args.batch_size) # -------------------- Model definition ------------------- # print("\t* Building model...") embeddings = load_embeddings(args.embed_file) model = ESIM(args, embeddings=embeddings).to(args.device) model.load_state_dict(checkpoints["model"]) print(20 * "=", " Testing ESIM model on device: {} ".format(args.device), 20 * "=") all_predict = predict(model, test_loader) index = np.array([], dtype=int) for i in range(len(all_predict)): index = np.append(index, i) # ---------------------生成文件-------------------------- df_test = pd.DataFrame(columns=['index', 'prediction']) df_test['index'] = index df_test['prediction'] = all_predict df_test.to_csv(args.submit_example_path, index=False, columns=['index', 'prediction'], sep='\t')
def _create_loss(self): ''' Create loss, output projection, RNN cell, embeddings ''' print 'Creating loss... ', start = time.time() xavier = tf.contrib.layers.xavier_initializer() # use output projection if we're using sampled softmax if config.NUM_SAMPLES > 0 and config.NUM_SAMPLES < self.dec_vocab: proj_w_size = config.HIDDEN_SIZE w = tf.Variable(xavier([proj_w_size, self.dec_vocab]), name='w') b = tf.Variable(xavier([self.dec_vocab]), name='b') self.output_projection = (w, b) def sampled_loss(inputs, labels): labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss(tf.transpose(w), b, inputs, labels, config.NUM_SAMPLES, self.dec_vocab) self.softmax_loss = sampled_loss single_cell = tf.nn.rnn_cell.GRUCell(config.HIDDEN_SIZE) self.cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * config.NUM_LAYERS) if self.pretrained: # set up variables for special tokens and concat with pretrained pad = tf.zeros([1, config.EMBED_SIZE]) flags = tf.Variable(xavier([3, config.EMBED_SIZE], dtype=tf.float32), name='flags') embeddings = tf.constant(data.load_embeddings(self.data_path), dtype=tf.float32) self.embeddings = tf.concat(0, [pad, flags, embeddings]) else: self.embeddings = tf.Variable(xavier( [self.enc_vocab, config.EMBED_SIZE]), dtype=tf.float32) feed_prev = self.feed_prev_placeholder self.outputs, self.losses = tf.nn.seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, self.targets, self.decoder_masks, config.BUCKETS, lambda x, y: self._seq_f(x, y, feed_prev), softmax_loss_function=self.softmax_loss) # If we use output projection, we need to project outputs for decoding. def project_outputs(cur, bucket): if self.output_projection: return [ tf.matmul(output, self.output_projection[0]) + self.output_projection[1] for output in self.outputs[bucket] ] return tf.constant(False) for bucket in xrange(len(config.BUCKETS)): cur = self.outputs[bucket] self.outputs[bucket] = tf.cond( self.feed_prev_placeholder, lambda: project_outputs(cur, bucket), lambda: cur) print 'Took', time.time() - start, 'seconds'
parser.add_argument('--mode', type=str, default='demo', help='train/test/demo') parser.add_argument('--demo_model', type=str, default='1521112368', help='model for test and demo') args = parser.parse_args() ## get char embeddings #word2id:为每一个不重复的字进行编号,其中UNK为最后一位 word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl')) print("\n========word2id=========\n", word2id) if args.embedding_type == 'random': #随机生成词嵌入矩阵(一共3905个字,默认取300个特征,维度为3905*300) embeddings = random_embedding(word2id, args.embedding_dim) else: embeddings = load_embeddings(args.embedding_dim, word2id, args.embedding_type) #使用gensim(word2vec)基于wiki百科语料训练的中文词向量 print("\n=========embeddings==========\n", embeddings, "\ndim(embeddings)=", embeddings.shape) ## read corpus and get training data获取 if args.mode != 'demo': train_path = os.path.join('.', args.train_data, 'ner_train_data') test_path = os.path.join('.', args.test_data, 'ner_test_data') train_data = read_corpus(train_path) #读取训练集 test_data = read_corpus(test_path) test_size = len(test_data) #读取测试集 print('train_data=\n', train_data) #print("\n==========train_data================\n",train_data) #print("\n==========test_data================\n",test_data)
def main(train_file, dev_file, embeddings_file, vocab_file, target_dir, max_length=50, epochs=50, batch_size=128, lr=0.0005, patience=5, max_grad_norm=10.0, gpu_index=0, checkpoint=None): device = torch.device( "cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu") print(20 * "=", " Preparing for training ", 20 * "=") # 保存模型的路径 if not os.path.exists(target_dir): os.makedirs(target_dir) # -------------------- Data loading ------------------- # print("\t* Loading training data...") train_data = LCQMC_Dataset(train_file, vocab_file, max_length) train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size) print("\t* Loading validation data...") dev_data = LCQMC_Dataset(dev_file, vocab_file, max_length) dev_loader = DataLoader(dev_data, shuffle=True, batch_size=batch_size) # -------------------- Model definition ------------------- # print("\t* Building model...") embeddings = load_embeddings(embeddings_file) model = BIMPM(embeddings, device=device).to(device) # -------------------- Preparation for training ------------------- # criterion = nn.CrossEntropyLoss() # 过滤出需要梯度更新的参数 parameters = filter(lambda p: p.requires_grad, model.parameters()) # optimizer = optim.Adadelta(parameters, params["LEARNING_RATE"]) optimizer = torch.optim.Adam(parameters, lr=lr) # optimizer = torch.optim.Adam(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=0.85, patience=0) best_score = 0.0 start_epoch = 1 # Data for loss curves plot epochs_count = [] train_losses = [] valid_losses = [] # Continuing training from a checkpoint if one was given as argument if checkpoint: checkpoint = torch.load(checkpoint) start_epoch = checkpoint["epoch"] + 1 best_score = checkpoint["best_score"] print("\t* Training will continue on existing model from epoch {}...". format(start_epoch)) model.load_state_dict(checkpoint["model"]) optimizer.load_state_dict(checkpoint["optimizer"]) epochs_count = checkpoint["epochs_count"] train_losses = checkpoint["train_losses"] valid_losses = checkpoint["valid_losses"] # Compute loss and accuracy before starting (or resuming) training. _, valid_loss, valid_accuracy, auc = validate(model, dev_loader, criterion) print( "\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}" .format(valid_loss, (valid_accuracy * 100), auc)) # -------------------- Training epochs ------------------- # print("\n", 20 * "=", "Training BIMPM model on device: {}".format(device), 20 * "=") patience_counter = 0 for epoch in range(start_epoch, epochs + 1): epochs_count.append(epoch) print("* Training epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader, optimizer, criterion, epoch, max_grad_norm) train_losses.append(epoch_loss) print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%". format(epoch_time, epoch_loss, (epoch_accuracy * 100))) print("* Validation for epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy, epoch_auc = validate( model, dev_loader, criterion) valid_losses.append(epoch_loss) print( "-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}\n" .format(epoch_time, epoch_loss, (epoch_accuracy * 100), epoch_auc)) # Update the optimizer's learning rate with the scheduler. scheduler.step(epoch_accuracy) # Early stopping on validation accuracy. if epoch_accuracy < best_score: patience_counter += 1 else: best_score = epoch_accuracy patience_counter = 0 torch.save( { "epoch": epoch, "model": model.state_dict(), "best_score": best_score, "epochs_count": epochs_count, "train_losses": train_losses, "valid_losses": valid_losses }, os.path.join(target_dir, "best.pth.tar")) if patience_counter >= patience: print("-> Early stopping: patience limit reached, stopping...") break
def run_task(task_name, task_data, task_format, model_name, model_id): mode = config.layer_mode train_data, dev_data, test_data = None, None, None if config.train: dev_data = load_embeddings(task_name, task_data, task_format, 'dev', model_name, model_id, config.label_map) train_data = dev_data if config.sample else load_embeddings( task_name, task_data, task_format, 'train', model_name, model_id, config.label_map) if config.export: test_data = load_embeddings(task_name, task_data, task_format, 'test', model_name, model_id, config.label_map) if config.dry_run: print('loaded data, now stopping dry run') return layers_labels = [] n_workers = config.num_workers if config.num_workers > 0 else 1 if n_workers == 1: for layer in range(*config.layer_range): layer_labels, layer_preds = run_layer(train_data, dev_data, test_data, task_name, model_name, mode, layer) if config.export and layer_labels is not None: if len(layers_labels) == 0: layers_labels.append(layer_labels) layers_labels.append(layer_preds) else: procs_queue, procs_running = [], [] for layer in range(*config.layer_range): p = mp.Process(target=run_layer, args=(train_data, dev_data, test_data, task_name, model_name, mode, layer)) procs_queue.append(p) # run_layer(train_data, dev_data, test_data, task_name, model_name, mode, layer) for p in procs_queue: while len(procs_running) >= n_workers: time.sleep(1) for i in range(n_workers): if not procs_running[i].is_alive(): procs_running.pop(i) break procs_running.append(p) p.start() for p in procs_running: p.join() if len(layers_labels) > 0: preds_dir = os.path.join(task_name, 'predictions', config.name) preds_path = data_path(preds_dir, mode, 'json', model_name) with open(preds_path, 'w') as f: for labels in zip(*layers_labels): labels = [config.label_map[lab] for lab in labels] f.write('\t'.join(labels) + '\n') print(f'Saved layer-wise predictions to {preds_path}') if config.report: summaries = [] for layer in range(*config.layer_range): summary_dir = os.path.join(task_name, 'summaries', config.name, mode) summary_path = data_path(summary_dir, mode, 'json', model_name, layer) if not os.path.exists(summary_path): print('skipping, {} does not exist'.format(summary_path)) else: with open(summary_path) as f: summary = json.load(f) summaries.append((layer, summary)) report(summaries)
n_gram=config.n_gram, context_mode=config.context_mode) validation_corpus = Corpus(args.data + "/valid.txt", dictionary, create_dict=True, use_cuda=args.cuda, n_gram=config.n_gram, context_mode=config.context_mode) # TensorboardX object writer = SummaryWriter("saved_runs/" + args.save) # Word embeddings embedding = nn.Embedding(len(dictionary), config.em_size, padding_idx=0) if config.pre_trained: load_embeddings(embedding, dictionary.word2idx, config.pre_trained, config.em_size) # Model, Optimizer and Loss model = LSTM_LM(embedding, config) optimizer = optim.Adam(model.parameters(), lr=config.lr) criterion = nn.CrossEntropyLoss(ignore_index=0) if args.cuda: model = model.cuda() criterion = criterion.cuda() total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in model.parameters()) print('Model total parameters:', total_params, flush=True)
def test( name: str, model_type: str, step: int = None, mode: str = 'test', data_seed: int = None, data_name: str = 'SNLI', data_embedding: str = 'GloVe', data_pad: bool = True, batch_size: int = 10, print_errors: bool = False, print_errors_limit: int = 10, **kwargs, ) -> None: model_path = build.get_model_path(name) model = getattr(nn, model_type)(embeddings=data.load_embeddings( data_name, data_embedding, data_seed), **kwargs) log.info(str(model)) log.debug('Model parameters:\n\n\t' + '\n\t'.join(graph.print_trainable_variables().split('\n'))) with tf.Session(config=_make_config()) as sess: dataset = data.load_dataset(data_name, mode, data_embedding, data_seed) data_iter, data_hd = _make_dataset_iterator( type_name='initializable_iterator', handle_name='data_handle', dataset=dataset, batch_size=batch_size, shuffle=False, pad=data_pad, session=sess) _restore_model(sess, model_path, step) y_preds, y_trues = [], [] # type: ignore sess.run(data_iter.initializer) while True: try: true, pred = sess.run( [model.y, model.prediction], feed_dict={ model.handle: data_hd, model.keep_prob: 1.0, model.is_training: False }) y_preds.extend(np.squeeze(pred).tolist()) y_trues.extend(np.squeeze(true).tolist()) except tf.errors.OutOfRangeError: break # print accuracy print('Acc: %.4f' % sklearn.metrics.accuracy_score(y_trues, y_preds)) # Print confusion matrix labels = list( sorted(data.SNLI.LABELS.keys(), key=lambda x: data.SNLI.LABELS[x])) cm = sklearn.metrics.confusion_matrix(y_trues, y_preds, labels=range(len(labels))) tmpl = '%15s ' * (len(labels) + 2) print(tmpl % tuple([''] + labels + [''])) corr = 0 for i in range(len(labels)): stats = cm[i] prob = stats[i] / sum(stats) corr += stats[i] print(tmpl % tuple([labels[i]] + list(map(str, cm[i])) + ['%.4f' % prob])) print(tmpl % tuple(['%d / %d' % (corr, len(y_trues))] + [''] * len(labels) + ['%.4f' % (corr / len(y_trues))])) # Print errors if print_errors: tmpl = '\n%4d. Pred: %-20s True: %s\n %s\n %s' for i, (y_pred, y_true) in enumerate(zip(y_preds, y_trues)): if y_pred != y_true and print_errors_limit != 0: s1 = ' '.join(dataset.x1_words[i]) s2 = ' '.join(dataset.x2_words[i]) l_pred = labels[y_pred] l_true = labels[y_true] print(tmpl % (i, l_pred, l_true, s1, s2)) print_errors_limit -= 1
def train(name: str, model_type: str, batch_size: int = 256, epoch_num: int = 200, keep_prob: float = 0.8, train_regex_list: t.Union[t.List[str], str] = None, optim_manager_type: str = 'NotChange', data_name: str = 'SNLI', data_embedding: str = 'GloVe', data_argument: bool = False, data_pad: bool = True, data_cache: bool = False, data_seed: int = None, record_every: int = 64000, validate_every: int = 640000, save_every: int = 6400000, restore_from: str = None, restore_step: int = None, profiling: bool = False, clip_norm: int = None, seed: int = None, debug: bool = False, **kwargs) -> None: # Data preparation model_path = build.get_model_path(name) shutil.rmtree(model_path, ignore_errors=True) # remove previous trained # Network setup model = getattr(nn, model_type)(embeddings=data.load_embeddings( data_name, data_embedding, data_seed), **_select_kwargs_regex(kwargs, r'^optim[0-9]*_', invert=True)) log.info(str(model)) log.debug('Model parameters:\n\n\t' + '\n\t'.join(graph.print_trainable_variables().split('\n'))) # Control randomization if seed: log.info( 'Set random seed for data shuffling and graph computation: %d' % seed) tf.set_random_seed(seed) train_summary = _make_model_summary(model) with tf.Session(config=_make_config()) as sess: if debug: from tensorflow.python import debug as tf_debug sess = tf_debug.LocalCLIDebugWrapperSession(sess) dataset_opts = { 'pad': data_pad, 'batch_size': batch_size, 'session': sess, } train_iter, train_hd = _make_dataset_iterator( type_name='one_shot_iterator', handle_name='train_handle', dataset=data.load_dataset(data_name, 'train', data_embedding, data_seed), argument=data_argument, bucket_boundaries=[20, 50], repeat_num=epoch_num, cache=data_cache, seed=seed, **dataset_opts) valid_iter, valid_hd = _make_dataset_iterator( type_name='initializable_iterator', handle_name='valid_handle', dataset=data.load_dataset(data_name, 'validation', data_embedding, data_seed), shuffle=False, cache=True, **dataset_opts) test_iter, test_hd = _make_dataset_iterator( type_name='initializable_iterator', handle_name='test_handle', dataset=data.load_dataset(data_name, 'test', data_embedding, data_seed), shuffle=False, cache=True, **dataset_opts) om = _make_optim_manager(optim_manager_type, model.loss, clip_norm, train_regex_list, kwargs) test_wtr = tf.summary.FileWriter(os.path.join(model_path, 'test')) train_wtr = tf.summary.FileWriter(os.path.join(model_path, 'train'), sess.graph) # Build a validation summary writer for each optimizer valid_wtr = {} for optim in om.optims: valid_wtr[optim.get_name()] = tf.summary.FileWriter( os.path.join(model_path, 'valid-%s' % optim.get_name())) if restore_from: _copy_checkpoint(restore_from, model_path, restore_step) _restore_model(sess, model_path, restore_step) # Evaluate the pretrained model step = restore_step _iterate_dataset(sess, model, valid_iter, valid_hd, valid_wtr[om.optim.get_name()], step) _iterate_dataset(sess, model, test_iter, test_hd, test_wtr, step) else: sess.run(tf.global_variables_initializer()) step = 0 if profiling: _profile_and_exit(sess, model, om.optim_op, train_hd) pbar = tqdm.tqdm(total=save_every, desc='Train', unit=' inst') try: while True: feed_dict = { model.handle: train_hd, model.keep_prob: keep_prob, model.is_training: True } if om.feed_lr: feed_dict[om.lr_op] = om.lr_val if step % record_every == 0: summary, _, loss = sess.run( [train_summary, om.optim_op, model.loss], feed_dict=feed_dict) pbar.set_postfix(loss='{:.3f}'.format(loss)) train_wtr.add_summary(summary, step) else: sess.run([om.optim_op], feed_dict=feed_dict) if step and step % validate_every == 0: pbar.set_description('Valid') valid_acc = _iterate_dataset( sess, model, valid_iter, valid_hd, valid_wtr[om.optim.get_name()], step) # Update upon the validation perfomance om.update(valid_acc, step) pbar.set_description('Test') _iterate_dataset(sess, model, test_iter, test_hd, test_wtr, step) pbar.set_description('Train') if step and step % save_every == 0: save_path = _save_model(sess, model_path, step) pbar.set_description(save_path) pbar.update(batch_size) pbar.close() pbar = tqdm.tqdm(total=save_every, desc='Train', unit=' inst') else: pbar.update(batch_size) step += batch_size except tf.errors.OutOfRangeError: save_path = _save_model(sess, model_path, step) pbar.set_description(save_path) log.info('Training finished!')
from data import model_path, load_embeddings from tqdm import tqdm from glob import glob skip = [ ('sbert-lexrank-top1', 5), ('sbert-lexrank-top1', 10), ('sbert-tf-idf-top1', 5), ('sbert-tf-idf-top1', 10), ('use-lexrank-top1', 5), ('use-lexrank-top1', 10), ('use-tf-idf-top1', 5), ('use-tf-idf-top1', 10), ('use-tf-idf-top1', 15), ('use-tf-idf-top1', 20), ('use-tf-idf-top5', 5), ('use-tf-idf-top5', 10), ] for emb_file in tqdm(sorted(glob(model_path + 'embeddings/*.pkl')), position=0): emb = emb_file.split('/')[-1][:-4] for n_neighbors in tqdm([5, 10, 15, 20, 25, 50], position=1, leave=False): if (emb, n_neighbors) in skip: continue if 'top' in emb and n_neighbors in [25, 50]: continue load_embeddings(emb, dim=5, n_neighbors=n_neighbors)
def model_train_validate_test(train_df, dev_df, test_df, embeddings_file, vocab_file, target_dir, mode, max_length=64, num_labels=2, epochs=50, batch_size=256, lr=0.0005, patience=3, max_grad_norm=10.0, gpu_index=0, if_save_model=False, checkpoint=None): device = torch.device( "cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu") print(20 * "=", " Preparing for training ", 20 * "=") # 保存模型的路径 if not os.path.exists(target_dir): os.makedirs(target_dir) # -------------------- Data loading ------------------- # print("\t* Loading training data...") train_data = My_Dataset(train_df, vocab_file, max_length, mode) train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size) print("\t* Loading validation data...") dev_data = My_Dataset(dev_df, vocab_file, max_length, mode) dev_loader = DataLoader(dev_data, shuffle=True, batch_size=batch_size) print("\t* Loading test data...") test_data = My_Dataset(test_df, vocab_file, max_length, mode) test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size) # -------------------- Model definition ------------------- # print("\t* Building model...") if (embeddings_file is not None): embeddings = load_embeddings(embeddings_file) else: embeddings = None model = ABCNN(embeddings=embeddings, num_labels=num_labels, num_layer=1, linear_size=300, max_length=max_length, device=device).to(device) total_params = sum(p.numel() for p in model.parameters()) print(f'{total_params:,} total parameters.') total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print(f'{total_trainable_params:,} training parameters.') # -------------------- Preparation for training ------------------- # criterion = nn.CrossEntropyLoss() # 过滤出需要梯度更新的参数 parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = torch.optim.Adam(parameters, lr=lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=0.85, patience=0) best_score = 0.0 start_epoch = 1 # Data for loss curves plot epochs_count = [] train_losses = [] valid_losses = [] # Continuing training from a checkpoint if one was given as argument if checkpoint: checkpoint = torch.load(checkpoint) start_epoch = checkpoint["epoch"] + 1 best_score = checkpoint["best_score"] print("\t* Training will continue on existing model from epoch {}...". format(start_epoch)) model.load_state_dict(checkpoint["model"]) optimizer.load_state_dict(checkpoint["optimizer"]) epochs_count = checkpoint["epochs_count"] train_losses = checkpoint["train_losses"] valid_losses = checkpoint["valid_losses"] # Compute loss and accuracy before starting (or resuming) training. _, valid_loss, valid_accuracy, _, = validate(model, dev_loader, criterion) print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%". format(valid_loss, (valid_accuracy * 100))) # -------------------- Training epochs ------------------- # print("\n", 20 * "=", "Training ABCNN model on device: {}".format(device), 20 * "=") patience_counter = 0 for epoch in range(start_epoch, epochs + 1): epochs_count.append(epoch) print("* Training epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader, optimizer, criterion, epoch, max_grad_norm) train_losses.append(epoch_loss) print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%". format(epoch_time, epoch_loss, (epoch_accuracy * 100))) print("* Validation for epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy, _, = validate( model, dev_loader, criterion) valid_losses.append(epoch_loss) print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%\n". format(epoch_time, epoch_loss, (epoch_accuracy * 100))) # Update the optimizer's learning rate with the scheduler. scheduler.step(epoch_accuracy) # Early stopping on validation accuracy. if epoch_accuracy < best_score: patience_counter += 1 else: best_score = epoch_accuracy patience_counter = 0 if (if_save_model): torch.save( { "epoch": epoch, "model": model.state_dict(), "best_score": best_score, "epochs_count": epochs_count, "train_losses": train_losses, "valid_losses": valid_losses }, os.path.join(target_dir, "best.pth.tar")) print("save model succesfully!\n") print("* Test for epoch {}:".format(epoch)) _, _, test_accuracy, predictions = validate( model, test_loader, criterion) print("Test accuracy: {:.4f}%\n".format(test_accuracy)) test_prediction = pd.DataFrame({'prediction': predictions}) test_prediction.to_csv(os.path.join(target_dir, "test_prediction.csv"), index=False) if patience_counter >= patience: print("-> Early stopping: patience limit reached, stopping...") break
from data import load_data, load_embeddings from model import create_model, train_model from evaluation import evaluate_model from sklearn.model_selection import train_test_split, StratifiedKFold log.basicConfig(format='%(asctime)s %(message)s', level=log.INFO) try: experiment = experiments[sys.argv[1]] except: log.error("experiment \"{0}\" does not exist".format(sys.argv[1])) sys.exit(1) X_train, X_test, y_train, y_test, word_index = load_data(experiment) if "embedding_file" in experiment: embedding_matrix = load_embeddings(experiment, word_index) model = create_model(experiment, X_train, y_train, embedding_matrix=embedding_matrix, word_index=word_index) else: model = create_model(experiment, X_train, y_train, word_index=word_index) model = train_model(model, X_train, y_train) evaluate_model(model, X_test, y_test) #pred = model.predict_classes(X_test) # for p in pred: # print (p)
def main(args): print(20 * "=", " Preparing for training ", 20 * "=") # 保存模型的路径 if not os.path.exists(args.target_dir): os.makedirs(args.target_dir) # -------------------- Data loading ------------------- # print("\t* Loading training data...") # train_data = LCQMC_dataset(args.train_file, args.vocab_file, args.max_length, test_flag=False) train_data = LCQMC_dataset(args.train_file, args.vocab_file, args.max_length, test_flag=False) train_loader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True) print("\t* Loading valid data...") dev_data = LCQMC_dataset(args.dev_file, args.vocab_file, args.max_length, test_flag=False) dev_loader = DataLoader(dev_data, batch_size=args.batch_size, shuffle=True) # -------------------- Model definition ------------------- # print("\t* Building model...") embeddings = load_embeddings(args.embed_file) model = ESIM(args, embeddings=embeddings).to(args.device) # -------------------- Preparation for training ------------------- # criterion = nn.CrossEntropyLoss() # 交叉熵损失函数 # 过滤出需要梯度更新的参数 parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = torch.optim.Adam(parameters, lr=args.lr) # 优化器 # 学习计划 scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.85, patience=0) best_score = 0.0 start_epoch = 1 epochs_count = [] train_losses = [] valid_losses = [] # Continuing training from a checkpoint if one was given as argument if args.checkpoint: # 从文件中加载checkpoint数据, 从而继续训练模型 checkpoints = torch.load(args.checkpoint) start_epoch = checkpoints["epoch"] + 1 best_score = checkpoints["best_score"] print("\t* Training will continue on existing model from epoch {}...".format(start_epoch)) model.load_state_dict(checkpoints["model"]) # 模型部分 optimizer.load_state_dict(checkpoints["optimizer"]) epochs_count = checkpoints["epochs_count"] train_losses = checkpoints["train_losses"] valid_losses = checkpoints["valid_losses"] # 这里改为只有从以前加载的checkpoint中才进行计算 valid, Compute loss and accuracy before starting (or resuming) training. _, valid_loss, valid_accuracy, auc = validate(model, dev_loader, criterion) print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}" .format(valid_loss, (valid_accuracy * 100), auc)) # -------------------- Training epochs ------------------- # print("\n", 20 * "=", "Training ESIM model on device: {}".format(args.device), 20 * "=") patience_counter = 0 for epoch in range(start_epoch, args.epochs + 1): epochs_count.append(epoch) print("* Training epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader, optimizer, criterion, epoch, args.max_grad_norm) train_losses.append(epoch_loss) print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%" .format(epoch_time, epoch_loss, (epoch_accuracy * 100))) print("* Validation for epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy, epoch_auc = validate(model, train_loader, criterion) valid_losses.append(epoch_loss) print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}\n" .format(epoch_time, epoch_loss, (epoch_accuracy * 100), epoch_auc)) # Update the optimizer's learning rate with the scheduler. scheduler.step(epoch_accuracy) # Early stopping on validation accuracy. if epoch_accuracy < best_score: patience_counter += 1 else: best_score = epoch_accuracy patience_counter = 0 # 保存最好的结果,需要保存的参数,这些参数在checkpoint中都能找到 torch.save( { "epoch": epoch, "model": model.state_dict(), "best_score": best_score, "epochs_count": epochs_count, "train_losses": train_losses, "valid_losses": valid_losses}, os.path.join(args.target_dir, "new_best.pth.tar")) # 保存每个epoch的结果 Save the model at each epoch.(这里可要可不要) torch.save( { "epoch": epoch, "model": model.state_dict(), "best_score": best_score, "optimizer": optimizer.state_dict(), "epochs_count": epochs_count, "train_losses": train_losses, "valid_losses": valid_losses}, os.path.join(args.target_dir, "new_esim_{}.pth.tar".format(epoch))) if patience_counter >= args.patience: print("-> Early stopping: patience limit reached, stopping...") break