def init_dataset(seq_path, tag_path, word_to_ix, max_seq_len, batch_size): seqs, tags = load_dataset(seq_path, tag_path) seqs, masks, tags = create_dataset(seqs, tags, word_to_ix, max_seq_len, word_to_ix['[PAD]']) extended_attention_mask = create_attention_mask(masks) dataset = TensorDataset(seqs, extended_attention_mask, tags) return DataLoader(dataset, batch_size=batch_size, shuffle=True)
def infer(documents_path, beam_size, checkpoint_path): doc, summ = create_dataset(documents_path, num_examples=config.num_examples) train_examples = tf.data.Dataset.from_tensor_slices((doc, summ)) batch = len(doc) // 2 train_dataset = train_examples.map(tf_encode) train_dataset = train_dataset.filter(filter_token_size) train_dataset = train_dataset.cache() train_dataset = train_dataset.padded_batch(batch, padded_shapes=([-1], [-1])) #print(f'Number of records before filtering was {len(doc)}') #print(f'Number of records to be inferenced is {sum(1 for l in train_dataset) * batch} approx') restore_chkpt(checkpoint_path) start_time = time.time() for (_, (inp, tar)) in enumerate(train_dataset): translated_output_temp = beam_search_eval(inp, beam_size) #print(translated_output_temp) for true_summary, top_sentence_ids in zip( tar, translated_output_temp[0][:, 0, :]): print() print('Original summary: {}'.format( tokenizer_en.decode( [j for j in true_summary if j < tokenizer_en.vocab_size]))) print('Predicted summary: {}'.format( tokenizer_en.decode([ j for j in top_sentence_ids if j < tokenizer_en.vocab_size if j > 0 ]))) print() print('time to process {}'.format(time.time() - start_time))
def change_dataset_and_train(addtional_tokens_per_batch, batch_size): memory_test_dataset = create_dataset( split='train', source_tokenizer=source_tokenizer, target_tokenizer=target_tokenizer, from_=90, to=100, shuffle=True, batch_size=batch_size ) log.info(f'Training with tokens_per_batch set to {addtional_tokens_per_batch}\ and batch_size set to {batch_size}') training_loop(memory_test_dataset.take(1000), False) gpu_usage = check_gpu_usage() log.info(f'GPU memory utilization is {gpu_usage}') return gpu_usage
def main(_): # total_x, total_y, x_dim, y_dim ckpt_path = os.path.join(FLAGS.ckpt_dir, FLAGS.name) (train_x, train_y), (test_x, test_y) = preprocess.create_dataset() batch = model.Batch(train_x, train_y, FLAGS.epoch) print('start session') with tf.Session() as sess: predicator = model.Predicator(matrix_shape=[9, 8], num_time=7, out_time=7, kernels=[[5, 5], [5, 5], [5, 5], [5, 5], [5, 5]], depths=[256, 128, 128, 64, 32], learning_rate=FLAGS.learning_rate, beta1=FLAGS.beta1) train_path = os.path.join(FLAGS.summary_dir, FLAGS.name, 'train') test_path = os.path.join(FLAGS.summary_dir, FLAGS.name, 'test') train_writer = tf.summary.FileWriter(train_path, sess.graph) test_writer = tf.summary.FileWriter(test_path, sess.graph) print('start training') sess.run(tf.global_variables_initializer()) for i in range(FLAGS.epoch): for n in range(batch.iter_per_epoch): batch_x, batch_y = batch() predicator.train(sess, batch_x, batch_y) print(i, 'th epoch') summary = predicator.inference(sess, predicator.summary, batch_x, batch_y) train_writer.add_summary(summary, global_step=i) summary = predicator.inference(sess, predicator.summary, test_x, test_y) test_writer.add_summary(summary, global_step=i) if (i + 1) % FLAGS.ckpt_interval == 0: predicator.dump(sess, ckpt_path, i)
if __name__ == "__main__": args = parser.parse_args() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") with open(args.config_path, 'r') as f: config = json.load(f) model = BertAbsSum(args.bert_model, config['decoder_config'], device) model.load_state_dict(torch.load(args.model_path)) model.to(device) processor = LCSTSProcessor() tokenizer = BertTokenizer.from_pretrained(os.path.join(args.bert_model, 'vocab.txt')) test_examples = processor.get_examples(args.eval_path) test_features = convert_examples_to_features(test_examples, args.max_src_len, args.max_tgt_len, tokenizer) test_data = create_dataset(test_features) test_sampler = RandomSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE, drop_last=True) logger.info('Loading complete. Writing results to %s' % (args.result_path)) model.eval() f_log = open(os.path.join(args.result_path, 'log.txt'), 'w', encoding='utf-8') # f_hyp = open(os.path.join(args.result_path, 'hyp.txt'), 'w', encoding='utf-8') # f_ref = open(os.path.join(args.result_path, 'ref.txt'), 'w', encoding='utf-8') hyp_list = [] ref_list = [] for batch in tqdm(test_dataloader, desc="Iteration"): batch = tuple(t.to(device) for t in batch) pred, _ = model.beam_decode(batch[0], batch[1], 3, 3) src, tgt = batch[0], batch[2] for i in range(BATCH_SIZE):
example = train_examples[0] example_feature = train_features[0] logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("src text: %s" % example.src) logger.info("src_ids: %s" % " ".join([str(x) for x in example_feature.src_ids])) logger.info("src_mask: %s" % " ".join([str(x) for x in example_feature.src_mask])) logger.info("tgt text: %s" % example.tgt) logger.info("tgt_ids: %s" % " ".join([str(x) for x in example_feature.tgt_ids])) logger.info("tgt_mask: %s" % " ".join([str(x) for x in example_feature.tgt_mask])) logger.info('Building dataloader...') train_data = create_dataset(train_features) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size, drop_last=True) # eval data preprocess if not os.path.exists(os.path.join(args.data_dir, 'eval.csv')): logger.info( 'No eval data found in data directory. Eval will not be performed.' ) eval_dataloader = None else: logger.info('Loading eval dataset...') eval_examples = processor.get_examples(
def restore_chkpt(checkpoint_path): ckpt = tf.train.Checkpoint( Model=Model ) assert tf.train.latest_checkpoint(os.path.split(checkpoint_path)[0]), 'Incorrect checkpoint directory' ckpt.restore(checkpoint_path).expect_partial() print(f'{checkpoint_path} restored') restore_chkpt(config.infer_ckpt_path) test_dataset = create_dataset( split='test', source_tokenizer=source_tokenizer, target_tokenizer=target_tokenizer, from_=0, to=100, batch_size=config.validation_batch_size, drop_remainder=True ) max_combined_metric = 0 decoder_type = 'topktopp' temperatures = [1] length_penalties = [0.8] beams = [12, 13, 14, 15] top_ps = [1] top_ks = [10] for beam_size in beams: for length_penalty in length_penalties: for top_p in top_ps:
def train(test_name, radius=1, dim=52, layer_hidden=4, layer_output=10, dropout=0.45, batch_train=8, batch_test=8, lr=3e-4, lr_decay=0.85, decay_interval=25, iteration=140, N=5000, dataset_train='../dataset/data_train.txt'): dataset_test = test_name (radius, dim, layer_hidden, layer_output, batch_train, batch_test, decay_interval, iteration, dropout) = map(int, [ radius, dim, layer_hidden, layer_output, batch_train, batch_test, decay_interval, iteration, dropout ]) lr, lr_decay = map(float, [lr, lr_decay]) if torch.cuda.is_available(): device = torch.device('cuda') print('The code uses a GPU!') else: device = torch.device('cpu') print('The code uses a CPU...') lr, lr_decay = map(float, [lr, lr_decay]) print('-' * 100) print('Just a moment......') print('-' * 100) path = '' dataname = '' dataset_train = pp.create_dataset(dataset_train, path, dataname) dataset_test = pp.create_dataset(dataset_test, path, dataname) np.random.seed(0) np.random.shuffle(dataset_train) np.random.shuffle(dataset_test) print('The preprocess has finished!') print('# of training data samples:', len(dataset_train)) # print('# of development data samples:', len(dataset_dev)) print('# of test data samples:', len(dataset_test)) print('-' * 100) print('Creating a model.') torch.manual_seed(0) model = MolecularGraphNeuralNetwork(N, dim, layer_hidden, layer_output, dropout).to(device) trainer = Trainer(model, lr, batch_train) tester = Tester(model, batch_test) print('# of model parameters:', sum([np.prod(p.size()) for p in model.parameters()])) print('-' * 100) file_result = path + 'AUC' + '.txt' # file_result = '../output/result--' + setting + '.txt' result = 'Epoch\tTime(sec)\tLoss_train\tLoss_test\tAUC_train\tAUC_test' file_test_result = path + 'test_prediction' + '.txt' file_predictions = path + 'train_prediction' + '.txt' file_model = path + 'model' + '.h5' with open(file_result, 'w') as f: f.write(result + '\n') print('Start training.') print('The result is saved in the output directory every epoch!') np.random.seed(0) start = timeit.default_timer() for epoch in range(iteration): epoch += 1 if epoch % decay_interval == 0: trainer.optimizer.param_groups[0]['lr'] *= lr_decay # [‘amsgrad’, ‘params’, ‘lr’, ‘betas’, ‘weight_decay’, ‘eps’] prediction_train, loss_train, train_res = trainer.train(dataset_train) prediction_test, loss_test, test_res = tester.test_classifier( dataset_test) time = timeit.default_timer() - start if epoch == 1: minutes = time * iteration / 60 hours = int(minutes / 60) minutes = int(minutes - 60 * hours) print('The training will finish in about', hours, 'hours', minutes, 'minutes.') print('-' * 100) print(result) result = '\t'.join( map(str, [ epoch, time, loss_train, loss_test, prediction_train, prediction_test ])) tester.save_result(result, file_result) tester.save_model(model, file_model) print(result) loss = pd.read_table(file_result) plt.plot(loss['AUC_train'], color='r', label='AUC of train set') plt.plot(loss['AUC_test'], color='b', label='AUC of test set') plt.ylabel('AUC') plt.xlabel('Epoch') plt.xlim(-1, 145) plt.ylim(0, 1) plt.legend() plt.savefig(path + 'loss.tif', dpi=300) plt.show() res_test = test_res.T res_train = train_res.T cn_matrix = confusion_matrix(res_train[:, 0], res_train[:, 1]) cn_matrix tn1 = cn_matrix[0, 0] tp1 = cn_matrix[1, 1] fn1 = cn_matrix[1, 0] fp1 = cn_matrix[0, 1] bacc_train = ((tp1 / (tp1 + fn1)) + (tn1 / (tn1 + fp1))) / 2 # balance accurance pre_train = tp1 / (tp1 + fp1) # precision/q+ rec_train = tp1 / (tp1 + fn1) # recall/se sp_train = tn1 / (tn1 + fp1) q__train = tn1 / (tn1 + fn1) f1_train = 2 * pre_train * rec_train / (pre_train + rec_train) # f1score mcc_train = ((tp1 * tn1) - (fp1 * fn1)) / math.sqrt( (tp1 + fp1) * (tp1 + fn1) * (tn1 + fp1) * (tn1 + fn1)) # Matthews correlation coefficient acc_train = (tp1 + tn1) / (tp1 + fp1 + fn1 + tn1) # accurancy fpr_train, tpr_train, thresholds_train = roc_curve(res_train[:, 0], res_train[:, 1]) print('bacc_train:', bacc_train) print('pre_train:', pre_train) print('rec_train:', rec_train) print('f1_train:', f1_train) print('mcc_train:', mcc_train) print('sp_train:', sp_train) print('q__train:', q__train) print('acc_train:', acc_train) cnf_matrix = confusion_matrix(res_test[:, 0], res_test[:, 1]) cnf_matrix tn = cnf_matrix[0, 0] tp = cnf_matrix[1, 1] fn = cnf_matrix[1, 0] fp = cnf_matrix[0, 1] bacc = ((tp / (tp + fn)) + (tn / (tn + fp))) / 2 # balance accurance pre = tp / (tp + fp) # precision/q+ rec = tp / (tp + fn) # recall/se sp = tn / (tn + fp) q_ = tn / (tn + fn) f1 = 2 * pre * rec / (pre + rec) # f1score mcc = ((tp * tn) - (fp * fn)) / math.sqrt( (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) # Matthews correlation coefficient acc = (tp + tn) / (tp + fp + fn + tn) # accurancy fpr, tpr, thresholds = roc_curve(res_test[:, 0], res_test[:, 1]) print('bacc:', bacc) print('pre:', pre) print('rec:', rec) print('f1:', f1) print('mcc:', mcc) print('sp:', sp) print('q_:', q_) print('acc:', acc) print('auc:', prediction_test) return res_test
import tensorflow as tf tf.keras.backend.clear_session() tf.random.set_seed(100) import time from tqdm import tqdm from preprocess import create_dataset from configuration import config, source_tokenizer, target_tokenizer from calculate_metrics import mask_and_calculate_loss from utilities import log from model_training_helper import (check_ckpt, eval_step, train_step, batch_run_check, save_evaluate_monitor) train_dataset = create_dataset(split='train', source_tokenizer=source_tokenizer, target_tokenizer=target_tokenizer, from_=0, to=100, batch_size=config.train_batch_size, shuffle=True) val_dataset = create_dataset(split='validation', source_tokenizer=source_tokenizer, target_tokenizer=target_tokenizer, from_=0, to=100, batch_size=config.validation_batch_size, shuffle=True, drop_remainder=True, num_examples_to_select=config.samples_to_validate) # if a checkpoint exists, restore the latest checkpoint. ck_pt_mgr = check_ckpt(config.checkpoint_path)
import tensorflow as tf tf.keras.backend.clear_session() tf.random.set_seed(100) import time from tqdm import tqdm from preprocess import create_dataset from configuration import config, source_tokenizer, target_tokenizer from calculate_metrics import mask_and_calculate_loss from utilities import log from model_training_helper import (check_ckpt, eval_step, train_step, batch_run_check, train_sanity_check) train_dataset = create_dataset(split='train', source_tokenizer=source_tokenizer, target_tokenizer=target_tokenizer, from_=0, to=100, batch_size=2, shuffle=False) # if a checkpoint exists, restore the latest checkpoint. ck_pt_mgr = check_ckpt(config.checkpoint_path) total_steps = int(config.epochs * (config.gradient_accumulation_steps)) train_dataset = train_dataset.repeat(total_steps) for (step, (input_ids, target_ids)) in tqdm(enumerate(train_dataset, 1), initial=1): print(inputs_ids) if step == 100: break
import GPUtil from io import StringIO from tqdm import tqdm from preprocess import create_dataset from configuration import config, source_tokenizer, target_tokenizer from calculate_metrics import mask_and_calculate_loss, monitor_run from utilities import log, detokenize from create_model import Model from model_training_helper import (check_ckpt, eval_step, train_step, batch_run_check, train_sanity_check, evaluate_validation_set) unit_test_dataset = create_dataset( split='train', source_tokenizer=source_tokenizer, target_tokenizer=target_tokenizer, from_=90, to=100, batch_size=config.unit_test_dataset_batch_size, drop_remainder=True ) def check_gpu_usage(): old_stdout = sys.stdout sys.stdout = mystdout = StringIO() GPUtil.showUtilization() sys.stdout = old_stdout gpu_usage = mystdout.getvalue().strip().split('|')[-2].strip() return gpu_usage
#data preprocess print("Data preprocess ...") ##Scale scaler = MinMaxScaler(feature_range=(0, 1)) dataset = scaler.fit_transform(df['wti_price'].values.reshape(-1,1)) ##train_test split train = dataset[: math.floor(len(dataset) * 0.8), :] val = dataset[math.floor(len(dataset) * 0.8): , :] ##transform data to be supervised learning look_back = 1 trainX, trainY = p.create_dataset(train, look_back) valX, valY = p.create_dataset(val, look_back) ##To dataset for Dataloader train_dataset = d.PriceDataset(X=trainX, y=trainY) val_dataset = d.PriceDataset(X=valX, y=valY) ## To batch of tensors train_loader = DataLoader(dataset = train_dataset, batch_size = batch_size, shuffle = True) val_loader = DataLoader(dataset = val_dataset, batch_size = batch_size, shuffle = False)
# -*- coding: utf-8 -*- """ Created on Sat Oct 26 12:50:32 2019 @author: pravech3 """ import tensorflow_datasets as tfds from input_path import file_path from preprocess import create_dataset # def create_dataset(path, num_examples): # df = pd.read_csv(path) # df = df[:num_examples] # return (df['cisco_technical_team'].values, df['Actions_taken'].values) doc, summ = create_dataset(file_path.csv_path, None) def tokenizer(doc, summ): try: tokenizer_en = tfds.features.text.SubwordTextEncoder.load_from_file( file_path.subword_vocab_path) except: print( f'creating the subword vocab . This may take some time depending on the training data size' ) tokenizer_en = tfds.features.text.SubwordTextEncoder.build_from_corpus( (doc for doc, _ in zip(doc, summ)), target_vocab_size=2**13) tokenizer_en = tokenizer_en.save_to_file(file_path.subword_vocab_path) print("Subword Tokenizer created")
batch_train = 32 batch_test = 32 lr = 1e-4 lr_decay = 0.85 decay_interval = 10 iteration = 200 N = 5000 path = '/data/' dataname = 'SMRT' if torch.cuda.is_available(): device = torch.device('cuda') print('The code uses a GPU!') else: device = torch.device('cpu') print('The code uses a CPU...') dataset_train = pp.create_dataset('SMRT_train_set.txt', path, dataname) dataset_train, dataset_dev = split_dataset(dataset_train, 0.9) dataset_test = pp.create_dataset('SMRT_test_set.txt', path, dataname) lr, lr_decay = map(float, [lr, lr_decay]) if torch.cuda.is_available(): device = torch.device('cuda') print('The code uses a GPU!') else: device = torch.device('cpu') print('The code uses a CPU...') print('-' * 100) print('Preprocessing the', dataset, 'dataset.') print('Just a moment......') print('-' * 100) print('The preprocess has finished!') print('# of training data samples:', len(dataset_train))
return score_1 if __name__ == '__main__': history_data, future_data, sample_ps, sample_vm, dim_to_be_optimized, history_begin, predict_begin, predict_end, flavor_num = read_data( ) lse_model = linear_regression() predict = [] actual = [] for i in range(total_flavors): predict_list = [] # history_data[i] = avg_filter(history_data[i]) history_data[i] = get_pow(history_data[i], exponent) history_data[i] = batch_add(history_data[i], addition) x_train, y_train, x_last = create_dataset(history_data[i], 7, 1) x_train = gaussian_weighted(x_train) x_last = gaussian_weighted(x_last) lse_model.lse_fit(x_train, y_train) x_train.show() for j in range(predict_span): predict_val = lse_model.predict(x_last) predict_list.append(predict_val) predict_mat = matrix(1, 1, predict_val) x_last.col_append(predict_mat) x_last.col_deque() predict_list = batch_add(predict_list, -addition) predict_list = get_pow(predict_list, 1 / exponent)
from sklearn.model_selection import train_test_split from structure import structure from preprocess import create_dataset from model import Model if __name__ == "__main__": print("Downloading files") structure() print("Creating dataset") X, y = create_dataset() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05) model = Model() model.fit( X_train, y_train, X_test, y_test, early=4, epoches=50 )
lr, lr_decay = map(float, [lr, lr_decay]) if torch.cuda.is_available(): device = torch.device('cuda') print('The code uses a GPU!') else: device = torch.device('cpu') print('The code uses a CPU...') print('-' * 100) # print('Preprocessing the', dataset, 'dataset.') print('Just a moment......') print('-' * 100) path = 'E:/code/drug/drugnn/' dataname = '' dataset_train = pp.create_dataset('data_train.txt', path, dataname) dataset_test = pp.create_dataset('data_test.txt', path, dataname) #dataset_train, dataset_test = edit_dataset(dataset_drug, dataset_nondrug,'balance') #dataset_train, dataset_dev = split_dataset(dataset_train, 0.9) print('The preprocess has finished!') print('# of training data samples:', len(dataset_train)) #print('# of development data samples:', len(dataset_dev)) print('# of test data samples:', len(dataset_test)) print('-' * 100) print('Creating a model.') torch.manual_seed(111) model = MolecularGraphNeuralNetwork(N, dim, layer_hidden, layer_output).to(device) trainer = Trainer(model)
import tensorflow as tf tf.keras.backend.clear_session() tf.random.set_seed(100) import time from tqdm import tqdm from preprocess import create_dataset from configuration import config, source_tokenizer, target_tokenizer from calculate_metrics import mask_and_calculate_loss from utilities import log from model_training_helper import (check_ckpt, evaluate_validation_set, training_results) val_dataset = create_dataset(split='validation', source_tokenizer=source_tokenizer, target_tokenizer=target_tokenizer, from_=0, to=100, batch_size=8, shuffle=True, drop_remainder=True) count = 0 step = 1 for (i, o) in val_dataset: #print(f'input {tf.shape(i)}') #print(f'output {tf.shape(o)}') count += 1 print(f'Total records count is {count}') #sys.exit() #restore checkpoint ck_pt_mgr = check_ckpt(config.checkpoint_path) start_time = time.time()