Пример #1
0
def init_dataset(seq_path, tag_path, word_to_ix, max_seq_len, batch_size):
    seqs, tags = load_dataset(seq_path, tag_path)
    seqs, masks, tags = create_dataset(seqs, tags, word_to_ix, max_seq_len,
                                       word_to_ix['[PAD]'])
    extended_attention_mask = create_attention_mask(masks)
    dataset = TensorDataset(seqs, extended_attention_mask, tags)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)
def infer(documents_path, beam_size, checkpoint_path):
    doc, summ = create_dataset(documents_path,
                               num_examples=config.num_examples)
    train_examples = tf.data.Dataset.from_tensor_slices((doc, summ))
    batch = len(doc) // 2
    train_dataset = train_examples.map(tf_encode)
    train_dataset = train_dataset.filter(filter_token_size)
    train_dataset = train_dataset.cache()
    train_dataset = train_dataset.padded_batch(batch,
                                               padded_shapes=([-1], [-1]))
    #print(f'Number of records before filtering was {len(doc)}')
    #print(f'Number of records to be inferenced is {sum(1 for l in train_dataset) * batch} approx')
    restore_chkpt(checkpoint_path)
    start_time = time.time()
    for (_, (inp, tar)) in enumerate(train_dataset):
        translated_output_temp = beam_search_eval(inp, beam_size)
        #print(translated_output_temp)
        for true_summary, top_sentence_ids in zip(
                tar, translated_output_temp[0][:, 0, :]):
            print()
            print('Original summary: {}'.format(
                tokenizer_en.decode(
                    [j for j in true_summary if j < tokenizer_en.vocab_size])))
            print('Predicted summary: {}'.format(
                tokenizer_en.decode([
                    j for j in top_sentence_ids if j < tokenizer_en.vocab_size
                    if j > 0
                ])))
            print()
    print('time to process {}'.format(time.time() - start_time))
Пример #3
0
def change_dataset_and_train(addtional_tokens_per_batch, batch_size):
    
    memory_test_dataset = create_dataset(
                              split='train', 
                              source_tokenizer=source_tokenizer, 
                              target_tokenizer=target_tokenizer, 
                              from_=90, 
                              to=100, 
                              shuffle=True,
                              batch_size=batch_size
                              )
    log.info(f'Training with tokens_per_batch set to {addtional_tokens_per_batch}\
               and batch_size set to {batch_size}')
    training_loop(memory_test_dataset.take(1000), False)
    gpu_usage = check_gpu_usage()
    log.info(f'GPU memory utilization is {gpu_usage}')

    return gpu_usage
Пример #4
0
def main(_):
    # total_x, total_y, x_dim, y_dim
    ckpt_path = os.path.join(FLAGS.ckpt_dir, FLAGS.name)
    (train_x, train_y), (test_x, test_y) = preprocess.create_dataset()

    batch = model.Batch(train_x, train_y, FLAGS.epoch)

    print('start session')
    with tf.Session() as sess:
        predicator = model.Predicator(matrix_shape=[9, 8],
                                      num_time=7,
                                      out_time=7,
                                      kernels=[[5, 5], [5, 5], [5, 5], [5, 5],
                                               [5, 5]],
                                      depths=[256, 128, 128, 64, 32],
                                      learning_rate=FLAGS.learning_rate,
                                      beta1=FLAGS.beta1)

        train_path = os.path.join(FLAGS.summary_dir, FLAGS.name, 'train')
        test_path = os.path.join(FLAGS.summary_dir, FLAGS.name, 'test')

        train_writer = tf.summary.FileWriter(train_path, sess.graph)
        test_writer = tf.summary.FileWriter(test_path, sess.graph)

        print('start training')
        sess.run(tf.global_variables_initializer())
        for i in range(FLAGS.epoch):
            for n in range(batch.iter_per_epoch):
                batch_x, batch_y = batch()
                predicator.train(sess, batch_x, batch_y)

            print(i, 'th epoch')
            summary = predicator.inference(sess, predicator.summary, batch_x,
                                           batch_y)
            train_writer.add_summary(summary, global_step=i)

            summary = predicator.inference(sess, predicator.summary, test_x,
                                           test_y)
            test_writer.add_summary(summary, global_step=i)

            if (i + 1) % FLAGS.ckpt_interval == 0:
                predicator.dump(sess, ckpt_path, i)
Пример #5
0
if __name__ == "__main__":
    args = parser.parse_args()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    with open(args.config_path, 'r') as f:
        config = json.load(f)
    model = BertAbsSum(args.bert_model, config['decoder_config'], device)
    model.load_state_dict(torch.load(args.model_path))
    model.to(device)

    processor = LCSTSProcessor()
    tokenizer = BertTokenizer.from_pretrained(os.path.join(args.bert_model, 'vocab.txt'))
    test_examples = processor.get_examples(args.eval_path)
    test_features = convert_examples_to_features(test_examples, args.max_src_len, args.max_tgt_len, tokenizer)
    test_data = create_dataset(test_features)
    test_sampler = RandomSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE, drop_last=True)
    logger.info('Loading complete. Writing results to %s' % (args.result_path))

    model.eval()
    f_log = open(os.path.join(args.result_path, 'log.txt'), 'w', encoding='utf-8')
    # f_hyp = open(os.path.join(args.result_path, 'hyp.txt'), 'w', encoding='utf-8')
    # f_ref = open(os.path.join(args.result_path, 'ref.txt'), 'w', encoding='utf-8')
    hyp_list = []
    ref_list = []
    for batch in tqdm(test_dataloader, desc="Iteration"):
        batch = tuple(t.to(device) for t in batch)
        pred, _ = model.beam_decode(batch[0], batch[1], 3, 3)
        src, tgt = batch[0], batch[2]
        for i in range(BATCH_SIZE):
Пример #6
0
    example = train_examples[0]
    example_feature = train_features[0]
    logger.info("*** Example ***")
    logger.info("guid: %s" % (example.guid))
    logger.info("src text: %s" % example.src)
    logger.info("src_ids: %s" %
                " ".join([str(x) for x in example_feature.src_ids]))
    logger.info("src_mask: %s" %
                " ".join([str(x) for x in example_feature.src_mask]))
    logger.info("tgt text: %s" % example.tgt)
    logger.info("tgt_ids: %s" %
                " ".join([str(x) for x in example_feature.tgt_ids]))
    logger.info("tgt_mask: %s" %
                " ".join([str(x) for x in example_feature.tgt_mask]))
    logger.info('Building dataloader...')
    train_data = create_dataset(train_features)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size,
                                  drop_last=True)

    # eval data preprocess
    if not os.path.exists(os.path.join(args.data_dir, 'eval.csv')):
        logger.info(
            'No eval data found in data directory. Eval will not be performed.'
        )
        eval_dataloader = None
    else:
        logger.info('Loading eval dataset...')
        eval_examples = processor.get_examples(
Пример #7
0
def restore_chkpt(checkpoint_path):
    ckpt = tf.train.Checkpoint(
                               Model=Model
                               )
    assert tf.train.latest_checkpoint(os.path.split(checkpoint_path)[0]), 'Incorrect checkpoint directory'
    ckpt.restore(checkpoint_path).expect_partial()
    print(f'{checkpoint_path} restored')

restore_chkpt(config.infer_ckpt_path)

test_dataset = create_dataset(
                             split='test', 
                             source_tokenizer=source_tokenizer, 
                             target_tokenizer=target_tokenizer, 
                             from_=0, 
                             to=100, 
                             batch_size=config.validation_batch_size,
                             drop_remainder=True
                             )
max_combined_metric = 0
decoder_type = 'topktopp'
temperatures = [1]
length_penalties = [0.8]
beams =  [12, 13, 14, 15]
top_ps = [1]
top_ks = [10]

for beam_size in beams:
    for length_penalty in length_penalties:
        for top_p in top_ps:
Пример #8
0
def train(test_name,
          radius=1,
          dim=52,
          layer_hidden=4,
          layer_output=10,
          dropout=0.45,
          batch_train=8,
          batch_test=8,
          lr=3e-4,
          lr_decay=0.85,
          decay_interval=25,
          iteration=140,
          N=5000,
          dataset_train='../dataset/data_train.txt'):

    dataset_test = test_name

    (radius, dim, layer_hidden, layer_output, batch_train, batch_test,
     decay_interval, iteration, dropout) = map(int, [
         radius, dim, layer_hidden, layer_output, batch_train, batch_test,
         decay_interval, iteration, dropout
     ])

    lr, lr_decay = map(float, [lr, lr_decay])
    if torch.cuda.is_available():
        device = torch.device('cuda')
        print('The code uses a GPU!')
    else:
        device = torch.device('cpu')
        print('The code uses a CPU...')

    lr, lr_decay = map(float, [lr, lr_decay])

    print('-' * 100)
    print('Just a moment......')
    print('-' * 100)
    path = ''
    dataname = ''

    dataset_train = pp.create_dataset(dataset_train, path, dataname)
    dataset_test = pp.create_dataset(dataset_test, path, dataname)
    np.random.seed(0)
    np.random.shuffle(dataset_train)
    np.random.shuffle(dataset_test)

    print('The preprocess has finished!')
    print('# of training data samples:', len(dataset_train))
    # print('# of development data samples:', len(dataset_dev))
    print('# of test data samples:', len(dataset_test))
    print('-' * 100)

    print('Creating a model.')
    torch.manual_seed(0)
    model = MolecularGraphNeuralNetwork(N, dim, layer_hidden, layer_output,
                                        dropout).to(device)
    trainer = Trainer(model, lr, batch_train)
    tester = Tester(model, batch_test)
    print('# of model parameters:',
          sum([np.prod(p.size()) for p in model.parameters()]))
    print('-' * 100)
    file_result = path + 'AUC' + '.txt'
    #    file_result = '../output/result--' + setting + '.txt'
    result = 'Epoch\tTime(sec)\tLoss_train\tLoss_test\tAUC_train\tAUC_test'
    file_test_result = path + 'test_prediction' + '.txt'
    file_predictions = path + 'train_prediction' + '.txt'
    file_model = path + 'model' + '.h5'
    with open(file_result, 'w') as f:
        f.write(result + '\n')

    print('Start training.')
    print('The result is saved in the output directory every epoch!')

    np.random.seed(0)

    start = timeit.default_timer()

    for epoch in range(iteration):

        epoch += 1
        if epoch % decay_interval == 0:
            trainer.optimizer.param_groups[0]['lr'] *= lr_decay
        # [‘amsgrad’, ‘params’, ‘lr’, ‘betas’, ‘weight_decay’, ‘eps’]
        prediction_train, loss_train, train_res = trainer.train(dataset_train)
        prediction_test, loss_test, test_res = tester.test_classifier(
            dataset_test)

        time = timeit.default_timer() - start

        if epoch == 1:
            minutes = time * iteration / 60
            hours = int(minutes / 60)
            minutes = int(minutes - 60 * hours)
            print('The training will finish in about', hours, 'hours', minutes,
                  'minutes.')
            print('-' * 100)
            print(result)

        result = '\t'.join(
            map(str, [
                epoch, time, loss_train, loss_test, prediction_train,
                prediction_test
            ]))
        tester.save_result(result, file_result)
        tester.save_model(model, file_model)
        print(result)

    loss = pd.read_table(file_result)

    plt.plot(loss['AUC_train'], color='r', label='AUC of train set')
    plt.plot(loss['AUC_test'], color='b', label='AUC of test set')
    plt.ylabel('AUC')
    plt.xlabel('Epoch')
    plt.xlim(-1, 145)
    plt.ylim(0, 1)
    plt.legend()
    plt.savefig(path + 'loss.tif', dpi=300)
    plt.show()

    res_test = test_res.T
    res_train = train_res.T
    cn_matrix = confusion_matrix(res_train[:, 0], res_train[:, 1])
    cn_matrix

    tn1 = cn_matrix[0, 0]
    tp1 = cn_matrix[1, 1]
    fn1 = cn_matrix[1, 0]
    fp1 = cn_matrix[0, 1]

    bacc_train = ((tp1 / (tp1 + fn1)) + (tn1 /
                                         (tn1 + fp1))) / 2  # balance accurance
    pre_train = tp1 / (tp1 + fp1)  # precision/q+
    rec_train = tp1 / (tp1 + fn1)  # recall/se
    sp_train = tn1 / (tn1 + fp1)
    q__train = tn1 / (tn1 + fn1)
    f1_train = 2 * pre_train * rec_train / (pre_train + rec_train)  # f1score
    mcc_train = ((tp1 * tn1) - (fp1 * fn1)) / math.sqrt(
        (tp1 + fp1) * (tp1 + fn1) * (tn1 + fp1) *
        (tn1 + fn1))  # Matthews correlation coefficient
    acc_train = (tp1 + tn1) / (tp1 + fp1 + fn1 + tn1)  # accurancy
    fpr_train, tpr_train, thresholds_train = roc_curve(res_train[:, 0],
                                                       res_train[:, 1])
    print('bacc_train:', bacc_train)
    print('pre_train:', pre_train)
    print('rec_train:', rec_train)
    print('f1_train:', f1_train)
    print('mcc_train:', mcc_train)
    print('sp_train:', sp_train)
    print('q__train:', q__train)
    print('acc_train:', acc_train)

    cnf_matrix = confusion_matrix(res_test[:, 0], res_test[:, 1])
    cnf_matrix

    tn = cnf_matrix[0, 0]
    tp = cnf_matrix[1, 1]
    fn = cnf_matrix[1, 0]
    fp = cnf_matrix[0, 1]

    bacc = ((tp / (tp + fn)) + (tn / (tn + fp))) / 2  # balance accurance
    pre = tp / (tp + fp)  # precision/q+
    rec = tp / (tp + fn)  # recall/se
    sp = tn / (tn + fp)
    q_ = tn / (tn + fn)
    f1 = 2 * pre * rec / (pre + rec)  # f1score
    mcc = ((tp * tn) - (fp * fn)) / math.sqrt(
        (tp + fp) * (tp + fn) * (tn + fp) *
        (tn + fn))  # Matthews correlation coefficient
    acc = (tp + tn) / (tp + fp + fn + tn)  # accurancy
    fpr, tpr, thresholds = roc_curve(res_test[:, 0], res_test[:, 1])
    print('bacc:', bacc)
    print('pre:', pre)
    print('rec:', rec)
    print('f1:', f1)
    print('mcc:', mcc)
    print('sp:', sp)
    print('q_:', q_)
    print('acc:', acc)
    print('auc:', prediction_test)
    return res_test
import tensorflow as tf
tf.keras.backend.clear_session()
tf.random.set_seed(100)
import time
from tqdm import tqdm
from preprocess import create_dataset
from configuration import config, source_tokenizer, target_tokenizer
from calculate_metrics import mask_and_calculate_loss
from utilities import log
from model_training_helper import (check_ckpt, eval_step, train_step,
                                   batch_run_check, save_evaluate_monitor)

train_dataset = create_dataset(split='train',
                               source_tokenizer=source_tokenizer,
                               target_tokenizer=target_tokenizer,
                               from_=0,
                               to=100,
                               batch_size=config.train_batch_size,
                               shuffle=True)
val_dataset = create_dataset(split='validation',
                             source_tokenizer=source_tokenizer,
                             target_tokenizer=target_tokenizer,
                             from_=0,
                             to=100,
                             batch_size=config.validation_batch_size,
                             shuffle=True,
                             drop_remainder=True,
                             num_examples_to_select=config.samples_to_validate)

# if a checkpoint exists, restore the latest checkpoint.
ck_pt_mgr = check_ckpt(config.checkpoint_path)
import tensorflow as tf
tf.keras.backend.clear_session()
tf.random.set_seed(100)
import time
from tqdm import tqdm
from preprocess import create_dataset
from configuration import config, source_tokenizer, target_tokenizer
from calculate_metrics import mask_and_calculate_loss
from utilities import log
from model_training_helper import (check_ckpt, eval_step, train_step,
                                   batch_run_check, train_sanity_check)

train_dataset = create_dataset(split='train',
                               source_tokenizer=source_tokenizer,
                               target_tokenizer=target_tokenizer,
                               from_=0,
                               to=100,
                               batch_size=2,
                               shuffle=False)

# if a checkpoint exists, restore the latest checkpoint.
ck_pt_mgr = check_ckpt(config.checkpoint_path)
total_steps = int(config.epochs * (config.gradient_accumulation_steps))
train_dataset = train_dataset.repeat(total_steps)

for (step, (input_ids, target_ids)) in tqdm(enumerate(train_dataset, 1),
                                            initial=1):

    print(inputs_ids)
    if step == 100:
        break
Пример #11
0
import GPUtil
from io import StringIO
from tqdm import tqdm
from preprocess import create_dataset
from configuration import config, source_tokenizer, target_tokenizer
from calculate_metrics import mask_and_calculate_loss, monitor_run
from utilities import log, detokenize
from create_model import Model
from model_training_helper import (check_ckpt, eval_step, train_step, batch_run_check, 
                          train_sanity_check, evaluate_validation_set)

unit_test_dataset = create_dataset(
                              split='train', 
                              source_tokenizer=source_tokenizer, 
                              target_tokenizer=target_tokenizer, 
                              from_=90, 
                              to=100, 
                              batch_size=config.unit_test_dataset_batch_size,
                              drop_remainder=True
                              )

def check_gpu_usage():

    old_stdout = sys.stdout
    sys.stdout = mystdout = StringIO()
    GPUtil.showUtilization()
    sys.stdout = old_stdout
    gpu_usage = mystdout.getvalue().strip().split('|')[-2].strip()

    return gpu_usage




#data preprocess
print("Data preprocess ...")
##Scale
scaler = MinMaxScaler(feature_range=(0, 1))
dataset = scaler.fit_transform(df['wti_price'].values.reshape(-1,1))
##train_test split
train = dataset[: math.floor(len(dataset) * 0.8), :]
val = dataset[math.floor(len(dataset) * 0.8): , :]
##transform data to be supervised learning
look_back = 1
trainX, trainY = p.create_dataset(train, look_back)
valX, valY = p.create_dataset(val, look_back)
##To dataset for Dataloader
train_dataset = d.PriceDataset(X=trainX, y=trainY)
val_dataset = d.PriceDataset(X=valX, y=valY)
## To batch of tensors
train_loader = DataLoader(dataset = train_dataset,
                                            batch_size = batch_size,
                                            shuffle = True)
val_loader = DataLoader(dataset = val_dataset,
                                            batch_size = batch_size,
                                            shuffle = False)



# -*- coding: utf-8 -*-
"""
Created on Sat Oct 26 12:50:32 2019

@author: pravech3
"""
import tensorflow_datasets as tfds
from input_path import file_path
from preprocess import create_dataset

# def create_dataset(path, num_examples):
#     df = pd.read_csv(path)
#     df = df[:num_examples]
#     return (df['cisco_technical_team'].values, df['Actions_taken'].values)

doc, summ = create_dataset(file_path.csv_path, None)


def tokenizer(doc, summ):
    try:
        tokenizer_en = tfds.features.text.SubwordTextEncoder.load_from_file(
            file_path.subword_vocab_path)
    except:
        print(
            f'creating the subword vocab . This may take some time depending on the training data size'
        )
        tokenizer_en = tfds.features.text.SubwordTextEncoder.build_from_corpus(
            (doc for doc, _ in zip(doc, summ)), target_vocab_size=2**13)
        tokenizer_en = tokenizer_en.save_to_file(file_path.subword_vocab_path)
    print("Subword Tokenizer created")
Пример #14
0
 batch_train = 32
 batch_test = 32
 lr = 1e-4
 lr_decay = 0.85
 decay_interval = 10
 iteration = 200
 N = 5000
 path = '/data/'
 dataname = 'SMRT'
 if torch.cuda.is_available():
     device = torch.device('cuda')
     print('The code uses a GPU!')
 else:
     device = torch.device('cpu')
     print('The code uses a CPU...')
 dataset_train = pp.create_dataset('SMRT_train_set.txt', path, dataname)
 dataset_train, dataset_dev = split_dataset(dataset_train, 0.9)
 dataset_test = pp.create_dataset('SMRT_test_set.txt', path, dataname)
 lr, lr_decay = map(float, [lr, lr_decay])
 if torch.cuda.is_available():
     device = torch.device('cuda')
     print('The code uses a GPU!')
 else:
     device = torch.device('cpu')
     print('The code uses a CPU...')
 print('-' * 100)
 print('Preprocessing the', dataset, 'dataset.')
 print('Just a moment......')
 print('-' * 100)
 print('The preprocess has finished!')
 print('# of training data samples:', len(dataset_train))
Пример #15
0
    return score_1


if __name__ == '__main__':
    history_data, future_data, sample_ps, sample_vm, dim_to_be_optimized, history_begin, predict_begin, predict_end, flavor_num = read_data(
    )
    lse_model = linear_regression()
    predict = []
    actual = []
    for i in range(total_flavors):
        predict_list = []
        # history_data[i] = avg_filter(history_data[i])
        history_data[i] = get_pow(history_data[i], exponent)
        history_data[i] = batch_add(history_data[i], addition)

        x_train, y_train, x_last = create_dataset(history_data[i], 7, 1)
        x_train = gaussian_weighted(x_train)
        x_last = gaussian_weighted(x_last)
        lse_model.lse_fit(x_train, y_train)
        x_train.show()

        for j in range(predict_span):
            predict_val = lse_model.predict(x_last)
            predict_list.append(predict_val)
            predict_mat = matrix(1, 1, predict_val)
            x_last.col_append(predict_mat)
            x_last.col_deque()

        predict_list = batch_add(predict_list, -addition)
        predict_list = get_pow(predict_list, 1 / exponent)
Пример #16
0
from sklearn.model_selection import train_test_split
from structure import structure
from preprocess import create_dataset
from model import Model

if __name__ == "__main__":
    print("Downloading files")
    structure()
    print("Creating dataset")
    X, y = create_dataset()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)

    model = Model()
    model.fit(
        X_train, y_train,
        X_test, y_test,
        early=4,
        epoches=50
    )
Пример #17
0
    lr, lr_decay = map(float, [lr, lr_decay])
    if torch.cuda.is_available():
        device = torch.device('cuda')
        print('The code uses a GPU!')
    else:
        device = torch.device('cpu')
        print('The code uses a CPU...')
    print('-' * 100)

    #    print('Preprocessing the', dataset, 'dataset.')
    print('Just a moment......')
    print('-' * 100)
    path = 'E:/code/drug/drugnn/'
    dataname = ''

    dataset_train = pp.create_dataset('data_train.txt', path, dataname)
    dataset_test = pp.create_dataset('data_test.txt', path, dataname)

    #dataset_train, dataset_test = edit_dataset(dataset_drug, dataset_nondrug,'balance')
    #dataset_train, dataset_dev = split_dataset(dataset_train, 0.9)
    print('The preprocess has finished!')
    print('# of training data samples:', len(dataset_train))
    #print('# of development data samples:', len(dataset_dev))
    print('# of test data samples:', len(dataset_test))
    print('-' * 100)

    print('Creating a model.')
    torch.manual_seed(111)
    model = MolecularGraphNeuralNetwork(N, dim, layer_hidden,
                                        layer_output).to(device)
    trainer = Trainer(model)
Пример #18
0
import tensorflow as tf
tf.keras.backend.clear_session()
tf.random.set_seed(100)
import time
from tqdm import tqdm
from preprocess import create_dataset
from configuration import config, source_tokenizer, target_tokenizer
from calculate_metrics import mask_and_calculate_loss
from utilities import log
from model_training_helper import (check_ckpt, evaluate_validation_set,
                                   training_results)

val_dataset = create_dataset(split='validation',
                             source_tokenizer=source_tokenizer,
                             target_tokenizer=target_tokenizer,
                             from_=0,
                             to=100,
                             batch_size=8,
                             shuffle=True,
                             drop_remainder=True)
count = 0
step = 1
for (i, o) in val_dataset:
    #print(f'input {tf.shape(i)}')
    #print(f'output {tf.shape(o)}')
    count += 1
print(f'Total records count is {count}')
#sys.exit()

#restore checkpoint
ck_pt_mgr = check_ckpt(config.checkpoint_path)
start_time = time.time()