예제 #1
0
def test_gettopk(data_dir='data/memes/',
          dim_proj=256,
          dim_att=128,
          maxlen=None,
          batch_size=700,
          keep_ratio=1.,
          shuffle_data=False,
          learning_rate=0.0005,
          global_steps=50000,
          disp_freq=100,
          save_freq=300,
          test_freq=300,
          saveto_file='params.npz',
          weight_decay=0.0005,
          sigmasqr = 1,
          tdim = 1.,
          reload_model=False,
          train=True):
    """
    Topo-LSTM model training.
    tdim: scale time down by how many times
    """
    options = locals().copy()
    saveto = data_dir + saveto_file
    tmsaveto = data_dir + 'timeparams.npz'

    # loads graph
    Gp, node_index, node_reverse_index = data_utils.load_graph_withtrack(data_dir)
    options['n_events'] = len(node_index)

    print options

    # creates and initializes shared variables.
    print 'Initializing variables...'
    params = init_params(options)
    print 'reusing saved model.'
    load_params(saveto, params)
    tparams = init_tparams(params)

    timeparams = init_timeparams(options)
    print 'reusing saved model.'
    load_params(tmsaveto, timeparams)
    timetparams = init_tparams(timeparams)

    # builds Topo-LSTM model
    print 'Building model...'
    model = tpgru_model.build_model(tparams, timetparams, options)

    print 'Loading test data...'
    test_examples = data_utils.load_examples_seq(data_dir,
                                             dataset='test',
                                             node_index=node_index,
                                             maxlen=maxlen,
                                             Gp=Gp)
    test_loader = data_utils.Loader(test_examples, options=options)
    print 'Loaded %d test examples' % len(test_examples)

    scores = evaluate_topk(model['f_prob'], test_loader, model['f_tprob'], options['tdim'], node_reverse_index, data_dir)
    print 'eval scores: ', scores
    pprint.pprint(scores)
예제 #2
0
print("Shape of test_mask: {}".format(test_mask.shape))
print("Average train cascade size: {}".format(
    np.mean(np.sum(train_mask, axis=1))))
print("Average validation cascade size: {}".format(
    np.mean(np.sum(validation_mask, axis=1))))
print("Average test cascade size: {}".format(np.mean(np.sum(test_mask,
                                                            axis=1))))

print("***** Hyper Parameters *****")
print("Learning rate: {}".format(FLAGS.learning_rate))
print("Batch size: {}".format(FLAGS.batch_size))
print("Max steps: {}".format(FLAGS.max_steps))
print("Regularization scale: {}".format(FLAGS.regularization_scale))
print("hidden_dim: {}".format(FLAGS.hidden_dim))

train_batches = data_utils.Loader(train_examples, train_mask, FLAGS.batch_size)
print("Number of train batches: {}".format(len(train_batches)))

# Define placeholders
placeholders = {
    'contents': tf.placeholder(tf.float32, shape=(None, FLAGS.hidden_dim)),
    'sequences': tf.placeholder(tf.int32, shape=(None, FLAGS.max_steps + 1)),
    'seq_mask': tf.placeholder(tf.int32, shape=(None, FLAGS.max_steps)),
    'hit_at': tf.placeholder(tf.int32)
}

# Create model
model = CascadeRNN(number_of_nodes,
                   FLAGS.hidden_dim,
                   FLAGS.max_steps,
                   nx.to_numpy_matrix(G).astype(np.float32),
예제 #3
0
def train(data_dir='data/memes/',
          dim_proj=512,
          maxlen=30,
          batch_size=256,
          keep_ratio=1.,
          shuffle_data=True,
          learning_rate=0.001,
          global_steps=50000,
          disp_freq=100,
          save_freq=1000,
          test_freq=1000,
          saveto_file='params.npz',
          weight_decay=0.0005,
          reload_model=False,
          train=True):
    """
    Topo-LSTM model training.
    """
    options = locals().copy()
    saveto = data_dir + saveto_file

    # loads graph
    G, node_index = data_utils.load_graph(data_dir)
    print nx.info(G)
    options['n_words'] = len(node_index)

    print options

    # creates and initializes shared variables.
    print 'Initializing variables...'
    params = init_params(options)
    if reload_model:
        print 'reusing saved model.'
        load_params(saveto, params)
    tparams = init_tparams(params)

    # builds Topo-LSTM model
    print 'Building model...'
    model = tprnn_model.build_model(tparams, options)

    print 'Loading test data...'
    test_examples = data_utils.load_examples(data_dir,
                                             dataset='test',
                                             node_index=node_index,
                                             maxlen=maxlen,
                                             G=G)
    test_loader = data_utils.Loader(test_examples, options=options)
    print 'Loaded %d test examples' % len(test_examples)

    if train:
        # prepares training data.
        print 'Loading train data...'
        train_examples = data_utils.load_examples(
            data_dir,
            dataset='train',
            keep_ratio=options['keep_ratio'],
            node_index=node_index,
            maxlen=maxlen,
            G=G)
        train_loader = data_utils.Loader(train_examples, options=options)
        print 'Loaded %d training examples.' % len(train_examples)

        # compiles updates.
        optimizer = downhill.build(algo='adam',
                                   loss=model['cost'],
                                   params=tparams.values(),
                                   inputs=model['data'])

        updates = optimizer.get_updates(max_gradient_elem=5.,
                                        learning_rate=learning_rate)

        f_update = theano.function(model['data'],
                                   model['cost'],
                                   updates=list(updates))

        # training loop.
        start_time = timeit.default_timer()

        # downhill.minimize(
        #     loss=cost,
        #     algo='adam',
        #     train=train_loader,
        #     # inputs=input_list + [labels],
        #     # params=tparams.values(),
        #     # patience=0,
        #     max_gradient_clip=1,
        #     # max_gradient_norm=1,
        #     learning_rate=learning_rate,
        #     monitors=[('cost', cost)],
        #     monitor_gradients=False)

        n_examples = len(train_examples)
        batches_per_epoch = n_examples // options['batch_size'] + 1
        n_epochs = global_steps // batches_per_epoch + 1

        global_step = 0
        cost_history = []
        for _ in range(n_epochs):
            for _ in range(batches_per_epoch):
                cost = f_update(*train_loader())
                cost_history += [cost]

                if global_step % disp_freq == 0:
                    print 'global step %d, cost: %f' % (global_step, cost)

                # dump model parameters.
                if global_step % save_freq == 0:
                    params = unzip(tparams)
                    np.savez(saveto, **params)
                    pickle.dump(options, open('%s.pkl' % saveto, 'wb'), -1)

                # evaluate on test data.
                if global_step % test_freq == 0:
                    scores = evaluate(model['f_prob'], test_loader)
                    print 'eval scores: ', scores
                    end_time = timeit.default_timer()
                    print 'time used: %d seconds.' % (end_time - start_time)

                global_step += 1

    scores = evaluate(model['f_prob'], test_loader)
    pprint.pprint(scores)
예제 #4
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--data_dir",
                        default='./data',
                        type=str,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--bert_model", default='bert-base-uncased', type=str,
                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
                             "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
    parser.add_argument("--pre_training_path", default='./pre_training', type=str,
                        help="model pre training")

    parser.add_argument("--save_path", default='./output', type=str,
                        help="model save path")

    parser.add_argument("--ngpu", default=1, type=int,
                        help="use gpu number")

    parser.add_argument("--load_model", default=False, action='store_true',
                        help="model load")

    parser.add_argument("--save_model", default=False, action='store_true',
                        help="model save ")

    parser.add_argument("--load_path", default='./output', type=str,
                        help="model save path")

    parser.add_argument("--is_test", default='./output', type=str,
                        help="model save path")

    parser.add_argument("--task_name",
                        default='cloth',
                        type=str,
                        help="The name of the task to train.")
    parser.add_argument("--output_dir",
                        default='EXP/',
                        type=str,
                        required=True,
                        help="The output directory where the model checkpoints will be written.")

    ## Other parameters
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size",
                        default=4,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--cache_size",
                        default=256,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--num_log_steps",
                        default=10,
                        type=int,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion",
                        default=0.1,
                        type=float,
                        help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument('--optimize_on_cpu',
                        default=False,
                        action='store_true',
                        help="Whether to perform optimization and keep the optimizer averages on CPU")
    parser.add_argument('--fp16',
                        default=False,
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--loss_scale',
                        type=float, default=128,
                        help='Loss scaling, positive power of 2 values can improve fp16 convergence.')

    args = parser.parse_args()

    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    suffix = time.strftime('%Y%m%d-%H%M%S')
    args.output_dir = os.path.join(args.output_dir, suffix)

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    os.makedirs(args.output_dir, exist_ok=True)


    bert_list = []
    model_list = []
    for m in args.bert_model.split('+'):
        bert_list .append(m)
        model_list.append(chose_model_model(m, args))

    logging = get_logger(os.path.join(args.output_dir, 'log.txt'))

    data_file = []
    for m in bert_list:
        data_file.append({'train': 'train', 'valid': 'dev', 'test': 'test'})
        for key in data_file[-1].keys():
            data_file[-1][key] = data_file[-1][key] + '-' + m + '.pt'
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
        if args.fp16:
            logging("16-bits training currently not supported in distributed training")
            args.fp16 = False  # (see https://github.com/pytorch/pytorch/pull/13496)
    logging("device {} n_gpu {} distributed training {}".format(device, n_gpu, bool(args.local_rank != -1)))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
            args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    task_name = args.task_name.lower()

    num_train_steps = []
    train_data = []
    if args.do_train:
        for id, m in enumerate(bert_list):
            train_data.append(data_utils.Loader(args.data_dir, data_file[id]['train'], args.cache_size, args.train_batch_size,
                                           device))
            num_train_steps.append(int(
                train_data[-1].data_num / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs))

    # Prepare model
    # model = RobertaForCloze.from_pretrained("roberta-base",
    #           cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank),
    #              #proxies={ "socks":"127.0.0.1:1080",}
    #
    # tokenizer = chose_model_token(args.bert_model,args)
    # tokenizer = chose_model_token(args.bert_model,args)
    # model.resize_token_embeddings(len(tokenizer))
    #  model = torch.load()
    if args.fp16:
        for id, model in enumerate(model_list):
            model_list[id].half()
    for id, model in enumerate(model_list):
        model_list[id].to(device)
    if args.local_rank != -1:
        for id, model in enumerate(model_list):
            model_list[id] = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank)
    elif n_gpu > 1:
        for id, model in enumerate(model_list):
            model_list[id] = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = []
    if args.fp16:
        for model in model_list:
            param_optimizer.append((n, param.clone().detach().to('cpu').float().requires_grad_()) \
                           for n, param in model.named_parameters())
    elif args.optimize_on_cpu:
        for model in model_list:
            param_optimizer.append((n, param.clone().detach().to('cpu').requires_grad_()) \
                           for n, param in model.named_parameters())
    else:
        for model in model_list:
            param_optimizer.append(list(model.named_parameters()))
    no_decay = ['bias', 'gamma', 'beta']

    optimizer_grouped_parameters = []
    for p_o in param_optimizer:
        optimizer_grouped_parameters.append([
            {'params': [p for n, p in p_o if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
            {'params': [p for n, p in p_o if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
        ])


    global_step = 0

    if args.load_model:
        if args.ngpu > 1:
            for id, m in enumerate(args.bert_list):
                print('        model is loading......   PATH:' + m + '/' + m + '_' + str(
                    args.ngpu) + '.bin')
                model_list[id] = torch.load(args.load_path + '/' + m + '_' + str(args.ngpu) + '.bin')
        else:
            for id, m in enumerate(args.bert_list):
                print('        model is loading......   PATH:' + args.load_path + '/' + m + '.bin')
                model_list[id] = torch.load(args.load_path + '/' + m + '.bin')

    if args.do_train:
        # import time

        for id, model in enumerate(model_list):
            start = time.time()
            logging("***** Running training *****")
            logging("  Batch size = {}".format(args.train_batch_size))
            logging("  Num steps = {}".format(num_train_steps[id]))

            model.train()
            loss_history = []
            acc_history = []

            t_total = num_train_steps[id]
            if args.local_rank != -1:
                t_total = t_total // torch.distributed.get_world_size()
            optimizer = (BertAdam(optimizer_grouped_parameters[id],
                                          lr=args.learning_rate,
                                          warmup=args.warmup_proportion,
                                          t_total=t_total))

            for _ in range(int(args.num_train_epochs)):
                tr_loss = 0
                tr_acc = 0
                nb_tr_examples, nb_tr_steps = 0, 0
                for inp, tgt in train_data[id].data_iter():
                    loss, acc = model(inp, tgt)
                    # print(loss)
                    if n_gpu > 1:
                        loss = loss.mean()  # mean() to average on multi-gpu.
                        acc = acc.sum()
                    if args.fp16 and args.loss_scale != 1.0:
                        # rescale loss for fp16 training
                        # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
                        loss = loss * args.loss_scale
                    if args.gradient_accumulation_steps > 1:
                        loss = loss / args.gradient_accumulation_steps

                    loss.backward()
                    # print(loss.shape)
                    tr_loss += loss.item()

                    tr_acc += acc.item()
                    # print(tr_acc)
                    nb_tr_examples += inp[-1].sum()
                    nb_tr_steps += 1
                    if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0:
                        if args.fp16 or args.optimize_on_cpu:
                            if args.fp16 and args.loss_scale != 1.0:
                                # scale down gradients for fp16 training
                                for param in model.parameters():
                                    if param.grad is not None:
                                        param.grad.data = param.grad.data / args.loss_scale
                            is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True)
                            if is_nan:
                                logging("FP16 TRAINING: Nan in gradients, reducing loss scaling")
                                args.loss_scale = args.loss_scale / 2
                                model.zero_grad()
                                continue
                            optimizer.step()
                            copy_optimizer_params_to_model(model.named_parameters(), param_optimizer)
                        else:
                            optimizer.step()
                        model.zero_grad()
                        global_step += 1
                    if (global_step % args.num_log_steps == 0):
                        logging('step: {} | train loss: {} | train acc {}'.format(
                            global_step, tr_loss / nb_tr_examples, tr_acc / nb_tr_examples))

                        loss_history.append([global_step, tr_loss])
                        acc_history.append([global_step, tr_acc])

                        tr_loss = 0
                        tr_acc = 0
                        nb_tr_examples = 0

                save_history_path = "./Cord_Pic"
                end = time.time()
                print(end - start)
                loss_history = np.array(loss_history)
                acc_history = np.array(acc_history)
                np.save(save_history_path + '/' + bert_list[id] + '.loss_history.npy', loss_history)  # 保存为.npy格式
                np.save(save_history_path + '/' + bert_list[id] + '.acc_history.npy', acc_history)  # 保存为.npy格式

        # 读取
        # a = np.load('a.npy')
        # a = a.tolist()

                if args.save_model:
                    if args.ngpu > 1:
                        print('        model is saving......   PATH:' + args.load_path + '/' + bert_list[id] + '_' + str(
                            args.ngpu) + '.bin')
                        torch.save(model, args.load_path + '/' + bert_list[id] + '_' + str(args.ngpu) + '.bin')
                    else:
                        print('        model is saving......   PATH:' + args.load_path + '/' + bert_list[id] + '.bin')
                        torch.save(model, args.load_path + '/' + bert_list[id] + '.bin')

    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        logging("***** Running evaluation *****")
        logging("  Batch size = {}".format(args.eval_batch_size))
        valid_data = []
        for id, m in enumerate(bert_list):
            valid_data.append(data_utils.Loader(args.data_dir, data_file[id]['valid'], args.cache_size, args.eval_batch_size, device))
        # Run prediction for full data

        for id, model in enumerate(model_list):
            out = []
            for inp, tgt in valid_data[id].data_iter(shuffle=False):
                with torch.no_grad():
                    one_out = model(inp, tgt)
                    out.append(one_out)

            output = torch.tensor([valid_data[id].data_num, 4])
            for batch in range(int(valid_data[id].data_num / args.eval_batch_size)):
                output[batch * args.eval_batch_size : (batch + 1) * args.eval_batch_size] = out[batch]

            torch.save(output,bert_list[id] + '_out.pt')
예제 #5
0
def train(data_dir='data/memes/',
          dim_proj=256,
          dim_att=128,
          maxlen=30,
          batch_size=256,
          keep_ratio=1.,
          shuffle_data=True,
          learning_rate=0.001,
          global_steps=50000,
          disp_freq=100,
          save_freq=100,
          test_freq=100,
          saveto_file='params.npz',
          tmsaveto_file='timeparams.npz',
          weight_decay=0.0005,
          sigmasqr=1,
          tdim=1.,
          reload_model=False,
          train=True):
    """
    Topo-LSTM model training.
    tdim: scale time down by how many times
    """
    options = locals().copy()
    #savedstep = '0'
    saveto = data_dir + saveto_file
    tmsaveto = data_dir + tmsaveto_file

    # loads graph
    Gp, node_index = data_utils.load_graph(data_dir)
    #print nx.info(G)
    options['n_events'] = len(node_index)

    print options

    # creates and initializes shared variables.
    print 'Initializing variables...'
    params = init_params(options)
    if reload_model:
        print 'reusing saved model.'
        load_params(saveto, params)
    tparams = init_tparams(params)

    timeparams = init_timeparams(options)
    if reload_model:
        print 'reusing saved model.'
        load_params(tmsaveto, timeparams)
    timetparams = init_tparams(timeparams)

    # builds Topo-LSTM model
    print 'Building model...'
    model = tpgru_model.build_model(tparams, timetparams, options)

    print 'Loading test data...'
    test_examples = data_utils.load_examples(data_dir,
                                             dataset='test',
                                             node_index=node_index,
                                             maxlen=maxlen,
                                             Gp=Gp)
    test_loader = data_utils.Loader(test_examples, options=options)
    print 'Loaded %d test examples' % len(test_examples)

    if train:
        # prepares training data.
        print 'Loading train data...'
        train_examples = data_utils.load_examples(
            data_dir,
            dataset='train',
            keep_ratio=options['keep_ratio'],
            node_index=node_index,
            maxlen=maxlen,
            Gp=Gp)
        train_loader = data_utils.Loader(train_examples, options=options)
        print 'Loaded %d training examples.' % len(train_examples)

        # compiles updates.
        optimizer = downhill.build(algo='adam',
                                   loss=model['cost'],
                                   params=tparams.values(),
                                   inputs=model['data'])

        updates = optimizer.get_updates(max_gradient_elem=5.,
                                        learning_rate=learning_rate)

        f_update = theano.function(model['data'],
                                   model['cost'],
                                   updates=list(updates))

        toptimizer = downhill.build(algo='adam',
                                    loss=model['timecost'],
                                    params=timetparams.values(),
                                    inputs=model['timedata'])

        tupdates = toptimizer.get_updates(max_gradient_elem=5.,
                                          learning_rate=0.005)

        f_t_update = theano.function(model['timedata'],
                                     model['timecost'],
                                     updates=list(tupdates))

        # training loop.
        start_time = timeit.default_timer()

        n_examples = len(train_examples)
        batches_per_epoch = n_examples // options['batch_size'] + 1
        n_epochs = global_steps // batches_per_epoch + 1

        global_step = 0
        #cost_history = []
        for _ in range(n_epochs):
            for _ in range(batches_per_epoch):
                batch_data = train_loader()
                cost = f_update(*(batch_data[:-3] + (batch_data[-2], )))
                #cost_history += [cost]
                timecost = f_t_update(*(batch_data[:-2] + (batch_data[-1], )))

                if global_step % disp_freq == 0:
                    print 'global step %d, cost: %f' % (global_step, cost)
                    print 'timecost: %f' % (timecost)

                # dump model parameters.
                if global_step % save_freq == 0:
                    params = unzip(tparams)
                    np.savez(data_dir + saveto_file, **params)
                    pickle.dump(
                        options, open('%s.pkl' % (data_dir + saveto_file),
                                      'wb'), -1)
                    timeparams = unzip(timetparams)
                    np.savez(data_dir + tmsaveto_file, **timeparams)

                # evaluate on test data.
                if global_step % test_freq == 0:
                    scores = evaluate(model['f_prob'], test_loader,
                                      model['f_tprob'], options['tdim'])
                    print 'eval scores: ', scores
                    end_time = timeit.default_timer()
                    print 'time used: %d seconds.' % (end_time - start_time)

                global_step += 1

    scores = evaluate(model['f_prob'], test_loader, model['f_tprob'],
                      options['tdim'])
    pprint.pprint(scores)