예제 #1
0
def train(args):
    """Train
    """
    if not os.path.exists(args.save_path):
        os.makedirs(args.save_path)

    net = Network(args.vocab_size, args.emb_size, args.hidden_size)

    train_program = fluid.Program()
    train_startup = fluid.Program()
    if "CE_MODE_X" in os.environ:
        train_program.random_seed = 110
        train_startup.random_seed = 110
    with fluid.program_guard(train_program, train_startup):
        with fluid.unique_name.guard():
            logits, loss = net.network(args.loss_type)
            loss.persistable = True
            logits.persistable = True
            # gradient clipping
            fluid.clip.set_gradient_clip(
                clip=fluid.clip.GradientClipByValue(max=1.0, min=-1.0))

            optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
            optimizer.minimize(loss)
            print("begin memory optimization ...")
            fluid.memory_optimize(train_program)
            print("end memory optimization ...")

    test_program = fluid.Program()
    test_startup = fluid.Program()
    if "CE_MODE_X" in os.environ:
        test_program.random_seed = 110
        test_startup.random_seed = 110
    with fluid.program_guard(test_program, test_startup):
        with fluid.unique_name.guard():
            logits, loss = net.network(args.loss_type)
            loss.persistable = True
            logits.persistable = True

    test_program = test_program.clone(for_test=True)
    if args.use_cuda:
        place = fluid.CUDAPlace(0)
        dev_count = fluid.core.get_cuda_device_count()
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))

    print("device count %d" % dev_count)
    print("theoretical memory usage: ")
    print(
        fluid.contrib.memory_usage(program=train_program,
                                   batch_size=args.batch_size))

    exe = fluid.Executor(place)
    exe.run(train_startup)
    exe.run(test_startup)

    train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                       loss_name=loss.name,
                                       main_program=train_program)

    test_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                      main_program=test_program,
                                      share_vars_from=train_exe)

    if args.word_emb_init is not None:
        print("start loading word embedding init ...")
        if six.PY2:
            word_emb = np.array(pickle.load(open(args.word_emb_init,
                                                 'rb'))).astype('float32')
        else:
            word_emb = np.array(
                pickle.load(open(args.word_emb_init, 'rb'),
                            encoding="bytes")).astype('float32')
        net.set_word_embedding(word_emb, place)
        print("finish init word embedding  ...")

    print("start loading data ...")

    def train_with_feed(batch_data):
        """
        Train on one batch
        """
        #to do get_feed_names
        feed_dict = dict(zip(net.get_feed_names(), batch_data))

        cost = train_exe.run(feed=feed_dict, fetch_list=[loss.name])
        return cost[0]

    def test_with_feed(batch_data):
        """
        Test on one batch
        """
        feed_dict = dict(zip(net.get_feed_names(), batch_data))

        score = test_exe.run(feed=feed_dict, fetch_list=[logits.name])
        return score[0]

    def evaluate():
        """
        Evaluate to choose model
        """
        val_batches = reader.batch_reader(args.val_path, args.batch_size,
                                          place, args.max_len, 1)
        scores = []
        labels = []
        for batch in val_batches:
            scores.extend(test_with_feed(batch))
            labels.extend([x[0] for x in batch[2]])

        return eva.evaluate_Recall(zip(scores, labels))

    def save_exe(step, best_recall):
        """
        Save exe conditional
        """
        recall_dict = evaluate()
        print('evaluation recall result:')
        print('1_in_2: %s\t1_in_10: %s\t2_in_10: %s\t5_in_10: %s' %
              (recall_dict['1_in_2'], recall_dict['1_in_10'],
               recall_dict['2_in_10'], recall_dict['5_in_10']))

        if recall_dict['1_in_10'] > best_recall and step != 0:
            fluid.io.save_inference_model(args.save_path,
                                          net.get_feed_inference_names(),
                                          logits,
                                          exe,
                                          main_program=train_program)

            print("Save model at step %d ... " % step)
            print(
                time.strftime('%Y-%m-%d %H:%M:%S',
                              time.localtime(time.time())))
            best_recall = recall_dict['1_in_10']
        return best_recall

    # train over different epoches
    global_step, train_time = 0, 0.0
    best_recall = 0
    for epoch in six.moves.xrange(args.num_scan_data):
        train_batches = reader.batch_reader(args.train_path, args.batch_size,
                                            place, args.max_len,
                                            args.sample_pro)

        begin_time = time.time()
        sum_cost = 0
        ce_cost = 0
        for batch in train_batches:
            if (args.save_path is not None) and (global_step % args.save_step
                                                 == 0):
                best_recall = save_exe(global_step, best_recall)

            cost = train_with_feed(batch)
            global_step += 1
            sum_cost += cost.mean()
            ce_cost = cost.mean()

            if global_step % args.print_step == 0:
                print('training step %s avg loss %s' %
                      (global_step, sum_cost / args.print_step))
                sum_cost = 0

        pass_time_cost = time.time() - begin_time
        train_time += pass_time_cost
        print("Pass {0}, pass_time_cost {1}".format(
            epoch, "%2.2f sec" % pass_time_cost))
        if "CE_MODE_X" in os.environ and epoch == args.num_scan_data - 1:
            card_num = get_cards()
            print("kpis\ttrain_duration_card%s\t%s" %
                  (card_num, pass_time_cost))
            print("kpis\ttrain_loss_card%s\t%s" % (card_num, ce_cost))
예제 #2
0
def finetune(args):
    """
    Finetune
    """
    if not os.path.exists(args.save_path):
        os.makedirs(args.save_path)

    net = Network(args.vocab_size, args.emb_size, args.hidden_size)

    train_program = fluid.Program()
    train_startup = fluid.Program()
    if "CE_MODE_X" in os.environ:
        train_program.random_seed = 110
        train_startup.random_seed = 110
    with fluid.program_guard(train_program, train_startup):
        with fluid.unique_name.guard():
            logits, loss = net.network(args.loss_type)
            loss.persistable = True
            logits.persistable = True
            # gradient clipping
            fluid.clip.set_gradient_clip(
                clip=fluid.clip.GradientClipByValue(max=1.0, min=-1.0))

            optimizer = fluid.optimizer.Adam(
                learning_rate=fluid.layers.exponential_decay(
                    learning_rate=args.learning_rate,
                    decay_steps=400,
                    decay_rate=0.9,
                    staircase=True))
            optimizer.minimize(loss)
            print("begin memory optimization ...")
            fluid.memory_optimize(train_program)
            print("end memory optimization ...")

    test_program = fluid.Program()
    test_startup = fluid.Program()
    if "CE_MODE_X" in os.environ:
        test_program.random_seed = 110
        test_startup.random_seed = 110
    with fluid.program_guard(test_program, test_startup):
        with fluid.unique_name.guard():
            logits, loss = net.network(args.loss_type)
            loss.persistable = True
            logits.persistable = True

    test_program = test_program.clone(for_test=True)
    if args.use_cuda:
        place = fluid.CUDAPlace(0)
        dev_count = fluid.core.get_cuda_device_count()
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))

    print("device count %d" % dev_count)
    print("theoretical memory usage: ")
    print(
        fluid.contrib.memory_usage(program=train_program,
                                   batch_size=args.batch_size))

    exe = fluid.Executor(place)
    exe.run(train_startup)
    exe.run(test_startup)

    train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                       loss_name=loss.name,
                                       main_program=train_program)

    test_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                      main_program=test_program,
                                      share_vars_from=train_exe)

    if args.init_model:
        init.init_pretraining_params(exe,
                                     args.init_model,
                                     main_program=train_startup)
        print('sccuess init %s' % args.init_model)

    print("start loading data ...")

    def train_with_feed(batch_data):
        """
        Train on one batch
        """
        #to do get_feed_names
        feed_dict = dict(zip(net.get_feed_names(), batch_data))

        cost = train_exe.run(feed=feed_dict, fetch_list=[loss.name])
        return cost[0]

    def test_with_feed(batch_data):
        """
        Test on one batch
        """
        feed_dict = dict(zip(net.get_feed_names(), batch_data))

        score = test_exe.run(feed=feed_dict, fetch_list=[logits.name])
        return score[0]

    def evaluate():
        """
        Evaluate to choose model
        """
        val_batches = reader.batch_reader(args.val_path, args.batch_size,
                                          place, args.max_len, 1)
        scores = []
        labels = []
        for batch in val_batches:
            scores.extend(test_with_feed(batch))
            labels.extend([x[0] for x in batch[2]])
        scores = [x[0] for x in scores]
        return eva.evaluate_cor(scores, labels)

    def save_exe(step, best_cor):
        """
        Save exe conditional
        """
        cor = evaluate()
        print('evaluation cor relevance %s' % cor)
        if cor > best_cor and step != 0:
            fluid.io.save_inference_model(args.save_path,
                                          net.get_feed_inference_names(),
                                          logits,
                                          exe,
                                          main_program=train_program)
            print("Save model at step %d ... " % step)
            print(
                time.strftime('%Y-%m-%d %H:%M:%S',
                              time.localtime(time.time())))
            best_cor = cor
        return best_cor

    # train over different epoches
    global_step, train_time = 0, 0.0
    best_cor = 0.0
    pre_index = -1
    for epoch in six.moves.xrange(args.num_scan_data):
        train_batches = reader.batch_reader(args.train_path, args.batch_size,
                                            place, args.max_len,
                                            args.sample_pro)

        begin_time = time.time()
        sum_cost = 0
        for batch in train_batches:
            if (args.save_path is not None) and (global_step % args.save_step
                                                 == 0):
                best_cor = save_exe(global_step, best_cor)

            cost = train_with_feed(batch)
            global_step += 1
            sum_cost += cost.mean()

            if global_step % args.print_step == 0:
                print('training step %s avg loss %s' %
                      (global_step, sum_cost / args.print_step))
                sum_cost = 0

        pass_time_cost = time.time() - begin_time
        train_time += pass_time_cost
        print("Pass {0}, pass_time_cost {1}".format(
            epoch, "%2.2f sec" % pass_time_cost))