Пример #1
0
    def build_graph(self):
        tf.reset_default_graph()
        self.dataset = as_dataset(FLAGS.dataset)

        with tf.device(self.device_op(0)):
            with tf.variable_scope(tf.get_variable_scope()):
                self.global_step = tf.get_variable(
                    name='global_step',
                    dtype=tf.int32,
                    shape=[],
                    initializer=tf.constant_initializer(0),
                    trainable=False)
                self.learning_rate = tf.get_variable(
                    name='learning_rate',
                    dtype=tf.float32,
                    shape=[],
                    initializer=tf.constant_initializer(FLAGS.learning_rate),
                    trainable=False)
                # self.lr_decay_op = tf.assign(self.learning_rate, self.learning_rate * FLAGS.decay)
                self.opt = get_optimizer(FLAGS.optimizer, self.learning_rate)
                self.model = as_model(FLAGS.model,
                                      input_dim=self.dataset.num_features,
                                      num_fields=self.dataset.num_fields,
                                      **self.model_param)
                tf.get_variable_scope().reuse_variables()
                self.grads = self.opt.compute_gradients(self.model.loss)

        with tf.device(self.device_op(0, local=True)):
            if self.lazy_update > 1:
                local_grads = []
                accumulate_op = []
                reset_op = []
                self.local_grads = []
                for grad, v in self.grads:
                    zero_grad = tf.zeros_like(v)
                    local_grad = tf.Variable(
                        zero_grad,
                        dtype=tf.float32,
                        trainable=False,
                        name=v.name.split(':')[0] + '_local_grad',
                        collections=[tf.GraphKeys.LOCAL_VARIABLES])
                    self.local_grads.append(local_grad)
                    reset_grad = local_grad.assign(zero_grad)
                    if FLAGS.sparse_grad and isinstance(
                            grad, tf.IndexedSlices):
                        accumulate_grad = local_grad.scatter_sub(-grad)
                    else:
                        accumulate_grad = local_grad.assign_add(grad)
                    local_grads.append((local_grad, v))
                    accumulate_op.append(accumulate_grad)
                    reset_op.append(reset_grad)
            if self.lazy_update > 1:
                self.update_op = self.opt.apply_gradients(
                    local_grads, global_step=self.global_step)
                self.accumulate_op = tf.group(*accumulate_op)
                self.reset_op = tf.group(*reset_op)
            else:
                self.train_op = self.opt.minimize(self.model.loss,
                                                  global_step=self.global_step)
            self.saver = tf.train.Saver()
Пример #2
0
    def build_graph_multi_gpu(self):
        tf.reset_default_graph()
        self.dataset = as_dataset(FLAGS.dataset)
        self.tower_grads = []
        self.models = []

        with tf.device(self.device_op(0)):
            with tf.variable_scope(tf.get_variable_scope()):
                self.global_step = tf.get_variable(name='global_step', dtype=tf.int32, shape=[],
                                                   initializer=tf.constant_initializer(0), trainable=False)
                self.learning_rate = tf.get_variable(name='learning_rate', dtype=tf.float32, shape=[],
                                                     initializer=tf.constant_initializer(FLAGS.learning_rate),
                                                     trainable=False)
                self.opt = get_optimizer(FLAGS.optimizer, self.learning_rate)
                for i in xrange(self.num_gpus):
                    with tf.device(self.device_op(i)):
                        print('Deploying gpu:%d ...' % i)
                        with tf.name_scope('tower_%d' % i):
                            model = as_model(FLAGS.model, input_dim=self.dataset.num_features,
                                             num_fields=self.dataset.num_fields,
                                             **self.model_param)
                            self.models.append(model)
                            tf.get_variable_scope().reuse_variables()
                            grads = self.opt.compute_gradients(model.loss)
                            self.tower_grads.append(grads)

        with tf.device(self.device_op(0, local=True)):
            print('###################################')
            average_grads = []
            if self.lazy_update > 1:
                local_grads = []
                accumulate_op = []
                reset_op = []
                self.local_grads = []
            for grad_and_vars in zip(*self.tower_grads):
                grads = []
                if FLAGS.sparse_grad and isinstance(grad_and_vars[0][0], tf.IndexedSlices):
                    grad = sparse_grads_mean(grad_and_vars)
                    grad_shape = grad.dense_shape
                else:
                    for g, _ in grad_and_vars:
                        expanded_g = tf.expand_dims(g, 0)
                        grads.append(expanded_g)
                    grad = tf.concat(axis=0, values=grads)
                    grad = tf.reduce_mean(grad, 0)
                    grad_shape = grad.shape
                v = grad_and_vars[0][1]
                grad_and_var = (grad, v)
                print(type(grad), grad_shape, type(v), v.shape)
                average_grads.append(grad_and_var)

                if self.lazy_update > 1:
                    zero_grad = tf.zeros_like(v)
                    local_grad = tf.Variable(zero_grad, dtype=tf.float32, trainable=False,
                                             name=v.name.split(':')[0] + '_local_grad',
                                             collections=[tf.GraphKeys.LOCAL_VARIABLES])
                    self.local_grads.append(local_grad)
                    reset_grad = local_grad.assign(zero_grad)
                    if FLAGS.sparse_grad and isinstance(grad, tf.IndexedSlices):
                        accumulate_grad = local_grad.scatter_sub(-grad)
                    else:
                        accumulate_grad = local_grad.assign_add(grad)
                    local_grads.append((local_grad, v))
                    accumulate_op.append(accumulate_grad)
                    reset_op.append(reset_grad)
            print('###################################')
            # TODO test this
            # self.grad_op = tf.group([(x[0].op, x[1].op) for x in average_grads])
            if self.lazy_update > 1:
                self.update_op = self.opt.apply_gradients(local_grads, global_step=self.global_step)
                # self.grad_op = tf.group(average_grads)
                # tf.ver < 1.5 need *inputs 
                self.accumulate_op = tf.group(*accumulate_op)
                self.reset_op = tf.group(*reset_op)
            else:
                self.train_op = self.opt.apply_gradients(average_grads, global_step=self.global_step)
            self.saver = tf.train.Saver()
Пример #3
0
import os
import __init__

sys.path.append(__init__.config['data_path'])  # add your data path here
from datasets import as_dataset
from tf_trainer import Trainer
from tf_models import AutoDeepFM
import tensorflow as tf
import traceback

seeds = [
    0x0123, 0x4567, 0x3210, 0x7654, 0x89AB, 0xCDEF, 0xBA98, 0xFEDC, 0x0123,
    0x4567, 0x3210, 0x7654, 0x89AB, 0xCDEF, 0xBA98, 0xFEDC
]
data_name = 'avazu'
dataset = as_dataset(data_name)
backend = 'tf'
batch_size = 2000

train_data_param = {
    'gen_type': 'train',
    'random_sample': True,
    'batch_size': batch_size,
    'split_fields': False,
    'on_disk': True,
    'squeeze_output': True,
}
test_data_param = {
    'gen_type': 'test',
    'random_sample': False,
    'batch_size': batch_size,
Пример #4
0
class Config:

    #
    #   general config
    #
    epoch_display_periods = 10  # epoch display periods
    summaries_dir = "./summaries"  # tensorboard writer target directory
    model_dir = "checkpoints"  # save model in this directory
    save_periods = 100  # save periods
    sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
    sess_config.gpu_options.allow_growth = True
    keras_sess = tf.Session(config=sess_config)
    K.set_session(keras_sess)


    #
    #   environment config
    #
    environment_combination_len = 3
    environment_combinations_num = 10

    #
    #   actor config
    #
    lr = 0.001  # learning rate
    gamma = 0.5  # the discount factor in G
    value_scale = 0.5  # the weight of value function approximation in total loss
    reinforce_batch_size = 100  # batch size used in Reinforce algorithm
    gradient_clip = 40  # graient clip, avoid too large gradient

    #
    #   encoder config
    #
    encoder_dim = 64

    #
    #   reinforce config
    reinforce_logdir = "./summaries/reinforce_logdir"
    reinforce_learning_rate = 0.001


    #
    #   evaluator configs
    #
    evaluator_model_name = "lr"  #  'pin', 'lr'
    evaluator_optimizer_name = 'adam'
    evaluator_learning_rate = 0.03
    evaluator_epsilon = 1e-4
    evaluator_max_rounds = 2000
    evaluator_early_stop = 8
    evaluator_embedding_size = 20
    evaluator_log_step_frequency = 0
    evaluator_eval_round_frequency = 1
    evaluator_train_logdir = "./summaries/evaluator_train"
    evaluator_valid_logdir = "./summaries/evaluator_valid"
    evaluator_graph_logdir = "./summaries/evaluator_graph"


    #
    #   dataset
    #
    data_name = "Couple"
    dataset = as_dataset(data_name, True)
    dataset.load_data(gen_type='train')
    dataset.load_data(gen_type='test')
    dataset.summary()
    num_fields = dataset.num_fields
    feat_sizes = dataset.feat_sizes
    feat_min = dataset.feat_min
    target_combination_num = 30
    target_combination_len = 4
Пример #5
0
    def __init__(self):
        # parse params
        self.config = {}
        self.logdir, self.logfile = get_logdir(FLAGS=FLAGS)
        self.ckpt_dir = os.path.join(self.logdir, 'checkpoints')
        self.ckpt_name = 'model.ckpt'
        self.worker_dir = ''
        self.sub_file = os.path.join(self.logdir, 'submission.%d.csv')
        redirect_stdout(self.logfile)
        self.train_data_param = {
            'gen_type': 'train',
            'random_sample': True,
            'batch_size': FLAGS.batch_size,
            'squeeze_output': False,
            'val_ratio': FLAGS.val_ratio,
        }
        self.valid_data_param = {
            'gen_type': 'valid' if FLAGS.val else 'test',
            'random_sample': False,
            'batch_size': FLAGS.test_batch_size,
            'squeeze_output': False,
            'val_ratio': FLAGS.val_ratio,
        }
        self.test_data_param = {
            'gen_type': 'test',
            'random_sample': False,
            'batch_size': FLAGS.test_batch_size,
            'squeeze_output': False,
        }
        self.train_logdir = os.path.join(self.logdir, 'train', self.worker_dir)
        self.valid_logdir = os.path.join(self.logdir, 'valid', self.worker_dir)
        self.test_logdir = os.path.join(self.logdir, 'test', self.worker_dir)
        gpu_config = tf.ConfigProto(allow_soft_placement=True,
                                    log_device_placement=False,
                                    gpu_options={'allow_growth': True})

        self.model_param = {
            'l2_embed': FLAGS.l2_embed,
            'input_norm': FLAGS.input_norm,
            'init_sparse': FLAGS.init_sparse,
            'init_fused': FLAGS.init_fused,
            'loss_mode': FLAGS.loss_mode
        }
        if FLAGS.model != 'lr':
            self.model_param['embed_size'] = FLAGS.embed_size
        if FLAGS.model == 'kfm':
            self.model_param['unit_kernel'] = FLAGS.unit_kernel
            self.model_param['fix_kernel'] = FLAGS.fix_kernel
            self.model_param['l2_kernel'] = FLAGS.l2_kernel
            self.model_param['kernel_type'] = FLAGS.kernel_type
        self.dump_config()

        # create graph
        tf.reset_default_graph()
        # load dataset
        self.dataset = as_dataset(FLAGS.dataset)

        # build model
        with tf.device('/gpu:0'):
            with tf.variable_scope(tf.get_variable_scope()):
                self.global_step = tf.get_variable(
                    name='global_step',
                    dtype=tf.int32,
                    shape=[],
                    initializer=tf.constant_initializer(1),
                    trainable=False)
                self.learning_rate = tf.get_variable(
                    name='learning_rate',
                    dtype=tf.float32,
                    shape=[],
                    initializer=tf.constant_initializer(FLAGS.learning_rate),
                    trainable=False)
                self.opt = get_optimizer(FLAGS.optimizer, self.learning_rate)
                self.model = as_model(FLAGS.model,
                                      input_dim=self.dataset.num_features,
                                      num_fields=self.dataset.num_fields,
                                      **self.model_param)
                tf.get_variable_scope().reuse_variables()

            self.train_op = self.opt.minimize(self.model.loss,
                                              global_step=self.global_step)
            self.saver = tf.train.Saver()

        def sess_op():
            return tf.Session(config=gpu_config)

        train_size = int(self.dataset.train_size * (1 - FLAGS.val_ratio))
        self.num_steps = int(np.ceil(train_size / FLAGS.batch_size))
        self.eval_steps = self.num_steps

        # start train
        with sess_op() as self.sess:
            print('Train size = %d, Batch size = %d' %
                  (self.dataset.train_size, FLAGS.batch_size))
            print(
                '%d rounds in total, One round = %d steps, One evaluation = %d steps'
                % (FLAGS.num_rounds, self.num_steps, self.eval_steps))

            # data generator
            self.train_gen = self.dataset.batch_generator(
                self.train_data_param)
            self.valid_gen = self.dataset.batch_generator(
                self.valid_data_param)
            self.test_gen = self.dataset.batch_generator(self.test_data_param)
            # log writer
            self.train_writer = tf.summary.FileWriter(logdir=self.train_logdir,
                                                      graph=self.sess.graph,
                                                      flush_secs=30)
            self.test_writer = tf.summary.FileWriter(logdir=self.test_logdir,
                                                     graph=self.sess.graph,
                                                     flush_secs=30)
            self.valid_writer = tf.summary.FileWriter(logdir=self.valid_logdir,
                                                      graph=self.sess.graph,
                                                      flush_secs=30)

            # init model
            if not FLAGS.restore:
                self.sess.run(tf.global_variables_initializer())
            else:
                checkpoint_state = tf.train.get_checkpoint_state(self.ckpt_dir)
                if checkpoint_state and checkpoint_state.model_checkpoint_path:
                    self.saver.restore(self.sess,
                                       checkpoint_state.model_checkpoint_path)
                    print('Restore model from:',
                          checkpoint_state.model_checkpoint_path)
                    print('Run initial evaluation...')
                    self.evaluate(self.test_gen, self.test_writer)
                else:
                    print('Restore failed')

            # init check
            print('Initial evaluation')
            cnt = 0
            for xs, ys in self.test_gen:
                feed_dict = {self.model.inputs: xs, self.model.labels: ys}
                if self.model.training is not None:
                    feed_dict[self.model.training] = False
                self.sess.run(fetches=self.model.preds, feed_dict=feed_dict)
                cnt += 1
                if cnt == 100:
                    break

            self.begin_step = self.global_step.eval(self.sess)
            self.step = self.begin_step
            self.start_time = time.time()

            for r in range(1, FLAGS.num_rounds + 1):
                print('Round: %d' % r)
                for batch_xs, batch_ys in self.train_gen:
                    fetches = [self.train_op, self.global_step]
                    train_feed = {}
                    fetches += [
                        self.model.loss, self.model.log_loss,
                        self.model.l2_loss
                    ]
                    train_feed[self.model.inputs] = batch_xs
                    train_feed[self.model.labels] = batch_ys
                    if self.model.training is not None:
                        train_feed[self.model.training] = True

                    _, self.step, _loss_, _log_loss_, _l2_loss_ = self.sess.run(
                        fetches=fetches, feed_dict=train_feed)

                    if self.step % FLAGS.log_frequency == 0:
                        elapsed_time = self.get_elapsed()
                        print(
                            'Done step %d, Elapsed: %.2fs, Train-Loss: %.4f, Log-Loss: %.4f, L2-Loss: %g'
                            % (self.step, elapsed_time, _loss_, _log_loss_,
                               _l2_loss_))
                        summary = tf.Summary(value=[
                            tf.Summary.Value(tag='loss', simple_value=_loss_),
                            tf.Summary.Value(tag='log_loss',
                                             simple_value=_log_loss_),
                            tf.Summary.Value(tag='l2_loss',
                                             simple_value=_l2_loss_)
                        ])
                        self.train_writer.add_summary(summary,
                                                      global_step=self.step)

                self.saver.save(
                    self.sess,
                    os.path.join(self.logdir, 'checkpoints', 'model.ckpt'),
                    self.step)
                print('Round %d finished, Elapsed: %s' %
                      (r, self.get_timedelta()))
                self.evaluate(self.test_gen, submission=r)
Пример #6
0
import time
import os
import __init__
sys.path.append(__init__.config['data_path'])  # add your data path here
from datasets import as_dataset
from tf_trainer import Trainer
from tf_models import AutoFM
import tensorflow as tf
import traceback
import random
seeds = [
    0x0123, 0x4567, 0x3210, 0x7654, 0x89AB, 0xCDEF, 0xBA98, 0xFEDC, 0x0123,
    0x4567, 0x3210, 0x7654, 0x89AB, 0xCDEF, 0xBA98, 0xFEDC
]
data_name = 'avazu'
dataset = as_dataset(
    data_name)  # https://github.com/Atomu2014/Ads-RecSys-Datasets使用的这个
backend = 'tf'
batch_size = 2000

train_data_param = {
    'gen_type': 'train',
    'random_sample': True,
    'batch_size': batch_size,
    'split_fields': False,
    'on_disk': True,
    'squeeze_output': True,
}
test_data_param = {
    'gen_type': 'test',
    'random_sample': False,
    'batch_size': batch_size,