Exemplo n.º 1
0
def main(_):
    data_dir = FLAGS.train_path
    data_files = []
    for i in range(FLAGS.train_parts):
        data_files.append(data_dir + 'part-r-{:0>5}'.format(i))

    train_files = data_files[:-FLAGS.eval_parts]
    eval_files = data_files[-FLAGS.eval_parts:]

    linear_feature_columns, embedding_feature_columns = build_feature_columns(
        FLAGS.embedding_size)

    distribute_strategy = None
    if FLAGS.mirror:
        distribute_strategy = tf.distribute.MirroredStrategy()
    config = estimator.RunConfig(
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        keep_checkpoint_max=5,
        log_step_count_steps=FLAGS.log_steps,
        save_summary_steps=200,
        train_distribute=distribute_strategy,
        eval_distribute=distribute_strategy)

    model_params = {
        'linear_feature_columns': linear_feature_columns,
        'embedding_feature_columns': embedding_feature_columns,
        'embedding_size': FLAGS.embedding_size,
        "learning_rate": FLAGS.learning_rate,
        "dropout": FLAGS.dropout,
        "deep_layers": FLAGS.deep_layers
    }

    dcn = estimator.Estimator(model_fn=model_fn,
                              model_dir='./model/',
                              params=model_params,
                              config=config)

    if FLAGS.task_type == 'train':
        train_spec = estimator.TrainSpec(
            input_fn=lambda: input_fn(train_files,
                                      num_epochs=FLAGS.num_epochs,
                                      batch_size=FLAGS.batch_size,
                                      need_shuffle=True))
        eval_spec = estimator.EvalSpec(
            input_fn=lambda: input_fn(eval_files,
                                      num_epochs=-1,
                                      batch_size=FLAGS.batch_size,
                                      need_shuffle=True),
            steps=200,
            start_delay_secs=1,
            throttle_secs=5)
        estimator.train_and_evaluate(dcn, train_spec, eval_spec)
    elif FLAGS.task_type == 'eval':
        dcn.evaluate(input_fn=lambda: input_fn(
            eval_files, num_epochs=1, batch_size=FLAGS.batch_size),
                     steps=200)
Exemplo n.º 2
0
def main(_):
    print(
        "==================== 1.Check Args and Initialized Distributed Env...")
    if FLAGS.model_dir == "":  # 算法模型checkpoint文件
        FLAGS.model_dir = (date.today() + timedelta(-1)
                           ).strftime("%Y%m") + "_ckt_" + FLAGS.algorithm
    if FLAGS.serve_dir == "":  # 算法模型输出pb文件
        FLAGS.serve_dir = (date.today() + timedelta(-1)
                           ).strftime("%Y%m") + "_exp_" + FLAGS.algorithm
    if FLAGS.input_dir == "":  # windows环境测试
        FLAGS.input_dir = os.getcwd() + "\\feat1\\" + "\\CV\\"

    train_files = glob.glob("%s/train.set" % FLAGS.input_dir)  # 获取指定目录下train文件
    valid_files = glob.glob("%s/valid.set" % FLAGS.input_dir)  # 获取指定目录下valid文件
    tests_files = glob.glob("%s/tests.set" % FLAGS.input_dir)  # 获取指定目录下tests文件
    infer_files = glob.glob("%s/infer.set" % FLAGS.input_dir)  # 获取指定目录下infer文件
    random.shuffle(train_files)  # 打散train文件

    if FLAGS.clear_mod == "True" and FLAGS.task_mode == "train":  # 删除已存在的模型文件
        try:
            shutil.rmtree(FLAGS.model_dir)  # 递归删除目录下的目录及文件
        except Exception as e:
            print(e, "At clear_existed_model")
        else:
            print("Existed model cleared at %s folder" % FLAGS.model_dir)

    print("==================== 2.Set model params and Build CTR model...")
    model_params = {
        "algorithm": FLAGS.algorithm,
        "feature_size": FLAGS.feature_size,
        "field_size": FLAGS.field_size,
        "embed_size": FLAGS.embed_size,
        "loss_mode": FLAGS.loss_mode,
        "optimizer": FLAGS.optimizer,
        "learning_rate": FLAGS.learning_rate,
        "l2_reg_lambda": FLAGS.l2_reg_lambda,
        "deep_layers": FLAGS.deep_layers,
        "cross_layers": FLAGS.cross_layers,
        "dropout": FLAGS.dropout
    }
    if FLAGS.algorithm == "LR":
        model_fn = lr
    elif FLAGS.algorithm == "FM":
        model_fn = fm
    elif FLAGS.algorithm == "DC":
        model_fn = deepcrossing
    elif FLAGS.algorithm == "FNN" or FLAGS.algorithm == "IPNN" or FLAGS.algorithm == "OPNN":
        model_fn = fpnn
    elif FLAGS.algorithm == "WD":
        model_fn = wd
    elif FLAGS.algorithm == "DeepFM":
        model_fn = deepfm
    elif FLAGS.algorithm == "DCN":
        model_fn = dcn
    elif FLAGS.algorithm == "NFM":
        model_fn = nfm
    else:
        model_fn = None
        print("Invalid algorithm, not supported!")

    epoch_step = int(FLAGS.train_size /
                     FLAGS.batch_size)  # one epoch = num of steps
    train_step = epoch_step * FLAGS.num_epochs  # data_num * num_epochs / batch_size
    session_config = tf.ConfigProto(device_count={
        "GPU": 1,
        "CPU": FLAGS.num_thread
    })
    config = estimator.RunConfig(session_config=session_config,
                                 save_checkpoints_steps=epoch_step * 2,
                                 save_summary_steps=FLAGS.log_steps,
                                 log_step_count_steps=FLAGS.log_steps)
    ctr = estimator.Estimator(model_fn=model_fn,
                              model_dir=FLAGS.model_dir,
                              params=model_params,
                              config=config)

    print("==================== 3.Apply CTR model to diff tasks...")
    if FLAGS.task_mode == "train":
        train_spec = estimator.TrainSpec(input_fn=lambda: input_fn(
            train_files, FLAGS.batch_size, FLAGS.num_epochs, True),
                                         max_steps=train_step)
        eval_spec = estimator.EvalSpec(
            input_fn=lambda: input_fn(valid_files, FLAGS.batch_size, 1, False),
            steps=None,
            start_delay_secs=50,
            throttle_secs=15)
        estimator.train_and_evaluate(ctr, train_spec, eval_spec)
    elif FLAGS.task_mode == "eval":
        ctr.evaluate(
            input_fn=lambda: input_fn(train_files, FLAGS.batch_size, 1, False))
    elif FLAGS.task_mode == "infer":
        preds = ctr.predict(
            input_fn=lambda: input_fn(infer_files, FLAGS.batch_size, 1, False),
            predict_keys="prob")
        with open(FLAGS.input_dir + "/infer.csv", "w") as fo:
            for prob in preds:
                fo.write("%f\n" % (prob['prob']))
    elif FLAGS.task_mode == "export":
        feature_spec = {
            "feat_idx":
            tf.placeholder(dtype=tf.int64,
                           shape=[None, FLAGS.field_size],
                           name="feat_idx"),
            "feat_val":
            tf.placeholder(dtype=tf.float32,
                           shape=[None, FLAGS.field_size],
                           name="feat_val")
        }
        serving_input_receiver_fn = estimator.export.build_raw_serving_input_receiver_fn(
            feature_spec)
        ctr.export_savedmodel(FLAGS.serve_dir, serving_input_receiver_fn)
Exemplo n.º 3
0
                              show_dataflow=True)
feature_columns = [
    tf.feature_column.numeric_column(key=key) for key in train_x.keys()
]

test = tf.feature_column.numeric_column(train_x.keys()[0], default_value=0.0)
test = tf.feature_column.bucketized_column(test, [0.1, 1, 100])
test_emb = tf.feature_column.embedding_column(test, 10)
feature_columns.append(test_emb)

session_config = tf.ConfigProto()

mirrored_strategy = tf.distribute.MirroredStrategy()

config = estimator.RunConfig(
    train_distribute=mirrored_strategy,
    eval_distribute=mirrored_strategy,
)

classifier = estimator.DNNClassifier(feature_columns=feature_columns,
                                     hidden_units=[10, 10],
                                     n_classes=3,
                                     config=config)

classifier.train(
    input_fn=lambda: train_input_fn(train_x, train_y, batch_size=BATCH_SIZE),
    hooks=[hook],
    steps=STEPS)

# evaluate
eval_result = classifier.evaluate(
    input_fn=lambda: eval_input_fn(test_x, test_y, batch_size=BATCH_SIZE))
Exemplo n.º 4
0
def main(_):
    # tf.enable_eager_execution()
    # dataset = input_fn(['train2'], 1)
    # for raw_record in dataset.take(1):
    # 	print(raw_record[1])

    data_dir = FLAGS.train_path
    data_files = []
    for i in range(FLAGS.train_parts):
        data_files.append(data_dir + 'part-r-{:0>5}'.format(i))

    train_files = data_files[:-FLAGS.eval_parts]
    eval_files = data_files[-FLAGS.eval_parts:]

    train_files = ['train2']
    eval_files = ['valid2']

    test_files = []
    for i in range(FLAGS.test_parts):
        test_files.append(FLAGS.test_path + 'part-r-{:0>5}'.format(i))

    distribute_strategy = None
    if FLAGS.mirror:
        distribute_strategy = tf.distribute.MirroredStrategy()

    config = estimator.RunConfig(
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        keep_checkpoint_max=5,
        log_step_count_steps=FLAGS.log_steps,
        save_summary_steps=200,
        train_distribute=distribute_strategy,
        eval_distribute=distribute_strategy)

    model_params = {
        'pkg_count': FLAGS.pkg_count,
        'pkgc_count': FLAGS.pkgc_count,
        'ssid_count': FLAGS.ssid_count,
        'oper_count': FLAGS.oper_count,
        'embedding_size': FLAGS.embedding_size,
        "learning_rate": FLAGS.learning_rate,
        "dropout": FLAGS.dropout,
        "deep_layers": FLAGS.deep_layers,
    }

    din = estimator.Estimator(model_fn=model_fn,
                              model_dir='./models/',
                              params=model_params,
                              config=config)

    if FLAGS.task_type == 'train':
        train_spec = estimator.TrainSpec(
            input_fn=lambda: input_fn(train_files,
                                      num_epochs=FLAGS.num_epochs,
                                      batch_size=FLAGS.batch_size,
                                      need_shuffle=True))
        eval_spec = estimator.EvalSpec(input_fn=lambda: input_fn(
            eval_files, num_epochs=-1, batch_size=FLAGS.batch_size),
                                       steps=FLAGS.eval_steps,
                                       start_delay_secs=1,
                                       throttle_secs=5)
        start = time()
        estimator.train_and_evaluate(din, train_spec, eval_spec)
        elapsed = (time() - start)
        tf.logging.info("Training time used: {0}ms".format(
            round(elapsed * 1000, 2)))
    elif FLAGS.task_type == 'eval':
        din.evaluate(input_fn=lambda: input_fn(
            eval_files, num_epochs=1, batch_size=FLAGS.batch_size),
                     steps=FLAGS.eval_steps * 10)
    elif FLAGS.task_type == 'predict':
        p = din.predict(input_fn=lambda: input_fn(
            eval_files, num_epochs=1, batch_size=FLAGS.batch_size))
        tf.logging.info('done predit')
Exemplo n.º 5
0
def main(_):
    linear_feature_columns, embedding_feature_columns = build_model_columns(FLAGS.embedding_size)

    # session_config = tf.ConfigProto(log_device_placement=True)
    # session_config.gpu_options.allow_growth = True

    distribute_strategy = None
    if FLAGS.mirror:
        distribute_strategy = tf.distribute.MirroredStrategy()

    config = estimator.RunConfig(
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        keep_checkpoint_max=5,
        log_step_count_steps=FLAGS.log_steps,
        save_summary_steps=200,
        train_distribute=distribute_strategy,
        eval_distribute=distribute_strategy
    )

    model_params = {
        'linear_feature_columns': linear_feature_columns,
        'embedding_feature_columns': embedding_feature_columns,
        'embedding_size': FLAGS.embedding_size,
        "learning_rate": FLAGS.learning_rate,
        "dropout": FLAGS.dropout,
        "deep_layers": FLAGS.deep_layers
    }

    deepfm = estimator.Estimator(
        model_fn=model_fn,
        model_dir='./models/deepfm',
        params=model_params,
        config=config
    )

    data_dir = FLAGS.dataset_path
    data_files = []
    for i in range(FLAGS.dataset_parts):
        data_files.append(data_dir + 'part-r-{:0>5}'.format(i))

    train_files = data_files[:-FLAGS.dataset_eval]
    eval_files = data_files[-FLAGS.dataset_eval:]
    # hook = estimator.ProfilerHook(save_steps=10, output_dir='./time/', show_memory=True, show_dataflow=True)

    if FLAGS.task_type == 'train':
        train_spec = estimator.TrainSpec(input_fn=lambda: input_fn(
            train_files,
            num_epochs=FLAGS.num_epochs,
            batch_size=FLAGS.batch_size,
            need_shuffle=True))
        eval_spec = estimator.EvalSpec(input_fn=lambda: input_fn(
            eval_files,
            num_epochs=1,
            batch_size=FLAGS.batch_size), steps=None, start_delay_secs=1, throttle_secs=5)
        start = time()
        estimator.train_and_evaluate(deepfm, train_spec, eval_spec)
        elapsed = (time() - start)
        print("Training time used: {0}ms".format(round(elapsed * 1000, 2)))
    elif FLAGS.task_type == 'eval':
        deepfm.evaluate(input_fn=lambda: input_fn(eval_files, num_epochs=1, batch_size=FLAGS.batch_size))
    elif FLAGS.task_type == 'predict':
        p = deepfm.predict(input_fn=lambda: input_fn(eval_files, num_epochs=1, batch_size=FLAGS.batch_size))
        print('done predit')
        # tf.data.Dataset.from_tensor_slices()
        # for i in p:
        #     print(i["prob"])

    feature_description.pop('label')
    serving_fn = estimator.export.build_parsing_serving_input_receiver_fn(feature_description)

    # features = {
    #     "u_id": tf.placeholder(dtype=tf.int32, shape=1, name='u_id'),
    #     "i_id": tf.placeholder(dtype=tf.int32, shape=1, name='i_id'),
    # }
    # serving_fn = estimator.export.build_raw_serving_input_receiver_fn(features)

    deepfm.export_savedmodel(
        export_dir_base=FLAGS.export_path,
        serving_input_receiver_fn=serving_fn,
        as_text=True,
    )
    tf.logging.info('Model exported.')