예제 #1
0
파일: train.py 프로젝트: codescv/paddle-ctr
def train():
    args = parse_args()

    if not os.path.isdir(args.model_output_dir):
        os.mkdir(args.model_output_dir)

    loss, data_list, auc_var, batch_auc_var = ctr_dnn_model(
        args.embedding_size, args.sparse_feature_dim)
    optimizer = fluid.optimizer.Adam(learning_rate=1e-4)
    optimizer.minimize(loss)

    if args.is_local:
        logger.info("run local training")
        main_program = fluid.default_main_program()
        train_loop(args, main_program, data_list, loss, auc_var, batch_auc_var,
                   1, 0)
    else:
        logger.info("run dist training")
        t = fluid.DistributeTranspiler()
        t.transpile(args.trainer_id,
                    pservers=args.endpoints,
                    trainers=args.trainers)
        if args.role == "pserver":
            logger.info("run pserver")
            prog = t.get_pserver_program(args.current_endpoint)
            startup = t.get_startup_program(args.current_endpoint,
                                            pserver_program=prog)
            exe = fluid.Executor(fluid.CPUPlace())
            exe.run(startup)
            exe.run(prog)
        elif args.role == "trainer":
            logger.info("run trainer")
            train_prog = t.get_trainer_program()
            train_loop(args, train_prog, data_list, loss, auc_var,
                       batch_auc_var, args.trainers, args.trainer_id)
예제 #2
0
def train():
    args = parse_args()

    if not os.path.isdir(args.model_output_dir):
        os.mkdir(args.model_output_dir)

    loss, auc_var, batch_auc_var, py_reader, _ = ctr_dnn_model(
        args.embedding_size, args.sparse_feature_dim)
    optimizer = fluid.optimizer.Adam(learning_rate=1e-4)
    optimizer.minimize(loss)
    if args.cloud_train:
        # the port of all pservers, needed by both trainer and pserver
        port = os.getenv("PADDLE_PORT", "6174")
        # comma separated ips of all pservers, needed by trainer and
        pserver_ips = os.getenv("PADDLE_PSERVERS", "")
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        args.endpoints = ",".join(eplist)
        args.trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
        args.current_endpoint = os.getenv("POD_IP", "localhost") + ":" + port
        args.role = os.getenv("TRAINING_ROLE", "TRAINER")
        args.trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
        args.is_local = bool(int(os.getenv("PADDLE_IS_LOCAL", 0)))

    if args.is_local:
        logger.info("run local training")
        main_program = fluid.default_main_program()
        train_loop(args, main_program, py_reader, loss, auc_var, batch_auc_var,
                   1, 0)
    else:
        logger.info("run dist training")
        t = fluid.DistributeTranspiler()
        t.transpile(args.trainer_id,
                    pservers=args.endpoints,
                    trainers=args.trainers)
        if args.role == "pserver" or args.role == "PSERVER":
            logger.info("run pserver")
            prog = t.get_pserver_program(args.current_endpoint)
            startup = t.get_startup_program(args.current_endpoint,
                                            pserver_program=prog)
            exe = fluid.Executor(fluid.CPUPlace())
            exe.run(startup)
            exe.run(prog)
        elif args.role == "trainer" or args.role == "TRAINER":
            logger.info("run trainer")
            train_prog = t.get_trainer_program()
            train_loop(args, train_prog, py_reader, loss, auc_var,
                       batch_auc_var, args.trainers, args.trainer_id)
        else:
            raise ValueError(
                'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
            )
예제 #3
0
def dump():
    args = parse_args()

    output_data_path = os.path.abspath(args.output_data_path)
    base_datafile = output_data_path + "/" + NOW_DATETIME + "/base/feature"
    base_donefile = output_data_path + "/" + "donefile/" + "base.txt"
    patch_datafile = output_data_path + "/" + NOW_DATETIME + "/patch/feature"
    patch_donefile = output_data_path + "/" + "donefile/" + "patch.txt"

    place = fluid.CPUPlace()
    inference_scope = fluid.Scope()

    startup_program = fluid.framework.Program()
    test_program = fluid.framework.Program()
    with fluid.framework.program_guard(test_program, startup_program):
        loss, auc_var, batch_auc_var, _, data_list = ctr_dnn_model(
            args.embedding_size, args.sparse_feature_dim, False)

        exe = fluid.Executor(place)

        feeder = fluid.DataFeeder(feed_list=data_list, place=place)

        fluid.io.load_persistables(executor=exe,
                                   dirname=args.model_path,
                                   main_program=fluid.default_main_program())

        # Dump embedding
        t = np.array(
            fluid.global_scope().find_var('SparseFeatFactors').get_tensor())

        if not os.access(os.path.dirname(base_datafile), os.F_OK):
            os.makedirs(os.path.dirname(base_datafile))

        with open(base_datafile, "wb") as f:
            writer = SequenceFileWriter(f)
            for i in range(0, t.shape[0]):
                key_bytes = struct.pack('Q', i)
                row_bytes = struct.pack('%sf' % t.shape[1], *t[i])
                writer.write(key_bytes, row_bytes)
        f.close()

    write_donefile(base_datafile, base_donefile)
예제 #4
0
def infer():
    args = parse_args()

    place = fluid.CPUPlace()
    inference_scope = fluid.Scope()

    dataset = reader.CriteoDataset(args.sparse_feature_dim)
    test_reader = paddle.batch(dataset.test([args.data_path]),
                               batch_size=args.batch_size)

    startup_program = fluid.framework.Program()
    test_program = fluid.framework.Program()
    with fluid.scope_guard(inference_scope):
        with fluid.framework.program_guard(test_program, startup_program):
            loss, auc_var, batch_auc_var, _, data_list, auc_states = ctr_dnn_model(
                args.embedding_size, args.sparse_feature_dim, False)

            exe = fluid.Executor(place)

            feeder = fluid.DataFeeder(feed_list=data_list, place=place)

            fluid.io.load_persistables(
                executor=exe,
                dirname=args.model_path,
                main_program=fluid.default_main_program())

            def set_zero(var_name):
                param = inference_scope.var(var_name).get_tensor()
                param_array = np.zeros(param._get_dims()).astype("int64")
                param.set(param_array, place)

            for var in auc_states:
                set_zero(var.name)

            for batch_id, data in enumerate(test_reader()):
                loss_val, auc_val = exe.run(test_program,
                                            feed=feeder.feed(data),
                                            fetch_list=[loss, auc_var])
                if batch_id % 100 == 0:
                    logger.info("TEST --> batch: {} loss: {} auc: {}".format(
                        batch_id, loss_val / args.batch_size, auc_val))
예제 #5
0
def infer():
    args = parse_args()

    place = fluid.CPUPlace()
    inference_scope = fluid.core.Scope()

    dataset = reader.CriteoDataset(args.sparse_feature_dim)
    test_reader = paddle.batch(dataset.test([args.data_path]),
                               batch_size=args.batch_size)

    startup_program = fluid.framework.Program()
    test_program = fluid.framework.Program()
    with fluid.framework.program_guard(test_program, startup_program):
        loss, data_list, auc_var, batch_auc_var = ctr_dnn_model(
            args.embedding_size, args.sparse_feature_dim)

    exe = fluid.Executor(place)

    feeder = fluid.DataFeeder(feed_list=data_list, place=place)

    with fluid.scope_guard(inference_scope):
        [inference_program, _,
         fetch_targets] = fluid.io.load_inference_model(args.model_path, exe)

        def set_zero(var_name):
            param = inference_scope.var(var_name).get_tensor()
            param_array = np.zeros(param._get_dims()).astype("int64")
            param.set(param_array, place)

        auc_states_names = ['_generated_var_2', '_generated_var_3']
        for name in auc_states_names:
            set_zero(name)

        for batch_id, data in enumerate(test_reader()):
            loss_val, auc_val = exe.run(inference_program,
                                        feed=feeder.feed(data),
                                        fetch_list=fetch_targets)
            if batch_id % 100 == 0:
                logger.info("TEST --> batch: {} loss: {} auc: {}".format(
                    batch_id, loss_val / args.batch_size, auc_val))