Exemplo n.º 1
0
    def test_run(self):
        # Initialize dataset description
        data_feed = fluid.DataFeedDesc('train_data/data.prototxt')
        data_feed.set_batch_size(
            128)  # See API doc for how to change other fields

        # define network
        # input text data
        data = fluid.layers.data(name="words",
                                 shape=[1],
                                 dtype="int64",
                                 lod_level=1)
        # label data
        label = fluid.layers.data(name="label", shape=[1], dtype="int64")

        avg_cost, acc, prediction = bow_net(data, label)
        sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)
        opt_ops, weight_and_grad = sgd_optimizer.minimize(avg_cost)

        # Run startup program
        startup_program = fluid.default_startup_program()
        place = fluid.CPUPlace()
        executor = fluid.Executor(place)
        executor.run(startup_program)

        main_program = fluid.default_main_program()
        async_executor = fluid.AsyncExecutor(place)

        self.assertRaises(TypeError, async_executor.run)
        self.assertRaises(TypeError, async_executor.run, main_program)
        self.assertRaises(TypeError, async_executor.run, main_program,
                          data_feed)

        filelist = ['train_data/part-%d' % i for i in range(10)]
        self.assertRaises(TypeError, async_executor.run, main_program,
                          data_feed, filelist)

        thread_num = 4
        self.assertRaises(TypeError, async_executor.run, main_program,
                          data_feed, filelist, thread_num)

        async_executor.run(main_program, data_feed, filelist, thread_num,
                           [acc])
        fluid.io.save_inference_model("imdb.model", [data.name, label.name],
                                      [acc], executor)
        statinfo = os.stat('imdb.model/__model__')
        self.assertGreater(statinfo.st_size, 0)

        os.remove('./data.prototxt')
        shutil.rmtree('./train_data')
        shutil.rmtree('./imdb.model')
Exemplo n.º 2
0
def train():
    # Download data
    with tarfile.open(paddle.dataset.common.download(URL, "imdb",
                                                     MD5)) as tarf:
        tarf.extractall(path='./')
        tarf.close()

    # Initialize dataset description
    dataset = fluid.DataFeedDesc('train_data/data.prototxt')
    dataset.set_batch_size(128)  # See API doc for how to change other fields
    print dataset.desc()  # Debug purpose: see what we get

    # define network
    # input text data
    data = fluid.layers.data(name="words",
                             shape=[1],
                             dtype="int64",
                             lod_level=1)
    # label data
    label = fluid.layers.data(name="label", shape=[1], dtype="int64")

    avg_cost, acc, prediction = bow_net(data, label)
    sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)
    opt_ops, weight_and_grad = sgd_optimizer.minimize(avg_cost)

    # Run startup program
    startup_program = fluid.default_startup_program()
    place = fluid.CPUPlace()
    executor = fluid.Executor(place)
    executor.run(startup_program)

    async_executor = fluid.AsyncExecutor(place)
    main_program = fluid.default_main_program()
    epochs = 10
    filelist = ["train_data/part-%d" % i for i in range(12)]
    for i in range(epochs):
        thread_num = 4
        async_executor.run(
            main_program,  # This can be changed during iteration
            dataset,  # This can be changed during iteration
            filelist,  # This can be changed during iteration
            thread_num,  # This can be changed during iteration
            [data, acc],  # Multiple fetch targets can be specified
            debug=False)
        fluid.io.save_inference_model('imdb/epoch%d.model' % i,
                                      [data.name, label.name], [acc], executor)
Exemplo n.º 3
0
def async_train_loop(args, train_program, loss, dataset, filelist):
    place = fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())
    async_executor = fluid.AsyncExecutor(place)
    thread_num = 40
    for i in range(args.epochs):
        async_executor.run(
            train_program, # main program
            dataset, # dataset
            filelist, # filelist
            thread_num, # thread
            [], # fetch
            debug=True) # debug
        epoch_model = "word2vec_model/epoch" + str(i + 1)
        fluid.io.save_inference_model(
            epoch_model,
            [data.name, label.name],
            [acc],
            executor)
Exemplo n.º 4
0
def train(network, dict_dim, lr, save_dirname, training_data_dirname, pass_num,
          thread_num, batch_size):
    file_names = os.listdir(training_data_dirname)
    filelist = []
    for i in range(0, len(file_names)):
        if file_names[i] == 'data_feed.proto':
            continue
        filelist.append(os.path.join(training_data_dirname, file_names[i]))

    dataset = fluid.DataFeedDesc(
        os.path.join(training_data_dirname, 'data_feed.proto'))
    dataset.set_batch_size(
        batch_size)  # datafeed should be assigned a batch size
    dataset.set_use_slots(['words', 'label'])

    data = fluid.layers.data(
        name="words", shape=[1], dtype="int64", lod_level=1)
    label = fluid.layers.data(name="label", shape=[1], dtype="int64")

    avg_cost, acc, prediction = network(data, label, dict_dim)
    optimizer = fluid.optimizer.Adagrad(learning_rate=lr)
    opt_ops, weight_and_grad = optimizer.minimize(avg_cost)

    startup_program = fluid.default_startup_program()
    main_program = fluid.default_main_program()

    place = fluid.CPUPlace()
    executor = fluid.Executor(place)
    executor.run(startup_program)

    async_executor = fluid.AsyncExecutor(place)
    for i in range(pass_num):
        pass_start = time.time()
        async_executor.run(main_program,
                           dataset,
                           filelist,
                           thread_num, [acc],
                           debug=False)
        print('pass_id: %u pass_time_cost %f' % (i, time.time() - pass_start))
        fluid.io.save_inference_model('%s/epoch%d.model' % (save_dirname, i),
                                      [data.name, label.name], [acc], executor)
Exemplo n.º 5
0
def train_async_local(batch_size):
    fea_sz, fea_sections, model_dict = model_conf.model_conf(
        'thirdparty/model.conf')
    id_dict = None

    #data_list, predict, auc_var, cur_auc_var, auc_states, avg_cost, label = \
    #fluid_net.async_net(fea_sections)
    data_list, predict, avg_cost, label = fluid_net.async_net(fea_sections)

    optimizer = fluid.optimizer.Adam(learning_rate=0.0005, lazy_mode=True)
    #optimizer = fluid.optimizer.SGD(learning_rate=0.01)
    optimize_ops, params_grads = optimizer.minimize(avg_cost)
    place = fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())
    async_exe = fluid.AsyncExecutor(place)

    def train_loop(main_program, trainer_id=None):
        dataset = fluid.DataFeedDesc('data_feed.proto')
        dataset.set_batch_size(32)
        dataset.set_use_slots([d.name for d in data_list])
        dataset.set_pipe_command(
            '/home/users/dongdaxiang/paddle_whls/new_io/paddle_release_home/python/bin/python ctr_reader.py'
        )
        # how to define the protocol
        thread_num = 10
        for pass_id in xrange(PASS_NUM):
            for hour in range(24):
                hour_filelist = [
                    "./test_data_dir/%s" % x
                    for x in os.listdir("./test_data_dir/") if "part" in x
                ]
                print(hour_filelist)
                async_exe.run(main_program,
                              dataset,
                              hour_filelist,
                              thread_num, [avg_cost],
                              debug=True)

    train_loop(fluid.default_main_program())
Exemplo n.º 6
0
def train(network, dict_dim, lr, save_dirname, training_data_dirname, pass_num,
          thread_num, batch_size):
    file_names = os.listdir(training_data_dirname)
    filelist = ['data_generator/train_data/%s' % x for x in os.listdir("data_generator/train_data")]

    dataset = fluid.DataFeedDesc('data_feed.proto')
    dataset.set_batch_size(
        batch_size)  # datafeed should be assigned a batch size
    dataset.set_use_slots(['words', 'label'])
    #dataset.set_pipe_command('/home/users/dongdaxiang/paddle_whls/new_io/paddle_release_home/python/bin/python new_data_reader.py')
    dataset.proto_desc.pipe_command = "/home/users/dongdaxiang/paddle_whls/new_io/paddle_release_home/python/bin/python new_data_reader.py"

    data = fluid.layers.data(
        name="words", shape=[1], dtype="int64", lod_level=1)
    label = fluid.layers.data(name="label", shape=[1], dtype="int64")

    avg_cost, acc, prediction = network(data, label, dict_dim)
    optimizer = fluid.optimizer.Adagrad(learning_rate=lr)
    opt_ops, weight_and_grad = optimizer.minimize(avg_cost)

    startup_program = fluid.default_startup_program()
    main_program = fluid.default_main_program()

    place = fluid.CPUPlace()
    executor = fluid.Executor(place)
    executor.run(startup_program)

    async_executor = fluid.AsyncExecutor(place)
    for i in range(pass_num):
        pass_start = time.time()
        async_executor.run(main_program,
                           dataset,
                           filelist,
                           thread_num, [acc],
                           debug=False)
        print('pass_id: %u pass_time_cost %f' % (i, time.time() - pass_start))
        fluid.io.save_inference_model('%s/epoch%d.model' % (save_dirname, i),
                                      [data.name, label.name], [acc], executor)
Exemplo n.º 7
0
                                                                                    shape=[-1, 1], value=0.0, dtype='float32'), \
                                         loss_op2)
avg_cost = fluid.layers.mean(loss_op3)
'''
acc = fluid.layers.accuracy(input=cos_q_pt, \
                            label=label, k=1)
'''
#real_acc = get_acc(cos_q_nt, cos_q_pt)
# SGD optimizer
sgd_optimizer = fluid.optimizer.SGD(learning_rate=base_lr)
sgd_optimizer.minimize(avg_cost)

place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
async_exe = fluid.AsyncExecutor(place)
thread_num = 40
dataset = fluid.DataFeedDesc('data_feed.proto')
dataset.set_batch_size(128)
dataset.set_use_slots([q.name, pt.name, nt.name])
dataset.set_pipe_command(
    "/home/users/dongdaxiang/paddle_whls/new_io/paddle_release_home/python/bin/python pairwise_reader.py"
)
#dataset.set_pipe_command("cat")
filelist = ["ids/%s" % x for x in os.listdir("ids")]
#filelist = ["prepared.txt"]
print(filelist)
async_exe.run(fluid.default_main_program(),
              dataset,
              filelist,
              thread_num, [],
Exemplo n.º 8
0
        sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)

        self.startup_program = fluid.default_startup_program()
        self.program_desc = fluid.default_main_program()


if __name__ == "__main__":

    model = Model()

    dp = downpour.DownpourSGD(learning_rate=0.1, window=1)
    server_desc, skipped_ops = dp.minimize(model.avg_cost)

    server_desc_str = text_format.MessageToString(server_desc)

    async_exe = fluid.AsyncExecutor()
    instance = async_exe.config_distributed_nodes()

    if instance.is_server():
        async_exe.init_server(server_desc_str)
    elif instance.is_worker():
        async_exe.init_worker(server_desc_str, model.startup_program)
        local_data_dir = "./data/"
        # you can use this to download data from hadoop
        # async_exe.download_data("your_HADOOP_data_dir", local_data_dir, "fs_default_name", "ugi", 10)
        data_set = data_feed.DataFeedDesc(local_data_dir + "data_feed.proto")
        data_set.set_use_slots(["click"] + [str(i) for i in range(100)])
        file_list = filter(
            lambda x: x.find("part") != -1,
            [local_data_dir + i for i in os.listdir(local_data_dir)])
        async_exe.run(model.program_desc, data_set, file_list, 10,