Пример #1
0
    def test_run(self):
        # Initialize dataset description
        data_feed = fluid.DataFeedDesc('train_data/data.prototxt')
        data_feed.set_batch_size(
            128)  # See API doc for how to change other fields

        # define network
        # input text data
        data = fluid.layers.data(name="words",
                                 shape=[1],
                                 dtype="int64",
                                 lod_level=1)
        # label data
        label = fluid.layers.data(name="label", shape=[1], dtype="int64")

        avg_cost, acc, prediction = bow_net(data, label)
        sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)
        opt_ops, weight_and_grad = sgd_optimizer.minimize(avg_cost)

        # Run startup program
        startup_program = fluid.default_startup_program()
        place = fluid.CPUPlace()
        executor = fluid.Executor(place)
        executor.run(startup_program)

        main_program = fluid.default_main_program()
        async_executor = fluid.AsyncExecutor(place)

        self.assertRaises(TypeError, async_executor.run)
        self.assertRaises(TypeError, async_executor.run, main_program)
        self.assertRaises(TypeError, async_executor.run, main_program,
                          data_feed)

        filelist = ['train_data/part-%d' % i for i in range(10)]
        self.assertRaises(TypeError, async_executor.run, main_program,
                          data_feed, filelist)

        thread_num = 4
        self.assertRaises(TypeError, async_executor.run, main_program,
                          data_feed, filelist, thread_num)

        async_executor.run(main_program, data_feed, filelist, thread_num,
                           [acc])
        fluid.io.save_inference_model("imdb.model", [data.name, label.name],
                                      [acc], executor)
        statinfo = os.stat('imdb.model/__model__')
        self.assertGreater(statinfo.st_size, 0)

        os.remove('./data.prototxt')
        shutil.rmtree('./train_data')
        shutil.rmtree('./imdb.model')
Пример #2
0
def train():
    # Download data
    with tarfile.open(paddle.dataset.common.download(URL, "imdb",
                                                     MD5)) as tarf:
        tarf.extractall(path='./')
        tarf.close()

    # Initialize dataset description
    dataset = fluid.DataFeedDesc('train_data/data.prototxt')
    dataset.set_batch_size(128)  # See API doc for how to change other fields
    print dataset.desc()  # Debug purpose: see what we get

    # define network
    # input text data
    data = fluid.layers.data(name="words",
                             shape=[1],
                             dtype="int64",
                             lod_level=1)
    # label data
    label = fluid.layers.data(name="label", shape=[1], dtype="int64")

    avg_cost, acc, prediction = bow_net(data, label)
    sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)
    opt_ops, weight_and_grad = sgd_optimizer.minimize(avg_cost)

    # Run startup program
    startup_program = fluid.default_startup_program()
    place = fluid.CPUPlace()
    executor = fluid.Executor(place)
    executor.run(startup_program)

    async_executor = fluid.AsyncExecutor(place)
    main_program = fluid.default_main_program()
    epochs = 10
    filelist = ["train_data/part-%d" % i for i in range(12)]
    for i in range(epochs):
        thread_num = 4
        async_executor.run(
            main_program,  # This can be changed during iteration
            dataset,  # This can be changed during iteration
            filelist,  # This can be changed during iteration
            thread_num,  # This can be changed during iteration
            [data, acc],  # Multiple fetch targets can be specified
            debug=False)
        fluid.io.save_inference_model('imdb/epoch%d.model' % i,
                                      [data.name, label.name], [acc], executor)
Пример #3
0
def train(network, dict_dim, lr, save_dirname, training_data_dirname, pass_num,
          thread_num, batch_size):
    file_names = os.listdir(training_data_dirname)
    filelist = []
    for i in range(0, len(file_names)):
        if file_names[i] == 'data_feed.proto':
            continue
        filelist.append(os.path.join(training_data_dirname, file_names[i]))

    dataset = fluid.DataFeedDesc(
        os.path.join(training_data_dirname, 'data_feed.proto'))
    dataset.set_batch_size(
        batch_size)  # datafeed should be assigned a batch size
    dataset.set_use_slots(['words', 'label'])

    data = fluid.layers.data(
        name="words", shape=[1], dtype="int64", lod_level=1)
    label = fluid.layers.data(name="label", shape=[1], dtype="int64")

    avg_cost, acc, prediction = network(data, label, dict_dim)
    optimizer = fluid.optimizer.Adagrad(learning_rate=lr)
    opt_ops, weight_and_grad = optimizer.minimize(avg_cost)

    startup_program = fluid.default_startup_program()
    main_program = fluid.default_main_program()

    place = fluid.CPUPlace()
    executor = fluid.Executor(place)
    executor.run(startup_program)

    async_executor = fluid.AsyncExecutor(place)
    for i in range(pass_num):
        pass_start = time.time()
        async_executor.run(main_program,
                           dataset,
                           filelist,
                           thread_num, [acc],
                           debug=False)
        print('pass_id: %u pass_time_cost %f' % (i, time.time() - pass_start))
        fluid.io.save_inference_model('%s/epoch%d.model' % (save_dirname, i),
                                      [data.name, label.name], [acc], executor)
Пример #4
0
 def train_loop(main_program, trainer_id=None):
     dataset = fluid.DataFeedDesc('data_feed.proto')
     dataset.set_batch_size(32)
     dataset.set_use_slots([d.name for d in data_list])
     dataset.set_pipe_command(
         '/home/users/dongdaxiang/paddle_whls/new_io/paddle_release_home/python/bin/python ctr_reader.py'
     )
     # how to define the protocol
     thread_num = 10
     for pass_id in xrange(PASS_NUM):
         for hour in range(24):
             hour_filelist = [
                 "./test_data_dir/%s" % x
                 for x in os.listdir("./test_data_dir/") if "part" in x
             ]
             print(hour_filelist)
             async_exe.run(main_program,
                           dataset,
                           hour_filelist,
                           thread_num, [avg_cost],
                           debug=True)
Пример #5
0
def async_train(args):
    if not os.path.isdir(args.model_output_dir):
                os.mkdir(args.model_output_dir)
    filelist = GetFileList(args.train_data_path)
    word2vec_reader = reader.Word2VecReader(
        args.dict_path, args.train_data_path, filelist, 0, 1)
    loss, words = skip_gram_word2vec(
        word2vec_reader.dict_size,
        word2vec_reader.word_frequencys,
        args.embedding_size,
        args.max_code_length,
        args.with_hs,
        args.with_nce,
        is_sparse=args.is_sparse)
    dataset = fluid.DataFeedDesc('data_feed.proto')
    dataset.set_batch_size(args.batch_size)
    dataset.set_use_slots([w.name for w in words])
    dataset.set_pipe_command("/home/users/dongdaxiang/paddle_whls/new_io/paddle_release_home/python/bin/python word2vec_data_gen.py")
    optimizer = fluid.optimizer.SGD(learning_rate=1e-4)
    optimizer.minimize(loss)
    async_train_loop(args, fluid.default_main_program(), loss, dataset, filelist)
Пример #6
0
def train(network, dict_dim, lr, save_dirname, training_data_dirname, pass_num,
          thread_num, batch_size):
    file_names = os.listdir(training_data_dirname)
    filelist = ['data_generator/train_data/%s' % x for x in os.listdir("data_generator/train_data")]

    dataset = fluid.DataFeedDesc('data_feed.proto')
    dataset.set_batch_size(
        batch_size)  # datafeed should be assigned a batch size
    dataset.set_use_slots(['words', 'label'])
    #dataset.set_pipe_command('/home/users/dongdaxiang/paddle_whls/new_io/paddle_release_home/python/bin/python new_data_reader.py')
    dataset.proto_desc.pipe_command = "/home/users/dongdaxiang/paddle_whls/new_io/paddle_release_home/python/bin/python new_data_reader.py"

    data = fluid.layers.data(
        name="words", shape=[1], dtype="int64", lod_level=1)
    label = fluid.layers.data(name="label", shape=[1], dtype="int64")

    avg_cost, acc, prediction = network(data, label, dict_dim)
    optimizer = fluid.optimizer.Adagrad(learning_rate=lr)
    opt_ops, weight_and_grad = optimizer.minimize(avg_cost)

    startup_program = fluid.default_startup_program()
    main_program = fluid.default_main_program()

    place = fluid.CPUPlace()
    executor = fluid.Executor(place)
    executor.run(startup_program)

    async_executor = fluid.AsyncExecutor(place)
    for i in range(pass_num):
        pass_start = time.time()
        async_executor.run(main_program,
                           dataset,
                           filelist,
                           thread_num, [acc],
                           debug=False)
        print('pass_id: %u pass_time_cost %f' % (i, time.time() - pass_start))
        fluid.io.save_inference_model('%s/epoch%d.model' % (save_dirname, i),
                                      [data.name, label.name], [acc], executor)
Пример #7
0
avg_cost = fluid.layers.mean(loss_op3)
'''
acc = fluid.layers.accuracy(input=cos_q_pt, \
                            label=label, k=1)
'''
#real_acc = get_acc(cos_q_nt, cos_q_pt)
# SGD optimizer
sgd_optimizer = fluid.optimizer.SGD(learning_rate=base_lr)
sgd_optimizer.minimize(avg_cost)

place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
async_exe = fluid.AsyncExecutor(place)
thread_num = 40
dataset = fluid.DataFeedDesc('data_feed.proto')
dataset.set_batch_size(128)
dataset.set_use_slots([q.name, pt.name, nt.name])
dataset.set_pipe_command(
    "/home/users/dongdaxiang/paddle_whls/new_io/paddle_release_home/python/bin/python pairwise_reader.py"
)
#dataset.set_pipe_command("cat")
filelist = ["ids/%s" % x for x in os.listdir("ids")]
#filelist = ["prepared.txt"]
print(filelist)
async_exe.run(fluid.default_main_program(),
              dataset,
              filelist,
              thread_num, [],
              debug=True)
Пример #8
0
 def test_data_feed_desc(self):
     data_feed = fluid.DataFeedDesc('./data.prototxt')
     # assertEqueal(data_feed.proto_desc.batch, 2)
     # assertEqual(len(data_feed.proto_desc.multi_slot_desc), 2)
     self.assertEqual(" ".join(data_feed.desc().split()),
                      " ".join(proto_str.split()))