def arci_api(qpool, logdir, dataset_path, train_id, parameter):
    keras.backend.clear_session()
    # load数据并创建preprocessor对象
    train_pack = load_train_data(train_id, parameter['existing_dataset'], parameter['task'])
    predict_pack = load_test_data(train_id, parameter['existing_dataset'], parameter['task'])
    preprocessor = mz.preprocessors.BasicPreprocessor(fixed_length_left=10, fixed_length_right=100,
                                                      remove_stop_words=False)
    # 重定向stderr到log文件
    logdir.set_preprocess_id(train_id)
    err_old = sys.stderr
    sys.stderr = logdir
    # preprocessor.fit的内容写出到log,写完后关闭重定向,保存preprocessor
    train_pack_processed = preprocessor.fit_transform(train_pack)
    sys.stderr = err_old
    preprocessor.save(ROOT_PATH + 'matchzoo_temp_files/preprocessors/' + train_id + '.arci_preprocessor')
    predict_pack_processed = preprocessor.transform(predict_pack)
    with open(ROOT_PATH + 'matchzoo_temp_files/logger/' + train_id + '.preprocess_log', 'a') as f:
        f.write('Preprocess finished!')
    ranking_task = mz.tasks.Ranking(loss=mz.losses.RankHingeLoss())
    ranking_task.metrics = [
        mz.metrics.NormalizedDiscountedCumulativeGain(k=3),
        mz.metrics.NormalizedDiscountedCumulativeGain(k=5),
        mz.metrics.MeanAveragePrecision()
    ]
    model = mz.models.ArcI()
    model.params['input_shapes'] = preprocessor.context['input_shapes']
    model.params['task'] = ranking_task
    model.params['embedding_input_dim'] = preprocessor.context['vocab_size']
    model.params['embedding_output_dim'] = parameter['embedding_output_dim']
    model.params['num_blocks'] = parameter['num_blocks']
    model.params['left_filters'] = [parameter['left_filters']]
    model.params['left_kernel_sizes'] = [parameter['left_kernel_sizes']]
    model.params['left_pool_sizes'] = [parameter['left_pool_sizes']]
    model.params['right_filters'] = [parameter['right_filters']]
    model.params['right_kernel_sizes'] = [parameter['right_kernel_sizes']]
    model.params['right_pool_sizes'] = [parameter['right_pool_sizes']]
    model.params['conv_activation_func'] = 'relu'
    model.params['mlp_num_layers'] = parameter['mlp_num_layers']
    model.params['mlp_num_units'] = parameter['mlp_num_units']
    model.params['mlp_num_fan_out'] = parameter['mlp_num_fan_out']
    model.params['mlp_activation_func'] = 'relu'
    model.params['dropout_rate'] = 0.9
    model.params['optimizer'] = 'adadelta'
    model.guess_and_fill_missing_params()
    model.build()
    model.compile()
    model.backend.summary()
    glove_embedding = mz.datasets.embeddings.load_glove_embedding(dimension=300)
    embedding_matrix = glove_embedding.build_matrix(preprocessor.context['vocab_unit'].state['term_index'])
    model.load_embedding_matrix(embedding_matrix)
    pred_x, pred_y = predict_pack_processed[:].unpack()
    evaluate = mz.callbacks.EvaluateAllMetrics(model, x=pred_x, y=pred_y, batch_size=len(pred_y))
    train_generator = mz.PairDataGenerator(train_pack_processed, num_dup=2, num_neg=1, batch_size=20)
    # 重定向stdout到log当中
    qpool.set_trainid(train_id)
    old = sys.stdout
    sys.stdout = qpool
    model.fit_generator(train_generator, epochs=parameter['epochs'], callbacks=[evaluate], workers=5, use_multiprocessing=False)
    sys.stdout = old
    model.save(ROOT_PATH + 'matchzoo_temp_files/models/' + train_id + '.arci_model')
def train(train_id='test_file'):
    train_pack = mz.datasets.wiki_qa.load_data(stage='train')[:1000]
    dev_pack = mz.datasets.wiki_qa.load_data(stage='dev')[:1000]
    predict_pack = mz.datasets.wiki_qa.load_data(
        stage='test').drop_label()[:1000]

    preprocessor = mz.preprocessors.DSSMPreprocessor()
    preprocessor.fit(train_pack)
    preprocessor.save(ROOT_PATH + 'matchzoo_temp_files/preprocessors/' +
                      train_id + '.dssm_preprocessor')

    train_pack_processed = preprocessor.transform(train_pack)
    dev_pack_processed = preprocessor.transform(dev_pack)

    train_generator = mz.PairDataGenerator(train_pack_processed,
                                           num_dup=5,
                                           num_neg=1,
                                           batch_size=32)

    ranking_task = mz.tasks.Ranking(
        loss=mz.losses.RankHingeLoss(num_neg=1, margin=1.0))
    ranking_task.metrics = [
        'mae', 'map', 'precision',
        mz.metrics.Precision(k=3),
        mz.metrics.DiscountedCumulativeGain(k=1),
        mz.metrics.DiscountedCumulativeGain(k=3),
        mz.metrics.DiscountedCumulativeGain(k=5),
        mz.metrics.NormalizedDiscountedCumulativeGain(k=1),
        mz.metrics.NormalizedDiscountedCumulativeGain(k=3),
        mz.metrics.NormalizedDiscountedCumulativeGain(k=5)
    ]

    model = mz.models.DSSMModel()
    model.params['task'] = ranking_task
    model.params['input_shapes'] = preprocessor.context['input_shapes']
    model.guess_and_fill_missing_params()
    model.build()
    model.compile()

    dev_x, dev_y = dev_pack_processed.unpack()
    evaluate = model.EvaluateOnCall(model,
                                    x=dev_x,
                                    y=dev_y,
                                    valid_steps=2,
                                    batch_size=32)
    model.fit(*train_pack_processed.unpack(),
              epochs=10,
              batch_size=32,
              callbacks=[evaluate])

    model.save(ROOT_PATH + 'matchzoo_temp_files/models/' + train_id +
               '.dssm_model')

    model.fit_generator(train_generator,
                        epochs=5,
                        callbacks=[evaluate],
                        workers=4,
                        use_multiprocessing=True)
示例#3
0
def tutorial():
    # data = get_processed_data()
    # data = knrm_processed()
    print("Loading data")
    data = get_processed_data_from_cache()
    preprocessor, train_processed, valid_processed = data
    # save_to_pickle(data, "matchzoo_prac1")
    print("Defining task")
    ranking_task = mz.tasks.Ranking(loss=mz.losses.RankCrossEntropyLoss(
        num_neg=4))
    ranking_task.metrics = [
        mz.metrics.NormalizedDiscountedCumulativeGain(k=3),
        mz.metrics.MeanAveragePrecision()
    ]
    glove_embedding = mz.datasets.embeddings.load_glove_embedding(
        dimension=300)
    print('output_dim', glove_embedding.output_dim)

    #Initialize the model, fine-tune the hyper-parameters.
    print("building model")
    model = mz.models.KNRM()
    model.params.update(preprocessor.context)
    model.params['task'] = ranking_task
    model.params['embedding_output_dim'] = glove_embedding.output_dim
    model.params['embedding_trainable'] = True
    model.params['kernel_num'] = 21
    model.params['sigma'] = 0.1
    model.params['exact_sigma'] = 0.001
    model.params['optimizer'] = 'adadelta'
    model.build()
    model.compile()
    embedding_matrix = glove_embedding.build_matrix(
        preprocessor.context['vocab_unit'].state['term_index'])
    print(embedding_matrix.shape)
    # normalize the word embedding for fast histogram generating.
    l2_norm = np.sqrt((embedding_matrix * embedding_matrix).sum(axis=1))
    print(l2_norm.shape)
    embedding_matrix = embedding_matrix / l2_norm[:, np.newaxis]
    print(embedding_matrix.shape)
    model.load_embedding_matrix(embedding_matrix)

    print("defining generator")
    train_generator = mz.PairDataGenerator(train_processed,
                                           num_dup=1,
                                           num_neg=4,
                                           batch_size=64,
                                           shuffle=True)
    valid_x, valid_y = valid_processed.unpack()
    evaluate = mz.callbacks.EvaluateAllMetrics(model,
                                               x=valid_x,
                                               y=valid_y,
                                               batch_size=len(valid_x))
    print("fitting")
    history = model.fit_generator(train_generator,
                                  epochs=20,
                                  callbacks=[evaluate],
                                  workers=5,
                                  use_multiprocessing=False)
def cdssm_api(qpool, logdir, dataset_path, train_id, parameter):
    keras.backend.clear_session()
    # load数据并创建preprocessor对象
    train_pack = load_train_data(train_id, parameter['existing_dataset'], parameter['task'])
    predict_pack = load_test_data(train_id, parameter['existing_dataset'], parameter['task'])
    preprocessor = mz.preprocessors.CDSSMPreprocessor()
    # 重定向stderr到log文件
    logdir.set_preprocess_id(train_id)
    err_old = sys.stderr
    sys.stderr = logdir
    # preprocessor.fit的内容写出到log,写完后关闭重定向,保存preprocessor
    train_pack_processed = preprocessor.fit_transform(train_pack)
    sys.stderr = err_old
    preprocessor.save(ROOT_PATH + 'matchzoo_temp_files/preprocessors/' + train_id + '.cdssm_preprocessor')
    predict_pack_processed = preprocessor.transform(predict_pack)
    with open(ROOT_PATH + 'matchzoo_temp_files/logger/' + train_id + '.preprocess_log', 'a') as f:
        f.write('Preprocess finished!')
    ranking_task = mz.tasks.Ranking(loss=mz.losses.RankCrossEntropyLoss(num_neg=4))
    ranking_task.metrics = [
        mz.metrics.NormalizedDiscountedCumulativeGain(k=3),
        mz.metrics.NormalizedDiscountedCumulativeGain(k=5),
        mz.metrics.MeanAveragePrecision()
    ]
    model = mz.models.CDSSM()
    model.params['input_shapes'] = preprocessor.context['input_shapes']
    model.params['task'] = ranking_task
    model.params['filters'] = parameter['filters']
    model.params['kernel_size'] = parameter['kernel_size']
    model.params['strides'] = parameter['strides']
    model.params['padding'] = parameter['padding']
    model.params['conv_activation_func'] = parameter['conv_activation_func']
    model.params['w_initializer'] = parameter['w_initializer']
    model.params['b_initializer'] = parameter['b_initializer']
    model.params['mlp_num_layers'] = parameter['mlp_num_layers']
    model.params['mlp_num_units'] = parameter['mlp_num_units']
    model.params['mlp_num_fan_out'] = parameter['mlp_num_fan_out']
    model.params['mlp_activation_func'] = parameter['mlp_activation_func']
    model.params['dropout_rate'] = 0.8
    model.params['optimizer'] = 'adadelta'
    model.guess_and_fill_missing_params()
    model.guess_and_fill_missing_params()
    model.build()
    model.compile()
    model.backend.summary()
    pred_x, pred_y = predict_pack_processed[:].unpack()
    evaluate = mz.callbacks.EvaluateAllMetrics(model, x=pred_x, y=pred_y, batch_size=len(pred_x))
    train_generator = mz.PairDataGenerator(train_pack_processed, num_dup=1, num_neg=4, batch_size=64, shuffle=True)
    # 重定向stdout到log当中
    qpool.set_trainid(train_id)
    old = sys.stdout
    sys.stdout = qpool
    model.fit_generator(train_generator, epochs=parameter['epochs'], callbacks=[evaluate], workers=5, use_multiprocessing=False)
    sys.stdout = old
    model.save(ROOT_PATH + 'matchzoo_temp_files/models/' + train_id + '.cdssm_model')
callback_earlystopping = keras.callbacks.EarlyStopping(monitor='val_loss',
                                                       mode='min',
                                                       verbose=0,
                                                       patience=100,
                                                       min_delta=0.001)
mcp_save = keras.callbacks.ModelCheckpoint('best_one_cdssm',
                                           save_best_only=True,
                                           monitor='val_loss',
                                           mode='min')
evaluate = mz.callbacks.EvaluateAllMetrics(model,
                                           x=val_x,
                                           y=val_y,
                                           batch_size=len(val_y))
data_generator = mz.PairDataGenerator(train_processed,
                                      num_dup=2,
                                      num_neg=2,
                                      batch_size=128)

start_time = time.time()
model.fit_generator(data_generator,
                    epochs=1000,
                    validation_data=(val_x, val_y),
                    callbacks=[evaluate, callback_earlystopping, mcp_save],
                    verbose=2)
print(
    "===================================================== Training time ====================================================="
)
print("--- %s seconds ---" % (time.time() - start_time))
print(
    "========================================================================================================================="
)
示例#6
0
model.params['mlp_num_layers'] = 3
model.params['mlp_num_units'] = 300
model.params['mlp_num_fan_out'] = 128
model.params['mlp_activation_func'] = 'relu'
model.guess_and_fill_missing_params()
model.build()
model.compile()
model.backend.summary()

# In[8]:

pred_x, pred_y = valid_pack_processed[:].unpack()
# print(pred_x,pred_y)
evaluate = mz.callbacks.EvaluateAllMetrics(model,
                                           x=pred_x,
                                           y=pred_y,
                                           batch_size=len(pred_x))

train_generator = mz.PairDataGenerator(train_pack_processed,
                                       num_dup=1,
                                       num_neg=4,
                                       batch_size=32,
                                       shuffle=True)
len(train_generator)

history = model.fit_generator(train_generator,
                              epochs=100,
                              callbacks=[evaluate],
                              workers=5,
                              use_multiprocessing=False)
    model.guess_and_fill_missing_params()
    model.build()
    model.compile()
    model.backend.summary()

    matrix = embedding.build_matrix(
        preprocessor.context['vocab_unit'].state['term_index'])
    model.load_embedding_matrix(matrix)

    pred_x, pred_y = predict_pack_processed[:].unpack()
    evaluate = mz.callbacks.EvaluateAllMetrics(model,
                                               x=pred_x,
                                               y=pred_y,
                                               batch_size=len(pred_y))
    train_generator = mz.PairDataGenerator(train_pack_processed,
                                           num_dup=2,
                                           num_neg=1,
                                           batch_size=20)
    history = model.fit_generator(train_generator,
                                  epochs=30,
                                  callbacks=[evaluate],
                                  workers=30,
                                  use_multiprocessing=True)
    # evaluate the model
    scores = model.evaluate(pred_x, pred_y, batch_size=len(pred_y))
    mrrscores.append(scores[mz.engine.parse_metric('mrr')] * 100)
    mapscores.append(scores[mz.metrics.MeanAveragePrecision()] * 100)
    ndcgscores.append(
        scores[mz.metrics.NormalizedDiscountedCumulativeGain(k=1)] * 100)
model.save(model_path)
preprocessor.save(pre_path)
print("\n>>> Resultat mrr:  %.2f%% (+/- %.2f%%)" %