示例#1
0
 def predict(self, test_pack_processed):
     model = mz.load_model(self.modeldir)
     test_generator = mz.DPoolDataGenerator(test_pack_processed,
                                            fixed_length_left=20,
                                            fixed_length_right=20,
                                            batch_size=20)
     pred_x, pred_y = test_generator[:]
     predict_value = model.predict(pred_x,
                                   batch_size=len(pred_y))  # batch_size
     return predict_value
示例#2
0
def predict_similarity_scores(tweets,q,n_top):
    
    keras.backend.clear_session() 
    pre=mz.engine.base_preprocessor.load_preprocessor('./search/Preprocessors/MatchPy_full_fasttext')
    model=mz.engine.base_model.load_model('./search/Models/MatchPy_full_fasttext')
    
    tokenizer = RegexpTokenizer(r'\w+')
    rows=[]                     
    for id, tweet in tweets.iterrows():
                rows+=[{
                    'id_left': 1,
                    'text_left': q,
                    'id_right': tweet['id_str'],
                    'text_right': tweet['processed_text'],
                    'label':1
                }]
    if len(rows)==0:
        return None        
    data=pd.DataFrame.from_dict(rows)
    data_pack = mz.pack(data)
    del data
    data_pack.relation['label'] = data_pack.relation['label'].astype('float32')
    predict_pack_processed=pre.transform(data_pack)
    predict_generator = mz.DPoolDataGenerator(predict_pack_processed,
                                          fixed_length_left=10,
                                          fixed_length_right=40,
                                          batch_size=20)    
    logging.info('\n# Predictiong...\n')
    pred_x, pred_y = predict_generator[:]
    predictions=model.predict(pred_x)
    del data_pack
    i=0
    tweets.set_index('id_str',inplace=True)
    tweets['score']=0
    x=pd.DataFrame(pred_x,columns=['id_left','id_right'])
    for index,row in x.iterrows():
        tweets.loc[row['id_right'],'score']=predictions[i][0]
        i+=1
    del x
    tweets=tweets.sort_values(['score'],ascending=[False])
    # eliminer les scores negatifs
    logging.info('\n# Prediction [OK]\n')
    return tweets
示例#3
0
        model.params['dropout_rate'] = 0.4
        model.guess_and_fill_missing_params()
        model.build()
        model.compile()
        model.backend.summary()
        matrix = embedding.build_matrix(
            preprocessor.context['vocab_unit'].state['term_index'])
        model.load_embedding_matrix(matrix)
        train_generator = mz.DPoolPairDataGenerator(train_pack_processed,
                                                    fixed_length_left=10,
                                                    fixed_length_right=128,
                                                    num_dup=2,
                                                    num_neg=1,
                                                    batch_size=20)
        predict_generator = mz.DPoolDataGenerator(predict_pack_processed,
                                                  fixed_length_left=10,
                                                  fixed_length_right=128,
                                                  batch_size=20)
        pred_x, pred_y = predict_generator[:]
        # Fit the model
        evaluate = mz.callbacks.EvaluateAllMetrics(model,
                                                   x=pred_x,
                                                   y=pred_y,
                                                   batch_size=len(pred_y))
        history = model.fit_generator(train_generator,
                                      epochs=30,
                                      callbacks=[evaluate],
                                      workers=30,
                                      use_multiprocessing=True)

        # evaluate the model
        scores = model.evaluate(pred_x, pred_y, batch_size=len(pred_y))
def matchpyramid_api(qpool, logdir, dataset_path, train_id, parameter):
    keras.backend.clear_session()
    # load数据并创建preprocessor对象
    train_pack = load_train_data(train_id, parameter['existing_dataset'], parameter['task'])
    predict_pack = load_test_data(train_id, parameter['existing_dataset'], parameter['task'])
    preprocessor = mz.preprocessors.BasicPreprocessor(fixed_length_left=10, fixed_length_right=40,
                                                      remove_stop_words=True)
    # 重定向stderr到log文件
    logdir.set_preprocess_id(train_id)
    err_old = sys.stderr
    sys.stderr = logdir
    # preprocessor.fit的内容写出到log,写完后关闭重定向,保存preprocessor
    train_pack_processed = preprocessor.fit_transform(train_pack)
    sys.stderr = err_old
    preprocessor.save(ROOT_PATH + 'matchzoo_temp_files/preprocessors/' + train_id + '.matchpyramid_preprocessor')
    predict_pack_processed = preprocessor.transform(predict_pack)
    with open(ROOT_PATH + 'matchzoo_temp_files/logger/' + train_id + '.preprocess_log', 'a') as f:
        f.write('Preprocess finished!')
    ranking_task = mz.tasks.Ranking(loss=mz.losses.RankHingeLoss())
    ranking_task.metrics = [
        mz.metrics.NormalizedDiscountedCumulativeGain(k=3),
        mz.metrics.NormalizedDiscountedCumulativeGain(k=5),
        mz.metrics.MeanAveragePrecision()
    ]
    model = mz.models.MatchPyramid()
    model.params['input_shapes'] = preprocessor.context['input_shapes']
    model.params['task'] = ranking_task
    model.params['embedding_input_dim'] = preprocessor.context['vocab_size']
    model.params['embedding_output_dim'] = parameter['embedding_output_dim']
    model.params['embedding_trainable'] = True
    model.params['num_blocks'] = parameter['num_blocks']
    model.params['kernel_count'] = [16, 32]
    model.params['kernel_size'] = [[3, 3], [3, 3]]
    model.params['dpool_size'] = [3, 10]
    model.params['optimizer'] = 'adam'
    model.params['dropout_rate'] = 0.1
    model.guess_and_fill_missing_params()
    model.build()
    model.compile()
    model.backend.summary()
    glove_embedding = mz.datasets.embeddings.load_glove_embedding(dimension=300)
    embedding_matrix = glove_embedding.build_matrix(preprocessor.context['vocab_unit'].state['term_index'])
    model.load_embedding_matrix(embedding_matrix)

    train_generator = mz.DPoolPairDataGenerator(train_pack_processed,
                                                fixed_length_left=10,
                                                fixed_length_right=40,
                                                num_dup=2,
                                                num_neg=1,
                                                batch_size=20)
    predict_generator = mz.DPoolDataGenerator(predict_pack_processed,
                                              fixed_length_left=10,
                                              fixed_length_right=40,
                                              batch_size=20)
    pred_x, pred_y = predict_generator[:]
    evaluate = mz.callbacks.EvaluateAllMetrics(model, x=pred_x, y=pred_y, batch_size=len(pred_y))
    # 重定向stdout到log当中
    qpool.set_trainid(train_id)
    old = sys.stdout
    sys.stdout = qpool
    model.fit_generator(train_generator, epochs=parameter['epochs'], callbacks=[evaluate], workers=5, use_multiprocessing=False)
    sys.stdout = old
    model.save(ROOT_PATH + 'matchzoo_temp_files/models/' + train_id + '.matchpyramid_model')
示例#5
0
def predict_similarity_scores(processed_tweets, n_top):
    tokenizer = RegexpTokenizer(r'\w+')

    pre = mz.engine.base_preprocessor.load_preprocessor(
        '/data/Preprocessors/MatchPy/MatchPy_full_2015-2016_fasttext_CBOW')
    model = mz.engine.base_model.load_model(
        '/data/Models/MatchPy/MatchPy_full_2015-2016_fasttext_CBOW')

    tweets = pd.DataFrame.from_dict(processed_tweets)
    del processed_tweets
    tweets.drop_duplicates('tweetid', 'first', True)
    tweets.set_index('tweetid', inplace=True)
    #Retrieve topics
    #with open('/home/lila/CORPUS/left/left_2017.json') as f:
    with open(
            "/data/CORPUS/left/left_2017_nist_evaluated_qrels_real.json") as f:
        topics = json.load(f)
    rows = []
    for topic in topics[:]:
        for id, tweet in tweets.iterrows():
            #overlap=set(tokenizer.tokenize(topic['title'])) & set(tokenizer.tokenize(tweet['text']))
            #if len(overlap)>0:
            rows += [{
                'id_left': topic['topid'],
                'text_left': topic['title'],
                'id_right': str(id),
                'text_right': tweet['text'],
                'label': 0
            }]
    if len(rows) == 0:
        return None
    data = pd.DataFrame.from_dict(rows)
    data_pack = mz.pack(data)
    del data
    data_pack.relation['label'] = data_pack.relation['label'].astype('float32')
    predict_pack_processed = pre.transform(data_pack)
    predict_generator = mz.DPoolDataGenerator(predict_pack_processed,
                                              fixed_length_left=10,
                                              fixed_length_right=128,
                                              batch_size=20)
    pred_x, pred_y = predict_generator[:]
    predictions = model.predict(pred_x)
    del data_pack
    i = 0
    scores = []
    x = pd.DataFrame(pred_x, columns=['id_left', 'id_right'])
    for index, row in x.iterrows():
        score = {
            "id_left": row['id_left'],
            "id_right": row['id_right'],
            "text": tweets.loc[row['id_right'], 'text'],
            "created_at": tweets.loc[row['id_right'], 'created_at'],
            "score": predictions[i][0]
        }
        scores.append(score)
        i += 1
    del x
    pred_table = pd.DataFrame.from_dict(scores)
    del scores
    pred_table = pred_table.sort_values(['id_left', 'score'],
                                        ascending=[True, False])
    pred_table.set_index('id_left', inplace=True)
    topics = set(pred_table.index.values)
    df = pd.DataFrame()
    for topic in topics:
        df = pd.concat([df, pred_table.loc[topic, :].head(n_top)], sort=True)
    del pred_table
    del topics
    return df
示例#6
0
    def model_build(preprocessor, train_pack_processed, valid_pack_processed):
        # model
        ranking_task = mz.tasks.Ranking(loss=mz.losses.RankHingeLoss())
        ranking_task.metrics = [
            mz.metrics.NormalizedDiscountedCumulativeGain(k=3),
            mz.metrics.NormalizedDiscountedCumulativeGain(k=5),
            mz.metrics.MeanAveragePrecision()
        ]

        model = mz.models.MatchPyramid()
        model.params['input_shapes'] = preprocessor.context['input_shapes']
        model.params['task'] = ranking_task
        model.params['embedding_input_dim'] = preprocessor.context[
            'vocab_size']
        model.params['embedding_output_dim'] = 300
        model.params['embedding_trainable'] = True
        model.params['num_blocks'] = 1
        model.params['kernel_count'] = [64]
        model.params['kernel_size'] = [[3, 3]]
        model.params['dpool_size'] = [3, 10]
        model.params['optimizer'] = 'adam'
        model.params['dropout_rate'] = 0.1
        model.guess_and_fill_missing_params()
        model.build()
        model.compile()
        model.backend.summary()
        model.params.completed()
        print(model.params)

        # pre-train embedding
        save_embedding(embeddingdir,
                       preprocessor.context['vocab_unit'].state['term_index'],
                       embeddingmatricdir)

        embed_dict = read_embedding(filename=embeddingmatricdir)
        embedding_matrix = build_matrix(embed_dict,
                                        preprocessor.context['vocab_size'],
                                        embed_size=300)
        # # online prediction - load this file in memory
        model.load_embedding_matrix(embedding_matrix)

        # training
        train_generator = mz.DPoolPairDataGenerator(train_pack_processed,
                                                    fixed_length_left=20,
                                                    fixed_length_right=20,
                                                    num_dup=2,
                                                    num_neg=1,
                                                    batch_size=20)

        valid_generator = mz.DPoolDataGenerator(valid_pack_processed,
                                                fixed_length_left=20,
                                                fixed_length_right=20,
                                                batch_size=20)

        pred_x, pred_y = valid_generator[:]
        evaluate = mz.callbacks.EvaluateAllMetrics(model,
                                                   x=pred_x,
                                                   y=pred_y,
                                                   batch_size=len(pred_y))
        history = model.fit_generator(train_generator,
                                      epochs=20,
                                      callbacks=[evaluate],
                                      verbose=2,
                                      workers=30,
                                      use_multiprocessing=True)
        model.save(modeldir)