def predict(self, test_pack_processed): model = mz.load_model(self.modeldir) test_generator = mz.DPoolDataGenerator(test_pack_processed, fixed_length_left=20, fixed_length_right=20, batch_size=20) pred_x, pred_y = test_generator[:] predict_value = model.predict(pred_x, batch_size=len(pred_y)) # batch_size return predict_value
def predict_similarity_scores(tweets,q,n_top): keras.backend.clear_session() pre=mz.engine.base_preprocessor.load_preprocessor('./search/Preprocessors/MatchPy_full_fasttext') model=mz.engine.base_model.load_model('./search/Models/MatchPy_full_fasttext') tokenizer = RegexpTokenizer(r'\w+') rows=[] for id, tweet in tweets.iterrows(): rows+=[{ 'id_left': 1, 'text_left': q, 'id_right': tweet['id_str'], 'text_right': tweet['processed_text'], 'label':1 }] if len(rows)==0: return None data=pd.DataFrame.from_dict(rows) data_pack = mz.pack(data) del data data_pack.relation['label'] = data_pack.relation['label'].astype('float32') predict_pack_processed=pre.transform(data_pack) predict_generator = mz.DPoolDataGenerator(predict_pack_processed, fixed_length_left=10, fixed_length_right=40, batch_size=20) logging.info('\n# Predictiong...\n') pred_x, pred_y = predict_generator[:] predictions=model.predict(pred_x) del data_pack i=0 tweets.set_index('id_str',inplace=True) tweets['score']=0 x=pd.DataFrame(pred_x,columns=['id_left','id_right']) for index,row in x.iterrows(): tweets.loc[row['id_right'],'score']=predictions[i][0] i+=1 del x tweets=tweets.sort_values(['score'],ascending=[False]) # eliminer les scores negatifs logging.info('\n# Prediction [OK]\n') return tweets
model.params['dropout_rate'] = 0.4 model.guess_and_fill_missing_params() model.build() model.compile() model.backend.summary() matrix = embedding.build_matrix( preprocessor.context['vocab_unit'].state['term_index']) model.load_embedding_matrix(matrix) train_generator = mz.DPoolPairDataGenerator(train_pack_processed, fixed_length_left=10, fixed_length_right=128, num_dup=2, num_neg=1, batch_size=20) predict_generator = mz.DPoolDataGenerator(predict_pack_processed, fixed_length_left=10, fixed_length_right=128, batch_size=20) pred_x, pred_y = predict_generator[:] # Fit the model evaluate = mz.callbacks.EvaluateAllMetrics(model, x=pred_x, y=pred_y, batch_size=len(pred_y)) history = model.fit_generator(train_generator, epochs=30, callbacks=[evaluate], workers=30, use_multiprocessing=True) # evaluate the model scores = model.evaluate(pred_x, pred_y, batch_size=len(pred_y))
def matchpyramid_api(qpool, logdir, dataset_path, train_id, parameter): keras.backend.clear_session() # load数据并创建preprocessor对象 train_pack = load_train_data(train_id, parameter['existing_dataset'], parameter['task']) predict_pack = load_test_data(train_id, parameter['existing_dataset'], parameter['task']) preprocessor = mz.preprocessors.BasicPreprocessor(fixed_length_left=10, fixed_length_right=40, remove_stop_words=True) # 重定向stderr到log文件 logdir.set_preprocess_id(train_id) err_old = sys.stderr sys.stderr = logdir # preprocessor.fit的内容写出到log,写完后关闭重定向,保存preprocessor train_pack_processed = preprocessor.fit_transform(train_pack) sys.stderr = err_old preprocessor.save(ROOT_PATH + 'matchzoo_temp_files/preprocessors/' + train_id + '.matchpyramid_preprocessor') predict_pack_processed = preprocessor.transform(predict_pack) with open(ROOT_PATH + 'matchzoo_temp_files/logger/' + train_id + '.preprocess_log', 'a') as f: f.write('Preprocess finished!') ranking_task = mz.tasks.Ranking(loss=mz.losses.RankHingeLoss()) ranking_task.metrics = [ mz.metrics.NormalizedDiscountedCumulativeGain(k=3), mz.metrics.NormalizedDiscountedCumulativeGain(k=5), mz.metrics.MeanAveragePrecision() ] model = mz.models.MatchPyramid() model.params['input_shapes'] = preprocessor.context['input_shapes'] model.params['task'] = ranking_task model.params['embedding_input_dim'] = preprocessor.context['vocab_size'] model.params['embedding_output_dim'] = parameter['embedding_output_dim'] model.params['embedding_trainable'] = True model.params['num_blocks'] = parameter['num_blocks'] model.params['kernel_count'] = [16, 32] model.params['kernel_size'] = [[3, 3], [3, 3]] model.params['dpool_size'] = [3, 10] model.params['optimizer'] = 'adam' model.params['dropout_rate'] = 0.1 model.guess_and_fill_missing_params() model.build() model.compile() model.backend.summary() glove_embedding = mz.datasets.embeddings.load_glove_embedding(dimension=300) embedding_matrix = glove_embedding.build_matrix(preprocessor.context['vocab_unit'].state['term_index']) model.load_embedding_matrix(embedding_matrix) train_generator = mz.DPoolPairDataGenerator(train_pack_processed, fixed_length_left=10, fixed_length_right=40, num_dup=2, num_neg=1, batch_size=20) predict_generator = mz.DPoolDataGenerator(predict_pack_processed, fixed_length_left=10, fixed_length_right=40, batch_size=20) pred_x, pred_y = predict_generator[:] evaluate = mz.callbacks.EvaluateAllMetrics(model, x=pred_x, y=pred_y, batch_size=len(pred_y)) # 重定向stdout到log当中 qpool.set_trainid(train_id) old = sys.stdout sys.stdout = qpool model.fit_generator(train_generator, epochs=parameter['epochs'], callbacks=[evaluate], workers=5, use_multiprocessing=False) sys.stdout = old model.save(ROOT_PATH + 'matchzoo_temp_files/models/' + train_id + '.matchpyramid_model')
def predict_similarity_scores(processed_tweets, n_top): tokenizer = RegexpTokenizer(r'\w+') pre = mz.engine.base_preprocessor.load_preprocessor( '/data/Preprocessors/MatchPy/MatchPy_full_2015-2016_fasttext_CBOW') model = mz.engine.base_model.load_model( '/data/Models/MatchPy/MatchPy_full_2015-2016_fasttext_CBOW') tweets = pd.DataFrame.from_dict(processed_tweets) del processed_tweets tweets.drop_duplicates('tweetid', 'first', True) tweets.set_index('tweetid', inplace=True) #Retrieve topics #with open('/home/lila/CORPUS/left/left_2017.json') as f: with open( "/data/CORPUS/left/left_2017_nist_evaluated_qrels_real.json") as f: topics = json.load(f) rows = [] for topic in topics[:]: for id, tweet in tweets.iterrows(): #overlap=set(tokenizer.tokenize(topic['title'])) & set(tokenizer.tokenize(tweet['text'])) #if len(overlap)>0: rows += [{ 'id_left': topic['topid'], 'text_left': topic['title'], 'id_right': str(id), 'text_right': tweet['text'], 'label': 0 }] if len(rows) == 0: return None data = pd.DataFrame.from_dict(rows) data_pack = mz.pack(data) del data data_pack.relation['label'] = data_pack.relation['label'].astype('float32') predict_pack_processed = pre.transform(data_pack) predict_generator = mz.DPoolDataGenerator(predict_pack_processed, fixed_length_left=10, fixed_length_right=128, batch_size=20) pred_x, pred_y = predict_generator[:] predictions = model.predict(pred_x) del data_pack i = 0 scores = [] x = pd.DataFrame(pred_x, columns=['id_left', 'id_right']) for index, row in x.iterrows(): score = { "id_left": row['id_left'], "id_right": row['id_right'], "text": tweets.loc[row['id_right'], 'text'], "created_at": tweets.loc[row['id_right'], 'created_at'], "score": predictions[i][0] } scores.append(score) i += 1 del x pred_table = pd.DataFrame.from_dict(scores) del scores pred_table = pred_table.sort_values(['id_left', 'score'], ascending=[True, False]) pred_table.set_index('id_left', inplace=True) topics = set(pred_table.index.values) df = pd.DataFrame() for topic in topics: df = pd.concat([df, pred_table.loc[topic, :].head(n_top)], sort=True) del pred_table del topics return df
def model_build(preprocessor, train_pack_processed, valid_pack_processed): # model ranking_task = mz.tasks.Ranking(loss=mz.losses.RankHingeLoss()) ranking_task.metrics = [ mz.metrics.NormalizedDiscountedCumulativeGain(k=3), mz.metrics.NormalizedDiscountedCumulativeGain(k=5), mz.metrics.MeanAveragePrecision() ] model = mz.models.MatchPyramid() model.params['input_shapes'] = preprocessor.context['input_shapes'] model.params['task'] = ranking_task model.params['embedding_input_dim'] = preprocessor.context[ 'vocab_size'] model.params['embedding_output_dim'] = 300 model.params['embedding_trainable'] = True model.params['num_blocks'] = 1 model.params['kernel_count'] = [64] model.params['kernel_size'] = [[3, 3]] model.params['dpool_size'] = [3, 10] model.params['optimizer'] = 'adam' model.params['dropout_rate'] = 0.1 model.guess_and_fill_missing_params() model.build() model.compile() model.backend.summary() model.params.completed() print(model.params) # pre-train embedding save_embedding(embeddingdir, preprocessor.context['vocab_unit'].state['term_index'], embeddingmatricdir) embed_dict = read_embedding(filename=embeddingmatricdir) embedding_matrix = build_matrix(embed_dict, preprocessor.context['vocab_size'], embed_size=300) # # online prediction - load this file in memory model.load_embedding_matrix(embedding_matrix) # training train_generator = mz.DPoolPairDataGenerator(train_pack_processed, fixed_length_left=20, fixed_length_right=20, num_dup=2, num_neg=1, batch_size=20) valid_generator = mz.DPoolDataGenerator(valid_pack_processed, fixed_length_left=20, fixed_length_right=20, batch_size=20) pred_x, pred_y = valid_generator[:] evaluate = mz.callbacks.EvaluateAllMetrics(model, x=pred_x, y=pred_y, batch_size=len(pred_y)) history = model.fit_generator(train_generator, epochs=20, callbacks=[evaluate], verbose=2, workers=30, use_multiprocessing=True) model.save(modeldir)