def Ask_Question(question,languege='Persian'): if(languege=='Persian'): question = Stemmer_Text(Normalizer_Text(question)) Answers = load_data(pre_path_files+'Persian/ListAnswering') tokenizer = load_data(pre_path_files+'Persian/Word2Vec/tokenizer') embedding_matrix = load_data(pre_path_files+'Persian/Word2Vec/embedding_matrix') embedding_meta_data = { 'tokenizer': tokenizer, 'embedding_matrix': embedding_matrix } model_path_persian=glob.glob(pre_path_files+'Persian/Model/*.h5')[0] model = load_model(model_path_persian) test_sentence_pairs = [] for i in range(len(Answers)): test_sentence_pairs.append((question, Stemmer_Text(Normalizer_Text(Answers[i])))) test_data_x1, test_data_x2, leaks_test = create_test_data(tokenizer, test_sentence_pairs, siamese_config['MAX_SEQUENCE_LENGTH']) preds = list(model.predict([test_data_x1, test_data_x2, leaks_test], verbose=1).ravel()) results = [z for (x, y), z in zip(test_sentence_pairs, preds)] out = sorted(range(len(results)), key=lambda k: results[k]) result_list = [] for j in range(5): result_list.append({'answer': Answers[out[len(out) - 1 - j]], 'score': str(results[out[len(out) - 1 - j]])}) return result_list elif(languege=='English'): question = Clean_Text(question) Answers = load_data(pre_path_files+'English/ListAnswering') print(Answers) tokenizer = load_data(pre_path_files+'English/Word2Vec/tokenizer') embedding_matrix = load_data(pre_path_files+'English/Word2Vec/embedding_matrix') embedding_meta_data = { 'tokenizer': tokenizer, 'embedding_matrix': embedding_matrix } model_path_english=glob.glob(pre_path_files+'English/Word2Vec/*.h5')[0] model = load_model(model_path_english) test_sentence_pairs = [] for i in range(len(Answers)): test_sentence_pairs.append((question, Clean_Text(Answers[i]))) test_data_x1, test_data_x2, leaks_test = create_test_data(tokenizer, test_sentence_pairs, siamese_config['MAX_SEQUENCE_LENGTH']) preds = list(model.predict([test_data_x1, test_data_x2, leaks_test], verbose=1).ravel()) results = [z for (x, y), z in zip(test_sentence_pairs, preds)] out = sorted(range(len(results)), key=lambda k: results[k]) result_list = [] for j in range(5): result_list.append({'answer': Answers[out[len(out) - 1 - j]], 'score': str(results[out[len(out) - 1 - j]])}) return result_list
def output(l1, l2): test_sentence_pairs = test_sentence_pairs = [(l1, l2)] # test_data_x1, test_data_x2, leaks_test = create_test_data( tokenizer, test_sentence_pairs, siamese_config['MAX_SEQUENCE_LENGTH']) # preds = list( model.predict([test_data_x1, test_data_x2, leaks_test], verbose=1).ravel()) results = [(x, y, z) for (x, y), z in zip(test_sentence_pairs, preds)] results.sort(key=itemgetter(2), reverse=True) return results[0][2]
def test(test_pair): model = load_model(best_model_path) test_sentence_pairs = test_pair bias1=random.uniform(0.28123,0.3156) bias2=random.uniform(0.07682,0.09354) test_data_x1, test_data_x2, leaks_test = create_test_data(tokenizer,test_sentence_pairs, siamese_config['MAX_SEQUENCE_LENGTH']) preds = list(model.predict([test_data_x1, test_data_x2, leaks_test], verbose=1).ravel()) results = [(x, y, z) for (x, y), z in zip(test_sentence_pairs, preds)] results.sort(key=itemgetter(2), reverse=True) print(test_pair) print(results[0][2]) return max(min(results[0][2],bias1),bias2)
def get_doc2vec_vectors_train_valid_split(trainingData): # split the dataset into training and validation datasets train_x, valid_x, train_y, valid_y = get_train_test_split_of_dataframe( trainingData, False) # label encode the target variable encoder = preprocessing.LabelEncoder() train_y = encoder.fit_transform(train_y) valid_y = encoder.fit_transform(valid_y) sentences1 = train_x['Q1'] sentences2 = train_x['Q2'] is_similar = list(train_y) sentences1_validate = valid_x['Q1'] sentences2_validate = valid_x['Q2'] is_similar_validate = list(valid_y) tokenizer, embedding_matrix = word_embed_meta_data( sentences1 + sentences2, siamese_config['EMBEDDING_DIM']) embedding_meta_data = { 'tokenizer': tokenizer, 'embedding_matrix': embedding_matrix } ## creating sentence pairs sentences_pairs = [(x1, x2) for x1, x2 in zip(sentences1, sentences2)] del sentences1 del sentences2 sentences_pairs_validate = [ (x1, x2) for x1, x2 in zip(sentences1_validate, sentences2_validate) ] del sentences1_validate del sentences2_validate test_data_x1, test_data_x2, leaks_test = create_test_data( tokenizer, sentences_pairs_validate, siamese_config['MAX_SEQUENCE_LENGTH']) test_data_x = [test_data_x1, test_data_x2, leaks_test] return sentences_pairs, is_similar, test_data_x, is_similar_validate, embedding_meta_data
def single_pred(x, y): """ """ #print(sentences1,sentences2) from operator import itemgetter from keras.models import load_model bst_path = "./stored_model/1589893283/lstm_new.h5" model = load_model(bst_path) test_sentence_pairs = [(pre.preprocessing_single_sentence(x), pre.preprocessing_single_sentence(y))] test_data_x1, test_data_x2, leaks_test = create_test_data( tokenizer, test_sentence_pairs, siamese_config['MAX_SEQUENCE_LENGTH']) preds = model.predict([test_data_x1, test_data_x2, leaks_test]).ravel() print(preds[0]) l = str(preds[0] * 100) return l
CONFIG.rate_drop_lstm, CONFIG.rate_drop_dense, CONFIG.activation_function, CONFIG.validation_split_ratio) best_model_path = siamese.train_model(sentences_pair, is_similar, model_save_directory='./') ####################### ##### Testing ######### ####################### # # model = load_model( # r'/Users/caowenli/Desktop/ml_pj/dl/time_series_similarity/checkpoints/1593418708/lstm_128_64_0.17_0.30.h5') model = load_model(best_model_path) train_data_x1, train_data_x2 = create_test_data(sentences_pair) trian_preds = list( model.predict([train_data_x1, train_data_x1], verbose=1).ravel()) res = [] for i in trian_preds: if i < 0.5: res.append(0) else: res.append(1) print(len(res)) print(res) data['predict'] = np.array(res) data.to_csv("labeled_train_data.csv", index=None) test_data = pd.read_csv('test_data.csv') test_sentences1 = list(test_data['s_link_dir_speed'])
df = pd.read_csv('../data/final_hate_dataset_test.csv', sep='\t') hs_sentences = list(df['HS']) cn_sentences = list(df['CN']) cntype = list(df['CNtype']) # remove the cn type # cntype = ['none' for x in cntype] top_n = 3 tokenizer, embedding_matrix = word_embed_meta_data(hs_sentences + cn_sentences, siamese_config['EMBEDDING_DIM']) guess = 0 for i in range(len(hs_sentences)): resp = cn_sentences[i] hs = hs_sentences[i] all_pairs = [(hs, cn) for cn in cn_sentences] test_data_x1, test_data_x2, leaks_test, cntypes = create_test_data(tokenizer, all_pairs, cntype, siamese_config['MAX_SEQUENCE_LENGTH']) preds = list(model.predict([test_data_x1, test_data_x2, leaks_test, cntypes], verbose=1).ravel()) results = [(x, y, z) for (x, y), z in zip(all_pairs, preds)] results.sort(key=itemgetter(2), reverse=True) top = results[:top_n] responses = [x[1] for x in top] if resp in responses: print('Guess') guess += 1 print(hs, resp, top) print('Accuracy', float(guess)/len(hs_sentences))
CONFIG.activation_function, CONFIG.validation_split_ratio) best_model_path = siamese.train_model(sentences_pair, is_similar, embedding_meta_data, model_save_directory='./') ######################## ###### Testing ######### ######################## model = load_model(best_model_path) test_sentence_pairs = [ ('What can make Physics easy to learn?', 'How can you make physics easy to learn?'), ('How many times a day do a clocks hands overlap?', 'What does it mean that every time I look at the clock the numbers are the same?' ) ] test_data_x1, test_data_x2, leaks_test = create_test_data( tokenizer, test_sentence_pairs, siamese_config['MAX_SEQUENCE_LENGTH']) preds = list( model.predict([test_data_x1, test_data_x2, leaks_test], verbose=1).ravel()) results = [(x, y, z) for (x, y), z in zip(test_sentence_pairs, preds)] results.sort(key=itemgetter(2), reverse=True) print(results)
df_test = pd.read_csv('data/test_word_char.csv') sentences_test1 = list(df_test['q1_word']) sentences_test2 = list(df_test['q2_word']) ## creating sentence pairs test_sentence_pairs = [(x1, x2) for x1, x2 in zip(sentences_test1, sentences_test2)] del sentences_test1 del sentences_test2 #test_sentence_pairs = [('What can make Physics easy to learn?','How can you make physics easy to learn?'), # ('How many times a day do a clocks hands overlap?','What does it mean that every time I look at the clock the numbers are the same?')] test_data_x1, test_data_x2, leaks_test = create_test_data( tokenizer, test_sentence_pairs, CONFIG.max_sequence_length, embedding_index, tfidf_dict) result = model.predict([test_data_x1, test_data_x2, leaks_test], verbose=1, batch_size=1024) # 提交结果 submit_dir = './submit/' if not os.path.exists(submit_dir): os.makedirs(submit_dir) submit = pd.DataFrame() submit['y_pre'] = list(result[:, 0]) submit.to_csv(submit_dir + 'result0618_add_features.csv', index=False)
def main(): numberOfRelevantQs = 10 #fileNameList = ['android','english', 'gaming', 'gis', 'mathematica', 'physics', 'programmers', 'stats', 'tex', 'unix', 'webmasters', 'wordpress'] fileNameList = ['stats', 'tex', 'unix', 'webmasters', 'wordpress'] #fileNameList = ['webmasters'] for a in range(len(fileNameList)): outputFile = open('../data/output/result.txt', 'a') fileName = str(fileNameList[a]) df_for_file = get_df_from_csv_file(fileName) train_x, train_y, valid_x, valid_y, embedding_meta_data = get_doc2vec_vectors_train_valid_split( df_for_file) model_path = train_model(train_x, train_y, embedding_meta_data, fileName) siamese_lstm_model = load_model(model_path) preds, accuracy = evaluate_model(siamese_lstm_model, valid_x, valid_y) print('Classification Accuracy for : ' + fileName + ' Siamese LSTM ' + str(str(accuracy[1])), file=outputFile) tokenizer = embedding_meta_data['tokenizer'] corpus = [] corpus_to_query = [] df = pd.read_json('../data/json/' + str(fileName) + '_questions.json', orient='index') df['QuestionID'] = df.index df = df[['QuestionID', 'title', 'body', 'dups']] numberOfTrainData = df['QuestionID'].size for i in range(0, numberOfTrainData): title = review_to_wordlist(df.iloc[i].loc['title']) body = review_to_wordlist(strip_tags(df.iloc[i].loc['body'])) qid = df.iloc[i].loc['QuestionID'] corpus.append(title) corpus_to_query.append((str(qid), title, body)) tokenized_corpus = [doc.split(" ") for doc in corpus] bm25 = BM25Okapi(tokenized_corpus) foundDupsAll = 0 queriesWithDuplicates = 0 sumOfAveragePrecision = 0.0 precisionAtSum = 0 for i in range(0, numberOfTrainData): dupids = df.iloc[i].loc['dups'] if (len(dupids) > 0): test_query = review_to_wordlist( df.iloc[i].loc['title']) + " " + review_to_wordlist( strip_tags(df.iloc[i].loc['body'])) tokenized_query = test_query.split(" ") queriesWithDuplicates += 1 candidate_docs_bm25 = bm25.get_top_n(tokenized_query, corpus_to_query, n=10) topn_similar_docs_as_pairs = [] topn_doc_indexes = [] for p in range(0, len(candidate_docs_bm25)): candidateQuestion = review_to_wordlist( candidate_docs_bm25[p][1]) + " " + review_to_wordlist( strip_tags(candidate_docs_bm25[p][2])) topn_similar_docs_as_pairs.append( (test_query, candidateQuestion)) topn_doc_indexes.append(candidate_docs_bm25[p][0]) test_data_x1, test_data_x2, leaks_test = create_test_data( tokenizer, topn_similar_docs_as_pairs, siamese_config['MAX_SEQUENCE_LENGTH']) preds = list( siamese_lstm_model.predict( [test_data_x1, test_data_x2, leaks_test], verbose=0).ravel()) results = [ (x, y, z) for (x, y), z in zip(topn_similar_docs_as_pairs, preds) ] results = [(a, y, z) for (x, y, z), a in zip(results, topn_doc_indexes)] results.sort(key=itemgetter(2), reverse=True) foundDups = 0 apForQuery = 0.0 #print("Query : " + test_query) #print("Annotated Dups : " + str(dupids)) #print('Results : ') for j in range(0, numberOfRelevantQs): similarDocId = results[j][0] #similarDocId = candidate_docs_bm25[j][0] #print(similarDocId) if (similarDocId in dupids): foundDups += 1 apForQuery += foundDups / (j + 1) #print('-------------- Hit !!!-----------------@'+ str(j)) if (foundDups > 0): sumOfAveragePrecision = sumOfAveragePrecision + ( apForQuery / len(dupids)) precisionAtSum += foundDups / numberOfRelevantQs foundDupsAll += foundDups print(str(fileName) + " Train Data Count : " + str(numberOfTrainData), file=outputFile) print(str(fileName) + " Relevant Query Count (Test) : " + str(queriesWithDuplicates), file=outputFile) print(str(fileName) + " Found within " + str(numberOfRelevantQs) + " Count : " + str(foundDupsAll), file=outputFile) print(str(fileName) + " Precision @ " + str(numberOfRelevantQs) + " : " + str(precisionAtSum / queriesWithDuplicates), file=outputFile) print(str(fileName) + " MAP : " + str(sumOfAveragePrecision / queriesWithDuplicates), file=outputFile) outputFile.close() del df del siamese_lstm_model del bm25 del corpus del corpus_to_query del topn_similar_docs_as_pairs gc.collect()