Пример #1
0
def Ask_Question(question,languege='Persian'):
    if(languege=='Persian'):
        question = Stemmer_Text(Normalizer_Text(question))
        Answers = load_data(pre_path_files+'Persian/ListAnswering')
        tokenizer = load_data(pre_path_files+'Persian/Word2Vec/tokenizer')
        embedding_matrix = load_data(pre_path_files+'Persian/Word2Vec/embedding_matrix')
        embedding_meta_data = {
            'tokenizer': tokenizer,
            'embedding_matrix': embedding_matrix
        }
        model_path_persian=glob.glob(pre_path_files+'Persian/Model/*.h5')[0]
        model = load_model(model_path_persian)
        test_sentence_pairs = []
        for i in range(len(Answers)):
            test_sentence_pairs.append((question, Stemmer_Text(Normalizer_Text(Answers[i]))))
        test_data_x1, test_data_x2, leaks_test = create_test_data(tokenizer, test_sentence_pairs,
                                                                  siamese_config['MAX_SEQUENCE_LENGTH'])
        preds = list(model.predict([test_data_x1, test_data_x2, leaks_test], verbose=1).ravel())
        results = [z for (x, y), z in zip(test_sentence_pairs, preds)]
        out = sorted(range(len(results)), key=lambda k: results[k])
        result_list = []
        for j in range(5):
            result_list.append({'answer': Answers[out[len(out) - 1 - j]],
                                'score': str(results[out[len(out) - 1 - j]])})
        return result_list

    elif(languege=='English'):
        question = Clean_Text(question)
        Answers = load_data(pre_path_files+'English/ListAnswering')
        print(Answers)
        tokenizer = load_data(pre_path_files+'English/Word2Vec/tokenizer')
        embedding_matrix = load_data(pre_path_files+'English/Word2Vec/embedding_matrix')
        embedding_meta_data = {
            'tokenizer': tokenizer,
            'embedding_matrix': embedding_matrix
        }
        model_path_english=glob.glob(pre_path_files+'English/Word2Vec/*.h5')[0]
        model = load_model(model_path_english)
        test_sentence_pairs = []
        for i in range(len(Answers)):
            test_sentence_pairs.append((question, Clean_Text(Answers[i])))
        test_data_x1, test_data_x2, leaks_test = create_test_data(tokenizer, test_sentence_pairs,
                                                                  siamese_config['MAX_SEQUENCE_LENGTH'])
        preds = list(model.predict([test_data_x1, test_data_x2, leaks_test], verbose=1).ravel())
        results = [z for (x, y), z in zip(test_sentence_pairs, preds)]
        out = sorted(range(len(results)), key=lambda k: results[k])
        result_list = []
        for j in range(5):
            result_list.append({'answer': Answers[out[len(out) - 1 - j]],
                                'score': str(results[out[len(out) - 1 - j]])})
        return result_list
Пример #2
0
def output(l1, l2):
    test_sentence_pairs = test_sentence_pairs = [(l1, l2)]
    #
    test_data_x1, test_data_x2, leaks_test = create_test_data(
        tokenizer, test_sentence_pairs, siamese_config['MAX_SEQUENCE_LENGTH'])
    #
    preds = list(
        model.predict([test_data_x1, test_data_x2, leaks_test],
                      verbose=1).ravel())
    results = [(x, y, z) for (x, y), z in zip(test_sentence_pairs, preds)]
    results.sort(key=itemgetter(2), reverse=True)
    return results[0][2]
Пример #3
0
def test(test_pair):
	model = load_model(best_model_path)

	test_sentence_pairs = test_pair
	bias1=random.uniform(0.28123,0.3156)
	bias2=random.uniform(0.07682,0.09354)
	test_data_x1, test_data_x2, leaks_test = create_test_data(tokenizer,test_sentence_pairs,  siamese_config['MAX_SEQUENCE_LENGTH'])
	
	preds = list(model.predict([test_data_x1, test_data_x2, leaks_test], verbose=1).ravel())
	results = [(x, y, z) for (x, y), z in zip(test_sentence_pairs, preds)]
	results.sort(key=itemgetter(2), reverse=True)
	print(test_pair)
	print(results[0][2])
	return max(min(results[0][2],bias1),bias2)
def get_doc2vec_vectors_train_valid_split(trainingData):

    # split the dataset into training and validation datasets
    train_x, valid_x, train_y, valid_y = get_train_test_split_of_dataframe(
        trainingData, False)

    # label encode the target variable
    encoder = preprocessing.LabelEncoder()
    train_y = encoder.fit_transform(train_y)
    valid_y = encoder.fit_transform(valid_y)

    sentences1 = train_x['Q1']
    sentences2 = train_x['Q2']
    is_similar = list(train_y)

    sentences1_validate = valid_x['Q1']
    sentences2_validate = valid_x['Q2']
    is_similar_validate = list(valid_y)

    tokenizer, embedding_matrix = word_embed_meta_data(
        sentences1 + sentences2, siamese_config['EMBEDDING_DIM'])

    embedding_meta_data = {
        'tokenizer': tokenizer,
        'embedding_matrix': embedding_matrix
    }

    ## creating sentence pairs
    sentences_pairs = [(x1, x2) for x1, x2 in zip(sentences1, sentences2)]
    del sentences1
    del sentences2

    sentences_pairs_validate = [
        (x1, x2) for x1, x2 in zip(sentences1_validate, sentences2_validate)
    ]
    del sentences1_validate
    del sentences2_validate

    test_data_x1, test_data_x2, leaks_test = create_test_data(
        tokenizer, sentences_pairs_validate,
        siamese_config['MAX_SEQUENCE_LENGTH'])
    test_data_x = [test_data_x1, test_data_x2, leaks_test]

    return sentences_pairs, is_similar, test_data_x, is_similar_validate, embedding_meta_data
def single_pred(x, y):
    """

	"""
    #print(sentences1,sentences2)

    from operator import itemgetter
    from keras.models import load_model
    bst_path = "./stored_model/1589893283/lstm_new.h5"
    model = load_model(bst_path)

    test_sentence_pairs = [(pre.preprocessing_single_sentence(x),
                            pre.preprocessing_single_sentence(y))]
    test_data_x1, test_data_x2, leaks_test = create_test_data(
        tokenizer, test_sentence_pairs, siamese_config['MAX_SEQUENCE_LENGTH'])

    preds = model.predict([test_data_x1, test_data_x2, leaks_test]).ravel()
    print(preds[0])
    l = str(preds[0] * 100)
    return l
Пример #6
0
                        CONFIG.rate_drop_lstm, CONFIG.rate_drop_dense,
                        CONFIG.activation_function,
                        CONFIG.validation_split_ratio)

best_model_path = siamese.train_model(sentences_pair,
                                      is_similar,
                                      model_save_directory='./')

#######################
##### Testing #########
#######################
#
# model = load_model(
#     r'/Users/caowenli/Desktop/ml_pj/dl/time_series_similarity/checkpoints/1593418708/lstm_128_64_0.17_0.30.h5')
model = load_model(best_model_path)
train_data_x1, train_data_x2 = create_test_data(sentences_pair)
trian_preds = list(
    model.predict([train_data_x1, train_data_x1], verbose=1).ravel())
res = []
for i in trian_preds:
    if i < 0.5:
        res.append(0)
    else:
        res.append(1)
print(len(res))
print(res)
data['predict'] = np.array(res)
data.to_csv("labeled_train_data.csv", index=None)

test_data = pd.read_csv('test_data.csv')
test_sentences1 = list(test_data['s_link_dir_speed'])
Пример #7
0

df = pd.read_csv('../data/final_hate_dataset_test.csv', sep='\t')
hs_sentences = list(df['HS'])
cn_sentences = list(df['CN'])
cntype = list(df['CNtype'])
# remove the cn type
# cntype = ['none' for x in cntype]

top_n = 3

tokenizer, embedding_matrix = word_embed_meta_data(hs_sentences + cn_sentences,  siamese_config['EMBEDDING_DIM'])

guess = 0
for i in range(len(hs_sentences)):
    resp = cn_sentences[i]
    hs = hs_sentences[i]
    all_pairs = [(hs, cn) for cn in cn_sentences]
    test_data_x1, test_data_x2, leaks_test, cntypes = create_test_data(tokenizer, all_pairs, cntype, siamese_config['MAX_SEQUENCE_LENGTH'])

    preds = list(model.predict([test_data_x1, test_data_x2, leaks_test, cntypes], verbose=1).ravel())
    results = [(x, y, z) for (x, y), z in zip(all_pairs, preds)]
    results.sort(key=itemgetter(2), reverse=True)
    top = results[:top_n]
    responses = [x[1] for x in top]
    if resp in responses:
        print('Guess')
        guess += 1
    print(hs, resp, top)
print('Accuracy', float(guess)/len(hs_sentences))
                        CONFIG.activation_function,
                        CONFIG.validation_split_ratio)

best_model_path = siamese.train_model(sentences_pair,
                                      is_similar,
                                      embedding_meta_data,
                                      model_save_directory='./')

########################
###### Testing #########
########################

model = load_model(best_model_path)

test_sentence_pairs = [
    ('What can make Physics easy to learn?',
     'How can you make physics easy to learn?'),
    ('How many times a day do a clocks hands overlap?',
     'What does it mean that every time I look at the clock the numbers are the same?'
     )
]

test_data_x1, test_data_x2, leaks_test = create_test_data(
    tokenizer, test_sentence_pairs, siamese_config['MAX_SEQUENCE_LENGTH'])

preds = list(
    model.predict([test_data_x1, test_data_x2, leaks_test], verbose=1).ravel())
results = [(x, y, z) for (x, y), z in zip(test_sentence_pairs, preds)]
results.sort(key=itemgetter(2), reverse=True)
print(results)
Пример #9
0
df_test = pd.read_csv('data/test_word_char.csv')
sentences_test1 = list(df_test['q1_word'])
sentences_test2 = list(df_test['q2_word'])

## creating sentence pairs
test_sentence_pairs = [(x1, x2)
                       for x1, x2 in zip(sentences_test1, sentences_test2)]
del sentences_test1
del sentences_test2

#test_sentence_pairs = [('What can make Physics easy to learn?','How can you make physics easy to learn?'),
#					   ('How many times a day do a clocks hands overlap?','What does it mean that every time I look at the clock the numbers are the same?')]

test_data_x1, test_data_x2, leaks_test = create_test_data(
    tokenizer, test_sentence_pairs, CONFIG.max_sequence_length,
    embedding_index, tfidf_dict)

result = model.predict([test_data_x1, test_data_x2, leaks_test],
                       verbose=1,
                       batch_size=1024)

# 提交结果
submit_dir = './submit/'
if not os.path.exists(submit_dir):
    os.makedirs(submit_dir)

submit = pd.DataFrame()
submit['y_pre'] = list(result[:, 0])
submit.to_csv(submit_dir + 'result0618_add_features.csv', index=False)
Пример #10
0
def main():
    numberOfRelevantQs = 10
    #fileNameList = ['android','english', 'gaming', 'gis', 'mathematica', 'physics', 'programmers', 'stats', 'tex', 'unix', 'webmasters', 'wordpress']
    fileNameList = ['stats', 'tex', 'unix', 'webmasters', 'wordpress']
    #fileNameList = ['webmasters']

    for a in range(len(fileNameList)):

        outputFile = open('../data/output/result.txt', 'a')

        fileName = str(fileNameList[a])

        df_for_file = get_df_from_csv_file(fileName)
        train_x, train_y, valid_x, valid_y, embedding_meta_data = get_doc2vec_vectors_train_valid_split(
            df_for_file)
        model_path = train_model(train_x, train_y, embedding_meta_data,
                                 fileName)
        siamese_lstm_model = load_model(model_path)
        preds, accuracy = evaluate_model(siamese_lstm_model, valid_x, valid_y)
        print('Classification Accuracy for : ' + fileName + ' Siamese LSTM ' +
              str(str(accuracy[1])),
              file=outputFile)
        tokenizer = embedding_meta_data['tokenizer']

        corpus = []
        corpus_to_query = []
        df = pd.read_json('../data/json/' + str(fileName) + '_questions.json',
                          orient='index')
        df['QuestionID'] = df.index
        df = df[['QuestionID', 'title', 'body', 'dups']]
        numberOfTrainData = df['QuestionID'].size

        for i in range(0, numberOfTrainData):
            title = review_to_wordlist(df.iloc[i].loc['title'])
            body = review_to_wordlist(strip_tags(df.iloc[i].loc['body']))
            qid = df.iloc[i].loc['QuestionID']
            corpus.append(title)
            corpus_to_query.append((str(qid), title, body))

        tokenized_corpus = [doc.split(" ") for doc in corpus]
        bm25 = BM25Okapi(tokenized_corpus)

        foundDupsAll = 0
        queriesWithDuplicates = 0
        sumOfAveragePrecision = 0.0
        precisionAtSum = 0
        for i in range(0, numberOfTrainData):

            dupids = df.iloc[i].loc['dups']
            if (len(dupids) > 0):

                test_query = review_to_wordlist(
                    df.iloc[i].loc['title']) + " " + review_to_wordlist(
                        strip_tags(df.iloc[i].loc['body']))
                tokenized_query = test_query.split(" ")

                queriesWithDuplicates += 1

                candidate_docs_bm25 = bm25.get_top_n(tokenized_query,
                                                     corpus_to_query,
                                                     n=10)
                topn_similar_docs_as_pairs = []
                topn_doc_indexes = []

                for p in range(0, len(candidate_docs_bm25)):
                    candidateQuestion = review_to_wordlist(
                        candidate_docs_bm25[p][1]) + " " + review_to_wordlist(
                            strip_tags(candidate_docs_bm25[p][2]))
                    topn_similar_docs_as_pairs.append(
                        (test_query, candidateQuestion))
                    topn_doc_indexes.append(candidate_docs_bm25[p][0])

                test_data_x1, test_data_x2, leaks_test = create_test_data(
                    tokenizer, topn_similar_docs_as_pairs,
                    siamese_config['MAX_SEQUENCE_LENGTH'])
                preds = list(
                    siamese_lstm_model.predict(
                        [test_data_x1, test_data_x2, leaks_test],
                        verbose=0).ravel())
                results = [
                    (x, y, z)
                    for (x, y), z in zip(topn_similar_docs_as_pairs, preds)
                ]
                results = [(a, y, z)
                           for (x, y, z), a in zip(results, topn_doc_indexes)]
                results.sort(key=itemgetter(2), reverse=True)

                foundDups = 0
                apForQuery = 0.0
                #print("Query : " + test_query)
                #print("Annotated Dups : " + str(dupids))
                #print('Results : ')
                for j in range(0, numberOfRelevantQs):
                    similarDocId = results[j][0]
                    #similarDocId = candidate_docs_bm25[j][0]
                    #print(similarDocId)
                    if (similarDocId in dupids):
                        foundDups += 1
                        apForQuery += foundDups / (j + 1)
                        #print('-------------- Hit !!!-----------------@'+ str(j))

                if (foundDups > 0):
                    sumOfAveragePrecision = sumOfAveragePrecision + (
                        apForQuery / len(dupids))
                precisionAtSum += foundDups / numberOfRelevantQs
                foundDupsAll += foundDups

        print(str(fileName) + " Train Data Count : " + str(numberOfTrainData),
              file=outputFile)
        print(str(fileName) + " Relevant Query Count (Test) : " +
              str(queriesWithDuplicates),
              file=outputFile)
        print(str(fileName) + " Found within " + str(numberOfRelevantQs) +
              " Count : " + str(foundDupsAll),
              file=outputFile)
        print(str(fileName) + " Precision @ " + str(numberOfRelevantQs) +
              " : " + str(precisionAtSum / queriesWithDuplicates),
              file=outputFile)
        print(str(fileName) + " MAP : " +
              str(sumOfAveragePrecision / queriesWithDuplicates),
              file=outputFile)

        outputFile.close()
        del df
        del siamese_lstm_model
        del bm25
        del corpus
        del corpus_to_query
        del topn_similar_docs_as_pairs
        gc.collect()