Exemplo n.º 1
0
def tokenize(sentence):
    sentence=WordPunctTokenizer().tokenize(sentence.lower())
    return ' '.join(sentence)
Exemplo n.º 2
0
    sim21 = (idf2 * (matrix2.dot(matrix1.T).max(axis=1))).sum() / idf2.sum()

    return 2 * sim12 * sim21 / (sim12 + sim21)
    total_len = matrix1.shape[0] + matrix2.shape[0]
    return sim12 * matrix2.shape[0] / total_len + sim21 * matrix1.shape[
        0] / total_len


if __name__ == "__main__":
    w2v = gensim.models.Word2Vec.load('../data/w2v_model_stemmed')

    idf = pickle.load(open('../data/idf'))

    question1 = 'intialize all elements in an ArrayList as a specific integer'
    question1 = WordPunctTokenizer().tokenize(question1.lower())
    question1 = [SnowballStemmer('english').stem(word) for word in question1]

    question2 = 'set every element of a list to the same constant value'
    question2 = WordPunctTokenizer().tokenize(question2.lower())
    question2 = [SnowballStemmer('english').stem(word) for word in question2]

    matrix1 = init_doc_matrix(question1, w2v)
    matrix2 = init_doc_matrix(question2, w2v)
    matrix1_trans = matrix1.T
    matrix2_trans = matrix2.T

    idf1 = init_doc_idf_vector(question1, idf)
    idf2 = init_doc_idf_vector(question2, idf)

    #print sim_question_api(question1, question2, idf, w2v)