示例#1
0
def clean_compute_similarity(d1, d2):

    #print(type(d1))
    #print(type(d2))

    d1 = remove_stopwords(d1).split()
    d2 = remove_stopwords(d2).split()

    #print(d1)
    #print(d2)

    # Dictionary and Corpus
    documents = [d1, d2]
    dictionary = corpora.Dictionary(documents)

    # Composing the similarity matrix
    similarity_matrix = fasttext_model300.similarity_matrix(dictionary,
                                                            tfidf=None,
                                                            threshold=0.0,
                                                            exponent=2.0,
                                                            nonzero_limit=100)

    # Conversion of sentences into bag-of-words vectors - The function doc2bow() simply counts the number of occurrences of each distinct word, converts the word to its integer word id and returns the result as a sparse vector.
    d1 = dictionary.doc2bow(d1)
    d2 = dictionary.doc2bow(d2)

    #print(d1)
    #print(d2)

    # Soft cosine similarity - Considers similarities between pairs of features
    score = softcossim(d1, d2, similarity_matrix)

    return score
示例#2
0
def soft_cosine_similarity(data_source, data_target):
    data_source = [s.strip() for s in data_source]
    data_source = [s.lower() for s in data_source]
    data_source = [s.split() for s in data_source]

    data_target = [s.strip() for s in data_target]
    data_target = [s.lower() for s in data_target]
    data_target = [s.split() for s in data_target]

    random.shuffle(data_source)

    overall_data = data_source + data_target
    assert len(overall_data) == len(data_source) + len(
        data_target), 'Lengths should be equal'

    #dictionary = corpora.Dictionary(data_source)
    dictionary = corpora.Dictionary(overall_data)
    print('Making similarity matrix')
    similarity_matrix = fasttext_model300.similarity_matrix(dictionary,
                                                            tfidf=None,
                                                            threshold=0.0,
                                                            exponent=2.0,
                                                            nonzero_limit=100)
    overlap_list = []
    same_counter = 0

    target_bow_list = []
    print('Processing target data')
    for k in range(len(data_target)):
        if k % 100 == 0:
            print(k)
        target_sent = data_target[k]
        target_bow = dictionary.doc2bow(target_sent)
        target_bow_list.append(target_bow)

    for i in range(1000):
        print('Iteration ', i)
        max_overlap = -100
        overlapper = 0
        source_sentence = data_source[i]
        source_bow = dictionary.doc2bow(source_sentence)
        for j in range(len(data_target)):
            #if i == j:
            #    continue
            target_bow = target_bow_list[j]
            distance = softcossim(source_bow, target_bow, similarity_matrix)
            if distance == 1:
                same_counter += 1
            if distance > max_overlap:
                max_overlap = distance
                overlapper = j
        overlap_list.append(max_overlap)
        print('Source: ', data_source[i])
        print('Closest sentence: ', data_target[overlapper])
        print('Overlap: ', max_overlap)
        print('Perfect matches: ', same_counter)

    avg_overlap = sum(overlap_list) / len(overlap_list)
    print(overlap_list)
    print('Average distance: ', avg_overlap)
 def test_distributions(self):
     # checking bag of words as inputs
     vec_1 = [(0, 1.0), (2, 1.0)]  # hello world
     vec_2 = [(1, 1.0), (2, 1.0)]  # hi world
     similarity_matrix = csc_matrix([[1, 0.5, 0], [0.5, 1, 0], [0, 0, 1]])
     result = matutils.softcossim(vec_1, vec_2, similarity_matrix)
     expected = 0.75
     self.assertAlmostEqual(expected, result)
示例#4
0
def soft_cosine_similarity_matrix(sentences):
    len_array = np.arange(len(sentences))
    xx, yy = np.meshgrid(len_array, len_array)
    cossim_mat = pd.DataFrame([[
        round(softcossim(sentences[i], sentences[j], similarity_matrix), 2)
        for i, j in zip(x, y)
    ] for y, x in zip(xx, yy)])
    return cossim_mat
示例#5
0
 def get_category_seed_similarity(self, sentence, seeds, similarity_matrix):
     result = 0
     length = len(seeds)
     sentence_d2b = self.dictionary.doc2bow(sentence)
     for word in seeds:
         seed_d2b = self.dictionary.doc2bow([word])
         result += softcossim(sentence_d2b, seed_d2b, similarity_matrix)
     return result / length
 def test_distributions(self):
     # checking bag of words as inputs
     vec_1 = [(0, 1.0), (2, 1.0)]  # hello world
     vec_2 = [(1, 1.0), (2, 1.0)]  # hi world
     similarity_matrix = csc_matrix([[1, 0.5, 0], [0.5, 1, 0], [0, 0, 1]])
     result = matutils.softcossim(vec_1, vec_2, similarity_matrix)
     expected = 0.75
     self.assertAlmostEqual(expected, result)
    def test_inputs(self):
        # checking empty inputs
        vec_1 = []
        vec_2 = []
        similarity_matrix = csc_matrix((0, 0))
        result = matutils.softcossim(vec_1, vec_2, similarity_matrix)
        expected = 0.0
        self.assertEqual(expected, result)

        # checking CSR term similarity matrix format
        similarity_matrix = csr_matrix((0, 0))
        result = matutils.softcossim(vec_1, vec_2, similarity_matrix)
        expected = 0.0
        self.assertEqual(expected, result)

        # checking unknown term similarity matrix format
        with self.assertRaises(ValueError):
            matutils.softcossim(vec_1, vec_2, np.matrix([]))
    def test_inputs(self):
        # checking empty inputs
        vec_1 = []
        vec_2 = []
        similarity_matrix = csc_matrix((0, 0))
        result = matutils.softcossim(vec_1, vec_2, similarity_matrix)
        expected = 0.0
        self.assertEqual(expected, result)

        # checking CSR term similarity matrix format
        similarity_matrix = csr_matrix((0, 0))
        result = matutils.softcossim(vec_1, vec_2, similarity_matrix)
        expected = 0.0
        self.assertEqual(expected, result)

        # checking unknown term similarity matrix format
        with self.assertRaises(ValueError):
            matutils.softcossim(vec_1, vec_2, np.matrix([]))
def create_soft_cossim_matrix(sentences, title, des):
    len_array = np.arange(len(sentences))
    xx, yy = np.meshgrid(0, len_array)
    cossim_mat = pd.DataFrame([[
        round(softcossim(sentences[i], sentences[j], similarity_matrix), 2)
        for i, j in zip(x, y)
    ] for y, x in zip(xx, yy)])
    k = cossim_mat.sort_values(by=0, ascending=False)
    k1 = pd.DataFrame(k)
    return k1
示例#10
0
    def get_similarity(self,first_document,second_document):
        documents = [ first_document, second_document]
        dictionary = corpora.Dictionary([simple_preprocess(doc) for doc in documents])
        similarity_matrix = self.__fasttext_model.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)
        
        sentences_of_first_document = dictionary.doc2bow(simple_preprocess(first_document))
        sentences_of_second_document = dictionary.doc2bow(simple_preprocess(second_document))

        sentences = [sentences_of_first_document, sentences_of_second_document]
        return softcossim(sentences_of_first_document, sentences_of_second_document, similarity_matrix)
def is_duplicate(text1, text2, threshold):
    preprocessed_text1 = preprocessor.preprocess_text(data['text1'])
    preprocessed_text2 = preprocessor.preprocess_text(data['text2'])

    bow1 = dictionary.doc2bow(preprocessed_text1)
    bow2 = dictionary.doc2bow(preprocessed_text2)

    softcossim_similarity = softcossim(bow1, bow2, similarity_matrix)

    return softcossim_similarity >= threshold
def soft_cosine(tokens, stem=False, lemma=False):
    """
    Apply soft cosine between two paragraphs using fasttext embeddings and append the scores to a dataframe
    :param tokens:
    :param stem: if stem is true, output cosine scores are saved for paragraphs with stemmed tokens
    :param lemma: if lemma is true, output cosine scores are saved for paragraphs with lemmatized tokens
    :return: none
    """
    softcosout = []
    colnames = []
    df_softcos = pd.DataFrame()
    tokens = tokens.apply(lambda x: ' '.join(x))
    
    token_1 = []
    token_2 = []
    
    for count in range(0, len(tokens)-1):
        sent1 = tokens[count]
        sent2 = tokens[count+1]

        parag1 = 'parag#' + str(count+1)
        parag2 = ' & ' + str(count+2)
        paragnumber = parag1 + parag2
        parag_1 = str(count+1)
        parag_2 = str(count+2)

        documents = [sent1, sent2]
        # create vocabulary
        dictionary = corpora.Dictionary([simple_preprocess(doc) for doc in documents])
        # apply fasttext model
        similarity_matrix = fasttext_model300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0,
                                                                nonzero_limit=100)
        # create bag of words
        sent_1 = dictionary.doc2bow(simple_preprocess(sent1))
        sent_2 = dictionary.doc2bow(simple_preprocess(sent2))
        # apply softcosine similarity
        soft_cosine_output = softcossim(sent_1, sent_2, similarity_matrix)
        print(soft_cosine_output)
        colnames.append(paragnumber)
        softcosout.append(soft_cosine_output)
        token_1.append(parag_1)
        token_2.append(parag_2)
    # create and export dataframe
    df_softcos = pd.DataFrame(softcosout, columns = ['Soft cosine'])
    df_softcos['token 1'] = token_1
    df_softcos['token 2'] = token_2
    print(df_softcos)
    
    if stem:
        df_softcos.to_csv(r'softcosine_stem_output15.csv',
                      index=None, header=True)
    if lemma:
        df_softcos.to_csv(r'softcosine_lemma_output15.csv',
                              index=None, header=True)
示例#13
0
def cosine_similarity(reference, hypothesis, model):
    reference = reference.split()
    hypothesis = hypothesis.split()
    documents = [hypothesis, reference]
    dictionary = corpora.Dictionary(documents)

    similarity_matrix = emb_models[model].similarity_matrix(dictionary)

    hypothesis = dictionary.doc2bow(hypothesis)
    reference = dictionary.doc2bow(reference)

    return softcossim(hypothesis, reference, similarity_matrix)
def soft_cosine_similarity(text_1, text_2, corpus):
    dictionary = Dictionary(corpus)
    text_1 = dictionary.doc2bow(text_1)
    text_2 = dictionary.doc2bow(text_2)
    w2v_model = Word2Vec(corpus,
                         workers=cpu_count(),
                         min_count=1,
                         size=300,
                         seed=12345)
    similarity_matrix = sparse.csr_matrix(
        MatrixSimilarity(Dense2Corpus(w2v_model.wv.syn0.T)))
    return softcossim(text_1, text_2, similarity_matrix)
示例#15
0
def deriveSoftCosineSimilarityMatrix(allDict,
                                     limit=None,
                                     weName="glove-wiki-gigaword-50",
                                     simThreshold=0.3):
    # documents=getTestDocuments()
    docsZip = getDocList(allDict,
                         limit,
                         stop_list=getCustomStopWords(),
                         with_ids=True)

    documents = []
    ids = []
    for i, j in docsZip:
        documents.append(j)
        ids.append(i)
    model = getWordEmbeddingModel(weName=weName)
    # Create gensim Dictionary of unique IDs of all words in all documents
    # pyDAVis param "d"
    dictionary = corpora.Dictionary(
        [simple_preprocess(doc) for doc in documents])

    # Convert the sentences into bag-of-words vectors.
    sentences = []  # pyDAVis param "c"
    for doc in documents:
        sentences.append(dictionary.doc2bow(simple_preprocess(doc)))

    # Create a TF-IDF model. TF-IDF encoding represents words as their
    # relative importance to the whole document in a collection of documents,
    # i.e. the sentences.
    # pyDAVis param "lda"
    tf_idf = models.TfidfModel(sentences)

    # Prepare the similarity matrix
    similarity_matrix = model.similarity_matrix(dictionary,
                                                tfidf=tf_idf,
                                                threshold=simThreshold,
                                                exponent=2.0,
                                                nonzero_limit=100)

    # create 1xN vector filled with 1,2,..N
    len_array = np.arange(len(sentences))
    # create NxN array filled with 1..N down, 1..N across
    xx, yy = np.meshgrid(len_array, len_array)
    # Iterate over the 2d matrix calculating
    theMatrix = [[
        round(softcossim(sentences[i], sentences[j], similarity_matrix), 2)
        for i, j in zip(x, y)
    ] for y, x in zip(xx, yy)]

    cossim_mat = pd.DataFrame(theMatrix, index=ids, columns=ids)

    return cossim_mat
示例#16
0
def similarity(path1, path2):
    '''
    prints the cosine and soft cosine similiarities of the text objects located
    at path1 and path2
    '''
    # opens the file and saves them as readable objects
    one = open(path1).read().replace('\n', '')
    two = open(path2).read().replace('\n', '')

    #creates a list containing the objects
    lst = [one, two]

    #drops the stop words --------------------------------------------------------
    vectorizer = CountVectorizer(stop_words='english')

    #converts the text documents to a (sparse) matrix of token counts
    sparse_matrix = vectorizer.fit_transform(lst)

    #creates dense matrix
    dense_matrix = sparse_matrix.todense()
    df = pd.DataFrame(dense_matrix,
                      columns=vectorizer.get_feature_names(),
                      index=['1', '2'])

    #prints the cosine similarity ---------------------------------------------------------
    cos_similarity = cosine_similarity(df, df)[0][1]

    #creates corpus? dictionary? ------------------------------------------------------------------------
    dictionary = corpora.Dictionary([simple_preprocess(doc) for doc in lst])
    print(dictionary)

    #creates similarity matrix
    similarity_matrix = fasttext_model300.similarity_matrix(dictionary,
                                                            tfidf=None,
                                                            threshold=0.0,
                                                            exponent=2.0,
                                                            nonzero_limit=100)

    #create bag-of-words vectors for each text file
    one = dictionary.doc2bow(simple_preprocess(one))
    two = dictionary.doc2bow(simple_preprocess(two))

    #prints the soft cosine similarity ------------------------------------------------------
    soft_similarity = softcossim(one, two, similarity_matrix)

    return [
        'Cosine Similarity: ' + str(cos_similarity),
        'Soft Cosine Similarity: ' + str(soft_similarity)
    ]
示例#17
0
def compare_sentences(sentence1, sentence2, model=word2vec_model300):
    sentence1 = sentence1.split()
    sentence2 = sentence2.split()

    documents = [sentence1, sentence2]
    dictionary = corpora.Dictionary(documents)
    ws1 = dictionary.doc2bow(sentence1)
    ws2 = dictionary.doc2bow(sentence2)

    similarity_matrix = model.similarity_matrix(dictionary,
                                                tfidf=None,
                                                threshold=0.0,
                                                exponent=2.0,
                                                nonzero_limit=100)
    return softcossim(ws1, ws2, similarity_matrix)
示例#18
0
def make_dist_vec(vector, doc_vectors, similarity_matrix, length=5):
    """
        f**k off!! 
        Read code instead of searching for docstring
    """
    gen_vector = []

    for i in doc_vectors:
        if vector == i:
            continue

        gen_vector = gen_vector + [softcossim(vector, i, similarity_matrix)]

    gen_vector.sort(reverse=True)

    return gen_vector[:length + 1]
示例#19
0
def softCosineSimilarityTest(numtestdocs=20, weName="glove-wiki-gigaword-50"):
    # documents=getTestDocuments()
    # documents=getSampleDocs(numtestdocs)
    documents = getDocList(limit=numtestdocs)
    model = getWordEmbeddingModel(weName=weName)
    # Create gensim Dictionary of unique IDs of all words in all documents
    dictionary = corpora.Dictionary(
        [simple_preprocess(doc) for doc in documents])

    # Prepare the similarity matrix
    similarity_matrix = model.similarity_matrix(dictionary,
                                                tfidf=None,
                                                threshold=0.0,
                                                exponent=2.0,
                                                nonzero_limit=100)

    # Convert the sentences into bag-of-words vectors.
    sentences = []
    for doc in documents:
        sentences.append(dictionary.doc2bow(simple_preprocess(doc)))

    # Create a TF-IDF model. TF-IDF encoding represents words as their
    # relative importance to the whole document in a collection of documents,
    # i.e. the sentences.
    # tf_idf = models.TfidfModel(sentences)
    # print("tf_idf:", tf_idf)

    # create 1xN vector filled with 1,2,..N
    len_array = np.arange(len(sentences))
    # create NxN array filled with 1..N down, 1..N across
    xx, yy = np.meshgrid(len_array, len_array)
    # Iterate over the 2d matrix calculating
    theMatrix = [[
        round(softcossim(sentences[i], sentences[j], similarity_matrix), 2)
        for i, j in zip(x, y)
    ] for y, x in zip(xx, yy)]

    names = []  # for identifying rows and columns
    jj = 0
    for doc in documents:
        names.append(str(jj) + " " + doc[:15] + "\t")
        jj += 1

    cossim_mat = pd.DataFrame(theMatrix, index=names, columns=names)

    return cossim_mat
示例#20
0
    def get_similarities(self, query):
        """Get similarity between `query` and current index instance.

        Warnings
        --------
        Do not use this function directly; use the self[query] syntax instead.

        Parameters
        ----------
        query : {list of (int, number), iterable of list of (int, number), :class:`scipy.sparse.csr_matrix`
            Document or collection of documents.

        Return
        ------
        :class:`numpy.ndarray`
            Similarity matrix.

        """
        if isinstance(query, numpy.ndarray):
            # Convert document indexes to actual documents.
            query = [self.corpus[i] for i in query]

        if not query or not isinstance(query[0], list):
            query = [query]

        n_queries = len(query)
        result = []
        for qidx in range(n_queries):
            # Compute similarity for each query.
            qresult = [
                matutils.softcossim(document, query[qidx],
                                    self.similarity_matrix)
                for document in self.corpus
            ]
            qresult = numpy.array(qresult)

            # Append single query result to list of all results.
            result.append(qresult)

        if len(result) == 1:
            # Only one query.
            result = result[0]
        else:
            result = numpy.array(result)

        return result
示例#21
0
    def get_similarities(self, query):
        """Get similarity between `query` and this index.

        Warnings
        --------
        Do not use this function directly; use the `self[query]` syntax instead.

        Parameters
        ----------
        query : {list of (int, number), iterable of list of (int, number)}
            Document or collection of documents.

        Return
        ------
        :class:`numpy.ndarray`
            Similarity matrix.

        """

        is_corpus, query = utils.is_corpus(query)
        if not is_corpus:
            if isinstance(query, numpy.ndarray):
                # Convert document indexes to actual documents.
                query = [self.corpus[i] for i in query]
            else:
                query = [query]

        result = []
        for query_document in query:
            # Compute similarity for each query.
            qresult = [
                matutils.softcossim(query_document, corpus_document,
                                    self.similarity_matrix)
                for corpus_document in self.corpus
            ]
            qresult = numpy.array(qresult)

            # Append single query result to list of all results.
            result.append(qresult)

        if is_corpus:
            result = numpy.array(result)
        else:
            result = result[0]

        return result
示例#22
0
def similarity(quest, faq=faq):
    faq_clean = cleaner(faq)
    dictionary = corpora.Dictionary(faq_clean)
    corpus = [dictionary.doc2bow(q) for q in cleaner(faq)]
    similarities = []
    faq_ = cleaner(faq)
    dictionary = corpora.Dictionary(faq_)
    corpus = [dictionary.doc2bow(q) for q in faq_]
    similarity_matrix = w2v_model.similarity_matrix(dictionary)
    question = cleanq(quest)

    for i in range(len(corpus)):
        similarities.append(
            softcossim(dictionary.doc2bow(question), corpus[i],
                       similarity_matrix))
    return (faq[similarities.index(sorted(similarities, reverse=True)[0])],
            similarities.index(sorted(similarities, reverse=True)[0]))
示例#23
0
文件: docsim.py 项目: lopusz/gensim
    def get_similarities(self, query):
        """Get similarity between `query` and current index instance.

        Warnings
        --------
        Do not use this function directly; use the self[query] syntax instead.

        Parameters
        ----------
        query : {list of (int, number), iterable of list of (int, number), :class:`scipy.sparse.csr_matrix`
            Document or collection of documents.

        Return
        ------
        :class:`numpy.ndarray`
            Similarity matrix.

        """
        if isinstance(query, numpy.ndarray):
            # Convert document indexes to actual documents.
            query = [self.corpus[i] for i in query]

        if not query or not isinstance(query[0], list):
            query = [query]

        n_queries = len(query)
        result = []
        for qidx in range(n_queries):
            # Compute similarity for each query.
            qresult = [matutils.softcossim(document, query[qidx], self.similarity_matrix)
                       for document in self.corpus]
            qresult = numpy.array(qresult)

            # Append single query result to list of all results.
            result.append(qresult)

        if len(result) == 1:
            # Only one query.
            result = result[0]
        else:
            result = numpy.array(result)

        return result
示例#24
0
    def get_similarities(self, query):
        """Get similarity between `query` and this index.

        Warnings
        --------
        Do not use this function directly; use the `self[query]` syntax instead.

        Parameters
        ----------
        query : {list of (int, number), iterable of list of (int, number)
            Document or collection of documents.

        Return
        ------
        :class:`numpy.ndarray`
            Similarity matrix.

        """

        is_corpus, query = utils.is_corpus(query)
        if not is_corpus:
            if isinstance(query, numpy.ndarray):
                # Convert document indexes to actual documents.
                query = [self.corpus[i] for i in query]
            else:
                query = [query]

        result = []
        for query_document in query:
            # Compute similarity for each query.
            qresult = [matutils.softcossim(query_document, corpus_document, self.similarity_matrix)
                       for corpus_document in self.corpus]
            qresult = numpy.array(qresult)

            # Append single query result to list of all results.
            result.append(qresult)

        if is_corpus:
            result = numpy.array(result)
        else:
            result = result[0]

        return result
    def softCosine(self, model, documents):
        """
        Returns a similarity score using cosine similarity between combined word vectors of two documents.
        Credit and additional information: https://www.machinelearningplus.com/nlp/cosine-similarity/

        @param model: A set of pretrained word embeddings, such as GoogleNews-vectors-negative300.bin.
        @param documents: A size 2 array of strings. Example: ['This is a short sentence.', 'One. Two sentences here.']
        """
        # Prepare a dictionary and a corpus.
        dictionary = corpora.Dictionary([simple_preprocess(doc) for doc in documents])

        # Prepare the similarity matrix
        similarity_matrix = model.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)
        
        # Convert the sentences into bag-of-words vectors.
        sentenceVector = []
        for doc in documents:
            sentenceVector.append(dictionary.doc2bow(simple_preprocess(doc)))

        return softcossim(sentenceVector[0], sentenceVector[1], similarity_matrix)
print("Model loaded Input the document")
s = input()
s = s.lower()
test_data = word_tokenize(s)

v1 = model.infer_vector(test_data)
print("V1_infer", v1)

#s = dictionary.doc2bow(s)

max = 0
idx = 0

for d in data:
    if softcossim(dictionary.doc2bow(
        (s.lower()).split()), dictionary.doc2bow(
            (d.lower()).split()), similarity_matrix) > max:
        max = softcossim(dictionary.doc2bow((s.lower()).split()),
                         dictionary.doc2bow((d.lower()).split()),
                         similarity_matrix)
        idx = data.index(d)

print(max)
print(data[idx])

# to find most similar doc using tags
#similar_doc = model.docvecs.most_similar('1')

#print("work")
#print(similar_doc)
corpus = [dictionary.doc2bow(document) for document in documents]

# Convert the sentences into bag-of-words vectors.
question1 = dictionary.doc2bow(question1)
question2 = dictionary.doc2bow(question2)
question3 = dictionary.doc2bow(question3)
question4 = dictionary.doc2bow(question4)

import gensim.downloader as api

w2v_model = api.load("glove-wiki-gigaword-50")
similarity_matrix = w2v_model.similarity_matrix(dictionary)

from gensim.matutils import softcossim

similarity = softcossim(question1, question2, similarity_matrix)
print('similarity = %.4f' % similarity)

"""The similarity for the 1st pair is relative large, this means soft cosine thinks these two sentence are very similar."""

similarity = softcossim(question3, question4, similarity_matrix)
print('similarity = %.4f' % similarity)

"""On the other hand, the similarity for the 2nd pair is very small, this means soft cosine thinks this pair are not similar.

### FuzzyWuzzy

We have covered some basics on Fuzzy String Matching in Python, let's have a quick peak on whether FuzzyWuzzy can help with our question dedupe problem.
"""

from fuzzywuzzy import fuzz
示例#28
0
                                                        exponent=2.0,
                                                        nonzero_limit=100)

# Convert sentences into bag-of-words vectors.
sentence_1 = dictionary.doc2bow(sentence_1)
sentence_2 = dictionary.doc2bow(sentence_2)
sentence_3 = dictionary.doc2bow(sentence_3)
sentence_4 = dictionary.doc2bow(sentence_4)

print(sentence_1)
print(sentence_3)
print(sentence_3)
print(sentence_4)

# Soft cosine similarity
print(softcossim(sentence_1, sentence_2, similarity_matrix))
print(softcossim(sentence_1, sentence_3, similarity_matrix))
print(softcossim(sentence_2, sentence_3, similarity_matrix))
print(softcossim(sentence_2, sentence_4, similarity_matrix))

# In[21]:

print(dataset['SYNONYM_VALUE'])

# In[86]:

# Testing Gensim with the actual KOIOS data
# Step 1 - Clean data (Removing stopwords and punctuation)

from gensim.parsing.preprocessing import remove_stopwords
from gensim.utils import simple_preprocess
示例#29
0
similarity_matrix = fasttext_model300.similarity_matrix(dictionary,
                                                        tfidf=None,
                                                        threshold=0.0,
                                                        exponent=2.0,
                                                        nonzero_limit=100)

# Convert the sentences into bag-of-words vectors.
sent_1 = dictionary.doc2bow(simple_preprocess(doc_trump))
sent_2 = dictionary.doc2bow(simple_preprocess(doc_election))
sent_3 = dictionary.doc2bow(simple_preprocess(doc_putin))
sent_4 = dictionary.doc2bow(simple_preprocess(doc_soup))
sent_5 = dictionary.doc2bow(simple_preprocess(doc_noodles))
sent_6 = dictionary.doc2bow(simple_preprocess(doc_dosa))

sentences = [sent_1, sent_2, sent_3, sent_4, sent_5, sent_6]

print(softcossim(sent_1, sent_2, similarity_matrix))


def create_soft_cossim_matrix(sentences):
    len_array = np.arange(len(sentences))
    xx, yy = np.meshgrid(len_array, len_array)
    cossim_mat = pd.DataFrame([[
        round(softcossim(sentences[i], sentences[j], similarity_matrix), 2)
        for i, j in zip(x, y)
    ] for y, x in zip(xx, yy)])
    return cossim_mat


print(create_soft_cossim_matrix(sentences))
print('#################################################################')
print('')

duplicates = []
none_duplicates = []

train = get_train()

for data in tqdm(train, desc='Calculating similarities'):
    preprocessed_text1 = preprocessor.preprocess_text(data['text1'])
    preprocessed_text2 = preprocessor.preprocess_text(data['text2'])

    bow1 = dictionary.doc2bow(preprocessed_text1)
    bow2 = dictionary.doc2bow(preprocessed_text2)

    softcossim_similarity = softcossim(bow1, bow2, similarity_matrix)

    if data['duplicate']:
        duplicates.append(softcossim_similarity)
    else:
        none_duplicates.append(softcossim_similarity)

sleep(2)
print('Mean duplicates:', mean(duplicates))

print()

mean_duplicates = mean(duplicates)

test = get_test()
correct = 0
示例#31
0
non_col = "Objective: To investigate the clinicopathologic and molecular features of the rare cribriform morular variant of papillary thyroid carcinoma (CMV-PTC). Methods: The clinicopathologic data of 10 patients with CMV-PTC were retrospectively reviewed. Immunohistochemical (IHC) staining was done using LSAB method. DNA sequencing for APC were applied using Sanger method. BRAF V600E mutation was examined using ARMS method. The cytological, morphological, IHC and molecular features were analyzed. Results: All patients were female at an average age of 27 years old. The tumors were mostly located in the right lobe of thyroid. Fine needle aspiration cytology was performed in three patients; two were diagnosed as suspicious for PTC and one as PTC. Nine tumors presented as solitary nodule and two as multiple nodules in both lobes. Infiltration was demonstrated in three cases. The average size was 2.6 cm. The neoplastic cells were arranged in papillary, cribriform, solid and glandular patterns, with rare or without colloid inside the lumen. The number of morula varied, ranging from zero to many. The neoplastic cells were variably enlarged, showing round, oval or spindle shape. Nuclear irregularity was identified as irregular membrane, nuclear grooves or pseudoinclusion, but no typical ground glass feature. Peculiar nuclear clearing could be observed in the morular cells. IHC staining showed the neoplastic cells were negative for thyroglobulin and p63, but positive for TTF1, cytokeratin 19 and estrogen receptor. Diffuse staining with cytokeratin was seen in the neoplastic cells and the morula. Specific cytoplasmic and nuclear staining of β-catenin was seen in the neoplastic cells but not the morula. Ki-67 proliferation index was 1%-30%. No recurrence or metastasis was observed. One patient was demonstrated to harbor both somatic and germline mutations of the APC gene, who was found to have adenomatous polyposis and her mother died of colonic carcinoma. No BRAF V600E mutation was detected. Conclusions: CMV-PTC is rare and shows atypical cytological and clinicopathological features, and it is easily misdiagnosed.TG, TTF1, ER and β-catenin are specific IHC markers for CMV-PTC. The morula is negative for cytokeratin 19, in contrast to squamous metaplasia. Although CMV-PTC has indolent clinical behavior, a definite diagnosis is necessary to rule out the possibility of APC gene mutation and related extra-thyroidal neoplasm, such as FAP and Gardner syndrome."
non_col = re.sub(
    r"(?<=\w[^\d])\.|\.(?=[^\d])|\(|\)|\[|\]|,(?= )|((?<=[^\w])-|-(?=[^\w]))|:|\?|\;",
    " ", non_col)

non_col = remove_stopwords(non_col)
non_col = stem_text(non_col).split()

non_col = trigram[bigram[non_col]]

col_1 = dictionary.doc2bow(col_1)
col_2 = dictionary.doc2bow(col_2)
non_col = dictionary.doc2bow(non_col)

similarity = softcossim(col_1, col_2, similarity_matrix)
print('similarity = %.4f' % similarity)

similarity = softcossim(col_1, non_col, similarity_matrix)
print('similarity = %.4f' % similarity)

#print non_col
#inferred_docvec = model.infer_vector(trigram[bigram[non_col]],steps=5000)
'''

#age_test = stem_text("16 years-old").split()

ngrams = trigram[bigram[query_doc]]


for item in ngrams:
示例#32
0
    dictionary.doc2bow(document) for document in all_documents_stop_removed
]

print("document loaded and corpus created.")

size = len(all_documents_stop_removed)
Matrix = [0] * size
for i in range(size):
    Matrix[i] = [0] * size

model = KeyedVectors.load_word2vec_format(
    '/home/mostafa/Desktop/WMD/wiki.fa.vec', binary=False)
similarity_matrix = model.similarity_matrix(dictionary)

print('model loaded')

for i in range(1, len(all_documents_stop_removed)):
    for j in range(0, i):
        # print i,",",j
        doc_i = all_documents_stop_removed[i]
        doc_j = all_documents_stop_removed[j]
        doc_i = dictionary.doc2bow(doc_i)
        doc_j = dictionary.doc2bow(doc_j)
        similarity = softcossim(doc_i, doc_j, similarity_matrix)
        # print similarity
        Matrix[i][j] = similarity
    b = Matrix[i]
    min_distance = np.amax(b)
    print str(i) + "\t" + str(khabarID[i]) + "\t" + str(min_distance)
    # print i
示例#33
0
def preprocess(words):
    common_words = ["habitat", "stay", "just", "the", "is", "of", "and", "for", "anything", "it", "a", "an", "in", "if", "that", "to", "here", "find", "your", "you", "more", "become", "some", "individuals", "can", "all", "about", "regardless", "we", "so", "be", "as", "ever"]
    punctuation = [".", "!", "?", ",", ";", ":"]

    output = []

    for i in range(len(words)):
        initial_word = words[i]
        if words[i][len(words[i])-1] in punctuation:
            initial_word = words[i][:-1]
        if not initial_word.lower() in common_words:
            output.append(initial_word)
    
    return output


dictionary = corpora.Dictionary([preprocess(doc) for doc in documents])
similarity_matrix = fast_text_model.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)


first_sentence = dictionary.doc2bow(preprocess(documents[0]))
second_sentence = dictionary.doc2bow(preprocess(documents[1]))
third_sentence = dictionary.doc2bow(preprocess(documents[2]))


print(softcossim(first_sentence, second_sentence, similarity_matrix))
print(softcossim(first_sentence, third_sentence, similarity_matrix))
print(softcossim(second_sentence, third_sentence, similarity_matrix))
print(softcossim(third_sentence, second_sentence, similarity_matrix))