예제 #1
0
파일: asn3.py 프로젝트: dddd999/asn3
def W2VH():
    docbrown = ""
    for w in brown.words(categories='mystery'):
        docbrown += str(w.lower().split())

    docbrown1, docbrown2 = docbrown[:int(len(docbrown) /
                                         2)], docbrown[int(len(docbrown) / 2):]

    stop_words = stopwords.words('english')
    docbrown1 = [w for w in docbrown1 if w not in stop_words]
    docbrown2 = [w for w in docbrown2 if w not in stop_words]

    documents = [docbrown1, docbrown2]
    dictionary = corpora.Dictionary(documents)

    docbrown1 = dictionary.doc2bow(docbrown1)
    docbrown2 = dictionary.doc2bow(docbrown2)

    model = Word2Vec(common_texts, size=20, min_count=1)
    termsim_index = WordEmbeddingSimilarityIndex(model.wv)
    similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary)

    similarity = similarity_matrix.inner_product(docbrown1,
                                                 docbrown2,
                                                 normalized=True)
    print('= %.4f' % similarity)
예제 #2
0
    def test_dtype(self):
        """Test the dtype parameter of the matrix constructor."""
        matrix = SparseTermSimilarityMatrix(self.index, self.dictionary, dtype=numpy.float32).matrix.todense()
        self.assertEqual(numpy.float32, matrix.dtype)

        matrix = SparseTermSimilarityMatrix(self.index, self.dictionary, dtype=numpy.float64).matrix.todense()
        self.assertEqual(numpy.float64, matrix.dtype)
    def walid_similarity_query(self, answer: str, key: str):
        if len(answer) == 0 or len(key) == 0:
            return False

        if self.model_ready:
            documents = [answer, key]

            if self.verbose:
                print(
                    f'{len(documents)} documents loaded and ready to preprocess'
                )

            corpus = [self.preprocess(document) for document in documents]

            if self.verbose:
                print(f'{len(corpus)} documents loaded into corpus')

            dictionary = Dictionary(corpus)
            tfidf = TfidfModel(dictionary=dictionary)
            similarity_matrix = SparseTermSimilarityMatrix(
                self.similarity_index, dictionary, tfidf)

            answer_bow = dictionary.doc2bow(self.preprocess(answer))
            key_bow = dictionary.doc2bow(self.preprocess(key))

            # Measure soft cosine similarity
            scores = similarity_matrix.inner_product(answer_bow,
                                                     key_bow,
                                                     normalized=True)

            return scores

        else:
            raise NotReadyError('Word embedding model is not ready.')
예제 #4
0
    def calculate_soft_cosine_similarity(self, topic_models, sentences, *args,
                                         **kwargs):

        topic_claim_relations = {}
        for topic in topic_models:
            topic_claim_relations[topic] = []

        documents = []
        for topic in topic_models:
            documents.append(topic.lower().split())
        for sentence in sentences:
            documents.append(sentence.lower().split())
        dictionary = corpora.Dictionary(documents)

        w2v_model = api.load("glove-wiki-gigaword-100")
        similarity_index = WordEmbeddingSimilarityIndex(w2v_model)
        similarity_matrix = SparseTermSimilarityMatrix(similarity_index,
                                                       dictionary)

        for sentence in sentences:
            best_cosine_result = 0
            x = 0
            normal_sentence = sentence
            sentence = sentence.lower().split()

            stop_words = stopwords.words('english')
            sentence = [w for w in sentence if w not in stop_words]

            while x <= len(topic_models) - 1:

                topic_model = (topic_models[x]).lower().split()
                topic_model = [w for w in topic_model if w not in stop_words]

                topic_model_bow = dictionary.doc2bow(topic_model)
                sentence_bow = dictionary.doc2bow(sentence)

                similarity = similarity_matrix.inner_product(topic_model_bow,
                                                             sentence_bow,
                                                             normalized=True)
                print('similarity = %.4f' % similarity)

                if similarity > best_cosine_result:
                    best_cosine_result = similarity
                    matched_topic = topic_models[x]

                if x == len(topic_models) - 1:
                    if best_cosine_result > 0.3:
                        topic_claim_relations[matched_topic].append(
                            normal_sentence)

                x = x + 1
        return topic_claim_relations
예제 #5
0
    def test_nonzero_limit(self):
        """Test the nonzero_limit parameter of the matrix constructor."""
        matrix = SparseTermSimilarityMatrix(self.index, self.dictionary, nonzero_limit=100).matrix.todense()
        self.assertGreaterEqual(101, numpy.max(numpy.sum(matrix != 0, axis=0)))

        matrix = SparseTermSimilarityMatrix(self.index, self.dictionary, nonzero_limit=4).matrix.todense()
        self.assertGreaterEqual(5, numpy.max(numpy.sum(matrix != 0, axis=0)))

        matrix = SparseTermSimilarityMatrix(self.index, self.dictionary, nonzero_limit=1).matrix.todense()
        self.assertGreaterEqual(2, numpy.max(numpy.sum(matrix != 0, axis=0)))

        matrix = SparseTermSimilarityMatrix(self.index, self.dictionary, nonzero_limit=0).matrix.todense()
        self.assertEqual(1, numpy.max(numpy.sum(matrix != 0, axis=0)))
        self.assertTrue(numpy.all(matrix == numpy.eye(matrix.shape[0])))
예제 #6
0
 def setUp(self):
     self.cls = similarities.SoftCosineSimilarity
     self.tfidf = TfidfModel(dictionary=dictionary)
     similarity_matrix = scipy.sparse.identity(12, format="lil")
     similarity_matrix[dictionary.token2id["user"], dictionary.token2id["human"]] = 0.5
     similarity_matrix[dictionary.token2id["human"], dictionary.token2id["user"]] = 0.5
     self.similarity_matrix = SparseTermSimilarityMatrix(similarity_matrix)
예제 #7
0
class Similarity:
    def __init__(self):
        self.stop_words = stopwords.words('english')
        self.w2v_model = api.load("glove-wiki-gigaword-50")
        self.similarity_index = WordEmbeddingSimilarityIndex(self.w2v_model)

    def make_document(self, headline, articles):
        temp = []
        headline = [
            w for w in headline.lower().split() if w not in self.stop_words
        ]
        for article in articles:
            article = [
                w for w in article.lower().split() if w not in self.stop_words
            ]
            temp.append(article)
        self.documents = [headline] + temp
        dictionary = corpora.Dictionary(self.documents)
        self.similarity_matrix = SparseTermSimilarityMatrix(
            self.similarity_index, dictionary)
        headline = dictionary.doc2bow(headline)
        articles = [dictionary.doc2bow(i) for i in temp]
        similarities = []
        for i in articles:
            similarities.append(self.get_similarity(headline, i))
        return similarities

    def get_similarity(self, s1, s2):
        return self.similarity_matrix.inner_product(s1, s2, normalized=True)
예제 #8
0
    def test_encapsulation(self):
        """Test the matrix encapsulation."""

        # check that a sparse matrix will be converted to a CSC format
        expected_matrix = numpy.array([
            [1.0, 2.0, 3.0],
            [0.0, 1.0, 4.0],
            [0.0, 0.0, 1.0]])

        matrix = SparseTermSimilarityMatrix(scipy.sparse.csc_matrix(expected_matrix)).matrix
        self.assertTrue(isinstance(matrix, scipy.sparse.csc_matrix))
        self.assertTrue(numpy.all(matrix.todense() == expected_matrix))

        matrix = SparseTermSimilarityMatrix(scipy.sparse.csr_matrix(expected_matrix)).matrix
        self.assertTrue(isinstance(matrix, scipy.sparse.csc_matrix))
        self.assertTrue(numpy.all(matrix.todense() == expected_matrix))
def create_softcosine_resourse(model_source,all_in_sentence): # create resources for soft cosine
    overall_dict = gensim.corpora.Dictionary(all_in_sentence)
    model = gensim.models.Word2Vec.load(model_source)
    similarity_index = WordEmbeddingSimilarityIndex(model.wv)

    similarity_matrix = SparseTermSimilarityMatrix(similarity_index, overall_dict)
    return overall_dict, similarity_matrix
예제 #10
0
def createW2VecIndex(reference_dict):
    from gensim.corpora import Dictionary
    from gensim.models import Word2Vec, WordEmbeddingSimilarityIndex
    from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix
    print("Prepare Word2Vec model")
    import time
    t1 = time.time()
    corpus = []
    #reference = []
    for term in reference_dict:
        corpus.append(word_tokenize(term))
        #reference.append(term)
    model = Word2Vec(corpus, size=20, min_count=1)  # train word-vectors
    termsim_index = WordEmbeddingSimilarityIndex(model.wv)  #<----
    dictionary = Dictionary(corpus)
    bow_corpus = [dictionary.doc2bow(document) for document in corpus]
    similarity_matrix = SparseTermSimilarityMatrix(
        termsim_index, dictionary)  # construct similarity matrix
    docsim_index = SoftCosineSimilarity(bow_corpus,
                                        similarity_matrix,
                                        num_best=3)
    t2 = time.time()
    print(" W2v index and dictionary in ", (t2 - t1) / 60, " minutes")
    import pickle
    f = open("./models/W2VecIndexes.bin", 'wb')
    pickle.dump((docsim_index, dictionary), f)
    return docsim_index, dictionary
예제 #11
0
파일: approach.py 프로젝트: LeyliG/ds4se
    def __init__(self, params):
        super().__init__(params)
        self.new_model = gensim.models.Word2Vec.load( params['path_to_trained_model'] )
        self.new_model.init_sims(replace=True)  # Normalizes the vectors in the word2vec class.
        #Computes cosine similarities between word embeddings and retrieves the closest
        #word embeddings by cosine similarity for a given word embedding.
        self.similarity_index = WordEmbeddingSimilarityIndex(self.new_model.wv)
        #Build a term similarity matrix and compute the Soft Cosine Measure.
        self.similarity_matrix = SparseTermSimilarityMatrix(self.similarity_index, self.dictionary)

        self.dict_distance_dispatcher = {
            DistanceMetric.COS: self.cos_scipy,
            SimilarityMetric.Pearson: self.pearson_abs_scipy,
            DistanceMetric.WMD: self.wmd_gensim,
            DistanceMetric.SCM: self.scm_gensim
        }
예제 #12
0
    def __init__(self, documents):
        print("Initializing GloVe")
        if isinstance(documents[0], list):
            print("It is a list")
            documents = [[" ".join(document)] for document in documents
                         if isinstance(document, list)]

        documents = [str(document) for document in documents]

        self.corpus = [
            preprocess(document) for document in documents
            if type(document) is str
        ]
        self.documents = documents
        '''
        Then we create a similarity matrix, that contains the similarity between each pair of words, 
        weighted using the term frequency:
        '''
        # Load the model: this is a big file, can take a while to download and open
        glove = api.load("glove-wiki-gigaword-50")
        print("Document loaded")
        self.similarity_index = WordEmbeddingSimilarityIndex(glove)
        self.dictionary = Dictionary(self.corpus)
        self.tfidf = TfidfModel(dictionary=self.dictionary)
        print("Model is running")

        # Create the term similarity matrix.
        self.similarity_matrix = SparseTermSimilarityMatrix(
            self.similarity_index, self.dictionary, self.tfidf)
        print("Everything has been initialized")
예제 #13
0
    def compute_sim_matrix(self):
        '''    
        if(self.model_type.lower() == "fasttext"):
            model = FastText(self.questions) 
        else:
            model = Word2Vec(self.questions)
        '''
        self.dictionary = Dictionary(self.questions)
        self.tfidf = TfidfModel(dictionary=self.dictionary)
        word2vec_model = Word2Vec(self.questions,
                                  workers=cpu_count(),
                                  min_count=5,
                                  size=300,
                                  seed=12345)

        sim_index = WordEmbeddingSimilarityIndex(word2vec_model.wv)
        sim_matrix = SparseTermSimilarityMatrix(sim_index,
                                                self.dictionary,
                                                self.tfidf,
                                                nonzero_limit=100)
        bow_corpus = [
            self.dictionary.doc2bow(document) for document in self.questions
        ]

        tfidf_corpus = [self.tfidf[bow] for bow in bow_corpus]

        self.docsim_index = SoftCosineSimilarity(tfidf_corpus,
                                                 sim_matrix,
                                                 num_best=10)
 def initializeSimilarityMatrix(self):
     self.similarity_index = WordEmbeddingSimilarityIndex(self.w2v_model)
     self.similarity_matrix = SparseTermSimilarityMatrix(
         self.similarity_index,
         self.dictionary,
         self.tfidf,
         nonzero_limit=100)
예제 #15
0
    def load(self, directory):
        """Load model files from a directory"""

        self.ft = Word2Vec.load(os.path.join(directory, "w2v.model"))
        self.dictionary = Dictionary.load(os.path.join(directory,
                                                       "dict.model"))
        self.matrix = SparseTermSimilarityMatrix.load(
            os.path.join(directory, "stsm.model"))
예제 #16
0
def get_sim_index(wv_model, bow_corpus, dictionary):
    termsim_index = WordEmbeddingSimilarityIndex(wv_model.wv)
    similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary)
    docsim_index = SoftCosineSimilarity(bow_corpus,
                                        similarity_matrix,
                                        num_best=10)

    return docsim_index
예제 #17
0
파일: eval.py 프로젝트: LeyliG/ds4se
class Word2VecSeqVect(BasicSequenceVectorization):
    def __init__(self, params):
        super().__init__(params)
        self.new_model = gensim.models.Word2Vec.load(
            params['path_to_trained_model'])
        self.new_model.init_sims(
            replace=True)  # Normalizes the vectors in the word2vec class.
        #Computes cosine similarities between word embeddings and retrieves the closest
        #word embeddings by cosine similarity for a given word embedding.
        self.similarity_index = WordEmbeddingSimilarityIndex(self.new_model.wv)
        #Build a term similarity matrix and compute the Soft Cosine Measure.
        self.similarity_matrix = SparseTermSimilarityMatrix(
            self.similarity_index, self.dictionary)

        self.dict_distance_dispatcher = {
            DistanceMetric.COS: self.cos_scipy,
            SimilarityMetric.Pearson: self.pearson_abs_scipy,
            DistanceMetric.WMD: self.wmd_gensim,
            DistanceMetric.SCM: self.scm_gensim
        }

    def wmd_gensim(self, sentence_a, sentence_b):
        wmd = self.new_model.wv.wmdistance(sentence_a, sentence_b)
        return [wmd, self.wmd_similarity(wmd)]

    def wmd_similarity(self, dist):
        return 1. / (1. + float(dist))  #Associated Similarity

    def scm_gensim(self, sentence_a, sentence_b):
        '''Compute SoftCosine Similarity of Gensim'''
        #Convert the sentences into bag-of-words vectors.
        sentence_1 = self.dictionary.doc2bow(sentence_a)
        sentence_2 = self.dictionary.doc2bow(sentence_b)

        #Return the inner product(s) between real vectors / corpora vec1 and vec2 expressed in a non-orthogonal normalized basis,
        #where the dot product between the basis vectors is given by the sparse term similarity matrix.
        scm_similarity = self.similarity_matrix.inner_product(sentence_1,
                                                              sentence_2,
                                                              normalized=True)
        return [1 - scm_similarity, scm_similarity]

    def distance(self, metric_list, link):
        '''Iterate on the metrics'''
        #Computation of sentences can be moved directly to wmd_gensim method if we cannot generalize it for
        #the remaining metrics
        sentence_a = self.df_source[self.df_source['ids'].str.contains(
            link[0])]['text'].values[0].split()
        sentence_b = self.df_target[self.df_target['ids'].str.contains(
            link[1])]['text'].values[0].split()

        dist = [
            self.dict_distance_dispatcher[metric](sentence_a, sentence_b)
            for metric in metric_list
        ]
        logging.info("Computed distances or similarities " + str(link) +
                     str(dist))
        return functools.reduce(lambda a, b: a + b,
                                dist)  #Always return a list
예제 #18
0
 def make_document(self, headline, articles):
     temp = []
     headline = [
         w for w in headline.lower().split() if w not in self.stop_words
     ]
     for article in articles:
         article = [
             w for w in article.lower().split() if w not in self.stop_words
         ]
         temp.append(article)
     self.documents = [headline] + temp
     dictionary = corpora.Dictionary(self.documents)
     self.similarity_matrix = SparseTermSimilarityMatrix(
         self.similarity_index, dictionary)
     headline = dictionary.doc2bow(headline)
     articles = [dictionary.doc2bow(i) for i in temp]
     similarities = []
     for i in articles:
         similarities.append(self.get_similarity(headline, i))
     return similarities
예제 #19
0
def prepare_index(dictionary, model, tfidf, documents):
    if not os.path.isfile('soft_cosine.index'):
        similarity_index = WordEmbeddingSimilarityIndex(model.wv)
        similarity_matrix = SparseTermSimilarityMatrix(similarity_index,
                                                       dictionary, tfidf)
        index = SoftCosineSimilarity(
            tfidf[[dictionary.doc2bow(document) for document in documents]],
            similarity_matrix)
        index.save('soft_cosine.index')

    return SoftCosineSimilarity.load('soft_cosine.index')
예제 #20
0
    def __init__(self, cut_off=0.2, cleanup_urls=True, nltk_tokenizer=False):
        super().__init__(cleanup_urls=cleanup_urls, nltk_tokenizer=nltk_tokenizer)

        terms_idx = WordEmbeddingSimilarityIndex(self.w2vmodel.wv)
        self.dictionary = Dictionary(self.corpus)

        bow = [self.dictionary.doc2bow(doc) for doc in self.corpus]

        similarity_matrix = SparseTermSimilarityMatrix(terms_idx, self.dictionary)
        self.softcosinesimilarity = SoftCosineSimilarity(
            bow, similarity_matrix, num_best=10
        )
예제 #21
0
def glove_score_1v1(query_string, documents):
    # query_string = 'Leticia has 3+ years of experience in data science. She has a background in applied mathematics and computer science and currently works as a data scientist at Ørsted. In her work, she builds condition-based algorithms to predict when their offshore wind turbines are going to fail in order to optimize daily operations. Leticia has an international upbringing and has lived in 9 different countries, and she is driven by a great work environment with diversity in the workplace. Leticia wants to become a mentor to help students in their transition to professional life and share their own experiences of studying and working abroad and succeeding as a woman in a male-dominated field. Leticia would prefer a mentee that has ambition and drive, such that she has a better understanding of where he or she wants to go and how she can help in the best way.'
    # documents = ['I would describe myself as being dedicated and curious. I am very interested in data analytics and operations research, specially in connection with logistics and planning. For my Bachelor thesis I did a simulation project with Copenhagen Malmö Port on how to optimise the logistics operations at their container-terminal, which really sparked my interest in this area. I am always interesting in learning new things and I try to take advantage of the great opportunities offered through my studies at DTU - like this mentorship or having the opportunity to go abroad for a semester. Last year I spent a semester at Hong Kong University of Science and Technology which was a big experience both academically and personally. Currently, I am working as a student assistant in Danmarks Nationalbank, and even though it is interesting getting an insight into the financial world and having to apply my skills to a different area, at some time, I would like to try something more related to my studies. I would like to be part of the program to gain more knowledge of what it is like working in the industry as a data analyst or engineer - preferably working with logistics, data analytics or operations research. I know very few engineers outside the academic world at DTU, so I would appreciate a mentor who could share some of their experiences and tips on transitioning from student to professional. I am leaning towards specialising in prescriptive analytics, so I would also be very interested in learning more about how optimisation methods and simulation studies are actually applied to real-world problems. What I hope to achieve as a mentee is to be more prepared for working in the industry and get advice on how to make smart choices regarding my studies. I would also appreciate some advice on whether to take another semester abroad during my Masters or gain more work-experience.',
    # 'My greatest ambition is to leave the world in a better state for humans to experience the quality of life than it was when I entered it. This reason lead me to choose scientific studies - general engineering in Paris at first, and then Applied Mathematics in DTU - in the hope to use technologys leverage for maximum impact. Disclaimer: I am currently not looking for a position as I am to continue working for Tomorrow, the fantastic company I am already working for I nevertheless am very interested to get some insights, from a mentor that went through a similar line of study, into how they decided on starting to work straight away vs continue in the academic world by applying for a PhD. I am also eager to learn more about what it actually means to be a professional "data scientist". How much research/theory is actually useful in day-to-day operations and what level of freedom they can have in their decisions and organisation. I am also curious to learn more about career path for data scientist. The popularity of this position is fairly recent and for this reason, career evolution for a data scientist is still rather obscure to me.']
    # 'I would describe myself as focused, structured and vigorous. My main interest is overall concrete technology. It is from the mixing recipes to the maintaining of old structures to "cure" its sickness. The topic of my bachelor project was about testing the different national and international test methods for alkali silica reactions (ASR). To find out the most optimal methods, to catch that sand and stone which could develop ASR. My master thesis is about testing if mine tailings could be used as a substitute for fly ash, which soon not will be available at the same amount as earlier. In my free time, I have been doing a lot of volunteering. I have been a coach for a handball team for 11-12 year old girls for two years. I learned a lot about coaching, planning and taught the girls to be team players. Further I have been part of the organizing committee for the study start and the council for my study line for three years. Where I further developed my competencies planning, leading and get things done. I usually take the lead when things need to be done, but I dont know if Im suited for management. I hope to get a closer look at "the real life", to get ready when I finish my thesis in January. I want to a mentee to get knowledge about the "life" after university. I would prefer a mentor who works with civil engineering, but a mentor who can taught me difference between consulting and entrepreneur firms, so I can find out what is right for me, would be a nice. I still don\'t know what exactly I can be, but I would appreciate some advice. I hope to achieve a way into the business, which could help me find a job after my thesis.']

    # From: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb

    # Preprocess the documents, including the query string
    corpus = [preprocess(document) for document in documents]
    query = preprocess(query_string)
    '''
    Then we create a similarity matrix, that contains the similarity between each pair of words, 
    weighted using the term frequency:
    '''
    # Load the model: this is a big file, can take a while to download and open
    glove = api.load("glove-wiki-gigaword-50")
    similarity_index = WordEmbeddingSimilarityIndex(glove)

    # Build the term dictionary, TF-idf model
    print("Everything has been initialized")
    dictionary = Dictionary(corpus + [query])
    tfidf = TfidfModel(dictionary=dictionary)

    # Create the term similarity matrix.
    similarity_matrix = SparseTermSimilarityMatrix(similarity_index,
                                                   dictionary, tfidf)
    '''
    Finally, we calculate the soft cosine similarity between the query and each of the documents. 
    Unlike the regular cosine similarity (which would return zero for vectors with no overlapping terms), 
    the soft cosine similarity considers word similarity as well.
    '''
    # Compute Soft Cosine Measure between the query and the documents.
    # From: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb
    query_tf = tfidf[dictionary.doc2bow(query)]

    index = SoftCosineSimilarity(
        tfidf[[dictionary.doc2bow(document) for document in corpus]],
        similarity_matrix)

    doc_similarity_scores = index[query_tf]

    # Output the sorted similarity scores and documents
    sorted_indexes = np.argsort(doc_similarity_scores)[::-1]
    count = 0
    print("Mentee values: {}".format(query_string))
    for idx in sorted_indexes:
        count += 1
        if count > 10:
            break
        print(
            f'{idx} \t {doc_similarity_scores[idx]:0.3f} \t {documents[idx]}')
    return doc_similarity_scores
예제 #22
0
    def test_tfidf(self):
        """Test the tfidf parameter of the matrix constructor."""
        matrix = SparseTermSimilarityMatrix(
            self.index, self.dictionary, nonzero_limit=1).matrix.todense()
        expected_matrix = numpy.array([
            [1.0, 0.5, 0.0, 0.0, 0.0],
            [0.5, 1.0, 0.0, 0.0, 0.0],
            [0.0, 0.0, 1.0, 0.0, 0.0],
            [0.0, 0.0, 0.0, 1.0, 0.0],
            [0.0, 0.0, 0.0, 0.0, 1.0]])
        self.assertTrue(numpy.all(expected_matrix == matrix))

        matrix = SparseTermSimilarityMatrix(
            self.index, self.dictionary, nonzero_limit=1, tfidf=self.tfidf).matrix.todense()
        expected_matrix = numpy.array([
            [1.0, 0.0, 0.0, 0.5, 0.0],
            [0.0, 1.0, 0.0, 0.0, 0.0],
            [0.0, 0.0, 1.0, 0.0, 0.0],
            [0.5, 0.0, 0.0, 1.0, 0.0],
            [0.0, 0.0, 0.0, 0.0, 1.0]])
        self.assertTrue(numpy.all(expected_matrix == matrix))
def find_similarity(search_w, corpus_w):
    rv = {}
    rv['result'] = []
    bmatch = False
    #Tokenize the sentence into words
    #search_tokens = [word for word in search_w.split()]
    #corpus_tokens = [word for word in corpus_w.split()]
    search_tokens = search_w
    corpus_tokens = corpus_w

    #print(search_tokens)
    #print(corpus_tokens)
    #print("-----")
    #cp = []
    #for c in corpus_tokens:
    #    cp.append([c])

    #corpus_tokens = cp
    search_tokens = [search_w]
    print(corpus_tokens)
    print(search_tokens)
    # Prepare a dictionary and a corpus.
    #documents = [svc_tokens, specs_tokens]
    dictionary = corpora.Dictionary(corpus_tokens)

    termsim_index = WordEmbeddingSimilarityIndex(model.wv)
    bow_corpus = [dictionary.doc2bow(doc) for doc in corpus_tokens]
    similarity_matrix = SparseTermSimilarityMatrix(
        termsim_index, dictionary)  # construct similarity matrix
    docsim_index = SoftCosineSimilarity(bow_corpus,
                                        similarity_matrix,
                                        num_best=10)

    # Compute soft cosine similarity
    for t in search_tokens:
        #print("looking for %s" %(t.split()))
        for e in t.split(','):
            match = {}
            e = e.strip()
            lkup = [e]
            try:
                result = docsim_index[dictionary.doc2bow(lkup)]
            except:
                result = [(0, 0)]
            print(f"looking for {lkup}, result {result}")
            if len(result) and result[0][1] > 0.5:
                match['word'] = e.split()
                match['value'] = str(result)
                rv['result'].append(match)
                bmatch = True
    #print(docsim_index[dictionary.doc2bow(search_tokens)])
    return rv if bmatch else None
예제 #24
0
    def test_positive_definite(self):
        """Test the positive_definite parameter of the matrix constructor."""
        negative_index = UniformTermSimilarityIndex(self.dictionary, term_similarity=-0.5)
        matrix = SparseTermSimilarityMatrix(
            negative_index, self.dictionary, nonzero_limit=2).matrix.todense()
        expected_matrix = numpy.array([
            [1.0, -.5, -.5, 0.0, 0.0],
            [-.5, 1.0, 0.0, -.5, 0.0],
            [-.5, 0.0, 1.0, 0.0, 0.0],
            [0.0, -.5, 0.0, 1.0, 0.0],
            [0.0, 0.0, 0.0, 0.0, 1.0]])
        self.assertTrue(numpy.all(expected_matrix == matrix))

        matrix = SparseTermSimilarityMatrix(
            negative_index, self.dictionary, nonzero_limit=2, positive_definite=True).matrix.todense()
        expected_matrix = numpy.array([
            [1.0, -.5, 0.0, 0.0, 0.0],
            [-.5, 1.0, 0.0, 0.0, 0.0],
            [0.0, 0.0, 1.0, 0.0, 0.0],
            [0.0, 0.0, 0.0, 1.0, 0.0],
            [0.0, 0.0, 0.0, 0.0, 1.0]])
        self.assertTrue(numpy.all(expected_matrix == matrix))
def compute_msg_dist_matrix(data):
    lst_notifications = data 
    # print(lst_notifications)
    model = Word2Vec(lst_notifications, min_count=1)  # train word-vectors
    termsim_index = WordEmbeddingSimilarityIndex(model.wv)
    data_2 = [d.split() for d in lst_notifications]
    #print(data)
    dictionary = Dictionary(data_2)
    bow_corpus = [dictionary.doc2bow(document) for document in data_2]
    similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary)  # construct similarity matrix
    docsim_index = SoftCosineSimilarity(bow_corpus, similarity_matrix)
    sof_cosine_distance_matrix = 1- np.array(docsim_index)
    return sof_cosine_distance_matrix
예제 #26
0
def calculate_softcosine_w2v(test_data):
    data = [i.split() for i in (test_data.text).tolist()]
    dictionary = corpora.Dictionary(data)
    corpus = [dictionary.doc2bow(d) for d in data]

    similarity_index = WordEmbeddingSimilarityIndex(w2v_model)
    similarity_matrix = SparseTermSimilarityMatrix(similarity_index,
                                                   dictionary)

    softsim_w2v_matrix = np.empty(shape=(len(data), len(data))) * np.nan
    for d1 in range(0, len(data)):
        for d2 in range(0, len(data)):
            softsim_w2v_matrix[d1, d2] = similarity_matrix.inner_product(
                corpus[d1], corpus[d2], normalized=True)

    doc_sim_max_index, doc_sim_max_values = calculate_max_similarity(
        softsim_w2v_matrix)
    softsim_w2v_df = export_result(test_data, doc_sim_max_index,
                                   doc_sim_max_values, 'softsim_w2v')
    print(
        "Similarity using soft cosine similarity using w2v vectors is calculated!!"
    )
    return softsim_w2v_df
예제 #27
0
    def test_symmetric(self):
        """Test the symmetric parameter of the matrix constructor."""
        matrix = SparseTermSimilarityMatrix(self.index, self.dictionary).matrix.todense()
        self.assertTrue(numpy.all(matrix == matrix.T))

        matrix = SparseTermSimilarityMatrix(
            self.index, self.dictionary, nonzero_limit=1).matrix.todense()
        expected_matrix = numpy.array([
            [1.0, 0.5, 0.0, 0.0, 0.0],
            [0.5, 1.0, 0.0, 0.0, 0.0],
            [0.0, 0.0, 1.0, 0.0, 0.0],
            [0.0, 0.0, 0.0, 1.0, 0.0],
            [0.0, 0.0, 0.0, 0.0, 1.0]])
        self.assertTrue(numpy.all(expected_matrix == matrix))

        matrix = SparseTermSimilarityMatrix(
            self.index, self.dictionary, nonzero_limit=1, symmetric=False).matrix.todense()
        expected_matrix = numpy.array([
            [1.0, 0.5, 0.5, 0.5, 0.5],
            [0.5, 1.0, 0.0, 0.0, 0.0],
            [0.0, 0.0, 1.0, 0.0, 0.0],
            [0.0, 0.0, 0.0, 1.0, 0.0],
            [0.0, 0.0, 0.0, 0.0, 1.0]])
        self.assertTrue(numpy.all(expected_matrix == matrix))
예제 #28
0
    def train(self, sentences):
        """Train a word2vec model with sentences"""

        dictionary = Dictionary(sentences)

        ft = Word2Vec(sentences,
                      workers=cpu_count(),
                      min_count=5,
                      size=300,
                      seed=12345)

        index = WordEmbeddingSimilarityIndex(ft.wv)
        matrix = SparseTermSimilarityMatrix(index, dictionary)

        self.dictionary = dictionary
        self.ft = ft
        self.matrix = matrix
예제 #29
0
    def computeDocumentSimilarityIndex(self, corpus):
        """
            Compute the similarity matrix of the model

            Args:
                corpus: dictionary to use to create index

            Returns:
                SoftCosineSimilarity instance
        """

        if self.wordEmbedding is None:
            self.wordEmbedding = WordEmbeddingSimilarityIndex(self.model)

        # create similarity matrix, update flags
        simMatrix = SparseTermSimilarityMatrix(self.wordEmbedding, corpus)
        return SoftCosineSimilarity([x.sentence for x in self.itemScores], simMatrix)
예제 #30
0
    def get_embedding_files(self, num_best=10):
        """
        Get the dictionary, bow_corpos, similiarity matrix and docsim index pre-trained on all image tags.
        """
        # embeddings
        try:
            with open(f'{constants.EMBEDDING_DIR}/soft_cosine.pkl', "rb") as f:
                self.dictionary, self.bow_corpus, self.similarity_matrix, _ = pickle.load(
                    f)
            self.docsim_index = SoftCosineSimilarity(self.bow_corpus,
                                                     self.similarity_matrix,
                                                     num_best=num_best)

        except FileNotFoundError:
            print(
                f'no file found, training word2vec to get bow_corpus, similarity matrix and docsim index'
            )
            # read in all tags
            try:
                with open(f'{constants.DATA_DIR}/all_img_tags.pkl',
                          'rb') as fp:
                    all_img_tags_lower = pickle.load(fp)
            except FileNotFoundError:
                print(
                    f'no file found at {constants.DATA_DIR}/all_img_tags.pkl')
            model = Word2Vec(all_img_tags_lower, size=20,
                             min_count=1)  # train word2vec
            termsim_index = WordEmbeddingSimilarityIndex(model.wv)
            self.dictionary = Dictionary(all_img_tags_lower)
            self.bow_corpus = [
                self.dictionary.doc2bow(document)
                for document in all_img_tags_lower
            ]
            self.similarity_matrix = SparseTermSimilarityMatrix(
                termsim_index, self.dictionary)  # construct similarity matrix
            # 10 (default) most similar image tag vectors
            self.docsim_index = SoftCosineSimilarity(self.bow_corpus,
                                                     self.similarity_matrix,
                                                     num_best=num_best)
            print(
                f'Saving bow_corpus, similarity matrix and docsim index to {constants.EMBEDDING_DIR}'
            )
            with open(f'{constants.EMBEDDING_DIR}/soft_cosine.pkl', "wb") as f:
                pickle.dump((self.dictionary, self.bow_corpus,
                             self.similarity_matrix, self.docsim_index), f)
예제 #31
0
    def calculate_distance(self,query_string,documents):
       

    
        def preprocess(doc):
            # Tokenize, clean up input document string
            doc = sub(r'<img[^<>]+(>|$)', " image_token ", doc)
            doc = sub(r'<[^<>]+(>|$)', " ", doc)
            doc = sub(r'\[img_assist[^]]*?\]', " ", doc)
            doc = sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', " url_token ", doc)
            return [token for token in simple_preprocess(doc, min_len=0, max_len=float("inf")) if token not in STOPWORDS]

        # Preprocess the documents, including the query string
        corpus = [preprocess(document) for document in documents]
        query = preprocess(query_string)


        # Load the model: this is a big file, can take a while to download and open
            
        similarity_index = WordEmbeddingSimilarityIndex(glove)

        # Build the term dictionary, TF-idf model

        dictionary = Dictionary(corpus+[query])
        tfidf = TfidfModel(dictionary=dictionary)

        # Create the term similarity matrix.  
        similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf)

        query_tf = tfidf[dictionary.doc2bow(query)]

        index = SoftCosineSimilarity(
                    tfidf[[dictionary.doc2bow(document) for document in corpus]],
                    similarity_matrix)

        doc_similarity_scores = index[query_tf]

        # Output the sorted similarity scores and documents
        sorted_indexes = np.argsort(doc_similarity_scores)[::-1]
        if len(documents) > 1:
            for idx in sorted_indexes:
                print(f'{idx} \t {doc_similarity_scores[idx]:0.3f} \t {documents[idx]}')
        # print(doc_similarity_scores)
        return doc_similarity_scores
예제 #32
0
    def test_inner_product(self):
        """Test the inner product."""

        matrix = SparseTermSimilarityMatrix(
            UniformTermSimilarityIndex(self.dictionary, term_similarity=0.5), self.dictionary)

        # check zero vectors work as expected
        vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"])
        vec2 = self.dictionary.doc2bow([u"government", u"holiday"])

        self.assertEqual(0.0, matrix.inner_product([], vec2))
        self.assertEqual(0.0, matrix.inner_product(vec1, []))
        self.assertEqual(0.0, matrix.inner_product([], []))

        self.assertEqual(0.0, matrix.inner_product([], vec2, normalized=True))
        self.assertEqual(0.0, matrix.inner_product(vec1, [], normalized=True))
        self.assertEqual(0.0, matrix.inner_product([], [], normalized=True))

        # check that real-world vectors work as expected
        vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"])
        vec2 = self.dictionary.doc2bow([u"government", u"holiday"])
        expected_result = 0.0
        expected_result += 2 * 1.0 * 1  # government * s_{ij} * government
        expected_result += 2 * 0.5 * 1  # government * s_{ij} * holiday
        expected_result += 1 * 0.5 * 1  # denied * s_{ij} * government
        expected_result += 1 * 0.5 * 1  # denied * s_{ij} * holiday
        result = matrix.inner_product(vec1, vec2)
        self.assertAlmostEqual(expected_result, result, places=5)

        vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"])
        vec2 = self.dictionary.doc2bow([u"government", u"holiday"])
        expected_result = matrix.inner_product(vec1, vec2)
        expected_result /= math.sqrt(matrix.inner_product(vec1, vec1))
        expected_result /= math.sqrt(matrix.inner_product(vec2, vec2))
        result = matrix.inner_product(vec1, vec2, normalized=True)
        self.assertAlmostEqual(expected_result, result, places=5)

        # check that real-world (vector, corpus) pairs work as expected
        vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"])
        vec2 = self.dictionary.doc2bow([u"government", u"holiday"])
        expected_result = 0.0
        expected_result += 2 * 1.0 * 1  # government * s_{ij} * government
        expected_result += 2 * 0.5 * 1  # government * s_{ij} * holiday
        expected_result += 1 * 0.5 * 1  # denied * s_{ij} * government
        expected_result += 1 * 0.5 * 1  # denied * s_{ij} * holiday
        expected_result = numpy.full((1, 2), expected_result)
        result = matrix.inner_product(vec1, [vec2] * 2)
        self.assertTrue(isinstance(result, numpy.ndarray))
        self.assertTrue(numpy.allclose(expected_result, result))

        vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"])
        vec2 = self.dictionary.doc2bow([u"government", u"holiday"])
        expected_result = matrix.inner_product(vec1, vec2)
        expected_result /= math.sqrt(matrix.inner_product(vec1, vec1))
        expected_result /= math.sqrt(matrix.inner_product(vec2, vec2))
        expected_result = numpy.full((1, 2), expected_result)
        result = matrix.inner_product(vec1, [vec2] * 2, normalized=True)
        self.assertTrue(isinstance(result, numpy.ndarray))
        self.assertTrue(numpy.allclose(expected_result, result))

        # check that real-world (corpus, vector) pairs work as expected
        vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"])
        vec2 = self.dictionary.doc2bow([u"government", u"holiday"])
        expected_result = 0.0
        expected_result += 2 * 1.0 * 1  # government * s_{ij} * government
        expected_result += 2 * 0.5 * 1  # government * s_{ij} * holiday
        expected_result += 1 * 0.5 * 1  # denied * s_{ij} * government
        expected_result += 1 * 0.5 * 1  # denied * s_{ij} * holiday
        expected_result = numpy.full((3, 1), expected_result)
        result = matrix.inner_product([vec1] * 3, vec2)
        self.assertTrue(isinstance(result, numpy.ndarray))
        self.assertTrue(numpy.allclose(expected_result, result))

        vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"])
        vec2 = self.dictionary.doc2bow([u"government", u"holiday"])
        expected_result = matrix.inner_product(vec1, vec2)
        expected_result /= math.sqrt(matrix.inner_product(vec1, vec1))
        expected_result /= math.sqrt(matrix.inner_product(vec2, vec2))
        expected_result = numpy.full((3, 1), expected_result)
        result = matrix.inner_product([vec1] * 3, vec2, normalized=True)
        self.assertTrue(isinstance(result, numpy.ndarray))
        self.assertTrue(numpy.allclose(expected_result, result))

        # check that real-world corpora work as expected
        vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"])
        vec2 = self.dictionary.doc2bow([u"government", u"holiday"])
        expected_result = 0.0
        expected_result += 2 * 1.0 * 1  # government * s_{ij} * government
        expected_result += 2 * 0.5 * 1  # government * s_{ij} * holiday
        expected_result += 1 * 0.5 * 1  # denied * s_{ij} * government
        expected_result += 1 * 0.5 * 1  # denied * s_{ij} * holiday
        expected_result = numpy.full((3, 2), expected_result)
        result = matrix.inner_product([vec1] * 3, [vec2] * 2)
        self.assertTrue(isinstance(result, scipy.sparse.csr_matrix))
        self.assertTrue(numpy.allclose(expected_result, result.todense()))

        vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"])
        vec2 = self.dictionary.doc2bow([u"government", u"holiday"])
        expected_result = matrix.inner_product(vec1, vec2)
        expected_result /= math.sqrt(matrix.inner_product(vec1, vec1))
        expected_result /= math.sqrt(matrix.inner_product(vec2, vec2))
        expected_result = numpy.full((3, 2), expected_result)
        result = matrix.inner_product([vec1] * 3, [vec2] * 2, normalized=True)
        self.assertTrue(isinstance(result, scipy.sparse.csr_matrix))
        self.assertTrue(numpy.allclose(expected_result, result.todense()))