Exemplo n.º 1
0
    def _setup_model(self, model):
        # Determine which model to use, download/load it, and create the similarity_index

        if isinstance(model, Word2VecKeyedVectors):
            # Use supplied model
            self.model = model
        elif isinstance(model, str):
            # Try to download named model
            if self.verbose:
                print(f"Loading word vector model: {model}")
            self.model = api.load(model)
            if self.verbose:
                print("Model loaded Succesfully")
        elif model is None:
            # Download/use default GloVe model
            if self.verbose:
                print(
                    f"Loading default GloVe word vector model: {self.default_model}"
                )
            self.model = api.load(self.default_model)
            if self.verbose:
                print("Model loaded Succesfully")
        else:
            raise ValueError("Unable to load word vector model")

        self.similarity_index = WordEmbeddingSimilarityIndex(self.model)

        self.model_ready = True
Exemplo n.º 2
0
Arquivo: asn3.py Projeto: dddd999/asn3
def W2VH():
    docbrown = ""
    for w in brown.words(categories='mystery'):
        docbrown += str(w.lower().split())

    docbrown1, docbrown2 = docbrown[:int(len(docbrown) /
                                         2)], docbrown[int(len(docbrown) / 2):]

    stop_words = stopwords.words('english')
    docbrown1 = [w for w in docbrown1 if w not in stop_words]
    docbrown2 = [w for w in docbrown2 if w not in stop_words]

    documents = [docbrown1, docbrown2]
    dictionary = corpora.Dictionary(documents)

    docbrown1 = dictionary.doc2bow(docbrown1)
    docbrown2 = dictionary.doc2bow(docbrown2)

    model = Word2Vec(common_texts, size=20, min_count=1)
    termsim_index = WordEmbeddingSimilarityIndex(model.wv)
    similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary)

    similarity = similarity_matrix.inner_product(docbrown1,
                                                 docbrown2,
                                                 normalized=True)
    print('= %.4f' % similarity)
def create_softcosine_resourse(model_source,all_in_sentence): # create resources for soft cosine
    overall_dict = gensim.corpora.Dictionary(all_in_sentence)
    model = gensim.models.Word2Vec.load(model_source)
    similarity_index = WordEmbeddingSimilarityIndex(model.wv)

    similarity_matrix = SparseTermSimilarityMatrix(similarity_index, overall_dict)
    return overall_dict, similarity_matrix
Exemplo n.º 4
0
def createW2VecIndex(reference_dict):
    from gensim.corpora import Dictionary
    from gensim.models import Word2Vec, WordEmbeddingSimilarityIndex
    from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix
    print("Prepare Word2Vec model")
    import time
    t1 = time.time()
    corpus = []
    #reference = []
    for term in reference_dict:
        corpus.append(word_tokenize(term))
        #reference.append(term)
    model = Word2Vec(corpus, size=20, min_count=1)  # train word-vectors
    termsim_index = WordEmbeddingSimilarityIndex(model.wv)  #<----
    dictionary = Dictionary(corpus)
    bow_corpus = [dictionary.doc2bow(document) for document in corpus]
    similarity_matrix = SparseTermSimilarityMatrix(
        termsim_index, dictionary)  # construct similarity matrix
    docsim_index = SoftCosineSimilarity(bow_corpus,
                                        similarity_matrix,
                                        num_best=3)
    t2 = time.time()
    print(" W2v index and dictionary in ", (t2 - t1) / 60, " minutes")
    import pickle
    f = open("./models/W2VecIndexes.bin", 'wb')
    pickle.dump((docsim_index, dictionary), f)
    return docsim_index, dictionary
Exemplo n.º 5
0
    def __init__(self, documents):
        print("Initializing GloVe")
        if isinstance(documents[0], list):
            print("It is a list")
            documents = [[" ".join(document)] for document in documents
                         if isinstance(document, list)]

        documents = [str(document) for document in documents]

        self.corpus = [
            preprocess(document) for document in documents
            if type(document) is str
        ]
        self.documents = documents
        '''
        Then we create a similarity matrix, that contains the similarity between each pair of words, 
        weighted using the term frequency:
        '''
        # Load the model: this is a big file, can take a while to download and open
        glove = api.load("glove-wiki-gigaword-50")
        print("Document loaded")
        self.similarity_index = WordEmbeddingSimilarityIndex(glove)
        self.dictionary = Dictionary(self.corpus)
        self.tfidf = TfidfModel(dictionary=self.dictionary)
        print("Model is running")

        # Create the term similarity matrix.
        self.similarity_matrix = SparseTermSimilarityMatrix(
            self.similarity_index, self.dictionary, self.tfidf)
        print("Everything has been initialized")
 def initializeSimilarityMatrix(self):
     self.similarity_index = WordEmbeddingSimilarityIndex(self.w2v_model)
     self.similarity_matrix = SparseTermSimilarityMatrix(
         self.similarity_index,
         self.dictionary,
         self.tfidf,
         nonzero_limit=100)
Exemplo n.º 7
0
    def compute_sim_matrix(self):
        '''    
        if(self.model_type.lower() == "fasttext"):
            model = FastText(self.questions) 
        else:
            model = Word2Vec(self.questions)
        '''
        self.dictionary = Dictionary(self.questions)
        self.tfidf = TfidfModel(dictionary=self.dictionary)
        word2vec_model = Word2Vec(self.questions,
                                  workers=cpu_count(),
                                  min_count=5,
                                  size=300,
                                  seed=12345)

        sim_index = WordEmbeddingSimilarityIndex(word2vec_model.wv)
        sim_matrix = SparseTermSimilarityMatrix(sim_index,
                                                self.dictionary,
                                                self.tfidf,
                                                nonzero_limit=100)
        bow_corpus = [
            self.dictionary.doc2bow(document) for document in self.questions
        ]

        tfidf_corpus = [self.tfidf[bow] for bow in bow_corpus]

        self.docsim_index = SoftCosineSimilarity(tfidf_corpus,
                                                 sim_matrix,
                                                 num_best=10)
Exemplo n.º 8
0
def get_sim_index(wv_model, bow_corpus, dictionary):
    termsim_index = WordEmbeddingSimilarityIndex(wv_model.wv)
    similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary)
    docsim_index = SoftCosineSimilarity(bow_corpus,
                                        similarity_matrix,
                                        num_best=10)

    return docsim_index
Exemplo n.º 9
0
def prepare_index(dictionary, model, tfidf, documents):
    if not os.path.isfile('soft_cosine.index'):
        similarity_index = WordEmbeddingSimilarityIndex(model.wv)
        similarity_matrix = SparseTermSimilarityMatrix(similarity_index,
                                                       dictionary, tfidf)
        index = SoftCosineSimilarity(
            tfidf[[dictionary.doc2bow(document) for document in documents]],
            similarity_matrix)
        index.save('soft_cosine.index')

    return SoftCosineSimilarity.load('soft_cosine.index')
Exemplo n.º 10
0
    def __init__(self, cut_off=0.2, cleanup_urls=True, nltk_tokenizer=False):
        super().__init__(cleanup_urls=cleanup_urls, nltk_tokenizer=nltk_tokenizer)

        terms_idx = WordEmbeddingSimilarityIndex(self.w2vmodel.wv)
        self.dictionary = Dictionary(self.corpus)

        bow = [self.dictionary.doc2bow(doc) for doc in self.corpus]

        similarity_matrix = SparseTermSimilarityMatrix(terms_idx, self.dictionary)
        self.softcosinesimilarity = SoftCosineSimilarity(
            bow, similarity_matrix, num_best=10
        )
Exemplo n.º 11
0
def glove_score_1v1(query_string, documents):
    # query_string = 'Leticia has 3+ years of experience in data science. She has a background in applied mathematics and computer science and currently works as a data scientist at Ørsted. In her work, she builds condition-based algorithms to predict when their offshore wind turbines are going to fail in order to optimize daily operations. Leticia has an international upbringing and has lived in 9 different countries, and she is driven by a great work environment with diversity in the workplace. Leticia wants to become a mentor to help students in their transition to professional life and share their own experiences of studying and working abroad and succeeding as a woman in a male-dominated field. Leticia would prefer a mentee that has ambition and drive, such that she has a better understanding of where he or she wants to go and how she can help in the best way.'
    # documents = ['I would describe myself as being dedicated and curious. I am very interested in data analytics and operations research, specially in connection with logistics and planning. For my Bachelor thesis I did a simulation project with Copenhagen Malmö Port on how to optimise the logistics operations at their container-terminal, which really sparked my interest in this area. I am always interesting in learning new things and I try to take advantage of the great opportunities offered through my studies at DTU - like this mentorship or having the opportunity to go abroad for a semester. Last year I spent a semester at Hong Kong University of Science and Technology which was a big experience both academically and personally. Currently, I am working as a student assistant in Danmarks Nationalbank, and even though it is interesting getting an insight into the financial world and having to apply my skills to a different area, at some time, I would like to try something more related to my studies. I would like to be part of the program to gain more knowledge of what it is like working in the industry as a data analyst or engineer - preferably working with logistics, data analytics or operations research. I know very few engineers outside the academic world at DTU, so I would appreciate a mentor who could share some of their experiences and tips on transitioning from student to professional. I am leaning towards specialising in prescriptive analytics, so I would also be very interested in learning more about how optimisation methods and simulation studies are actually applied to real-world problems. What I hope to achieve as a mentee is to be more prepared for working in the industry and get advice on how to make smart choices regarding my studies. I would also appreciate some advice on whether to take another semester abroad during my Masters or gain more work-experience.',
    # 'My greatest ambition is to leave the world in a better state for humans to experience the quality of life than it was when I entered it. This reason lead me to choose scientific studies - general engineering in Paris at first, and then Applied Mathematics in DTU - in the hope to use technologys leverage for maximum impact. Disclaimer: I am currently not looking for a position as I am to continue working for Tomorrow, the fantastic company I am already working for I nevertheless am very interested to get some insights, from a mentor that went through a similar line of study, into how they decided on starting to work straight away vs continue in the academic world by applying for a PhD. I am also eager to learn more about what it actually means to be a professional "data scientist". How much research/theory is actually useful in day-to-day operations and what level of freedom they can have in their decisions and organisation. I am also curious to learn more about career path for data scientist. The popularity of this position is fairly recent and for this reason, career evolution for a data scientist is still rather obscure to me.']
    # 'I would describe myself as focused, structured and vigorous. My main interest is overall concrete technology. It is from the mixing recipes to the maintaining of old structures to "cure" its sickness. The topic of my bachelor project was about testing the different national and international test methods for alkali silica reactions (ASR). To find out the most optimal methods, to catch that sand and stone which could develop ASR. My master thesis is about testing if mine tailings could be used as a substitute for fly ash, which soon not will be available at the same amount as earlier. In my free time, I have been doing a lot of volunteering. I have been a coach for a handball team for 11-12 year old girls for two years. I learned a lot about coaching, planning and taught the girls to be team players. Further I have been part of the organizing committee for the study start and the council for my study line for three years. Where I further developed my competencies planning, leading and get things done. I usually take the lead when things need to be done, but I dont know if Im suited for management. I hope to get a closer look at "the real life", to get ready when I finish my thesis in January. I want to a mentee to get knowledge about the "life" after university. I would prefer a mentor who works with civil engineering, but a mentor who can taught me difference between consulting and entrepreneur firms, so I can find out what is right for me, would be a nice. I still don\'t know what exactly I can be, but I would appreciate some advice. I hope to achieve a way into the business, which could help me find a job after my thesis.']

    # From: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb

    # Preprocess the documents, including the query string
    corpus = [preprocess(document) for document in documents]
    query = preprocess(query_string)
    '''
    Then we create a similarity matrix, that contains the similarity between each pair of words, 
    weighted using the term frequency:
    '''
    # Load the model: this is a big file, can take a while to download and open
    glove = api.load("glove-wiki-gigaword-50")
    similarity_index = WordEmbeddingSimilarityIndex(glove)

    # Build the term dictionary, TF-idf model
    print("Everything has been initialized")
    dictionary = Dictionary(corpus + [query])
    tfidf = TfidfModel(dictionary=dictionary)

    # Create the term similarity matrix.
    similarity_matrix = SparseTermSimilarityMatrix(similarity_index,
                                                   dictionary, tfidf)
    '''
    Finally, we calculate the soft cosine similarity between the query and each of the documents. 
    Unlike the regular cosine similarity (which would return zero for vectors with no overlapping terms), 
    the soft cosine similarity considers word similarity as well.
    '''
    # Compute Soft Cosine Measure between the query and the documents.
    # From: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb
    query_tf = tfidf[dictionary.doc2bow(query)]

    index = SoftCosineSimilarity(
        tfidf[[dictionary.doc2bow(document) for document in corpus]],
        similarity_matrix)

    doc_similarity_scores = index[query_tf]

    # Output the sorted similarity scores and documents
    sorted_indexes = np.argsort(doc_similarity_scores)[::-1]
    count = 0
    print("Mentee values: {}".format(query_string))
    for idx in sorted_indexes:
        count += 1
        if count > 10:
            break
        print(
            f'{idx} \t {doc_similarity_scores[idx]:0.3f} \t {documents[idx]}')
    return doc_similarity_scores
Exemplo n.º 12
0
    def calculate_soft_cosine_similarity(self, topic_models, sentences, *args,
                                         **kwargs):

        topic_claim_relations = {}
        for topic in topic_models:
            topic_claim_relations[topic] = []

        documents = []
        for topic in topic_models:
            documents.append(topic.lower().split())
        for sentence in sentences:
            documents.append(sentence.lower().split())
        dictionary = corpora.Dictionary(documents)

        w2v_model = api.load("glove-wiki-gigaword-100")
        similarity_index = WordEmbeddingSimilarityIndex(w2v_model)
        similarity_matrix = SparseTermSimilarityMatrix(similarity_index,
                                                       dictionary)

        for sentence in sentences:
            best_cosine_result = 0
            x = 0
            normal_sentence = sentence
            sentence = sentence.lower().split()

            stop_words = stopwords.words('english')
            sentence = [w for w in sentence if w not in stop_words]

            while x <= len(topic_models) - 1:

                topic_model = (topic_models[x]).lower().split()
                topic_model = [w for w in topic_model if w not in stop_words]

                topic_model_bow = dictionary.doc2bow(topic_model)
                sentence_bow = dictionary.doc2bow(sentence)

                similarity = similarity_matrix.inner_product(topic_model_bow,
                                                             sentence_bow,
                                                             normalized=True)
                print('similarity = %.4f' % similarity)

                if similarity > best_cosine_result:
                    best_cosine_result = similarity
                    matched_topic = topic_models[x]

                if x == len(topic_models) - 1:
                    if best_cosine_result > 0.3:
                        topic_claim_relations[matched_topic].append(
                            normal_sentence)

                x = x + 1
        return topic_claim_relations
def find_similarity(search_w, corpus_w):
    rv = {}
    rv['result'] = []
    bmatch = False
    #Tokenize the sentence into words
    #search_tokens = [word for word in search_w.split()]
    #corpus_tokens = [word for word in corpus_w.split()]
    search_tokens = search_w
    corpus_tokens = corpus_w

    #print(search_tokens)
    #print(corpus_tokens)
    #print("-----")
    #cp = []
    #for c in corpus_tokens:
    #    cp.append([c])

    #corpus_tokens = cp
    search_tokens = [search_w]
    print(corpus_tokens)
    print(search_tokens)
    # Prepare a dictionary and a corpus.
    #documents = [svc_tokens, specs_tokens]
    dictionary = corpora.Dictionary(corpus_tokens)

    termsim_index = WordEmbeddingSimilarityIndex(model.wv)
    bow_corpus = [dictionary.doc2bow(doc) for doc in corpus_tokens]
    similarity_matrix = SparseTermSimilarityMatrix(
        termsim_index, dictionary)  # construct similarity matrix
    docsim_index = SoftCosineSimilarity(bow_corpus,
                                        similarity_matrix,
                                        num_best=10)

    # Compute soft cosine similarity
    for t in search_tokens:
        #print("looking for %s" %(t.split()))
        for e in t.split(','):
            match = {}
            e = e.strip()
            lkup = [e]
            try:
                result = docsim_index[dictionary.doc2bow(lkup)]
            except:
                result = [(0, 0)]
            print(f"looking for {lkup}, result {result}")
            if len(result) and result[0][1] > 0.5:
                match['word'] = e.split()
                match['value'] = str(result)
                rv['result'].append(match)
                bmatch = True
    #print(docsim_index[dictionary.doc2bow(search_tokens)])
    return rv if bmatch else None
Exemplo n.º 14
0
    def __init__(self, model):
        """
            Creates the class.

            Args:
                modelName: name of the model to download through gensim
        """

        # public properties
        self.itemScores = []
        self.dictionary = corpora.Dictionary()
        self.model = model
        self.wordEmbedding = WordEmbeddingSimilarityIndex(self.model)
    def _setup_model(self):

        if self.verbose:
            print('Loading model')

        loaded_model = load_facebook_model(self.default_model)
        self.model = loaded_model.wv

        if self.verbose:
            print('Model loaded')

        self.similarity_index = WordEmbeddingSimilarityIndex(self.model)
        self.model_ready = True
def compute_msg_dist_matrix(data):
    lst_notifications = data 
    # print(lst_notifications)
    model = Word2Vec(lst_notifications, min_count=1)  # train word-vectors
    termsim_index = WordEmbeddingSimilarityIndex(model.wv)
    data_2 = [d.split() for d in lst_notifications]
    #print(data)
    dictionary = Dictionary(data_2)
    bow_corpus = [dictionary.doc2bow(document) for document in data_2]
    similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary)  # construct similarity matrix
    docsim_index = SoftCosineSimilarity(bow_corpus, similarity_matrix)
    sof_cosine_distance_matrix = 1- np.array(docsim_index)
    return sof_cosine_distance_matrix
Exemplo n.º 17
0
    def __init__(self, params):
        super().__init__(params)
        self.new_model = gensim.models.Word2Vec.load( params['path_to_trained_model'] )
        self.new_model.init_sims(replace=True)  # Normalizes the vectors in the word2vec class.
        #Computes cosine similarities between word embeddings and retrieves the closest
        #word embeddings by cosine similarity for a given word embedding.
        self.similarity_index = WordEmbeddingSimilarityIndex(self.new_model.wv)
        #Build a term similarity matrix and compute the Soft Cosine Measure.
        self.similarity_matrix = SparseTermSimilarityMatrix(self.similarity_index, self.dictionary)

        self.dict_distance_dispatcher = {
            DistanceMetric.COS: self.cos_scipy,
            SimilarityMetric.Pearson: self.pearson_abs_scipy,
            DistanceMetric.WMD: self.wmd_gensim,
            DistanceMetric.SCM: self.scm_gensim
        }
Exemplo n.º 18
0
    def train(self, sentences):
        """Train a word2vec model with sentences"""

        dictionary = Dictionary(sentences)

        ft = Word2Vec(sentences,
                      workers=cpu_count(),
                      min_count=5,
                      size=300,
                      seed=12345)

        index = WordEmbeddingSimilarityIndex(ft.wv)
        matrix = SparseTermSimilarityMatrix(index, dictionary)

        self.dictionary = dictionary
        self.ft = ft
        self.matrix = matrix
Exemplo n.º 19
0
    def computeDocumentSimilarityIndex(self, corpus):
        """
            Compute the similarity matrix of the model

            Args:
                corpus: dictionary to use to create index

            Returns:
                SoftCosineSimilarity instance
        """

        if self.wordEmbedding is None:
            self.wordEmbedding = WordEmbeddingSimilarityIndex(self.model)

        # create similarity matrix, update flags
        simMatrix = SparseTermSimilarityMatrix(self.wordEmbedding, corpus)
        return SoftCosineSimilarity([x.sentence for x in self.itemScores], simMatrix)
Exemplo n.º 20
0
    def get_embedding_files(self, num_best=10):
        """
        Get the dictionary, bow_corpos, similiarity matrix and docsim index pre-trained on all image tags.
        """
        # embeddings
        try:
            with open(f'{constants.EMBEDDING_DIR}/soft_cosine.pkl', "rb") as f:
                self.dictionary, self.bow_corpus, self.similarity_matrix, _ = pickle.load(
                    f)
            self.docsim_index = SoftCosineSimilarity(self.bow_corpus,
                                                     self.similarity_matrix,
                                                     num_best=num_best)

        except FileNotFoundError:
            print(
                f'no file found, training word2vec to get bow_corpus, similarity matrix and docsim index'
            )
            # read in all tags
            try:
                with open(f'{constants.DATA_DIR}/all_img_tags.pkl',
                          'rb') as fp:
                    all_img_tags_lower = pickle.load(fp)
            except FileNotFoundError:
                print(
                    f'no file found at {constants.DATA_DIR}/all_img_tags.pkl')
            model = Word2Vec(all_img_tags_lower, size=20,
                             min_count=1)  # train word2vec
            termsim_index = WordEmbeddingSimilarityIndex(model.wv)
            self.dictionary = Dictionary(all_img_tags_lower)
            self.bow_corpus = [
                self.dictionary.doc2bow(document)
                for document in all_img_tags_lower
            ]
            self.similarity_matrix = SparseTermSimilarityMatrix(
                termsim_index, self.dictionary)  # construct similarity matrix
            # 10 (default) most similar image tag vectors
            self.docsim_index = SoftCosineSimilarity(self.bow_corpus,
                                                     self.similarity_matrix,
                                                     num_best=num_best)
            print(
                f'Saving bow_corpus, similarity matrix and docsim index to {constants.EMBEDDING_DIR}'
            )
            with open(f'{constants.EMBEDDING_DIR}/soft_cosine.pkl', "wb") as f:
                pickle.dump((self.dictionary, self.bow_corpus,
                             self.similarity_matrix, self.docsim_index), f)
Exemplo n.º 21
0
    def calculate_distance(self,query_string,documents):
       

    
        def preprocess(doc):
            # Tokenize, clean up input document string
            doc = sub(r'<img[^<>]+(>|$)', " image_token ", doc)
            doc = sub(r'<[^<>]+(>|$)', " ", doc)
            doc = sub(r'\[img_assist[^]]*?\]', " ", doc)
            doc = sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', " url_token ", doc)
            return [token for token in simple_preprocess(doc, min_len=0, max_len=float("inf")) if token not in STOPWORDS]

        # Preprocess the documents, including the query string
        corpus = [preprocess(document) for document in documents]
        query = preprocess(query_string)


        # Load the model: this is a big file, can take a while to download and open
            
        similarity_index = WordEmbeddingSimilarityIndex(glove)

        # Build the term dictionary, TF-idf model

        dictionary = Dictionary(corpus+[query])
        tfidf = TfidfModel(dictionary=dictionary)

        # Create the term similarity matrix.  
        similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf)

        query_tf = tfidf[dictionary.doc2bow(query)]

        index = SoftCosineSimilarity(
                    tfidf[[dictionary.doc2bow(document) for document in corpus]],
                    similarity_matrix)

        doc_similarity_scores = index[query_tf]

        # Output the sorted similarity scores and documents
        sorted_indexes = np.argsort(doc_similarity_scores)[::-1]
        if len(documents) > 1:
            for idx in sorted_indexes:
                print(f'{idx} \t {doc_similarity_scores[idx]:0.3f} \t {documents[idx]}')
        # print(doc_similarity_scores)
        return doc_similarity_scores
Exemplo n.º 22
0
    def __init__(self, documents):
        print(type(documents[0]))
        if isinstance(documents[0], list):
            print("It is a list")
            documents = [[" ".join(document)] for document in documents
                         if isinstance(document, list)]

        documents = [str(document) for document in documents]

        self.corpus = [
            preprocess(document) for document in documents
            if type(document) is str
        ]
        self.documents = documents
        '''
        Then we create a similarity matrix, that contains the similarity between each pair of words, 
        weighted using the term frequency:
        '''
        # Load the model: this is a big file, can take a while to download and open
        glove = api.load("glove-wiki-gigaword-50")
        self.similarity_index = WordEmbeddingSimilarityIndex(glove)
Exemplo n.º 23
0
def calculate_softcosine_w2v(test_data):
    data = [i.split() for i in (test_data.text).tolist()]
    dictionary = corpora.Dictionary(data)
    corpus = [dictionary.doc2bow(d) for d in data]

    similarity_index = WordEmbeddingSimilarityIndex(w2v_model)
    similarity_matrix = SparseTermSimilarityMatrix(similarity_index,
                                                   dictionary)

    softsim_w2v_matrix = np.empty(shape=(len(data), len(data))) * np.nan
    for d1 in range(0, len(data)):
        for d2 in range(0, len(data)):
            softsim_w2v_matrix[d1, d2] = similarity_matrix.inner_product(
                corpus[d1], corpus[d2], normalized=True)

    doc_sim_max_index, doc_sim_max_values = calculate_max_similarity(
        softsim_w2v_matrix)
    softsim_w2v_df = export_result(test_data, doc_sim_max_index,
                                   doc_sim_max_values, 'softsim_w2v')
    print(
        "Similarity using soft cosine similarity using w2v vectors is calculated!!"
    )
    return softsim_w2v_df
def main(chosen_model_no=0, num_items_displayed=10, use_spacy=False, use_soft_cosine_similarity=False,
         num_topics=None, no_below=5, no_above=0.5, normalize_vectors=False):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    if num_topics is None:
        num_topics = 100

    possible_model_names = [
        'tf_idf',  # 0
        'lsi_bow', 'lsi_tf_idf',  # 1, 2
        'rp_bow', 'rp_tf_idf',  # 3, 4
        'lda_bow', 'lda_tf_idf',  # 5, 6
        'hdp_bow', 'hdp_tf_idf',  # 7, 8
        'word2vec',  # 9
    ]
    chosen_model_name = possible_model_names[chosen_model_no]
    print(chosen_model_name)

    game_names, _ = load_game_names(include_genres=False, include_categories=False)

    steam_tokens = load_tokens()

    nlp = spacy.load('en_core_web_lg')

    documents = list(steam_tokens.values())

    dct = Dictionary(documents)
    print(len(dct))
    dct.filter_extremes(no_below=no_below, no_above=no_above)
    print(len(dct))

    corpus = [dct.doc2bow(doc) for doc in documents]

    # Pre-processing

    pre_process_corpus_with_tf_idf = chosen_model_name.endswith('_tf_idf')

    tfidf_model = TfidfModel(corpus, id2word=dct, normalize=normalize_vectors)

    if pre_process_corpus_with_tf_idf:
        # Caveat: the leading underscore is important. Do not use this pre-processing if the chosen model is Tf-Idf!
        print('Corpus as Tf-Idf')
        pre_processed_corpus = tfidf_model[corpus]
    else:
        print('Corpus as Bag-of-Words')
        pre_processed_corpus = corpus

    # Model

    model = None
    wv = None
    index2word_set = None

    if chosen_model_name == 'tf_idf':
        print('Term Frequency * Inverse Document Frequency (Tf-Idf)')
        model = tfidf_model

    elif chosen_model_name.startswith('lsi'):
        print('Latent Semantic Indexing (LSI/LSA)')
        model = LsiModel(pre_processed_corpus, id2word=dct, num_topics=num_topics)

    elif chosen_model_name.startswith('rp'):
        print('Random Projections (RP)')
        model = RpModel(pre_processed_corpus, id2word=dct, num_topics=num_topics)

    elif chosen_model_name.startswith('lda'):
        print('Latent Dirichlet Allocation (LDA)')
        model = LdaModel(pre_processed_corpus, id2word=dct, num_topics=num_topics)

    elif chosen_model_name.startswith('hdp'):
        print('Hierarchical Dirichlet Process (HDP)')
        model = HdpModel(pre_processed_corpus, id2word=dct)

    elif chosen_model_name == 'word2vec':
        use_a_lot_of_ram = False

        if use_a_lot_of_ram:
            model = None

            print('Loading Word2Vec based on Google News')
            # Warning: this takes a lot of time and uses a ton of RAM!
            wv = KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)
        else:
            if use_spacy:
                print('Using Word2Vec with spaCy')
            else:
                print('Training Word2Vec')

                model = Word2Vec(documents)

                wv = model.wv

        if not use_spacy:
            wv.init_sims(replace=normalize_vectors)

            index2word_set = set(wv.index2word)

    else:
        print('No model specified.')
        model = None

    if chosen_model_name != 'word2vec':
        if not use_soft_cosine_similarity:
            index = MatrixSimilarity(model[pre_processed_corpus], num_best=10, num_features=len(dct))
        else:
            w2v_model = Word2Vec(documents)
            similarity_index = WordEmbeddingSimilarityIndex(w2v_model.wv)
            similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dct, tfidf_model, nonzero_limit=100)
            index = SoftCosineSimilarity(model[pre_processed_corpus], similarity_matrix)
    else:
        index = None

    query_app_ids = load_benchmarked_app_ids(append_hard_coded_app_ids=True)

    app_ids = list(int(app_id) for app_id in steam_tokens.keys())

    matches_as_app_ids = []

    for query_count, query_app_id in enumerate(query_app_ids):
        print('[{}/{}] Query appID: {} ({})'.format(query_count + 1, len(query_app_ids),
                                                    query_app_id, get_app_name(query_app_id, game_names)))

        query = steam_tokens[str(query_app_id)]

        if use_spacy:
            spacy_query = Doc(nlp.vocab, query)
        else:
            spacy_query = None

        if chosen_model_name != 'word2vec':
            vec_bow = dct.doc2bow(query)
            if pre_process_corpus_with_tf_idf:
                pre_preoccessed_vec = tfidf_model[vec_bow]
            else:
                pre_preoccessed_vec = vec_bow
            vec_lsi = model[pre_preoccessed_vec]
            sims = index[vec_lsi]

            if use_soft_cosine_similarity:
                sims = enumerate(sims)

            similarity_scores_as_tuples = [(str(app_ids[i]), sim) for (i, sim) in sims]
            similarity_scores = reformat_similarity_scores_for_doc2vec(similarity_scores_as_tuples)
        else:
            if use_spacy:
                similarity_scores = {}
                for app_id in steam_tokens:
                    reference_sentence = steam_tokens[app_id]
                    spacy_reference = Doc(nlp.vocab, reference_sentence)
                    similarity_scores[app_id] = spacy_query.similarity(spacy_reference)
            else:
                query_sentence = filter_out_words_not_in_vocabulary(query, index2word_set)

                similarity_scores = {}

                counter = 0
                num_games = len(steam_tokens)

                for app_id in steam_tokens:
                    counter += 1

                    if (counter % 1000) == 0:
                        print('[{}/{}] appID = {} ({})'.format(counter, num_games, app_id, game_names[app_id]))

                    reference_sentence = steam_tokens[app_id]
                    reference_sentence = filter_out_words_not_in_vocabulary(reference_sentence, index2word_set)

                    try:
                        similarity_scores[app_id] = wv.n_similarity(query_sentence, reference_sentence)
                    except ZeroDivisionError:
                        similarity_scores[app_id] = 0

        similar_app_ids = print_most_similar_sentences(similarity_scores, num_items_displayed=num_items_displayed,
                                                       verbose=False)
        matches_as_app_ids.append(similar_app_ids)

    print_ranking(query_app_ids,
                  matches_as_app_ids,
                  only_print_banners=True)

    return
Exemplo n.º 25
0
def create_model(storage_client, json_in, video_id):
    """ Create soft cosine similarity model

    Keywords arguments:
    storage_client -- a Storage instance
    json_in        -- json returned from the YouTube Captions API
    video_id       -- the Youtube video_id

    Returns:
    - A Soft Cosine Measure model
    - The dictionary of terms computed
    """

    video_id = video_id.lower()

    # check if bucket exists
    if blob_exists(storage_client, video_id):
        # retrieve blob from bucket
        bucket = storage_client.bucket(bucket_name)
        blob = bucket.blob(video_id)  # The blob's name is the video ID

        # download the storage pickle as a binary string
        blob_str = blob.download_as_string()
        dictionary, index = pickle.loads(blob_str)
        return dictionary, index

    # download stop_words and glove
    stop_words, glove = download_resources()

    # Create Glove similarity Index
    similarity_index = WordEmbeddingSimilarityIndex(glove)

    # parse json captions into document form
    documents = processInput(json_in)

    # create a corpus from documents
    corpus = [preprocess(document, stop_words) for document in documents]

    # create dictionary from documents
    dictionary = Dictionary(corpus)
    tfidf = TfidfModel(dictionary=dictionary)

    # create a term similarity matrix
    similarity_matrix = SparseTermSimilarityMatrix(similarity_index,
                                                   dictionary, tfidf)

    # Compute Soft Cosine Measure between documents
    index = SoftCosineSimilarity(
        tfidf[[dictionary.doc2bow(document) for document in corpus]],
        similarity_matrix)

    # save index and dictionary
    storage_client = storage.Client()

    # create a binary pickle representation
    bin_tuple = pickle.dumps((dictionary, index))
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(video_id)

    # save to storage
    blob.upload_from_string(bin_tuple)

    if debug_messages:
        print("Binary model with name {} and dictionary uploaded.".format(
            video_id))

    return dictionary, index
                total_examples=w2v_model.corpus_count,
                epochs=30,
                report_delay=1)

print('Time to train the model: {} mins'.format(
    round((time.time() - t) / 60, 2)))

w2v_model.init_sims(replace=True)

w2v_model.save("w2v-20newsgroups")

print(w2v_model.vector_size)

len(w2v_model.wv.vocab)

termsim_index = WordEmbeddingSimilarityIndex(w2v_model.wv)  # get termsim index


# dictionary = Dictionary(df['tokenized'])  # dictionary for model to use for indexing later
# finding a similarity matrix
def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)


dictionary = load_obj('LDADICT')
bow_corpus = [dictionary.doc2bow(document)
              for document in df['tokenized']]  # generate a bow corpus
similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary)

# Testing: Case 1 Atheism vs Windows
Exemplo n.º 27
0
def main():
    tfidf = None
    word2vec = None
    similarityMatrix = None
    browndict = {}
    corporadict = None

    word2vec = None

    choice = ""
    while choice != "exit":
        choice = ""
        while choice not in ["tfidf", "word2vec", "exit"]:
            choice = input(
                "TF-IDF or Word2Vec? [TFIDF, Word2Vec, Exit]\n>").lower()

        if choice == "exit":
            break

        catType = ""
        while catType not in ["within", "between", "return"]:
            catType = input(
                "Within or between clusters? [Within, Between, Return]\n>"
            ).lower()

        if catType == "return":
            break

        # get all of the words for each document per category
        texts = []
        if catType == "within":
            for c in brown.categories():
                words = NormalizeWords(brown.words(categories=c))
                texts.append(words)
                # build a dictionary for me to use later
                browndict[c] = words
        elif catType == "between":
            for c in brown.categories():
                words = NormalizeWords(brown.words(categories=c))
                texts.append(words[:len(words) // 2])
                texts.append(words[len(words) // 2:])
                # build a dictionary for me to use later
                browndict[c + "1/2"] = words[:len(words) // 2]
                browndict[c + "2/2"] = words[len(words) // 2:]

        # create the corpora dictionary built from gensim
        corporadict = corpora.Dictionary(texts)
        # create a corpus for the training
        corpus = []
        for line in texts:
            corpus.append(corporadict.doc2bow(line))

        if choice == "tfidf":
            # create the tfidf model from our built corpus
            tfidf = TfidfModel(corpus=corpus)

            # build the similarity matrix
            similarityMatrix = MatrixSimilarity(corpus,
                                                num_features=len(corporadict))
        elif choice == "word2vec":
            word2vec = Word2Vec(brown.sents())

            # build term similiarity matrix from our models word-vector
            termSimilarityIndex = WordEmbeddingSimilarityIndex(word2vec.wv)

            # build sparse similarity matrix
            sparseSimiliarityMatrix = SparseTermSimilarityMatrix(
                termSimilarityIndex, corporadict)

            # build similarity word-vector
            WV_SimilarityMatrix = SoftCosineSimilarity(
                corpus, sparseSimiliarityMatrix)

        maxes = {}
        if choice == "tfidf":
            # Print out the code
            keys = list(browndict.keys())
            for i in range(len(keys) - 1):
                # Convert to a bag of words and to a tfidf vector, then query it.
                query_bow = corporadict.doc2bow(browndict[keys[i]])
                query_tfidf = tfidf[query_bow]

                # Get the similarity of every cluster
                query_similarity = similarityMatrix[query_tfidf]
                for j in range(i + 1, len(query_similarity)):
                    sim = query_similarity[j]
                    print(keys[i], "and", keys[j], "have a similarity of:",
                          sim)
                print("")
        elif choice == "word2vec":
            keys = list(browndict.keys())
            for i in range(len(keys) - 1):
                # Convert to a bag of words and query it
                query_bow = corporadict.doc2bow(browndict[keys[i]])

                # Get the similarity of every cluster
                query_similarity = WV_SimilarityMatrix[query_bow]
                for j in range(i + 1, len(query_similarity)):
                    sim = query_similarity[j]
                    print(keys[i], "and", keys[j], "have a similarity of:",
                          sim)
                print("")
Exemplo n.º 28
0
    n_topics = len(topics)

    # pre-process topic keyword lists
    topics_cleaned = list(map(clean_sentence,topics))

    # build complete dictionary
    tokenized_neg_reviews_and_topics = topics_cleaned + list(df_negative_sentences['review_sentence_cleaned'])
    neg_dictionary = corpora.Dictionary(tokenized_neg_reviews_and_topics)

    # create bag-of-words vectors
    corpus_neg_reviews = [neg_dictionary.doc2bow(text) for text in list(df_negative_sentences['review_sentence_cleaned'])]
    corpus_neg_topics = [neg_dictionary.doc2bow(text) for text in topics_cleaned]

    # build similarity matrix of word embeddings
    print('Building similarity matrix of word embeddings. Might take a few minutes...')
    termsim_index = WordEmbeddingSimilarityIndex(fasttext_model300)
    similarity_matrix = SparseTermSimilarityMatrix(termsim_index,neg_dictionary)
    print('done')

    # compute soft cosine similarity between sentences and topics
    print('Computing soft cosine similarity between sentences and topics. Might take a few minutes...')
    neg_data_topics = []
    for review_item in corpus_neg_reviews:
        review_item_topics = []
        for topic in corpus_neg_topics:
            review_item_topics.append(similarity_matrix.inner_product(review_item,topic,normalized=True))
        neg_data_topics.append(review_item_topics)
    print('done')    

    # extract topic with highest soft cosine similarity
    # I set a minimum threshold (0.10) that needs to be reached in order to assign a topic.
Exemplo n.º 29
0
        embedding1 = np.average([model.wv[token] for token in tokfreqs1],
                                axis=0,
                                weights=weights1).reshape(1, -1)
        embedding2 = np.average([model.wv[token] for token in tokfreqs2],
                                axis=0,
                                weights=weights2).reshape(1, -1)

        sim = cosine_similarity(embedding1, embedding2)[0][0]
        sims.append(sim)

    return sims


# Create Term Similarity Index from Word2Vec model

termsim_index = WordEmbeddingSimilarityIndex(model.wv)

# Create Corpus List
corpus_list = []
for data in dataset:
    docs = ""
    for sentence in data['gejala']:
        docs += " " + sentence
    corpus_list.append(docs)

# Create token list for all document corpus
corpus_list_token = [preprocess(doc) for doc in corpus_list]

dictionary = Dictionary(corpus_list_token)
bow_corpus = [dictionary.doc2bow(document) for document in corpus_list_token]
Exemplo n.º 30
0
    stopwords = ['the', 'and', 'are', 'a']

    # Preprocess the documents, including the query string
    corpus = [preprocess(document) for document in documents]

    file_corpus_w = open("data/corpus.pickle", 'wb')
    pickle.dump(corpus, file_corpus_w)
    file_corpus_w.close()
    print("Preprocessing finished")

    print(time.time() - start_time)
    print("Loading model")
    # Load the model: this is a big file, can take a while to download and open
    glove = api.load("glove-wiki-gigaword-50")    
    similarity_index = WordEmbeddingSimilarityIndex(glove)
    file_sim_idx_w = open("data/sim_idx.pickle", 'wb')
    pickle.dump(similarity_index, file_sim_idx_w)
    file_sim_idx_w.close()

    print("Model loaded")
    print(time.time() - start_time)
    #####################

    print("Building term dictionary and similarity matrix")
    # Build the term dictionary, TF-idf model
    dictionary = Dictionary(corpus)
    tfidf = TfidfModel(dictionary=dictionary)
    file_tfidf_w = open("data/tfidf.pickle", 'wb')
    pickle.dump(tfidf, file_tfidf_w)
    file_tfidf_w.close()