示例#1
0
def prepare_index(dictionary, model, tfidf, documents):
    if not os.path.isfile('soft_cosine.index'):
        similarity_index = WordEmbeddingSimilarityIndex(model.wv)
        similarity_matrix = SparseTermSimilarityMatrix(similarity_index,
                                                       dictionary, tfidf)
        index = SoftCosineSimilarity(
            tfidf[[dictionary.doc2bow(document) for document in documents]],
            similarity_matrix)
        index.save('soft_cosine.index')

    return SoftCosineSimilarity.load('soft_cosine.index')
    def get_score(self, query_string):
        if isinstance(query_string, list):
            query_string = " ".join(query_string)

        query = preprocess(query_string)
        '''
        Finally, we calculate the soft cosine similarity between the query and each of the documents. 
        Unlike the regular cosine similarity (which would return zero for vectors with no overlapping terms), 
        the soft cosine similarity considers word similarity as well.
        '''
        # Compute Soft Cosine Measure between the query and the documents.
        # From: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb
        query_tf = self.tfidf[self.dictionary.doc2bow(query)]

        index = SoftCosineSimilarity(
            self.tfidf[[
                self.dictionary.doc2bow(document) for document in self.corpus
            ]], self.similarity_matrix)

        doc_similarity_scores = index[query_tf]

        # Output the sorted similarity scores and documents
        print("Mentee values: {}".format(query_string))

        return doc_similarity_scores
示例#3
0
def createW2VecIndex(reference_dict):
    from gensim.corpora import Dictionary
    from gensim.models import Word2Vec, WordEmbeddingSimilarityIndex
    from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix
    print("Prepare Word2Vec model")
    import time
    t1 = time.time()
    corpus = []
    #reference = []
    for term in reference_dict:
        corpus.append(word_tokenize(term))
        #reference.append(term)
    model = Word2Vec(corpus, size=20, min_count=1)  # train word-vectors
    termsim_index = WordEmbeddingSimilarityIndex(model.wv)  #<----
    dictionary = Dictionary(corpus)
    bow_corpus = [dictionary.doc2bow(document) for document in corpus]
    similarity_matrix = SparseTermSimilarityMatrix(
        termsim_index, dictionary)  # construct similarity matrix
    docsim_index = SoftCosineSimilarity(bow_corpus,
                                        similarity_matrix,
                                        num_best=3)
    t2 = time.time()
    print(" W2v index and dictionary in ", (t2 - t1) / 60, " minutes")
    import pickle
    f = open("./models/W2VecIndexes.bin", 'wb')
    pickle.dump((docsim_index, dictionary), f)
    return docsim_index, dictionary
示例#4
0
    def compute_sim_matrix(self):
        '''    
        if(self.model_type.lower() == "fasttext"):
            model = FastText(self.questions) 
        else:
            model = Word2Vec(self.questions)
        '''
        self.dictionary = Dictionary(self.questions)
        self.tfidf = TfidfModel(dictionary=self.dictionary)
        word2vec_model = Word2Vec(self.questions,
                                  workers=cpu_count(),
                                  min_count=5,
                                  size=300,
                                  seed=12345)

        sim_index = WordEmbeddingSimilarityIndex(word2vec_model.wv)
        sim_matrix = SparseTermSimilarityMatrix(sim_index,
                                                self.dictionary,
                                                self.tfidf,
                                                nonzero_limit=100)
        bow_corpus = [
            self.dictionary.doc2bow(document) for document in self.questions
        ]

        tfidf_corpus = [self.tfidf[bow] for bow in bow_corpus]

        self.docsim_index = SoftCosineSimilarity(tfidf_corpus,
                                                 sim_matrix,
                                                 num_best=10)
示例#5
0
def get_sim_index(wv_model, bow_corpus, dictionary):
    termsim_index = WordEmbeddingSimilarityIndex(wv_model.wv)
    similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary)
    docsim_index = SoftCosineSimilarity(bow_corpus,
                                        similarity_matrix,
                                        num_best=10)

    return docsim_index
示例#6
0
def softcossim(query, documents):
    # Compute Soft Cosine Measure between the query and the documents.
    query = tfidf[dictionary.doc2bow(query)]
    index = SoftCosineSimilarity(
        tfidf[[dictionary.doc2bow(document) for document in documents]],
        similarity_matrix)
    similarities = index[query]
    return similarities
示例#7
0
    def get_embedding_files(self, num_best=10):
        """
        Get the dictionary, bow_corpos, similiarity matrix and docsim index pre-trained on all image tags.
        """
        # embeddings
        try:
            with open(f'{constants.EMBEDDING_DIR}/soft_cosine.pkl', "rb") as f:
                self.dictionary, self.bow_corpus, self.similarity_matrix, _ = pickle.load(
                    f)
            self.docsim_index = SoftCosineSimilarity(self.bow_corpus,
                                                     self.similarity_matrix,
                                                     num_best=num_best)

        except FileNotFoundError:
            print(
                f'no file found, training word2vec to get bow_corpus, similarity matrix and docsim index'
            )
            # read in all tags
            try:
                with open(f'{constants.DATA_DIR}/all_img_tags.pkl',
                          'rb') as fp:
                    all_img_tags_lower = pickle.load(fp)
            except FileNotFoundError:
                print(
                    f'no file found at {constants.DATA_DIR}/all_img_tags.pkl')
            model = Word2Vec(all_img_tags_lower, size=20,
                             min_count=1)  # train word2vec
            termsim_index = WordEmbeddingSimilarityIndex(model.wv)
            self.dictionary = Dictionary(all_img_tags_lower)
            self.bow_corpus = [
                self.dictionary.doc2bow(document)
                for document in all_img_tags_lower
            ]
            self.similarity_matrix = SparseTermSimilarityMatrix(
                termsim_index, self.dictionary)  # construct similarity matrix
            # 10 (default) most similar image tag vectors
            self.docsim_index = SoftCosineSimilarity(self.bow_corpus,
                                                     self.similarity_matrix,
                                                     num_best=num_best)
            print(
                f'Saving bow_corpus, similarity matrix and docsim index to {constants.EMBEDDING_DIR}'
            )
            with open(f'{constants.EMBEDDING_DIR}/soft_cosine.pkl', "wb") as f:
                pickle.dump((self.dictionary, self.bow_corpus,
                             self.similarity_matrix, self.docsim_index), f)
    def _softcossim(self, query: str, documents: list):
        # Compute Soft Cosine Measure between the query and each of the documents.
        query = self.tfidf[self.dictionary.doc2bow(query)]
        index = SoftCosineSimilarity(
            self.tfidf[[
                self.dictionary.doc2bow(document) for document in documents
            ]], self.similarity_matrix)
        similarities = index[query]

        return similarities
示例#9
0
    def similarity(self, query, documents):
        """Caclulate cosine simularity between query and all documents"""

        bow_query = self.dictionary.doc2bow(query)
        bow_docs = [
            self.dictionary.doc2bow(document) for document in documents
        ]

        index = SoftCosineSimilarity(bow_docs, self.matrix)
        similarities = index[bow_query]

        return similarities
 def __init__(self, initializer):
     tfidf = initializer.getTfIdf()
     dictionary = initializer.getDictionary()
     query = tfidf[dictionary.doc2bow(initializer.getPreprocessedNews())]
     preprocessed_documents = initializer.getPreprocessedDocuments()
     index = SoftCosineSimilarity(
         tfidf[[
             dictionary.doc2bow(document)
             for document in preprocessed_documents
         ]], initializer.getSimilarityMatrix())
     similarities = index[query]
     self.scores = similarities[1:len(similarities)]
示例#11
0
    def __init__(self, cut_off=0.2, cleanup_urls=True, nltk_tokenizer=False):
        super().__init__(cleanup_urls=cleanup_urls, nltk_tokenizer=nltk_tokenizer)

        terms_idx = WordEmbeddingSimilarityIndex(self.w2vmodel.wv)
        self.dictionary = Dictionary(self.corpus)

        bow = [self.dictionary.doc2bow(doc) for doc in self.corpus]

        similarity_matrix = SparseTermSimilarityMatrix(terms_idx, self.dictionary)
        self.softcosinesimilarity = SoftCosineSimilarity(
            bow, similarity_matrix, num_best=10
        )
def glove_score_1v1(query_string, documents):
    # query_string = 'Leticia has 3+ years of experience in data science. She has a background in applied mathematics and computer science and currently works as a data scientist at Ørsted. In her work, she builds condition-based algorithms to predict when their offshore wind turbines are going to fail in order to optimize daily operations. Leticia has an international upbringing and has lived in 9 different countries, and she is driven by a great work environment with diversity in the workplace. Leticia wants to become a mentor to help students in their transition to professional life and share their own experiences of studying and working abroad and succeeding as a woman in a male-dominated field. Leticia would prefer a mentee that has ambition and drive, such that she has a better understanding of where he or she wants to go and how she can help in the best way.'
    # documents = ['I would describe myself as being dedicated and curious. I am very interested in data analytics and operations research, specially in connection with logistics and planning. For my Bachelor thesis I did a simulation project with Copenhagen Malmö Port on how to optimise the logistics operations at their container-terminal, which really sparked my interest in this area. I am always interesting in learning new things and I try to take advantage of the great opportunities offered through my studies at DTU - like this mentorship or having the opportunity to go abroad for a semester. Last year I spent a semester at Hong Kong University of Science and Technology which was a big experience both academically and personally. Currently, I am working as a student assistant in Danmarks Nationalbank, and even though it is interesting getting an insight into the financial world and having to apply my skills to a different area, at some time, I would like to try something more related to my studies. I would like to be part of the program to gain more knowledge of what it is like working in the industry as a data analyst or engineer - preferably working with logistics, data analytics or operations research. I know very few engineers outside the academic world at DTU, so I would appreciate a mentor who could share some of their experiences and tips on transitioning from student to professional. I am leaning towards specialising in prescriptive analytics, so I would also be very interested in learning more about how optimisation methods and simulation studies are actually applied to real-world problems. What I hope to achieve as a mentee is to be more prepared for working in the industry and get advice on how to make smart choices regarding my studies. I would also appreciate some advice on whether to take another semester abroad during my Masters or gain more work-experience.',
    # 'My greatest ambition is to leave the world in a better state for humans to experience the quality of life than it was when I entered it. This reason lead me to choose scientific studies - general engineering in Paris at first, and then Applied Mathematics in DTU - in the hope to use technologys leverage for maximum impact. Disclaimer: I am currently not looking for a position as I am to continue working for Tomorrow, the fantastic company I am already working for I nevertheless am very interested to get some insights, from a mentor that went through a similar line of study, into how they decided on starting to work straight away vs continue in the academic world by applying for a PhD. I am also eager to learn more about what it actually means to be a professional "data scientist". How much research/theory is actually useful in day-to-day operations and what level of freedom they can have in their decisions and organisation. I am also curious to learn more about career path for data scientist. The popularity of this position is fairly recent and for this reason, career evolution for a data scientist is still rather obscure to me.']
    # 'I would describe myself as focused, structured and vigorous. My main interest is overall concrete technology. It is from the mixing recipes to the maintaining of old structures to "cure" its sickness. The topic of my bachelor project was about testing the different national and international test methods for alkali silica reactions (ASR). To find out the most optimal methods, to catch that sand and stone which could develop ASR. My master thesis is about testing if mine tailings could be used as a substitute for fly ash, which soon not will be available at the same amount as earlier. In my free time, I have been doing a lot of volunteering. I have been a coach for a handball team for 11-12 year old girls for two years. I learned a lot about coaching, planning and taught the girls to be team players. Further I have been part of the organizing committee for the study start and the council for my study line for three years. Where I further developed my competencies planning, leading and get things done. I usually take the lead when things need to be done, but I dont know if Im suited for management. I hope to get a closer look at "the real life", to get ready when I finish my thesis in January. I want to a mentee to get knowledge about the "life" after university. I would prefer a mentor who works with civil engineering, but a mentor who can taught me difference between consulting and entrepreneur firms, so I can find out what is right for me, would be a nice. I still don\'t know what exactly I can be, but I would appreciate some advice. I hope to achieve a way into the business, which could help me find a job after my thesis.']

    # From: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb

    # Preprocess the documents, including the query string
    corpus = [preprocess(document) for document in documents]
    query = preprocess(query_string)
    '''
    Then we create a similarity matrix, that contains the similarity between each pair of words, 
    weighted using the term frequency:
    '''
    # Load the model: this is a big file, can take a while to download and open
    glove = api.load("glove-wiki-gigaword-50")
    similarity_index = WordEmbeddingSimilarityIndex(glove)

    # Build the term dictionary, TF-idf model
    print("Everything has been initialized")
    dictionary = Dictionary(corpus + [query])
    tfidf = TfidfModel(dictionary=dictionary)

    # Create the term similarity matrix.
    similarity_matrix = SparseTermSimilarityMatrix(similarity_index,
                                                   dictionary, tfidf)
    '''
    Finally, we calculate the soft cosine similarity between the query and each of the documents. 
    Unlike the regular cosine similarity (which would return zero for vectors with no overlapping terms), 
    the soft cosine similarity considers word similarity as well.
    '''
    # Compute Soft Cosine Measure between the query and the documents.
    # From: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb
    query_tf = tfidf[dictionary.doc2bow(query)]

    index = SoftCosineSimilarity(
        tfidf[[dictionary.doc2bow(document) for document in corpus]],
        similarity_matrix)

    doc_similarity_scores = index[query_tf]

    # Output the sorted similarity scores and documents
    sorted_indexes = np.argsort(doc_similarity_scores)[::-1]
    count = 0
    print("Mentee values: {}".format(query_string))
    for idx in sorted_indexes:
        count += 1
        if count > 10:
            break
        print(
            f'{idx} \t {doc_similarity_scores[idx]:0.3f} \t {documents[idx]}')
    return doc_similarity_scores
def find_similarity(search_w, corpus_w):
    rv = {}
    rv['result'] = []
    bmatch = False
    #Tokenize the sentence into words
    #search_tokens = [word for word in search_w.split()]
    #corpus_tokens = [word for word in corpus_w.split()]
    search_tokens = search_w
    corpus_tokens = corpus_w

    #print(search_tokens)
    #print(corpus_tokens)
    #print("-----")
    #cp = []
    #for c in corpus_tokens:
    #    cp.append([c])

    #corpus_tokens = cp
    search_tokens = [search_w]
    print(corpus_tokens)
    print(search_tokens)
    # Prepare a dictionary and a corpus.
    #documents = [svc_tokens, specs_tokens]
    dictionary = corpora.Dictionary(corpus_tokens)

    termsim_index = WordEmbeddingSimilarityIndex(model.wv)
    bow_corpus = [dictionary.doc2bow(doc) for doc in corpus_tokens]
    similarity_matrix = SparseTermSimilarityMatrix(
        termsim_index, dictionary)  # construct similarity matrix
    docsim_index = SoftCosineSimilarity(bow_corpus,
                                        similarity_matrix,
                                        num_best=10)

    # Compute soft cosine similarity
    for t in search_tokens:
        #print("looking for %s" %(t.split()))
        for e in t.split(','):
            match = {}
            e = e.strip()
            lkup = [e]
            try:
                result = docsim_index[dictionary.doc2bow(lkup)]
            except:
                result = [(0, 0)]
            print(f"looking for {lkup}, result {result}")
            if len(result) and result[0][1] > 0.5:
                match['word'] = e.split()
                match['value'] = str(result)
                rv['result'].append(match)
                bmatch = True
    #print(docsim_index[dictionary.doc2bow(search_tokens)])
    return rv if bmatch else None
def compute_msg_dist_matrix(data):
    lst_notifications = data 
    # print(lst_notifications)
    model = Word2Vec(lst_notifications, min_count=1)  # train word-vectors
    termsim_index = WordEmbeddingSimilarityIndex(model.wv)
    data_2 = [d.split() for d in lst_notifications]
    #print(data)
    dictionary = Dictionary(data_2)
    bow_corpus = [dictionary.doc2bow(document) for document in data_2]
    similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary)  # construct similarity matrix
    docsim_index = SoftCosineSimilarity(bow_corpus, similarity_matrix)
    sof_cosine_distance_matrix = 1- np.array(docsim_index)
    return sof_cosine_distance_matrix
示例#15
0
    def computeDocumentSimilarityIndex(self, corpus):
        """
            Compute the similarity matrix of the model

            Args:
                corpus: dictionary to use to create index

            Returns:
                SoftCosineSimilarity instance
        """

        if self.wordEmbedding is None:
            self.wordEmbedding = WordEmbeddingSimilarityIndex(self.model)

        # create similarity matrix, update flags
        simMatrix = SparseTermSimilarityMatrix(self.wordEmbedding, corpus)
        return SoftCosineSimilarity([x.sentence for x in self.itemScores], simMatrix)
示例#16
0
def softcos(defns, return_centers=False):
    keys = list(defns.keys())
    if len(defns) == 1:
        return unclusterable_default(keys, return_centers=return_centers)
    dictionary, bow_corpus = mk_dictionary_bow_corpus(defns.values())
    if len(dictionary) == 0:
        return unclusterable_default(keys, return_centers=return_centers)

    similarity_index = WordEmbeddingSimilarityIndex(vecs.get_en())
    similarity_matrix = SparseTermSimilarityMatrix(similarity_index,
                                                   dictionary)
    index = SoftCosineSimilarity(bow_corpus, similarity_matrix)
    affinities = np.zeros((len(defns), len(defns)))

    for row, similarities in enumerate(index):
        affinities[row] = similarities

    return graph_clust_grouped(affinities, keys, return_centers)
示例#17
0
    def calculate_distance(self,query_string,documents):
       

    
        def preprocess(doc):
            # Tokenize, clean up input document string
            doc = sub(r'<img[^<>]+(>|$)', " image_token ", doc)
            doc = sub(r'<[^<>]+(>|$)', " ", doc)
            doc = sub(r'\[img_assist[^]]*?\]', " ", doc)
            doc = sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', " url_token ", doc)
            return [token for token in simple_preprocess(doc, min_len=0, max_len=float("inf")) if token not in STOPWORDS]

        # Preprocess the documents, including the query string
        corpus = [preprocess(document) for document in documents]
        query = preprocess(query_string)


        # Load the model: this is a big file, can take a while to download and open
            
        similarity_index = WordEmbeddingSimilarityIndex(glove)

        # Build the term dictionary, TF-idf model

        dictionary = Dictionary(corpus+[query])
        tfidf = TfidfModel(dictionary=dictionary)

        # Create the term similarity matrix.  
        similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf)

        query_tf = tfidf[dictionary.doc2bow(query)]

        index = SoftCosineSimilarity(
                    tfidf[[dictionary.doc2bow(document) for document in corpus]],
                    similarity_matrix)

        doc_similarity_scores = index[query_tf]

        # Output the sorted similarity scores and documents
        sorted_indexes = np.argsort(doc_similarity_scores)[::-1]
        if len(documents) > 1:
            for idx in sorted_indexes:
                print(f'{idx} \t {doc_similarity_scores[idx]:0.3f} \t {documents[idx]}')
        # print(doc_similarity_scores)
        return doc_similarity_scores
    def get_score(self, query_string):
        if isinstance(query_string, list):
            query_string = " ".join(query_string)

        query = preprocess(query_string)
        print("Everything has been initialized")
        dictionary = Dictionary(self.corpus + [query])
        tfidf = TfidfModel(dictionary=dictionary)

        # Create the term similarity matrix.
        similarity_matrix = SparseTermSimilarityMatrix(self.similarity_index,
                                                       dictionary, tfidf)
        '''
        Finally, we calculate the soft cosine similarity between the query and each of the documents. 
        Unlike the regular cosine similarity (which would return zero for vectors with no overlapping terms), 
        the soft cosine similarity considers word similarity as well.
        '''
        # Compute Soft Cosine Measure between the query and the documents.
        # From: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb
        query_tf = tfidf[dictionary.doc2bow(query)]

        index = SoftCosineSimilarity(
            tfidf[[dictionary.doc2bow(document) for document in self.corpus]],
            similarity_matrix)

        doc_similarity_scores = index[query_tf]

        # Output the sorted similarity scores and documents
        sorted_indexes = np.argsort(doc_similarity_scores)[::-1]
        count = 0
        print("Mentee values: {}".format(query_string))
        for idx in sorted_indexes:
            count += 1
            if count > 10:
                break
            # print(f'{idx} \t {doc_similarity_scores[idx]:0.3f} \t {self.documents[idx]}')

        return doc_similarity_scores
示例#19
0
def create_model(storage_client, json_in, video_id):
    """ Create soft cosine similarity model

    Keywords arguments:
    storage_client -- a Storage instance
    json_in        -- json returned from the YouTube Captions API
    video_id       -- the Youtube video_id

    Returns:
    - A Soft Cosine Measure model
    - The dictionary of terms computed
    """

    video_id = video_id.lower()

    # check if bucket exists
    if blob_exists(storage_client, video_id):
        # retrieve blob from bucket
        bucket = storage_client.bucket(bucket_name)
        blob = bucket.blob(video_id)  # The blob's name is the video ID

        # download the storage pickle as a binary string
        blob_str = blob.download_as_string()
        dictionary, index = pickle.loads(blob_str)
        return dictionary, index

    # download stop_words and glove
    stop_words, glove = download_resources()

    # Create Glove similarity Index
    similarity_index = WordEmbeddingSimilarityIndex(glove)

    # parse json captions into document form
    documents = processInput(json_in)

    # create a corpus from documents
    corpus = [preprocess(document, stop_words) for document in documents]

    # create dictionary from documents
    dictionary = Dictionary(corpus)
    tfidf = TfidfModel(dictionary=dictionary)

    # create a term similarity matrix
    similarity_matrix = SparseTermSimilarityMatrix(similarity_index,
                                                   dictionary, tfidf)

    # Compute Soft Cosine Measure between documents
    index = SoftCosineSimilarity(
        tfidf[[dictionary.doc2bow(document) for document in corpus]],
        similarity_matrix)

    # save index and dictionary
    storage_client = storage.Client()

    # create a binary pickle representation
    bin_tuple = pickle.dumps((dictionary, index))
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(video_id)

    # save to storage
    blob.upload_from_string(bin_tuple)

    if debug_messages:
        print("Binary model with name {} and dictionary uploaded.".format(
            video_id))

    return dictionary, index
def main(chosen_model_no=0, num_items_displayed=10, use_spacy=False, use_soft_cosine_similarity=False,
         num_topics=None, no_below=5, no_above=0.5, normalize_vectors=False):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    if num_topics is None:
        num_topics = 100

    possible_model_names = [
        'tf_idf',  # 0
        'lsi_bow', 'lsi_tf_idf',  # 1, 2
        'rp_bow', 'rp_tf_idf',  # 3, 4
        'lda_bow', 'lda_tf_idf',  # 5, 6
        'hdp_bow', 'hdp_tf_idf',  # 7, 8
        'word2vec',  # 9
    ]
    chosen_model_name = possible_model_names[chosen_model_no]
    print(chosen_model_name)

    game_names, _ = load_game_names(include_genres=False, include_categories=False)

    steam_tokens = load_tokens()

    nlp = spacy.load('en_core_web_lg')

    documents = list(steam_tokens.values())

    dct = Dictionary(documents)
    print(len(dct))
    dct.filter_extremes(no_below=no_below, no_above=no_above)
    print(len(dct))

    corpus = [dct.doc2bow(doc) for doc in documents]

    # Pre-processing

    pre_process_corpus_with_tf_idf = chosen_model_name.endswith('_tf_idf')

    tfidf_model = TfidfModel(corpus, id2word=dct, normalize=normalize_vectors)

    if pre_process_corpus_with_tf_idf:
        # Caveat: the leading underscore is important. Do not use this pre-processing if the chosen model is Tf-Idf!
        print('Corpus as Tf-Idf')
        pre_processed_corpus = tfidf_model[corpus]
    else:
        print('Corpus as Bag-of-Words')
        pre_processed_corpus = corpus

    # Model

    model = None
    wv = None
    index2word_set = None

    if chosen_model_name == 'tf_idf':
        print('Term Frequency * Inverse Document Frequency (Tf-Idf)')
        model = tfidf_model

    elif chosen_model_name.startswith('lsi'):
        print('Latent Semantic Indexing (LSI/LSA)')
        model = LsiModel(pre_processed_corpus, id2word=dct, num_topics=num_topics)

    elif chosen_model_name.startswith('rp'):
        print('Random Projections (RP)')
        model = RpModel(pre_processed_corpus, id2word=dct, num_topics=num_topics)

    elif chosen_model_name.startswith('lda'):
        print('Latent Dirichlet Allocation (LDA)')
        model = LdaModel(pre_processed_corpus, id2word=dct, num_topics=num_topics)

    elif chosen_model_name.startswith('hdp'):
        print('Hierarchical Dirichlet Process (HDP)')
        model = HdpModel(pre_processed_corpus, id2word=dct)

    elif chosen_model_name == 'word2vec':
        use_a_lot_of_ram = False

        if use_a_lot_of_ram:
            model = None

            print('Loading Word2Vec based on Google News')
            # Warning: this takes a lot of time and uses a ton of RAM!
            wv = KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)
        else:
            if use_spacy:
                print('Using Word2Vec with spaCy')
            else:
                print('Training Word2Vec')

                model = Word2Vec(documents)

                wv = model.wv

        if not use_spacy:
            wv.init_sims(replace=normalize_vectors)

            index2word_set = set(wv.index2word)

    else:
        print('No model specified.')
        model = None

    if chosen_model_name != 'word2vec':
        if not use_soft_cosine_similarity:
            index = MatrixSimilarity(model[pre_processed_corpus], num_best=10, num_features=len(dct))
        else:
            w2v_model = Word2Vec(documents)
            similarity_index = WordEmbeddingSimilarityIndex(w2v_model.wv)
            similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dct, tfidf_model, nonzero_limit=100)
            index = SoftCosineSimilarity(model[pre_processed_corpus], similarity_matrix)
    else:
        index = None

    query_app_ids = load_benchmarked_app_ids(append_hard_coded_app_ids=True)

    app_ids = list(int(app_id) for app_id in steam_tokens.keys())

    matches_as_app_ids = []

    for query_count, query_app_id in enumerate(query_app_ids):
        print('[{}/{}] Query appID: {} ({})'.format(query_count + 1, len(query_app_ids),
                                                    query_app_id, get_app_name(query_app_id, game_names)))

        query = steam_tokens[str(query_app_id)]

        if use_spacy:
            spacy_query = Doc(nlp.vocab, query)
        else:
            spacy_query = None

        if chosen_model_name != 'word2vec':
            vec_bow = dct.doc2bow(query)
            if pre_process_corpus_with_tf_idf:
                pre_preoccessed_vec = tfidf_model[vec_bow]
            else:
                pre_preoccessed_vec = vec_bow
            vec_lsi = model[pre_preoccessed_vec]
            sims = index[vec_lsi]

            if use_soft_cosine_similarity:
                sims = enumerate(sims)

            similarity_scores_as_tuples = [(str(app_ids[i]), sim) for (i, sim) in sims]
            similarity_scores = reformat_similarity_scores_for_doc2vec(similarity_scores_as_tuples)
        else:
            if use_spacy:
                similarity_scores = {}
                for app_id in steam_tokens:
                    reference_sentence = steam_tokens[app_id]
                    spacy_reference = Doc(nlp.vocab, reference_sentence)
                    similarity_scores[app_id] = spacy_query.similarity(spacy_reference)
            else:
                query_sentence = filter_out_words_not_in_vocabulary(query, index2word_set)

                similarity_scores = {}

                counter = 0
                num_games = len(steam_tokens)

                for app_id in steam_tokens:
                    counter += 1

                    if (counter % 1000) == 0:
                        print('[{}/{}] appID = {} ({})'.format(counter, num_games, app_id, game_names[app_id]))

                    reference_sentence = steam_tokens[app_id]
                    reference_sentence = filter_out_words_not_in_vocabulary(reference_sentence, index2word_set)

                    try:
                        similarity_scores[app_id] = wv.n_similarity(query_sentence, reference_sentence)
                    except ZeroDivisionError:
                        similarity_scores[app_id] = 0

        similar_app_ids = print_most_similar_sentences(similarity_scores, num_items_displayed=num_items_displayed,
                                                       verbose=False)
        matches_as_app_ids.append(similar_app_ids)

    print_ranking(query_app_ids,
                  matches_as_app_ids,
                  only_print_banners=True)

    return
示例#21
0
print(now(), 'loaded speech data')
np.seterr(divide='ignore', invalid='ignore')

dictionary = Dictionary(documents)
tfidf = TfidfModel(dictionary=dictionary)
wv = KeyedVectors.load("word2vec_100_3_polish.bin")

print(now(), 'loaded model')

similarity_index = WordEmbeddingSimilarityIndex(wv)
similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary)
print(now(), 'created similarity matrix')

index = SoftCosineSimilarity(
    [dictionary.doc2bow(document) for document in documents],
    similarity_matrix)

print(now(), 'created index')
# index.save('soft_cosine.index')

# index = SoftCosineSimilarity.load('soft_cosine.index')
while True:
    try:
        query = input("Query: ").lower().split()
        # query = tfidf[dictionary.doc2bow(query)]
        similarities = index[dictionary.doc2bow(query)]
        result_list = [partie[i] for i in [a[0] for a in similarities]]
        score_list = [a[1] for a in similarities]
        results = [' '.join(each) for each in result_list]
        for score, result in zip(score_list, results):
示例#22
0
    def fit(
        self,
        path_to_model,
        source,
        target,
        sourcetext="text",
        sourcedate="publication_date",
        targettext="text",
        targetdate="publication_date",
        keyword_source=None,
        keyword_target=None,
        keyword_source_must=False,
        keyword_target_must=False,
        condition_source=None,
        condition_target=None,
        days_before=None,
        days_after=None,
        merge_weekend=False,
        threshold=None,
        from_time=None,
        to_time=None,
        to_csv=False,
        destination="comparisons",
        to_pajek=False,
        filter_above=0.5,
        filter_below=5,
    ):
        """
        path_to_model = Supply a pre-trained word2vec model. Information on how to train such a model
        can be found here: https://rare-technologies.com/word2vec-tutorial/
        source/target = doctype of source/target (can also be a list of multiple doctypes)

        sourcetext/targettext = field where text of target/source can be found (defaults to 'text')
        sourcdate/targetedate = field where date of source/target can be found (defaults to 'publication_date')
        keyword_source/_target = optional: specify keywords that need to be present in the textfield; list or string (lowercase)
        keyword_source/_target_must = optional: In case of a list, do all keywords need to appear in the text (logical AND) or does at least one of the words need to be in the text (logical OR). Defaults to False (logical OR)
        condition_source/target = optional: supply the field and its value as a dict as a condition for analysis, e.g. {'topic':1} (defaults to None)
        days_before = days target is before source (e.g. 2); days_after = days target is after source (e.g. 2) -> either both or none should be supplied. Additionally, merge_weekend = True will merge articles published on Saturday and Sunday. 
        threshold = threshold to determine at which point similarity is sufficient; if supplied only the rows who pass it are included in the dataset
        from_time, to_time = optional: specifying a date range to filter source and target articles. Supply the date in the yyyy-MM-dd format.
        to_csv = if True save the resulting data in a csv file - otherwise a pandas dataframe is returned
        destination = optional: where should the resulting datasets be saved? (defaults to 'comparisons' folder)
        to_pajek = if True save - in addition to csv/pickle - the result (source, target and similarity score) as pajek file to be used in the Infomap method (defaults to False) - not available in combination with days_before/days_after parameters
        filter_above = Words occuring in more than this fraction of all documents will be filtered
        filter_below = Words occuring in less than this absolute number of docments will be filtered
        """
        now = time.localtime()

        logger.info(
            "The results of the similarity analysis could be inflated when not using the recommended text processing steps (stopword removal, punctuation removal, stemming) beforehand"
        )

        # Load the pretrained model (different ways depending on how the model was saved)
        logger.info("Loading word embeddings...")
        try:
            softcosine_model = gensim.models.Word2Vec.load(path_to_model)
        except:
            softcosine_model = gensim.models.keyedvectors.KeyedVectors.load_word2vec_format(
                path_to_model, binary=True)

        logger.info("Done")

        # Construct source and target queries for elasticsearch
        if isinstance(source, list):  # multiple doctypes
            source_query = {
                "query": {
                    "bool": {
                        "filter": {
                            "bool": {
                                "must": [{
                                    "terms": {
                                        "doctype": source
                                    }
                                }]
                            }
                        }
                    }
                }
            }
        elif isinstance(source, str):  # one doctype
            source_query = {
                "query": {
                    "bool": {
                        "filter": {
                            "bool": {
                                "must": [{
                                    "term": {
                                        "doctype": source
                                    }
                                }]
                            }
                        }
                    }
                }
            }

        if isinstance(target, list):  # multiple doctypes
            target_query = {
                "query": {
                    "bool": {
                        "filter": {
                            "bool": {
                                "must": [{
                                    "terms": {
                                        "doctype": target
                                    }
                                }]
                            }
                        }
                    }
                }
            }
        elif isinstance(target, str):  # one doctype
            target_query = {
                "query": {
                    "bool": {
                        "filter": {
                            "bool": {
                                "must": [{
                                    "term": {
                                        "doctype": target
                                    }
                                }]
                            }
                        }
                    }
                }
            }

        # Change query if date range was specified
        source_range = {"range": {sourcedate: {}}}
        target_range = {"range": {targetdate: {}}}
        if from_time:
            source_range["range"][sourcedate].update({"gte": from_time})
            target_range["range"][targetdate].update({"gte": from_time})
        if to_time:
            source_range["range"][sourcedate].update({"lte": to_time})
            target_range["range"][targetdate].update({"lte": to_time})
        if from_time or to_time:
            source_query["query"]["bool"]["filter"]["bool"]["must"].append(
                source_range)
            target_query["query"]["bool"]["filter"]["bool"]["must"].append(
                target_range)

        # Change query if keywords were specified
        if isinstance(keyword_source, str) == True:
            source_query["query"]["bool"]["filter"]["bool"]["must"].append(
                {"term": {
                    sourcetext: keyword_source
                }})
        elif isinstance(keyword_source, list) == True:
            if keyword_source_must == True:
                for item in keyword_source:
                    source_query["query"]["bool"]["filter"]["bool"][
                        "must"].append({"term": {
                            sourcetext: item
                        }})
            elif keyword_source_must == False:
                source_query["query"]["bool"]["should"] = []
                source_query["query"]["bool"]["minimum_should_match"] = 1
                for item in keyword_source:
                    source_query["query"]["bool"]["should"].append(
                        {"term": {
                            sourcetext: item
                        }})
        if isinstance(keyword_target, str) == True:
            target_query["query"]["bool"]["filter"]["bool"]["must"].append(
                {"term": {
                    targettext: keyword_target
                }})
        elif isinstance(keyword_target, list) == True:
            if keyword_target_must == True:
                for item in keyword_target:
                    target_query["query"]["bool"]["filter"]["bool"][
                        "must"].append({"term": {
                            targettext: item
                        }})
            elif keyword_target_must == False:
                target_query["query"]["bool"]["should"] = []
                target_query["query"]["bool"]["minimum_should_match"] = 1
                for item in keyword_target:
                    target_query["query"]["bool"]["should"].append(
                        {"term": {
                            targettext: item
                        }})

        # Change query if condition_target or condition_source is specified
        if isinstance(condition_target, dict) == True:
            target_query["query"]["bool"]["filter"]["bool"]["must"].append(
                {"match": condition_target})
        if isinstance(condition_source, dict) == True:
            source_query["query"]["bool"]["filter"]["bool"]["must"].append(
                {"match": condition_source})

        # Retrieve source and target articles as generators
        source_query = scroll_query(source_query)
        target_query = scroll_query(target_query)

        # Make generators into lists and filter out those who do not have the specified keys (preventing KeyError)
        target_query = [
            a for a in target_query if targettext in a["_source"].keys()
            and targetdate in a["_source"].keys()
        ]
        source_query = [
            a for a in source_query if sourcetext in a["_source"].keys()
            and sourcedate in a["_source"].keys()
        ]

        # Target and source texts (split)
        target_text = []
        for doc in target_query:
            target_text.append(doc["_source"][targettext].split())
        source_text = []
        for doc in source_query:
            source_text.append(doc["_source"][sourcetext].split())

        logger.info("Preparing dictionary")
        dictionary = Dictionary(source_text + target_text)
        logger.info(
            "Removing all tokens that occur in less than {} documents or in more than {:.1f}% or all documents from dictionary"
            .format(filter_below, filter_above * 100))
        dictionary.filter_extremes(no_below=filter_below,
                                   no_above=filter_above)
        logger.info("Preparing tfidf model")
        tfidf = TfidfModel(dictionary=dictionary)
        logger.info("Preparing soft cosine similarity matrix")
        similarity_matrix = softcosine_model.wv.similarity_matrix(
            dictionary, tfidf)

        # extract additional information from sources
        source_dates = [doc["_source"][sourcedate] for doc in source_query]
        source_ids = [doc["_id"] for doc in source_query]
        source_doctype = [doc["_source"]["doctype"] for doc in source_query]
        source_dict = dict(zip(source_ids, source_dates))
        source_dict2 = dict(zip(source_ids, source_doctype))

        # extract information from targets
        target_ids = [doc["_id"] for doc in target_query]
        target_dates = [doc["_source"][targetdate] for doc in target_query]
        target_dict = dict(zip(target_ids, target_dates))
        target_doctype = [doc["_source"]["doctype"] for doc in target_query]
        target_dict2 = dict(zip(target_ids, target_doctype))

        # If specified, comparisons compare docs within sliding date window
        if days_before != None or days_after != None:
            logger.info("Performing sliding window comparisons...")
            # merge queries including identifier key
            for i in source_query:
                i.update({"identifier": "source"})
            for i in target_query:
                i.update({"identifier": "target"})
            source_query.extend(target_query)

            # sourcedate and targetdate need to be the same key (bc everything is done for sourcedate)
            if targetdate is not sourcedate:
                logger.info(
                    "Make sure that sourcedate and targetdate are the same key."
                )

            else:
                # convert dates into datetime objects
                for a in source_query:
                    if isinstance(a["_source"][sourcedate],
                                  datetime.date) == True:
                        pass  # is already datetime object
                    else:
                        a["_source"][sourcedate] = [
                            int(i)
                            for i in a["_source"][sourcedate][:10].split("-")
                        ]
                        a["_source"][sourcedate] = datetime.date(
                            a["_source"][sourcedate][0],
                            a["_source"][sourcedate][1],
                            a["_source"][sourcedate][2],
                        )

                # sort query by date
                source_query.sort(key=lambda item: item["_source"][sourcedate])

                # create list of all possible dates
                d1 = source_query[0]["_source"][sourcedate]
                d2 = source_query[-1]["_source"][sourcedate]
                delta = d2 - d1
                date_list = []
                for i in range(delta.days + 1):
                    date_list.append(d1 + datetime.timedelta(i))

                # create list of docs grouped by date (dates without docs are empty lists)
                grouped_query = []
                for d in date_list:
                    dt = []
                    for a in source_query:
                        if a["_source"][sourcedate] == d:
                            dt.append(a)
                    grouped_query.append(dt)
                # Optional: merges saturday and sunday into one weekend group
                # Checks whether group is Sunday, then merge together with previous (saturday) group.
                if merge_weekend == True:
                    grouped_query_new = []
                    for group in grouped_query:
                        # if empty, append empty list
                        if not group:
                            grouped_query_new.append([])
                        # if group is sunday, extend previous (saturday) list, except when it is the first day in the data.
                        elif group[0]["_source"][sourcedate].weekday() == 6:
                            if not grouped_query_new:
                                grouped_query_new.append(group)
                            else:
                                grouped_query_new[-1].extend(group)
                        # for all other weekdays, append new list
                        else:
                            grouped_query_new.append(group)
                    grouped_query = grouped_query_new

                # Sliding window starts here... How it works:
                # A sliding window cuts the documents into groups that should be compared to each other based on their publication dates. A list of source documents published on the reference date is created. For each of the target dates in the window, the source list is compared to the targets, the information is put in a dataframe, and the dataframe is added to a list. This process is repeated for each window. We end up with a list of dataframes, which are eventually merged together into one dataframe.

                len_window = days_before + days_after + 1
                source_pos = (
                    days_before
                )  # source position is equivalent to days_before (e.g. 2 days before, means 3rd day is source with the index position [2])
                n_window = 0

                for e in tqdm(self.window(grouped_query, n=len_window)):
                    n_window += 1
                    df_window = []

                    source_texts = []
                    source_ids = []
                    if not "source" in [
                            l2["identifier"] for l2 in e[source_pos]
                    ]:
                        pass

                    else:
                        for doc in e[source_pos]:
                            try:
                                if doc["identifier"] == "source":
                                    # create sourcetext list to compare against
                                    source_texts.append(
                                        doc["_source"][sourcetext].split())
                                    # extract additional information
                                    source_ids.append(doc["_id"])
                            except:
                                logger.error(
                                    "This does not seem to be a valid document"
                                )
                                print(doc)

                        # create index of source texts
                        query = tfidf[[
                            dictionary.doc2bow(d) for d in source_texts
                        ]]

                        # iterate through targets
                        for d in e:
                            target_texts = []
                            target_ids = []

                            for doc in d:
                                try:
                                    if doc["identifier"] == "target":
                                        target_texts.append(
                                            doc["_source"][targettext].split())
                                        # extract additional information
                                        target_ids.append(doc["_id"])
                                except:
                                    logger.error(
                                        "This does not seem to be a valid document"
                                    )
                                    print(doc)
                            # do comparison
                            if len(target_ids) == 0:
                                logger.warning(
                                    "Empty list of target ids. Skipping comparisons."
                                )
                                continue
                            index = SoftCosineSimilarity(
                                tfidf[[
                                    dictionary.doc2bow(d) for d in target_texts
                                ]],
                                similarity_matrix,
                            )
                            try:
                                sims = index[query]
                            except:
                                logger.warning(
                                    "There was a problem calculating the similarities, skipping this one"
                                )
                                print(target_ids)
                                sims = []
                            # make dataframe
                            df_temp = (pd.DataFrame(
                                sims, columns=target_ids,
                                index=source_ids).stack().reset_index())
                            df_window.append(df_temp)

                        df = pd.concat(df_window, ignore_index=True)
                        df.columns = ["source", "target", "similarity"]
                        df["source_date"] = df["source"].map(source_dict)
                        df["target_date"] = df["target"].map(target_dict)
                        df["source_doctype"] = df["source"].map(source_dict2)
                        df["target_doctype"] = df["target"].map(target_dict2)

                        # Optional: if threshold is specified
                        if threshold:
                            df = df.loc[df["similarity"] >= threshold]

                        # Make exports folder if it does not exist yet
                        if not os.path.exists(destination):
                            os.mkdir(destination)

                        # Optional: save as csv file
                        if to_csv == True:
                            df.to_csv(
                                os.path.join(
                                    destination,
                                    r"INCA_softcosine_{source}_{target}_{now.tm_year}_{now.tm_mon}_{now.tm_mday}_{now.tm_hour}_{now.tm_min}_{now.tm_sec}_{n_window}.csv"
                                    .format(
                                        now=now,
                                        target=target,
                                        source=source,
                                        n_window=n_window,
                                    ),
                                ))
                            # Otherwise: save as pickle file
                        else:
                            df.to_pickle(
                                os.path.join(
                                    destination,
                                    r"INCA_softcosine_{source}_{target}_{now.tm_year}_{now.tm_mon}_{now.tm_mday}_{now.tm_hour}_{now.tm_min}_{now.tm_sec}_{n_window}.pkl"
                                    .format(
                                        now=now,
                                        target=target,
                                        source=source,
                                        n_window=n_window,
                                    ),
                                ))

                # Optional: save as pajek file not for days_before/days_after
                if to_pajek == True:
                    logger.info(
                        "Does not save as Pajek file with days_before/days_after because of the size of the files."
                    )

        # Same procedure as above, but without specifying a time frame (thus: comparing all sources to all targets)
        else:

            # Create index out of target texts
            logger.info("Preparing the index out of target texts...")
            index = SoftCosineSimilarity(
                tfidf[[dictionary.doc2bow(d) for d in target_text]],
                similarity_matrix)

            # Retrieve source IDs and make generator to compute similarities between each source and the index
            logger.info("Preparing the query out of source texts...")
            query = tfidf[[dictionary.doc2bow(d) for d in source_text]]
            query_generator = (item for item in query)

            # Retrieve similarities
            # Makes a separate dataframe for each source doc and saves this.
            logger.info("Starting comparisons...")

            i = 0
            s_ids = 0
            for doc in query_generator:
                i += 1  # count each round of comparisons
                # if doc is empty (which may happen due to pruning)
                # then we skip this comparison
                if len(doc) == 0:
                    s_ids += 1
                    logger.info("Skipped one empty document")
                    continue
                # make comparison
                sims = index[doc]
                # make dataframe
                df = pd.DataFrame([sims]).transpose()
                logger.debug("Created dataframe of shape {}".format(df.shape))
                logger.debug("Length of target_id list: {}".format(
                    len(target_ids)))
                df["target"] = target_ids
                df["source"] = source_ids[s_ids]
                df.columns = ["similarity", "target", "source"]
                df["source_date"] = df["source"].map(source_dict)
                df["target_date"] = df["target"].map(target_dict)
                df["source_doctype"] = df["source"].map(source_dict2)
                df["target_doctype"] = df["target"].map(target_dict2)
                df = df.set_index("source")

                # Optional: if threshold is specified
                if threshold:
                    df = df.loc[df["similarity"] >= threshold]

                # Make exports folder if it does not exist yet
                if not "comparisons" in os.listdir("."):
                    os.mkdir("comparisons")

                # Optional: save as csv file
                if to_csv == True:
                    df.to_csv(
                        os.path.join(
                            destination,
                            r"INCA_softcosine_{source}_{target}_{now.tm_year}_{now.tm_mon}_{now.tm_mday}_{now.tm_hour}_{now.tm_min}_{now.tm_sec}_{i}.csv"
                            .format(now=now, target=target, source=source,
                                    i=i),
                        ))
                # Otherwise: save as pickle file
                else:
                    df.to_pickle(
                        os.path.join(
                            destination,
                            r"INCA_softcosine_{source}_{target}_{now.tm_year}_{now.tm_mon}_{now.tm_mday}_{now.tm_hour}_{now.tm_min}_{now.tm_sec}_{i}.pkl"
                            .format(now=now, target=target, source=source,
                                    i=i),
                        ))

                # Optional: additionally save as pajek file
                if to_pajek == True:
                    G = nx.Graph()
                    # change int to str (necessary for pajek format)
                    df["similarity"] = df["similarity"].apply(str)
                    # change column name to 'weights' to faciliate later analysis
                    df.rename({"similarity": "weight"}, axis=1, inplace=True)
                    # notes and weights from dataframe
                    G = nx.from_pandas_edgelist(df,
                                                source="source",
                                                target="target",
                                                edge_attr="weight")
                    # write to pajek
                    nx.write_pajek(
                        G,
                        os.path.join(
                            destination,
                            r"INCA_softcosine_{source}_{target}_{now.tm_year}_{now.tm_mon}_{now.tm_mday}_{now.tm_hour}_{now.tm_min}_{now.tm_sec}_{i}.net"
                            .format(now=now, target=target, source=source,
                                    i=i),
                        ),
                    )

                s_ids += 1  # move one doc down in source_ids

                logger.info("Done with source " + str(i) + " out of " +
                            str(len(source_text)))
示例#23
0
# Load the model: this is a big file, can take a while to download and open
glove = api.load("glove-wiki-gigaword-50")
similarity_index = WordEmbeddingSimilarityIndex(glove)

# Build the term dictionary, TF-idf model
dictionary = Dictionary(corpus + [query])
tfidf = TfidfModel(dictionary=dictionary)

# Create the term similarity matrix.
similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary,
                                               tfidf)

# Compute Soft Cosine Measure between the query and the documents.
# From: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb
query_tf = tfidf[dictionary.doc2bow(query)]

index = SoftCosineSimilarity(
    tfidf[[dictionary.doc2bow(document) for document in corpus]],
    similarity_matrix)

doc_similarity_scores = index[query_tf]

# Output the sorted similarity scores and documents
sorted_indexes = np.argsort(doc_similarity_scores)[::-1]
results = []
if __name__ == "__main__":
    for idx in sorted_indexes:
        print(
            f'{idx} \t {doc_similarity_scores[idx]:0.3f} \t {documents[idx]}')
    # results.append(f'{idx} \t {doc_similarity_scores[idx]:0.3f} \t {documents[idx]}')
示例#24
0
termsim_index = WordEmbeddingSimilarityIndex(model.wv)

# Create Corpus List
corpus_list = []
for data in dataset:
    docs = ""
    for sentence in data['gejala']:
        docs += " " + sentence
    corpus_list.append(docs)

# Create token list for all document corpus
corpus_list_token = [preprocess(doc) for doc in corpus_list]

dictionary = Dictionary(corpus_list_token)
bow_corpus = [dictionary.doc2bow(document) for document in corpus_list_token]

# Create Term similarity matrix
similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary)

# Compute Soft Cosine Similarity
docsim_index = SoftCosineSimilarity(bow_corpus, similarity_matrix, num_best=10)

text = "Diare (10 hingga 12 kali per hari) Diare disertai darah.Kram pada perut.Buang air besar yang kental.Gas dalam perut.Gejala yang umum seperti demam, sakit punggung, dan lelah."
predict = predict_decease(text, docsim_index, dictionary)
print(predict)

# sentence_end = re.compile(r'''[.!?]['"]?\s{1,2}(?=)''')
# input_sentences = re.split(r"\.|\?|\!",text)

# print(docs_similarity(input_sentences, [sentence for sentence in predict['data']['gejala']], model))
示例#25
0
def main():
    tfidf = None
    word2vec = None
    similarityMatrix = None
    browndict = {}
    corporadict = None

    word2vec = None

    choice = ""
    while choice != "exit":
        choice = ""
        while choice not in ["tfidf", "word2vec", "exit"]:
            choice = input(
                "TF-IDF or Word2Vec? [TFIDF, Word2Vec, Exit]\n>").lower()

        if choice == "exit":
            break

        catType = ""
        while catType not in ["within", "between", "return"]:
            catType = input(
                "Within or between clusters? [Within, Between, Return]\n>"
            ).lower()

        if catType == "return":
            break

        # get all of the words for each document per category
        texts = []
        if catType == "within":
            for c in brown.categories():
                words = NormalizeWords(brown.words(categories=c))
                texts.append(words)
                # build a dictionary for me to use later
                browndict[c] = words
        elif catType == "between":
            for c in brown.categories():
                words = NormalizeWords(brown.words(categories=c))
                texts.append(words[:len(words) // 2])
                texts.append(words[len(words) // 2:])
                # build a dictionary for me to use later
                browndict[c + "1/2"] = words[:len(words) // 2]
                browndict[c + "2/2"] = words[len(words) // 2:]

        # create the corpora dictionary built from gensim
        corporadict = corpora.Dictionary(texts)
        # create a corpus for the training
        corpus = []
        for line in texts:
            corpus.append(corporadict.doc2bow(line))

        if choice == "tfidf":
            # create the tfidf model from our built corpus
            tfidf = TfidfModel(corpus=corpus)

            # build the similarity matrix
            similarityMatrix = MatrixSimilarity(corpus,
                                                num_features=len(corporadict))
        elif choice == "word2vec":
            word2vec = Word2Vec(brown.sents())

            # build term similiarity matrix from our models word-vector
            termSimilarityIndex = WordEmbeddingSimilarityIndex(word2vec.wv)

            # build sparse similarity matrix
            sparseSimiliarityMatrix = SparseTermSimilarityMatrix(
                termSimilarityIndex, corporadict)

            # build similarity word-vector
            WV_SimilarityMatrix = SoftCosineSimilarity(
                corpus, sparseSimiliarityMatrix)

        maxes = {}
        if choice == "tfidf":
            # Print out the code
            keys = list(browndict.keys())
            for i in range(len(keys) - 1):
                # Convert to a bag of words and to a tfidf vector, then query it.
                query_bow = corporadict.doc2bow(browndict[keys[i]])
                query_tfidf = tfidf[query_bow]

                # Get the similarity of every cluster
                query_similarity = similarityMatrix[query_tfidf]
                for j in range(i + 1, len(query_similarity)):
                    sim = query_similarity[j]
                    print(keys[i], "and", keys[j], "have a similarity of:",
                          sim)
                print("")
        elif choice == "word2vec":
            keys = list(browndict.keys())
            for i in range(len(keys) - 1):
                # Convert to a bag of words and query it
                query_bow = corporadict.doc2bow(browndict[keys[i]])

                # Get the similarity of every cluster
                query_similarity = WV_SimilarityMatrix[query_bow]
                for j in range(i + 1, len(query_similarity)):
                    sim = query_similarity[j]
                    print(keys[i], "and", keys[j], "have a similarity of:",
                          sim)
                print("")
    # Read the data into a pandas dataframe
    df = pd.DataFrame([text.data]).T
    df['text'] = df[0]
    df = df[df['text'].map(type) == str]
    df.dropna(axis=0, inplace=True, subset=['text'])
    df = df.sample(frac=1.0)
    df.reset_index(drop=True, inplace=True)
    corpus = df['text'].apply(apply_all)  # preprocessed tokenized corpus
    bow = [dictionary.doc2bow(doc)
           for doc in corpus]  # transform into a gen bow
    bow = [i for i in bow if len(i) > 0]  # remove empty lists
    return bow


train_ath = clean(cat='soc.religion.christian', subset='test')  # reference set
docsim_index = SoftCosineSimilarity(
    train_ath, similarity_matrix)  # SCM with ref set for later use

atheism_test = clean('soc.religion.christian',
                     subset='test')  # query set 1: sim
windows_test = clean('comp.windows.x', subset='test')  # qyery set 2: diff

SCM_ath_ath = docsim_index[atheism_test]  # ref vs q1
SCM_ath_win = docsim_index[windows_test]  # ref vs q2

ath_ath = SCM_ath_ath.flatten()
ath_win = SCM_ath_win.flatten()
fig = plt.figure(1, figsize=(10, 5))
sns.distplot(ath_win, bins=50, color='red',
             label='Christian-WindowsX')  # atheism-windows
sns.distplot(ath_ath, bins=50, color='green',
             label='Christian-Christian')  # atheism-atheism
示例#27
0
# dictionary, index and article_ids are used if past_behavior (SoftCosine) recommender is used

try:
    lda_model = gensim.models.LdaModel.load("put path to model here")
except:
    lda_model = None
try:
    lda_dict = gensim.corpora.Dictionary.load("/put path to dict here")
except:
    lda_dict = None
try:
    dictionary = gensim.corpora.Dictionary.load("put path to dict here")
except:
    dictionary = None
try:
    index = SoftCosineSimilarity.load('put path to index here')
except:
    index = None
try:
    article_ids = pickle.load(open('put path to article ids here', 'rb'))
except:
    article_ids = None

login.login_view = 'login'
if not app.debug:
    if app.config['MAIL_SERVER']:
        auth = None
        if app.config['MAIL_USERNAME'] or app.config['MAIL_PASSWORD']:
            auth = (app.config['MAIL_USERNAME'], app.config['MAIL_PASSWORD'])
        secure = None
        if app.config['MAIL_USE_TLS']:
示例#28
0
    # preprocessed tokenized corpus
    bow = [dictionary.doc2bow(doc)
           for doc in corpus]  # transform into a gen bow
    bow = [i for i in bow if len(i) > 0]  # remove empty lists
    return bow


#%%
test1BOW = get_bow(df_test1['tokenized'])
train1 = df['Dominant_Topic'] == 1
dftrain1 = df[train1]
dftrain1.reset_index(drop=True, inplace=True)
train1BOW = get_bow(dftrain1['Text'])
#%%

testvstraining_1 = SoftCosineSimilarity(train1BOW, similarity_matrix)
scs_topic1 = testvstraining_1[test1BOW]
#%%

test4BOW = get_bow(df_test4['tokenized'])
scs_topic1vstopic4 = testvstraining_1[test4BOW]

#%%
fig = plt.figure(1, figsize=(10, 5))
sns.distplot(scs_topic1, bins=50, color='red',
             label='TrainingvsTest (Topic1)')  # atheism-windows
sns.distplot(scs_topic1vstopic4,
             bins=50,
             color='green',
             label='Topic1vsTopic4')  # atheism-atheism
示例#29
0
    def _preprocess_dataset(
        self, level: str
    ) -> Tuple[List[Document], List[Document], np.ndarray, List[Tuple[int,
                                                                      float]]]:
        LOGGER.info('Preprocessing {} ({})'.format(self.dataset, level))

        if level == 'validation':
            pivot = int(round(len(self.dataset.train_documents) * 0.8))
            train_documents = self.dataset.train_documents[:pivot]
            test_documents = self.dataset.train_documents[pivot:]
        elif level == 'test':
            train_documents = self.dataset.train_documents
            test_documents = self.dataset.test_documents
        else:
            message = 'Expected validation or test level, but got {}'
            raise ValueError(message.format(level))

        cache_path = self.model.cache_dir / 'text_classification'
        cache_path.mkdir(exist_ok=True)
        method_parameters = TEXT_CLASSIFICATION_METHOD_PARAMETERS[self.method]
        if self.method == 'scm':
            train_corpus = [document.words for document in train_documents]
            dictionary = Dictionary(train_corpus, prune_at=None)
            tfidf = TfidfModel(dictionary=dictionary, smartirs='nfn')
            termsim_index = WordEmbeddingSimilarityIndex(
                self.model.vectors, **method_parameters['similarity_index'])
            cache_path = cache_path / '{}-{}-{}-{}'.format(
                self.dataset.name, self.dataset.split_idx, self.method, level)
            try:
                similarity_matrix = SparseTermSimilarityMatrix.load(
                    str(cache_path), mmap='r')
            except IOError:
                similarity_matrix = SparseTermSimilarityMatrix(
                    termsim_index, dictionary, tfidf,
                    **method_parameters['similarity_matrix'])
                similarity_matrix.save(str(cache_path))
            train_corpus = [
                dictionary.doc2bow(document) for document in train_corpus
            ]
            train_corpus = tfidf[train_corpus]
            similarity_model = SoftCosineSimilarity(train_corpus,
                                                    similarity_matrix)
            test_corpus = (document.words for document in test_documents)
            test_corpus = [
                dictionary.doc2bow(document) for document in test_corpus
            ]
            test_corpus = tfidf[test_corpus]
        elif self.method == 'wmd':
            train_corpus = [document.words for document in train_documents]
            cache_path = cache_path / '{}-{}'.format(self.dataset.name,
                                                     self.method)
            cache_path = cache_path.with_suffix('.shelf')
            similarity_model = ParallelCachingWmdSimilarity(
                train_corpus, self.model.vectors, cache_path)
            test_corpus = [document.words for document in test_documents]
        else:
            message = 'Preprocessing for method {} not yet implemented'.format(
                self.method)
            raise ValueError(message)

        with np.errstate(all='ignore'):
            similarities = similarity_model[test_corpus]
        expected_shape = (len(test_documents), len(train_documents))
        if similarities.shape != expected_shape:
            message = 'Expected similarities with shape {}, but received shape {}'
            raise ValueError(message.format(expected_shape,
                                            similarities.shape))

        return (train_documents, test_documents, similarities, test_corpus)