예제 #1
0
def build_tfidf_or_lsi(corpus, method='tfidf'):
    '''

    построение модели для ранжирования документов.
    На вход: корпус текстов и метод ("tfidf" или "lsi").
    На выход кортеж: (словарь
    терминов в корпусе текстов,
    оцененная модель и матрица сходств слов)

    '''

    dictionary = Dictionary(corpus)
    corpus_bow = [dictionary.doc2bow(doc) for doc in corpus]
    model_tfidf = TfidfModel(corpus_bow)
    corpus_tfidf = [model_tfidf[doc] for doc in corpus_bow]
    simil_tfidf = MatrixSimilarity(corpus_tfidf)
    if method == 'tfidf':

        return dictionary, model_tfidf, simil_tfidf

    elif method == 'lsi':

        model_lsi = LsiModel(corpus_tfidf, id2word=dictionary, num_topics=50)
        corpus_lsi = [model_lsi[doc] for doc in corpus_bow]
        simil_lsi = MatrixSimilarity(corpus_lsi)

        return dictionary, model_lsi, simil_lsi
예제 #2
0
 def trainModel(self):
     if self.toweight:
         self.model = LsiModel(self.tfidf[self.corpus], num_topics=self.num_topics)
         self.index = MatrixSimilarity(self.model[self.tfidf[self.corpus]])
     else:
         self.model = LsiModel(self.corpus, num_topics=self.num_topics)
         self.index = MatrixSimilarity(self.model[self.corpus])
 def train_model(self):
     """
     Read the preprocessed data and generate corpus dictionary, tfidf model and matrix(Cosine) similarity
     :return: status of training
     """
     try:
         data = pd.read_csv(self.processed_data)
         del data['Unnamed: 0']
         # creating tokens for the doc column
         corpus = data['doc'].map(break_to_tokens)
         # creating dictionary of words in the movie dataset
         dictionary = gensim.corpora.Dictionary(corpus)
         dictionary.save(self.corpus_dictionary)
         # creating vector with bag of words for the corpus
         vector = [dictionary.doc2bow(d) for d in corpus]
         # creating tfidf values for the vector
         tfidf = models.TfidfModel(vector)
         tfidf.save(self.tfidf_model)
         corpus_tfidf = tfidf[vector]
         # Compute Similarities
         similarity = MatrixSimilarity(corpus_tfidf,
                                       num_features=len(dictionary))
         similarity.save(self.matrix_similarity)
         return "Model Trained Successfully"
     except:
         return "Error While Training Model"
예제 #4
0
def calAuthorSim():
    conn = sqlite3.connect(config.db_path)
    db = conn.cursor()

    model = AuthorTopicModel.load(config.author_model128_path)
    poets = list(model.id2author.values())
    print(len(poets))
    # vec = model.get_author_topics('苏轼')
    index = MatrixSimilarity(model[list(model.id2author.values())], num_best=30)
    index.save(config.author_simMatrix_path)
    # index = MatrixSimilarity.load(config.author_simMatrix_path)

    for name in poets:
        # print(name)
        sims = index[model[name]]
        sims = sorted(sims, key=lambda item: -item[1])
        sims = [ [poets[sim[0]] , sim[1]] for sim in sims]
        # print(sims)
        # sql_comment  = "UPDATE author SET sims=? WHERE id=?"
        # db.execute(sql_comment, (toJson(sims), name))

        sql_comment  = "UPDATE author SET sims=\'{}\' WHERE id=\'{}\'".format(toJson(sims), name)
        db.execute(sql_comment)
        # print(sql_comment)
    # print(len(poets))
    conn.commit()
예제 #5
0
    def train(self, classdict, nb_topics, *args, **kwargs):
        """ Train the topic modeler.

        :param classdict: training data
        :param nb_topics: number of latent topics
        :param args: arguments to pass to the `train` method for gensim topic models
        :param kwargs: arguments to pass to the `train` method for gensim topic models
        :return: None
        :type classdict: dict
        :type nb_topics: int
        """
        self.nb_topics = nb_topics
        self.generate_corpus(classdict)
        if self.toweigh:
            self.tfidf = TfidfModel(self.corpus)
            normcorpus = self.tfidf[self.corpus]
        else:
            self.tfidf = None
            normcorpus = self.corpus

        self.topicmodel = gensim_topic_model_dict[self.algorithm](
            normcorpus, num_topics=self.nb_topics, *args, **kwargs)
        self.matsim = MatrixSimilarity(self.topicmodel[normcorpus])

        # change the flag
        self.trained = True
예제 #6
0
	def compute(self):
		vec_texts = [text.split() for text in self.texts]
		write("\n    "+"-> Computing the dictionary".ljust(50,'.')) if self.debug else ''
		dictionary = Dictionary(vec_texts)
		write("[OK]") if self.debug else ''
		write("\n    "+"-> Creating the bag-of-words space".ljust(50,'.')) if self.debug else '' 
		corpus = [dictionary.doc2bow(vec) for vec in vec_texts]
		write("[OK]") if self.debug else ''
		write("\n    "+("-> Creating the %s space" % self.method).ljust(50,'.') ) if self.debug else '' 
		tfidf_space = TfidfModel(corpus)
		tfidf_corpus = tfidf_space[corpus]
		if self.method == 'TFIDF':
			self.space = tfidf_space
			self.index = MatrixSimilarity(tfidf_corpus)
		elif self.method == 'LSI': 
			self.space = LsiModel(tfidf_corpus, id2word=dictionary, num_topics=self.num_t) 
			self.index = MatrixSimilarity(self.space[tfidf_corpus])
		elif self.method == 'RP': 
			self.space = RpModel(tfidf_corpus, id2word=dictionary, num_topics=self.num_t) 
			self.index = MatrixSimilarity(self.space[tfidf_corpus])
		elif self.method == 'LDA':
			self.space = ldamodel.LdaModel(tfidf_corpus, id2word=dictionary, 
														 num_topics=self.num_t)
			self.index = MatrixSimilarity(self.space[tfidf_corpus])
		self.dictionary = dictionary
		write("[OK]\n") if self.debug else ''
예제 #7
0
def create_model_tfidf_model(documents, model_name, matrix_name, dic_name):
    dictionary = Dictionary(documents)
    corpus = [dictionary.doc2bow(doc) for doc in documents]
    tfidfmodel = TfidfModel(corpus)
    index = MatrixSimilarity(tfidfmodel[corpus], num_features=len(dictionary))
    index.save(matrix_name)
    tfidfmodel.save(model_name)
    dictionary.save(dic_name)
    return tfidfmodel, index, dictionary
    def main(self):

        print("Recommendation using TF_IDF")

        # Loading preprocessed data
        vagas_ti = pd.read_csv(self.dataPrepFile)
        vagas_ids = pickle.load(
            open(self.out + "preprocessing/vagas_ids.array", "rb"))
        vagas_words = pickle.load(
            open(self.out + "preprocessing/vagas_words.list", "rb"))
        cvs_words = pickle.load(
            open(self.out + "preprocessing/cvs_words.series", "rb"))
        cvs = pd.read_csv(self.dataCvsFile)
        cvs = cvs.fillna("")
        cvs.isnull().any()
        #print("Loading cvs done!")

        # Creating a dictionary
        dictionary = gcorp.Dictionary(vagas_words)
        dictionary.save(self.out + 'preprocessing/tf_idf/vagas.dict'
                        )  # store the dictionary, for future reference

        # compile corpus (vectors number of times each elements appears)
        raw_corpus = [dictionary.doc2bow(v) for v in vagas_words]
        gcorp.MmCorpus.serialize(self.out + 'preprocessing/tf_idf/vagas.mm',
                                 raw_corpus)  # store to disk
        print("Tamanho do dicionário: " + str(len(dictionary)))

        # STEP 2 : similarity between corpuses
        dictionary = gcorp.Dictionary.load(self.out +
                                           'preprocessing/tf_idf/vagas.dict')
        corpus = gcorp.MmCorpus(self.out + 'preprocessing/tf_idf/vagas.mm')

        # Transform Text with TF-IDF
        tfidf = gsm.TfidfModel(corpus)  # step 1 -- initialize a model

        # corpus tf-idf
        corpus_tfidf = tfidf[corpus]

        # STEP 3 : Create similarity matrix of all files
        index = MatrixSimilarity(corpus_tfidf,
                                 num_features=len(dictionary),
                                 num_best=10)
        index.save(self.out + 'preprocessing/tf_idf/vagas.index')
        index = MatrixSimilarity.load(self.out +
                                      'preprocessing/tf_idf/vagas.index')

        self.recommendationTf_idf(cvs, vagas_ti, vagas_ids, cvs_words,
                                  dictionary, tfidf, index)

        print("Recommendation using TF_IDF done!")
예제 #9
0
def custom_queries(corpus, dictionary, paragraphs):

    # tfidf query:
    tfidf_model = TfidfModel(corpus, dictionary=dictionary)
    query = process_query("What is the function of money?", dictionary)
    tfidf_query = tfidf_model[query]

    tfidf_corpus = []
    for i in range(len(corpus)):
        tfidf_corpus.append(tfidf_model[corpus[i]])

    tfidf_index = MatrixSimilarity(tfidf_corpus)

    print("tfidf query:")
    doc2similarity_tfidf = enumerate(tfidf_index[tfidf_query])
    for tfidf_index, similarity in sorted(doc2similarity_tfidf,
                                          key=lambda kv: -kv[1])[:3]:
        paragraph = paragraphs[tfidf_index].split("\n")
        number = tfidf_index + 1
        print("[paragraph: " + str(number) + "]")
        for i in range(5):
            print(paragraph[i])
            if (i + 1) == len(paragraph):
                break
        print("\n")

    # lsi query:
    lsi_model = LsiModel(corpus, id2word=dictionary, num_topics=100)
    lsi_query = lsi_model[tfidf_query]

    lsi_corpus = []
    for i in range(len(corpus)):
        lsi_corpus.append(lsi_model[corpus[i]])

    lsi_index = MatrixSimilarity(lsi_corpus)
    doc2similarity_lsi = enumerate(lsi_index[lsi_query])

    print("lsi query:")
    for lsi_index, similarity in sorted(doc2similarity_lsi,
                                        key=lambda kv: -kv[1])[:3]:
        paragraph = paragraphs[lsi_index].split("\n")
        number = lsi_index + 1
        print("[paragraph: " + str(number) + "]")
        for i in range(5):
            print(paragraph[i])
            if (i + 1) == len(paragraph):
                break
        print("\n")
예제 #10
0
def calculate_tfidf_cos_sim(text1, text2, dictionary_tfidf, corpus):

    tfidf1 = to_tfidf(text1, dictionary_tfidf, corpus)
    tfidf2 = to_tfidf(text2, dictionary_tfidf, corpus)
    index = MatrixSimilarity([tfidf1], num_features=len(dictionary_tfidf))
    sim = index[tfidf2]
    return float(sim[0])
예제 #11
0
    def find_similarity_scores(self, topics):
        # Create similarities container
        similarities = {'Resumes': {}}
        # Gensim requires a corpora data structure for transformations and analysis
        dictionary = corpora.Dictionary(self.corpus)

        # Convert text to BoW.  It already is but lets be sure.
        corpus_gensim = [dictionary.doc2bow(doc) for doc in self.corpus]

        # Term Frequency-Inverse Document Frequency (TF-IDF) transformation sets weights small
        # when they appear more often in the text.
        self.tfidf = TfidfModel(corpus_gensim)
        print(self.tfidf)
        self.tfidf = self.tfidf[corpus_gensim]
        print(self.tfidf)
        # Find similarity via vector-space pair-wise cosine angle absolute value via Latent Semantic Indexing (LSI)
        # https://en.wikipedia.org/wiki/Latent_semantic_analysis#Latent_semantic_indexing
        lsi = LsiModel(self.tfidf, id2word=dictionary, num_topics=topics)
        lsi_index = MatrixSimilarity(lsi[self.tfidf])
        similarities['Resumes']["LSI_Similarity"] = np.array(
            [lsi_index[lsi[self.tfidf[i]]] for i in range(len(self.corpus))])

        for doc in self.tfidf:
            for word_id, value in doc:
                word = dictionary.get(word_id)
                self.ind_word_scores[word] = value

        # Convert to numpy arrays
        self.f_list = np.array(self.f_list)
        self.data = np.array(self.data)

        # Return results to object
        self.sim_matrix = similarities
    def get_recommendation(self, movie_title: str):
        """
        Accepts Movie Name and fetches the list of recommended movie names using matrix(cosine) similarity
        :param movie_title:
        :return: array of movie names
        """
        print("movie : ", movie_title)
        dictionary = gensim.corpora.Dictionary.load(self.corpus_dictionary)
        tfidf_model = gensim.models.TfidfModel.load(self.tfidf_model)
        similarity = MatrixSimilarity.load(self.matrix_similarity)
        data = pd.read_csv(self.processed_data)

        del data['Unnamed: 0']
        data["original_title"] = data["original_title"].str.lower()
        movie = data.loc[data.original_title == movie_title]
        print(movie)
        if movie.shape[0] == 0:
            status = ["Failed to Recommend Movies with existing movie data."]
            return status
        else:
            movie_doc_bow = dictionary.doc2bow(
                movie['doc'].map(break_to_tokens)[0])
            movie_tfidf = tfidf_model[movie_doc_bow]
            movie_recommendations = pd.DataFrame({
                'Cosine_sim_values':
                similarity[movie_tfidf],
                'title':
                data.original_title.values
            }).sort_values(by="Cosine_sim_values", ascending=False)
            top_recommendations = movie_recommendations['title'].head(11)
            return top_recommendations.to_numpy()
예제 #13
0
    def __init__(self, model_prefix=None, num_best=None):
        self.model_prefix = model_prefix
        self.num_best = num_best
        if self.model_prefix is None:
            raise ValueError("model_prefix must be specified")

        logger.info("ESA: Loading word dictionary...")
        self.dictionary = Dictionary.load_from_text(model_prefix +
                                                    '_wordids.txt.bz2')

        logger.info("ESA: Loading document name map...")
        self.article_dict = utils.unpickle(model_prefix +
                                           '_bow.mm.metadata.cpickle')

        logger.info("ESA: Loading TF-IDF model...")
        self.tfidf = TfidfModel.load(model_prefix + '.tfidf_model')

        logger.info("ESA: Loading similarity index...")
        sim_fname = "%s.cluster.%d.centroids" % (model_prefix, 2000)
        self.similarity_index = MatrixSimilarity.load(sim_fname, mmap='r')

        #logger.info("ESA: Preloading reverse indexes...")
        #self.similarity_index.preload_reverse_index()

        logger.info("ESA: Finished loading model files.")
예제 #14
0
파일: lsi.py 프로젝트: jwilber/artcamp
    def load(lsi_path=None, id2word_path=None, index_path=None):
        """
        If specified, attempts to load gensim LsiModel from `lsi_path`
        and gensim Dictionary from `dictionary_path`.

        Parameters
        ----------
        lsi_path: str
            File-path designating where self.model should be saved.
        id2word_path: str
            File-path designating where self.dictionary should be saved.
        """
        if lsi_path is not None:
            from gensim.models import LsiModel
            if not os.path.exists(lsi_path):
                raise IOError(
                    'The provided file path to the LsiModel was not found.'
                    'Please ensure that the argument is the correct path.')
            return LsiModel.load(lsi_path)
        if id2word_path is not None:
            from gensim.corpora.dictionary import Dictionary
            if not os.path.exists(id2word_path):
                raise IOError(
                    'The provided file path to the Dictionary was not found.'
                    'Please ensure that the argument is the correct path.')
            return Dictionary.load(id2word_path)
        if index_path is not None:
            from gensim.similarities import MatrixSimilarity
            if not os.path.exists(index_path):
                raise IOError(
                    'The provided file path to the Dictionary was not found.'
                    'Please ensure that the argument is the correct path.')
            return MatrixSimilarity.load(index_path)
예제 #15
0
 def recomended_projects(self, request):
     projects = ProjectRequest.objects.all()
     project_keywords_dict = {}
     projects_dict = {}
     tags_list = []
     for project in projects:
         description = project.description
         description_keywords = get_keywords(description.replace('"', ''))
         tags = project.tags.replace('  ', ',').lower() 
         for keyword in description_keywords:
             tags += ',' + keyword[0].lower()
         tags_list.append(tags)
     df = read_frame(projects, fieldnames=['id', 'tags'], index_col=['id'])
     df['tags'] = tags_list
     keywords = df['tags'].tolist()
     keywords = [word_tokenize(keyword.lower()) for keyword in keywords]
     keywords = [no_commas(kw) for kw in keywords]
     processed_keywords = keywords
     dictionary = Dictionary(processed_keywords)
     corpus = [dictionary.doc2bow(doc) for doc in processed_keywords]
     tfidf = TfidfModel(corpus)
     sims = MatrixSimilarity(tfidf[corpus], num_features=len(dictionary))
     top_3 = keywords_recommendation(all_projects=df, keywords=['uvg', 'gasolina', 'potente', 'mcdonald', 'mecanico', 'gg', 'carros'], number_of_hits=3, data=[dictionary, tfidf, sims])
     projects = []
     for id in top_3:
         projects.append(ProjectRequestSerializer(ProjectRequest.objects.get(pk=id)).data)
     return Response(projects)
예제 #16
0
    def loadmodel(self, nameprefix):
        """ Load the topic model with the given prefix of the file paths.

        Given the prefix of the file paths, load the corresponding topic model. The files
        include a JSON (.json) file that specifies various parameters, a gensim dictionary (.gensimdict),
        and a topic model (.gensimmodel). If weighing is applied, load also the tf-idf model (.gensimtfidf).

        :param nameprefix: prefix of the file paths
        :return: None
        :type nameprefix: str
        """
        # load the JSON file (parameters)
        parameters = json.load(open(nameprefix + '.json', 'rb'))
        self.nb_topics = parameters['nb_topics']
        self.toweigh = parameters['toweigh']
        self.algorithm = parameters['algorithm']
        self.classlabels = parameters['classlabels']

        # load the dictionary
        self.dictionary = Dictionary.load(nameprefix + '.gensimdict')

        # load the topic model
        self.topicmodel = gensim_topic_model_dict[self.algorithm].load(
            nameprefix + '.gensimmodel')

        # load the similarity matrix
        self.matsim = MatrixSimilarity.load(nameprefix + '.gensimmat')

        # load the tf-idf modek
        if self.toweigh:
            self.tfidf = TfidfModel.load(nameprefix + '.gensimtfidf')

        # flag
        self.trained = True
예제 #17
0
def train_model_get_cosine_matrix(statements, num):

    statements = [statement.split() for statement in statements]
    dictionary = corpora.Dictionary(statements)
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in statements]

    ###tfidf model
    # https://stackoverflow.com/questions/50521304/why-i-get-different-length-of-vectors-using-gensim-lsi-model
    tfidf = models.TfidfModel(doc_term_matrix, normalize=True)
    corpus_tfidf = tfidf[doc_term_matrix]

    lsi = models.LsiModel(corpus_tfidf, num_topics=num, id2word=dictionary)

    #turn dictionary into doc2vec
    words = [
        dictionary.doc2bow([word])
        for word in sorted(list(dictionary.token2id.keys()))
    ]

    vectorized_corpus = lsi[words]

    index = MatrixSimilarity(vectorized_corpus)
    index[vectorized_corpus]

    out = pd.DataFrame(index[vectorized_corpus])
    out.columns = sorted(list(dictionary.token2id.keys()))
    out.index = sorted(list(dictionary.token2id.keys()))
    return out
예제 #18
0
def main(argv):

    if len(sys.argv) != 2:
        print 'usage: text_exp sentence'
        sys.exit(2)

    # encode this sentence into semantic space
    # text = "Rice wheat and barley are all important feed crops."
    # text = "Brazil issues with industrial pollution"

    text = sys.argv[1]

    # first load the basic models
    MODELS_DIR = "models"

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.ERROR)
    dictionary = gensim.corpora.Dictionary.load(
        os.path.join(MODELS_DIR, "mtsamples.dict"))
    corpus = gensim.corpora.MmCorpus(os.path.join(MODELS_DIR, "mtsamples.mm"))

    # now, transform the text
    bow_text = dictionary.doc2bow(gensim.utils.tokenize(text))
    # show me the transformed text
    # print([(dictionary[id], count) for id, count in bow_text])

    # generate a tfidf model from the set of all articles
    tfidf = gensim.models.TfidfModel(corpus, normalize=True)
    corpus_tfidf = tfidf[corpus]
    # then generate a LSI model from the set of all articles
    lsi = gensim.models.LsiModel(corpus_tfidf,
                                 id2word=dictionary,
                                 num_topics=10)

    # now, create a dense index from the set of all articles
    index_dense = MatrixSimilarity(lsi[corpus])

    # finally, let's use the input query and translate it into the lsi space.
    vec_lsi = lsi[bow_text]
    # compute the similarity index
    sims = index_dense[vec_lsi]
    # print the raw vector numbers
    # print (list(enumerate(sims)))

    # now, sort by similarity number and print the highest similar articles to the query
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    # print (sims)

    # load the file list
    file_list = pickle.load(open('models/file_list.p', 'rb'))

    # use it to show the article names

    dictSimilars = {}
    for i in range(len(sims)):
        ind = sims[i][0]
        dictSimilars[str(sims[i][1])] = file_list[ind]

    js = json.dumps(dictSimilars)
    return js
예제 #19
0
def cs_lda(sp, df, feature, session, update_model):
    print(f"Starting LDA ...")
    if update_model:
        session = session + "lda-" + feature
        if not os.path.exists(session):
            print(f"New directory: {session}")
            os.mkdir(session)
        session = session + "/"
        create_dictionary(session, df, feature)
        create_model(session, df, feature)
    else:
        session = session + "lda-" + feature + "/"
    print(f"Computing Cosine Similarity on feature {feature}")
    dct = get_dict(feature, session)
    if not feature == "title":
        corpus = common.remove_stopwords(df[feature]).tolist()
    else:
        corpus = df[feature].tolist()
    corpus = [doc.split() for doc in corpus]
    corpus = [dct.doc2bow(text) for text in corpus]
    lda = LdaMulticore.load(session + "LDA-model-" + feature)
    res = lda[corpus]
    index = MatrixSimilarity(res)

    # index.save("simIndex.index")

    def compute(text):
        vec_bow = dct.doc2bow(text.split())
        vec_lda = lda[vec_bow]
        sims = index[vec_lda]
        return sims

    results = for_pivot(df[feature], df, compute)
    common.save_as_pivot(results, sp=sp)
예제 #20
0
    def get_similarity_index(self, bow_corpus, lsa: LsiModel, recalculate=False, from_scratch=True):

        filepath = self.paths.get_lsa_index(lsa.num_topics)

        if not os.path.isfile(filepath) or recalculate:

            if not from_scratch:
                raise ValueError('No similarity index file exists but from_scratch is False')

            print('Building index...')
            index = MatrixSimilarity(lsa[bow_corpus])
            index.save(filepath)
        else:
            print('Loading index...')
            index = MatrixSimilarity.load(filepath)

        return index
예제 #21
0
def cossim(query, documents):
    # Compute cosine similarity between the query and the documents.
    query = tfidf[dictionary.doc2bow(query)]
    index = MatrixSimilarity(
        tfidf[[dictionary.doc2bow(document) for document in documents]],
        num_features=len(dictionary))
    similarities = index[query]
    return similarities
예제 #22
0
def cos_sim(text1, text2):
    tfidf1 = to_tfidf(text1)
    tfidf2 = to_tfidf(text2)
    index = MatrixSimilarity([tfidf1],num_features=len(dictionary))
    sim = index[tfidf2]
    # 本来sim输出是一个array,我们不需要一个array来表示,
    # 所以我们直接cast成一个float
    return float(sim[0])
예제 #23
0
 def __init__(self,
              num_topics=100,
              wiki_tokens_path='data/token_sents.pkl',
              wiki_sents_path='data/sents.pkl',
              student_tokens_path='data/children_data.json'):
     super().__init__(wiki_tokens_path, wiki_sents_path,
                      student_tokens_path)
     self.lsi = self.compute_lsi(num_topics)
     self.lsi_index = MatrixSimilarity(self.lsi[self.wiki_tfidf_corpus])
예제 #24
0
def lsi(corpus, dictionary):
    lsi_model = LsiModel(corpus, id2word=dictionary, num_topics=100)
    lsi_corpus = []
    for i in range(len(corpus)):
        lsi_corpus.append(lsi_model[corpus[i]])

    lsi_similarity_matrix = MatrixSimilarity(lsi_corpus)
    print(lsi_model.show_topics())
    return lsi_similarity_matrix
예제 #25
0
    def build_models(self):
        # Create tfidf model
        self.tfidf_model = TfidfModel(self.corpus)

        # Map bag of words to (word-index, word-weight)
        self.tfidf_corpus = list(
            map(lambda c: self.tfidf_model[c], self.corpus))

        self.tfidf_similarity = MatrixSimilarity(self.tfidf_corpus)

        self.lsi_model = LsiModel(self.tfidf_corpus,
                                  id2word=self.dictionary,
                                  num_topics=100)

        self.lsi_corpus = list(
            map(lambda c: self.lsi_model[c], self.tfidf_corpus))

        self.lsi_similarity = MatrixSimilarity(self.lsi_corpus)
def generate_embeddings(documents, dictionary):
    doc_word_index = MatrixSimilarity(
        [dictionary.doc2bow(document) for document in documents],
        num_features=len(dictionary))
    doc_doc_index = np.array(
        [doc_word_index[dictionary.doc2bow(doc)] for doc in documents])
    np.save('models/Word2vec/doc_word_index', doc_word_index.index)
    np.save('models/Word2Vec/doc_doc_index', doc_doc_index)
    return doc_word_index.index, doc_doc_index
예제 #27
0
def tf_idf(corpus):
    tfidf_model = TfidfModel(corpus)
    tfidf_corpus = []
    for i in range(len(corpus)):
        tfidf_corpus.append(tfidf_model[corpus[i]])

    tfidf_similarity_matrix = MatrixSimilarity(tfidf_corpus)

    return tfidf_similarity_matrix
    def load(self):
        """
        load the corpora created by `make_corpus.py`
        """
        self.corpus = MmCorpus(self.corpus_file)
        self.dictionary = Dictionary.load_from_text(self.dict_file)
        self.titles = load_titles(self.title_file)

        self.tfidf_model = TfidfModel.load(self.tfidf_model_file)
        self.index = MatrixSimilarity(self.tfidf_model[self.corpus])
예제 #29
0
 def load(cls, fname):
     """
     Load a previously saved object from file (also see `save`).
     """
     logger.info("loading %s object from %s and %s" % (cls.__name__,
                                                       fname,
                                                       fname + ".index"))
     result = utils.unpickle(fname)
     result.similarity_index = MatrixSimilarity.load(fname + ".index")
     return result
예제 #30
0
    def __init__(self, filename):
        self.docs = loads(open(filename, "r").read())
        self.docmap = hoist_dict(self.docs, "id")

        if isfile("data.dict"):
            self.dictionary = Dictionary.load("data.dict")
        else:
            self.dictionary = Dictionary(iterate_summaries(self.docs))
            self.dictionary.save("data.dict")

        if isfile("data.mm"):
            self.corpus = MmCorpus("data.mm")
        else:
            corpus = (self.dictionary.doc2bow(text) for text in iterate_summaries(self.docs))
            MmCorpus.serialize("data.mm", corpus)
            self.corpus = MmCorpus("data.mm")

        self.lsi = LsiModel(self.corpus, id2word=self.dictionary, num_topics=3)

        if isfile("data.sim"):
            self.sim = MatrixSimilarity.load("data.sim")
        else:
            self.sim = MatrixSimilarity(self.lsi[self.corpus])
            self.sim.save("data.sim")

        # self.lda = LdaModel(corpus=self.corpus, id2word=self.dictionary, num_topics=100, update_every=1, chunksize=10000, passes=1)

        self.sentiment_model = Doc2Vec.load("imdb.d2v")
        self.sentiment = LogisticRegression()
        self.sentiment.fit([self.sentiment_model.docvecs["TEST_POS_" + str(i)] for i in range(12500)] +
                           [self.sentiment_model.docvecs["TEST_NEG_" + str(i)] for i in range(12500)],
                           asarray(list(chain(repeat(0, 12500), repeat(1, 12500)))))

        if isfile("arxiv.d2v"):
            self.doc_model = Doc2Vec.load("arxiv.d2v")
        else:
            tagged = [TaggedDocument(doc.get("summary").split(), [doc.get("id")]) for doc in self.docs]
            doc_model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7)
            doc_model.build_vocab(tagged)
            shuffle(tagged) # Replace with functional stuff
            for epoch in range(10):
                doc_model.train(tagged, total_examples=doc_model.corpus_count, epochs=doc_model.iter)
            doc_model.save("arxiv.d2v")
예제 #31
0
def find_similarity(vec_lsi, df, k=30):

    with open('models/LSI_model/corpus_lsi.pickle', 'rb') as handle:
        corpus_lsi = pickle.load(handle)
    index = MatrixSimilarity(corpus_lsi, num_features=72)
    sims = index[vec_lsi]
    index = sims[0].argsort()[-k:][::-1]
    for i in index:
        print(i, "------->", df[i])
    return index
예제 #32
0
파일: esa.py 프로젝트: consciousgaze/cs224u
    def __init__(self, model_prefix = None, num_best = None):
        self.model_prefix = model_prefix
        self.num_best = num_best
        if self.model_prefix is None:
            raise ValueError("model_prefix must be specified")

        logger.info("ESA: Loading word dictionary...")
        self.dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2')

        logger.info("ESA: Loading document name map...")
        self.article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle')

        logger.info("ESA: Loading TF-IDF model...")
        self.tfidf = TfidfModel.load(model_prefix + '.tfidf_model')

        logger.info("ESA: Loading similarity index...")
        sim_fname = "%s.cluster.%d.centroids" % (model_prefix, 2000)
        self.similarity_index = MatrixSimilarity.load(sim_fname, mmap='r')

        #logger.info("ESA: Preloading reverse indexes...")
        #self.similarity_index.preload_reverse_index()

        logger.info("ESA: Finished loading model files.")
예제 #33
0
    # check and process input arguments
    if len(sys.argv) < 3:
        print(globals()['__doc__'] % locals())
        sys.exit(1)
    language = sys.argv[1]
    method = sys.argv[2].strip().lower()

    logging.info("loading corpus mappings")
    config = dmlcorpus.DmlConfig('%s_%s' % (gensim_build.PREFIX, language),
                                 resultDir=gensim_build.RESULT_DIR, acceptLangs=[language])

    logging.info("loading word id mapping from %s", config.resultFile('wordids.txt'))
    id2word = dmlcorpus.DmlCorpus.loadDictionary(config.resultFile('wordids.txt'))
    logging.info("loaded %i word ids", len(id2word))

    corpus = dmlcorpus.DmlCorpus.load(config.resultFile('.pkl'))
    input = MmCorpus(config.resultFile('_%s.mm' % method))
    assert len(input) == len(corpus), \
        "corpus size mismatch (%i vs %i): run ./gensim_genmodel.py again" % (len(input), len(corpus))

    # initialize structure for similarity queries
    if method == 'lsi' or method == 'rp':  # for these methods, use dense vectors
        index = MatrixSimilarity(input, num_best=MAX_SIMILAR + 1, num_features=input.numTerms)
    else:
        index = SparseMatrixSimilarity(input, num_best=MAX_SIMILAR + 1)

    index.normalize = False
    generateSimilar(corpus, index, method)

    logging.info("finished running %s", program)
logger.info('loading lsi model')
lsi_model = lsimodel.LsiModel.load(os.path.join(settings.PERSIST_DIR, 'lsi_model{}-200'.format(
                                                     fname_suffix)))

fnames = [line.strip() for line in open(os.path.join(settings.PERSIST_DIR, 'document_index{}'.format(
                                                     fname_suffix)))]

doc_ids = pd.Series(map(lambda x: os.path.basename(x).split('.')[0], fnames),
                    dtype=object)

matrix_sim_loc = os.path.join(settings.PERSIST_DIR,
                              'tfidf_corpus_lsi{}-200_matrix_similarity'.format(fname_suffix))

if not os.path.exists(matrix_sim_loc):
    logger.info('building matrix similarity')
    doc_topic = MatrixSimilarity(tfidf_corpus_lsi, num_features=tfidf_corpus_lsi.num_terms)

    logger.info('persisting matrix similarity index')
    doc_topic.save(matrix_sim_loc)
else:
    logger.info('matrix similarity already available. using that')
    doc_topic = MatrixSimilarity.load(matrix_sim_loc)

def cluster(group, level, nbranches):
    if len(group) < min_nodes:
        logger.info("......less than {min_nodes} nodes ({n})".format(
            min_nodes=min_nodes, n=len(group)))
        return

    mbk = MiniBatchKMeans(init='k-means++', n_clusters=nbranches, n_init=1,
                          init_size=1000, batch_size=1000)
예제 #35
0
# load models

print "\n    Loading models, etc..\n"
id2word_pgfin = gensim.corpora.Dictionary.load('./data/pgfin.dictionary')
tfidf_model = gensim.models.TfidfModel.load('./data/tfidf_pgfin.model')
lsi_model = gensim.models.LsiModel.load('./data/lsi_pgfin.model')
indexfile = ('./data/ta_index.txt')
queryfile = './queryfiles/queryfile.txt'  # text in corpus
# queryfile = './queryfiles/45vuotta.txt'  # Film review
# queryfile = './queryfiles/tktjohdessee2.txt'  # Ancient essay

# check similarity

print "\n    Load similarity indices.\n"
index = Similarity.load('./data/pgfin_index.index')
index_dense = MatrixSimilarity.load('./data/pgfin_matrixindex.index')

with open(queryfile, 'r') as datafile:
    query = datafile.read()

# vectorize the query text into bag-of-words and tfidf
query_bow = id2word_pgfin.doc2bow(tokenize(query))
query_tfidf = tfidf_model[query_bow]
query_lsi = lsi_model[query_tfidf]

index_dense.num_best = 5


class BookHitValue(object):

    def __init__(self, indexfile, author_title, hit_percent):
tfidf_corpus_lsi = corpora.MmCorpus(os.path.join(settings.PERSIST_DIR, 'tfidf_corpus_lsi-200'))

logger.info('loading lsi model')
lsi_model = lsimodel.LsiModel.load(os.path.join(settings.PERSIST_DIR, 'lsi_model-200'))

fnames = [line.strip() for line in open(os.path.join(settings.PERSIST_DIR, 'document_index'))]
doc_ids = pd.Series(map(lambda x: os.path.basename(x).split('.')[0], fnames),
                    dtype=object)

#logger.info('building matrix similarity')
#doc_topic = MatrixSimilarity(tfidf_corpus_lsi, num_features=tfidf_corpus_lsi.num_terms)

#logger.info('persisting matrix similarity index')
#doc_topic.save(os.path.join(settings.PERSIST_DIR, 'tfidf_corpus_lsi-200_matrix_similarity'))

doc_topic = MatrixSimilarity.load(os.path.join(settings.PERSIST_DIR,
                                               'tfidf_corpus_lsi-200_matrix_similarity'))

def cluster(group, level, nbranches):
    if len(group) < min_nodes:
        logger.info("......less than {min_nodes} nodes ({n})".format(
            min_nodes=min_nodes, n=len(group)))
        return

    mbk = MiniBatchKMeans(init='k-means++', n_clusters=nbranches, n_init=1,
                          init_size=1000, batch_size=1000)
    mbk.fit(doc_topic.index[group['original_id']])
    return mbk


def index_freq_above(na, minval):
    l = pd.Series(na)
예제 #37
0
    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]

    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]

    # add tokens to list
    texts.append(stemmed_tokens)

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)

# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=30, id2word = dictionary, passes=20)

index = MatrixSimilarity(ldamodel[corpus])
index.save("simIndex.index")

print(ldamodel.print_topics(num_topics=30, num_words=2))

doc = stories['cast56']
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lda = ldamodel[vec_bow]

sims = index[vec_lda]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
print sims
예제 #38
0
import gensim
from gensim.similarities import Similarity, MatrixSimilarity

# from pgfin_timing import Timer

from pgfin_helpers import tokenize


logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore


# load the corpora

print "\n    Loading corpora.\n"
# tfidf_corpus = gensim.corpora.MmCorpus('./data/pgfintestdata20_tfidf.mm')
# lsi_corpus = gensim.corpora.MmCorpus('./data/pgfintestdata20_lsa.mm')
# tfidf_corpus = gensim.corpora.MmCorpus('./data/pgfin_tfidf.mm')
lsi_corpus = gensim.corpora.MmCorpus('./data/pgfin_lsa.mm')
# print(tfidf_corpus)
# print(lsi_corpus)

print "\n    Start similarity index.\n"
index = Similarity('./data/pgfin_index', lsi_corpus, num_features=lsi_corpus.num_terms)
index.save('./data/pgfin_index.index')  # save to disk
# print index
index_dense = MatrixSimilarity(lsi_corpus, num_features=lsi_corpus.num_terms)
index_dense.save('./data/pgfin_matrixindex.index')  # save to disk
# print index_dense
예제 #39
0
def k_cluster_wiki(input_prefix, output_prefix):
    k = 2000
    delta = 0.001
    max_iters = 10
    error = float('nan')
    old_error = float('nan')
    relative_error_change = float('nan')

    logger.info("Starting k-means clustering with k=%d, max iters=%d, delta=%f", k, max_iters, delta)

    m = ESAModel(input_prefix)
    similarity_index = m.similarity_index
    dictionary = m.dictionary

    num_topics = len(similarity_index)
    num_terms = len(dictionary)

    # Create initial cluster centroids.
    # L2-normalize them so we can calculate cosine similarity with a simple dot product.
    cluster_centroids = normalize(np.random.uniform(size=(k, num_terms)))

    # The cluster that each document belongs to.
    cluster_assignments = None

    logger.info("Preloading memory-mapped shards...")
    for i, shard in enumerate(similarity_index.shards):
        shard.get_index()

    iter = 0
    while iter < max_iters:

        # Calculate cosine similarities between each centroid and each topic.
        # To save time, we also calculate the error for the previous assignment during this step.
        logger.info("Calculating cosine similarity of each cluster with each document...")
        previous_cluster_assignments = np.copy(cluster_assignments)
        previous_cluster_centroids = np.copy(cluster_centroids)
        cluster_counts = np.ones(k) # Use ones instead of zeros to avoid divide by zero.

        cluster_centroids = np.zeros((k, num_terms))
        previous_centroid_distances = np.zeros(k)
        cluster_assignments = []
        docid = 0
        num_shards = len(similarity_index.shards)
        for i, shard in enumerate(similarity_index.shards):
            logger.info("Processing shard %d/%d ...", i, num_shards)
            # Calculate a (Cluster X Document) cosine similarity matrix for the current shard.
            # (C X T) . (T X D) = (C X D)
            logger.info("  Calculating similarities...")
            cluster_shard_similarities = previous_cluster_centroids * shard.get_index().index.transpose()

            # Select most similar cluster for each document.
            logger.info("  Calculating argmax...")
            cluster_selections = np.argmax(cluster_shard_similarities, axis=0)
            cluster_assignments = np.hstack([cluster_assignments, cluster_selections])

            shard_first_docid = docid

            # Calculate errors for the previous assignment.
            # We don't calculate errors on the first iteration since we don't
            # have an assignment yet.
            if previous_cluster_assignments.size != 1: # np.copy() of None has size 1
                logger.info("  Calculating error...")
                for doc_cluster_sims in cluster_shard_similarities.transpose():
                    cluster = previous_cluster_assignments[docid]
                    previous_centroid_distances[cluster] += (1 - doc_cluster_sims[cluster])
                    docid += 1

            # Iteratively recalculate the centroid of each cluster, so we don't
            # have to swap each shard out and back in.
            docid = shard_first_docid # Reset docid counter to before the error calcs.
            logger.info("  Computing new cluster centroids...")
            for topic_vec in shard.get_index().index:
                cluster = cluster_assignments[docid]
                cluster_centroids[cluster] += topic_vec
                cluster_counts[cluster] += 1
                docid += 1

        #print("Cluster assignments:", cluster_assignments)
        cluster_centroids /= cluster_counts[:,None]         # Take the average (off by one to avoid /0)
        cluster_centroids = normalize(cluster_centroids)    # And normalize.

        # We just use the sum of all cosine distances as our error metric.
        old_error = error
        error = np.sum(previous_centroid_distances)
        relative_error_change = abs(1 - error / old_error)

        logger.info("> Iteration: %d, previous error: %f, old error: %f, rel change: %f",
                    iter, error, old_error, relative_error_change)

        # TODO: Drop clusters with zero members assigned and merge clusters that
        # have converged to the same centroid.

        # Checkpoint the clusterings in every iteration so we can test them
        # before they converge.
        # Save centroids.
        centroids_fname = "%s.cluster.%d.centroids" % (output_prefix, k)
        logger.info("Saving clusters to file: %s", centroids_fname)
        s = MatrixSimilarity(None, dtype = np.float64, num_features = num_terms)
        s.index = cluster_centroids
        s.save(centroids_fname)
        del s   # Free any RAM the similarity index might use.

        # Save assignments.
        assignments_fname = "%s.cluster.%d.assignments" % (output_prefix, k)
        logger.info("Saving cluster assignments to file: %s", assignments_fname)
        np.save(open(assignments_fname, 'wb'), cluster_assignments)

        if relative_error_change < delta:
            logger.info("Converged.")
            break

        iter += 1

    logger.info("Done.")
예제 #40
0
# query = 'oil and gas'
# from src.engine.preprocess import preprocess_body_lda
# query = preprocess_body_lda(query)
# corpus_query = [dictionary.doc2bow(query.split(" "))]
# transformed = tfidf[corpus_query]
#
# logentropy = models.LogEntropyModel(tfidf[corpus], id2word=dictionary, normalize=True)
# logentropy.save(settings.LOGENTROPY_MODEL)

# logentropy_query = logentropy[transformed]
lsi = models.LdaModel(corpus, id2word=dictionary, num_topics=30, passes=3, alpha='auto', chunksize=4000)
lsi.save(settings.LDA_MODEL)

lsi = models.LdaModel.load(settings.LDA_MODEL)
from gensim.similarities import MatrixSimilarity
similarity_matrix = MatrixSimilarity(lsi[corpus], num_features=100)
similarity_matrix.save(settings.SIMILARITY_MATRIX)

# similarities = similarity_matrix.get_similarities(lsi[logentropy_query])

#
#
#

# lsi_query = lsi[logentropy_query]
from gensim import matutils

# matutils.cossim(lsi.)


# passes = 1, per = 11000; alpha='auto', per=9200
예제 #41
0
    # check and process input arguments
    if len(sys.argv) < 3:
        print(globals()['__doc__'] % locals())
        sys.exit(1)
    language = sys.argv[1]
    method = sys.argv[2].strip().lower()

    logging.info("loading corpus mappings")
    config = dmlcorpus.DmlConfig('%s_%s' % (gensim_build.PREFIX, language),
                                 resultDir=gensim_build.RESULT_DIR, acceptLangs=[language])

    logging.info("loading word id mapping from %s" % config.resultFile('wordids.txt'))
    id2word = dmlcorpus.DmlCorpus.loadDictionary(config.resultFile('wordids.txt'))
    logging.info("loaded %i word ids" % len(id2word))

    corpus = dmlcorpus.DmlCorpus.load(config.resultFile('.pkl'))
    input = MmCorpus(config.resultFile('_%s.mm' % method))
    assert len(input) == len(corpus), "corpus size mismatch (%i vs %i): run ./gensim_genmodel.py again" % (len(input), len(corpus))

     # initialize structure for similarity queries
    if method == 'lsi' or method == 'rp': # for these methods, use dense vectors
        index = MatrixSimilarity(input, numBest=MAX_SIMILAR + 1, numFeatures=input.numTerms)
    else:
        index = SparseMatrixSimilarity(input, numBest=MAX_SIMILAR + 1)

    index.normalize = False # do not normalize query vectors during similarity queries (the index is already built normalized, so it would be a no-op)
    generateSimilar(corpus, index, method) # for each document, print MAX_SIMILAR nearest documents to a xml file, in dml-cz specific format

    logging.info("finished running %s" % program)