def train_model(self):
     """
     Read the preprocessed data and generate corpus dictionary, tfidf model and matrix(Cosine) similarity
     :return: status of training
     """
     try:
         data = pd.read_csv(self.processed_data)
         del data['Unnamed: 0']
         # creating tokens for the doc column
         corpus = data['doc'].map(break_to_tokens)
         # creating dictionary of words in the movie dataset
         dictionary = gensim.corpora.Dictionary(corpus)
         dictionary.save(self.corpus_dictionary)
         # creating vector with bag of words for the corpus
         vector = [dictionary.doc2bow(d) for d in corpus]
         # creating tfidf values for the vector
         tfidf = models.TfidfModel(vector)
         tfidf.save(self.tfidf_model)
         corpus_tfidf = tfidf[vector]
         # Compute Similarities
         similarity = MatrixSimilarity(corpus_tfidf,
                                       num_features=len(dictionary))
         similarity.save(self.matrix_similarity)
         return "Model Trained Successfully"
     except:
         return "Error While Training Model"
示例#2
0
def calAuthorSim():
    conn = sqlite3.connect(config.db_path)
    db = conn.cursor()

    model = AuthorTopicModel.load(config.author_model128_path)
    poets = list(model.id2author.values())
    print(len(poets))
    # vec = model.get_author_topics('苏轼')
    index = MatrixSimilarity(model[list(model.id2author.values())], num_best=30)
    index.save(config.author_simMatrix_path)
    # index = MatrixSimilarity.load(config.author_simMatrix_path)

    for name in poets:
        # print(name)
        sims = index[model[name]]
        sims = sorted(sims, key=lambda item: -item[1])
        sims = [ [poets[sim[0]] , sim[1]] for sim in sims]
        # print(sims)
        # sql_comment  = "UPDATE author SET sims=? WHERE id=?"
        # db.execute(sql_comment, (toJson(sims), name))

        sql_comment  = "UPDATE author SET sims=\'{}\' WHERE id=\'{}\'".format(toJson(sims), name)
        db.execute(sql_comment)
        # print(sql_comment)
    # print(len(poets))
    conn.commit()
示例#3
0
def create_model_tfidf_model(documents, model_name, matrix_name, dic_name):
    dictionary = Dictionary(documents)
    corpus = [dictionary.doc2bow(doc) for doc in documents]
    tfidfmodel = TfidfModel(corpus)
    index = MatrixSimilarity(tfidfmodel[corpus], num_features=len(dictionary))
    index.save(matrix_name)
    tfidfmodel.save(model_name)
    dictionary.save(dic_name)
    return tfidfmodel, index, dictionary
    def main(self):

        print("Recommendation using TF_IDF")

        # Loading preprocessed data
        vagas_ti = pd.read_csv(self.dataPrepFile)
        vagas_ids = pickle.load(
            open(self.out + "preprocessing/vagas_ids.array", "rb"))
        vagas_words = pickle.load(
            open(self.out + "preprocessing/vagas_words.list", "rb"))
        cvs_words = pickle.load(
            open(self.out + "preprocessing/cvs_words.series", "rb"))
        cvs = pd.read_csv(self.dataCvsFile)
        cvs = cvs.fillna("")
        cvs.isnull().any()
        #print("Loading cvs done!")

        # Creating a dictionary
        dictionary = gcorp.Dictionary(vagas_words)
        dictionary.save(self.out + 'preprocessing/tf_idf/vagas.dict'
                        )  # store the dictionary, for future reference

        # compile corpus (vectors number of times each elements appears)
        raw_corpus = [dictionary.doc2bow(v) for v in vagas_words]
        gcorp.MmCorpus.serialize(self.out + 'preprocessing/tf_idf/vagas.mm',
                                 raw_corpus)  # store to disk
        print("Tamanho do dicionário: " + str(len(dictionary)))

        # STEP 2 : similarity between corpuses
        dictionary = gcorp.Dictionary.load(self.out +
                                           'preprocessing/tf_idf/vagas.dict')
        corpus = gcorp.MmCorpus(self.out + 'preprocessing/tf_idf/vagas.mm')

        # Transform Text with TF-IDF
        tfidf = gsm.TfidfModel(corpus)  # step 1 -- initialize a model

        # corpus tf-idf
        corpus_tfidf = tfidf[corpus]

        # STEP 3 : Create similarity matrix of all files
        index = MatrixSimilarity(corpus_tfidf,
                                 num_features=len(dictionary),
                                 num_best=10)
        index.save(self.out + 'preprocessing/tf_idf/vagas.index')
        index = MatrixSimilarity.load(self.out +
                                      'preprocessing/tf_idf/vagas.index')

        self.recommendationTf_idf(cvs, vagas_ti, vagas_ids, cvs_words,
                                  dictionary, tfidf, index)

        print("Recommendation using TF_IDF done!")
示例#5
0
    def get_similarity_index(self, bow_corpus, lsa: LsiModel, recalculate=False, from_scratch=True):

        filepath = self.paths.get_lsa_index(lsa.num_topics)

        if not os.path.isfile(filepath) or recalculate:

            if not from_scratch:
                raise ValueError('No similarity index file exists but from_scratch is False')

            print('Building index...')
            index = MatrixSimilarity(lsa[bow_corpus])
            index.save(filepath)
        else:
            print('Loading index...')
            index = MatrixSimilarity.load(filepath)

        return index
示例#6
0
    def train(self):
        print("Reading serializations...")
        sr = SerializationReader(self.series)
        documents, doc2idx, idx2doc = sr.read()

        print("Building dictionary...")
        dictionary = Dictionary(documents)
        corpus = [dictionary.doc2bow(doc) for doc in documents]

        print("Building model...")
        lsi = LsiModel(corpus, id2word=dictionary, num_topics=self.dimensions)

        print("Building index...")
        index = MatrixSimilarity(lsi[corpus])

        print("Saving...")
        dictionary.save(self.dictionary)
        lsi.save(self.lsi)
        index.save(self.index)
示例#7
0
class GensimTopicModeler(LatentTopicModeler):
    """
    This class facilitates the creation of topic models (options: LDA (latent Dirichlet Allocation),
    LSI (latent semantic indexing), and Random Projections
    with the given short text training data, and convert future
    short text into topic vectors using the trained topic model.

    This class extends :class:`LatentTopicModeler`.
    """
    def __init__(self,
                 preprocessor=textpreprocess.standard_text_preprocessor_1(),
                 algorithm='lda',
                 toweigh=True,
                 normalize=True):
        """ Initialize the topic modeler.

        :param preprocessor: function that preprocesses the text. (Default: `utils.textpreprocess.standard_text_preprocessor_1`)
        :param algorithm: algorithm for topic modeling. Options: lda, lsi, rp. (Default: lda)
        :param toweigh: whether to weigh the words using tf-idf. (Default: True)
        :param normalize: whether the retrieved topic vectors are normalized. (Default: True)
        :type preprocessor: function
        :type algorithm: str
        :type toweigh: bool
        """
        LatentTopicModeler.__init__(self,
                                    preprocessor=preprocessor,
                                    normalize=normalize)
        self.algorithm = algorithm
        self.toweigh = toweigh

    def train(self, classdict, nb_topics, *args, **kwargs):
        """ Train the topic modeler.

        :param classdict: training data
        :param nb_topics: number of latent topics
        :param args: arguments to pass to the `train` method for gensim topic models
        :param kwargs: arguments to pass to the `train` method for gensim topic models
        :return: None
        :type classdict: dict
        :type nb_topics: int
        """
        self.nb_topics = nb_topics
        self.generate_corpus(classdict)
        if self.toweigh:
            self.tfidf = TfidfModel(self.corpus)
            normcorpus = self.tfidf[self.corpus]
        else:
            self.tfidf = None
            normcorpus = self.corpus

        self.topicmodel = gensim_topic_model_dict[self.algorithm](
            normcorpus, num_topics=self.nb_topics, *args, **kwargs)
        self.matsim = MatrixSimilarity(self.topicmodel[normcorpus])

        # change the flag
        self.trained = True

    def retrieve_corpus_topicdist(self, shorttext):
        """ Calculate the topic vector representation of the short text, in the corpus form.

        If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`.

        :param shorttext: text to be represented
        :return: topic vector in the corpus form
        :raise: ModelNotTrainedException
        :type shorttext: str
        :rtype: list
        """
        if not self.trained:
            raise e.ModelNotTrainedException()
        bow = self.retrieve_bow(shorttext)
        return self.topicmodel[self.tfidf[bow] if self.toweigh else bow]

    def retrieve_topicvec(self, shorttext):
        """ Calculate the topic vector representation of the short text.

        This function calls :func:`~retrieve_corpus_topicdist`.

        If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`.

        :param shorttext: text to be represented
        :return: topic vector
        :raise: ModelNotTrainedException
        :type shorttext: str
        :rtype: numpy.ndarray
        """
        if not self.trained:
            raise e.ModelNotTrainedException()
        topicdist = self.retrieve_corpus_topicdist(shorttext)
        topicvec = np.zeros(self.nb_topics)
        for topicid, frac in topicdist:
            topicvec[topicid] = frac
        if self.normalize:
            topicvec /= np.linalg.norm(topicvec)
        return topicvec

    def get_batch_cos_similarities(self, shorttext):
        """ Calculate the score, which is the cosine similarity with the topic vector of the model,
        of the short text against each class labels.

        If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`.

        :param shorttext: short text
        :return: dictionary of scores of the text to all classes
        :raise: ModelNotTrainedException
        :type shorttext: str
        :rtype: dict
        """
        if not self.trained:
            raise e.ModelNotTrainedException()
        simdict = {}
        similarities = self.matsim[self.retrieve_corpus_topicdist(shorttext)]
        for label, similarity in zip(self.classlabels, similarities):
            simdict[label] = similarity
        return simdict

    def loadmodel(self, nameprefix):
        """ Load the topic model with the given prefix of the file paths.

        Given the prefix of the file paths, load the corresponding topic model. The files
        include a JSON (.json) file that specifies various parameters, a gensim dictionary (.gensimdict),
        and a topic model (.gensimmodel). If weighing is applied, load also the tf-idf model (.gensimtfidf).

        :param nameprefix: prefix of the file paths
        :return: None
        :type nameprefix: str
        """
        # load the JSON file (parameters)
        parameters = json.load(open(nameprefix + '.json', 'rb'))
        self.nb_topics = parameters['nb_topics']
        self.toweigh = parameters['toweigh']
        self.algorithm = parameters['algorithm']
        self.classlabels = parameters['classlabels']

        # load the dictionary
        self.dictionary = Dictionary.load(nameprefix + '.gensimdict')

        # load the topic model
        self.topicmodel = gensim_topic_model_dict[self.algorithm].load(
            nameprefix + '.gensimmodel')

        # load the similarity matrix
        self.matsim = MatrixSimilarity.load(nameprefix + '.gensimmat')

        # load the tf-idf modek
        if self.toweigh:
            self.tfidf = TfidfModel.load(nameprefix + '.gensimtfidf')

        # flag
        self.trained = True

    def savemodel(self, nameprefix):
        """ Save the model with names according to the prefix.

        Given the prefix of the file paths, save the corresponding topic model. The files
        include a JSON (.json) file that specifies various parameters, a gensim dictionary (.gensimdict),
        and a topic model (.gensimmodel). If weighing is applied, load also the tf-idf model (.gensimtfidf).

        If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`.

        :param nameprefix: prefix of the file paths
        :return: None
        :raise: ModelNotTrainedException
        :type nameprefix: str
        """
        if not self.trained:
            raise e.ModelNotTrainedException()
        parameters = {}
        parameters['nb_topics'] = self.nb_topics
        parameters['toweigh'] = self.toweigh
        parameters['algorithm'] = self.algorithm
        parameters['classlabels'] = self.classlabels
        json.dump(parameters, open(nameprefix + '.json', 'wb'))

        self.dictionary.save(nameprefix + '.gensimdict')
        self.topicmodel.save(nameprefix + '.gensimmodel')
        self.matsim.save(nameprefix + '.gensimmat')
        if self.toweigh:
            self.tfidf.save(nameprefix + '.gensimtfidf')
]

doc_ids = pd.Series(map(lambda x: os.path.basename(x).split('.')[0], fnames),
                    dtype=object)

matrix_sim_loc = os.path.join(
    settings.PERSIST_DIR,
    'tfidf_corpus_lsi{}-200_matrix_similarity'.format(fname_suffix))

if not os.path.exists(matrix_sim_loc):
    logger.info('building matrix similarity')
    doc_topic = MatrixSimilarity(tfidf_corpus_lsi,
                                 num_features=tfidf_corpus_lsi.num_terms)

    logger.info('persisting matrix similarity index')
    doc_topic.save(matrix_sim_loc)
else:
    logger.info('matrix similarity already available. using that')
    doc_topic = MatrixSimilarity.load(matrix_sim_loc)


def cluster(group, level, nbranches):
    if len(group) < min_nodes:
        logger.info("......less than {min_nodes} nodes ({n})".format(
            min_nodes=min_nodes, n=len(group)))
        return

    mbk = MiniBatchKMeans(init='k-means++',
                          n_clusters=nbranches,
                          n_init=1,
                          init_size=1000,
示例#9
0
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]

    # add tokens to list
    texts.append(stemmed_tokens)

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)

# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus,
                                           num_topics=30,
                                           id2word=dictionary,
                                           passes=20)

index = MatrixSimilarity(ldamodel[corpus])
index.save("simIndex.index")

print(ldamodel.print_topics(num_topics=30, num_words=2))

doc = stories['cast56']
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lda = ldamodel[vec_bow]

sims = index[vec_lda]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
print sims
示例#10
0
# from src.engine.preprocess import preprocess_body_lda
# query = preprocess_body_lda(query)
# corpus_query = [dictionary.doc2bow(query.split(" "))]
# transformed = tfidf[corpus_query]
#
# logentropy = models.LogEntropyModel(tfidf[corpus], id2word=dictionary, normalize=True)
# logentropy.save(settings.LOGENTROPY_MODEL)

# logentropy_query = logentropy[transformed]
lsi = models.LdaModel(corpus, id2word=dictionary, num_topics=30, passes=3, alpha='auto', chunksize=4000)
lsi.save(settings.LDA_MODEL)

lsi = models.LdaModel.load(settings.LDA_MODEL)
from gensim.similarities import MatrixSimilarity
similarity_matrix = MatrixSimilarity(lsi[corpus], num_features=100)
similarity_matrix.save(settings.SIMILARITY_MATRIX)

# similarities = similarity_matrix.get_similarities(lsi[logentropy_query])

#
#
#

# lsi_query = lsi[logentropy_query]
from gensim import matutils

# matutils.cossim(lsi.)


# passes = 1, per = 11000; alpha='auto', per=9200
# passes = 2, per = 5100; alpha='auto', per=3200
import codecs
from gensim import corpora
from gensim.similarities import MatrixSimilarity
import utils

# for real file path
huffPostDataFilePath = '../../lda-ner-result-data/rawHuffPostData.json'
gensimDictionaryBaseFilePath = '../../5w1h-result-data/gensim-in-time/gensimDictionary'
gensimCorpusBaseFilePath = '../../5w1h-result-data/gensim-in-time/gensimCorpus'
writingSimilarityIndexBaseFilePath = '../../5w1h-result-data/gensim-in-time/similarityIndex'

huffPostData = json.load(codecs.open(huffPostDataFilePath, 'r', 'utf-8-sig'))
year_months = utils.make_year_months_from_huff_post_data(huffPostData)

print('year_months', year_months)

# read dictionary and corpus in times
for year_month in year_months:
    dictionary_in_time_file_path = gensimDictionaryBaseFilePath + '_' + year_month + '.dict'
    corpus_in_time_file_path = gensimCorpusBaseFilePath + '_' + year_month + '.mm'

    dictionary = corpora.Dictionary.load(dictionary_in_time_file_path)

    corpus = corpora.MmCorpus(corpus_in_time_file_path)

    # make similarityIndex
    similarityIndex = MatrixSimilarity(corpus, num_best=10,
                                       num_features=len(dictionary))

    similarityIndex.save(writingSimilarityIndexBaseFilePath + '_' + year_month)
示例#12
0
import gensim
from gensim.similarities import Similarity, MatrixSimilarity

# from pgfin_timing import Timer

from pgfin_helpers import tokenize


logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore


# load the corpora

print "\n    Loading corpora.\n"
# tfidf_corpus = gensim.corpora.MmCorpus('./data/pgfintestdata20_tfidf.mm')
# lsi_corpus = gensim.corpora.MmCorpus('./data/pgfintestdata20_lsa.mm')
# tfidf_corpus = gensim.corpora.MmCorpus('./data/pgfin_tfidf.mm')
lsi_corpus = gensim.corpora.MmCorpus('./data/pgfin_lsa.mm')
# print(tfidf_corpus)
# print(lsi_corpus)

print "\n    Start similarity index.\n"
index = Similarity('./data/pgfin_index', lsi_corpus, num_features=lsi_corpus.num_terms)
index.save('./data/pgfin_index.index')  # save to disk
# print index
index_dense = MatrixSimilarity(lsi_corpus, num_features=lsi_corpus.num_terms)
index_dense.save('./data/pgfin_matrixindex.index')  # save to disk
# print index_dense
示例#13
0
    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]

    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]

    # add tokens to list
    texts.append(stemmed_tokens)

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)

# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=30, id2word = dictionary, passes=20)

index = MatrixSimilarity(ldamodel[corpus])
index.save("simIndex.index")

print(ldamodel.print_topics(num_topics=30, num_words=2))

doc = stories['cast56']
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lda = ldamodel[vec_bow]

sims = index[vec_lda]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
print sims
示例#14
0
# logentropy = models.LogEntropyModel(tfidf[corpus], id2word=dictionary, normalize=True)
# logentropy.save(settings.LOGENTROPY_MODEL)

# logentropy_query = logentropy[transformed]
lsi = models.LdaModel(corpus,
                      id2word=dictionary,
                      num_topics=30,
                      passes=3,
                      alpha='auto',
                      chunksize=4000)
lsi.save(settings.LDA_MODEL)

lsi = models.LdaModel.load(settings.LDA_MODEL)
from gensim.similarities import MatrixSimilarity
similarity_matrix = MatrixSimilarity(lsi[corpus], num_features=100)
similarity_matrix.save(settings.SIMILARITY_MATRIX)

# similarities = similarity_matrix.get_similarities(lsi[logentropy_query])

#
#
#

# lsi_query = lsi[logentropy_query]
from gensim import matutils

# matutils.cossim(lsi.)

# passes = 1, per = 11000; alpha='auto', per=9200
# passes = 2, per = 5100; alpha='auto', per=3200
# passes = 3. per = 4400; alpha='auto', per=2000
示例#15
0
# tokenize to vectors
corpus = [dictionary.doc2bow(text) for text in texts]
# MmCorpus.serialize('./gen_sim_corpus.mm', corpus)

from gensim.models import TfidfModel
from gensim.models import LsiModel
from gensim.similarities import MatrixSimilarity

lsi = LsiModel(corpus, id2word=dictionary)
corpus_lsi = lsi[corpus]

lsi.save('gensim_lsi_model.lsi')

# transform corpus to LSI space and index it
index = MatrixSimilarity(corpus_lsi)
index.save('./gensim_lsi_matrix_similarity.index')

query = documents[0]
query_vec = dictionary.doc2bow(query.lower().split())
# convert the query to LSI space
vec_lsi = lsi[query_vec]

# perform a similarity query against the corpus
sims = index[vec_lsi]
sims_s = sorted(list(enumerate(sims)), key=lambda tup: tup[1], reverse=True)
# sorted (document number, similarity score) 2-tuples
print('\n')
print('Printing first 10')
real_documents = bugs.subject
c = 0
for item in sims_s:
示例#16
0
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

corpora.MmCorpus.serialize('corpus.mm', corpus_tfidf)
tfidf.save("my_model.tfidf")
tfidf = models.TfidfModel.load("my_model.tfidf")

print('Building LsiModel...')
corpus_tfidf = corpora.MmCorpus('corpus.mm')
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=100)

print('Building MatrixSimilarity...')
from gensim.similarities import MatrixSimilarity
index = MatrixSimilarity(lsi[corpus_tfidf])

index.save('deerwester.index')
index = MatrixSimilarity.load('deerwester.index')

print('Testing...')
result = np.zeros((20, 300)).astype('str')
j = 0
for doc in query_test['Query']:
    doc = jieba.cut(doc)
    tokens = []
    for word in doc:
        tokens.append(word)

    vec_bow = dictionary.doc2bow(tokens)
    vec_lsi = lsi[vec_bow]
    sims = index[vec_lsi]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
示例#17
0
def k_cluster_wiki(input_prefix, output_prefix):
    k = 2000
    delta = 0.001
    max_iters = 10
    error = float('nan')
    old_error = float('nan')
    relative_error_change = float('nan')

    logger.info(
        "Starting k-means clustering with k=%d, max iters=%d, delta=%f", k,
        max_iters, delta)

    m = ESAModel(input_prefix)
    similarity_index = m.similarity_index
    dictionary = m.dictionary

    num_topics = len(similarity_index)
    num_terms = len(dictionary)

    # Create initial cluster centroids.
    # L2-normalize them so we can calculate cosine similarity with a simple dot product.
    cluster_centroids = normalize(np.random.uniform(size=(k, num_terms)))

    # The cluster that each document belongs to.
    cluster_assignments = None

    logger.info("Preloading memory-mapped shards...")
    for i, shard in enumerate(similarity_index.shards):
        shard.get_index()

    iter = 0
    while iter < max_iters:

        # Calculate cosine similarities between each centroid and each topic.
        # To save time, we also calculate the error for the previous assignment during this step.
        logger.info(
            "Calculating cosine similarity of each cluster with each document..."
        )
        previous_cluster_assignments = np.copy(cluster_assignments)
        previous_cluster_centroids = np.copy(cluster_centroids)
        cluster_counts = np.ones(
            k)  # Use ones instead of zeros to avoid divide by zero.

        cluster_centroids = np.zeros((k, num_terms))
        previous_centroid_distances = np.zeros(k)
        cluster_assignments = []
        docid = 0
        num_shards = len(similarity_index.shards)
        for i, shard in enumerate(similarity_index.shards):
            logger.info("Processing shard %d/%d ...", i, num_shards)
            # Calculate a (Cluster X Document) cosine similarity matrix for the current shard.
            # (C X T) . (T X D) = (C X D)
            logger.info("  Calculating similarities...")
            cluster_shard_similarities = previous_cluster_centroids * shard.get_index(
            ).index.transpose()

            # Select most similar cluster for each document.
            logger.info("  Calculating argmax...")
            cluster_selections = np.argmax(cluster_shard_similarities, axis=0)
            cluster_assignments = np.hstack(
                [cluster_assignments, cluster_selections])

            shard_first_docid = docid

            # Calculate errors for the previous assignment.
            # We don't calculate errors on the first iteration since we don't
            # have an assignment yet.
            if previous_cluster_assignments.size != 1:  # np.copy() of None has size 1
                logger.info("  Calculating error...")
                for doc_cluster_sims in cluster_shard_similarities.transpose():
                    cluster = previous_cluster_assignments[docid]
                    previous_centroid_distances[cluster] += (
                        1 - doc_cluster_sims[cluster])
                    docid += 1

            # Iteratively recalculate the centroid of each cluster, so we don't
            # have to swap each shard out and back in.
            docid = shard_first_docid  # Reset docid counter to before the error calcs.
            logger.info("  Computing new cluster centroids...")
            for topic_vec in shard.get_index().index:
                cluster = cluster_assignments[docid]
                cluster_centroids[cluster] += topic_vec
                cluster_counts[cluster] += 1
                docid += 1

        #print("Cluster assignments:", cluster_assignments)
        cluster_centroids /= cluster_counts[:,
                                            None]  # Take the average (off by one to avoid /0)
        cluster_centroids = normalize(cluster_centroids)  # And normalize.

        # We just use the sum of all cosine distances as our error metric.
        old_error = error
        error = np.sum(previous_centroid_distances)
        relative_error_change = abs(1 - error / old_error)

        logger.info(
            "> Iteration: %d, previous error: %f, old error: %f, rel change: %f",
            iter, error, old_error, relative_error_change)

        # TODO: Drop clusters with zero members assigned and merge clusters that
        # have converged to the same centroid.

        # Checkpoint the clusterings in every iteration so we can test them
        # before they converge.
        # Save centroids.
        centroids_fname = "%s.cluster.%d.centroids" % (output_prefix, k)
        logger.info("Saving clusters to file: %s", centroids_fname)
        s = MatrixSimilarity(None, dtype=np.float64, num_features=num_terms)
        s.index = cluster_centroids
        s.save(centroids_fname)
        del s  # Free any RAM the similarity index might use.

        # Save assignments.
        assignments_fname = "%s.cluster.%d.assignments" % (output_prefix, k)
        logger.info("Saving cluster assignments to file: %s",
                    assignments_fname)
        np.save(open(assignments_fname, 'wb'), cluster_assignments)

        if relative_error_change < delta:
            logger.info("Converged.")
            break

        iter += 1

    logger.info("Done.")
fnames = [line.strip() for line in open(os.path.join(settings.PERSIST_DIR, 'document_index{}'.format(
                                                     fname_suffix)))]

doc_ids = pd.Series(map(lambda x: os.path.basename(x).split('.')[0], fnames),
                    dtype=object)

matrix_sim_loc = os.path.join(settings.PERSIST_DIR,
                              'tfidf_corpus_lsi{}-200_matrix_similarity'.format(fname_suffix))

if not os.path.exists(matrix_sim_loc):
    logger.info('building matrix similarity')
    doc_topic = MatrixSimilarity(tfidf_corpus_lsi, num_features=tfidf_corpus_lsi.num_terms)

    logger.info('persisting matrix similarity index')
    doc_topic.save(matrix_sim_loc)
else:
    logger.info('matrix similarity already available. using that')
    doc_topic = MatrixSimilarity.load(matrix_sim_loc)

def cluster(group, level, nbranches):
    if len(group) < min_nodes:
        logger.info("......less than {min_nodes} nodes ({n})".format(
            min_nodes=min_nodes, n=len(group)))
        return

    mbk = MiniBatchKMeans(init='k-means++', n_clusters=nbranches, n_init=1,
                          init_size=1000, batch_size=1000)
    mbk.fit(doc_topic.index[group['original_id'],:TOPIC_LIMIT])
    return mbk
示例#19
0

def get_user_vec(user_doc):
    user_vec = []
    for doc in user_doc:
        tokens = tokenizer.tokenize(doc)
        bow = dictionary.doc2bow(tokens)
        bow_tfidf = tfidf[bow]
        vec = ldamodel.get_document_topics(bow_tfidf)
        temp = np.zeros(num_topics)
        for topic in vec:
            temp[topic[0]] = topic[1]
        user_vec.append(temp)
    return user_vec


user_doc = get_user_doc(review_data, info_data)
user_vec = get_user_vec(user_doc)
pickle_data(picklepath, "lda_inputs", user_vec)

" MAKE SIMILARITY MATRIX "
from gensim.similarities import MatrixSimilarity
index = MatrixSimilarity(ldamodel[corpus_tfidf])
index_file = dp(os.path.join(modelpath, "lda_similarity.index"))
index.save(index_file)
'''
vector = prediction[:,-num_topics:][0]
sims = index[vector]
sims = sorted(enumerate(sims), key=lambda item: -item[1])[:10]
'''
示例#20
0
class Model:
    def __init__(self, filename):
        self.docs = loads(open(filename, "r").read())
        self.docmap = hoist_dict(self.docs, "id")

        if isfile("data.dict"):
            self.dictionary = Dictionary.load("data.dict")
        else:
            self.dictionary = Dictionary(iterate_summaries(self.docs))
            self.dictionary.save("data.dict")

        if isfile("data.mm"):
            self.corpus = MmCorpus("data.mm")
        else:
            corpus = (self.dictionary.doc2bow(text) for text in iterate_summaries(self.docs))
            MmCorpus.serialize("data.mm", corpus)
            self.corpus = MmCorpus("data.mm")

        self.lsi = LsiModel(self.corpus, id2word=self.dictionary, num_topics=3)

        if isfile("data.sim"):
            self.sim = MatrixSimilarity.load("data.sim")
        else:
            self.sim = MatrixSimilarity(self.lsi[self.corpus])
            self.sim.save("data.sim")

        # self.lda = LdaModel(corpus=self.corpus, id2word=self.dictionary, num_topics=100, update_every=1, chunksize=10000, passes=1)

        self.sentiment_model = Doc2Vec.load("imdb.d2v")
        self.sentiment = LogisticRegression()
        self.sentiment.fit([self.sentiment_model.docvecs["TEST_POS_" + str(i)] for i in range(12500)] +
                           [self.sentiment_model.docvecs["TEST_NEG_" + str(i)] for i in range(12500)],
                           asarray(list(chain(repeat(0, 12500), repeat(1, 12500)))))

        if isfile("arxiv.d2v"):
            self.doc_model = Doc2Vec.load("arxiv.d2v")
        else:
            tagged = [TaggedDocument(doc.get("summary").split(), [doc.get("id")]) for doc in self.docs]
            doc_model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7)
            doc_model.build_vocab(tagged)
            shuffle(tagged) # Replace with functional stuff
            for epoch in range(10):
                doc_model.train(tagged, total_examples=doc_model.corpus_count, epochs=doc_model.iter)
            doc_model.save("arxiv.d2v")

    def similar(self, query):
        """ Return all documents with similarity scores for the given query """
        return [{**{"similarity": float(sim)}, **doc}
                for doc, sim in zip(self.docs, self.sim[self.lsi[self.dictionary.doc2bow(query.lower().split())]])]

    def sentiment(self, id):
        """ Return a sentiment score for this document. """
        # TODO: Gensim
        import random
        return random.uniform(0, 1)

    def doc(self, id):
        return self.docmap.get(id)

    @property
    def positions(self):
        """ Return the positions for all document. Coordinates are arbitrary,
        but similar documents are physically close. """
        vectors = [self.sentiment_model[s] for s in self.sentiment_model.wv.index2word]
        pca = PCA(n_components=3, whiten=True)
        return pca.fit(vectors).transform(vectors)

    @property
    def everything(self):
        """ Return all documents with coordinates and sentiment scores """
        # TODO: Sentiment
        return [{**{"coordinates": list(pos)}, **doc}
                for doc, pos in zip(self.docs, self.positions)]
# gensimDictionaryFilePath = '../../5w1h-test-data/gensimDictionary.dict'
# gensimCorpusFilePath = '../../5w1h-test-data/gensimCorpus.mm'
# writingSimilarityIndexFilePath = '../../5w1h-test-data/similarityIndex'

# for real file path
gensimDictionaryFilePath = '../../5w1h-result-data/gensimDictionary.dict'
gensimCorpusFilePath = '../../5w1h-result-data/gensimCorpus.mm'
writingSimilarityIndexFilePath = '../../5w1h-result-data/similarityIndex'

# load gensim_dictionary and gensim_corpus
dictionary = corpora.Dictionary.load(gensimDictionaryFilePath)
corpus = corpora.MmCorpus(gensimCorpusFilePath)

# make similarityIndex
similarityIndex = MatrixSimilarity(corpus, num_best=9,
                                   num_features=len(dictionary))

# save similarityIndex
similarityIndex.save(writingSimilarityIndexFilePath)


# # test query_keywords
# queryKeywords = ['advance', 'april', 'april']

# # make doc2bow
# queryBow = dictionary.doc2bow(queryKeywords)
# print('queryBow', queryBow)

# similarityResult = similarityIndex[queryBow]
# print('similarityResult', similarityResult)
def k_cluster_wiki(input_prefix, output_prefix):
    k = 2000
    delta = 0.001
    max_iters = 10
    error = float('nan')
    old_error = float('nan')
    relative_error_change = float('nan')

    logger.info("Starting k-means clustering with k=%d, max iters=%d, delta=%f", k, max_iters, delta)

    m = ESAModel(input_prefix)
    similarity_index = m.similarity_index
    dictionary = m.dictionary

    num_topics = len(similarity_index)
    num_terms = len(dictionary)

    # Create initial cluster centroids.
    # L2-normalize them so we can calculate cosine similarity with a simple dot product.
    cluster_centroids = normalize(np.random.uniform(size=(k, num_terms)))

    # The cluster that each document belongs to.
    cluster_assignments = None

    logger.info("Preloading memory-mapped shards...")
    for i, shard in enumerate(similarity_index.shards):
        shard.get_index()

    iter = 0
    while iter < max_iters:

        # Calculate cosine similarities between each centroid and each topic.
        # To save time, we also calculate the error for the previous assignment during this step.
        logger.info("Calculating cosine similarity of each cluster with each document...")
        previous_cluster_assignments = np.copy(cluster_assignments)
        previous_cluster_centroids = np.copy(cluster_centroids)
        cluster_counts = np.ones(k) # Use ones instead of zeros to avoid divide by zero.

        cluster_centroids = np.zeros((k, num_terms))
        previous_centroid_distances = np.zeros(k)
        cluster_assignments = []
        docid = 0
        num_shards = len(similarity_index.shards)
        for i, shard in enumerate(similarity_index.shards):
            logger.info("Processing shard %d/%d ...", i, num_shards)
            # Calculate a (Cluster X Document) cosine similarity matrix for the current shard.
            # (C X T) . (T X D) = (C X D)
            logger.info("  Calculating similarities...")
            cluster_shard_similarities = previous_cluster_centroids * shard.get_index().index.transpose()

            # Select most similar cluster for each document.
            logger.info("  Calculating argmax...")
            cluster_selections = np.argmax(cluster_shard_similarities, axis=0)
            cluster_assignments = np.hstack([cluster_assignments, cluster_selections])

            shard_first_docid = docid

            # Calculate errors for the previous assignment.
            # We don't calculate errors on the first iteration since we don't
            # have an assignment yet.
            if previous_cluster_assignments.size != 1: # np.copy() of None has size 1
                logger.info("  Calculating error...")
                for doc_cluster_sims in cluster_shard_similarities.transpose():
                    cluster = previous_cluster_assignments[docid]
                    previous_centroid_distances[cluster] += (1 - doc_cluster_sims[cluster])
                    docid += 1

            # Iteratively recalculate the centroid of each cluster, so we don't
            # have to swap each shard out and back in.
            docid = shard_first_docid # Reset docid counter to before the error calcs.
            logger.info("  Computing new cluster centroids...")
            for topic_vec in shard.get_index().index:
                cluster = cluster_assignments[docid]
                cluster_centroids[cluster] += topic_vec
                cluster_counts[cluster] += 1
                docid += 1

        #print("Cluster assignments:", cluster_assignments)
        cluster_centroids /= cluster_counts[:,None]         # Take the average (off by one to avoid /0)
        cluster_centroids = normalize(cluster_centroids)    # And normalize.

        # We just use the sum of all cosine distances as our error metric.
        old_error = error
        error = np.sum(previous_centroid_distances)
        relative_error_change = abs(1 - error / old_error)

        logger.info("> Iteration: %d, previous error: %f, old error: %f, rel change: %f",
                    iter, error, old_error, relative_error_change)

        # TODO: Drop clusters with zero members assigned and merge clusters that
        # have converged to the same centroid.

        # Checkpoint the clusterings in every iteration so we can test them
        # before they converge.
        # Save centroids.
        centroids_fname = "%s.cluster.%d.centroids" % (output_prefix, k)
        logger.info("Saving clusters to file: %s", centroids_fname)
        s = MatrixSimilarity(None, dtype = np.float64, num_features = num_terms)
        s.index = cluster_centroids
        s.save(centroids_fname)
        del s   # Free any RAM the similarity index might use.

        # Save assignments.
        assignments_fname = "%s.cluster.%d.assignments" % (output_prefix, k)
        logger.info("Saving cluster assignments to file: %s", assignments_fname)
        np.save(open(assignments_fname, 'wb'), cluster_assignments)

        if relative_error_change < delta:
            logger.info("Converged.")
            break

        iter += 1

    logger.info("Done.")