示例#1
0
    def loadmodel(self, nameprefix):
        """ Load the topic model with the given prefix of the file paths.

        Given the prefix of the file paths, load the corresponding topic model. The files
        include a JSON (.json) file that specifies various parameters, a gensim dictionary (.gensimdict),
        and a topic model (.gensimmodel). If weighing is applied, load also the tf-idf model (.gensimtfidf).

        :param nameprefix: prefix of the file paths
        :return: None
        :type nameprefix: str
        """
        # load the JSON file (parameters)
        parameters = json.load(open(nameprefix + '.json', 'rb'))
        self.nb_topics = parameters['nb_topics']
        self.toweigh = parameters['toweigh']
        self.algorithm = parameters['algorithm']
        self.classlabels = parameters['classlabels']

        # load the dictionary
        self.dictionary = Dictionary.load(nameprefix + '.gensimdict')

        # load the topic model
        self.topicmodel = gensim_topic_model_dict[self.algorithm].load(
            nameprefix + '.gensimmodel')

        # load the similarity matrix
        self.matsim = MatrixSimilarity.load(nameprefix + '.gensimmat')

        # load the tf-idf modek
        if self.toweigh:
            self.tfidf = TfidfModel.load(nameprefix + '.gensimtfidf')

        # flag
        self.trained = True
示例#2
0
    def __init__(self, model_prefix=None, num_best=None):
        self.model_prefix = model_prefix
        self.num_best = num_best
        if self.model_prefix is None:
            raise ValueError("model_prefix must be specified")

        logger.info("ESA: Loading word dictionary...")
        self.dictionary = Dictionary.load_from_text(model_prefix +
                                                    '_wordids.txt.bz2')

        logger.info("ESA: Loading document name map...")
        self.article_dict = utils.unpickle(model_prefix +
                                           '_bow.mm.metadata.cpickle')

        logger.info("ESA: Loading TF-IDF model...")
        self.tfidf = TfidfModel.load(model_prefix + '.tfidf_model')

        logger.info("ESA: Loading similarity index...")
        sim_fname = "%s.cluster.%d.centroids" % (model_prefix, 2000)
        self.similarity_index = MatrixSimilarity.load(sim_fname, mmap='r')

        #logger.info("ESA: Preloading reverse indexes...")
        #self.similarity_index.preload_reverse_index()

        logger.info("ESA: Finished loading model files.")
    def get_recommendation(self, movie_title: str):
        """
        Accepts Movie Name and fetches the list of recommended movie names using matrix(cosine) similarity
        :param movie_title:
        :return: array of movie names
        """
        print("movie : ", movie_title)
        dictionary = gensim.corpora.Dictionary.load(self.corpus_dictionary)
        tfidf_model = gensim.models.TfidfModel.load(self.tfidf_model)
        similarity = MatrixSimilarity.load(self.matrix_similarity)
        data = pd.read_csv(self.processed_data)

        del data['Unnamed: 0']
        data["original_title"] = data["original_title"].str.lower()
        movie = data.loc[data.original_title == movie_title]
        print(movie)
        if movie.shape[0] == 0:
            status = ["Failed to Recommend Movies with existing movie data."]
            return status
        else:
            movie_doc_bow = dictionary.doc2bow(
                movie['doc'].map(break_to_tokens)[0])
            movie_tfidf = tfidf_model[movie_doc_bow]
            movie_recommendations = pd.DataFrame({
                'Cosine_sim_values':
                similarity[movie_tfidf],
                'title':
                data.original_title.values
            }).sort_values(by="Cosine_sim_values", ascending=False)
            top_recommendations = movie_recommendations['title'].head(11)
            return top_recommendations.to_numpy()
示例#4
0
文件: lsi.py 项目: jwilber/artcamp
    def load(lsi_path=None, id2word_path=None, index_path=None):
        """
        If specified, attempts to load gensim LsiModel from `lsi_path`
        and gensim Dictionary from `dictionary_path`.

        Parameters
        ----------
        lsi_path: str
            File-path designating where self.model should be saved.
        id2word_path: str
            File-path designating where self.dictionary should be saved.
        """
        if lsi_path is not None:
            from gensim.models import LsiModel
            if not os.path.exists(lsi_path):
                raise IOError(
                    'The provided file path to the LsiModel was not found.'
                    'Please ensure that the argument is the correct path.')
            return LsiModel.load(lsi_path)
        if id2word_path is not None:
            from gensim.corpora.dictionary import Dictionary
            if not os.path.exists(id2word_path):
                raise IOError(
                    'The provided file path to the Dictionary was not found.'
                    'Please ensure that the argument is the correct path.')
            return Dictionary.load(id2word_path)
        if index_path is not None:
            from gensim.similarities import MatrixSimilarity
            if not os.path.exists(index_path):
                raise IOError(
                    'The provided file path to the Dictionary was not found.'
                    'Please ensure that the argument is the correct path.')
            return MatrixSimilarity.load(index_path)
 def load(cls, fname):
     """
     Load a previously saved object from file (also see `save`).
     """
     logger.info("loading %s object from %s and %s" % (cls.__name__,
                                                       fname,
                                                       fname + ".index"))
     result = utils.unpickle(fname)
     result.similarity_index = MatrixSimilarity.load(fname + ".index")
     return result
def getMatrixSimilarity(tfidfModel, lsiModel=None) -> SparseMatrixSimilarity:
    similarityPath = os.path.join('.cache', 'sim_mat.gensim_sim')
    try:
        sim = MatrixSimilarity.load(similarityPath)
    except FileNotFoundError:
        corpus = Sparse2Corpus(tfidfModel.vectors, documents_columns=False)
        if lsiModel is None:
            lsiModel = getLsiModel(tfidfModel)
        sim = SparseMatrixSimilarity(lsiModel[corpus],
                                     num_best=21,
                                     num_features=tfidfModel.vectors.shape[0])
        sim.save(similarityPath)
    return sim
    def main(self):

        print("Recommendation using TF_IDF")

        # Loading preprocessed data
        vagas_ti = pd.read_csv(self.dataPrepFile)
        vagas_ids = pickle.load(
            open(self.out + "preprocessing/vagas_ids.array", "rb"))
        vagas_words = pickle.load(
            open(self.out + "preprocessing/vagas_words.list", "rb"))
        cvs_words = pickle.load(
            open(self.out + "preprocessing/cvs_words.series", "rb"))
        cvs = pd.read_csv(self.dataCvsFile)
        cvs = cvs.fillna("")
        cvs.isnull().any()
        #print("Loading cvs done!")

        # Creating a dictionary
        dictionary = gcorp.Dictionary(vagas_words)
        dictionary.save(self.out + 'preprocessing/tf_idf/vagas.dict'
                        )  # store the dictionary, for future reference

        # compile corpus (vectors number of times each elements appears)
        raw_corpus = [dictionary.doc2bow(v) for v in vagas_words]
        gcorp.MmCorpus.serialize(self.out + 'preprocessing/tf_idf/vagas.mm',
                                 raw_corpus)  # store to disk
        print("Tamanho do dicionário: " + str(len(dictionary)))

        # STEP 2 : similarity between corpuses
        dictionary = gcorp.Dictionary.load(self.out +
                                           'preprocessing/tf_idf/vagas.dict')
        corpus = gcorp.MmCorpus(self.out + 'preprocessing/tf_idf/vagas.mm')

        # Transform Text with TF-IDF
        tfidf = gsm.TfidfModel(corpus)  # step 1 -- initialize a model

        # corpus tf-idf
        corpus_tfidf = tfidf[corpus]

        # STEP 3 : Create similarity matrix of all files
        index = MatrixSimilarity(corpus_tfidf,
                                 num_features=len(dictionary),
                                 num_best=10)
        index.save(self.out + 'preprocessing/tf_idf/vagas.index')
        index = MatrixSimilarity.load(self.out +
                                      'preprocessing/tf_idf/vagas.index')

        self.recommendationTf_idf(cvs, vagas_ti, vagas_ids, cvs_words,
                                  dictionary, tfidf, index)

        print("Recommendation using TF_IDF done!")
示例#8
0
def load_model(documents, model_name, matrix_name, dic_name, queue=None):
    try:
        tfidfmodel = TfidfModel.load(model_name)
        index = MatrixSimilarity.load(matrix_name)
        dictionary = Dictionary.load(dic_name)
    except Exception:
        tfidfmodel, index, dictionary = create_model_tfidf_model(
            documents=documents,
            model_name=model_name,
            matrix_name=matrix_name,
            dic_name=dic_name)
    if queue is not None:
        queue.put([tfidfmodel, index, dictionary])
    return tfidfmodel, index, dictionary
示例#9
0
    def __init__(self, series, dictionary, lsi, index, sim_opt, rank_opt):
        super().__init__()

        self.norm = LookupNormalization()

        self.dictionary: Dictionary = Dictionary.load(dictionary)
        self.lsi: LsiModel = LsiModel.load(lsi)
        self.index: MatrixSimilarity = MatrixSimilarity.load(index)

        sr = SerializationReader(series)
        self.documents, self.doc2idx, self.idx2doc = sr.read()

        sim_class = globals()[self.SIM_OPTS[sim_opt]["cls"]]
        self.sim_strategy: SimilarityStrategy = sim_class(self.SIM_OPTS[sim_opt]["constant"])

        rank_class = globals()[self.RANK_OPTS[rank_opt]]
        self.rank_strategy: RankingStrategy = rank_class()
示例#10
0
    def get_similarity_index(self, bow_corpus, lsa: LsiModel, recalculate=False, from_scratch=True):

        filepath = self.paths.get_lsa_index(lsa.num_topics)

        if not os.path.isfile(filepath) or recalculate:

            if not from_scratch:
                raise ValueError('No similarity index file exists but from_scratch is False')

            print('Building index...')
            index = MatrixSimilarity(lsa[bow_corpus])
            index.save(filepath)
        else:
            print('Loading index...')
            index = MatrixSimilarity.load(filepath)

        return index
示例#11
0
    def __init__(self, filename):
        self.docs = loads(open(filename, "r").read())
        self.docmap = hoist_dict(self.docs, "id")

        if isfile("data.dict"):
            self.dictionary = Dictionary.load("data.dict")
        else:
            self.dictionary = Dictionary(iterate_summaries(self.docs))
            self.dictionary.save("data.dict")

        if isfile("data.mm"):
            self.corpus = MmCorpus("data.mm")
        else:
            corpus = (self.dictionary.doc2bow(text) for text in iterate_summaries(self.docs))
            MmCorpus.serialize("data.mm", corpus)
            self.corpus = MmCorpus("data.mm")

        self.lsi = LsiModel(self.corpus, id2word=self.dictionary, num_topics=3)

        if isfile("data.sim"):
            self.sim = MatrixSimilarity.load("data.sim")
        else:
            self.sim = MatrixSimilarity(self.lsi[self.corpus])
            self.sim.save("data.sim")

        # self.lda = LdaModel(corpus=self.corpus, id2word=self.dictionary, num_topics=100, update_every=1, chunksize=10000, passes=1)

        self.sentiment_model = Doc2Vec.load("imdb.d2v")
        self.sentiment = LogisticRegression()
        self.sentiment.fit([self.sentiment_model.docvecs["TEST_POS_" + str(i)] for i in range(12500)] +
                           [self.sentiment_model.docvecs["TEST_NEG_" + str(i)] for i in range(12500)],
                           asarray(list(chain(repeat(0, 12500), repeat(1, 12500)))))

        if isfile("arxiv.d2v"):
            self.doc_model = Doc2Vec.load("arxiv.d2v")
        else:
            tagged = [TaggedDocument(doc.get("summary").split(), [doc.get("id")]) for doc in self.docs]
            doc_model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7)
            doc_model.build_vocab(tagged)
            shuffle(tagged) # Replace with functional stuff
            for epoch in range(10):
                doc_model.train(tagged, total_examples=doc_model.corpus_count, epochs=doc_model.iter)
            doc_model.save("arxiv.d2v")
示例#12
0
    def __init__(self, model_prefix = None, num_best = None):
        self.model_prefix = model_prefix
        self.num_best = num_best
        if self.model_prefix is None:
            raise ValueError("model_prefix must be specified")

        logger.info("ESA: Loading word dictionary...")
        self.dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2')

        logger.info("ESA: Loading document name map...")
        self.article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle')

        logger.info("ESA: Loading TF-IDF model...")
        self.tfidf = TfidfModel.load(model_prefix + '.tfidf_model')

        logger.info("ESA: Loading similarity index...")
        sim_fname = "%s.cluster.%d.centroids" % (model_prefix, 2000)
        self.similarity_index = MatrixSimilarity.load(sim_fname, mmap='r')

        #logger.info("ESA: Preloading reverse indexes...")
        #self.similarity_index.preload_reverse_index()

        logger.info("ESA: Finished loading model files.")
import json
import codecs
import utils
import re

huffPostDataFilePath = '../../5w1h-result-data/huffPostDataIncludingKeywords.json'
gensimDictionaryInTimeBaseFilePath = '../../5w1h-result-data/gensim-in-time/gensimDictionary'
gensimSimilarityIndexInTimeBaseFilePath = '../../5w1h-result-data/gensim-in-time/similarityIndex'

huffPostData = json.load(codecs.open(huffPostDataFilePath, 'r', 'utf-8-sig'))

# corpus = corpora.MmCorpus('../../lda-ner-result-data/gensimCorpus.mm')
dictionary = corpora.Dictionary.load(
    '../../5w1h-result-data/gensimDictionary.dict')
# load similarity_index
similarityIndex = MatrixSimilarity.load(
    '../../5w1h-result-data/similarityIndex')


def get_top_k_documents_total_time(queryKeywords):
    bow = dictionary.doc2bow(queryKeywords)
    sims = similarityIndex[bow]
    print('sims', sims)
    # make [(documentIndex, point)]
    # convertedSims = []
    # for sim in sims:
    #     convertedSims.append((int(sim[0]), sim[1]))

    # return convertedSims

    # return huffPostDatum
    topKHuffPostData = []
示例#14
0
from sklearn.linear_model import LogisticRegression, Ridge

from src.text import preprocess
from src.transformers import PreprocessTokensTransformer

# constants
API_KEY = os.getenv('API_KEY', '_')
MIN_DF = 3
MAX_DF = 0.6

app = Flask(__name__)  # pylint: disable=C0103

# load data
dictionary = corpora.Dictionary.load('data/lda.dictionary')  # pylint: disable=C0103
lda = LdaModel.load('data/lda.model')  # pylint: disable=C0103
similarities = MatrixSimilarity.load('data/lda.similarity')  # pylint: disable=C0103
data = pd.read_csv(
    'data/data.csv',
    index_col='index',  # pylint: disable=C0103
    usecols=[
        'index', 'project_number', 'title', 'abstract', 'rcr',
        'preprocessed_text'
    ])
topic_counts = pd.read_csv('data/lda.topic_counts.csv', index_col='year')  # pylint: disable=C0103
topic_counts.columns = topic_counts.columns.astype(int)


def get_most_similar_documents(query, k=10):
    """Return indices for similar documents"""
    stime = time.time()
    sims = -similarities[query]
tfidf_corpus_lsi = corpora.MmCorpus(os.path.join(settings.PERSIST_DIR, 'tfidf_corpus_lsi-200'))

logger.info('loading lsi model')
lsi_model = lsimodel.LsiModel.load(os.path.join(settings.PERSIST_DIR, 'lsi_model-200'))

fnames = [line.strip() for line in open(os.path.join(settings.PERSIST_DIR, 'document_index'))]
doc_ids = pd.Series(map(lambda x: os.path.basename(x).split('.')[0], fnames),
                    dtype=object)

#logger.info('building matrix similarity')
#doc_topic = MatrixSimilarity(tfidf_corpus_lsi, num_features=tfidf_corpus_lsi.num_terms)

#logger.info('persisting matrix similarity index')
#doc_topic.save(os.path.join(settings.PERSIST_DIR, 'tfidf_corpus_lsi-200_matrix_similarity'))

doc_topic = MatrixSimilarity.load(os.path.join(settings.PERSIST_DIR,
                                               'tfidf_corpus_lsi-200_matrix_similarity'))

def cluster(group, level, nbranches):
    if len(group) < min_nodes:
        logger.info("......less than {min_nodes} nodes ({n})".format(
            min_nodes=min_nodes, n=len(group)))
        return

    mbk = MiniBatchKMeans(init='k-means++', n_clusters=nbranches, n_init=1,
                          init_size=1000, batch_size=1000)
    mbk.fit(doc_topic.index[group['original_id']])
    return mbk


def index_freq_above(na, minval):
    l = pd.Series(na)
示例#16
0
import numpy as np
import pickle
import requests
import re
import gensim
from textwrap import dedent as d

from collections import defaultdict
from gensim import corpora, models
from gensim.similarities import MatrixSimilarity, SoftCosineSimilarity, SparseTermSimilarityMatrix
from gensim.corpora import Dictionary
from gensim.models import LsiModel

td = corpora.Dictionary.load('scratch_work/NLPScratch/brand_new_strains.dict')
m = LsiModel.load('scratch_work/NLPScratch/_fit_LSI_Model.model')
s_index = MatrixSimilarity.load('scratch_work/NLPScratch/strain_sim.index')
descripts = open("models/descriptions.pkl", "rb")
strain_lookup = open("models/strain_lookup.pkl", "rb")
descripion_dict = pickle.load(descripts)
lookup = pickle.load(strain_lookup)

external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

app = dash.Dash(__name__, external_stylesheets=external_stylesheets)

app.layout = html.Div([
    html.H1("Green-Rex: We Recommend It, You Smoke It!",
            style={
                'color': '#008000',
                "textAlign": "center"
            }),
示例#17
0
# load models

print "\n    Loading models, etc..\n"
id2word_pgfin = gensim.corpora.Dictionary.load('./data/pgfin.dictionary')
tfidf_model = gensim.models.TfidfModel.load('./data/tfidf_pgfin.model')
lsi_model = gensim.models.LsiModel.load('./data/lsi_pgfin.model')
indexfile = ('./data/ta_index.txt')
queryfile = './queryfiles/queryfile.txt'  # text in corpus
# queryfile = './queryfiles/45vuotta.txt'  # Film review
# queryfile = './queryfiles/tktjohdessee2.txt'  # Ancient essay

# check similarity

print "\n    Load similarity indices.\n"
index = Similarity.load('./data/pgfin_index.index')
index_dense = MatrixSimilarity.load('./data/pgfin_matrixindex.index')

with open(queryfile, 'r') as datafile:
    query = datafile.read()

# vectorize the query text into bag-of-words and tfidf
query_bow = id2word_pgfin.doc2bow(tokenize(query))
query_tfidf = tfidf_model[query_bow]
query_lsi = lsi_model[query_tfidf]

index_dense.num_best = 5


class BookHitValue(object):

    def __init__(self, indexfile, author_title, hit_percent):
doc_ids = pd.Series(map(lambda x: os.path.basename(x).split('.')[0], fnames),
                    dtype=object)

matrix_sim_loc = os.path.join(settings.PERSIST_DIR,
                              'tfidf_corpus_lsi{}-200_matrix_similarity'.format(fname_suffix))

if not os.path.exists(matrix_sim_loc):
    logger.info('building matrix similarity')
    doc_topic = MatrixSimilarity(tfidf_corpus_lsi, num_features=tfidf_corpus_lsi.num_terms)

    logger.info('persisting matrix similarity index')
    doc_topic.save(matrix_sim_loc)
else:
    logger.info('matrix similarity already available. using that')
    doc_topic = MatrixSimilarity.load(matrix_sim_loc)

def cluster(group, level, nbranches):
    if len(group) < min_nodes:
        logger.info("......less than {min_nodes} nodes ({n})".format(
            min_nodes=min_nodes, n=len(group)))
        return

    mbk = MiniBatchKMeans(init='k-means++', n_clusters=nbranches, n_init=1,
                          init_size=1000, batch_size=1000)
    mbk.fit(doc_topic.index[group['original_id'],:TOPIC_LIMIT])
    return mbk


def index_freq_above(na, minval):
    l = pd.Series(na)
示例#19
0
corpus_tfidf = tfidf[corpus]

corpora.MmCorpus.serialize('corpus.mm', corpus_tfidf)
tfidf.save("my_model.tfidf")
tfidf = models.TfidfModel.load("my_model.tfidf")

print('Building LsiModel...')
corpus_tfidf = corpora.MmCorpus('corpus.mm')
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=100)

print('Building MatrixSimilarity...')
from gensim.similarities import MatrixSimilarity
index = MatrixSimilarity(lsi[corpus_tfidf])

index.save('deerwester.index')
index = MatrixSimilarity.load('deerwester.index')

print('Testing...')
result = np.zeros((20, 300)).astype('str')
j = 0
for doc in query_test['Query']:
    doc = jieba.cut(doc)
    tokens = []
    for word in doc:
        tokens.append(word)

    vec_bow = dictionary.doc2bow(tokens)
    vec_lsi = lsi[vec_bow]
    sims = index[vec_lsi]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    for i in range(300):
                    dtype=object)

matrix_sim_loc = os.path.join(
    settings.PERSIST_DIR,
    'tfidf_corpus_lsi{}-200_matrix_similarity'.format(fname_suffix))

if not os.path.exists(matrix_sim_loc):
    logger.info('building matrix similarity')
    doc_topic = MatrixSimilarity(tfidf_corpus_lsi,
                                 num_features=tfidf_corpus_lsi.num_terms)

    logger.info('persisting matrix similarity index')
    doc_topic.save(matrix_sim_loc)
else:
    logger.info('matrix similarity already available. using that')
    doc_topic = MatrixSimilarity.load(matrix_sim_loc)


def cluster(group, level, nbranches):
    if len(group) < min_nodes:
        logger.info("......less than {min_nodes} nodes ({n})".format(
            min_nodes=min_nodes, n=len(group)))
        return

    mbk = MiniBatchKMeans(init='k-means++',
                          n_clusters=nbranches,
                          n_init=1,
                          init_size=1000,
                          batch_size=1000)
    mbk.fit(doc_topic.index[group['original_id'], :TOPIC_LIMIT])
    return mbk
示例#21
0
文件: main.py 项目: wnozawa/mbtfidf
# load dictionary
from gensim import corpora
dictionary_name = "maqa.dict"
dictionary = corpora.Dictionary.load(dictionary_name)

# load model
from gensim import models

tfidf_name = "tfidf.model"
tfidf = models.TfidfModel.load(tfidf_name)

# load index
from gensim.similarities import MatrixSimilarity
index_name = "index_tfidf.index"
index = MatrixSimilarity.load(index_name)

# load original Q&A
import pandas
df_qa = pandas.read_csv('AllQA.csv')

from TFIDF import TFIDF_sims_argmax


@handler.add(MessageEvent, message=TextMessage)
def handle_message(event):
    # response=LSI_prediction(event.message.text, dictionary, tfidf, lsi_model, index, text_corpus)
    sims_argmax = TFIDF_sims_argmax(event.message.text, dictionary, tfidf,
                                    index)
    response = df_qa.iloc[sims_argmax, 1]
    conn = psycopg2.connect(DATABASE_URL, sslmode='require')
示例#22
0
from gensim.similarities import MatrixSimilarity
from gensim.models import LsiModel
from gensim.corpora import Dictionary
import pandas as pd
import time

bugs = pd.read_csv('./data/query_result_4189_cleared.csv')
documents = bugs.tokenized
real_documents = bugs.subject

lsi = LsiModel.load('gensim_lsi_model.lsi')
dictionary = Dictionary.load('./gen_sim.dict')
index = MatrixSimilarity.load('./gensim_lsi_matrix_similarity.index')

start = time.time()

query = documents[0]
query_vec = dictionary.doc2bow(query.lower().split())
# convert the query to LSI space
vec_lsi = lsi[query_vec]

sims = index[vec_lsi]
sims_s = sorted(list(enumerate(sims)), key=lambda tup: tup[1], reverse=True)

end = time.time()
print('Counting one query took: {}'.format(end - start))
c = 0

for item in sims_s:
    i = item[0]
    v = item[1]
def createSearchObjs():
    """
    Creates the SimSearch and KeySearch objects using the data structures
    created in `make_wikicorpus.py`.
    Returns (simsearch, keysearch, titles_to_id)
    """
    
    # Load the article titles. These have the format (pageid, article title)
    fprint('Loading Wikipedia article titles...')
    t0 = time.time()
    
    id_to_titles = utils.unpickle('./data/bow.mm.metadata.cpickle')
    titles_to_id = utils.unpickle('./data/titles_to_id.pickle')

    # id_to_titles is actually a map of indeces to (pageid, article title)
    # The 'pageid' property is unused.
    # Convert id_to_titles into a simple list of titles.
    titles = [item[1][1] for item in id_to_titles.items()]
    
    fprint('    Took %.2f seconds' % (time.time() - t0))        
    
    # Load the dictionary (830ms on my machine)
    fprint('\nLoading dictionary...')
    t0 = time.time()
    
    dictionary = Dictionary.load_from_text('./data/dictionary.txt.bz2')
    
    fprint('    Took %.2f seconds' % (time.time() - t0))    
    
    # Load tf-idf model (60ms on my machine).
    fprint('\nLoading tf-idf model...')
    t0 = time.time()
    
    tfidf_model = TfidfModel.load('./data/tfidf.tfidf_model')    
    
    fprint('    Took %.2f seconds' % (time.time() - t0))        
    
    # We must not use `load`--that would attempt to load the corpus into 
    # memory, and it's 16.7 GB!!
    #corpus_tfidf = MmCorpus.load('./data/corpus_tfidf.mm')
    
    fprint('\nCreating tf-idf corpus object (leaves the vectors on disk)...')
    t0 = time.time()
    
    corpus_tfidf = MmCorpus('./data/corpus_tfidf.mm')
    
    fprint('    Took %.2f seconds' % (time.time() - t0))            
    
    # Create the KeySearch and SimSearch objects.    
    ksearch = KeySearch(dictionary, tfidf_model, corpus_tfidf, titles)
    simsearch = SimSearch(ksearch)
    
    # TODO - SimSearch doesn't currently have a clean way to provide the index
    # and model.
    
    fprint('\nLoading LSI model...')
    t0 = time.time()    
    simsearch.lsi = LsiModel.load('./data/lsi.lsi_model')
    
    fprint('    Took %.2f seconds' % (time.time() - t0))        
    
    # Load the Wikipedia LSI vectors into memory.
    # The matrix is 4.69GB for me, and takes ~15 seconds on my machine to load.
    fprint('\nLoading Wikipedia LSI index...')
    t0 = time.time()
        
    simsearch.index = MatrixSimilarity.load('./data/lsi_index.mm')
    
    fprint('    Took %.2f seconds' % (time.time() - t0))    

    # TODO - It would be interesting to try the 'Similarity' class which 
    #       shards the dataset on disk for you...

    return (simsearch, ksearch, titles_to_id)
示例#24
0

def explain_lda_prediction(inputs, prediction, index, num_topics):
    book_meta_data = pd.read_csv(os.path.join(datapath, "meta_dataset.csv"))

    vector = prediction[-num_topics:]
    sims = index[vector]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])[:10]

    print("books that are similar by LDA: ")
    for sim in sims:
        book_id = sim[0]
        book_title = book_meta_data.title[book_id]
        print("\t", book_title)


" OPEN RBM MODEL PARAMETERS "
rbm_parameters = open_pickle(modelpath, "lda_rbm_parameters")
W = rbm_parameters['W']
bv = rbm_parameters['bv']
bh = rbm_parameters['bh']
num_topics = 50

from gensim.test.utils import datapath as dp
from gensim.similarities import MatrixSimilarity

index_file = dp(os.path.join(modelpath, "lda_similarity.index"))
index = MatrixSimilarity.load(index_file)

user_vecs = open_pickle(picklepath, "lda_rbm_inputs")