def __init__(self): self.dictionary = Dictionary.load(app.config["RCMDR_DICT"]) self.corpus = corpora.MmCorpus(app.config["RCMDR_CORPUS"]) self.tfidf = TfidfModel.load(app.config["RCMDR_TFIDF_MODEL"]) self.lda_model = LdaModel.load(app.config["RCMDR_LDA_MODEL"]) self.lsi_model = LsiModel.load(app.config["RCMDR_LSI_MODEL"]) self.lda_index = Similarity.load(app.config["RCMDR_LDA_INDEX"]) self.lsi_index = Similarity.load(app.config["RCMDR_LSI_INDEX"]) self.job_labels = { int(k): v for k, v in (line.split("=") for line in open(app.config["RCMDR_JOB_LABELS"]).read().strip().split("\n")) }
def __init__(self, model_prefix=None, num_best=None): self.model_prefix = model_prefix self.num_best = num_best if self.model_prefix is None: raise ValueError("model_prefix must be specified") logger.info("ESA: Loading word dictionary...") self.dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2') logger.info("ESA: Loading document name map...") self.article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle') logger.info("ESA: Loading TF-IDF model...") self.tfidf = TfidfModel.load(model_prefix + '.tfidf_model') logger.info("ESA: Loading similarity index...") self.similarity_index = Similarity.load(model_prefix + '_similarity.index', mmap='r') #logger.info("ESA: Preloading reverse indexes...") #self.similarity_index.preload_reverse_index() logger.info("ESA: Finished loading model files.")
def create_similarity_index(self): if not os.path.isfile(self.similarity_file): self.similarity_index = Similarity('./LSM/', self.corpora, self.num_topics) self.similarity_index.save(self.similarity_file) else: self.similarity_index = Similarity.load(self.similarity_file)
def initiate_recommender(): # Retrieve all the necessary files for the recommender system baseDir = settings.BASE_DIR # Load dictionary and corpus dictFile = baseDir + "/static/data/DBLP_Dictionary.dict" corpusFile = baseDir + "/static/data/DBLP_Corpus.mm" dictionary = corpora.Dictionary.load(dictFile) corpus = corpora.MmCorpus(corpusFile) # Load the TF-IDF model tfidfFile = baseDir + "/static/data/TF-IDF" tfidf = models.TfidfModel().load(tfidfFile) # Load the Gensim similarity index indexFile = baseDir + "/static/data/Index" sims = Similarity.load(indexFile) # If matrix fits in memory, use this instead and comment out previous two lines #sims = MatrixSimilarity(tfidf[corpus], num_features=(len(dictionary))) # Point to the text csv file textFile = baseDir + "/static/data/Text.csv" # Load ID dataframe from recommender paperIDs = baseDir + "/static/data/AbsID.csv" cols = ["paperID"] dfIDs = pd.read_csv(paperIDs, names=cols, header=None) return dictionary, corpus, tfidf, sims, textFile, dfIDs
def run(self): if self.clean_level in ('raw','clean','stopwords'): kind = self.clean_level else: kind = 'stopwords' # Guardamos las similitudes en un archivo con un formato sencillo # NOTA: EL ÍNDICE YA DE POR SÍ GUARDA LAS SIMILITUDES. NO ES NECESARIO CALCULARLAS DE NUEVO for idioma, salida in self.output()['langs'].iteritems(): file_list = os.listdir(os.path.join(self.txt_dir,kind,idioma)) for n_topics, o in salida.iteritems(): index = Similarity.load(self.input()['langs'][idioma][n_topics]['lsi-index'].path) # JSON sims = index2dict(index, file_list, num_sims=self.num_similar_docs) with o['json'].open('w') as f: json.dump(sims, f) # HTML + CSV s = u'' net = pd.DataFrame(columns=['from_name', 'to_name', 'sim']) for book, v in sims.iteritems(): s += u'-------------------------------------------\n' s += u'### %s\n\n' % (book) s += u'| Ranking | Libro | Similitud |\n|:--------:|:-------|-------------:|\n''' for rank, attrs in v.iteritems(): s += u'| %d | %s | %f |\n' % (rank, attrs['name'], round(attrs['similarity'],3)) net = net.append(pd.DataFrame({'from_name':[book], 'to_name':[attrs['name']], 'sim':[attrs['similarity']]})) s += u'\n\n' md = markdown.markdown(s, extensions=['markdown.extensions.tables']) books = sorted(list(set(net['from_name']).union(net['to_name']))) ids = {v:i for i,v in enumerate(books)} net['from'] = [ids[k] for k in net['from_name']] net['to'] = [ids[k] for k in net['to_name']] with o['html'].open('w') as f: f.write(md) with o['csv'].open('w') as f: net.to_csv(f, index=False) # Red (en R) tempname = 'net_temp0.html' i = 1 while os.path.exists(tempname): tempname = 'net_temp%d.html' % i i += 1 if i >= 100: print 'ERROR: No se puede crear la red temporal... Checa que no exista un archivo llamado %s en esta carpeta y que tienes permisos de escritura...' % tempname break subprocess.call(['itam-d3-network.R', '--input', o['csv'].path, '--output', tempname, '--max_links', str(self.num_similar_docs), '--min_sim', str(self.min_similarity)]) print 'USER INFO: Creando archivo temporal: ' + tempname shutil.move(tempname, o['net'].path) print 'USER INFO: Movimiento listo, %s --> %s' % (tempname, o['net'].path) if os.path.exists(tempname): os.remove(tempname)
def load(self, path): if type(path) == str: path = Path(path) with open(path / 'paragraph-ids.txt') as f: self.paragraph_ids = [paragraph_id.strip() for paragraph_id in f] dictionary_path = str(path / 'dct.pkl') self.dictionary = Dictionary.load(dictionary_path) index_path = str(path / 'indexes' / 'master-index') self.index = Similarity.load(index_path) self.index.num_best = self.num_best
def run(self): if self.clean_level in ('raw','clean','stopwords'): kind = self.clean_level else: kind = 'stopwords' # Guardamos las similitudes en un archivo con un formato sencillo # NOTA: EL ÍNDICE YA DE POR SÍ GUARDA LAS SIMILITUDES. NO ES NECESARIO CALCULARLAS DE NUEVO for idioma, salida in self.output()['langs'].iteritems(): file_list = os.listdir(os.path.join(self.txt_dir,kind,idioma)) for n_topics, o in salida.iteritems(): index = Similarity.load(self.input()['langs'][idioma][n_topics]['lsi-index'].path) sims = arrange_similarities(index, file_list, num_sims=self.num_similar_docs) sims = '\n'.join(['\t'.join([str(i) for i in t]) for t in sims]) with o.open('w') as f: f.write(sims)
def __init__(self, model_prefix = None, num_best = None): self.model_prefix = model_prefix self.num_best = num_best if self.model_prefix is None: raise ValueError("model_prefix must be specified") logger.info("ESA: Loading word dictionary...") self.dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2') logger.info("ESA: Loading document name map...") self.article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle') logger.info("ESA: Loading TF-IDF model...") self.tfidf = TfidfModel.load(model_prefix + '.tfidf_model') logger.info("ESA: Loading similarity index...") self.similarity_index = Similarity.load(model_prefix + '_similarity.index', mmap='r') #logger.info("ESA: Preloading reverse indexes...") #self.similarity_index.preload_reverse_index() logger.info("ESA: Finished loading model files.")
def load_precomputed(): global questions global documents global dct global corpus global tfidf_model global corpus_tfidf global index if questions is None: questions = pickle.load(open('precompute/questions.pkl', 'rb')) logger.info("Loaded questions") if documents is None: documents = pickle.load(open('precompute/documents.pkl', 'rb')) logger.info("Loaded tokenized questions") if dct is None: dct = pickle.load(open('precompute/dct.pkl', 'rb')) logger.info("Loaded dictionary") if corpus is None: corpus = pickle.load(open('precompute/corpus.pkl', 'rb')) logger.info("Loaded corpus") if tfidf_model is None: tfidf_model = pickle.load(open('precompute/tfidf_model.pkl', 'rb')) logger.info("Loaded tfidf model") if corpus_tfidf is None: corpus_tfidf = pickle.load(open('precompute/corpus_tfidf.pkl', 'rb')) logger.info("Loaded tfidf corpus") if index is None: index = Similarity.load("precompute/similarities.pkl") logger.info("Loaded similarities") logger.info("Loaded precomputed stuff")
def largevisformat(c_file, s_file, o_file): import gensim ofile = open(o_file, 'w') from gensim import corpora corpus = gensim.corpora.MmCorpus(c_file) if s_file: from gensim import similarities from gensim.similarities import Similarity sim_index = Similarity.load(s_file) i = 0 for doc1 in corpus: sims = sim_index[doc1] j = 0 for s in sims: dist = s ofile.write("%d %d %f\n" % (i, j, dist)) j += 1 i += 1 else: ofile.write("%d %d\n" % (corpus.num_docs, corpus.num_terms)) #ofile.write("%d %d\n" % (10000, corpus.num_terms)) counter = 0 for doc in corpus: doc.sort(key=lambda x: x[0], reverse=False) ps = 0 for (s, w) in doc: for i in range(0, (s - ps)): ofile.write('0.0 ') ofile.write('%f ' % (w)) ps = s counter += 1
def __init__(self, model_filename, index_filename): # lemmatizer and model for keyword inference self.lemmatize = nltk.stem.WordNetLemmatizer().lemmatize self.model = Doc2Vec.load(model_filename) # index for similarity queries self.index = Similarity.load(index_filename)
from gensim import corpora from gensim.models import LdaModel, LdaMulticore from gensim.similarities import Similarity logger = logging.getLogger(__name__) # Create references # Load LDA model lda_model_tfidf = LdaModel.load('./lda_data/lda_model_tfidf.model') # Load the BoW model bow_corpus = corpora.MmCorpus('./lda_data/bow_corpus.mm') # load the index index = Similarity.load('./lda_data/wine.index') # Create indices, a vector of wine names to position in data wine_data = pd.read_csv('./lda_data/df_out_data.csv', index_col=0) wine_best = wine_data.loc[wine_data['points'] > 90, ['winery', 'variety', 'designation_replace']].sample( 5) indices = pd.Series(wine_data.index, index=wine_data.designation_replace).drop_duplicates() indices.index.names = ['name'] # Function that takes in wine name as input and outputs most similar wines # provided by datacamp
model_prefix = sys.argv[1] logger.info("running %s" % ' '.join(sys.argv)) logger.info("Loading word dictionary...") dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2') logger.debug(dictionary) logger.info("Loading document name map...") article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle') logger.info("Loading tf-idf model...") tfidf = TfidfModel.load(model_prefix + '.tfidf_model') logger.info("Loading similarity index...") similarity_index = Similarity.load(model_prefix + '_similarity.index', mmap='r') similarity_index.use_reverse_index = True logger.info("Finished loading model files.") mismatches = 0 for doc_idx in range(0, len(similarity_index)): logger.info("Checking doc: %d %s" % (doc_idx, article_dict[doc_idx])) rev_doc = scipy.sparse.dok_matrix((1, len(dictionary)), dtype=np.float64) fwd_doc = similarity_index.vector_by_id(doc_idx) for feature_id, val in enumerate(fwd_doc.toarray().flatten()): if val == 0: continue feat_rev_docs = similarity_index.docs_by_feature_id(feature_id).toarray().flatten() rev_doc[0, feature_id] = feat_rev_docs[doc_idx] rev_doc = rev_doc.tocsr()
# load models print "\n Loading models, etc..\n" id2word_pgfin = gensim.corpora.Dictionary.load('./data/pgfin.dictionary') tfidf_model = gensim.models.TfidfModel.load('./data/tfidf_pgfin.model') lsi_model = gensim.models.LsiModel.load('./data/lsi_pgfin.model') indexfile = ('./data/ta_index.txt') queryfile = './queryfiles/queryfile.txt' # text in corpus # queryfile = './queryfiles/45vuotta.txt' # Film review # queryfile = './queryfiles/tktjohdessee2.txt' # Ancient essay # check similarity print "\n Load similarity indices.\n" index = Similarity.load('./data/pgfin_index.index') index_dense = MatrixSimilarity.load('./data/pgfin_matrixindex.index') with open(queryfile, 'r') as datafile: query = datafile.read() # vectorize the query text into bag-of-words and tfidf query_bow = id2word_pgfin.doc2bow(tokenize(query)) query_tfidf = tfidf_model[query_bow] query_lsi = lsi_model[query_tfidf] index_dense.num_best = 5 class BookHitValue(object):
input_file, output_prefix = sys.argv[1:3] logger.info("running %s" % ' '.join(sys.argv)) logger.info("Loading word dictionary...") dictionary = Dictionary.load_from_text(output_prefix + '_wordids.txt.bz2') logger.debug(dictionary) logger.info("Loading document name map...") article_dict = utils.unpickle(output_prefix + '_bow.mm.metadata.cpickle') logger.info("Loading tf-idf model...") tfidf = TfidfModel.load(output_prefix + '.tfidf_model') logger.info("Loading similarity index...") similarity_index = Similarity.load(output_prefix + '_similarity.index', mmap='r') similarity_index.use_reverse_index = True similarity_index.preload_reverse_index() logger.info("Finished loading model files.") logger.info("Processing input documents...") try: infile = open(input_file, 'r') except IOError: print('cannot open %s' % (input_file,)) sys.exit(1) for docnum, line in enumerate(infile): line = line.rstrip()