def main(): logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') parser = OptionParser() parser.add_option('-f', '--corpus-file') parser.add_option('-p', '--parse-procs', default=1, type=int) parser.add_option('-s', '--sublexicalize-procs', default=1, type=int) parser.add_option('-t', '--tfidf-model') parser.add_option('-v', '--vocabulary') parser.add_option('-m', '--model-file') opts, args = parser.parse_args() corpus_fn = opts.corpus_file or sys.exit() n_proc_parse = opts.parse_procs n_proc_sublex = opts.sublexicalize_procs vocab_fn = opts.vocabulary tfidf_fn = opts.tfidf_model model_fn = opts.model_file or sys.exit() with BZ2File(corpus_fn) as f: corpus = SublexicalizedCorpus(WikiCorpus(corpus_fn, processes=n_proc_parse, dictionary=Dictionary()), order=(3, 6), clean_func=normalize_whitespace, n_proc=n_proc_sublex, create_dictionary=False) if vocab_fn and os.path.exists(vocab_fn): logging.info("Loading vocabulary from %s" % vocab_fn) vocab = Dictionary.load(vocab_fn) else: logging.info("Creating vocabulary") start = time.clock() vocab = Dictionary(corpus.get_texts()) end = time.clock() logging.info("Vocabulary created in %d seconds" % (end - start)) if vocab_fn: logging.info("Saving dictionary to %s" % vocab_fn) vocab.save(vocab_fn) corpus.dictionary = vocab corpus.dictionary.filter_extremes(no_below=5, no_above=.8) corpus.dictionary.compactify() if tfidf_fn and os.path.exists(tfidf_fn): logging.info("Reading TF-IDF model from %s" % tfidf_fn) tfidf = TfidfModel.load(tfidf_fn) else: logging.info("creating TF-IDF model") tfidf = TfidfModel(corpus) if tfidf_fn: logging.info("Saving TFF-IDF model to %s" % tfidf_fn) tfidf.save(tfidf_fn) bow_corpus = (tfidf[art] for art in corpus) model = LsiModel(corpus=bow_corpus, num_topics=10, id2word=corpus.dictionary) model.save(model_fn)
def train_lda(): """ Usage: python Wechat_LDA.py wechat.csv """ with open(sys.argv[1], 'r') as wx: for f in wx: seg = jieba.cut(f) seg = [word for word in seg if word not in stopwords] with codecs.open('wechat_seg.txt', encoding='utf-8', mode='ab') as wx_seg: wx_seg.write(' '.join(seg)) documents = open('wechat_seg.txt', 'r') dictionary = corpora.Dictionary(LineSentence(documents)) corpus = [dictionary.doc2bow(text) for text in LineSentence(documents)] tfidf_model = TfidfModel(corpus, id2word=dictionary, normalize=True) tfidf_model.save('wechat_seg.txt.tfidf_model') # corpora.MmCorpus.serialize('wechat_seg.txt.tfidf_model.mm', tfidf_model[corpus]) lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=100, workers=cpu_count()-1) lda_model.save('wechat_lda_model.pkl') topics = [] for doc in corpus: topics.append(lda_model[doc]) counts = np.zeros(100) for top_doc in topics: for ti, _ in top_doc: counts[ti] += 1 words = lda_model.show_topic(counts.argmax(), 64) with open('top_words.txt', 'w') as tw: writer = UnicodeWriter(tw) for w in words: writer.writerow((w[0], int(float(w[1])*1000)))
def main(argv=None): if argv is None: argv = sys.argv print('Creating simple wiki serialized corpus') # Download the raw file if we do not have it already if not os.path.isfile(WIKIFILE): # Get the file wget.download(WIKIURL) wiki = WikiCorpus(WIKIFILE, lemmatize=False) i = 0 article_dict = {} for text in wiki.get_texts(meta=True): url_string = 'https://simple.wikipedia.org/wiki/?curid={}' article_dict[i] = (url_string.format(text[0]), text[1]) i += 1 with open(ARTICLEDICT, 'w') as f: json.dump(article_dict, f) wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) MmCorpus.serialize(MMFILE, wiki, progress_cnt=10000, ) wiki.dictionary.save_as_text(DICTFILE) print('Simple wiki serialized corpus created') # Now run LSI dictionary = Dictionary.load_from_text(DICTFILE) mm = MmCorpus(MMFILE) tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(TDIFMODEL) MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000) mm_tdif = MmCorpus(TDIFFILE) lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300) index = similarities.MatrixSimilarity(lsi[mm_tdif]) index.save(SIMMATRIX) lsi.save(LSIMODEL) print("LSI model and index created")
def make_corpus(): corpus = MyCorpus() tfidf_model = TfidfModel(corpus) corpus_idf = tfidf_model[corpus] num_terms = 400 lsi_model = LsiModel(corpus_idf, id2word=corpus.dictionary, num_topics=num_terms) # corpora.MmCorpus.serialize('wiki_en_corpus.mm', corpus) # store to disk, for later use corpus.dictionary.save(os.path.join(HERE, "sogou.dict")) # store the dictionary, for future reference tfidf_model.save(os.path.join(HERE, "sogou.model")) lsi_model.save(os.path.join(HERE, "sogou.lsi")) print "save dictionary and tfidf model" """
def apply_tfidf(dictionary_path, mm_corpus_path): dictionary = Dictionary.load_from_text(dictionary_path) mm = MmCorpus(mm_corpus_path) tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) MmCorpus.serialize('/home/andre/Develop/corpora/lsamodel_tfidf.mm', tfidf[mm], progress_cnt=10000)
def process_records(records, fields, target, textmodel=None): tokenize = CountVectorizer().build_analyzer() input = None X = None y_labels = [] for i, record in enumerate(records): nums = [] strs = [] y_labels.append(record.get(target)) for field in fields: if is_number(record.get(field)): nums.append(record[field]) else: strs.append(str(record.get(field) or "").lower()) if strs: if input is None: input = StringIO.StringIO() print >> input, " ".join(tokenize(" ".join(strs))) if nums: if X is None: X = sp.lil_matrix((len(records),len(nums))) X[i] = np.array(nums, dtype=np.float64) if input is not None: if X is not None: X_2 = X.tocsr() else: X_2 = None if isinstance(textmodel,basestring): if textmodel == 'lsi': corpus = TextCorpus(input) textmodel = LsiModel(corpus, chunksize=1000) elif textmodel == 'tfidf': corpus = TextCorpus(input) textmodel = TfidfModel(corpus) elif textmodel == 'hashing': textmodel = None hasher = FeatureHasher(n_features=2 ** 18, input_type="string") input.seek(0) X = hasher.transform(tokenize(line.strip()) for line in input) if textmodel: num_terms = len(textmodel.id2word or getattr(textmodel, 'dfs',[])) X = corpus2csc(textmodel[corpus], num_terms).transpose() if X_2 is not None: # print >> sys.stderr, "X SHAPE:", X.shape # print >> sys.stderr, "X_2 SHAPE:", X_2.shape X = sp.hstack([X, X_2], format='csr') elif X is not None: textmodel = None X = X.tocsr() print >> sys.stderr, "X SHAPE:", X.shape return X, y_labels, textmodel
def create_movie_profile(movie_dataset): ''' 使用tfidf,分析提取topn关键词 :param movie_dataset: :return: ''' dataset = movie_dataset["tags"].values from gensim.corpora import Dictionary # 根据数据集建立词袋,并统计词频,将所有词放入一个词典,使用索引进行获取 dct = Dictionary(dataset) # 根据将每条数据,返回对应的词索引和词频 corpus = [dct.doc2bow(line) for line in dataset] # 训练TF-IDF模型,即计算TF-IDF值 model = TfidfModel(corpus) _movie_profile = [] for i, data in enumerate(movie_dataset.itertuples()): mid = data[0] title = data[1] genres = data[2] vector = model[corpus[i]] movie_tags = sorted(vector, key=lambda x: x[1], reverse=True)[:30] topN_tags_weights = dict(map(lambda x: (dct[x[0]], x[1]), movie_tags)) # 将类别词的添加进去,并设置权重值为1.0 for g in genres: topN_tags_weights[g] = 1.0 topN_tags = [i[0] for i in topN_tags_weights.items()] _movie_profile.append((mid, title, topN_tags, topN_tags_weights)) movie_profile = pd.DataFrame( _movie_profile, columns=["movieId", "title", "profile", "weights"]) movie_profile.set_index("movieId", inplace=True) return movie_profile
class Cos(): def __init__(self): self.tfidf = {} self.dict = Dictionary() def init(self, traindata, dict_path, tfidf_path): self.dict = Dictionary(traindata) # fit dictionary corpus = [self.dict.doc2bow(line) for line in traindata] # convert corpus to BoW format self.tfidf = TfidfModel(corpus) # fit model self.dict.save(dict_path) self.tfidf.save(tfidf_path) def load(self, dict_path, tfidf_path): self.dict = Dictionary.load(dict_path) self.tfidf = TfidfModel.load(tfidf_path)
def getSparseMatrixSimilarity(keyword, texts): # 1、将【文本集】生成【分词列表】 texts = [jieba.lcut(text) for text in texts] # 2、基于文本集建立【词典】,并获得词典特征数 dictionary = Dictionary(texts) num_features = len(dictionary.token2id) # 3.1、基于词典,将【分词列表集】转换成【稀疏向量集】,称作【语料库】 corpus = [dictionary.doc2bow(text) for text in texts] # 3.2、同理,用【词典】把【搜索词】也转换为【稀疏向量】 kw_vector = dictionary.doc2bow(jieba.lcut(keyword)) # 4、创建【TF-IDF模型】,传入【语料库】来训练 tfidf = TfidfModel(corpus) # 5、用训练好的【TF-IDF模型】处理【被检索文本】和【搜索词】 tf_texts = tfidf[corpus] # 此处将【语料库】用作【被检索文本】 tf_kw = tfidf[kw_vector] # 6、相似度计算 sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features) similarities = sparse_matrix.get_similarities(tf_kw) for e, s in enumerate(similarities, 1): print('kw 与 text%d 相似度为:%.2f' % (e, s)) print(sparse_matrix) print(similarities)
def _build_vocab(self, max_vocab_cnt): all_words = [] for data in self.valid + self.non_valid: all_words.append(data["title"] + data["content"]) vocab = Dictionary(all_words) raw_vocab_size = len(vocab) vocab.filter_extremes(no_below=5) vocab.filter_extremes(keep_n=max_vocab_cnt) len_1_words = list( filter( lambda w: len(w) == 1 and re.match(r"[\x00-\x7f]", w) and w not in ["a", "i"] and True or False, vocab.values())) vocab.filter_tokens(list(map(vocab.token2id.get, len_1_words))) if self.config.use_dict == "seq" and self.config.enable_pad: vocab.token2id[PAD] = len(vocab) vocab.compactify() self.pad_wid = vocab.token2id.get(PAD) self.vocab_seq = vocab # seq dictionary # build bow dictionary self.vocab_bow = copy.deepcopy(vocab) self.vocab_bow.filter_tokens( map(self.vocab_bow.token2id.get, STOPWORDS)) # filter stop words self.vocab_bow.compactify() if self.config.tfidf: tfidf_corpus = [self.vocab_bow.doc2bow(line) for line in all_words] self.tfidf_model = TfidfModel(tfidf_corpus) print("Load corpus with non_valid size %d, valid size %d, " "raw vocab size %d seq vocab size %d, bow vocab size %d" % (len(self.non_valid), len(self.valid), raw_vocab_size, len(self.vocab_seq), len(self.vocab_bow)))
def cluster_data(state): # Bigram model data_words_bigrams = make_bigrams(state) INPUT = data_words_bigrams # Create Dictionary id2word = corpora.Dictionary(INPUT) # Create Corpus texts = INPUT # Filter out words that occur less than and greater than id2word.filter_extremes(no_below=state.no_below, no_above=state.no_above) # Term Document Frequency corpus = [id2word.doc2bow(text) for text in texts] tfidf = TfidfModel(corpus) corpus_tfidf = tfidf[corpus] TOPICS_LIST = range(1, state.lda_topics + 1) lda_models = [] coherence_scores = [] for TOPICS in TOPICS_LIST: lda_model = run_LDA_model(corpus, id2word, TOPICS) lda_models.append(lda_model) coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words_bigrams, dictionary=id2word, coherence='c_v') score = coherence_model_lda.get_coherence() coherence_scores.append(score) return coherence_scores, lda_models, corpus
def get_matrix_pinyin(pos_path="data/samples/positive.txt", neg_path="data/samples/negative.txt"): from xpinyin import Pinyin dataset = [] pin = Pinyin() with open(pos_path, encoding='utf8') as f: dataset += [ pin.get_pinyin(line, '').split() for line in f if line != '\n' ] pos_len = len(dataset) print("positive matrix length", pos_len) with open(neg_path, encoding='utf8') as f: dataset += [ pin.get_pinyin(line, '').split() for line in f if line != '\n' ] neg_len = len(dataset) - pos_len print("negative matrix length", neg_len) dct = Dictionary(dataset) print("dictionary length", len(dct)) corpus = [dct.doc2bow(line) for line in dataset] model = TfidfModel(corpus) pos_matrix = np.zeros((pos_len, len(dct))) neg_matrix = np.zeros((neg_len, len(dct))) for i, line in enumerate(model[corpus][:pos_len]): for j, n in line: pos_matrix[i, j] = n for i, line in enumerate(model[corpus][pos_len:]): for j, n in line: neg_matrix[i, j] = n print("get matrix completed") return pos_matrix, neg_matrix
def compute_sim_matrix(self): ''' if(self.model_type.lower() == "fasttext"): model = FastText(self.questions) else: model = Word2Vec(self.questions) ''' self.dictionary = Dictionary(self.questions) self.tfidf = TfidfModel(dictionary=self.dictionary) word2vec_model = Word2Vec(self.questions, workers=cpu_count(), min_count=5, size=300, seed=12345) sim_index = WordEmbeddingSimilarityIndex(word2vec_model.wv) sim_matrix = SparseTermSimilarityMatrix(sim_index, self.dictionary, self.tfidf, nonzero_limit=100) bow_corpus = [ self.dictionary.doc2bow(document) for document in self.questions ] tfidf_corpus = [self.tfidf[bow] for bow in bow_corpus] self.docsim_index = SoftCosineSimilarity(tfidf_corpus, sim_matrix, num_best=10)
def sim_calculator(DF, column_name): print("Number of {}: {}".format(column_name,len(DF[column_name]))) #Preprocessing print('\nCreating Dictionary...') processed_docs = DF[column_name].map(preprocess) #Generating dictionary dictionary = corpora.Dictionary(processed_docs) dictionary.filter_extremes(no_below=100,no_above=0.9, keep_n=100000) print('Dictionary created') print("Size of vocabularly: ",len(dictionary)) #Bag of words bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs] #Tfid vectorization print('\nRunning TFIDF vectorization...') model2 = TfidfModel(bow_corpus) abs_tfidf=model2[bow_corpus] print('TFIDF complete') #Calculating similarties print('\nCalculating Similarity Matrix...') index = similarities.MatrixSimilarity(abs_tfidf) sims = index[abs_tfidf] print("size of similarity matrix: ", sims.shape) return sims
def text_matching_tfidf(text, candidate_texts, top_n=1): """ 文本匹配:基于TF-IDF :param text: :param candidate_texts: :param top_n: :return: """ text_cut = jieba.lcut(text) candidate_texts_cut = [jieba.lcut(item) for item in candidate_texts] # todo: 可以选择去一下停用词 dct = Dictionary(candidate_texts_cut) dct_size = len(dct.token2id.keys()) corpus_bow = [dct.doc2bow(item) for item in candidate_texts_cut] tfidf_model = TfidfModel(corpus_bow, dictionary=dct) corpus_tfidf = tfidf_model[corpus_bow] similarity = SparseMatrixSimilarity(corpus_tfidf, num_features=dct_size) text_bow = dct.doc2bow(text_cut) text_tfidf = tfidf_model[text_bow] cosine_similarities = similarity[text_tfidf] sims_argsort = (-cosine_similarities).argsort()[:top_n] return [(candidate_texts[idx], cosine_similarities[idx]) for idx in sims_argsort]
def main(): """ Executes all the scripts defined above """ nlp = spacy.load("en_core_web_sm") sop_df = pd.read_csv('data/interim/sop_types_valid.csv', converters={ 'juri': eval, 'filename': eval }) type_list = sop_df['type'] try: calltaker_all = pd.read_csv('data/interim/calltaker_all.csv', converters={'sop': eval}) except: calltaker_all = load_event_types_for_role(sop_df, type_list, 'call taker') save_df(calltaker_all, 'calltaker_all.csv') doc_term_bow, corpus, dictionary = get_dct_dtmatrix( nlp, calltaker_all['sop']) tfidf_type = TfidfModel(doc_term_bow) tfidf_mtx = bow2tfidf(doc_term_bow, tfidf_type) km_alltype = KMeans(n_clusters=87, random_state=911).fit(tfidf_mtx) type_topics_kmeans_tfidf = calltaker_all.copy() type_topics_kmeans_tfidf['cluster'] = km_alltype.labels_ type_topics_kmeans_tfidf = type_topics_kmeans_tfidf.sort_values( by=['cluster', 'type', 'juri'], ignore_index=True) type_topics_kmeans_tfidf.to_csv( 'data/interim/type_topics_kmeans_tfidf.csv', index=False)
def build_tfidf_or_lsi(corpus, method='tfidf'): ''' построение модели для ранжирования документов. На вход: корпус текстов и метод ("tfidf" или "lsi"). На выход кортеж: (словарь терминов в корпусе текстов, оцененная модель и матрица сходств слов) ''' dictionary = Dictionary(corpus) corpus_bow = [dictionary.doc2bow(doc) for doc in corpus] model_tfidf = TfidfModel(corpus_bow) corpus_tfidf = [model_tfidf[doc] for doc in corpus_bow] simil_tfidf = MatrixSimilarity(corpus_tfidf) if method == 'tfidf': return dictionary, model_tfidf, simil_tfidf elif method == 'lsi': model_lsi = LsiModel(corpus_tfidf, id2word=dictionary, num_topics=50) corpus_lsi = [model_lsi[doc] for doc in corpus_bow] simil_lsi = MatrixSimilarity(corpus_lsi) return dictionary, model_lsi, simil_lsi
def train(self): self.process_dataset(self.training_path, True) self.process_dataset(self.training_path, False) self.training_sources_length = len(self.sources) self.logger.debug( f'After train set processing: sources len {len(self.sources)}, labels len {len(self.labels)}' ) self.process_dataset(self.test_path, True) self.process_dataset(self.test_path, False) self.logger.debug( f'After full processing: sources len {len(self.sources)}, labels len {len(self.labels)}' ) corpus = Texts(self.sources).to_vector() dictionary = corpora.Dictionary(corpus) corpus = [dictionary.doc2bow(text) for text in corpus] model = TfidfModel(corpus) corpus = [text for text in model[corpus]] self.training_text_matrix = corpus2dense(corpus, num_terms=len( dictionary.token2id)).T if self.pca: self.training_text_matrix = self.pca.fit_transform( self.training_text_matrix) self.classifier.fit( self.training_text_matrix[:self.training_sources_length], self.labels[:self.training_sources_length]) self.is_trained = True
def main(JDK, url, title, query): dictionary = corpora.Dictionary.load( './TFIDF_Word2Vec/data/tfidf-w2v_dictionary.dict') tfidf = TfidfModel.load('./TFIDF_Word2Vec/data/tfidf.model') word2vec = gensim.models.keyedvectors.Word2VecKeyedVectors.load( './TFIDF_Word2Vec/data/word2vec.model') tfidf_w2v_model = models.keyedvectors.Word2VecKeyedVectors.load( './TFIDF_Word2Vec/data/tfidf-w2v.model') query_vec = get_tfidf_w2v_vec(query, dictionary, tfidf, word2vec) full_entity_score_vec = tfidf_w2v_model.similar_by_vector(query_vec, topn=False) sort_sims = sorted(enumerate(full_entity_score_vec), key=lambda item: -item[1]) result = [] for i in range(10): dic = { 'url': url[sort_sims[i][0]].strip('\n'), 'JDK': JDK[sort_sims[i][0]].strip('\n'), 'title': title[sort_sims[i][0]].strip('\n'), 'score': sort_sims[i][1] } result.append(dic) return result
def create_data(corpus_path):#构建数据,先后使用doc2bow和tfidf model对文本进行向量表示 sentences = [] sentence_dict={} count=0 for line in open(corpus_path): # print line line = line.strip().split('\t') # print(line) if len(line) == 2: sentence_dict[count]=line[1] count+=1 sentences.append(line[1].split(' ')) else: break # print(sentence_dict) print(sentences) #对文本进行处理,得到文本集合中的词表 dictionary = corpora.Dictionary(sentences) # print(dictionary) #利用词表,对文本进行cbow表示 corpus = [dictionary.doc2bow(text) for text in sentences] print(corpus) #利用cbow,对文本进行tfidf表示 tfidf=TfidfModel(corpus) corpus_tfidf=tfidf[corpus] # print(corpus_tfidf) return sentence_dict,dictionary,corpus,corpus_tfidf
def fit(self, X, y=None): """ Fit the model according to the given training data. """ self.gensim_model = TfidfModel(corpus=X, id2word=self.id2word, dictionary=self.dictionary, wlocal=self.wlocal, wglobal=self.wglobal, normalize=self.normalize) return self
def setUp(self): self.cls = similarities.SoftCosineSimilarity self.tfidf = TfidfModel(dictionary=dictionary) similarity_matrix = scipy.sparse.identity(12, format="lil") similarity_matrix[dictionary.token2id["user"], dictionary.token2id["human"]] = 0.5 similarity_matrix[dictionary.token2id["human"], dictionary.token2id["user"]] = 0.5 self.similarity_matrix = similarity_matrix.tocsc()
def __init__(self, kp_archives_by_paperid, kp_archives_by_userid): self.dictionary = corpora.Dictionary() # self.bow_by_userid = defaultdict(Counter) # self.bow_by_paperid = defaultdict(Counter) self.all_documents = [] self.kp_archives_by_paperid = kp_archives_by_paperid self.kp_archives_by_userid = kp_archives_by_userid for archive in self.kp_archives_by_paperid.values(): for token_list in archive: self.dictionary.add_documents([token_list]) self.all_documents += [token_list] for archive in self.kp_archives_by_userid.values(): for token_list in archive: self.dictionary.add_documents([token_list]) self.all_documents += [token_list] self.corpus_bows = [ self.dictionary.doc2bow(doc) for doc in self.all_documents ] self.tfidf = TfidfModel(self.corpus_bows)
def create_doc_term_matrix(docs, id2word, tfidf=False, logentropy=False, random_projections=False): doc_term_matrix = [id2word.doc2bow(doc) for doc in docs] _save_model2(doc_term_matrix, 'doc_term_matrix') if random_projections: rp_model = RpModel(corpus=doc_term_matrix, id2word=id2word, num_topics=params['num_topics']) doc_term_matrix = rp_model[doc_term_matrix] _save_model2(doc_term_matrix, 'doc_term_matrix_random_projections') if tfidf: tfidf_model = TfidfModel(id2word=id2word, corpus=doc_term_matrix, normalize=True) doc_term_matrix = tfidf_model[doc_term_matrix] _save_model2(doc_term_matrix, 'doc_term_matrix_tfidf') if logentropy: log_model = LogEntropyModel(corpus=doc_term_matrix, normalize=True) doc_term_matrix = log_model[doc_term_matrix] _save_model2(doc_term_matrix, 'doc_term_matrix_logentropy') return doc_term_matrix
def create_corpus(src, out_dir, no_below=20, keep_words=_DEFAULT_KEEP_WORDS): """\ """ wordid_filename = os.path.join(out_dir, 'cables_wordids.pickle') bow_filename = os.path.join(out_dir, 'cables_bow.mm') tfidf_filename = os.path.join(out_dir, 'cables_tfidf.mm') predicate = None # Could be set to something like pred.origin_filter(pred.origin_germany) # 1. Create word dict dct = Dictionary() dct_handler = DictionaryHandler(dct) handler = create_filter(dct_handler) handle_source(src, handler, predicate) dct.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words) dct.save(wordid_filename) # 2. Reiterate through the cables and create the vector space corpus_handler = CorpusHandler(out_dir, dct=dct, allow_dict_updates=False) handler = create_filter(corpus_handler) handle_source(src, handler, predicate) # 3. Load corpus mm = MmCorpus(bow_filename) # 4. Create TF-IDF model tfidf = TfidfModel(mm, id2word=dct, normalize=True) # 5. Save the TF-IDF model MmCorpus.serialize(tfidf_filename, tfidf[mm], progress_cnt=10000)
def tfidf_w2v_top5w(all_docs_prepro, id_dict): with open('../code/similarity/mappings/map_w2v_tfidf_5w.pkl', 'rb') as fp: Classes = pickle.load(fp) mapping = Classes['mapping'] print('Loading Word2vec model') model_path = 'embedding/models/word2vec_all.model' model_w2v = Word2Vec.load(model_path) print('Loading Tfidf model') model_path = 'embedding/models/tfidf_all.model' model_tfidf = TfidfModel.load(model_path) dct = Dictionary(all_docs_prepro) corpus = [dct.doc2bow(line) for line in all_docs_prepro] mean_ticket_ques = top5_average('ticket_ques', corpus=corpus, dct=dct, model_w2v=model_w2v, model_tfidf=model_tfidf, id_dict=id_dict, all_docs_prepro=all_docs_prepro) return (mean_ticket_ques, mapping)
def __init__(self): self.host = 'localhost' self.port = 3306 self.user = '******' self.password = '******' self.db = 'gaojiruangong' self.charset = 'utf8' db = pymysql.Connect(host=self.host, port=self.port, user=self.user, passwd=self.password, db=self.db, charset=self.charset) cursor = db.cursor() query_sql = "SELECT id, api FROM apisamplecode" cursor.execute(query_sql) results = cursor.fetchall() all_api_name_set = set() for item in results: delete_left_brackets_api_name = item[1].split('(')[0] all_api_name_set.add(delete_left_brackets_api_name) api_name = delete_left_brackets_api_name.split('.')[-1].lower() api_id = item[0] if api_name in self.api_name_2_id.keys(): self.api_name_2_id[api_name].append(api_id) else: self.api_name_2_id[api_name] = [] self.api_name_2_id[api_name].append(api_id) self.all_qualified_api_name = list(all_api_name_set) self.dictionary = corpora.Dictionary.load( ROOT_DIR + '/output/model/tfidf/tfidf_dictionary.dict') self.index = similarities.Similarity.load( ROOT_DIR + '/output/model/tfidf/tfidf_index.index') self.tfidf = TfidfModel.load(ROOT_DIR + '/output/model/tfidf/tfidf.model')
def recomended_projects(self, request): projects = ProjectRequest.objects.all() project_keywords_dict = {} projects_dict = {} tags_list = [] for project in projects: description = project.description description_keywords = get_keywords(description.replace('"', '')) tags = project.tags.replace(' ', ',').lower() for keyword in description_keywords: tags += ',' + keyword[0].lower() tags_list.append(tags) df = read_frame(projects, fieldnames=['id', 'tags'], index_col=['id']) df['tags'] = tags_list keywords = df['tags'].tolist() keywords = [word_tokenize(keyword.lower()) for keyword in keywords] keywords = [no_commas(kw) for kw in keywords] processed_keywords = keywords dictionary = Dictionary(processed_keywords) corpus = [dictionary.doc2bow(doc) for doc in processed_keywords] tfidf = TfidfModel(corpus) sims = MatrixSimilarity(tfidf[corpus], num_features=len(dictionary)) top_3 = keywords_recommendation(all_projects=df, keywords=['uvg', 'gasolina', 'potente', 'mcdonald', 'mecanico', 'gg', 'carros'], number_of_hits=3, data=[dictionary, tfidf, sims]) projects = [] for id in top_3: projects.append(ProjectRequestSerializer(ProjectRequest.objects.get(pk=id)).data) return Response(projects)
def __init__(self, sentences_file, stopwords): self.dictionary = None self.corpus = None f_sentences = codecs.open(sentences_file, encoding='utf-8') documents = list() count = 0 print "Gathering sentences and removing stopwords" for line in f_sentences: line = re.sub('<[A-Z]+>[^<]+</[A-Z]+>', '', line) # remove stop words and tokenize document = [ word for word in TreebankWordTokenizer().tokenize(line.lower()) if word not in stopwords ] documents.append(document) count += 1 if count % 10000 == 0: sys.stdout.write(".") f_sentences.close() self.dictionary = corpora.Dictionary(documents) self.corpus = [self.dictionary.doc2bow(text) for text in documents] self.tf_idf_model = TfidfModel(self.corpus) # print(documents) print len(documents), "documents read" print len(self.dictionary), " unique tokens", self.dictionary
def mergeTags(): res = {} # 创建一个空字典 for i in range(len(displayArr)): texts = default_tags keyword = displayArr[i] # 1、将【文本集】生成【分词列表】 texts = [lcut(text) for text in texts] # 2、基于文本集建立【词典】,并获得词典特征数 dictionary = Dictionary(texts) num_features = len(dictionary.token2id) # 3.1、基于词典,将【分词列表集】转换成【稀疏向量集】,称作【语料库】 corpus = [dictionary.doc2bow(text) for text in texts] # 3.2、同理,用【词典】把【搜索词】也转换为【稀疏向量】 kw_vector = dictionary.doc2bow(lcut(keyword)) # 4、创建【TF-IDF模型】,传入【语料库】来训练 tfidf = TfidfModel(corpus) # 5、用训练好的【TF-IDF模型】处理【被检索文本】和【搜索词】 tf_texts = tfidf[corpus] # 此处将【语料库】用作【被检索文本】 tf_kw = tfidf[kw_vector] # 6、相似度计算 sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features) similarities = sparse_matrix.get_similarities(tf_kw) for e, s in enumerate(similarities, 1): if s > 0.5: # print(keyword, ' 与 ', ''.join(texts[e - 1]), ' 的相似度为: ', s) key = ''.join(texts[e - 1]).strip() res[key] = s arrSorted = sorted(res.items(), key=lambda item: item[1], reverse=True) for ind, (k, v) in enumerate(arrSorted): if ind == 0: ids = textsOld[i].strip().split('.')[0] textsOld[i] = textsOld[i] + '----------' + k # textsOld[i] = ids+'.'+k res = {} #字典置空 return textsOld
def loadmodel(self, nameprefix): """ Load the topic model with the given prefix of the file paths. Given the prefix of the file paths, load the corresponding topic model. The files include a JSON (.json) file that specifies various parameters, a gensim dictionary (.gensimdict), and a topic model (.gensimmodel). If weighing is applied, load also the tf-idf model (.gensimtfidf). :param nameprefix: prefix of the file paths :return: None :type nameprefix: str """ # load the JSON file (parameters) parameters = json.load(open(nameprefix + '.json', 'rb')) self.nb_topics = parameters['nb_topics'] self.toweigh = parameters['toweigh'] self.algorithm = parameters['algorithm'] self.classlabels = parameters['classlabels'] # load the dictionary self.dictionary = Dictionary.load(nameprefix + '.gensimdict') # load the topic model self.topicmodel = gensim_topic_model_dict[self.algorithm].load( nameprefix + '.gensimmodel') # load the similarity matrix self.matsim = MatrixSimilarity.load(nameprefix + '.gensimmat') # load the tf-idf modek if self.toweigh: self.tfidf = TfidfModel.load(nameprefix + '.gensimtfidf') # flag self.trained = True
def processing_data(): corpus = list( chain(*[ chain([ preprocess(thread["RelQuestion"]["RelQSubject"]), preprocess(thread["RelQuestion"]["RelQBody"]) ], [ preprocess(relcomment["RelCText"]) for relcomment in thread["RelComments"] ]) for thread in api.load( "semeval-2016-2017-task3-subtaskA-unannotated") ])) dictionary = Dictionary(corpus) tfidf = TfidfModel(dictionary=dictionary) w2v_model = Word2Vec(corpus, workers=cpu_count(), min_count=5, size=300, seed=12345) similarity_matrix = w2v_model.wv.similarity_matrix(dictionary, tfidf, nonzero_limit=100) pickle.dump(dictionary, open(r'C:\Code\201810\Similarity\data\dic_path', 'wb+')) #字典 pickle.dump(similarity_matrix, open(r'C:\Code\201810\Similarity\data\similarity_matrix_path', 'wb+')) #相似度举证 pickle.dump(tfidf, open(r'C:\Code\201810\Similarity\data\tfidf_path', 'wb+')) #tfidf
def remove_rare_often_word(texts, low_value, high_value): #removing frequent and rare words texts_tokenized = [simple_preprocess(doc) for doc in texts] dictionary = Dictionary(texts_tokenized) corpus = [dictionary.doc2bow(doc) for doc in texts_tokenized] tfidf = TfidfModel(corpus, id2word=dictionary) corpus_tfidf = tfidf[corpus] bad_words = [] for sent_tfidf in tqdm(corpus_tfidf, desc="selecting bad words"): bad_words += [ id for id, value in sent_tfidf if (value < low_value) or (value > high_value) ] dictionary.filter_tokens(bad_ids=bad_words) out_bow = [dictionary.doc2bow(doc) for doc in texts_tokenized] out_corpus = [] for doc in tqdm(out_bow, desc='Creating out corpus'): out_corpus.append([dictionary.get(id) for id, value in doc]) dict_tfidf = { dictionary.get(id): value for doc in corpus_tfidf for id, value in doc if (value >= low_value) and (value <= high_value) } return { 'texts': out_corpus, 'dict_tfidf': dict_tfidf, 'dictionary': dictionary }
def setUp(self): self.documents = [ [u"government", u"denied", u"holiday"], [u"government", u"denied", u"holiday", u"slowing", u"hollingworth"]] self.dictionary = Dictionary(self.documents) self.tfidf = TfidfModel(dictionary=self.dictionary) self.index = UniformTermSimilarityIndex(self.dictionary, term_similarity=0.5)
def load_data(self): if not self.tf_idf_model: if not os.path.exists(self.tf_idf_model_path): raise Exception('TF-IDF model file not found') self.dictionary = Dictionary.load(self.dictionary_path) self.tf_idf_model = TfidfModel.load(self.tf_idf_model_path)
def build_tfidf(corpus_dir,model_filename): stemmer = nltk.stem.PorterStemmer() corpus = PlaintextCorpusReader(corpus_dir, '.*\.txt$') # a memory-friendly iterator dictionary = corpora.Dictionary() bigram_transformer = Phrases(TextCorpus(corpus)) for myfile in corpus.fileids(): try: chunks = bigram_transformer[[word.lower() for word in corpus.words(myfile)]] dictionary.add_documents([[stemmer.stem(chunk) for chunk in chunks]]) except Exception as e: print 'Warning error in file:', myfile model = TfidfModel(BowCorpus(corpus,dictionary,bigram_transformer), id2word=dictionary) model.save(model_filename)
def fit(self, raw_documents, y=None): self.analyzer_func = self.build_analyzer() self.model = LsiModel.load(self.model_fn) if os.path.exists(self.model_fn + '.tfidf'): self.tfidf = TfidfModel.load(self.model_fn + '.tfidf') return self
def __init__(self): self.dictionary = Dictionary.load(app.config["RCMDR_DICT"]) self.corpus = corpora.MmCorpus(app.config["RCMDR_CORPUS"]) self.tfidf = TfidfModel.load(app.config["RCMDR_TFIDF_MODEL"]) self.lda_model = LdaModel.load(app.config["RCMDR_LDA_MODEL"]) self.lsi_model = LsiModel.load(app.config["RCMDR_LSI_MODEL"]) self.lda_index = Similarity.load(app.config["RCMDR_LDA_INDEX"]) self.lsi_index = Similarity.load(app.config["RCMDR_LSI_INDEX"]) self.job_labels = { int(k): v for k, v in (line.split("=") for line in open(app.config["RCMDR_JOB_LABELS"]).read().strip().split("\n")) }
def main(): datadir = path.abspath(path.join(os.getcwd(), "data")) # load back the id->word mapping directly from file fin = path.join(datadir, "reuters21578.dict.txt") vocabulary = Dictionary.load_from_text(fin) # load the corpus fin = path.join(datadir, "reuters21578.mm") mm = MmCorpus(fin) # build tfidf, ~50min tfidf = TfidfModel(mm, id2word=vocabulary, normalize=True) # save the TfidfModel instance to file fout = path.join(datadir, "reuters21578.tfidf.model") tfidf.save(fout) # save TF-IDF vectors in matrix market format fout = path.join(datadir, "reuters21578.tfidf.mm") MmCorpus.serialize(fout, tfidf[mm], progress_cnt=10000)
def main(): program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) outp = OUT_PREFIX keep_words = DEFAULT_DICT_SIZE # the doc index dbc = get_cursor() dbc.execute('SELECT id, title FROM wiki_pages WHERE is_artist=1 ORDER BY id') docindex = [(pageid, title) for pageid, title in dbc] pickle.dump(docindex, open(outp + '_docindex.p', 'wb')) lemmatize = True # 'lemma' in program wiki = WikiCorpus(pages_gen, lemmatize=lemmatize) # only keep the most frequent words wiki.dictionary.filter_extremes(no_below=20, no_above=0.5, keep_n=DEFAULT_DICT_SIZE) # save dictionary and bag-of-words (term-document frequency matrix) MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') # initialize corpus reader and word->id mapping mm = MmCorpus(outp + '_bow.mm') # build tfidf, ~50min tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(outp + '.tfidf_model') # save tfidf vectors in matrix market format # another long task MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000) logger.info("finished running %s" % program)
def build_tfidf_model(data_directory, corpus_path, wiki_text_output_path, model_output_path, multiwords=True, druid_cutoff_score=0.3): stemmer = nltk.stem.PorterStemmer() tokenid_dictionary = corpora.Dictionary() if not exists(wiki_text_output_path): logger.info('Converting ' + str(corpus_path) + ' into plain text file: ' + wiki_text_output_path) # Convert Wikipedia XML dump into .txt format wikidump2text.convert(corpus_path, wiki_text_output_path) else: logger.info('Found ', wiki_text_output_path, ' not converting from the raw bz2 file.') # Load Multiword Expressions as Dictionary stopwords_path = join(data_directory, 'stopwords_en.txt') if multiwords: druid_path = join(data_directory, 'druid_en.bz2') druid_dict = druid.DruidDictionary(druid_path, stopwords_path, cutoff_score=druid_cutoff_score) logger.info('Loaded Druid with cutoff' + str(druid_cutoff_score)) else: druid_dict = None logger.info("Building tfidf model...") start_time = time.time() if multiwords: logger.info('Using druid_en.bz2 in ' + data_directory + ' as multiword dictionary.') articles = TextCorpus(wiki_text_output_path, druid_dict, multiwords=True) # a memory-friendly iterator else: logger.info('Using no multiword dicitionary, just single words') articles = TextCorpus(wiki_text_output_path, None, multiwords=False) tokenid_dictionary.add_documents(articles) model = TfidfModel(BowCorpus(wiki_text_output_path, druid_dict, tokenid_dictionary, multiwords=multiwords), id2word=tokenid_dictionary) model.save(model_output_path) logger.info("Finished building tfidf model. Time needed: " + str(time.time() - start_time))
def main(argv=None): if argv is None: argv = sys.argv print('Creating speech serialized corpus') # Create the speech corpus, it is inside the rawfile as a json format: # "id0": {"text": [" "], "url": "http://www.americanrhetoric.com/"} with open(RAWFILE, 'r') as f: speech_dict = json.load(f) with open(RAWIDS, 'r') as f: id_dict = json.load(f) # We also need to make sure that the article ids are saved in the correct # format so that the gensimple engine can understand it, like this: # "int": ["url", "title"], texts = [] article_dict = {} counter = 0 for key, value in speech_dict.items(): texts.append([token for token in value['text']]) article_dict[str(counter)] = [value['url'], id_dict[key]['title']] counter += 1 with open(ARTICLEDICT, 'w') as f: json.dump(article_dict, f) dictionary = Dictionary(texts) dictionary.save_as_text(DICTFILE) corpus = [dictionary.doc2bow(text) for text in texts] MmCorpus.serialize(MMFILE, corpus) print('Speech serialized corpus created') # # Now run LSI on TDIDF dictionary = Dictionary.load_from_text(DICTFILE) mm = MmCorpus(MMFILE) tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(TDIFMODEL) MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000) mm_tdif = MmCorpus(TDIFFILE) lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300) index = similarities.MatrixSimilarity(lsi[mm_tdif]) index.save(SIMMATRIX) lsi.save(LSIMODEL) print("LSI model and index created")
def construct_model(self, documents): logging.basicConfig( format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO ) logging.info("Obtaining word tokens") tokens = [self.get_tokens(document) for document in documents] # self.tf_idf_model = TfidfModel(tokens) logging.info("Constructing dictionary") self.dictionary = Dictionary(tokens) self.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=1000) self.dictionary.compactify() self.dictionary.save(self.dictionary_path) logging.info("Constructing TF-IDF model") self.tf_idf_model = TfidfModel(dictionary=self.dictionary) self.tf_idf_model.save(self.tf_idf_model_path)
def __init__(self, model_prefix = None, num_best = None): self.model_prefix = model_prefix self.num_best = num_best if self.model_prefix is None: raise ValueError("model_prefix must be specified") logger.info("ESA: Loading word dictionary...") self.dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2') logger.info("ESA: Loading document name map...") self.article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle') logger.info("ESA: Loading TF-IDF model...") self.tfidf = TfidfModel.load(model_prefix + '.tfidf_model') logger.info("ESA: Loading similarity index...") self.similarity_index = Similarity.load(model_prefix + '_similarity.index', mmap='r') #logger.info("ESA: Preloading reverse indexes...") #self.similarity_index.preload_reverse_index() logger.info("ESA: Finished loading model files.")
def main(train, model, dic): logging.basicConfig(level=logging.INFO) corpus = SentenceDocCorpus(train) tfidf = TfidfModel(corpus) tfidf.save(model) corpus.dictionary.save(dic)
#print 'Saved dictionary' print('Creating LogEntropy TF-IDF and regular TF-IDF matrices and models') BOW_corpus = MmCorpus('../data/wiki_en_vocab200k') #Resurrect BOW corpus #log_entropy = LogEntropyModel(BOW_corpus) #log_entropy.save('../models/logEntropy.model') #already provided log_entropy = LogEntropyModel.load('../models/logEntropy.model') corpora.mmcorpus.MmCorpus.serialize('../data/log_entropy_matrix', log_entropy[BOW_corpus]) print('Saved LogEntropy TF-IDF matrix') #tfidf = TfidfModel(BOW_corpus) #tfidf.save('../models/tfidf.model') #already provided tfidf = TfidfModel.load('../models/tfidf.model') corpora.mmcorpus.MmCorpus.serialize('../data/log_entropy_matrix', tfidf[BOW_corpus]) print('Saved LogEntropy TF-IDF matrix') print('Creating Similarity Index') logent_corpus = MmCorpus('../data/log_entropy_matrix') num_feat = len(wiki.dictionary.keys()) index = Similarity('../data/logEntropyShards/logEntropySimilarity', logent_corpus, num_features=num_feat) index.save('../data/logEntropyShards/logEntropySimilarityIndex') print('Saved Shards and similarity index') print('Getting list of titles...')
return os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data') corpus_dir = os.path.join(data_directory(), 'audio_transcripts') model_filename = os.path.join(data_directory(), 'conversation.tfidf') stemmer = nltk.stem.PorterStemmer() corpus = PlaintextCorpusReader(corpus_dir, '.*\.txt$') # a memory-friendly iterator dictionary = corpora.Dictionary() # Train bigram transformer class TextCorpus(object): def __iter__(self): for file in corpus.fileids(): yield [word.lower() for word in corpus.words(file)] bigram_transformer = Phrases(TextCorpus()) for file in corpus.fileids(): chunks = bigram_transformer[[word.lower() for word in corpus.words(file)]] dictionary.add_documents([[stemmer.stem(chunk) for chunk in chunks]]) class BowCorpus(object): def __iter__(self): for file in corpus.fileids(): chunks = bigram_transformer[[word.lower() for word in corpus.words(file)]] yield dictionary.doc2bow([stemmer.stem(chunk) for chunk in chunks]) model = TfidfModel(BowCorpus(), id2word=dictionary) model.save(model_filename)
# corpus.save(f_bow) else: # models will be trained on your own corpus if os.path.exists(f_bow): corpus = TextCorpus.load(f_bow) else: corpus = TextCorpus(f_corpus) # corpus.save(f_bow) # filter dictionary corpus.dictionary.filter_extremes(no_below=0, no_above=1, keep_n=voc_size) corpus.dictionary.save(f_dict) corpus.save(f_bow) # tf-idf model if os.path.exists(f_tfidf): tfidf = TfidfModel.load(f_tfidf) else: tfidf = TfidfModel(corpus, id2word=corpus.dictionary) tfidf.save(f_tfidf) # TRAINING # lsa model if not os.path.exists(f_lsa): lsa = LsiModel(tfidf[corpus], id2word=corpus.dictionary, num_topics=lsa_dim) lsa.save(f_lsa) # word2vec model class MyCorpus(): def __iter__(self): for d in corpus.get_texts():
elif not opts.scaling: scaling = None else: raise ValueError("Only tfidf scaling is supported") word_model = opts.word_model if word_model: logging.info("Building word model") corpus = LimitCorpus(WikiCorpus(dump_fn, dictionary=Dictionary()), word_limit) else: corpus = SublexicalizedCorpus(WikiCorpus(dump_fn, dictionary=Dictionary()), order=order, word_limit=word_limit) voc = Dictionary(corpus) voc.filter_extremes(no_below=cutoff) voc.compactify() bow_corpus = (voc.doc2bow(art) for art in corpus) tfidf = None if scaling == 'tfidf': tfidf = TfidfModel(bow_corpus) bow_corpus = (tfidf[voc.doc2bow(art)] for art in corpus) model = LsiModel(corpus=bow_corpus, num_topics=num_topics, id2word=voc) model.save(model_fn) if tfidf: tfidf.save(model_fn + '.tfidf')
class TfidfVectorizer(): """ Transform text to tf-idf representation """ def __init__(self): self.base_path = os.path.dirname(__file__) self.dictionary_path = os.path.join(self.base_path, "dictionary") self.tf_idf_model_path = os.path.join(self.base_path, "tfidf") self.stemmer = NepStemmer() self.tf_idf_model = None def get_tokens(self, document): if not self.stemmer: raise Exception("Stemmer not available") return self.stemmer.get_stems(document) def construct_model(self, documents): logging.basicConfig( format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO ) logging.info("Obtaining word tokens") tokens = [self.get_tokens(document) for document in documents] # self.tf_idf_model = TfidfModel(tokens) logging.info("Constructing dictionary") self.dictionary = Dictionary(tokens) self.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=1000) self.dictionary.compactify() self.dictionary.save(self.dictionary_path) logging.info("Constructing TF-IDF model") self.tf_idf_model = TfidfModel(dictionary=self.dictionary) self.tf_idf_model.save(self.tf_idf_model_path) def load_data(self): if not self.tf_idf_model: if not os.path.exists(self.tf_idf_model_path): raise Exception('TF-IDF model file not found') self.dictionary = Dictionary.load(self.dictionary_path) self.tf_idf_model = TfidfModel.load(self.tf_idf_model_path) def doc2vector(self, document): """ Returns the sparse tf-idf vector for given document """ tokens = self.get_tokens(document) bag_of_words = self.dictionary.doc2bow(tokens) return (self.tf_idf_model[bag_of_words]) def obtain_feature_vector(self, document): """ Returns a single dense tf-idf vector for a given document """ self.load_data() tf_idf_vector = matutils.sparse2full( self.doc2vector(document), self.no_of_features ).reshape(1, -1) return tf_idf_vector def obtain_feature_matrix(self, documents): """ Returns the tf-idf dense matrix for the given documents """ self.load_data() input_matrix_sparse = [ self.doc2vector(x) for x in documents ] no_of_features = len(self.tf_idf_model.idfs) input_matrix = matutils.corpus2dense( input_matrix_sparse, no_of_features ).transpose() return input_matrix
# Remove stop words (additional removal of common words used in spoken language) stop_ids = [] with open(stop_words_file, 'r') as infile: for line in infile: try: stop_ids.append(wiki.dictionary.token2id[line.lower().strip()]) except KeyError: continue wiki.dictionary.filter_tokens(bad_ids=stop_ids) # save dictionary and bag-of-words (term-document frequency matrix) MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') # load back the id->word mapping directly from file # this seems to save more memory, compared to keeping the wiki.dictionary object from above dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') del wiki # initialize corpus reader and word->id mapping mm = MmCorpus(outp + '_bow.mm') # build tfidf tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(outp + '.tfidf_model') # save tfidf vectors in matrix market format MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000) logger.info("finished running %s" % program)
# In[18]: df.tokens.values[0:3] # In[22]: d = Dictionary.from_documents(df.tokens) d # In[20]: tfidf = TfidfModel(d) # *Hint-Hint:* `gensim` is sprinting this week at PyCon! # In[24]: get_ipython().magic(u'pinfo TfidfModel') # In[26]: TfidfModel(df.txt) # In[27]:
def scorer(model, dic): tfidf = TfidfModel.load(model) dictionary = Dictionary.load(dic) def score(words): return tfidf[dictionary.doc2bow(words)] return score
if len(sys.argv) < 2: print(inspect.cleandoc(__doc__) % locals()) sys.exit(1) model_prefix = sys.argv[1] logger.info("running %s" % ' '.join(sys.argv)) logger.info("Loading word dictionary...") dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2') logger.debug(dictionary) logger.info("Loading document name map...") article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle') logger.info("Loading tf-idf model...") tfidf = TfidfModel.load(model_prefix + '.tfidf_model') logger.info("Loading similarity index...") similarity_index = Similarity.load(model_prefix + '_similarity.index', mmap='r') similarity_index.use_reverse_index = True logger.info("Finished loading model files.") mismatches = 0 for doc_idx in range(0, len(similarity_index)): logger.info("Checking doc: %d %s" % (doc_idx, article_dict[doc_idx])) rev_doc = scipy.sparse.dok_matrix((1, len(dictionary)), dtype=np.float64) fwd_doc = similarity_index.vector_by_id(doc_idx) for feature_id, val in enumerate(fwd_doc.toarray().flatten()): if val == 0: continue feat_rev_docs = similarity_index.docs_by_feature_id(feature_id).toarray().flatten()
class TextCorpus(object): def __init__(self, filename): self.corpus = codecs.open(filename, 'r', encoding='utf-8') def __iter__(self): # One line contains one wiki article. for line in self.corpus: ngrams = druid_dict.find_ngrams(line.lower().split()) yield [stemmer.stem(token) for token in ngrams] articles = TextCorpus(wiki_text_output_path) # a memory-friendly iterator dictionary.add_documents(articles) class BowCorpus(object): def __init__(self, filename): self.corpus = codecs.open(filename, 'r', encoding='utf-8') def __iter__(self): for line in self.corpus: ngrams = druid_dict.find_ngrams(line.lower().split()) stemmed_article = [stemmer.stem(token) for token in ngrams] yield dictionary.doc2bow(stemmed_article) model = TfidfModel(BowCorpus(wiki_text_output_path), id2word=dictionary) model.save(model_output_path) logger.info("Finished building tfidf model. Time needed: " + str(time.time() - start_time))
# What about the raw, unprocessed unicode tweet text itself? # In[6]: import gzip with gzip.open(os.path.join(DATA_PATH, 'datetimes.csv.gz'), 'rb') as f: nums = pd.read_csv(f, engine='python', encoding='utf-8') with gzip.open(os.path.join(DATA_PATH, 'text.csv.gz'), 'rb') as f: corpus = pd.DataFrame.from_csv(f, encoding='utf8') # Now load previously compiled vocabulary and TFIDF matrix (transformation) # In[11]: tfidf = TfidfModel.load(os.path.join(DATA_PATH, 'tfidf')) tfidf.num_docs # In[17]: bows = pd.Series(vocab.doc2bow(toks) for toks in corpus.tokens) bows # This would make a nice, compact sparse matrix representation of our entire corpus... # Which would mean we could do more in RAM at once. # Left as an exercise. (check out `scipy.sparse.coo_matrix`) # In[18]:
self.dictionary.filter_extremes(no_below=1, keep_n=5000) # check API docs for pruning params self.dictionary.save_as_text("wiki_en_wordids.txt") def __iter__(self): for tokens in iter_documents(): yield self.dictionary.doc2bow(tokens) corpus = MyCorpus() # create a dictionary corpora.MmCorpus.serialize("wiki_en_corpus.mm", corpus) # store to disk, for later use # for vector in corpus: # convert each document to a bag-of-word vector # print vector print "Create models" tfidf_model = TfidfModel(corpus) tfidf_model.save("wiki_en_tfidf.model") # lsi_model = LsiModel(corpus) # topic_id = 0 # for topic in lsi_model.show_topics(): # topic_id+=1 # print "TOPIC (LSI) " + str(topic_id) + " : " + topic # lsi_model.print_topic(20, topn=10) # corpus_lsi = lsi_model[corpus] corpus_tfidf = tfidf_model[corpus] lsi_model_2 = LsiModel(corpus_tfidf, id2word=corpus.dictionary, num_topics=300)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) # check and process input arguments if len(sys.argv) < 3: print globals()['__doc__'] % locals() sys.exit(1) inp, outp = sys.argv[1:3] if len(sys.argv) > 3: keep_words = int(sys.argv[3]) else: keep_words = DEFAULT_DICT_SIZE online = 'online' in program lemmatize = 'lemma' in program debug = 'nodebug' not in program wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) # only keep the most frequent words (out of total ~8.2m unique tokens) wiki.dictionary.filter_extremes(no_below=20, keep_n=DEFAULT_DICT_SIZE) wiki.dictionary.save_as_text(outp + '_wordids.txt') # load back the id->word mapping directly from file # this seems to save more memory, compared to keeping the wiki.dictionary object from above dictionary = Dictionary.load_from_text(outp + '_wordids.txt') # build tfidf, ~50min tfidf = TfidfModel(wiki, normalize=True) tfidf.save('tfidf_all_words') logger.info("finished running %s" % program)
if len(sys.argv) < 3: print(inspect.cleandoc(__doc__) % locals()) sys.exit(1) input_file, output_prefix = sys.argv[1:3] logger.info("running %s" % ' '.join(sys.argv)) logger.info("Loading word dictionary...") dictionary = Dictionary.load_from_text(output_prefix + '_wordids.txt.bz2') logger.debug(dictionary) logger.info("Loading document name map...") article_dict = utils.unpickle(output_prefix + '_bow.mm.metadata.cpickle') logger.info("Loading tf-idf model...") tfidf = TfidfModel.load(output_prefix + '.tfidf_model') logger.info("Loading similarity index...") similarity_index = Similarity.load(output_prefix + '_similarity.index', mmap='r') similarity_index.use_reverse_index = True similarity_index.preload_reverse_index() logger.info("Finished loading model files.") logger.info("Processing input documents...") try: infile = open(input_file, 'r') except IOError: print('cannot open %s' % (input_file,)) sys.exit(1)