def cal_tfidf(documents, topk=10) -> List: """ tfidf模型训练 :param documents: 要进行训练的文档 :param topk: 提取tfidf score 的前多少个单词, 如果topk大于提取到的单词个数,返回所有单词 :return: """ # 单个文档分成列表 docs = [[word for word in document.split(' ')] for document in documents] # 生成字典 dictionary = corpora.Dictionary(docs) # 生成bag of word docs_bow = [dictionary.doc2bow(doc) for doc in docs] if os.path.isfile(tfidfmodel): model = TfidfModel.load(tfidfmodel) else: model = TfidfModel(docs_bow) model.save(tfidfmodel) # 生成文本向量 docs_vector = list(model[docs_bow]) # 对所有的文本向量进行排序,取钱topk docs_sort_vector = [ sorted(doc, key=lambda x: x[1], reverse=True)[:topk] for doc in docs_vector ] # 把对应的向量id转换成中文单词,docs_sort_chinese是中文单词和tfidf的score的列表 docs_sort_chinese = [[(dictionary[vec[0]], vec[1]) for vec in doc] for doc in docs_sort_vector] return docs_sort_chinese
def get_lda_feature(): doc_train = pd.read_csv(id_content_path) documents = doc_train['content'].apply(lambda x: x.split(' ')) # 建立词和ID的映射字典(id:word) dictionary = corpora.Dictionary(documents) # 建立文档和id和list(tuple(id,num)) of list df ds_df = [dictionary.doc2bow(document) for document in documents] # 建立tfidf模型,通过语料文档的tf,预测的时候只要提供语料的df tfidf_model = TfidfModel(ds_df) # 获取文档的tdf获取文档tfidf ds_tfidf = tfidf_model[ds_df] # 定义文档的主题个数 n = 60 # 构建lda模型,输入参数是文档的tfidf,并指明主题的个数 lda_model = LdaModel(ds_tfidf, num_topics=n, passes=10, random_state=12) vec_size = (len(documents), n) lda_feature = np.zeros(vec_size) i = 0 for doc in ds_tfidf: topics = lda_model.get_document_topics(doc, minimum_probability=0.01) for topic in topics: num_topic = topic[0] prob = round(topic[1], 5) lda_feature[i, num_topic] = prob i += 1 f_names = get_lda_feacture_name(n) pd.DataFrame(lda_feature, columns=f_names).to_csv(id_content_lda_path, index=0)
def __init__(self, docs, strip_diac=True, num_option=OPTION_GROUP, usr_option=OPTION_GROUP, url_option=OPTION_GROUP, emo_option=OPTION_GROUP, lc=True, del_dup1=True, token_list=[-1], lang=None, **kwargs): self.strip_diac = strip_diac self.num_option = num_option self.usr_option = usr_option self.url_option = url_option self.emo_option = emo_option self.emoclassifier = EmoticonClassifier() self.lc = lc self.del_dup1 = del_dup1 self.token_list = token_list if lang: self.lang = LangDependency(lang) else: self.lang = None self.kwargs = {k: v for k, v in kwargs.items() if k[0] != '_'} docs = [self.tokenize(d) for d in docs] self.dictionary = corpora.Dictionary(docs) corpus = [self.dictionary.doc2bow(d) for d in docs] self.model = TfidfModel(corpus)
def gensim_similarity(data_c): """ 使用Gensim包计算相似度: 词频 COUNT LDA LSI Tfidf: TFIDF LDA LSI """ # 合并获取词袋 data_c['s1'] = data_c['s1'].apply(lambda text: list(text)) data_c['s2'] = data_c['s2'].apply(lambda text: list(text)) data_c_all = data_c['s1'].append(data_c['s2'], ignore_index=True).to_frame(name='s') # 构建词典 print("starting create dic....") dic = corpora.Dictionary(data_c['s1'].values) dic.add_documents(data_c['s2'].values) print("文档数:", dic.num_docs) print("starting create count bow...") data_c['s1'] = data_c['s1'].apply(lambda text: dic.doc2bow(text)) data_c['s2'] = data_c['s2'].apply(lambda text: dic.doc2bow(text)) data_c_all['s'] = data_c_all['s'].apply(lambda text: dic.doc2bow(text)) # cps1 = [dic.doc2bow(text) for text in list(data_c['s1'].values)] # cps2 = [dic.doc2bow(text) for text in list(data_c['s2'].values)] cps1 = list(data_c['s1']) cps2 = list(data_c['s2']) cps = list(data_c_all['s']) # 计算s1,s2词频相似度 print("starting count similarity....") sm = similarities.SparseMatrixSimilarity(corpus=cps1, num_features=10000) count_sm = np.diag(sm[cps2]) # 计算s1,s2词频LDA,LSI相似度 count_lda_sm = lda_similarity(cps, cps1, cps2, dic) # count_lsi_sm= lsi_similarity(cps,cps1,cps2,dic) # 计算s1,s2 tfidf相似度 print("starting tfidf similarity....") tfidf = TfidfModel(corpus=cps, id2word=dic) cps1_tfidf = tfidf[cps1] cps2_tfidf = tfidf[cps2] cps_tfidf = tfidf[cps] # 计算s1,s2 TFIDF相似度 sm = similarities.SparseMatrixSimilarity(corpus=cps1_tfidf, num_features=10000) tfidf_sm = np.diag(sm[cps2_tfidf]) # 计算s1,s2词频LDA,LSI相似度 tfidf_lda_sm = lda_similarity(cps_tfidf, cps1_tfidf, cps2_tfidf, dic) tfidf_lsi_sm = lda_similarity(cps_tfidf, cps1_tfidf, cps2_tfidf, dic) return count_sm, count_lda_sm, tfidf_sm, tfidf_lda_sm, tfidf_lsi_sm
def calculate_embedding(corpus: Corpus, *, rank=2, svd_dims=50, perplexity=30, seed=0): """ Calculate a document embedding that assigns each document in the corpus a N-d position based on the word usage. :returns: A list of N-d tuples for the documents in the corpus. """ from gensim.models.tfidfmodel import TfidfModel from sklearn.decomposition import TruncatedSVD from sklearn.manifold import TSNE dic = corpus.dictionary freqs = corpus.frequencies tfidf = corpus2dense(TfidfModel(dictionary=dic)[freqs], len(dic)).T if svd_dims is not None: svd = TruncatedSVD(n_components=svd_dims, random_state=seed) components = svd.fit_transform(tfidf) else: components = tfidf model = TSNE(rank, metric='cosine', square_distances=True, perplexity=perplexity, random_state=seed) return model.fit_transform(components)
def fit(self, documents, labels=None): self.lexicon = Dictionary(documents) self.tfidf = TfidfModel( [self.lexicon.doc2bow(doc) for doc in documents], id2word=self.lexicon) self.save() return self
def lsi(documents, topicNum): texts = [[word for word in document.split(' ')] for document in documents] print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+str(len(texts))) dictionary = corpora.Dictionary(texts) print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+' get corpus..') corpusD = [dictionary.doc2bow(text) for text in texts] print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+' tfidf Model...') tfidf = TfidfModel(corpusD) corpus_tfidf = tfidf[corpusD] model = LsiModel(corpusD, num_topics=topicNum, chunksize=8000, extra_samples = 100)#, distributed=True)#, sample = 1e-5, iter = 10,seed = 1) lsiFeature = np.zeros((len(texts), topicNum)) print('translate...') i = 0 for doc in corpusD: topic = model[doc] for t in topic: lsiFeature[i, t[0]] = round(t[1],5) i = i + 1 if i%1000 == 1: print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+str(i)) return lsiFeature
def getLsiFeature(documents, topicNum): ''' Funciton: generate lsi features by training lsi model Input: documents: list of preprocessed sentences topicNum: output vector dimension Output: lsi features(DataFrame format) ''' # get corpus # LogInfo(' Get corpus...') texts = [[word for word in document.split(' ')] for document in documents] dictionary = corpora.Dictionary(texts) corpusD = [dictionary.doc2bow(text) for text in texts] # train lsi model # LogInfo(' Train LSI model...') tfidf = TfidfModel(corpusD) corpus_tfidf = tfidf[corpusD] model = LsiModel(corpusD, num_topics=topicNum, chunksize=8000, extra_samples = 100)#, distributed=True)#, sample = 1e-5, iter = 10,seed = 1) # generate lsi features LogInfo(' Generate LSI features...') lsiFeature = np.zeros((len(texts), topicNum)) i = 0 for doc in corpusD: topic = model[doc] for t in topic: lsiFeature[i, t[0]] = round(t[1],5) i = i + 1 colName = getColName(topicNum, "qlsi") lsiFeature = pd.DataFrame(lsiFeature, columns = colName) return lsiFeature
def load(self): if os.path.exists(self._lexicon_path): self.lexicon = Dictionary.load(self._lexicon_path) if os.path.exists(self._tfidf_path): self.tfidf = TfidfModel().load(self._tfidf_path)
def tf_idf_transform(self, doc): """ Perform tf-idf transformation on doc. """ self.dictionary = corpora.Dictionary(doc) corpus = [self.dictionary.doc2bow(text) for text in doc] self.tfIdfModel = TfidfModel(corpus) conf.mk_dir(self.tfIdfPath) self.dictionary.save(self.dictPath) logger.info('Dictionary has been saved in %s.' % self.dictPath) self.tfIdfModel.save(self.tfIdfPath) logger.info('TF-IDF model has been saved in %s.' % self.tfIdfPath) tfidf_corpus = self.tfIdfModel[corpus] tfidf_corpus_path = conf.get_filename_via_tpl('tfidf', n_users=self.nUsers, postfix='mm', n_samples=self.nSamples) corpora.MmCorpus.serialize(tfidf_corpus_path, tfidf_corpus) logger.info('TF-IDF corpus with a shape of %s has been saved in %s.' % (np.array(tfidf_corpus).shape, tfidf_corpus_path)) return tfidf_corpus
def get_tfidf_model(path="data/swiki.json", save_path="data/swiki_dict.txt", stem=False): """ :param path: :param save_path: :return: """ texts = map(lambda x: _preprocess_text(x, stem=stem), _load_json_list("data/swiki.json")) def _get_swiki_dictionary(): dict_file = os.path.join(BASE_DIR, save_path) if os.path.exists(dict_file): dictionary = corpora.Dictionary.load_from_text(dict_file) else: dictionary = corpora.Dictionary(texts) dictionary.save_as_text(dict_file) return dictionary dct = _get_swiki_dictionary() bow_texts = map(dct.doc2bow, texts) tfidf = TfidfModel(bow_texts) return dct, tfidf
def __init__(self): self.inner_model = None # load dictionary and corpus vocabulary = "raw" corpora_folder = os.path.join(*[ os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'corpora' ]) self.dictionary = corpora.Dictionary.load( os.path.join(corpora_folder, "%s.dict" % (vocabulary, ))) self.corpus = corpora.MmCorpus( os.path.join(corpora_folder, "%s.mm" % (vocabulary, ))) # parameters self.dataset = "CASEREPORT" # data file path models_folder = os.path.join(*[ os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'models' ]) filename = "TFIDF_%s" % (self.dataset, ) self.filepath = os.path.join(models_folder, filename) model_exists = os.path.isfile(self.filepath) if model_exists: logging.info("found data file %s" % (self.filepath, )) self.inner_model = TfidfModel.load(self.filepath) else: self.inner_model = TfidfModel(corpus=self.corpus) self.inner_model.save(self.filepath)
def get_tfidf(self): docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs] model_tfidf = TfidfModel(docs_corpus, id2word=self.docs_dict) docs_tfidf = model_tfidf[docs_corpus] docs_vecs = np.vstack( [sparse2full(c, len(self.docs_dict)) for c in docs_tfidf]) return docs_vecs
def tf_idf_weight(spacy_contexts): """ @param spacy_contexts Spacy-fied contexts Returns list of Dicts, each dictionary corresponds to one document and contains words and their tf-idf weights """ docs_dict = Dictionary(spacy_contexts) docs_dict.compactify() docs_corpus = [docs_dict.doc2bow(doc) for doc in spacy_contexts] model_tfidf = TfidfModel(docs_corpus, id2word=docs_dict) docs_tfidf = model_tfidf[docs_corpus] # Now generate a list of dicts with k,v = "word": tfidf_frequency # each dict contains words from one document (sentence) doc_tfidf_dicts = [] for doc in docs_tfidf: d = dict() for term, freq in doc: d[docs_dict[term]] = freq doc_tfidf_dicts.append(d) return doc_tfidf_dicts
def getLdaFeature(documents, topicNum): ''' Funciton: generate lda features by training lda model Input: documents: list of preprocessed sentences topicNum: output vector dimension Output: lda features(DataFrame format) ''' # get corpus # LogInfo(' Get corpus...') texts = [[word for word in document.split(' ')] for document in documents] dictionary = corpora.Dictionary(texts) corpusD = [dictionary.doc2bow(text) for text in texts] # train lda model # LogInfo(' Train LDA model...') tfidf = TfidfModel(corpusD) corpus_tfidf = tfidf[corpusD] # ldaModel = gensim.models.ldamulticore.LdaMulticore(corpus_tfidf, workers = 8, num_topics=topicNum, chunksize=8000, passes=10, random_state = 12) ldaModel = LdaModel(corpus_tfidf, num_topics=topicNum, chunksize=8000, passes=10, random_state = 12) # generate lda features LogInfo(' Generate LDA features...') ldaFeature = np.zeros((len(texts), topicNum)) i = 0 for doc in corpus_tfidf: topic = ldaModel.get_document_topics(doc, minimum_probability = 0.01) for t in topic: ldaFeature[i, t[0]] = round(t[1],5) i = i + 1 colName = getColName(topicNum, "qlda") ldaFeature = pd.DataFrame(ldaFeature, columns = colName) return ldaFeature
def lda(documents, topicNum): texts = [[word for word in document.split(' ')] for document in documents] print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+str(len(texts))) dictionary = corpora.Dictionary(texts) print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+' get corpus..') corpusD = [dictionary.doc2bow(text) for text in texts] #id2word = dictionary.id2word print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+' tfidf Model...') tfidf = TfidfModel(corpusD) corpus_tfidf = tfidf[corpusD] print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+' train lda Model...') ldaModel = gensim.models.ldamulticore.LdaMulticore(corpus_tfidf, workers = 8, num_topics=topicNum, chunksize=8000, passes=10, random_state = 12) #ldaModel = gensim.models.ldamodel.LdaModel(corpus=corpusD, num_topics=topicNum, update_every=1, chunksize=8000, passes=10) print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+' get lda feature...') ldaFeature = np.zeros((len(texts), topicNum)) i = 0 for doc in corpus_tfidf: topic = ldaModel.get_document_topics(doc, minimum_probability = 0.01) for t in topic: ldaFeature[i, t[0]] = round(t[1],5) i = i + 1 if i%1000 == 1: print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+str(i)) return ldaFeature
def compute_tfidf(): from gensim.models.tfidfmodel import TfidfModel keys, unstem_map, paragraph_lengths, int2word, word2int = compute_all_words( ) with time_code('compute_tfidf'): tfidf = TfidfModel(corpus, smartirs='ltc', id2word=int2word) return tfidf
def loadCorpus(self, mmfile, dictfile, doctuplesfile=None): self.corpus = corpora.MmCorpus(mmfile) self.dictionary = corpora.Dictionary.load(dictfile) if doctuplesfile != None: with open(doctuplesfile, 'rb') as docpicklef: self.doctuples = pickle.load(docpicklef) if self.toweight: self.tfidf = TfidfModel(self.corpus)
def predict_on_group(model, docs_data, word2vec_model300, length=5) -> 'pd.DataFrame of type : pair_id || target': """ Parameters: model -- model object with methods train and predict docs_data -- pandas Data Frame with fields pair_id, content, target word2vec_model300 -- w2v model (object) Returns: pd.DataFrame of type : { pair_id || target } with predicted target for each pair_id """ dictionary = corpora.Dictionary() for i in docs_data.content: try: dictionary.add_documents([i]) except: dictionary.add_documents([['a']]) docs_data['vector'] = docs_data.content.apply(doc_opti, args=(dictionary, )) # except: # docs_data['vector'] = docs_data.content.apply(dictionary.doc2bow) corpus = [] for line in docs_data.content: try: if math.isnan(line): line = ["мимо"] except: pass corpus = corpus + [dictionary.doc2bow(line)] similarity_matrix = word2vec_model300.similarity_matrix( dictionary, tfidf=TfidfModel(corpus, dictionary=dictionary), threshold=0.0, exponent=2.0, nonzero_limit=100) docs_data['dist_vec'] = docs_data.vector.apply(make_dist_vec, args=(docs_data.vector, similarity_matrix)) features = [str(i) for i in range(length)] for i in range(length): docs_data[str(i)] = docs_data.dist_vec.apply(take_S, args=(i, )) docs_data['target'] = model.predict(np.array(docs_data[features])) return docs_data[['pair_id', 'target']]
def train_model_on_group(model, docs_data, word2vec_model300, length=5): """ Parameters: model -- model object with methods train and predict docs_data -- pandas Data Frame with fields pair_id, content, target word2vec_model300 -- w2v model (object) Returns: model trained on data """ dictionary = corpora.Dictionary() for i in docs_data.content: try: dictionary.add_documents([i]) except: dictionary.add_documents([['a']]) docs_data['vector'] = docs_data.content.apply(doc_opti, args=(dictionary, )) corpus = [] for line in docs_data.content: try: if math.isnan(line): line = ["мимо"] except: pass corpus = corpus + [dictionary.doc2bow(line)] similarity_matrix = word2vec_model300.similarity_matrix( dictionary, tfidf=TfidfModel(corpus, dictionary=dictionary), threshold=0.0, exponent=2.0, nonzero_limit=100) docs_data['dist_vec'] = docs_data.vector.apply(make_dist_vec, args=(docs_data.vector, similarity_matrix)) features = [str(i) for i in range(length)] for i in range(length): docs_data[str(i)] = docs_data.dist_vec.apply(take_S, args=(i, )) print(docs_data.head()) model = model.fit(docs_data[features], docs_data['target']) print(model.score(docs_data[features], docs_data['target'])) return model
def __init__(self, docs, num_option=OPTION_GROUP, usr_option=OPTION_GROUP, url_option=OPTION_GROUP, emo_option=OPTION_GROUP, lc=True, del_dup=True, del_punc=False, del_diac=True, token_list=[-1], token_min_filter=-1, token_max_filter=1.0, tfidf=True, **kwargs): self.del_diac = del_diac self.num_option = num_option self.usr_option = usr_option self.url_option = url_option self.emo_option = emo_option self.lc = lc self.del_dup = del_dup self.del_punc = del_punc self.token_list = token_list self.token_min_filter = token_min_filter self.token_max_filter = token_max_filter self.tfidf = tfidf self.kwargs = {k: v for k, v in kwargs.items() if k[0] != '_'} if emo_option == OPTION_NONE: self.emo_map = None else: # self.emo_map = get_compiled_map(os.path.join(os.path.dirname(__file__), 'resources', 'emoticons.json')) self.emo_map = EmoticonClassifier() docs = [self.tokenize(d) for d in docs] self.dictionary = corpora.Dictionary(docs) corpus = [self.dictionary.doc2bow(d) for d in docs] if self.token_min_filter != 1 or self.token_max_filter != 1.0: if self.token_min_filter < 0: self.token_min_filter = abs(self.token_min_filter) else: self.token_min_filter = int( len(corpus) * self.token_min_filter) if self.token_max_filter < 0: self.token_max_filter = abs( self.token_max_filter) / len(corpus) self.dictionary.filter_extremes(no_below=self.token_min_filter, no_above=self.token_max_filter, keep_n=None) if self.tfidf: self.model = TfidfModel(corpus) else: self.model = None
def corpus_vec(docs, model, corpus, size = DEFAULT_SAMPLE_SIZE): """ Creates a NxD array of document vectors for each document in a list""" tfidf = TfidfModel(corpus) N,D = len(docs), model.wv.syn0.shape[1] arr = np.empty((N, D)) for i in range(N): arr[i,:] = doc_vec(docs[i], model, corpus, size, tfidf) return arr
def __init__(self, documents): self.documents = documents self.texts = [[word for word in document.lower().split()] for document in documents] self.dictionary = corpora.Dictionary(self.texts) self.corpus = [self.dictionary.doc2bow(text) for text in self.texts] self.tfidf = TfidfModel(self.corpus) self._make_random_indexing() print "initialized!"
def buildTfidfModel(corpus): print('get tfidf model...') if not os.path.exists(modelpath + 'tfidf.model'): # 构造tfidf向量 tfidf = TfidfModel(corpus) tfidf.save(modelpath + 'tfidf.model') else: tfidf = TfidfModel.load(modelpath + 'tfidf.model') print('done') return tfidf
def tf_idf_keywords(text, bow, dictionary): tfidf = TfidfModel(bow) # generates the model text = dictionary.doc2bow(text) tfidf_weights = tfidf[text] sorted_tfidf_weights = sorted(tfidf_weights, key=lambda w: w[1], reverse=True) # sort by value keywords = [] for term_id, weight in sorted_tfidf_weights[:5]: keywords.append(str(dictionary.get(term_id))) return keywords
def fit(self, documents, labels=None): if self.lexicon == None or self.tfidf == None: inputDocuments = list(documents) self.lexicon = Dictionary(inputDocuments) self.tfidf = TfidfModel( [self.lexicon.doc2bow(doc) for doc in inputDocuments], id2word=self.lexicon) self.save() return self else: return self
def tf_idf(dataSeg_save): corpus = pd.read_csv(dataSeg_save,header=None)[0] texts = [sentence.split(' ') for sentence in corpus] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] tf_idf_model = TfidfModel(corpus, normalize=False) word_tf_tdf = list(tf_idf_model[corpus]) # print('词典:', dictionary.token2id) # print('词频:', corpus) # print('词的tf-idf值:', word_tf_tdf) return word_tf_tdf,dictionary.token2id
def _get_tfidf(self): # Convert document (a list of words) into the bag-of-words format docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs] model_tf_idf = TfidfModel(docs_corpus, id2word=self.docs_dict) docs_tf_idf = model_tf_idf[docs_corpus] docs_tuples = [] for c in docs_tf_idf: docs_tuples.append(sparse2full(c, len(self.docs_dict))) # print('ccc',len(c),c) tf_idf_vec = np.vstack(docs_tuples) return tf_idf_vec
def news_recommend_keywords(keywords, num=10): keywords = [word for word in keywords.split()] path_df = "Pickles/News_central_rec2.pickle" with open(path_df, 'rb') as data: df = pickle.load(data) df['bag_of_words'] = '' columns = df.columns for index, row in df.iterrows(): words = [] Words = '' for col in columns: if col == 'Content': words += row[col].split() words = list(set(words)) row['bag_of_words'] = words processed_keywords = df.bag_of_words.to_list() dictionary = Dictionary( processed_keywords) # create a dictionary of words from our keywords corpus = [dictionary.doc2bow(doc) for doc in processed_keywords] #create corpus where the corpus is a bag of words for each document tfidf = TfidfModel(corpus) #create tfidf model of the corpus # Create the similarity data structure. This is the most important part where we get the similarities between the news. sims = MatrixSimilarity(tfidf[corpus], num_features=len(dictionary)) query_doc_bow = dictionary.doc2bow( keywords) # get a bag of words from the query_doc query_doc_tfidf = tfidf[ query_doc_bow] #convert the regular bag of words model to a tf-idf model where we have tuples # of the news ID and it's tf-idf value for the news similarity_array = sims[ query_doc_tfidf] # get the array of similarity values between our news and every other news. #So the length is the number of news we have. To do this, we pass our list of tf-idf tuples to sims. similarity_series = pd.Series(similarity_array.tolist(), index=df.Title.values) #Convert to a Series top_hits = similarity_series.sort_values( ascending=False)[:num] #get the top matching results, # i.e. most similar news titles = [] scores = [] for idx, (title, score) in enumerate(zip(top_hits.index, top_hits)): #print("%d '%s' with a similarity score of %.3f" %(idx+1, title, score)) titles.append(title) scores.append(score) return titles, scores
def transform_to_sparse(infiles, N, feature_size, vectorizer=None, feature_weight='logent'): """ Param ------ infiles: dict, in the form {word_file: "path", lemma_file: "path"} N: the number of instances in the file feature_size: int vectorizer: sklearn vectorizer feature_weight: {'logent', 'tfidf', 'binary'}, weighting scheme Return ------ X: sparse matrix, feature representation of infiles with specific weighting scheme y: 1d array, indicators of labels in infiles """ infile = infiles['word_file'] if infiles[ 'word_file'] is not None else infiles['lemma_file'] if vectorizer is not None: if feature_weight == 'binary': X = vectorizer.fit_transform(get_line_as_str(**infiles)) y = get_y(infile) else: X = Scipy2Corpus( vectorizer.fit_transform(get_line_as_str(**infiles))) if feature_weight == 'tfidf': weighting_scheme = TfidfModel(X) elif feature_weight == 'logent': weighting_scheme = LogEntropyModel(X) x = weighting_scheme[X] y = get_y(infile) data = [] rows = [] cols = [] line_count = 0 for line in x: for elem in line: rows.append(line_count) cols.append(elem[0]) data.append(elem[1]) line_count += 1 print(len(data)) print(len(rows)) print(len(cols)) print(N) print(feature_size) X = csr_matrix((data, (rows, cols)), shape=(N, feature_size)) return X, y