def build_lda_by_keywords(keywords, num_article_for_search, num_topics=0): if num_topics == 0: num_topics = len(keywords) dir_name = '../data' if not os.path.exists(dir_name): os.mkdir(dir_name) filename = dir_name + '/' + str("_".join(keywords) + "_" + str(num_article_for_search)) with open(filename, 'w', encoding="utf-8") as file: articles = [] for keyword in keywords: articles_keyword = fetch_articles(keyword, number=num_article_for_search, days=-1) articles.extend(articles_keyword) log(file, "%s : %d" % (keyword, len(articles_keyword))) texts = [] for article in articles: tokens = cut( article.title + article.content, using_stopwords=True, simplified_convert=True) texts.append(tokens) start = time.time() model = lda.build_lda_model(texts, num_topics) for topic_key, tokens in lda.get_topic(model, num_topics=num_topics, num_words=15).items(): log(file, tokens) end = time.time() log(file, "model train time : " + str(end - start)) print("\n\n\n\n", file=file) for article in articles: print(article.title, end="\n", file=file)
def _compute_vector(self, input_data): weights = None if isinstance(input_data, list): if len(input_data) == 0: tokens = [] elif isinstance(input_data[0], tuple): tokens = [data_tuple[0] for data_tuple in input_data] weights = [data_tuple[1] for data_tuple in input_data] else: tokens = input_data else: tokens = cut(input_data, using_stopwords=True, simplified_convert=True) if len(tokens) > 0 and (tokens[-1] in ['八卦', '卦']): del tokens[-1] v1 = [] for word in tokens: if word in self._model: word_vector = self._model[word] if weights: weight = weights[tokens.index(word)] word_vector = word_vector * weight v1.append(word_vector) if len(v1) is 0: print('invalid article:', input_data) return None return sum(v1)
def get_sentence(keyword, number, page=1): articles = fetch_articles(keyword, number, page=page, fl='title, content', desc=False) result_sentences = [] for article in articles: tokens = cut(article.title, using_stopwords=False, simplified_convert=True) result_sentences.append(tokens) if hasattr(article, 'content_sentence'): for sen in article.content_sentence: result_sentences.append( cut(sen, using_stopwords=False, simplified_convert=True)) # if hasattr(article, 'content'): # result_sentences.append(cut(article.content, using_stopwords=False, simplified_convert=True)) return result_sentences
def n_similarity_test(): sentence_list = [ '馬總統走光照 蔡正元:經專家鑑定為光影', '馬走光照瘋傳總統府譴責', '2016 全球 軍事力量排名', '舉債也最低... 「六都還款王」第2名令人', '【北捷殺人案】鄭捷判死定讞5大理由曝光', '地震', '日本紅十字會:捐款不用手續費 善款100%', '日本熊本強震 屏縣府擬捐香蕉賑災', '有沒有日本重新定義島的八卦', '日本只在利益不衝突時才是朋友' ] tokens_list = [cut(sentence) for sentence in sentence_list] for i in range(1, len(tokens_list)): print(tokens_list[i - 1]) print(tokens_list[i]) for model in models: print(model.n_similarity(tokens_list[i], tokens_list[i - 1]))
def compute_vector(model, input_data, need_log=False): if isinstance(input_data, str): tokens = cut(input_data, using_stopwords=True, simplified_convert=True, log=need_log) else: tokens = input_data if len(tokens) > 0 and (tokens[-1] in ['八卦', '卦']): del tokens[-1] if need_log is True: print(tokens) tokens_not_found = [word for word in tokens if word not in model] if len(tokens_not_found) is not 0: log('token not in model :' + " ".join(tokens_not_found)) v1 = [model[word] for word in tokens if word in model] if len(v1) is 0: print('invalid article: \'' + input_data + '\'') return None vector = matutils.unitvec(array(v1, float).mean(axis=0)) return vector
def build_lda_model(input_data, num_topics=1): if len(input_data) == 0: print('data is empty') return if isinstance(input_data, str): input_data = [input_data] texts = [] for data in input_data: tokens = cut(data, using_stopwords=True, simplified_convert=True) texts.append(tokens) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] lda_model = models.ldamodel.LdaModel( corpus, num_topics=num_topics, id2word=dictionary, passes=1) return lda_model
def _compute_vector(self, input_data, tfidf_vectorizer=None): weights = None if isinstance(input_data, list): if isinstance(input_data[0], tuple): tokens = [data_tuple[0] for data_tuple in input_data] weights = [data_tuple[1] for data_tuple in input_data] else: tokens = input_data else: tokens = cut(input_data, using_stopwords=True, simplified_convert=True) if len(tokens) > 0 and (tokens[-1] in ['八卦', '卦']): del tokens[-1] v1 = [] if tfidf_vectorizer is not None: idf_table = self.build_idf_table(tfidf_vectorizer) for word in tokens: if word in self.model: word_vector = self.model[word] if weights: weight = weights[tokens.index(word)] word_vector = word_vector * weight if tfidf_vectorizer is not None and word in idf_table: word_vector = word_vector * idf_table[word] v1.append(word_vector) if len(v1) is 0: print('invalid article:', input_data) return None # v1 = [self.model[word] for word in tokens if word in self.model] if tfidf_vectorizer is None: return array(v1, float).mean(axis=0) else: return sum(v1)
def _split_string(article, split_content=True): tokens = cut(article.title) if split_content: tokens.extend(keywords_extraction([article], 1)) return ' '.join(tokens)
from python_code.model import ptt_article_fetcher from python_code.model.my_tokenize.tokenizer import cut articles = ptt_article_fetcher.fetch_articles('', number=10, page=6) using_stopwords = False equals_tokens = [] for article in articles: token1 = cut(article.title, using_stopwords, True) token2 = cut(article.title, using_stopwords, False) if token1 == token2: equals_tokens.append(token1) else: print('經轉換' + str(token1)) print('未轉換' + str(token2)) for i in equals_tokens: print(i)