def fit_with_extraction(self, articles, method, topic=10, use_idf=False, with_weight=False): documents = [] keyword_list_table = {} for article in articles: keyword_list = keywords_extraction(article, method, topic, with_weight=with_weight) keyword_list_table[article.id] = keyword_list if with_weight: documents.append(' '.join( [keyword[0] for keyword in keyword_list])) else: documents.append(' '.join(keyword_list)) tfidf_vectorizer = self.idf_vectorizer(documents, use_idf) for article in articles: keyword_list = keyword_list_table[article.id] article.vector = self._compute_vector(keyword_list, tfidf_vectorizer) return self.remove_invalid_articles(articles)
def clustering4(model, articles, threshold=0.55, t=0.7, c=0.3): clusters = initialize_clusters(articles) for cluster in clusters: for article in cluster['articles']: article.content_vector = compute_vector(model, keywords_extraction(article)) compute_cluster_vector(model, cluster, [2, t, c]) clusters = merge_clusters(model, clusters, threshold, combined_method=[2, t, c]) return clusters
def fit(self, articles): for article in articles: keyword_list = keywords_extraction(article, self.method, self.k, with_weight=self.with_weight) article.vector = self._compute_vector(keyword_list) self.remove_invalid_articles(articles)
def compute_cluster_vector(model, cluster, combined_method): if combined_method[0] is 0: cluster['centroid'] = sum([a.vector for a in cluster['articles']]) / len(cluster['articles']) elif combined_method[0] is 1: cluster['centroid'] = sum([a.vector for a in cluster['articles']]) / len(cluster['articles']) cluster['keywords'] = compute_vector(model, keywords_extraction(cluster['articles'])) elif combined_method[0] is 2: cluster['centroid'] = sum([a.vector for a in cluster['articles']]) * combined_method[1] + \ sum([a.content_vector for a in cluster['articles']]) * combined_method[2] cluster['centroid'] /= len(cluster['articles'])
def fit_with_extraction_ratio(self, articles, method=1, k=25, t=0.5, c=0.5): for article in articles: if c != 0 and t != 0: title_vector = self._compute_vector(article.title) if title_vector is None: t = 0 else: keyword_vector = self._compute_vector( keywords_extraction(article, method, k, with_weight=True)) article.vector = title_vector * t + keyword_vector * c if c == 0: article.vector = self._compute_vector(article.title) elif t == 0: article.vector = self._compute_vector( keywords_extraction(article, method, k, with_weight=True)) self.remove_invalid_articles(articles)
def fit(self, articles): for article in articles: keyword_list = keywords_extraction(article, self.method, self.k, with_weight=self.with_weight) title_vector = self._compute_vector(article.title) content_vector = self._compute_vector(keyword_list) if title_vector is None: article.vector = content_vector if self.c_ratio != 0 else None elif content_vector is None: article.vector = title_vector if self.t_ratio != 0 else None else: article.vector = title_vector * self.t_ratio + content_vector * self.c_ratio self.remove_invalid_articles(articles)
def _split_string(article, split_content=True): tokens = cut(article.title) if split_content: tokens.extend(keywords_extraction([article], 1)) return ' '.join(tokens)
def clustering1(model, articles, threshold=0.55, t=0.9, c=0.1): clusters = initialize_clusters(articles) for cluster in clusters: cluster['keywords'] = compute_vector(model, keywords_extraction(cluster['articles'])) return merge_clusters(model, clusters, threshold, combined_method=[1], similarity_method=[2, t, c])
def get_cluster_keyword(cluster): return [ keywords_extraction(cluster['articles'], 0), keywords_extraction(cluster['articles'], 1) ]