def fit_with_extraction(self,
                         articles,
                         method,
                         topic=10,
                         use_idf=False,
                         with_weight=False):
     documents = []
     keyword_list_table = {}
     for article in articles:
         keyword_list = keywords_extraction(article,
                                            method,
                                            topic,
                                            with_weight=with_weight)
         keyword_list_table[article.id] = keyword_list
         if with_weight:
             documents.append(' '.join(
                 [keyword[0] for keyword in keyword_list]))
         else:
             documents.append(' '.join(keyword_list))
     tfidf_vectorizer = self.idf_vectorizer(documents, use_idf)
     for article in articles:
         keyword_list = keyword_list_table[article.id]
         article.vector = self._compute_vector(keyword_list,
                                               tfidf_vectorizer)
     return self.remove_invalid_articles(articles)
예제 #2
0
def clustering4(model, articles, threshold=0.55, t=0.7, c=0.3):
    clusters = initialize_clusters(articles)
    for cluster in clusters:
        for article in cluster['articles']:
            article.content_vector = compute_vector(model, keywords_extraction(article))
        compute_cluster_vector(model, cluster, [2, t, c])
    clusters = merge_clusters(model, clusters, threshold, combined_method=[2, t, c])
    return clusters
 def fit(self, articles):
     for article in articles:
         keyword_list = keywords_extraction(article,
                                            self.method,
                                            self.k,
                                            with_weight=self.with_weight)
         article.vector = self._compute_vector(keyword_list)
     self.remove_invalid_articles(articles)
예제 #4
0
def compute_cluster_vector(model, cluster, combined_method):
    if combined_method[0] is 0:
        cluster['centroid'] = sum([a.vector for a in cluster['articles']]) / len(cluster['articles'])
    elif combined_method[0] is 1:
        cluster['centroid'] = sum([a.vector for a in cluster['articles']]) / len(cluster['articles'])
        cluster['keywords'] = compute_vector(model, keywords_extraction(cluster['articles']))
    elif combined_method[0] is 2:
        cluster['centroid'] = sum([a.vector for a in cluster['articles']]) * combined_method[1] + \
                              sum([a.content_vector for a in cluster['articles']]) * combined_method[2]
        cluster['centroid'] /= len(cluster['articles'])
 def fit_with_extraction_ratio(self,
                               articles,
                               method=1,
                               k=25,
                               t=0.5,
                               c=0.5):
     for article in articles:
         if c != 0 and t != 0:
             title_vector = self._compute_vector(article.title)
             if title_vector is None:
                 t = 0
             else:
                 keyword_vector = self._compute_vector(
                     keywords_extraction(article,
                                         method,
                                         k,
                                         with_weight=True))
                 article.vector = title_vector * t + keyword_vector * c
         if c == 0:
             article.vector = self._compute_vector(article.title)
         elif t == 0:
             article.vector = self._compute_vector(
                 keywords_extraction(article, method, k, with_weight=True))
     self.remove_invalid_articles(articles)
 def fit(self, articles):
     for article in articles:
         keyword_list = keywords_extraction(article,
                                            self.method,
                                            self.k,
                                            with_weight=self.with_weight)
         title_vector = self._compute_vector(article.title)
         content_vector = self._compute_vector(keyword_list)
         if title_vector is None:
             article.vector = content_vector if self.c_ratio != 0 else None
         elif content_vector is None:
             article.vector = title_vector if self.t_ratio != 0 else None
         else:
             article.vector = title_vector * self.t_ratio + content_vector * self.c_ratio
     self.remove_invalid_articles(articles)
예제 #7
0
def _split_string(article, split_content=True):
    tokens = cut(article.title)
    if split_content:
        tokens.extend(keywords_extraction([article], 1))
    return ' '.join(tokens)
예제 #8
0
def clustering1(model, articles, threshold=0.55, t=0.9, c=0.1):
    clusters = initialize_clusters(articles)
    for cluster in clusters:
        cluster['keywords'] = compute_vector(model, keywords_extraction(cluster['articles']))
    return merge_clusters(model, clusters, threshold, combined_method=[1], similarity_method=[2, t, c])
예제 #9
0
def get_cluster_keyword(cluster):
    return [
        keywords_extraction(cluster['articles'], 0),
        keywords_extraction(cluster['articles'], 1)
    ]