def compare_different_method(self, name, args, sampling=False, times=1): file_name = '{} {} sampling={} times={}'.format( name, self._number_article_per_test_cluster, sampling, times) print(file_name) result_table = {} for time_counter in range(times): print(time_counter) articles = self._get_test_articles(sampling) for (feature, linkage, threshold, sim, quick, use_idf) in args: t = time.time() self._feature_extraction(feature, articles, use_idf=use_idf) if quick: clusters = HAC(threshold, linkage=linkage, similarity=sim).quick_fit(articles) else: clusters = HAC(threshold, linkage=linkage, similarity=sim).fit(articles) result = validate_clustering(self._labeled_clusters, clusters) result['time'] = time.time() - t key = '{} {} {} {} {} {}'.format(feature, linkage, threshold, sim, quick, use_idf) if key not in result_table: result_table[key] = [] result_table[key].append(result) self._print_test_result(result_table) self._save_as_csv(result_table, 'compare all', file_name)
def find_best_threshold(self, linkage, sim, quick, start_th=0.3, end_th=0.8, step=0.05, sampling=True, times=1): file_name = 'threshold {} {} quick={} idf={} sampling={} times={}'.format( linkage, sim, quick, self.use_idf, sampling, times) print(file_name) result_table = {} for time_counter in range(times): print(time_counter) articles = self._get_test_articles(sampling) threshold = start_th while threshold < end_th + step: print('threshold', threshold) if quick is True: clusters = HAC(threshold, linkage=linkage, similarity=sim).quick_fit(articles) else: clusters = HAC(threshold, linkage=linkage, similarity=sim).fit(articles) result = validate_clustering(self._labeled_clusters, clusters) key = '{0:.2f}'.format(threshold) if key not in result_table: result_table[key] = [] result_table[key].append(result) threshold += step self._print_test_result(result_table) self._save_as_csv(result_table, self._feature_mode, file_name)
def compare_extraction(self, args, sampling=False, times=1): file_name = 'extraction {} sampling={} times={}'.format( self._number_article_per_test_cluster, sampling, times) print(file_name) result_table = {} for time_counter in range(times): print(time_counter) articles = self._get_test_articles(sampling) for (method, k, linkage, threshold, with_weight) in args: t = time.time() invalid_id_list = self._feature_extractor.fit_with_extraction( articles, method, k, with_weight=with_weight) for invalid_id in invalid_id_list: removed = False for cluster in self._labeled_clusters: if removed: break for article in cluster['articles']: if article.id == invalid_id: cluster['articles'].remove(article) removed = True break clusters = HAC( threshold, linkage=linkage, similarity=HAC.SIMILARITY_DOT).quick_fit(articles) result = validate_clustering(self._labeled_clusters, clusters) result['time'] = time.time() - t key = 'method{} k={} {} {} weight={}'.format( method, k, linkage, threshold, with_weight) if key not in result_table: result_table[key] = [] result_table[key].append(result) self._print_test_result(result_table) self._save_as_csv(result_table, 'compare all', file_name)
def find_ratio_threshold(self, method, k, t, c, start_th=0.3, end_th=0.8, step=0.05, sampling=True, times=1): file_name = 'ratio th method{} k={} t={} c={} sampling={} times={}'.format( method, k, t, c, sampling, times) print(file_name) result_table = {} for time_counter in range(times): print(time_counter) articles = self._get_test_articles(sampling) self._feature_extractor.fit_with_extraction_ratio( articles, method, k, t, c) threshold = start_th while threshold < end_th + step: print('threshold', threshold) clusters = HAC( threshold, linkage=HAC.LINKAGE_CENTROID, similarity=HAC.SIMILARITY_DOT).quick_fit(articles) result = validate_clustering(self._labeled_clusters, clusters) key = 'th{} method{} k{} t{} c{}'.format( threshold, method, k, t, c) if key not in result_table: result_table[key] = [] result_table[key].append(result) threshold += step self._print_test_result(result_table) self._save_as_csv(result_table, self._feature_mode, file_name)
def stable_test(self, times=3): file_name = 'stable_test times={}'.format(times) result_table = {} for time_counter in range(times): articles = self._get_test_articles(False) random.shuffle(articles) print('time counter', time_counter) for key in [HAC(0.55).quick_fit, HAC(0.55).fit]: clusters = key(articles) result = validate_clustering(self._labeled_clusters, clusters) algorithm_name = str(key).split(' ')[2] if algorithm_name not in result_table: result_table[algorithm_name] = [] print(result) result_table[algorithm_name].append(result) self._print_test_result(result_table) self._save_as_csv(result_table, self._feature_mode, file_name)
def compare_time_feature(self, name, threshold, linkage, sim, sampling=False, times=1): file_name = '{} {} sampling={} times={}'.format( name, self._number_article_per_test_cluster, sampling, times) print(file_name) result_table = {} for time_counter in range(times): print(time_counter) articles = self._get_test_articles(sampling) for i in range(3): t = time.time() if i == 0: clusters = HAC(threshold, linkage=linkage, similarity=sim).fit(articles) key = 'normal {} {} {}'.format(linkage, threshold, sim) elif i == 1: clusters = HAC(threshold, linkage=linkage, similarity=sim).quick_fit(articles, time_order=True) key = 'time_order {} {} {}'.format(linkage, threshold, sim) else: clusters = HAC(threshold, linkage=linkage, similarity=sim).quick_fit(articles, time_order=False) key = 'random {} {} {}'.format(linkage, threshold, sim) result = validate_clustering(self._labeled_clusters, clusters) result['time'] = time.time() - t if key not in result_table: result_table[key] = [] result_table[key].append(result) self._print_test_result(result_table) self._save_as_csv(result_table, 'compare all', file_name)
def compare(self, sim, quick, args, sampling=False, times=1): file_name = 'compare {} quick={} sampling={} times={}'.format( sim, quick, sampling, times) print(file_name) result_table = {} for time_counter in range(times): print(time_counter) articles = self._get_test_articles(sampling) for linkage, threshold in args: if quick is True: clusters = HAC(threshold, linkage=linkage, similarity=sim).quick_fit(articles) else: clusters = HAC(threshold, linkage=linkage, similarity=sim).fit(articles) result = validate_clustering(self._labeled_clusters, clusters) key = '{}-{}'.format(linkage, threshold) if key not in result_table: result_table[key] = [] result_table[key].append(result) self._print_test_result(result_table) self._save_as_csv(result_table, self._feature_mode, file_name)
def compare_ratio(self, method, k, args, sampling=True, times=1): file_name = 'compare ratio method{} k={} sampling={} times={}'.format( method, k, sampling, times) print(file_name) result_table = {} for time_counter in range(times): print(time_counter) articles = self._get_test_articles(sampling) for t, c, threshold in args: print('t ratio', t) self._feature_extractor.fit_with_extraction_ratio( articles, method, k, t, c) clusters = HAC(threshold, linkage=HAC.LINKAGE_CENTROID, similarity=HAC.SIMILARITY_DOT).fit(articles) result = validate_clustering(self._labeled_clusters, clusters) key = 't{} c{} th{} method{} k{} '.format( t, c, threshold, method, k) if key not in result_table: result_table[key] = [] result_table[key].append(result) self._print_test_result(result_table) self._save_as_csv(result_table, self._feature_mode, file_name)