def cluster_alchemy(dataset, gamma=None, filter=False): doc_proc = dp.DocumentsProcessor(dataset) if gamma: tfidf_matrix, f_score_dict, params = doc_proc.get_data_with_alchemy( gamma=gamma, filter=filter) else: tfidf_matrix, f_score_dict, params = doc_proc.get_data_with_alchemy() print 'starting clustering: found %s document and %s features' \ % (tfidf_matrix.shape[0], tfidf_matrix.shape[1]) linkage_matrix = hr.average(tfidf_matrix.toarray()) t = hr.to_tree(linkage_matrix, rd=True) clusters = {} for node in t[1]: if not node.is_leaf(): l = [] clusters[node.get_id()] = collect_leaf_nodes(node, l) f = f_score(clusters, f_score_dict) l = print_f_score_dict(f) params['avg_f_score'] = average_f_score(f, tfidf_matrix.shape[0]) params['all_fscore'] = l print 'average f_score: %s' % params['avg_f_score'] return params
def cluster_dandelion_2(dataset, gamma=0.91, filter=False): #duplicato, mi serve solo per tornare la linkage_matrix doc_proc = dp.DocumentsProcessor(dataset) if gamma: tfidf_matrix, f_score_dict, params = doc_proc.get_data_with_dandelion( gamma=gamma, filter=filter) else: tfidf_matrix, f_score_dict, params = doc_proc.get_data_with_dandelion() svd = TruncatedSVD(tfidf_matrix.shape[0]) lsa = make_pipeline(svd, Normalizer(copy=False)) tfidf_matrix = lsa.fit_transform(tfidf_matrix) #linkage_matrix = hr.average(tfidf_matrix.toarray()) linkage_matrix = hr.average(tfidf_matrix) t = hr.to_tree(linkage_matrix, rd=True) clusters = {} for node in t[1]: if not node.is_leaf(): l = [] clusters[node.get_id()] = collect_leaf_nodes(node, l) f = f_score(clusters, f_score_dict) l = print_f_score_dict(f) params['avg_f_score'] = average_f_score(f, tfidf_matrix.shape[0]) params['all_fscore'] = l return linkage_matrix
def scipy_algo(dataset, abstract=False): doc_proc = dp.DocumentsProcessor(dataset) tfidf_matrix, f_score_dict = doc_proc.get_data(abstract) svd = TruncatedSVD(tfidf_matrix.shape[0]) lsa = make_pipeline(svd, Normalizer(copy=False)) #tfidf_matrix = lsa.fit_transform(tfidf_matrix) print 'starting clustering after lsa: found %s document and %s features' \ % (tfidf_matrix.shape[0], tfidf_matrix.shape[1]) linkage_matrix = hr.average(tfidf_matrix.toarray()) #linkage_matrix = hr.average(tfidf_matrix) t = hr.to_tree(linkage_matrix, rd=True) clusters = {} for node in t[1]: if not node.is_leaf(): l = [] clusters[node.get_id()] = collect_leaf_nodes(node, l) f = f_score(clusters, f_score_dict) print_f_score_dict(f) avg_f_score = average_f_score(f, tfidf_matrix.shape[0]) print 'average f_score: %s' % avg_f_score return avg_f_score
def cluster_dandelion_entities(dataset, gamma=None, filter=False): doc_proc = dp.DocumentsProcessor(dataset) if gamma: tfidf_matrix, f_score_dict, params = doc_proc.get_data_only_with_entities( gamma=gamma, filter=filter) else: tfidf_matrix, f_score_dict, params = doc_proc.get_data_only_with_entities( ) doc, features = tfidf_matrix.shape print 'starting clustering: found %s document and %s features' \ % (doc, features) svd = TruncatedSVD(tfidf_matrix.shape[0]) lsa = make_pipeline(svd, Normalizer(copy=False)) tfidf_matrix = lsa.fit_transform(tfidf_matrix) print 'starting clustering: found %s document and %s features after LSA' \ % (tfidf_matrix.shape[0], tfidf_matrix.shape[1]) #linkage_matrix = hr.average(tfidf_matrix.toarray()) linkage_matrix = hr.average(tfidf_matrix) t = hr.to_tree(linkage_matrix, rd=True) clusters = {} for node in t[1]: if not node.is_leaf(): l = [] clusters[node.get_id()] = collect_leaf_nodes(node, l) f = f_score(clusters, f_score_dict) l = print_f_score_dict(f) params['avg_f_score'] = average_f_score(f, tfidf_matrix.shape[0]) params['all_fscore'] = l print 'average f_score: %s' % params['avg_f_score'] return params
def scipy_algo(dataset): doc_proc = dp.DocumentsProcessor(dataset) tfidf_matrix, f_score_dict = doc_proc.get_data() linkage_matrix = hr.average(tfidf_matrix.toarray()) t = hr.to_tree(linkage_matrix, rd=True) clusters = {} for node in t[1]: if not node.is_leaf(): l = [] clusters[node.get_id()] = collect_leaf_nodes(node, l) f = f_score(clusters, f_score_dict) print_f_score_dict(f) print 'average f_score: %s' % average_f_score(f, tfidf_matrix.shape[0])
def cluster_fabio(db, dataset, gamma=None, with_lsa=False, ranking_metric='r'): doc_proc = dp.DocumentsProcessor(dataset, db=db) if gamma: tfidf_matrix, f_score_dict, params = doc_proc.get_data_fabio( rank_metric=ranking_metric, gamma=gamma) else: tfidf_matrix, f_score_dict, params = doc_proc.get_data_fabio( rank_metric=ranking_metric) doc, features = tfidf_matrix.shape print 'starting clustering: found %s document and %s features' \ % (doc, features) if with_lsa: svd = TruncatedSVD(tfidf_matrix.shape[0]) lsa = make_pipeline(svd, Normalizer(copy=False)) tfidf_matrix = lsa.fit_transform(tfidf_matrix) linkage_matrix = hr.average(tfidf_matrix) else: linkage_matrix = hr.average(tfidf_matrix.toarray()) t = hr.to_tree(linkage_matrix, rd=True) clusters = {} for node in t[1]: if not node.is_leaf(): l = [] clusters[node.get_id()] = collect_leaf_nodes(node, l) f = f_score(clusters, f_score_dict) l = print_f_score_dict(f) params['avg_f_score'] = average_f_score(f, tfidf_matrix.shape[0]) params['all_fscore'] = l print 'average f_score: %s' % params['avg_f_score'] return params
def scipy_algo_cosine(dataset, abstract=False): ''' al momento non riesco a clusterizzare utilizzando la cosine_similarity perche' sembra esserci un bug di scipy che ritorna una distanza negativa https://github.com/scipy/scipy/issues/5208 :param dataset: :param abstract: :return: ''' doc_proc = dp.DocumentsProcessor(dataset) tfidf_matrix, f_score_dict = doc_proc.get_data(abstract) linkage_matrix = hr.linkage(tfidf_matrix.todense(), method='average', metric='cosine') print linkage_matrix.shape print linkage_matrix[linkage_matrix < 0].shape return t = hr.to_tree(linkage_matrix, rd=True) clusters = {} for node in t[1]: if not node.is_leaf(): l = [] clusters[node.get_id()] = collect_leaf_nodes(node, l) f = f_score(clusters, f_score_dict) print_f_score_dict(f) print 'average f_score: %s' % average_f_score(f, tfidf_matrix.shape[0])