def create_soft_clusters(doc_cluster_pairs): """ Caution: May become deprecated @param doc_cluster_pairs: Pairs clustered together Transitively create soft-bordered cluster groups, which can be used to "see" which documents belong semantically -- directly or indirectly -- together. Use for that the clustered pairs. Problem: There are documents ("vertices") which belong to several clusters, which may be indicators of a topic shift. """ soft_clusters = list() paths_list = list() no_of_cluster_pairs = len(doc_cluster_pairs) max_doc_pair_idx = no_of_cluster_pairs - 1 doc_pair_idx1 = 0 print "Making out soft clusters ... may take a while." pb = ProgressBar(maxval=no_of_cluster_pairs).start() for doc_pair, _ in doc_cluster_pairs: doc_pair1 = set(doc_pair) path_vertices = doc_pair1 doc_pair_idx2 = doc_pair_idx1 + 1 # No more paths to find if maximum index is reached if (doc_pair_idx1 == max_doc_pair_idx): break while True: # Last element reached before; break loop if(doc_pair_idx2 == max_doc_pair_idx): break doc_pair2 = set(doc_cluster_pairs[doc_pair_idx2][0]) # If there's a path to a pair, add it to the vertices' list if len(path_vertices.intersection(doc_pair2)) == 1: path_vertices = path_vertices.union(doc_pair2) doc_pair_idx2 += 1 path = tuple(sorted(path_vertices)) paths_list.append(path) # Remove duplicates and subsets # => remove paths fully or partly contained in other paths paths_list = filter_subsets(paths_list) doc_pair_idx1 += 1 pb.update(doc_pair_idx1) # XXX: To be put into seperate function. print "\n", "Number of unique paths found:", len(paths_list) for path in paths_list: print path return soft_clusters
def process_project(tfidf_matrix_file, xmlcollection): """ Here starts the classification upon the TF*IDF matrix. """ pos_idx = get_positional_index(tfidf_matrix_file) no_of_docs = len(xmlcollection.get_docs()) cluster_pairs = list() # In here create cluster pairs soft_clusters = list() # In here create soft clusters doc_idx1 = 0 max_doc_idx = no_of_docs - 1 for doc_line1 in pos_idx: doc_idx2 = doc_idx1 + 1 # Do comparison as of next document terms1 = set(doc_line1) common_terms = set() soft_cluster = set() soft_cluster_common_terms = set() # Last document doesn't have other document to compare to; # break loop if(doc_idx1 == max_doc_idx): break already_added = False while True: # Break loop if last document reached to compare to # already reached before if(doc_idx2 == max_doc_idx): break terms2 = set(pos_idx[doc_idx2]) common_terms = terms1.intersection(terms2) soft_cluster_common_terms = \ soft_cluster_common_terms.union(common_terms) if len(common_terms) >= get_def_common_terms_no(): doc_no1 = doc_idx1 + 1 doc_no2 = doc_idx2 + 1 clustered_doc_pair = [doc_no1, doc_no2] if already_added == False: soft_cluster.add(doc_no1) already_added = True soft_cluster.add(doc_no2) cluster_pairs.append([clustered_doc_pair, common_terms]) doc_idx2 += 1 if len(soft_cluster) > 0: soft_clusters.append([tuple(sorted(soft_cluster)), tuple(sorted(soft_cluster_common_terms))]) doc_idx1 += 1 # Print found soft cluster groups print_line() soft_clusters = filter_subsets(soft_clusters, nested=True) print "Soft clustering (statistics): " print_clusters(soft_clusters, no_of_docs) print_line() # Print found hard cluster groups print "Hard clustering (statistics): " hard_clusters = create_hard_clusters(soft_clusters, no_of_docs) print_clusters(hard_clusters, no_of_docs) # Write found soft & hard clusters base_clust_dir = get_clustdir() write_clusters(xmlcollection, soft_clusters, base_clust_dir) write_clusters(xmlcollection, hard_clusters, base_clust_dir, type_='hard')