예제 #1
0
def print_clusters(clusters, no_of_docs):
    """
    
    Prints documents by their numbers they were clustered together
    because of common terms. Also shows how many documents could be
    clustered.
    
    @param clusters: A list containing document numbers
                    clustered together upon a certain number
                    of common terms (represented by their
                    numbers, too).
    @param no_of_docs: The number (int) of documents the collection we
                       clustered consists of.
                       
    """
    if len(clusters) == 0 or no_of_docs == 0:
        raise VoidStructureError, \
        "Please provide non-zero/empty values."
    
    set_of_docs_clustered = set()
    cluster_sizes = list()
    
    for docs, _ in clusters:
        cluster_sizes.append(len(docs))
        for doc in docs:
            set_of_docs_clustered.add(doc)
    cluster_sizes = sorted(cluster_sizes)
    
    rate_of_docs_clustered = float(len(set_of_docs_clustered)) / no_of_docs
    print "Lowest IDF value considered for terms:", \
          get_def_idf_filter_val()
    print "Number of feature terms used to cluster:", \
          get_def_common_terms_no() 
    print "Number of clusters built:", len(clusters)
     
    print "Number of docs clustered:", len(set_of_docs_clustered), "/", \
                                       no_of_docs
    print "Average cluster size:", \
        sum(cluster_sizes) / float(len(cluster_sizes))
    print "Median cluster size:", cluster_sizes[len(cluster_sizes)/2]
    ten_biggest_clusters = sorted(cluster_sizes, reverse=True)[:10]
    print "Ten biggest cluster sizes:", ten_biggest_clusters
    print "Coverage of ten biggest clusters over docs clustered:", \
        sum(ten_biggest_clusters) / float(len(set_of_docs_clustered))
    print "Rate of docs clustered:", rate_of_docs_clustered
예제 #2
0
def process_project(tfidf_matrix_file, xmlcollection):
    """
    Here starts the classification upon the TF*IDF matrix.
    """
    pos_idx = get_positional_index(tfidf_matrix_file)
    no_of_docs = len(xmlcollection.get_docs())
    cluster_pairs = list() # In here create cluster pairs
    soft_clusters = list() # In here create soft clusters
    
    doc_idx1 = 0
    max_doc_idx = no_of_docs - 1
    for doc_line1 in pos_idx:
        doc_idx2 = doc_idx1 + 1 # Do comparison as of next document
        terms1 = set(doc_line1)
        common_terms = set()
        soft_cluster = set()
        soft_cluster_common_terms = set()
        
        # Last document doesn't have other document to compare to;
        # break loop
        if(doc_idx1 == max_doc_idx):
            break
        
        already_added = False
        while True:
            # Break loop if last document reached to compare to
            # already reached before
            if(doc_idx2 == max_doc_idx):
                break
            
            terms2 = set(pos_idx[doc_idx2])
            common_terms = terms1.intersection(terms2)
            soft_cluster_common_terms = \
                soft_cluster_common_terms.union(common_terms)
            
            if len(common_terms) >= get_def_common_terms_no():
                doc_no1 = doc_idx1 + 1
                doc_no2 = doc_idx2 + 1
                clustered_doc_pair = [doc_no1, doc_no2]
                
                if already_added == False:
                    soft_cluster.add(doc_no1)
                    already_added = True
                soft_cluster.add(doc_no2)

                cluster_pairs.append([clustered_doc_pair, common_terms])
            
            doc_idx2 += 1
           
        if len(soft_cluster) > 0:
            soft_clusters.append([tuple(sorted(soft_cluster)), 
                                  tuple(sorted(soft_cluster_common_terms))])  
            
        doc_idx1 += 1
        
    # Print found soft cluster groups
    print_line()
    soft_clusters = filter_subsets(soft_clusters, nested=True)
    print "Soft clustering (statistics): "
    print_clusters(soft_clusters, no_of_docs)
    print_line()
    
    # Print found hard cluster groups
    print "Hard clustering (statistics): "
    hard_clusters = create_hard_clusters(soft_clusters, no_of_docs)
    print_clusters(hard_clusters, no_of_docs)
    
    # Write found soft & hard clusters
    base_clust_dir = get_clustdir()
    write_clusters(xmlcollection, soft_clusters, base_clust_dir)
    write_clusters(xmlcollection, hard_clusters, base_clust_dir, type_='hard')