Пример #1
0
def main():
    
    try :
        opts, args = getopt.getopt(sys.argv[1:], [], "debug")
    except getopt.GetoptError:
        err()

    if not args:
        err()
    else:
        source_dir = sys.argv[-1]
    
    for opt, arg in opts:
        if opt == "--debug":
            global DEBUG
            DEBUG = True
        
    print "***************************************************************"
    print "Newsgroups with IDFs"
    newsgroups_IDFs = computeDocumentFrequency(source_dir)
    for newsgroup, IDF in newsgroups_IDFs.iteritems():
        print newsgroup
        if DEBUG: 
            print dumps(IDF, sort_keys=True, indent=4)
    
    print "***************************************************************"
    print "hClustering"
    newsgroup_TFIDF = computeNewsGroupCategory(source_dir)
    root = hCluster(newsgroup_TFIDF)
    print tree.printChildren(root)
Пример #2
0
def hCluster(S):
    ''' build denogram by comparing cosine similarity scores of all categories '''
    
    while len(S) > 1:
        cos_score = 0.0
        selected = (None, None)
        right, left = None, None
        
        # 1) find the two most similar elements e1 and e2 in S using cos()
        for k, v in S.iteritems():
            for k_, v_ in S.iteritems():
                if k_ is not k:
                    temp_score = cosineSimilarity(v, v_)
                    if temp_score > cos_score:
                        selected = (k, k_)
                        cos_score = temp_score
                        right = tree.assertCreateNode(k_)
                        left = tree.assertCreateNode(k)
                        
        # 2) replace them in S with e1Ve2
        parent = str(selected[0]) + " && " + str(selected[1]) 
        if DEBUG:
            print "parent : ", parent
            print "right : ", right
            print "left : ", left
        
        node = tree.Node(parent, right, left)
        S[node] = merge(v, v_)
        del S[selected[0]]
        del S[selected[1]]
        
        if DEBUG:
            print "\n"
            print "Tree: "
            print tree.printChildren(node)
            print "\n\n"
        
    return node