예제 #1
0
    for word in inverse_index:
        word_map[word] = i
        index_word_map[i] = word
        i = i + 1
        for doc in inverse_index[word][2]:
            if doc not in doc_map:
                doc_map[doc] = j
                index_doc_map[j] = doc
                j = j + 1

    for i in range(len(index_doc_map)):
        print str(i) + ' ' + index_doc_map[i]
    #print doc_map
    print len(word_map), len(doc_map)
    #print inverse_index
    tf_idf_index, tf_idf_dict = tf_idf_all(inverse_index, word_map, doc_map)

    title_map = {}
    word_map_1 = {}
    index_title_map = {}
    i = 0
    j = 0
    for title in tf_doc:
        title_map[title] = i
        index_title_map[i] = title
        #print str(i) + ' ' + title
        i = i + 1
        for word in tf_doc[title]:
            if word not in word_map_1:
                word_map_1[word] = j
                j = j + 1
예제 #2
0
 for word in inverse_index:
     word_map[word] = i
     index_word_map[i] = word
     i = i + 1
     for doc in inverse_index[word][2]:
         if doc not in doc_map:
             doc_map[doc] = j
             index_doc_map[j] = doc
             j = j + 1 
 
 for i in range(len(index_doc_map)):
     print str(i) + ' ' + index_doc_map[i]
 #print doc_map
 print len(word_map),len(doc_map)
 #print inverse_index
 tf_idf_index,tf_idf_dict= tf_idf_all(inverse_index,word_map,doc_map) 
 
 title_map = {}
 word_map_1 = {}
 index_title_map = {}
 i = 0
 j = 0
 for title in tf_doc:
     title_map[title] = i
     index_title_map[i] = title
     #print str(i) + ' ' + title
     i = i + 1
     for word in tf_doc[title]:
         if word not in word_map_1:
             word_map_1[word] = j
             j = j + 1 
예제 #3
0
if __name__ == '__main__':
    #inverse_index('bing','bing_index.json')
    f = open('bing_index.json')
    inverse_index = json.loads(f.read())
        
    word_map = {}
    doc_map = {}
    i = 0
    j = 0
    for word in inverse_index:
        word_map[word] = i
        i = i + 1
        for doc in inverse_index[word][2]:
            if doc not in doc_map:
                doc_map[doc] = j
                j = j + 1 
    
    tf_idf_index = tf_idf_all(inverse_index,word_map,doc_map) 
    print doc_map
    
    miu,w = k_means(tf_idf_index,5)
    print w
    
    c = get_class(doc_map)
    
    p = purity(w,c,len(tf_idf_index))
    ri = rand_index(w,c,len(tf_idf_index))
    
    print 'purity = %f ri = %f' % (p, ri)