for word in inverse_index: word_map[word] = i index_word_map[i] = word i = i + 1 for doc in inverse_index[word][2]: if doc not in doc_map: doc_map[doc] = j index_doc_map[j] = doc j = j + 1 for i in range(len(index_doc_map)): print str(i) + ' ' + index_doc_map[i] #print doc_map print len(word_map), len(doc_map) #print inverse_index tf_idf_index, tf_idf_dict = tf_idf_all(inverse_index, word_map, doc_map) title_map = {} word_map_1 = {} index_title_map = {} i = 0 j = 0 for title in tf_doc: title_map[title] = i index_title_map[i] = title #print str(i) + ' ' + title i = i + 1 for word in tf_doc[title]: if word not in word_map_1: word_map_1[word] = j j = j + 1
for word in inverse_index: word_map[word] = i index_word_map[i] = word i = i + 1 for doc in inverse_index[word][2]: if doc not in doc_map: doc_map[doc] = j index_doc_map[j] = doc j = j + 1 for i in range(len(index_doc_map)): print str(i) + ' ' + index_doc_map[i] #print doc_map print len(word_map),len(doc_map) #print inverse_index tf_idf_index,tf_idf_dict= tf_idf_all(inverse_index,word_map,doc_map) title_map = {} word_map_1 = {} index_title_map = {} i = 0 j = 0 for title in tf_doc: title_map[title] = i index_title_map[i] = title #print str(i) + ' ' + title i = i + 1 for word in tf_doc[title]: if word not in word_map_1: word_map_1[word] = j j = j + 1
if __name__ == '__main__': #inverse_index('bing','bing_index.json') f = open('bing_index.json') inverse_index = json.loads(f.read()) word_map = {} doc_map = {} i = 0 j = 0 for word in inverse_index: word_map[word] = i i = i + 1 for doc in inverse_index[word][2]: if doc not in doc_map: doc_map[doc] = j j = j + 1 tf_idf_index = tf_idf_all(inverse_index,word_map,doc_map) print doc_map miu,w = k_means(tf_idf_index,5) print w c = get_class(doc_map) p = purity(w,c,len(tf_idf_index)) ri = rand_index(w,c,len(tf_idf_index)) print 'purity = %f ri = %f' % (p, ri)