def main(): folders=[i for i in os.listdir(pathi) if not i.startswith(".")] folders=['files9_output_0102'] print "We have {} folders".format(len(folders)) featuredict=dictmaker(folders) wordmatrix_without_cat, wordmatrix_with_cat, catdicti = matrixmachine(folders, featuredict, "category1") x=clustermachine(wordmatrix_without_cat, scipy.cluster.vq.kmeans2) print x f=[(i.name, i.no_of_clusters) for i in x] g=[ct.Clusteringstats(wordmatrix_with_cat, type(i), i.labels).size_of_clusters() for i in x] #print g h=[len(ct.Clusteringstats(wordmatrix_with_cat, type(i), i.labels).cluster_features()) for i in x] print "no of clusters", h g=[ct.Centroidstats(wordmatrix_with_cat, i.name, i.labels, i.centroids, i.centroids)._centroiddictmaker() for i in x] test=x[0] #print test #g=ct.Centroidstats(wordmatrix_with_cat, test.name, test.labels,i.centroids ) #print g #t=g.cluster_predictors(featuredict) # t=Partitionsimilarity(x[0], x[0]) # print t.compare_partitions() t=[Categorystats(wordmatrix_with_cat, type(i), i.labels).size_of_categories() for i in x] print t
def word2vecclustermaker(model, no_clusters, distance_metric): """ This takes a word2vec model created by gensim.Word2Vec, a list of cluster numbers to compute, and a distance metric to use. The resulting clusters are output as json files formatted {cluster X: {indexes:[], words:[], vectors:[], cluster Y:{}}. It prints the silhouette score and the distribution of words over clusters. """ for k in no_clusters: print "clustering" #clustering=AgglomerativeClustering(n_clusters=k, affinity=distance_metric, linkage='average') clustering = KMeans(n_clusters=k, max_iter=1000, n_init=100) result = clustering.fit(newmod.syn0) clusteringstats = ct.Clusteringstats(newmod.syn0, newmod.syn0, result, result.labels_) print clustering print clusteringstats.size_of_clusters() print "silhouette", clusteringstats.cluster_silhouette(distance_metric) clusterdict = defaultdict(dict) for item in range(0, k): #print item clusterdict[item]['indexes'] = [ i for i, x in enumerate(result.labels_) if x == item ] clusterdict[item]['words'] = [ e for e in newmod.vocab.keys() for i in clusterdict[item]['indexes'] if newmod.vocab[e].__dict__['index'] == i ] #clusterdict[item]['vectors']=[newmod[e].tolist() for e in clusterdict[item]['words']] with codecs.open( os.path.join( "outputfiles", "clusters_" + str(k) + "_" + time.strftime("%H_%M_%m_%d") + ".json"), "w", "utf-8") as outputfile: json.dump(clusterdict, outputfile) print "output to ", "clusters_" + str(k) + "_" + time.strftime( "%H_%M_%m_%d") + ".json"
def main(distance_metric, threshold, testmode=False): starttime = time.time() #make this flexible in case there are no subfolders folders = [i for i in os.listdir(pathi) if not i.startswith(".")] print ", ".join(folders) print "Items in folders", ", ".join( [str(len(os.listdir(os.path.join(pathi, f)))) for f in folders]) #folders=['files9_output_0102']#, 'files9_output_0102', 'files9_output_0102', 'files9_output_0102','files9_output_0102', 'files9_output_0102','files9_output_0102', 'files9_output_0102', 'files9_output_0102'] print "We have {} folders".format(len(folders)) featuredict = dictmaker(folders, threshold, remove_stopwords=True, remove_punct=True) wordmatrix_without_cat, wordmatrix_with_cat, catdicti, filedicti = matrixmachine( folders, featuredict, testmode, "category1") wordmatrix_without_cat, wordmatrix_with_cat = ct.matrixstats( wordmatrix_without_cat, wordmatrix_with_cat, distance_metric, zscores=False, outlier_removal=True, outlier_threshold=2, median_metric='median') #apply to wordmatrix with cats x = clustermachine(wordmatrix_without_cat, distance_metric, 4) #print [(i.name, i.no_of_clusters) for i in x] excludelist = ['total', 'no_of_categories', 'no_of_clusters', 'no_of_cats'] print "These clusterings have less than 2 clusters\n{}\n\n".format( "\n".join([str(c.name) for c in x if c.no_of_clusters < 2])) #PRINTING STUFF headline = "\n\n-----------\n\n" print "Working with {} distance metric".format(distance_metric) #CROSS CLUSTERING COMPARISON for clustering in [c for c in x if c.no_of_clusters > 1]: cati = ct.Categorystats(wordmatrix_with_cat, clustering.name, clustering.labels) sili = ct.Clusteringstats( wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).cluster_silhouette(distance_metric) #GENERAL STATS print headline, headline, "CLUSTERING CALLED {} HAS {} CLUSTERS".format( clustering.getname()[1], clustering.no_of_clusters) print "Its silhouette score is {}".format(str(sili)) stats = ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).size_of_clusters() catstats = ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).cats_per_cluster() for cluster in stats: print "\nCluster {} contains {} items, {} % of the total".format( cluster, stats[cluster], round( float(stats[cluster]) / len(wordmatrix_without_cat) * 100)) for cat in [i for i in catstats[cluster] if not i in excludelist]: print "{} items of category {} make up {} % of this cluster".format( catstats[cluster][cat], "".join( [i[0] for i in catdicti.items() if i[1] == int(cat)]), round(catstats[cluster][cat] / catstats[cluster]['total'] * 100)) cats = ct.Categorystats(wordmatrix_with_cat, clustering.name, clustering.labels).size_of_categories() #STATS PER CAT print headline, "Statistics per category" for cat in [i for i in cats if not i in excludelist]: print "\nCategory {} has {} items".format( "".join([i[0] for i in catdicti.items() if i[1] == int(cat)]), cats[cat]['total']) for entry in [ i for i in cats[cat]['cat_per_cluster'] if not i in excludelist ]: print "{} items or {} percent in cluster {}".format( cats[cat]['cat_per_cluster'][entry], round( float(cats[cat]['cat_per_cluster'][entry]) / float(cats[cat]['total']) * 100), entry) #PREDICTIVE FEATURES print headline, "Strongly predictive features are" cents = ct.Centroidstats( wordmatrix_without_cat, clustering.name, clustering.labels, clustering.centroids).cluster_predictors(featuredict) if cents: for diff in cents: print "\nRaw Scores" print "Cluster {} and cluster {} are differentiated by \n{}\n\n\n".format( diff[0], diff[1], ", ".join([ " : ".join(map(unicode, i[::-1])) for i in cents[diff]['raw_diff'] ][:10])) print "Zscores" print "Cluster {} and cluster {} are differentiated by \n{}\n\n\n".format( diff[0], diff[1], ", ".join([ " : ".join(map(unicode, i[::-1])) for i in cents[diff]['zscores_diff'] ][:10])) #PROTOTYPES print headline, "Here is a typical document for each cluster" distance = distance_metric if distance_metric == 'manhattan': distance = 'cityblock' print "We set the distance metric to {}".format(distance) docs = ct.Centroidstats(wordmatrix_without_cat, clustering.name, clustering.labels, clustering.centroids).central_documents( wordmatrix_with_cat, filedicti) if docs: for cluster in docs: print "\nCLUSTER {} \n".format(cluster) with open(docs[cluster][distance][0]) as f: print f.read() if len(docs[cluster][distance]) > 8: print "\nOther files close by in cluster {}:\n".format( cluster) print("{}\n" * 8).format(*docs[cluster][distance][1:9]) print headline, "Comparing clusterings" for clustering in [c for c in x if c.no_of_clusters > 1]: print headline, "CLUSTERING CALLED {} HAS {} CLUSTERS".format( clustering.getname()[0], clustering.no_of_clusters) print "Its silhouette score is {}".format( str( ct.Clusteringstats( wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).cluster_silhouette(distance_metric))) #all input does it just concatenate name + cluster # and supply clustering object to similarity measurement input = [(str(type(i.name)).split(".")[3].rstrip("'>") + "--" + str(i.no_of_clusters), i) for i in x] simi = ct.Clusteringsimilarity(wordmatrix_with_cat, wordmatrix_without_cat, input) options = [ 'adjustedrand_sim', 'adjustedmutualinfo_sim', 'jaccard_sim', 'v_sim', 'completeness_sim', 'homogeneity_sim', 'silhouette_score_sim' ] for o in options: print "\n---\n" ct.Clusteringsimilarity(wordmatrix_with_cat, wordmatrix_without_cat, input).similarity_matrix(o) print "\n---\n" endtime = time.time() process = endtime - starttime print headline, "This took us {} minutes".format(process / 60) #or do we want to do predictive features and typical document per cluster as well???? os.system('say "your program has finished"')
def main(): starttime = time.time() folders = [i for i in os.listdir(pathi) if not i.startswith(".")] print ", ".join(folders) print ", ".join( [str(len(os.listdir(os.path.join(pathi, f)))) for f in folders]) folders = [ 'files9_output_0102' ] #, 'files9_output_0102', 'files9_output_0102', 'files9_output_0102','files9_output_0102', 'files9_output_0102','files9_output_0102', 'files9_output_0102', 'files9_output_0102'] print "We have {} folders".format(len(folders)) featuredict = dictmaker(folders, 5000) wordmatrix_without_cat, wordmatrix_with_cat, catdicti = matrixmachine( folders, featuredict, "category1") #self.matrix_with_cats=matrix_with_cats #data frame including "gold labels" #self.matrix_without_cats=matrix_with_cats[:,1:] #data frame without "gold labels" x = clustermachine(wordmatrix_without_cat, 2) print[(i.name, i.no_of_clusters) for i in x] #print [i.name for i in x] excludelist = ['total', 'no_of_categories', 'no_of_clusters', 'no_of_cats'] for clustering in x: cati = ct.Categorystats(wordmatrix_with_cat, clustering.name, clustering.labels) sili = ct.Clusteringstats(wordmatrix_with_cat, clustering.name, clustering.labels).cluster_silhouette() # print cati.size_of_categories() print "\n\n-----------\n\nClustering called {} has {} clusters".format( clustering.getname()[0], clustering.no_of_clusters) print "Its silhouette score is {}".format(str(sili)) stats = ct.Clusteringstats(wordmatrix_with_cat, clustering.name, clustering.labels).size_of_clusters() catstats = ct.Clusteringstats(wordmatrix_with_cat, clustering.name, clustering.labels).cats_per_cluster() for cluster in stats: print "\nCluster {} contains {} items, {} % of the total".format( cluster, stats[cluster], round( float(stats[cluster]) / len(wordmatrix_without_cat) * 100)) for cat in [i for i in catstats[cluster] if not i in excludelist]: print "{} items of category {} make up {} % of this cluster".format( catstats[cluster][cat], "".join( [i[0] for i in catdicti.items() if i[1] == int(cat)]), round(catstats[cluster][cat] / catstats[cluster]['total'] * 100)) cats = ct.Categorystats(wordmatrix_with_cat, clustering.name, clustering.labels).size_of_categories() print "\n\n-----------\n\nStatistics per category" for cat in [i for i in cats if not i in excludelist]: print "\nCategory {} has {} items".format( "".join([i[0] for i in catdicti.items() if i[1] == int(cat)]), cats[cat]['total']) for entry in [ i for i in cats[cat]['cat_per_cluster'] if not i in excludelist ]: print "{} items or {} percent in cluster {}".format( cats[cat]['cat_per_cluster'][entry], round( float(cats[cat]['cat_per_cluster'][entry]) / float(cats[cat]['total']) * 100), entry) print "\n\n-----------\n\nStronly predictive features are" cents = ct.Centroidstats( clustering.name, clustering.labels, clustering.centroids).cluster_predictors(featuredict) for diff in cents: print "\n Raw Scores" print "{} differentiate {} and {}\n".format( ", ".join([ " : ".join(map(str, i[::-1])) for i in cents[diff]['raw_diff'] ]), diff[0], diff[1]) print "Zscores" print "{} differentiate {} and {}".format( ", ".join([ " : ".join(map(str, i[::-1])) for i in cents[diff]['zscores_diff'] ]), diff[0], diff[1]) "We can also add equivalent features if we want" "And stems and whatnot" print "\n\n-----------\n\nHere is a typical document for each cluster" endtime = time.time() process = endtime - starttime print "This took us {} minutes".format(process / 60)
def main(distance_metric, testmode=False): starttime = time.time() #x=ct.clustermachine(wordmatrix_without_cat,distance_metric,4) print "These clusterings have less than 2 clusters\n{}\n\n".format( "\n".join([str(c.name) for c in x if c.no_of_clusters < 2])) #PRINTING STUFF headline = "\n\n-----------\n\n" print "Working with {} distance metric".format(distance_metric) #v is a number, k a word excludelist = [ 'total', 'no_of_categories', 'no_of_clusters', 'no_of_cats' ] + [ v for k, v in catdicti.items() if wordmatrix_with_cat[wordmatrix_with_cat[:, 0] == v].shape[0] < 100 ] print "excludelist", excludelist #CROSS CLUSTERING COMPARISON for clustering in [c for c in x if c.no_of_clusters > 1]: cati = ct.Categorystats(wordmatrix_with_cat, clustering.name, clustering.labels) print "Categorystats done" sili = ct.Clusteringstats( wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).cluster_silhouette(distance_metric) print "Clusteringstats done" #GENERAL STATS print headline, headline, "CLUSTERING CALLED {} HAS {} CLUSTERS".format( clustering.getname()[1], clustering.no_of_clusters) print "Its silhouette score is {}".format(str(sili)) stats = ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).size_of_clusters() print "stats done" catstats = ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).cats_per_cluster() print "catstats done" for cluster in stats: print "\nCluster {} contains {} items, {} % of the total".format( cluster, stats[cluster], round( float(stats[cluster]) / len(wordmatrix_without_cat) * 100)) for cat in [i for i in catstats[cluster] if not i in excludelist]: print "{} items of category {} make up {} % of this cluster".format( catstats[cluster][cat], "".join( [i[0] for i in catdicti.items() if i[1] == int(cat)]), round(catstats[cluster][cat] / catstats[cluster]['total'] * 100)) cats = ct.Categorystats(wordmatrix_with_cat, clustering.name, clustering.labels).size_of_categories() #STATS PER CAT print headline, "Statistics per category" for cat in [i for i in cats if not i in excludelist]: print "\nCategory {} has {} items".format( "".join([i[0] for i in catdicti.items() if i[1] == int(cat)]), cats[cat]['total']) for entry in [ i for i in cats[cat]['cat_per_cluster'] if not i in excludelist ]: print "{} items or {} percent in cluster {}".format( cats[cat]['cat_per_cluster'][entry], round( float(cats[cat]['cat_per_cluster'][entry]) / float(cats[cat]['total']) * 100), entry) # #PREDICTIVE FEATURES print headline, "Strongly predictive features are" cents = ct.Centroidstats( wordmatrix_without_cat, clustering.name, clustering.labels, clustering.centroids).cluster_predictors(featuredict) if cents: for diff in cents: print "\nRaw Scores" print "Cluster {} and cluster {} are differentiated by \n{}\n\n\n".format( diff[0], diff[1], ", ".join([ " : ".join(map(unicode, i[::-1])) for i in cents[diff]['raw_diff'] ][:10])) print "Zscores" print "Cluster {} and cluster {} are differentiated by \n{}\n\n\n".format( diff[0], diff[1], ", ".join([ " : ".join(map(unicode, i[::-1])) for i in cents[diff]['zscores_diff'] ][:10])) #PROTOTYPES print headline, "Here is a typical document for each cluster" distance = distance_metric if distance_metric == 'manhattan': distance = 'cityblock' print "We set the distance metric to {}".format(distance) docs = ct.Centroidstats(wordmatrix_without_cat, clustering.name, clustering.labels, clustering.centroids).central_documents( wordmatrix_with_cat, filedicti) if docs: for cluster in docs: print "\nCLUSTER {} \n".format(cluster) print docs[cluster][distance] with open(docs[cluster][distance][0]) as f: print f.read() if len(docs[cluster][distance]) > 8: print "\nOther files close by in cluster {}:\n".format( cluster) print("{}\n" * 8).format(*docs[cluster][distance][1:9]) #COMPARING CLUSTERINGS print headline, "Comparing clusterings" for clustering in [c for c in x if c.no_of_clusters > 1]: print headline, "CLUSTERING CALLED {} HAS {} CLUSTERS".format( clustering.getname()[0], clustering.no_of_clusters) print "Its silhouette score is {}".format( str( ct.Clusteringstats( wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).cluster_silhouette(distance_metric))) #all input does it just concatenate name + cluster # and supply clustering object to similarity measurement input = [(str(type(i.name)).split(".")[3].rstrip("'>") + "--" + str(i.no_of_clusters), i) for i in x] simi = ct.Clusteringsimilarity(wordmatrix_with_cat, wordmatrix_without_cat, input) options = [ 'adjustedrand_sim', 'adjustedmutualinfo_sim', 'jaccard_sim', 'v_sim', 'completeness_sim', 'homogeneity_sim', 'silhouette_score_sim' ] for o in options: print "\n---\n" ct.Clusteringsimilarity(wordmatrix_with_cat, wordmatrix_without_cat, input).similarity_matrix(o) print "\n---\n" endtime = time.time() process = endtime - starttime print headline, "This took us {} minutes".format(process / 60)
def main(distance_metric, threshold, testmode=False): starttime = time.time() #here we read in the featuredict #create the featuredict from a text file featuredict = {} with codecs.open( '/Users/ps22344/Downloads/chapter2/textfiles/emolist_final.txt', "r", "utf-8") as inputtext: for line in inputtext.readlines(): featuredict[line.rstrip("\n")] = 0 print "pre length", len(featuredict) folders = [i for i in os.listdir(pathi) if not i.startswith(".")] print ", ".join(folders) print "Items in folders", ", ".join( [str(len(os.listdir(os.path.join(pathi, f)))) for f in folders]) print "We have {} folders".format(len(folders)) #here we input the featuredict wordmatrix_without_cat, wordmatrix_with_cat, catdicti, filedicti = matrixmachine( folders, featuredict, testmode, "category1") wordmatrix_without_cat, wordmatrix_with_cat = ct.matrixstats( wordmatrix_without_cat, wordmatrix_with_cat, distance_metric, zscores=False, outlier_removal=False, outlier_threshold=2, median_metric='median') np.savetxt('wordmatrix_without_cat.gz', wordmatrix_without_cat) np.savetxt('wordmatrix_with_cat.gz', wordmatrix_with_cat) x = clustermachine(wordmatrix_without_cat, distance_metric, 4) #print [(i.name, i.no_of_clusters) for i in x] excludelist = ['total', 'no_of_categories', 'no_of_clusters', 'no_of_cats'] print "These clusterings have less than 2 clusters\n{}\n\n".format( "\n".join([str(c.name) for c in x if c.no_of_clusters < 2])) #PRINTING STUFF headline = "\n\n-----------\n\n" print "Working with {} distance metric".format(distance_metric) #CROSS CLUSTERING COMPARISON for clustering in [c for c in x if c.no_of_clusters > 1]: cati = ct.Categorystats(wordmatrix_with_cat, clustering.name, clustering.labels) print "Categorystats established" sili = ct.Clusteringstats( wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).cluster_silhouette(distance_metric) print "Clusteringstats established" #GENERAL STATS print headline, headline, "CLUSTERING CALLED {} HAS {} CLUSTERS".format( clustering.getname()[1], clustering.no_of_clusters) print "Its silhouette score is {}".format(str(sili)) stats = ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).size_of_clusters() catstats = ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).cats_per_cluster() for cluster in stats: print "\nCluster {} contains {} items, {} % of the total".format( cluster, stats[cluster], round( float(stats[cluster]) / len(wordmatrix_without_cat) * 100)) for cat in [i for i in catstats[cluster] if not i in excludelist]: print "{} items of category {} make up {} % of this cluster".format( catstats[cluster][cat], "".join( [i[0] for i in catdicti.items() if i[1] == int(cat)]), round(catstats[cluster][cat] / catstats[cluster]['total'] * 100)) cats = ct.Categorystats(wordmatrix_with_cat, clustering.name, clustering.labels).size_of_categories() #STATS PER CAT print headline, "Statistics per category" for cat in [i for i in cats if not i in excludelist]: print "\nCategory {} has {} items".format( "".join([i[0] for i in catdicti.items() if i[1] == int(cat)]), cats[cat]['total']) for entry in [ i for i in cats[cat]['cat_per_cluster'] if not i in excludelist ]: print "{} items or {} percent in cluster {}".format( cats[cat]['cat_per_cluster'][entry], round( float(cats[cat]['cat_per_cluster'][entry]) / float(cats[cat]['total']) * 100), entry) #PREDICTIVE FEATURES print headline, "Strongly predictive features are" cents = ct.Centroidstats( wordmatrix_without_cat, clustering.name, clustering.labels, clustering.centroids).cluster_predictors(featuredict) if cents: for diff in cents: print "\nRaw Scores" print diff #the two below should return the same results; one was worked over because of unicode issues. print u"Cluster {} and cluster {} are differentiated by \n{}\n\n\n".format( unicode(diff[0]), diff[1], ", ".join([ " : ".join([i, unicode(k)]) for i, k in cents[diff]['raw_diff'][:10] ])) print "Zscores" #this is python slice notation: "a[::-1] to reverse a string" ; http://stackoverflow.com/questions/509211/explain-pythons-slice-notation print u"Cluster {} and cluster {} are differentiated by \n{}\n\n\n".format( diff[0], diff[1], ", ".join([ " : ".join(map(unicode, i[::-1])) for i in cents[diff]['zscores_diff'] ][:10])) #PROTOTYPES print headline, "Here is a typical document for each cluster" distance = distance_metric if distance_metric == 'manhattan': distance = 'cityblock' print "We set the distance metric to {}".format(distance) docs = ct.Centroidstats(wordmatrix_without_cat, clustering.name, clustering.labels, clustering.centroids).central_documents( wordmatrix_with_cat, filedicti) if docs: for cluster in docs: print "\nCLUSTER {} \n".format(cluster) with open(docs[cluster][distance][0]) as f: print f.read() if len(docs[cluster][distance]) > 8: print "\nOther files close by in cluster {}:\n".format( cluster) print("{}\n" * 8).format(*docs[cluster][distance][1:9]) print headline, "Comparing clusterings" for clustering in [c for c in x if c.no_of_clusters > 1]: print headline, "CLUSTERING CALLED {} HAS {} CLUSTERS".format( clustering.getname()[0], clustering.no_of_clusters) print "Its silhouette score is {}".format( str( ct.Clusteringstats( wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).cluster_silhouette(distance_metric))) #all input does it just concatenate name + cluster # and supply clustering object to similarity measurement input = [(str(type(i.name)).split(".")[3].rstrip("'>") + "--" + str(i.no_of_clusters), i) for i in x] simi = ct.Clusteringsimilarity(wordmatrix_with_cat, wordmatrix_without_cat, input) options = [ 'adjustedrand_sim', 'adjustedmutualinfo_sim', 'jaccard_sim', 'v_sim', 'completeness_sim', 'homogeneity_sim', 'silhouette_score_sim' ] for o in options: print "\n---\n" ct.Clusteringsimilarity(wordmatrix_with_cat, wordmatrix_without_cat, input).similarity_matrix(o) print "\n---\n" endtime = time.time() process = endtime - starttime print headline, "This took us {} minutes".format(process / 60) #or do we want to do predictive features and typical document per cluster as well???? os.system('say "your program has finished"')
def main(): starttime = time.time() folders = [i for i in os.listdir(pathi) if not i.startswith(".")] print ", ".join(folders) print ", ".join( [str(len(os.listdir(os.path.join(pathi, f)))) for f in folders]) #folders=['files9_output_0102']#, 'files9_output_0102', 'files9_output_0102', 'files9_output_0102','files9_output_0102', 'files9_output_0102','files9_output_0102', 'files9_output_0102', 'files9_output_0102'] print "We have {} folders".format(len(folders)) featuredict = dictmaker(folders, 10000) wordmatrix_without_cat, wordmatrix_with_cat, catdicti, filedicti = matrixmachine( folders, featuredict, "category1") x = clustermachine(wordmatrix_without_cat, 4) print[(i.name, i.no_of_clusters) for i in x] #print [i.name for i in x] excludelist = ['total', 'no_of_categories', 'no_of_clusters', 'no_of_cats'] for clustering in x: cati = ct.Categorystats(wordmatrix_with_cat, clustering.name, clustering.labels) sili = ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).cluster_silhouette() # print cati.size_of_categories() headline = "\n\n-----------\n\n" #GENERAL STATS print headline, headline, "CLUSTERING CALLED {} HAS {} CLUSTERS".format( clustering.getname()[0], clustering.no_of_clusters) print "Its silhouette score is {}".format(str(sili)) stats = ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).size_of_clusters() catstats = ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).cats_per_cluster() for cluster in stats: print "\nCluster {} contains {} items, {} % of the total".format( cluster, stats[cluster], round( float(stats[cluster]) / len(wordmatrix_without_cat) * 100)) for cat in [i for i in catstats[cluster] if not i in excludelist]: print "{} items of category {} make up {} % of this cluster".format( catstats[cluster][cat], "".join( [i[0] for i in catdicti.items() if i[1] == int(cat)]), round(catstats[cluster][cat] / catstats[cluster]['total'] * 100)) cats = ct.Categorystats(wordmatrix_with_cat, clustering.name, clustering.labels).size_of_categories() #STATS PER CAT print headline, "Statistics per category" for cat in [i for i in cats if not i in excludelist]: print "\nCategory {} has {} items".format( "".join([i[0] for i in catdicti.items() if i[1] == int(cat)]), cats[cat]['total']) for entry in [ i for i in cats[cat]['cat_per_cluster'] if not i in excludelist ]: print "{} items or {} percent in cluster {}".format( cats[cat]['cat_per_cluster'][entry], round( float(cats[cat]['cat_per_cluster'][entry]) / float(cats[cat]['total']) * 100), entry) #PREDICTIVE FEATURES print headline, "Stronly predictive features are" cents = ct.Centroidstats( clustering.name, clustering.labels, clustering.centroids).cluster_predictors(featuredict) if cents: for diff in cents: print "\nRaw Scores" print "Cluster {} and cluster {} are differentiated by \n{}\n\n\n".format( diff[0], diff[1], ", ".join([ " : ".join(map(str, i[::-1])) for i in cents[diff]['raw_diff'] ][:10])) print "Zscores" print "Cluster {} and cluster {} are differentiated by \n{}\n\n\n".format( diff[0], diff[1], ", ".join([ " : ".join(map(str, i[::-1])) for i in cents[diff]['zscores_diff'] ][:10])) print "We can also add equivalent features if we want" print "And stems and whatnot" #PROTOTYPES print headline, "Here is a typical document for each cluster" distance = 'euclidean' print "We set the distance metric to {}".format(distance) docs = ct.Centroidstats(clustering.name, clustering.labels, clustering.centroids).central_documents( wordmatrix_with_cat, filedicti) if docs: for cluster in docs: print "\nCLUSTER {} \n".format(cluster) f = open(docs[cluster][distance][0]).read() print f #CROSS CLUSTERING COMPARISON print headline, "Comparing clusterings" # input = [(str(type(i.name)).split(".")[3].rstrip("'>"), i) for i in x] simi = ct.Clusteringsimilarity(wordmatrix_with_cat, wordmatrix_without_cat, input) options = [ 'adjustedrand_sim', 'adjustedmutualinfo_sim', 'jaccard_sim', 'v_sim', 'completeness_sim', 'homogeneity_sim', 'silhouette_score_sim' ] for o in options: print "\n---\n" ct.Clusteringsimilarity(wordmatrix_with_cat, wordmatrix_without_cat, input).similarity_matrix(o) print "\n---\n" endtime = time.time() process = endtime - starttime print headline, "This took us {} minutes".format(process / 60)