def kmedoids_clustering(similarity_matrix, k, maxits = 1000000): """Takes symmetric similarity matrix (list of lists) and returns list: assignment to clusters.""" logging.info("[kmedoids_clustering] clustering objects="+str(len(similarity_matrix))+" k="+str(k)) if len(similarity_matrix) <= k: logging.warn("[kmedoids_clustering] objects="+str(len(similarity_matrix))+" is no more than clusters="+str(k)+"!") return range(0, len(similarity_matrix)) labels = list( i for i in xrange(len(similarity_matrix)) ) matrix_io.fwrite_smatrix(similarity_matrix, labels, labels, TMP_INPATH) if matlab_wrapper.run_matlab("../clustering/kmedoids_matlab/kmedoids.m", [TMP_INPATH, (k), (maxits), TMP_OUTPATH]) != 0: raise Exception("[kmedoids_clustering] Matlab failure!") assignment = matrix_io.fread_ivector(TMP_OUTPATH) clusters = set(assignment) #przenumerowanie klastrow: clust2clust = dict( (old_no, new_no) for (new_no,old_no) in enumerate(clusters) ) assignment = list(clust2clust[c] for c in assignment) return assignment
def kmedoids_clustering(similarity_matrix, k, maxits=1000000): """Takes symmetric similarity matrix (list of lists) and returns list: assignment to clusters.""" logging.info("[kmedoids_clustering] clustering objects=" + str(len(similarity_matrix)) + " k=" + str(k)) if len(similarity_matrix) <= k: logging.warn("[kmedoids_clustering] objects=" + str(len(similarity_matrix)) + " is no more than clusters=" + str(k) + "!") return range(0, len(similarity_matrix)) labels = list(i for i in xrange(len(similarity_matrix))) matrix_io.fwrite_smatrix(similarity_matrix, labels, labels, TMP_INPATH) if matlab_wrapper.run_matlab("../clustering/kmedoids_matlab/kmedoids.m", [TMP_INPATH, (k), (maxits), TMP_OUTPATH]) != 0: raise Exception("[kmedoids_clustering] Matlab failure!") assignment = matrix_io.fread_ivector(TMP_OUTPATH) clusters = set(assignment) #przenumerowanie klastrow: clust2clust = dict( (old_no, new_no) for (new_no, old_no) in enumerate(clusters)) assignment = list(clust2clust[c] for c in assignment) return assignment
print "Loading ZBL records from zbl_path=", zbl_path zblid2zbl = dict((zbl[zbl_io.ZBL_ID_FIELD], zbl) for zbl in _get_zbl_generator_(zbl_path)) print " zblid2zbl [", len(zblid2zbl), "docs loaded] =", str( list(zblid2zbl.iteritems()))[:100] print "--------------------------------------------------------" print "Building model MSC codes counts..." mscmodel = msc_processing.MscModel(zblid2zbl.values()) print "--------------------------------------------------------" print "Filtering msccodes with MIN_COUNT_MSC=", MIN_COUNT_MSC, " MIN_COUNT_MSCPRIM=", MIN_COUNT_MSCPRIM, " MIN_COUNT_MSCSEC=", MIN_COUNT_MSCSEC mscmodel.keep_msc_mincount(MIN_COUNT_MSC, MIN_COUNT_MSCPRIM, MIN_COUNT_MSCSEC) mscmodel.report() #store_mscgroups_primary(open("msc_groups.txt", "w"), mscmodel.mscprim2zblidlist) print "--------------------------------------------------------" print "Calculating msc2ix mapping..." msc2ix = dict( (msc, ix) for ix, msc in enumerate(sorted(mscmodel.allcodes()))) ix2msc = dict((ix, msc) for msc, ix in msc2ix.iteritems()) msc_list = list(ix2msc[ix] for ix in xrange(len(ix2msc))) print " msc2ix=", str(list(msc2ix.iteritems()))[:100] print "--------------------------------------------------------" print "Storing random similarity matrix to", output_path matrix_io.fwrite_smatrix([[random.random() for msc in msc_list] for msc in msc_list], msc_list, msc_list, output_path)
print "The program generates random similarity matrix for MSC codes (filtered and extracted from ZBL file)." print "--------------------------------------------------------" print "Loading ZBL records from zbl_path=",zbl_path zblid2zbl = dict( (zbl[zbl_io.ZBL_ID_FIELD],zbl) for zbl in _get_zbl_generator_(zbl_path) ) print " zblid2zbl [",len(zblid2zbl),"docs loaded] =",str(list(zblid2zbl.iteritems()))[:100] print "--------------------------------------------------------" print "Building model MSC codes counts..." mscmodel = msc_processing.MscModel( zblid2zbl.values() ) print "--------------------------------------------------------" print "Filtering msccodes with MIN_COUNT_MSC=",MIN_COUNT_MSC," MIN_COUNT_MSCPRIM=",MIN_COUNT_MSCPRIM," MIN_COUNT_MSCSEC=",MIN_COUNT_MSCSEC mscmodel.keep_msc_mincount(MIN_COUNT_MSC, MIN_COUNT_MSCPRIM, MIN_COUNT_MSCSEC) mscmodel.report() #store_mscgroups_primary(open("msc_groups.txt", "w"), mscmodel.mscprim2zblidlist) print "--------------------------------------------------------" print "Calculating msc2ix mapping..." msc2ix = dict((msc,ix) for ix,msc in enumerate(sorted(mscmodel.allcodes()))) ix2msc = dict((ix,msc) for msc,ix in msc2ix.iteritems()) msc_list = list( ix2msc[ix] for ix in xrange(len(ix2msc)) ) print " msc2ix=",str(list(msc2ix.iteritems()))[:100] print "--------------------------------------------------------" print "Storing random similarity matrix to",output_path matrix_io.fwrite_smatrix([ [random.random() for msc in msc_list] for msc in msc_list], msc_list, msc_list, output_path)