예제 #1
0
def kmedoids_clustering(similarity_matrix, k, maxits = 1000000):
    """Takes symmetric similarity matrix (list of lists) and returns list: assignment to clusters."""
    logging.info("[kmedoids_clustering] clustering objects="+str(len(similarity_matrix))+" k="+str(k))
    if len(similarity_matrix) <= k:        
        logging.warn("[kmedoids_clustering] objects="+str(len(similarity_matrix))+" is no more than clusters="+str(k)+"!")
        return range(0, len(similarity_matrix))
    
    labels = list( i for i in xrange(len(similarity_matrix)) )    
    matrix_io.fwrite_smatrix(similarity_matrix, labels, labels, TMP_INPATH)
    if matlab_wrapper.run_matlab("../clustering/kmedoids_matlab/kmedoids.m", [TMP_INPATH, (k), (maxits), TMP_OUTPATH]) != 0:
        raise Exception("[kmedoids_clustering] Matlab failure!")
    assignment = matrix_io.fread_ivector(TMP_OUTPATH)
    
    clusters = set(assignment) #przenumerowanie klastrow:
    clust2clust = dict( (old_no, new_no) for (new_no,old_no) in enumerate(clusters) ) 
    assignment = list(clust2clust[c] for c in assignment)
    return assignment
예제 #2
0
def kmedoids_clustering(similarity_matrix, k, maxits=1000000):
    """Takes symmetric similarity matrix (list of lists) and returns list: assignment to clusters."""
    logging.info("[kmedoids_clustering] clustering objects=" +
                 str(len(similarity_matrix)) + " k=" + str(k))
    if len(similarity_matrix) <= k:
        logging.warn("[kmedoids_clustering] objects=" +
                     str(len(similarity_matrix)) +
                     " is no more than clusters=" + str(k) + "!")
        return range(0, len(similarity_matrix))

    labels = list(i for i in xrange(len(similarity_matrix)))
    matrix_io.fwrite_smatrix(similarity_matrix, labels, labels, TMP_INPATH)
    if matlab_wrapper.run_matlab("../clustering/kmedoids_matlab/kmedoids.m",
                                 [TMP_INPATH, (k),
                                  (maxits), TMP_OUTPATH]) != 0:
        raise Exception("[kmedoids_clustering] Matlab failure!")
    assignment = matrix_io.fread_ivector(TMP_OUTPATH)

    clusters = set(assignment)  #przenumerowanie klastrow:
    clust2clust = dict(
        (old_no, new_no) for (new_no, old_no) in enumerate(clusters))
    assignment = list(clust2clust[c] for c in assignment)
    return assignment
    print "Loading ZBL records from zbl_path=", zbl_path
    zblid2zbl = dict((zbl[zbl_io.ZBL_ID_FIELD], zbl)
                     for zbl in _get_zbl_generator_(zbl_path))
    print " zblid2zbl [", len(zblid2zbl), "docs loaded] =", str(
        list(zblid2zbl.iteritems()))[:100]

    print "--------------------------------------------------------"
    print "Building model MSC codes counts..."
    mscmodel = msc_processing.MscModel(zblid2zbl.values())

    print "--------------------------------------------------------"
    print "Filtering msccodes with MIN_COUNT_MSC=", MIN_COUNT_MSC, " MIN_COUNT_MSCPRIM=", MIN_COUNT_MSCPRIM, " MIN_COUNT_MSCSEC=", MIN_COUNT_MSCSEC
    mscmodel.keep_msc_mincount(MIN_COUNT_MSC, MIN_COUNT_MSCPRIM,
                               MIN_COUNT_MSCSEC)
    mscmodel.report()
    #store_mscgroups_primary(open("msc_groups.txt", "w"), mscmodel.mscprim2zblidlist)

    print "--------------------------------------------------------"
    print "Calculating msc2ix mapping..."
    msc2ix = dict(
        (msc, ix) for ix, msc in enumerate(sorted(mscmodel.allcodes())))
    ix2msc = dict((ix, msc) for msc, ix in msc2ix.iteritems())
    msc_list = list(ix2msc[ix] for ix in xrange(len(ix2msc)))
    print " msc2ix=", str(list(msc2ix.iteritems()))[:100]

    print "--------------------------------------------------------"
    print "Storing random similarity matrix to", output_path
    matrix_io.fwrite_smatrix([[random.random() for msc in msc_list]
                              for msc in msc_list], msc_list, msc_list,
                             output_path)
     
 print "The program generates random similarity matrix for MSC codes (filtered and extracted from ZBL file)."
     
 print "--------------------------------------------------------"           
 print "Loading ZBL records from zbl_path=",zbl_path    
 zblid2zbl = dict( (zbl[zbl_io.ZBL_ID_FIELD],zbl) for zbl in _get_zbl_generator_(zbl_path) )
 print " zblid2zbl [",len(zblid2zbl),"docs loaded] =",str(list(zblid2zbl.iteritems()))[:100]
 
 print "--------------------------------------------------------"
 print "Building model MSC codes counts..."
 mscmodel = msc_processing.MscModel( zblid2zbl.values() )
 
 print "--------------------------------------------------------"
 print "Filtering msccodes with MIN_COUNT_MSC=",MIN_COUNT_MSC," MIN_COUNT_MSCPRIM=",MIN_COUNT_MSCPRIM," MIN_COUNT_MSCSEC=",MIN_COUNT_MSCSEC
 mscmodel.keep_msc_mincount(MIN_COUNT_MSC, MIN_COUNT_MSCPRIM, MIN_COUNT_MSCSEC)
 mscmodel.report()
 #store_mscgroups_primary(open("msc_groups.txt", "w"), mscmodel.mscprim2zblidlist)
 
 print "--------------------------------------------------------"
 print "Calculating msc2ix mapping..."
 msc2ix = dict((msc,ix) for ix,msc in enumerate(sorted(mscmodel.allcodes())))
 ix2msc = dict((ix,msc) for msc,ix in msc2ix.iteritems())
 msc_list = list( ix2msc[ix] for ix in xrange(len(ix2msc)) )    
 print " msc2ix=",str(list(msc2ix.iteritems()))[:100]
 
 
 print "--------------------------------------------------------"
 print "Storing random similarity matrix to",output_path
 matrix_io.fwrite_smatrix([ [random.random() for msc in msc_list] for msc in msc_list], msc_list, msc_list, output_path)