Пример #1
0
def dist_tree(training_distances, training_labels, outpath = "/tmp/training_distances_avg_tree"):
    import jrs_labels_tree
    print "","building label vs label distance matrix using averaging..."
    
    #dmatrix = jrs_io.load_data(open("/tmp/dist_tree_dmatrix.txt"), cast_method=float)
    #print "","loaded matrix =",len(dmatrix),"x",len(dmatrix[0]),"..."
    #import numpy
    #print numpy.array(dmatrix)
    
    print "","training on matrix =",len(training_distances),"x",len(training_distances[0]),"/",len(training_labels)," labels' sets..."
    dmatrix = jrs_labels_tree.build_sim_matrix_labels(training_distances, training_labels)
    print "","clearing diagonal..."
    for i in xrange(len(dmatrix)):
        dmatrix[i][i] = 0.0
    minval = min_matrix(dmatrix)
    maxval = max_matrix(dmatrix)
    print numpy.array(dmatrix)    
    print "","minval:",minval
    print "","maxval:",maxval
    jrs_io.store_data(open("/tmp/dist_tree_dmatrix.txt","w"), dmatrix)
        
    
    print "","transforming dmatrix by substracting minval"
    dmatrix_t = [[(e-minval) for e in row ] for row in dmatrix ]
    dmatrix = dmatrix_t         
    minval = min_matrix(dmatrix)
    maxval = max_matrix(dmatrix)
    print numpy.array(dmatrix)
    print "","minval:",minval
    print "","maxval:",maxval
    jrs_io.store_data(open("/tmp/dist_tree_dmatrix_t.txt","w"), dmatrix)
    
    #dmatrix = jrs_io.load_data(open("/tmp/dist_tree_dmatrix_t.txt"), cast_method=float)
    #print "","loaded matrix =",len(dmatrix),"x",len(dmatrix[0]),"..."
    #import numpy
    #print numpy.array(dmatrix)
    
    treelabels = [str(i+1) for i in range(len(single_labels))]
    print "","treelabels:", treelabels            
    def gen_tree(outph, hd):
        phylo_tree = jrs_labels_tree.upgma(treelabels, dmatrix, agreggation_method = 'a', anonclades = hd)    
        print "","writing tree to:",outph
        jrs_labels_tree.write_tree(phylo_tree, outph)
        print "","dict_tree=",jrs_labels_tree.phylotree2dicttree(phylo_tree)
        
    gen_tree(outpath, False)
    gen_tree(outpath+"_hd", True)
 print "Shuffling labels..."
 labels_shuffled = [labels[ix] for ix in order]
 jrs_io.store_labels(open(labels_path+"_shuffled","w"), labels_shuffled)
 
 print "Loading distances' file:",  distance_matrix_path
 distances = jrs_io.load_data(open(distance_matrix_path), lambda x: x)
 try: print "",len(distances), "x",len(distances[0])
 except: pass
 
 print "Extending order..."    
 order = order + range(n, len(distances))
 print "Extended order:", order    
 
 print "Shuffling columns"
 distances_tmp = []
 for row in distances:
     new_row = [ row[ix] for ix in order ]
     distances_tmp.append(new_row)
 
 print "Shuffling rows"
 distances_shuffled = []
 for ix in order:
     distances_shuffled.append(distances_tmp[ix])
 
 fout = open(distance_matrix_path+"_shuffled","w")
 print "Storing to ",fout
 jrs_io.store_data(fout, distances_shuffled)
       
 
 
 
    print "Shuffling labels..."
    labels_shuffled = [labels[ix] for ix in order]
    jrs_io.store_labels(open(labels_path + "_shuffled", "w"), labels_shuffled)

    print "Loading distances' file:", distance_matrix_path
    distances = jrs_io.load_data(open(distance_matrix_path), lambda x: x)
    try:
        print "", len(distances), "x", len(distances[0])
    except:
        pass

    print "Extending order..."
    order = order + range(n, len(distances))
    print "Extended order:", order

    print "Shuffling columns"
    distances_tmp = []
    for row in distances:
        new_row = [row[ix] for ix in order]
        distances_tmp.append(new_row)

    print "Shuffling rows"
    distances_shuffled = []
    for ix in order:
        distances_shuffled.append(distances_tmp[ix])

    fout = open(distance_matrix_path + "_shuffled", "w")
    print "Storing to ", fout
    jrs_io.store_data(fout, distances_shuffled)