示例#1
0
def walk(adj_file, k, path_file, strand_specific=False, cov_gradient=0.05, dist_file=None):
    # parse adj
    adj_graph, largest_cid = parse_adj(adj_file, k, strand_specific=strand_specific)

    graph = adj_graph.graph
    log('ADJ: %d vertices, %d edges' % (graph.vcount(), graph.ecount()))
    
    # Descending order of mean kmer coverage
    vidx_mkc_tuple_list = []
    if strand_specific:
        for vidx, mkc in enumerate(adj_graph.mkcs):
            vidx_mkc_tuple_list.append((vidx, mkc))
        #endfor
    else:
        for vidx, mkc in enumerate(adj_graph.mkcs):
            vidx_mkc_tuple_list.append((vidx*2, mkc))
        #endfor
    #endif    
    vidx_mkc_tuple_list.sort(key=itemgetter(1), reverse=True)
        
    # initialize the path id
    pathid = largest_cid+1
    
    cid_partners_dict = None
    if dist_file:
        cid_partners_dict = parse_dist(dist_file)
    #endif
    
    with open(path_file, 'w') as fh_path:
        for seed_index, mkc in vidx_mkc_tuple_list:            
            # extend a path from a seed
            path = extend_seed(seed_index, mkc, adj_graph, cov_gradient=cov_gradient)
            
            if len(path) > 0 and cid_partners_dict is not None:
                path = extend_path_with_paired_support(path, adj_graph, cid_partners_dict)
            #endif
                                    
            if len(path) > 1:
                path_as_cids = []
                
                for index in path:
                    path_as_cids.append(graph.vs[index][GRAPH_ATT_NAME])
                #endfor
                
                fh_path.write(str(pathid) + '\t' + ' '.join(path_as_cids) + '\n')
                                
                pathid += 1
            #endif
        #endfor
    #endwith
    
    #return the number of paths walked
    return pathid - largest_cid - 1
示例#2
0
def unbraid(adj_file, k, path_file, err_cid_file, strand_specific=False, cov_gradient=0.05, length_diff_tolerance=1):
    # parse adj
    adj_graph, largest_cid = parse_adj(adj_file, k, strand_specific=strand_specific)
    
    graph = adj_graph.graph
    log('ADJ: %d vertices, %d edges' % (graph.vcount(), graph.ecount()))
    
    # Descending order of mean kmer coverage
    vidx_mkc_tuple_list = []
    if strand_specific:
        for vidx, mkc in enumerate(adj_graph.mkcs):
            vidx_mkc_tuple_list.append((vidx, mkc))
        #endfor
    else:
        for vidx, mkc in enumerate(adj_graph.mkcs):
            vidx_mkc_tuple_list.append((vidx*2, mkc))
        #endfor
    #endif
    vidx_mkc_tuple_list.sort(key=itemgetter(1), reverse=True)
    
    # initialize the path id
    pathid = largest_cid+1
    
    num_errors = 0
    with open(path_file, 'w') as fh_path:
        for seed_index, mkc in vidx_mkc_tuple_list:             
            # extend a path from a seed
            path = extend_seed(seed_index, mkc, adj_graph, cov_gradient=cov_gradient)
                                    
            if len(path) > 1:
                path_as_cids = []
                
                for index in path:
                    path_as_cids.append(graph.vs[index][GRAPH_ATT_NAME])
                #endfor
                
                fh_path.write(str(pathid) + '\t' + ' '.join(path_as_cids) + '\n')
                
                # walk along the path to remove erroneous branches
                find_erroneous_branches(path, adj_graph, k, length_diff_tolerance=length_diff_tolerance)
                                
                pathid += 1
            #endif
        #endfor
        
        with open(err_cid_file, 'w') as fh_err:
            if strand_specific:
                for idx, val in enumerate(adj_graph.states):
                    if val == REMOVE_VERTEX_STATE:
                        num_errors += 1
                        name = graph.vs[idx][GRAPH_ATT_NAME]
                        cid = get_cid(name)
                        fh_path.write(str(cid) + '\n')
                        fh_err.write(str(cid) + '\n')
                     #endif
                #endfor
            else:
                for idx, val in enumerate(adj_graph.states):
                    if val == REMOVE_VERTEX_STATE:
                        num_errors += 1
                        name = graph.vs[idx*2][GRAPH_ATT_NAME]
                        cid = get_cid(name)
                        fh_path.write(str(cid) + '\n')
                        fh_err.write(str(cid) + '\n')
                     #endif
                #endfor
            #endif
        #endwith
    #endwith
    
    #return the number of paths walked, number vertices marked for removal
    return pathid - largest_cid - 1, num_errors