def walk(adj_file, k, path_file, strand_specific=False, cov_gradient=0.05, dist_file=None): # parse adj adj_graph, largest_cid = parse_adj(adj_file, k, strand_specific=strand_specific) graph = adj_graph.graph log('ADJ: %d vertices, %d edges' % (graph.vcount(), graph.ecount())) # Descending order of mean kmer coverage vidx_mkc_tuple_list = [] if strand_specific: for vidx, mkc in enumerate(adj_graph.mkcs): vidx_mkc_tuple_list.append((vidx, mkc)) #endfor else: for vidx, mkc in enumerate(adj_graph.mkcs): vidx_mkc_tuple_list.append((vidx*2, mkc)) #endfor #endif vidx_mkc_tuple_list.sort(key=itemgetter(1), reverse=True) # initialize the path id pathid = largest_cid+1 cid_partners_dict = None if dist_file: cid_partners_dict = parse_dist(dist_file) #endif with open(path_file, 'w') as fh_path: for seed_index, mkc in vidx_mkc_tuple_list: # extend a path from a seed path = extend_seed(seed_index, mkc, adj_graph, cov_gradient=cov_gradient) if len(path) > 0 and cid_partners_dict is not None: path = extend_path_with_paired_support(path, adj_graph, cid_partners_dict) #endif if len(path) > 1: path_as_cids = [] for index in path: path_as_cids.append(graph.vs[index][GRAPH_ATT_NAME]) #endfor fh_path.write(str(pathid) + '\t' + ' '.join(path_as_cids) + '\n') pathid += 1 #endif #endfor #endwith #return the number of paths walked return pathid - largest_cid - 1
def unbraid(adj_file, k, path_file, err_cid_file, strand_specific=False, cov_gradient=0.05, length_diff_tolerance=1): # parse adj adj_graph, largest_cid = parse_adj(adj_file, k, strand_specific=strand_specific) graph = adj_graph.graph log('ADJ: %d vertices, %d edges' % (graph.vcount(), graph.ecount())) # Descending order of mean kmer coverage vidx_mkc_tuple_list = [] if strand_specific: for vidx, mkc in enumerate(adj_graph.mkcs): vidx_mkc_tuple_list.append((vidx, mkc)) #endfor else: for vidx, mkc in enumerate(adj_graph.mkcs): vidx_mkc_tuple_list.append((vidx*2, mkc)) #endfor #endif vidx_mkc_tuple_list.sort(key=itemgetter(1), reverse=True) # initialize the path id pathid = largest_cid+1 num_errors = 0 with open(path_file, 'w') as fh_path: for seed_index, mkc in vidx_mkc_tuple_list: # extend a path from a seed path = extend_seed(seed_index, mkc, adj_graph, cov_gradient=cov_gradient) if len(path) > 1: path_as_cids = [] for index in path: path_as_cids.append(graph.vs[index][GRAPH_ATT_NAME]) #endfor fh_path.write(str(pathid) + '\t' + ' '.join(path_as_cids) + '\n') # walk along the path to remove erroneous branches find_erroneous_branches(path, adj_graph, k, length_diff_tolerance=length_diff_tolerance) pathid += 1 #endif #endfor with open(err_cid_file, 'w') as fh_err: if strand_specific: for idx, val in enumerate(adj_graph.states): if val == REMOVE_VERTEX_STATE: num_errors += 1 name = graph.vs[idx][GRAPH_ATT_NAME] cid = get_cid(name) fh_path.write(str(cid) + '\n') fh_err.write(str(cid) + '\n') #endif #endfor else: for idx, val in enumerate(adj_graph.states): if val == REMOVE_VERTEX_STATE: num_errors += 1 name = graph.vs[idx*2][GRAPH_ATT_NAME] cid = get_cid(name) fh_path.write(str(cid) + '\n') fh_err.write(str(cid) + '\n') #endif #endfor #endif #endwith #endwith #return the number of paths walked, number vertices marked for removal return pathid - largest_cid - 1, num_errors