def mp_build_scaffolds(paths, gapsize, n_proc): scaffold_sequences, scaffold_correspondences = {}, {} all_edges = [] n_gaps, n_merges = 0, 0 misc.printstatus("Number of paths: " + str(len(paths))) bed = {} # To collect bed coordinates pool = multiprocessing.Pool(n_proc) result = pool.map_async(process_scaffold, paths) while not result.ready(): misc.printstatusFlush( "[ SCAFFOLDING ]\t" + misc.reportProgress(len(paths) - result._number_left, len(paths))) time.sleep(4) # Get the result and remove Nones. mp_output = [i for i in result.get() if i] misc.printstatus("[ SCAFFOLDING ]\t" + misc.reportProgress(len(paths), len(paths))) # Unpack multiprocessing data results for idx, dat in enumerate(mp_output): scaffold_sequence, included, ng, nm, bed_coords = dat[0], dat[1], dat[ 2], dat[3], dat[4] scaffold_sequences["scaffold_" + str(idx)] = scaffold_sequence scaffold_correspondences["scaffold_" + str(idx)] = included bed["scaffold_" + str(idx)] = bed_coords n_gaps += ng n_merges += nm return scaffold_sequences, scaffold_correspondences, all_edges, n_gaps, n_merges, bed
def fillJunctions(backbone_graph, GEMlist, barcode_factor): '''Fill the Linkgraph junctions with short contigs. Connections section of the linkgraph is filled using the barcodes of the junction. The short contigs with matching set of barcodes to the junction are inserted into the connections. Args: backbone_graph (Linkgraph) GEMlist (dict) barcode_factor (int) Returns: list: list of paths with junctions filled. ''' filled_junction_paths = [] # Iterate over paths and every junction in the path # Create a barcode comparison of the junction and all small contigs for idx, path in enumerate(backbone_graph.paths): # Report progress every 100 windows if idx in range(0, 10000000, 1): misc.printstatusFlush("[ PATH FILLING ]\t" + \ misc.reportProgress(idx+1, len(backbone_graph.paths))) filled_path = [] # Check outgoing edges from both start and target in full_graph. # If they are connected to both sides, add them to junction. for junction in path: tigs, fractions = zip(*[(k, graph_building.compareGEMlibs(junction.barcodes, v)) \ for k,v in GEMlist.items()]) fracs = pd.Series(fractions, index=tigs) fracs = fracs[fracs > 0] if len(fracs > 0): outliers = graph_building.calcOutliers(fracs, barcode_factor) # Old outlier method: #outliers = esd.getOutliers_QC(np.array(fractions),tigs,10) # Add any outliers to junction.connections filled_path.append( graph_building.Junction(junction.start, \ junction.target, \ junction.connections + \ [ o[:-1] for o in list(outliers.index)] )) filled_junction_paths.append(filled_path) misc.printstatus("[ PATH FILLING ]\t" + \ misc.reportProgress(idx+1, len(backbone_graph.paths))) return filled_junction_paths
def main(input_bam, contig_dict, region_size=20000, mapq=60, bc_quant=2): global samfile samfile = pysam.AlignmentFile(input_bam, "rb") GEMlist = {} # Inappropriately named "list" # First step is to collect all barcodes (passing -q cutoff) that are aligned # to each contigs first and last regions (-l) misc.printstatus("Starting barcode collection. Found {0} contigs.".format( len(contig_dict.keys()))) # Generate windows windows = getWindows(region_size, contig_dict) # Iterate over windows to collect barcodes sets for idx, window in enumerate(windows): # Unpack variables, for readability region, contig, start, end = window[0], window[0][:-1], window[ 1], window[2] # Print progress. Number of windows is dependent on if running on # backbone or on small contigs. if idx in range(0, 100000000, 20): if region[-1] == "a": misc.printstatusFlush("[ BARCODE COLLECTION ]\t" + \ misc.reportProgress(idx, len(contig_dict.keys()))) else: misc.printstatusFlush("[ BARCODE COLLECTION ]\t" + \ misc.reportProgress(idx, len(contig_dict.keys())*2)) # Collect barcodes from the window GEMs = collectGEMs((contig, start, end), mapq, bc_quant) # If at least 100 barcodes in list, use it if len(GEMs) > 100: GEMlist[region] = GEMs if region[-1] == "a": misc.printstatus("[ BARCODE COLLECTION ]\t" + \ misc.reportProgress(len(contig_dict.keys()), len(contig_dict.keys()))) else: misc.printstatus("[ BARCODE COLLECTION ]\t" + \ misc.reportProgress(len(contig_dict.keys())*2, len(contig_dict.keys())*2)) samfile.close() return GEMlist
def build_scaffolds(paths, gapsize): scaffold_sequences, scaffold_correspondences = {}, {} all_edges = [] n_gaps, n_merges = 0, 0 misc.printstatus("Number of paths: " + str(len(paths))) bed = {} # To collect bed coordinates for idx, path in enumerate(paths): misc.printstatusFlush("[ SCAFFOLDING ]\t" + misc.reportProgress(idx + 1, len(paths))) # Collect all relevant sequences from fasta linked_contigs = [ [junction.start[:-1], junction.target[:-1]] + \ junction.connections for junction in path \ if junction.start and junction.target] linked_contigs = [ step for partial_path in linked_contigs for step in partial_path ] # Start overlapping filled_path, edges = combine_paths(path) # It is possible that there is no filled_path, in the case that the # path had a single junction which had a None at junction.start or # junction.target and no overlaps were found. In this case, continue. if filled_path: all_edges.extend(edges) # Create scaffold scaffold_sequence, included, ng, nm, bed_coords = mergeSeq( filled_path, gapsize) scaffold_sequences["scaffold_" + str(idx)] = scaffold_sequence scaffold_correspondences["scaffold_" + str(idx)] = included bed["scaffold_" + str(idx)] = bed_coords n_gaps += ng n_merges += nm misc.printstatus("[ SCAFFOLDING ]\t" + misc.reportProgress(idx + 1, len(paths))) return scaffold_sequences, scaffold_correspondences, all_edges, n_gaps, n_merges, bed
def pairwise_comparisons(GEMlist): ''' Performs all pairwise comparisons between windows in GEMlist. Returns: GEMcomparison (pd.DataFrame) ''' # Compare the barcodes in every region to all other regions GEMcomparison = pd.DataFrame(np.zeros(( len(GEMlist), len(GEMlist) )), \ index=GEMlist.keys()) GEMcomparison.columns = GEMcomparison.index # Iterate over rows in GEMcomparison # Index to keep track of position so we can skip calculating some fractions # twice idx = 0 for idx, region1 in enumerate(GEMcomparison.index): lib1 = GEMlist[region1] # Report progress every 20 windows if idx in range(0, 100000000, 20): misc.printstatusFlush("[ BARCODE COMPARISON ]\t" + misc.reportProgress(idx + 1, len(GEMlist))) fractions = [ compareGEMlibs(lib1, GEMlist[col]) for col in GEMcomparison.columns[idx:] ] GEMcomparison.loc[region1][ idx:] = fractions # Update row values from idx GEMcomparison[region1][ idx:] = fractions # Also update column values from idx misc.printstatus("[ BARCODE COMPARISON ]\t" + misc.reportProgress(idx + 1, len(GEMlist))) return GEMcomparison
def makeEdges(GEMcomparison, barcode_factor, min_barcode_fraction): '''Create edges from the GEMcomparison dataframe. Args: GEMcomparison (pd.DataFrame): All-against-all comparison of the windows' barcodes. barcode_factor (int): Factor for calculating outliers. min_barcode_fraction (float): Minimum fraction of shared barcodes to create an edge in the linkgraph. Returns: list: Edges inferred from the fractions of shared barcodes. ''' misc.printstatus("Number of windows: " + str(len(GEMcomparison.keys()))) edges = [] with open("fractions.txt", "w") as out: for f in GEMcomparison.index: out.write("{}\t".format(f)) out.write("\n") # Iterate over rows in GEMcomparison for idx, (region, fractions) in enumerate(GEMcomparison.iterrows()): contig = region[:-1] window = region[-1] out.write(region + "\t") for f in fractions: out.write("{}\t".format(f)) out.write("\n") # Report progress every 100 windows if idx in range(0, 10000000, 100): misc.printstatusFlush( "[ BARCODE LINKING ]\t" + misc.reportProgress(idx, len(GEMcomparison))) ''' # Calculate outliers from the comparisons of window k to all other windows # outliers is a dict where each key is a connected window to region, # and value is the fraction of shared barcodes between region and window outliers = esd.getOutliers_QC(np.array(fractions),fractions.index,10) # Get rid of edges to the same contig. outliers = { k:v for k,v in outliers.items() if k[:-1] != region[:-1] \ and v > np.mean(fractions)} outliers = pd.Series(outliers) # If there are any outliers, i.e. edges to create, add them to the edges # list. Don't add edges for lower outliers (fractions < mean(fractions)) # or where the fraction is less than # min_barcode_fraction (-f) and edges back to the same contig if len(outliers.keys()) > 1: sorted_outliers = outliers.sort_values(ascending = False) if sorted_outliers[0] > sorted_outliers[1] * barcode_factor: outliers = outliers[outliers == sorted_outliers[0]] new_edges = [(region, connected_window, fraction) \ for connected_window, fraction in outliers.items()] # Let's try only writing single edges #if len(new_edges) == 1: for idx, mo in outliers.iteritems(): edges.append( (region, idx, mo ) ) ''' # Ignore comparisons to the same contig and calculate outliers # In low coverage datasets the amount of 0's might cloud any # actual signal fractions = fractions.drop(labels=[contig + "s", contig + "e"], errors="ignore") fractions = fractions[fractions > 0] if len(fractions) > 0: minor_outliers = calcOutliers(fractions, barcode_factor) minor_outliers = minor_outliers[ minor_outliers > min_barcode_fraction] for ix, mo in minor_outliers.iteritems(): edges.append((region, ix, mo)) misc.printstatus( "[ BARCODE LINKING ]\t" + misc.reportProgress(len(GEMcomparison), len(GEMcomparison))) return edges
def trimSequences(paths, mincov): '''Trim away low quality regions of input sequences Description: Because de novo assembled contigs often end in low quality regions that are of too poor sequence to find good overlaps between, we want to trim input contigs of regions where reads don't map. Only trim regions where there is a potential overlap, i.e. NOT at the start of the first contig and end of the last contig in a path. Args: paths (list): list of lists. Each nested list contains ordered graph_building.Junction objects. mincov (int): Trim contig ends with lower average coverage than this value Returns: dict: trimmed_fasta_coords. Keys: input contig headers, values: start and end coordinates to keep, in addition to True or False for start and end if they were trimmed or not. ''' # trimmed_fasta_coords is a dict with coords to keep from original fasta # Format: {contig: [start_coord, end_coord, bool, bool]} # Start by filling with old coords, which will then be changed trimmed_fasta_coords = {} for idx, ctg in enumerate(fastafile.references): trimmed_fasta_coords[ctg] = [0, fastafile.lengths[idx], False, False] # Then find new coordinates for all sequences to merge for idx, path in enumerate(paths): if idx in range(0, 100000000, 5): misc.printstatusFlush("[ TRIMMING ]\t" + misc.reportProgress(idx + 1, len(paths))) for junction in path: if junction.start != None: start_tig, start_side = junction.start[:-1], junction.start[-1] else: start_tig, start_side = None, None if junction.target != None: target_tig, target_side = junction.target[: -1], junction.target[ -1] else: target_tig, target_side = None, None connections = junction.connections # Trim the sides of contigs where a junction is formed, # and don't trim places where there are no junctions. if start_side == "s" \ and trimmed_fasta_coords[start_tig][2] == False: trimmed_fasta_coords[start_tig] = [trimmed_fasta_coords[start_tig][0] + \ trimFasta(trimmed_fasta_coords, junction.start, mincov), \ trimmed_fasta_coords[start_tig][1], \ True, \ trimmed_fasta_coords[start_tig][3]] elif start_side == "e" \ and trimmed_fasta_coords[start_tig][3] == False: trimmed_fasta_coords[start_tig] = [trimmed_fasta_coords[start_tig][0], \ trimmed_fasta_coords[start_tig][1] + \ trimFasta(trimmed_fasta_coords, junction.start, mincov), \ trimmed_fasta_coords[start_tig][2], \ True] if target_side == "s" \ and trimmed_fasta_coords[target_tig][2] == False: trimmed_fasta_coords[target_tig] = [trimmed_fasta_coords[target_tig][0] + \ trimFasta(trimmed_fasta_coords, junction.target, mincov), \ trimmed_fasta_coords[target_tig][1], \ True, \ trimmed_fasta_coords[target_tig][3]] elif target_side == "e" \ and trimmed_fasta_coords[target_tig][3] == False: trimmed_fasta_coords[target_tig] = [trimmed_fasta_coords[target_tig][0], \ trimmed_fasta_coords[target_tig][1] + \ trimFasta(trimmed_fasta_coords, junction.target, mincov), \ trimmed_fasta_coords[target_tig][2], \ True] # Also trim everything in connections for conn in connections: if not trimmed_fasta_coords[conn][2] == True \ or not trimmed_fasta_coords[conn][3] == True: trimmed_fasta_coords[conn] = [trimmed_fasta_coords[conn][0] + \ trimFasta(trimmed_fasta_coords, conn+"s", mincov), \ trimmed_fasta_coords[conn][1] + \ trimFasta(trimmed_fasta_coords, conn+"e", mincov), \ True, True] misc.printstatus("[ TRIMMING ]\t" + misc.reportProgress(idx + 1, len(paths))) return trimmed_fasta_coords