def mp_build_scaffolds(paths, gapsize, n_proc): scaffold_sequences, scaffold_correspondences = {}, {} all_edges = [] n_gaps, n_merges = 0, 0 misc.printstatus("Number of paths: " + str(len(paths))) bed = {} # To collect bed coordinates pool = multiprocessing.Pool(n_proc) result = pool.map_async(process_scaffold, paths) while not result.ready(): misc.printstatusFlush( "[ SCAFFOLDING ]\t" + misc.reportProgress(len(paths) - result._number_left, len(paths))) time.sleep(4) # Get the result and remove Nones. mp_output = [i for i in result.get() if i] misc.printstatus("[ SCAFFOLDING ]\t" + misc.reportProgress(len(paths), len(paths))) # Unpack multiprocessing data results for idx, dat in enumerate(mp_output): scaffold_sequence, included, ng, nm, bed_coords = dat[0], dat[1], dat[ 2], dat[3], dat[4] scaffold_sequences["scaffold_" + str(idx)] = scaffold_sequence scaffold_correspondences["scaffold_" + str(idx)] = included bed["scaffold_" + str(idx)] = bed_coords n_gaps += ng n_merges += nm return scaffold_sequences, scaffold_correspondences, all_edges, n_gaps, n_merges, bed
def fillJunctions(backbone_graph, GEMlist, barcode_factor): '''Fill the Linkgraph junctions with short contigs. Connections section of the linkgraph is filled using the barcodes of the junction. The short contigs with matching set of barcodes to the junction are inserted into the connections. Args: backbone_graph (Linkgraph) GEMlist (dict) barcode_factor (int) Returns: list: list of paths with junctions filled. ''' filled_junction_paths = [] # Iterate over paths and every junction in the path # Create a barcode comparison of the junction and all small contigs for idx, path in enumerate(backbone_graph.paths): # Report progress every 100 windows if idx in range(0, 10000000, 1): misc.printstatusFlush("[ PATH FILLING ]\t" + \ misc.reportProgress(idx+1, len(backbone_graph.paths))) filled_path = [] # Check outgoing edges from both start and target in full_graph. # If they are connected to both sides, add them to junction. for junction in path: tigs, fractions = zip(*[(k, graph_building.compareGEMlibs(junction.barcodes, v)) \ for k,v in GEMlist.items()]) fracs = pd.Series(fractions, index=tigs) fracs = fracs[fracs > 0] if len(fracs > 0): outliers = graph_building.calcOutliers(fracs, barcode_factor) # Old outlier method: #outliers = esd.getOutliers_QC(np.array(fractions),tigs,10) # Add any outliers to junction.connections filled_path.append( graph_building.Junction(junction.start, \ junction.target, \ junction.connections + \ [ o[:-1] for o in list(outliers.index)] )) filled_junction_paths.append(filled_path) misc.printstatus("[ PATH FILLING ]\t" + \ misc.reportProgress(idx+1, len(backbone_graph.paths))) return filled_junction_paths
def build_scaffolds(paths, gapsize): scaffold_sequences, scaffold_correspondences = {}, {} all_edges = [] n_gaps, n_merges = 0, 0 misc.printstatus("Number of paths: " + str(len(paths))) bed = {} # To collect bed coordinates for idx, path in enumerate(paths): misc.printstatusFlush("[ SCAFFOLDING ]\t" + misc.reportProgress(idx + 1, len(paths))) # Collect all relevant sequences from fasta linked_contigs = [ [junction.start[:-1], junction.target[:-1]] + \ junction.connections for junction in path \ if junction.start and junction.target] linked_contigs = [ step for partial_path in linked_contigs for step in partial_path ] # Start overlapping filled_path, edges = combine_paths(path) # It is possible that there is no filled_path, in the case that the # path had a single junction which had a None at junction.start or # junction.target and no overlaps were found. In this case, continue. if filled_path: all_edges.extend(edges) # Create scaffold scaffold_sequence, included, ng, nm, bed_coords = mergeSeq( filled_path, gapsize) scaffold_sequences["scaffold_" + str(idx)] = scaffold_sequence scaffold_correspondences["scaffold_" + str(idx)] = included bed["scaffold_" + str(idx)] = bed_coords n_gaps += ng n_merges += nm misc.printstatus("[ SCAFFOLDING ]\t" + misc.reportProgress(idx + 1, len(paths))) return scaffold_sequences, scaffold_correspondences, all_edges, n_gaps, n_merges, bed
def pairwise_comparisons(GEMlist): ''' Performs all pairwise comparisons between windows in GEMlist. Returns: GEMcomparison (pd.DataFrame) ''' # Compare the barcodes in every region to all other regions GEMcomparison = pd.DataFrame(np.zeros(( len(GEMlist), len(GEMlist) )), \ index=GEMlist.keys()) GEMcomparison.columns = GEMcomparison.index # Iterate over rows in GEMcomparison # Index to keep track of position so we can skip calculating some fractions # twice idx = 0 for idx, region1 in enumerate(GEMcomparison.index): lib1 = GEMlist[region1] # Report progress every 20 windows if idx in range(0, 100000000, 20): misc.printstatusFlush("[ BARCODE COMPARISON ]\t" + misc.reportProgress(idx + 1, len(GEMlist))) fractions = [ compareGEMlibs(lib1, GEMlist[col]) for col in GEMcomparison.columns[idx:] ] GEMcomparison.loc[region1][ idx:] = fractions # Update row values from idx GEMcomparison[region1][ idx:] = fractions # Also update column values from idx misc.printstatus("[ BARCODE COMPARISON ]\t" + misc.reportProgress(idx + 1, len(GEMlist))) return GEMcomparison
def main(input_bam, contig_dict, region_size=20000, mapq=60, bc_quant=2): global samfile samfile = pysam.AlignmentFile(input_bam, "rb") GEMlist = {} # Inappropriately named "list" # First step is to collect all barcodes (passing -q cutoff) that are aligned # to each contigs first and last regions (-l) misc.printstatus("Starting barcode collection. Found {0} contigs.".format( len(contig_dict.keys()))) # Generate windows windows = getWindows(region_size, contig_dict) # Iterate over windows to collect barcodes sets for idx, window in enumerate(windows): # Unpack variables, for readability region, contig, start, end = window[0], window[0][:-1], window[ 1], window[2] # Print progress. Number of windows is dependent on if running on # backbone or on small contigs. if idx in range(0, 100000000, 20): if region[-1] == "a": misc.printstatusFlush("[ BARCODE COLLECTION ]\t" + \ misc.reportProgress(idx, len(contig_dict.keys()))) else: misc.printstatusFlush("[ BARCODE COLLECTION ]\t" + \ misc.reportProgress(idx, len(contig_dict.keys())*2)) # Collect barcodes from the window GEMs = collectGEMs((contig, start, end), mapq, bc_quant) # If at least 100 barcodes in list, use it if len(GEMs) > 100: GEMlist[region] = GEMs if region[-1] == "a": misc.printstatus("[ BARCODE COLLECTION ]\t" + \ misc.reportProgress(len(contig_dict.keys()), len(contig_dict.keys()))) else: misc.printstatus("[ BARCODE COLLECTION ]\t" + \ misc.reportProgress(len(contig_dict.keys())*2, len(contig_dict.keys())*2)) samfile.close() return GEMlist
def main(): misc.printstatus("Starting ARBitR.") # Unpack arguments region_size = args.region_size molecule_size = args.molecule_size mapq = args.mapq n_proc = args.n_proc short_mapq = args.short_mapq short_bc_quant = args.short_bc_quant short_bc_factor = args.short_barcode_factor barcode_factor = args.barcode_factor barcode_fraction = args.barcode_fraction mincov = args.coverage bc_quantity = args.bc_quantity gapsize = 100 if region_size > molecule_size: misc.printstatus("Larger --region_size than --molecule_size detected. \ Using default values instead.") region_size, molecule_size = 20000, 45000 outfilename = getOut() # Create a prefix for output files samfile = pysam.AlignmentFile(args.input_bam, "rb") input_contig_lengths = dict(zip(samfile.references, samfile.lengths)) samfile.close() # Split dataset into backbone and small contigs misc.printstatus("Collecting contigs.") backbone_contig_lengths = { ctg:length for ctg, length \ in input_contig_lengths.items() \ if length > molecule_size} small_contig_lengths = { k:input_contig_lengths[k] \ for k in input_contig_lengths.keys() \ - backbone_contig_lengths.keys()} # First step is to collect the barcodes for the backbone graph misc.printstatus("Collecting barcodes for linkgraph.") GEMlist = barcode_collection.main( args.input_bam, \ backbone_contig_lengths, \ region_size, \ mapq, \ bc_quantity) # Second step is to build the link graph based on the barcodes misc.printstatus("Creating link graph.") backbone_graph = graph_building.main(backbone_contig_lengths, \ GEMlist, \ barcode_factor, \ barcode_fraction) misc.printstatus( "Writing link graph to {}.backbone.gfa.".format(outfilename)) writeGfa(outfilename + ".backbone", backbone_contig_lengths, backbone_graph) # Third step is to traverse the graph and build paths misc.printstatus("Finding paths.") backbone_graph.unambiguousPaths() # Fill graph.paths misc.printstatus("Found {} paths.".format(len(backbone_graph.paths))) writePaths( outfilename+".pre-fill", \ {str(idx):path for idx, path in enumerate(backbone_graph.paths)}) # Fourth step is to collect the barcodes from the input bam file, # this time for the small contigs misc.printstatus("Collecting barcodes from short contigs.") GEMlist = barcode_collection.main( args.input_bam, \ small_contig_lengths, \ molecule_size, \ short_mapq, \ short_bc_quant) # Fifth step is to pull in the short contigs into the linkgraph junctions, # if they have # Sixth step is to fill the junctions in the backbone_graph paths = fill_junctions.fillJunctions(backbone_graph, GEMlist, short_bc_factor) writePaths( outfilename+".pre-merge", \ {str(idx):path for idx, path in enumerate(paths)}) if os.path.isfile(args.input_fasta): # If user gave an assembly fasta file, use this for merging misc.printstatus("Found fasta file for merging: {}".format( args.input_fasta)) new_scaffolds, \ scaffold_correspondence, \ bed = merge_fasta.main( args.input_fasta, \ args.input_bam, \ paths, \ mincov, \ gapsize, \ n_proc) misc.printstatus( "Writing merged fasta to {0}.fasta".format(outfilename)) writeFasta(outfilename, new_scaffolds) writePaths(outfilename + ".correspondence", scaffold_correspondence) writeBed(outfilename, bed) else: misc.printstatus("No fasta file found for merging. Pipeline finished.") # Cleanup temp files tmp_files = [file for file in os.listdir() if file.endswith(".tmp.fasta")] for tmp in tmp_files: os.remove(tmp) misc.printstatus("ARBitR successfully completed!\n")
def makeEdges(GEMcomparison, barcode_factor, min_barcode_fraction): '''Create edges from the GEMcomparison dataframe. Args: GEMcomparison (pd.DataFrame): All-against-all comparison of the windows' barcodes. barcode_factor (int): Factor for calculating outliers. min_barcode_fraction (float): Minimum fraction of shared barcodes to create an edge in the linkgraph. Returns: list: Edges inferred from the fractions of shared barcodes. ''' misc.printstatus("Number of windows: " + str(len(GEMcomparison.keys()))) edges = [] with open("fractions.txt", "w") as out: for f in GEMcomparison.index: out.write("{}\t".format(f)) out.write("\n") # Iterate over rows in GEMcomparison for idx, (region, fractions) in enumerate(GEMcomparison.iterrows()): contig = region[:-1] window = region[-1] out.write(region + "\t") for f in fractions: out.write("{}\t".format(f)) out.write("\n") # Report progress every 100 windows if idx in range(0, 10000000, 100): misc.printstatusFlush( "[ BARCODE LINKING ]\t" + misc.reportProgress(idx, len(GEMcomparison))) ''' # Calculate outliers from the comparisons of window k to all other windows # outliers is a dict where each key is a connected window to region, # and value is the fraction of shared barcodes between region and window outliers = esd.getOutliers_QC(np.array(fractions),fractions.index,10) # Get rid of edges to the same contig. outliers = { k:v for k,v in outliers.items() if k[:-1] != region[:-1] \ and v > np.mean(fractions)} outliers = pd.Series(outliers) # If there are any outliers, i.e. edges to create, add them to the edges # list. Don't add edges for lower outliers (fractions < mean(fractions)) # or where the fraction is less than # min_barcode_fraction (-f) and edges back to the same contig if len(outliers.keys()) > 1: sorted_outliers = outliers.sort_values(ascending = False) if sorted_outliers[0] > sorted_outliers[1] * barcode_factor: outliers = outliers[outliers == sorted_outliers[0]] new_edges = [(region, connected_window, fraction) \ for connected_window, fraction in outliers.items()] # Let's try only writing single edges #if len(new_edges) == 1: for idx, mo in outliers.iteritems(): edges.append( (region, idx, mo ) ) ''' # Ignore comparisons to the same contig and calculate outliers # In low coverage datasets the amount of 0's might cloud any # actual signal fractions = fractions.drop(labels=[contig + "s", contig + "e"], errors="ignore") fractions = fractions[fractions > 0] if len(fractions) > 0: minor_outliers = calcOutliers(fractions, barcode_factor) minor_outliers = minor_outliers[ minor_outliers > min_barcode_fraction] for ix, mo in minor_outliers.iteritems(): edges.append((region, ix, mo)) misc.printstatus( "[ BARCODE LINKING ]\t" + misc.reportProgress(len(GEMcomparison), len(GEMcomparison))) return edges
def trimSequences(paths, mincov): '''Trim away low quality regions of input sequences Description: Because de novo assembled contigs often end in low quality regions that are of too poor sequence to find good overlaps between, we want to trim input contigs of regions where reads don't map. Only trim regions where there is a potential overlap, i.e. NOT at the start of the first contig and end of the last contig in a path. Args: paths (list): list of lists. Each nested list contains ordered graph_building.Junction objects. mincov (int): Trim contig ends with lower average coverage than this value Returns: dict: trimmed_fasta_coords. Keys: input contig headers, values: start and end coordinates to keep, in addition to True or False for start and end if they were trimmed or not. ''' # trimmed_fasta_coords is a dict with coords to keep from original fasta # Format: {contig: [start_coord, end_coord, bool, bool]} # Start by filling with old coords, which will then be changed trimmed_fasta_coords = {} for idx, ctg in enumerate(fastafile.references): trimmed_fasta_coords[ctg] = [0, fastafile.lengths[idx], False, False] # Then find new coordinates for all sequences to merge for idx, path in enumerate(paths): if idx in range(0, 100000000, 5): misc.printstatusFlush("[ TRIMMING ]\t" + misc.reportProgress(idx + 1, len(paths))) for junction in path: if junction.start != None: start_tig, start_side = junction.start[:-1], junction.start[-1] else: start_tig, start_side = None, None if junction.target != None: target_tig, target_side = junction.target[: -1], junction.target[ -1] else: target_tig, target_side = None, None connections = junction.connections # Trim the sides of contigs where a junction is formed, # and don't trim places where there are no junctions. if start_side == "s" \ and trimmed_fasta_coords[start_tig][2] == False: trimmed_fasta_coords[start_tig] = [trimmed_fasta_coords[start_tig][0] + \ trimFasta(trimmed_fasta_coords, junction.start, mincov), \ trimmed_fasta_coords[start_tig][1], \ True, \ trimmed_fasta_coords[start_tig][3]] elif start_side == "e" \ and trimmed_fasta_coords[start_tig][3] == False: trimmed_fasta_coords[start_tig] = [trimmed_fasta_coords[start_tig][0], \ trimmed_fasta_coords[start_tig][1] + \ trimFasta(trimmed_fasta_coords, junction.start, mincov), \ trimmed_fasta_coords[start_tig][2], \ True] if target_side == "s" \ and trimmed_fasta_coords[target_tig][2] == False: trimmed_fasta_coords[target_tig] = [trimmed_fasta_coords[target_tig][0] + \ trimFasta(trimmed_fasta_coords, junction.target, mincov), \ trimmed_fasta_coords[target_tig][1], \ True, \ trimmed_fasta_coords[target_tig][3]] elif target_side == "e" \ and trimmed_fasta_coords[target_tig][3] == False: trimmed_fasta_coords[target_tig] = [trimmed_fasta_coords[target_tig][0], \ trimmed_fasta_coords[target_tig][1] + \ trimFasta(trimmed_fasta_coords, junction.target, mincov), \ trimmed_fasta_coords[target_tig][2], \ True] # Also trim everything in connections for conn in connections: if not trimmed_fasta_coords[conn][2] == True \ or not trimmed_fasta_coords[conn][3] == True: trimmed_fasta_coords[conn] = [trimmed_fasta_coords[conn][0] + \ trimFasta(trimmed_fasta_coords, conn+"s", mincov), \ trimmed_fasta_coords[conn][1] + \ trimFasta(trimmed_fasta_coords, conn+"e", mincov), \ True, True] misc.printstatus("[ TRIMMING ]\t" + misc.reportProgress(idx + 1, len(paths))) return trimmed_fasta_coords
def main(input_fasta, input_bam, paths, mincov, gapsize, n_proc): '''Controller for merge_fasta. Args: input_fasta (str): Path to fasta file to create scaffolds from. input_bam (str): Path to bam file of reads mapped to input_fasta. paths (list): list of lists containing graph_building.Junction objects describing the paths inferred previously during the pipeline. mincov (int): Minimum average coverage for trimming. gapsize (int): Gap size when scaffolding by gap introduction. n_proc (int): Number of processes to run during scaffolding. Returns: dict: scaffolded fasta to output. Keys: fasta headers. Values: the resulting sequence. dict: correspondence, which contigs went into which scaffold. ''' global samfile global fastafile fastafile = pysam.FastaFile(input_fasta) samfile = pysam.AlignmentFile(input_bam, "rb") # Get trim coordinates based on read mappings in samfile misc.printstatus("Trimming contig ends...") trimmed_fasta_coords = trimSequences(paths, mincov) # Trim fasta global trimmed_fasta trimmed_fasta = {} for tig in samfile.references: trimmed_fasta[tig] = fastafile.fetch(reference=tig, \ start=trimmed_fasta_coords[tig][0], \ end=trimmed_fasta_coords[tig][1]) samfile.close() fastafile.close() # Start finding overlaps misc.printstatus("Creating scaffolds...") if n_proc == 1: scaffold_sequences, scaffold_correspondences, all_edges, \ n_gaps, n_merges, bed = build_scaffolds(paths, gapsize) else: scaffold_sequences, scaffold_correspondences, all_edges, \ n_gaps, n_merges, bed = mp_build_scaffolds(paths, gapsize, n_proc) all_edges = [edge for ls in all_edges for edge in ls] misc.printstatus("Scaffolding completed.") misc.printstatus("Number of aligned merges: {}".format(str(n_merges))) misc.printstatus("Number of gaps introduced: {}".format(str(n_gaps))) #complete_overlap_graph = Overlapgraph(list(trimmed_fasta.keys()), all_edges) #writeGfa(complete_overlap_graph) # Collect contigs that were not put into a scaffold misc.printstatus("Collecting leftover sequences.") used_contigs = [ tig for value in scaffold_correspondences.values() for tig in value ] leftover_contigs = [ ctg for ctg in trimmed_fasta.keys() if ctg not in used_contigs ] for idx, tig in enumerate(leftover_contigs): scaffold_sequences["unplaced_contig_" + str(idx)] = trimmed_fasta[tig] scaffold_correspondences["unplaced_contig_" + str(idx)] = [tig] return scaffold_sequences, scaffold_correspondences, bed