Пример #1
0
def mp_build_scaffolds(paths, gapsize, n_proc):
    scaffold_sequences, scaffold_correspondences = {}, {}
    all_edges = []
    n_gaps, n_merges = 0, 0
    misc.printstatus("Number of paths: " + str(len(paths)))
    bed = {}  # To collect bed coordinates

    pool = multiprocessing.Pool(n_proc)
    result = pool.map_async(process_scaffold, paths)

    while not result.ready():
        misc.printstatusFlush(
            "[ SCAFFOLDING ]\t" +
            misc.reportProgress(len(paths) - result._number_left, len(paths)))
        time.sleep(4)

    # Get the result and remove Nones.
    mp_output = [i for i in result.get() if i]
    misc.printstatus("[ SCAFFOLDING ]\t" +
                     misc.reportProgress(len(paths), len(paths)))

    # Unpack multiprocessing data results
    for idx, dat in enumerate(mp_output):
        scaffold_sequence, included, ng, nm, bed_coords = dat[0], dat[1], dat[
            2], dat[3], dat[4]
        scaffold_sequences["scaffold_" + str(idx)] = scaffold_sequence
        scaffold_correspondences["scaffold_" + str(idx)] = included
        bed["scaffold_" + str(idx)] = bed_coords
        n_gaps += ng
        n_merges += nm

    return scaffold_sequences, scaffold_correspondences, all_edges, n_gaps, n_merges, bed
Пример #2
0
def fillJunctions(backbone_graph, GEMlist, barcode_factor):
    '''Fill the Linkgraph junctions with short contigs.

    Connections section of the linkgraph is filled using the barcodes of
    the junction. The short contigs with matching set of barcodes to the
    junction are inserted into the connections.

    Args:
        backbone_graph (Linkgraph)
        GEMlist (dict)
        barcode_factor (int)

    Returns:
        list: list of paths with junctions filled.
    '''

    filled_junction_paths = []

    # Iterate over paths and every junction in the path
    # Create a barcode comparison of the junction and all small contigs
    for idx, path in enumerate(backbone_graph.paths):
        # Report progress every 100 windows
        if idx in range(0, 10000000, 1):
            misc.printstatusFlush("[ PATH FILLING ]\t" + \
            misc.reportProgress(idx+1, len(backbone_graph.paths)))

        filled_path = []

        # Check outgoing edges from both start and target in full_graph.
        # If they are connected to both sides, add them to junction.
        for junction in path:
            tigs, fractions = zip(*[(k, graph_building.compareGEMlibs(junction.barcodes, v)) \
                                    for k,v in GEMlist.items()])
            fracs = pd.Series(fractions, index=tigs)
            fracs = fracs[fracs > 0]

            if len(fracs > 0):
                outliers = graph_building.calcOutliers(fracs, barcode_factor)

                # Old outlier method:
                #outliers = esd.getOutliers_QC(np.array(fractions),tigs,10)

                # Add any outliers to junction.connections
                filled_path.append( graph_building.Junction(junction.start, \
                                                            junction.target, \
                                                            junction.connections + \
                                                            [ o[:-1] for o in list(outliers.index)] ))

        filled_junction_paths.append(filled_path)

    misc.printstatus("[ PATH FILLING ]\t" + \
    misc.reportProgress(idx+1, len(backbone_graph.paths)))

    return filled_junction_paths
Пример #3
0
def build_scaffolds(paths, gapsize):
    scaffold_sequences, scaffold_correspondences = {}, {}
    all_edges = []
    n_gaps, n_merges = 0, 0
    misc.printstatus("Number of paths: " + str(len(paths)))
    bed = {}  # To collect bed coordinates

    for idx, path in enumerate(paths):
        misc.printstatusFlush("[ SCAFFOLDING ]\t" +
                              misc.reportProgress(idx + 1, len(paths)))

        # Collect all relevant sequences from fasta
        linked_contigs = [ [junction.start[:-1], junction.target[:-1]] + \
                            junction.connections for junction in path \
                            if junction.start and junction.target]
        linked_contigs = [
            step for partial_path in linked_contigs for step in partial_path
        ]

        # Start overlapping
        filled_path, edges = combine_paths(path)
        # It is possible that there is no filled_path, in the case that the
        # path had a single junction which had a None at junction.start or
        # junction.target and no overlaps were found. In this case, continue.
        if filled_path:
            all_edges.extend(edges)

            # Create scaffold
            scaffold_sequence, included, ng, nm, bed_coords = mergeSeq(
                filled_path, gapsize)
            scaffold_sequences["scaffold_" + str(idx)] = scaffold_sequence
            scaffold_correspondences["scaffold_" + str(idx)] = included
            bed["scaffold_" + str(idx)] = bed_coords
            n_gaps += ng
            n_merges += nm

    misc.printstatus("[ SCAFFOLDING ]\t" +
                     misc.reportProgress(idx + 1, len(paths)))

    return scaffold_sequences, scaffold_correspondences, all_edges, n_gaps, n_merges, bed
Пример #4
0
def pairwise_comparisons(GEMlist):
    '''
    Performs all pairwise comparisons between windows in GEMlist.

    Returns:
        GEMcomparison (pd.DataFrame)
    '''
    # Compare the barcodes in every region to all other regions
    GEMcomparison = pd.DataFrame(np.zeros(( len(GEMlist), len(GEMlist) )), \
                                index=GEMlist.keys())
    GEMcomparison.columns = GEMcomparison.index

    # Iterate over rows in GEMcomparison
    # Index to keep track of position so we can skip calculating some fractions
    # twice
    idx = 0
    for idx, region1 in enumerate(GEMcomparison.index):
        lib1 = GEMlist[region1]

        # Report progress every 20 windows
        if idx in range(0, 100000000, 20):
            misc.printstatusFlush("[ BARCODE COMPARISON ]\t" +
                                  misc.reportProgress(idx + 1, len(GEMlist)))

        fractions = [
            compareGEMlibs(lib1, GEMlist[col])
            for col in GEMcomparison.columns[idx:]
        ]

        GEMcomparison.loc[region1][
            idx:] = fractions  # Update row values from idx
        GEMcomparison[region1][
            idx:] = fractions  # Also update column values from idx

    misc.printstatus("[ BARCODE COMPARISON ]\t" +
                     misc.reportProgress(idx + 1, len(GEMlist)))

    return GEMcomparison
Пример #5
0
def main(input_bam, contig_dict, region_size=20000, mapq=60, bc_quant=2):
    global samfile
    samfile = pysam.AlignmentFile(input_bam, "rb")
    GEMlist = {}  # Inappropriately named "list"

    # First step is to collect all barcodes (passing -q cutoff) that are aligned
    # to each contigs first and last regions (-l)
    misc.printstatus("Starting barcode collection. Found {0} contigs.".format(
        len(contig_dict.keys())))

    # Generate windows
    windows = getWindows(region_size, contig_dict)

    # Iterate over windows to collect barcodes sets
    for idx, window in enumerate(windows):
        # Unpack variables, for readability
        region, contig, start, end = window[0], window[0][:-1], window[
            1], window[2]

        # Print progress. Number of windows is dependent on if running on
        # backbone or on small contigs.
        if idx in range(0, 100000000, 20):
            if region[-1] == "a":
                misc.printstatusFlush("[ BARCODE COLLECTION ]\t" + \
                misc.reportProgress(idx, len(contig_dict.keys())))
            else:
                misc.printstatusFlush("[ BARCODE COLLECTION ]\t" + \
                misc.reportProgress(idx, len(contig_dict.keys())*2))

        # Collect barcodes from the window
        GEMs = collectGEMs((contig, start, end), mapq, bc_quant)

        # If at least 100 barcodes in list, use it
        if len(GEMs) > 100:
            GEMlist[region] = GEMs

    if region[-1] == "a":
        misc.printstatus("[ BARCODE COLLECTION ]\t" + \
        misc.reportProgress(len(contig_dict.keys()), len(contig_dict.keys())))
    else:
        misc.printstatus("[ BARCODE COLLECTION ]\t" + \
        misc.reportProgress(len(contig_dict.keys())*2, len(contig_dict.keys())*2))
    samfile.close()

    return GEMlist
Пример #6
0
def main():
    misc.printstatus("Starting ARBitR.")

    # Unpack arguments
    region_size = args.region_size
    molecule_size = args.molecule_size
    mapq = args.mapq
    n_proc = args.n_proc
    short_mapq = args.short_mapq
    short_bc_quant = args.short_bc_quant
    short_bc_factor = args.short_barcode_factor
    barcode_factor = args.barcode_factor
    barcode_fraction = args.barcode_fraction
    mincov = args.coverage
    bc_quantity = args.bc_quantity
    gapsize = 100

    if region_size > molecule_size:
        misc.printstatus("Larger --region_size than --molecule_size detected. \
                            Using default values instead.")
        region_size, molecule_size = 20000, 45000

    outfilename = getOut()  # Create a prefix for output files
    samfile = pysam.AlignmentFile(args.input_bam, "rb")
    input_contig_lengths = dict(zip(samfile.references, samfile.lengths))
    samfile.close()

    # Split dataset into backbone and small contigs
    misc.printstatus("Collecting contigs.")
    backbone_contig_lengths = { ctg:length for ctg, length \
                                in input_contig_lengths.items() \
                                if length > molecule_size}
    small_contig_lengths = {    k:input_contig_lengths[k] \
                                for k in input_contig_lengths.keys() \
                                - backbone_contig_lengths.keys()}

    # First step is to collect the barcodes for the backbone graph
    misc.printstatus("Collecting barcodes for linkgraph.")
    GEMlist = barcode_collection.main(  args.input_bam, \
                                        backbone_contig_lengths, \
                                        region_size, \
                                        mapq, \
                                        bc_quantity)

    # Second step is to build the link graph based on the barcodes
    misc.printstatus("Creating link graph.")
    backbone_graph = graph_building.main(backbone_contig_lengths, \
                                        GEMlist, \
                                        barcode_factor, \
                                        barcode_fraction)

    misc.printstatus(
        "Writing link graph to {}.backbone.gfa.".format(outfilename))
    writeGfa(outfilename + ".backbone", backbone_contig_lengths,
             backbone_graph)

    # Third step is to traverse the graph and build paths
    misc.printstatus("Finding paths.")
    backbone_graph.unambiguousPaths()  # Fill graph.paths
    misc.printstatus("Found {} paths.".format(len(backbone_graph.paths)))
    writePaths( outfilename+".pre-fill", \
                {str(idx):path for idx, path in enumerate(backbone_graph.paths)})

    # Fourth step is to collect the barcodes from the input bam file,
    # this time for the small contigs
    misc.printstatus("Collecting barcodes from short contigs.")
    GEMlist = barcode_collection.main(  args.input_bam, \
                                        small_contig_lengths, \
                                        molecule_size, \
                                        short_mapq, \
                                        short_bc_quant)

    # Fifth step is to pull in the short contigs into the linkgraph junctions,
    # if they have
    # Sixth step is to fill the junctions in the backbone_graph
    paths = fill_junctions.fillJunctions(backbone_graph, GEMlist,
                                         short_bc_factor)

    writePaths( outfilename+".pre-merge", \
                {str(idx):path for idx, path in enumerate(paths)})

    if os.path.isfile(args.input_fasta):
        # If user gave an assembly fasta file, use this for merging
        misc.printstatus("Found fasta file for merging: {}".format(
            args.input_fasta))
        new_scaffolds, \
        scaffold_correspondence, \
        bed = merge_fasta.main( args.input_fasta, \
                                args.input_bam, \
                                paths, \
                                mincov, \
                                gapsize, \
                                n_proc)
        misc.printstatus(
            "Writing merged fasta to {0}.fasta".format(outfilename))
        writeFasta(outfilename, new_scaffolds)
        writePaths(outfilename + ".correspondence", scaffold_correspondence)
        writeBed(outfilename, bed)

    else:
        misc.printstatus("No fasta file found for merging. Pipeline finished.")

    # Cleanup temp files
    tmp_files = [file for file in os.listdir() if file.endswith(".tmp.fasta")]
    for tmp in tmp_files:
        os.remove(tmp)

    misc.printstatus("ARBitR successfully completed!\n")
Пример #7
0
def makeEdges(GEMcomparison, barcode_factor, min_barcode_fraction):
    '''Create edges from the GEMcomparison dataframe.

    Args:
        GEMcomparison (pd.DataFrame): All-against-all comparison of the
            windows' barcodes.
        barcode_factor (int): Factor for calculating outliers.
        min_barcode_fraction (float): Minimum fraction of shared barcodes to create
            an edge in the linkgraph.
    Returns:
        list: Edges inferred from the fractions of shared barcodes.
    '''

    misc.printstatus("Number of windows: " + str(len(GEMcomparison.keys())))
    edges = []

    with open("fractions.txt", "w") as out:
        for f in GEMcomparison.index:
            out.write("{}\t".format(f))
        out.write("\n")

        # Iterate over rows in GEMcomparison
        for idx, (region, fractions) in enumerate(GEMcomparison.iterrows()):
            contig = region[:-1]
            window = region[-1]

            out.write(region + "\t")
            for f in fractions:
                out.write("{}\t".format(f))
            out.write("\n")

            # Report progress every 100 windows
            if idx in range(0, 10000000, 100):
                misc.printstatusFlush(
                    "[ BARCODE LINKING ]\t" +
                    misc.reportProgress(idx, len(GEMcomparison)))
            '''
            # Calculate outliers from the comparisons of window k to all other windows
            # outliers is a dict where each key is a connected window to region,
            # and value is the fraction of shared barcodes between region and window
            outliers = esd.getOutliers_QC(np.array(fractions),fractions.index,10)
            # Get rid of edges to the same contig.
            outliers = { k:v for k,v in outliers.items() if k[:-1] != region[:-1] \
                        and v > np.mean(fractions)}
            outliers = pd.Series(outliers)
            # If there are any outliers, i.e. edges to create, add them to the edges
            # list. Don't add edges for lower outliers (fractions < mean(fractions))
            # or where the fraction is less than
            # min_barcode_fraction (-f) and edges back to the same contig
            if len(outliers.keys()) > 1:
                sorted_outliers = outliers.sort_values(ascending = False)
                if sorted_outliers[0] > sorted_outliers[1] * barcode_factor:
                    outliers = outliers[outliers == sorted_outliers[0]]

            new_edges = [(region, connected_window, fraction) \
                        for connected_window, fraction in outliers.items()]

            # Let's try only writing single edges
            #if len(new_edges) == 1:
            for idx, mo in outliers.iteritems():
                edges.append( (region, idx, mo ) )

            '''

            # Ignore comparisons to the same contig and calculate outliers
            # In low coverage datasets the amount of 0's might cloud any
            # actual signal
            fractions = fractions.drop(labels=[contig + "s", contig + "e"],
                                       errors="ignore")
            fractions = fractions[fractions > 0]
            if len(fractions) > 0:
                minor_outliers = calcOutliers(fractions, barcode_factor)
                minor_outliers = minor_outliers[
                    minor_outliers > min_barcode_fraction]

                for ix, mo in minor_outliers.iteritems():
                    edges.append((region, ix, mo))

        misc.printstatus(
            "[ BARCODE LINKING ]\t" +
            misc.reportProgress(len(GEMcomparison), len(GEMcomparison)))

        return edges
Пример #8
0
def trimSequences(paths, mincov):
    '''Trim away low quality regions of input sequences

    Description:
        Because de novo assembled contigs often end in low quality regions
        that are of too poor sequence to find good overlaps between, we want to
        trim input contigs of regions where reads don't map. Only trim regions
        where there is a potential overlap, i.e. NOT at the start of the first
        contig and end of the last contig in a path.

    Args:
        paths (list): list of lists. Each nested list contains ordered
            graph_building.Junction objects.
        mincov (int): Trim contig ends with lower average coverage than this
            value
    Returns:
        dict: trimmed_fasta_coords. Keys: input contig headers, values:
            start and end coordinates to keep, in addition to True or False
            for start and end if they were trimmed or not.
    '''
    # trimmed_fasta_coords is a dict with coords to keep from original fasta
    # Format: {contig: [start_coord, end_coord, bool, bool]}
    # Start by filling with old coords, which will then be changed
    trimmed_fasta_coords = {}
    for idx, ctg in enumerate(fastafile.references):
        trimmed_fasta_coords[ctg] = [0, fastafile.lengths[idx], False, False]

    # Then find new coordinates for all sequences to merge
    for idx, path in enumerate(paths):
        if idx in range(0, 100000000, 5):
            misc.printstatusFlush("[ TRIMMING ]\t" +
                                  misc.reportProgress(idx + 1, len(paths)))

        for junction in path:
            if junction.start != None:
                start_tig, start_side = junction.start[:-1], junction.start[-1]
            else:
                start_tig, start_side = None, None
            if junction.target != None:
                target_tig, target_side = junction.target[:
                                                          -1], junction.target[
                                                              -1]
            else:
                target_tig, target_side = None, None
            connections = junction.connections

            # Trim the sides of contigs where a junction is formed,
            # and don't trim places where there are no junctions.
            if start_side == "s" \
            and trimmed_fasta_coords[start_tig][2] == False:
                trimmed_fasta_coords[start_tig] =   [trimmed_fasta_coords[start_tig][0] + \
                                                    trimFasta(trimmed_fasta_coords, junction.start, mincov), \
                                                    trimmed_fasta_coords[start_tig][1], \
                                                    True, \
                                                    trimmed_fasta_coords[start_tig][3]]
            elif start_side == "e" \
            and trimmed_fasta_coords[start_tig][3] == False:
                trimmed_fasta_coords[start_tig] =   [trimmed_fasta_coords[start_tig][0], \
                                                    trimmed_fasta_coords[start_tig][1] + \
                                                    trimFasta(trimmed_fasta_coords, junction.start, mincov), \
                                                    trimmed_fasta_coords[start_tig][2], \
                                                    True]
            if target_side == "s" \
            and trimmed_fasta_coords[target_tig][2] == False:
                trimmed_fasta_coords[target_tig] =  [trimmed_fasta_coords[target_tig][0] + \
                                                    trimFasta(trimmed_fasta_coords, junction.target, mincov), \
                                                    trimmed_fasta_coords[target_tig][1], \
                                                    True, \
                                                    trimmed_fasta_coords[target_tig][3]]
            elif target_side == "e" \
            and trimmed_fasta_coords[target_tig][3] == False:
                trimmed_fasta_coords[target_tig] =  [trimmed_fasta_coords[target_tig][0], \
                                                    trimmed_fasta_coords[target_tig][1] + \
                                                    trimFasta(trimmed_fasta_coords, junction.target, mincov), \
                                                    trimmed_fasta_coords[target_tig][2], \
                                                    True]

            # Also trim everything in connections
            for conn in connections:
                if not trimmed_fasta_coords[conn][2] == True \
                or not trimmed_fasta_coords[conn][3] == True:
                    trimmed_fasta_coords[conn] = [trimmed_fasta_coords[conn][0] + \
                                                trimFasta(trimmed_fasta_coords, conn+"s", mincov), \
                                                trimmed_fasta_coords[conn][1] + \
                                                trimFasta(trimmed_fasta_coords, conn+"e", mincov), \
                                                True, True]
    misc.printstatus("[ TRIMMING ]\t" +
                     misc.reportProgress(idx + 1, len(paths)))

    return trimmed_fasta_coords
Пример #9
0
def main(input_fasta, input_bam, paths, mincov, gapsize, n_proc):
    '''Controller for merge_fasta.

    Args:
        input_fasta (str): Path to fasta file to create scaffolds from.
        input_bam (str): Path to bam file of reads mapped to input_fasta.
        paths (list): list of lists containing graph_building.Junction objects
            describing the paths inferred previously during the pipeline.
        mincov (int): Minimum average coverage for trimming.
        gapsize (int): Gap size when scaffolding by gap introduction.
        n_proc (int): Number of processes to run during scaffolding.
    Returns:
        dict: scaffolded fasta to output. Keys: fasta headers. Values: the
            resulting sequence.
        dict: correspondence, which contigs went into which scaffold.
    '''

    global samfile
    global fastafile
    fastafile = pysam.FastaFile(input_fasta)
    samfile = pysam.AlignmentFile(input_bam, "rb")

    # Get trim coordinates based on read mappings in samfile
    misc.printstatus("Trimming contig ends...")
    trimmed_fasta_coords = trimSequences(paths, mincov)

    # Trim fasta
    global trimmed_fasta
    trimmed_fasta = {}
    for tig in samfile.references:
        trimmed_fasta[tig] = fastafile.fetch(reference=tig, \
                                             start=trimmed_fasta_coords[tig][0], \
                                             end=trimmed_fasta_coords[tig][1])
    samfile.close()
    fastafile.close()

    # Start finding overlaps
    misc.printstatus("Creating scaffolds...")

    if n_proc == 1:
        scaffold_sequences, scaffold_correspondences, all_edges, \
        n_gaps, n_merges, bed = build_scaffolds(paths, gapsize)
    else:
        scaffold_sequences, scaffold_correspondences, all_edges, \
        n_gaps, n_merges, bed = mp_build_scaffolds(paths, gapsize, n_proc)

    all_edges = [edge for ls in all_edges for edge in ls]

    misc.printstatus("Scaffolding completed.")
    misc.printstatus("Number of aligned merges: {}".format(str(n_merges)))
    misc.printstatus("Number of gaps introduced: {}".format(str(n_gaps)))
    #complete_overlap_graph = Overlapgraph(list(trimmed_fasta.keys()), all_edges)
    #writeGfa(complete_overlap_graph)

    # Collect contigs that were not put into a scaffold
    misc.printstatus("Collecting leftover sequences.")
    used_contigs = [
        tig for value in scaffold_correspondences.values() for tig in value
    ]
    leftover_contigs = [
        ctg for ctg in trimmed_fasta.keys() if ctg not in used_contigs
    ]

    for idx, tig in enumerate(leftover_contigs):
        scaffold_sequences["unplaced_contig_" + str(idx)] = trimmed_fasta[tig]
        scaffold_correspondences["unplaced_contig_" + str(idx)] = [tig]

    return scaffold_sequences, scaffold_correspondences, bed