Exemplo n.º 1
0
def form_collapsed_clusters(sorted_fn,
                            max_hq_mismatches,
                            max_indels,
                            max_UMI_distance,
                            show_progress=True):

    collapsed_fn = '.'.join(sorted_fn.split(".")[:-1]) + ".collapsed.bam"

    yaml_fn = '.'.join(sorted_fn.split(".")[:-1]) + ".yaml"
    stats = yaml.load(open(yaml_fn, "r"))
    max_read_length = stats['max_read_length']
    total_reads = stats['total_reads']

    sorted_als = pysam.AlignmentFile(str(sorted_fn), check_sq=False)
    if progress:
        sorted_als = progress(sorted_als, total=total_reads, desc='Collapsing')

    cell_groups = utilities.group_by(sorted_als, cell_key)

    with pysam.AlignmentFile(str(collapsed_fn), 'wb', header=empty_header) as collapsed_fh:
        for cell_BC, cell_group in cell_groups:

            for UMI, UMI_group in utilities.group_by(cell_group, UMI_key):
                clusters = form_clusters(UMI_group, max_read_length, max_hq_mismatches)
                clusters = sorted(clusters, key=lambda c: c.get_tag(NUM_READS_TAG), reverse=True)

                for i, cluster in enumerate(clusters):
                    cluster.set_tag(CELL_BC_TAG, cell_BC, 'Z')
                    cluster.set_tag(UMI_TAG, UMI, 'Z')
                    cluster.set_tag(CLUSTER_ID_TAG, str(i), 'Z')

                biggest = clusters[0]
                rest = clusters[1:]

                not_collapsed = []

                for other in rest:
                    if other.get_tag(NUM_READS_TAG) == biggest.get_tag(NUM_READS_TAG):
                        not_collapsed.append(other)
                    else:
                        indels, hq_mismatches = align_clusters(biggest, other)

                        if indels <= max_indels and hq_mismatches <= max_hq_mismatches:
                            biggest = merge_annotated_clusters(biggest, other)
                        else:
                            not_collapsed.append(other)

                for cluster in [biggest] + not_collapsed:
                    annotation = cluster_Annotation(cell_BC=cluster.get_tag(CELL_BC_TAG),
                                                    UMI=cluster.get_tag(UMI_TAG),
                                                    num_reads=cluster.get_tag(NUM_READS_TAG),
                                                    cluster_id=cluster.get_tag(CLUSTER_ID_TAG),
                                                   )

                    cluster.query_name = str(annotation)
                    collapsed_fh.write(cluster)
Exemplo n.º 2
0
def collapse_pooled_UMI_outcomes(outcome_iter):
    def is_relevant(outcome):
        return (outcome.category != 'bad sequence' and outcome.outcome !=
                ('no indel', 'other', 'ambiguous'))

    all_outcomes = [o for o in outcome_iter if is_relevant(o)]
    all_outcomes = sorted(all_outcomes, key=lambda u: (u.UMI, u.cluster_id))

    all_collapsed_outcomes = []
    most_abundant_outcomes = []

    for UMI, UMI_outcomes in group_by(all_outcomes, lambda u: u.UMI):
        observed = set(u.outcome for u in UMI_outcomes)

        collapsed_outcomes = []
        for outcome in observed:
            relevant = [u for u in UMI_outcomes if u.outcome == outcome]
            representative = max(relevant, key=lambda u: u.num_reads)
            representative.num_reads = sum(u.num_reads for u in relevant)

            collapsed_outcomes.append(representative)
            all_collapsed_outcomes.append(representative)

        max_count = max(u.num_reads for u in collapsed_outcomes)
        has_max_count = [
            u for u in collapsed_outcomes if u.num_reads == max_count
        ]

        if len(has_max_count) == 1:
            most_abundant_outcomes.append(has_max_count[0])

    all_collapsed_outcomes = sorted(all_collapsed_outcomes,
                                    key=lambda u: (u.UMI, u.cluster_id))
    return all_collapsed_outcomes, most_abundant_outcomes
Exemplo n.º 3
0
def _consolidate_counts(positions_list):
    positions_list = sorted(positions_list)
    consolidated_list = []
    for position, items in utilities.group_by(
            positions_list,
            key=lambda x: x[:4],
    ):
        ref_seq_name, ref_pos, ref_char, read_char = position
        count = sum(item[4] for item in items)
        consolidated = (ref_seq_name, ref_pos, ref_char, read_char, count)
        consolidated_list.append(consolidated)
    return consolidated_list
Exemplo n.º 4
0
def make_sample_sheet(group_dir, target, guides):
    color_list = bokeh.palettes.Category20c_20[:16] #+ bokeh.palettes.Category20b_20
    color_groups = itertools.cycle(list(zip(*[iter(color_list)]*4)))

    sample_sheet = {}

    grouped_guides = utilities.group_by(sorted(guides), lambda n: n.split('-')[0])
    for (group_name, group), color_group in zip(grouped_guides, color_groups):
        for name, color in zip(group, color_group[1:]):
            sample_sheet[name] = {
                'fastq_fns': name + '.fastq',
                'target_info': target,
                'project': 'screen',
                'color': color,
            }

    sample_sheet_fn = group_dir / 'sample_sheet.yaml'
    sample_sheet_fn.write_text(yaml.dump(sample_sheet, default_flow_style=False))
Exemplo n.º 5
0
def error_correct_allUMIs(sorted_fn,
                          max_UMI_distance,
                          sampleID,
                          log_fh=None,
                          show_progress=True):

    collapsed_fn = sorted_fn.with_name(sorted_fn.stem + '_ec.bam')
    log_fn = sorted_fn.with_name(sorted_fn.stem + '_umi_ec_log.txt')

    sorted_als = pysam.AlignmentFile(str(sorted_fn), check_sq=False)

    # group_by only works if sorted_als is already sorted by loc_key
    allele_groups = utilities.group_by(sorted_als, loc_key)

    num_corrected = 0
    total = 0

    with pysam.AlignmentFile(str(collapsed_fn), 'wb',
                             header=sorted_als.header) as collapsed_fh:
        for allele_bc, allele_group in allele_groups:
            if max_UMI_distance > 0:
                allele_group, num_corr, tot, erstring = error_correct_UMIs(
                    allele_group, sampleID, max_UMI_distance)

            for a in allele_group:
                collapsed_fh.write(a)

            #log_fh.write(error_corrections)
            if log_fh is None:
                print(erstring, end=' ')
                sys.stdout.flush()
            else:
                with open(log_fh, "a") as f:
                    f.write(erstring)

            num_corrected += num_corr
            total += tot

    print(str(num_corrected) + " UMIs Corrected of " + str(total) + " (" +
          str(round(float(num_corrected) / total, 5) * 100) + "%)",
          file=sys.stderr)
Exemplo n.º 6
0
def form_collapsed_clusters(
    sorted_fn: str,
    collapsed_fn: str,
    max_hq_mismatches: int,
    max_indels: int,
    cell_key: Callable[[pysam.AlignedSegment], str] = cell_key,
    UMI_key: Callable[[pysam.AlignedSegment], str] = UMI_key,
    method: Literal["cutoff", "bayesian"] = "cutoff",
    n_threads: int = 1,
):
    """Aggregates together aligned segments (reads) that share UMIs if their
    sequences are close.

    Clusters aligned segments within a UMI and with sequences are less than a
    threshold of mismatches from one another, effectively determining the
    sequence and number of reads for each UMI while accounting for sequencing
    errors. If ties do not exist, the most frequent sequence in reads with a
    certain UMI is chosen as the 'true' sequence for that UMI, otherwise a
    consensus is created between sequences. Then, it further attempts to
    cluster clusters with similar sequences after the consensuses are created.
    Clusters are represented by a single annotated aligned segment that records
    the UMI, cellBC, and number of reads. Then saves these annotated aligned
    segments representing clusters to a BAM file. Multiple clusters can be
    created for a given UMI, if there are multiple clusters with significantly
    different sequences that could not be resolved at this step.

    Args:
        sorted_fn: The file name of the sorted BAM.
        collapsed_fn: The file name of the collapsed BAM.
        max_hq_mismatches: A threshold specifying the maximum number of high
            quality mismatches between the seqeunces of 2 aligned segments to be
            collapsed.
        max_indels: A threshold specifying the maximum number of differing indels
            allowed between the sequences of 2 aligned segments to be collapsed.
        cell_key: A function that takes an alignment and returns the cell barcode
        UMI_key: A function that takes an alignment and returns the UMI sequence
        method: Which method to use to form initial sequence clusters. Must be
            one of the following:
            * cutoff: Uses a quality score hard cutoff of 30, and any mismatches
                below this quality are ignored. Initial sequence clusters are
                formed by selecting the most common base at each position (with
                quality at least 30).
            * likelihood: Utilizes the error probability encoded in the quality
                score. Initial sequence clusters are formed by selecting the
                most probable at each position.
        n_threads: Number of threads to use.
    """

    sorted_als = pysam.AlignmentFile(sorted_fn, check_sq=False)

    cellBC_UMIs = set()
    max_read_length = 0
    for al in sorted_als:
        cellBC_UMIs.add((cell_key(al), UMI_key(al)))
        max_read_length = max(max_read_length, al.query_length)

    # Raise warning when max_hq_mismatches / max_read_length > 0.5
    if max_hq_mismatches / max_read_length > 0.5:
        warnings.warn(
            "Provided `max_hq_mismatches` exceeds half of the maximum read "
            "length. Most reads will be collapsed into a single consensus "
            "sequence.",
            PreprocessWarning,
        )

    # Read in the AlignmentFile again as iterating over it in the previous for
    # loop has destructively removed all alignments from the file object
    sorted_als = pysam.AlignmentFile(sorted_fn, check_sq=False)
    cell_groups = utilities.group_by(sorted_als, cell_key)

    # Helper function so that we can use joblib to parallelize the computation
    def cluster_group(cell_BC, UMI, UMI_group, header_text):
        header = pysam.AlignmentHeader.from_text(header_text)
        UMI_group = [
            pysam.AlignedSegment.fromstring(s, header) for s in UMI_group
        ]
        if method == "cutoff":
            clusters = form_clusters(UMI_group, max_read_length,
                                     max_hq_mismatches)
        elif method == "likelihood":
            clusters = form_clusters_likelihood(UMI_group,
                                                proportion=max_hq_mismatches /
                                                max_read_length)
        else:
            raise PreprocessError(
                f"Unknown method to form UMI clusters: {method}")
        clusters = sorted(clusters,
                          key=lambda c: c.get_tag(NUM_READS_TAG),
                          reverse=True)

        for i, cluster in enumerate(clusters):
            cluster.set_tag(CELL_BC_TAG, cell_BC, "Z")
            cluster.set_tag(UMI_TAG, UMI, "Z")
            cluster.set_tag(CLUSTER_ID_TAG, str(i), "Z")

        biggest = clusters[0]
        rest = clusters[1:]

        not_collapsed = []

        for other in rest:
            if other.get_tag(NUM_READS_TAG) == biggest.get_tag(NUM_READS_TAG):
                not_collapsed.append(other)
            else:
                indels, hq_mismatches = align_clusters(biggest, other)

                if indels <= max_indels and hq_mismatches <= max_hq_mismatches:
                    biggest = merge_annotated_clusters(biggest, other)
                else:
                    not_collapsed.append(other)

        clusters = []
        for cluster in [biggest] + not_collapsed:
            annotation = cluster_Annotation(
                cell_BC=cluster.get_tag(CELL_BC_TAG),
                UMI=cluster.get_tag(UMI_TAG),
                num_reads=cluster.get_tag(NUM_READS_TAG),
                cluster_id=cluster.get_tag(CLUSTER_ID_TAG),
            )

            cluster.query_name = str(annotation)
            clusters.append(cluster.to_string())
        return clusters

    # Because pysam alignments can not be pickled, we need to pass them as
    # dictionaries.
    all_clusters = ngs.utils.ParallelWithProgress(
        n_jobs=n_threads, total=len(cellBC_UMIs), desc="Collapsing UMIs")(
            delayed(cluster_group)(
                cell_BC, UMI, [aln.to_string()
                               for aln in UMI_group], str(sorted_als.header))
            for cell_BC, cell_group in cell_groups
            for UMI, UMI_group in utilities.group_by(cell_group, UMI_key))

    with pysam.AlignmentFile(collapsed_fn,
                             "wb",
                             header=empty_header,
                             threads=n_threads) as collapsed_fh:
        for clusters in progress(all_clusters, desc="Writing collapsed UMIs"):
            for cluster in clusters:
                collapsed_fh.write(
                    pysam.AlignedSegment.fromstring(cluster, empty_header))