Пример #1
0
def main(dat,
         gzipped,
         flags,
         flag_filter,
         min_quality,
         bin_size,
         file=stdout,
         **kwargs):
    """Dispatch data to subroutines"""
    samfilters = [flags, flag_filter, min_quality]
    kmerscans = [
        load_kmerscan(fn, gzipped, samfilters, bin_size) for fn in dat
    ]
    entropies = concat(
        calculate_entropies(bdf) for bdf in progressbar(
            chain(*(ks.values() for ks in kmerscans)),
            desc="Calculating entropies",
            unit="arm",
            total=sum(len(ks) for ks in kmerscans),
        ))
    quantiles = {
        q: weighted_quantile(
            entropies["#entropy"],
            entropies["coverage"] - 1,
            q / 100,
        )
        for q in progressbar(range(5, 101, 5), desc="Calculating quantiles")
    }
    print("#" + ",".join(f"q{k}={v}" for k, v in quantiles.items()), file=file)
    entropies.to_csv(file, sep="\t", index=False)
Пример #2
0
def filter_and_read_tsv(dat, gzipped, integer_samfilters):
    """If filters supplied, subset DAT first, then read with pandas"""
    number_retained = 0
    if gzipped:
        opener = gzopen
    else:
        opener = open
    with opener(dat, mode="rt") as dat_handle:
        with TemporaryDirectory() as tempdir:
            datflt_name = path.join(tempdir, "dat.gz")
            with gzopen(datflt_name, mode="wt") as datflt:
                decorated_line_iterator = progressbar(
                    dat_handle,
                    desc="Filtering",
                    unit=" lines",
                )
                for line in decorated_line_iterator:
                    if line[0] == "#":
                        print(line, end="", file=datflt)
                    else:
                        fields = line.split("\t")
                        line_passes_filter = entry_filters_ok(
                            int(fields[1]),
                            int(fields[4]),
                            integer_samfilters,
                        )
                        if line_passes_filter:
                            number_retained += 1
                            print(line, end="", file=datflt)
                print("Kept {} records".format(number_retained), file=stderr)
            print("Loading DAT...", file=stderr, flush=True)
            return read_csv(datflt_name, sep="\t", escapechar="#")
Пример #3
0
def parse_bam_with_ambiguity(bam, ecxfd, max_read_length, min_map_overlap,
                             targets, samfilters):
    """Parse BAM file, select overhanging reads, possibly mapping to more than one arm"""
    with AlignmentFile(bam) as bam_data:
        reflens = dict(zip(bam_data.references, bam_data.lengths))
        bam_header_string = str(bam_data.header).rstrip("\n")
        decorated_bam_iterator = progressbar(
            ecxfd,
            total=len(ecxfd),
            desc="Pulling",
            unit="chromosome",
        )
        entries = []
        for chrom in decorated_bam_iterator:
            bam_chunk = get_bam_chunk(
                bam_data,
                chrom,
                ecxfd,
                reflens,
                max_read_length,
            )
            entries.extend(
                filter_entries(
                    bam_chunk,
                    ecxfd,
                    targets,
                    samfilters,
                    min_map_overlap,
                ), )
    return bam_header_string, entries
Пример #4
0
def filter_bam(alignment, samfilters, desc=None):
    """Wrap alignment iterator with a flag and quality filter"""
    integer_samfilters = list(map(interpret_flags, samfilters))
    filtered_iterator = (
        entry for entry in alignment
        if entry_filters_ok(entry.flag, entry.mapq, integer_samfilters))
    if desc is None:
        return filtered_iterator
    else:
        return progressbar(filtered_iterator, desc=desc, unit="read")
Пример #5
0
def load_kmerscan(dat,
                  gzipped,
                  samfilters,
                  bin_size=None,
                  no_align=False,
                  each_once=True):
    """Load densities from dat file, split into dataframes per chromosome"""
    integer_samfilters = list(map(interpret_flags, samfilters))
    if not any(integer_samfilters):  # all zero / None
        print("Loading DAT...", file=stderr, flush=True)
        raw_densities = read_csv(dat, sep="\t", escapechar="#")
    else:
        raw_densities = filter_and_read_tsv(dat, gzipped, integer_samfilters)
    if len(raw_densities) == 0:
        raise EmptyKmerscanError
    if not are_motifs_consistent(raw_densities):
        raise NotImplementedError(KMERSCANNER_INCONSISTENT_NUMBER_OF_MOTIFS)
    bin_size_data = raw_densities.columns[-1]
    raw_densities.rename(columns={bin_size_data: "density"}, inplace=True)
    if bin_size is None:
        bin_size_matcher = search(r'[0-9]+$', bin_size_data)
        if bin_size_matcher:
            bin_size = int(bin_size_matcher.group())
        else:
            raise ValueError("No bin size in DAT, user must specify")
    if each_once:
        count_commas = lambda d: d.count(",") + 1
        raw_densities["length"] = raw_densities["density"].apply(count_commas)
        groups = raw_densities[["name", "motif", "length"]].groupby(
            ["name", "motif"],
            as_index=False,
        ).max()
        raw_densities = merge(groups, raw_densities).drop(columns="length")
    if no_align:
        raw_densities["chrom"] = "None"
    chromosome_iterator = progressbar(
        raw_densities["chrom"].drop_duplicates(),
        desc="Interpreting data",
        unit="chromosome",
    )
    return {
        chrom: get_binned_density_dataframe(
            raw_densities,
            chrom,
            bin_size,
            no_align,
        )
        for chrom in chromosome_iterator
    }
Пример #6
0
def analyze_repeats(full_report,
                    collapse_reverse_complement=False,
                    adj="bonferroni"):
    """Analyze repeat enrichment for multiple lengths and apply multiple testing adjustment"""
    candidates = concat([
        get_motifs_fisher(
            full_report[full_report["length"] == length],
            collapse_reverse_complement=collapse_reverse_complement,
        ) for length in progressbar(
            unique(full_report["length"].values),
            unit="k",
            desc="Calculating enrichment",
        )
    ])
    candidates["p_adjusted"] = multipletests(candidates["p"], method=adj)[1]
    return candidates[["motif", "length", "count", "p", "p_adjusted"]]
Пример #7
0
def make_decorated_densities_iterator(densities, chroms_to_plot=None):
    """Order chromosomes and wrap with progress bar"""
    if chroms_to_plot:
        sorted_chromosomes = natsorted_chromosomes(
            set(densities.keys()) | set(chroms_to_plot.split(",")))
    else:
        sorted_chromosomes = natsorted_chromosomes(densities.keys())
    sorted_densities_iterator = ((chrom, densities.get(chrom))
                                 for chrom in sorted_chromosomes)
    decorated_densities_iterator = progressbar(
        sorted_densities_iterator,
        total=len(sorted_chromosomes),
        desc="Plotting",
        unit="chromosome",
    )
    return decorated_densities_iterator, len(sorted_chromosomes)
Пример #8
0
def calculate_chromosome_lds(chrom, entries, jobs):
    """Calculate pairwise relative levenshtein distances between all reads mapping to one chromosome"""
    with ThreadPoolExecutor(max_workers=jobs) as pool:
        workers = [
            pool.submit(
                get_relative_read_ld,
                aname, bname, entries[aname], entries[bname],
            )
            for aname, bname in combinations_with_replacement(
                sorted(entries.keys()), r=2,
            )
        ]
        iterator = progressbar(
            as_completed(workers), desc=chrom, unit="pair", total=len(workers),
        )
        for worker in iterator:
            yield worker.result()
Пример #9
0
def explain_report(filtered_analysis, sequencefile, min_repeats, jobs=1):
    """Calculate fraction of reads explainable by each motif"""
    explained_analysis = filtered_analysis.copy()
    explained_analysis["bases_explained"], total_bases = 0.0, 0
    with FastxFile(sequencefile) as fastx:

        def get_number_of_masked_positions(sequence, motifs):
            n_masked_positions_per_motif = {}
            for motif in motifs:
                positions_to_mask = set()
                motifs_pattern = get_circular_pattern(
                    motif,
                    repeats=min_repeats,
                )
                matcher = motifs_pattern.finditer(sequence, overlapped=True)
                for match in matcher:
                    positions_to_mask |= set(range(match.start(), match.end()))
                n_masked_positions_per_motif[motif] = len(positions_to_mask)
            return n_masked_positions_per_motif, len(sequence)

        with ThreadPoolExecutor(max_workers=jobs) as pool:
            workers = [
                pool.submit(
                    get_number_of_masked_positions,
                    entry.sequence,
                    set(filtered_analysis["motif"]),
                ) for entry in fastx
            ]
            iterator = progressbar(
                as_completed(workers),
                total=len(workers),
                desc="Calculating fractions",
                unit="read",
            )
            for worker in iterator:
                n_masked_positions_per_motif, total_seq_bases = worker.result()
                for motif, n_pos in n_masked_positions_per_motif.items():
                    indexer = (
                        explained_analysis["motif"] == motif,
                        "bases_explained",
                    )
                    explained_analysis.loc[indexer] += n_pos
                total_bases += total_seq_bases
    return explained_analysis, total_bases
Пример #10
0
def main(bam,
         index,
         flags,
         flag_filter,
         min_quality,
         target,
         file=stdout,
         **kwargs):
    """Interpret arguments and dispatch data to subroutines"""
    if target == "cigar":
        chopper, integer_target = cigar_chopper, None
    else:
        chopper, integer_target = relative_chopper, interpret_flags(target)
    ecx = load_index(index)
    with AlignmentFile(bam) as alignment:
        print(str(alignment.header).rstrip("\n"), file=file)
        n_skipped = 0
        bam_iterator = progressbar(
            filter_bam(alignment, [flags, flag_filter, min_quality]),
            desc="Chopping",
            unit="read",
        )
        with errstate(invalid="ignore"):
            for entry in bam_iterator:
                if entry.query_sequence:
                    chopped_entry, error = chopper(
                        entry,
                        ecx,
                        integer_target,
                    )
                    if chopped_entry.query_sequence:
                        print(chopped_entry.to_string(), file=file)
                    else:
                        n_skipped += 1
    if n_skipped:
        msg_mask = "Skipped {} reads to be safe (unsure where to chop)"
        print(msg_mask.format(n_skipped), file=stderr)
    warning = [
        "WARNING: Read mapping positions were adjusted and retained;",
        "         this is needed to comply with the SAM spec.",
        "         Do not use these positions for analyses outside of edgeCase!",
    ]
    print("\n".join(warning), file=stderr)
    return 0
Пример #11
0
def find_repeats(sequencefile, min_k, max_k, min_repeats, jellyfish,
                 jellyfish_hash_size, collapse_reverse_complement, jobs,
                 tempdir):
    """Find all repeats in sequencefile"""
    per_k_reports = []
    k_iterator = progressbar(
        range(min_k, max_k + 1),
        desc="Sweeping lengths",
        unit="k",
    )
    for k in k_iterator:
        db = path.join(tempdir, "{}.db".format(k))
        jellyfish_count_options = [
            jellyfish, "count", "-t",
            str(jobs), "-s", jellyfish_hash_size, "-L", "0", "-m",
            str(k * min_repeats)
        ]
        if collapse_reverse_complement:
            jellyfish_count_options += ["-C"]
        check_output(jellyfish_count_options + ["-o", db, sequencefile])
        tsv = path.join(tempdir, "{}.tsv".format(k))
        check_output([
            jellyfish,
            "dump",
            "-c",
            "-t",
            "-L",
            "0",
            "-o",
            tsv,
            db,
        ])
        k_report = read_csv(tsv, sep="\t", names=["kmer", "count"])
        if len(k_report) == 0:
            return None
        repeats_indexer = k_report["kmer"].apply(
            lambda kmer: kmer[:k] * min_repeats == kmer)
        k_report = k_report[repeats_indexer]
        k_report["kmer"] = k_report["kmer"].apply(lambda kmer: kmer[:k])
        k_report["length"] = k
        per_k_reports.append(k_report)
    return concat(per_k_reports, axis=0)
Пример #12
0
def pattern_scanner(entry_iterator, fmt, samfilters, motif_patterns, bin_size,
                    num_reads, jobs):
    """Calculate density of pattern hits in a rolling window along each read"""
    if fmt == "sam":
        filtered_iterator = filter_bam(entry_iterator, samfilters)
    else:
        filtered_iterator = entry_iterator
    simple_entry_iterator = (SimpleNamespace(
        query_name=getattr(entry, "query_name", getattr(entry, "name", None)),
        flag=getattr(entry, "flag", None),
        reference_name=getattr(entry, "reference_name", None),
        reference_start=getattr(entry, "reference_start", None),
        mapping_quality=getattr(entry, "mapping_quality", None),
        query_sequence=getattr(entry, "query_sequence",
                               getattr(entry, "sequence", None)),
        cigarstring=getattr(entry, "cigarstring", ""),
    ) for entry in filtered_iterator)
    with Pool(jobs) as pool:
        # imap_unordered() only accepts single-argument functions:
        density_calculator = partial(
            calculate_density_of_patterns,
            motif_patterns=motif_patterns,
            bin_size=bin_size,
        )
        # lazy multiprocess evaluation:
        read_density_iterator = pool.imap_unordered(
            density_calculator,
            simple_entry_iterator,
        )
        # iterate pairs (entry.query_name, density_array), same as calculate_density_of_patterns():
        desc = "Calculating density"
        yield from progressbar(
            read_density_iterator,
            desc=desc,
            unit="read",
            total=num_reads,
        )
Пример #13
0
def get_unambiguous_entries(entries, ecx):
    """Subset candidate entries to those that map unambiguously"""
    entry_dispatcher, valid_qnames = make_entry_dispatchers(entries, ecx)
    chromosomes = set(ecx["chromosome"].drop_duplicates())
    for qname in progressbar(valid_qnames, desc="Filtering", unit="read"):
        entry_mappings = entry_dispatcher[entry_dispatcher["qname"] == qname]
        entry_mapped_to_main = entry_mappings["rname"].isin(chromosomes)
        if entry_mapped_to_main.any():  # prefer canonical chromosomes
            target_entry_mappings = entry_mappings[entry_mapped_to_main]
        else:  # fall back to forks / subtelomeres
            target_entry_mappings = entry_mappings
        rnames = target_entry_mappings["rname"].drop_duplicates()
        primes = target_entry_mappings["prime"].drop_duplicates()
        if (len(rnames) == 1) and (len(primes) == 1):
            if primes.iloc[0] == 3:  # find innermost mappos on same q arm
                target_mappos = target_entry_mappings["mappos"].min()
            else:  # find innermost mappos on same p arm
                target_mappos = target_entry_mappings["mappos"].max()
            entry_candidates = target_entry_mappings.loc[
                target_entry_mappings["mappos"] == target_mappos, "entry", ]
            if len(entry_candidates) == 1:
                entry = entry_candidates.iloc[0]
                if entry.flag & 0x800 == 0:  # non-supplementary
                    yield entry