Exemplo n.º 1
0
def main(args, outs):
    '''Find cut sites on a per chromosome basis and write out a bedgraph'''
    if args.fragments is None:
        outs.count_dict = None
        outs.cut_sites = None
        return

    ctg_mgr = ReferenceManager(args.reference_path)
    contig_len = ctg_mgr.get_contig_lengths()
    chrom_len = contig_len[args.contig]
    half_window = WINDOW_SIZE // 2
    Cuts = np.zeros(chrom_len, dtype='int32')

    # find windowed cut sites
    for _, start, stop, _, _ in parsed_fragments_from_contig(contig=args.contig, filename=args.fragments, index=args.fragments_index):
        Cuts[max(0, start - half_window): min(start + half_window + 1, chrom_len)] += 1
        Cuts[max(0, stop - half_window): min(stop + half_window + 1, chrom_len)] += 1

    # get count dict
    count_dict = Counter(v for v in Cuts if v > 0)
    with open(outs.count_dict, 'w') as count_dict_out:
        pickle.dump(count_dict, count_dict_out)

    # write bedgraph of * windowed cutsites *
    if len(count_dict):
        write_chrom_bedgraph(args.contig, chrom_len, Cuts, outs.cut_sites)
    else:
        outs.cut_sites = None
Exemplo n.º 2
0
def split(args):
    if args.fragments is None:
        return {'chunks': [], 'join': {}}

    ctg_mgr = ReferenceManager(args.reference_path)
    all_contigs = ctg_mgr.primary_contigs(allow_sex_chromosomes=True)
    contig_len = ctg_mgr.get_contig_lengths()
    BYTES_PER_INT32_WITH_SAFETY = 5

    chunks = []
    for contig in all_contigs:
        chunks.append({'contig': contig,
                       '__mem_gb': int(np.ceil(BYTES_PER_INT32_WITH_SAFETY * contig_len[contig] / 1024 / 1024 / 1024))})

    return {'chunks': chunks, 'join': {'__mem_gb': 5}}
Exemplo n.º 3
0
def split(args):
    """split into a chunk for each library in aggr csv, and define a unique gem group"""
    aggr_df = pd.read_csv(args.aggr_csv, sep=',')
    nchunks = len(aggr_df)

    ctg_mgr = ReferenceManager(args.reference_path)
    max_contig_len = max(ctg_mgr.get_contig_lengths().values())
    BYTES_PER_INT32_WITH_SAFETY = 5
    mem_gb = 2 * int(
        np.ceil(
            BYTES_PER_INT32_WITH_SAFETY * max_contig_len / 1024 / 1024 / 1024))

    return {
        'chunks': [{
            'n': group,
            '__mem_gb': mem_gb,
            '__vmem_gb': mem_gb + 6
        } for group in range(nchunks)],
        'join': {
            '__mem_gb': 12
        }
    }
Exemplo n.º 4
0
def main(args, outs):
    ref = ReferenceManager(args.reference_path)
    contig_len = ref.get_contig_lengths()[args.contig]

    with open(args.peaks, "r") as infile:
        peak_regions = get_target_regions(infile)

    fragment_counts = Counter()
    targeted_counts = Counter()

    cumulative_fragment_length = {padding: Counter() for padding in PADDING_VALUES}
    covered_bases = {padding: {} for padding in PADDING_VALUES}

    for contig, start, stop, barcode, _ in parsed_fragments_from_contig(
        args.contig, args.fragments, args.fragments_index
    ):
        fragment_counts[barcode] += 1
        if fragment_overlaps_target(contig, start, stop, peak_regions):
            targeted_counts[barcode] += 1

        for padding in PADDING_VALUES:
            adj_start = max(0, start - padding)
            adj_stop = min(contig_len, stop + padding)

            cumulative_fragment_length[padding][barcode] += adj_stop - adj_start

            if barcode not in covered_bases[padding]:
                # Total # of covered bases, current start/stop of active region
                covered_bases[padding][barcode] = 0, None, None

            current_covered, active_start, active_stop = covered_bases[padding][barcode]
            if active_start is None:
                active_start = adj_start
                active_stop = adj_stop
            else:
                if adj_start < active_stop:
                    active_stop = max(adj_stop, active_stop)
                else:
                    current_covered += active_stop - active_start
                    active_start = adj_start
                    active_stop = adj_stop
            covered_bases[padding][barcode] = current_covered, active_start, active_stop

    final_covered = {padding: Counter() for padding in PADDING_VALUES}
    for padding in PADDING_VALUES:
        for barcode in covered_bases[padding]:
            current_covered, active_start, active_stop = covered_bases[padding][barcode]
            if active_start is None:
                final_covered[padding][barcode] = current_covered
            else:
                final_covered[padding][barcode] = current_covered + active_stop - active_start

    with open(outs.fragment_counts, "w") as outfile:
        pickle.dump(fragment_counts, outfile)
    with open(outs.targeted_counts, "w") as outfile:
        pickle.dump(targeted_counts, outfile)
    with open(outs.fragment_lengths, "w") as outfile:
        pickle.dump(cumulative_fragment_length, outfile)
    with open(outs.covered_bases, "w") as outfile:
        pickle.dump(final_covered, outfile)

    outs.peak_coverage = 0
    if args.contig in peak_regions:
        contig_len = ref.contig_lengths[args.contig]
        outs.peak_coverage = count_covered_bases(
            peak_regions[args.contig].starts,
            peak_regions[args.contig].ends,
            contig_len,
            padding=DISTANCE,
        )
Exemplo n.º 5
0
def join(args, outs, chunk_defs, chunk_outs):
    # Sample ID / pipestance name
    check_sample_id(args.sample_id)

    # force_cells
    check_force_cells(args.force_cells, ulimit=10000000)  # allow arbitrarily large limit for reanalyzer

    # # Reference
    # ref directory structure and timestamps
    ok, msg = check_refdata(args.reference_path, max_contigs=None)
    if ok:
        martian.log_info(msg)
    else:
        martian.exit(msg)

    # formatting
    check_reference_format(args.reference_path)
    contig_manager = ReferenceManager(args.reference_path)

    # peaks format check and nonoverlapping
    if args.peaks is None:
        martian.exit("peaks file not provided")
    exists_and_readable(args.peaks, "peaks")
    bed_format_checker(args.peaks, contig_manager.fasta_index)
    contain_three_columns(args.peaks)
    if is_overlapping(args.peaks):
        martian.exit("{} contains overlapping peak regions".format(args.peaks))

    # check parameters files
    if args.parameters is not None:
        if not os.path.exists(args.parameters):
            martian.exit("{} does not exist".format(args.parameters))

    # fragments checks
    whitelist_barcodes = load_barcode_whitelist(args.barcode_whitelist)
    species_list = contig_manager.list_species()
    observed_gem_groups = set()
    observed_species = set()
    if args.fragments is None:
        martian.exit("fragments file not provided")
    exists_and_readable(args.fragments, "fragments")
    contig_lens = contig_manager.get_contig_lengths()
    # check bounds and matching contigs in reference and species
    for chrom, start, stop, bc, _ in open_fragment_file(args.fragments):
        spec = chrom.split("_")
        observed_species.add(spec[0] if spec[0] != chrom else "")
        barcode, gem_group = bc.split("-")
        observed_gem_groups.add(gem_group)
        if args.check_executables:  # run this only non-locally
            if barcode not in whitelist_barcodes:
                martian.exit("{} is not a valid whitelist barcode".format(barcode))
            if chrom not in contig_lens:
                martian.exit("contig {} not present in reference".format(chrom))
            if stop > contig_lens[chrom]:
                martian.exit("fragment {}:{}-{} boundaries exceed contig size ({} bp)".format(chrom, start, stop, contig_lens[chrom]))
    # ensure fragments are on the correct reference
    for species in observed_species:
        if species not in species_list:
            martian.exit("{} contains fragments mapped to species not recognized in the reference".format(args.fragments))
    if len(observed_gem_groups) > 1:
        martian.log_info("multiple gem groups present in {}, likely generated in a previous aggregation run".format(args.fragments))

    # fragments index is synced with fragments
    if args.fragments_index is None:
        martian.exit("fragments index file not provided")
    if not os.path.exists(args.fragments_index):
        martian.exit("{} does not exist".format(args.fragments_index))
    try:
        all_contigs = contig_manager.primary_contigs(allow_sex_chromosomes=True)
        for contig in all_contigs:
            en = 0
            for chrom, start, end, bc, dups in parsed_fragments_from_contig(contig, args.fragments, index=args.fragments_index):
                if en >= FRAGMENTS_SCAN_SIZE:
                    break
                en += 1
    except:
        martian.exit("fragments index is not in sync with the fragments file")

    # aggr csv checks
    if args.aggregation_csv is not None:
        check_aggr_csv(args.aggregation_csv, args.reference_path, cursory=True)

    # cell barcode checks
    if args.cell_barcodes is not None:
        if not os.path.exists(args.cell_barcodes):
            martian.exit("{} does not exist".format(args.cell_barcodes))
        check_singlecell_format(args.cell_barcodes, species_list, whitelist_barcodes)

    # Open file handles limit
    if args.check_executables:
        check_filehandle_limit()

    martian.log_info(tk_preflight.record_package_versions())
Exemplo n.º 6
0
def main(args, outs):
    """Compute the depth and signal per library"""
    # read
    lib_id = args.n + 1
    aggr_df = pd.read_csv(args.aggr_csv, sep=',')
    library_info = {lib_id: {}}
    for label in aggr_df.columns.values.tolist():
        library_info[lib_id][label] = str(aggr_df.iloc[args.n][label])

    # if no normalization, don't waste compute
    if args.normalization is None:
        with open(outs.library_info, 'w') as f:
            pickle.dump(library_info, f)
        return

    # set ref properties
    ctg_mgr = ReferenceManager(args.reference_path)
    contig_lens = ctg_mgr.get_contig_lengths()
    max_contig_len = max(contig_lens.values())
    curr_chrom = None
    count_dict = Counter()
    chrom_len = 1
    half_window = WINDOW_SIZE // 2

    # traverse fragments file and count stats
    fragments_f = aggr_df.iloc[args.n]['fragments']
    Cuts = None
    special_normalization = (args.normalization
                             in ["signal_mean", "signal_noise_threshold"])
    if special_normalization:
        Cuts = np.zeros(max_contig_len, dtype='int32')
    for chrom, start, stop, bc, dups in open_fragment_file(
            filename=fragments_f):
        if chrom != curr_chrom:
            curr_chrom = chrom
            if chrom not in contig_lens:
                martian.exit(
                    "fragment {}:{}-{} in {} is mapped to a contig not in the reference"
                    .format(chrom, start, stop, fragments_f))
            if special_normalization:
                count_dict += Counter(
                    Cuts[i] for i in xrange(chrom_len)
                    if Cuts[i] > 0)  # only traverse chrom len
                Cuts[:] = 0  # reset and reuse
                chrom_len = contig_lens[chrom]
        if special_normalization:
            Cuts[max(0, start - half_window):min(start + half_window +
                                                 1, chrom_len)] += 1
            Cuts[max(0, stop - half_window):min(stop + half_window +
                                                1, chrom_len)] += 1
    if special_normalization:
        count_dict += Counter(Cuts[i] for i in xrange(chrom_len)
                              if Cuts[i] > 0)  # only traverse chrom len

    scdf = pd.read_csv(library_info[lib_id]['cells'], sep=',')
    cell_mask = np.full(len(scdf), False)
    for species in ctg_mgr.list_species():
        cell_mask |= scdf['is_{}_cell_barcode'.format(species)] == 1
    library_info[lib_id]['total_fragments_per_cell'] = np.median(
        scdf[cell_mask]['total'] if 'total' in
        scdf[cell_mask].columns else scdf[cell_mask]['passed_filters'] +
        scdf[cell_mask]['duplicate'])
    library_info[lib_id]['unique_fragments_per_cell'] = np.median(
        scdf[cell_mask]['passed_filters'])

    # do peak calling fit on the count dict and get signal fit
    if args.normalization in ["signal_mean", "signal_noise_threshold"]:
        threshold, params = estimate_final_threshold(count_dict,
                                                     PEAK_ODDS_RATIO)
        library_info[lib_id]['original_threshold'] = threshold
        library_info[lib_id]['signal_mean'] = 1 / params.p_signal

    # dump library info
    with open(outs.library_info, 'w') as f:
        pickle.dump(library_info, f)
def check_aggr_csv(aggr_csv, reference_path, cursory=False):
    """Check aggr csv has correct columns, then progressively stronger checks on duplicates and formating of files.
    These stronger checks are enabled by default, unless you want to test the basic minimum, for example in reanalyzer"""
    contig_manager = ReferenceManager(reference_path)

    # aggr_csv checks
    exists_and_readable(aggr_csv, "aggr_csv")

    if cursory:
        nlibs, library_info, msg = parse_aggr_csv(aggr_csv, whitelist=["library_id"], blacklist=None)
    else:
        nlibs, library_info, msg = parse_aggr_csv(aggr_csv)
    if msg is not None:
        martian.exit(msg)

    # At least one library should be there
    if nlibs == 0:
        martian.exit("aggregation csv does not include any library. Provide at least two libraries.")

    if cursory:
        return
    # Enable aggr(count1) to run
    if nlibs == 1:
        martian.log_info("Aggregator should be run on more than one library")

    # avoid aggr of duplicate files (assessed by filename).
    species_list = contig_manager.list_species()
    for aggr_key in library_info[1]:  # at least one library is present
        files = {}
        for lib_id in library_info:
            fname = library_info[lib_id][aggr_key]
            if fname in files:
                martian.exit("File {} already specified for a different library under {}".format(fname, aggr_key))

            # singlecell.csv should contain 'barcode' and 'is_{}_cell_barcode' columns with the correct type
            if aggr_key == "cells":
                check_singlecell_format(fname, species_list, allow_multi_gem_groups=False)

            # peaks.bed need to be formatted correctly with right contigs if provided in aggr.csv
            # also check if peaks are non overlapping
            if aggr_key == "peaks":
                exists_and_readable(fname, "peaks")
                bed_format_checker(fname, contig_manager.fasta_index)
                contain_three_columns(fname)
                if is_overlapping(fname):
                    martian.exit("{} contains overlapping peak regions".format(fname))

            # checks on fragments
            contig_lens = contig_manager.get_contig_lengths()
            if aggr_key == "fragments":
                observed_gem_groups = set()
                observed_species = set()
                exists_and_readable(fname, "fragments")
                en = 0
                for chrom, start, stop, bc, _ in open_fragment_file(fname):
                    if en >= FRAGMENTS_SCAN_SIZE:
                        break
                    spec = chrom.split("_")
                    observed_species.add(spec[0] if spec[0] != chrom else "")
                    observed_gem_groups.add(bc.split("-")[1])
                    if chrom not in contig_lens:
                        martian.exit("fragment {}:{}-{} in {} is mapped to a contig not in the reference".format(chrom, start, stop, fname))
                    if stop > contig_lens[chrom]:
                        martian.exit("fragment {}:{}-{} boundaries exceed contig size ({} bp)".format(chrom, start, stop, contig_lens[chrom]))
                    en += 1
                for species in observed_species:
                    if species not in species_list:
                        martian.exit("{} contains fragments mapped to species not recognized in the reference".format(fname))
                if len(observed_gem_groups) > 1:
                    martian.exit("multiple gem groups present in {}, likely generated in a previous aggregation run".format(fname))