Exemplo n.º 1
0
def join(args, outs, chunk_defs, chunk_outs):
    outs.coerce_strings()

    # combine the duplicate summary counts
    dup_summaries = [json.load(open(out.duplicate_summary)) for out in chunk_outs]
    combined_dups = reduce(lambda x,y: tenkit.dict_utils.add_dicts(x,y,2), dup_summaries)

    diffusion_summary = json.load(open(args.diffusion_dup_summary))

    combined_dups['read_counts'] = {}
    combined_dups['read_counts']['perfect_read_count'] = args.perfect_read_count

    for k, v in diffusion_summary.items():
        combined_dups[k] = v

    # TODO: Remove null_* observed_* ?

    with open(outs.duplicate_summary, 'w') as f:
        json.dump(combined_dups, f, indent=4)

    # combine & index the chunks of the BAM
    if args.write_bam:
        tk_bam.merge(outs.output, [c.output for c in chunk_outs], args.__threads)
        tk_bam.index(outs.output)
        outs.index = outs.output + '.bai'
    else:
        outs.output = None
        outs.index = None
Exemplo n.º 2
0
def call_haploid(haplotype, bam, locus, reference_path, variant_caller,
                 gatk_path, mem_gb):
    bam_name = "hap" + str(haplotype) + ".bam"
    haploid_bam, _ = tenkit.bam.create_bam_outfile(bam_name,
                                                   None,
                                                   None,
                                                   template=bam)
    (chrom, start, stop) = tk_io.get_locus_info(locus)
    for read in bam.fetch(chrom, start, stop):
        readhap = dict(read.tags).get('HP')
        if readhap != None and int(readhap) == haplotype:
            haploid_bam.write(read)
    haploid_bam.close()
    tk_bam.index(bam_name)
    tmp_vcf_name = "tmp_hap" + str(haplotype) + ".vcf"
    vcf_name = "hap" + str(haplotype) + ".vcf"

    fasta_path = tk_ref.get_fasta(reference_path)
    vc.run_variant_caller(variant_caller,
                          gatk_path,
                          mem_gb,
                          fasta_path,
                          bam_name,
                          tmp_vcf_name,
                          haploid_mode=True)

    longranger.variants.canonicalize(tmp_vcf_name, vcf_name)
    tenkit.tabix.index_vcf(vcf_name)
    bam_in = tk_bam.create_bam_infile(bam_name)
    return (vcf_name + ".gz", bam_in)
Exemplo n.º 3
0
def join(args, outs, chunk_defs, chunk_outs):
    outs.coerce_strings()

    # Concatenate chunks
    if len(chunk_outs) == 1:
        subprocess.call(['mv', chunk_outs[0].phased_possorted_bam, outs.phased_possorted_bam])
    else:
        tk_bam.concatenate(outs.phased_possorted_bam, [out.phased_possorted_bam for out in chunk_outs])
    tk_bam.index(outs.phased_possorted_bam)
    outs.phased_possorted_bam_index = outs.phased_possorted_bam + ".bai"

    total_reads = 0
    phased_reads = 0
    molecule_tagged_reads = 0
    for chunk_out in chunk_outs:
        total_reads += chunk_out.total_reads
        phased_reads += chunk_out.phased_reads
        molecule_tagged_reads += chunk_out.molecule_tagged_reads

    outs.total_reads = total_reads
    outs.phased_reads = phased_reads
    outs.molecule_tagged_reads = molecule_tagged_reads

    fract_reads_phased = tk_stats.robust_divide(float(phased_reads), float(total_reads))
    fract_reads_molecule_id = tk_stats.robust_divide(float(molecule_tagged_reads), float(total_reads))

    stats = {
        "fract_reads_phased": fract_reads_phased,
        "fract_reads_molecule_id": fract_reads_molecule_id,
        }

    with open(outs.summary, 'w') as summary_file:
        json.dump(tenkit.safe_json.json_sanitize(stats), summary_file)
Exemplo n.º 4
0
def join(args, outs, chunk_defs, chunk_outs):
    outs.coerce_strings()
    input_bams = [str(chunk.output) for chunk in chunk_outs]
    #merg and index
    args_merge = [
        'sambamba', 'merge', '-t',
        str(args.__threads), 'output_merge.bam'
    ]
    #create an extended list to put at the end of args_merge
    args_merge.extend(input_bams)
    subprocess.check_call(args_merge)
    os.rename('output_merge.bam', outs.output)
    os.rename('output_merge.bam.bai', outs.output + '.bai')
    tk_bam.concatenate(outs.output, input_bams)
    tk_bam.index(outs.output)
Exemplo n.º 5
0
def main(args, outs):
    """
    Given a set of barcodes and a possorted bam, return a new BAM that only contains those barcodes
    """
    useful_bcs = set(args.barcode_subset.split(','))

    bam_h = pysam.Samfile(args.possorted_bam)
    outf_h = pysam.Samfile(outs.subset_bam, 'wb', template=bam_h)
    for rec in bam_h:
        try:
            cb = rec.get_tag('CB')
        except KeyError:
            continue
        if cb in useful_bcs:
            outf_h.write(rec)
    outf_h.close()
    tk_bam.index(outs.subset_bam)
Exemplo n.º 6
0
def join(args, outs, chunk_defs, chunk_outs):
    outs.coerce_strings()

    # combine the duplicate summary counts
    dup_summaries = [
        json.load(open(out.duplicate_summary)) for out in chunk_outs
    ]
    combined_dups = reduce(lambda x, y: tenkit.dict_utils.add_dicts(x, y, 2),
                           dup_summaries)
    combined_dups['read_counts'] = {}
    combined_dups['read_counts'][
        'perfect_read_count'] = args.perfect_read_count

    f = open(outs.duplicate_summary, 'w')
    json.dump(combined_dups, f)
    f.close()

    # combine & index the chunks of the BAM
    tk_bam.concatenate(outs.output, [c.output for c in chunk_outs])
    tk_bam.index(outs.output)
    outs.index = outs.output + '.bai'
Exemplo n.º 7
0
def join(args, outs, chunk_defs, chunk_outs):
    outs.coerce_strings()
    input_bams = [str(chunk.output) for chunk in chunk_outs]
    merge(input_bams, outs.output, args.__threads)
    outs.index = outs.output + '.bai'
    tk_bam.index(outs.output)
Exemplo n.º 8
0
def join(args, outs, chunk_defs, chunk_outs):
    outs.coerce_strings()
    input_bams = [str(chunk.default) for chunk in chunk_outs]
    merge(input_bams, outs.default, args.__threads)
    tk_bam.index(outs.default)
    outs.perfect_read_count = sum([chunk.perfect_read_count for chunk in chunk_outs])
def join(args, outs, chunk_defs, chunk_outs):
    outs.coerce_strings()
    input_bams = [str(chunk.output) for chunk in chunk_outs]
    tk_bam.concatenate(outs.output, input_bams)
    tk_bam.index(outs.output)
Exemplo n.º 10
0
def join(args, outs, chunk_defs, chunk_outs):
    contigs = []
    contig_fastqs = []
    contig_bams = []

    if len(chunk_outs) == 0:
        # No input reads
        # Create empty BAM file
        with open(outs.contig_bam, 'w') as f:
            pass
        outs.contig_bam_bai = None
        # Create empty contig FASTA
        with open(outs.contig_fasta, 'w') as f:
            pass
        outs.contig_fasta_fai = None
        # Create empty contig FASTQ
        with open(outs.contig_fastq, 'w') as f:
            pass
        outs.metrics_summary_json = None
        outs.summary_tsv = None
        outs.umi_summary_tsv = None
        return

    summary_tsvs = []
    umi_summary_tsvs = []

    for chunk_out in chunk_outs:
        if not os.path.isfile(chunk_out.contig_fasta):
            continue
        contigs.append(chunk_out.contig_fasta)

        contig_fastqs.append(chunk_out.contig_fastq)
        contig_bams.append(chunk_out.contig_bam)

        summary_tsvs.append(chunk_out.summary_tsv)
        umi_summary_tsvs.append(chunk_out.umi_summary_tsv)

    cr_io.concatenate_files(outs.contig_fasta, contigs)

    if os.path.getsize(outs.contig_fasta) > 0:
        tk_subproc.check_call('samtools faidx %s' % outs.contig_fasta,
                              shell=True)
        outs.contig_fasta_fai = outs.contig_fasta + '.fai'

    cr_io.concatenate_files(outs.contig_fastq, contig_fastqs)

    if len(summary_tsvs) > 0:
        cr_io.concatenate_headered_files(outs.summary_tsv, summary_tsvs)
    if len(umi_summary_tsvs) > 0:
        cr_io.concatenate_headered_files(outs.umi_summary_tsv,
                                         umi_summary_tsvs)

    if contig_bams:
        # Merge every N BAMs. Trying to merge them all at once
        #  risks hitting the filehandle limit.
        n_merged = 0

        while len(contig_bams) > 1:
            to_merge = contig_bams[0:MERGE_BAMS_N]

            tmp_bam = martian.make_path('merged-%04d.bam' % n_merged)
            n_merged += 1

            print "Merging %d BAMs into %s ..." % (len(to_merge), tmp_bam)
            tk_bam.merge(tmp_bam, to_merge, threads=args.__threads)

            # Delete any temporary bams that have been merged
            for in_bam in to_merge:
                if os.path.basename(in_bam).startswith('merged-'):
                    cr_io.remove(in_bam)

            # Pop the input bams and push the merged bam
            contig_bams = contig_bams[len(to_merge):] + [tmp_bam]

        if os.path.basename(contig_bams[0]).startswith('merged-'):
            # We merged at least two chunks together.
            # Rename it to the output bam.
            cr_io.move(contig_bams[0], outs.contig_bam)
        else:
            # There was only a single chunk, so copy it from the input
            cr_io.copy(contig_bams[0], outs.contig_bam)

        tk_bam.index(outs.contig_bam)

        # Make sure the Martian out matches the actual index filename
        outs.contig_bam_bai = outs.contig_bam + '.bai'

    # Merge the assembler summary jsons
    merged_summary = cr_io.merge_jsons_single_level(
        [out.metrics_summary_json for out in chunk_outs])

    with open(outs.metrics_summary_json, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(merged_summary),
                  f,
                  indent=4,
                  sort_keys=True)
Exemplo n.º 11
0
def join(args, outs, chunk_defs, chunk_outs):
    args_dict ={}
    args_dict["bc_allow_indel"]=args.bc_allow_indel
    args_dict["bc_max_error_allowed"]=args.bc_max_error_allowed
    args_dict["bc_pseudo_count"]=args.bc_pseudo_count
    args_dict["bc_use_mapping"]=args.bc_use_mapping
    args_dict["bc_mapq"]=args.bc_mapq
    args_dict["frag_no_merging"]=args.frag_no_merging
    args_dict["frag_mapq"]=args.frag_mapq
    args_dict["frag_pval"]=args.frag_pval
    args_dict["frag_freq"]=args.frag_freq
    fsummary = open(outs.summary, "w")
    fsummary.write(safe_json.safe_jsonify(args_dict))
    fsummary.close()

    tk_bam.concatenate(out_file_name=outs.pos_sorted_bam, all_in_file_names=[chunk.pos_sorted_bam for chunk in chunk_outs])
    tk_bam.index(outs.pos_sorted_bam)
    outs.pos_sorted_bam_index = outs.pos_sorted_bam + '.bai'

    bam_in = tk_bam.create_bam_infile(outs.pos_sorted_bam)
    chroms = bam_in.references
    barcode_whitelist = list(tk_seq.load_barcode_whitelist(args.barcode_whitelist))
    barcode_whitelist.sort()

    # Combine fragment csv files into a single h5 file
    in_csv_files = [co.fragments+"_"+cd.tid+".csv" for (cd, co)
        in zip(chunk_defs, chunk_outs) if os.path.exists(co.fragments+"_"+cd.tid+".csv")]


    nfrags = 0
    if len(in_csv_files) > 0:
        bc_num_frags = defaultdict(int)
        bc_num_reads = defaultdict(int)
        bc_num_single_reads = defaultdict(int)
        bc_num_lens = defaultdict(int)

        temp_csv_barcodes = outs.barcodes+"_temp.csv"
        nfrags = 0

        for f in in_csv_files:
            # TODO - sequentially append to fragments.h5 file to keep memory under control
            # - handle multiple GEM groups properly.
            # ensure the chroms column has string /categorical type in hdf5
            # - same fixes for barcodes.h5 file
            # handle 0-length outputs -- does that result in None file outs?
            frag_in = p.read_csv(f, names=["tid", "start_pos", "end_pos", "bc_id", "num_reads"])
            frag_in["obs_len"] = frag_in.end_pos - frag_in.start_pos
            frag_in[frag_in.num_reads <= 1].obs_len = 1000

            frag_in["est_len"] = np.maximum(1, frag_in["obs_len"] * (frag_in.num_reads + 1) / np.maximum(1, frag_in.num_reads - 1)).astype("int")
            frag_in[frag_in.num_reads <= 1].est_len = 1000
            
            barcode_seqs = []
            molecule_ids = []
    
            for (i, row) in frag_in.iterrows():

                bc_num_frags[row.bc_id] += 1
                bc_num_reads[row.bc_id] += row.num_reads
                bc_num_lens[row.bc_id] += row.est_len
                    
                bc_wl_id = int(row.bc_id) % len(barcode_whitelist)
                gg = int(row.bc_id) / len(barcode_whitelist) + 1
                barcode_seq = "%s-%d" % (barcode_whitelist[bc_wl_id], gg)
                barcode_seqs.append(barcode_seq)
                molecule_ids.append(nfrags)

                nfrags += 1

            frag_in["bc"] = p.Categorical(barcode_seqs)
            frag_in["chrom"] = p.Categorical.from_codes(frag_in.tid, chroms)
            frag_in["molecule_id"] = molecule_ids
            del frag_in["tid"]
            del frag_in["bc_id"]

            if len(frag_in) > 0:
                tenkit.hdf5.append_data_frame(outs.fragments, frag_in)


        with open(temp_csv_barcodes, "w") as csv_out:
            csv_out.write("bc,bc_est_len,bc_linked_read_fraction,bc_linked_fragment_fraction,bc_mean_reads_per_fragment,bc_num_fragments,bc_num_reads\n")
            for bc_id in range(len(barcode_whitelist)):
                bc = barcode_whitelist[bc_id]+"-1"
                if bc_id in bc_num_frags:
                    bc_est_len = bc_num_lens[bc_id]
                    bc_linked_read_fraction = 1.0 - bc_num_single_reads[bc_id]*1.0/bc_num_reads[bc_id]
                    bc_linked_fragment_fraction = 1.0 - bc_num_single_reads[bc_id]*1.0/bc_num_frags[bc_id]
                    bc_mean_reads_per_fragment = bc_num_reads[bc_id]*1.0/bc_num_frags[bc_id]
                    csv_out.write("%s,%d,%f,%f,%f,%d,%d\n" % (bc, bc_est_len, bc_linked_read_fraction, bc_linked_fragment_fraction, bc_mean_reads_per_fragment, bc_num_frags[bc_id], bc_num_reads[bc_id]))


        if nfrags == 0:
            outs.fragments = None
            outs.barcodes = None

        else:
            tenkit.hdf5.create_tabix_index(outs.fragments, 'chrom', 'start_pos', 'end_pos')

            df_barcodes = p.read_csv(temp_csv_barcodes)
            tenkit.hdf5.append_data_frame(outs.barcodes, df_barcodes)

    else:
        outs.fragments = None
        outs.barcodes= None

    summary =  {}
    # Compute high-level BC summary metrics
    # Load BC data
    if outs.barcodes:
        bc_df = tenkit.hdf5.read_data_frame(outs.barcodes)
        fragment_df = tenkit.hdf5.read_data_frame(outs.fragments, query_cols=['bc', 'num_reads', 'est_len', 'chrom', 'start_pos'])

        bc_df.sort('bc_num_reads', inplace=True)

        # bin the bc counts and write a json histogram file
        n_reads = bc_df.bc_num_reads.values
        max_val = np.percentile(n_reads, 99.99) * 1.3
        min_val = n_reads.min()
        num_bins = 400
        step = math.ceil((max_val - min_val)/num_bins)
        bins = np.arange(min_val, max_val, step)
        (hist, edges) = np.histogram(n_reads, bins=bins)
        bc_count_hist = {int(edges[i]):hist[i] for i in range(len(bins)-1)}

        # Summarize properties of n50 and n90 BC set
        bc_df['cum_reads'] = np.cumsum(bc_df.bc_num_reads)
        n50_read_thresh = sum(bc_df.bc_num_reads) * 0.5
        n50_bcs = bc_df[bc_df.cum_reads > n50_read_thresh]
        n50_fra = fragment_df[fragment_df.bc.isin(n50_bcs.bc)]
        n50_stats = high_level_stats("n50", n50_fra, n50_bcs)
        del n50_fra

        n90_read_thresh = sum(bc_df.bc_num_reads) * 0.1
        n90_bcs = bc_df[bc_df.cum_reads > n90_read_thresh]
        n90_fra = fragment_df[fragment_df.bc.isin(n90_bcs.bc)]
        n90_stats = high_level_stats("n90", n90_fra, n90_bcs)
        del n90_fra

        for (k,v) in n50_stats.iteritems():
            summary[k] = v

        for (k,v) in n90_stats.iteritems():
            summary[k] = v

        # Generate a fragment length histogram
        fragment_df['len_bin'] = np.floor_divide(fragment_df.est_len.values, FRAG_LEN_HIST_BIN_SIZE).astype(int) * FRAG_LEN_HIST_BIN_SIZE

        multi_read_frags = fragment_df[fragment_df.num_reads > 1]
        len_bins = multi_read_frags.groupby(['len_bin']).apply(len)
        del multi_read_frags

        len_hist = {k:v for (k,v) in len_bins.iteritems()}

        # Write fragment length hist to json
        with open(outs.fragment_size, 'w') as fragment_size_file:
            tenkit.safe_json.dump_numpy(len_hist, fragment_size_file)

        # Estimate total DNA per partition by looking at hottest 1000 GEMs or GEMs w/ bc_mean_reads_per_fragment > 2, whichever is fewer
        hot_bcs = bc_df[np.logical_and(bc_df.bc_mean_reads_per_fragment > 2.0, bc_df.bc_num_reads > 25)]
        hot_bcs.sort('bc_mean_reads_per_fragment', inplace=True)
        if len(hot_bcs) > 50:
            hot_bcs = hot_bcs[-NUM_BCS_LOADING_ESTIMATE:]
            summary['estimated_dna_per_partition'] = round(scipy.stats.tmean(hot_bcs.bc_est_len, scipy.percentile(hot_bcs.bc_est_len, (1,99))))
        else:
            summary['estimated_dna_per_partition'] = None

        # Read-based effective diversity
        reads = bc_df.bc_num_reads.values
        sum_sq = (reads**2.0).sum()
        effective_diversity = tk_stats.robust_divide((reads.sum()**2.0), float(sum_sq))
        summary['effective_diversity_reads'] = effective_diversity

        # Fragment-based effective diversity
        fragments = bc_df.bc_num_fragments.values
        sum_sq = (fragments**2.0).sum()
        effective_diversity = tk_stats.robust_divide((fragments.sum()**2.0), float(sum_sq))
        summary['effective_diversity_fragments'] = effective_diversity

    else:
        # No fragment_size file emitted
        outs.fragment_size = None

        n50_stats = high_level_stats("n50", None, None)
        n90_stats = high_level_stats("n90", None, None)

        for (k,v) in n50_stats.iteritems():
            summary[k] = v

        for (k,v) in n90_stats.iteritems():
            summary[k] = v

        bc_count_hist = {}

        summary['estimated_dna_per_partition'] = None
        summary['effective_diversity_reads'] = None
        summary['effective_diversity_fragments'] = None

    with open(outs.barcode_histogram, 'w') as barcode_hist_file:
        tenkit.safe_json.dump_numpy(bc_count_hist, barcode_hist_file)

    # Write summary to json
    with open(outs.single_partition, 'w') as summary_file:
        tenkit.safe_json.dump_numpy(summary, summary_file, pretty=True)
Exemplo n.º 12
0
def join(args, outs, chunk_defs, chunk_outs):
    contigs = []
    contig_fastqs = []
    contig_bams = []

    summary_df_parts = []
    umi_summary_df_parts = []

    for chunk_out in chunk_outs:
        if not os.path.isfile(chunk_out.contig_fasta):
            continue
        contigs.append(chunk_out.contig_fasta)

        contig_fastqs.append(chunk_out.contig_fastq)
        contig_bams.append(chunk_out.contig_bam)
        summary_df_parts.append(
            pd.read_csv(chunk_out.summary_tsv,
                        header=0,
                        index_col=None,
                        sep='\t',
                        dtype={
                            'component': int,
                            'num_reads': int,
                            'num_pairs': int,
                            'num_umis': int
                        }))

        umi_summary_df_parts.append(
            pd.read_csv(chunk_out.umi_summary_tsv,
                        header=0,
                        index_col=None,
                        sep='\t',
                        dtype={
                            'umi_id': int,
                            'reads': int,
                            'min_umi_reads': int,
                            'contigs': str
                        }))

    summary_df = pd.concat(summary_df_parts, ignore_index=True)
    umi_summary_df = pd.concat(umi_summary_df_parts, ignore_index=True)

    cr_utils.concatenate_files(outs.contig_fasta, contigs)

    if os.path.getsize(outs.contig_fasta) > 0:
        subprocess.check_call('samtools faidx %s' % outs.contig_fasta,
                              shell=True)
        outs.contig_fasta_fai = outs.contig_fasta + '.fai'

    cr_utils.concatenate_files(outs.contig_fastq, contig_fastqs)

    if summary_df is not None:
        summary_df.to_csv(outs.summary_tsv, header=True, index=False, sep='\t')
    if umi_summary_df is not None:
        umi_summary_df.to_csv(outs.umi_summary_tsv,
                              header=True,
                              index=False,
                              sep='\t')

    if contig_bams:
        tk_bam.merge(outs.contig_bam, contig_bams, threads=args.__threads)
        tk_bam.index(outs.contig_bam)

        # Make sure the Martian out matches the actual index filename
        outs.contig_bam_bai = outs.contig_bam + '.bai'

    # Merge the assembler summary jsons
    merged_summary = cr_utils.merge_jsons_single_level(
        [out.metrics_summary_json for out in chunk_outs])

    with open(outs.metrics_summary_json, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(merged_summary),
                  f,
                  indent=4,
                  sort_keys=True)