示例#1
0
    def test_targets(self):
        bam_bc_file = tk_test.in_path("namesort_test.bam")
        read_info_out = tk_test.out_path("read_info.h5")
        barcode_whitelist = tk_seq.load_barcode_whitelist("737K-april-2014")

        targets_filename = tk_test.in_path('agilent_kinome_targs.bed')
        targets_file = open(targets_filename, 'r')
        target_regions = tk_io.get_target_regions(targets_file)

        bam_in = tk_bam.create_bam_infile(bam_bc_file)
        r = compute_basic_stats(bam_in,
                                target_regions,
                                1000,
                                bam_in.references,
                                barcode_whitelist=barcode_whitelist,
                                read_h5_out=read_info_out)
        # insert_size_dists, nearest_targ_dists, summary_metrics, bc_table, mapq_counts, insert_size_hist = r
        misc_sm, bc_sms = r

        nearest_targ_dists = bc_sms.get('nearest_targ_dists')
        maxTargetDist = max(nearest_targ_dists.get_summarizer(60).dict.keys())
        minTargetDist = min(nearest_targ_dists.get_summarizer(60).dict.keys())

        self.assertEqual(minTargetDist, 130)
        self.assertEqual(maxTargetDist, 10000)
示例#2
0
    def test_attach_bcs(self):
        #  --align_input alignment_output.bam --barcode_input phix_I2.fastq --output test2.out --complete ~/c --stats ~/s
        args = {
            'barcode_whitelist' : IN_WHITELIST,
            'align_chunk' : IN_BAM,
            'barcode_chunk' : IN_I2,
            'sample_index_chunk' : IN_I1,
            'gem_group' : None,
            'paired_end' : True,
            'exclude_non_bc_reads' : False,
            'max_expected_bc_error': 0.75,
            'subsample_rate' : 1.0,
        }
        outs = { 'output': OUT_BAM }

        args = martian.Record(args)
        outs = martian.Record(outs)

        main(args, outs)

        # Get the barcodes
        barcode_whitelist = tk_seq.load_barcode_whitelist(IN_WHITELIST)

        # Ensure each read has a barcode
        out_bam = pysam.Samfile(OUT_BAM)
        for r in out_bam:
            tag_dict = { k:v for (k,v) in r.tags }
            tag_names = [ k for (k,v) in r.tags ]
            self.assertTrue(RAW_BARCODE_TAG in tag_names)

            if tag_dict[RAW_BARCODE_TAG] in barcode_whitelist:
                self.assertTrue(PROCESSED_BARCODE_TAG in tag_names)

            self.assertTrue(SAMPLE_INDEX_TAG in tag_names)


        # Make sure we put out the full BAM file
        out_len = len([ x for x in pysam.Samfile(OUT_BAM)])
        in_len  = len([ x for x in pysam.Samfile(IN_BAM)])
        self.assertEqual(out_len, in_len)


        def get_bc(r):
            tags = { k:v for (k,v) in r.tags }
            return tags[RAW_BARCODE_TAG]

        # Ensure each read pair has the same barcode
        out_bam = pysam.Samfile(OUT_BAM)
        reads = [ x for x in out_bam ]

        for (grp, reads) in groupby(reads, lambda x: x.qname):
            bcs = set(tk_io.get_read_barcode(r) for r in reads)
            self.assertEqual(len(bcs), 1)
示例#3
0
def main(args, outs):
    """ Attaches barcodes. Attaches raw barcode to RAW_BC tag and filters those to form set of PROCESSES_BARCODES """

    # Bail out if there's no barcodes or whitelist
    if args.barcode_whitelist is None or args.chunk['barcode'] is None:
        outs.bc_counts = None
        return

    def open_maybe_gzip(fn):
        if fn[-2:] == "gz":
            return gzip.open(fn)
        else:
            return open(fn)

    barcode_whitelist = sorted(
        list(tk_seq.load_barcode_whitelist(args.barcode_whitelist)))
    bc_idx = {bc: idx for (idx, bc) in enumerate(barcode_whitelist)}
    bc_counts = np.zeros(len(barcode_whitelist), dtype=np.int32)
    bad_count = 0

    barcode_file = open_maybe_gzip(args.chunk['barcode'])
    bc_iterator = tk_fasta.read_generator_fastq(barcode_file)

    for (bc_read, raw_bc_seq, raw_bc_qual) in bc_iterator:
        idx = bc_idx.get(raw_bc_seq)

        if idx is not None:
            bc_counts[idx] += 1
        else:
            bad_count += 1

    # Write BC count array and bad count to pickle
    result = {}
    result['bad_bc_count'] = bad_count
    result['bc_counts'] = list(bc_counts)

    with open(outs.bc_counts, 'w') as bc_counts_out:
        tenkit.safe_json.dump_numpy(result, bc_counts_out)
示例#4
0
    def test_barcode_counts(self):
        bam_bc_file = tk_test.in_path("attach_bcs/attach_bcs_output.bam")
        read_info_out = tk_test.out_path("read_info.h5")
        barcode_whitelist = tk_seq.load_barcode_whitelist("737K-april-2014")
        bam_in = tk_bam.create_bam_infile(bam_bc_file)
        r = compute_basic_stats(bam_in, {},
                                2000,
                                bam_in.references,
                                barcode_whitelist=barcode_whitelist,
                                read_h5_out=read_info_out)
        # insert_size_dists, nearest_targ_dists, summary_metrics, bc_table, mapq_counts, insert_size_hist = r
        misc_sm, bc_sms = r

        # Look at the barcode results -- there should be a raw bc count for each read pair
        # n_raw_bcs = bc_table["count"].sum()
        n_reads = len([x for x in tk_bam.create_bam_infile(bam_bc_file)])

        # self.assertEqual(n_raw_bcs, n_reads / 2)

        # Load the per-cluster table -- there should be a row for each read pair
        read_info = tenkit.hdf5.read_data_frame(read_info_out)

        self.assertEqual(read_info.shape[0], n_reads / 2)
示例#5
0
def split(args):
    if args.input is None or args.barcode_whitelist is None:
        chunk_defs = [{'chunk_start': "0", 'chunk_end': "0"}]
        return {'chunks': chunk_defs}

    # Some R&D bc sets have very small diversity -- don't run on them
    barcode_whitelist = tk_seq.load_barcode_whitelist(args.barcode_whitelist)
    if len(barcode_whitelist) < 100:
        chunk_defs = [{'chunk_start': "0", 'chunk_end': "0"}]
        return {'chunks': chunk_defs}

    min_chunks = 5
    if len(barcode_whitelist) > 1e6:
        min_chunks = 10

    bam_in = tk_bam.create_bam_infile(args.input)
    chunks = tk_bam.chunk_bam_records(bam_in,
                                      chunk_split_func,
                                      chunk_size_gb=8.0,
                                      min_chunks=min_chunks)
    for c in chunks:
        c['__mem_gb'] = 7.0

    return {'chunks': chunks, 'join': {'__mem_gb': 32.0}}
示例#6
0
def join(args, outs, chunk_defs, chunk_outs):
    final_chunks = []

    for cl in chunk_outs:
        final_chunks.extend(cl.chunks)

    outs.chunks = final_chunks
    valid_counts = [c.bc_counts for c in chunk_outs if c.bc_counts is not None]

    # No counts if there's no whitelist or actual counts
    if args.barcode_whitelist is None or len(valid_counts) == 0:
        outs.bc_counts = None
        outs.lot_info = None
        return

    result = {}

    for (c_out, c_def) in zip(chunk_outs, chunk_defs):
        gem_group = c_def.chunk['gem_group']
        if c_out.bc_counts is None:
            continue

        with open(c_out.bc_counts) as f:
            r = json.load(f)

        gg_result = result.setdefault(gem_group, {
            'bad_bc_count': 0,
            'bc_counts': None
        })

        gg_result['bad_bc_count'] += r['bad_bc_count']

        if gg_result['bc_counts'] is None:
            gg_result['bc_counts'] = np.array(r['bc_counts'], dtype=np.int32)
        else:
            gg_result['bc_counts'] += np.array(r['bc_counts'], dtype=np.int32)

    for gg in result.keys():
        rgg = result[gg]
        rgg['bc_error_rate'] = tk_stats.robust_divide(
            float(rgg['bad_bc_count']),
            float(rgg['bad_bc_count'] + rgg['bc_counts'].sum()))

    # possibly do lot detection
    lot_detection = {}
    lot_map = WHITELIST_TO_LOT_MAP.get(args.barcode_whitelist)
    if lot_map is not None:
        # get BC counts histogram
        # for now, just sum over all gem groups
        bc_seq = sorted(
            list(tk_seq.load_barcode_whitelist(args.barcode_whitelist)))
        bc_cts = np.sum([ggr['bc_counts'] for ggr in result.values()], axis=0)
        bc_hist = {seq: cts for seq, cts in zip(bc_seq, bc_cts)}

        (gelbead_lot, gelbead_lot_confidence,
         gelbead_lot_counts) = identify_gelbead_lot(bc_hist, lot_map)
        # only report on lots with nonzero counts
        gelbead_lot_counts_nonzero = {
            lot: count
            for lot, count in gelbead_lot_counts.items() if count > 0
        }

        lot_detection['gelbead_lot'] = gelbead_lot
        lot_detection['gelbead_lot_confidence'] = gelbead_lot_confidence
        lot_detection['gelbead_lot_counts'] = gelbead_lot_counts_nonzero

        martian.log_info("Gelbead lot detected: %s, reason (if None): %s" %
                         (gelbead_lot, gelbead_lot_confidence))

    with open(outs.lot_info, 'w') as f:
        tenkit.safe_json.dump_numpy(lot_detection, f)

    with open(outs.bc_counts, 'w') as f:
        tenkit.safe_json.dump_numpy(result, f)
示例#7
0
def main(args, outs):
    """ Trim the reads in a series of fasta files """

    # Set a fixed random seed to eliminate noise in metrics
    random.seed(0)

    chunk = args.chunk
    interleaved = chunk['reads_interleaved']
    have_read2 = chunk['read2'] is not None
    paired = interleaved or have_read2

    read1_trim = args.read1_trim_length
    read2_trim = args.read2_trim_length

    subsample_rate = chunk['subsample_rate']

    # BC config -- BC come from separate fastq, or are embedded in R1 or R2
    have_barcode = False
    bc_in_read1 = False
    bc_in_read2 = False
    bc_in_fastq = False

    # If we have bc in read, use that & ignore a separate BC read
    if chunk.get('bc_in_read',
                 None) is not None and chunk.get('bc_length', 0) > 0:
        have_barcode = True
        bc_length = chunk['bc_length']
        if chunk['bc_in_read'] == 1:
            bc_in_read1 = True
            read1_trim += bc_length
        elif chunk['bc_in_read'] == 2:
            bc_in_read2 = True
            read2_trim += bc_length
        else:
            martian.exit(
                "bc_in_read configuration incorrect -- read must be 1 or 2")

    # Otherwise use the BC file
    elif chunk['barcode'] is not None:
        have_barcode = True
        bc_in_fastq = True

    have_sample_index = chunk['sample_index'] is not None

    output_directory = os.path.dirname(os.path.realpath(outs.placeholder))
    max_read_num = args.max_read_num

    # counter for sub-chunked files
    file_number = 1

    # open the available read files and make the appropriate iterators
    if interleaved:
        read_in = openfq(chunk['read1'])
        read_iter = tk_fasta.read_generator_fastq(read_in, paired_end=True)
    else:
        if have_read2:
            read1_in = openfq(chunk['read1'])
            read1_iter = tk_fasta.read_generator_fastq(read1_in)

            read2_in = openfq(chunk['read2'])
            read2_iter = tk_fasta.read_generator_fastq(read2_in)

            read_iter = itertools.imap(
                lambda x, y: (x[0], x[1], x[2], y[0], y[1], y[2]), read1_iter,
                read2_iter)
        else:
            read1_in = openfq(chunk['read1'])
            read_iter = tk_fasta.read_generator_fastq(read1_in)

    # open read file
    read_name = output_directory + "/read" + str(file_number) + ".fastq"
    read_names = [read_name]
    out_read_fastq = open(read_name, 'w')

    # Bail out if there's no barcodes or whitelist
    if args.barcode_whitelist is None:
        outs.bc_counts = None
        bc_idx = None
    else:
        barcode_whitelist = sorted(
            list(tk_seq.load_barcode_whitelist(args.barcode_whitelist)))
        bc_idx = {bc: idx for (idx, bc) in enumerate(barcode_whitelist)}
        bc_counts = np.zeros(len(barcode_whitelist), dtype=np.int32)
        bad_count = 0

    # open barcode file if there is one
    if have_barcode:
        bc_name = output_directory + "/BC" + str(file_number) + ".fastq"
        out_bc_fastq = open(bc_name, 'w')
        bc_names = [bc_name]
        if bc_in_fastq:
            bc_in = openfq(chunk['barcode'])
            bc_iter = tk_fasta.read_generator_fastq(bc_in)
        elif bc_in_read1 or bc_in_read2:
            # BC in read -- have output file but no input file
            bc_iter = itertools.repeat(None)
    else:
        bc_iter = itertools.repeat(None)
        bc_names = [None]
        outs.bc_counts = None

# open sample_index file if there is one
    if have_sample_index:
        si_name = output_directory + "/SI" + str(file_number) + ".fastq"
        out_si_fastq = open(si_name, 'w')
        si_in = openfq(chunk['sample_index'])
        si_iter = tk_fasta.read_generator_fastq(si_in)
        si_names = [si_name]
    else:
        si_iter = itertools.repeat(None)
        si_names = [None]

    # loop through reads
    read_num = 0
    for read, barcode_read, sample_index_read in itertools.izip(
            read_iter, bc_iter, si_iter):
        if read_num > 0 and random.random() > subsample_rate:
            continue

        if paired:
            (name1, seq1, qual1, name2, seq2, qual2) = read
        else:
            (name1, seq1, qual1) = read

        new_seq1 = seq1[read1_trim:]
        new_qual1 = qual1[read1_trim:]
        if paired:
            new_seq2 = seq2[read2_trim:]
            new_qual2 = qual2[read2_trim:]

        # Get BC sequence out of the read, for BC-in-read schemes
        if bc_in_read1:
            barcode_read = (name1, seq1[:bc_length], qual1[:bc_length])

        if bc_in_read2:
            barcode_read = (name2, seq2[:bc_length], qual2[:bc_length])

        read_num += 1
        if read_num > max_read_num:
            read_num = 1
            file_number += 1
            read_name = output_directory + "/read" + str(
                file_number) + ".fastq"
            out_read_fastq.close()
            out_read_fastq = open(read_name, 'w')
            read_names.append(read_name)

            if have_barcode:
                bc_name = output_directory + "/BC" + str(
                    file_number) + ".fastq"
                out_bc_fastq.close()
                out_bc_fastq = open(bc_name, 'w')
                bc_names.append(bc_name)
            else:
                bc_names.append(None)

            if have_sample_index:
                si_name = output_directory + "/SI" + str(
                    file_number) + ".fastq"
                out_si_fastq.close()
                out_si_fastq = open(si_name, 'w')
                si_names.append(si_name)
            else:
                si_names.append(None)

        if have_barcode:
            barcode_seq = barcode_read[1]
            barcode_qual = barcode_read[2]
            if chunk['barcode_reverse_complement']:
                barcode_seq = tk_seq.get_rev_comp(barcode_seq)
                barcode_qual = barcode_qual[::
                                            -1]  # obscure way to reverse string
            if bc_idx is not None:
                idx = bc_idx.get(barcode_seq)
                if idx is not None:
                    bc_counts[idx] += 1
                else:
                    bad_count += 1

            tk_fasta.write_read_fastq(out_bc_fastq, barcode_read[0],
                                      barcode_seq, barcode_qual)
        if have_sample_index:
            tk_fasta.write_read_fastq(out_si_fastq, sample_index_read[0],
                                      sample_index_read[1],
                                      sample_index_read[2])

        tk_fasta.write_read_fastq(out_read_fastq, name1, new_seq1, new_qual1)
        if paired:
            tk_fasta.write_read_fastq(out_read_fastq, name2, new_seq2,
                                      new_qual2)

    if have_barcode:
        out_bc_fastq.close()
        # Only emit BC counts if we had a whitelist
        if outs.bc_counts is not None:
            result = {}
            result['bad_bc_count'] = bad_count
            result['bc_counts'] = list(bc_counts)
            with open(outs.bc_counts, 'w') as bc_counts_out:
                tenkit.safe_json.dump_numpy(result, bc_counts_out)
    if have_sample_index:
        out_si_fastq.close()
    out_read_fastq.close()

    chunks = []
    for (r, bc, si) in zip(read_names, bc_names, si_names):
        new_chunk = {
            'read1': r,
            'read2': None,
            'barcode': bc,
            'sample_index': si,
            'barcode_reverse_complement': False,
            'reads_interleaved': have_read2 or interleaved,
            'gem_group': chunk['gem_group'],
            'read_group': chunk['read_group']
        }
        chunks.append(new_chunk)

    outs.chunks = chunks
def main(args, outs):
    """ Attaches barcodes. Attaches raw barcode to RAW_BC tag and filters those to form set of PROCESSES_BARCODES """

    chunk = args.chunk

    #subsample_rate = 1.0
    #if args.subsample_rate is not None:
    #    subsample_rate = args.subsample_rate

    bam_in = tk_bam.create_bam_infile(args.align_chunk)
    bam_out, tids = tk_bam.create_bam_outfile(outs.output, None, None, template=bam_in, pgs=tk_bam.make_pg_header(martian.get_pipelines_version(), "attach_bcs"))

    if args.barcode_whitelist is None or args.bc_counts is None:
        # If there's no whitelist or counts then all high quality BC reads get allowed.
        barcode_whitelist = None
        wl_idxs = None
        bc_dist = None
    else:
        barcode_whitelist = tk_seq.load_barcode_whitelist(args.barcode_whitelist)

        # Load the bc counts for this GEM group
        counts = json.load(open(args.bc_counts, 'r'))
        counts = counts[str(chunk['gem_group'])]['bc_counts']

        # Prior distribution over barcodes, with pseudo-count
        bc_dist = np.array(counts, dtype=np.float) + 1.0
        bc_dist = bc_dist / bc_dist.sum()
        wl_idxs = { bc:idx for (idx,bc) in enumerate(sorted(list(barcode_whitelist))) }

    # set random seed to get deterministic subsampling
    random.seed(0)

    def open_maybe_gzip(fn):
        if fn[-2:] == "gz":
            return gzip.open(fn)
        else:
            return open(fn)

    if chunk['barcode']:
        processed_barcode_iter = get_raw_processed_barcodes(open_maybe_gzip(chunk['barcode']), barcode_whitelist, args.bc_confidence_threshold, chunk['gem_group'], chunk['barcode_reverse_complement'], wl_idxs, bc_dist)
        require_barcode_for_stringent = True
    else:
        processed_barcode_iter = itertools.repeat(None)
        require_barcode_for_stringent = False

    if chunk['sample_index']:
        sample_index_iter = tk_fasta.read_generator_fastq(open_maybe_gzip(chunk['sample_index']))
    else:
        sample_index_iter = itertools.repeat(None)

    iters = itertools.izip(processed_barcode_iter, sample_index_iter)

    # First read
    read = bam_in.next()

    # Number of perfect reads -- used to compute down-sampling rates in mark_duplicates
    perfect_read_count = 0

    # Due to secondary alignments, we must apply the tags to all
    # reads with the same cluster name.
    for (barcode_info, sample_index_info) in iters:
        tags = []
        read_name = None

        if read is None:
            break

        if barcode_info:
            (bc_read_name, raw_bc_seq, processed_bc_seq, raw_bc_qual) = barcode_info
            tags.append((RAW_BARCODE_TAG, raw_bc_seq))
            tags.append((RAW_BARCODE_QUAL_TAG, raw_bc_qual))
            if processed_bc_seq is not None:
                tags.append((PROCESSED_BARCODE_TAG, processed_bc_seq))
            read_name = bc_read_name.split()[0]


        if sample_index_info:
            (si_read_name, seq, qual) = sample_index_info
            tags.append((SAMPLE_INDEX_TAG, seq))
            tags.append((SAMPLE_INDEX_QUAL_TAG, qual))

            if read_name != None:
                if si_read_name.split()[0] != read_name:
                    martian.log_info("mismatch: si_read_name: %s, bam_read_name: %s" % (si_read_name, read_name))
                assert(si_read_name.split()[0] == read_name)
            else:
                read_name = si_read_name.split()[0]

        reads_attached = 0
        #emit_read_pair = random.random() < subsample_rate
        emit_read_pair = True

        while read.qname == read_name or read_name == None:
            if len(tags) > 0:
                existing_tags = read.tags
                existing_tags.extend(tags)
                read.tags = existing_tags

            reads_attached += 1
            if not (read_name is None):
                assert(read.qname == read_name)

            if emit_read_pair:
                # Count the perfect reads -- will be used when subsampling in dedup
                if tenkit.read_filter.stringent_read_filter(read, require_barcode_for_stringent):
                    perfect_read_count += 1

                if args.exclude_non_bc_reads:
                    if not(tk_io.get_read_barcode(read) is None):
                        bam_out.write(read)
                else:
                    bam_out.write(read)

            try:
                read = bam_in.next()

            except StopIteration:
                read = None
                break

        # We may have more than 2 reads is there was a
        # secondary alignment, but less than 2 means
        # something went wrong
        assert(reads_attached >= 2)


    outs.perfect_read_count = perfect_read_count
    bam_out.close()
示例#9
0
def join(args, outs, chunk_defs, chunk_outs):
    args_dict ={}
    args_dict["bc_allow_indel"]=args.bc_allow_indel
    args_dict["bc_max_error_allowed"]=args.bc_max_error_allowed
    args_dict["bc_pseudo_count"]=args.bc_pseudo_count
    args_dict["bc_use_mapping"]=args.bc_use_mapping
    args_dict["bc_mapq"]=args.bc_mapq
    args_dict["frag_no_merging"]=args.frag_no_merging
    args_dict["frag_mapq"]=args.frag_mapq
    args_dict["frag_pval"]=args.frag_pval
    args_dict["frag_freq"]=args.frag_freq
    fsummary = open(outs.summary, "w")
    fsummary.write(safe_json.safe_jsonify(args_dict))
    fsummary.close()

    tk_bam.concatenate(out_file_name=outs.pos_sorted_bam, all_in_file_names=[chunk.pos_sorted_bam for chunk in chunk_outs])
    tk_bam.index(outs.pos_sorted_bam)
    outs.pos_sorted_bam_index = outs.pos_sorted_bam + '.bai'

    bam_in = tk_bam.create_bam_infile(outs.pos_sorted_bam)
    chroms = bam_in.references
    barcode_whitelist = list(tk_seq.load_barcode_whitelist(args.barcode_whitelist))
    barcode_whitelist.sort()

    # Combine fragment csv files into a single h5 file
    in_csv_files = [co.fragments+"_"+cd.tid+".csv" for (cd, co)
        in zip(chunk_defs, chunk_outs) if os.path.exists(co.fragments+"_"+cd.tid+".csv")]


    nfrags = 0
    if len(in_csv_files) > 0:
        bc_num_frags = defaultdict(int)
        bc_num_reads = defaultdict(int)
        bc_num_single_reads = defaultdict(int)
        bc_num_lens = defaultdict(int)

        temp_csv_barcodes = outs.barcodes+"_temp.csv"
        nfrags = 0

        for f in in_csv_files:
            # TODO - sequentially append to fragments.h5 file to keep memory under control
            # - handle multiple GEM groups properly.
            # ensure the chroms column has string /categorical type in hdf5
            # - same fixes for barcodes.h5 file
            # handle 0-length outputs -- does that result in None file outs?
            frag_in = p.read_csv(f, names=["tid", "start_pos", "end_pos", "bc_id", "num_reads"])
            frag_in["obs_len"] = frag_in.end_pos - frag_in.start_pos
            frag_in[frag_in.num_reads <= 1].obs_len = 1000

            frag_in["est_len"] = np.maximum(1, frag_in["obs_len"] * (frag_in.num_reads + 1) / np.maximum(1, frag_in.num_reads - 1)).astype("int")
            frag_in[frag_in.num_reads <= 1].est_len = 1000
            
            barcode_seqs = []
            molecule_ids = []
    
            for (i, row) in frag_in.iterrows():

                bc_num_frags[row.bc_id] += 1
                bc_num_reads[row.bc_id] += row.num_reads
                bc_num_lens[row.bc_id] += row.est_len
                    
                bc_wl_id = int(row.bc_id) % len(barcode_whitelist)
                gg = int(row.bc_id) / len(barcode_whitelist) + 1
                barcode_seq = "%s-%d" % (barcode_whitelist[bc_wl_id], gg)
                barcode_seqs.append(barcode_seq)
                molecule_ids.append(nfrags)

                nfrags += 1

            frag_in["bc"] = p.Categorical(barcode_seqs)
            frag_in["chrom"] = p.Categorical.from_codes(frag_in.tid, chroms)
            frag_in["molecule_id"] = molecule_ids
            del frag_in["tid"]
            del frag_in["bc_id"]

            if len(frag_in) > 0:
                tenkit.hdf5.append_data_frame(outs.fragments, frag_in)


        with open(temp_csv_barcodes, "w") as csv_out:
            csv_out.write("bc,bc_est_len,bc_linked_read_fraction,bc_linked_fragment_fraction,bc_mean_reads_per_fragment,bc_num_fragments,bc_num_reads\n")
            for bc_id in range(len(barcode_whitelist)):
                bc = barcode_whitelist[bc_id]+"-1"
                if bc_id in bc_num_frags:
                    bc_est_len = bc_num_lens[bc_id]
                    bc_linked_read_fraction = 1.0 - bc_num_single_reads[bc_id]*1.0/bc_num_reads[bc_id]
                    bc_linked_fragment_fraction = 1.0 - bc_num_single_reads[bc_id]*1.0/bc_num_frags[bc_id]
                    bc_mean_reads_per_fragment = bc_num_reads[bc_id]*1.0/bc_num_frags[bc_id]
                    csv_out.write("%s,%d,%f,%f,%f,%d,%d\n" % (bc, bc_est_len, bc_linked_read_fraction, bc_linked_fragment_fraction, bc_mean_reads_per_fragment, bc_num_frags[bc_id], bc_num_reads[bc_id]))


        if nfrags == 0:
            outs.fragments = None
            outs.barcodes = None

        else:
            tenkit.hdf5.create_tabix_index(outs.fragments, 'chrom', 'start_pos', 'end_pos')

            df_barcodes = p.read_csv(temp_csv_barcodes)
            tenkit.hdf5.append_data_frame(outs.barcodes, df_barcodes)

    else:
        outs.fragments = None
        outs.barcodes= None

    summary =  {}
    # Compute high-level BC summary metrics
    # Load BC data
    if outs.barcodes:
        bc_df = tenkit.hdf5.read_data_frame(outs.barcodes)
        fragment_df = tenkit.hdf5.read_data_frame(outs.fragments, query_cols=['bc', 'num_reads', 'est_len', 'chrom', 'start_pos'])

        bc_df.sort('bc_num_reads', inplace=True)

        # bin the bc counts and write a json histogram file
        n_reads = bc_df.bc_num_reads.values
        max_val = np.percentile(n_reads, 99.99) * 1.3
        min_val = n_reads.min()
        num_bins = 400
        step = math.ceil((max_val - min_val)/num_bins)
        bins = np.arange(min_val, max_val, step)
        (hist, edges) = np.histogram(n_reads, bins=bins)
        bc_count_hist = {int(edges[i]):hist[i] for i in range(len(bins)-1)}

        # Summarize properties of n50 and n90 BC set
        bc_df['cum_reads'] = np.cumsum(bc_df.bc_num_reads)
        n50_read_thresh = sum(bc_df.bc_num_reads) * 0.5
        n50_bcs = bc_df[bc_df.cum_reads > n50_read_thresh]
        n50_fra = fragment_df[fragment_df.bc.isin(n50_bcs.bc)]
        n50_stats = high_level_stats("n50", n50_fra, n50_bcs)
        del n50_fra

        n90_read_thresh = sum(bc_df.bc_num_reads) * 0.1
        n90_bcs = bc_df[bc_df.cum_reads > n90_read_thresh]
        n90_fra = fragment_df[fragment_df.bc.isin(n90_bcs.bc)]
        n90_stats = high_level_stats("n90", n90_fra, n90_bcs)
        del n90_fra

        for (k,v) in n50_stats.iteritems():
            summary[k] = v

        for (k,v) in n90_stats.iteritems():
            summary[k] = v

        # Generate a fragment length histogram
        fragment_df['len_bin'] = np.floor_divide(fragment_df.est_len.values, FRAG_LEN_HIST_BIN_SIZE).astype(int) * FRAG_LEN_HIST_BIN_SIZE

        multi_read_frags = fragment_df[fragment_df.num_reads > 1]
        len_bins = multi_read_frags.groupby(['len_bin']).apply(len)
        del multi_read_frags

        len_hist = {k:v for (k,v) in len_bins.iteritems()}

        # Write fragment length hist to json
        with open(outs.fragment_size, 'w') as fragment_size_file:
            tenkit.safe_json.dump_numpy(len_hist, fragment_size_file)

        # Estimate total DNA per partition by looking at hottest 1000 GEMs or GEMs w/ bc_mean_reads_per_fragment > 2, whichever is fewer
        hot_bcs = bc_df[np.logical_and(bc_df.bc_mean_reads_per_fragment > 2.0, bc_df.bc_num_reads > 25)]
        hot_bcs.sort('bc_mean_reads_per_fragment', inplace=True)
        if len(hot_bcs) > 50:
            hot_bcs = hot_bcs[-NUM_BCS_LOADING_ESTIMATE:]
            summary['estimated_dna_per_partition'] = round(scipy.stats.tmean(hot_bcs.bc_est_len, scipy.percentile(hot_bcs.bc_est_len, (1,99))))
        else:
            summary['estimated_dna_per_partition'] = None

        # Read-based effective diversity
        reads = bc_df.bc_num_reads.values
        sum_sq = (reads**2.0).sum()
        effective_diversity = tk_stats.robust_divide((reads.sum()**2.0), float(sum_sq))
        summary['effective_diversity_reads'] = effective_diversity

        # Fragment-based effective diversity
        fragments = bc_df.bc_num_fragments.values
        sum_sq = (fragments**2.0).sum()
        effective_diversity = tk_stats.robust_divide((fragments.sum()**2.0), float(sum_sq))
        summary['effective_diversity_fragments'] = effective_diversity

    else:
        # No fragment_size file emitted
        outs.fragment_size = None

        n50_stats = high_level_stats("n50", None, None)
        n90_stats = high_level_stats("n90", None, None)

        for (k,v) in n50_stats.iteritems():
            summary[k] = v

        for (k,v) in n90_stats.iteritems():
            summary[k] = v

        bc_count_hist = {}

        summary['estimated_dna_per_partition'] = None
        summary['effective_diversity_reads'] = None
        summary['effective_diversity_fragments'] = None

    with open(outs.barcode_histogram, 'w') as barcode_hist_file:
        tenkit.safe_json.dump_numpy(bc_count_hist, barcode_hist_file)

    # Write summary to json
    with open(outs.single_partition, 'w') as summary_file:
        tenkit.safe_json.dump_numpy(summary, summary_file, pretty=True)
def join(args, outs, chunk_defs, chunk_outs):
    final_chunks = []

    for cl in chunk_outs:
        final_chunks.extend(cl.chunks)

    outs.chunks = final_chunks
    valid_counts = [c.bc_counts for c in chunk_outs if c.bc_counts is not None]

    # No counts if there's no whitelist or actual counts
    if args.barcode_whitelist is None or len(valid_counts) == 0:
        outs.bc_counts = None
        outs.lot_info = None
        return

    result = {}

    for (c_out, c_def) in zip(chunk_outs, chunk_defs):
        gem_group = c_def.chunk['gem_group']
        if c_out.bc_counts is None:
            continue

        with open(c_out.bc_counts) as f:
            r = json.load(f)

        gg_result = result.setdefault(gem_group, {
            'bad_bc_count': 0,
            'bc_counts': None
        })

        gg_result['bad_bc_count'] += r['bad_bc_count']

        if gg_result['bc_counts'] is None:
            gg_result['bc_counts'] = np.array(r['bc_counts'], dtype=np.int32)
        else:
            gg_result['bc_counts'] += np.array(r['bc_counts'], dtype=np.int32)

    total_counts = 0
    total_errors = 0
    for gg in result.keys():
        rgg = result[gg]
        rgg['bc_error_rate'] = tk_stats.robust_divide(
            float(rgg['bad_bc_count']),
            float(rgg['bad_bc_count'] + rgg['bc_counts'].sum()))
        total_counts += float(rgg['bad_bc_count'] + rgg['bc_counts'].sum())
        total_errors += float(rgg['bad_bc_count'])

    # Hardcoded bail-out if the BC-correct rate is extremely high
    bc_error_rate = total_errors / total_counts
    if bc_error_rate > 0.97:
        martian.exit(
            "Extremely high rate of incorrect barcodes observed (%.2f %%). Check that input is 10x Chromium data, and that there are no missing cycles in the first 16bp of Read 1. Please note that Supernova does not support GemCode data."
            % (bc_error_rate * 100.0))

    # possibly do lot detection
    lot_detection = {}
    lot_map = WHITELIST_TO_LOT_MAP.get(args.barcode_whitelist)
    if lot_map is not None:
        # get BC counts histogram
        # for now, just sum over all gem groups
        bc_seq = sorted(
            list(tk_seq.load_barcode_whitelist(args.barcode_whitelist)))
        bc_cts = np.sum([ggr['bc_counts'] for ggr in result.values()], axis=0)
        bc_hist = {seq: cts for seq, cts in zip(bc_seq, bc_cts)}

        (gelbead_lot, gelbead_lot_confidence,
         gelbead_lot_counts) = identify_gelbead_lot(bc_hist, lot_map)
        # only report on lots with nonzero counts
        gelbead_lot_counts_nonzero = {
            lot: count
            for lot, count in gelbead_lot_counts.items() if count > 0
        }

        lot_detection['gelbead_lot'] = gelbead_lot
        lot_detection['gelbead_lot_confidence'] = gelbead_lot_confidence
        lot_detection['gelbead_lot_counts'] = gelbead_lot_counts_nonzero

        martian.log_info("Gelbead lot detected: %s, reason (if None): %s" %
                         (gelbead_lot, gelbead_lot_confidence))

    with open(outs.lot_info, 'w') as f:
        tenkit.safe_json.dump_numpy(lot_detection, f, pretty=True)

    with open(outs.bc_counts, 'w') as f:
        tenkit.safe_json.dump_numpy(result, f)
示例#11
0
def main_report_length_mass(args, outs):
    tmp_dir = os.path.dirname(outs.summary)

    empty_stats = {
        'alpha': [],
        'alpha_mean': None,
        'alpha_cv': None,
        'mean_frags': None,
        'total_frags': [],
        'length_distribution': {},
        'empirical_length_distribution': {},
        'inferred_mean_length': None,
        'inferred_lw_mean_length': None,
        'inferred_total_mass_ng': None,
        'inferred_bp_per_bc': [],
        'mean_bp_per_bc': 0,
        'occupied_bcs': 0,
        'inferred_number_gems': 0,
    }

    if args.barcodes is None or args.barcode_whitelist is None or not os.path.exists(
            args.barcodes):
        return empty_stats

    barcode_whitelist = tk_seq.load_barcode_whitelist(args.barcode_whitelist)

    if len(barcode_whitelist) < 1000:
        return empty_stats

    if args.targets_file is None:
        targeted = False
        num_frags = NUM_FRAGS
    else:
        targeted = True
        num_frags = NUM_FRAGS_TARGETED

    bc_df = tenkit.hdf5.read_data_frame(args.barcodes)
    frag_df = tenkit.hdf5.read_data_frame(
        args.fragments,
        ['bc', 'chrom', 'start_pos', 'obs_len', 'num_reads', 'est_len'])
    input_num_frags = len(frag_df)

    gem_group = [int(bc.split('-')[1]) for bc in bc_df.bc]
    num_gem_groups = len(set(gem_group))

    # Start with data about all barcodes.
    # First filter out any barcodes that don't have at least 1 molecule that has > 1 read
    # This eliminates most of the background contamination of barcodes
    bc_df = bc_df[bc_df.bc_mean_reads_per_fragment > 1.0].copy()
    bc_df.sort('bc_num_reads', inplace=True)

    # Subset set to the N99 barcodes. (i.e. barcode that account for 99% of reads), and have at least 1 valid fragment
    # A valid fragment must have >= 1 MAPQ30 read and at least 1
    bc_df['cum_reads'] = np.cumsum(bc_df.bc_num_reads)
    prod_bc_thresh = 0.01 * bc_df.bc_num_reads.sum()
    occupied_bcs_df = bc_df[np.logical_and(bc_df.cum_reads > prod_bc_thresh,
                                           bc_df.bc_num_fragments > 0)]

    if len(occupied_bcs_df) == 0:
        martian.log_info(
            "No valid barcodes for length/mass inference -- exiting")
        return empty_stats

    # Figure out the subset of BCs likely to be singleton BCs
    # Only run estimation on that subset
    # Infer the expected total GEM count that should have been present
    occupied_bcs = len(occupied_bcs_df)
    total_diversity = len(barcode_whitelist) * num_gem_groups

    # Poisson correction -- we know how many barcodes have >= 1 GEM, and we know
    # how many total barcodes are possible. he Poission distribution to back-calculate
    # The number of GEMs that must have been present.
    # For Chromium there are 4.2M barcodes.
    p_occupied = float(occupied_bcs) / total_diversity
    mean_gems_per_bc = min(100, -np.log(1 - p_occupied))
    p_singleton = scipy.stats.poisson.pmf(1, mean_gems_per_bc)
    n_singleton = p_singleton * total_diversity

    # n_gems gets reported out as 'Gems Detected' in Loupe
    n_gems = int(round(mean_gems_per_bc * total_diversity))

    # Only use the bottom 90% of singleton BCs, to avoid contamination at high end
    bc_df_frags = occupied_bcs_df.sort('bc_num_fragments')
    singleton_bcs = bc_df_frags[int(round(n_singleton *
                                          0.1)):int(round(n_singleton * 0.9))]

    martian.log_info("Read Count Threshold for Occupied Barcodes: %f" %
                     occupied_bcs_df.iloc[0].bc_num_reads)
    martian.log_info("Occupied Barcodes: %d" % occupied_bcs)
    martian.log_info("Singleton Barcodes: %f" % n_singleton)
    martian.log_info("Number of GEMs in slice used for inference: %d" %
                     len(singleton_bcs))
    martian.log_info("Inferred Number of GEMS: %f" % n_gems)

    # Get empirical fragment length distribution
    obs_len = frag_df.obs_len.values

    # It's possible for multi-read fragments to have a size of zero, which
    # causes a vanishing density - set a lower limit
    obs_len = np.maximum(obs_len, 200)

    empirical_dist = empirical_length_distribution(frag_df)

    # Cap the obs_len at a reasonable value, then set the length bins accordingly
    if targeted:
        max_len_adj_factor = 1.6
    else:
        max_len_adj_factor = 1.3

    # select the max length for the fragment length distribution
    max_len = np.int32(np.percentile(obs_len, 99.97) * max_len_adj_factor)
    max_len = np.maximum(max_len, 100000)
    obs_len = np.minimum(obs_len, max_len, dtype=np.int32)
    max_bin = max_len * 1.01
    bin_data = gen_bin_length(NUM_LENGTH_BINS, min_len=500, max_len=max_bin)

    martian.log_info("Fragments trimmed to max length of %d" % max_len)

    # Select a random subset of BCS to work with
    # Fix random seed so that we get repeatable results
    num_bcs = max(MIN_BCS,
                  float(num_frags) / singleton_bcs.bc_num_fragments.mean())
    np.random.seed(0)
    if len(singleton_bcs) > 0:
        sel_bcs = singleton_bcs.irow(
            np.random.randint(0, len(singleton_bcs), num_bcs)).copy()
    sel_bcs['bc_id'] = np.arange(1, len(sel_bcs) + 1)
    sel_frags = frag_df[frag_df.bc.isin(sel_bcs.bc)].copy()
    sel_frags['bc_string'] = sel_frags.bc.astype('string')
    sel_frags.sort(['bc_string'], inplace=True)
    martian.log_info("Usings %d fragments" % len(sel_frags))

    bc_id_lookup = {}
    for (bc, bc_id) in zip(sel_bcs.bc, sel_bcs.bc_id):
        bc_id_lookup[bc] = bc_id
    # Write out the fragment data for stan to consume
    nbcs = len(sel_bcs)

    obs_len = sel_frags.obs_len.values
    # It's possible for multi-read fragments to have a size of zero, which
    # causes a vanishing density - set a lower limit
    obs_len = np.maximum(obs_len, 200)
    # obs_len for single-read fragments is 1000 in the
    # fragment file -- remap to 0
    obs_len[sel_frags.num_reads.values == 1] = 0.0
    obs_len = np.minimum(obs_len, max_len, dtype=np.int32)

    # Data to be passed to stan
    data = {
        # data sizes
        'N': len(sel_frags),
        'BC': nbcs,

        # Per BC stats
        'bc_observed_frags': sel_bcs.bc_num_fragments,

        # Fragment data: bc_id maps fragments to bc, num_reads, and obs_length fragment stats
        'bc_id': [bc_id_lookup[bc] for bc in sel_frags.bc],
        'num_reads': sel_frags.num_reads,
        'obs_length': obs_len,
    }

    # The number of sizes of the length bins
    data.update(bin_data)

    # Add extra data for targeting if neccesary
    if args.targets_file is not None:
        targets = tk_io.get_target_regions_dict(open(args.targets_file))
        fasta = tenkit.reference.open_reference(args.reference_path)
        ctg_sizes = [(name, len(seq)) for (name, seq) in fasta.items()]
        genome_size = float(sum(l for (name, l) in ctg_sizes))

        gb_size = 1024
        ctg_round_sizes = np.array([
            math.ceil(float(sz) / gb_size) * gb_size
            for (name, sz) in ctg_sizes
        ])
        ctg_starts = np.cumsum(np.concatenate([[0], ctg_round_sizes[:-1]]))
        ctg_start_series = p.Series(np.array(ctg_starts, dtype=np.int64),
                                    index=[name for (name, l) in ctg_sizes])

        targ_cs_ctgs = []
        on_target_bps = {}
        rsum = 0
        for ((ctg, sz), round_sz) in zip(ctg_sizes, ctg_round_sizes):
            targs = np.zeros(round_sz, dtype=np.int32)
            # Mark bases as targeted
            for (s, e) in targets.get(ctg, []):
                targs[s:e] = 1

            for frag_len in data['bin_length']:
                on_target_chrom = np.zeros(round_sz, dtype=np.int8)

                for (s, e) in targets.get(ctg, []):
                    ss = max(0, s - int(frag_len))
                    ee = min(round_sz, e)
                    on_target_chrom[ss:ee] = 1

                # Determine the probability that a fragment w/ a given length will touch an exon
                on_target_bps[frag_len] = on_target_bps.get(
                    frag_len, 0) + on_target_chrom.sum()
                del on_target_chrom

            # Running sum over chromosomes
            targs_cs = np.cumsum(targs) + rsum
            rsum += np.sum(targs)
            targ_cs_bins = targs_cs[::gb_size].copy()
            del targs
            del targs_cs
            targ_cs_ctgs.append(targ_cs_bins)

        total_target_size = sum(
            (e - s) for regs in targets.values() for (s, e) in regs)
        print "Total target size: %d" % total_target_size
        on_target_fracs = {
            k: float(v) / genome_size
            for (k, v) in on_target_bps.items()
        }
        print on_target_fracs

        # STAN will use this to interpolate the target sizes
        cum_target_bins = np.concatenate(targ_cs_ctgs)

        assert (cum_target_bins.shape[0] == int(
            np.sum(ctg_round_sizes / gb_size)))

        # Get the position of each fragment on the laid-out genome, with the position decimated by 8
        ctg_starts = ctg_start_series[sel_frags.chrom].values
        stan_pos = ((ctg_starts + sel_frags.start_pos) / 8).astype(np.int32)
        sel_frags['stan_pos'] = stan_pos

        print sel_frags.head(20)

        data['pos'] = sel_frags.stan_pos
        data['genome_size'] = genome_size
        data['gb_size'] = gb_size
        data['GB'] = len(cum_target_bins)
        data['cum_target_bases'] = cum_target_bins

    # Write out the stan input data
    input_fn = os.path.join(tmp_dir, "input.R")
    write_stan_input(input_fn, data)

    # Generate initial values for optimization
    ramp = np.linspace(1, 0.1, NUM_LENGTH_BINS)
    ramp = ramp / ramp.sum()

    # assume that fragments with 1 read were 2kb when setting initial alpha
    seen_dna = sel_frags.obs_len.sum() + 2000.0 * (sel_frags.num_reads
                                                   == 1).sum()
    mean_alpha = float(sel_frags.num_reads.sum()) / seen_dna

    frags_mu = sel_bcs.bc_num_fragments.mean()

    # Initial values of parameters to be estimated by Stan
    init_data = {
        # BC amp rate
        'alpha': [mean_alpha] * nbcs,

        # Length distribution
        'theta': list(ramp),

        # Average number of fragments
        'mean_frags': frags_mu,

        # Number of unobserved fragments
        'bc_unobserved_frags': [100] * nbcs,
        'read_disp': 10,
        'amp_length_k': 1.0 / 200000,
    }

    init_fn = os.path.join(tmp_dir, "init.R")
    write_stan_input(init_fn, init_data)

    # check if we have valid data for stan
    # need some observed fragments, and a minimal reads / fragments
    mean_rpf = sel_frags.num_reads.mean()
    martian.log_info("Mean LPM of molecules selected for inference: %f" %
                     mean_rpf)

    success = 0
    if len(sel_frags) > 0 and mean_rpf > MIN_RPF and (
            not targeted or total_target_size >= MIN_TARGET_SIZE):
        success = run_model(tmp_dir, targeted)
    else:
        if targeted and total_target_size < MIN_TARGET_SIZE:
            martian.log_info(
                "Target size is too small for length/mass inference: %d" %
                total_target_size)

        if len(sel_frags) == 0:
            martian.log_info("Aborting length-mass inference: no fragments")

        if mean_rpf < MIN_RPF:
            martian.log_info(
                "Reads per fragment too low for length-mass inference: %f" %
                mean_rpf)

    if success:
        res = load_stan_output(os.path.join(tmp_dir, "output.csv"))

        # If targeted, adjust the fragment length distribution and mass according to the fragment
        # visibility function
        if targeted:
            theta = res['theta']
            bl = data['bin_length']
            vis_func = np.array([on_target_fracs[l] for l in bl])
            print vis_func
            adj_theta = theta / vis_func
            adj_theta = adj_theta / adj_theta.sum()

            missing_factor = 1.0 / (adj_theta * vis_func).sum()

            # Put back in the adjusted values
            res['theta'] = adj_theta
            res['mean_frags'] = missing_factor * res['mean_frags']
            res['bc_total_frags'] = missing_factor * res['bc_total_frags']

        # print the mass distribution, alpha distributions
        mean_length = (data['bin_length'] * res['theta']).sum()
        mean_length_weighted = np.average(data['bin_length'],
                                          weights=data['bin_length'] *
                                          res['theta'])

        # Mass conversion
        ng_per_bp = 1.025e-12

        bases_per_bc = res['bc_total_frags'] * mean_length
        total_bases = res['bc_total_frags'].mean() * mean_length * n_gems
        total_mass_ng = total_bases * ng_per_bp

        # calculation
        bp_per_ng = 9.76e11

        # try to calc input mass
        #z2_vol_per_gem - ufluidcs number, corrected for empty GEMS
        #bp_per_gem = loaded_mass * bp_per_ng * z2_vol_per_gem / total_z2_vol_input
        # z2_vol_per_gem = 144 pL
        # total_z2_vol_input = 65uL
        # Fixme -- product configuration needs to be passed in & fixed for future products
        fluidics_params = FLUIDICS_PARAMS['Chromium']
        loaded_mass = np.mean(bases_per_bc) * fluidics_params[
            'total_z2_vol_input'] / bp_per_ng / fluidics_params[
                'z2_vol_per_gem']

        # Me: magic number, David: empirically derived correction factor
        DENATURATION_FACTOR = 1.6

        # Ad-hoc correction for the apparent 'denaturation' of the input material, which leads to double counting on input DNA
        corrected_loaded_mass = loaded_mass / DENATURATION_FACTOR

        stats = {
            'alpha':
            list(res['alpha']),
            'alpha_mean':
            np.mean(res['alpha']),
            'alpha_cv':
            tk_stats.robust_divide(np.std(res['alpha']),
                                   np.mean(res['alpha'])),
            'mean_frags':
            res['mean_frags'],
            'total_frags':
            res['bc_total_frags'],
            'length_distribution': {
                str(l): frac
                for (l, frac) in zip(data['bin_length'], input_num_frags *
                                     res['theta'])
            },
            'empirical_length_distribution':
            empirical_dist,
            'inferred_mean_length':
            mean_length,
            'inferred_lw_mean_length':
            mean_length_weighted,
            'inferred_total_mass_ng':
            total_mass_ng,
            'inferred_bp_per_bc':
            bases_per_bc,
            'mean_bp_per_bc':
            np.mean(bases_per_bc),
            'loaded_mass_ng':
            loaded_mass,
            'corrected_loaded_mass_ng':
            corrected_loaded_mass,
        }
    else:

        len_dist_default = {str(k): 1.0 / k for k in data['bin_length']}

        stats = {
            'alpha': [],
            'alpha_mean': None,
            'alpha_cv': None,
            'mean_frags': None,
            'total_frags': [],
            'length_distribution': len_dist_default,
            'empirical_length_distribution': empirical_dist,
            'inferred_mean_length': None,
            'inferred_lw_mean_length': None,
            'inferred_total_mass_ng': None,
            'inferred_bp_per_bc': [],
            'mean_bp_per_bc': None,
            'loaded_mass_ng': None,
            'corrected_loaded_mass_ng': None,
        }

    stats['occupied_bcs'] = occupied_bcs
    stats['inferred_number_gems'] = n_gems
    return stats