예제 #1
0
def read_data_frame_limited(fn, query_cols=[], max_rows=None):
    ''' Load a pandas DataFrame from an HDF5 file. If a column list is specified, only load the matching columns '''

    with h5py.File(fn, 'r') as f:

        column_names = f.attrs.get("column_names")
        column_names = get_column_intersection(column_names, query_cols)

        sz = f[column_names[0]].shape[0]
        if max_rows:
            sz = min(sz, max_rows)

        df = p.DataFrame()

        # Add the columns progressively to save memory
        for name in column_names:
            ds = f[name]
            if has_levels(ds):
                indices = ds[:sz]
                uniques = get_levels(ds)
                # This method of constructing of Categorical avoids copying the indices array
                # which saves memory for big datasets
                df[name] = p.Categorical(indices,
                                         categories=uniques,
                                         ordered=False,
                                         fastpath=True)
            else:
                df[name] = p.Series(ds[:sz])

        return df
예제 #2
0
def main_report_length_mass(args, outs):
    tmp_dir = os.path.dirname(outs.summary)

    empty_stats = {
        'alpha': [],
        'alpha_mean': None,
        'alpha_cv': None,
        'mean_frags': None,
        'total_frags': [],
        'length_distribution': {},
        'empirical_length_distribution': {},
        'inferred_mean_length': None,
        'inferred_lw_mean_length': None,
        'inferred_total_mass_ng': None,
        'inferred_bp_per_bc': [],
        'mean_bp_per_bc': 0,
        'occupied_bcs': 0,
        'inferred_number_gems': 0,
    }

    if args.barcodes is None or args.barcode_whitelist is None or not os.path.exists(
            args.barcodes):
        return empty_stats

    barcode_whitelist = tk_seq.load_barcode_whitelist(args.barcode_whitelist)

    if len(barcode_whitelist) < 1000:
        return empty_stats

    if args.targets_file is None:
        targeted = False
        num_frags = NUM_FRAGS
    else:
        targeted = True
        num_frags = NUM_FRAGS_TARGETED

    bc_df = tenkit.hdf5.read_data_frame(args.barcodes)
    frag_df = tenkit.hdf5.read_data_frame(
        args.fragments,
        ['bc', 'chrom', 'start_pos', 'obs_len', 'num_reads', 'est_len'])
    input_num_frags = len(frag_df)

    gem_group = [int(bc.split('-')[1]) for bc in bc_df.bc]
    num_gem_groups = len(set(gem_group))

    # Start with data about all barcodes.
    # First filter out any barcodes that don't have at least 1 molecule that has > 1 read
    # This eliminates most of the background contamination of barcodes
    bc_df = bc_df[bc_df.bc_mean_reads_per_fragment > 1.0].copy()
    bc_df.sort('bc_num_reads', inplace=True)

    # Subset set to the N99 barcodes. (i.e. barcode that account for 99% of reads), and have at least 1 valid fragment
    # A valid fragment must have >= 1 MAPQ30 read and at least 1
    bc_df['cum_reads'] = np.cumsum(bc_df.bc_num_reads)
    prod_bc_thresh = 0.01 * bc_df.bc_num_reads.sum()
    occupied_bcs_df = bc_df[np.logical_and(bc_df.cum_reads > prod_bc_thresh,
                                           bc_df.bc_num_fragments > 0)]

    if len(occupied_bcs_df) == 0:
        martian.log_info(
            "No valid barcodes for length/mass inference -- exiting")
        return empty_stats

    # Figure out the subset of BCs likely to be singleton BCs
    # Only run estimation on that subset
    # Infer the expected total GEM count that should have been present
    occupied_bcs = len(occupied_bcs_df)
    total_diversity = len(barcode_whitelist) * num_gem_groups

    # Poisson correction -- we know how many barcodes have >= 1 GEM, and we know
    # how many total barcodes are possible. he Poission distribution to back-calculate
    # The number of GEMs that must have been present.
    # For Chromium there are 4.2M barcodes.
    p_occupied = float(occupied_bcs) / total_diversity
    mean_gems_per_bc = min(100, -np.log(1 - p_occupied))
    p_singleton = scipy.stats.poisson.pmf(1, mean_gems_per_bc)
    n_singleton = p_singleton * total_diversity

    # n_gems gets reported out as 'Gems Detected' in Loupe
    n_gems = int(round(mean_gems_per_bc * total_diversity))

    # Only use the bottom 90% of singleton BCs, to avoid contamination at high end
    bc_df_frags = occupied_bcs_df.sort('bc_num_fragments')
    singleton_bcs = bc_df_frags[int(round(n_singleton *
                                          0.1)):int(round(n_singleton * 0.9))]

    martian.log_info("Read Count Threshold for Occupied Barcodes: %f" %
                     occupied_bcs_df.iloc[0].bc_num_reads)
    martian.log_info("Occupied Barcodes: %d" % occupied_bcs)
    martian.log_info("Singleton Barcodes: %f" % n_singleton)
    martian.log_info("Number of GEMs in slice used for inference: %d" %
                     len(singleton_bcs))
    martian.log_info("Inferred Number of GEMS: %f" % n_gems)

    # Get empirical fragment length distribution
    obs_len = frag_df.obs_len.values

    # It's possible for multi-read fragments to have a size of zero, which
    # causes a vanishing density - set a lower limit
    obs_len = np.maximum(obs_len, 200)

    empirical_dist = empirical_length_distribution(frag_df)

    # Cap the obs_len at a reasonable value, then set the length bins accordingly
    if targeted:
        max_len_adj_factor = 1.6
    else:
        max_len_adj_factor = 1.3

    # select the max length for the fragment length distribution
    max_len = np.int32(np.percentile(obs_len, 99.97) * max_len_adj_factor)
    max_len = np.maximum(max_len, 100000)
    obs_len = np.minimum(obs_len, max_len, dtype=np.int32)
    max_bin = max_len * 1.01
    bin_data = gen_bin_length(NUM_LENGTH_BINS, min_len=500, max_len=max_bin)

    martian.log_info("Fragments trimmed to max length of %d" % max_len)

    # Select a random subset of BCS to work with
    # Fix random seed so that we get repeatable results
    num_bcs = max(MIN_BCS,
                  float(num_frags) / singleton_bcs.bc_num_fragments.mean())
    np.random.seed(0)
    if len(singleton_bcs) > 0:
        sel_bcs = singleton_bcs.irow(
            np.random.randint(0, len(singleton_bcs), num_bcs)).copy()
    sel_bcs['bc_id'] = np.arange(1, len(sel_bcs) + 1)
    sel_frags = frag_df[frag_df.bc.isin(sel_bcs.bc)].copy()
    sel_frags['bc_string'] = sel_frags.bc.astype('string')
    sel_frags.sort(['bc_string'], inplace=True)
    martian.log_info("Usings %d fragments" % len(sel_frags))

    bc_id_lookup = {}
    for (bc, bc_id) in zip(sel_bcs.bc, sel_bcs.bc_id):
        bc_id_lookup[bc] = bc_id
    # Write out the fragment data for stan to consume
    nbcs = len(sel_bcs)

    obs_len = sel_frags.obs_len.values
    # It's possible for multi-read fragments to have a size of zero, which
    # causes a vanishing density - set a lower limit
    obs_len = np.maximum(obs_len, 200)
    # obs_len for single-read fragments is 1000 in the
    # fragment file -- remap to 0
    obs_len[sel_frags.num_reads.values == 1] = 0.0
    obs_len = np.minimum(obs_len, max_len, dtype=np.int32)

    # Data to be passed to stan
    data = {
        # data sizes
        'N': len(sel_frags),
        'BC': nbcs,

        # Per BC stats
        'bc_observed_frags': sel_bcs.bc_num_fragments,

        # Fragment data: bc_id maps fragments to bc, num_reads, and obs_length fragment stats
        'bc_id': [bc_id_lookup[bc] for bc in sel_frags.bc],
        'num_reads': sel_frags.num_reads,
        'obs_length': obs_len,
    }

    # The number of sizes of the length bins
    data.update(bin_data)

    # Add extra data for targeting if neccesary
    if args.targets_file is not None:
        targets = tk_io.get_target_regions_dict(open(args.targets_file))
        fasta = tenkit.reference.open_reference(args.reference_path)
        ctg_sizes = [(name, len(seq)) for (name, seq) in fasta.items()]
        genome_size = float(sum(l for (name, l) in ctg_sizes))

        gb_size = 1024
        ctg_round_sizes = np.array([
            math.ceil(float(sz) / gb_size) * gb_size
            for (name, sz) in ctg_sizes
        ])
        ctg_starts = np.cumsum(np.concatenate([[0], ctg_round_sizes[:-1]]))
        ctg_start_series = p.Series(np.array(ctg_starts, dtype=np.int64),
                                    index=[name for (name, l) in ctg_sizes])

        targ_cs_ctgs = []
        on_target_bps = {}
        rsum = 0
        for ((ctg, sz), round_sz) in zip(ctg_sizes, ctg_round_sizes):
            targs = np.zeros(round_sz, dtype=np.int32)
            # Mark bases as targeted
            for (s, e) in targets.get(ctg, []):
                targs[s:e] = 1

            for frag_len in data['bin_length']:
                on_target_chrom = np.zeros(round_sz, dtype=np.int8)

                for (s, e) in targets.get(ctg, []):
                    ss = max(0, s - int(frag_len))
                    ee = min(round_sz, e)
                    on_target_chrom[ss:ee] = 1

                # Determine the probability that a fragment w/ a given length will touch an exon
                on_target_bps[frag_len] = on_target_bps.get(
                    frag_len, 0) + on_target_chrom.sum()
                del on_target_chrom

            # Running sum over chromosomes
            targs_cs = np.cumsum(targs) + rsum
            rsum += np.sum(targs)
            targ_cs_bins = targs_cs[::gb_size].copy()
            del targs
            del targs_cs
            targ_cs_ctgs.append(targ_cs_bins)

        total_target_size = sum(
            (e - s) for regs in targets.values() for (s, e) in regs)
        print "Total target size: %d" % total_target_size
        on_target_fracs = {
            k: float(v) / genome_size
            for (k, v) in on_target_bps.items()
        }
        print on_target_fracs

        # STAN will use this to interpolate the target sizes
        cum_target_bins = np.concatenate(targ_cs_ctgs)

        assert (cum_target_bins.shape[0] == int(
            np.sum(ctg_round_sizes / gb_size)))

        # Get the position of each fragment on the laid-out genome, with the position decimated by 8
        ctg_starts = ctg_start_series[sel_frags.chrom].values
        stan_pos = ((ctg_starts + sel_frags.start_pos) / 8).astype(np.int32)
        sel_frags['stan_pos'] = stan_pos

        print sel_frags.head(20)

        data['pos'] = sel_frags.stan_pos
        data['genome_size'] = genome_size
        data['gb_size'] = gb_size
        data['GB'] = len(cum_target_bins)
        data['cum_target_bases'] = cum_target_bins

    # Write out the stan input data
    input_fn = os.path.join(tmp_dir, "input.R")
    write_stan_input(input_fn, data)

    # Generate initial values for optimization
    ramp = np.linspace(1, 0.1, NUM_LENGTH_BINS)
    ramp = ramp / ramp.sum()

    # assume that fragments with 1 read were 2kb when setting initial alpha
    seen_dna = sel_frags.obs_len.sum() + 2000.0 * (sel_frags.num_reads
                                                   == 1).sum()
    mean_alpha = float(sel_frags.num_reads.sum()) / seen_dna

    frags_mu = sel_bcs.bc_num_fragments.mean()

    # Initial values of parameters to be estimated by Stan
    init_data = {
        # BC amp rate
        'alpha': [mean_alpha] * nbcs,

        # Length distribution
        'theta': list(ramp),

        # Average number of fragments
        'mean_frags': frags_mu,

        # Number of unobserved fragments
        'bc_unobserved_frags': [100] * nbcs,
        'read_disp': 10,
        'amp_length_k': 1.0 / 200000,
    }

    init_fn = os.path.join(tmp_dir, "init.R")
    write_stan_input(init_fn, init_data)

    # check if we have valid data for stan
    # need some observed fragments, and a minimal reads / fragments
    mean_rpf = sel_frags.num_reads.mean()
    martian.log_info("Mean LPM of molecules selected for inference: %f" %
                     mean_rpf)

    success = 0
    if len(sel_frags) > 0 and mean_rpf > MIN_RPF and (
            not targeted or total_target_size >= MIN_TARGET_SIZE):
        success = run_model(tmp_dir, targeted)
    else:
        if targeted and total_target_size < MIN_TARGET_SIZE:
            martian.log_info(
                "Target size is too small for length/mass inference: %d" %
                total_target_size)

        if len(sel_frags) == 0:
            martian.log_info("Aborting length-mass inference: no fragments")

        if mean_rpf < MIN_RPF:
            martian.log_info(
                "Reads per fragment too low for length-mass inference: %f" %
                mean_rpf)

    if success:
        res = load_stan_output(os.path.join(tmp_dir, "output.csv"))

        # If targeted, adjust the fragment length distribution and mass according to the fragment
        # visibility function
        if targeted:
            theta = res['theta']
            bl = data['bin_length']
            vis_func = np.array([on_target_fracs[l] for l in bl])
            print vis_func
            adj_theta = theta / vis_func
            adj_theta = adj_theta / adj_theta.sum()

            missing_factor = 1.0 / (adj_theta * vis_func).sum()

            # Put back in the adjusted values
            res['theta'] = adj_theta
            res['mean_frags'] = missing_factor * res['mean_frags']
            res['bc_total_frags'] = missing_factor * res['bc_total_frags']

        # print the mass distribution, alpha distributions
        mean_length = (data['bin_length'] * res['theta']).sum()
        mean_length_weighted = np.average(data['bin_length'],
                                          weights=data['bin_length'] *
                                          res['theta'])

        # Mass conversion
        ng_per_bp = 1.025e-12

        bases_per_bc = res['bc_total_frags'] * mean_length
        total_bases = res['bc_total_frags'].mean() * mean_length * n_gems
        total_mass_ng = total_bases * ng_per_bp

        # calculation
        bp_per_ng = 9.76e11

        # try to calc input mass
        #z2_vol_per_gem - ufluidcs number, corrected for empty GEMS
        #bp_per_gem = loaded_mass * bp_per_ng * z2_vol_per_gem / total_z2_vol_input
        # z2_vol_per_gem = 144 pL
        # total_z2_vol_input = 65uL
        # Fixme -- product configuration needs to be passed in & fixed for future products
        fluidics_params = FLUIDICS_PARAMS['Chromium']
        loaded_mass = np.mean(bases_per_bc) * fluidics_params[
            'total_z2_vol_input'] / bp_per_ng / fluidics_params[
                'z2_vol_per_gem']

        # Me: magic number, David: empirically derived correction factor
        DENATURATION_FACTOR = 1.6

        # Ad-hoc correction for the apparent 'denaturation' of the input material, which leads to double counting on input DNA
        corrected_loaded_mass = loaded_mass / DENATURATION_FACTOR

        stats = {
            'alpha':
            list(res['alpha']),
            'alpha_mean':
            np.mean(res['alpha']),
            'alpha_cv':
            tk_stats.robust_divide(np.std(res['alpha']),
                                   np.mean(res['alpha'])),
            'mean_frags':
            res['mean_frags'],
            'total_frags':
            res['bc_total_frags'],
            'length_distribution': {
                str(l): frac
                for (l, frac) in zip(data['bin_length'], input_num_frags *
                                     res['theta'])
            },
            'empirical_length_distribution':
            empirical_dist,
            'inferred_mean_length':
            mean_length,
            'inferred_lw_mean_length':
            mean_length_weighted,
            'inferred_total_mass_ng':
            total_mass_ng,
            'inferred_bp_per_bc':
            bases_per_bc,
            'mean_bp_per_bc':
            np.mean(bases_per_bc),
            'loaded_mass_ng':
            loaded_mass,
            'corrected_loaded_mass_ng':
            corrected_loaded_mass,
        }
    else:

        len_dist_default = {str(k): 1.0 / k for k in data['bin_length']}

        stats = {
            'alpha': [],
            'alpha_mean': None,
            'alpha_cv': None,
            'mean_frags': None,
            'total_frags': [],
            'length_distribution': len_dist_default,
            'empirical_length_distribution': empirical_dist,
            'inferred_mean_length': None,
            'inferred_lw_mean_length': None,
            'inferred_total_mass_ng': None,
            'inferred_bp_per_bc': [],
            'mean_bp_per_bc': None,
            'loaded_mass_ng': None,
            'corrected_loaded_mass_ng': None,
        }

    stats['occupied_bcs'] = occupied_bcs
    stats['inferred_number_gems'] = n_gems
    return stats