예제 #1
0
def join(args, outs, chunk_defs, chunk_outs):
    ref = contig_manager.contig_manager(args.reference_path)
    chroms = ref.primary_contigs(allow_sex_chromosomes=True)

    profiles, gc, mask = load_data(args.raw_profiles, args.tracks, chroms)
    gc_norm_params = json.load(open(args.gc_norm_params, "r"))
    scale = gc_norm_params["scale"]
    linear = gc_norm_params["linear"]
    quadratic = gc_norm_params["quadratic"]

    norm_profiles = gc_normalize(profiles, gc, linear, quadratic, chroms)

    bin_size = coverage_matrix.get_bin_size(args.raw_profiles)

    coverage_matrix.store_matrix(file_name=outs.normalized_profiles,
                                 chroms=chroms,
                                 profiles=norm_profiles,
                                 tracks=None,
                                 window_size=bin_size,
                                 masks=mask,
                                 dtype="float32")

    store = pd.HDFStore(outs.normalized_profiles, "a")
    constants = load_h5(args.raw_profiles, "constants")
    store["constants"] = constants
    store.close()

    store = pd.HDFStore(outs.normalized_profiles, "a")
    store["/gc_params/scale"] = pd.Series(scale)
    store["/gc_params/linear"] = pd.Series(linear)
    store["/gc_params/quadratic"] = pd.Series(quadratic)
    store.close()
예제 #2
0
def split(args):
    ref = contig_manager.contig_manager( args.reference_path )
    constants = load_h5(args.sc_norm_profiles, "constants").to_dict()
    ncells = constants["ncells"]
    window_size = constants["window_size"]
    # maximum memory usage is the maximum of these four values:
    # sumbins = sum(len(c) for c in primary_contigs)/window_size
    # maxbins = max(len(c) for c in all_contigs)/window_size
    # X + Q + H = ((2*sizeof(i8) + 2*sizeof(f32)) * ncells * sumbins)
    # occupancy = sizeof(f32) * levels(=6) * (ncells - 1) * sumbins / nchunks(=100)
    # het = X + Q + H + occupancy
    # X + Y + Z = ((2*sizeof(float)) * ncells * maxbins)
    # merged_bed = sc_cnv_calls_bed + internal_cnv_calls_bed
    # unmerged_bed = sc_unmerged_cnv_calls_bed + internal_unmerged_cnv_calls_bed
    # * NOTE: ask for the double the matrix sizes to acct for intermediate values
    f32sz = 4
    sumbins = sum(ref.contig_lengths[c]/window_size+1 for c in ref.primary_contigs())
    maxbins = max(ref.contig_lengths[c]/window_size+1 for c in ref.list_all_contigs())
    XQH_mem_gb = float((2 + 2*f32sz) * ncells * sumbins)/1e9
    occ_mem_gb = float(f32sz * 6 * (ncells - 1) * sumbins/100)/1e9
    het_mem_gb = XQH_mem_gb + occ_mem_gb
    XYZ_mem_gb = 2 * float(f32sz * ncells * maxbins) / 1e9
    merged_bed_gb = os.path.getsize(args.sc_cnv_calls)/1e9 + \
                    os.path.getsize(args.internal_cnv_calls)/1e9 + 1
    unmerged_bed_gb = os.path.getsize(args.sc_unmerged_cnv_calls)/1e9 + \
                      os.path.getsize(args.internal_unmerged_cnv_calls)/1e9 + 1
    mem_gb = int(np.ceil(max(het_mem_gb, XYZ_mem_gb, merged_bed_gb, unmerged_bed_gb))) + 3
    return {'chunks': [], 'join': {'__mem_gb': mem_gb}}
예제 #3
0
def split(args):
    ref = contig_manager.contig_manager(args.reference_path)
    ## every primary chromosome gets its own chunk
    ## all the secondary pieces are in one chunk
    chrom_chunks = []
    non_primary_chunk = []
    for chrom in ref.list_all_contigs():
        if ref.is_primary_contig(chrom, allow_sex_chromosomes=True):
            chrom_chunks.append([chrom])
        else:
            non_primary_chunk.append(chrom)
    
    if len(non_primary_chunk) > 0:
        chrom_chunks.append( non_primary_chunk )
    
    chrom_sizes = ref.contig_lengths
    max_size = 0
    for chroms in chrom_chunks:
        chunk_size = sum([chrom_sizes[chrom] for chrom in chroms])
        max_size = max(max_size, chunk_size)
    
    nbcs = 0
    for v in args.cell_barcodes.itervalues():
        nbcs += len(v)

    max_mat_size = 4*nbcs*max_size/args.window_size
    chunk_mem_gb = int(np.ceil((1.0*max_mat_size/1e9) + 1))
    join_mem_gb = int(np.ceil(1.0*max_mat_size/1e9 +
                              1.0*sum(chrom_sizes.values())/args.window_size/1e9 + 1))

    chunk_defs = [{'chroms': chroms, '__mem_gb': chunk_mem_gb} for chroms in chrom_chunks]
    return {'chunks': chunk_defs, 'join': {'__mem_gb': join_mem_gb}}
예제 #4
0
def get_contig_info(args):
    manager = contig_manager.contig_manager(args.reference_path)
    contig_info = {"contig_order": {}, "contig_lengths": {}}
    contig_lengths = manager.get_contig_lengths()
    for idx, contig in enumerate(manager.contigs["primary_contigs"]):
        contig_info["contig_order"][contig] = idx
        contig_info["contig_lengths"][contig] = contig_lengths[contig]
    contig_info["species"] = manager.list_species()
    return contig_info
예제 #5
0
def split(args):
    ref = contig_manager.contig_manager(args.reference_path)
    chroms = ref.list_all_contigs()
    max_chrom_size = max([ref.contig_lengths[chrom] for chrom in chroms])
    constants = load_h5(args.profiles, "constants").to_dict()
    ncells = constants["ncells"]
    window_size = constants["window_size"]
    max_mat_size_gb = float(
        2 * ncells * max_chrom_size / window_size) / 1e9 * 4
    mem_gb = int(np.ceil(max_mat_size_gb * 4 + 1))
    return {'chunks': [], 'join': {'__mem_gb': mem_gb}}
예제 #6
0
 def _load_hdf5_file(self, filename, reference_path):
     self._contig_manager = contig_manager.contig_manager(reference_path)
     self._contig_list = self._contig_manager.primary_contigs(
         allow_sex_chromosomes=True)
     store = pd.HDFStore(filename, "r")
     self._window_size = store['constants']['window_size']
     self._conf_filter = {}
     for chrom in self._contig_list:
         cmask = (store["/CONF/"+chrom].values > crdna.constants.CONFIDENT_BIN_THRESHOLD) & \
                 (store["/N/"+chrom].values < 1.0/self._window_size)
         self._conf_filter[chrom] = cmask
     # for chrom
     store.close()
예제 #7
0
def split(args):
    MAX_CHUNKS = 30
    MIN_CELLS_PER_CHUNK = 100

    ## TODO : store ncells in the profiles.h5 as a constant so we don't have
    ## to do this to get the number of cells
    ref = contig_manager.contig_manager(args.reference_path)
    chrom = ref.primary_contigs(allow_sex_chromosomes=True)[0]

    store = pd.HDFStore(args.profiles, "r")
    ncells, _ = store["/contigs/" + chrom].shape
    store.close()

    ## no cells, do nothing!
    if ncells < 1:
        return {'chunks': [], 'join': {}}

    nchunks = np.clip(
        ncells / MIN_CELLS_PER_CHUNK + int(ncells % MIN_CELLS_PER_CHUNK != 0),
        1, MAX_CHUNKS)
    cells_per_chunk = ncells / nchunks + int(ncells % nchunks != 0)

    mat_size_gb = coverage_matrix.get_genome_matrix_size_gb(args.profiles)
    chunk_mem_gb = int(np.ceil(4 * mat_size_gb / ncells * cells_per_chunk + 1))
    join_mem_gb = int(np.ceil(4 * mat_size_gb + 1))

    ## if this is a multi species sample do nothing
    if len(ref.list_species()) > 1:
        return {
            'chunks': [{
                'chunk': {
                    'start': 0,
                    'end': ncells,
                    'ncells': ncells
                }
            }],
            'join': {
                '__mem_gb': join_mem_gb
            }
        }

    chunk_defs = [{
        'chunk': {
            'start': i,
            'end': min(i + cells_per_chunk, ncells),
            'ncells': ncells
        },
        '__mem_gb': chunk_mem_gb
    } for i in xrange(0, ncells, cells_per_chunk)]

    return {'chunks': chunk_defs, 'join': {'__mem_gb': join_mem_gb}}
예제 #8
0
def split(args):
    ref = contig_manager.contig_manager(args.reference_path)
    contig_lengths = ref.get_contig_lengths()

    target_regions = None
    all_loci = []
    for (chrom_name, chrom_size) in contig_lengths.iteritems():
        all_loci.extend(
            generate_chrom_loci(target_regions, chrom_name, chrom_size,
                                100000000))

    locus_sets = pack_loci(all_loci)

    chunk_defs = [{'loci': loci} for loci in locus_sets]
    return {'chunks': chunk_defs}
예제 #9
0
def split(args):
    ref = contig_manager.contig_manager(args.reference_path)
    contig_lengths = ref.get_contig_lengths()

    target_regions = None
    all_loci = []
    for (chrom_name, chrom_size) in contig_lengths.iteritems():
        all_loci.extend(
            generate_chrom_loci(target_regions, chrom_name, chrom_size,
                                tenkit.constants.PARALLEL_LOCUS_SIZE))

    locus_sets = pack_loci(all_loci)

    chunk_defs = [{'loci': loci, '__mem_gb': 12} for loci in locus_sets]
    return {'chunks': chunk_defs, 'join': {'__mem_gb': 12}}
예제 #10
0
def main(args, outs):
    ref = contig_manager.contig_manager(args.reference_path)
    chroms = ref.primary_contigs(allow_sex_chromosomes=True)

    ## read in calls as dataframe
    calls = pd.read_csv(args.cnv_calls,
                        sep="\t",
                        names=[
                            "chrom", "start", "end", "ploidy", "confidence",
                            "cluster_index"
                        ])

    ## figure out dimensions of cnv_tracks and gather chrom data
    nclusters = len(np.unique(calls["cluster_index"].values))
    window_size = args.window_size
    contig_sizes = ref.get_contig_lengths()
    chrom_bin_sizes = {}
    nbins = 0
    chrom_offset = {}
    for chrom in chroms:
        chrom_offset[chrom] = nbins
        csize = contig_sizes[chrom]
        cbins = csize / window_size + int(csize % window_size != 0)
        chrom_bin_sizes[chrom] = cbins
        nbins += cbins

    cnv_tracks = np.zeros((nclusters, nbins), dtype="int32")
    for chrom in chroms:
        p = ref.expected_ploidy(chrom, args.sex)
        csize = chrom_bin_sizes[chrom]
        cnv_tracks[:, chrom_offset[chrom]:chrom_offset[chrom] + csize] = p

    nclusters = calls["cluster_index"].unique().shape[0]
    for ci in xrange(nclusters):
        print ci
        cluster_calls = calls[calls["cluster_index"] == ci]
        for _, row in cluster_calls.iterrows():
            offset = chrom_offset[row["chrom"]]
            assert (row["start"] - 1) % window_size == 0
            #assert row["end"] % window_size == 0
            sbin = offset + (row["start"] - 1) / window_size
            ebin = offset + row["end"] / window_size
            cnv_tracks[ci, sbin:ebin] = row["ploidy"]

    out_store = pd.HDFStore(outs.cnv_tracks, "w")
    out_store["cnv_tracks"] = pd.DataFrame(cnv_tracks)
    out_store.close()
예제 #11
0
def split(args):
    ctg_mgr = contig_manager.contig_manager(args.reference_path)

    ## every primary chromosome gets its own chunk
    ## all the secondary pieces are in one chunk
    chrom_chunks = []
    non_primary_chunk = []
    for chrom in ctg_mgr.list_all_contigs():
        if ctg_mgr.is_primary_contig(chrom, allow_sex_chromosomes=True):
            chrom_chunks.append([chrom])
        else:
            non_primary_chunk.append(chrom)
    if len(non_primary_chunk) > 0:
        chrom_chunks.append( non_primary_chunk )
    
    chunk_defs = [{'chroms': chroms, '__mem_gb': 12} for chroms in chrom_chunks]

    return {'chunks': chunk_defs, 'join': {'__mem_gb': 12}}
예제 #12
0
def join(args, outs, chunk_defs, chunk_outs):
    store = pd.HDFStore( outs.profiles, "w" )
    
    ## put cell barcodes in canonical order
    bc_list = []
    for v in args.cell_barcodes.itervalues():
        bc_list.extend(v.keys())
    # for v
    bc_list = list(set(bc_list))
    bc_list.sort( )
    
    store["barcodes"] = pd.Series( bc_list )

    # load ref to determine primary-ness

    all_chroms = []
    masks = []
    genomebins = 0
    ncells = None
    for chunk_out, chunk_def in zip(chunk_outs, chunk_defs):
        chroms = chunk_def.chroms
        all_chroms.extend(chroms)
        profile_chunk = pd.HDFStore( chunk_out.profiles, "r" )
        for chrom in chroms:
            mask = profile_chunk["/masks/" + chrom]
            masks.extend(mask)
            store["/contigs/" + chrom] = profile_chunk["/contigs/" + chrom]
            store["/masks/" + chrom] = mask
        genomebins += profile_chunk["constants"]["genomebins"]
        if ncells is None:
            ncells = profile_chunk["constants"]["ncells"]

        # for chrom
        profile_chunk.close( )
    # for chunk_out, chunk_def
    ## store the window size in the h5
    store["constants"] = pd.Series({"window_size": args.window_size,
        "ncells" : ncells, "genomebins" : genomebins})

    ref = contig_manager.contig_manager(args.reference_path)
    write_mask_bed(outs.mappable_regions,store,all_chroms,args.window_size,ref,
        args)
    
    store.close( )
def estimate_cnv_confidence_score_v2( raw_profiles, cnv_calls, reference_path, logp, bin_size ):
    """
    Calculates a CNV confidence score (log(posterior)) for each CNV call using the pre-computed
    logp matrix of per-bin confidence scores.
    """
    ref = contig_manager.contig_manager(reference_path)
    chrom_names = ref.primary_contigs(allow_sex_chromosomes=True)
    PER_BIN_MAX_SCORE = 100.0
    scores = np.zeros( len(cnv_calls), dtype='int32' )

    for i, cnv_call in enumerate(cnv_calls.itertuples()):
        #
        # the create cnv tracks module already sets confidence to zero for masked bins
        # just use that confidence if it's already set
        #
        if cnv_call.Confidence==0.0:
            continue
        chrom_name = cnv_call.Chr
        chrom_index = chrom_names.index(chrom_name)
        start = int(round(cnv_call.Start / bin_size))
        # end in BED file is exclusive
        end = int(round(cnv_call.End / bin_size))
        cell = cnv_call.NodeID
        #
        start = max([0, start])
        n_bins = raw_profiles[chrom_index].shape[1]
        end = min([end, n_bins])
        # start can == end in the case where a CNV call happens only on the terminal bin
        # this was found by randomly breaking up a reference such that a mappable bin is
        # cut in two. This is extremely unlikely in a 'real' reference since bins at the
        # end of contigs will unmappable and/or have the same ploidy as the neighboring
        # bins. The pipeline steps that will lead to this are in
        # CREATE_CNV_TRACKS_AND_BED which converts cluster_data.h5 into cnv_calls.bed
        if start == end:  # special case: CNV call on single bin
            score = np.nansum( logp[chrom_index][cell,start] )
            score = np.clip(score, 0, PER_BIN_MAX_SCORE)
            scores[i] = min(np.round(score * 100), np.iinfo("uint8").max)
        else:
            score = np.nansum( logp[chrom_index][cell,start:end] )
            score = np.clip(score, 0, PER_BIN_MAX_SCORE*(end-start))
            scores[i] = min(np.round(score/(end-start)*100), np.iinfo("uint8").max)
    cnv_calls['Confidence'] = scores
    return cnv_calls
예제 #14
0
def split(args):
    if args.input is None or args.barcode_whitelist is None:
        chunk_defs = [{'chunk_start': "0", 'chunk_end': "0", '__mem_gb': 1}]
        return {'chunks': chunk_defs, 'join': {'__mem_gb': 1}}

    ref = contig_manager.contig_manager(args.reference_path)
    species_list = ref.list_species()
    if (args.force_cells is not None and args.force_cells > 0
            and len(species_list) > 1):
        martian.exit(
            "force_cells can only be used for single species reference.")
    min_chunks = 10
    bam_in = tk_bam.create_bam_infile(args.input)
    chunks = tk_bam.chunk_bam_records(bam_in,
                                      chunk_split_func,
                                      chunk_size_gb=8.0,
                                      min_chunks=min_chunks)

    # 0.03 =~ 26meg = 1M bcs * (sizeof(int64) + 18)
    join_mem_gb = int(np.ceil(0.03 * (len(chunks) + 1) + 1))
    return {'chunks': chunks, 'join': {'__mem_gb': join_mem_gb}}
예제 #15
0
def join(args, outs, chunk_defs, chunk_outs):
    args.coerce_strings()
    outs.coerce_strings()
    
    ref = contig_manager.contig_manager( args.reference_path )
    chroms = ref.primary_contigs(allow_sex_chromosomes=True)

    ## Load data
    store = pd.HDFStore( args.cnv_tracks, "r" )
    Q = store["/cnv_tracks"].values
    sf = store["/scale_factor"]
    rpb = store["/reads_per_bin"]
    segment_windows = store["constants"]["segment_windows"]
    store.close( )

    if args.tracks is None:
        gmask = np.ones(Q.shape[1], dtype=bool)
    else:
        gmask = []
        maptrack = pd.HDFStore(args.tracks, "r")
        for chrom in chroms:
            x = maptrack["/map/"+chrom].values 
            ## TODO make this consistent across stages
            gmask.extend( x > MAPPABILITY_THRESHOLD )
        maptrack.close( )
        gmask = np.array(gmask)

    ## Aggregate all cells to the same resolution and compute L1 norm
    Q_agg = np.round(aggregate_matrix( Q[:, gmask].astype("float32"), 
        segment_windows)/segment_windows).astype("int32")

    distances, Z = compute_linkage( Q_agg )

    out_store = pd.HDFStore( outs.data, "w")
    out_store["/Z"] = pd.DataFrame(Z)
    out_store["distances"] = pd.Series(distances)
    out_store["constants"] = pd.Series({"segment_windows": segment_windows})
    out_store["scale_factor"] = sf
    out_store["reads_per_bin"] = rpb
    out_store.close( )
예제 #16
0
 def _load_hdf5_file(self, filename, reference_path, reuse_mask_from=None):
     X, Xm = load_matrix(filename, reference_path)
     if len(X) == 0:
         raise ValueError(
             "Loading profiles from %s returned zero contigs matching the reference at %s"
             % (filename, reference_path))
     # it's low cost to hang on to a contig_manager instance
     self._contig_manager = contig_manager.contig_manager(reference_path)
     # this list has to match the list inside load_matrix
     primary_contigs = self._contig_manager.primary_contigs(
         allow_sex_chromosomes=True)
     store = pd.HDFStore(filename, "r")
     self._window_size = store['constants']['window_size']
     if 'barcodes' in store:
         self._barcodes = np.array(store['barcodes'])
         self._num_cells = len(self._barcodes)
     else:
         self._barcodes = None
         self._num_cells = len(X[0])
     profile_contigs = set(list_all_contigs(store))
     i = 0
     self._contig_list = []
     self._contig_coverage = {}
     self._contig_mask = {}
     self._contig_idx = {}
     for chrom in primary_contigs:
         if chrom in profile_contigs:
             self._contig_list.append(chrom)
             self._contig_coverage[chrom] = X[i]
             if reuse_mask_from:
                 self._contig_mask[chrom] = reuse_mask_from._contig_mask[
                     chrom]
             else:
                 self._contig_mask[chrom] = Xm[i]
             nbins = X[i].shape[1]
             self._contig_idx[chrom] = np.arange(1,
                                                 nbins * self._window_size,
                                                 self._window_size)
             i = i + 1
     store.close()
def calculate_logposterior_matrix( raw_profiles, poisson_expectations, mask, 
        cnv_calls, reference_path, bin_size ):
    """Create a ncell x nbins matrix of log(posterior) values
    """
    ref = contig_manager.contig_manager(reference_path)
    chrom_names = ref.primary_contigs(allow_sex_chromosomes=True)

    logp = []
    ncells = 0
    for chrom_index in xrange(len(raw_profiles)):
        if ncells==0:
            ncells = raw_profiles[chrom_index].shape[0]
        nbins = raw_profiles[chrom_index].shape[1]
        logp.append( np.zeros( ( ncells, nbins ), dtype='float32' ) )

    for cnv_call in cnv_calls.itertuples():
        #
        # the create cnv tracks module already sets confidence to zero for masked bins
        # just use that confidence if it's already set
        #
        if cnv_call.Confidence==0.0:
            continue
        chrom_name = cnv_call.Chr
        chrom_index = chrom_names.index(chrom_name)
        ploidy = int(round(cnv_call.CopyNumber))
        assert(ploidy >= 0), 'Negative ploidy: %s' % repr(cnv_call)
        start = int(round(cnv_call.Start / bin_size))
        # end in BED file is exclusive
        end = int(round(cnv_call.End / bin_size))
        cell = cnv_call.NodeID
        #
        start = max(0, start)
        n_bins = raw_profiles[chrom_index].shape[1]
        end = min(end, n_bins)
        scores = get_segment_scores(raw_profiles[chrom_index][cell,:],
            poisson_expectations[chrom_index][cell,:],
            mask[chrom_index], start, end, ploidy)
        logp[chrom_index][cell,start:end] = scores
    # for cnv_call
    return logp
예제 #18
0
    def write_sorted_bed(chunk_getter, outfilename):
        with open(outfilename, 'w') as out_file:
            #
            for chunk in chunk_outs:
                if not os.path.exists(chunk_getter(chunk)):
                    continue
                # if !exists
                with open(chunk_getter(chunk), 'r') as in_file:
                    shutil.copyfileobj(in_file, out_file, 1024 * 1024)
            # for chunk
        ref = contig_manager.contig_manager(args.reference_path)
        chroms = ref.primary_contigs(allow_sex_chromosomes=True)
        chrom_index = dict([(c, i) for i, c in enumerate(chroms)])

        cnv_df = pd.read_csv(outfilename, sep="\t", names=COLUMN_NAMES)
        cnv_df["chrom_index"] = cnv_df["Chr"].apply(chrom_index.get)
        cnv_df.sort_values(by=["chrom_index", "Start", "End"], inplace=True)
        cnv_df.to_csv(outfilename,
                      sep="\t",
                      columns=COLUMN_NAMES,
                      header=False,
                      index=False)
예제 #19
0
def main(args, outs):
    ref = contig_manager.contig_manager(args.reference_path)
    args.coerce_strings()
    outs.coerce_strings()

    # Bail out if there no valid barcodes
    if args.barcode_whitelist is None or args.input is None:
        outs.summary = None
        return

    bam_in = tk_bam.create_bam_infile(args.input)
    bam_chunk = tk_bam.read_bam_chunk(bam_in,
                                      (args.chunk_start, args.chunk_end))

    # Skip reads without a barcode
    bam_chunk_filt = itertools.ifilter(read_has_barcode, bam_chunk)
    bc_read_iter = itertools.groupby(bam_chunk_filt,
                                     lambda x: crdna_io.get_read_barcode(x))

    counts = {}

    for bc, reads in bc_read_iter:
        for r in reads:
            contig = bam_in.references[r.tid]
            species = ref.species_from_contig(contig)
            if not species in counts:
                counts[species] = {}
            if not bc in counts[species]:
                counts[species][bc] = 0
            if r.is_secondary or r.is_supplementary:
                ## we are ignoring alternate alignments
                continue
            if (r.is_unmapped or r.mapping_quality < CELL_DETECT_MAPQ_THRESHOLD
                    or r.is_duplicate):
                ## if read is unmapped, poor mapping quality or dup
                continue
            counts[species][bc] += 1
    outs.counts = counts
예제 #20
0
def split(args):
    args.coerce_strings()
    #
    ctg_mgr = contig_manager.contig_manager(args.reference_path)
    chroms = ctg_mgr.primary_contigs(allow_sex_chromosomes=False)
    #
    # Handle case when clusters = None
    if args.clusters is None:
        ncells = coverage_matrix.get_num_cells(args.coverage_profile,
                                               args.reference_path)
        clusters = [[x] for x in xrange(ncells)]
    else:
        f = open(args.clusters)
        clusters = json.load(f)
    cart_prod = []
    for chrom in chroms:
        for ci, cluster in enumerate(clusters):
            chunk_def = {'chrom': chrom, 'cluster_index': ci}
            cart_prod.append(chunk_def)
        # for cluster
    # for chrom
    #
    # Split these pieces into at most MAX_CHUNKS chunks
    MAX_CHUNKS = 100
    npieces = len(cart_prod)
    pieces_per_chunk = npieces / MAX_CHUNKS + int(npieces % MAX_CHUNKS != 0)
    chunks = []
    start = 0
    while start < npieces:
        chunk_def = {"chroms": [], "cluster_indices": []}
        end = min(start + pieces_per_chunk, npieces)
        for i in xrange(start, end):
            chunk_def["chroms"].append(cart_prod[i]["chrom"])
            chunk_def["cluster_indices"].append(cart_prod[i]["cluster_index"])
        chunks.append(chunk_def)
        start += pieces_per_chunk
    assert len(chunks) <= MAX_CHUNKS
    return {'chunks': chunks}
예제 #21
0
def split(args):
    with open(args.cnv_calls, 'r') as infile:
        nodes = {l.rstrip().split('\t')[3] for l in infile}
    num_nodes = len(nodes)

    store = pd.HDFStore(args.raw_profiles)
    ref = contig_manager.contig_manager(args.reference_path)
    chroms = ref.primary_contigs(allow_sex_chromosomes=True)
    max_chrom_nbins = max(store["/contigs/" + chrom].shape[1]
                          for chrom in chroms)
    store.close()

    MAX_CHUNKS = 30
    MIN_NODES_PER_CHUNK = 5

    nchunks = np.clip(np.ceil(1.0 * num_nodes / MIN_NODES_PER_CHUNK), 1,
                      MAX_CHUNKS)
    nodes_per_chunk = max(1, int(np.ceil(1.0 * num_nodes / nchunks)))

    chromsz_gb = 1.0 * max_chrom_nbins * max(1, num_nodes) / 1e9
    matsize_gb = (
        1.0 * coverage_matrix.get_genome_matrix_size_gb(args.raw_profiles) *
        nodes_per_chunk / max(1, num_nodes))
    unmerged_gb = int(np.ceil(os.path.getsize(args.unmerged_cnv_calls) / 1e9))
    chunk_mem_gb = int(np.ceil(6 * max(matsize_gb, chromsz_gb) + 2))
    join_mem_gb = int(np.ceil(6 * unmerged_gb + 2))

    chunk_defs = [{
        'chunk': {
            'start': i,
            'end': min(i + nodes_per_chunk, num_nodes)
        },
        '__mem_gb': chunk_mem_gb
    } for i in xrange(0, num_nodes, nodes_per_chunk)]

    return {'chunks': chunk_defs, 'join': {'__mem_gb': join_mem_gb}}
예제 #22
0
def join(args, outs, chunk_defs, chunk_outs):
    ref = contig_manager.contig_manager(args.reference_path)
    ## only run normalization for single species samples
    species_list = ref.list_species()
    if len(species_list) == 1:
        chroms = ref.primary_contigs(allow_sex_chromosomes=True)
        profiles, mask, _ = load_genome_data(args.raw_profiles, args.tracks,
                                             chroms)
        gc = load_gc_data(args.tracks, chroms)
        scale, linear, quadratic = estimate_gc_normalization(
            profiles, gc, mask)
    else:
        ncells = coverage_matrix.get_num_cells(args.raw_profiles,
                                               args.reference_path)
        scale = [1.0] * ncells
        linear = [0.0] * ncells
        quadratic = [0.0] * ncells
    with open(outs.gc_norm_params, "w") as out:
        gc_norm_data = {
            "scale": scale,
            "linear": linear,
            "quadratic": quadratic
        }
        json.dump(gc_norm_data, out, indent=4)
예제 #23
0
def join(args, outs, chunk_defs, chunk_outs):
    args.coerce_strings()
    outs.coerce_strings()

    ref = contig_manager.contig_manager(args.reference_path)
    chroms = ref.primary_contigs(allow_sex_chromosomes=True)

    ## load genome data and cell profiles
    X, gmask, bdy = load_genome_data(
        args.profiles,
        args.tracks,
        chroms,
        mappability_threshold=crdna.constants.MAPPABILITY_THRESHOLD)

    # compute chromosome boundaries after masking by gmask
    cbdy = np.zeros_like(bdy)
    for i in xrange(1, len(bdy)):
        cbdy[i] = (gmask[0:bdy[i]].sum())

    ## load GC info and create GC emission track
    gctrack = []
    store = pd.HDFStore(args.tracks, "r")
    for chrom in chroms:
        gctrack.extend(store["/GC/" + chrom].values)
    gctrack = np.array(gctrack)[gmask]
    store.close()

    store = pd.HDFStore(args.ll_ratios, "r")
    llrs = store["/llrs"].values
    store.close()

    nbins = gmask.sum()
    ncells = X.shape[0]

    ## Heuristics to define breakpoints
    ll_threshold = 5
    delta_threshold = 0.10

    Y_quant = np.zeros((ncells, nbins), dtype="int8")
    scale_factor = np.zeros(ncells)
    windows_per_cell = []

    gc_norm_params = json.load(open(args.gc_norm_params, "r"))
    print "Starting loop over cells"
    sys.stdout.flush()
    for i in xrange(ncells):
        print "-" * 80
        print "Cell", i
        sys.stdout.flush()

        ## genome profile
        y = X[i][gmask]
        ## log likelihood ratio profile
        ll = llrs[i]

        ## GC coefficients
        gc_linear = gc_norm_params["linear"][i]
        gc_quadratic = gc_norm_params["quadratic"][i]

        ## GC correction track for cell
        xi = parabola(gctrack, crdna.constants.GC_ORIGIN, gc_linear,
                      gc_quadratic)
        xi_low = parabola(crdna.constants.MIN_GC, crdna.constants.GC_ORIGIN,
                          gc_linear, gc_quadratic)
        xi_high = parabola(crdna.constants.MAX_GC, crdna.constants.GC_ORIGIN,
                           gc_linear, gc_quadratic)
        xi[gctrack < crdna.constants.MIN_GC] = xi_low
        xi[gctrack > crdna.constants.MAX_GC] = xi_high

        ## Define breakpoints
        ##

        bp_cands2 = get_breakpoint_positions(y,
                                             ll,
                                             xi,
                                             ll_threshold=ll_threshold,
                                             delta_threshold=delta_threshold)

        assert bp_cands2[0] == 0, "genome start must be breakpoint"
        assert bp_cands2[-1] == y.shape[0], "genome end must be breakpoint"

        ## define segments using breakpoints
        segment_bdy = []
        for j in xrange(len(bp_cands2) - 1):
            segment_bdy.append((bp_cands2[j], bp_cands2[j + 1]))

        ## add chromosome boundaries as mandatory breakpoints
        segment_bdy = break_segments_at_points(segment_bdy,
                                               cbdy,
                                               verbose=False)
        validate_segment_intervals(segment_bdy, cbdy)

        ## aggregate bins within a segment to resolution given by window
        ## and compute segment mean read counts and lengths
        window = int(
            np.round(crdna.constants.BREAKPOINT_READ_THRESHOLD /
                     np.median(y[y > 0])))
        window = np.clip(window, 1, None)
        windows_per_cell.append(window)

        segment_means = []
        segment_lengths = []
        for s, e in segment_bdy:
            segment = y[s:e]
            xi_piece = xi[s:e]
            length = e - s
            agg = []
            xi_agg = []
            j = 0
            while j < length:
                piece = segment[j:j + window]
                assert len(piece) > 0, "%d, %d-%d" % (j, s, e)
                corr = float(window) / len(piece)
                agg.append(corr * piece.sum())
                xi_agg.append(xi_piece[j:j + window].mean())
                j += window
            agg = np.array(agg)
            xi_agg = np.array(xi_agg)
            ## remove outliers
            med = np.median(agg)
            mad = np.abs(agg - med)
            mmad = np.median(mad)
            outlier_mask = mad <= 5 * mmad
            segment_means.append(
                np.sum(agg[outlier_mask]) / np.sum(xi_agg[outlier_mask]))
            segment_lengths.append(e - s)
        segment_means = np.array(segment_means)
        segment_lengths = np.array(segment_lengths)

        ## Find the scaling factor to produce integer ploidies

        ## Heuristics

        # max ploidy to assign to initially chosen long segment
        max_ploidy_long = 10

        # max value of segment mean to consider "zero ploidy"
        zero_ploidy_count = crdna.constants.BREAKPOINT_READ_THRESHOLD / 4.0

        # longest segment with segment mean > zero_ploidy_count
        # that we will push to zero ploidy
        max_segment_push_to_zero = 200

        # prior params
        prior_params = {
            "prior_mean": args.params.get("prior_mean", 2.0),
            "prior_std": args.params.get("prior_std", 1.0)
        }
        min_ploidy = args.params.get("min_ploidy", None)
        max_ploidy = args.params.get("max_ploidy", None)
        lam_best = find_best_scale_v14(
            y,
            segment_bdy,
            segment_means,
            segment_lengths,
            window,
            max_ploidy_long=max_ploidy_long,
            zero_ploidy_count=zero_ploidy_count,
            prior_params=prior_params,
            max_segment_push_to_zero=max_segment_push_to_zero,
            min_ploidy=min_ploidy,
            max_ploidy=max_ploidy,
            verbose=True)

        print "Scaling factor:"
        print lam_best

        assert lam_best > 0.001, "run away to zero"
        scale_factor[i] = lam_best / window

        ## Compute the ploidy vector

        ploidy = get_ploidy_vector(y, segment_means, segment_bdy, lam_best)

        ## Set max ploidy at 127
        ploidy = np.clip(ploidy, None, np.iinfo("int8").max)

        print "Ploidies encountered"
        print Counter(ploidy).most_common()

        Y_quant[i, :] = ploidy.astype("int8")

    ## store in output h5
    out_store = pd.HDFStore(outs.denoised_profiles, "w")
    out_store["/constants"] = pd.Series(
        {"segment_windows": int(np.median(windows_per_cell))})
    out_store["/scale_factor"] = pd.Series(scale_factor)
    out_store["/quantized"] = pd.DataFrame(Y_quant)
    out_store.close()
예제 #24
0
def join(args, outs, chunk_defs, chunk_outs):
    args.coerce_strings()
    outs.coerce_strings()

    ref = contig_manager.contig_manager(args.reference_path)
    chroms = ref.primary_contigs(allow_sex_chromosomes=True)

    store = pd.HDFStore(args.cluster_data, "r")
    windows = store["windows"]
    Q = store["quantized"].values
    #
    # due to the int8 conversion and the use of -127 as a special value,
    # unpredictable bad things will happen if Q>126
    # in practice we don't expect such a thing to ever occur
    #
    martian.log_info("Found %d bins with Q>126" % (Q > 126).sum())
    Q[Q > 126] = 126
    constants = store["constants"]
    store.close()

    ## cnv track
    ncells = Q.shape[0]

    store = pd.HDFStore(args.tracks, "r")

    window_size = store["constants"]["window_size"]

    nbins = 0
    for chrom in chroms:
        nbins += store["/map/" + chrom].shape[0]

    C = MISSING_VALUE * np.ones((ncells, nbins), dtype="int8")

    #
    # The C array is filled out with the following convention:
    # * unmasked bins have positive ploidies
    # * masked bins with imputed ploidies are recorded with negative ploidies
    #
    chrom_start = 0
    masked_chrom_start = 0
    chrom_bdy = {}
    for chrom in chroms:
        ctrack = store["/map/" + chrom].values
        cmask = ctrack > crdna.constants.MAPPABILITY_THRESHOLD
        chrom_end = chrom_start + len(cmask)
        masked_chrom_end = masked_chrom_start + cmask.sum()
        chrom_bdy[chrom] = (chrom_start, chrom_end)
        C[:, chrom_start:chrom_end][:, cmask] = Q[:, masked_chrom_start:
                                                  masked_chrom_end]
        impute_ploidies_for_chromosome_nocall_boundaries(
            C, chrom_start, chrom_end, window_size)
        chrom_start = chrom_end
        masked_chrom_start = masked_chrom_end

    store.close()
    in_store = pd.HDFStore(args.cluster_data, "r")
    out_store = pd.HDFStore(outs.cnv_tracks, "w")
    out_store["/cnv_tracks"] = pd.DataFrame(C)
    out_store["/windows"] = windows
    out_store["constants"] = constants
    out_store["/ploidy_conf"] = in_store["/ploidy_conf"]
    out_store["/reads_per_bin"] = in_store["/reads_per_bin"]
    out_store["/scale_factor"] = in_store["/scale_factor"]
    out_store.close()
    in_store.close()

    ## break up profile into segments and write to BED

    with open(outs.cnv_calls, "w") as out_bed, open(outs.unmerged_cnv_calls,
                                                    "w") as out_unmerged_bed:
        for cell in xrange(ncells):
            for chrom in chroms:
                chrom_start, chrom_end = chrom_bdy[chrom]
                ## chrom piece of CNV
                chrom_piece = C[cell, chrom_start:chrom_end]

                for b in get_event_blocks_v2(cell, chrom, chrom_piece,
                                             window_size, ref):
                    out_bed.write("\t".join(map(str, b)) + os.linesep)
                for b in get_event_blocks_v2(cell,
                                             chrom,
                                             chrom_piece,
                                             window_size,
                                             ref,
                                             merge_imputed_blocks=False):
                    out_unmerged_bed.write("\t".join(map(str, b)) + os.linesep)
예제 #25
0
def join(args, outs, chunk_defs, chunk_outs):
    num_sc_bcs = 0
    num_qual_reads = 0
    num_sc_reads = 0
    bc_counts = {}
    
    ## compute species_list
    ref = contig_manager.contig_manager(args.reference_path)
    species_list = ref.list_species()
    species_list.sort()

    ## doublet rate estimation
    total_unique_cell_barcodes = set()
    total_cell_barcodes = []
    species_counts = {}
    for (species, species_barcodes) in args.cell_barcodes.iteritems():
        species_counts[species] = 0
        for bc in species_barcodes.iterkeys():
            total_cell_barcodes.append(bc)
            total_unique_cell_barcodes.add(bc)
            species_counts[species] += 1
    counts = species_counts.values()

    observed_doublets = len(total_cell_barcodes) - len(total_unique_cell_barcodes)
    observed_doublet_rate = tk_stats.robust_divide(observed_doublets,
        float(len(total_cell_barcodes)))
    
    inferred_doublets = float('NaN')
    inferred_doublet_rate = float('NaN')
    if len(species_counts) > 1:
        inferred_doublets = _infer_multiplets_from_observed(observed_doublets,
            counts[0], counts[1])
        inferred_doublet_rate = tk_stats.robust_divide(float(inferred_doublets),
            float(len(total_cell_barcodes)))
    
    ## combine barnyard_hits chunks
    combine_csv([c.barnyard_hits for c in chunk_outs], outs.barnyard_hits,
                 header_lines=1)
    
    ## aggregate summary.json from chunks
    raw_bc_on_whitelist = 0
    for j,chunk_out in enumerate(chunk_outs):
        if chunk_out.summary is None: continue
        chunk_summary = json.loads(open(chunk_out.summary).read())
        num_sc_bcs += chunk_summary['num_sc_bcs']
        num_qual_reads += chunk_summary['num_sc_qual_reads']
        num_sc_reads += chunk_summary['num_sc_reads']
        raw_bc_on_whitelist += chunk_summary['raw_bc_on_whitelist']

        chunk_bc_counts_file = open(chunk_out.barcode_histogram)
        chunk_bc_counts = json.loads(chunk_bc_counts_file.read())
        bc_counts.update(chunk_bc_counts)

    ## combine barnyard chunks
    combine_csv([c.barnyard for c in chunk_outs], outs.barnyard,
                 header_lines=1)

    n_reads = np.array(bc_counts.values())
    max_val = np.percentile(n_reads, 99.99) * 1.3
    min_val = n_reads.min()
    num_bins = 400
    step = math.ceil((max_val - min_val)/num_bins)
    if max_val - min_val < 1e-6:
        bins = np.array([min_val, min_val+1])
    else:
        bins = np.arange(min_val, max_val, step)
    (hist, edges) = np.histogram(n_reads, bins=bins)
    bc_hist = {int(edges[i]):hist[i] for i in range(len(bins)-1)}

    cells = 0
    for (speci, cell_list) in args.cell_barcodes.iteritems():
        cells += len(cell_list)
    summary_info = {}
    summary_info['cells_detected'] = cells
    summary_info['num_sc_bcs'] = num_sc_bcs
    summary_info['num_sc_qual_reads'] = num_qual_reads
    summary_info['num_sc_reads'] = num_sc_reads
    summary_info['fract_sc_reads'] = tk_stats.robust_divide(num_sc_reads, num_qual_reads)
    summary_info['observed_doublets'] = observed_doublets
    summary_info['obserbed_doublet_rate'] = observed_doublet_rate
    summary_info['inferred_doublets'] = inferred_doublets
    summary_info['inferred_doublet_rate'] = inferred_doublet_rate
    
    ## compute stats from barnyard file
    barnyard_df = pd.read_csv( outs.barnyard )
    bkeys = ["amp_rate", "library_complexity", "dup_ratio", "mapped", "mapped_frac"]
    for species in species_list:
        if len(species_list) == 1:
            key_suffix = ""
        else:
            key_suffix = "_" + species

        is_cell_filter = barnyard_df["is_%s_cell_barcode"%species] == 1
        species_barcodes = args.cell_barcodes.get(species, {} )
        
        ## compute quartiles, min, mean, max and CV
        for bkey in bkeys:
            vals = barnyard_df[bkey][is_cell_filter]
            for pct in [25, 50, 75]:
                summary_key = bkey + key_suffix + ("_cells_p%d"%pct)
                summary_info[summary_key] = tk_stats.robust_percentile(vals, pct)
            summary_key = bkey + key_suffix + "_cells_cv"
            summary_info[summary_key] = tk_stats.robust_divide(vals.std(), vals.mean())
            summary_key = bkey + key_suffix + "_cells_mean"
            summary_info[summary_key] = vals.mean()
            summary_key = bkey + key_suffix + "_cells_min"
            summary_info[summary_key] = vals.min()
            summary_key = bkey + key_suffix + "_cells_max"
            summary_info[summary_key] = vals.max()

    ## tabulate waste metrics from barnyard_hits file
    waste_keys = ["no_barcode", "non_cell_barcode", "unmapped", 
                  "low_mapq_lt_%d"%PROFILE_MAPQ_THRESHOLD, 
                  "dups", "denominator", "unusable_read"]
    bh_df = pd.read_csv( outs.barnyard)

    # calculate median percent unmapped (defined as unmapped / (denominator - non_cell_barcode - no_barcode)
    banyard_cell_df = bh_df[~(bh_df.cell_id == 'None')]
    unmapped_frac = 1.0 * banyard_cell_df['unmapped'] / banyard_cell_df['denominator']
    unmapped_frac = unmapped_frac.fillna(0)
    median_unmapped_frac = unmapped_frac.median()

    waste_totals = {}
    sum_waste_keys = 0.0
    for key in waste_keys:
        waste_totals[key] = float(bh_df[key].sum( ))
        if key != "denominator":
            sum_waste_keys += waste_totals[key]
    for level, key in enumerate(waste_keys):
        if key == "denominator":
            continue
        summary_info["waste_%s_reads"%key] = waste_totals[key]
        summary_info["frac_waste_%s"%(key)] = tk_stats.robust_divide(
            waste_totals[key], waste_totals["denominator"] )
    summary_info["waste_total_reads"] = sum_waste_keys
    summary_info["frac_waste_total"] = tk_stats.robust_divide(
        sum_waste_keys, waste_totals["denominator"] )
    summary_info['frac_raw_bc_on_whitelist'] = float(raw_bc_on_whitelist)/waste_totals["denominator"]
    summary_info['median_unmapped_frac'] = median_unmapped_frac

    ## compute leakage metric and add to summary_info
    if len(species_list) == 2:
        compute_leakage( outs.barnyard_hits, ref, summary_info )
    
    with open(outs.summary, 'w') as summary_file:
        summary_file.write(tenkit.safe_json.safe_jsonify(summary_info,pretty=True))

    with open(outs.barcode_histogram, 'w') as bc_hist_file:
        bc_hist_file.write(tenkit.safe_json.safe_jsonify(bc_hist))

    # logging
    print tenkit.safe_json.safe_jsonify(summary_info, pretty=True)
예제 #26
0
def main(args, outs):
    #min_insert_size = 0
    #max_insert_size = 1e4
    
    ## sc purity threshold: what fraction of contamination by another species
    ## will we tolerate
    SC_PURITY_THRESHOLD = 0.95

    args.coerce_strings()
    outs.coerce_strings()

    # Bail out if there no valid barcodes
    if args.barcode_whitelist is None or args.input is None:
        outs.summary = None
        return

    ## group bam records by barcode NO_BARCODE/raw barcode tag/processed barcode tag
    bam_in = tk_bam.create_bam_infile(args.input)
    bam_chunk = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end))
    bc_read_iter = itertools.groupby(bam_chunk, groupbybarcode)

    ## compute species_list
    refs = bam_in.references
    ref = contig_manager.contig_manager(args.reference_path)
    species_list = ref.list_species()
    has_species_info = (species_list != [""])
    species_list.sort()
    genome_size = sum(ref.get_contig_lengths().values())

    ## index cells of each species
    cell_index = {}
    for sp in species_list:
        bc_list = args.cell_barcodes.get(sp, {}).keys()
        bc_list.sort( )
        for i, b in enumerate(bc_list):
            y = cell_index.get(b, "")
            if len(y) == 0:
                cell_index[b] = "%s_cell_%d"%(sp, i)
            else:
                cell_index[b] = y + "_" + "%s_cell_%d"%(sp, i)

    ## construct and write header for barnyard file
    barnyard_file = open(outs.barnyard, 'w')
    barnyard_header = (['BC'] + ["cell_id"] +
        [s+("_" if has_species_info else "")+"reads_mapq_60" for s in species_list] +
        [s+("_" if has_species_info else "")+"contigs" for s in species_list] +
        ['mapped',
        'num_mapped_bases',
        'soft_clip_frac',
        'insert_p50',
        'num_mapped_pos',
        'mapped_frac',
        'amp_rate',
        'library_complexity',
        'dup_ratio',
        'num_pairs'] +
        ["is_%s_cell_barcode"%s for s in species_list])
    waste_keys = ["no_barcode", "non_cell_barcode", "unmapped",
                  "low_mapq_lt_%d"%PROFILE_MAPQ_THRESHOLD,
                  "dups", "denominator", "unusable_read"]
    fractional_waste_keys = [
                  "no_barcode_frac", "non_cell_barcode_frac", "unmapped_frac",
                  "low_mapq_lt_%d_frac"%PROFILE_MAPQ_THRESHOLD, "dups_frac"]

    barnyard_header.extend(waste_keys)
    barnyard_header.extend(fractional_waste_keys)
    barnyard_file.write( ",".join(barnyard_header) + "\n" )

    ## wasted data categories

    ## construct and write header for barnyard_hits file
    barnyard_hits_file = open( outs.barnyard_hits, "w" )
    bh_header = ["barcode", "is_whitelisted"]
    bh_header.extend(["is_%s_cell_barcode"%s for s in species_list])
    bh_header.extend([refname for refname in bam_in.references])
    barnyard_hits_file.write( ",".join(bh_header) + "\n" )

    # For each barocode, count # per each contig, number per each window (for each window size)
    # number per species (if available in contig), number per species
    # TODO: Add detailed matrix by contigs, windows output
    num_sc_bcs = 0
    num_qual_reads = 0
    num_sc_reads = 0

    ploidy = 2
    bc_hist = {}

    ## count number of raw barcodes that exactly match whitelist
    ## without any error correction
    raw_bc_on_whitelist = 0
    # dup_summary = json.load(open(args.duplicate_summary))
    # pcr_dup_fraction = dup_summary['dup_fraction']['pcr']
    #barcode_whitelist = bc_utils.load_barcode_whitelist(args.barcode_whitelist)
    for bc, reads in bc_read_iter:
        ## collect various forms of wasted data here per barcode
        wastebin = defaultdict(int)

        bh_hits = [0 for _ in bam_in.references]
        dup_count = 1
        non_dup = 1
        bc_count = 0
        num_per_species = defaultdict(int)
        contigs_per_species = defaultdict(set)

        total_reads_by_clip = np.zeros(2, dtype=float)

        insert_length = []
        num_pairs = 0
        num_mapped = 0
        num_mapped_bases = 0
        pos_set = set([])
        for r in reads:
            ## secondary/supplementary are never counted towards anything
            if r.is_secondary or r.is_supplementary:
                continue

            ## include everything in the denominator
            wastebin["denominator"] += 1

            ## how many reads have >= 10 soft clipped bases
            if r.cigartuples is not None:
                cigar_dict = dict(r.cigartuples)
                soft_clip_index = int(cigar_dict.get(4, 0) >= 10)
                total_reads_by_clip[soft_clip_index] += 1

            if barnyard_hits_include(r):
                bh_hits[r.tid] += 1
            ## non-whitelisted barcodes count as wasted data
            if not "-" in bc:
                wastebin["no_barcode"] += 1
                continue

            if bc[:-2] == r.get_tag(RAW_BARCODE_TAG):
                raw_bc_on_whitelist += 1

            is_cell_bc_read = True

            ## waste hierarchy
            ## if not a cell or if read doesn't belong to species, then waste
            ## else if not mapped, then waste
            ## else if mapq< 30, then waste
            ## else if dup, then waste

            ## is this is a contaminant read from a different species
            ## it is wasted
            contig = refs[r.tid]
            read_species = ref.species_from_contig(contig)
            if ( not(read_species in args.cell_barcodes) or
                 not(bc in args.cell_barcodes[read_species]) ):
                wastebin["non_cell_barcode"] += 1
                is_cell_bc_read = False
            elif r.is_unmapped:
                wastebin["unmapped"] += 1
            elif r.mapq < PROFILE_MAPQ_THRESHOLD:
                wastebin["low_mapq_lt_%d"%PROFILE_MAPQ_THRESHOLD] += 1
            elif r.is_duplicate:
                wastebin["dups"] += 1
            bad_map_or_dup = (r.is_unmapped or
                              (r.mapq < PROFILE_MAPQ_THRESHOLD) or
                              r.is_duplicate)

            if is_cell_bc_read:
                bc_count += 1
                # if (stringent_read_filter(r, True) and
                #         not(r.is_unmapped) and not(r.mate_is_unmapped)):
                #     if r.is_duplicate:
                #         dup_count += 1
                #     else:
                #         non_dup += 1
                if r.has_tag(DUPLICATE_COUNT_TAG):
                    dup_count += r.get_tag(DUPLICATE_COUNT_TAG)
                    non_dup += 1
            elif bad_map_or_dup:
                # unusable reads are those that are non-cell barcodes that are
                # also any of unmapped, low mapq, nor dups
                wastebin['unusable_read'] += 1

            ## whether we have a cell barcode or not, count these stats
            if not bad_map_or_dup:
                num_mapped += 1
                num_mapped_bases += r.reference_length

                pos_set.add((r.reference_name, r.reference_start/1000))

                ## if read is part of a proper pair, only count read or its pair
                if r.is_proper_pair:
                    if r.is_read1:
                        insert_length.append( r.template_length )
                        num_pairs += 1
                    else:
                        continue

                ## Use MAPQ >= 60 to get accurate mappings only for barnyard stuff
                if r.mapq < 60:
                    continue
                num_qual_reads += 1
                if has_species_info:
                    num_per_species[read_species] += 1
                    contigs_per_species[read_species].add(contig)
            ## end of loop over reads in this barcode
            assert wastebin['denominator'] - wastebin['no_barcode'] - wastebin['unusable_read'] == num_mapped + \
                   wastebin["low_mapq_lt_%d" % PROFILE_MAPQ_THRESHOLD] + wastebin['unmapped'] + wastebin['dups']

        ## compute the library complexity and amp
        ## NOTE: insert length is hardcoded as 250, so the amp rate is really the
        ## library complexity in different units 
        num_amplicons = num_mapped - num_pairs
        dup_ratio = tk_stats.robust_divide(float(dup_count + non_dup), float(non_dup))
        
        library_complexity = tk_stats.robust_divide(num_amplicons, (dup_ratio-1.0)*2)

        amp_rate = tk_stats.robust_divide(float(library_complexity * DEFAULT_AMPLICON_LENGTH) ,
            float(ploidy * genome_size))

        bc_hist[bc] = bc_count
        map_rate = tk_stats.robust_divide(float(num_mapped), wastebin["denominator"])
        
        ## write row to barnyard_hits file
        bh_row = [ bc, int("-" in bc)]
        for s in species_list:
            bh_row.append( int(s in args.cell_barcodes and bc in args.cell_barcodes[s]) )
        bh_row.extend( bh_hits )
        barnyard_hits_file.write(",".join(map(str, bh_row)) + "\n" )

        ## write row to barnyard file
        barnyard_row = ([bc, cell_index.get(bc, "None")] +
            [num_per_species[s] for s in species_list] +
            [len(contigs_per_species[s]) for s in species_list] +
            [num_mapped, num_mapped_bases] +
            [tk_stats.robust_divide(total_reads_by_clip[1], sum(total_reads_by_clip)),
            np.median(insert_length) if len(insert_length) else np.nan,
            len(pos_set),
            map_rate,
            amp_rate,
            library_complexity,
            dup_ratio,
            num_pairs])
        for speci in species_list:
            barnyard_row.append( int((speci in args.cell_barcodes) and 
                (bc in args.cell_barcodes[speci])) )

        for key in waste_keys:
            fkey = key + "_frac"
            if (fkey in fractional_waste_keys):
                wastebin[fkey] = tk_stats.robust_divide(float(wastebin[key]), float(wastebin["denominator"]))
        barnyard_row.extend( [ wastebin[x] for x in waste_keys ] )
        barnyard_row.extend( [ wastebin[x] for x in fractional_waste_keys ] )

        barnyard_file.write( ",".join(map(str, barnyard_row)) + "\n")
        
        ## metrics relating to purity - only for multi species
        if has_species_info and len(species_list) >= 2:
            counts_by_species = [float(num_per_species[s]) for s in species_list]

            major_species_index = np.argmax( counts_by_species )
            major_species = species_list[major_species_index]
            species_purity = tk_stats.robust_divide( counts_by_species[major_species_index],
                np.sum(counts_by_species) )

            if species_purity >= SC_PURITY_THRESHOLD:
                num_sc_bcs += 1
                num_sc_reads += num_per_species[major_species]
        ## END of loop over barcodes

    summary_info = {}
    summary_info['num_sc_bcs'] = num_sc_bcs
    summary_info['num_sc_qual_reads'] = num_qual_reads
    summary_info['num_sc_reads'] = num_sc_reads
    summary_info['raw_bc_on_whitelist'] = raw_bc_on_whitelist

    barnyard_file.close()
    barnyard_hits_file.close()
    
    with open(outs.summary, 'w') as summary_file:
        summary_file.write(tenkit.safe_json.safe_jsonify(summary_info))

    with open(outs.barcode_histogram, 'w') as bc_hist_file:
        bc_hist_file.write(tenkit.safe_json.safe_jsonify(bc_hist))
예제 #27
0
def main(args, outs):
    """Compute a CNV confidence score from the profile for a specific choice of cluster
    and contig."""
    martian.log_info('Entering __init__.main()')
    node_start = args.chunk['start']
    # exclusive end
    node_end = args.chunk['end']
    raw_profiles, mask = coverage_matrix.load_matrix(args.raw_profiles,
                                                     args.reference_path,
                                                     start_cell=node_start,
                                                     end_cell=node_end)

    bin_size = coverage_matrix.get_bin_size(args.raw_profiles)

    ## read in CNV data for nodes of interest
    node_column = COLUMN_NAMES.index("NodeID")
    cnv_calls = read_cnv_data(args.cnv_calls, node_start, node_end,
                              node_column)
    #
    scale = get_scaling_factors(raw_profiles, cnv_calls)
    with open(args.gc_norm_params, "r") as handle:
        gc_norm_params = json.load(handle)
    linear = gc_norm_params["linear"]
    quadratic = gc_norm_params["quadratic"]
    #
    ref = contig_manager.contig_manager(args.reference_path)
    #
    # Get mappability, GC content:
    bin_parameters = []
    vesna.load_track_parameters(args.tracks, bin_parameters, ref)
    #
    logp, cnv_calls2 = ccs.process_cnv_calls(raw_profiles, mask,
                                             bin_parameters,
                                             args.reference_path, args.sex,
                                             scale, linear, quadratic,
                                             cnv_calls, bin_size)

    export_segments(outs.cnvs, cnv_calls2, node_start)

    # free some memory
    del cnv_calls
    del cnv_calls2

    #
    # Compute confidence values for unmerged, broken-up CNV calls
    #
    unmerged_cnv_calls = read_cnv_data(args.unmerged_cnv_calls, node_start,
                                       node_end, node_column)

    _, unmerged_cnv_calls2 = ccs.process_cnv_calls(raw_profiles,
                                                   mask,
                                                   bin_parameters,
                                                   args.reference_path,
                                                   args.sex,
                                                   scale,
                                                   linear,
                                                   quadratic,
                                                   unmerged_cnv_calls,
                                                   bin_size,
                                                   logp=logp)

    export_segments(outs.unmerged_cnvs, unmerged_cnv_calls2, node_start)
    #
    # Debugging:
    #
    martian.log_info('Leaving __init__.main()')
    martian.log_info('.' * 80)
예제 #28
0
def estimate_gc_bias(profiles, tracks, reference_path):
    ## load genome tracks and profiles skipping sex chromosomes
    
    ref = contig_manager.contig_manager(reference_path)
    chroms = ref.primary_contigs(allow_sex_chromosomes=False)

    maptrack = pd.HDFStore(tracks, "r")
    cmask = []
    gctrack = []
    bdy = [0]
    mtrack = []
    for chrom in chroms:
        x = maptrack["/map/"+chrom].values > MAPPABILITY_THRESHOLD
        cmask.extend(x)
        z = bdy[-1] + len(x)
        gctrack.extend(maptrack["/GC/"+chrom].values)
        mtrack.extend(maptrack["/map/"+chrom].values)
        bdy.append(z)
    cmask = np.array(cmask)
    maptrack.close( )
    gctrack = np.array(gctrack)
    mtrack = np.array(mtrack)
    bdy = np.array(bdy)

    nbins = bdy[-1]
    pstore = pd.HDFStore(profiles, "r")
    ncells = len(pstore["/barcodes"].values)
    X = np.zeros((ncells, nbins), dtype="int32")
    for ci, chrom in enumerate(chroms):
        X[:, bdy[ci]:bdy[ci+1]] = pstore["/contigs/"+chrom].values
    pstore.close( )

    ## genome wide profile of all cells @ GC_RES resolution
    ## restricted to mappable regions
    y = aggregate_counts(X.sum( axis=0 )[cmask], GC_RES).astype(float)
    y /= y.mean( )
    gc = aggregate_counts(gctrack[cmask], GC_RES)/GC_RES

    gcbins = np.linspace(MIN_GC, MAX_GC, NUM_GC_BINS+1)
    gc_vals = 0.5 * (gcbins[1:] + gcbins[:-1])
    gc_bin_index = np.searchsorted(gcbins, gc)
    gc0 = np.nanmean(gc_vals)

    ## group data points by GC bins and compute the median
    x_vals = []
    y_vals = []
    for bi in xrange(1, NUM_GC_BINS+1):
        bin_filter = gc_bin_index == bi
        num_data_points = bin_filter.sum( )
        if num_data_points < MIN_POINTS_PER_BIN:
            continue
        bin_gc = gc_vals[bi-1]
        bin_val = np.median(y[bin_filter])
        x_vals.append(bin_gc)
        y_vals.append(bin_val)
    # for bi
    x_vals = np.array(x_vals) - gc0
    
    ## fit to ax^2 + bx + c
    a, b, c = np.polyfit(x_vals, y_vals, 2)
    
    ## GC metric is mean absolute deviation away from 1.0
    gc_metric = np.abs(np.array(y_vals) - 1.0).sum( ) / len(y_vals)

    ## store gc data in summary
    summary = {}
    summary["GC_content"] = x_vals
    summary["scaled_read_counts"] = y_vals
    summary["quadratic_coefficients"] = [a, b, c]
    summary["gc_cells_only"] = gc_metric
    summary["gc0"] = gc0
   
    #with open(outs.summary, "w") as out:
    #    json.dump(summary, out, indent=4)
    #
    return( {'GCMetric': gc_metric, 'Summary': summary})
예제 #29
0
def main(args, outs):
    hostname = socket.gethostname()

    # Sample ID / pipestance name
    if args.sample_id is not None:
        if not re.match("^[\w-]+$", args.sample_id):
            martian.exit("Sample name may only contain letters, numbers, underscores, and dashes: " + args.sample_id)

    # Check numerical options
    # types are already checked by mrp so only need to check ranges
    if args.force_cells is not None and (args.force_cells < 1 or
        args.force_cells > 20000):
        martian.exit("MRO parameter force_cells must be a positive integer"\
            " <= 20000.")

    # check min_ploidy, max_ploidy
    if args.cnv_params is not None:
        min_ploidy = args.cnv_params.get("min_ploidy", None)
        max_ploidy = args.cnv_params.get("max_ploidy", None)
        if min_ploidy is not None and min_ploidy <= 0:
            martian.exit("Command line argument soft-min-avg-ploidy must be a "\
                "positive real number.")
        if max_ploidy is not None and (max_ploidy <= 0 or max_ploidy > 8.0):
            martian.exit("Command line argument soft-max-avg-ploidy must be a "\
                "positive real number <= 8.")
        if (min_ploidy is not None and max_ploidy is not None and 
            max_ploidy <= min_ploidy):
            martian.exit("Command line arguments must satisfy "\
                "soft-min-avg-ploidy < soft-max-avg-ploidy.")

    # check downsample options
    if args.downsample is not None and len(args.downsample.keys()) > 0:
        keys = args.downsample.keys()
        if len(keys) > 1:
            martian.exit("Please supply either maxreads or downsample but not "\
                "both.")
        key = keys[0]
        value = args.downsample[key]
        param_map = {"target_reads" : "maxreads", "gigabases" : "downsample"}
        bad_value = False
        try:
            float(value)
            bad_value = value < 1e-12
        except ValueError:
            bad_value = True
        if bad_value:
            cs_key = param_map[key]
            martian.exit("Command line argument %s must be a positive number" %
                cs_key)

    # FASTQ input
    for idx, sample_def in enumerate(args.sample_def):
        read_path = sample_def["read_path"]
        if not read_path:
            martian.exit("Must specify a read_path containing FASTQs.")
        if not read_path.startswith('/'):
            martian.exit("Specified FASTQ folder must be an absolute path: %s" % read_path)
        if not os.path.exists(read_path):
            martian.exit("On machine: %s, specified FASTQ folder does not exist: %s" % (hostname, read_path))
        if not os.access(read_path, os.X_OK):
            martian.exit("On machine: %s, longranger does not have permission to open FASTQ folder: %s" % (hostname, read_path))
        if not os.listdir(read_path):
            martian.exit("Specified FASTQ folder is empty: " + read_path)

        library_id = sample_def.get("library_id")
        if library_id is not None:
            if not re.match("^[\w-]+$", library_id):
                martian.exit("Library name may only contain letters, numbers, underscores, and dashes: " + library_id)

        lanes = sample_def["lanes"]
        if lanes is not None:
            for lane in lanes:
                if not tk_preflight.is_int(lane):
                    martian.exit("Lanes must be a comma-separated list of numbers.")

        if args.fastq_mode == "BCL_PROCESSOR":
            sample_indices, msg = tk_preflight.check_sample_indices(sample_def)
            if sample_indices is None:
                martian.exit(msg)

            find_func = tk_fasta.find_input_fastq_files_10x_preprocess
            reads = []
            for sample_index in sample_indices:
                # process interleaved reads
                reads.extend(find_func(read_path, "RA", sample_index, lanes))
            if len(reads) == 0:
                martian.exit("No input FASTQs were found for the requested parameters.")
        elif args.fastq_mode == "ILMN_BCL2FASTQ":
            sample_names = sample_def.get("sample_names", None)
            if sample_names is None:
                martian.exit("Entry {} in sample_def missing required field: sample_names".format(idx))
            find_func = tk_fasta.find_input_fastq_files_bcl2fastq_demult
            reads1 = []
            reads2 = []
            for sample_name in sample_names:
                r1 = find_func(read_path, "R1", sample_name, lanes)
                r2 = find_func(read_path, "R2", sample_name, lanes)
                if len(r1) != len(r2):
                    martian.exit("Entry {} in sample_defs are missing input FASTQs.".format(idx))
                reads1.extend(r1)
                reads2.extend(r2)
            if len(reads1) == 0 and len(reads2) == 0:
                martian.exit("No input FASTQs were found for the requested parameters.")
        else:
            martian.exit("Unrecognized fastq_mode: {}".format(args.fastq_mode))

    # Reference
    ok, msg = tk_preflight.check_refdata(args.reference_path, max_contigs=None)
    if ok:
        martian.log_info(msg)
    else:
        martian.exit(msg)
    contig_defs_json_path = os.path.join(args.reference_path, "fasta", 
        "contig-defs.json")
    faidx_path = os.path.join(args.reference_path, "fasta", 
        "genome.fa.fai")
    error_msg = contig_manager.verify_contig_defs(contig_defs_json_path,
        faidx_path)
    if error_msg is not None:
        martian.exit(error_msg)

    try:
        ref = contig_manager.contig_manager(args.reference_path)
    except Exception as e:
        martian.exit("Unexpected error occurred.\n%s"%str(e))

    # too many contigs
    primary = ref.primary_contigs(allow_sex_chromosomes=True)
    num_primary_contigs = len(primary)
    if num_primary_contigs > 100:
        martian.exit("There can be at most 100 primary contigs.")

    # contig length checks
    chrom_length_dict = ref.get_contig_lengths()

    contig_length_exit = 500 * 1000
    contig_length_warn = 10 ** 7
    offending_contigs_warn = []
    offending_contigs_exit = []
    for c in primary:
        clen = chrom_length_dict[c]
        if clen < contig_length_exit:
            offending_contigs_exit.append(c)
        elif clen < contig_length_warn:
            offending_contigs_warn.append(c)
    if len(offending_contigs_exit) > 0:
        martian.exit("Primary contig(s) \"%s\" are shorter than %d bases. "\
            "Every primary contig must be at least %d bases "\
            "in length."%(",".join(offending_contigs_exit), contig_length_exit,
                          contig_length_exit))
    elif (not args.check_executables) and len(offending_contigs_warn) > 0:
        martian.alarm("Primary contig(s) \"%s\" are shorter than %d bases. "\
            "Every primary contig is recommended to be at least %d bases "\
            "in length."%(",".join(offending_contigs_warn), contig_length_warn,
                          contig_length_warn))

    # Open file handles limit 
    if args.check_executables:
        ok, msg = tk_preflight.check_open_fh()
        if not ok:
            martian.exit(msg)

    martian.log_info(tk_preflight.record_package_versions())
예제 #30
0
def join(args, outs, chunk_defs, chunk_outs):
    ## merge gc params jsons
    node_gc_params = {}
    sc_gc_params = json.load(open(args.sc_gc_params, "r"))
    internal_gc_params = json.load(open(args.internal_gc_params, "r"))

    ncells = len(sc_gc_params['linear'])
    nnodes = 2*ncells - 1

    for key in ["scale", "linear", "quadratic"]:
        node_gc_params[key] = sc_gc_params[key] + internal_gc_params[key]
    with open(outs.node_gc_params, "w") as out:
        json.dump(node_gc_params, out, indent=4)

    ref = contig_manager.contig_manager(args.reference_path)
    chroms = ref.primary_contigs(allow_sex_chromosomes=True)
    index_chrom = dict([(str(i), c) for i, c in enumerate(chroms)])
    chrom_index = dict([(c, str(i)) for i, c in enumerate(chroms)])
    tmp = martian.make_path('tmp.bed')
    tmp_dir = os.path.dirname(tmp)
    tmp_sorted = martian.make_path('tmp_sorted.bed')
    calls = [[args.sc_cnv_calls, args.internal_cnv_calls],
             [args.sc_unmerged_cnv_calls, args.internal_unmerged_cnv_calls]]
    out_calls = [outs.node_cnv_calls, outs.node_unmerged_cnv_calls]
    for calls, out in zip(calls, out_calls):
        with open(tmp, 'w') as outf:
            for f in calls:
                for l in open(f):
                    fields = l.split()
                    # offset internal node indices by ncells
                    if f == calls[1]:
                        fields[3] = str(int(fields[3]) + ncells)
                    # fix type of confidence field to integer
                    fields[-1] = str(int(float(fields[-1])))
                    # replace index number at start for sorting
                    fields[0] = chrom_index[fields[0]]
                    outf.write('\t'.join(fields) + '\n')

        no_unicode = dict(LC_ALL='C')
        tmp_mem_gib = max(1, int(np.ceil(float(os.path.getsize(tmp)) / (1024**3))))
        try:
            subprocess.check_call(['sort', '-k1,1n', '-k2,2n', '-k3,3n',
                                   '--parallel=1',  # force sort to use 1 thread
                                   '-S', '{}G'.format(tmp_mem_gib),
                                   '-T', tmp_dir,
                                   '-o', tmp_sorted, tmp],
                                  env=no_unicode, stderr=sys.stderr)
        # on some systems, --parallel is unavailable
        except subprocess.CalledProcessError:
            subprocess.check_call(['sort', '-k1,1n', '-k2,2n', '-k3,3n',
                                   # will by default only use 1 thread
                                   '-S', '{}G'.format(tmp_mem_gib),
                                   '-T', tmp_dir,
                                   '-o', tmp_sorted, tmp],
                                  env=no_unicode, stderr=sys.stderr)

        # strip index column into outfile
        with open(out, 'w') as outf:
            version = martian.get_pipelines_version()
            outf.write("#cellranger-dna {}\n".format(version))
            outf.write("#reference genome: {}\n".format(args.reference_path))
            outf.write("#chrom\tstart\tend\tid\tcopy_number\tevent_confidence\n")
            for l in open(tmp_sorted):
                l = l.split('\t')
                l[0] = index_chrom[l[0]]
                outf.write('\t'.join(l))

    os.remove(tmp)
    os.remove(tmp_sorted)

    ## cnv tracks file
    sc_windows = load_h5(args.sc_cnv_tracks, "windows")
    internal_windows = load_h5(args.internal_cnv_tracks, "windows")
    windows = sc_windows.append(internal_windows).values
    constants = load_h5(args.sc_cnv_tracks, "constants")
    
    sc_ploidy_conf = scale_confidence_score(load_h5(args.sc_cnv_tracks, 
        "ploidy_conf").values)
    internal_ploidy_conf = scale_confidence_score(load_h5(
        args.internal_cnv_tracks, "ploidy_conf").values)
    
    sc_scale_factor= load_h5(args.sc_cnv_tracks, "scale_factor")
    internal_scale_factor = load_h5(args.internal_cnv_tracks, "scale_factor")

    sc_rpb= load_h5(args.sc_cnv_tracks, "reads_per_bin")
    internal_rpb= load_h5(args.internal_cnv_tracks, "reads_per_bin")
    
    X = load_h5(args.sc_cnv_tracks, "cnv_tracks").values
    nbins = X.shape[1]
    Q = np.zeros((nnodes, nbins), dtype=X.dtype)
    Q[0:ncells, :] = X
    del X
    Q[ncells:, :] = load_h5(args.internal_cnv_tracks, "cnv_tracks").values

    store = pd.HDFStore(outs.node_cnv_tracks, "w")
    store["constants"] = constants
    store["windows"] = sc_windows.append(internal_windows)
    store["ploidy_conf"] = sc_ploidy_conf.append(internal_ploidy_conf)
    store["scale_factor"] = sc_scale_factor.append(internal_scale_factor)
    store["reads_per_bin"] = sc_rpb.append(internal_rpb)
    store["cnv_tracks"] = pd.DataFrame(Q)
    store.close()
    
    ## Compute heterogeneity and store in tree_data
    ref = contig_manager.contig_manager(args.reference_path)
    chroms = ref.primary_contigs(allow_sex_chromosomes=True)
    if args.tracks is None:
        gmask = np.ones(nbins, dtype=bool)
    else:
        gmask = []
        maptrack = pd.HDFStore(args.tracks, "r")
        for chrom in chroms:
            gmask.extend(maptrack["/map/"+chrom].values > MAPPABILITY_THRESHOLD)
        maptrack.close( )
        gmask = np.array(gmask)

    ## update tree data
    # load tree
    store = pd.HDFStore( args.tree_data, "r" )
    Z = store["/Z"].values
    distances = store["/distances"].values
    constants = store["/constants"]
    store.close( )

    # Compute the heterogeneity at every *internal* node of the tree
    # obviously the heterogeneity is zero at every leaf, so don't
    # store a bunch of zeros
    levels = 6
    het = compute_heterogeneity(Q, Z, gmask, windows, levels=levels)

    del Q

    # dump to disk
    store = pd.HDFStore( outs.tree_data, "w" )
    store["Z"] = pd.DataFrame(Z)
    store["het"] = pd.DataFrame(het)
    store["distances"] = pd.Series(distances)
    store["windows"] = pd.Series(windows)
    store["constants"] = constants
    store.close( )

    del het

    ## normalized profiles
    sc_store = pd.HDFStore(args.sc_norm_profiles, "r")
    internal_store = pd.HDFStore(args.internal_norm_profiles, "r")
    out_store = pd.HDFStore(outs.norm_node_profiles, "w")
    out_store["/constants"] = sc_store["/constants"]
    for chrom in chroms:
        ## first do the /contigs
        X = sc_store["/contigs/"+chrom].values
        Y = internal_store["/contigs/"+chrom].values
        assert X.shape[1] == Y.shape[1]
        nbins = X.shape[1]
        Z = np.zeros((2*ncells-1, nbins), dtype=X.dtype)
        Z[:ncells, :] = X
        Z[ncells:, :] = Y
        out_store["/contigs/"+chrom] = pd.DataFrame(Z)
        del X, Y, Z

        ## next do the /masks
        out_store["/masks/"+chrom] = sc_store["/masks/"+chrom]
    ## gc params
    for key in ["scale", "linear", "quadratic"]:
        out_store["/gc_params/"+key] = pd.concat([sc_store["/gc_params/"+key],
            internal_store["/gc_params/"+key]], ignore_index=True)

    ## do the normalization metrics
    out_store["/normalization_metrics"] =sc_store["normalization_metrics"].append(internal_store["/normalization_metrics"], ignore_index=True)

    out_store.close()
    sc_store.close()
    internal_store.close()