예제 #1
0
def join(args, outs, chunk_defs, chunk_outs):
    if args.fragments is None:
        outs.raw_matrix = None
        outs.raw_matrix_mex = None
        return

    # Hstack barcodes to generate full peak matrix
    barcodes = []
    sp_matrix = None
    for i, chunk in enumerate(chunk_outs):
        if chunk.raw_matrix is not None and os.path.exists(chunk.raw_matrix):
            cpm = cr_matrix.CountMatrix.load_h5_file(chunk.raw_matrix)
            if i == 0:
                sp_matrix = cpm.m
            else:
                sp_matrix = hstack([sp_matrix, cpm.m])
            barcodes.extend(cpm.bcs)

    genomes = utils.generate_genome_tag(args.reference_path)
    peaks_def = atac_feature_ref.from_peaks_bed(args.peaks, genomes)
    raw_matrix = cr_matrix.CountMatrix(peaks_def, barcodes, sp_matrix)
    raw_matrix.save_h5_file(outs.raw_matrix,
                            sw_version=martian.get_pipelines_version())
    if not os.path.exists(outs.raw_matrix_mex):
        os.mkdir(outs.raw_matrix_mex)
    atac_matrix.save_mex(raw_matrix, outs.raw_matrix_mex,
                         cr_lib_constants.ATACSEQ_LIBRARY_TYPE,
                         martian.get_pipelines_version())
예제 #2
0
def join(args, outs, chunk_defs, chunk_outs):
    ref_mgr = ReferenceManager(args.reference_path)
    if args.filtered_matrix is None or args.peak_motif_hits is None or len(
            ref_mgr.list_species()) > 1:
        outs.filtered_tf_bc_matrix = None
        outs.filtered_tf_bc_matrix_mex = None
        outs.tf_propZ_matrix = None
        return

    # motif scan is completed in ANNOTATE_PEAKS

    peaks = BedTool(args.peaks)
    motifs = Motifs(args.reference_path)

    peak_motif_hits = BedTool(args.peak_motif_hits)

    # extract peak coordinate to numerical index map
    peak_idx, n_peaks = _get_peak_indexes(peaks)

    # extract motif names to numerical index map
    motif_idx, n_motifs = _get_motif_indexes(motifs)

    # extract 3 lists: peak indexes, motif indexes and counts, each entry correspond to a peak-motif pair
    peak_coor, motif_coor, values = motifscan_bed_to_sparse_matrix(
        peak_motif_hits, peak_idx, motif_idx, format='binary')

    # convert it to a sparse matrix, default is binary format, motifs are rows and peaks are columns
    tf_peak_matrix = sp.csr_matrix((values, (motif_coor, peak_coor)),
                                   shape=(n_motifs, n_peaks),
                                   dtype='int32')

    # compute the motif-BC matrix via pooling
    # The current method simply counts the number of hits for a motif inside the peaks in a barcode
    # cast as a CountMatrix
    peak_matrix = cr_matrix.CountMatrix.load_h5_file(args.filtered_matrix)
    motif_names = motif_idx.keys()
    barcodes = peak_matrix.bcs
    genomes = utils.generate_genome_tag(args.reference_path)
    motifs_def = atac_feature_ref.from_motif_list(motif_names, genomes)
    tf_matrix = cr_matrix.CountMatrix(motifs_def, barcodes,
                                      tf_peak_matrix * peak_matrix.m)

    # perform MAD-zscoring of proportion values
    propZ_matrix = np.array(tf_matrix.m / peak_matrix.m.sum(axis=0))
    propZ_matrix = MADzscore(propZ_matrix)

    outs.coerce_strings()

    # save to h5 and csv
    tf_matrix.save_h5_file(outs.filtered_tf_bc_matrix,
                           sw_version=martian.get_pipelines_version())
    if not os.path.exists(outs.filtered_tf_bc_matrix_mex):
        os.mkdir(outs.filtered_tf_bc_matrix_mex)
    atac_matrix.save_mex(
        tf_matrix,
        outs.filtered_tf_bc_matrix_mex,
        feature_type=cr_lib_constants.ATACSEQ_LIBRARY_DERIVED_TYPE,
        sw_version=martian.get_pipelines_version())
    # save propZ matrix as gz
    np.savetxt(outs.tf_propZ_matrix, propZ_matrix)
예제 #3
0
def join(args, outs, chunk_defs, chunk_outs):
    if args.raw_matrix is None:
        outs.filtered_matrix = None
        return

    # consume cell barcodes across all species and raise errors if not found
    if args.cell_barcodes is None:
        martian.exit("cell barcodes not provided")
    cell_barcodes = utils.load_cell_barcodes(args.cell_barcodes, with_species=True)

    # Read the peak matrix file and keep only cell barcodes
    # remove cell barcodes that were specified externally, such in reanalyzer,
    # which may not be present in raw matrix because they're missing from the fragments file
    present_cell_barcodes = {}
    peak_matrix = cr_matrix.CountMatrix.load_h5_file(args.raw_matrix)
    peak_matrix_bcs = set(peak_matrix.bcs)
    for species in cell_barcodes:
        present_cell_barcodes[species] = set()
        for bc in cell_barcodes[species]:
            if bc not in peak_matrix_bcs:
                martian.log_info("{} not found in the raw peak - bc matrix".format(bc))
            else:
                present_cell_barcodes[species].add(bc)

    peak_matrix = peak_matrix.filter_barcodes(present_cell_barcodes)
    if peak_matrix.features_dim == 0:
        martian.log_info("data has no peaks, skipping the clustering analysis")
        outs.filtered_matrix = None
        outs.filtered_matrix_mex = None
        return

    peak_matrix = prune(peak_matrix, num_analysis_bcs=args.num_analysis_bcs, random_state=args.random_seed)

    if peak_matrix.bcs_dim <= analysis_constants.MAX_N_CLUSTERS_DEFAULT:
        martian.log_info("Insufficient number of cell barcodes present after processing")
        outs.filtered_matrix = None
        outs.filtered_matrix_mex = None
        return

    if peak_matrix.features_dim < analysis_constants.MAX_N_CLUSTERS_DEFAULT:
        martian.log_info("Insufficient number of peaks present after processing")
        outs.filtered_matrix = None
        outs.filtered_matrix_mex = None
        return

    # save processed matrix
    peak_matrix.save_h5_file(outs.filtered_matrix, sw_version=martian.get_pipelines_version())
    if not os.path.exists(outs.filtered_matrix_mex):
        os.mkdir(outs.filtered_matrix_mex)
    atac_matrix.save_mex(peak_matrix, outs.filtered_matrix_mex,
                         cr_lib_constants.ATACSEQ_LIBRARY_TYPE,
                         sw_version=martian.get_pipelines_version())
예제 #4
0
def get_pipeline_info(args, reference, debug):
    """Generates a table of general pipeline information.
    """
    metadata = reference.metadata

    def get_fastq_paths(sample_def):
        if sample_def is None:
            return ""
        else:
            paths = [x["read_path"] for x in sample_def]
            return "\n".join(paths)

    rows = [
        ['Sample ID', args.sample_id],
        ['Sample description', args.sample_desc],
        ['FASTQ path', get_fastq_paths(args.sample_def)],
        ['Pipeline version',
         martian.get_pipelines_version()],
        ['Reference path', args.reference_path],
    ]

    if metadata:
        rows.extend([
            ['Organism', metadata.get('organism')],
            ['Assembly', metadata.get('assembly')],
            ['Annotation', metadata.get('annotation')],
        ])

    if debug:
        rows.append(['Barcode Whitelist', args.barcode_whitelist])

    data = {'pipeline_info_table': {'rows': rows}}
    data['pipeline_helptext'] = {'title': 'Sample', 'data': []}
    return data
예제 #5
0
def simple_load_metrics(summary_metrics, metrics_fn):
    with open(metrics_fn, 'r') as infile:
        metrics = json.load(infile)
    summary_metrics.update(metrics)
    summary_metrics['cellranger-atac_version'] = martian.get_pipelines_version(
    )
    return summary_metrics
예제 #6
0
def main(args, outs):
    genomes = cr_matrix.GeneBCMatrices.load_genomes_from_h5(
        args.filtered_matrices)
    chemistry = cr_matrix.GeneBCMatrices.load_chemistry_from_h5(
        args.filtered_matrices)
    total_cells = cr_matrix.GeneBCMatrices.count_cells_from_h5(
        args.filtered_matrices)
    summary = {
        'chemistry_description': chemistry,
        'filtered_bcs_transcriptome_union': total_cells
    }
    with open(outs.summary, 'w') as f:
        json.dump(summary, f, indent=4, sort_keys=True)

    sample_properties = ReanalyzeSampleProperties(
        sample_id=args.analysis_id,
        sample_desc=args.analysis_desc,
        genomes=genomes,
        version=martian.get_pipelines_version())
    sample_properties = dict(sample_properties._asdict())

    sample_data_paths = cr_webshim_data.SampleDataPaths(
        summary_path=outs.summary,
        analysis_path=args.analysis,
    )

    sample_data = cr_webshim.load_sample_data(sample_properties,
                                              sample_data_paths)
    cr_webshim.build_web_summary_html(outs.web_summary, sample_properties,
                                      sample_data, PIPELINE_REANALYZE)
예제 #7
0
def main(args, outs):
    cr_report.merge_jsons(args.summaries, outs.metrics_summary_json)

    sample_data_paths = cr_webshim_data.SampleDataPaths(
        summary_path=outs.metrics_summary_json,
        barcode_summary_path=args.barcode_summary_h5,
        analysis_path=args.analysis,
        filtered_barcodes_path=args.filtered_barcodes,
    )

    genomes = cr_utils.get_reference_genomes(args.reference_path)
    sample_properties = CountSampleProperties(
        sample_id=args.sample_id,
        sample_desc=args.sample_desc,
        genomes=genomes,
        version=martian.get_pipelines_version())
    sample_properties = dict(sample_properties._asdict())

    sample_data = cr_webshim.load_sample_data(sample_properties,
                                              sample_data_paths)

    cr_webshim.build_web_summary_html(outs.web_summary,
                                      sample_properties,
                                      sample_data,
                                      PIPELINE_COUNT,
                                      alerts_output_filename=outs.alerts)
    cr_webshim.build_metrics_summary_csv(outs.metrics_summary_csv,
                                         sample_properties, sample_data,
                                         PIPELINE_COUNT)
예제 #8
0
def get_pipeline_info(args, reference, debug):
    """Generates a table of general pipeline information.
    """
    data = {}
    metadata = reference.metadata

    rows = [
        ['Sample ID', args.sample_id],
        ['Sample description', args.sample_desc],
        ['Pipeline version',
         martian.get_pipelines_version()],
        ['Reference path', args.reference_path],
    ]

    if metadata:
        rows.extend([
            ['Organism', metadata.get('organism')],
            ['Assembly', metadata.get('assembly')],
            ['Annotation', metadata.get('annotation')],
        ])

    if debug:
        rows.append(['Barcode Whitelist', args.barcode_whitelist])

    data = {'pipeline_info_table': {'rows': rows}}
    return data
예제 #9
0
def main(args, outs):
    metrics = {}
    for fname in args.metrics:
        if fname is not None:
            with open(fname, 'r') as f:
                metrics.update(json.load(f))

    # Normalize "NaN" values
    for key in metrics:
        value = metrics[key]
        if str(value) == 'NaN' or (isinstance(value, float) and np.isnan(value)):
            metrics[key] = None

    # add version info
    metrics['cellranger-atac_version'] = martian.get_pipelines_version()

    if len(metrics) > 0:
        martian.log_info('Writing out summary_metrics')
        with open(outs.metrics, 'w') as outfile:
            outfile.write(tenkit.safe_json.safe_jsonify(metrics, pretty=True))

    # compile summary.csv metrics
    # load library info and fake libraries as species
    metric_registry = MetricAnnotations()
    metrics_csv_dict = {}
    if args.library_info is not None:
        with open(args.library_info, 'r') as f:
            library_info = pickle.load(f)
        library_list = [library_info[n]['library_id'] for n in library_info.keys()]
        metrics_csv_dict.update(metric_registry.compile_summary_metrics(metrics, species_list=library_list))

    # load species level metrics
    ctg_mgr = ReferenceManager(args.reference_path)
    metrics_csv_dict.update(metric_registry.compile_summary_metrics(metrics, species_list=ctg_mgr.list_species()))
    write_dict_to_csv(outs.metrics_csv, metrics_csv_dict, sort=True)
예제 #10
0
def main(args, outs):
    args.coerce_strings()
    outs.coerce_strings()
    input_vfr = tk_io.VariantFileReader(args.input)

    bc_mix_prob = args.bc_mix_prob
    min_var_hap_conf = args.min_var_hap_conf
    min_junction_hap_conf = args.min_junction_hap_conf
    hap_block_size = args.hap_block_size
    hap_block_buffer_size = args.hap_block_buffer_size
    max_reassign_rounds = args.max_reassign_rounds
    chrom, start, stop = tk_io.get_locus_info(args.locus)

    output_file = open(outs.default.strip('.gz'), 'w')
    fragment_output_file = open(outs.fragment_phasing.strip('.gz'), 'w')
    vc_mode, _, _, _ = tk_io.get_vc_mode(args.vc_precalled, args.vc_mode)

    # Add the component name and the version of the phasing code
    new_source = "10X/pipelines/stages/snpindels/phase_snpindels %s" % martian.get_pipelines_version(
    )
    new_filters = [
        ("10X_PHASING_INCONSISTENT",
         "Uses haplotype information from the fragments and the alleles to filter some variants that are not consistent with phasing."
         ),
        ("10X_HOMOPOLYMER_UNPHASED_INSERTION",
         "Unphased insertions in homopolymer regions tend to be false positives"
         )
    ]
    new_formats = [
        ("PS", 1, "Integer", "ID of Phase Set for Variant"),
        ("PQ", 1, "Integer",
         "Phred QV indicating probability at this variant is incorrectly phased"
         ),
        ("JQ", 1, "Integer",
         "Phred QV indicating probability of a phasing switch error in gap prior to this variant"
         ),
    ]
    vfw = tk_io.VariantFileWriter(output_file,
                                  template_file=open(args.input),
                                  new_source=new_source,
                                  new_format_fields=new_formats,
                                  new_filters=new_filters)
    if args.do_phasing:
        phaser = Phaser(input_vfr, args.fragments, chrom, start, stop,
                        bc_mix_prob, min_junction_hap_conf, min_var_hap_conf,
                        hap_block_buffer_size, hap_block_size,
                        max_reassign_rounds, vc_mode)
        phaser.call_haps(vfw, fragment_output_file)
    else:
        pass_variants(input_vfr,
                      vfw,
                      chrom,
                      start,
                      stop,
                      strip_phasing_info=True)
    output_file.close()
    fragment_output_file.close()

    tk_tabix.sort_unique_tabix_vcf(outs.default.strip('.gz'))
예제 #11
0
def join_matrices(args, outs, chunk_defs, chunk_outs):
    chunk_h5s = [chunk_out.matrices_h5 for chunk_out in chunk_outs]
    matrix = cr_matrix.merge_matrices(chunk_h5s)
    matrix_attrs = cr_matrix.make_matrix_attrs_count(
        args.sample_id, args.gem_groups,
        cr_chem.get_description(args.chemistry_def))
    matrix.save_h5_file(outs.matrices_h5, extra_attrs=matrix_attrs)

    rna_matrix.save_mex(matrix, outs.matrices_mex,
                        martian.get_pipelines_version())
예제 #12
0
def make_sample_info(args):

    p = {
        "sample_def": args.sample_def,
        "reference_path": args.reference_path,
        "sample_id": args.sample_id,
        "sample_desc": args.sample_desc,
        "version": martian.get_pipelines_version()
    }
    return p
예제 #13
0
def join(args, outs, _chunk_defs, _chunk_outs):
    filtered_matrix = filter_barcodes(args, outs)

    matrix_attrs = cr_matrix.make_matrix_attrs_count(
        args.sample_id, args.gem_groups,
        cr_chem.get_description(args.chemistry_def))
    filtered_matrix.save_h5_file(outs.filtered_matrices_h5,
                                 extra_attrs=matrix_attrs)

    rna_matrix.save_mex(filtered_matrix, outs.filtered_matrices_mex,
                        martian.get_pipelines_version())
예제 #14
0
def write_analysis_parameters(analysis_params_outfn):
    with open(analysis_params_outfn, 'w') as analysis_params_out:
        analysis_params = {
            'analysis_version': martian.get_pipelines_version(),
            # Dropping meowmix version -- we're moving to putting special reference datasets into main repo
            'meowmix_version': "99.9.9",
            # Lena needs this set, even though we're not trimming
            'lead_trim': 0,
        }
        analysis_params_out.write(
            tenkit.safe_json.safe_jsonify(analysis_params))
예제 #15
0
def main(args, outs):
    args.coerce_strings()
    outs.coerce_strings()

    outs.raw_matrix_mex = None
    if args.fragments is None:
        outs.raw_matrix = None
        return

    with open(args.peaks, 'r') as infile:
        full_peaks = tk_bio.get_target_regions(infile)
    with open(args.peaks, 'r') as pfile:
        peaks_dict = OrderedDict(
            ("{}:{}-{}".format(*peak.strip("\n").split("\t")), num)
            for num, peak in enumerate(pfile))

    with open(args.barcodes, 'r') as barcode_file:
        barcodes_dict = OrderedDict(
            (bc.strip('\n'), num) for num, bc in enumerate(barcode_file))

    if len(barcodes_dict) == 0:
        outs.raw_matrix = None
        return

    # get matrix counts
    peak_bc_counts = Counter()
    for contig, start, stop, barcode, _ in open_fragment_file(args.fragments):
        if barcode not in barcodes_dict:
            continue
        for pos in (start, stop):
            if contig in full_peaks.keys():
                peak = full_peaks[contig].get_region_containing_point(pos)
                if peak is not None:
                    peak_bc_counts[barcodes_dict[barcode],
                                   peaks_dict['{}:{}-{}'.format(
                                       contig, peak[0], peak[1])]] += 1

    data, col, row = (), (), ()
    if len(peak_bc_counts) > 0:
        data, col, row = zip(*[(val, key[0], key[1])
                               for key, val in peak_bc_counts.iteritems()])
    sp_matrix = csc_matrix(
        coo_matrix((data, (row, col)),
                   shape=(len(peaks_dict), len(barcodes_dict)),
                   dtype=int))

    # save as a CountMatrix
    genomes = utils.generate_genome_tag(args.reference_path)
    peaks_def = atac_feature_ref.from_peaks_bed(args.peaks, genomes)
    raw_matrix = cr_matrix.CountMatrix(peaks_def, barcodes_dict.keys(),
                                       sp_matrix)
    raw_matrix.save_h5_file(outs.raw_matrix,
                            sw_version=martian.get_pipelines_version())
예제 #16
0
def main(args, outs):
    outs.coerce_strings()
    bam_in = tk_bam.create_bam_infile(args.bucket[0])
    bam_out, _ = tk_bam.create_bam_outfile(outs.default,
                                           None,
                                           None,
                                           template=bam_in,
                                           pg=tk_bam.make_pg_header(
                                               martian.get_pipelines_version(),
                                               "sort_reads_by_bc"))
    bam_in.close()

    outs.total_reads = merge_by_key(args.bucket, bc_sort_key, bam_out)
    bam_out.close()
예제 #17
0
def main(args, outs):
    summary = {}

    filtered_mat = cr_matrix.CountMatrix.load_h5_file(
        args.filtered_matrices_h5)
    genomes = filtered_mat.get_genomes()

    # get metrics from other summaries
    if args.analyze_matrices_summary:
        with open(args.analyze_matrices_summary) as reader:
            analysis_summary = json.load(reader)
        summary.update(analysis_summary)

    with open(args.normalize_depth_summary, 'r') as reader:
        summary.update(json.load(reader))
        agg_batches = summary['batches']

    with open(outs.summary, 'w') as f:
        json.dump(summary, f, indent=4, sort_keys=True)

    # build web summary
    sample_properties = AggrSampleProperties(
        sample_id=args.sample_id,
        sample_desc=args.sample_desc,
        genomes=genomes,
        version=martian.get_pipelines_version(),
        agg_batches=agg_batches)
    sample_properties = dict(sample_properties._asdict())

    sample_data_paths = cr_webshim_data.SampleDataPaths(
        summary_path=outs.summary,
        barcode_summary_path=args.barcode_summary_h5,
        analysis_path=args.analysis,
    )

    sample_data = cr_webshim.load_sample_data(sample_properties,
                                              sample_data_paths)
    cr_webshim.build_web_summary_html(outs.web_summary, sample_properties,
                                      sample_data, PIPELINE_AGGR)
예제 #18
0
def write_mask_bed(bedfile,store,chroms,window_size,ref, args):
    """Write a BED file corresponding to the mask=True regions
    for our profiles."""
    chroms = sorted(chroms)
    with open(bedfile,'w') as outfile:
        version = martian.get_pipelines_version()
        outfile.write("#cellranger-dna {}\n".format(version))
        outfile.write("#reference genome: {}\n".format(args.reference_path))
        outfile.write("#chrom\tstart\tend\n")

        for chrom in chroms:
            chrom_length = ref.contig_lengths[chrom]
            mask = store['/masks/'+chrom]
            start,end = None,None
            in_mask = False
            for i in xrange(len(mask)):
                if not in_mask and mask[i]:
                    start = i
                    in_mask = True
                if in_mask and not mask[i]:
                    end = i
                    in_mask = False
                    outfile.write( '\t'.join(str(s) for s in [\
                        chrom,
                        start*window_size,
                        min(end*window_size,chrom_length),
                        ]) + os.linesep
                    )
            if in_mask and \
               (start is not None) and \
               (end is None or end*window_size<chrom_length):
                outfile.write( '\t'.join(str(s) for s in [\
                    chrom,
                    start*window_size,
                    chrom_length,
                    ]) + os.linesep
                )
예제 #19
0
def main(args, outs):
    summary = {}

    # add stats from matrices
    filtered_mats = cr_matrix.GeneBCMatrices.load_h5(args.filtered_matrices_h5)
    genomes = filtered_mats.get_genomes()
    cells_per_genome = {}
    for genome in genomes:
        matrix = filtered_mats.matrices[genome]
        cells_per_genome[genome] = matrix.bcs_dim
        median_gene_counts = np.median(
            matrix._sum(matrix.m >= cr_constants.MIN_READS_PER_GENE, axis=0))
        median_umi_counts = np.median(matrix._sum(matrix.m, axis=0))
        summary.update({
            '%s_filtered_bcs' % genome:
            cells_per_genome[genome],
            '%s_filtered_bcs_median_counts' % genome:
            median_umi_counts,
            '%s_filtered_bcs_median_unique_genes_detected' % genome:
            median_gene_counts,
        })
    del filtered_mats

    # get metrics from other summaries
    if args.analyze_matrices_summary:
        with open(args.analyze_matrices_summary) as reader:
            analysis_summary = json.load(reader)
        summary.update(analysis_summary)

    with open(args.normalize_depth_summary, 'r') as reader:
        data = json.load(reader)
        raw_conf_mapped_per_genome = data['raw_conf_mapped_per_genome']
        downsample_map = data['downsample_info']
        mol_counter_metrics = data['mol_counter_metrics']

    with open(args.count_genes_summary, 'r') as reader:
        data = json.load(reader)
        flt_conf_mapped_per_genome = data['flt_conf_mapped_per_genome']

    for genome in flt_conf_mapped_per_genome:
        frac_reads_in_cells = tk_stats.robust_divide(
            flt_conf_mapped_per_genome[genome],
            raw_conf_mapped_per_genome[genome])
        summary['%s_filtered_bcs_conf_mapped_barcoded_reads_cum_frac' %
                genome] = frac_reads_in_cells

    # Pass chemistry metrics through to output
    summary.update({
        k: v
        for k, v in mol_counter_metrics.iteritems()
        if k.startswith('chemistry_')
    })

    # Molecule counter metrics
    gem_groups = []
    total_reads_per_gem_group = []
    downsampled_reads_per_gem_group = []
    for (gg, submetrics) in mol_counter_metrics[
            cr_mol_counter.GEM_GROUPS_METRIC].iteritems():
        gem_groups.append(gg)
        total_reads = submetrics[cr_mol_counter.GG_TOTAL_READS_METRIC]
        total_reads_per_gem_group.append(total_reads)
        # If metric is missing, assume no downsampling was done
        downsampled = submetrics.get(
            cr_mol_counter.GG_DOWNSAMPLED_READS_METRIC, total_reads)
        downsampled_reads_per_gem_group.append(downsampled)
    total_reads = sum(total_reads_per_gem_group)
    downsampled_reads = sum(downsampled_reads_per_gem_group)
    total_cells = sum(cells_per_genome.values())
    mean_reads_per_cell = tk_stats.robust_divide(total_reads, total_cells)
    downsampled_mean_reads_per_cell = tk_stats.robust_divide(
        downsampled_reads, total_cells)
    summary.update({
        'pre_normalization_total_reads':
        total_reads,
        'post_normalization_total_reads':
        downsampled_reads,
        'filtered_bcs_transcriptome_union':
        total_cells,
        'pre_normalization_multi_transcriptome_total_raw_reads_per_filtered_bc':
        mean_reads_per_cell,
        'post_normalization_multi_transcriptome_total_raw_reads_per_filtered_bc':
        downsampled_mean_reads_per_cell,
    })

    # Downsampling metrics
    gem_group_index = args.gem_group_index
    agg_batches = []
    lowest_frac_reads_kept = 1.0
    for (gg, rpg) in zip(gem_groups, total_reads_per_gem_group):
        dinfo = downsample_map[str(gg)]
        (library_id, old_gg) = gem_group_index[str(gg)]
        batch = library_id + ('-%d' % old_gg if old_gg > 1 else '')
        agg_batches.append(batch)
        # calc summary metrics
        frac_reads_kept = dinfo['frac_reads_kept']
        lowest_frac_reads_kept = min(lowest_frac_reads_kept, frac_reads_kept)
        summary['%s_frac_reads_kept' % batch] = frac_reads_kept
        summary['%s_pre_normalization_raw_reads_per_filtered_bc' %
                batch] = tk_stats.robust_divide(dinfo['total_reads'],
                                                dinfo['cells'])
        summary['%s_pre_normalization_cmb_reads_per_filtered_bc' %
                batch] = tk_stats.robust_divide(dinfo['cmb_reads'],
                                                dinfo['cells'])
        # this is an internal metric, so keep using gem group instead of batch
        summary['%s_total_reads_per_gem_group' % gg] = frac_reads_kept * rpg
    summary['lowest_frac_reads_kept'] = lowest_frac_reads_kept

    with open(outs.summary, 'w') as f:
        json.dump(summary, f, indent=4, sort_keys=True)

    # build web summary
    sample_properties = cr_webshim.get_sample_properties(
        args.aggregation_id,
        args.aggregation_desc,
        genomes,
        version=martian.get_pipelines_version(),
        agg_batches=agg_batches)

    sample_data_paths = cr_webshim_data.SampleDataPaths(
        summary_path=outs.summary,
        barcode_summary_path=args.barcode_summary_h5,
        analysis_path=args.analysis,
    )

    sample_data = cr_webshim.load_sample_data(sample_properties,
                                              sample_data_paths)
    cr_webshim.build_web_summary_html(outs.web_summary, sample_properties,
                                      sample_data, PIPELINE_AGGR)
예제 #20
0
def main(args, outs):
    random.seed(0)

    paired_end = cr_chem.is_paired_end(args.chemistry_def)

    # Use the chemistry to get the locations of various sequences
    rna_read_def = cr_chem.get_rna_read_def(args.chemistry_def)
    rna_read2_def = cr_chem.get_rna_read2_def(args.chemistry_def)
    bc_read_def = cr_chem.get_barcode_read_def(args.chemistry_def)
    si_read_def = cr_chem.get_si_read_def(args.chemistry_def)
    umi_read_def = cr_chem.get_umi_read_def(args.chemistry_def)

    read_defs = [rna_read_def, rna_read2_def,
                 bc_read_def, si_read_def, umi_read_def]
    read_tags = [None, None,
                 (cr_constants.RAW_BARCODE_TAG, cr_constants.RAW_BARCODE_QUAL_TAG),
                 (tk_constants.SAMPLE_INDEX_TAG, tk_constants.SAMPLE_INDEX_QUAL_TAG),
                 (cr_constants.RAW_UMI_TAG, cr_constants.UMI_QUAL_TAG),
             ]

    # Determine which trimmed sequences need to be retained for bamtofastq
    trim_defs = get_bamtofastq_defs(read_defs, read_tags)
    outs.bam_comments = sorted(set(trim_defs.itervalues()))

    gem_groups = [chunk['gem_group'] for chunk in args.chunks]
    reporter = cr_report.Reporter(umi_length=cr_chem.get_umi_length(args.chemistry_def),
                                  primers=cr_utils.get_primers_from_dicts(args.primers),
                                  gem_groups=gem_groups)

    # Determine if barcode sequences need to be reverse complemented.
    bc_check_rc = FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved, None, None)
    barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist)
    barcode_rc = infer_barcode_reverse_complement(barcode_whitelist, bc_check_rc.in_iter)
    bc_check_rc.close()

    # Log the untrimmed read lengths to stdout
    r1_read_def = cr_constants.ReadDef(rna_read_def.read_type, 0, None)
    r1_reader = FastqReader(args.read_chunks, r1_read_def, args.reads_interleaved, None, None)

    r1_untrimmed_len = 0
    for read in itertools.islice(r1_reader.in_iter, cr_constants.DETECT_CHEMISTRY_INITIAL_READS):
        r1_untrimmed_len = max(r1_untrimmed_len, len(read[1]))
    print "Read 1 untrimmed length = ", r1_untrimmed_len
    print "Input arg r1_length = ", args.r1_length
    r1_reader.close()

    if paired_end:
        r2_read_def = cr_constants.ReadDef(rna_read2_def.read_type, 0, None)
        r2_reader = FastqReader(args.read_chunks, r2_read_def, args.reads_interleaved, None, None)

        r2_untrimmed_len = 0
        for read in itertools.islice(r2_reader.in_iter, cr_constants.DETECT_CHEMISTRY_INITIAL_READS):
            r2_untrimmed_len = max(r2_untrimmed_len, len(read[1]))
        print "Read 2 untrimmed length = ", r2_untrimmed_len
        print "Input arg r2_length = ", args.r2_length
        r2_reader.close()


    # Setup read iterators.
    r1_length = args.r1_length
    r2_length = args.r2_length

    rna_reads = FastqReader(args.read_chunks, rna_read_def, args.reads_interleaved, r1_length, r2_length)
    rna_read2s = FastqReader(args.read_chunks, rna_read2_def, args.reads_interleaved, r1_length, r2_length)
    bc_reads = FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved, r1_length, r2_length)
    si_reads = FastqReader(args.read_chunks, si_read_def, args.reads_interleaved, r1_length, r2_length)

    if cr_chem.has_umis(args.chemistry_def):
        umi_reads = FastqReader(args.read_chunks, umi_read_def, args.reads_interleaved, r1_length, r2_length)
    else:
        umi_reads = FastqReader(None, None, False, r1_length, r2_length)

    fastq_readers = (rna_reads, rna_read2s, bc_reads, si_reads, umi_reads)

    read1_writer = ChunkedFastqWriter(outs.reads, args.reads_per_file, compression=COMPRESSION)
    if paired_end:
        read2_writer = ChunkedFastqWriter(outs.read2s, args.reads_per_file, compression=COMPRESSION)

    bc_counter = BarcodeCounter(args.barcode_whitelist, outs.barcode_counts)

    all_read_iter = itertools.izip_longest(*[reader.in_iter for reader in fastq_readers])

    EMPTY_READ = (None, '', '')

    reporter.extract_reads_init()

    for extractions in itertools.islice(all_read_iter, args.initial_reads):
        # Downsample
        if random.random() > args.subsample_rate:
            continue

        rna_extraction, rna2_extraction, bc_extraction, si_extraction, umi_extraction = extractions

        rna_read = rna_extraction if rna_extraction is not None else EMPTY_READ
        rna_read2 = rna2_extraction if rna2_extraction is not None else EMPTY_READ
        bc_read = bc_extraction if bc_extraction is not None else EMPTY_READ
        si_read = si_extraction if si_extraction is not None else EMPTY_READ
        umi_read = umi_extraction if umi_extraction is not None else EMPTY_READ

        if (not rna_read[1]) or (paired_end and (not rna_read2[1])):
            # Read 1 is empty or read 2 is empty (if paired_end)
            # Empty reads causes issue with STAR aligner, so eliminate
            # them here
            continue

        if bc_read != EMPTY_READ:
            # Reverse complement the barcode if necessary
            if barcode_rc:
                bc_read = (bc_read[0], tk_seq.get_rev_comp(bc_read[1]), bc_read[2][::-1])
            # Track the barcode count distribution
            bc_counter.count(*bc_read)

        # Calculate metrics on raw sequences
        reporter.raw_fastq_cb(rna_read, rna_read2, bc_read, si_read, umi_read, args.gem_group, skip_metrics=args.skip_metrics)

        # Construct new fastq headers
        fastq_header1 = AugmentedFastqHeader(rna_read[0])
        fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1])
        fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2])
        fastq_header1.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1])
        fastq_header1.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2])
        fastq_header1.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1])
        fastq_header1.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2])

        fastq_header_str1 = fastq_header1.to_string()

        read1_writer.write((fastq_header_str1, rna_read[1], rna_read[2]))

        if paired_end:
            fastq_header2 = AugmentedFastqHeader(rna_read2[0])
            fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1])
            fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2])
            fastq_header2.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1])
            fastq_header2.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2])
            fastq_header2.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1])
            fastq_header2.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2])

            read2_writer.write((fastq_header2.to_string(), rna_read2[1], rna_read2[2]))

    reporter.extract_reads_finalize()

    # Close input and output files.
    rna_reads.close()
    if paired_end:
        rna_read2s.close()
    bc_reads.close()
    si_reads.close()
    umi_reads.close()

    read1_writer.close()
    if paired_end:
        read2_writer.close()
    bc_counter.close()

    # Set stage output parameters.
    if len(read1_writer.file_paths) > 0:
        outs.reads = read1_writer.get_out_paths()
        if paired_end:
            outs.read2s = read2_writer.get_out_paths(len(outs.reads))
        else:
            outs.read2s = []
        outs.gem_groups = [args.gem_group] * len(outs.reads)
        outs.read_groups = [args.read_group] * len(outs.reads)
    else:
        outs.reads = []
        outs.read2s = []
        outs.gem_groups = []
        outs.read_groups = []

    assert len(outs.gem_groups) == len(outs.reads)

    if paired_end:
        assert len(outs.reads) == len(outs.read2s)

    # this is the first reporter stage, so store the pipeline metadata
    reporter.store_pipeline_metadata(martian.get_pipelines_version())

    reporter.save(outs.chunked_reporter)
예제 #21
0
def main(args, outs):
    """Mark exact duplicate reads in the output BAM file while also writing out some summary statistics.
    PCR duplicates have the same read1 start site and read2 start site.
    """
    args.coerce_strings()
    outs.coerce_strings()

    # Chunk output doesn't get indexed
    outs.fragments_index = None
    outs.index = None

    # Pull in prior likelihoods for barcodes
    raw_barcode_abundance = None
    barcode_whitelist = load_barcode_whitelist(args.barcode_whitelist)
    if args.raw_barcode_counts is not None and barcode_whitelist is not None:
        with open(args.raw_barcode_counts, 'r') as infile:
            raw_counts = json.load(infile)
        raw_barcode_abundance = {
            '{}-{}'.format(barcode, gem_group): count
            for gem_group, subdict in raw_counts.iteritems()
            for barcode, count in zip(barcode_whitelist, subdict['bc_counts'])
        }

    bam_in = create_bam_infile(args.input)
    bam_refs = bam_in.references

    bam_prefix, ext = os.path.splitext(outs.output)
    raw_bam_file = martian.make_path(bam_prefix + '_five_prime_pos_sorted' +
                                     ext)

    frag_prefix, ext = os.path.splitext(outs.fragments)
    raw_frag_file = martian.make_path(frag_prefix + '_raw' + ext)

    # only write CO line for one chunk, so we don't have duplicates after samtools merge
    if args.chunk_num == 0:
        COs = [
            '10x_bam_to_fastq:R1(SEQ:QUAL,TR:TQ)',
            '10x_bam_to_fastq:R2(SEQ:QUAL,TR:TQ)',
            '10x_bam_to_fastq:I1(BC:QT)', '10x_bam_to_fastq:I2(CR:CY)',
            '10x_bam_to_fastq_seqnames:R1,R3,I1,R2'
        ]
    else:
        COs = None

    bam_out, _ = tk_bam.create_bam_outfile(
        raw_bam_file,
        None,
        None,
        template=bam_in,
        pgs=[
            tk_bam.make_pg_header(martian.get_pipelines_version(),
                                  "mark_duplicates", TENX_PRODUCT_NAME)
        ],
        cos=COs)
    fragments_out = open(raw_frag_file, 'w')
    bam_in.reset()

    # Ensure the summary key indicates what kind of dup marking was actually performed.
    lane_coord_sys = tk_lane.LaneCoordinateSystem.from_dict(args.lane_map)
    reference_manager = ReferenceManager(args.reference_path)
    summarizer = DupSummary(split_bcs=False,
                            lane_coordinate_system=lane_coord_sys,
                            output_bam=bam_out,
                            output_tsv=fragments_out,
                            ref=reference_manager,
                            bam_refs=bam_refs,
                            priors=raw_barcode_abundance)

    # Now broadcast the selected reads to the summarizers
    consumers = [summarizer.read_consumer()]
    source = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end))
    broadcast(source, consumers)

    # Close outfiles
    bam_out.close()
    fragments_out.close()

    # Feed the chunk barcode_counts data back to join()
    with open(outs.singlecell_mapping, 'w') as outfile:
        pickle.dump(summarizer.bc_counts, outfile)

    # Sort the output bam & tsv files
    sort_bam(raw_bam_file,
             outs.output,
             threads=martian.get_threads_allocation())
    sort_bed(raw_frag_file,
             outs.fragments,
             genome=reference_manager.fasta_index,
             threads=martian.get_threads_allocation(),
             leave_key=True)
예제 #22
0
def main(args, outs):
    """ Attaches barcodes. Attaches raw barcode to RAW_BC tag and filters those to form set of PROCESSES_BARCODES """

    chunk = args.chunk

    #subsample_rate = 1.0
    #if args.subsample_rate is not None:
    #    subsample_rate = args.subsample_rate

    bam_in = tk_bam.create_bam_infile(args.align_chunk)
    bam_out, tids = tk_bam.create_bam_outfile(outs.output, None, None, template=bam_in, pgs=tk_bam.make_pg_header(martian.get_pipelines_version(), "attach_bcs"))

    if args.barcode_whitelist is None or args.bc_counts is None:
        # If there's no whitelist or counts then all high quality BC reads get allowed.
        barcode_whitelist = None
        wl_idxs = None
        bc_dist = None
    else:
        barcode_whitelist = tk_seq.load_barcode_whitelist(args.barcode_whitelist)

        # Load the bc counts for this GEM group
        counts = json.load(open(args.bc_counts, 'r'))
        counts = counts[str(chunk['gem_group'])]['bc_counts']

        # Prior distribution over barcodes, with pseudo-count
        bc_dist = np.array(counts, dtype=np.float) + 1.0
        bc_dist = bc_dist / bc_dist.sum()
        wl_idxs = { bc:idx for (idx,bc) in enumerate(sorted(list(barcode_whitelist))) }

    # set random seed to get deterministic subsampling
    random.seed(0)

    def open_maybe_gzip(fn):
        if fn[-2:] == "gz":
            return gzip.open(fn)
        else:
            return open(fn)

    if chunk['barcode']:
        processed_barcode_iter = get_raw_processed_barcodes(open_maybe_gzip(chunk['barcode']), barcode_whitelist, args.bc_confidence_threshold, chunk['gem_group'], chunk['barcode_reverse_complement'], wl_idxs, bc_dist)
        require_barcode_for_stringent = True
    else:
        processed_barcode_iter = itertools.repeat(None)
        require_barcode_for_stringent = False

    if chunk['sample_index']:
        sample_index_iter = tk_fasta.read_generator_fastq(open_maybe_gzip(chunk['sample_index']))
    else:
        sample_index_iter = itertools.repeat(None)

    iters = itertools.izip(processed_barcode_iter, sample_index_iter)

    # First read
    read = bam_in.next()

    # Number of perfect reads -- used to compute down-sampling rates in mark_duplicates
    perfect_read_count = 0

    # Due to secondary alignments, we must apply the tags to all
    # reads with the same cluster name.
    for (barcode_info, sample_index_info) in iters:
        tags = []
        read_name = None

        if read is None:
            break

        if barcode_info:
            (bc_read_name, raw_bc_seq, processed_bc_seq, raw_bc_qual) = barcode_info
            tags.append((RAW_BARCODE_TAG, raw_bc_seq))
            tags.append((RAW_BARCODE_QUAL_TAG, raw_bc_qual))
            if processed_bc_seq is not None:
                tags.append((PROCESSED_BARCODE_TAG, processed_bc_seq))
            read_name = bc_read_name.split()[0]


        if sample_index_info:
            (si_read_name, seq, qual) = sample_index_info
            tags.append((SAMPLE_INDEX_TAG, seq))
            tags.append((SAMPLE_INDEX_QUAL_TAG, qual))

            if read_name != None:
                if si_read_name.split()[0] != read_name:
                    martian.log_info("mismatch: si_read_name: %s, bam_read_name: %s" % (si_read_name, read_name))
                assert(si_read_name.split()[0] == read_name)
            else:
                read_name = si_read_name.split()[0]

        reads_attached = 0
        #emit_read_pair = random.random() < subsample_rate
        emit_read_pair = True

        while read.qname == read_name or read_name == None:
            if len(tags) > 0:
                existing_tags = read.tags
                existing_tags.extend(tags)
                read.tags = existing_tags

            reads_attached += 1
            if not (read_name is None):
                assert(read.qname == read_name)

            if emit_read_pair:
                # Count the perfect reads -- will be used when subsampling in dedup
                if tenkit.read_filter.stringent_read_filter(read, require_barcode_for_stringent):
                    perfect_read_count += 1

                if args.exclude_non_bc_reads:
                    if not(tk_io.get_read_barcode(read) is None):
                        bam_out.write(read)
                else:
                    bam_out.write(read)

            try:
                read = bam_in.next()

            except StopIteration:
                read = None
                break

        # We may have more than 2 reads is there was a
        # secondary alignment, but less than 2 means
        # something went wrong
        assert(reads_attached >= 2)


    outs.perfect_read_count = perfect_read_count
    bam_out.close()
예제 #23
0
def join(args, outs, chunk_defs, chunk_outs):
    summary_files = [
        args.reads_summary,
        args.filter_umis_summary,
        args.filter_barcodes_summary,
        args.trim_reads_summary,
        args.filter_reads_summary,
        args.filter_contigs_summary,
        args.report_contigs_summary,
        args.report_contig_alignments_summary,
        args.raw_consensus_summary,
        args.group_clonotypes_summary,
    ]

    summary_files = [
        sum_file for sum_file in summary_files if not sum_file is None
    ]

    cr_report.merge_jsons(summary_files, outs.metrics_summary_json)

    # Copy barcode summary h5
    if args.barcode_summary:
        cr_utils.copy(args.barcode_summary, outs.barcode_summary)

    # Copy cell barcodes
    if args.cell_barcodes:
        cr_utils.copy(args.cell_barcodes, outs.cell_barcodes)

    # Copy barcode support
    if args.barcode_support:
        cr_utils.copy(args.barcode_support, outs.barcode_support)

    # Copy barcode umi summary
    if args.barcode_umi_summary:
        cr_utils.copy(args.barcode_umi_summary, outs.barcode_umi_summary)

    # Copy umi info
    if args.umi_info:
        cr_utils.copy(args.umi_info, outs.umi_info)

    sample_data_paths = cr_webshim_data.SampleDataPaths(
        summary_path=outs.metrics_summary_json,
        barcode_summary_path=args.barcode_summary,
        vdj_clonotype_summary_path=args.clonotype_summary,
        vdj_barcode_support_path=args.barcode_support,
    )

    # Determine chain type for the report
    if args.chain_type_spec == vdj_constants.AUTO_CHAIN_TYPE:
        chain_type = args.chain_type_auto
    elif args.chain_type_spec == vdj_constants.ALL_CHAIN_TYPES:
        chain_type = None
    else:
        chain_type = args.chain_type_spec

    sample_properties = VdjSampleProperties(
        sample_id=args.sample_id,
        sample_desc=args.sample_desc,
        chain_type=chain_type,
        version=martian.get_pipelines_version())
    sample_properties = dict(sample_properties._asdict())

    sample_data = cr_webshim.load_sample_data(sample_properties,
                                              sample_data_paths)

    if args.barcode_whitelist is not None:
        cr_webshim.build_web_summary_html(outs.web_summary,
                                          sample_properties,
                                          sample_data,
                                          PIPELINE_VDJ,
                                          alerts_output_filename=outs.alerts)
        cr_webshim.build_metrics_summary_csv(outs.metrics_summary_csv,
                                             sample_properties, sample_data,
                                             PIPELINE_VDJ)
예제 #24
0
def join(args, outs, chunk_defs, chunk_outs):
    ## merge gc params jsons
    node_gc_params = {}
    sc_gc_params = json.load(open(args.sc_gc_params, "r"))
    internal_gc_params = json.load(open(args.internal_gc_params, "r"))

    ncells = len(sc_gc_params['linear'])
    nnodes = 2*ncells - 1

    for key in ["scale", "linear", "quadratic"]:
        node_gc_params[key] = sc_gc_params[key] + internal_gc_params[key]
    with open(outs.node_gc_params, "w") as out:
        json.dump(node_gc_params, out, indent=4)

    ref = contig_manager.contig_manager(args.reference_path)
    chroms = ref.primary_contigs(allow_sex_chromosomes=True)
    index_chrom = dict([(str(i), c) for i, c in enumerate(chroms)])
    chrom_index = dict([(c, str(i)) for i, c in enumerate(chroms)])
    tmp = martian.make_path('tmp.bed')
    tmp_dir = os.path.dirname(tmp)
    tmp_sorted = martian.make_path('tmp_sorted.bed')
    calls = [[args.sc_cnv_calls, args.internal_cnv_calls],
             [args.sc_unmerged_cnv_calls, args.internal_unmerged_cnv_calls]]
    out_calls = [outs.node_cnv_calls, outs.node_unmerged_cnv_calls]
    for calls, out in zip(calls, out_calls):
        with open(tmp, 'w') as outf:
            for f in calls:
                for l in open(f):
                    fields = l.split()
                    # offset internal node indices by ncells
                    if f == calls[1]:
                        fields[3] = str(int(fields[3]) + ncells)
                    # fix type of confidence field to integer
                    fields[-1] = str(int(float(fields[-1])))
                    # replace index number at start for sorting
                    fields[0] = chrom_index[fields[0]]
                    outf.write('\t'.join(fields) + '\n')

        no_unicode = dict(LC_ALL='C')
        tmp_mem_gib = max(1, int(np.ceil(float(os.path.getsize(tmp)) / (1024**3))))
        try:
            subprocess.check_call(['sort', '-k1,1n', '-k2,2n', '-k3,3n',
                                   '--parallel=1',  # force sort to use 1 thread
                                   '-S', '{}G'.format(tmp_mem_gib),
                                   '-T', tmp_dir,
                                   '-o', tmp_sorted, tmp],
                                  env=no_unicode, stderr=sys.stderr)
        # on some systems, --parallel is unavailable
        except subprocess.CalledProcessError:
            subprocess.check_call(['sort', '-k1,1n', '-k2,2n', '-k3,3n',
                                   # will by default only use 1 thread
                                   '-S', '{}G'.format(tmp_mem_gib),
                                   '-T', tmp_dir,
                                   '-o', tmp_sorted, tmp],
                                  env=no_unicode, stderr=sys.stderr)

        # strip index column into outfile
        with open(out, 'w') as outf:
            version = martian.get_pipelines_version()
            outf.write("#cellranger-dna {}\n".format(version))
            outf.write("#reference genome: {}\n".format(args.reference_path))
            outf.write("#chrom\tstart\tend\tid\tcopy_number\tevent_confidence\n")
            for l in open(tmp_sorted):
                l = l.split('\t')
                l[0] = index_chrom[l[0]]
                outf.write('\t'.join(l))

    os.remove(tmp)
    os.remove(tmp_sorted)

    ## cnv tracks file
    sc_windows = load_h5(args.sc_cnv_tracks, "windows")
    internal_windows = load_h5(args.internal_cnv_tracks, "windows")
    windows = sc_windows.append(internal_windows).values
    constants = load_h5(args.sc_cnv_tracks, "constants")
    
    sc_ploidy_conf = scale_confidence_score(load_h5(args.sc_cnv_tracks, 
        "ploidy_conf").values)
    internal_ploidy_conf = scale_confidence_score(load_h5(
        args.internal_cnv_tracks, "ploidy_conf").values)
    
    sc_scale_factor= load_h5(args.sc_cnv_tracks, "scale_factor")
    internal_scale_factor = load_h5(args.internal_cnv_tracks, "scale_factor")

    sc_rpb= load_h5(args.sc_cnv_tracks, "reads_per_bin")
    internal_rpb= load_h5(args.internal_cnv_tracks, "reads_per_bin")
    
    X = load_h5(args.sc_cnv_tracks, "cnv_tracks").values
    nbins = X.shape[1]
    Q = np.zeros((nnodes, nbins), dtype=X.dtype)
    Q[0:ncells, :] = X
    del X
    Q[ncells:, :] = load_h5(args.internal_cnv_tracks, "cnv_tracks").values

    store = pd.HDFStore(outs.node_cnv_tracks, "w")
    store["constants"] = constants
    store["windows"] = sc_windows.append(internal_windows)
    store["ploidy_conf"] = sc_ploidy_conf.append(internal_ploidy_conf)
    store["scale_factor"] = sc_scale_factor.append(internal_scale_factor)
    store["reads_per_bin"] = sc_rpb.append(internal_rpb)
    store["cnv_tracks"] = pd.DataFrame(Q)
    store.close()
    
    ## Compute heterogeneity and store in tree_data
    ref = contig_manager.contig_manager(args.reference_path)
    chroms = ref.primary_contigs(allow_sex_chromosomes=True)
    if args.tracks is None:
        gmask = np.ones(nbins, dtype=bool)
    else:
        gmask = []
        maptrack = pd.HDFStore(args.tracks, "r")
        for chrom in chroms:
            gmask.extend(maptrack["/map/"+chrom].values > MAPPABILITY_THRESHOLD)
        maptrack.close( )
        gmask = np.array(gmask)

    ## update tree data
    # load tree
    store = pd.HDFStore( args.tree_data, "r" )
    Z = store["/Z"].values
    distances = store["/distances"].values
    constants = store["/constants"]
    store.close( )

    # Compute the heterogeneity at every *internal* node of the tree
    # obviously the heterogeneity is zero at every leaf, so don't
    # store a bunch of zeros
    levels = 6
    het = compute_heterogeneity(Q, Z, gmask, windows, levels=levels)

    del Q

    # dump to disk
    store = pd.HDFStore( outs.tree_data, "w" )
    store["Z"] = pd.DataFrame(Z)
    store["het"] = pd.DataFrame(het)
    store["distances"] = pd.Series(distances)
    store["windows"] = pd.Series(windows)
    store["constants"] = constants
    store.close( )

    del het

    ## normalized profiles
    sc_store = pd.HDFStore(args.sc_norm_profiles, "r")
    internal_store = pd.HDFStore(args.internal_norm_profiles, "r")
    out_store = pd.HDFStore(outs.norm_node_profiles, "w")
    out_store["/constants"] = sc_store["/constants"]
    for chrom in chroms:
        ## first do the /contigs
        X = sc_store["/contigs/"+chrom].values
        Y = internal_store["/contigs/"+chrom].values
        assert X.shape[1] == Y.shape[1]
        nbins = X.shape[1]
        Z = np.zeros((2*ncells-1, nbins), dtype=X.dtype)
        Z[:ncells, :] = X
        Z[ncells:, :] = Y
        out_store["/contigs/"+chrom] = pd.DataFrame(Z)
        del X, Y, Z

        ## next do the /masks
        out_store["/masks/"+chrom] = sc_store["/masks/"+chrom]
    ## gc params
    for key in ["scale", "linear", "quadratic"]:
        out_store["/gc_params/"+key] = pd.concat([sc_store["/gc_params/"+key],
            internal_store["/gc_params/"+key]], ignore_index=True)

    ## do the normalization metrics
    out_store["/normalization_metrics"] =sc_store["normalization_metrics"].append(internal_store["/normalization_metrics"], ignore_index=True)

    out_store.close()
    sc_store.close()
    internal_store.close()
예제 #25
0
def main(args, outs):
    """
    Mark exact duplicate reads in the BAM file. Duplicates have the same read1 start site and read2 start site
    """

    lane_coord_sys = tk_lane.LaneCoordinateSystem.from_dict(args.lane_map)

    args.coerce_strings()
    outs.coerce_strings()

    bam_in = tk_bam.create_bam_infile(args.input)
    template = BamTemplateShim(bam_in, keep_comments=(args.chunk_index==0))
    
    if args.write_bam:
        bam_prefix, ext = os.path.splitext(outs.output)
        out_bam_name = bam_prefix + '_five_prime_pos_sorted' + ext
        bam_out, _ = tk_bam.create_bam_outfile(out_bam_name, None, None, template=template,
                                               pgs=[tk_bam.make_pg_header(martian.get_pipelines_version(),
                                                                          "mark_duplicates")])
        outs.index = None # chunk bams don't get indexed
    else:
        bam_out = None
        outs.output = None
        outs.index = None

    # Determine whether the BAM has 10x barcodes
    bam_in.reset()
    has_barcodes = [crdna_io.read_has_barcode(x) for x in itertools.islice(bam_in, 1000)]
    have_barcodes = (float(sum(has_barcodes)) / len(has_barcodes)) > 0.1

    # All read duplicate marking - these dup decisions are written to bam_out
    # the output bam has BC aware dup marking if available.
    # Ensure the summary key indicates what kind of dup marking was actually performed.
    if have_barcodes:
        no_filter_dups_bcs =    DupSummary(False, 1.0, True,  "no_filter_full_use_bcs", lane_coord_sys, output_bam=bam_out, threshold=args.diffusion_threshold)
        no_filter_dups_no_bcs = DupSummary(False, 1.0, False, "no_filter_full_ignore_bcs", lane_coord_sys, threshold=args.diffusion_threshold)
    else:
        no_filter_dups_bcs =    DupSummary(False, 1.0, True,  "no_filter_full_use_bcs", lane_coord_sys, threshold=args.diffusion_threshold)
        no_filter_dups_no_bcs = DupSummary(False, 1.0, False, "no_filter_full_ignore_bcs", lane_coord_sys, output_bam=bam_out, threshold=args.diffusion_threshold)


    # Dup marking on all perfect reads
    full_dups_bcs = DupSummary(True, 1.0, True, "full_use_bcs", lane_coord_sys, threshold=args.diffusion_threshold, tag_counts=True)
    full_dups_no_bcs = DupSummary(True, 1.0, False, "full_ignore_bcs", lane_coord_sys, threshold=args.diffusion_threshold)

    dup_sums = [full_dups_bcs, full_dups_no_bcs, no_filter_dups_bcs, no_filter_dups_no_bcs]

    # Now broadcast the selected reads to the summarizers
    # We can't do the points the require a sample_rate > 1.0 so, skip those.
    # If we don't have barcodes, don't run the set that are split by barcode.
    consumers = [x.read_consumer() for x in dup_sums if x.sample_rate <= 1.0 and ((not x.split_bcs) or have_barcodes)]

    source = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end))
    broadcast(source, consumers)

    # We close the BAM
    if bam_out:
        bam_out.close()
        # Note - the indexing happens in join
        bam_prefix, _ = os.path.splitext(outs.output)
        tk_bam.sort(out_bam_name, bam_prefix)

    # Package up the summaries:
    dup_results = {}
    for x in dup_sums:
        (dups, optical_dups, diff_dups, custom_diff_dups) = x.result
        desc = x.description
        dup_results[desc] = dups
        optical_desc = "optical_" + desc
        dup_results[optical_desc] = optical_dups
        diff_desc = "diffusion_old_" + desc
        dup_results[diff_desc] = diff_dups
        custom_diff_desc = "diffusion_" + desc
        dup_results[custom_diff_desc] = custom_diff_dups

    if outs.duplicate_summary:
        with open(outs.duplicate_summary, 'w') as f:
            json.dump(dup_results, f, indent=4)
예제 #26
0
def main(args, outs):
    random.seed(0)

    paired_end = cr_chem.is_paired_end(args.chemistry_def)

    # Build the feature reference
    if args.reference_path:
        feature_ref = rna_feature_ref.from_transcriptome_and_csv(
            args.reference_path, args.feature_reference)
    else:
        feature_ref = rna_feature_ref.FeatureReference.empty()

    # Setup feature barcode extraction
    feature_extractor = rna_feature_ref.FeatureExtractor(
        feature_ref, use_feature_types=[args.library_type])

    # Use the chemistry to get the locations of various sequences
    rna_read_def = cr_chem.get_rna_read_def(args.chemistry_def)
    rna_read2_def = cr_chem.get_rna_read2_def(args.chemistry_def)
    bc_read_def = cr_chem.get_barcode_read_def(args.chemistry_def)
    si_read_def = cr_chem.get_si_read_def(args.chemistry_def)
    umi_read_def = cr_chem.get_umi_read_def(args.chemistry_def)

    read_defs = [
        rna_read_def, rna_read2_def, bc_read_def, si_read_def, umi_read_def
    ]
    read_tags = [
        None,
        None,
        (cr_constants.RAW_BARCODE_TAG, cr_constants.RAW_BARCODE_QUAL_TAG),
        (tk_constants.SAMPLE_INDEX_TAG, tk_constants.SAMPLE_INDEX_QUAL_TAG),
        (cr_constants.RAW_UMI_TAG, cr_constants.UMI_QUAL_TAG),
    ]

    # Determine which trimmed sequences need to be retained for bamtofastq
    trim_defs = get_bamtofastq_defs(read_defs, read_tags)
    outs.bam_comments = sorted(set(trim_defs.itervalues()))

    num_libraries = len(args.library_info)
    reporter = cr_report.Reporter(
        umi_length=cr_chem.get_umi_length(args.chemistry_def),
        primers=cr_utils.get_primers_from_dicts(args.primers),
        num_libraries=num_libraries)

    # Determine if barcode sequences need to be reverse complemented.
    with FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved,
                     None, None) as bc_check_rc:
        barcode_whitelist = cr_utils.load_barcode_whitelist(
            args.barcode_whitelist, True)
        barcode_rc = infer_barcode_reverse_complement(barcode_whitelist,
                                                      bc_check_rc.in_iter)

    # Log the untrimmed read lengths to stdout
    r1_read_def = cr_constants.ReadDef(rna_read_def.read_type, 0, None)
    r1_reader = FastqReader(args.read_chunks, r1_read_def,
                            args.reads_interleaved, None, None)

    r1_untrimmed_len = 0
    for read in itertools.islice(r1_reader.in_iter,
                                 cr_constants.DETECT_CHEMISTRY_INITIAL_READS):
        r1_untrimmed_len = max(r1_untrimmed_len, len(read[1]))
    print "Read 1 untrimmed length = ", r1_untrimmed_len
    print "Input arg r1_length = ", args.r1_length
    r1_reader.close()

    if paired_end:
        r2_read_def = cr_constants.ReadDef(rna_read2_def.read_type, 0, None)
        r2_reader = FastqReader(args.read_chunks, r2_read_def,
                                args.reads_interleaved, None, None)

        r2_untrimmed_len = 0
        for read in itertools.islice(
                r2_reader.in_iter,
                cr_constants.DETECT_CHEMISTRY_INITIAL_READS):
            r2_untrimmed_len = max(r2_untrimmed_len, len(read[1]))
        print "Read 2 untrimmed length = ", r2_untrimmed_len
        print "Input arg r2_length = ", args.r2_length
        r2_reader.close()

    # Setup read iterators.
    r1_length = args.r1_length
    r2_length = args.r2_length

    rna_reads = FastqReader(args.read_chunks, rna_read_def,
                            args.reads_interleaved, r1_length, r2_length)
    rna_read2s = FastqReader(args.read_chunks, rna_read2_def,
                             args.reads_interleaved, r1_length, r2_length)
    bc_reads = FastqReader(args.read_chunks, bc_read_def,
                           args.reads_interleaved, r1_length, r2_length)
    si_reads = FastqReader(args.read_chunks, si_read_def,
                           args.reads_interleaved, r1_length, r2_length)

    if cr_chem.has_umis(args.chemistry_def):
        umi_reads = FastqReader(args.read_chunks, umi_read_def,
                                args.reads_interleaved, r1_length, r2_length)
    else:
        umi_reads = FastqReader(None, None, False, r1_length, r2_length)

    # Record feature counts:
    feature_counts = np.zeros(feature_ref.get_num_features(), dtype=int)

    # If this library type has no feature barcodes, make the reader a NOOP
    if feature_extractor.has_features_to_extract():
        feature_reads = FastqFeatureReader(args.read_chunks, feature_extractor,
                                           args.reads_interleaved, r1_length,
                                           r2_length)
    else:
        feature_reads = FastqReader(None, None, None, r1_length, r2_length)

    fastq_readers = (rna_reads, rna_read2s, bc_reads, si_reads, umi_reads,
                     feature_reads)

    read1_writer = ChunkedFastqWriter(outs.reads,
                                      args.reads_per_file,
                                      compression=COMPRESSION)
    if paired_end:
        read2_writer = ChunkedFastqWriter(outs.read2s,
                                          args.reads_per_file,
                                          compression=COMPRESSION)

    tag_writer = None
    if not args.augment_fastq:
        tag_writer = ChunkedFastqWriter(outs.tags,
                                        args.reads_per_file,
                                        compression=COMPRESSION)

    bc_counter = BarcodeCounter(args.barcode_whitelist, outs.barcode_counts)

    all_read_iter = itertools.izip_longest(
        *[reader.in_iter for reader in fastq_readers])

    EMPTY_READ = (None, '', '')

    reporter.extract_reads_init()

    for extractions in itertools.islice(all_read_iter,
                                        args.chunk_initial_reads):
        # Downsample
        if random.random() > args.chunk_subsample_rate:
            continue

        rna_extraction, rna2_extraction, bc_extraction, si_extraction, umi_extraction, feature_extraction = extractions

        rna_read = rna_extraction if rna_extraction is not None else EMPTY_READ
        rna_read2 = rna2_extraction if rna2_extraction is not None else EMPTY_READ
        bc_read = bc_extraction if bc_extraction is not None else EMPTY_READ
        si_read = si_extraction if si_extraction is not None else EMPTY_READ
        umi_read = umi_extraction if umi_extraction is not None else EMPTY_READ

        if (not rna_read[1]) or (paired_end and (not rna_read2[1])):
            # Read 1 is empty or read 2 is empty (if paired_end)
            # Empty reads causes issue with STAR aligner, so eliminate
            # them here
            continue

        if bc_read != EMPTY_READ:
            # Reverse complement the barcode if necessary
            if barcode_rc:
                bc_read = (bc_read[0], tk_seq.get_rev_comp(bc_read[1]),
                           bc_read[2][::-1])
            # Track the barcode count distribution
            bc_counter.count(*bc_read)

        # Calculate metrics on raw sequences
        lib_idx = [
            i for i, x in enumerate(args.library_info)
            if x['library_id'] == args.library_id
        ][0]
        reporter.raw_fastq_cb(rna_read,
                              rna_read2,
                              bc_read,
                              si_read,
                              umi_read,
                              lib_idx,
                              skip_metrics=args.skip_metrics)

        # Construct new fastq headers
        fastq_header1 = AugmentedFastqHeader(rna_read[0])
        fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1])
        fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2])
        fastq_header1.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1])
        fastq_header1.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2])
        fastq_header1.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1])
        fastq_header1.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2])

        feat_raw_bc = None
        feat_proc_bc = None
        feat_qual = None
        feat_ids = None

        if feature_extraction:
            if feature_extraction.barcode:
                feat_raw_bc = feature_extraction.barcode
                feat_qual = feature_extraction.qual

            if len(feature_extraction.ids) > 0:
                feat_proc_bc = feature_extraction.barcode
                feat_ids = ';'.join(feature_extraction.ids)

                # If hit a single feature ID, count its frequency
                if len(feature_extraction.ids) == 1:
                    feature_counts[feature_extraction.indices[0]] += 1

        if feat_raw_bc:
            fastq_header1.set_tag(cr_constants.RAW_FEATURE_BARCODE_TAG,
                                  feat_raw_bc)
            fastq_header1.set_tag(cr_constants.FEATURE_BARCODE_QUAL_TAG,
                                  feat_qual)
        if feat_ids:
            fastq_header1.set_tag(cr_constants.PROCESSED_FEATURE_BARCODE_TAG,
                                  feat_proc_bc)
            fastq_header1.set_tag(cr_constants.FEATURE_IDS_TAG, feat_ids)

        if args.augment_fastq:
            read1_writer.write(
                (fastq_header1.to_string(), rna_read[1], rna_read[2]))
        else:
            read1_writer.write((rna_read[0], rna_read[1], rna_read[2]))
            tag_writer.write((fastq_header1.to_string(), '', ''))

        if paired_end:
            fastq_header2 = AugmentedFastqHeader(rna_read2[0])
            fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1])
            fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG,
                                  si_read[2])
            fastq_header2.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1])
            fastq_header2.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG,
                                  bc_read[2])
            fastq_header2.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1])
            fastq_header2.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2])

            if feat_raw_bc:
                fastq_header2.set_tag(cr_constants.RAW_FEATURE_BARCODE_TAG,
                                      feat_raw_bc)
                fastq_header2.set_tag(cr_constants.FEATURE_BARCODE_QUAL_TAG,
                                      feat_qual)
            if feat_ids:
                fastq_header2.set_tag(
                    cr_constants.PROCESSED_FEATURE_BARCODE_TAG, feat_proc_bc)
                fastq_header2.set_tag(cr_constants.FEATURE_IDS_TAG, feat_ids)

            if args.augment_fastq:
                read2_writer.write(
                    (fastq_header2.to_string(), rna_read2[1], rna_read2[2]))
            else:
                read2_writer.write((rna_read2[0], rna_read2[1], rna_read2[2]))

    reporter.extract_reads_finalize()

    # Close input and output files.
    rna_reads.close()
    if paired_end:
        rna_read2s.close()
    bc_reads.close()
    si_reads.close()
    umi_reads.close()

    read1_writer.close()
    if paired_end:
        read2_writer.close()
    if not args.augment_fastq:
        tag_writer.close()
    bc_counter.close()

    # Write feature BC read counts
    with open(outs.feature_counts, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(list(feature_counts)), f)

    # Set stage output parameters.
    if len(read1_writer.file_paths) > 0:
        outs.reads = read1_writer.get_out_paths()

        if paired_end:
            outs.read2s = read2_writer.get_out_paths(len(outs.reads))
        else:
            outs.read2s = []

        if args.augment_fastq:
            outs.tags = []
        else:
            outs.tags = tag_writer.get_out_paths(len(outs.tags))

        libraries = args.library_info
        library = [
            li for li in libraries if li['library_id'] == args.library_id
        ][0]

        outs.gem_groups = [library['gem_group']] * len(outs.reads)
        outs.library_types = [library['library_type']] * len(outs.reads)
        outs.library_ids = [library['library_id']] * len(outs.reads)
        outs.read_groups = [args.read_group] * len(outs.reads)
    else:
        outs.reads = []
        outs.read2s = []
        outs.tags = []
        outs.gem_groups = []
        outs.library_types = []
        outs.library_ids = []
        outs.read_groups = []

    assert len(outs.gem_groups) == len(outs.reads)
    assert args.augment_fastq or len(outs.tags) == len(outs.reads)

    if paired_end:
        assert len(outs.reads) == len(outs.read2s)

    # this is the first reporter stage, so store the pipeline metadata
    reporter.store_pipeline_metadata(martian.get_pipelines_version())

    reporter.save(outs.chunked_reporter)
예제 #27
0
def main(args, outs):
    random.seed(0)

    paired_end = cr_chem.is_paired_end(args.chemistry_def)

    # Use the chemistry to get the locations of various sequences
    rna_read_def = cr_chem.get_rna_read_def(args.chemistry_def)
    rna_read2_def = cr_chem.get_rna_read2_def(args.chemistry_def)
    bc_read_def = cr_chem.get_barcode_read_def(args.chemistry_def)
    si_read_def = cr_chem.get_si_read_def(args.chemistry_def)
    umi_read_def = cr_chem.get_umi_read_def(args.chemistry_def)

    read_defs = [
        rna_read_def, rna_read2_def, bc_read_def, si_read_def, umi_read_def
    ]
    read_tags = [
        None,
        None,
        (cr_constants.RAW_BARCODE_TAG, cr_constants.RAW_BARCODE_QUAL_TAG),
        (tk_constants.SAMPLE_INDEX_TAG, tk_constants.SAMPLE_INDEX_QUAL_TAG),
        (cr_constants.RAW_UMI_TAG, cr_constants.UMI_QUAL_TAG),
    ]

    # Determine which trimmed sequences need to be retained
    trim_defs = compute_trim_defs(
        read_defs, read_tags,
        args.chemistry_def.get('retain_trimmed_suffix_read'))

    outs.bam_comments = sorted(
        set([td.bam_to_fastq for td in trim_defs.itervalues()]))

    gem_groups = [chunk['gem_group'] for chunk in args.chunks]
    reporter = cr_report.Reporter(
        umi_length=cr_chem.get_umi_length(args.chemistry_def),
        primers=cr_utils.get_primers_from_dicts(args.primers),
        gem_groups=gem_groups)

    # Determine if barcode sequences need to be reverse complemented.
    bc_check_rc = FastqReader(args.read_chunks, bc_read_def,
                              args.reads_interleaved, None)
    barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist)
    barcode_rc = infer_barcode_reverse_complement(barcode_whitelist,
                                                  bc_check_rc.in_iter)
    bc_check_rc.close()

    # Determine which read_iters need to retain trimmed sequence
    # (only one per read-type e.g., one per R1, one per R2, etc.)
    read_types_with_trim_def = set()
    rna_read_trim_defs = None
    rna_read2_trim_defs = None
    bc_read_trim_defs = None
    si_read_trim_defs = None
    umi_read_trim_defs = None

    if rna_read_def.read_type not in read_types_with_trim_def:
        rna_read_trim_defs = trim_defs
        read_types_with_trim_def.add(rna_read_def.read_type)
    if rna_read2_def.read_type not in read_types_with_trim_def:
        rna_read2_trim_defs = trim_defs
        read_types_with_trim_def.add(rna_read2_def.read_type)
    if bc_read_def.read_type not in read_types_with_trim_def:
        bc_read_trim_defs = trim_defs
        read_types_with_trim_def.add(bc_read_def.read_type)
    if si_read_def.read_type not in read_types_with_trim_def:
        si_read_trim_defs = trim_defs
        read_types_with_trim_def.add(si_read_def.read_type)
    if umi_read_def.read_type not in read_types_with_trim_def:
        umi_read_trim_defs = trim_defs
        read_types_with_trim_def.add(umi_read_def.read_type)

    # Setup read iterators.
    rna_reads = FastqReader(args.read_chunks, rna_read_def,
                            args.reads_interleaved, rna_read_trim_defs)
    rna_read2s = FastqReader(args.read_chunks, rna_read2_def,
                             args.reads_interleaved, rna_read2_trim_defs)
    bc_reads = FastqReader(args.read_chunks, bc_read_def,
                           args.reads_interleaved, bc_read_trim_defs)
    si_reads = FastqReader(args.read_chunks, si_read_def,
                           args.reads_interleaved, si_read_trim_defs)

    if cr_chem.has_umis(args.chemistry_def):
        umi_reads = FastqReader(args.read_chunks, umi_read_def,
                                args.reads_interleaved, umi_read_trim_defs)
    else:
        umi_reads = FastqReader(None, None, False, None)

    fastq_readers = (rna_reads, rna_read2s, bc_reads, si_reads, umi_reads)

    # Compute trim order of the readers; this is to ensure stability in the ordering
    # in which trimmed sequence is added to the TRIMMED_SEQ tags
    trim_order = list(
        np.argsort([
            reader.read_def.read_type for reader in fastq_readers
            if reader.read_def is not None
        ]))

    read1_writer = ChunkedFastqWriter(outs.reads, args.reads_per_file)
    if paired_end:
        read2_writer = ChunkedFastqWriter(outs.read2s, args.reads_per_file)

    bc_counter = BarcodeCounter(args.barcode_whitelist, outs.barcode_counts)

    all_read_iter = itertools.izip_longest(
        *[reader.in_iter for reader in fastq_readers])

    # Bam file to write auxiliary data to (that won't fit in a fastq hdr / QNAME)
    trimmed_seq_writer = ChunkedBamWriter(outs.trimmed_seqs,
                                          args.reads_per_file)

    EMPTY_READ = (None, '', '')

    reporter.extract_reads_init()

    for extractions in itertools.islice(all_read_iter, args.initial_reads):
        # Downsample
        if random.random() > args.subsample_rate:
            continue

        rna_extraction, rna2_extraction, bc_extraction, si_extraction, umi_extraction = extractions

        rna_read = rna_extraction.read if rna_extraction is not None else EMPTY_READ
        rna_read2 = rna2_extraction.read if rna2_extraction is not None else EMPTY_READ
        bc_read = bc_extraction.read if bc_extraction is not None else EMPTY_READ
        si_read = si_extraction.read if si_extraction is not None else EMPTY_READ
        umi_read = umi_extraction.read if umi_extraction is not None else EMPTY_READ

        # Extra trimming for internal purposes
        if args.rna_read_length is not None:
            rna_read = (rna_read[0], rna_read[1][0:args.rna_read_length],
                        rna_read[2][0:args.rna_read_length])

        # Accumulate trimmed sequence; ordering is by read-type (I1,I2,R1,R2)
        # to ensure stability
        trimmed_seq = ''
        trimmed_qual = ''
        for i in trim_order:
            if extractions[i] is None:
                continue
            trimmed_seq += extractions[i].trimmed_seq
            trimmed_qual += extractions[i].trimmed_qual

        if bc_read != EMPTY_READ:
            # Reverse complement the barcode if necessary
            if barcode_rc:
                bc_read = (bc_read[0], tk_seq.get_rev_comp(bc_read[1]),
                           bc_read[2][::-1])
            # Track the barcode count distribution
            bc_counter.count(*bc_read)

        # Calculate metrics on raw sequences
        reporter.raw_fastq_cb(rna_read,
                              rna_read2,
                              bc_read,
                              si_read,
                              umi_read,
                              args.gem_group,
                              skip_metrics=args.skip_metrics)

        # Construct new fastq headers
        fastq_header1 = AugmentedFastqHeader(rna_read[0])
        fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1])
        fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2])
        fastq_header1.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1])
        fastq_header1.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2])
        fastq_header1.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1])
        fastq_header1.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2])

        fastq_header_str1 = fastq_header1.to_string()

        read1_writer.write((fastq_header_str1, rna_read[1], rna_read[2]))

        # Write trimmed sequence data to a separate, unaligned BAM file
        # Note: We assume that there is only one trimmed sequence per read-pair
        trimmed_seq_data = pysam.AlignedSegment()
        trimmed_seq_data.query_name = fastq_header_str1.split(
            AugmentedFastqHeader.WORD_SEP)[0]
        trimmed_seq_data.flag = 4
        trimmed_seq_data.seq = trimmed_seq
        trimmed_seq_data.qual = trimmed_qual
        trimmed_seq_writer.write(trimmed_seq_data)

        if paired_end:
            fastq_header2 = AugmentedFastqHeader(rna_read2[0])
            fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1])
            fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG,
                                  si_read[2])
            fastq_header2.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1])
            fastq_header2.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG,
                                  bc_read[2])
            fastq_header2.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1])
            fastq_header2.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2])

            read2_writer.write(
                (fastq_header2.to_string(), rna_read2[1], rna_read2[2]))

    reporter.extract_reads_finalize()

    # Close input and output files.
    rna_reads.close()
    if paired_end:
        rna_read2s.close()
    bc_reads.close()
    si_reads.close()
    umi_reads.close()

    read1_writer.close()
    if paired_end:
        read2_writer.close()
    bc_counter.close()

    trimmed_seq_writer.close()

    # Set stage output parameters.
    if len(read1_writer.file_paths) > 0:
        outs.reads = read1_writer.get_out_paths()
        if paired_end:
            outs.read2s = read2_writer.get_out_paths(len(outs.reads))
        else:
            outs.read2s = []
        outs.gem_groups = [args.gem_group] * len(outs.reads)
        outs.read_groups = [args.read_group] * len(outs.reads)
        outs.trimmed_seqs = trimmed_seq_writer.get_out_paths()
    else:
        outs.reads = []
        outs.read2s = []
        outs.gem_groups = []
        outs.read_groups = []
        outs.trimmed_seqs = []

    assert len(outs.gem_groups) == len(outs.reads)
    if paired_end:
        assert len(outs.reads) == len(outs.read2s)
    assert len(outs.trimmed_seqs) == len(outs.reads)

    # this is the first reporter stage, so store the pipeline metadata
    reporter.store_pipeline_metadata(martian.get_pipelines_version())

    reporter.save(outs.chunked_reporter)
예제 #28
0
def main_mark_duplicates(args, outs):
    """
    Mark exact duplicate reads in the BAM file. Duplicates have the same read1 start site and read2 start site
    """

    lane_coord_sys = tk_lane.LaneCoordinateSystem.from_dict(args.lane_map)

    args.coerce_strings()
    outs.coerce_strings()

    bam_in = tk_bam.create_bam_infile(args.input)
    bam_out, tids = tk_bam.create_bam_outfile(
        outs.output,
        None,
        None,
        template=bam_in,
        pg=tk_bam.make_pg_header(martian.get_pipelines_version(),
                                 "mark_duplicates"))

    # Determine whether the BAM has 10x barcodes
    bam_in.reset()
    has_barcodes = [
        tk_io.read_has_barcode(x) for x in itertools.islice(bam_in, 1000)
    ]
    have_barcodes = (float(sum(has_barcodes)) / len(has_barcodes)) > 0.1

    # We do the subsampling to achieve the desired coverage on _perfect reads_, as
    # defined by tenkit.read_filter.stringent_read_filter.  This is tallied in ATTACH_BCS,
    # and passed into the perfect_read_count argument.  We will fail if it's not supplied.
    total_coverage = args.estimated_coverage

    # Set a fixed random seed to eliminate noise in metrics
    random.seed(0)

    sampling_rates = []
    for sample_cov in DUPLICATE_SUBSAMPLE_COVERAGES:
        rate = tk_stats.robust_divide(float(sample_cov), total_coverage)
        sampling_rates.append((rate, sample_cov))

    # All read duplicate marking - these dup decisions are written to bam_out
    # the output bam has BC aware dup marking if available.
    # Ensure the summary key indicates what kind of dup marking was actually performed.
    if have_barcodes:
        no_filter_dups_bcs = DupSummary(False, 1.0, True,
                                        "no_filter_full_use_bcs",
                                        lane_coord_sys, bam_out)
        no_filter_dups_no_bcs = DupSummary(False,
                                           1.0,
                                           False,
                                           "no_filter_full_ignore_bcs",
                                           lane_coord_sys,
                                           write_to_stdout=False)
    else:
        no_filter_dups_bcs = DupSummary(False, 1.0, True,
                                        "no_filter_full_use_bcs",
                                        lane_coord_sys)
        no_filter_dups_no_bcs = DupSummary(False,
                                           1.0,
                                           False,
                                           "no_filter_full_ignore_bcs",
                                           lane_coord_sys,
                                           bam_out,
                                           write_to_stdout=False)

    # Dup marking on all perfect reads
    full_dups_bcs = DupSummary(True, 1.0, True, "full_use_bcs", lane_coord_sys)
    full_dups_no_bcs = DupSummary(True, 1.0, False, "full_ignore_bcs",
                                  lane_coord_sys)

    # Make a battery of duplicate summaries at different coverages, with and w/o
    # barcode splitting
    split_options = [True, False]

    dup_sums = [
        full_dups_bcs, full_dups_no_bcs, no_filter_dups_bcs,
        no_filter_dups_no_bcs
    ]

    # Duplicate marking on perfect reads subsampled to the requested coverage
    for (sr, cov) in sampling_rates:
        for split_bc in split_options:
            description = "cov_" + str(cov) + ('_use_bcs'
                                               if split_bc else '_ignore_bcs')
            dup_sums.append(
                DupSummary(True, sr, split_bc, description, lane_coord_sys))

    # Now broadcast the selected reads to the summarizers
    # We can't do the points the require a sample_rate > 1.0 so, skip those.
    # If we don't have barcodes, don't run the set that are split by barcode.
    consumers = [
        x.read_consumer() for x in dup_sums
        if x.sample_rate <= 1.0 and ((not x.split_bcs) or have_barcodes)
    ]

    source = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end))
    broadcast(source, consumers)

    # We close the BAM
    bam_out.close()
    # Note - the indexing happens in join

    # Package up the summaries:
    dup_results = {}
    for x in dup_sums:
        (dups, optical_dups, diff_dups) = x.result
        desc = x.description
        dup_results[desc] = dups
        optical_desc = "optical_" + desc
        dup_results[optical_desc] = optical_dups
        diff_desc = "diffusion_" + desc
        dup_results[diff_desc] = diff_dups

    if outs.duplicate_summary:
        f = open(outs.duplicate_summary, 'w')
        json.dump(dup_results, f)
        f.close()
예제 #29
0
def join(args, outs, chunk_defs, chunk_outs):
    if args.filtered_peak_bc_matrix is None or not args.reduction_summary[
            'h5'].keys():
        outs.analysis = None
        outs.analysis_csv = None
        outs.feature_bc_matrix = None
        return

    # Make the FBM
    # build joint Peak + TF count matrix for single genomes
    # combine peak annotations for single genome analysis
    peak_annotation = None
    if args.peak_annotation:
        annotations = pd.read_csv(args.peak_annotation,
                                  sep='\t')[['gene', 'peak_type']]
        annotations = annotations.replace(np.nan, '', regex=True)
        annotations = annotations.values.astype(str).tolist()
        peak_annotation = []
        for row in annotations:
            genes = row[0].split(";")
            annotation = row[1].split(";")
            promoter = []
            nearby_gene = []
            assert len(annotation) == len(genes)
            for en, kind in enumerate(annotation):
                if kind == 'promoter':
                    promoter += [genes[en]]
                nearby_gene += [genes[en]]
            peak_annotation += [[';'.join(promoter), ';'.join(nearby_gene)]]
    fbm = cr_matrix.CountMatrix.load_h5_file(args.filtered_peak_bc_matrix)
    mapping = None
    if args.filtered_tf_bc_matrix:
        # combine matrices, ensure the barcodes are same and ordered the same way
        tf_matrix = cr_matrix.CountMatrix.load_h5_file(
            args.filtered_tf_bc_matrix)
        assert (fbm.bcs == tf_matrix.bcs).all()
        if peak_annotation is not None:
            fbm.feature_ref = FeatureReference.addtags(
                fbm.feature_ref, ['promoter', 'nearby_gene'], peak_annotation)
            tf_matrix.feature_ref = FeatureReference.addtags(
                tf_matrix.feature_ref, ['promoter', 'nearby_gene'])
        combined_feature_defs = FeatureReference.join(fbm.feature_ref,
                                                      tf_matrix.feature_ref)
        combined_matrix = vstack([fbm.m, tf_matrix.m])
        # explicit map linking rows in diffexp to combined matrix
        mapping = np.zeros((tf_matrix.features_dim, 2))
        for x in range(tf_matrix.features_dim):
            mapping[x, 0] = x
            mapping[x, 1] = x + fbm.features_dim
        fbm = cr_matrix.CountMatrix(combined_feature_defs, fbm.bcs,
                                    combined_matrix)
    fbm.save_h5_file(outs.feature_bc_matrix,
                     sw_version=martian.get_pipelines_version())

    # Pytables doesn't support variable len strings, so use h5py first
    with h5.File(outs.feature_bc_matrix, 'r') as matrix, \
            h5.File(outs.analysis, 'w') as out:
        # TODO: copy the first group; fixme when we have a key
        name = matrix.keys()[0]
        matrix.copy(matrix[name], out, name='matrix')

    factorizations = args.reduction_summary['h5'].keys()
    USE_FACTORIZATION = DEFAULT_FACTORIZATION if DEFAULT_FACTORIZATION in factorizations else factorizations[
        0]
    with tables.open_file(outs.analysis, 'a') as out:
        for summary, key in zip([
                args.reduction_summary, args.clustering_summary,
                args.tsne_summary, args.enrichment_analysis_summary
        ], [USE_FACTORIZATION, 'clustering', 'tsne', 'enrichment']):
            if summary is None or not summary:
                continue
            print(key, summary)
            data_h5 = summary['h5'][USE_FACTORIZATION]
            with tables.open_file(data_h5, 'r') as indata:
                indata.copy_children(indata.root, out.root, recursive=True)
            dirname = os.path.join(outs.analysis_csv, key)
            cr_io.copytree(summary['csv'][USE_FACTORIZATION], dirname)

    # if mapping is present (single genome case), so is the coloring matrix
    if mapping is not None:
        with h5.File(outs.analysis, 'a') as out:
            out.create_dataset('feature_DE_map', data=mapping)
        args.coerce_strings()
        tf_propZ_matrix = np.loadtxt(args.tf_propZ_matrix)
        with h5.File(outs.analysis, 'a') as out:
            out.create_dataset('diffexp_coloring_matrix', data=tf_propZ_matrix)
예제 #30
0
def join(args, outs, chunk_defs, chunk_outs):
    pred_to_match, _, pred_df, true_df, min_qv = merge_predictions(chunk_outs)

    # Change TRANS type to DISTAL. This change will only
    # affect the type reported not the names of the metrics.
    new_info = []
    for _, row in pred_df.iterrows():
        sv_type = tk_sv_io.get_sv_type(row.info)
        if sv_type == 'TRANS':
            sv_type = 'DISTAL'
        new_info.append(tk_sv_io.update_info(row.info, ['TYPE'], [sv_type]))
    pred_df['info'] = new_info

    if not true_df is None:
        true_df.to_csv(outs.feasible_gt,
                       index=False,
                       header=True,
                       sep='\t',
                       na_rep='NaN')

    ##### Write BEDPE/VCF outputs
    tk_sv_io.write_sv_df_to_bedpe(pred_df, outs.sv_candidates)
    source_str = '10X/pipelines/stages/analyze_sv_calls {}'.format(
        martian.get_pipelines_version())
    sample_id = 'sample' if args.sample_id is None else args.sample_id
    tk_sv_io.bedpe_to_vcf(outs.sv_candidates, outs.svs.strip('.gz'), sample_id,
                          source_str, args.reference_path)
    # this will sort and gzip
    tk_sv_io.index_sv_vcf(outs.svs.strip(".gz"))
    outs.svs_index = outs.svs + '.tbi'
    # delete the non-gzipped file
    os.remove(outs.svs.strip('.gz'))

    if not pred_df.empty:
        call_df = pred_df[np.logical_or(pred_df['filters'] == '.',
                                        pred_df['filters'] == "PASS")]
    else:
        call_df = None
    tk_sv_io.write_sv_df_to_bedpe(call_df, outs.sv_calls)

    # Annotate each call with the matching ground truth svs. The resulting
    # dataframe might have multiple rows for the same call if there were multiple
    # matching ground truth svs.
    martian.log_info("merging calls and gt")
    if not pred_df.empty:
        pred_df = merge_calls_and_gt(pred_df, true_df, pred_to_match)

    martian.log_info("writing call_tsv")
    pred_df.to_csv(outs.call_tsv,
                   index=False,
                   header=True,
                   sep='\t',
                   na_rep='NaN')

    pred_df = pred_df[np.logical_not(pd.isnull(pred_df['name']))]

    max_dists = sorted(np.array(args.detect_dists))

    gt_sv_types = get_all_sv_types(true_df)
    call_sv_types = get_all_sv_types(pred_df)

    if not true_df is None:
        # Use the default MAX_PPV_TIER unless this is greater than the maximum tier
        # present in the data.
        max_ppv_tier = min(MAX_PPV_TIER, np.max(true_df.tier))
        # Use the default unless this is smaller than the minimum tier present in
        # the data.
        max_sens_tier = max(MAX_SENS_TIER, np.min(true_df.tier))
    else:
        max_ppv_tier = 1
        max_sens_tier = 1

    tiers = [max_ppv_tier, max_sens_tier]

    # All combinations of filters in ground truth and call set
    if not args.targets is None and not args.target_dists is None:
        target_dists = list(sorted(np.array(args.target_dists,
                                            dtype=np.float)))
        target_dists.append(float('NaN'))
    else:
        target_dists = [float('NaN')]

    combs = product([0, 1, 2, None], target_dists, gt_sv_types, tiers,
                    [True, False], call_sv_types, max_dists)

    metrics = defaultdict(list)

    gt_filters = ['genic_breaks', 'target_dist', 'gt_sv_type', 'tier']
    call_filters = ['call_filtered', 'call_sv_type', 'match_dist']

    for (genic_breaks, tdist, gt_sv_type, tier, is_filtered, call_sv_type,
         dist) in combs:
        if gt_sv_type != 'NA' and call_sv_type != 'NA' and gt_sv_type != call_sv_type:
            continue

        metrics['genic_breaks'].append(genic_breaks)
        metrics['target_dist'].append(tdist)
        metrics['gt_sv_type'].append(gt_sv_type)
        metrics['tier'].append(tier)
        metrics['call_filtered'].append(is_filtered)
        metrics['call_sv_type'].append(call_sv_type)
        metrics['match_dist'].append(dist)

        if true_df is None:
            sel_true_df = None
        else:
            sel_true_df = true_df
            if gt_sv_type != 'NA':
                sel_true_df = sel_true_df[sel_true_df.sv_type == gt_sv_type]
            if not np.isnan(tdist):
                sel_true_df = sel_true_df[sel_true_df.targ_dist <= tdist]
            sel_true_df = sel_true_df[sel_true_df.tier <= tier]
            # Restrict to genic or non-genic or take everything if this is None.
            if not genic_breaks is None:
                sel_true_df = sel_true_df[sel_true_df.genic_breaks ==
                                          genic_breaks]

            if len(sel_true_df) == 0:
                sel_true_df = None

        sel_pred_df = pred_df

        if is_filtered and not pred_df.empty:
            sel_pred_df = sel_pred_df[(sel_pred_df.filters == '.') |
                                      (sel_pred_df.filters == 'PASS')]
        if call_sv_type != 'NA' and not pred_df.empty:
            sel_pred_df = sel_pred_df[sel_pred_df.sv_type == call_sv_type]
        if not pred_df.empty and (args.min_rel_overlap is None
                                  or args.min_rel_overlap == 0):
            # Do not apply thi filter if the matching is done based on overlap.
            sel_pred_df = sel_pred_df[np.logical_or(
                np.isnan(sel_pred_df.match_dist),
                sel_pred_df.match_dist <= dist)]

        add_metrics(sel_pred_df, sel_true_df, metrics)

    column_names = gt_filters
    column_names.extend(call_filters)
    other_names = set(metrics.keys()).difference(set(column_names))
    column_names.extend(other_names)

    metric_df = pd.DataFrame(metrics)
    metric_df = metric_df[column_names]

    martian.log_info("writing summary tsv")
    metric_df.to_csv(outs.summary_tsv,
                     index=False,
                     header=True,
                     sep='\t',
                     na_rep='NaN')

    short_metrics = get_short_metrics(metric_df, other_names, max_ppv_tier,
                                      max_sens_tier, args)

    if not args.call_summary is None:
        with open(args.call_summary, 'r') as in_summary_fn:
            in_summary = json.load(in_summary_fn)
            for key, val in in_summary.iteritems():
                short_metrics[key] = val

    short_metrics['min_qv'] = min_qv

    with open(outs.summary, 'w') as out_file:
        out_file.write(
            tenkit.safe_json.safe_jsonify(short_metrics, pretty=True))