Exemplo n.º 1
0
def join_matrices(args, outs, chunk_defs, chunk_outs):
    chunk_h5s = [chunk_out.matrices_h5 for chunk_out in chunk_outs]
    matrices = cr_matrix.merge_matrices(chunk_h5s)
    matrix_attrs = cr_matrix.make_matrix_attrs_count(
        args.sample_id, args.gem_groups,
        cr_chem.get_description(args.chemistry_def))
    matrices.save_h5(outs.matrices_h5, extra_attrs=matrix_attrs)
    matrices.save_mex(outs.matrices_mex)
Exemplo n.º 2
0
def join_matrices(args, outs, chunk_defs, chunk_outs):
    chunk_h5s = [chunk_out.matrices_h5 for chunk_out in chunk_outs]
    matrix = cr_matrix.merge_matrices(chunk_h5s)
    matrix_attrs = cr_matrix.make_matrix_attrs_count(
        args.sample_id, args.gem_groups,
        cr_chem.get_description(args.chemistry_def))
    matrix.save_h5_file(outs.matrices_h5, extra_attrs=matrix_attrs)

    rna_matrix.save_mex(matrix, outs.matrices_mex,
                        martian.get_pipelines_version())
Exemplo n.º 3
0
def join(args, outs, chunk_defs, chunk_outs):
    outs.coerce_strings()

    input_vcfs = [chunk_out.filtered_variants for chunk_out in chunk_outs]
    tk_io.combine_vcfs(outs.filtered_variants, input_vcfs)

    raw_chunk_h5s = [
        chunk_out.raw_allele_bc_matrices_h5 for chunk_out in chunk_outs
    ]
    raw_allele_bc_matrices = cr_matrix.merge_matrices(raw_chunk_h5s)

    likelihood_chunk_h5s = [
        chunk_out.likelihood_allele_bc_matrices_h5 for chunk_out in chunk_outs
    ]
    likelihood_allele_bc_matrices = cr_matrix.merge_matrices(
        likelihood_chunk_h5s)

    raw_allele_bc_matrices.save_h5(outs.raw_allele_bc_matrices_h5)
    raw_allele_bc_matrices.save_mex(outs.raw_allele_bc_matrices_mex)
    likelihood_allele_bc_matrices.save_h5(
        outs.likelihood_allele_bc_matrices_h5)
    likelihood_allele_bc_matrices.save_mex(
        outs.likelihood_allele_bc_matrices_mex)
Exemplo n.º 4
0
def join(args, outs, chunk_defs, chunk_outs):
    # Summarize genes and UMI counts
    chunks = zip(chunk_defs, chunk_outs)

    # Check for an empty chunk
    if len(chunks) == 0 or chunk_defs[0].subsample_info.get(
            'subsample_type') is None or chunk_defs[0].subsample_info.get(
                'subsample_rate') is None:
        outs.summary = None
        return

    chunk_key = lambda chunk: (chunk[0].subsample_info[
        'subsample_type'], chunk[0].subsample_info['target_rpc'], chunk[0].
                               subsample_info['subsample_rate'])

    # Merge reporter objects from main
    reporter_file_names = [
        chunk_out.chunked_reporter for chunk_out in chunk_outs
        if os.path.isfile(chunk_out.chunked_reporter)
    ]
    merged_reporter = cr_report.merge_reporters(reporter_file_names)

    outs.subsampled_matrices = []

    # Aggregate the molecule info chunks that belong together
    for chunk_group, (subsample_key, chunk_iter) in enumerate(
            itertools.groupby(sorted(chunks, key=chunk_key), chunk_key)):
        subsample_type, target_rpc, subsample_rate = subsample_key

        if subsample_type is None or subsample_rate is None:
            continue

        # Aggregate information over chunks with same key
        chunk_raw_h5s = []
        chunk_filtered_h5s = []
        all_subsample_types = cr_constants.ALL_SUBSAMPLE_TYPES
        all_target_rpc = None

        for chunk_def, chunk_out in chunk_iter:
            # List of target rpcs should be identical among all chunks
            assert all_target_rpc is None or all_target_rpc == chunk_def.subsample_info[
                'all_target_rpc']
            all_target_rpc = chunk_def.subsample_info['all_target_rpc']

            chunk_raw_h5s.append(chunk_out.subsampled_matrices['raw_matrices'])
            chunk_filtered_h5s.append(
                chunk_out.subsampled_matrices['filtered_matrices'])

        raw_matrices = cr_matrix.merge_matrices(chunk_raw_h5s)
        filtered_matrices = cr_matrix.merge_matrices(chunk_filtered_h5s)

        # Compute metrics on subsampled matrices
        merged_reporter.summarize_subsampled_matrices_cb(
            filtered_matrices, subsample_type, target_rpc)

        # Write the merged matrices
        outs.subsampled_matrices.append({
            'subsample_type':
            subsample_type,
            'target_rpc':
            target_rpc,
            'subsample_rate':
            subsample_rate,
            'all_subsample_types':
            all_subsample_types,
            'all_target_rpc':
            all_target_rpc,
            'raw_matrices':
            martian.make_path('%s_%s_%s_raw_matrices.h5' %
                              (subsample_type, target_rpc, chunk_group)),
            'filtered_matrices':
            martian.make_path('%s_%s_%s_filtered_matrices.h5' %
                              (subsample_type, target_rpc, chunk_group)),
        })

        assert not os.path.exists(outs.subsampled_matrices[-1]['raw_matrices'])
        assert not os.path.exists(
            outs.subsampled_matrices[-1]['filtered_matrices'])

        raw_matrices.save_h5(outs.subsampled_matrices[-1]['raw_matrices'])
        filtered_matrices.save_h5(
            outs.subsampled_matrices[-1]['filtered_matrices'])

    merged_reporter.report_summary_json(filename=outs.summary)
Exemplo n.º 5
0
def join(args, outs, chunk_defs, chunk_outs):

    version = martian.get_pipelines_version()

    with open(args.summary) as f:
        summary = json.load(f)

    with MoleculeCounter.open(args.molecules, 'r') as mc:
        library_info = mc.get_library_info()
        barcode_info = mc.get_barcode_info()
        barcode_seqs = mc.get_barcodes()

    lib_types = sorted(set(lib['library_type'] for lib in library_info))

    # make attrs for user-added columns in aggr csv
    extra_attrs = get_custom_aggr_columns(args.sample_defs)
    # track original library/gem info
    library_map = cr_matrix.make_library_map_aggr(args.gem_group_index)
    extra_attrs.update(library_map)

    # Merge raw matrix
    raw_matrix = cr_matrix.merge_matrices(args.raw_matrices_h5)
    raw_matrix.save_h5_file(outs.raw_matrix_h5, extra_attrs=extra_attrs)

    genomes = raw_matrix.get_genomes()

    # Create barcode summary HDF5 file w/ GEX data for the barcode rank plot
    with h5py.File(outs.barcode_summary_h5, 'w') as f:
        cr_io.create_hdf5_string_dataset(f, cr_constants.H5_BC_SEQUENCE_COL, raw_matrix.bcs)

        gex_bc_counts = raw_matrix.view().select_features_by_type(lib_constants.GENE_EXPRESSION_LIBRARY_TYPE).sum(axis=0).astype('uint64')
        genome_key = genomes[0] if len(genomes) == 1 else lib_constants.MULTI_REFS_PREFIX
        f.create_dataset('_%s_transcriptome_conf_mapped_deduped_barcoded_reads' % genome_key,
                         data=gex_bc_counts)

    rna_matrix.save_mex(raw_matrix,outs.raw_matrix_mex, version)
    del raw_matrix

    # Merge filtered matrix
    filt_mat = cr_matrix.merge_matrices(args.filtered_matrices_h5)
    filt_mat.save_h5_file(outs.filtered_matrix_h5, extra_attrs=extra_attrs)

    # Summarize the matrix across library types and genomes
    for lib_type in lib_types:
        libtype_prefix = rna_library.get_library_type_metric_prefix(lib_type)

        if rna_library.has_genomes(lib_type):
            genomes = filt_mat.get_genomes()
        else:
            genomes = [None]

        mat_lib = filt_mat.view().select_features_by_type(lib_type)

        for genome in genomes:
            if genome is None:
                mat = mat_lib
                genome_idx = None
            else:
                mat = mat_lib.select_features_by_genome(genome)
                genome_idx = barcode_info.genomes.index(genome)

            # Select barcodes passing filter for this (lib_type, genome)
            filtered_bcs = MoleculeCounter.get_filtered_barcodes(barcode_info,
                                                                 library_info,
                                                                 barcode_seqs,
                                                                 genome_idx=genome_idx,
                                                                 library_type=lib_type)
            mat = mat.select_barcodes_by_seq(filtered_bcs)

            median_features = np.median(mat.count_ge(axis=0,
                                                     threshold=cr_constants.MIN_COUNTS_PER_GENE))
            median_counts = np.median(mat.sum(axis=0))
            genome_prefix = genome if genome is not None else lib_constants.MULTI_REFS_PREFIX

            prefixes = (libtype_prefix, genome_prefix)
            if genome is not None:
                flt_reads = summary['%s%s_flt_mapped_reads' % prefixes]
                raw_reads = summary['%s%s_raw_mapped_reads' % prefixes]
                frac_reads_in_cells = tk_stats.robust_divide(flt_reads, raw_reads)

                summary['%s%s_filtered_bcs_conf_mapped_barcoded_reads_cum_frac' % prefixes] =  frac_reads_in_cells

            summary.update({
                '%s%s_filtered_bcs_median_counts' % prefixes: median_counts,
                '%s%s_filtered_bcs_median_unique_genes_detected' % prefixes: median_features,
            })

        # Compute frac reads in cells across all genomes
        prefixes = [(libtype_prefix, g) for g in genomes if g is not None]
        if len(prefixes) == 0:
            prefixes = [(libtype_prefix, lib_constants.MULTI_REFS_PREFIX)]
        flt_reads = sum(summary['%s%s_flt_mapped_reads' % p] for p in prefixes)
        raw_reads = sum(summary['%s%s_raw_mapped_reads' % p] for p in prefixes)

        frac_reads_in_cells = tk_stats.robust_divide(flt_reads, raw_reads)
        summary['%s%s_filtered_bcs_conf_mapped_barcoded_reads_cum_frac' % (
            libtype_prefix, lib_constants.MULTI_REFS_PREFIX)] = frac_reads_in_cells


    # Write MEX format (do it last because it converts the matrices to COO)
    rna_matrix.save_mex(filt_mat, outs.filtered_matrix_mex, version)

    with open(outs.summary, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(summary), f, indent=4, sort_keys=True)