示例#1
0
def main(args, outs):
    genomes = cr_matrix.CountMatrix.get_genomes_from_h5(args.filtered_matrices)
    chemistry = cr_matrix.CountMatrix.load_chemistry_from_h5(args.filtered_matrices)
    total_cells = cr_matrix.CountMatrix.count_cells_from_h5(args.filtered_matrices)

    summary = {'chemistry_description': chemistry, 'filtered_bcs_transcriptome_union': total_cells}
    if args.analyze_matrices_summary:
        with open(args.analyze_matrices_summary) as reader:
            analysis_summary = json.load(reader)
        summary.update(analysis_summary)

    with open(outs.summary, 'w') as f:
        json.dump(tk_json.json_sanitize(summary), f, indent=4, sort_keys=True)

    sample_properties = ReanalyzeSampleProperties(sample_id=args.sample_id,
                                                  sample_desc=args.sample_desc,
                                                  genomes=genomes,
                                                  version=martian.get_pipelines_version())
    sample_properties = dict(sample_properties._asdict())

    sample_data_paths = cr_webshim_data.SampleDataPaths(
        summary_path=outs.summary,
        analysis_path=args.analysis,
    )

    sample_data = cr_webshim.load_sample_data(sample_properties, sample_data_paths)
    cr_webshim.build_web_summary_html(outs.web_summary, sample_properties, sample_data, PIPELINE_REANALYZE)
示例#2
0
def plot_clonotype_table(chart, sample_properties, sample_data):
    if sample_data.vdj_clonotype_summary is None:
        return None

    clonotypes = sample_data.vdj_clonotype_summary.iloc[0:10]

    # This column used to be called 'cdr3s'; allow the webshim to work on older data
    cdr3_aa_col = 'cdr3s_aa'
    if cdr3_aa_col not in clonotypes:
        cdr3_aa_col = 'cdr3s'

    col_defs = collections.OrderedDict([
        ('clonotype_id', {'label': 'Clonotype ID',
                          'format': 'string',
                          'title': 'Clonotype ID',
                          'style': 'text-align: left'}),
        (cdr3_aa_col,     {'label': 'CDR3s',
                           'format': 'string',
                           'title': 'CDR3s in clonotype',
                           'style': 'text-align: left'}),
        ('frequency',    {'label': 'Frequency',
                          'format': 'integer',
                          'title': 'Number of cells with clonotype',
                          'style': 'text-align: right'}),
        ('proportion',   {'label': 'Proportion',
                          'format': '%0.4f',
                          'title': 'Fraction of cell with clonotype',
                          'style': 'text-align: right'}),
    ])

    cols = []
    for name, col_def in col_defs.iteritems():
        if name not in clonotypes:
            raise ValueError('Column not found in clonotype summary: %s' % name)
        cols.append({
            'label': col_defs[name]['label'],
            'title': col_defs[name]['title'],
        })

    rows = []
    for _, cl_row in clonotypes.iterrows():
        row = []
        for col_name, col_def in col_defs.iteritems():
            value = cl_row[col_name]
            formatted_value = format_value(value, col_def['format'])

            # Make the CDR3 list bit more readable
            formatted_value = formatted_value.replace(';', '; ')

            row.append({
                'v': tk_safe_json.json_sanitize(value),
                'f': formatted_value,
                's': col_def['style'],
            })
        rows.append(row)

    chart['table'].update({'rows': rows, 'cols': cols})

    return chart
示例#3
0
    def build_reference(self):
        print "Creating new reference folder at %s" % self.out_dir
        os.mkdir(self.out_dir)
        print "...done\n"

        print "Writing genome FASTA file into reference folder..."
        new_genome_fasta = os.path.join(self.out_dir, cr_constants.REFERENCE_FASTA_PATH)
        os.mkdir(os.path.dirname(new_genome_fasta))
        self.write_genome_fasta(new_genome_fasta)
        print "...done\n"

        print "Computing hash of genome FASTA file..."
        fasta_hash = cr_utils.compute_hash_of_file(new_genome_fasta)
        print "...done\n"

        print "Writing genes GTF file into reference folder..."
        new_gene_gtf = os.path.join(self.out_dir, cr_constants.REFERENCE_GENES_GTF_PATH)
        os.mkdir(os.path.dirname(new_gene_gtf))
        self.write_genome_gtf(new_gene_gtf)
        print "...done\n"

        print "Computing hash of genes GTF file..."
        gtf_hash = cr_utils.compute_hash_of_file(new_gene_gtf)
        print "...done\n"

        print "Writing genes index file into reference folder (may take over 10 minutes for a 3Gb genome)..."
        new_gene_index = os.path.join(self.out_dir, cr_constants.REFERENCE_GENES_INDEX_PATH)
        os.mkdir(os.path.dirname(new_gene_index))
        self.write_genome_gene_index(new_gene_index, new_gene_gtf, new_genome_fasta)
        print "...done\n"

        print "Writing genome metadata JSON file into reference folder..."
        metadata = {
            cr_constants.REFERENCE_GENOMES_KEY: self.genomes,
            cr_constants.REFERENCE_NUM_THREADS_KEY: int(math.ceil(float(self.mem_gb) / 8.0)),
            cr_constants.REFERENCE_MEM_GB_KEY: self.mem_gb,
            cr_constants.REFERENCE_FASTA_HASH_KEY: fasta_hash,
            cr_constants.REFERENCE_GTF_HASH_KEY: gtf_hash,
            cr_constants.REFERENCE_INPUT_FASTA_KEY: [os.path.basename(x) for x in self.in_fasta_fns],
            cr_constants.REFERENCE_INPUT_GTF_KEY: [os.path.basename(x) for x in self.in_gtf_fns],
            cr_constants.REFERENCE_VERSION_KEY: self.ref_version,
            cr_constants.REFERENCE_MKREF_VERSION_KEY: self.mkref_version,
        }
        new_metadata_json = os.path.join(self.out_dir, cr_constants.REFERENCE_METADATA_FILE)
        with open(new_metadata_json, 'w') as f:
            json.dump(tk_safe_json.json_sanitize(metadata), f, sort_keys=True, indent=4)
        print "...done\n"

        print "Generating STAR genome index (may take over 8 core hours for a 3Gb genome)..."
        new_star_path = os.path.join(self.out_dir, cr_constants.REFERENCE_STAR_PATH)
        star = STAR(new_star_path)
        star.index_reference_with_mem_gb(new_genome_fasta, new_gene_gtf,
                                         num_threads=self.num_threads,
                                         mem_gb=self.mem_gb)
        print "...done.\n"

        print ">>> Reference successfully created! <<<\n"
        print "You can now specify this reference on the command line:"
        print "cellranger --transcriptome=%s ..." % self.out_dir
示例#4
0
 def save_gem_class_json(self, base_dir):
     json_file_path = MultiGenomeAnalysis.json_path(base_dir)
     cr_io.makedirs(os.path.dirname(json_file_path), allow_existing=True)
     with open(json_file_path, 'w') as f:
         json.dump(tk_safe_json.json_sanitize(self.result),
                   f,
                   indent=4,
                   sort_keys=True)
示例#5
0
 def report_summary_json(self, filename, summary_json_paths, barcode_summary_h5_path,
                         recovered_cells, cell_bc_seqs):
     """ summary_json_paths: paths to summary jsons containing total_reads and *_conf_mapped_reads_frac
         barcode_summary_h5_path: path to barcode summary h5 file
     """
     d = self.report(summary_json_paths,
                     barcode_summary_h5_path=barcode_summary_h5_path,
                     recovered_cells=recovered_cells,
                     cell_bc_seqs=cell_bc_seqs)
     with open(filename, 'w') as f:
         json.dump(tk_safe_json.json_sanitize(d), f, indent=4, sort_keys=True)
示例#6
0
def merge_by_barcode(in_filenames, r1_out_file, r2_out_file, bcs_out_file,
                     paired_end):
    barcodes = set()

    # Note: The filehandle cache precludes the use of compressed files
    file_cache = tk_cache.FileHandleCache(mode='r', open_func=open)
    heap = []

    key_func = vdj_utils.fastq_barcode_sort_key

    for filename in in_filenames:
        try:
            fastq = tk_fasta.read_generator_fastq(file_cache.get(filename),
                                                  paired_end=paired_end)
            first_readpair = fastq.next()

            key = key_func(first_readpair[0:3])
            barcode = key[0]
            barcodes.add(barcode)

            heapq.heappush(heap, (key, first_readpair, filename))

        except StopIteration:
            pass

    while len(heap) > 0:
        # Get the minimum item and write it.
        key, readpair, in_filename = heapq.heappop(heap)

        fastq = tk_fasta.read_generator_fastq(file_cache.get(in_filename),
                                              paired_end=paired_end)

        tk_fasta.write_read_fastq(r1_out_file, *readpair[0:3])
        if paired_end:
            tk_fasta.write_read_fastq(r2_out_file, *readpair[3:6])

        # Get the next item from the source file we just wrote from
        # If that file is out of items, then we leave that one out
        try:
            next_readpair = fastq.next()

            key = key_func(next_readpair[0:3])
            barcode = key[0]
            barcodes.add(barcode)

            heapq.heappush(heap, (key, next_readpair, in_filename))

        except StopIteration:
            pass

    json.dump(tk_safe_json.json_sanitize(list(barcodes)), bcs_out_file)
示例#7
0
def build_web_summary_html(filename, sample_properties, sample_data, pipeline,
                           template_dir=None, alerts_output_filename=None):
    view = build_web_summary_json(sample_properties, sample_data, pipeline)

    if not view:
        return

    with open(filename, 'w') as f:
        f.write(template.convert_webshim_json_to_html(view, pipeline, template_dir=template_dir))

    # Write raised alerts to a json file
    if alerts_output_filename is not None:
        with open(alerts_output_filename, 'w') as f:
            json.dump(tk_safe_json.json_sanitize(view.get('alarms', [])), f, indent=4, sort_keys=True)
示例#8
0
def convert_webshim_json_to_html(data, pipeline, template_dir=None):
    if template_dir is None:
        template_dir = DEFAULT_TEMPLATE_DIR
    loader = jinja2.FileSystemLoader(template_dir)
    env = jinja2.Environment(loader=loader,
                             trim_blocks=True,
                             lstrip_blocks=True,
                             variable_start_string='[[',
                             variable_end_string=']]')
    env.globals['include_file'] = lambda name: loader.get_source(env, name)[0]
    template_html = get_template_for_pipeline(pipeline, data)
    template = env.get_template(template_html)
    compressed_data = lz_string.compressToEncodedURIComponent(
        json.dumps(tk_safe_json.json_sanitize(data)))
    return template.render(data=data,
                           js_compressed_data=compressed_data).encode('utf-8')
def main(args, outs):
    exclusions = {}
    for filename in args.barcode_exclusions:
        if filename is None or not os.path.isfile(filename):
            continue
        with open(filename, "r") as infile:
            data = json.load(infile)
        reason = data["label"]
        for species, barcode_data in data["data"].iteritems():
            if species not in exclusions:
                exclusions[species] = {}
            for barcode, metric in barcode_data.iteritems():
                if barcode in exclusions[species]:
                    # This barcode was already excluded by another file
                    continue
                exclusions[species][barcode] = [reason, metric]

    with open(outs.excluded_barcodes, "w") as outfile:
        json.dump(json_sanitize(exclusions), outfile, indent=4, sort_keys=True)
示例#10
0
def split(args):
    # Write BAM comments to json file
    bam_comment_fn = martian.make_path('bam_comments.json')
    with open(bam_comment_fn, 'w') as f:
        json.dump(args.bam_comments, f)

    # Write library info to a file
    libraries_fn = martian.make_path('libraries.json')
    with open(libraries_fn, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(args.library_info),
                  f,
                  indent=4,
                  sort_keys=True)

    chunks = []
    for chunk_genome_input, tags, gem_group, library_type, library_id, in itertools.izip_longest(
            args.genome_inputs, args.tags, args.gem_groups, args.library_types,
            args.library_ids):

        gem_group_str = str(gem_group)
        if gem_group_str in args.skip_translate and library_type in args.skip_translate[
                gem_group_str]:
            this_skip_translate = args.skip_translate[gem_group_str][
                library_type]
        else:
            this_skip_translate = True

        chunks.append({
            'chunk_genome_input': chunk_genome_input,
            'chunk_tags': tags,
            'gem_group': gem_group,
            'library_type': library_type,
            'library_id': library_id,
            'library_info_json': libraries_fn,
            'bam_comments_json': bam_comment_fn,
            'skip_translate': this_skip_translate,
            '__mem_gb': 4,
        })
    join = {
        '__mem_gb': 12,
    }
    return {'chunks': chunks, 'join': join}
示例#11
0
def _plot_differential_expression(chart,
                                  analysis,
                                  clustering=None,
                                  diff_expr=None,
                                  original_cluster_sizes=None):
    n_clusters = clustering.clusters.max()

    # Get the union of top DE genes
    top_genes = set()

    # Limit the number of entries in the DE table
    n_genes = int(
        np.floor(
            float(ws_gex_constants.MAX_DE_TABLE_ENTRIES) / (n_clusters**2)))
    if n_genes < 1:
        n_genes = 1
    elif n_genes > ws_gex_constants.MAX_TOP_N_GENES:
        n_genes = ws_gex_constants.MAX_TOP_N_GENES

    cols = [
        {
            'type': 'string',
            'label': 'Gene ID'
        },
        {
            'type': 'string',
            'label': 'Gene name'
        },
    ]

    for i in xrange(n_clusters):
        # Filter genes by mean count and sort by log2 fold-change, descending
        means = diff_expr.data[:, 0 + 3 * i]
        log2fcs = diff_expr.data[:, 1 + 3 * i]

        keep_indices = np.flatnonzero(
            means >= ws_gex_constants.TOP_DE_GENES_MIN_MEAN)
        top_gene_indices = keep_indices[log2fcs[keep_indices].argsort()
                                        [::-1]][:n_genes]

        for j in top_gene_indices:
            top_genes.add(analysis.matrix.int_to_feature_id(j))

        cols.append({
            'type':
            'number',
            'label':
            'L2FC',
            'title':
            'Log2 fold-change in cluster %d vs other cells' % (i + 1)
        })
        cols.append({
            'type':
            'number',
            'label':
            'p-value',
            'title':
            'Adjusted p-value of differential expression in cluster %d' %
            (i + 1)
        })

    rows = []
    for gene_id in top_genes:
        i = analysis.matrix.feature_id_to_int(gene_id)
        gene_name = analysis.matrix.feature_id_to_name(gene_id)

        row = [gene_id, gene_name]
        for j in xrange(n_clusters):
            log2fc = diff_expr.data[i, 1 + (3 * j)]
            adj_p_value = diff_expr.data[i, 2 + (3 * j)]

            if log2fc <= 0 or adj_p_value >= ws_gex_constants.PVALUE_DEEMPHASIS_CUTOFF:
                style = '#DDD'
            else:
                style = '#000'

            row.append({
                'v': tk_safe_json.json_sanitize(log2fc),
                'f': format_value(log2fc, '%.2f'),
                's': style
            })
            row.append({
                'v': tk_safe_json.json_sanitize(adj_p_value),
                'f': format_value(adj_p_value, '%.0e'),
                's': style
            })

        rows.append(row)

    # Sort by log2fc, descending, in first cluster
    if n_clusters > 0:
        rows = sorted(rows, key=lambda row: row[2]['v'], reverse=True)

    chart['table'].update({'rows': rows, 'cols': cols})
    return chart
示例#12
0
 def save_summary_json(self, filename):
     with open(filename, 'w') as f:
         json.dump(tk_safe_json.json_sanitize(self.summary),
                   f,
                   indent=4,
                   sort_keys=True)
示例#13
0
def join(args, outs, chunk_defs, chunk_outs):
    contigs = []
    contig_fastqs = []
    contig_bams = []

    summary_df_parts = []
    umi_summary_df_parts = []

    for chunk_out in chunk_outs:
        if not os.path.isfile(chunk_out.contig_fasta):
            continue
        contigs.append(chunk_out.contig_fasta)

        contig_fastqs.append(chunk_out.contig_fastq)
        contig_bams.append(chunk_out.contig_bam)
        summary_df_parts.append(
            pd.read_csv(chunk_out.summary_tsv,
                        header=0,
                        index_col=None,
                        sep='\t',
                        dtype={
                            'component': int,
                            'num_reads': int,
                            'num_pairs': int,
                            'num_umis': int
                        }))

        umi_summary_df_parts.append(
            pd.read_csv(chunk_out.umi_summary_tsv,
                        header=0,
                        index_col=None,
                        sep='\t',
                        dtype={
                            'umi_id': int,
                            'reads': int,
                            'min_umi_reads': int,
                            'contigs': str
                        }))

    summary_df = pd.concat(summary_df_parts, ignore_index=True)
    umi_summary_df = pd.concat(umi_summary_df_parts, ignore_index=True)

    cr_utils.concatenate_files(outs.contig_fasta, contigs)

    if os.path.getsize(outs.contig_fasta) > 0:
        subprocess.check_call('samtools faidx %s' % outs.contig_fasta,
                              shell=True)
        outs.contig_fasta_fai = outs.contig_fasta + '.fai'

    cr_utils.concatenate_files(outs.contig_fastq, contig_fastqs)

    if summary_df is not None:
        summary_df.to_csv(outs.summary_tsv, header=True, index=False, sep='\t')
    if umi_summary_df is not None:
        umi_summary_df.to_csv(outs.umi_summary_tsv,
                              header=True,
                              index=False,
                              sep='\t')

    if contig_bams:
        tk_bam.merge(outs.contig_bam, contig_bams, threads=args.__threads)
        tk_bam.index(outs.contig_bam)

        # Make sure the Martian out matches the actual index filename
        outs.contig_bam_bai = outs.contig_bam + '.bai'

    # Merge the assembler summary jsons
    merged_summary = cr_utils.merge_jsons_single_level(
        [out.metrics_summary_json for out in chunk_outs])

    with open(outs.metrics_summary_json, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(merged_summary),
                  f,
                  indent=4,
                  sort_keys=True)
示例#14
0
def save_cell_barcodes_json(barcodes, filename):
    with open(filename, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(sorted(list(barcodes))),
                  f,
                  indent=4,
                  sort_keys=True)
示例#15
0
def join(args, outs, chunk_defs, chunk_outs):
    outs.reads, outs.read2s, outs.tags = [], [], []
    outs.gem_groups, outs.library_types, outs.library_ids, outs.read_groups = [], [], [], []

    for chunk_out in chunk_outs:
        outs.reads += [read for read in chunk_out.reads]
        outs.read2s += [read2 for read2 in chunk_out.read2s]
        outs.tags += [tags for tags in chunk_out.tags]
        outs.gem_groups += [gem_group for gem_group in chunk_out.gem_groups]
        outs.library_types += [lt for lt in chunk_out.library_types]
        outs.library_ids += [li for li in chunk_out.library_ids]
        outs.read_groups += [
            read_group for read_group in chunk_out.read_groups
        ]

    # Ensure that we have non-zero reads
    if not outs.reads:
        martian.exit(
            "No reads found. Check the input fastqs and/or the chemistry definition"
        )
    # Ensure consistency of BAM comments
    assert all(chunk_out.bam_comments == chunk_outs[0].bam_comments
               for chunk_out in chunk_outs)
    outs.bam_comments = chunk_outs[0].bam_comments

    # Write barcode counts (merged by library_type)
    bc_counters = BarcodeCounter.merge_by(
        [co.barcode_counts
         for co in chunk_outs], [cd.library_type for cd in chunk_defs],
        args.barcode_whitelist, outs.gem_groups)
    with open(outs.barcode_counts, 'w') as f:
        tk_safe_json.dump_numpy(bc_counters, f)

    # Write feature counts
    feature_counts = None
    for chunk_def, chunk_out in itertools.izip(chunk_defs, chunk_outs):
        with open(chunk_out.feature_counts) as f:
            chunk_counts = np.asarray(json.load(f), dtype=int)
            if feature_counts is None:
                feature_counts = chunk_counts
            else:
                feature_counts += chunk_counts

    with open(outs.feature_counts, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(list(feature_counts)), f)

    outs.align = cr_utils.select_alignment_params(args.align)

    # Group reporters by library type
    outs.chunked_reporter = None
    reporter_groups = defaultdict(list)
    for chunk_def, chunk_out in zip(chunk_defs, chunk_outs):
        if not chunk_out.reads:
            continue
        chunk_lib_types = set(lt for lt in chunk_out.library_types)
        assert len(chunk_lib_types) == 1
        lib_type = list(chunk_lib_types)[0]
        reporter_groups[lib_type].append(chunk_out.chunked_reporter)

    # Merge reporters and prefix JSON keys by library type
    summary = {}
    for lib_type, reporters in reporter_groups.iteritems():
        j = cr_report.merge_reporters(reporters).to_json()

        prefix = rna_library.get_library_type_metric_prefix(lib_type)
        j_prefixed = dict((prefix + k, v) for k, v in j.iteritems())

        summary.update(j_prefixed)

    # Use a temporary reporter to generate the metadata (w/o a prefix)
    tmp_reporter = cr_report.Reporter()
    tmp_reporter.store_chemistry_metadata(args.chemistry_def)
    summary.update(tmp_reporter.to_json())

    # Write summary JSON
    with open(outs.summary, 'w') as f:
        tk_safe_json.dump_numpy(summary, f, pretty=True)
示例#16
0
def join(args, outs, chunk_defs, chunk_outs):

    version = martian.get_pipelines_version()

    with open(args.summary) as f:
        summary = json.load(f)

    with MoleculeCounter.open(args.molecules, 'r') as mc:
        library_info = mc.get_library_info()
        barcode_info = mc.get_barcode_info()
        barcode_seqs = mc.get_barcodes()

    lib_types = sorted(set(lib['library_type'] for lib in library_info))

    # make attrs for user-added columns in aggr csv
    extra_attrs = get_custom_aggr_columns(args.sample_defs)
    # track original library/gem info
    library_map = cr_matrix.make_library_map_aggr(args.gem_group_index)
    extra_attrs.update(library_map)

    # Merge raw matrix
    raw_matrix = cr_matrix.merge_matrices(args.raw_matrices_h5)
    raw_matrix.save_h5_file(outs.raw_matrix_h5, extra_attrs=extra_attrs)

    genomes = raw_matrix.get_genomes()

    # Create barcode summary HDF5 file w/ GEX data for the barcode rank plot
    with h5py.File(outs.barcode_summary_h5, 'w') as f:
        cr_io.create_hdf5_string_dataset(f, cr_constants.H5_BC_SEQUENCE_COL, raw_matrix.bcs)

        gex_bc_counts = raw_matrix.view().select_features_by_type(lib_constants.GENE_EXPRESSION_LIBRARY_TYPE).sum(axis=0).astype('uint64')
        genome_key = genomes[0] if len(genomes) == 1 else lib_constants.MULTI_REFS_PREFIX
        f.create_dataset('_%s_transcriptome_conf_mapped_deduped_barcoded_reads' % genome_key,
                         data=gex_bc_counts)

    rna_matrix.save_mex(raw_matrix,outs.raw_matrix_mex, version)
    del raw_matrix

    # Merge filtered matrix
    filt_mat = cr_matrix.merge_matrices(args.filtered_matrices_h5)
    filt_mat.save_h5_file(outs.filtered_matrix_h5, extra_attrs=extra_attrs)

    # Summarize the matrix across library types and genomes
    for lib_type in lib_types:
        libtype_prefix = rna_library.get_library_type_metric_prefix(lib_type)

        if rna_library.has_genomes(lib_type):
            genomes = filt_mat.get_genomes()
        else:
            genomes = [None]

        mat_lib = filt_mat.view().select_features_by_type(lib_type)

        for genome in genomes:
            if genome is None:
                mat = mat_lib
                genome_idx = None
            else:
                mat = mat_lib.select_features_by_genome(genome)
                genome_idx = barcode_info.genomes.index(genome)

            # Select barcodes passing filter for this (lib_type, genome)
            filtered_bcs = MoleculeCounter.get_filtered_barcodes(barcode_info,
                                                                 library_info,
                                                                 barcode_seqs,
                                                                 genome_idx=genome_idx,
                                                                 library_type=lib_type)
            mat = mat.select_barcodes_by_seq(filtered_bcs)

            median_features = np.median(mat.count_ge(axis=0,
                                                     threshold=cr_constants.MIN_COUNTS_PER_GENE))
            median_counts = np.median(mat.sum(axis=0))
            genome_prefix = genome if genome is not None else lib_constants.MULTI_REFS_PREFIX

            prefixes = (libtype_prefix, genome_prefix)
            if genome is not None:
                flt_reads = summary['%s%s_flt_mapped_reads' % prefixes]
                raw_reads = summary['%s%s_raw_mapped_reads' % prefixes]
                frac_reads_in_cells = tk_stats.robust_divide(flt_reads, raw_reads)

                summary['%s%s_filtered_bcs_conf_mapped_barcoded_reads_cum_frac' % prefixes] =  frac_reads_in_cells

            summary.update({
                '%s%s_filtered_bcs_median_counts' % prefixes: median_counts,
                '%s%s_filtered_bcs_median_unique_genes_detected' % prefixes: median_features,
            })

        # Compute frac reads in cells across all genomes
        prefixes = [(libtype_prefix, g) for g in genomes if g is not None]
        if len(prefixes) == 0:
            prefixes = [(libtype_prefix, lib_constants.MULTI_REFS_PREFIX)]
        flt_reads = sum(summary['%s%s_flt_mapped_reads' % p] for p in prefixes)
        raw_reads = sum(summary['%s%s_raw_mapped_reads' % p] for p in prefixes)

        frac_reads_in_cells = tk_stats.robust_divide(flt_reads, raw_reads)
        summary['%s%s_filtered_bcs_conf_mapped_barcoded_reads_cum_frac' % (
            libtype_prefix, lib_constants.MULTI_REFS_PREFIX)] = frac_reads_in_cells


    # Write MEX format (do it last because it converts the matrices to COO)
    rna_matrix.save_mex(filt_mat, outs.filtered_matrix_mex, version)

    with open(outs.summary, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(summary), f, indent=4, sort_keys=True)
示例#17
0
def main(args, outs):
    random.seed(0)

    paired_end = cr_chem.is_paired_end(args.chemistry_def)

    # Build the feature reference
    if args.reference_path:
        feature_ref = rna_feature_ref.from_transcriptome_and_csv(
            args.reference_path, args.feature_reference)
    else:
        feature_ref = rna_feature_ref.FeatureReference.empty()

    # Setup feature barcode extraction
    feature_extractor = rna_feature_ref.FeatureExtractor(
        feature_ref, use_feature_types=[args.library_type])

    # Use the chemistry to get the locations of various sequences
    rna_read_def = cr_chem.get_rna_read_def(args.chemistry_def)
    rna_read2_def = cr_chem.get_rna_read2_def(args.chemistry_def)
    bc_read_def = cr_chem.get_barcode_read_def(args.chemistry_def)
    si_read_def = cr_chem.get_si_read_def(args.chemistry_def)
    umi_read_def = cr_chem.get_umi_read_def(args.chemistry_def)

    read_defs = [
        rna_read_def, rna_read2_def, bc_read_def, si_read_def, umi_read_def
    ]
    read_tags = [
        None,
        None,
        (cr_constants.RAW_BARCODE_TAG, cr_constants.RAW_BARCODE_QUAL_TAG),
        (tk_constants.SAMPLE_INDEX_TAG, tk_constants.SAMPLE_INDEX_QUAL_TAG),
        (cr_constants.RAW_UMI_TAG, cr_constants.UMI_QUAL_TAG),
    ]

    # Determine which trimmed sequences need to be retained for bamtofastq
    trim_defs = get_bamtofastq_defs(read_defs, read_tags)
    outs.bam_comments = sorted(set(trim_defs.itervalues()))

    num_libraries = len(args.library_info)
    reporter = cr_report.Reporter(
        umi_length=cr_chem.get_umi_length(args.chemistry_def),
        primers=cr_utils.get_primers_from_dicts(args.primers),
        num_libraries=num_libraries)

    # Determine if barcode sequences need to be reverse complemented.
    with FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved,
                     None, None) as bc_check_rc:
        barcode_whitelist = cr_utils.load_barcode_whitelist(
            args.barcode_whitelist, True)
        barcode_rc = infer_barcode_reverse_complement(barcode_whitelist,
                                                      bc_check_rc.in_iter)

    # Log the untrimmed read lengths to stdout
    r1_read_def = cr_constants.ReadDef(rna_read_def.read_type, 0, None)
    r1_reader = FastqReader(args.read_chunks, r1_read_def,
                            args.reads_interleaved, None, None)

    r1_untrimmed_len = 0
    for read in itertools.islice(r1_reader.in_iter,
                                 cr_constants.DETECT_CHEMISTRY_INITIAL_READS):
        r1_untrimmed_len = max(r1_untrimmed_len, len(read[1]))
    print "Read 1 untrimmed length = ", r1_untrimmed_len
    print "Input arg r1_length = ", args.r1_length
    r1_reader.close()

    if paired_end:
        r2_read_def = cr_constants.ReadDef(rna_read2_def.read_type, 0, None)
        r2_reader = FastqReader(args.read_chunks, r2_read_def,
                                args.reads_interleaved, None, None)

        r2_untrimmed_len = 0
        for read in itertools.islice(
                r2_reader.in_iter,
                cr_constants.DETECT_CHEMISTRY_INITIAL_READS):
            r2_untrimmed_len = max(r2_untrimmed_len, len(read[1]))
        print "Read 2 untrimmed length = ", r2_untrimmed_len
        print "Input arg r2_length = ", args.r2_length
        r2_reader.close()

    # Setup read iterators.
    r1_length = args.r1_length
    r2_length = args.r2_length

    rna_reads = FastqReader(args.read_chunks, rna_read_def,
                            args.reads_interleaved, r1_length, r2_length)
    rna_read2s = FastqReader(args.read_chunks, rna_read2_def,
                             args.reads_interleaved, r1_length, r2_length)
    bc_reads = FastqReader(args.read_chunks, bc_read_def,
                           args.reads_interleaved, r1_length, r2_length)
    si_reads = FastqReader(args.read_chunks, si_read_def,
                           args.reads_interleaved, r1_length, r2_length)

    if cr_chem.has_umis(args.chemistry_def):
        umi_reads = FastqReader(args.read_chunks, umi_read_def,
                                args.reads_interleaved, r1_length, r2_length)
    else:
        umi_reads = FastqReader(None, None, False, r1_length, r2_length)

    # Record feature counts:
    feature_counts = np.zeros(feature_ref.get_num_features(), dtype=int)

    # If this library type has no feature barcodes, make the reader a NOOP
    if feature_extractor.has_features_to_extract():
        feature_reads = FastqFeatureReader(args.read_chunks, feature_extractor,
                                           args.reads_interleaved, r1_length,
                                           r2_length)
    else:
        feature_reads = FastqReader(None, None, None, r1_length, r2_length)

    fastq_readers = (rna_reads, rna_read2s, bc_reads, si_reads, umi_reads,
                     feature_reads)

    read1_writer = ChunkedFastqWriter(outs.reads,
                                      args.reads_per_file,
                                      compression=COMPRESSION)
    if paired_end:
        read2_writer = ChunkedFastqWriter(outs.read2s,
                                          args.reads_per_file,
                                          compression=COMPRESSION)

    tag_writer = None
    if not args.augment_fastq:
        tag_writer = ChunkedFastqWriter(outs.tags,
                                        args.reads_per_file,
                                        compression=COMPRESSION)

    bc_counter = BarcodeCounter(args.barcode_whitelist, outs.barcode_counts)

    all_read_iter = itertools.izip_longest(
        *[reader.in_iter for reader in fastq_readers])

    EMPTY_READ = (None, '', '')

    reporter.extract_reads_init()

    for extractions in itertools.islice(all_read_iter,
                                        args.chunk_initial_reads):
        # Downsample
        if random.random() > args.chunk_subsample_rate:
            continue

        rna_extraction, rna2_extraction, bc_extraction, si_extraction, umi_extraction, feature_extraction = extractions

        rna_read = rna_extraction if rna_extraction is not None else EMPTY_READ
        rna_read2 = rna2_extraction if rna2_extraction is not None else EMPTY_READ
        bc_read = bc_extraction if bc_extraction is not None else EMPTY_READ
        si_read = si_extraction if si_extraction is not None else EMPTY_READ
        umi_read = umi_extraction if umi_extraction is not None else EMPTY_READ

        if (not rna_read[1]) or (paired_end and (not rna_read2[1])):
            # Read 1 is empty or read 2 is empty (if paired_end)
            # Empty reads causes issue with STAR aligner, so eliminate
            # them here
            continue

        if bc_read != EMPTY_READ:
            # Reverse complement the barcode if necessary
            if barcode_rc:
                bc_read = (bc_read[0], tk_seq.get_rev_comp(bc_read[1]),
                           bc_read[2][::-1])
            # Track the barcode count distribution
            bc_counter.count(*bc_read)

        # Calculate metrics on raw sequences
        lib_idx = [
            i for i, x in enumerate(args.library_info)
            if x['library_id'] == args.library_id
        ][0]
        reporter.raw_fastq_cb(rna_read,
                              rna_read2,
                              bc_read,
                              si_read,
                              umi_read,
                              lib_idx,
                              skip_metrics=args.skip_metrics)

        # Construct new fastq headers
        fastq_header1 = AugmentedFastqHeader(rna_read[0])
        fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1])
        fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2])
        fastq_header1.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1])
        fastq_header1.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2])
        fastq_header1.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1])
        fastq_header1.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2])

        feat_raw_bc = None
        feat_proc_bc = None
        feat_qual = None
        feat_ids = None

        if feature_extraction:
            if feature_extraction.barcode:
                feat_raw_bc = feature_extraction.barcode
                feat_qual = feature_extraction.qual

            if len(feature_extraction.ids) > 0:
                feat_proc_bc = feature_extraction.barcode
                feat_ids = ';'.join(feature_extraction.ids)

                # If hit a single feature ID, count its frequency
                if len(feature_extraction.ids) == 1:
                    feature_counts[feature_extraction.indices[0]] += 1

        if feat_raw_bc:
            fastq_header1.set_tag(cr_constants.RAW_FEATURE_BARCODE_TAG,
                                  feat_raw_bc)
            fastq_header1.set_tag(cr_constants.FEATURE_BARCODE_QUAL_TAG,
                                  feat_qual)
        if feat_ids:
            fastq_header1.set_tag(cr_constants.PROCESSED_FEATURE_BARCODE_TAG,
                                  feat_proc_bc)
            fastq_header1.set_tag(cr_constants.FEATURE_IDS_TAG, feat_ids)

        if args.augment_fastq:
            read1_writer.write(
                (fastq_header1.to_string(), rna_read[1], rna_read[2]))
        else:
            read1_writer.write((rna_read[0], rna_read[1], rna_read[2]))
            tag_writer.write((fastq_header1.to_string(), '', ''))

        if paired_end:
            fastq_header2 = AugmentedFastqHeader(rna_read2[0])
            fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1])
            fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG,
                                  si_read[2])
            fastq_header2.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1])
            fastq_header2.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG,
                                  bc_read[2])
            fastq_header2.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1])
            fastq_header2.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2])

            if feat_raw_bc:
                fastq_header2.set_tag(cr_constants.RAW_FEATURE_BARCODE_TAG,
                                      feat_raw_bc)
                fastq_header2.set_tag(cr_constants.FEATURE_BARCODE_QUAL_TAG,
                                      feat_qual)
            if feat_ids:
                fastq_header2.set_tag(
                    cr_constants.PROCESSED_FEATURE_BARCODE_TAG, feat_proc_bc)
                fastq_header2.set_tag(cr_constants.FEATURE_IDS_TAG, feat_ids)

            if args.augment_fastq:
                read2_writer.write(
                    (fastq_header2.to_string(), rna_read2[1], rna_read2[2]))
            else:
                read2_writer.write((rna_read2[0], rna_read2[1], rna_read2[2]))

    reporter.extract_reads_finalize()

    # Close input and output files.
    rna_reads.close()
    if paired_end:
        rna_read2s.close()
    bc_reads.close()
    si_reads.close()
    umi_reads.close()

    read1_writer.close()
    if paired_end:
        read2_writer.close()
    if not args.augment_fastq:
        tag_writer.close()
    bc_counter.close()

    # Write feature BC read counts
    with open(outs.feature_counts, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(list(feature_counts)), f)

    # Set stage output parameters.
    if len(read1_writer.file_paths) > 0:
        outs.reads = read1_writer.get_out_paths()

        if paired_end:
            outs.read2s = read2_writer.get_out_paths(len(outs.reads))
        else:
            outs.read2s = []

        if args.augment_fastq:
            outs.tags = []
        else:
            outs.tags = tag_writer.get_out_paths(len(outs.tags))

        libraries = args.library_info
        library = [
            li for li in libraries if li['library_id'] == args.library_id
        ][0]

        outs.gem_groups = [library['gem_group']] * len(outs.reads)
        outs.library_types = [library['library_type']] * len(outs.reads)
        outs.library_ids = [library['library_id']] * len(outs.reads)
        outs.read_groups = [args.read_group] * len(outs.reads)
    else:
        outs.reads = []
        outs.read2s = []
        outs.tags = []
        outs.gem_groups = []
        outs.library_types = []
        outs.library_ids = []
        outs.read_groups = []

    assert len(outs.gem_groups) == len(outs.reads)
    assert args.augment_fastq or len(outs.tags) == len(outs.reads)

    if paired_end:
        assert len(outs.reads) == len(outs.read2s)

    # this is the first reporter stage, so store the pipeline metadata
    reporter.store_pipeline_metadata(martian.get_pipelines_version())

    reporter.save(outs.chunked_reporter)
示例#18
0
def join(args, outs, chunk_defs, chunk_outs):
    # compute invariants on input data
    input_genomes = set()
    input_features = set()
    input_bc_counts = {}
    input_feature_counts = {}
    input_num_gem_groups = 0

    for sample_def in args.input_sample_defs:
        library_id = sample_def['library_id']
        with MoleculeCounter.open(sample_def[cr_constants.AGG_H5_FIELD],
                                  'r') as mc:
            input_genomes.update(mol_counter_genomes(mc))
            input_features.update(mol_counter_features_id_type(mc))
            gem_groups = mc.get_gem_groups()
            input_num_gem_groups += len(gem_groups)

            mol_gem_group = mc.get_column('gem_group')

            mol_barcode_idx = mc.get_column('barcode_idx')
            for gg in gem_groups:
                input_bc_counts[(library_id, gg)] = np.zeros(
                    len(mc.get_ref_column('barcodes')))
                bc_idx, counts = np.unique(
                    mol_barcode_idx[mol_gem_group == gg], return_counts=True)
                input_bc_counts[(library_id, gg)][bc_idx] = counts
            del mol_barcode_idx

            mol_feature_idx = mc.get_column('feature_idx')
            for gg in gem_groups:
                input_feature_counts[(library_id, gg)] = np.zeros(
                    len(mc.feature_reference.feature_defs))
                feature_idx, counts = np.unique(
                    mol_feature_idx[mol_gem_group == gg], return_counts=True)
                input_feature_counts[(library_id, gg)][feature_idx] = counts
            del mol_feature_idx

    # compute invariants on output
    output_matrix = cr_matrix.CountMatrix.load_h5_file(
        args.merged_raw_gene_bc_matrices_h5)
    output_genomes = set(output_matrix.get_genomes())
    output_features = set(count_matrix_features_id_type(output_matrix))
    output_bc_counts = {}
    output_feature_counts = {}
    output_gem_index = cr_matrix.get_gem_group_index(
        args.merged_raw_gene_bc_matrices_h5)
    output_num_gem_groups = len(output_gem_index)

    for gg in output_gem_index:
        library_id, old_gg = output_gem_index[gg]
        matrix_gg = output_matrix.select_barcodes_by_gem_group(gg)
        output_bc_counts[(library_id, old_gg)] = matrix_gg.get_counts_per_bc()
        output_feature_counts[(library_id,
                               old_gg)] = matrix_gg.get_counts_per_feature()

    exit_message = (
        'An internal problem in the aggr pipeline has been detected '
        'that might lead to incorrect results. Please report this '
        'problem to [email protected].')

    if input_genomes != output_genomes:
        martian.log_info(
            'Genomes differ between input molecule files and aggregated matrix'
        )
        martian.exit(exit_message)
    if input_features != output_features:
        martian.log_info(
            'Features differ between input molecule files and aggregated matrix'
        )
        martian.exit(exit_message)
    if input_num_gem_groups != output_num_gem_groups:
        martian.log_info(
            'Number of GEM groups differs between input molecule files and aggregated matrix'
        )
        martian.exit(exit_message)
    for lib_gg in input_bc_counts.keys():
        if len(input_bc_counts[lib_gg]) != len(output_bc_counts[lib_gg]):
            martian.log_info(
                'Barcode list for library {}, GEM group {} has different length '
                'in aggregated output compared to input.'.format(
                    lib_gg[0], lib_gg[1]))
            martian.exit(exit_message)
        if np.any(input_bc_counts[lib_gg] < output_bc_counts[lib_gg]):
            martian.log_info(
                'Barcode(s) in library {}, GEM group {} have higher UMI counts '
                'in aggregated output compared to inputs'.format(
                    lib_gg[0], lib_gg[1]))
            martian.exit(exit_message)
        if len(input_feature_counts[lib_gg]) != len(
                output_feature_counts[lib_gg]):
            martian.log_info(
                'Feature list for library {}, GEM group {} has different length '
                'in aggregated output compared to input.'.format(
                    lib_gg[0], lib_gg[1]))
            martian.exit(exit_message)
        if np.any(
                input_feature_counts[lib_gg] < output_feature_counts[lib_gg]):
            martian.log_info(
                'Feature(s) in library {}, GEM group {} have higher UMI counts '
                'in aggregated output compared to inputs'.format(
                    lib_gg[0], lib_gg[1]))
            martian.exit(exit_message)

    summary = {
        'genomes_present': list(input_genomes),
        'num_features_in_ref': len(input_features),
        'num_gem_groups': input_num_gem_groups,
    }

    with open(outs.summary, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(summary),
                  f,
                  indent=4,
                  sort_keys=True)
示例#19
0
def filter_barcodes(args, outs):
    random.seed(0)
    np.random.seed(0)

    matrices = cr_matrix.GeneBCMatrices.load_h5(args.matrices_h5)

    summary = {}

    total_diversity = len(matrices.matrices.values()[-1].bcs)

    if args.cell_barcodes is not None:
        method_name = cr_constants.FILTER_BARCODES_MANUAL
    elif args.force_cells is not None:
        method_name = cr_constants.FILTER_BARCODES_FIXED_CUTOFF
    else:
        method_name = cr_constants.FILTER_BARCODES_ORDMAG

    summary['total_diversity'] = total_diversity
    summary['filter_barcodes_method'] = method_name

    # Initialize filtered matrices object
    filtered_matrices = cr_matrix.GeneBCMatrices(
        matrices.matrices.keys(),
        [m.genes for m in matrices.matrices.values()],
        [m.bcs for m in matrices.matrices.values()][0])

    # Get unique gem groups
    unique_gem_groups = sorted(list(set(args.gem_groups)))

    # Get per-gem group cell load
    if args.recovered_cells is not None:
        gg_recovered_cells = int(
            float(args.recovered_cells) / float(len(unique_gem_groups)))
    else:
        gg_recovered_cells = cr_constants.DEFAULT_RECOVERED_CELLS_PER_GEM_GROUP

    if args.force_cells is not None:
        gg_force_cells = int(
            float(args.force_cells) / float(len(unique_gem_groups)))

    filtered_metrics = []
    filtered_bcs = []

    # Track filtered barcodes for each genome
    bcs_per_genome = collections.defaultdict(list)

    # Filter each genome's matrix
    for genome, matrix in matrices.matrices.iteritems():
        filtered_metrics = []
        filtered_bcs = []

        # Filter each gem group individually
        for gem_group in unique_gem_groups:
            gg_matrix = matrix.select_barcodes_by_gem_group(gem_group)
            if method_name == cr_constants.FILTER_BARCODES_ORDMAG:
                gg_total_diversity = len(gg_matrix.bcs)
                gg_bc_counts = gg_matrix.get_reads_per_bc()
                gg_filtered_indices, gg_filtered_metrics, msg = cr_stats.filter_cellular_barcodes_ordmag(
                    gg_bc_counts, gg_recovered_cells, gg_total_diversity)
                gg_filtered_bcs = gg_matrix.ints_to_bcs(gg_filtered_indices)
            elif method_name == cr_constants.FILTER_BARCODES_MANUAL:
                with (open(args.cell_barcodes)) as f:
                    cell_barcodes = json.load(f)
                gg_filtered_bcs, gg_filtered_metrics, msg = cr_stats.filter_cellular_barcodes_manual(
                    gg_matrix, cell_barcodes)
            elif method_name == cr_constants.FILTER_BARCODES_FIXED_CUTOFF:
                gg_bc_counts = gg_matrix.get_reads_per_bc()
                gg_filtered_indices, gg_filtered_metrics, msg = cr_stats.filter_cellular_barcodes_fixed_cutoff(
                    gg_bc_counts, gg_force_cells)
                gg_filtered_bcs = gg_matrix.ints_to_bcs(gg_filtered_indices)
            else:
                martian.exit("Unsupported BC filtering method: %s" %
                             method_name)

            if msg is not None:
                martian.log_info(msg)

            filtered_metrics.append(gg_filtered_metrics)
            filtered_bcs.extend(gg_filtered_bcs)

            bcs_per_genome[genome].extend(gg_filtered_bcs)

        # Merge metrics over all gem groups
        txome_summary = cr_stats.merge_filtered_metrics(filtered_metrics)

        # Append method name to metrics
        summary.update({
            ('%s_%s_%s' % (genome, key, method_name)): txome_summary[key] \
            for (key,_) in txome_summary.iteritems()})

        txome_filtered_matrix = matrix.select_barcodes_by_seq(filtered_bcs)
        filtered_matrices.matrices[genome] = txome_filtered_matrix
        summary['%s_filtered_bcs' % genome] = txome_summary['filtered_bcs']
        summary['%s_filtered_bcs_cv' %
                genome] = txome_summary['filtered_bcs_cv']

    # Re-compute various metrics on the filtered matrices
    matrix_summary = matrices.report(
        summary_json_paths=[args.raw_fastq_summary, args.attach_bcs_summary],
        barcode_summary_h5_path=args.barcode_summary,
        recovered_cells=args.recovered_cells,
        cell_bc_seqs=[
            mat.bcs for mat in filtered_matrices.matrices.itervalues()
        ])

    # Write summary json
    combined_summary = matrix_summary.copy()
    combined_summary.update(summary)
    with open(outs.summary, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(combined_summary),
                  f,
                  indent=4,
                  sort_keys=True)

    # Write the filtered barcodes file
    write_filtered_barcodes(outs.filtered_barcodes, bcs_per_genome)

    return filtered_matrices
示例#20
0
def build_reference_fasta_from_fasta(fasta_path, reference_path,
                                     reference_name, ref_version,
                                     mkref_version):
    """Create cellranger-compatible vdj reference files from a
       V(D)J segment FASTA file.
    """

    seen_features = set()
    seen_ids = set()
    features = []

    print 'Checking FASTA entries...'

    with open(fasta_path) as f:
        for header, sequence in cr_utils.get_fasta_iter(f):
            feat = parse_fasta_entry(header, sequence)

            # Enforce unique feature IDs
            if feat.feature_id in seen_ids:
                raise ValueError(
                    'Duplicate feature ID found in input FASTA: %d.' %
                    feat.feature_id)
            # Sanity check values
            if ' ' in feat.region_type:
                raise ValueError('Spaces not allowed in region type: "%s"' %
                                 feat.region_type)
            if ' ' in feat.gene_name:
                raise ValueError('Spaces not allowed in gene name: "%s"' %
                                 feat.gene_name)
            if ' ' in feat.record_id:
                raise ValueError('Spaces not allowed in record ID: "%s"' %
                                 feat.record_id)

            key = get_duplicate_feature_key(feat)
            if key in seen_features:
                print 'Warning: Skipping duplicate entry for %s (%s, %s).' % (
                    feat.display_name, feat.region_type, feat.record_id)
                continue

            # Strip Ns from termini
            seq = feat.sequence
            if 'N' in seq:
                print 'Warning: Feature %s contains Ns. Stripping from the ends.' % \
                    str((feat.display_name, feat.record_id, feat.region_type))
                seq = seq.strip('N')

            if len(seq) == 0:
                print 'Warning: Feature %s is all Ns. Skipping.' % \
                    str((feat.display_name, feat.record_id, feat.region_type))
                continue

            # Warn on features we couldn't classify properly
            if feat.chain_type not in vdj_constants.VDJ_CHAIN_TYPES:
                print 'Warning: Unknown chain type for: %s. Expected name to be in %s. Skipping.' % \
                (str((feat.display_name, feat.record_id, feat.region_type)),
                 str(tuple(vdj_constants.VDJ_CHAIN_TYPES)))
                continue

            seen_ids.add(feat.feature_id)
            seen_features.add(key)

            # Update the sequence since we may have modified it
            feat_dict = feat._asdict()
            feat_dict.update({'sequence': seq})
            new_feat = VdjAnnotationFeature(**feat_dict)
            features.append(new_feat)
    print '...done.\n'

    print 'Writing sequences...'
    os.makedirs(os.path.dirname(get_vdj_reference_fasta(reference_path)))
    with open(get_vdj_reference_fasta(reference_path), 'w') as out_fasta:
        for feat in features:
            out_fasta.write(convert_vdj_feature_to_fasta_entry(feat) + '\n')
    print '...done.\n'

    print 'Computing hash of input FASTA file...'
    fasta_hash = cr_utils.compute_hash_of_file(fasta_path)
    print '...done.\n'

    print 'Writing metadata JSON file into reference folder...'
    metadata = {
        cr_constants.REFERENCE_GENOMES_KEY: reference_name,
        cr_constants.REFERENCE_FASTA_HASH_KEY: fasta_hash,
        cr_constants.REFERENCE_GTF_HASH_KEY: None,
        cr_constants.REFERENCE_INPUT_FASTA_KEY: os.path.basename(fasta_path),
        cr_constants.REFERENCE_INPUT_GTF_KEY: None,
        cr_constants.REFERENCE_VERSION_KEY: ref_version,
        cr_constants.REFERENCE_MKREF_VERSION_KEY: mkref_version,
        cr_constants.REFERENCE_TYPE_KEY: vdj_constants.REFERENCE_TYPE,
    }
    with open(
            os.path.join(reference_path, cr_constants.REFERENCE_METADATA_FILE),
            'w') as json_file:
        json.dump(tk_safe_json.json_sanitize(metadata),
                  json_file,
                  sort_keys=True,
                  indent=4)
    print '...done.\n'
示例#21
0
def write_json_from_dict(input_dict, out_file_name):
    with open(out_file_name, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(input_dict),
                  f,
                  indent=4,
                  sort_keys=True)
示例#22
0
def main(args, outs):
    reporter = vdj_report.VdjReporter()

    cell_barcodes = set(vdj_utils.load_cell_barcodes_json(args.cell_barcodes))

    barcode_contigs = vdj_annot.load_cell_contigs_from_json(
        args.annotations, args.vdj_reference_path, group_key='barcode')

    # From CDR sequence to sequence id
    sequences = {}
    # From clonotype (tuple of CDR ids) to clonotype id
    clonotypes = {}

    # From barcode to clonotype id
    bc_clonotype_assignments = {}

    # First pass: Just keep track of observed CDR3s
    for contig_list in barcode_contigs:

        # This will be a tuple of sequences like "TRA_<cdr seq>"
        barcode_clonotype_tuple = contig_list.clonotype_tuple(
            require_productive=not args.use_non_productive,
            require_full_len=True,
            require_high_conf=True)

        # Give unique numerical ids to the CDR3 sequences
        if barcode_clonotype_tuple:
            for cdr_seq in barcode_clonotype_tuple:
                sequences.setdefault(cdr_seq, len(sequences))

    # From sequence id to CDR sequence
    sequence_ids = {seq_id: seq for seq, seq_id in sequences.iteritems()}

    # Do a second pass to potentially use non-full length contigs with a valid CDR3.
    for contig_list in barcode_contigs:
        if args.use_non_full_len:
            barcode_clonotype_tuple = []

            for c in contig_list.contigs():
                (_, cl_seq) = c.clonotype_seq()
                # If this contig has a CDR3 and we can infer the gene type of
                # that CDR3 (either based on the contig itself or based on
                # other full-length contigs that had this CDR3, then add this
                # to the clonotype tuple).
                if cl_seq in sequences:
                    # this will rescue contigs that have a chain and CDR3 assigned
                    # but aren't full length
                    barcode_clonotype_tuple.append(cl_seq)
        else:
            barcode_clonotype_tuple = contig_list.clonotype_tuple(
                require_productive=(not args.use_non_productive),
                require_full_len=True,
                require_high_conf=True)
        barcode_clonotype = tuple(
            sorted(list(set([sequences[s] for s in barcode_clonotype_tuple]))))

        if barcode_clonotype:
            clonotype_id = clonotypes.setdefault(barcode_clonotype,
                                                 len(clonotypes))
            bc_clonotype_assignments[contig_list.name] = clonotype_id

    # From clonotype id to tuple of CDRs
    clonotype_ids = {
        clonotype_id: clonotype_tuple
        for clonotype_tuple, clonotype_id in clonotypes.iteritems()
    }

    out_clonotypes = vdj_annot.report_clonotypes(reporter, 'raw',
                                                 cell_barcodes, clonotype_ids,
                                                 sequence_ids, barcode_contigs,
                                                 bc_clonotype_assignments)

    with open(outs.clonotype_assignments, 'w') as out_file:
        tk_safe_json.dump_numpy(tk_safe_json.json_sanitize(out_clonotypes),
                                out_file,
                                pretty=True)

    # Add clonotype assignments to contig annotations
    del barcode_contigs
    with open(args.annotations) as f:
        all_contigs = vdj_annot.load_contig_list_from_json(
            f, args.vdj_reference_path)

    vdj_annot.label_contigs_with_consensus(out_clonotypes, all_contigs, 'raw')

    # Write augmented contig annotations
    with open(outs.contig_annotations, 'w') as out_file:
        vdj_annot.save_annotation_list_json(out_file, all_contigs)

    with open(outs.contig_annotations_csv, 'w') as out_file:
        vdj_annot.save_contig_list_csv(out_file,
                                       all_contigs,
                                       write_inferred=False)

    with open(outs.contig_annotations_pickle, 'w') as out_file:
        cPickle.dump(all_contigs, out_file, protocol=cPickle.HIGHEST_PROTOCOL)

    # Write filtered contig annotations
    with open(outs.filtered_contig_annotations_csv, 'w') as out_file:
        filtered_contigs = filter(lambda x: x.high_confidence and x.is_cell,
                                  all_contigs)
        vdj_annot.save_contig_list_csv(out_file,
                                       filtered_contigs,
                                       write_inferred=False)

    # Set a default value for paired clonotype diversity so that it will be
    # present in the metric summary csv even when there are no paired cells
    # or in denovo mode
    paired_diversity_metric = reporter._get_metric_attr(
        'vdj_paired_clonotype_diversity', MULTI_REFS_PREFIX, 'raw')
    if not paired_diversity_metric.d:
        paired_diversity_metric.add(None, 0)

    reporter.report_summary_json(outs.summary)
示例#23
0
def main(args, outs):
    np.random.seed(0)

    LogPerf.mem()

    with MoleculeCounter.open(args.molecules, 'r') as mc:
        library_info = mc.get_library_info()
        barcode_info = mc.get_barcode_info()

        metrics_in = mc.get_all_metrics()
        metrics_out = copy.deepcopy(metrics_in)

        # Compute subsampling rate and approximate new total readpair count
        frac_reads_kept = np.array(args.frac_reads_kept, dtype=float)
        total_reads_in = mc.get_raw_read_pairs_per_library()
        total_reads_out = total_reads_in * frac_reads_kept

        for lib_idx, _ in enumerate(library_info):
            metrics_out[cr_mol_counter.LIBRARIES_METRIC][str(
                lib_idx)][cr_mol_counter.
                          DOWNSAMPLED_READS_METRIC] = total_reads_out[lib_idx]

        # downsample molecule info
        chunk = slice(args.chunk_start, args.chunk_start + args.chunk_len)
        mol_library_idx = mc.get_column_lazy('library_idx')[chunk]
        mol_read_pairs = mc.get_column_lazy('count')[chunk]

        mol_rate = frac_reads_kept[mol_library_idx]
        del mol_library_idx

        new_read_pairs = np.random.binomial(mol_read_pairs, mol_rate)
        del mol_read_pairs
        del mol_rate

        keep_mol = np.flatnonzero(new_read_pairs)
        new_read_pairs = new_read_pairs[keep_mol]

        mol_gem_group = mc.get_column_lazy('gem_group')[chunk][keep_mol]
        mol_barcode_idx = mc.get_column_lazy('barcode_idx')[chunk][keep_mol]
        mol_feature_idx = mc.get_column_lazy('feature_idx')[chunk][keep_mol]

        # Assert that gem groups start at 1 and are contiguous
        gem_groups = sorted(set(lib['gem_group'] for lib in library_info))
        assert(min(gem_groups) == 1 and \
               np.all(np.diff(np.array(gem_groups,dtype=int)) == 1))

        feature_ref = mc.get_feature_ref()

        # Compute matrix dimensions
        # Get the range of possible barcode indices for each gem group.
        gg_barcode_idx_start = np.zeros(1 + len(gem_groups), dtype=int)
        gg_barcode_idx_len = np.zeros(1 + len(gem_groups), dtype=int)
        for gg_str, idx_range in sorted(
                args.gem_group_barcode_ranges.iteritems(),
                key=lambda kv: int(kv[0])):
            gg = int(gg_str)
            gg_barcode_idx_start[gg] = idx_range[0]
            gg_barcode_idx_len[gg] = idx_range[1] - idx_range[0]

        num_bcs = gg_barcode_idx_len.sum()
        num_features = feature_ref.get_num_features()

        print 'downsampled'
        LogPerf.mem()

        # Convert molecule barcode indices into matrix barcode indices
        # The molecule info barcode_idx is in this space:
        #  [W_0, W_1, ...] where W_i is distinct original whitelist i.
        # The matrix is in, e.g., this space:
        #  [w_0-1, w_1-2, w_0-3, ...] where w_i-j is a copy of whitelist i for gem group j.

        # Return to the original whitelist index
        mol_barcode_idx -= gg_barcode_idx_start.astype(
            np.uint64)[mol_gem_group]

        # Offset by the cumulative whitelist length up to a barcode's gem group
        gg_barcode_matrix_start = np.cumsum(gg_barcode_idx_len).astype(
            np.uint64)
        mol_barcode_idx += gg_barcode_matrix_start[mol_gem_group - 1]

        ones = np.ones(len(mol_barcode_idx),
                       dtype=cr_matrix.DEFAULT_DATA_DTYPE)
        umi_matrix = sp_sparse.coo_matrix(
            (ones, (mol_feature_idx, mol_barcode_idx)),
            shape=(num_features, num_bcs))
        print 'created umi matrix'
        LogPerf.mem()

        # Create a read-count matrix so we can summarize reads per barcode
        read_matrix = sp_sparse.coo_matrix(
            (new_read_pairs, (mol_feature_idx, mol_barcode_idx)),
            shape=(num_features, num_bcs))
        del ones
        del mol_feature_idx
        del mol_barcode_idx
        del new_read_pairs

        # Get all barcodes strings for the raw matrix
        barcode_seqs = mc.get_barcodes()

        print len(barcode_seqs), len(gem_groups)
        print 'creating barcode strings'
        LogPerf.mem()

        barcodes = []
        for gg in gem_groups:
            idx_start = gg_barcode_idx_start[gg]
            idx_end = idx_start + gg_barcode_idx_len[gg]
            gg_bcs = np.array([
                cr_utils.format_barcode_seq(bc, gg)
                for bc in barcode_seqs[idx_start:idx_end]
            ])
            barcodes.append(gg_bcs)
        barcodes = np.concatenate(barcodes)
        barcodes.flags.writeable = False

        print 'created barcode strings'
        LogPerf.mem()

        # Get mapped reads per barcode per library,genome
        read_summary = {}
        read_matrix = CountMatrix(feature_ref, barcodes, read_matrix)
        read_matrix.m = read_matrix.m.tocsc(copy=True)
        read_summary = summarize_read_matrix(read_matrix, library_info,
                                             barcode_info, barcode_seqs)
        del read_matrix

        print 'created read matrix'
        LogPerf.mem()
        # Construct the raw UMI matrix
        raw_umi_matrix = CountMatrix(feature_ref, barcodes, umi_matrix)
        raw_umi_matrix.save_h5_file(outs.raw_matrix_h5)
        outs.raw_nnz = raw_umi_matrix.m.nnz

        # Construct the filtered UMI matrix
        filtered_bcs = MoleculeCounter.get_filtered_barcodes(
            barcode_info, library_info, barcode_seqs)
        filtered_umi_matrix = raw_umi_matrix.select_barcodes_by_seq(
            filtered_bcs)
        filtered_umi_matrix.save_h5_file(outs.filtered_matrix_h5)
        outs.filtered_nnz = filtered_umi_matrix.m.nnz

        print 'created filtered umi matrix'
        LogPerf.mem()

        summary = {
            'read_summary': read_summary,
            'mol_metrics': metrics_out,
        }

        with open(outs.chunk_summary, 'w') as f:
            json.dump(tk_safe_json.json_sanitize(summary),
                      f,
                      indent=4,
                      sort_keys=True)

    # Don't write MEX from chunks.
    outs.raw_matrices_mex = None
    outs.filtered_matrices_mex = None
示例#24
0
def join(args, outs, chunk_defs, chunk_outs):

    # Pass through the matrix chunks and nnz counts
    outs.raw_matrices_h5 = [o.raw_matrix_h5 for o in chunk_outs]
    outs.raw_nnz = sum(o.raw_nnz for o in chunk_outs)
    outs.filtered_matrices_h5 = [o.filtered_matrix_h5 for o in chunk_outs]
    outs.filted_nnz = sum(o.filtered_nnz for o in chunk_outs)

    with MoleculeCounter.open(args.molecules, 'r') as mc:
        library_info = mc.get_library_info()

    lib_types = sorted(set(lib['library_type'] for lib in library_info))

    summary = {
        'frac_reads_kept': chunk_defs[0].frac_reads_kept,
        'num_cells_by_library': chunk_defs[0].num_cells,
    }

    # Merge read summary metrics
    read_summary = defaultdict(int)
    for filename in [co.chunk_summary for co in chunk_outs]:
        with open(filename) as f:
            d = json.load(f)
            for k in d['read_summary'].iterkeys():
                read_summary[k] += d['read_summary'][k]
    summary.update(read_summary)

    # Get summary metrics
    with open(chunk_outs[0].chunk_summary) as f:
        mol_metrics = json.load(f)['mol_metrics']
    chem_keys = [
        k for k in mol_metrics.iterkeys() if k.startswith('chemistry')
    ]
    for k in chem_keys:
        summary[k] = mol_metrics[k]
    print json.dumps(mol_metrics, indent=4, sort_keys=True)

    # Report normalization metrics
    all_batches = OrderedDict()

    # These are all per-library-type
    min_frac_reads_kept = np.ones(len(lib_types), dtype='float')
    total_raw_read_pairs = np.zeros(len(lib_types), dtype='uint64')
    total_ds_raw_read_pairs = np.zeros(len(lib_types), dtype='uint64')
    total_cells = np.zeros(len(lib_types), dtype='uint64')

    for lib_type_idx, lib_type in enumerate(lib_types):
        lib_inds = [
            i for i, lib in enumerate(library_info)
            if lib['library_type'] == lib_type
        ]
        for lib_idx in lib_inds:
            aggr_id = library_info[lib_idx]['aggr_id']
            old_gg = library_info[lib_idx]['old_gem_group']
            batch = aggr_id + ('-%d' % old_gg if old_gg > 1 else '')
            all_batches[batch] = None

            n_cells = summary['num_cells_by_library'][lib_idx]
            total_cells[lib_type_idx] += n_cells

            lib_metrics = mol_metrics[cr_mol_counter.LIBRARIES_METRIC][str(
                lib_idx)]
            raw_read_pairs = lib_metrics[cr_mol_counter.TOTAL_READS_METRIC]
            mapped_read_pairs = lib_metrics[cr_mol_counter.USABLE_READS_METRIC]
            ds_read_pairs = lib_metrics[
                cr_mol_counter.DOWNSAMPLED_READS_METRIC]

            total_raw_read_pairs[lib_type_idx] += raw_read_pairs
            total_ds_raw_read_pairs[lib_type_idx] += ds_read_pairs

            frac_reads_kept = summary['frac_reads_kept'][lib_idx]
            min_frac_reads_kept[lib_type_idx] = min(
                min_frac_reads_kept[lib_type_idx], frac_reads_kept)

            pre_norm_raw_rppc = tk_stats.robust_divide(raw_read_pairs, n_cells)
            pre_norm_mapped_rppc = tk_stats.robust_divide(
                mapped_read_pairs, n_cells)

            # Prefix with batch and library type
            if lib_type.lower().startswith(
                    rna_library.CUSTOM_LIBRARY_TYPE_PREFIX.lower()):
                lib_prefix = rna_library.CUSTOM_LIBRARY_TYPE_PREFIX + '_'
            else:
                lib_prefix = rna_library.get_library_type_metric_prefix(
                    lib_type)

            p = (batch, lib_prefix)
            summary.update({
                '%s_%sfrac_reads_kept' % p:
                frac_reads_kept,
                '%s_%spre_normalization_raw_reads_per_filtered_bc' % p:
                pre_norm_raw_rppc,
                '%s_%spre_normalization_cmb_reads_per_filtered_bc' % p:
                pre_norm_mapped_rppc,
            })
    summary['batches'] = all_batches.keys()

    for lib_type_idx, lib_type in enumerate(lib_types):
        mean_rppc = tk_stats.robust_divide(total_raw_read_pairs[lib_type_idx],
                                           total_cells[lib_type_idx])
        ds_mean_rppc = tk_stats.robust_divide(
            total_ds_raw_read_pairs[lib_type_idx], total_cells[lib_type_idx])

        p = rna_library.get_library_type_metric_prefix(lib_type)
        summary.update({
            '%spre_normalization_total_reads' % p:
            total_raw_read_pairs[lib_type_idx],
            '%spost_normalization_total_reads' % p:
            total_ds_raw_read_pairs[lib_type_idx],
            '%sfiltered_bcs_transcriptome_union' % p:
            total_cells[lib_type_idx],
            '%spre_normalization_multi_transcriptome_total_raw_reads_per_filtered_bc' % p:
            mean_rppc,
            '%spost_normalization_multi_transcriptome_total_raw_reads_per_filtered_bc' % p:
            ds_mean_rppc,
            '%slowest_frac_reads_kept' % p:
            min_frac_reads_kept[lib_type_idx],
        })

    with open(outs.summary, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(summary),
                  f,
                  indent=4,
                  sort_keys=True)
示例#25
0
def build_reference_fasta_from_ensembl(gtf_paths, transcripts_to_remove_path,
                                       genome_fasta_path, reference_path,
                                       reference_name, ref_version,
                                       mkref_version):
    """Create cellranger-compatible vdj reference files from a list of ENSEMBL-like GTF files.

    Input files are concatenated. No attempt to merge/reconcile information
    across them is made. Providing the files in a different order might change the
    output in cases where there are multiple entries with the same transcript id
    and the same feature type (eg. V-region).
    """

    transcripts = collections.defaultdict(list)

    if transcripts_to_remove_path:
        with open(transcripts_to_remove_path) as f:
            rm_transcripts = set([line.strip() for line in f.readlines()])
    else:
        rm_transcripts = set()

    # Note: We cannot symlink here because some filesystems in the wild
    #       do not support symlinks.
    print 'Copying genome reference sequence...'
    os.makedirs(os.path.dirname(get_vdj_reference_fasta(reference_path)))
    tmp_genome_fa_path = os.path.join(reference_path, 'genome.fasta')
    cr_utils.copy(genome_fasta_path, tmp_genome_fa_path)
    print '...done.\n'

    print 'Indexing genome reference sequence...'
    tk_subproc.check_call(['samtools', 'faidx', tmp_genome_fa_path])
    print '...done.\n'

    print 'Loading genome reference sequence...'
    genome_fasta = pysam.FastaFile(tmp_genome_fa_path)
    print '...done.\n'

    print 'Computing hash of genome FASTA file...'
    fasta_hash = cr_utils.compute_hash_of_file(tmp_genome_fa_path)
    print '...done.\n'

    for gtf in gtf_paths:
        print 'Reading GTF {}'.format(gtf)

        for line_no, entry in enumerate(get_gtf_iter(open(gtf))):
            if not entry.feature in [
                    ENSEMBL_FIVE_PRIME_UTR_FEATURE, ENSEMBL_CDS_FEATURE
            ]:
                continue
            entry = parse_attributes(entry)
            transcript_id = entry.attributes.get('transcript_id')
            transcript_biotype = entry.attributes.get('transcript_biotype')
            gene_biotype = entry.attributes.get('gene_biotype')
            gene_name = entry.attributes.get('gene_name')

            # Skip irrelevant biotypes
            if transcript_biotype not in ENSEMBL_VDJ_BIOTYPES and not gene_biotype in ENSEMBL_VDJ_BIOTYPES:
                continue

            # Skip blacklisted gene names
            if transcript_id in rm_transcripts:
                continue

            # Warn and skip if transcript_id missing
            if transcript_id is None:
                print 'Warning: Entry on row %d has no transcript_id' % line_no
                continue

            # Warn and skip if gene_name missing
            if gene_name is None:
                print 'Warning: Transcript %s on row %d has biotype %s but no gene_name. Skipping.' % (
                    transcript_id, line_no, transcript_biotype)
                continue

            # Infer region type from biotype
            if transcript_biotype in ENSEMBL_VDJ_BIOTYPES:
                vdj_feature = infer_ensembl_vdj_feature_type(
                    entry.feature, transcript_biotype)
            else:
                vdj_feature = infer_ensembl_vdj_feature_type(
                    entry.feature, gene_biotype)

            # Warn and skip if region type could not be inferred
            if vdj_feature is None:
                print 'Warning: Transcript %s has biotype %s. Could not infer VDJ gene type. Skipping.' % (
                    transcript_id, transcript_biotype)
                continue

            # Features that share a transcript_id and feature type are presumably exons
            # so keep them together.
            transcripts[(transcript_id, vdj_feature)].append(entry)

        print '...done.\n'

    print 'Computing hash of genes GTF files...'
    digest = hashlib.sha1()
    # concatenate all the hashes into a string and then hash that string
    digest.update(
        reduce(lambda x, y: x + y,
               [cr_utils.compute_hash_of_file(gtf) for gtf in gtf_paths]))
    gtf_hash = digest.hexdigest()
    print '...done.\n'

    print 'Fetching sequences...'
    out_fasta = open(get_vdj_reference_fasta(reference_path), 'w')

    feature_id = 1
    seen_features = set()

    for (transcript_id, region_type), regions in transcripts.iteritems():
        if not all(r.chrom == regions[0].chrom for r in regions):
            chroms = sorted(list(set([r.chrom for r in regions])))
            print 'Warning: Transcript %s spans multiple contigs: %s. Skipping.' % (
                transcript_id, str(chroms))
            continue

        if not all(r.strand == regions[0].strand for r in regions):
            print 'Warning: Transcript %s spans multiple strands. Skipping.' % transcript_id
            continue

        chrom = regions[0].chrom
        strand = regions[0].strand
        ens_gene_name = standardize_ensembl_gene_name(
            regions[0].attributes['gene_name'])
        transcript_id = regions[0].attributes['transcript_id']

        if chrom not in genome_fasta:
            print 'Warning: Transcript %s is on contig "%s" which is not in the provided reference fasta. Skipping.' % (
                transcript_id, chrom)
            continue

        # Build sequence
        regions.sort(key=lambda r: r.start)
        seq = ''
        for region in regions:
            # GTF coordinates are 1-based
            start, end = int(region.start) - 1, int(region.end)
            seq += genome_fasta.fetch(chrom, start, end)

        # Revcomp if transcript on reverse strand
        if strand == '-':
            seq = tk_seq.get_rev_comp(seq)

        # Strip Ns from termini
        if 'N' in seq:
            print 'Warning: Feature %s contains Ns. Stripping from the ends.' % str(
                (ens_gene_name, transcript_id, region_type))
            seq = seq.strip('N')

        if len(seq) == 0:
            print 'Warning: Feature %s is all Ns. Skipping.' % str(
                (ens_gene_name, transcript_id, region_type))
            continue

        # Infer various attributes from the Ensembl gene name
        record_id = transcript_id
        gene_name = ens_gene_name
        display_name = make_display_name(gene_name=gene_name, allele_name=None)
        chain = infer_ensembl_vdj_chain(gene_name)
        chain_type = infer_ensembl_vdj_chain_type(gene_name)
        # Ensembl doesn't encode alleles
        allele_name = '00'

        # Disallow spaces in these fields
        if ' ' in region_type:
            raise ValueError('Spaces not allowed in region type: "%s"' %
                             region_type)
        if ' ' in gene_name:
            raise ValueError('Spaces not allowed in gene name: "%s"' %
                             gene_name)
        if ' ' in record_id:
            raise ValueError('Spaces not allowed in record ID: "%s"' %
                             record_id)

        # Warn on features we couldn't classify properly
        if chain_type not in vdj_constants.VDJ_CHAIN_TYPES:
            print ('Warning: Could not infer chain type for: %s. ' + \
                'Expected the first two characters of the gene name to be in %s. Feature skipped.') % \
                (str((gene_name, record_id, region_type)),
                 str(tuple(vdj_constants.VDJ_CHAIN_TYPES)))
            continue

        if region_type in vdj_constants.VDJ_C_FEATURE_TYPES and chain in vdj_constants.CHAINS_WITH_ISOTYPES:
            isotype = infer_ensembl_isotype(ens_gene_name)
        else:
            isotype = None

        feature = VdjAnnotationFeature(
            feature_id=feature_id,
            record_id=record_id,
            display_name=display_name,
            gene_name=gene_name,
            region_type=region_type,
            chain_type=chain_type,
            chain=chain,
            isotype=isotype,
            allele_name=allele_name,
            sequence=seq,
        )

        # Don't add duplicate entries
        feat_key = get_duplicate_feature_key(feature)
        if feat_key in seen_features:
            print 'Warning: Skipping duplicate entry for %s (%s, %s).' % (
                display_name, region_type, record_id)
            continue
        seen_features.add(feat_key)

        feature_id += 1

        out_fasta.write(convert_vdj_feature_to_fasta_entry(feature) + '\n')
    print '...done.\n'

    print 'Deleting copy of genome fasta...'
    os.remove(tmp_genome_fa_path)
    os.remove(tmp_genome_fa_path + '.fai')
    print '...done.\n'

    print 'Writing metadata JSON file into reference folder...'
    metadata = {
        cr_constants.REFERENCE_GENOMES_KEY:
        reference_name,
        cr_constants.REFERENCE_FASTA_HASH_KEY:
        fasta_hash,
        cr_constants.REFERENCE_GTF_HASH_KEY:
        gtf_hash,
        cr_constants.REFERENCE_INPUT_FASTA_KEY:
        os.path.basename(genome_fasta_path),
        cr_constants.REFERENCE_INPUT_GTF_KEY:
        ','.join([os.path.basename(gtf_path) for gtf_path in gtf_paths]),
        cr_constants.REFERENCE_VERSION_KEY:
        ref_version,
        cr_constants.REFERENCE_MKREF_VERSION_KEY:
        mkref_version,
        cr_constants.REFERENCE_TYPE_KEY:
        vdj_constants.REFERENCE_TYPE,
    }
    with open(
            os.path.join(reference_path, cr_constants.REFERENCE_METADATA_FILE),
            'w') as json_file:
        json.dump(tk_safe_json.json_sanitize(metadata),
                  json_file,
                  sort_keys=True,
                  indent=4)
    print '...done.\n'
示例#26
0
def join(args, outs, chunk_defs, chunk_outs):
    contigs = []
    contig_fastqs = []
    contig_bams = []

    if len(chunk_outs) == 0:
        # No input reads
        # Create empty BAM file
        with open(outs.contig_bam, 'w') as f:
            pass
        outs.contig_bam_bai = None
        # Create empty contig FASTA
        with open(outs.contig_fasta, 'w') as f:
            pass
        outs.contig_fasta_fai = None
        # Create empty contig FASTQ
        with open(outs.contig_fastq, 'w') as f:
            pass
        outs.metrics_summary_json = None
        outs.summary_tsv = None
        outs.umi_summary_tsv = None
        return

    summary_tsvs = []
    umi_summary_tsvs = []

    for chunk_out in chunk_outs:
        if not os.path.isfile(chunk_out.contig_fasta):
            continue
        contigs.append(chunk_out.contig_fasta)

        contig_fastqs.append(chunk_out.contig_fastq)
        contig_bams.append(chunk_out.contig_bam)

        summary_tsvs.append(chunk_out.summary_tsv)
        umi_summary_tsvs.append(chunk_out.umi_summary_tsv)

    cr_io.concatenate_files(outs.contig_fasta, contigs)

    if os.path.getsize(outs.contig_fasta) > 0:
        tk_subproc.check_call('samtools faidx %s' % outs.contig_fasta,
                              shell=True)
        outs.contig_fasta_fai = outs.contig_fasta + '.fai'

    cr_io.concatenate_files(outs.contig_fastq, contig_fastqs)

    if len(summary_tsvs) > 0:
        cr_io.concatenate_headered_files(outs.summary_tsv, summary_tsvs)
    if len(umi_summary_tsvs) > 0:
        cr_io.concatenate_headered_files(outs.umi_summary_tsv,
                                         umi_summary_tsvs)

    if contig_bams:
        # Merge every N BAMs. Trying to merge them all at once
        #  risks hitting the filehandle limit.
        n_merged = 0

        while len(contig_bams) > 1:
            to_merge = contig_bams[0:MERGE_BAMS_N]

            tmp_bam = martian.make_path('merged-%04d.bam' % n_merged)
            n_merged += 1

            print "Merging %d BAMs into %s ..." % (len(to_merge), tmp_bam)
            tk_bam.merge(tmp_bam, to_merge, threads=args.__threads)

            # Delete any temporary bams that have been merged
            for in_bam in to_merge:
                if os.path.basename(in_bam).startswith('merged-'):
                    cr_io.remove(in_bam)

            # Pop the input bams and push the merged bam
            contig_bams = contig_bams[len(to_merge):] + [tmp_bam]

        if os.path.basename(contig_bams[0]).startswith('merged-'):
            # We merged at least two chunks together.
            # Rename it to the output bam.
            cr_io.move(contig_bams[0], outs.contig_bam)
        else:
            # There was only a single chunk, so copy it from the input
            cr_io.copy(contig_bams[0], outs.contig_bam)

        tk_bam.index(outs.contig_bam)

        # Make sure the Martian out matches the actual index filename
        outs.contig_bam_bai = outs.contig_bam + '.bai'

    # Merge the assembler summary jsons
    merged_summary = cr_io.merge_jsons_single_level(
        [out.metrics_summary_json for out in chunk_outs])

    with open(outs.metrics_summary_json, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(merged_summary),
                  f,
                  indent=4,
                  sort_keys=True)
示例#27
0
def filter_barcodes(args, outs):
    random.seed(0)
    np.random.seed(0)

    correction_data = pd.read_csv(args.barcode_correction_csv)
    raw_matrix = cr_matrix.CountMatrix.load_h5_file(args.matrices_h5)
    if np.isin(rna_library.ANTIBODY_LIBRARY_TYPE,
               correction_data.library_type):
        matrix, metrics_to_report, removed_bcs_df = remove_bcs_with_high_umi_corrected_reads(
            correction_data, raw_matrix)
        ### report all idenitified aggregate barcodes, together with their reads, umi corrected reads, fraction of corrected reads, and fraction of total reads
        removed_bcs_df.to_csv(outs.aggregate_barcodes)
        summary = metrics_to_report
    else:
        matrix = raw_matrix
        summary = {}

    if args.cell_barcodes is not None:
        method = FilterMethod.MANUAL
    elif args.force_cells is not None:
        method = FilterMethod.TOP_N_BARCODES
    else:
        method = FilterMethod.ORDMAG_NONAMBIENT

    summary['total_diversity'] = matrix.bcs_dim
    summary['filter_barcodes_method'] = get_filter_method_name(method)

    # Get unique gem groups
    unique_gem_groups = sorted(list(set(args.gem_groups)))

    # Get per-gem group cell load
    if args.recovered_cells is not None:
        gg_recovered_cells = int(
            float(args.recovered_cells) / float(len(unique_gem_groups)))
    else:
        gg_recovered_cells = cr_constants.DEFAULT_RECOVERED_CELLS_PER_GEM_GROUP

    if args.force_cells is not None:
        gg_force_cells = int(
            float(args.force_cells) / float(len(unique_gem_groups)))

    # Only use gene expression matrix for cell calling
    gex_matrix = matrix.view().select_features_by_type(
        lib_constants.GENE_EXPRESSION_LIBRARY_TYPE)

    # Make initial cell calls for each genome separately
    genomes = gex_matrix.get_genomes()

    # (gem_group, genome) => dict
    filtered_metrics_groups = OrderedDict()
    # (gem_group, genome) => list of barcode strings
    filtered_bcs_groups = OrderedDict()

    for genome in genomes:
        genome_matrix = gex_matrix.select_features_by_genome(genome)

        # Make initial cell calls for each gem group individually
        for gem_group in unique_gem_groups:

            gg_matrix = genome_matrix.select_barcodes_by_gem_group(gem_group)

            if method == FilterMethod.ORDMAG or \
               method == FilterMethod.ORDMAG_NONAMBIENT:
                gg_total_diversity = gg_matrix.bcs_dim
                gg_bc_counts = gg_matrix.get_counts_per_bc()
                gg_filtered_indices, gg_filtered_metrics, msg = cr_stats.filter_cellular_barcodes_ordmag(
                    gg_bc_counts, gg_recovered_cells, gg_total_diversity)
                gg_filtered_bcs = gg_matrix.ints_to_bcs(gg_filtered_indices)

            elif method == FilterMethod.MANUAL:
                with (open(args.cell_barcodes)) as f:
                    cell_barcodes = json.load(f)
                gg_filtered_bcs, gg_filtered_metrics, msg = cr_stats.filter_cellular_barcodes_manual(
                    gg_matrix, cell_barcodes)

            elif method == FilterMethod.TOP_N_BARCODES:
                gg_bc_counts = gg_matrix.get_counts_per_bc()
                gg_filtered_indices, gg_filtered_metrics, msg = cr_stats.filter_cellular_barcodes_fixed_cutoff(
                    gg_bc_counts, gg_force_cells)
                gg_filtered_bcs = gg_matrix.ints_to_bcs(gg_filtered_indices)

            else:
                martian.exit("Unsupported BC filtering method: %s" % method)

            if msg is not None:
                martian.log_info(msg)

            filtered_metrics_groups[(gem_group, genome)] = gg_filtered_metrics
            filtered_bcs_groups[(gem_group, genome)] = gg_filtered_bcs

    # Do additional cell calling
    outs.nonambient_calls = None

    if method == FilterMethod.ORDMAG_NONAMBIENT:
        # We need the full gene expression matrix instead of just a view
        full_gex_matrix = matrix.select_features_by_type(
            lib_constants.GENE_EXPRESSION_LIBRARY_TYPE)

        # Track these for recordkeeping
        eval_bcs_arrays = []
        umis_per_bc_arrays = []
        loglk_arrays = []
        pvalue_arrays = []
        pvalue_adj_arrays = []
        nonambient_arrays = []
        genome_call_arrays = []

        # Do it by gem group, but agnostic to genome
        for gg in unique_gem_groups:
            gg_matrix = full_gex_matrix.select_barcodes_by_gem_group(gg)

            # Take union of initial cell calls across genomes
            gg_bcs = sorted(
                list(
                    reduce(set.union, [
                        set(bcs)
                        for group, bcs in filtered_bcs_groups.iteritems()
                        if group[0] == gg
                    ])))

            result = cr_cell.find_nonambient_barcodes(gg_matrix, gg_bcs)
            if result is None:
                print 'Failed at attempt to call non-ambient barcodes in GEM group %s' % gg
                continue

            # Assign a genome to the cell calls by argmax genome counts
            genome_counts = []
            for genome in genomes:
                genome_counts.append(gg_matrix.view() \
                                     .select_features_by_genome(genome) \
                                     .select_barcodes(result.eval_bcs) \
                                     .get_counts_per_bc())
            genome_counts = np.column_stack(genome_counts)
            genome_calls = np.array(genomes)[np.argmax(genome_counts, axis=1)]

            umis_per_bc = gg_matrix.get_counts_per_bc()

            eval_bcs_arrays.append(np.array(gg_matrix.bcs)[result.eval_bcs])
            umis_per_bc_arrays.append(umis_per_bc[result.eval_bcs])
            loglk_arrays.append(result.log_likelihood)
            pvalue_arrays.append(result.pvalues)
            pvalue_adj_arrays.append(result.pvalues_adj)
            nonambient_arrays.append(result.is_nonambient)
            genome_call_arrays.append(genome_calls)

            # Update the lists of cell-associated barcodes
            for genome in genomes:
                eval_bc_strs = np.array(gg_matrix.bcs)[result.eval_bcs]
                filtered_bcs_groups[(gg, genome)].extend(
                    eval_bc_strs[(genome_calls == genome)
                                 & (result.is_nonambient)])

        if len(eval_bcs_arrays) > 0:
            nonambient_summary = pd.DataFrame(
                OrderedDict([
                    ('barcode', np.concatenate(eval_bcs_arrays)),
                    ('umis', np.concatenate(umis_per_bc_arrays)),
                    ('ambient_loglk', np.concatenate(loglk_arrays)),
                    ('pvalue', np.concatenate(pvalue_arrays)),
                    ('pvalue_adj', np.concatenate(pvalue_adj_arrays)),
                    ('nonambient', np.concatenate(nonambient_arrays)),
                    ('genome', np.concatenate(genome_call_arrays)),
                ]))
            nonambient_summary.to_csv(outs.nonambient_calls)

    # Record all filtered barcodes
    genome_filtered_bcs = defaultdict(set)
    filtered_bcs = set()
    for (gem_group, genome), bcs in filtered_bcs_groups.iteritems():
        genome_filtered_bcs[genome].update(bcs)
        filtered_bcs.update(bcs)

    # Combine initial-cell-calling metrics
    for genome in genomes:
        # Merge metrics over all gem groups for this genome
        txome_metrics = [
            v for k, v in filtered_metrics_groups.iteritems() if k[1] == genome
        ]
        txome_summary = cr_stats.merge_filtered_metrics(txome_metrics)

        # Append method name to metrics
        summary.update({
            ('%s_%s_%s' % (genome,
                           key,
                           get_filter_method_name(method))): txome_summary[key] \
            for (key,_) in txome_summary.iteritems()})

        summary['%s_filtered_bcs' % genome] = len(genome_filtered_bcs[genome])

        # NOTE: This metric only applies to the initial cell calls
        summary['%s_filtered_bcs_cv' %
                genome] = txome_summary['filtered_bcs_cv']

    # Deduplicate and sort filtered barcode sequences
    # Sort by (gem_group, barcode_sequence)
    barcode_sort_key = lambda x: cr_utils.split_barcode_seq(x)[::-1]

    for genome, bcs in genome_filtered_bcs.iteritems():
        genome_filtered_bcs[genome] = sorted(list(set(bcs)),
                                             key=barcode_sort_key)
    filtered_bcs = sorted(list(set(filtered_bcs)), key=barcode_sort_key)

    # Re-compute various metrics on the filtered matrix
    reads_summary = cr_utils.merge_jsons_as_dict(
        [args.raw_fastq_summary, args.attach_bcs_summary])
    matrix_summary = rna_report_mat.report_genomes(
        matrix,
        reads_summary=reads_summary,
        barcode_summary_h5_path=args.barcode_summary,
        recovered_cells=args.recovered_cells,
        cell_bc_seqs=genome_filtered_bcs)

    # Write metrics json
    combined_summary = matrix_summary.copy()
    combined_summary.update(summary)
    with open(outs.summary, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(combined_summary),
                  f,
                  indent=4,
                  sort_keys=True)

    # Write the filtered barcodes file
    write_filtered_barcodes(outs.filtered_barcodes, genome_filtered_bcs)

    # Select cell-associated barcodes
    filtered_matrix = matrix.select_barcodes_by_seq(filtered_bcs)

    return filtered_matrix
示例#28
0
def join(args, outs, chunk_defs, chunk_outs):
    # Merge tallies
    data = None
    for chunk in chunk_outs:
        with open(chunk.metrics) as f:
            chunk_data = cPickle.load(f)
        if data is None:
            data = chunk_data
        else:
            for k, v in data.iteritems():
                data[k] += chunk_data[k]

    # Compute metrics for each subsampling rate
    summary = {}

    with MoleculeCounter.open(args.molecule_info, 'r') as mc:
        genomes = sorted(
            set(
                f.tags.get('genome', '')
                for f in mc.feature_reference.feature_defs))
        lib_types = sorted(set(lib['library_type'] for lib in mc.library_info))
        lib_type_map = dict((lt, idx) for (idx, lt) in enumerate(lib_types))
    cell_bcs_by_genome = get_cell_associated_barcodes(genomes,
                                                      args.filtered_barcodes)

    # Give each cell-associated barcode an integer index
    cell_bcs = sorted(list(cell_bcs_by_genome['']))
    cell_bc_to_int = {bc: i for i, bc in enumerate(cell_bcs)}

    subsample_info = chunk_defs[0].subsample_info if len(
        chunk_defs) > 0 else []

    for i, task in enumerate(subsample_info):
        lib_type = task['library_type']
        lib_type_idx = lib_type_map[lib_type]
        ss_type = task['subsample_type']
        ss_depth = task['target_read_pairs_per_cell']

        if rna_library.has_genomes(lib_type):
            genome_ints = list(range(data['umis_per_bc'].shape[1]))
        else:
            genome_ints = [0]

        # Per-genome metrics
        for g in genome_ints:
            if not data['lib_type_genome_any_reads'][lib_type_idx, g]:
                continue
            genome = genomes[g]

            # Only compute on cell-associated barcodes for this genome.
            # This only matters when there are multiple genomes present.
            cell_inds = np.array(
                sorted(cell_bc_to_int[bc]
                       for bc in cell_bcs_by_genome[genome]))

            median_umis_per_cell = np.median(data['umis_per_bc'][i, g,
                                                                 cell_inds])
            summary[make_metric_name('subsampled_filtered_bcs_median_counts',
                                     lib_type, genome, ss_type,
                                     ss_depth)] = median_umis_per_cell

            median_features_per_cell = np.median(
                data['features_det_per_bc'][i, g, cell_inds])
            summary[make_metric_name(
                'subsampled_filtered_bcs_median_unique_genes_detected',
                lib_type, genome, ss_type,
                ss_depth)] = median_features_per_cell

            dup_frac = compute_dup_frac(data['read_pairs'][i, g],
                                        data['umis'][i, g])
            summary[make_metric_name('subsampled_duplication_frac', lib_type,
                                     genome, ss_type, ss_depth)] = dup_frac

        # Whole-dataset duplication frac
        all_read_pairs = np.sum(data['read_pairs'][i, :])
        all_umis = np.sum(data['umis'][i, :])
        dup_frac = compute_dup_frac(all_read_pairs, all_umis)
        summary[make_metric_name('subsampled_duplication_frac', lib_type,
                                 lib_constants.MULTI_REFS_PREFIX, ss_type,
                                 ss_depth)] = dup_frac

    with open(outs.summary, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(summary),
                  f,
                  indent=4,
                  sort_keys=True)