Exemplo n.º 1
0
    def build_from_mol_counter(molecule_counter, subsample_rate=1.0,
                               subsample_result=None):
        """ Construct a GeneBCMatrices object from a MoleculeCounter.
            Args: subsample_result (dict) - Return some metrics results into this dict. """

        # Reconstruct all barcode sequences in the original matrices
        barcode_whitelist = cr_utils.load_barcode_whitelist(molecule_counter.get_barcode_whitelist())
        barcode_length = molecule_counter.get_barcode_length() or len(barcode_whitelist[0])

        gem_groups = molecule_counter.get_gem_groups()
        barcode_seqs = cr_utils.format_barcode_seqs(barcode_whitelist, gem_groups)

        # Reconstruct Gene tuples from the molecule info ref columns
        gene_ids = molecule_counter.get_ref_column('gene_ids')
        genome_ids = molecule_counter.get_ref_column('genome_ids')
        gene_names = molecule_counter.get_ref_column('gene_names')
        gene_tuples = [cr_constants.Gene(gid, gname, None, None, None) for (gid, gname) in itertools.izip(gene_ids, gene_names)]
        genes = cr_utils.split_genes_by_genomes(gene_tuples, genome_ids)

        matrices = GeneBCMatrices(genome_ids, genes, barcode_seqs)

        # Track results of subsampling
        reads = 0

        for mol in molecule_counter.get_molecule_iter(barcode_length, subsample_rate=subsample_rate):
            matrices.add(mol.genome, mol.gene_id, mol.barcode)
            reads += mol.reads

        if subsample_result is not None:
            subsample_result['mapped_reads'] = reads

        return matrices
Exemplo n.º 2
0
    def load(group):
        gene_ids = list(getattr(group, cr_constants.H5_GENE_IDS_ATTR).read())

        if hasattr(group, cr_constants.H5_GENE_NAMES_ATTR):
            gene_names = list(
                getattr(group, cr_constants.H5_GENE_NAMES_ATTR).read())
        else:
            gene_names = gene_ids

        assert len(gene_ids) == len(gene_names)
        genes = [
            cr_constants.Gene(id, name, None, None, None)
            for id, name in itertools.izip(gene_ids, gene_names)
        ]
        bcs = list(getattr(group, cr_constants.H5_BCS_ATTR).read())
        matrix = GeneBCMatrix(genes, bcs)

        shape = getattr(group, cr_constants.H5_MATRIX_SHAPE_ATTR).read()
        data = getattr(group, cr_constants.H5_MATRIX_DATA_ATTR).read()
        indices = getattr(group, cr_constants.H5_MATRIX_INDICES_ATTR).read()
        indptr = getattr(group, cr_constants.H5_MATRIX_INDPTR_ATTR).read()

        # quick check to make sure indptr increases monotonically (to catch overflow bugs)
        assert np.all(np.diff(indptr) >= 0)

        matrix.m = sp_sparse.csc_matrix((data, indices, indptr), shape=shape)

        return matrix
Exemplo n.º 3
0
def load_snps(filename):
    # HACK: Save SNPs as Gene tuples so we can reuse code in GeneBCMatrices
    with open(filename, 'r') as f:
        return [
            cr_constants.Gene(str(snp), '', None, None, None)
            for snp in json.load(f)
        ]
Exemplo n.º 4
0
    def load_genes_from_h5_group(group):
        """ Load just the genes from an h5 """
        gene_ids = list(getattr(group, cr_constants.H5_GENE_IDS_ATTR).read())

        if hasattr(group, cr_constants.H5_GENE_NAMES_ATTR):
            gene_names = list(getattr(group, cr_constants.H5_GENE_NAMES_ATTR).read())
        else:
            gene_names = gene_ids

        assert len(gene_ids) == len(gene_names)
        genes = [cr_constants.Gene(id, name, None, None, None) for id, name in itertools.izip(gene_ids, gene_names)]

        return genes
Exemplo n.º 5
0
 def load_mtx(genome_dir):
     barcodes_tsv = os.path.join(genome_dir, "barcodes.tsv")
     genes_tsv = os.path.join(genome_dir, "genes.tsv")
     matrix_mtx = os.path.join(genome_dir, "matrix.mtx")
     for filepath in [barcodes_tsv, genes_tsv, matrix_mtx]:
         if not os.path.exists(filepath):
             raise IOError("Required file not found: %s" % filepath)
     barcodes = pd.read_csv(barcodes_tsv, delimiter='\t', header=None, usecols=[0]).values.squeeze()
     genes = pd.read_csv(genes_tsv, delimiter='\t', header=None, usecols=[0]).values.squeeze()
     genes = [cr_constants.Gene(gene_id, None, None, None, None) for gene_id in genes]
     matrix = sp_io.mmread(matrix_mtx)
     gbm = GeneBCMatrix(genes, barcodes)
     gbm.m = matrix
     return gbm
Exemplo n.º 6
0
    def load_gtf(self, in_gtf_fn, fasta_parser=None):
        transcripts = {}
        gene_to_transcripts = collections.OrderedDict()

        for row, is_comment, properties in self.gtf_reader_iter(in_gtf_fn):
            if is_comment:
                continue

            chrom, _, annotation, start, end, _, strand, _, properties_str = row

            if annotation != "exon":
                continue

            start = int(start) - 1
            end = int(end)
            length = abs(end - start)
            transcript_id = properties['transcript_id']
            gene_id = properties['gene_id']
            gene_name = properties.get('gene_name', gene_id)
            gene = cr_constants.Gene(gene_id, gene_name, None, None, None)

            if transcript_id not in transcripts:
                transcripts[transcript_id] = cr_constants.Transcript(
                    gene, None, None, [])

            if gene not in gene_to_transcripts:
                gene_to_transcripts[gene] = set()

            assert transcripts[transcript_id].gene == gene
            transcripts[transcript_id].intervals.append(
                cr_constants.Interval(chrom, start, end, length, strand))
            gene_to_transcripts[gene].add(transcript_id)

        # Transcript length and GC content
        transcript_lengths = {}
        transcript_gc_contents = {}
        for transcript_id, transcript in transcripts.iteritems():
            transcript_lengths[transcript_id] = sum(
                [interval.length for interval in transcript.intervals])
            if fasta_parser is not None:
                transcript_gc_contents[
                    transcript_id] = fasta_parser.get_transcript_gc_content(
                        transcript)

        # Gene length, GC content and start + end positions
        genes = []
        for gene, transcript_ids in gene_to_transcripts.iteritems():
            length = np.median([
                transcript_lengths[transcript_id]
                for transcript_id in transcript_ids
            ])
            gc_content = np.median([
                transcript_gc_contents[transcript_id]
                for transcript_id in transcript_ids
            ])

            transcript_intervals = []
            for transcript_id in transcript_ids:
                transcript_intervals += transcripts[transcript_id].intervals
            transcript_intervals.sort(key=lambda interval: interval.chrom)

            intervals = []
            for chrom, chrom_intervals_iter in itertools.groupby(
                    transcript_intervals, lambda interval: interval.chrom):
                chrom_intervals = list(chrom_intervals_iter)
                start = min([interval.start for interval in chrom_intervals])
                end = max([interval.end for interval in chrom_intervals])
                interval = cr_constants.Interval(chrom, start, end,
                                                 end - start, None)
                intervals.append(interval)

            gene = cr_constants.Gene(gene.id, gene.name, length, gc_content,
                                     intervals)
            genes.append(gene)

            for transcript_id in transcript_ids:
                transcripts[transcript_id] = cr_constants.Transcript(
                    gene, transcript_lengths[transcript_id],
                    transcript_gc_contents[transcript_id],
                    transcripts[transcript_id].intervals)

        return transcripts, genes
Exemplo n.º 7
0
 def select_genes(self, gene_indices):
     new_genes = [cr_constants.Gene(gene[0], gene[1], None, None, None) for \
                  gene in np.array(self.genes)[gene_indices]]
     new_mat = GeneBCMatrix(new_genes, list(self.bcs))
     new_mat.m = self.m[gene_indices,:]
     return new_mat