def from_peaks_bed(peaks_path, genomes): ''' Create a FeatureReference from a peaks bed file Args: peaks_path (str): Path to peaks bed file. Can be None. genomes: This should be the genomes on which peaks were defined. These would be prefixes in peak names ''' # Load peaks info feature_defs = [] all_tag_keys = ['genome', 'derivation'] if peaks_path: # Stuff relevant fields of peak tuple into FeatureDef peaks = None with open(peaks_path, 'rU') as pf: peaks = [ "{}:{}-{}".format(*line.strip("\n").split("\t")) for line in pf ] for peak in peaks: genome = get_genome_from_contig(peak, genomes) feature_defs.append( FeatureDef(index=len(feature_defs), id=peak, name=peak, feature_type=lib_constants.ATACSEQ_LIBRARY_TYPE, tags={ 'genome': genome, 'derivation': '' })) return FeatureReference(feature_defs, all_tag_keys)
def from_motif_list(motif_list): ''' Create a FeatureReference from a motifs list. Args: motif_list (list): list of motif names. Can be None. ''' # Load peaks info feature_defs = [] all_tag_keys = ['genome', 'derivation'] if motif_list: # Stuff into FeatureDef for motif in motif_list: genome = get_genome_from_motif(motif) feature_defs.append( FeatureDef( index=len(feature_defs), id=motif, name=get_name_from_motif(motif), feature_type=lib_constants.ATACSEQ_LIBRARY_DERIVED_TYPE, tags={ 'genome': genome, 'derivation': 'POOL' })) return FeatureReference(feature_defs, all_tag_keys)
def _update_feature_ref(self): """Make the feature reference consistent with the feature mask""" indices = np.flatnonzero(self.feature_mask) self.feature_ref = FeatureReference( feature_defs=[ self.matrix.feature_ref.feature_defs[i] for i in indices ], all_tag_keys=self.matrix.feature_ref.all_tag_keys)
def from_bed_and_motifs(peaks_path, motif_list, genomes): ''' Create a FeatureReference from a bed file of peaks and then from a motifs list. Args: peaks_path (str): bed file of peaks. Can be None. motif_list (list): list of motif names. Can be None. genome: This should be the genome for which the peaks and motifs are identified. ''' # Load peaks info feature_defs = [] all_tag_keys = ['genome', 'derivation'] # process peaks if peaks_path: # Stuff relevant fields of peak tuple into FeatureDef peaks = None with open(peaks_path, 'rU') as pf: peaks = [ "{}:{}-{}".format(*line.strip("\n").split("\t")) for line in pf ] for peak in peaks: genome = get_genome_from_contig(peak, genomes) feature_defs.append( FeatureDef(index=len(feature_defs), id=peak, name=peak, feature_type=lib_constants.ATACSEQ_LIBRARY_TYPE, tags={ 'genome': genome, 'derivation': '' })) # process motifs if motif_list: # Stuff into FeatureDef for motif in motif_list: genome = get_genome_from_motif(motif) feature_defs.append( FeatureDef( index=len(feature_defs), id=motif, name=get_name_from_motif(motif), feature_type=lib_constants.ATACSEQ_LIBRARY_DERIVED_TYPE, tags={ 'genome': genome, 'derivation': 'POOL' })) return FeatureReference(feature_defs, all_tag_keys)
def from_transcriptome_and_csv(gene_ref_path, feature_def_filename): '''Create a FeatureReference. Create a FeatureReference from a transcriptome ref and a feature barcode ref. Args: gene_ref_path (str): Path to transcriptome reference. Can be None. feature_def_filename (str): Path to Feature Definition CSV file. Can be None. Returns: FeatureReference ''' # Load gene info feature_defs = [] all_tag_keys = ['genome'] genomes = cr_utils.get_reference_genomes(gene_ref_path) if gene_ref_path is not None: gene_idx_filename = cr_utils.get_reference_genes_index(gene_ref_path) gene_index = cr_reference.GeneIndex.load_pickle(gene_idx_filename) # Stuff relevant fields of Gene tuple into FeatureDef for gene in gene_index.genes: genome = cr_utils.get_genome_from_str(gene.id, genomes) fd = FeatureDef( index=len(feature_defs), id=gene.id, name=gene.name, feature_type=lib_constants.GENE_EXPRESSION_LIBRARY_TYPE, tags={ 'genome': genome, }) feature_defs.append(fd) # Load feature definition file if feature_def_filename is not None: csv_feature_defs, csv_tag_keys = parse_feature_def_file( feature_def_filename, index_offset=len(feature_defs)) # check the CRISPR 'target_gene_id' field, if it exists # it needs to match a transcriptome entry check_crispr_target_gene(csv_feature_defs, feature_defs) feature_defs.extend(csv_feature_defs) all_tag_keys.extend(csv_tag_keys) return FeatureReference(feature_defs, all_tag_keys)
def build_feature_ref(gene_ids, gene_names, genome_index): feature_defs = [] if len(genome_index) == 1: genome = genome_index.keys()[0] for idx, (gene_id, gene_name) in enumerate(zip(gene_ids, gene_names)): feature_defs.append(FeatureDef(index=idx, id=gene_id, name=gene_name, feature_type=lib_constants.GENE_EXPRESSION_LIBRARY_TYPE, tags={'genome': genome})) else: for idx, (gene_id, gene_name) in enumerate(zip(gene_ids, gene_names)): genome = gene_id.split('_')[0] feature_defs.append(FeatureDef(index=idx, id=gene_id, name=gene_name, feature_type=lib_constants.GENE_EXPRESSION_LIBRARY_TYPE, tags={'genome': genome})) return FeatureReference(feature_defs, ['genome'])
def select_features(self, indices): '''Select a subset of features and return the resulting matrix. We also update FeatureDefs to keep their indices consistent with their new position''' old_feature_defs = [self.feature_ref.feature_defs[i] for i in indices] updated_feature_defs = [ FeatureDef(index=i, id=fd.id, name=fd.name, feature_type=fd.feature_type, tags=fd.tags) for (i, fd) in enumerate(old_feature_defs) ] feature_ref = FeatureReference( feature_defs=updated_feature_defs, all_tag_keys=self.feature_ref.all_tag_keys) return CountMatrix(feature_ref=feature_ref, bcs=self.bcs, matrix=self.m[indices, :])
def from_v3_mtx(genome_dir): barcodes_tsv = os.path.join(genome_dir, "barcodes.tsv.gz") features_tsv = os.path.join(genome_dir, "features.tsv.gz") matrix_mtx = os.path.join(genome_dir, "matrix.mtx.gz") for filepath in [barcodes_tsv, features_tsv, matrix_mtx]: if not os.path.exists(filepath): raise IOError("Required file not found: %s" % filepath) barcodes = pd.read_csv(barcodes_tsv, delimiter='\t', header=None, usecols=[0]).values.squeeze() features = pd.read_csv(features_tsv, delimiter='\t', header=None) feature_defs = [] for (idx, (_, r)) in enumerate(features.iterrows()): fd = FeatureDef(idx, r[0], r[1], r[2], []) feature_defs.append(fd) feature_ref = FeatureReference(feature_defs, []) matrix = sp_io.mmread(matrix_mtx) mat = CountMatrix(feature_ref, barcodes, matrix) return mat
def from_legacy_mtx(genome_dir): barcodes_tsv = os.path.join(genome_dir, "barcodes.tsv") genes_tsv = os.path.join(genome_dir, "genes.tsv") matrix_mtx = os.path.join(genome_dir, "matrix.mtx") for filepath in [barcodes_tsv, genes_tsv, matrix_mtx]: if not os.path.exists(filepath): raise IOError("Required file not found: %s" % filepath) barcodes = pd.read_csv(barcodes_tsv, delimiter='\t', header=None, usecols=[0]).values.squeeze() genes = pd.read_csv(genes_tsv, delimiter='\t', header=None, usecols=[0]).values.squeeze() feature_defs = [ FeatureDef(idx, gene_id, None, "Gene Expression", []) for (idx, gene_id) in enumerate(genes) ] feature_ref = FeatureReference(feature_defs, []) matrix = sp_io.mmread(matrix_mtx) mat = CountMatrix(feature_ref, barcodes, matrix) return mat
def load_feature_ref_from_h5_group(group): '''Load just the FeatureRef from an h5py.Group.''' feature_group = group[h5_constants.H5_FEATURE_REF_ATTR] return FeatureReference.from_hdf5(feature_group)
def from_legacy_v1_h5(cls, h5_file): """Create a CountMatrix from a legacy h5py.File (format version 1)""" genome_arrays = [] gene_id_arrays = [] gene_name_arrays = [] bc_idx_arrays = [] feat_idx_arrays = [] data_arrays = [] # Map barcode string to column index in new matrix barcode_map = OrderedDict() # Construct a genome-concatenated matrix and FeatureReference for genome_idx, genome in enumerate(h5_file.keys()): g = h5_file[genome] n_genes = sum(len(x) for x in gene_id_arrays) # Offset the row (gene) indices by the number of genes seen so far feat_idx_arrays.append(g['indices'][:] + n_genes) # Offset the col (barcode) indices by the number of nonzero elements seen so far # Map barcode (column) indices to a single unique barcode space barcodes = g['barcodes'][:] for bc in barcodes: if bc not in barcode_map: barcode_map[bc] = len(barcode_map) remapped_col_inds = np.fromiter( (barcode_map[bc] for bc in barcodes), count=len(barcodes), dtype='uint64', ) indptr = g['indptr'][:] assert len(indptr) == 1 + len(remapped_col_inds) if genome_idx == 0: # For the first set of barcodes encountered, there should # be no change in their new indices. assert np.array_equal(remapped_col_inds, np.arange(len(indptr) - 1)) # Convert from CSC to COO by expanding the indptr array out nz_elems_per_bc = np.diff(indptr) assert len(nz_elems_per_bc) == len(g['barcodes']) bc_idx = np.repeat(remapped_col_inds, nz_elems_per_bc) assert len(bc_idx) == len(g['indices']) assert len(bc_idx) == len(g['data']) bc_idx_arrays.append(bc_idx) data_arrays.append(g['data'][:]) gene_id_arrays.append(g['genes'][:]) gene_name_arrays.append(g['gene_names'][:]) genome_arrays.append(np.repeat(genome, len(g['genes']))) genomes = np.concatenate(genome_arrays) gene_ids = np.concatenate(gene_id_arrays) gene_names = np.concatenate(gene_name_arrays) # Construct FeatureReference feature_defs = [] for (gene_id, gene_name, genome) in itertools.izip(gene_ids, gene_names, genomes): feature_defs.append( FeatureDef( index=len(feature_defs), id=gene_id, name=gene_name, feature_type=lib_constants.GENE_EXPRESSION_LIBRARY_TYPE, tags={'genome': genome})) feature_ref = FeatureReference(feature_defs, ['genome']) i = np.concatenate(feat_idx_arrays) j = np.concatenate(bc_idx_arrays) data = np.concatenate(data_arrays) assert (type(barcode_map) == OrderedDict) barcodes = barcode_map.keys() matrix = sp_sparse.csc_matrix((data, (i, j)), shape=(len(gene_ids), len(barcodes))) return CountMatrix(feature_ref, barcodes, matrix)
def join(args, outs, chunk_defs, chunk_outs): if args.filtered_peak_bc_matrix is None or not args.reduction_summary[ 'h5'].keys(): outs.analysis = None outs.analysis_csv = None outs.feature_bc_matrix = None return # Make the FBM # build joint Peak + TF count matrix for single genomes # combine peak annotations for single genome analysis peak_annotation = None if args.peak_annotation: annotations = pd.read_csv(args.peak_annotation, sep='\t')[['gene', 'peak_type']] annotations = annotations.replace(np.nan, '', regex=True) annotations = annotations.values.astype(str).tolist() peak_annotation = [] for row in annotations: genes = row[0].split(";") annotation = row[1].split(";") promoter = [] nearby_gene = [] assert len(annotation) == len(genes) for en, kind in enumerate(annotation): if kind == 'promoter': promoter += [genes[en]] nearby_gene += [genes[en]] peak_annotation += [[';'.join(promoter), ';'.join(nearby_gene)]] fbm = cr_matrix.CountMatrix.load_h5_file(args.filtered_peak_bc_matrix) mapping = None if args.filtered_tf_bc_matrix: # combine matrices, ensure the barcodes are same and ordered the same way tf_matrix = cr_matrix.CountMatrix.load_h5_file( args.filtered_tf_bc_matrix) assert (fbm.bcs == tf_matrix.bcs).all() if peak_annotation is not None: fbm.feature_ref = FeatureReference.addtags( fbm.feature_ref, ['promoter', 'nearby_gene'], peak_annotation) tf_matrix.feature_ref = FeatureReference.addtags( tf_matrix.feature_ref, ['promoter', 'nearby_gene']) combined_feature_defs = FeatureReference.join(fbm.feature_ref, tf_matrix.feature_ref) combined_matrix = vstack([fbm.m, tf_matrix.m]) # explicit map linking rows in diffexp to combined matrix mapping = np.zeros((tf_matrix.features_dim, 2)) for x in range(tf_matrix.features_dim): mapping[x, 0] = x mapping[x, 1] = x + fbm.features_dim fbm = cr_matrix.CountMatrix(combined_feature_defs, fbm.bcs, combined_matrix) fbm.save_h5_file(outs.feature_bc_matrix, sw_version=martian.get_pipelines_version()) # Pytables doesn't support variable len strings, so use h5py first with h5.File(outs.feature_bc_matrix, 'r') as matrix, \ h5.File(outs.analysis, 'w') as out: # TODO: copy the first group; fixme when we have a key name = matrix.keys()[0] matrix.copy(matrix[name], out, name='matrix') factorizations = args.reduction_summary['h5'].keys() USE_FACTORIZATION = DEFAULT_FACTORIZATION if DEFAULT_FACTORIZATION in factorizations else factorizations[ 0] with tables.open_file(outs.analysis, 'a') as out: for summary, key in zip([ args.reduction_summary, args.clustering_summary, args.tsne_summary, args.enrichment_analysis_summary ], [USE_FACTORIZATION, 'clustering', 'tsne', 'enrichment']): if summary is None or not summary: continue print(key, summary) data_h5 = summary['h5'][USE_FACTORIZATION] with tables.open_file(data_h5, 'r') as indata: indata.copy_children(indata.root, out.root, recursive=True) dirname = os.path.join(outs.analysis_csv, key) cr_io.copytree(summary['csv'][USE_FACTORIZATION], dirname) # if mapping is present (single genome case), so is the coloring matrix if mapping is not None: with h5.File(outs.analysis, 'a') as out: out.create_dataset('feature_DE_map', data=mapping) args.coerce_strings() tf_propZ_matrix = np.loadtxt(args.tf_propZ_matrix) with h5.File(outs.analysis, 'a') as out: out.create_dataset('diffexp_coloring_matrix', data=tf_propZ_matrix)
def get_feature_ref(self): return FeatureReference.from_hdf5(self.h5[h5_constants.H5_FEATURE_REF_ATTR])
def open(filename, mode, feature_ref=None, barcodes=None, library_info=None, barcode_info=None): """Open a molecule info object. Args: filename (str): Filename to open or create mode (str): 'r' for reading, 'w' for writing. feature_ref (FeatureReference): Required when mode is 'w'. barcodes (list of str): All possible barcode sequences. Required when mode is 'w'. library_info (list of dict): Library metadata. Required when mode is 'w'. barcode_info (BarcodeInfo): Per-barcode metadata. Returns: MoleculeInfo: A new object """ assert mode == 'r' or mode == 'w' mc = MoleculeCounter() if mode == 'w': if feature_ref is None: raise ValueError('Feature reference must be specified when opening a molecule info object for writing') if barcodes is None: raise ValueError('Barcodes must be specified when opening a molecule info object for writing') if library_info is None: raise ValueError('Library info must be specified when opening a molecule info object for writing') if barcode_info is None: raise ValueError('Barcode info must be specified when opening a molecule info object for writing') mc.h5 = h5py.File(filename, 'w') cr_io.set_hdf5_attr(mc.h5, FILE_VERSION_KEY, CURR_FILE_VERSION) cr_io.set_hdf5_attr(mc.h5, h5_constants.H5_FILETYPE_KEY, MOLECULE_H5_FILETYPE) cr_io.set_hdf5_attr(mc.h5, FILE_VERSION_KEY, CURR_FILE_VERSION) mc.h5.create_group(METRICS_GROUP_NAME) # Write feature reference fref_group = mc.h5.create_group(h5_constants.H5_FEATURE_REF_ATTR) feature_ref.to_hdf5(fref_group) # Write barcodes # If there are multiple barcode lengths, use the largest for the numpy dtype. max_barcode_len = np.max(map(len, barcodes)) barcode_dtype = np.dtype('S%d' % max_barcode_len) mc.h5.create_dataset('barcodes', data=np.fromiter(barcodes, barcode_dtype, count=len(barcodes)), compression=HDF5_COMPRESSION) # Write library info lib_info_json = json.dumps(library_info, indent=4, sort_keys=True) cr_io.create_hdf5_string_dataset(mc.h5, 'library_info', [lib_info_json]) # Write barcode info g = mc.h5.create_group(BARCODE_INFO_GROUP_NAME) MoleculeCounter.save_barcode_info(barcode_info, g) # Create empty per-molecule datasets for name, col_type in MOLECULE_INFO_COLUMNS.iteritems(): mc.columns[name] = mc.h5.create_dataset(name, (0,), maxshape=(None,), dtype=col_type, compression=HDF5_COMPRESSION, chunks=(HDF5_CHUNK_SIZE,)) elif mode == 'r': mc.h5 = h5py.File(filename, 'r') try: mc.file_version = mc.h5.attrs[FILE_VERSION_KEY] except AttributeError: mc.file_version = 1 # V1 doesn't have version field if mc.file_version < CURR_FILE_VERSION: raise ValueError('The molecule info HDF5 file (format version %d) was produced by an older version of Cell Ranger. Reading these files is unsupported.' % mc.file_version) if mc.file_version > CURR_FILE_VERSION: raise ValueError('The molecule info HDF5 file (format version %d) was produced by an newer version of Cell Ranger. Reading these files is unsupported.' % mc.file_version) for key in mc.h5.keys(): if key in MOLECULE_INFO_COLUMNS: mc.columns[key] = mc.h5[key] elif key in MOLECULE_REF_COLUMNS: mc.ref_columns[key] = mc.h5[key] elif key == h5_constants.H5_FEATURE_REF_ATTR: mc.feature_reference = FeatureReference.from_hdf5(mc.h5[key]) elif key == METRICS_GROUP_NAME \ or key == BARCODE_INFO_GROUP_NAME: pass else: raise AttributeError("Unrecognized dataset key: %s" % key) # Load library info mc.library_info = json.loads(cr_io.read_hdf5_string_dataset(mc.h5['library_info'])[0]) return mc