def cis_sites(): return [ CisSite(id='CIS1', chromosome='1', position=182408172, strand=1, metadata=frozendict()), CisSite(id='CIS2', chromosome='4', position=132408091, strand=1, metadata=frozendict()) ] # yapf: disable
def cis_insertions(): return [ # 1000 bp upstream of Trp53bp2. Insertion(id='INS1', chromosome='1', position=182408172, strand=1, support=2, metadata=frozendict({'cis_id': 'CIS1'})), # Different chromosome. Insertion(id='INS2', chromosome='4', position=77843175, strand=1, support=2, metadata=frozendict({'cis_id': 'CIS2'})) ] # yapf: disable
def insertions(): # Trp53bp2 location: 1: 182,409,172-182,462,432. # Myh9 location: 15: 77,760,587-77,842,175. return [ # 1000 bp upstream of Trp53bp2. Insertion(id='INS1', chromosome='1', position=182408172, strand=1, support=2, metadata=frozendict()), # 2000 bp downstream of Myh9. Insertion(id='INS2', chromosome='15', position=77758587, strand=1, support=2, metadata=frozendict()), # Different chromosome. Insertion(id='INS3', chromosome='4', position=77843175, strand=1, support=2, metadata=frozendict()) ] # yapf: disable
def _annotate_insertion(self, insertion): trees = self._trees # Identify overlapping features. hits = set() for window in self._windows: applied_window = window.apply(insertion.chromosome, insertion.position, insertion.strand) hits |= {(feature['gene_id'], feature['gene_name'], window.name) for feature in applied_window.get_overlap(trees)} # Filter for blacklist. if self._blacklist is not None: hits = {hit for hit in hits if hit[1] not in self._blacklist} if len(hits) > 0: # Annotate insertion with overlapping genes. for gene_id, gene_name, window_name in hits: metadata = {'gene_id': gene_id, 'gene_name': gene_name} if window_name is not None: metadata['window'] = window_name metadata = toolz.merge(insertion.metadata, metadata) yield insertion._replace(metadata=frozendict(metadata)) else: # In case of no overlap, return original insertion. yield insertion
def assign_strand(cis_sites, insertions, mapping, min_homogeneity=0.75): """Assigns CIS sites the average strand of their insertions.""" ins_lookup = {insertion.id: insertion for insertion in insertions} for cis_site in cis_sites: # Lookup strands of CIS insertions. cis_strands = np.array( [ins_lookup[ins_id].strand for ins_id in mapping[cis_site.id]]) # Calculate mean strand, strand and homogeneity. mean_strand = np.mean(cis_strands) strand = np.sign(mean_strand) homogeneity = np.sum(cis_strands == strand) / len(cis_strands) # If homogeneity is below the given threshold, then we don't # assign a specific strand (signified by a nan). if homogeneity < min_homogeneity: strand = np.nan # Merge strand metadata with existing metadata. strand_metadata = { 'strand_mean': mean_strand, 'strand_homogeneity': homogeneity } metadata = toolz.merge(cis_site.metadata, strand_metadata) yield cis_site._replace(strand=strand, metadata=frozendict(metadata))
def _to_insertion(ref, pos, strand, ends, id_=None, **kwargs): metadata = toolz.merge({ 'depth': len(ends), 'depth_unique': len(set(ends)) }, kwargs) return Insertion(id=id_, chromosome=ref, position=pos, strand=strand, support=metadata['depth_unique'], metadata=frozendict(metadata))
def _annotate_insertions(self, insertions, cis_map): for insertion in insertions: genes = cis_map.get(insertion.metadata['cis_id'], set()) if len(genes) > 0: for gene_name, gene_id in genes: metadata = {'gene_id': gene_id, 'gene_name': gene_name} metadata = toolz.merge(insertion.metadata, metadata) if self._drop_cis_id: metadata.pop('cis_id') yield insertion._replace(metadata=frozendict(metadata)) else: if self._drop_cis_id: metadata = dict(insertion.metadata) metadata.pop('cis_id') yield insertion._replace(metadata=frozendict(metadata)) else: yield insertion
def from_frame(cls, df): """Converts dataframe into a list of objects.""" cls.check_frame(df) basic_fields = cls._non_metadata_fields() metadata_fields = list(set(df.columns) - set(basic_fields)) for row in df.itertuples(): row_dict = row._asdict() metadata = {k: row_dict.pop(k) for k in metadata_fields} metadata = frozendict(toolz.valfilter(_not_nan, metadata)) row_dict.pop('Index', None) if not set(basic_fields) == set(row_dict.keys()): missing_fields = set(basic_fields) - set(row_dict.keys()) raise ValueError('Missing required fields ({})'.format( ', '.join(missing_fields))) yield cls(metadata=metadata, **row_dict)