def test_edge_cases_mapping(self): gene_t = Gene( name='T', isoforms=[ # 123456789 Protein(refseq='NM_01', sequence='AXAXAYAYA'), # C-terminal part was trimmed Protein(refseq='NM_02', sequence='AXAXA'), # N-terminal part was trimmed Protein(refseq='NM_03', sequence='AYAYA'), ]) db.session.add(gene_t) db.session.commit() mapper = SiteMapper(create_key_model_dict(Protein, 'refseq'), lambda s: f'{s.position}{s.residue}') # all sites in NM_01, the idea is to test sites = DataFrame.from_dict(data={ 'site at N-terminus edge': ('T', 'NM_01', 1, '^AX', 'A', 2), 'site at C-terminus edge': ('T', 'NM_01', 9, 'YA$', 'A', 2), }, orient='index') sites.columns = [ 'gene', 'refseq', 'position', 'sequence', 'residue', 'left_sequence_offset' ] mapped_sites = mapper.map_sites_by_sequence(sites) assert len(mapped_sites) == 4
def kinase_classification(path='data/regphos_kinome_scraped_clean.txt'): known_kinases = create_key_model_dict(Kinase, 'name', True) known_groups = create_key_model_dict(KinaseGroup, 'name', True) new_groups = [] print('Loading protein kinase groups:') header = [ 'No.', 'Kinase', 'Group', 'Family', 'Subfamily', 'Gene.Symbol', 'gene.clean', 'Description', 'group.clean' ] def parser(line): # note that the subfamily is often absent group, family, subfamily = line[2:5] # the 'gene.clean' [6] fits better to the names # of kinases used in all other data files kinase_name = line[6] # 'group.clean' is not atomic and is redundant with respect to # family and subfamily. This check assures that in case of a change # the maintainer would be able to spot the inconsistency easily clean = family + '_' + subfamily if subfamily else family assert line[8] == clean if kinase_name.lower() not in known_kinases: kinase = Kinase(name=kinase_name, protein=get_preferred_gene_isoform(kinase_name)) known_kinases[kinase_name.lower()] = kinase # the 'family' corresponds to 'group' in the all other files if family.lower() not in known_groups: group = KinaseGroup(name=family) known_groups[family.lower()] = group new_groups.append(group) known_groups[family.lower()].kinases.append( known_kinases[kinase_name.lower()]) parse_tsv_file(path, parser, header) return new_groups
def test_mapping(self): gene_a = Gene( name='A', isoforms=[ # the full isoform of gene A Protein(refseq='NM_01', sequence='AAAAAAAAAXAA'), # a trimmed isoform of gene A Protein(refseq='NM_02', sequence='AAAXAA'), ]) gene_b = Gene(name='B', isoforms=[ Protein(refseq='NM_03', sequence='BBBBBBBBBYBB'), Protein(refseq='NM_04', sequence='BBBYBB'), ]) db.session.add_all([gene_a, gene_b]) # whoops, NM_03 has be accidentally removed (!) db.session.delete(Protein.query.filter_by(refseq='NM_03').one()) db.session.commit() mapper = SiteMapper(create_key_model_dict(Protein, 'refseq'), lambda s: f'{s.position}{s.residue}') sites = DataFrame.from_dict(data={ 'good site A': ('A', 'NM_01', 10, 'AXA', 'X', 1), 'lost isoform': ('B', 'NM_03', 10, 'BYB', 'Y', 1) }, orient='index') sites.columns = [ 'gene', 'refseq', 'position', 'sequence', 'residue', 'left_sequence_offset' ] mapped_sites = mapper.map_sites_by_sequence(sites) sites_by_isoform = group_by_isoform(mapped_sites) # one from NM_01 (defined), from NM_02 (mapped), from NM_04 (mapped) assert len(mapped_sites) == 3 assert set(sites_by_isoform) == {'NM_01', 'NM_02', 'NM_04'} assert sites_by_isoform['NM_01'].residue == sites_by_isoform[ 'NM_02'].residue == 'X' assert sites_by_isoform['NM_01'].position == 10 assert sites_by_isoform['NM_02'].position == 4 assert sites_by_isoform['NM_04'].residue == 'Y' assert sites_by_isoform['NM_04'].position == 4 # will the mapping to NM_02 still work if we remove 'gene' column? sites.drop(columns=['gene'], inplace=True) mapped_sites = mapper.map_sites_by_sequence(sites) sites_by_isoform = group_by_isoform(mapped_sites) assert len(mapped_sites) == 2 assert set(sites_by_isoform) == {'NM_01', 'NM_02'}
def pathways(path='data/hsapiens.pathways.NAME.gmt'): """Loads pathways from given '.gmt' file. New genes may be created and should automatically be added to the session with pathways as those have a relationship. """ known_genes = create_key_model_dict(Gene, 'name', lowercase=True) pathways = [] new_genes = [] def parser(data): """Parse GTM file with pathway descriptions. Args: data: a list of subsequent columns from a single line of GTM file For example:: ['CORUM:5419', 'HTR1A-GPR26 complex', 'GPR26', 'HTR1A'] """ gene_set_name = data[0] # Entry description can by empty entry_description = data[1].strip() entry_gene_names = [name.strip() for name in data[2:]] pathway_genes = [] for gene_name in entry_gene_names: name_lower = gene_name.lower() if name_lower in known_genes: gene = known_genes[name_lower] else: gene = Gene(name=gene_name) known_genes[name_lower] = gene new_genes.append(gene) pathway_genes.append(gene) pathway = Pathway(description=entry_description, genes=pathway_genes) if gene_set_name.startswith('GO'): pathway.gene_ontology = int(gene_set_name[3:]) elif gene_set_name.startswith('REAC'): pathway.reactome = int(gene_set_name[5:]) else: raise Exception('Unknown gene set name: "%s"' % gene_set_name) parse_tsv_file(path, parser) print(len(new_genes), 'new genes created') return pathways
def proteins(self): """Allows for lazy fetching of proteins.refseq -> protein as not all uses of importer require proteins in place. """ if self._proteins: return self._proteins return create_key_model_dict(Protein, 'refseq', options=load_only('refseq', 'sequence', 'id'))
def __init__(self): print(f'Preparing {self.source_name} sites importer...') self.issues_counter = Counter() # caching proteins and kinases allows for much faster # import later on, though it takes some time to cache self.known_kinases = create_key_model_dict(Kinase, 'name', lowercase=True) self.known_groups = create_key_model_dict(KinaseGroup, 'name', lowercase=True) self.known_sites = create_key_model_dict( Site, ['protein_id', 'position', 'residue'], options=(joinedload(Site.sources).joinedload('*'))) self.proteins = create_key_model_dict( Protein, 'refseq', options=(load_only('refseq', 'sequence', 'id').joinedload( Protein.gene).joinedload(Gene.isoforms).load_only('refseq'))) # create site types site_type_objects = [ get_or_create(SiteType, name=name) for name in set(self.site_types) ] self.novel_site_types = [ site_type for site_type, new in site_type_objects if new ] self.site_types_map = { site_type.name: site_type for site_type, new in site_type_objects } self.source, _ = get_or_create(SiteSource, name=self.source_name) print(f'{self.source_name} importer ready.')
def domains_types(path='data/interpro.xml.gz'): from xml.etree import ElementTree import gzip print('Loading extended InterPro annotations:') domains = create_key_model_dict(InterproDomain, 'accession') with gzip.open(path) as interpro_file: tree = ElementTree.parse(interpro_file) entries = tree.getroot().findall('interpro') for entry in tqdm(entries): try: domain = domains[entry.get('id')] except KeyError: continue domain.type = entry.get('type')
def kinase_mappings(path='data/curated_kinase_IDs.txt'): """Create kinases from `kinase_name gene_name` mappings. For each kinase a `preferred isoforms` of given gene will be used. If given kinase already is in the database and has an isoform associated, the association will be superseded with the new one. Returns: list of created isoforms """ known_kinases = create_key_model_dict(Kinase, 'name') new_kinases = [] def parser(line): kinase_name, gene_name = line protein = get_preferred_gene_isoform(gene_name) if not protein: print('No isoform for %s kinase mapped to %s gene!' % (kinase_name, gene_name)) return if kinase_name in known_kinases: kinase = known_kinases[kinase_name] if kinase.protein and kinase.protein != protein: print('Overriding kinase-protein association for ' '%s kinase. Old isoform: %s; new isoform: %s.' % (kinase_name, kinase.protein.refseq, protein.refseq)) kinase.protein = protein else: new_kinases.append(Kinase(name=kinase_name, protein=protein)) parse_tsv_file(path, parser) return new_kinases
def __init__(self, proteins, repr_site): self.proteins = proteins self.repr_site = repr_site self.genes = create_key_model_dict(Gene, 'name') self.has_gene_names = None self.already_warned = None
def domains(path='data/biomart_protein_domains_20072016.txt'): proteins = get_proteins() print('Loading domains:') interpro_domains = create_key_model_dict(InterproDomain, 'accession') new_domains = [] skipped = 0 wrong_length = 0 not_matching_chrom = [] header = [ 'Ensembl Gene ID', 'Ensembl Transcript ID', 'Ensembl Protein ID', 'Chromosome Name', 'Gene Start (bp)', 'Gene End (bp)', 'RefSeq mRNA [e.g. NM_001195597]', 'Interpro ID', 'Interpro Short Description', 'Interpro Description', 'Interpro end', 'Interpro start' ] def parser(line): nonlocal skipped, wrong_length, not_matching_chrom try: protein = proteins[line[6]] # by refseq except KeyError: skipped += 1 return # If there is no data about the domains, skip this record if len(line) == 7: return try: assert len(line) == 12 except AssertionError: print(line, len(line)) # does start is lower than end? assert int(line[11]) < int(line[10]) accession = line[7] # according to: # http://www.ncbi.nlm.nih.gov/pmc/articles/PMC29841/#__sec2title assert accession.startswith('IPR') start, end = int(line[11]), int(line[10]) # TODO: the assertion fails for some domains: what to do? # assert end <= protein.length if end > protein.length: wrong_length += 1 if line[3] != protein.gene.chrom: skipped += 1 not_matching_chrom.append(line) return if accession not in interpro_domains: interpro = InterproDomain( accession=line[7], # Interpro Accession short_description=line[8], # Interpro Short Description description=line[9], # Interpro Description ) interpro_domains[accession] = interpro interpro = interpro_domains[accession] similar_domains = [ # select similar domain occurrences with criteria being: domain for domain in protein.domains # - the same interpro id if domain.interpro == interpro and # - at least 75% of common coverage for shorter occurrence of domain ((min(domain.end, end) - max(domain.start, start)) / min(len(domain), end - start) > 0.75) ] if similar_domains: try: assert len(similar_domains) == 1 except AssertionError: print(similar_domains) domain = similar_domains[0] domain.start = min(domain.start, start) domain.end = max(domain.end, end) else: domain = Domain(interpro=interpro, protein=protein, start=start, end=end) new_domains.append(domain) parse_tsv_file(path, parser, header) print('Domains loaded,', skipped, 'proteins skipped.', 'Domains exceeding proteins length:', wrong_length, 'Domains skipped due to not matching chromosomes:', len(not_matching_chrom)) return new_domains
def proteins_and_genes(path='data/protein_data.tsv'): """Create proteins and genes based on data in a given file. If protein/gene already exists it will be skipped. Returns: list of created (new) proteins """ # TODO where does the tsv file come from? print('Creating proteins and genes:') genes = create_key_model_dict(Gene, 'name', lowercase=True) known_proteins = get_proteins() proteins = {} coordinates_to_save = [('txStart', 'tx_start'), ('txEnd', 'tx_end'), ('cdsStart', 'cds_start'), ('cdsEnd', 'cds_end')] allowed_strands = ['+', '-'] # a list storing refseq ids which occur at least twice in the file with_duplicates = [] potentially_empty_genes = set() header = [ 'bin', 'name', 'chrom', 'strand', 'txStart', 'txEnd', 'cdsStart', 'cdsEnd', 'exonCount', 'exonStarts', 'exonEnds', 'score', 'name2', 'cdsStartStat', 'cdsEndStat', 'exonFrames' ] columns = tuple(header.index(x[0]) for x in coordinates_to_save) coordinates_names = [x[1] for x in coordinates_to_save] def parser(line): # use name2 (fourth column from the end) name = line[-4] strand = line[3] assert strand in allowed_strands gene_data = { 'name': name, 'chrom': line[2][3:], # remove chr prefix 'strand': True if strand == '+' else False } if name.lower() not in genes: gene = Gene(**gene_data) genes[name.lower()] = gene else: gene = genes[name.lower()] for key, value in gene_data.items(): previous = getattr(gene, key) if previous != value: print( f'Replacing {gene} {key} with {value} (previously: {previous})' ) setattr(gene, key, value) # load protein refseq = line[1] # if protein is already in database no action is required if refseq in known_proteins: return # do not allow duplicates if refseq in proteins: with_duplicates.append(refseq) potentially_empty_genes.add(gene) """ if gene.chrom in ('X', 'Y'): # close an eye for pseudoautosomal regions print( 'Skipping duplicated entry (probably belonging', 'to pseudoautosomal region) with refseq:', refseq ) else: # warn about other duplicated records print( 'Skipping duplicated entry with refseq:', refseq ) """ return # from this line there is no processing of duplicates allowed assert refseq not in proteins protein_data = {'refseq': refseq, 'gene': gene} coordinates = zip( coordinates_names, [int(value) for i, value in enumerate(line) if i in columns]) protein_data.update(coordinates) proteins[refseq] = Protein(**protein_data) parse_tsv_file(path, parser, header) cnt = sum(map(lambda g: len(g.isoforms) == 1, potentially_empty_genes)) print('Duplicated that are only isoforms for gene:', cnt) print('Duplicated rows detected:', len(with_duplicates)) return proteins.values()