def build_gene(self): self.cpdb_gene = set() for entity in self._entities: # we add the components of the complexes to the protein data # frame; I don't know if it's necessary but does not harm I guess if hasattr(entity, 'components'): components = entity.components else: components = (entity, ) for comp in components: name = mapping.map_name0(comp, 'uniprot', 'genesymbol') ensembl_genes = mapping.map_name(comp, 'uniprot', 'ensembl') for ensembl in ensembl_genes: self.cpdb_gene.add( CellPhoneDBGene( gene_name=name, uniprot=comp, hgnc_symbol=name, ensembl=ensembl, ))
def _icellnet_get_components(line, idx): return [ uniprot for uniprot in (mapping.map_name0(line[i], 'genesymbol', 'uniprot') for i in idx) if uniprot ]
def get_stoichiometry(rec): if not rec['stoichiometry']: return get_uniprots(rec) return tuple( mapping.map_name0(genesymbol, 'genesymbol', 'uniprot') for genesymbol in rec['stoichiometry'].split(';'))
def _cellphonedb_get_entity(name, complexes): if name in complexes: return (complexes[name], ) if ':' in name: name = name.split(':')[1] if '_' in name: name = mapping.map_name0(name, 'name-entry', 'name') if not uniprot_input.is_uniprot(name): uniprot = mapping.map_name0(name, 'genesymbol', 'uniprot') name = uniprot or name name = _cellphonedb_hla(name) return (name, ) if isinstance(name, common.basestring) else name
def genesymbols(self): return sorted( ( mapping.map_name0(uniprot, 'uniprot', 'genesymbol') or uniprot ) for uniprot in self.components.keys() )
def get_id_name(entity): id_ = entity.__str__() name = (id_ if 'COMPLEX' in id_ else mapping.map_name0( id_, 'uniprot', 'uniprot-entry', )) return id_, name
def stoichiometry_str_genesymbols(self): return ';'.join( itertools.chain(*(((mapping.map_name0( uniprot, 'uniprot', 'genesymbol', ) or uniprot), ) * cnt for uniprot, cnt in sorted( iteritems(self.components), key=lambda comp_cnt: comp_cnt[0], ))))
def build_protein(self): integrins = annot.db.annots['Integrins'] self.cpdb_protein = set() for entity in self._entities: # we add the components of the complexes to the protein data # frame; I don't know if it's necessary but does not harm I guess if hasattr(entity, 'components'): components = entity.components else: components = (entity, ) for comp in components: classes = self.intercell.classes_by_entity(comp) self.cpdb_protein.add( CellPhoneDBProtein( uniprot=comp.__str__(), protein_name=mapping.map_name0( comp, 'uniprot', 'uniprot-entry', ), transmembrane='transmembrane' in classes, peripheral='cell_surface' in classes, secreted='secreted' in classes, secreted_desc='', secreted_highlight='', receptor='receptor' in classes, receptor_desc='', integrin=comp in integrins, other='', other_desc='', tags='', tags_reason='', tags_description='', ))
def zhong2015_annotations(): """ From 10.1111/nyas.12776 (PMID 25988664). """ types = { 'i': 'iCAM', 'm': 'matrix adhesion', 'ag': 'axonal guidance', 'aj': 'adherens junction', 'c': 'cell-cell adhesion', 'fa': 'focal adhesion', 'tj': 'tight junction', 'my': 'myelin interactions', } Zhong2015Annotation = collections.namedtuple( 'Zhong2015Annotation', ['type'], ) result = collections.defaultdict(set) url = urls.urls['zhong2015']['url'] c = curl.Curl(url, silent=False, large=True) _ = next(c.result) for rec in c.result: rec = rec.strip().split('\t') uniprot = mapping.map_name0(rec[0], 'genesymbol', 'uniprot') if uniprot: result[uniprot].add(Zhong2015Annotation(type=types[rec[2]])) return result
def _bootstrap(self, identifier, id_type, entity_type, taxon): if self._is_complex(identifier): entity_type = 'complex' id_type = 'complex' taxon = ( identifier.ncbi_tax_id if hasattr(identifier, 'ncbi_tax_id') else taxon ) taxon = taxon or settings.get('default_organism') if not entity_type: if id_type and id_type in self._id_type_to_entity_type: entity_type = self._id_type_to_entity_type[id_type] if not id_type: id_type, entity_type = mapping.guess_type( identifier, entity_type = entity_type, ) if not id_type and (not entity_type or entity_type == 'protein'): id_type, entity_type = 'genesymbol', 'protein' if id_type in self._label_types: _identifier = mapping.id_from_label0( label = identifier, label_id_type = id_type, ncbi_tax_id = taxon, ) if _identifier and _identifier != identifier: id_type = mapping.mapper.label_type_to_id_type[id_type] identifier = _identifier if id_type == 'mir-pre': _identifier = mapping.map_name0( identifier, id_type, 'mirbase', ncbi_tax_id = taxon, ) if _identifier and _identifier != identifier: identifier = _identifier id_type = 'mirbase' entity_type = entity_type or self._get_entity_type(identifier) self.identifier = identifier self.id_type = id_type self.entity_type = entity_type self.taxon = taxon
def topdb_annotations(ncbi_tax_id = 9606): TopdbAnnotation = collections.namedtuple( 'TopdbAnnotation', ['membrane', 'topology', 'score', 'tmregions'], ) result = collections.defaultdict(set) url = urls.urls['topdb']['url'] c = curl.Curl( url, large = True, default_mode = 'rb', silent = False, ) parser = etree.iterparse(c.fileobj, events = ('start', 'end')) result = collections.defaultdict(set) root = next(parser) used_elements = [] for ev, elem in parser: if ev == 'end' and elem.tag == 'TOPDB': used_elements.append(elem) organism = elem.find('Organism').text if ( organism not in taxonomy.latin_name_to_ncbi_tax_id or taxonomy.latin_name_to_ncbi_tax_id[organism] != ncbi_tax_id ): continue tag_uniprots = elem.find('./CrossRef/UniProt') if tag_uniprots is None: continue uniprots = [u.text for u in tag_uniprots.findall('AC')] uniprots = set( mapping.map_name0( u, 'uniprot', 'uniprot', ncbi_tax_id = ncbi_tax_id, ) for u in uniprots ) if not uniprots: continue membranes = set( mem for tag_mem in elem.findall('Membrane') for mem in tag_mem.text.split(';') ) ntm = 0 score = 0 topologies = () tag_topo = elem.find('Topology') if tag_topo is not None: ntm = int(tag_topo.find('Numtm').attrib['Count']) score = int(tag_topo.find('Reliability').text) topologies = set( tag_reg.attrib['Loc'] for tag_reg in tag_topo.findall('./Regions/Region') ) if not membranes: membranes = (None,) if not topologies: topologies = (None,) for topology, membrane, uniprot in itertools.product( topologies, membranes, uniprots, ): result[uniprot].add( TopdbAnnotation( membrane = membrane, topology = topology, tmregions = ntm, score = score, ) ) # removing used elements to keep memory low if len(used_elements) > 2000: for _ in xrange(1000): e = used_elements.pop(0) e.clear() # closing the XML c.fileobj.close() del c return result
def opm_annotations(organism = 9606): reparentheses = re.compile(r'\((.*)\)') regenesymbol = re.compile(r' ([A-Z0-9]{3,}) ') def get_dict(name): result = {} url = urls.urls['opm'][name] c = curl.Curl(url, large = True, silent = False) data = csv.DictReader(c.result, delimiter = ',') for rec in data: result[rec['id']] = rec['name'] return result OpmAnnotation = collections.namedtuple( 'OpmAnnotation', ['membrane', 'family', 'transmembrane'], ) result = collections.defaultdict(set) organism_name = ( taxonomy.phosphoelm_taxids[organism] if organism in taxonomy.phosphoelm_taxids else None ) types = get_dict('types') families = get_dict('families') url = urls.urls['opm']['proteins'] c = curl.Curl(url, silent = False, large = True) data = csv.DictReader(c.result, delimiter = ',') for rec in data: if organism_name and rec['species_name_cache'] != organism_name: continue name = rec['name'] names = [ name, name.split('(')[0], name.split(',')[0], ] m = reparentheses.search(name) if m: names.append(m.groups()[0]) genesymbols = regenesymbol.findall(name) for this_name in names: uniprot = mapping.map_name0(this_name, 'protein-name', 'uniprot') if uniprot: break if not uniprot: for gs in genesymbols: uniprot = ( mapping.map_name0(this_name, 'genesymbol', 'uniprot') ) if uniprot: break if not uniprot: continue result[uniprot].add( OpmAnnotation( membrane = rec['membrane_name_cache'], family = rec['family_name_cache'], transmembrane = types[rec['type_id']] == 'Transmembrane', ) ) return result