Пример #1
0
    def build_gene(self):

        self.cpdb_gene = set()

        for entity in self._entities:

            # we add the components of the complexes to the protein data
            # frame; I don't know if it's necessary but does not harm I guess
            if hasattr(entity, 'components'):

                components = entity.components

            else:

                components = (entity, )

            for comp in components:

                name = mapping.map_name0(comp, 'uniprot', 'genesymbol')
                ensembl_genes = mapping.map_name(comp, 'uniprot', 'ensembl')

                for ensembl in ensembl_genes:

                    self.cpdb_gene.add(
                        CellPhoneDBGene(
                            gene_name=name,
                            uniprot=comp,
                            hgnc_symbol=name,
                            ensembl=ensembl,
                        ))
Пример #2
0
def _icellnet_get_components(line, idx):

    return [
        uniprot
        for uniprot in (mapping.map_name0(line[i], 'genesymbol', 'uniprot')
                        for i in idx) if uniprot
    ]
Пример #3
0
    def get_stoichiometry(rec):

        if not rec['stoichiometry']:
            return get_uniprots(rec)

        return tuple(
            mapping.map_name0(genesymbol, 'genesymbol', 'uniprot')
            for genesymbol in rec['stoichiometry'].split(';'))
Пример #4
0
def _cellphonedb_get_entity(name, complexes):

    if name in complexes:
        return (complexes[name], )

    if ':' in name:
        name = name.split(':')[1]

    if '_' in name:
        name = mapping.map_name0(name, 'name-entry', 'name')

    if not uniprot_input.is_uniprot(name):
        uniprot = mapping.map_name0(name, 'genesymbol', 'uniprot')
        name = uniprot or name

    name = _cellphonedb_hla(name)

    return (name, ) if isinstance(name, common.basestring) else name
Пример #5
0
 def genesymbols(self):
     
     return sorted(
         (
             mapping.map_name0(uniprot, 'uniprot', 'genesymbol') or
             uniprot
         )
         for uniprot in self.components.keys()
     )
Пример #6
0
        def get_id_name(entity):

            id_ = entity.__str__()

            name = (id_ if 'COMPLEX' in id_ else mapping.map_name0(
                id_,
                'uniprot',
                'uniprot-entry',
            ))

            return id_, name
Пример #7
0
    def stoichiometry_str_genesymbols(self):

        return ';'.join(
            itertools.chain(*(((mapping.map_name0(
                uniprot,
                'uniprot',
                'genesymbol',
            ) or uniprot), ) * cnt for uniprot, cnt in sorted(
                iteritems(self.components),
                key=lambda comp_cnt: comp_cnt[0],
            ))))
Пример #8
0
    def build_protein(self):

        integrins = annot.db.annots['Integrins']

        self.cpdb_protein = set()

        for entity in self._entities:

            # we add the components of the complexes to the protein data
            # frame; I don't know if it's necessary but does not harm I guess
            if hasattr(entity, 'components'):

                components = entity.components

            else:

                components = (entity, )

            for comp in components:

                classes = self.intercell.classes_by_entity(comp)

                self.cpdb_protein.add(
                    CellPhoneDBProtein(
                        uniprot=comp.__str__(),
                        protein_name=mapping.map_name0(
                            comp,
                            'uniprot',
                            'uniprot-entry',
                        ),
                        transmembrane='transmembrane' in classes,
                        peripheral='cell_surface' in classes,
                        secreted='secreted' in classes,
                        secreted_desc='',
                        secreted_highlight='',
                        receptor='receptor' in classes,
                        receptor_desc='',
                        integrin=comp in integrins,
                        other='',
                        other_desc='',
                        tags='',
                        tags_reason='',
                        tags_description='',
                    ))
Пример #9
0
def zhong2015_annotations():
    """
    From 10.1111/nyas.12776 (PMID 25988664).
    """

    types = {
        'i': 'iCAM',
        'm': 'matrix adhesion',
        'ag': 'axonal guidance',
        'aj': 'adherens junction',
        'c': 'cell-cell adhesion',
        'fa': 'focal adhesion',
        'tj': 'tight junction',
        'my': 'myelin interactions',
    }

    Zhong2015Annotation = collections.namedtuple(
        'Zhong2015Annotation',
        ['type'],
    )
    result = collections.defaultdict(set)

    url = urls.urls['zhong2015']['url']
    c = curl.Curl(url, silent=False, large=True)

    _ = next(c.result)

    for rec in c.result:
        rec = rec.strip().split('\t')

        uniprot = mapping.map_name0(rec[0], 'genesymbol', 'uniprot')

        if uniprot:
            result[uniprot].add(Zhong2015Annotation(type=types[rec[2]]))

    return result
Пример #10
0
    def _bootstrap(self, identifier, id_type, entity_type, taxon):

        if self._is_complex(identifier):

            entity_type = 'complex'
            id_type = 'complex'
            taxon = (
                identifier.ncbi_tax_id
                    if hasattr(identifier, 'ncbi_tax_id') else
                taxon
            )

        taxon = taxon or settings.get('default_organism')

        if not entity_type:

            if id_type and id_type in self._id_type_to_entity_type:

                entity_type = self._id_type_to_entity_type[id_type]


        if not id_type:

            id_type, entity_type = mapping.guess_type(
                identifier,
                entity_type = entity_type,
            )

        if not id_type and (not entity_type or entity_type == 'protein'):

            id_type, entity_type = 'genesymbol', 'protein'

        if id_type in self._label_types:

            _identifier = mapping.id_from_label0(
                label = identifier,
                label_id_type = id_type,
                ncbi_tax_id = taxon,
            )



            if _identifier and _identifier != identifier:

                id_type = mapping.mapper.label_type_to_id_type[id_type]
                identifier = _identifier

            if id_type == 'mir-pre':

                _identifier = mapping.map_name0(
                    identifier,
                    id_type,
                    'mirbase',
                    ncbi_tax_id = taxon,
                )

                if _identifier and _identifier != identifier:

                    identifier = _identifier
                    id_type = 'mirbase'

        entity_type = entity_type or self._get_entity_type(identifier)

        self.identifier = identifier
        self.id_type = id_type
        self.entity_type = entity_type
        self.taxon = taxon
Пример #11
0
def topdb_annotations(ncbi_tax_id = 9606):

    TopdbAnnotation = collections.namedtuple(
        'TopdbAnnotation',
        ['membrane', 'topology', 'score', 'tmregions'],
    )

    result = collections.defaultdict(set)

    url = urls.urls['topdb']['url']
    c = curl.Curl(
        url,
        large = True,
        default_mode = 'rb',
        silent = False,
    )

    parser = etree.iterparse(c.fileobj, events = ('start', 'end'))

    result = collections.defaultdict(set)
    root = next(parser)
    used_elements = []

    for ev, elem in parser:
        if ev == 'end' and elem.tag == 'TOPDB':
            used_elements.append(elem)

            organism = elem.find('Organism').text

            if (
                organism not in taxonomy.latin_name_to_ncbi_tax_id or
                taxonomy.latin_name_to_ncbi_tax_id[organism] != ncbi_tax_id
            ):
                continue

            tag_uniprots = elem.find('./CrossRef/UniProt')

            if tag_uniprots is None:
                continue

            uniprots = [u.text for u in tag_uniprots.findall('AC')]
            uniprots = set(
                mapping.map_name0(
                    u,
                    'uniprot',
                    'uniprot',
                    ncbi_tax_id = ncbi_tax_id,
                )
                for u in uniprots
            )

            if not uniprots:
                continue

            membranes = set(
                mem
                for tag_mem in elem.findall('Membrane')
                for mem in tag_mem.text.split(';')
            )

            ntm = 0
            score = 0
            topologies = ()
            tag_topo = elem.find('Topology')

            if tag_topo is not None:
                ntm = int(tag_topo.find('Numtm').attrib['Count'])
                score = int(tag_topo.find('Reliability').text)

                topologies = set(
                    tag_reg.attrib['Loc']
                    for tag_reg in tag_topo.findall('./Regions/Region')
                )

            if not membranes:
                membranes = (None,)

            if not topologies:
                topologies = (None,)

            for topology, membrane, uniprot in itertools.product(
                topologies,
                membranes,
                uniprots,
            ):

                result[uniprot].add(
                    TopdbAnnotation(
                        membrane = membrane,
                        topology = topology,
                        tmregions = ntm,
                        score = score,
                    )
                )

        # removing used elements to keep memory low
        if len(used_elements) > 2000:
            for _ in xrange(1000):
                e = used_elements.pop(0)
                e.clear()

    # closing the XML
    c.fileobj.close()
    del c

    return result
Пример #12
0
def opm_annotations(organism = 9606):

    reparentheses = re.compile(r'\((.*)\)')
    regenesymbol  = re.compile(r' ([A-Z0-9]{3,}) ')


    def get_dict(name):

        result = {}
        url = urls.urls['opm'][name]
        c = curl.Curl(url, large = True, silent = False)
        data = csv.DictReader(c.result, delimiter = ',')

        for rec in data:
            result[rec['id']] = rec['name']

        return result


    OpmAnnotation = collections.namedtuple(
        'OpmAnnotation',
        ['membrane', 'family', 'transmembrane'],
    )


    result = collections.defaultdict(set)

    organism_name = (
        taxonomy.phosphoelm_taxids[organism]
            if organism in taxonomy.phosphoelm_taxids else
        None
    )

    types = get_dict('types')
    families = get_dict('families')

    url = urls.urls['opm']['proteins']
    c = curl.Curl(url, silent = False, large = True)

    data = csv.DictReader(c.result, delimiter = ',')

    for rec in data:

        if organism_name and rec['species_name_cache'] != organism_name:

            continue

        name = rec['name']

        names = [
            name,
            name.split('(')[0],
            name.split(',')[0],
        ]

        m = reparentheses.search(name)

        if m:
            names.append(m.groups()[0])

        genesymbols = regenesymbol.findall(name)

        for this_name in names:
            uniprot = mapping.map_name0(this_name, 'protein-name', 'uniprot')

            if uniprot:
                break

        if not uniprot:
            for gs in genesymbols:
                uniprot = (
                    mapping.map_name0(this_name, 'genesymbol', 'uniprot')
                )

                if uniprot:
                    break

        if not uniprot:
            continue

        result[uniprot].add(
            OpmAnnotation(
                membrane = rec['membrane_name_cache'],
                family = rec['family_name_cache'],
                transmembrane = types[rec['type_id']] == 'Transmembrane',
            )
        )

    return result