Exemplo n.º 1
0
def ramilowski_interactions(putative = False):
    """
    Downloads and processes ligand-receptor interactions from
    Supplementary Table 2 of Ramilowski 2015.
    
    Returns list of lists with ligand and receptor gene symbols, reference
    and resources as elements.
    """

    c = curl.Curl(urls.urls['rami']['url'], silent = False, large = True)
    xlsname = c.fname
    del(c)
    raw = inputs_common.read_xls(xlsname, 'All.Pairs')[1:]

    return [
        [
            r[1],
            r[3],
            r[13].replace(' ', ''), # references
            ';'.join(filter(len, itertools.chain(r[5:11], [r[15]])))
        ]
        for r in raw
        if r[15] != 'EXCLUDED not ligand' and (
            putative or r[15] != 'putative'
        )
    ]

    return raw
Exemplo n.º 2
0
def cspa_cell_types(organism = 9606):

    sheets = {
        'Human': 'Table_E',
        'Mouse': 'Table_F',
    }

    str_organism = taxonomy.taxids[organism].capitalize()

    url = urls.urls['cspa']['url_s1']
    c = curl.Curl(url, large = True, silent = False)
    xlsname = c.fname
    del(c)
    raw = inputs_common.read_xls(xlsname, sheets[str_organism])

    result = collections.defaultdict(lambda: collections.defaultdict(dict))

    cell_types = raw[0][1:]

    for row in raw[1:]:

        for uniprot in mapping.map_name(row[0], 'uniprot', 'uniprot'):

            for col, cell_type in enumerate(cell_types):

                value = row[col + 1]

                result[cell_type][uniprot] = (
                    float(value)
                        if common.is_float(value) else
                    None
                )

    return result
Exemplo n.º 3
0
def almen2009_annotations():
    
    
    resep = re.compile(r'[;/]')
    
    
    Almen2009Annotation = collections.namedtuple(
        'Almen2009Annotation',
        [
            'mainclass',
            'classes',
            'phobius_secreted',
            'phobius_transmembrane',
            'sosui_transmembrane',
            'tmhmm_transmembrane',
        ]
    )
    
    
    url = urls.urls['almen2009']['url']
    
    c = curl.Curl(url, silent = False, large = True)
    
    xls = c.fileobj
    xlsfile = xls.name
    xls.close()
    tbl = inputs_common.read_xls(xlsfile, sheet = 'Data')[1:]
    
    result = collections.defaultdict(set)
    
    for row in tbl:
        
        uniprots = mapping.map_name(row[0], 'ipi', 'uniprot')
        
        mainclass = row[2]
        classes = row[3].replace('KInase', 'Kinase')
        classes = tuple(sorted(resep.split(classes)))
        phobius_transmembrane = int(float(row[5]))
        phobius_secreted = row[6] == 'Y'
        sosui_transmembrane = int(float(row[8])) if row[8] != 'ERROR' else 0
        tmhmm_transmembrane = int(float(row[10]))
        
        for uniprot in uniprots:
            
            result[uniprot].add(
                Almen2009Annotation(
                    mainclass = mainclass,
                    classes = classes,
                    phobius_secreted = phobius_secreted,
                    phobius_transmembrane = phobius_transmembrane,
                    sosui_transmembrane = sosui_transmembrane,
                    tmhmm_transmembrane = tmhmm_transmembrane,
                )
            )
    
    return result
Exemplo n.º 4
0
def get_li2012():
    """
    Reads supplementary data of Li 2012 from local file.
    Returns table (list of lists).
    """

    url = urls.urls['li2012']['url']
    c = curl.Curl(url, silent=False, large=True)
    xls = c.fileobj
    xlsfile = xls.name
    xls.close()
    tbl = inputs_common.read_xls(xlsfile, sheet='File S1')
    return filter(lambda l: len(l[-1]) > 0, map(lambda l: l[:7], tbl[2:]))
Exemplo n.º 5
0
def rolland_hi_ii_14():
    """
    Loads the HI-II-14 unbiased interactome from the large scale screening
    of from Rolland 2014.
    Returns list of interactions.
    """
    url = urls.urls['hiii14']['url']
    c = curl.Curl(url, silent=False, large=True)
    xlsname = c.fileobj.name
    c.fileobj.close()
    tbl = inputs_common.read_xls(xlsname, sheet='2G')

    for row in tbl[1:]:

        yield [c.split('.')[0] for c in row]
Exemplo n.º 6
0
def rolland_hi_ii_14():
    """
    Loads the HI-II-14 unbiased interactome from the large scale screening
    of from Rolland 2014.
    Returns list of interactions.
    """

    xlsname = cell.cell_supplementary(
        supp_url=urls.urls['hiii14']['url'],
        article_url=urls.urls['hiii14']['article_url'],
    )
    tbl = inputs_common.read_xls(xlsname, sheet='2G')

    for row in tbl[1:]:

        yield [c.split('.')[0] for c in row]
Exemplo n.º 7
0
def cspa_annotations(organism = 9606):


    CspaAnnotation = collections.namedtuple(
        'CspaAnnotation',
        [
            'high_confidence',
            'n_cell_types',
            'tm',
            'gpi',
            'uniprot_cell_surface',
        ],
    )


    sheets = {
        'Human': 'Table A',
        'Mouse': 'Table B',
    }

    str_organism = taxonomy.taxids[organism].capitalize()

    url = urls.urls['cspa']['url_s2']
    c = curl.Curl(url, large = True, silent = False)
    xlsname = c.fname
    del(c)
    raw = inputs_common.read_xls(xlsname, sheets[str_organism])[1:]

    result = collections.defaultdict(set)

    for row in raw:

        for uniprot in mapping.map_name(row[1], 'uniprot', 'uniprot'):

            result[uniprot].add(
                CspaAnnotation(
                    high_confidence = 'high confidence' in row[2],
                    n_cell_types = int(float(row[9])),
                    tm = int(float(row[11])),
                    gpi = int(float(row[12])),
                    uniprot_cell_surface = row[13] == 'yes',
                )
            )

    return dict(result)
Exemplo n.º 8
0
def matrisome_annotations(organism=9606):
    """
    Downloads MatrisomeDB 2.0, a database of extracellular matrix proteins.
    Returns dict where keys are UniProt IDs and values are tuples of
    classes, subclasses and notes.
    """

    MatrisomeAnnotation = collections.namedtuple(
        'MatrisomeAnnotation', ['mainclass', 'subclass', 'subsubclass'])

    tax_names = {
        10090: ('Murine', 'mm'),
        9606: ('Human', 'hs'),
    }

    url = urls.urls['matrisome']['url_xls'] % tax_names[organism]
    c = curl.Curl(url, large=True, silent=False)
    xlsname = c.fname
    del (c)
    raw = inputs_common.read_xls(xlsname)[1:]

    result = collections.defaultdict(set)

    for r in raw:

        uniprots = set(r[7].split(':'))
        uniprots.discard('')

        if not uniprots:
            continue

        uniprots = mapping.map_names(uniprots, 'uniprot', 'uniprot')

        for uniprot in uniprots:

            result[uniprot].add(
                MatrisomeAnnotation(
                    mainclass=r[0].strip(),
                    subclass=r[1].strip(),
                    subsubclass=r[10].strip() or None,
                ))

    return dict(result)
Exemplo n.º 9
0
def embrace_raw():
    """
    Returns Supplementary Table S11 from 10.1016/j.isci.2019.10.026
    (Sheikh et al. 2019) as a list of tuples.
    """

    path = cell_input.cell_supplementary(
        supp_url=urls.urls['embrace']['url'],
        article_url=urls.urls['embrace']['article'],
    )

    content = inputs_common.read_xls(path)

    EmbraceRawRecord = collections.namedtuple('EmbraceRawRecord', content[0])

    return [
        EmbraceRawRecord(*(line[:2] + [int(float(n)) for n in line[2:]]))
        for line in content[1:]
    ]
Exemplo n.º 10
0
def kinasedotcom_annotations():
    """
    Downloads and processes kinase annotations from kinase.com.
    """

    KinasedotcomAnnotation = collections.namedtuple(
        'KinasedotcomAnnotation', ['group', 'family', 'subfamily'])
    KinasedotcomAnnotation.__new__.__defaults__ = (None, )

    def add_record(uniprot, rec, offset=2):

        if rec[offset].strip():
            result[uniprot].add(
                KinasedotcomAnnotation(
                    group=rec[offset].strip(),
                    family=rec[offset + 1].strip(),
                    subfamily=rec[offset + 2].strip() or None,
                ))

    url = urls.urls['kinome']['url']
    c = curl.Curl(url, large=True, silent=False)
    xlsf = c.fileobj
    xlsname = xlsf.name
    xlsf.close()
    tbl = inputs_common.read_xls(xlsname)

    result = collections.defaultdict(set)

    for rec in tbl:

        uniprots = mapping.map_name(rec[23].strip(), 'genesymbol', 'uniprot')

        for uniprot in uniprots:

            add_record(uniprot, rec)

            if rec[12].strip():

                add_record(uniprot, rec, offset=12)

    return result
Exemplo n.º 11
0
def surfaceome_annotations():
    """
    Downloads the "In silico human surfaceome".
    Dict with UniProt IDs as key and tuples of surface prediction score,
    class and subclass as values (columns B, N, S and T of table S3).
    """

    url = urls.urls['surfaceome']['url']
    c = curl.Curl(url, large=True, silent=False)
    xlsname = c.fname
    del (c)
    raw = inputs_common.read_xls(xlsname, 'in silico surfaceome only')[2:]

    return dict((
        uniprot,  # uniprot
        (
            float(r[13]),  # score
            r[18] if r[18] else None,  # class
            set(r[19].replace('KInase', 'Kinase').split(';')
                ) if r[19] else set(),  # subclass
        )) for r in raw
                for uniprot in mapping.map_name(r[1], 'uniprot', 'uniprot'))
Exemplo n.º 12
0
def icellnet_interactions():

    url = urls.urls['icellnet']['url']

    c = curl.Curl(url, silent=False, large=True)

    xls = c.fileobj
    xlsfile = xls.name
    xls.close()
    tbl = inputs_common.read_xls(xlsfile)

    for line in tbl[1:]:

        references = _icellnet_get_references(line)
        resources = _icellnet_get_resources(line)

        if resources:
            references.extend([r for r in resources if r.isdigit()])
            resources = [r for r in resources if not r.isdigit()]

        ligand_components = _icellnet_get_components(line, (0, 1))
        receptor_components = _icellnet_get_components(line, (2, 3, 4))

        ligand = _icellnet_get_entity(ligand_components, references)
        receptor = _icellnet_get_entity(receptor_components, references)

        yield IcellnetRecord(
            ligand=ligand,
            receptor=receptor,
            family=line[6].strip() or None,
            subfamily=line[7].strip() or None,
            classification=([
                cls.strip().replace('.', '').capitalize()
                for cls in line[8].split('/')
            ] if line[8].strip() else None),
            resources=resources,
            references=references,
        )
Exemplo n.º 13
0
def wojtowicz2020_raw():
    """
    Returns Supplementary Table S4 from 10.1016/j.cell.2020.07.025
    (Wojtowicz et al. 2020) as a list of tuples.
    """

    path = cell_input.cell_supplementary(
        supp_url=urls.urls['wojtowicz2020']['url'],
        article_url=urls.urls['wojtowicz2020']['article'],
    )

    content = inputs_common.read_xls(path)

    fields = content.pop(0)
    fields = [re.sub('[- ]', '_', f.lower()) for f in fields]

    Wojtowicz2020RawRecord = collections.namedtuple('Wojtowicz2020RawRecord',
                                                    fields)

    return [
        Wojtowicz2020RawRecord(*(float(f) if 5 < i < 17 else f
                                 for i, f in enumerate(line)))
        for line in content
    ]
Exemplo n.º 14
0
def baccin2019_interactions(ncbi_tax_id=9606):

    recamel = re.compile(r'(.+?)([A-Z][a-z])')
    recap = re.compile(r'(^[A-Z][a-z]|_[A-Z][a-z])(.+)')

    def camel_to_snake(value):

        return (recamel.sub(lambda m: m.group(1).lower() + '_' + m.group(2),
                            value.strip()).lower())

    def id_translate(mouse_gs):

        uniprots = mapping.map_name(
            mouse_gs,
            'genesymbol',
            'uniprot',
            10090,
        )

        if ncbi_tax_id != 10090:
            uniprots = set(
                itertools.chain(*(homology.translate(uniprot)
                                  for uniprot in uniprots)))

        return uniprots

    def raw_to_uniprots(raw):

        components = raw.split('&')

        return set(
            itertools.product(*(id_translate(comp) for comp in components)))

    def get_partners(components, sources, references):

        return {(comp[0] if len(comp) == 1 else intera.Complex(
            components=comp,
            sources=sources,
            references=references,
        ))
                for comp in components}

    Baccin2019Interaction = collections.namedtuple('Baccin2019Interaction', [
        'ligand',
        'receptor',
        'correct',
        'ligand_location',
        'ligand_category',
        'resources',
        'references',
    ])

    source_names = {
        'Baccin': 'Baccin2019',
        'Ramilowski': 'Ramilowski2015',
    }

    url = urls.urls['baccin2019']['url']
    c = curl.Curl(url, silent=False, large=True)
    data = inputs_common.read_xls(c.fileobj.name, sheet='SuppTable3')

    result = []

    if ncbi_tax_id != 10090:
        homology = homology_mod.ProteinHomology(
            target=ncbi_tax_id,
            source=10090,
        )

    for rec in data[3:]:

        if rec[4].strip().lower() == 'incorrect':
            continue

        ligand_components = raw_to_uniprots(rec[1])

        if not ligand_components:
            continue

        receptor_components = raw_to_uniprots(rec[2])

        if not receptor_components:
            continue

        sources = {'Baccin2019', rec[3].strip()}
        sources = {
            source_names[s] if s in source_names else s
            for s in sources
        }

        references = {
            _ref
            for _ref in (ref.strip().replace('.0', '')
                         for ref in rec[7].split(',')) if _ref.isdigit()
        }

        ligands = get_partners(ligand_components, sources, references)
        receptors = get_partners(receptor_components, sources, references)

        for ligand, receptor in itertools.product(ligands, receptors):
            result.append(
                Baccin2019Interaction(
                    ligand=ligand,
                    receptor=receptor,
                    correct=rec[4].strip(),
                    ligand_location=camel_to_snake(rec[5]),
                    ligand_category=camel_to_snake(rec[6]),
                    resources=sources,
                    references=references,
                ))

    return result
Exemplo n.º 15
0
def kirouac2010_interactions():
    """
    Returns tuples of ligand-receptor genesymbol pairs.
    """

    rename = re.compile(r'[A-Z]{2}[A-Z0-9][-A-Z0-9]*')
    rerange = re.compile(r'([0-9])-([0-9])')
    reslash = re.compile(r'.*?([A-Z0-9]{1,3}/[/A-Z0-9]+)')

    def get_names(s):

        names = set()
        prev = None

        for n in s.split():

            m = rename.findall(n)

            if m:
                prev = m
                m = reslash.match(n)

                if m:
                    for post in m.groups()[0].split('/'):
                        for pre in prev:
                            names.add('%s%s' % (pre, post))

                else:
                    m = rerange.match(n)

                    if m:
                        intv = m.groups()

                        for post in range(int(intv[0]), int(intv[1]) + 1):
                            for pre in prev:
                                names.add('%s%u' % (pre, post))

                    else:
                        names.update(prev)

            prev = None

        return names

    init_url = urls.urls['kirouac2010']['init_url']
    req_headers = [
        ('User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:68.0) '
         'Gecko/20100101 Firefox/68.0'),
    ]
    url = urls.urls['kirouac2010']['url']

    c00 = curl.Curl(url, call=False, process=False)

    if (not os.path.exists(c00.cache_file_name)
            or os.path.getsize(c00.cache_file_name) == 0):
        _log('Kirouac 2010 download: requesting website cookie.')

        c0 = curl.Curl(
            init_url,
            silent=True,
            large=False,
            req_headers=req_headers,
            follow=False,
            cache=False,
        )

        cookies = []

        if hasattr(c0, 'resp_headers'):
            for hdr in c0.resp_headers:
                if hdr.lower().startswith(b'set-cookie'):
                    cookie = hdr.split(b':')[1].split(b';')[0].strip()

                    if cookie not in cookies:
                        cookies.append(cookie.decode('ascii'))

            cookies = '; '.join(cookies)

            req_headers.append('Cookie: %s' % cookies)

            _log('Response header: %s' % str(c0.resp_headers))
            _log('Cookies: %s' % str(cookies))
            _log('Request header: %s' % str(req_headers))

        os.remove(c00.cache_file_name)

    c = curl.Curl(
        url,
        silent=False,
        large=True,
        req_headers=req_headers,
    )
    xlsname = c.fname
    del (c)
    tbl = inputs_common.read_xls(xlsname, sheet='S12')

    result = []

    for r in tbl[2:]:
        namesA = get_names(r[0])
        namesB = get_names(r[1])

        result.extend(list(itertools.product(namesA, namesB)))

    return result
Exemplo n.º 16
0
def embrace_raw():
    """
    Returns Supplementary Table S11 from 10.1016/j.isci.2019.10.026
    (Sheikh et al. 2019) as a list of tuples.
    """

    url = urls.urls['embrace']['url']
    c_nocall = curl.Curl(
        url,
        call=False,
        setup=False,
        process=False,
        silent=True,
    )
    c_nocall.get_cache_file_name()
    path = c_nocall.cache_file_name

    init_url = urls.urls['embrace']['article']
    req_headers = []

    if not os.path.exists(path):

        cookies = {}

        for step in range(3):

            c_init = curl.Curl(
                init_url,
                silent=True,
                large=True,
                cache=False,
                follow=False,
                req_headers=req_headers + ['user-agent: curl/7.69.1'],
                bypass_url_encoding=True,
                retries=1,
                empty_attempt_again=False,
            )

            new_cookies = dict(
                tuple(
                    h.decode().split(':')[1].\
                    split(';')[0].\
                    strip().split('=', maxsplit = 1)
                )
                for h in c_init.resp_headers
                if h.lower().startswith(b'set-cookie')
            )
            cookies.update(new_cookies)
            _ = cookies.pop('__cflb', None)

            for h in c_init.resp_headers:

                if h.lower().startswith(b'location'):

                    init_url = h.decode().split(':', maxsplit=1)[1].strip()

            req_headers = ([
                'Cookie: %s' % ('; '.join('%s=%s' % cookie
                                          for cookie in iteritems(cookies)))
            ] if cookies else [])

            _log('HTTP %u; location: `%s`, cookies: `%s`.' % (
                c_init.status,
                init_url,
                req_headers[0] if req_headers else '',
            ))

            if c_init.status != 302:

                break

    c_table = curl.Curl(
        url,
        silent=False,
        large=True,
        empty_attempt_again=False,
        req_headers=req_headers + ['user-agent: curl/7.69.1'],
    )
    path = c_table.cache_file_name
    c_table.fileobj.close()

    content = inputs_common.read_xls(path)

    EmbraceRawRecord = collections.namedtuple('EmbraceRawRecord', content[0])

    return [
        EmbraceRawRecord(*(line[:2] + [int(float(n)) for n in line[2:]]))
        for line in content[1:]
    ]