示例#1
0
 def tissues_x_proteins(self, normalized=True, tissues=None):
     '''
     For all tissues downloads the expression of all the proteins.
     In the result, a dict of dicts will hold the expression values
     of each proteins, grouped by samples.
     '''
     self.get_tissues()
     tissues_selected = set([
         t['TISSUE_ID'] for t in self.tissues
         if tissues is None or t['TISSUE_ID'] in tissues
     ]) - self.tissues_loaded
     prg = progress.Progress(
         len(tissues_selected),
         'Downloading expression data',
         1,
         percent=False)
     for tis in tissues_selected:
         prg.step()
         sys.stdout.write('Querying tissue %s\n' % tis)
         sys.stdout.flush()
         self.get_proteins(tis)
         if not hasattr(self.result, 'read'):
             sys.stdout.write('\tFailed: %s\n' % tis)
             sys.stdout.flush()
         else:
             self.tissues_loaded.add(tis)
             self.get_expression(normalized)
             if tis not in self.samples:
                 self.samples[tis] = []
             self.samples[tis] = uniq_list(self.samples[tis] + list(
                 self.current_samples))
             self.current_samples = set([])
     prg.terminate()
示例#2
0
 def smiles2chembl(self, smiles):
     self.result = {}
     prg = progress.Progress(total=len(smiles),
                             name='Translating SMILEs',
                             interval=1)
     for sml in smiles:
         url = self.chembl_url.format(sml)
         c = curl.Curl(url, large=False)
         result = c.result
         self.result[sml] = []
         if result is not None:
             try:
                 data = json.loads(result)
                 for d in data['compounds']:
                     this_smile = d['smiles']
                     this_chembl = d['chemblId']
                     # if this_smile == sml:
                     self.result[sml].append(this_chembl)
             except ValueError:
                 soup = bs4.BeautifulSoup(result)
                 compounds = soup.find_all('compound')
                 if compounds is not None:
                     for compound in compounds:
                         this_smile = compound.find('smiles').text
                         this_chembl = compound.find('chemblid').text
                         # if this_smile == sml:
                         self.result[sml].append(this_chembl)
         prg.step()
     prg.terminate()
示例#3
0
文件: gsea.py 项目: rfour92/pypath
 def load_collection(self,
                     collname,
                     id_type='entrez',
                     map_ids=True,
                     cachedir='cache'):
     if os.path.exists(os.path.join(cachedir, 'gsea-%s.pickle' % collname)):
         self.load([collname])
         return None
     url = self.collections[collname]['urls'][id_type]
     data = dataio.curl(
         url,
         req_headers=self.session,
         silent=False,
         cache=False,
         write_cache=True)
     data = data.split('\n')
     names = []
     prg = progress.Progress(len(data), 'Loading gene sets', 1)
     for line in (l.split('\t') for l in data if len(l) > 0):
         prg.step()
         setname = line[0].strip()
         self.write_set(line[2:], setname, id_type, map_ids)
         self.get_desc(setname)
         names.append(setname)
     prg.terminate()
     self.groups[collname] = set(names)
     self.save([collname], cachedir=cachedir)
示例#4
0
文件: go.py 项目: rfour92/pypath
def annotate(graph, organism=9606, aspects=('C', 'F', 'P')):
    """
    Adds Gene Ontology annotations to the nodes of a graph.
    
    :param igraph.Graph graph:
        Any ``igraph.Graph`` object with uniprot IDs
        in its ``name`` vertex attribute.
    """

    aspects = aspects if type(aspects) in {list, tuple} else (aspects, )

    graph.vs['go'] = [{
        'C': set(),
        'F': set(),
        'P': set()
    } for _ in xrange(graph.vcount())]

    terms, annot = dataio.go_annotations_goa(organism=organism)

    prg = progress.Progress(graph.vcount(), 'Loading GO annotations', 9)

    for v in graph.vs:

        prg.step()

        for asp in aspects:

            if v['name'] in annot[asp]:

                v['go'][asp] = annot[asp][v['name']]

    prg.terminate()
示例#5
0
文件: chembl.py 项目: rfour92/pypath
 def compounds_targets_mechanism(self,
                                 id_list,
                                 id_type='uniprot',
                                 domains=False,
                                 pred_bind_d=False,
                                 activities=False,
                                 pchembl=False,
                                 one_query=False,
                                 client_side=False):
     if id_type == 'uniprot':
         compound_lookup = True
         id_list = self.get_chembl_uniprots(id_list)
     self.result = []
     id_list = id_list if type(id_list) is list else [id_list]
     if one_query:
         query_thread = threading.Thread(
             target=self.compound_target_mechanism,
             args=[id_list],
             kwargs={
                 'id_type': id_type,
                 'domains': domains,
                 'pred_bind_d': pred_bind_d,
                 'activities': activities,
                 'pchembl': pchembl
             })
         query_thread.daemon = True
         query_thread.start()
         sys.stdout.write('\n')
         sys.stdout.flush()
         while query_thread.isAlive():
             self.mysql.print_status()
             time.sleep(1)
         self.mysql_ready()
         if client_side:
             self.result = list(self.result)
     else:
         prg = progress.Progress(total=len(id_list),
                                 name='Sending queries',
                                 interval=5)
         qids = []
         for identifier in id_list:
             prg.step()
             qids.append(
                 self.compound_target_mechanism(identifier,
                                                id_type=id_type,
                                                domains=domains,
                                                pred_bind_d=pred_bind_d,
                                                activities=activities,
                                                pchembl=pchembl,
                                                wait=False))
         prg.terminate()
         self.mysql_ready(qids)
         for qid in qids:
             self.result += list(self.mysql.get_result(qid))
示例#6
0
    def _make_df_igraph(
            self,
            unique_pairs = True,
            extra_node_attrs = None,
            extra_edge_attrs = None,
        ):
        """
        See docs at method ``make_df``.
        """

        self._log('Creating data frame from `legacy.main.PyPath` object.')

        result = []

        self.pa.genesymbol_labels()

        self.extra_node_attrs = extra_node_attrs or self.extra_node_attrs
        self.extra_edge_attrs = extra_edge_attrs or self.extra_edge_attrs

        dtypes = (
            self.default_dtypes_uniquepairs
                if unique_pairs else
            self.default_dtypes_bydirs
        )

        header = self.get_header(unique_pairs = unique_pairs)

        prg = progress.Progress(
            total = self.graph.ecount(),
            name = 'Creating table',
            interval = 31
        )

        for e in self.graph.es:

            # adding default fields
            lines = (
                self._process_edge_uniquepairs_igraph(e)
                    if unique_pairs else
                self._process_edge_bydirection_igraph(e)
            )

            result.extend(lines)

            prg.step()

        prg.terminate()

        self.df = pd.DataFrame(result, columns = header)
        self.df = self.df.astype(dtypes)
示例#7
0
 def inchikey2anything(self, target, lst):
     self.result = {}
     target = str(target) if type(target) is int else self.name_dict[target]
     prg = progress.Progress(total=len(lst),
                             name='Translating InChi-Keys',
                             interval=1)
     for inchik in lst:
         url = self.inchi_stem % inchik
         c = curl.Curl(url, large=False)
         result = c.result
         if result is not None:
             data = json.loads(result)
             self.result[inchik] = [
                 d['src_compound_id'] for d in data if d['src_id'] == target
             ]
         prg.step()
     prg.terminate()
示例#8
0
def get_pubmeds(pmids):

    pmids = [str(pmid) for pmid in pmids]
    url = urls.urls['pubmed-eutils']['url']
    cache = len(pmids) < 10
    data = {}
    prg = progress.Progress(len(pmids) / 100 + 1,
                            'Retrieving data from NCBI e-utils',
                            1,
                            percent=False)

    for offset in xrange(0, len(pmids), 100):
        prg.step()
        post = {
            'id': ','.join(pmids[offset:offset + 100]),
            'retmode': 'json',
            'db': 'pubmed'
        }

        for i in xrange(3):
            try:
                c = curl.Curl(
                    url,
                    silent=False,
                    cache=cache,
                    post=post,
                    override_post=True,
                )
                res = c.result
                data = dict(
                    [(k, v) for k, v in iteritems(json.loads(res)['result'])] +
                    [(k, v) for k, v in iteritems(data)])

                break

            except ValueError:
                sys.stdout.write('\t:: Error in JSON, retry %u\n' % i)
                sys.stdout.flush()

    prg.terminate()

    return data
示例#9
0
 def connectivity_search(self,
                         id_list,
                         id_type,
                         parameters=[1, 0, 0, 0, 0, 1, 0]):
     '''
     [1,0,0,0,0,1,0,  1]
     '''
     '''
     parameters is a list of parameters A-H as described in 
     https://www.ebi.ac.uk/unichem/info/widesearchInfo
     '''
     parameters.append(1)  # H parameter must be 1 to process the result
     parameters = [str(i) for i in parameters]
     self.result = {}
     if id_type == 'inchikey':
         id_type = ''
         method = 'key_search'
     elif id_type == 'smiles':
         self.result = None
         return None
     else:
         id_type = str(
             id_type) if type(id_type) is int else self.name_dict[id_type]
         id_type = '%s/' % id_type
         method = 'cpd_search'
     prg = progress.Progress(total=len(id_list),
                             name='Connectivity search',
                             interval=1)
     for i in id_list:
         prg.step()
         url = self.cpd_search.format(method, i, id_type,
                                      '/'.join(parameters))
         c = curl.Curl(url, large=False)
         result = c.result
         self.result[i] = []
         if result is not None:
             data = json.loads(result)
             for k, v in iteritems(data):
                 for j in range(1, len(v)):
                     self.result[i].append(v[j][0])
         self.result[i] = list(set(self.result[i]))
     prg.terminate()
示例#10
0
 def translate(self, source, target, lst):
     if source == 'inchikey':
         self.inchikey2anything(target, lst)
         return None
     if source == 'smiles':
         self.smiles2chembl(lst)
         return None
     self.result = {}
     source = str(source) if type(source) is int else self.name_dict[source]
     target = str(target) if type(target) is int else self.name_dict[target]
     prg = progress.Progress(total=len(lst),
                             name='Translating compound identifiers',
                             interval=1)
     for comp in lst:
         url = '/'.join([self.url_stem, comp, source, target])
         c = curl.Curl(url, large=False)
         result = c.result
         self.result[comp] = []
         if result is not None:
             data = json.loads(result)
             for d in data:
                 self.result[comp].append(d['src_compound_id'])
         prg.step()
     prg.terminate()
示例#11
0
文件: pfam.py 项目: rfour92/pypath
def get_pfam(uniprots=None, organism=9606):

    if uniprots is None:

        uniprots = uniprot_input.all_uniprots(
            organism=organism,
            swissprot=True,
        )

    u_pfam = {}
    pfam_u = {}

    if uniprots is not None:

        prg = progress.Progress(
            len(uniprots) / 30,
            'Downloading data from UniProt',
            1,
        )
        data_all = []

        for i in xrange(0, len(uniprots), 30):

            to = i + 30
            thisPart = uniprots[i:to]
            thisPart = ' OR '.join(['accession:%s' % u for u in thisPart])
            get = {
                'query': thisPart,
                'format': 'tab',
                'columns': 'id,database(Pfam)'
            }
            for j in xrange(3):
                c = curl.Curl(urls.urls['uniprot_basic']['url'], get=get)
                data = c.result
                if data is not None:
                    break
            if data is None:
                return None, None
            data = data.split('\n')
            del data[0]
            del data[-1]
            data_all += data
            prg.step()

        prg.terminate()

    else:

        organism = taxonomy.ensure_ncbi_tax_id(organism)

        if not organism:

            return None, None

        organismQuery = 'organism:%u AND reviewed:yes' % organism
        get = {
            'query': organismQuery,
            'format': 'tab',
            'columns': 'id,database(Pfam)'
        }

        for j in xrange(3):

            c = curl.Curl(
                urls.urls['uniprot_basic']['url'],
                get=get,
                silent=False,
                outf='uniprot-pfam-%u.tab' % organism,
            )
            data_all = c.result
            if data_all is not None:
                break

        if data_all is None:
            return None

        data_all = data_all.split('\n')
        del data_all[0]

    for l in data_all:

        l = l.split('\t')

        pfams = re.sub(';$', '', l[1]).strip()
        pfams = pfams.split(';') if pfams else []

        if l[0] not in u_pfam:

            u_pfam[l[0]] = []

        u_pfam[l[0]] += pfams

        for pfam in pfams:

            if pfam not in pfam_u:
                pfam_u[pfam] = []

            pfam_u[pfam].append(l[0])

    return u_pfam, pfam_u
示例#12
0
文件: kegg.py 项目: rfour92/pypath
def kegg_interactions():
    """
    Downloads and processes KEGG Pathways.
    Returns list of interactions.
    """

    positive_terms = {'activation', 'expression'}
    negative_terms = {'inhibition', 'repression'}
    transc_terms = {'expression', 'repression'}
    mechanism_terms = {
        'phosphorylation',
        'binding/association',
        'dissociation',
        'ubiquitination',
        'dephosphorylation',
        'glycosylation',
        'state change',
        'methylation',
    }
    direct_terms = {'indirect effect'}

    KeggInteraction = collections.namedtuple(
        'KeggInteraction',
        [
            'id_a',
            'id_b',
            'effect',
            'pathway',
            'mechanism',
            'is_direct',
            'transcriptional',
        ],
    )

    rehsa = re.compile(r'.*(hsa[0-9]+).*')
    req_hdrs = [
        'Referer: http://www.genome.jp/kegg-bin/show_pathway'
        '?map=hsa04710&show_description=show'
    ]
    hsa_list = []
    interactions = []

    c = curl.Curl(urls.urls['kegg_pws']['list_url'], silent=True)
    htmllst = c.result
    lstsoup = bs4.BeautifulSoup(htmllst, 'html.parser')

    for a in lstsoup.find_all('a', href=True):
        m = rehsa.match(a['href'])

        if m:
            hsa_list.append((m.groups(0)[0], a.text))

    prg = progress.Progress(len(hsa_list),
                            'Processing KEGG Pathways',
                            1,
                            percent=False)

    for hsa, pw in hsa_list:

        prg.step()
        c = curl.Curl(urls.urls['kegg_pws']['kgml_url_2'] % hsa,
                      silent=True,
                      req_headers=req_hdrs)
        kgml = c.result
        kgmlsoup = bs4.BeautifulSoup(kgml, 'html.parser')
        entries = {}

        for ent in kgmlsoup.find_all('entry'):
            gr = ent.find('graphics')

            if gr and 'name' in gr.attrs:
                entries[ent.attrs['id']] = [
                    n.strip()
                    for n in gr.attrs['name'].replace('...', '').split(',')
                ]

        uentries = dict([(eid,
                          common.uniq_list(
                              common.flat_list([
                                  mapping.map_name(gn,
                                                   'genesymbol',
                                                   'uniprot',
                                                   strict=True) for gn in gns
                              ]))) for eid, gns in iteritems(entries)])

        for rel in kgmlsoup.find_all('relation'):

            subtypes = {st.attrs['name'] for st in rel.find_all('subtype')}

            if (rel.attrs['entry1'] in uentries
                    and rel.attrs['entry2'] in uentries and subtypes):

                is_direct = 'indirect effect' not in subtypes
                effect = ('inhibition' if negative_terms
                          & subtypes else 'activation' if positive_terms
                          & subtypes else 'unknown')
                mechanism = ';'.join(mechanism_terms & subtypes)
                transcriptional = bool(transc_terms & subtypes)

                for u1 in uentries[rel.attrs['entry1']]:

                    for u2 in uentries[rel.attrs['entry2']]:

                        interactions.append(
                            KeggInteraction(
                                id_a=u1,
                                id_b=u2,
                                effect=effect,
                                pathway=pw,
                                mechanism=mechanism,
                                is_direct=is_direct,
                                transcriptional=transcriptional,
                            ))

    prg.terminate()

    return common.uniq_list(interactions)
示例#13
0
文件: pfam.py 项目: rfour92/pypath
def get_pfam_regions(
    uniprots=[],
    pfams=[],
    keepfile=False,
    dicts='both',
):

    url = urls.urls['pfam_up']['url']
    outf = url.split('/')[-1]
    urlmd5 = common.md5(url)
    if not os.path.exists(settings.get('cachedir')):
        os.makedirs(settings.get('cachedir'))
    cachefile = os.path.join(settings.get('cachedir'), urlmd5 + '-' + outf)
    u_pfam = {}
    pfam_u = {}
    uniprots = set(uniprots)
    pfams = set(pfams)

    if not os.path.exists(cachefile):
        sys.stdout.write(
            '\t:: Downloading data from %s' %
            url.replace('http://', '').replace('ftp://', '').split('/')[0])
        sys.stdout.flush()
        if hasattr(urllib, 'urlretrieve'):
            urllib.urlretrieve(url, cachefile)
        else:
            urllib.request.urlretrieve(url, cachefile)
        sys.stdout.write('\n')

    with open(cachefile, 'rb') as f:
        f.seek(-4, 2)
        gzsize = struct.unpack('<I', f.read())[0]
        prg = progress.Progress(gzsize, 'Processing Pfam domains', 11)

    with gzip.open(cachefile, 'r') as f:  # FIXME: Something went wrong here

        for l in f:

            prg.step(len(l))
            l = l.strip().split()

            if l[0] in uniprots or l[4] in pfams:

                if dicts in ['uniprot', 'both']:

                    if l[0] not in u_pfam:
                        u_pfam[l[0]] = {}
                    if l[4] not in u_pfam[l[0]]:
                        u_pfam[l[0]][l[4]] = []
                    u_pfam[l[0]][l[4]].append({
                        'isoform': int(l[1]),
                        'start': int(l[5]),
                        'end': int(l[6])
                    })

                if dicts in ['pfam', 'both']:

                    if l[4] not in pfam_u:
                        pfam_u[l[4]] = {}
                    if l[0] not in pfam_u[l[4]]:
                        pfam_u[l[4]][l[0]] = []
                    pfam_u[l[4]][l[0]].append({
                        'isoform': int(l[1]),
                        'start': int(l[5]),
                        'end': int(l[6])
                    })

    prg.terminate()
    if not keepfile:
        os.remove(cachefile)
    if dicts == 'uniprot':
        return u_pfam
    elif dicts == 'pfam':
        return pfam_u
    else:
        return u_pfam, pfam_u
示例#14
0
def pathwaycommons_interactions(
    resources=None,
    types=None,
    by_interaction=False,
    version=12,
):

    interactions = collections.defaultdict(set) if by_interaction else []

    types = common.to_set(types)

    resources = {
        res.lower()
        for res in (common.to_list(resources) or (
            pc_res.name for pc_res in pathwaycommons_resources))
    }

    prg = progress.Progress(
        len(resources),
        'Processing PathwayCommons',
        1,
        percent=False,
    )

    url = urls.urls['pwcommons']['url']

    for resource in pathwaycommons_resources:

        if not resources & {resource.pc_label, resource.name.lower()}:

            continue

        prg.step()
        _version = min(resource.version, version)
        resource_url = url % (_version, _version, resource.pc_label)
        c = curl.Curl(resource_url, silent=False, large=True)

        for l in c.result:

            if hasattr(l, 'decode'):

                l = l.decode('ascii')

            l = l.strip('\n\r').split('\t')

            if not types or l[1] in types:

                if by_interaction:

                    a_b = (l[0], l[1], l[2])
                    b_a = (l[2], l[1], l[0])

                    directed = l[1] in pathwaycommons_directed_types

                    key = (b_a if (a_b not in interactions and not directed
                                   and b_a in interactions) else a_b)

                    interactions[key].add(
                        PathwayCommonsInteraction(*key,
                                                  resource=resource.name))

                else:

                    l.append(resource.name)
                    interactions.append(PathwayCommonsInteraction(*l))

    return interactions
示例#15
0
    def resource_to_relationships_graph(
        self,
        graph,
    ) -> None:
        """
        Convert a PyPath igraph object into list of BEL relationships.
        """

        self._log('Building bel graph from PyPath object (igraph graph).')

        edges = graph.es
        prg = progress.Progress(
            len(edges),
            'Building bel graph from PyPath object (igraph graph).',
            1,
        )
        for edge in edges:
            prg.step()
            directions = edge['dirs']

            for direction in (directions.straight, directions.reverse):

                if not directions.dirs[direction]:
                    # this direction does not exist
                    continue

                dir_sources = directions.get_dir(direction, sources=True)

                if self.only_sources and not dir_sources & self.only_sources:
                    # this direction not provided
                    # in the currently enabled set of sources
                    continue

                predicates = set()

                activation, inhibition = (directions.get_sign(direction,
                                                              sources=True))

                if self._check_sign(activation):
                    predicates.add(pc.DIRECTLY_INCREASES)

                if self._check_sign(inhibition):
                    predicates.add(pc.DIRECTLY_DECREASES)

                if not predicates:
                    # use `regulates` if sign is unknown
                    predicates.add(pc.REGULATES)

                source = self._protein(direction[0])
                target = self._protein(direction[1])
                evid_cits = self._references(edge, direction)

                for (predicate,
                     (evid, cits)) in itertools.product(predicates, evid_cits):

                    for cit in cits:

                        self.bel_graph.add_qualified_edge(
                            source,
                            target,
                            relation=predicate,
                            citation=cit,
                            evidence='OmniPath',
                        )
                        self.bel_graph.add_qualified_edge(
                            source,
                            target,
                            relation=predicate,
                            citation=cit,
                            evidence=evid,
                        )

            if not self._has_direction(directions):
                # add an undirected relationship
                # if no direction available

                evid_cits = self._references(edge, 'undirected')
                source = self._protein(directions.nodes[0])
                target = self._protein(directions.nodes[1])

                for evid, cits in evid_cits:

                    for cit in cits:

                        self.bel_graph.add_association(
                            source,
                            target,
                            citation=cit,
                            evidence='OmniPath',
                        )
                        self.bel_graph.add_association(
                            source,
                            target,
                            citation=cit,
                            evidence=evid,
                        )

        prg.terminate()
        self._log('Building bel graph from PyPath object finished.')
示例#16
0
文件: chembl.py 项目: rfour92/pypath
 def compounds_targets(self,
                       id_list,
                       id_type='uniprot',
                       assay_types=['B', 'F'],
                       relationship_types=['D', 'H'],
                       compound_props=[],
                       domains=False,
                       pred_bind_d=False,
                       action_type=False,
                       activities=False,
                       pchembl=False,
                       one_query=False,
                       client_side=False):
     '''
     Same as compounds_targets(), but queries each id by separate mysql query.
     Better performance expected in case the batch query requires disk_tmp_table.
     '''
     if id_type == 'uniprot':
         compound_lookup = True
         id_list = self.get_chembl_uniprots(id_list)
     self.result = []
     id_list = id_list if type(id_list) is list else [id_list]
     if one_query:
         query_thread = threading.Thread(target=self.compound_target,
                                         args=[id_list],
                                         kwargs={
                                             'id_type': id_type,
                                             'assay_types': assay_types,
                                             'relationship_types':
                                             relationship_types,
                                             'compound_props':
                                             compound_props,
                                             'domains': domains,
                                             'pred_bind_d': pred_bind_d,
                                             'action_type': action_type,
                                             'activities': activities,
                                             'pchembl': pchembl
                                         })
         query_thread.daemon = True
         query_thread.start()
         sys.stdout.write('\n')
         sys.stdout.flush()
         while query_thread.isAlive():
             self.mysql.print_status()
             time.sleep(1)
         self.mysql_ready()
         if client_side:
             self.result = list(self.result)
     else:
         prg = progress.Progress(total=len(id_list),
                                 name='Starting queries',
                                 interval=5)
         qids = []
         for identifier in id_list:
             prg.step()
             qids.append(
                 self.compound_target(identifier,
                                      id_type=id_type,
                                      assay_types=assay_types,
                                      relationship_types=relationship_types,
                                      compound_props=compound_props,
                                      domains=domains,
                                      pred_bind_d=pred_bind_d,
                                      action_type=action_type,
                                      activities=activities,
                                      pchembl=pchembl,
                                      wait=False))
         prg.terminate()
         self.mysql_ready(qids)
         for qid in qids:
             self.result.extend(list(self.mysql.get_result(qid)))
示例#17
0
文件: signor.py 项目: rfour92/pypath
def signor_pathways(**kwargs):
    """
    Obtains pathway annotations from Signor.
    """

    url = urls.urls['signor']['list_url']
    baseurl = urls.urls['signor']['all_url_new']

    proteins_pathways = {}
    interactions_pathways = {}

    c = curl.Curl(url, silent=True)

    soup = bs4.BeautifulSoup(c.result, 'html.parser')

    pathway_names = [(opt['value'], opt.text)
                     for opt in soup.find('select', {
                         'name': 'pathway_list'
                     }).findAll('option')]

    prg = progress.Progress(len(pathway_names),
                            'Downloading data from Signor',
                            1,
                            percent=False)

    for short, full in pathway_names:

        prg.step()

        if not short:

            continue

        binary_data = [(b'pathway_list', short.encode('ascii')),
                       (b'submit', b'Download')]

        c_pw = curl.Curl(
            baseurl,
            silent=True,
            binary_data=binary_data,
            encoding='utf-8',
        )

        #csv.DictReader(c_pw.result)

        sep = '@#@#@'
        lines = inputs_common.csv_sep_change(c_pw.result, '\t',
                                             sep).split('\n')[1:]

        data = list(
            filter(lambda l: len(l) > 6,
                   map(lambda l: l.strip().split(sep), lines)))

        proteins_pathways[full] = set()
        interactions_pathways[full] = set()

        for row in data:

            for uniprot1, uniprot2 in itertools.product(
                    mapping.map_name(row[4], 'uniprot', 'uniprot'),
                    mapping.map_name(row[8], 'uniprot', 'uniprot'),
            ):

                proteins_pathways[full].add(uniprot1)
                proteins_pathways[full].add(uniprot2)

                interactions_pathways[full].add((uniprot1, uniprot2))

    prg.terminate()

    return proteins_pathways, interactions_pathways
示例#18
0
文件: kegg.py 项目: jgray7700/pypath
def kegg_interactions():
    """
    Downloads and processes KEGG Pathways.
    Returns list of interactions.
    """

    rehsa = re.compile(r'.*(hsa[0-9]+).*')
    req_hdrs = [
        'Referer: http://www.genome.jp/kegg-bin/show_pathway'
        '?map=hsa04710&show_description=show'
    ]
    hsa_list = []
    interactions = []

    c = curl.Curl(urls.urls['kegg_pws']['list_url'], silent = True)
    htmllst = c.result
    lstsoup = bs4.BeautifulSoup(htmllst, 'html.parser')

    for a in lstsoup.find_all('a', href = True):
        m = rehsa.match(a['href'])

        if m:
            hsa_list.append((m.groups(0)[0], a.text))

    prg = progress.Progress(
        len(hsa_list), 'Processing KEGG Pathways', 1, percent = False)

    for hsa, pw in hsa_list:

        prg.step()
        c = curl.Curl(urls.urls['kegg_pws']['kgml_url_2'] % hsa,
                      silent = True,
                      req_headers = req_hdrs)
        kgml = c.result
        kgmlsoup = bs4.BeautifulSoup(kgml, 'html.parser')
        entries = {}

        for ent in kgmlsoup.find_all('entry'):
            gr = ent.find('graphics')

            if gr and 'name' in gr.attrs:
                entries[ent.attrs['id']] = [
                    n.strip()
                    for n in gr.attrs['name'].replace('...', '').split(',')
                ]

        uentries = dict([(eid, common.uniq_list(
            common.flat_list([
                mapping.map_name(
                    gn, 'genesymbol', 'uniprot', strict = True) for gn in gns
            ]))) for eid, gns in iteritems(entries)])

        for rel in kgmlsoup.find_all('relation'):
            st = rel.find('subtype')

            if (
                rel.attrs['entry1'] in uentries and
                rel.attrs['entry2'] in uentries and
                st and
                'name' in st.attrs
            ):
                for u1 in uentries[rel.attrs['entry1']]:
                    for u2 in uentries[rel.attrs['entry2']]:
                        interactions.append((u1, u2, st.attrs['name'], pw))

    prg.terminate()

    return common.uniq_list(interactions)
示例#19
0
def intact_interactions(
        miscore = 0.6,
        organism = 9606,
        complex_expansion = False,
        only_proteins = False,
        only_ids = False,
    ):
    """
    only_proteins : bool
        Keep only records of protein-protein interactions.
    only_ids : bool
        Load only the identifiers of interacting pairs
        (smaller memory footprint).
    """

    id_types = {
        'uniprotkb': 'uniprot',
    }

    IntactInteraction = collections.namedtuple(
        'IntactInteraction',
        (
            'id_a',
            'id_b',
            'id_type_a',
            'id_type_b',
            'pubmeds',
            'methods',
            'mi_score',
            'isoform_a',
            'isoform_b',
        ),
    )
    IntactInteraction.__new__.__defaults__ = (None,) * 7


    def get_id_type(field):

        id_type = None if field == '-' else field.split(':')[0]

        return id_types[id_type] if id_type in id_types else id_type


    def get_id(field):

        if field == '-':

            return None, None

        else:

            uniprot, isoform = _try_isoform(
                field.split(':')[1].replace('"', '')
            )

            uniprot = uniprot.split('-')[0]

            return uniprot, isoform


    def get_taxon(field):

        return (
            0
                if field == '-' else
            field.split('|')[0].split(':')[1].split('(')[0]
        )


    results = []
    url = urls.urls['intact']['mitab']

    if type(organism) is int:
        organism = '%u' % organism

    c = curl.Curl(
        url,
        silent = False,
        large = True,
        files_needed = ['intact.txt'],
    )

    data = c.result['intact.txt']
    size = c.sizes['intact.txt']
    prg = progress.Progress(size, 'Reading IntAct MI-tab file', 99)

    for lnum, l in enumerate(data):

        prg.step(len(l))

        if lnum == 0:

            continue

        l = l.strip('\n\r ').split('\t')

        taxon_a = get_taxon(l[9])
        taxon_b = get_taxon(l[10])

        if (
            (
                organism is None or (
                    taxon_a == organism and
                    taxon_b == organism
                )
            ) and (
                complex_expansion or
                'expansion' not in l[15]
            )
        ):

            # finding mi-score and author
            sc = '0'
            au = '0'

            for s in l[14].split('|'):

                if s.startswith('intact-miscore'):
                    sc = s.split(':')[1]

                if s.startswith('author'):
                    au = len(s.split(':')[1])

            # filtering for mi-score
            if float(sc) < miscore:

                continue

            id_type_a = get_id_type(l[0])
            id_type_b = get_id_type(l[0])

            if (
                only_proteins and not (
                    id_type_a == 'uniprot' and
                    id_type_b == 'uniprot'
                )
            ):

                continue

            id_a, isoform_a = get_id(l[0])
            id_b, isoform_b = get_id(l[1])

            key = tuple(sorted((id_a, id_b)))

            pubmeds = set(
                ref[1] for ref in (
                    ref.split(':')
                    for ref in l[8].split('|')
                )
                if ref[0] == 'pubmed'
            )
            methods = set(
                met.split('(')[1].strip(')"')
                for met in  l[6].split('|')
            )

            results.append(
                IntactInteraction(
                    id_a = id_a,
                    id_b = id_b,
                    id_type_a = id_type_a,
                    id_type_b = id_type_b,
                    pubmeds = pubmeds,
                    methods = methods,
                    mi_score = sc,
                    isoform_a = isoform_a,
                    isoform_b = isoform_b,
                )
            )

    prg.terminate()

    return results