def annotate(graph, organism=9606, aspects=('C', 'F', 'P')): """ Adds Gene Ontology annotations to the nodes of a graph. :param igraph.Graph graph: Any ``igraph.Graph`` object with uniprot IDs in its ``name`` vertex attribute. """ aspects = aspects if type(aspects) in {list, tuple} else (aspects, ) graph.vs['go'] = [{ 'C': set(), 'F': set(), 'P': set() } for _ in xrange(graph.vcount())] terms, annot = dataio.go_annotations_goa(organism=organism) prg = progress.Progress(graph.vcount(), 'Loading GO annotations', 9) for v in graph.vs: prg.step() for asp in aspects: if v['name'] in annot[asp]: v['go'][asp] = annot[asp][v['name']] prg.terminate()
def tissues_x_proteins(self, normalized=True, tissues=None): ''' For all tissues downloads the expression of all the proteins. In the result, a dict of dicts will hold the expression values of each proteins, grouped by samples. ''' self.get_tissues() tissues_selected = set([ t['TISSUE_ID'] for t in self.tissues if tissues is None or t['TISSUE_ID'] in tissues ]) - self.tissues_loaded prg = progress.Progress( len(tissues_selected), 'Downloading expression data', 1, percent=False) for tis in tissues_selected: prg.step() sys.stdout.write('Querying tissue %s\n' % tis) sys.stdout.flush() self.get_proteins(tis) if not hasattr(self.result, 'read'): sys.stdout.write('\tFailed: %s\n' % tis) sys.stdout.flush() else: self.tissues_loaded.add(tis) self.get_expression(normalized) if tis not in self.samples: self.samples[tis] = [] self.samples[tis] = uniqList(self.samples[tis] + list( self.current_samples)) self.current_samples = set([]) prg.terminate()
def load_collection(self, collname, id_type='entrez', map_ids=True, cachedir='cache'): if os.path.exists(os.path.join(cachedir, 'gsea-%s.pickle' % collname)): self.load([collname]) return None url = self.collections[collname]['urls'][id_type] data = dataio.curl(url, req_headers=self.session, silent=False, cache=False, write_cache=True) data = data.split('\n') names = [] prg = progress.Progress(len(data), 'Loading gene sets', 1) for line in (l.split('\t') for l in data if len(l) > 0): prg.step() setname = line[0].strip() self.write_set(line[2:], setname, id_type, map_ids) self.get_desc(setname) names.append(setname) prg.terminate() self.groups[collname] = set(names) self.save([collname], cachedir=cachedir)
def smiles2chembl(self, smiles): self.result = {} prg = progress.Progress(total=len(smiles), name='Translating SMILEs', interval=1) for sml in smiles: url = self.chembl_url.format(sml) c = curl.Curl(url, large=False) result = c.result self.result[sml] = [] if result is not None: try: data = json.loads(result) for d in data['compounds']: this_smile = d['smiles'] this_chembl = d['chemblId'] # if this_smile == sml: self.result[sml].append(this_chembl) except ValueError: soup = bs4.BeautifulSoup(result) compounds = soup.find_all('compound') if compounds is not None: for compound in compounds: this_smile = compound.find('smiles').text this_chembl = compound.find('chemblid').text # if this_smile == sml: self.result[sml].append(this_chembl) prg.step() prg.terminate()
def translate(self, source, target, lst): if source == 'inchikey': self.inchikey2anything(target, lst) return None if source == 'smiles': self.smiles2chembl(lst) return None self.result = {} source = str(source) if type(source) is int else self.name_dict[source] target = str(target) if type(target) is int else self.name_dict[target] prg = progress.Progress( total=len(lst), name='Translating compound identifiers', interval=1) for comp in lst: url = '/'.join([self.url_stem, comp, source, target]) c = curl.Curl(url, large = False) result = c.result self.result[comp] = [] if result is not None: data = json.loads(result) for d in data: self.result[comp].append(d['src_compound_id']) prg.step() prg.terminate()
def load_uniprot_mappings(self, ac_types=None, bi=False, ncbi_tax_id=None): ncbi_tax_id = self.get_tax_id(ncbi_tax_id) tables = self.tables[ncbi_tax_id] ac_types = ac_types if ac_types is not None else self.name_types.keys() # creating empty MappingTable objects: for ac_typ in ac_types: tables[(ac_typ, 'uniprot')] = MappingTable(ac_typ, 'uniprot', 'protein', ac_typ, None, ncbi_tax_id, None, log=self.ownlog) # attempting to load them from Pickle i = 0 for ac_typ in ac_types: md5ac = common.md5((ac_typ, 'uniprot', bi, ncbi_tax_id)) cachefile = os.path.join('cache', md5ac) if self.cache and os.path.isfile(cachefile): tables[(ac_typ, 'uniprot')].mapping = \ pickle.load(open(cachefile, 'rb')) ac_types.remove(ac_typ) tables[(ac_typ, 'uniprot')].mid = md5ac # loading the remaining from the big UniProt mapping file: if len(ac_types) > 0: url = urls.urls['uniprot_idmap_ftp']['url'] c = curl.Curl(url, silent=False, large=True) prg = progress.Progress(c.size, "Processing ID conversion list", 99) for l in c.result: prg.step(len(l)) l = l.decode('ascii').strip().split('\t') for ac_typ in ac_types: if len(l) > 2 and self.name_types[ac_typ] == l[1]: other = l[2].split('.')[0] if l[2] not in tables[(ac_typ, 'uniprot')].mapping['to']: tables[(ac_typ, 'uniprot')].mapping['to'][other] = [] tables[(ac_typ, 'uniprot')].mapping['to'][other].\ append(l[0].split('-')[0]) if bi: uniprot = l[0].split('-')[0] if uniprot not in tables[(ac_typ, 'uniprot')].\ mapping['from']: tables[(ac_typ, 'uniprot')].\ mapping['from'][uniprot] = [] tables[(ac_typ, 'uniprot')].mapping['from'][uniprot].\ append(other) prg.terminate() if self.cache: for ac_typ in ac_types: md5ac = common.md5((ac_typ, bi)) cachefile = os.path.join('cache', md5ac) pickle.dump(tables[(ac_typ, 'uniprot')].mapping, open(cachefile, 'wb'))
def compounds_targets_mechanism(self, id_list, id_type='uniprot', domains=False, pred_bind_d=False, activities=False, pchembl=False, one_query=False, client_side=False): if id_type == 'uniprot': compound_lookup = True id_list = self.get_chembl_uniprots(id_list) self.result = [] id_list = id_list if type(id_list) is list else [id_list] if one_query: query_thread = threading.Thread( target=self.compound_target_mechanism, args=[id_list], kwargs={ 'id_type': id_type, 'domains': domains, 'pred_bind_d': pred_bind_d, 'activities': activities, 'pchembl': pchembl }) query_thread.daemon = True query_thread.start() sys.stdout.write('\n') sys.stdout.flush() while query_thread.isAlive(): self.mysql.print_status() time.sleep(1) self.mysql_ready() if client_side: self.result = list(self.result) else: prg = progress.Progress(total=len(id_list), name='Sending queries', interval=5) qids = [] for identifier in id_list: prg.step() qids.append( self.compound_target_mechanism(identifier, id_type=id_type, domains=domains, pred_bind_d=pred_bind_d, activities=activities, pchembl=pchembl, wait=False)) prg.terminate() self.mysql_ready(qids) for qid in qids: self.result += list(self.mysql.get_result(qid))
def load_go(graph, aspect=['C', 'F', 'P']): ''' @graph : igraph.Graph Any igraph.Graph object with uniprot IDs in its `name` vertex attribute. ''' aspect = aspect if type(aspect) is list else [aspect] graph.vs['go'] = [{'C': [], 'F': [], 'P': []} for _ in graph.vs] go = dataio.get_go_goa() prg = progress.Progress(graph.vcount(), 'Loading GO annotations', 9) for v in graph.vs: prg.step() for asp in aspect: if v['name'] in go[asp]: v['go'][asp] = go[asp][v['name']] prg.terminate()
def inchikey2anything(self, target, lst): self.result = {} target = str(target) if type(target) is int else self.name_dict[target] prg = progress.Progress( total=len(lst), name='Translating InChi-Keys', interval=1) for inchik in lst: url = self.inchi_stem % inchik c = curl.Curl(url, large = False) result = c.result if result is not None: data = json.loads(result) self.result[inchik] = [ d['src_compound_id'] for d in data if d['src_id'] == target ] prg.step() prg.terminate()
def connectivity_search(self, id_list, id_type, parameters=[1, 0, 0, 0, 0, 1, 0]): ''' [1,0,0,0,0,1,0, 1] ''' ''' parameters is a list of parameters A-H as described in https://www.ebi.ac.uk/unichem/info/widesearchInfo ''' parameters.append(1) # H parameter must be 1 to process the result parameters = [str(i) for i in parameters] self.result = {} if id_type == 'inchikey': id_type = '' method = 'key_search' elif id_type == 'smiles': self.result = None return None else: id_type = str( id_type) if type(id_type) is int else self.name_dict[id_type] id_type = '%s/' % id_type method = 'cpd_search' prg = progress.Progress(total=len(id_list), name='Connectivity search', interval=1) for i in id_list: prg.step() url = self.cpd_search.format(method, i, id_type, '/'.join(parameters)) c = curl.Curl(url, large=False) result = c.result self.result[i] = [] if result is not None: data = json.loads(result) for k, v in iteritems(data): for j in range(1, len(v)): self.result[i].append(v[j][0]) self.result[i] = list(set(self.result[i])) prg.terminate()
def read_mapping_mysql(self, param): if param.mysql is None: self.ownlog.msg(2, 'No MySQL parameters given.', 'ERROR') return {"o": {}, "i": {}} tax_filter = ("" if param.ncbi_tax_id is None else "AND %s = %u" % (param.ncbi_tax_id, self.ncbi_tax_id)) query = """ SELECT %s AS one,%s AS two FROM %s WHERE %s IS NOT NULL AND %s IS NOT NULL %s""" % ( param.fieldOne, param.fieldTwo, param.tableName, param.fieldOne, param.fieldTwo, tax_filter) try: param.mysql.run_query(query) except _mysql.Error as e: self.ownlog.msg(2, "MySQL error: %s\nFAILED QUERY: %s" % (e, query), 'ERROR') return {"o": {}, "i": {}} total = len(param.mysql.result) + 1 prg = progress.Progress(total=total, name="Processing data", interval=42) mapping_o = {} mapping_i = {} for rr in param.mysql.result: if rr["one"] not in mapping_o: mapping_o[rr["one"]] = [] if rr["two"] not in mapping_i: mapping_i[rr["two"]] = [] mapping_o[rr["one"]].append(rr["two"]) mapping_i[rr["two"]].append(rr["one"]) prg.step() self.mapping["to"] = mapping_o self.cleanDict(self.mapping["to"]) if param.bi: self.mapping["from"] = mapping_i self.cleanDict(self.mapping["from"]) prg.terminate()
def compounds_targets(self, id_list, id_type='uniprot', assay_types=['B', 'F'], relationship_types=['D', 'H'], compound_props=[], domains=False, pred_bind_d=False, action_type=False, activities=False, pchembl=False, one_query=False, client_side=False): ''' Same as compounds_targets(), but queries each id by separate mysql query. Better performance expected in case the batch query requires disk_tmp_table. ''' if id_type == 'uniprot': compound_lookup = True id_list = self.get_chembl_uniprots(id_list) self.result = [] id_list = id_list if type(id_list) is list else [id_list] if one_query: query_thread = threading.Thread(target=self.compound_target, args=[id_list], kwargs={ 'id_type': id_type, 'assay_types': assay_types, 'relationship_types': relationship_types, 'compound_props': compound_props, 'domains': domains, 'pred_bind_d': pred_bind_d, 'action_type': action_type, 'activities': activities, 'pchembl': pchembl }) query_thread.daemon = True query_thread.start() sys.stdout.write('\n') sys.stdout.flush() while query_thread.isAlive(): self.mysql.print_status() time.sleep(1) self.mysql_ready() if client_side: self.result = list(self.result) else: prg = progress.Progress(total=len(id_list), name='Starting queries', interval=5) qids = [] for identifier in id_list: prg.step() qids.append( self.compound_target(identifier, id_type=id_type, assay_types=assay_types, relationship_types=relationship_types, compound_props=compound_props, domains=domains, pred_bind_d=pred_bind_d, action_type=action_type, activities=activities, pchembl=pchembl, wait=False)) prg.terminate() self.mysql_ready(qids) for qid in qids: self.result.extend(list(self.mysql.get_result(qid)))
def progress_setup(self): if not self.silent and self.progress is None and not self.debug: self.progress = progress.Progress(name=self.title, interval=1, status='initializing curl')
def resource_to_relationships_graph( self, graph, ) -> None: """ Convert a PyPath igraph object into list of BEL relationships. """ self._log('Building bel graph from PyPath object (igraph graph).') edges = graph.es prg = progress.Progress( len(edges), 'Building bel graph from PyPath object (igraph graph).', 1, ) for edge in edges: prg.step() directions = edge['dirs'] for direction in (directions.straight, directions.reverse): if not directions.dirs[direction]: # this direction does not exist continue dir_sources = directions.get_dir(direction, sources = True) if self.only_sources and not dir_sources & self.only_sources: # this direction not provided # in the currently enabled set of sources continue predicates = set() activation, inhibition = ( directions.get_sign(direction, sources=True) ) if self._check_sign(activation): predicates.add(pc.DIRECTLY_INCREASES) if self._check_sign(inhibition): predicates.add(pc.DIRECTLY_DECREASES) if not predicates: # use `regulates` if sign is unknown predicates.add(pc.REGULATES) source = self._protein(direction[0]) target = self._protein(direction[1]) evid_cits = self._references(edge, direction) for ( predicate, (evid, cits) ) in itertools.product(predicates, evid_cits): for cit in cits: self.bel_graph.add_qualified_edge( source, target, relation = predicate, citation = cit, evidence = 'OmniPath', ) self.bel_graph.add_qualified_edge( source, target, relation = predicate, citation = cit, evidence = evid, ) if not self._has_direction(directions): # add an undirected relationship # if no direction available evid_cits = self._references(edge, 'undirected') source = self._protein(directions.nodes[0]) target = self._protein(directions.nodes[1]) for evid, cits in evid_cits: for cit in cits: self.bel_graph.add_association( source, target, citation = cit, evidence = 'OmniPath', ) self.bel_graph.add_association( source, target, citation = cit, evidence = evid, ) prg.terminate() self._log('Building bel graph from PyPath object finished.')
def make_df( self, unique_pairs = True, extra_node_attrs = None, extra_edge_attrs = None, ): """ Creates a data frame from the network. By default UniProt IDs, Gene Symbols, source databases, literature references, directionality and sign information and interaction type are included. Args: ----- :param bool unique_pairs: If `True` each line corresponds to a unique pair of molecules, all directionality and sign information are covered in other columns. If `False`, order of `A` and `B` IDs corresponds to the direction while sign covered in further columns. :param dict extra_node_attrs: Additional node attributes to be included in the exported table. Keys are column ames used in the header while values are names of vertex attributes. Values also might be methods which then will be called then on each vertex. These should return strings or their result will be converted to string. In the header `_A` and `_B` suffixes will be appended to the column names so the values can be assigned to A and B side interaction partners. :param dict extra_edge_attrs: Additional edge attributes to be included in the exported table. Keys are column ames used in the header while values are names of edge attributes or callables accepting an edge as single argument. :param str outfile: Name of the output file. If `None` a file name "netrowk-<session id>.tab" is used. """ result = [] self.pa.genesymbol_labels() self.extra_node_attrs = extra_node_attrs or self.extra_node_attrs self.extra_edge_attrs = extra_edge_attrs or self.extra_edge_attrs suffix_a = 'A' if unique_pairs else 'source' suffix_b = 'B' if unique_pairs else 'target' dtypes = ( self.default_dtypes_uniquepairs if unique_pairs else self.default_dtypes_bydirs ) header = copy.copy( self.default_header_uniquepairs if unique_pairs else self.default_header_bydirs ) header += self.extra_edge_attrs.keys() header += [ '%s_%s' % (x, suffix_a) for x in self.extra_node_attrs.keys() ] header += [ '%s_%s' % (x, suffix_b) for x in self.extra_node_attrs.keys() ] prg = progress.Progress( total = self.graph.ecount(), name = 'Creating table', interval=31 ) for e in self.graph.es: # adding default fields lines = ( self.process_edge_uniquepairs(e) if unique_pairs else self.process_edge_bydirection(e) ) result.extend(lines) prg.step() prg.terminate() self.df = pd.DataFrame(result, columns = header) self.df = self.df.astype(dtypes)
def read_mapping_file(self, param, ncbi_tax_id=None): ncbi_tax_id = self.get_tax_id(ncbi_tax_id) if param.__class__.__name__ != "FileMapping": self.ownlog.msg(2, "Invalid parameter for read_mapping_file()", 'ERROR') return {} if (not os.path.exists(param.input) and not hasattr(mapping_input, param.input)): return {} if hasattr(mapping_input, param.input): toCall = getattr(mapping_input, param.input) inputArgs = param.inputArgs if hasattr(param, 'inputArgs') else {} infile = list(toCall(**inputArgs)) total = sum([sys.getsizeof(i) for i in infile]) else: infile = codecs.open(param.input, encoding='utf-8', mode='r') total = os.path.getsize(param.input) prg = progress.Progress(total=total, name="Reading from file", interval=18) lnum = 0 lsum = 0 mapping_o = {} mapping_i = {} for line in infile: if len(line) == 0: continue if lnum == 0 and param.header != 0: lnum += 1 continue if type(line is list): prg.step(sys.getsizeof(line)) else: line = line.decode('utf-8') prg.step(len(line)) line = line.rstrip().split(param.separator) if len(line) > max([param.oneCol, param.twoCol]): if line[param.oneCol] not in mapping_o: mapping_o[line[param.oneCol]] = [] mapping_o[line[param.oneCol]].append(line[param.twoCol]) if param.bi: if line[param.twoCol] not in mapping_i: mapping_i[line[param.twoCol]] = [] mapping_i[line[param.twoCol]].append(line[param.oneCol]) lnum += 1 if hasattr(infile, 'close'): infile.close() self.mapping["to"] = mapping_o self.cleanDict(self.mapping["to"]) if param.bi: self.mapping["from"] = mapping_i self.cleanDict(self.mapping["from"]) prg.terminate()