def update_lspci(): # We first create a dict of LSPCIs and their members but only for ones # that actually have TAS statements corresponding to them from indra.sources import tas tp = tas.process_from_web(affinity_class_limit=10) lspci_members = defaultdict(set) for stmt in tp.statements: if 'LSPCI' not in stmt.subj.db_refs: continue for k, v in stmt.subj.db_refs.items(): if k in {'TEXT', 'LSPCI'}: continue lspci_members[stmt.subj.db_refs.get('LSPCI')].add((k, v)) # We then process the names table in a way that we always prioritize the # first row for each LSPCI since the table is pre-sorted by priority df = pandas.read_csv('lsp_compound_names.csv', dtype={'lspci_id': str}) lspcid_names = {} for _, row in df.iterrows(): if row['lspci_id'] not in lspcid_names: lspcid_names[row['lspci_id']] = row['name'] # We can now combine the two sources filtering to only entries that have # names rows = [['lspcid', 'name', 'members']] for lspcid, members in lspci_members.items(): if lspcid not in lspcid_names: continue row = [lspcid, lspcid_names[lspcid], '|'.join(sorted(['%s:%s' % member for member in members]))] rows.append(row) write_unicode_csv(get_resource_path('lspci.tsv'), rows, delimiter='\t')
def find_drugs_for_genes(search_terms, drug_gene_stmts=None): """Return list of drugs targeting at least one gene from a list of genes Parameters ---------- search_terms : list of :py:class:`emmaa.priors.SearchTerm` List of search terms for genes Returns ------- drug_terms : list of :py:class:`emmaa.priors.SearchTerm` List of search terms of drugs targeting at least one of the input genes """ if not drug_gene_stmts: drug_gene_stmts = tas.process_from_web().statements drug_terms = [] already_added = set() for search_term in search_terms: if search_term.type == 'gene': hgnc_id = search_term.db_refs['HGNC'] drugs = get_drugs_for_gene(drug_gene_stmts, hgnc_id) for drug in drugs: if drug.name not in already_added: drug_terms.append(drug) already_added.add(drug.name) return sorted(drug_terms, key=lambda x: x.name)
def test_processor(): tp = process_from_web(affinity_class_limit=10) assert tp assert tp.statements num_stmts = len(tp.statements) # This is the total number of statements about human genes assert num_stmts == 1123724, num_stmts assert all(len(s.evidence) >= 1 for s in tp.statements), \ 'Some statements lack any evidence'
def find_drugs_for_genes(node_list): """Return list of drugs targeting gene nodes.""" tas_statements = tas.process_from_web().statements already_added = set() drug_terms = [] for node in node_list: if node.startswith('HGNC:'): hgnc_id = node.split(':')[1] drugs = get_drugs_for_gene(tas_statements, hgnc_id) for drug in drugs: if drug.name not in already_added: drug_terms.append(drug) already_added.add(drug.name) return sorted(drug_terms, key=lambda x: x.name)
def _get_statements(self): from indra.sources import tas # The settings we use here are justified as follows: # - only affinities that indicate binding are included # - only agents that have some kind of a name available are # included, with ones that get just an ID as a name are # not included. # - we do not require full standardization, thereby allowing # set of drugs to be extracted for which we have a name from CHEBML, # HMS-LINCS, or DrugBank logger.info('Processing TAS from web') tp = tas.process_from_web(affinity_class_limit=2, named_only=True, standardized_only=False) logger.info('Expanding evidences and deduplicating') filtered_stmts = [s for s in _expanded(tp.statements)] unique_stmts, _ = extract_duplicates(filtered_stmts, KeyFunc.mk_and_one_ev_src) return unique_stmts
def make_search_terms(self, drug_gene_stmts=None): """Generate search terms from the gene list.""" if not drug_gene_stmts: drug_gene_stmts = tas.process_from_web().statements already_added = set() terms = [] for gene in self.gene_list: # Gene search term agent = agent_from_gene_name(gene) term = SearchTerm(type='gene', name=agent.name, search_term=f'"{agent.name}"', db_refs={'HGNC': agent.db_refs['HGNC'], 'UP': agent.db_refs['UP']}) terms.append(term) # Drug search term drug_terms = get_drugs_for_gene(drug_gene_stmts, agent.db_refs['HGNC']) for drug_term in drug_terms: if drug_term.name not in already_added: terms.append(drug_term) already_added.add(drug_term.name) self.search_terms = terms return terms
for compound in drug_list: fh.write( '%s\t%s\t%s\n' % (compound[0], compound[1], 'INDRA (text mining/databases)')) misgrounding_map = { 'CTSL': ['MEP'], 'CTSB': ['APPs'], 'FURIN': ['pace', 'Fur'] } if __name__ == '__main__': db = get_db('primary') db_curations = get_curations(db=db) tp = tas.process_from_web() #targets = ['TMPRSS2', 'ACE2', 'FURIN', 'CTSB', 'CTSL'] targets = [ 'PIKFYVE', 'INPP5E', 'PIK3C2A', 'PIK3C2B', 'PIK3C2G', 'PI4K2A', 'PI4K2B', 'PI4KB', 'EHD3', 'PIK3C3' ] all_stmts = [] all_ev_counts = {} with open('ctd_drugbank_tas_pikfyve.pkl', 'rb') as f: all_ctd_stmts = pickle.load(f) all_ctd_stmts = filter_neg(all_ctd_stmts) for target in targets: stmts = get_statements(target) fname = '%s.html' % target ctd_stmts = ac.filter_gene_list(all_ctd_stmts, [target], policy='one') stmts += ctd_stmts
normalize_drug(stmt.subj) stmts = sorted(stmt_group, key=lambda x: (len(score_drug(x.subj)), len(x.subj.name))) if len(stmt_group) > 1: print('Choosing: %s (%s) from' % (stmts[0].subj, score_drug(stmts[0].subj))) for stmt in stmts: print(stmt.subj, score_drug(stmt.subj)) print() return stmts[0] if __name__ == '__main__': tp = tas.process_from_web(affinity_class_limit=2, named_only=True, standardized_only=False) grouped = defaultdict(list) for stmt in tp.statements: grouped[(stmt.subj.db_refs['LSPCI'], stmt.obj.name)].append(stmt) opt_stmts = [] for (lspci, obj_name), stmts in grouped.items(): opt_stmt = choose_best_stmt(stmts) opt_stmts.append(opt_stmt) fname = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'resources', 'tas_stmts_filtered.pkl') with open(fname, 'wb') as fh: pickle.dump(opt_stmts, fh)
for stmt in stmts: new_ev = [e for e in stmt.evidence if e.source_api not in sources] if not new_ev: continue stmt.evidence = new_ev new_stmts.append(stmt) return new_stmts if __name__ == '__main__': # Loading premliminary data structures db = get_db('primary') db_curations = get_curations(db=db) tas_processor = tas.process_from_web() # List of entities that are not of interest to get INDRA Statements # e.g., ATP, oxygen with open('black_list.txt', 'r') as fh: black_list = {line.strip() for line in fh.readlines()} with open('minerva_disease_map_indra_ids.csv', 'r') as fh: groundings = [line.strip().split(',') for line in fh.readlines()] with open('../../grounding_map.json', 'r') as fh: grounding_map = json.load(fh) ##################### # Querying for and assembling statements all_stmts = [] for db_ns, db_id, name in groundings:
reg_stmts = [] for stmt in stmts_by_channel[channel]: if isinstance(stmt, (Inhibition, DecreaseAmount)): if stmt.subj.name == reg_agent.name: reg_stmts.append(stmt) return reg_stmts def assemble_html(stmts, fname_key): ha = HtmlAssembler(stmts) ha.make_model() ha.save_model('%s.html' % fname_key) if __name__ == '__main__': tp = tas.process_from_web(affinity_class_limit=10) neg_regs = defaultdict(dict) non_neg_regs = defaultdict(dict) for channel, (stmts, _, _) in stmts_by_channel.items(): stmts = [ s for s in stmts if isinstance(s, (Inhibition, DecreaseAmount)) ] stmts = [s for s in stmts if s.obj.name == channel] for stmt in stmts: neg_regs[channel][get_key(stmt.subj)] = stmt.subj for stmt in tp.statements: if stmt.obj.name in neg_regs: if stmt.evidence[0].annotations['class_min'] in \ {'100nM < Kd < 1uM', 'Kd < 100nM'}: