def test_filter_by_curation(): new_st1 = deepcopy(st1) new_ev = Evidence(text='a -> b', source_api='new') new_st1.evidence.append(new_ev) stmts_in = [new_st1, st2, st3] assert len(new_st1.evidence) == 2 assert all(st.belief != 1 for st in stmts_in) Curation = namedtuple('Curation', ['pa_hash', 'source_hash', 'tag']) cur1 = Curation(new_st1.get_hash(), new_st1.evidence[0].get_source_hash(), 'grounding') cur2 = Curation(new_st1.get_hash(), new_st1.evidence[1].get_source_hash(), 'wrong_relation') cur3 = Curation(new_st1.get_hash(), new_st1.evidence[0].get_source_hash(), 'correct') cur4 = Curation(st2.get_hash(), st2.evidence[0].get_source_hash(), 'correct') # With 'any' policy it is enough to have one incorrect curation any_incorrect_one_cur = ac.filter_by_curation(stmts_in, [cur1], 'any') assert len(any_incorrect_one_cur) == 2 assert new_st1 not in any_incorrect_one_cur # With 'all' policy all evidences have to be curated all_incorrect_one_cur = ac.filter_by_curation(stmts_in, [cur1], 'all') assert len(all_incorrect_one_cur) == 3, len(all_incorrect_one_cur) assert new_st1 in all_incorrect_one_cur all_incorrect_two_cur = ac.filter_by_curation(stmts_in, [cur1, cur2], 'all') assert len(all_incorrect_two_cur) == 2 assert new_st1 not in all_incorrect_two_cur # Correct curation cancels out incorrect assert len(new_st1.evidence) == 2 correct_incorrect = ac.filter_by_curation(stmts_in, [cur1, cur2, cur3, cur4], 'all', update_belief=False) assert len(correct_incorrect) == 3, len(correct_incorrect) assert new_st1 in correct_incorrect # new_st1.evidence[1] should be filtered out because there's only incorrect # curation(cur2), new_st1.evidence[0] stays because correct cancels out # incorrect (cur1, cur3) assert len(new_st1.evidence) == 1 assert new_st1.evidence[0].source_api == 'assertion' assert all(st.belief != 1 for st in correct_incorrect) # Optionally update belief to 1 for correct curation new_belief = ac.filter_by_curation(stmts_in, [cur1, cur2, cur3, cur4], 'all', update_belief=True) assert new_belief[0].belief == 1 assert new_belief[1].belief == 1 assert new_belief[2].belief == 0.7
def get_statements(target): #tas_stmts = get_tas_stmts(target) db_stmts = get_db_stmts(target) stmts = db_stmts #stmts = tas_stmts + db_stmts stmts = filter_misgrounding(target, stmts) stmts = ac.run_preassembly(stmts) stmts = ac.filter_by_curation(stmts, db_curations) stmts = filter_neg(stmts) return stmts
def _filter_stmts(self, stmts): """This is an internal function that is applied to filter statements. In general, this does nothing, but some sub classes may want to limit the statements that are presented. This is applied to both the complete statements list (retrieved by `get_statements`) and the sample (gotten through `get_sample`). """ stmts = filter_by_curation(stmts, curations=curs) return stmts
def get_statements(target): tas_stmts = get_tas_stmts(target) db_stmts = get_db_stmts(target) stmts = filter_misgrounding(target, tas_stmts + db_stmts) stmts = ac.run_preassembly(stmts) stmts = ac.filter_by_curation(stmts, db_curations) ev_counts = {s.get_hash(): len(s.evidence) for s in stmts} source_counts = {} for stmt in stmts: stmt_source_counts = get_source_counts_dict() for ev in stmt.evidence: stmt_source_counts[ev.source_api] += 1 source_counts[stmt.get_hash()] = stmt_source_counts return stmts, ev_counts, source_counts
def assemble_statements(kinase, stmts, curs): """Run assembly steps on statements.""" # Remove unary statements and ones with many agents stmts = [stmt for stmt in stmts if (1 < len(stmt.real_agent_list()) < 4)] stmts = replace_ctd(stmts, ctd_stmts_by_gene.get(kinase, [])) # We do this at this point to make sure we capture the original DB # hashes before modifying statements to allow lookup for stmt in stmts: for ev in stmt.evidence: ev.annotations['prior_hash'] = stmt.get_hash() stmts = fix_invalidities(stmts) stmts = ac.filter_grounded_only(stmts) stmts = ac.filter_human_only(stmts) stmts = ac.filter_by_curation(stmts, curations=curs) stmts = unify_lspci(stmts) stmts = remove_contradictions(stmts) # Rename chemicals logger.info('Renaming chemicals') for stmt in stmts: for agent in stmt.real_agent_list(): if agent.db_refs.get('CHEBI') and len(agent.name) > 25: rename_chemical(agent) # Remove long names logger.info('Removing statements with long names') stmts = [ stmt for stmt in stmts if all( len(a.name) < 20 for a in stmt.real_agent_list()) ] logger.info('%d statements remaining' % len(stmts)) # Remove microRNAs logger.info('Removing microRNA statements') stmts = [ stmt for stmt in stmts if not any('miR' in a.name for a in stmt.real_agent_list()) ] logger.info('%d statements remaining' % len(stmts)) stmts = add_source_urls(stmts) with open('data/assembled/%s.pkl' % kinase, 'wb') as fh: pickle.dump(stmts, fh) return stmts
def filter_incorrect_curations(stmts): # Filter incorrect curations indra_op_filtered = ac.filter_by_curation(stmts, curations=db_curations) return indra_op_filtered
gilda_obj = gilda.ground(obj) gilda_obj = gilda_obj[0].term.entry_name if gilda_obj else 'NA' normalized_df.append({ 'Subject': subj, 'Normalized subject': gilda_subj, 'Object': obj, 'Normalized object': gilda_obj }) # Downloading statements using INDRA REST API idrp = idr.get_statements(subject=gilda_subj, object=gilda_obj) stmts = stmts + idrp.statements # Filtering out the indirect INDRA statements #indra_stmts = ac.filter_direct(stmts) indra_stmts = ac.run_preassembly(stmts, run_refinement=False) indra_filtered = ac.filter_by_curation(indra_stmts, curations=db_curations) indra_op_filtered = filter_complex_statements(indra_filtered, subj_set, obj_set) indra_op_filtered = ac.run_preassembly(indra_op_filtered, run_refinement=False) html_assembler(indra_op_filtered, os.path.join(OUTPUT, file + '_indra_report.html')) normalized_df = pd.DataFrame(normalized_df) normalized_df.to_csv( os.path.join(INPUT, file, file + '_normalized_names.csv'))
indra_db_stmts = list(stmts_by_hash.values()) # Filtering out the indirect INDRA statements indra_db_stmts = ac.filter_direct(indra_db_stmts) # Fetch omnipath database biomolecular interactions and # process them into INDRA statements op = process_from_web() # Filter statements which are not ligands/receptors from # OmniPath database op_filtered = filter_op_stmts(op.statements, full_ligand_set, receptor_genes_go) op_filtered = ac.filter_direct(op_filtered) op_filtered = ac.filter_by_curation(op_filtered, curations=db_curations) # Merge omnipath/INDRA statements and run assembly indra_op_stmts = ac.run_preassembly(indra_db_stmts + op_filtered, run_refinement=False) # Filter incorrect curations indra_op_filtered = filter_incorrect_curations(indra_op_stmts) # Filter complex statements indra_op_filtered = filter_complex_statements(indra_op_filtered, full_ligand_set, receptor_genes_go) # We do this again because when removing complex members, we # end up with more duplicates indra_op_filtered = ac.run_preassembly(indra_op_filtered,
with open('../../grounding_map.json', 'r') as fh: grounding_map = json.load(fh) ##################### # Querying for and assembling statements all_stmts = [] for db_ns, db_id, name in groundings: if db_id in black_list: print('Skipping %s in black list' % name) continue print('Looking up %s' % name) db_stmts = get_db_stmts_by_grounding(db_ns, db_id) tas_stmts = get_tas_stmts(db_ns, db_id) if db_ns == 'HGNC' else [] stmts = db_stmts + tas_stmts smts = ac.filter_by_curation(stmts, db_curations) stmts = reground_stmts(stmts, grounding_map, misgrounding_map) all_stmts += stmts all_stmts = make_unique_hashes(all_stmts) all_stmts = ac.run_preassembly(all_stmts) ######################################## # Dunp results with open('disease_map_indra_stmts_full.pkl', 'wb') as fh: pickle.dump(all_stmts, fh) stmts_to_json_file(all_stmts, 'disease_map_indra_stmts_full.json') filtered_stmts = filter_prior_all(all_stmts, groundings) with open('disease_map_indra_stmts_filtered.pkl', 'wb') as fh: pickle.dump(filtered_stmts, fh)