def assemble_pysb(stmts, data_genes, out_file): """Return an assembled PySB model.""" base_file, _ = os.path.splitext(out_file) #stmts = ac.load_statements('%s.pkl' % base_file) stmts = preprocess_stmts(stmts, data_genes) # This is the "final" set of statements going into the assembler so it # makes sense to cache these. # This is also the point where index cards can be generated ac.dump_statements(stmts, '%s_before_pa.pkl' % base_file) assemble_index_cards(stmts, 'output/index_cards') # Save a version of statements with no evidence for faster loading for s in stmts: s.evidence = [] for ss in s.supports + s.supported_by: ss.evidence = [] ac.dump_statements(stmts, '%s_no_evidence.pkl' % base_file) # Assemble model pa = PysbAssembler() pa.add_statements(stmts) pa.make_model(reverse_effects=False) #ac.dump_statements(pa.statements, '%s_after_pa.pkl' % base_file) # Set context set_context(pa) # Add observables add_observables(pa.model) pa.save_model(out_file) with open('korkut_pysb.pkl', 'wb') as fh: pickle.dump(pa.model, fh) #pa.export_model('kappa', '%s.ka' % base_file) return pa.model
def filter(stmts, cutoff, filename): stmts = ac.filter_belief(stmts, cutoff) stmts = ac.filter_top_level(stmts) stmts = ac.filter_direct(stmts) #stmts = ac.filter_enzyme_kinase(stmts) ac.dump_statements(stmts, filename) return stmts
def get_indradb_pa_stmts(): """Get preassembled INDRA Stmts for PMC articles from INDRA DB. DEPRECATED. Get Raw Statements instead. """ # Get the list of all PMCIDs from the corpus metadata pmcids = get_ids('pmcid') paper_refs = [('pmcid', p) for p in pmcids] stmt_jsons = [] batch_size = 1000 start = time.time() for batch_ix, paper_batch in enumerate(batch_iter(paper_refs, batch_size)): if batch_ix <= 5: continue papers = list(paper_batch) print("Querying DB for statements for %d papers" % batch_size) batch_start = time.time() result = get_statement_jsons_from_papers(papers) batch_elapsed = time.time() - batch_start batch_jsons = [ stmt_json for stmt_hash, stmt_json in result['statements'].items() ] print("Returned %d stmts in %f sec" % (len(batch_jsons), batch_elapsed)) batch_stmts = stmts_from_json(batch_jsons) ac.dump_statements(batch_stmts, 'batch_%02d.pkl' % batch_ix) stmt_jsons += batch_jsons elapsed = time.time() - start print("Total time: %f sec, %d papers" % (elapsed, len(paper_refs))) stmts = stmts_from_json(stmt_jsons) ac.dump_statements(stmts, 'cord19_pmc_stmts.pkl') return stmt_jsons
def run_assembly(stmts, filename): stmts = ac.map_grounding(stmts) stmts = ac.filter_grounded_only(stmts) stmts = ac.filter_human_only(stmts) #stmts = ac.expand_families(stmts) stmts = ac.filter_gene_list(stmts, gene_names, 'one', allow_families=True) stmts = ac.map_sequence(stmts) stmts = ac.run_preassembly(stmts, return_toplevel=False, poolsize=4) ac.dump_statements(stmts, filename) return stmts
def combine_all_stmts(pkl_list, output_file): all_stmts = [] for pkl_file in pkl_list: all_stmts.extend(ac.load_statements(pkl_file)) ac.dump_statements(all_stmts, output_file) stmt_json = stmts_to_json(all_stmts) output_json = f"{output_file.rsplit('.', maxsplit=1)[0]}.json" with open(output_json, 'wt') as f: json.dump(stmt_json, f, indent=2) return all_stmts
def assemble_ras_pathway(fname, reader): # Make original pathway map with open(fname, 'rt') as fh: txt = fh.read() if reader == 'reach': stmts = process_reach(txt) elif reader == 'trips': stmts = process_trips(txt, reread=True) ac.dump_statements(stmts, 'ras_pathway.pkl') draw_graph(stmts, 'ras_pathway')
def read_extra_sources(out_file): sparser_stmts = process_sparser.read_stmts(process_sparser.base_folder) #sparser_stmts += \ # process_sparser.read_stmts(process_sparser.sentences_folder) r3_stmts = process_r3.read_stmts(process_r3.active_forms_files[0]) trips_stmts = process_trips.read_stmts(process_trips.base_folder) phosphosite_stmts = \ read_phosphosite.read_phosphosite_owl(read_phosphosite.phosphosite_owl_file) stmts = trips_stmts + sparser_stmts + r3_stmts + phosphosite_stmts ac.dump_statements(stmts, out_file) return stmts
def assemble_extension(fname_orig, fname, reader): with open(fname_orig, 'rt') as fh: orig_txt = fh.read() with open(fname, 'rt') as fh: extension_txt = fh.read() txt = '\n'.join([orig_txt, extension_txt]) if reader == 'reach': stmts = process_reach(txt) elif reader == 'trips': stmts = process_trips(txt, reread=True) ac.dump_statements(stmts, 'ras_pathway_extension.pkl') draw_graph(stmts, 'ras_pathway_extension')
def run_assembly(stmts, save_file): stmts = ac.map_grounding(stmts) stmts = ac.filter_grounded_only(stmts) stmts = ac.filter_human_only(stmts) stmts = ac.expand_families(stmts) stmts = ac.filter_gene_list(stmts, gene_names, 'one') stmts = ac.map_sequence(stmts) stmts = ac.run_preassembly(stmts, return_toplevel=False) stmts = ac.filter_belief(stmts, 0.95) stmts = ac.filter_top_level(stmts) stmts = ac.filter_direct(stmts) stmts = ac.filter_enzyme_kinase(stmts) ac.dump_statements(stmts, save_file) return stmts
def make_cyjs_network(results, model, stmts): path_stmts = get_path_stmts(results, model, stmts) path_genes = get_path_genes(path_stmts) # Get UUIDs to use as filter path_uuids = [list(path.keys()) for path in path_stmts] all_path_uuids = [] for p in path_uuids: all_path_uuids += p #filtered_stmts = ac.filter_gene_list(stmts, path_genes, 'one') filtered_stmts = ac.filter_uuid_list(stmts, all_path_uuids) ac.dump_statements(filtered_stmts, 'output/korkut_cyjs_model.pkl') ca = CyJSAssembler(filtered_stmts) cm = ca.make_model() ca.set_CCLE_context(['SKMEL28_SKIN']) ca.save_json('output/korkut_model')
def get_indra_phos_stmts(): stmts = by_gene_role_type(stmt_type='Phosphorylation') stmts += by_gene_role_type(stmt_type='Dephosphorylation') stmts = ac.map_grounding(stmts) # Expand families before site mapping stmts = ac.expand_families(stmts) stmts = ac.filter_grounded_only(stmts) stmts = ac.map_sequence(stmts) ac.dump_statements(stmts, 'sources/indra_phos_sitemap.pkl') stmts = ac.run_preassembly(stmts, poolsize=4, save='sources/indra_phos_stmts_pre.pkl') stmts = ac.filter_human_only(stmts) stmts = ac.filter_genes_only(stmts, specific_only=True) ac.dump_statements(stmts, 'sources/indra_phos_stmts.pkl') return stmts
def run_preassembly(statements, hierarchies): print('%d total statements' % len(statements)) # Filter to grounded only statements = map_onto(statements) ac.dump_statements(statements, 'pi_mtg_demo_unfiltered.pkl') statements = ac.filter_grounded_only(statements, score_threshold=0.7) #statements = ac.filter_by_db_refs(statements, 'UN', # ['conflict', 'food_security', 'precipitation'], policy='one', # match_suffix=True) statements = ac.filter_by_db_refs( statements, 'UN', [ 'conflict', 'food_security', 'flooding', 'food_production', 'human_migration', 'drought', 'food_availability', 'market', 'food_insecurity' ], policy='all', match_suffix=True) assume_polarity(statements) statements = filter_has_polarity(statements) # Make a Preassembler with the Eidos and TRIPS ontology pa = Preassembler(hierarchies, statements) # Make a BeliefEngine and run combine duplicates be = BeliefEngine() unique_stmts = pa.combine_duplicates() print('%d unique statements' % len(unique_stmts)) be.set_prior_probs(unique_stmts) # Run combine related related_stmts = pa.combine_related(return_toplevel=False) be.set_hierarchy_probs(related_stmts) #related_stmts = ac.filter_belief(related_stmts, 0.8) # Filter to top-level Statements top_stmts = ac.filter_top_level(related_stmts) pa.stmts = top_stmts print('%d top-level statements' % len(top_stmts)) conflicts = pa.find_contradicts() top_stmts = remove_contradicts(top_stmts, conflicts) ac.dump_statements(top_stmts, 'pi_mtg_demo.pkl') return top_stmts
def get_indra_reg_act_stmts(): try: stmts = ac.load_statements('sources/indra_reg_act_stmts.pkl') return stmts except: pass stmts = [] for stmt_type in ('Activation', 'Inhibition', 'ActiveForm'): print("Getting %s statements from INDRA DB" % stmt_type) stmts += by_gene_role_type(stmt_type=stmt_type) stmts = ac.map_grounding(stmts, save='sources/indra_reg_act_gmap.pkl') stmts = ac.filter_grounded_only(stmts) stmts = ac.run_preassembly(stmts, poolsize=4, save='sources/indra_reg_act_pre.pkl') stmts = ac.filter_human_only(stmts) stmts = ac.filter_genes_only(stmts, specific_only=True) ac.dump_statements(stmts, 'sources/indra_reg_act_stmts.pkl') return stmts
def assemble_pysb(stmts, data_genes, out_file): """Return an assembled PySB model.""" base_file, _ = os.path.splitext(out_file) #stmts = ac.load_statements('%s.pkl' % base_file) stmts = preprocess_stmts(stmts, data_genes) # Make a SIF model equivalent to the PySB model # Useful for making direct comparisons in pathfinding sa = SifAssembler(stmts) sa.make_model(use_name_as_key=True, include_mods=True, include_complexes=True) sif_str = sa.print_model(include_unsigned_edges=True) with open('%s_pysb.sif' % base_file, 'wt') as f: f.write(sif_str) # This is the "final" set of statements going into the assembler so it # makes sense to cache these. # This is also the point where index cards can be generated ac.dump_statements(stmts, '%s_before_pa.pkl' % base_file) assemble_index_cards(stmts, 'output/index_cards') # Save a version of statements with no evidence for faster loading for s in stmts: s.evidence = [] for ss in s.supports + s.supported_by: ss.evidence = [] ac.dump_statements(stmts, '%s_no_evidence.pkl' % base_file) # Assemble model pa = PysbAssembler() pa.add_statements(stmts) pa.make_model(reverse_effects=False) #ac.dump_statements(pa.statements, '%s_after_pa.pkl' % base_file) # Set context set_context(pa) # Add observables add_observables(pa.model) pa.save_model(out_file) with open('korkut_pysb.pkl', 'wb') as fh: pickle.dump(pa.model, fh) #pa.export_model('kappa', '%s.ka' % base_file) return pa.model
def dump_raw_stmts(tr_dicts, stmt_file): """Dump all raw stmts in INDRA DB for a given set of TextRef IDs. Parameters ---------- tr_dicts : dict of text ref information Keys are text ref IDs (ints) mapped to dictionaries of text ref metadata. stmt_file : str Path to file to dump pickled raw statements. Returns ------- list of stmts Raw INDRA Statements retrieved from the INDRA DB. """ # Get the INDRA Statement JSON for the Statement IDs stmts_flat = get_raw_stmts(tr_dicts) ac.dump_statements(stmts_flat, stmt_file) return stmts_flat
def preprocess_db_stmts(stmts, output_file, filter_stmt_site): """Take the statements from the database and grounding map them; """ print("Mapping grounding") gmap_stmts = ac.map_grounding(stmts) #ac.dump_statements(gmap_stmts, prefix + '_gmap.pkl') print("Sorting and filtering") # Next, eliminate exact duplicates stmts_by_deep_hash = [(s.get_hash(shallow=False), s) for s in gmap_stmts] stmts_by_deep_hash.sort(key=lambda x: x[0]) uniq_stmts = [] for k, group in itertools.groupby(stmts_by_deep_hash, key=lambda x: x[0]): uniq_stmts.append(list(group)[0][1]) if filter_stmt_site: # Filter to statements with residue and position site_stmts = [s for s in uniq_stmts if s.residue and s.position] else: site_stmts = uniq_stmts # Organize into a dictionary indexed by site ac.dump_statements(site_stmts, output_file) return site_stmts
def assemble_correction(fname_orig, fname, reader): # Read correction with open(fname_orig, 'rt') as fh: orig_txt = [ln.strip() for ln in fh.readlines()] with open(fname, 'rt') as fh: correct_txt = [ln.strip() for ln in fh.readlines()] for ln in correct_txt: if ln.startswith('<'): remove_line = ln[2:] orig_txt.remove(remove_line) elif ln.startswith('>'): add_line = ln[2:] orig_txt.append(add_line) txt = '\n'.join(orig_txt) if reader == 'reach': stmts = process_reach(txt) elif reader == 'trips': stmts = process_trips(txt, reread=True) ac.dump_statements(stmts, 'ras_pathway_correction.pkl') draw_graph(stmts, 'ras_pathway_correction')
def get_statements(self, output_file=None): """Get the full set of model statements including extra statements. Optionally dumps a pickle of statements to given output file. Parameters ---------- output_file : str File to save the statements. Returns ------- list of INDRA Statements """ stmts_by_group = self.get_stmts_by_group() self.statements = [ s for stmts_by_line in stmts_by_group.values() for stmt_list in stmts_by_line.values() for s in stmt_list ] # Dump the statements if output_file is not None: ac.dump_statements(self.statements, output_file) return self.statements
def build_prior(gene_names): """Build a corpus of prior Statements from PC and BEL.""" gn = GeneNetwork(gene_names, basen) # Read BEL Statements bel_stmts = gn.get_bel_stmts(filter=False) ac.dump_statements(bel_stmts, prefixed_pkl('bel')) # Read Pathway Commons Statements database_filter = ['reactome', 'kegg', 'pid'] biopax_stmts = gn.get_biopax_stmts(database_filter=database_filter) # Eliminate blacklisted interactions tmp_stmts = [] for stmt in biopax_stmts: source_ids = [ev.source_id for ev in stmt.evidence] if set(source_ids) & set(biopax_blacklist): continue tmp_stmts.append(stmt) biopax_stmts = tmp_stmts ac.dump_statements(biopax_stmts, prefixed_pkl('biopax')) # Read Phosphosite Statements phosphosite_stmts = read_phosphosite_owl(phosphosite_owl_file) ac.dump_statements(phosphosite_stmts, prefixed_pkl('phosphosite'))
def test_dump_stmts(): ac.dump_statements([st1], '_test.pkl') st_loaded = ac.load_statements('_test.pkl') assert (len(st_loaded) == 1) assert (st_loaded[0].equals(st1))
def test_dump_stmts(): ac.dump_statements([st1], '_test.pkl') st_loaded = ac.load_statements('_test.pkl') assert len(st_loaded) == 1 assert st_loaded[0].equals(st1)
def main(args): uniq_pairs, all_hgnc_ids, fsort_corrs = \ get_correlations(args.ceres_file, args.geneset_file, args.corr_file, args.strict, args.outbasename, args.recalc, args.ll, args.ul) # Get statements from file or from database that contain any gene from # provided list as set if args.statements_in: # Get statments from file stmts_all = set(ac.load_statements(args.statements_in)) else: # Use api to get statements. NOT the same as querying for each ID if args.geneset_file: stmts_all = dnf.dbc_load_statements(gene_filter_list) else: # if there is no gene set file, restrict to gene ids in # correlation data stmts_all = dnf.dbc_load_statements(list(all_hgnc_ids)) # Dump statements to pickle file if output name has been given if args.statements_out: ac.dump_statements(stmts=stmts_all, fname=args.statements_out) # Get nested dicts from statements nested_dict_statements = dnf.nested_dict_gen(stmts_all) # Loop through the unique pairs dir_conn_pairs = [] dir_neg_conn_pairs = [] unexplained = [] npairs = len(uniq_pairs) f_con = open(args.outbasename + '_connections_latex.tex', 'w') f_neg_c = open(args.outbasename + '_neg_conn_latex.tex', 'w') logger.info('Looking for connections between %i pairs' % npairs) for pair in uniq_pairs: pl = list(pair) for li in pl: if _is_float(li): correlation = li fmt_corr = '{0:.04}'.format(correlation) break pl.remove(correlation) id1, id2 = pl forward_fail = False backward_fail = False if (nested_dict_statements.get(id1) and nested_dict_statements.get(id1).get(id2)) or \ (nested_dict_statements.get(id2) and nested_dict_statements.get(id2).get(id1)): new_pair = r'\section{{{}, {}: {}}}'.format(id1, id2, fmt_corr) \ +'\n'+ \ r'See correlation plot \href{{' \ r'https://depmap.org/portal/interactive/?xDataset=Avana' \ r'&xFeature={}&yDataset=Avana&yFeature={}&colorDataset=' \ r'lineage&colorFeature=all&filterDataset=context' \ r'&filterFeature=®ressionLine=false&statisticsTable=false' \ r'&associationTable=true&plotOnly=false}}{{here}}'.format( id1, id2) + '\n\n' f_con.write(new_pair) if correlation < 0: f_neg_c.write(new_pair) # nested_dict_statements.get(id1).get(id2) raises AttributeError # if nested_dict_statements.get(id1) returns {} ev_fltr = 0 # Checks subj=id1, obj=id2 if nested_dict_statements.get(id1) and \ nested_dict_statements.get(id1).get(id2): stmts = nested_dict_statements[id1][id2] logger.info('Found connection between %s and %s' % (id1, id2)) dir_conn_pairs.append((id1, id2, correlation, stmts)) output = dnf.latex_output(subj=id1, obj=id2, corr=correlation, ev_len_fltr=ev_fltr, stmts=stmts, ignore_str='parent') f_con.write(output) if correlation < 0: dir_neg_conn_pairs.append((id1, id2, correlation, stmts)) f_neg_c.write(output) else: forward_fail = True # Checks subj=id2, obj=id1 if nested_dict_statements.get(id2) and \ nested_dict_statements.get(id2).get(id1): stmts = nested_dict_statements[id2][id1] logger.info('Found connection between %s and %s' % (id2, id1)) dir_conn_pairs.append((id2, id1, correlation, stmts)) output = dnf.latex_output(subj=id2, obj=id1, corr=correlation, ev_len_fltr=ev_fltr, stmts=stmts, ignore_str='parent') f_con.write(output) if correlation < 0: dir_neg_conn_pairs.append((id2, id1, correlation, stmts)) f_neg_c.write(output) else: backward_fail = True # If both failed, count as unexplained if forward_fail and backward_fail: unexplained.append([id1, id2, correlation]) with open(args.outbasename + '_connections.csv', 'w', newline='') as csvf: wrtr = csv.writer(csvf, delimiter=',') wrtr.writerows(dir_conn_pairs) with open(args.outbasename + '_neg_conn.csv', 'w', newline='') as csvf: wrtr = csv.writer(csvf, delimiter=',') wrtr.writerows(dir_neg_conn_pairs) with open(args.outbasename + '_unexplained.csv', 'w', newline='') as csvf: wrtr = csv.writer(csvf, delimiter=',') wrtr.writerows(unexplained) f_con.close() f_neg_c.close()
from os.path import abspath, dirname, join from indra.tools import assemble_corpus as ac from indra.databases import hgnc_client from indra.assemblers.indranet import IndraNetAssembler from indra.sources import indra_db_rest as idr if __name__ == '__main__': stmts_path = join(dirname(abspath(__file__)), '..', '..', '..', 'covid-19', 'stmts') gordon_stmts_path = join(stmts_path, 'gordon_ndex_stmts.pkl') gordon_stmts = ac.load_statements(gordon_stmts_path) # Get human interactors of viral proteins from Gordon et al. hgnc_ids = [ ag.db_refs['HGNC'] for stmt in gordon_stmts for ag in stmt.agent_list() if ag is not None and 'HGNC' in ag.db_refs ] hgnc_names = [hgnc_client.get_hgnc_name(id) for id in hgnc_ids] stmts = [] for gene in hgnc_names: idrp = idr.get_statements(agents=[gene]) stmts.extend(idrp.statements) ac.dump_statements(stmts, 'gordon_ppi_stmts.pkl')
def build_prior(genes, out_file): gn = GeneNetwork(genes) stmts = gn.get_statements(filter=False) ac.dump_statements(stmts, out_file) return stmts
# The file in which the preassembled statements will be saved pre_stmts_file = prefixed_pkl('preassembled') if reassemble: # Load various files that were previously produced sources = [ 'indradb', 'trips', 'bel', 'biopax', 'phosphosite', 'r3', 'sparser' ] stmts = [] for source in sources: stmts += ac.load_statements(prefixed_pkl(source)) stmts = ac.filter_no_hypothesis(stmts) # Fix grounding and filter to grounded entities and for proteins, # filter to the human ones stmts = ac.map_grounding(stmts) stmts = ac.filter_grounded_only(stmts) stmts = ac.filter_human_only(stmts) # Combinatorially expand protein families stmts = ac.expand_families(stmts) # Apply a strict filter to statements based on the gene names stmts = ac.filter_gene_list(stmts, gene_names, 'all') # Fix errors in references to protein sequences stmts = ac.map_sequence(stmts) # Run preassembly and save result stmts = ac.run_preassembly(stmts, return_toplevel=False) ac.dump_statements(stmts, pre_stmts_file) # Load the preassembled statements stmts = ac.load_statements(pre_stmts_file) # Run assembly into a PySB model assemble_pysb.assemble_pysb(stmts, gene_names, contextualize=True)
def main(args): global any_expl, any_expl_not_sr, common_parent, ab_expl_count, \ directed_im_expl_count, both_im_dir_expl_count, \ any_axb_non_sr_expl_count, sr_expl_count, \ shared_regulator_only_expl_count, explanations_of_pairs, unexplained, \ explained_nested_dict, id1, id2, nested_dict_statements, dataset_dict, \ avg_corr, dir_node_set, nx_dir_graph, explained_set, part_of_explained,\ sr_explanations, any_expl_ign_sr if args.cell_line_filter and not len(args.cell_line_filter) > 2: logger.info('Filtering to provided cell lines in correlation ' 'calculations.') cell_lines = _parse_cell_filter(*args.cell_line_filter) assert len(cell_lines) > 0 elif args.cell_line_filter and len(args.cell_line_filter) > 2: sys.exit('Argument --cell-line-filter only takes one or two arguments') # No cell line dictionary and rnai data and filtering is requested elif args.cell_line_filter and len(args.cell_line_filter) == 1 and \ args.rnai_data_file: sys.exit('Need a translation dictionary if RNAi data is provided and ' 'filter is requested') else: # Should be empty only when --cell-line-filter is not provided logger.info('No cell line filter provided. Using all cell lines in ' 'correlation calculations.') cell_lines = [] # Parse "explained genes" if args.explained_set and len(args.explained_set) == 2: explained_set = _parse_explained_genes( gene_set_file=args.explained_set[0], check_column=args.explained_set[1]) logger.info('Loading "explained pairs."') elif args.explained_set and len(args.explained_set) != 2: sys.exit('Argument --explained-set takes exactly two arguments: ' '--explained-set <file> <column name>') # Check if belief dict is provided if not args.belief_score_dict and not args.nested_dict_in: logger.error('Belief dict must be provided through the `-b (' '--belief-score-dict)` argument if no nested dict ' 'of statements with belief score is provided through the ' '`-ndi (--nested-dict-in)` argument.') raise FileNotFoundError # Get dict of {hash: belief score} belief_dict = None # ToDo use api to query belief scores if not loaded if args.belief_score_dict: if args.belief_score_dict.endswith('.json'): belief_dict = _json_open(args.belief_score_dict) elif args.belief_score_dict.endswith('.pkl'): belief_dict = _pickle_open(args.belief_score_dict) args_dict = _arg_dict(args) npairs = 0 filter_settings = { 'gene_set_filter': args.gene_set_filter, 'strict': args.strict, 'cell_line_filter': cell_lines, 'cell_line_translation_dict': _pickle_open(args.cell_line_filter[1]) if args.cell_line_filter and len(args.cell_line_filter) == 2 else None, 'margin': args.margin, 'filter_type': (args.filter_type if args.filter_type else None) } output_settings = { 'dump_unique_pairs': args.dump_unique_pairs, 'outbasename': args.outbasename } # Parse CRISPR and/or RNAi data if args_dict.get('crispr') or args_dict.get('rnai'): if not filter_settings['filter_type'] and \ args.crispr_data_file and \ args.rnai_data_file: logger.info('No merge filter set. Output will be intersection of ' 'the two data sets.') elif filter_settings.get('filter_type'): logger.info('Using filter type "%s"' % filter_settings['filter_type']) master_corr_dict, all_hgnc_ids, stats_dict = \ dnf.get_combined_correlations(dict_of_data_sets=args_dict, filter_settings=filter_settings, output_settings=output_settings) # Count pairs in merged correlation dict and dum it npairs = dnf._dump_master_corr_dict_to_pairs_in_csv( fname=args.outbasename + '_merged_corr_pairs.csv', nest_dict=master_corr_dict) if args.gene_set_filter: gene_filter_list = None if args_dict.get('crispr') and not args_dict.get('rnai'): gene_filter_list = dnf._read_gene_set_file( gf=filter_settings['gene_set_filter'], data=pd.read_csv(args_dict['crispr']['data'], index_col=0, header=0)) elif args_dict.get('rnai') and not args_dict.get('crispr'): gene_filter_list = dnf._read_gene_set_file( gf=filter_settings['gene_set_filter'], data=pd.read_csv(args_dict['rnai']['data'], index_col=0, header=0)) elif args_dict.get('crispr') and args_dict.get('rnai'): gene_filter_list = \ set(dnf._read_gene_set_file( gf=filter_settings['gene_set_filter'], data=pd.read_csv(args_dict['crispr']['data'], index_col=0, header=0))) & \ set(dnf._read_gene_set_file( gf=filter_settings['gene_set_filter'], data=pd.read_csv(args_dict['rnai']['data'], index_col=0, header=0))) assert gene_filter_list is not None else: gene_filter_list = None else: stats_dict = None # LOADING INDRA STATEMENTS # Get statements from file or from database that contain any gene from # provided list as set unless you're already loading a pre-calculated # nested dict and/or precalculated directed graph. if not (args.light_weight_stmts or args.nested_dict_in): if args.statements_in: # Get statments from file stmts_all = set(ac.load_statements(args.statements_in)) # Use api to get statements. _NOT_ the same as querying for each ID else: if args.gene_set_filter: stmts_all = dnf.dbc_load_statements(gene_filter_list) else: # if there is no gene set file, restrict to gene ids in # input data stmts_all = dnf.dbc_load_statements(list(all_hgnc_ids)) # Dump statements to pickle file if output name has been given if args.statements_out: logger.info('Dumping read raw statements') ac.dump_statements(stmts=stmts_all, fname=args.statements_out) # Get nested dicts from statements if args.light_weight_stmts: hash_df = pd.read_csv(args.light_weight_stmts, delimiter='\t') nested_dict_statements = dnf.nested_hash_dict_from_pd_dataframe( hash_df) elif args.nested_dict_in: nested_dict_statements = _pickle_open(args.nested_dict_in) else: nested_dict_statements = dnf.dedupl_nested_dict_gen( stmts_all, belief_dict) if args.nested_dict_out: _dump_it_to_pickle(fname=args.nested_dict_out, pyobj=nested_dict_statements) # Get directed simple graph if args.directed_graph_in: with open(args.directed_graph_in, 'rb') as rpkl: nx_dir_graph = pkl.load(rpkl) else: # Create directed graph from statement dict nx_dir_graph = dnf.nx_directed_graph_from_nested_dict_2layer( nest_d=nested_dict_statements, belief_dict=belief_dict) # Save as pickle file if args.directed_graph_out: _dump_it_to_pickle(fname=args.directed_graph_out, pyobj=nx_dir_graph) dir_node_set = set(nx_dir_graph.nodes) # LOOP THROUGH THE UNIQUE CORRELATION PAIRS, MATCH WITH INDRA NETWORK any_expl = 0 # Count if any explanation per (A,B) correlation found any_expl_not_sr = 0 # Count any explanation, exlcuding when shared # regulator is the only explanation any_expl_ign_sr = 0 # Count any explanation, ingoring shared regulator # explanations common_parent = 0 # Count if common parent found per set(A,B) part_of_explained = 0 # Count pairs part the "explained set" ab_expl_count = 0 # Count A-B/B-A as one per set(A,B) directed_im_expl_count = 0 # Count any A->X->B,B->X->A as one per set(A,B) any_axb_non_sr_expl_count = 0 # Count if shared target found per set(A,B) sr_expl_count = 0 # Count if shared regulator found per set(A,B) shared_regulator_only_expl_count = 0 # Count if only shared regulator found explanations_of_pairs = [] # Saves all non shared regulator explanations sr_explanations = [] # Saves all shared regulator explanations unexplained = [] # Unexplained correlations skipped = 0 # The explained nested dict: (1st key = subj, 2nd key = obj, 3rd key = # connection type or correlation). # # directed: any A->B or B->A # undirected: any of complex, selfmodification, parent # x_is_intermediary: A->X->B or B->X->A # x_is_downstream: A->X<-B # x_is_upstream: A<-X->B # # d[subj][obj] = {correlation: {gene_set1: corr, gene_set2: corr, ...}, # directed: [(stmt/stmt hash, belief score)], # undirected: [(stmt/stmt hash, belief score)], # common_parents: [list of parents] # x_is_intermediary: [(X, belief rank)], # x_is_downstream: [(X, belief rank)], # x_is_upstream: [(X, belief rank)]} # # Then in javascript you can for example do: # if SUBJ_is_subj_dict.obj.direct.length <-- should return zero if [] # # Used to get: directed graph # 1. all nodes of directed graph -> 1st dropdown # 2. dir -> undir graph -> jsons to check all corr neighbors -> 2nd dropdown # 3. jsons to check if connection is direct or intermediary # Using the following loop structure for counter variables: # a = 2 # def for_loop_body(): # global a # a += 1 # # Then loop like: # if dict: # for pairs in dict: # for_loop_body(args) # elif random: # for random pair: # for_loop_body(args) explained_nested_dict = dnf.create_nested_dict() # Loop rnai and/or crispr only if args_dict.get('rnai') or args_dict.get('crispr') and \ not args.brca_dependencies: logger.info('Gene pairs generated from DepMap knockout screening data ' 'sets') logger.info('Looking for connections between %i pairs' % (npairs if npairs > 0 else args.max_pairs)) for outer_id, do in master_corr_dict.items(): for inner_id, dataset_dict in do.items(): if len(dataset_dict.keys()) == 0: skipped += 1 if args.verbosity: logger.info('Skipped outer_id=%s and inner_id=%s' % (outer_id, inner_id)) continue id1, id2 = outer_id, inner_id loop_body(args) # Loop rnai and/or crispr AND BRCA cell line dependencies elif args_dict.get('rnai') or args_dict.get('crispr') and \ args.brca_dependencies: logger.info('Gene pairs generated from combined knockout screens. ' 'Output data will incluide BRCA cell line dependency\n' 'data as well as correlation data from knockout screens.') logger.info('Looking for connections between %i pairs' % (npairs if npairs > 0 else args.max_pairs)) # Load BRCA dependency data brca_data_set = pd.read_csv(args.brca_dependencies, header=0) depend_in_breast_genes = brca_data_set.drop( axis=1, labels=['Url Label', 'Type'])[brca_data_set['Type'] == 'gene'] genes = set(depend_in_breast_genes['Gene/Compound'].values) for outer_id, do in master_corr_dict.items(): for inner_id, knockout_dict in do.items(): if len(knockout_dict.keys()) == 0: skipped += 1 if args.verbosity: logger.info('Skipped outer_id=%s and inner_id=%s' % (outer_id, inner_id)) continue id1, id2 = outer_id, inner_id dataset_dict = {} gene1_data = [] gene2_data = [] # Get BRCA dep data if id1 in genes: for row in depend_in_breast_genes[ depend_in_breast_genes['Gene/Compound'] == id1].iterrows(): gene1_data.append( (row[1]['Dataset'], row[1]['T-Statistic'], row[1]['P-Value'])) if id2 in genes: for row in depend_in_breast_genes[ depend_in_breast_genes['Gene/Compound'] == id2].iterrows(): gene2_data.append( (row[1]['Dataset'], row[1]['T-Statistic'], row[1]['P-Value'])) dataset_dict[id1] = gene1_data dataset_dict[id2] = gene2_data dataset_dict['crispr'] = (knockout_dict['crispr'] if knockout_dict.get('crispr') else None), dataset_dict['rnai'] = (knockout_dict['rnai'] if knockout_dict.get('rnai') else None) if id1 not in genes and id2 not in genes: dataset_dict = knockout_dict # Run loop body loop_body(args) # loop brca dependency ONLY elif args.brca_dependencies and not \ (args_dict.get('rnai') or args_dict.get('crispr')): logger.info( 'Gene pairs generated from BRCA gene enrichment data only.') brca_data_set = pd.read_csv(args.brca_dependencies, header=0) depend_in_breast_genes = brca_data_set.drop( axis=1, labels=['Url Label', 'Type'])[brca_data_set['Type'] == 'gene'] genes = set(depend_in_breast_genes['Gene/Compound'].values) npairs = len(list(itt.combinations(genes, 2))) logger.info('Looking for connections between %i pairs' % (npairs if npairs > 0 else args.max_pairs)) for id1, id2 in itt.combinations(genes, 2): gene1_data = [] gene2_data = [] # For each non-diagonal pair in file, insert in dataset_dict: # geneA, geneB, # dataset for A, dataset for B, # T-stat for A, T-stat for B, # P-value for A, P-value for row in depend_in_breast_genes[ depend_in_breast_genes['Gene/Compound'] == id1].iterrows(): gene1_data.append((row[1]['Dataset'], row[1]['T-Statistic'], row[1]['P-Value'])) for row in depend_in_breast_genes[ depend_in_breast_genes['Gene/Compound'] == id2].iterrows(): gene2_data.append((row[1]['Dataset'], row[1]['T-Statistic'], row[1]['P-Value'])) # dataset_dict = {id1: # [(dataset1, T-stat1, P-value1), # (dataset2, T-stat2, P-value2)], # id2: # [(..., ...)], # ...} dataset_dict = {id1: gene1_data, id2: gene2_data} loop_body(args) # loop random pairs from data set elif args_dict.get('sampling_gene_file'): logger.info('Gene pairs generated at random from %s' % args_dict['sampling_gene_file']) with open(args_dict['sampling_gene_file'], 'r') as fi: rnd_gene_set = [l.strip() for l in fi.readlines()] npairs = args.max_pairs dataset_dict = None logger.info('Looking for connections between %i pairs' % (npairs if npairs > 0 else args.max_pairs)) for _ in range(npairs): id1, id2 = _rnd_pair_gen(rnd_gene_set) assert not isinstance(id1, list) loop_body(args) long_string = '' long_string += '-' * 63 + '\n' long_string += 'Summary for matching INDRA network to correlation pairs:'\ + '\n\n' long_string += '> Total number of correlation pairs checked: %i' % npairs\ + '\n' if args.verbosity: long_string += '> Skipped %i empty doublets in corr dict\n' % skipped long_string += '> Total correlations unexplained: %i' % len(unexplained)\ + '\n' long_string += '> Total correlations explained: %i' % any_expl + '\n' long_string += '> Total correlations explained, ignoring shared ' \ 'regulator: %i' % any_expl_ign_sr + '\n' long_string += '> Total correlations explained, excluding shared ' \ 'regulator (total - shared only): %i' % \ (any_expl - shared_regulator_only_expl_count) + '\n' long_string += '> %i correlations have an explanation involving a ' \ 'common parent' % common_parent + '\n' if args.explained_set: long_string += '> %i gene pairs were considered explained as part ' \ 'of the "explained set"' % part_of_explained + '\n' long_string += '> %i explanations involving direct connection or ' \ 'complex' % ab_expl_count + '\n' long_string += '> %i correlations have a directed explanation ' \ 'involving an intermediate node (A->X->B/A<-X<-B)' \ % directed_im_expl_count + '\n' long_string += '> %i correlations have an explanation involving an ' \ 'intermediate node excluding shared regulators' % \ any_axb_non_sr_expl_count + '\n' long_string += '> %i correlations have an explanation involving a ' \ 'shared regulator (A<-X->B)' % sr_expl_count + '\n' long_string += '> %i correlations have shared regulator as only ' \ 'explanation' % shared_regulator_only_expl_count + '\n\n' if stats_dict and (stats_dict.get('rnai') or stats_dict.get('crispr')): long_string += 'Statistics of input data:' + '\n\n' if stats_dict and stats_dict.get('rnai'): long_string += ' RNAi data ' + '\n' long_string += ' -----------' + '\n' long_string += '> mean: %f\n' % stats_dict['rnai']['mean'] long_string += '> SD: %f\n' % stats_dict['rnai']['sigma'] long_string += '> lower bound: %.3f*SD = %.4f\n' % ( args_dict['rnai']['ll'], args_dict['rnai']['ll'] * stats_dict['rnai']['sigma']) if args_dict['rnai']['ul']: long_string += '> upper bound: %.3f*SD = %.4f\n\n' % ( args_dict['rnai']['ul'], args_dict['rnai']['ul'] * stats_dict['rnai']['sigma']) if stats_dict and stats_dict.get('crispr'): long_string += ' CRISPR data ' + '\n' long_string += ' -------------' + '\n' long_string += '> mean: %f\n' % stats_dict['crispr']['mean'] long_string += '> SD: %f\n' % stats_dict['crispr']['sigma'] long_string += '> lower bound: %.3f*SD = %.4f\n' % ( args_dict['crispr']['ll'], args_dict['crispr']['ll'] * stats_dict['crispr']['sigma']) if args_dict['crispr']['ul']: long_string += '> upper bound: %.3f*SD = %.4f\n\n' % ( args_dict['crispr']['ul'], args_dict['crispr']['ul'] * stats_dict['crispr']['sigma']) long_string += '-' * 63 + '\n\n' logger.info('\n' + long_string) # Here create directed graph from explained nested dict nx_expl_dir_graph = dnf.nx_directed_graph_from_nested_dict_3layer( nest_d=explained_nested_dict) if not args.no_web_files: # 'explained_nodes' are used to produce first drop down explained_nodes = list(nx_expl_dir_graph.nodes) logger.info('Dumping json "explainable_ids.json" for first dropdown.') _dump_it_to_json(args.outbasename + '_explainable_ids.json', explained_nodes) # Get undir graph and save each neighbor lookup as json for 2nd dropdown nx_expl_undir_graph = nx_expl_dir_graph.to_undirected() dnf.nx_undir_to_neighbor_lookup_json( expl_undir_graph=nx_expl_undir_graph, outbasename=args.outbasename) # Easiest way to check if pairs are explained or not is to loop explained # dict. Skip shared regulators. _dump_nest_dict_to_csv(fname=args.outbasename + '_explained_correlations.csv', nested_dict=explained_nested_dict, header=['gene1', 'gene2', 'meta_data'], excl_sr=True) _dump_it_to_pickle(fname=args.outbasename + '_explained_nest_dict.pkl', pyobj=explained_nested_dict) headers = ['subj', 'obj', 'type', 'X', 'meta_data'] _dump_it_to_csv(fname=args.outbasename + '_explanations_of_pairs.csv', pyobj=explanations_of_pairs, header=headers) _dump_it_to_csv(fname=args.outbasename + '_explanations_of_shared_regulators.csv', pyobj=sr_explanations, header=headers) _dump_it_to_csv(fname=args.outbasename + '_unexpl_correlations.csv', pyobj=unexplained, header=headers[:-2]) with open(args.outbasename + '_script_summary.txt', 'w') as fo: fo.write(long_string) return 0
site = '%s_%s%s' % (stmt.sub.name, stmt.residue, stmt.position) regulons[kinase].add(site) rows = [] for kinase, sites in regulons.items(): rows.append([kinase, 'Description'] + [s for s in sites]) with open(filename, 'wt') as f: csvwriter = csv.writer(f, delimiter='\t') csvwriter.writerows(rows) if __name__ == '__main__': reload = False if reload: phos_stmts = \ get_phosphorylation_stmts('../work/gsea_sites.rnk') ac.dump_statements(phos_stmts, '../work/phospho_stmts.pkl') else: phos_stmts = ac.load_statements('../work/phospho_stmts.pkl') regulons_from_stmts(phos_stmts, '../work/kinase_regulons.gmt') #kinases = get_kinase_counts(phos_stmts) target_list = get_stmt_subject_object(phos_stmts, 'SUBJECT') # Get all Tubulin child nodes as the source list source_list = [('FPLX', 'Tubulin')] tubulin_ag = Agent('Tubulin', db_refs={'FPLX': 'Tubulin'}) ex = Expander(bio_ontology) for ag_ns, ag_id in ex.get_children(tubulin_ag, ns_filter=None): #if ag_ns == 'HGNC':
'--ctd_stmts', help='Path to CTD statements pkl file', required=True) parser.add_argument('-f', '--output_file', help='Output file for combined pkl', required=True) args = parser.parse_args() # Load everything logger.info('Loading statements from pickle files') with open(args.old_mm, 'rb') as f: old_mm_emmaa_stmts = pickle.load(f) old_mm_stmts = [es.stmt for es in old_mm_emmaa_stmts] if args.new_cord: new_cord_stmts = ac.load_statements(args.new_cord) else: new_cord_stmts = None drug_stmts = ac.load_statements(args.drug_stmts) gordon_stmts = ac.load_statements(args.gordon_stmts) virhostnet_stmts = ac.load_statements(args.virhostnet_stmts) ctd_stmts = ac.load_statements(args.ctd_stmts) other_stmts = drug_stmts + gordon_stmts + virhostnet_stmts + ctd_stmts combined_stmts = make_model_stmts(old_mm_stmts, other_stmts, new_cord_stmts) # Dump new pickle ac.dump_statements(combined_stmts, args.output_file)
def build_prior(genes, out_file): gn = GeneNetwork(genes, 'dna_damage_prior') #stmts = gn.get_statements(filter=False) stmts = gn.get_biopax_stmts(filter=False) ac.dump_statements(stmts, out_file) return stmts
def build_prior(genes, file_prefix): gn = GeneNetwork(genes, file_prefix) #stmts = gn.get_statements(filter=False) stmts = gn.get_biopax_stmts(filter=False) ac.dump_statements(stmts, '%s.pkl' % file_prefix) return stmts
def assemble_pysb(stmts, data_genes, contextualize=False): # Filter the INDRA Statements to be put into the model stmts = ac.filter_by_type(stmts, Complex, invert=True) stmts = ac.filter_direct(stmts) stmts = ac.filter_belief(stmts, 0.95) stmts = ac.filter_top_level(stmts) # Strip the extraneous supports/supported by here strip_supports(stmts) stmts = ac.filter_gene_list(stmts, data_genes, 'all') stmts = ac.filter_enzyme_kinase(stmts) stmts = ac.filter_mod_nokinase(stmts) stmts = ac.filter_transcription_factor(stmts) # Simplify activity types ml = MechLinker(stmts) ml.gather_explicit_activities() ml.reduce_activities() ml.gather_modifications() ml.reduce_modifications() stmts = normalize_active_forms(ml.statements) # Replace activations when possible ml = MechLinker(stmts) ml.gather_explicit_activities() ml.replace_activations() # Require active forms ml.require_active_forms() num_stmts = len(ml.statements) while True: # Remove inconsequential PTMs ml.statements = ac.filter_inconsequential_mods(ml.statements, get_mod_whitelist()) ml.statements = ac.filter_inconsequential_acts(ml.statements, get_mod_whitelist()) if num_stmts <= len(ml.statements): break num_stmts = len(ml.statements) stmts = ml.statements # Save the Statements here ac.dump_statements(stmts, prefixed_pkl('pysb_stmts')) # Add drug target Statements drug_target_stmts = get_drug_target_statements() stmts += drug_target_stmts # Just generate the generic model pa = PysbAssembler() pa.add_statements(stmts) model = pa.make_model() with open(prefixed_pkl('pysb_model'), 'wb') as f: pickle.dump(model, f) # Run this extra part only if contextualize is set to True if not contextualize: return cell_lines_no_data = ['COLO858', 'K2', 'MMACSF', 'MZ7MEL', 'WM1552C'] for cell_line in cell_lines: if cell_line not in cell_lines_no_data: stmtsc = contextualize_stmts(stmts, cell_line, data_genes) else: stmtsc = stmts pa = PysbAssembler() pa.add_statements(stmtsc) model = pa.make_model() if cell_line not in cell_lines_no_data: contextualize_model(model, cell_line, data_genes) ac.dump_statements(stmtsc, prefixed_pkl('pysb_stmts_%s' % cell_line)) with open(prefixed_pkl('pysb_model_%s' % cell_line), 'wb') as f: pickle.dump(model, f)