def generate_model(model_name): """Generate a simple model for end-to-end testing using natural language.""" tp = trips.process_text('BRAF activates MAP2K1. ' 'Active MAP2K1 activates MAPK1.') indra_stmts = tp.statements emmaa_stmts = [ EmmaaStatement(stmt, datetime.datetime.now(), 'MAPK1') for stmt in indra_stmts ] # Create a CXAssembled model, upload to NDEx and retrieve key #cxa = CxAssembler(indra_stmts) #cxa.make_model() #ndex_id = cxa.upload_model(private=False) config_dict = { 'ndex': { 'network': 'a08479d1-24ce-11e9-bb6a-0ac135e8bacf' }, 'search_terms': [{ 'db_refs': { 'HGNC': '20974' }, 'name': 'MAPK1', 'search_term': 'MAPK1', 'type': 'gene' }] } emmaa_model = EmmaaModel(model_name, config_dict) emmaa_model.add_statements(emmaa_stmts) return emmaa_model, config_dict
def test_model_json(): """Test the json structure and content of EmmaaModel.to_json() output""" indra_stmts = \ [Activation(Agent('BRAF', db_refs={'HGNC': '20974'}), Agent('MAP2K1'), evidence=[Evidence(text='BRAF activates MAP2K1.')]), Activation(Agent('MAP2K1', activity=ActivityCondition('activity', True)), Agent('MAPK1'), evidence=[Evidence(text='Active MAP2K1 activates MAPK1.')]) ] st = SearchTerm('gene', 'MAP2K1', db_refs={}, search_term='MAP2K1') emmaa_stmts = [ EmmaaStatement(stmt, datetime.datetime.now(), [st]) for stmt in indra_stmts ] config_dict = { 'ndex': { 'network': 'a08479d1-24ce-11e9-bb6a-0ac135e8bacf' }, 'search_terms': [{ 'db_refs': { 'HGNC': '20974' }, 'name': 'MAPK1', 'search_term': 'MAPK1', 'type': 'gene' }] } emmaa_model = EmmaaModel('test', config_dict) emmaa_model.add_statements(emmaa_stmts) emmaa_model_json = emmaa_model.to_json() # Test json structure assert emmaa_model_json['name'] == 'test' assert isinstance(emmaa_model_json['stmts'], list) assert emmaa_model_json['ndex_network'] == \ 'a08479d1-24ce-11e9-bb6a-0ac135e8bacf' # Test config assert emmaa_model_json['search_terms'][0]['type'] == 'gene' assert emmaa_model_json['search_terms'][0]['db_refs'] == {'HGNC': '20974'} # Test json statements assert 'BRAF activates MAP2K1.' == \ emmaa_model_json['stmts'][0]['stmt']['evidence'][0]['text'] assert 'BRAF activates MAP2K1.' == \ emmaa_model_json['stmts'][0]['stmt']['evidence'][0]['text'] assert 'Active MAP2K1 activates MAPK1.' == \ emmaa_model_json['stmts'][1]['stmt']['evidence'][0]['text'] assert emmaa_model_json['stmts'][0]['stmt']['subj']['name'] == 'BRAF' assert emmaa_model_json['stmts'][1]['stmt']['subj']['name'] == 'MAP2K1' assert emmaa_model_json['stmts'][1]['stmt']['obj']['name'] == 'MAPK1' # Need hashes to be strings so that javascript can read them assert isinstance( emmaa_model_json['stmts'][0]['stmt']['evidence'][0]['source_hash'], str)
def create_model(relevance=None, paper_ids=None): indra_stmts = [ Activation(Agent('BRAF', db_refs={'HGNC': '1097'}), Agent('MAP2K1', db_refs={'HGNC': '6840'}), evidence=[Evidence(text='BRAF activates MAP2K1.', source_api='assertion', text_refs={'TRID': '1234'})]), Activation(Agent('MAP2K1', db_refs={'HGNC': '6840'}, activity=ActivityCondition('activity', True)), Agent('MAPK1', db_refs={'HGNC': '6871'}), evidence=[Evidence(text='Active MAP2K1 activates MAPK1.', source_api='assertion', text_refs={'TRID': '2345'})]) ] st = SearchTerm('gene', 'MAP2K1', db_refs={}, search_term='MAP2K1') emmaa_stmts = [ EmmaaStatement( indra_stmts[0], datetime.datetime.now(), [st], {'internal': True, 'curated': False}), EmmaaStatement( indra_stmts[1], datetime.datetime.now(), [st], {'internal': True, 'curated': True}) ] config_dict = { 'ndex': {'network': 'a08479d1-24ce-11e9-bb6a-0ac135e8bacf'}, 'search_terms': [{'db_refs': {'HGNC': '20974'}, 'name': 'MAPK1', 'search_term': 'MAPK1', 'type': 'gene'}], 'human_readable_name': 'Test Model', 'test': { 'statement_checking': {'max_path_length': 5, 'max_paths': 1}, 'test_corpus': 'simple_tests', 'mc_types': ['pysb', 'pybel', 'signed_graph', 'unsigned_graph']}, 'assembly': [ {'function': 'filter_no_hypothesis'}, {'function': 'map_grounding'}, {'function': 'filter_grounded_only'}, {'function': 'filter_human_only'}, {'function': 'map_sequence'}, {'function': 'run_preassembly', 'kwargs': { 'return_toplevel': False}}]} if relevance: config_dict['assembly'].append( {'function': 'filter_relevance', 'kwargs': {'policy': relevance}}) emmaa_model = EmmaaModel('test', config_dict, paper_ids) emmaa_model.add_statements(emmaa_stmts) return emmaa_model
def test_load_model(): # Local imports are recommended when using moto from emmaa.model import EmmaaModel client = setup_bucket(add_model=True) em = EmmaaModel.load_from_s3('test', bucket='test_bucket') assert isinstance(em, EmmaaModel) assert len(em.stmts) == 2, len(em.stmts) assert em.name == 'test'
def load_from_statements(cls, model_name, mode='local', date=None, bucket=EMMAA_BUCKET_NAME): config = load_config_from_s3(model_name, bucket=bucket) if date: prefix = f'papers/{model_name}/paper_ids_{date}' else: prefix = f'papers/{model_name}/paper_ids_' paper_key = find_latest_s3_file(bucket, prefix, 'json') if paper_key: paper_ids = load_json_from_s3(bucket, paper_key) else: paper_ids = None model = EmmaaModel(model_name, config, paper_ids) # Loading assembled statements to avoid reassembly stmts, fname = get_assembled_statements(model_name, date, bucket) model.assembled_stmts = stmts model.date_str = strip_out_date(fname, 'datetime') mm = cls(model, mode=mode) return mm
def upload_prior(ctype, config, gene_names): fname = f'../models/{ctype}/prior_stmts.pkl' with open(fname, 'rb') as fh: stmts = pickle.load(fh) estmts = get_emmaa_statements(stmts, gene_names) model = EmmaaModel(ctype, config) model.add_statements(estmts) model.update_to_ndex()
def test_model_extend(): ev1 = Evidence(pmid='1234', text='abcd', source_api='x') ev2 = Evidence(pmid='1234', text='abcde', source_api='x') ev3 = Evidence(pmid='1234', text='abcd', source_api='x') indra_sts = [Phosphorylation(None, Agent('a'), evidence=ev) for ev in [ev1, ev2, ev3]] emmaa_sts = [EmmaaStatement(st, datetime.datetime.now(), []) for st in indra_sts] em = EmmaaModel('x', {'search_terms': [], 'ndex': {'network': None}}) em.add_statements([emmaa_sts[0]]) em.extend_unique(emmaa_sts[1:]) assert len(em.stmts) == 2 stmt = EmmaaStatement(Phosphorylation(None, Agent('b'), evidence=ev1), datetime.datetime.now(), []) em.extend_unique([stmt]) assert len(em.stmts) == 3
def make_model(self): """Make an EmmaaModel and upload it along with the config to S3.""" config = self.make_config() em = EmmaaModel(self.name, config) em.stmts = self.stmts ndex_uuid = em.upload_to_ndex() config['ndex'] = {'network': ndex_uuid} save_config_to_s3(self.name, config) em.save_to_s3()
def upload_prior(ctype, config): fname = f'models/{ctype}/prior_stmts.pkl' with open(fname, 'rb') as fh: stmts = pickle.load(fh) estmts = [ EmmaaStatement(stmt, datetime.datetime.now(), []) for stmt in stmts ] model = EmmaaModel(ctype, config) model.add_statements(estmts) model.upload_to_ndex()
def get_statements(self, mode='all', batch_size=100): """Return EMMAA Statements for this prior's literature set. Parameters ---------- mode : 'all' or 'distilled' The 'distilled' mode makes sure that the "best", non-redundant set of raw statements are found across potentially redundant text contents and reader versions. The 'all' mode doesn't do such distillation but is significantly faster. batch_size : Optional[int] Determines how many PMIDs to fetch statements for in each iteration. Default: 100. Returns ------- list of EmmaaStatement A list of EMMAA Statements corresponding to extractions from the subset of literature defined by this prior's search terms. """ if self.stmts: return self.stmts terms_to_pmids = \ EmmaaModel.search_pubmed(search_terms=self.search_terms, date_limit=None) pmids_to_terms = defaultdict(list) for term, pmids in terms_to_pmids.items(): for pmid in pmids: pmids_to_terms[pmid].append(term) pmids_to_terms = dict(pmids_to_terms) all_pmids = set(pmids_to_terms.keys()) raw_statements_by_pmid = \ get_raw_statements_for_pmids(all_pmids, mode=mode, batch_size=batch_size) timestamp = datetime.datetime.now() for pmid, stmts in raw_statements_by_pmid.items(): for stmt in stmts: self.stmts.append( EmmaaStatement(stmt, timestamp, pmids_to_terms[pmid], {'internal': True})) return self.stmts
def model_to_tests(model_name, upload=True, bucket=EMMAA_BUCKET_NAME): em = EmmaaModel.load_from_s3(model_name, bucket=bucket) em.run_assembly() tests = [ StatementCheckingTest(stmt) for stmt in em.assembled_stmts if all(stmt.agent_list()) ] date_str = make_date_str() test_description = ( f'These tests were generated from the {em.human_readable_name} ' f'on {date_str[:10]}') test_dict = { 'test_data': { 'description': test_description }, 'tests': tests } if upload: save_pickle_to_s3(test_dict, bucket, f'tests/{model_name}_tests_{date_str}.pkl') return test_dict
def create_upload_model(model_name, full_name, indra_stmts, ndex_id=None): """Make and upload an EMMAA model from a list of INDRA Statements. Parameters ---------- short_name : str Short name of the model to use on S3. full_name : str Human-readable model name to use in EMMAA dashboard. indra_stmts : list of indra.statement INDRA Statements to be used to populate the EMMAA model. ndex_id : str UUID of the network corresponding to the model on NDex. If provided, the NDex network will be updated with the latest model content. If None (default), a new network will be created and the UUID stored in the model config files on S3. """ emmaa_stmts = to_emmaa_stmts(indra_stmts, datetime.datetime.now(), []) # Get updated CX content for the INDRA Statements cxa = CxAssembler(indra_stmts) cx_str = cxa.make_model() # If we don't have an NDex ID, create network and upload to Ndex if ndex_id is None: ndex_id = cxa.upload_model(private=False) print(f'NDex ID for {model_name} is {ndex_id}.') # If the NDEx ID is provided, update the existing network else: ndex_client.update_network(cx_str, ndex_id) # Create the config dictionary config_dict = {'ndex': {'network': ndex_id}, 'search_terms': []} # Create EMMAA model emmaa_model = EmmaaModel(model_name, config_dict) emmaa_model.add_statements(emmaa_stmts) # Upload model to S3 with config as YAML and JSON emmaa_model.save_to_s3() s3_client = boto3.client('s3') config_json = json.dumps(config_dict) s3_client.put_object(Body=config_json.encode('utf8'), Key='models/%s/config.json' % model_name, Bucket='emmaa') config_json = json.dumps(config_dict) s3_client.put_object(Body=config_json.encode('utf8'), Key='models/%s/config.json' % model_name, Bucket='emmaa')
def update_cancer(cancer_type): """Update the model for the given cancer. A JSON config file must be present for the given cancer type, located in the models/<cancer_type>/config.json. Parameters ---------- cancer_type : str A short string which is the name of the cancer, and corresponds to a directory in the models directory, as described above. """ print(cancer_type) with open(f'models/{cancer_type}/prior_stmts.pkl', 'rb') as fh: stmts = pickle.load(fh) config = json.load(open(f'models/{cancer_type}/config.json', 'r')) em = EmmaaModel(cancer_type, config) ess = [EmmaaStatement(st, datetime.datetime.now(), []) for st in stmts] em.add_statements(ess) em.save_to_s3() return
def create_upload_model(model_name, indra_stmts, config_file): """Make and upload an EMMAA model from a list of INDRA Statements. Parameters ---------- model_name : str Name of the model to use on S3. indra_stmts : list of indra.statement INDRA Statements to be used to populate the EMMAA model. config_file : str Path to the local config.json file. """ emmaa_stmts = to_emmaa_stmts(indra_stmts, datetime.datetime.now(), [], {'internal': True}) # Load config information with open(config_file, 'rt') as f: config_json = json.load(f) # If there is no ndex entry in the config, create a new network and update # the config file with the NDex network ID if 'ndex' not in config_json: cxa = CxAssembler(indra_stmts) cx_str = cxa.make_model() ndex_id = cxa.upload_model(private=False) print(f'NDex ID for {model_name} is {ndex_id}.') config_json['ndex'] = {'network': ndex_id} updated_config_file = f'{config_file}.updated' with open(updated_config_file, 'wt') as f: json.dump(config_json, f, indent=2) # If the NDEx ID is provided we don't need to update the existing network # because this will occur as part of the model assembly/update procedure # on EMMAA itself. # Create the config dictionary # Create EMMAA model emmaa_model = EmmaaModel(model_name, config_json) emmaa_model.add_statements(emmaa_stmts) # Upload model to S3 emmaa_model.save_to_s3() # Upload config JSON s3_client = boto3.client('s3') save_config_to_s3(model_name, config_json)
def make_model(self, estmts, upload_to_s3=False): """Return, and optionally upload to S3 an initial EMMAA Model. Parameters ---------- estmts : list of emmaa.statement.EmmaaStatement A list of prior EMMAA Statements to initialize the model with. upload_to_s3 : Optional[bool] If True, the model and the config are uploaded to S3, otherwise the model object is just returned without upload. Default: False Returns ------- emmaa.model.EmmaaModel The EMMAA Model object constructed from the generated config and the given EMMAA Statements. """ from emmaa.model import EmmaaModel config = self.make_config(upload_to_s3=upload_to_s3) model = EmmaaModel(name=self.name, config=config) model.add_statements(estmts) if upload_to_s3: model.save_to_s3() return model
'6840@HGNC&type=Activation&format=html'), ('Active MAP2K1 activates MAPK1.', 'https://db.indra.bio/statements/from_agents?subject=6840@HGNC&object=' '6871@HGNC&type=Activation&format=html')] } processed_link = '<a href="https://db.indra.bio/statements/from_agents?'\ 'subject=1097@HGNC&object=6840@HGNC&type=Activation&format=html" '\ 'target="_blank" class="status-link">'\ 'BRAF activates MAP2K1.</a>' query_not_appl = { 2413475507: [('Query is not applicable for this model', 'https://emmaa.readthedocs.io/en/latest/dashboard/response_codes.html')] } # Create a new ModelManager for tests instead of depending on S3 version test_model = EmmaaModel.load_from_s3('test') test_mm = ModelManager(test_model) def test_load_model_manager_from_s3(): mm = load_model_manager_from_s3('test') assert isinstance(mm, ModelManager) def test_format_results(): results = [('test', query_object, 'pysb', test_response, datetime.now())] formatted_results = format_results(results) assert len(formatted_results) == 1 assert formatted_results[0]['model'] == 'test' assert formatted_results[0]['query'] == simple_query assert formatted_results[0]['mc_type'] == 'pysb'
def update_model_manager_on_s3(model_name, bucket=EMMAA_BUCKET_NAME): model = EmmaaModel.load_from_s3(model_name, bucket=bucket) mm = ModelManager(model) save_model_manager_to_s3(model_name, mm, bucket=bucket) return mm
def test_filter_relevance(): config_dict = { 'ndex': { 'network': 'a08479d1-24ce-11e9-bb6a-0ac135e8bacf' }, 'search_terms': [{ 'db_refs': { 'HGNC': '20974' }, 'name': 'MAPK1', 'search_term': 'MAPK1', 'type': 'gene' }] } indra_stmts = \ [Activation(Agent('BRAF', db_refs={'HGNC': '20974'}), Agent('MAP2K1'), evidence=[Evidence(text='BRAF activates MAP2K1.', source_api='assertion')]), Activation(Agent('MAP2K1', activity=ActivityCondition('activity', True)), Agent('MAPK1'), evidence=[Evidence(text='Active MAP2K1 activates ' 'MAPK1.', source_api='assertion')]) ] st = SearchTerm('gene', 'MAP2K1', db_refs={}, search_term='MAP2K1') emmaa_stmts = [ EmmaaStatement(stmt, datetime.datetime.now(), [st]) for stmt in indra_stmts ] # Try no filter first emmaa_model = EmmaaModel('test', config_dict) emmaa_model.extend_unique(emmaa_stmts) emmaa_model.run_assembly() assert len(emmaa_model.assembled_stmts) == 2, emmaa_model.assembled_stmts # Next do a prior_one filter config_dict['assembly'] = {'filter_relevance': 'prior_one'} emmaa_model = EmmaaModel('test', config_dict) emmaa_model.extend_unique(emmaa_stmts) emmaa_model.run_assembly() assert len(emmaa_model.assembled_stmts) == 1, emmaa_model.assembled_stmts assert emmaa_model.assembled_stmts[0].obj.name == 'MAPK1' # Next do a prior_all filter config_dict['assembly'] = {'filter_relevance': 'prior_all'} emmaa_model = EmmaaModel('test', config_dict) emmaa_model.extend_unique(emmaa_stmts) emmaa_model.run_assembly() assert len(emmaa_model.assembled_stmts) == 0
import argparse from emmaa.model import EmmaaModel from emmaa.model_tests import ModelManager, save_model_manager_to_s3 if __name__ == '__main__': parser = argparse.ArgumentParser( description='Script to update ModelManager stored on Amazon S3.') parser.add_argument('-m', '--model', help='Model name', required=True) args = parser.parse_args() model = EmmaaModel.load_from_s3(args.model) mm = ModelManager(model, mode='s3') mm.model.update_to_ndex() mm.save_assembled_statements() save_model_manager_to_s3(args.model, mm)
def run_model_tests_from_s3(model_name, upload_mm=True, upload_results=True, upload_stats=True, registered_queries=True, db=None): """Run a given set of tests on a given model, both loaded from S3. After loading both the model and the set of tests, model/test overlap is determined using a ScopeTestConnector and tests are run. Parameters ---------- model_name : str Name of EmmaaModel to load from S3. upload_mm : Optional[bool] Whether to upload a model manager instance to S3 as a pickle file. Default: True upload_results : Optional[bool] Whether to upload test results to S3 in JSON format. Can be set to False when running tests. Default: True upload_stats : Optional[bool] Whether to upload latest statistics about model and a test. Default: True registered_queries : Optional[bool] If True, registered queries are fetched from the database and executed, the results are then saved to the database. Default: True db : Optional[emmaa.db.manager.EmmaaDatabaseManager] If given over-rides the default primary database. Returns ------- emmaa.model_tests.ModelManager Instance of ModelManager containing the model data, list of applied tests and the test results. emmaa.analyze_test_results.StatsGenerator Instance of StatsGenerator containing statistics about model and test. """ model = EmmaaModel.load_from_s3(model_name) test_corpus = model.test_config.get('test_corpus', 'large_corpus_tests.pkl') tests = load_tests_from_s3(test_corpus) mm = ModelManager(model) if upload_mm: save_model_manager_to_s3(model_name, mm) tm = TestManager([mm], tests) tm.make_tests(ScopeTestConnector()) tm.run_tests() results_json_dict = mm.results_to_json() results_json_str = json.dumps(results_json_dict, indent=1) # Optionally upload test results to S3 if upload_results: client = get_s3_client(unsigned=False) date_str = make_date_str() result_key = f'results/{model_name}/results_{date_str}.json' logger.info(f'Uploading test results to {result_key}') client.put_object(Bucket='emmaa', Key=result_key, Body=results_json_str.encode('utf8')) tr = TestRound(results_json_dict) sg = StatsGenerator(model_name, latest_round=tr) sg.make_stats() # Optionally upload statistics to S3 if upload_stats: sg.save_to_s3() if registered_queries: qm = QueryManager(db=db, model_managers=[mm]) qm.answer_registered_queries(model_name) return (mm, sg)
def get_indirect_stmts(corpus): cpath = os.path.join(indra.__path__[0], os.pardir, 'data', f'{corpus}_corpus.bel') bp = bel.process_belscript(cpath) indirect_stmts = [ st for st in bp.statements if not st.evidence[0].epistemics.get('direct') ] stmts = ac.run_preassembly(indirect_stmts, return_toplevel=False) return stmts if __name__ == '__main__': parser = argparse.ArgumentParser(description='') parser.add_argument('--corpus', default='large') parser.add_argument('--mode', default='dump') args = parser.parse_args() indirect_stmts = get_indirect_stmts(args.corpus) tests = [StatementCheckingTest(stmt) for stmt in indirect_stmts] if args.mode == 'dump': with open(f'{args.corpus}_corpus_tests.pkl', 'wb') as f: pickle.dump(tests, f) elif args.mode == 'run': ctypes = ['rasmodel'] models = [EmmaaModel(ctype) for ctype in ctypes] tm = TestManager(models, tests) tm.make_tests(ScopeTestConnector()) tm.run_tests() print(tm.test_results)