def test_run_query(self): # reinstantiate MatchEngine so that the set of all sample ids in the database includes the documents that were # posted by the unit test setUp self.me = MatchEngine(self.db) # define a clinical node node = {'type': 'clinical', 'value': {'ONCOTREE_PRIMARY_DIAGNOSIS': 'Adrenal Gland', 'AGE_NUMERICAL': '>=18'}} # run query result, matches = self.me.run_query(node) # assert number of samples are 89 assert len(result) == 1, len(result) assert self.sample_id in result # define a genomic criteria node = {'type': 'genomic', 'value': {'HUGO_SYMBOL': '!BRAF'}} # run query result, matches = self.me.run_query(node) # assert number of samples is 1 assert len(result) == 9, len(result) assert self.sample_id not in result self.add_genomic_v2() node = {'type': 'genomic', 'value': {'HUGO_SYMBOL': 'WHSC1'}} result, matches = self.me.run_query(node) assert 'actionability' in matches[0] assert matches[0]['mmr_status'] == 'Proficient (MMR-P / MSS)'
def test_extract_hr_status(self): m = MatchEngine(get_db()) match_tree = trial['treatment_list']['step'][0]['match'][5] g = m.create_match_tree(match_tree) pmt = ParseMatchTree(g) hr = pmt.extract_hr_status() assert sorted(hr) == sorted(['HER2 Negative', 'ER Negative', 'PR Positive'])
def test_extract_signatures(self): m = MatchEngine(get_db()) match_tree = trial['treatment_list']['step'][0]['match'][3] g = m.create_match_tree(match_tree) pmt = ParseMatchTree(g) s = pmt.extract_signatures() assert 'MMR-D' in s[0] assert 'MSI-H' in s[1]
def test_extract_hr_status(self): m = MatchEngine(get_db()) match_tree = trial['treatment_list']['step'][0]['match'][5] g = m.create_match_tree(match_tree) pmt = ParseMatchTree(g) hr = pmt.extract_hr_status() assert sorted(hr) == sorted( ['HER2 Negative', 'ER Negative', 'PR Positive'])
def run_matchengine(): """ Computes matches between all trials in the database and the given subset list of patient MRNs :param mrns: List of patient MRNs :return: database collection of trial matches """ db = database.get_db() me = MatchEngine(db) me.find_trial_matches()
def trial_insert(items): # get database connection. db = database.get_db() # loop over each item. for item in items: # log this. logging.info("trial inserted") # build tree. me = MatchEngine(db) status, trial_tree = me.create_trial_tree(item, no_validate=True) # look at every node. genomic = {} clinical = {} other = {} for n in trial_tree.nodes(): # get parent. if 'node_id' not in trial_tree.node[n]: continue node_id = trial_tree.node[n]['node_id'] # look for multi-level nodes (right now its only match). if 'match_tree' in trial_tree.node[n]: # compress categories. mt = trial_tree.node[n]['match_tree'] for x in mt: if mt.node[x]['type'] == 'genomic': insert_data_genomic(genomic, mt.node[x]['value'], node_id) if mt.node[x]['type'] == 'clinical': insert_data_clinical(clinical, mt.node[x]['value'], node_id) # add the other nodes. insert_data_other(trial_tree, node_id, n, other) # create _summary, _suggest, and _elasticsearch fields summary = Summary(clinical, genomic, other, trial_tree) item['_summary'] = summary.create_summary(item) autocomplete = Autocomplete(item) item['_suggest'], item['_elasticsearch'], item['_summary']['primary_tumor_types'] = \ autocomplete.add_autocomplete() return items
def test_extract_variants(self): m = MatchEngine(get_db()) match_tree = trial['treatment_list']['step'][0]['match'][0] g = m.create_match_tree(match_tree) pmt = ParseMatchTree(g) v = pmt.extract_variants() assert 'BRAF V600E' in v['variants'] assert 'BRAF V600K' in v['variants'] assert 'KRAS any' in v['variants'] assert 'EGFR wt' in v['wts'] assert len(v['variants']) == 3 assert len(v['wts']) == 1 match_tree = trial['treatment_list']['step'][0]['match'][1] g = m.create_match_tree(match_tree) pmt = ParseMatchTree(g) v = pmt.extract_variants() assert 'PTEN CNV' in v['cnvs'] assert 'BRCA1 SV' in v['svs'] assert 'BRAF V600' in v['exclusions'] assert len(v['variants']) == 0 assert len(v['cnvs']) == 1 assert len(v['svs']) == 1 assert len(v['wts']) == 0 assert len(v['exclusions']) == 1 match_tree = trial['treatment_list']['step'][0]['match'][2] g = m.create_match_tree(match_tree) pmt = ParseMatchTree(g) v = pmt.extract_variants() assert 'BRAF V600E' not in v['variants'] assert len(v['variants']) == 0 assert len(v['wts']) == 0 assert 'BRAF V600E' in v['exclusions'] match_tree = trial['treatment_list']['step'][0]['match'][4] g = m.create_match_tree(match_tree) pmt = ParseMatchTree(g) v = pmt.extract_variants() assert 'BRAF V600K' in v['variants'] assert 'EGFR any' in v['variants'] assert len(v['variants']) == 2 assert 'PTEN CNV' in v['cnvs'] assert len(v['cnvs']) == 1 assert 'KRAS' in v['exclusions'] assert 'NRAS' in v['exclusions'] assert len(v['exclusions']) == 2 assert 'NTRK1 wt' in v['wts'] assert len(v['wts']) == 1
def test_trial_summary(self): on_trial['_genomic'] = {'hugo_symbol': [{'value': 'TEST'}]} on_trial['_clinical'] = {'disease_status': [{'value': ['TEST']}]} other = {'management_group': [{'value': 'TEST'}]} # create trial tree db = get_db() me = MatchEngine(db) status, trial_tree = me.create_trial_tree(on_trial, no_validate=True) # validate summary summary = Summary(on_trial['_clinical'], on_trial['_genomic'], other, trial_tree) item = summary.create_summary(on_trial) status_fields = ['drugs', 'genes', 'tumor_types', 'sponsor', 'phase_summary', 'accrual_goal', 'investigator', 'age_summary', 'protocol_number', 'disease_status', 'nct_number', 'disease_center', 'short_title', 'hormone_receptor_status'] for field in status_fields: assert field in item, self._debug(item, field) if field not in ['dfci_investigator', 'hormone_receptor_status']: assert item[field], '%s| %s' % (field, item) # remove all fields and validate that the summary will not error del on_trial['age'] del on_trial['phase'] del on_trial['nct_id'] del on_trial['protocol_no'] del on_trial['principal_investigator'] del on_trial['cancer_center_accrual_goal_upper'] del on_trial['site_list'] del on_trial['sponsor_list'] del on_trial['drug_list'] del on_trial['staff_list'] status, trial_tree = me.create_trial_tree(on_trial, no_validate=True) summary = Summary(on_trial['_clinical'], on_trial['_genomic'], other, trial_tree) item = summary.create_summary(on_trial) for field in status_fields: assert field in item, self._debug(item, field)
def test_extract_cancer_types(self): m = MatchEngine(get_db()) match_tree = match_tree_example g = m.create_match_tree(match_tree) pmt = ParseMatchTree(g) cancer_type_dict = pmt.extract_cancer_types() assert sorted(cancer_type_dict['diagnoses']) == sorted( ['Ocular Melanoma']), cancer_type_dict['diagnoses'] assert sorted(cancer_type_dict['cancer_types_expanded']) == sorted([ 'Ocular Melanoma', 'Uveal Melanoma', 'Conjunctival Melanoma' ]), cancer_type_dict['cancer_types_expanded'] assert sorted(cancer_type_dict['excluded_cancer_types'] ) == [], cancer_type_dict['excluded_cancer_types'] assert cancer_type_dict['primary_cancer_types'] == [ 'Eye' ], cancer_type_dict['primary_cancer_types'] m = MatchEngine(get_db()) match_tree = match_tree_example2 g = m.create_match_tree(match_tree) pmt = ParseMatchTree(g) cancer_type_dict = pmt.extract_cancer_types() assert sorted(cancer_type_dict['diagnoses']) == sorted( ['_SOLID_']), cancer_type_dict['diagnoses'] assert 'Acute Lymphoid Leukemia' not in cancer_type_dict[ 'cancer_types_expanded'], cancer_type_dict['cancer_types_expanded'] assert sorted(cancer_type_dict['excluded_cancer_types'] ) == [], cancer_type_dict['excluded_cancer_types'] assert cancer_type_dict['primary_cancer_types'] == [ 'All Solid Tumors' ], cancer_type_dict['primary_cancer_types'] m = MatchEngine(get_db()) match_tree = match_tree_example3 g = m.create_match_tree(match_tree) pmt = ParseMatchTree(g) cancer_type_dict = pmt.extract_cancer_types() assert sorted(cancer_type_dict['diagnoses']) == sorted( ['_LIQUID_']), cancer_type_dict['diagnoses'] assert 'Acute Lymphoid Leukemia' in cancer_type_dict[ 'cancer_types_expanded'], cancer_type_dict['cancer_types_expanded'] assert sorted(cancer_type_dict['excluded_cancer_types'] ) == [], cancer_type_dict['excluded_cancer_types'] assert cancer_type_dict['primary_cancer_types'] == [ 'All Liquid Tumors' ], cancer_type_dict['primary_cancer_types']
def test_trial_summary(self): on_trial['_genomic'] = {'hugo_symbol': [{'value': 'TEST'}]} on_trial['_clinical'] = {'disease_status': [{'value': ['TEST']}]} other = {'management_group': [{'value': 'TEST'}]} # create trial tree db = get_db() me = MatchEngine(db) status, trial_tree = me.create_trial_tree(on_trial, no_validate=True) # validate summary summary = Summary(on_trial['_clinical'], on_trial['_genomic'], other, trial_tree) item = summary.create_summary(on_trial) status_fields = [ 'drugs', 'genes', 'tumor_types', 'sponsor', 'phase_summary', 'accrual_goal', 'investigator', 'age_summary', 'protocol_number', 'disease_status', 'nct_number', 'disease_center', 'short_title', 'hormone_receptor_status' ] for field in status_fields: assert field in item, self._debug(item, field) if field not in ['dfci_investigator', 'hormone_receptor_status']: assert item[field], '%s| %s' % (field, item) # remove all fields and validate that the summary will not error del on_trial['age'] del on_trial['phase'] del on_trial['nct_id'] del on_trial['protocol_no'] del on_trial['principal_investigator'] del on_trial['cancer_center_accrual_goal_upper'] del on_trial['site_list'] del on_trial['sponsor_list'] del on_trial['drug_list'] del on_trial['staff_list'] status, trial_tree = me.create_trial_tree(on_trial, no_validate=True) summary = Summary(on_trial['_clinical'], on_trial['_genomic'], other, trial_tree) item = summary.create_summary(on_trial) for field in status_fields: assert field in item, self._debug(item, field)
def test_ms_status(self): on_trial['_clinical'] = {} on_trial['_genomic'] = {'mmr_status': [{'value': 'MMR-Proficient'}]} db = get_db() me = MatchEngine(db) status, trial_tree = me.create_trial_tree(on_trial, no_validate=True) summary = Summary(on_trial['_clinical'], on_trial['_genomic'], {}, trial_tree) item = summary.create_summary(on_trial) assert 'mmr_status' in item, self._debug(item, 'mmr_status') on_trial['_genomic'] = {'ms_status': [{'value': 'MSI-H'}]} status, trial_tree = me.create_trial_tree(on_trial, no_validate=True) summary = Summary(on_trial['_clinical'], on_trial['_genomic'], {}, trial_tree) item = summary.create_summary(on_trial) assert 'ms_status' in item, self._debug(item, 'ms_status')
def test_extract_genes(self): m = MatchEngine(get_db()) match_tree = trial['treatment_list']['step'][0]['match'][0] g = m.create_match_tree(match_tree) pmt = ParseMatchTree(g) genes = pmt.extract_genes() assert 'BRAF' in genes, genes assert 'KRAS' in genes, genes assert 'EGFR' not in genes assert 'test' not in genes, genes assert len(genes) == 2, genes match_tree = trial['treatment_list']['step'][0]['match'][2] g = m.create_match_tree(match_tree) pmt = ParseMatchTree(g) genes = pmt.extract_genes() assert 'BRAF' not in genes, genes
def __init__(self, item): """ Creates data for ElasticSearch's autocomplete index :param item: Trial info: - treatment_list: Nested dictionary containing all match criteria - summary: Summary object created by the API """ self.summary = item['_summary'] self.treatment_list = item['treatment_list'] self.vdict = { 'variants': [], 'wts': [], 'svs': [], 'cnvs': [], 'exclusions': [] } self.genes = [] self.cancer_type_dict = None self.m = MatchEngine(get_db())
def _get_signatures(self, item): """ Creates hormone receptor status and mutational signature summary lists :param item: Trial document """ m = MatchEngine(get_db()) for step in item['treatment_list']['step']: if 'match' in step: g = m.create_match_tree(step['match'][0]) pmt = ParseMatchTree(g) signatures = pmt.extract_signatures() self.mmr.extend(signatures[0]) self.ms.extend(signatures[1]) self.sigs.extend(signatures[2]) self.hr.extend(pmt.extract_hr_status()) if 'arm' in step: for arm in step['arm']: if 'match' in arm: g = m.create_match_tree(arm['match'][0]) pmt = ParseMatchTree(g) signatures = pmt.extract_signatures() self.mmr.extend(signatures[0]) self.ms.extend(signatures[1]) self.sigs.extend(signatures[2]) self.hr.extend(pmt.extract_hr_status()) if 'dose_level' in arm: for dose in arm['dose_level']: if 'match' in dose: g = m.create_match_tree(dose['match'][0]) pmt = ParseMatchTree(g) signatures = pmt.extract_signatures() self.mmr.extend(signatures[0]) self.ms.extend(signatures[1]) self.sigs.extend(signatures[2]) self.hr.extend(pmt.extract_hr_status())
def match(args): """ Matches all trials in database to patients :param daemon: Boolean flag; when true, runs the matchengine once per 24 hours. """ db = get_db(args.mongo_uri) while True: me = MatchEngine(db) me.find_trial_matches() # exit if it is not set to run as a nightly automated daemon, otherwise sleep for a day if not args.daemon: # choose output file format if args.json_format: file_format = 'json' elif args.outpath and len(args.outpath.split('.')) > 1: file_format = args.outpath.split('.')[-1] if file_format not in ['json', 'csv']: file_format = 'csv' else: file_format = 'csv' # choose output path if args.outpath: outpath = args.outpath.split('.')[0] else: outpath = './results' # export results export_results(args.mongo_uri, file_format, outpath) break else: time.sleep(86400) # sleep for 24 hours
def _get_signatures(self, item): """ Creates hormone receptor status and mutational signature summary lists :param item: Trial document """ m = MatchEngine(get_db()) for step in item['treatment_list']['step']: if 'match' in step: g = m.create_match_tree(step['match'][0]) pmt = ParseMatchTree(g) signatures = pmt.extract_signatures() self.mmr.extend(signatures[0]) self.ms.extend(signatures[1]) self.hr.extend(pmt.extract_hr_status()) if 'arm' in step: for arm in step['arm']: if 'match' in arm: g = m.create_match_tree(arm['match'][0]) pmt = ParseMatchTree(g) signatures = pmt.extract_signatures() self.mmr.extend(signatures[0]) self.ms.extend(signatures[1]) self.hr.extend(pmt.extract_hr_status()) if 'dose_level' in arm: for dose in arm['dose_level']: if 'match' in dose: g = m.create_match_tree(dose['match'][0]) pmt = ParseMatchTree(g) signatures = pmt.extract_signatures() self.mmr.extend(signatures[0]) self.ms.extend(signatures[1]) self.hr.extend(pmt.extract_hr_status())
def test_extract_cancer_types(self): m = MatchEngine(get_db()) match_tree = match_tree_example g = m.create_match_tree(match_tree) pmt = ParseMatchTree(g) cancer_type_dict = pmt.extract_cancer_types() assert sorted(cancer_type_dict['diagnoses']) == sorted([ 'Ocular Melanoma' ]), cancer_type_dict['diagnoses'] assert sorted(cancer_type_dict['cancer_types_expanded']) == sorted([ 'Ocular Melanoma', 'Uveal Melanoma', 'Conjunctival Melanoma' ]), cancer_type_dict['cancer_types_expanded'] assert sorted(cancer_type_dict['excluded_cancer_types']) == [], cancer_type_dict['excluded_cancer_types'] assert cancer_type_dict['primary_cancer_types'] == ['Eye'], cancer_type_dict['primary_cancer_types'] m = MatchEngine(get_db()) match_tree = match_tree_example2 g = m.create_match_tree(match_tree) pmt = ParseMatchTree(g) cancer_type_dict = pmt.extract_cancer_types() assert sorted(cancer_type_dict['diagnoses']) == sorted(['_SOLID_']), cancer_type_dict['diagnoses'] assert 'Acute Lymphoid Leukemia' not in cancer_type_dict['cancer_types_expanded'], cancer_type_dict['cancer_types_expanded'] assert sorted(cancer_type_dict['excluded_cancer_types']) == [], cancer_type_dict['excluded_cancer_types'] assert cancer_type_dict['primary_cancer_types'] == ['All Solid Tumors'], cancer_type_dict['primary_cancer_types'] m = MatchEngine(get_db()) match_tree = match_tree_example3 g = m.create_match_tree(match_tree) pmt = ParseMatchTree(g) cancer_type_dict = pmt.extract_cancer_types() assert sorted(cancer_type_dict['diagnoses']) == sorted(['_LIQUID_']), cancer_type_dict['diagnoses'] assert 'Acute Lymphoid Leukemia' in cancer_type_dict['cancer_types_expanded'], cancer_type_dict[ 'cancer_types_expanded'] assert sorted(cancer_type_dict['excluded_cancer_types']) == [], cancer_type_dict['excluded_cancer_types'] assert cancer_type_dict['primary_cancer_types'] == ['All Liquid Tumors'], cancer_type_dict['primary_cancer_types']
class TestMatchEngine(TestSetUp): # global vars. me = None def setUp(self): super(TestMatchEngine, self).setUp() # add clinical collection self.db.clinical.insert_many([{ 'ONCOTREE_PRIMARY_DIAGNOSIS_NAME': onc, 'SAMPLE_ID': self.sample_id, 'VITAL_STATUS': 'alive', 'DFCI_MRN': self.mrn, 'REPORT_DATE': self.static_date, 'BIRTH_DATE': self.static_date, 'GENDER': 'Male' } for onc in ['_LIQUID_', '_SOLID_']]) self.add_clinical() self.add_genomic() def tearDown(self): self.db.clinical.drop() self.db.genomic.drop() def test_validate_yaml_format(self): # Read yaml file test_inp = read_file(os.path.join(YAML_DIR, '00-000.yml')) # Parse yaml document, The status should be 1 as validation should fail status, data = self.me.validate_yaml_format(test_inp) assert status == 1 # Parse yaml document, The status should be 0 as validation should pass test_inp = read_file(os.path.join(YAML_DIR, '00-001.yml')) status, data = self.me.validate_yaml_format(test_inp) assert status == 0 # Assert the protocol id is correctly annotated assert data['protocol_no'] == '00-001' assert data['protocol_id'] == 00001 def test_validate_yaml_data(self): # Read yaml file test_inp = read_file(os.path.join(YAML_DIR, '00-001.yml')) status, data = self.me.validate_yaml_format(test_inp) # Validate the schema for yaml document errors = self.me.validate_yaml_data(data) # Assert that there are no errors assert len(errors) == 0 test_inp = read_file(os.path.join(YAML_DIR, '00-002.yml')) status, data = self.me.validate_yaml_format(test_inp) # Assert Schema check fails errors = self.me.validate_yaml_data(data) assert errors['protocol_id'][0] == 'required field' # assert we don't need a match clause at root. test_inp = read_file(os.path.join(YAML_DIR, '00-003.yml')) status, data = self.me.validate_yaml_format(test_inp) assert status == 0 def test_run_query(self): # reinstantiate MatchEngine so that the set of all sample ids in the database includes the documents that were # posted by the unit test setUp self.me = MatchEngine(self.db) # define a clinical node node = {'type': 'clinical', 'value': {'ONCOTREE_PRIMARY_DIAGNOSIS': 'Adrenal Gland', 'AGE_NUMERICAL': '>=18'}} # run query result, matches = self.me.run_query(node) # assert number of samples are 89 assert len(result) == 1, len(result) assert self.sample_id in result # define a genomic criteria node = {'type': 'genomic', 'value': {'HUGO_SYMBOL': '!BRAF'}} # run query result, matches = self.me.run_query(node) # assert number of samples is 1 assert len(result) == 9, len(result) assert self.sample_id not in result self.add_genomic_v2() node = {'type': 'genomic', 'value': {'HUGO_SYMBOL': 'WHSC1'}} result, matches = self.me.run_query(node) assert 'actionability' in matches[0] assert matches[0]['mmr_status'] == 'Proficient (MMR-P / MSS)' def test_prepare_clinical_criteria(self): onc = 'ONCOTREE_PRIMARY_DIAGNOSIS' oncname = 'ONCOTREE_PRIMARY_DIAGNOSIS_NAME' # create clinical criteria item = {'AGE_NUMERICAL': '>=18', onc: 'Melanoma'} # convert to mongo query c = self.me.prepare_clinical_criteria(item) # assert length of ONCOTREE_DIAGNOSIS for 'Melanoma' is 9 assert len(c[oncname]['$in']) == 8 # check 'BIRTH DATE mongo query does less than search' assert c['BIRTH_DATE'].keys()[0] == '$lte' # check "!" item = {'AGE_NUMERICAL': '<=18', onc: '!Melanoma'} c = self.me.prepare_clinical_criteria(item) assert c['BIRTH_DATE'].keys()[0] == '$gte' assert len(c[oncname]['$nin']) == 8 # check with a list of diagnoses item = {'AGE_NUMERICAL': '>=18', onc: ['!Melanoma', '!Glioblastoma', 'Pheochromocytoma', 'Astrocytoma']} c = self.me.prepare_clinical_criteria(item) assert c['BIRTH_DATE'].keys()[0] == '$lte' assert '$nin' in c[oncname] assert '$in' in c[oncname] assert len(c[oncname]['$in']) == 2 assert len(c[oncname]['$nin']) == 12 # check _SOLID_ && _LIQUID_ liquid_item = {'AGE_NUMERICAL': '>=18', onc: '_LIQUID_'} solid_item = {'AGE_NUMERICAL': '>=18', onc: '_SOLID_'} liqc = self.me.prepare_clinical_criteria(liquid_item) solc = self.me.prepare_clinical_criteria(solid_item) assert len(liqc[oncname]['$in']) == 51, len(liqc[oncname]['$in']) assert len(solc[oncname]['$in']) == 561, len(solc[oncname]['$in']) def test_prepare_genomic_criteria(self): # create a genomic criteria item = {'HUGO_SYMBOL': '!KRAS', 'PROTEIN_CHANGE': 'p.V600E'} # convert to mongo query c, neg, _ = self.me.prepare_genomic_criteria(item) # check ! symbol assert c['$and'][0]['TRUE_HUGO_SYMBOL']['$eq'] == 'KRAS' assert neg is True # check protein change assert c['$and'][0]['TRUE_PROTEIN_CHANGE']['$eq'] == 'p.V600E' # check wildtype assert '$or' in c['$and'][1], c assert c['$and'][1]['$or'] == [{'WILDTYPE': False}, {'WILDTYPE': {'$exists': False}}] def test_sv(self): # create a genomic criteria item = {'HUGO_SYMBOL': 'KRAS', 'VARIANT_CATEGORY': 'SV'} # convert to mongo query c, neg, _ = self.me.prepare_genomic_criteria(item) # check sv assert c['$and'][0]['VARIANT_CATEGORY']['$eq'] == 'SV' # check wildtype assert '$or' in c['$and'][1], c assert c['$and'][1]['$or'] == [{'WILDTYPE': False}, {'WILDTYPE': {'$exists': False}}] def test_create_trial_tree(self): # parse yaml file and create tree. test_inp = read_file(os.path.join(YAML_DIR, '00-004.yml')) status, trial_tree = self.me.create_trial_tree(test_inp) # assert it was successful. assert status == 0 # check that we have 2 match trees. cnt = 0 for n in trial_tree.nodes(): if 'match_tree' in trial_tree.node[n]: cnt += 1 assert cnt == 2 # parse yaml file and create tree. test_inp = read_file(os.path.join(YAML_DIR, '00-004.yml')) status, trial_tree = self.me.create_trial_tree(test_inp) # check that we have 14 nodes. assert len(list(trial_tree.nodes())) == 4 # assert we have 3 match trees. cnt = 0 for n in trial_tree.nodes(): if 'match_tree' in trial_tree.node[n]: cnt += 1 assert cnt == 2 def test_create_match_tree(self): # parse yaml file and create trial tree. test_inp = read_file(os.path.join(YAML_DIR, '00-004.yml')) status, trial_tree = self.me.create_trial_tree(test_inp) # get the graph for the only match. g = None i = -1 number_of_nodes = [2, 10] edges = [[(1, 2)], [(1, 2), (2, 3), (2, 4), (4, 5), (4, 6), (4, 7), (4, 8), (4, 9), (4, 10)]] for n in trial_tree.nodes(): if 'match_tree' in trial_tree.node[n]: i += 1 g = trial_tree.node[n]['match_tree'] assert g is not None # Check if tree contain correct number of nodes assert g.number_of_nodes() == number_of_nodes[i] # Check edges are correctly created assert list(nx.dfs_edges(g)) == edges[i] # First node should be and if i == 0: assert g.node[1]['type'] == 'match' assert g.node[2]['type'] == 'clinical' print g.node[2]['value'] assert g.node[2]['value'] == {'disease_status': ['Advanced']} elif i == 1: assert g.node[1]['type'] == 'match' assert g.node[2]['type'] == 'and' assert g.node[3]['type'] == 'genomic' assert g.node[3]['value'] == {'hugo_symbol': 'IDH1', 'wildcard_protein_change': 'p.R132', 'variant_category': 'Mutation'} assert g.node[4]['type'] == 'or' assert g.node[5]['type'] == 'clinical' assert g.node[5]['value'] == {'age_numerical': '>=18', 'oncotree_primary_diagnosis': '_SOLID_'} assert g.node[6]['type'] == 'clinical' assert g.node[6]['value'] == {'age_numerical': '>=18', 'oncotree_primary_diagnosis': 'Diffuse Glioma'} assert g.node[7]['type'] == 'clinical' assert g.node[7]['value'] == {'age_numerical': '>=18', 'oncotree_primary_diagnosis': 'Encapsulated Glioma'} assert g.node[8]['type'] == 'clinical' assert g.node[8]['value'] == {'age_numerical': '>=18', 'oncotree_primary_diagnosis': 'Cholangiocarcinoma'} assert g.node[9]['type'] == 'clinical' assert g.node[9]['value'] == {'age_numerical': '>=18', 'oncotree_primary_diagnosis': 'Acute Myeloid Leukemia'} assert g.node[10]['type'] == 'clinical' assert g.node[10]['value'] == {'age_numerical': '>=18', 'oncotree_primary_diagnosis': 'Myelodysplasia'} def test_search_oncotree_diagnosis(self): # Glioblastoma c = {'ONCOTREE_PRIMARY_DIAGNOSIS_NAME': {'$eq': 'Glioblastoma'}} oncotree = build_oncotree() c['ONCOTREE_PRIMARY_DIAGNOSIS_NAME'] = self.me._search_oncotree_diagnosis(oncotree, c) conc = c['ONCOTREE_PRIMARY_DIAGNOSIS_NAME'] assert '$in' in conc, conc assert conc['$in'] == ['Small Cell Glioblastoma', 'Gliosarcoma', 'Glioblastoma Multiforme', 'Glioblastoma'], conc # Melanoma c = {'ONCOTREE_PRIMARY_DIAGNOSIS_NAME': {'$eq': 'Melanoma'}} conc = self.me._search_oncotree_diagnosis(oncotree, c) assert '$in' in conc, conc assert conc['$in'] == [ 'Melanoma', 'Congenital Nevus', 'Genitourinary Mucosal Melanoma', 'Cutaneous Melanoma', 'Melanoma of Unknown Primary', 'Desmoplastic Melanoma', 'Lentigo Maligna Melanoma', 'Acral Melanoma' ]
def setUp(self): """ Descriptions of test patients 1: >18, Adrenal Gland, Female, BRAF F346R Mutation 2: >18, Melanoma, Female, EGFR L858R Mutation 3: >18, Melanoma, Female, EGFR F346A Mutation 4: >18, Melanoma, Female, EGFR F346B Mutation 5: >18, Melanoma, Female, EGFR F000F Mutation 6: >0.5 && <18, Melanoma, Male, EGFR SV 7: >0.5 && <18, Glioblastoma, Male, EGFR CNV Hetero del 8: >0.5 && <18, Glioblastoma, Male, EGFR CNV Gain 9: >0.5 && <18, Glioblastoma, Male, EGFR CNV H**o del 10: <0.5, Glioblastoma, Male, EGFR CNV High amp Descriptions of test trials 00-001.yml: dose: EGFR L858R && >=18/_SOLID_ 00-002.yml: arm: EGFR L858R && >=18/_SOLID_ 00-003.yml: step: EGFR L858R && >=18/_SOLID_ 00-004.yml dose: EGFR L858R && >=18/_SOLID_ 00-005.yml 2 doses: EGFR L858R && >=18/_SOLID_ 00-006.yml exon: !13 """ self.db = get_db(None) for res in ["clinical", "dashboard", "filter", "genomic", "hipaa", "match", "normalize", "oplog" "response", "statistics", "status", "team", "trial", "trial_match", "user"]: self.db.drop_collection(res) self.me = MatchEngine(self.db) self.trials = {} self.clinical_id = ObjectId() self.mrn = 'TCGA-BH-A1FR' self.sample_id = 'TCGA-OR-A5J1' self.mrns = [self.mrn] + [self.__random_id() for _ in range(9)] self.sample_ids = [self.sample_id] + [self.__random_id() for _ in range(9)] self.clinical_ids = [self.clinical_id] + [ObjectId() for _ in range(9)] self.static_date = dt.datetime.today() # clinical collection self.oncotree_diagnoses = ['Adrenal Gland'] + ['Melanoma'] * 5 + ['Glioblastoma'] * 4 self.genders = ['Female'] * 5 + ['Male'] * 5 # ages adult = self.static_date - dt.timedelta(days=365*19) child = self.static_date - dt.timedelta(days=365*10) infant = self.static_date - dt.timedelta(days=30*4) self.ages = [adult] * 5 + [child] * 4 + [infant] self.clinical = [{ '_id': clinical_id, 'ONCOTREE_PRIMARY_DIAGNOSIS_NAME': diagnosis, 'SAMPLE_ID': sample_id, 'VITAL_STATUS': 'alive', 'MRN': mrn, 'REPORT_DATE': self.static_date, 'BIRTH_DATE': age, 'GENDER': gender } for diagnosis, gender, age, clinical_id, sample_id, mrn in zip( self.oncotree_diagnoses, self.genders, self.ages, self.clinical_ids, self.sample_ids, self.mrns)] # genomic collection self.genes = ['BRAF'] + ['EGFR'] * 9 self.protein_changes = ['p.F346R', 'p.L858R', 'p.F346A', 'p.F346B', 'p.F000F', None, None, None, None, None] self.variant_categories = ['MUTATION'] * 5 + ['SV', 'CNV', 'CNV', 'CNV', 'CNV'] self.wildtypes = [False] * 10 self.cnv_calls = [None, None, None, None, None, None, 'Heterozygous deletion', 'Gain', 'Homozygous deletion', 'High level amplification'] self.genomic = [{ 'TRUE_VARIANT_CLASSIFICATION': 'In_Frame_Del', 'TRUE_PROTEIN_CHANGE': protein_change, 'VARIANT_CATEGORY': variant_category, 'CHROMOSOME': 'chr3', 'POSITION': 178952085, 'TRUE_STRAND': '+', 'WILDTYPE': wildtype, 'CLINICAL_ID': _id, 'CNV_CALL': cnv_call, 'TRUE_HUGO_SYMBOL': gene, 'SAMPLE_ID': sample_id, 'TRUE_TRANSCRIPT_EXON': 19 } for protein_change, variant_category, wildtype, cnv_call, gene, _id, sample_id in zip( self.protein_changes, self.variant_categories, self.wildtypes, self.cnv_calls, self.genes, self.clinical_ids, self.sample_ids )] # test trials self.test_trials = ['00-001', '00-002', '00-003'] # demo match results pnos = ['00-001', '00-001', '00-001', '00-002', '00-002', '00-002'] mlevels = ['arm', 'arm', 'arm', 'dose', 'dose', 'dose'] iids = ['1', '2', '3', '4', '5', '6'] galts = ['Alt1', 'Alt2', 'Alt2', 'Alt3', 'Alt3', 'Alt3'] self.matches = [{ 'mrn': 'SAMPLE1', 'sample_id': 'SAMPLE1-ID', 'protocol_no': protocol_no, 'match_level': match_level, 'internal_id': internal_id, 'genomic_alteration': genomic_alteration } for protocol_no, match_level, internal_id, genomic_alteration in zip( pnos, mlevels, iids, galts )]
class Autocomplete: def __init__(self, item): """ Creates data for ElasticSearch's autocomplete index :param item: Trial info: - treatment_list: Nested dictionary containing all match criteria - summary: Summary object created by the API """ self.summary = item['_summary'] self.treatment_list = item['treatment_list'] self.vdict = { 'variants': [], 'wts': [], 'svs': [], 'cnvs': [], 'exclusions': [] } self.genes = [] self.cancer_type_dict = None self.m = MatchEngine(get_db()) @staticmethod def _get_cancer_type_weight(cancer_type, hierarchy='default'): """ Sets the weights for ElasticSearch autocompletion on cancer types. Cancer type terms are split so that autocomplete suggestions will populate regardless of which word in the multi-word cancer type string is initially input. Higher weighted terms will populate the top of the autocomplete dropdown list. :param cancer_type: Text to display in the autocomplete dropdown list. :param hierarchy: Weight to give the text. :return: Dictionary specifying ElasticSearch rules. """ weight_dict = {'primary': 10, 'default': 5, 'bucket': 20} if cancer_type == 'All Solid Tumors' or cancer_type == 'All Liquid Tumors': hierarchy = 'bucket' return { 'input': list( set([cancer_type] + [i for i in cancer_type.split() if len(i) > 3])), 'output': cancer_type, 'weight': weight_dict[hierarchy] } @staticmethod def _get_variants_weight(variant, esrule='variants'): """ Sets the weights for ElasticSearch autocompletion on gene variants. Higher weighted terms will populate the top of the autocomplete dropdown list. :param variant: Text to display in the autocomplete dropdown list. :param esrule: Type of variant. This will determine the ElasticSearch parameters. :return: Dictionary specifying ElasticSearch rules. """ weight_dict = {'variants': 1, 'wts': 5, 'svs': 3, 'cnvs': 3} return {'input': variant, 'weight': weight_dict[esrule]} @staticmethod def _get_investigator_suggest(investigator, dfci_investigator): """ Creates a list of investigators from the _summary field of the trial collection """ iin = [] iout = '' ispl = [i.strip() for i in investigator.split(',')] if len(ispl) == 1: iin = [ispl[0]] iout = investigator elif len(ispl) >= 2: iin = [ispl[0], ispl[1]] iout = '%s %s' % (ispl[1], ispl[0]) dfci_in = [] dfci_out = '' if dfci_investigator is not None and 'first_name' in dfci_investigator: dfci_in.append(dfci_investigator['first_name'].strip()) dfci_out += dfci_investigator['first_name'].strip() if dfci_investigator is not None and 'last_name' in dfci_investigator: dfci_in.append(dfci_investigator['last_name'].strip()) dfci_out += ' %s' % dfci_investigator['last_name'].strip() inv_suggest = [{'input': [i for i in iin if i != ''], 'output': iout}] if dfci_out != iout and dfci_out != '': inv_suggest.append({'input': dfci_in, 'output': dfci_out.strip()}) return inv_suggest @staticmethod def _get_tumor_types_search(ct_suggest): """ Maps special cancer type text output to the values stored in the ElasticSearch index. :param ct_suggest: Cancer type text to display. :return: Cancer type text stored in th ElasticSearch index, which we will query. """ tts = [] for ct in ct_suggest: if 'output' in ct and ct['output'] == 'All Solid Tumors': tts.append('_SOLID_') elif 'output' in ct and ct['output'] == 'All Liquid Tumors': tts.append('_LIQUID_') else: tts.append(ct['output']) return tts def _extract_data_from_match(self, match): """ Extract Cancer Type, Gene, and Variant data from the given match tree """ g = self.m.create_match_tree(match) pmt = ParseMatchTree(g) self.cancer_type_dict = pmt.extract_cancer_types() self.genes.extend(pmt.extract_genes()) vdict_tmp = pmt.extract_variants() for k, v in self.vdict.iteritems(): v.extend(vdict_tmp[k]) def add_autocomplete(self): """ Recursively iterates through the treatment list and creates a list of genes contained within. :return: Nested dictionary containing all genes referenced within this trial """ for step in self.treatment_list['step']: if 'match' in step: self._extract_data_from_match(step['match'][0]) if 'arm' in step: for arm in step['arm']: if 'match' in arm: self._extract_data_from_match(arm['match'][0]) if 'dose_level' in arm: for dose in arm['dose_level']: if 'match' in dose: self._extract_data_from_match(dose['match'][0]) if self.cancer_type_dict is None: self.cancer_type_dict = { 'diagnoses': [], 'primary_cancer_types': [], 'cancer_types_expanded': [], 'excluded_cancer_types': [] } weighted_cancer_types = [] for ct in self.cancer_type_dict['primary_cancer_types']: suggestion = self._get_cancer_type_weight(ct, hierarchy='primary') weighted_cancer_types.append(suggestion) for ct in set(self.cancer_type_dict['cancer_types_expanded']) - set( self.cancer_type_dict['primary_cancer_types']): suggestion = self._get_cancer_type_weight(ct, hierarchy='default') weighted_cancer_types.append(suggestion) weighted_variants = {} for key in ['variants', 'cnvs', 'svs', 'wts']: weighted_variants[key] = [] for v in set(self.vdict[key]): suggestion = self._get_variants_weight(v, esrule=key) weighted_variants[key].append(suggestion) suggestors = { "cancer_type_suggest": weighted_cancer_types, "hugo_symbol_suggest": { "input": list(set(self.genes)) }, "variant_suggest": [ i for i in weighted_variants['variants'] if not i['input'].endswith('any') ], "wildtype_suggest": weighted_variants['wts'], "cnv_suggest": weighted_variants['cnvs'], "sv_suggest": weighted_variants['svs'], "protocol_no_suggest": { 'input': self.summary['protocol_number'] }, "disease_center_suggest": { 'input': [ i.replace('(', '').replace(')', '') for i in self.summary['disease_center'].split() ], 'output': self.summary['disease_center'] }, 'disease_status_suggest': { 'input': self.summary['disease_status'] }, 'drug_suggest': { 'input': [i.title() for i in self.summary['drugs']] }, 'investigator_suggest': self._get_investigator_suggest(self.summary['investigator'], self.summary['dfci_investigator']), 'mmr_status_suggest': { 'input': self.summary['mmr_status'] + self.summary['ms_status'] }, 'nct_number_suggest': { 'input': self.summary['nct_number'] } } searchers = { "tumor_types": list(set(self._get_tumor_types_search(weighted_cancer_types))), "genes": list(set(self.genes)), "variants": list(set([i['input'] for i in weighted_variants['variants']])), "wildtype_genes": list(set([i['input'] for i in weighted_variants['wts']])), "cnv_genes": list(set([i['input'] for i in weighted_variants['cnvs']])), "sv_genes": list(set([i['input'] for i in weighted_variants['svs']])), "exclusion_genes": list(set(self.vdict['exclusions'])), "protocol_no": self.summary["protocol_number"], "drugs": self.summary["drugs"], "age": self.summary["age_summary"], "phase": self.summary["phase_summary"], "disease_status": self.summary["disease_status"], "nct_number": self.summary["nct_number"], "disease_center": self.summary["disease_center"], "mmr_status": self.summary["mmr_status"], "ms_status": self.summary["ms_status"], "mutational_signatures": self.summary["mutational_signatures"], "investigator": [i['output'] for i in suggestors['investigator_suggest']], "short_title": self.summary["short_title"] } return suggestors, searchers, parse_primary_cancer_types( self.cancer_type_dict['primary_cancer_types'])
class Autocomplete: def __init__(self, item): """ Creates data for ElasticSearch's autocomplete index :param item: Trial info: - treatment_list: Nested dictionary containing all match criteria - summary: Summary object created by the API """ self.summary = item['_summary'] self.treatment_list = item['treatment_list'] self.vdict = { 'variants': [], 'wts': [], 'svs': [], 'cnvs': [], 'exclusions': [] } self.genes = [] self.cancer_type_dict = None self.m = MatchEngine(get_db()) @staticmethod def _get_cancer_type_weight(cancer_type, hierarchy='default'): """ Sets the weights for ElasticSearch autocompletion on cancer types. Cancer type terms are split so that autocomplete suggestions will populate regardless of which word in the multi-word cancer type string is initially input. Higher weighted terms will populate the top of the autocomplete dropdown list. :param cancer_type: Text to display in the autocomplete dropdown list. :param hierarchy: Weight to give the text. :return: Dictionary specifying ElasticSearch rules. """ weight_dict = {'primary': 10, 'default': 5, 'bucket': 20} if cancer_type == 'All Solid Tumors' or cancer_type == 'All Liquid Tumors': hierarchy = 'bucket' return { 'input': list(set([cancer_type] + [i for i in cancer_type.split() if len(i) > 3])), 'output': cancer_type, 'weight': weight_dict[hierarchy] } @staticmethod def _get_variants_weight(variant, esrule='variants'): """ Sets the weights for ElasticSearch autocompletion on gene variants. Higher weighted terms will populate the top of the autocomplete dropdown list. :param variant: Text to display in the autocomplete dropdown list. :param esrule: Type of variant. This will determine the ElasticSearch parameters. :return: Dictionary specifying ElasticSearch rules. """ weight_dict = { 'variants': 1, 'wts': 5, 'svs': 3, 'cnvs': 3 } return {'input': variant, 'weight': weight_dict[esrule]} @staticmethod def _get_investigator_suggest(investigator, dfci_investigator): """ Creates a list of investigators from the _summary field of the trial collection """ iin = [] iout = '' ispl = [i.strip() for i in investigator.split(',')] if len(ispl) == 1: iin = [ispl[0]] iout = investigator elif len(ispl) >= 2: iin = [ispl[0], ispl[1]] iout = '%s %s' % (ispl[1], ispl[0]) dfci_in = [] dfci_out = '' if dfci_investigator is not None and 'first_name' in dfci_investigator: dfci_in.append(dfci_investigator['first_name'].strip()) dfci_out += dfci_investigator['first_name'].strip() if dfci_investigator is not None and 'last_name' in dfci_investigator: dfci_in.append(dfci_investigator['last_name'].strip()) dfci_out += ' %s' % dfci_investigator['last_name'].strip() inv_suggest = [{ 'input': [i for i in iin if i != ''], 'output': iout }] if dfci_out != iout and dfci_out != '': inv_suggest.append({ 'input': dfci_in, 'output': dfci_out.strip() }) return inv_suggest @staticmethod def _get_tumor_types_search(ct_suggest): """ Maps special cancer type text output to the values stored in the ElasticSearch index. :param ct_suggest: Cancer type text to display. :return: Cancer type text stored in th ElasticSearch index, which we will query. """ tts = [] for ct in ct_suggest: if 'output' in ct and ct['output'] == 'All Solid Tumors': tts.append('_SOLID_') elif 'output' in ct and ct['output'] == 'All Liquid Tumors': tts.append('_LIQUID_') else: tts.append(ct['output']) return tts def _extract_data_from_match(self, match): """ Extract Cancer Type, Gene, and Variant data from the given match tree """ g = self.m.create_match_tree(match) pmt = ParseMatchTree(g) self.cancer_type_dict = pmt.extract_cancer_types() self.genes.extend(pmt.extract_genes()) vdict_tmp = pmt.extract_variants() for k, v in self.vdict.iteritems(): v.extend(vdict_tmp[k]) def add_autocomplete(self): """ Recursively iterates through the treatment list and creates a list of genes contained within. :return: Nested dictionary containing all genes referenced within this trial """ for step in self.treatment_list['step']: if 'match' in step: self._extract_data_from_match(step['match'][0]) if 'arm' in step: for arm in step['arm']: if 'match' in arm: self._extract_data_from_match(arm['match'][0]) if 'dose_level' in arm: for dose in arm['dose_level']: if 'match' in dose: self._extract_data_from_match(dose['match'][0]) if self.cancer_type_dict is None: self.cancer_type_dict = { 'diagnoses': [], 'primary_cancer_types': [], 'cancer_types_expanded': [], 'excluded_cancer_types': [] } weighted_cancer_types = [] for ct in self.cancer_type_dict['primary_cancer_types']: suggestion = self._get_cancer_type_weight(ct, hierarchy='primary') weighted_cancer_types.append(suggestion) for ct in set(self.cancer_type_dict['cancer_types_expanded']) - set(self.cancer_type_dict['primary_cancer_types']): suggestion = self._get_cancer_type_weight(ct, hierarchy='default') weighted_cancer_types.append(suggestion) weighted_variants = {} for key in ['variants', 'cnvs', 'svs', 'wts']: weighted_variants[key] = [] for v in set(self.vdict[key]): suggestion = self._get_variants_weight(v, esrule=key) weighted_variants[key].append(suggestion) suggestors = { "cancer_type_suggest": weighted_cancer_types, "hugo_symbol_suggest": {"input": list(set(self.genes))}, "variant_suggest": [i for i in weighted_variants['variants'] if not i['input'].endswith('any')], "wildtype_suggest": weighted_variants['wts'], "cnv_suggest": weighted_variants['cnvs'], "sv_suggest": weighted_variants['svs'], "protocol_no_suggest": {'input': self.summary['protocol_number']}, "disease_center_suggest": { 'input': [i.replace('(', '').replace(')', '') for i in self.summary['disease_center'].split()], 'output': self.summary['disease_center'] }, 'disease_status_suggest': {'input': self.summary['disease_status']}, 'drug_suggest': {'input': [i.title() for i in self.summary['drugs']]}, 'investigator_suggest': self._get_investigator_suggest(self.summary['investigator'], self.summary['dfci_investigator']), 'mmr_status_suggest': {'input': self.summary['mmr_status'] + self.summary['ms_status']}, 'nct_number_suggest': {'input': self.summary['nct_number']} } searchers = { "tumor_types": list(set(self._get_tumor_types_search(weighted_cancer_types))), "genes": list(set(self.genes)), "variants": list(set([i['input'] for i in weighted_variants['variants']])), "wildtype_genes": list(set([i['input'] for i in weighted_variants['wts']])), "cnv_genes": list(set([i['input'] for i in weighted_variants['cnvs']])), "sv_genes": list(set([i['input'] for i in weighted_variants['svs']])), "exclusion_genes": list(set(self.vdict['exclusions'])), "protocol_no": self.summary["protocol_number"], "drugs": self.summary["drugs"], "age": self.summary["age_summary"], "phase": self.summary["phase_summary"], "disease_status": self.summary["disease_status"], "nct_number": self.summary["nct_number"], "disease_center": self.summary["disease_center"], "mmr_status": self.summary["mmr_status"], "ms_status": self.summary["ms_status"], "mutational_signatures": self.summary["mutational_signatures"], "investigator": [i['output'] for i in suggestors['investigator_suggest']], "short_title": self.summary["short_title"] } return suggestors, searchers, parse_primary_cancer_types(self.cancer_type_dict['primary_cancer_types'])