def build_trial_elasticsearch_fields(items): """ Create '_elasticsearch,' '_suggest' and '_summary' fields on trial documents. Normalize oncotree diagnoses fields using 'normalized' collection in db :param items: :return: """ # get database connection. db = database.get_db() # loop over each item. for item in items: # build tree. me = MatchEngine(db) status, trial_tree = me.create_trial_tree(item, no_validate=True) # look at every node. genomic = {} clinical = {} other = {} for n in trial_tree.nodes(): # get parent. if 'node_id' not in trial_tree.nodes[n]: continue node_id = trial_tree.nodes[n]['node_id'] # look for multi-level nodes (right now its only match). if 'match_tree' in trial_tree.nodes[n]: # compress categories. mt = trial_tree.nodes[n]['match_tree'] for x in mt: if mt.nodes[x]['type'] == 'genomic': insert_data_genomic(genomic, mt.nodes[x]['value'], node_id) if mt.nodes[x]['type'] == 'clinical': insert_data_clinical(clinical, mt.nodes[x]['value'], node_id) # add the other nodes. insert_data_other(trial_tree, node_id, n, other) # create _summary, _suggest, and _elasticsearch fields summary = Summary(clinical, genomic, other, trial_tree) item['_summary'] = summary.create_summary(item) autocomplete = Autocomplete(item) item['_suggest'], item['_elasticsearch'], item['_summary']['primary_tumor_types'] = \ autocomplete.add_autocomplete() logging.info("trial inserted " + item['protocol_no']) logging.info("inserting trials to elasticsearch " + str([item.setdefault("protocol_no") for item in items]))
def test_extract_hr_status(self): m = MatchEngine(get_db()) match_tree = trial['treatment_list']['step'][0]['match'][5] g = m.create_match_tree(match_tree) pmt = ParseMatchTree(g) hr = pmt.extract_hr_status() assert sorted(hr) == sorted(['HER2 Negative', 'ER Negative', 'PR Positive'])
def test_extract_signatures(self): m = MatchEngine(get_db()) match_tree = trial['treatment_list']['step'][0]['match'][3] g = m.create_match_tree(match_tree) pmt = ParseMatchTree(g) s = pmt.extract_signatures() assert 'MMR-D' in s[0] assert 'MSI-H' in s[1]
def test_trial_summary(self): on_trial['_genomic'] = {'hugo_symbol': [{'value': 'TEST'}]} on_trial['_clinical'] = {'disease_status': [{'value': ['TEST']}]} other = {'management_group': [{'value': 'TEST'}]} # create trial tree db = get_db() me = MatchEngine(db) status, trial_tree = me.create_trial_tree(on_trial, no_validate=True) # validate summary summary = Summary(on_trial['_clinical'], on_trial['_genomic'], other, trial_tree) item = summary.create_summary(on_trial) status_fields = ['drugs', 'genes', 'tumor_types', 'sponsor', 'phase_summary', 'accrual_goal', 'investigator', 'age_summary', 'protocol_number', 'disease_status', 'nct_number', 'disease_center', 'short_title', 'hormone_receptor_status'] for field in status_fields: assert field in item, self._debug(item, field) if field not in ['dfci_investigator', 'hormone_receptor_status']: assert item[field], '%s| %s' % (field, item) # remove all fields and validate that the summary will not error del on_trial['age'] del on_trial['phase'] del on_trial['nct_id'] del on_trial['protocol_no'] del on_trial['principal_investigator'] del on_trial['cancer_center_accrual_goal_upper'] del on_trial['site_list'] del on_trial['sponsor_list'] del on_trial['drug_list'] del on_trial['staff_list'] status, trial_tree = me.create_trial_tree(on_trial, no_validate=True) summary = Summary(on_trial['_clinical'], on_trial['_genomic'], other, trial_tree) item = summary.create_summary(on_trial) for field in status_fields: assert field in item, self._debug(item, field)
def test_extract_genes(self): m = MatchEngine(get_db()) match_tree = trial['treatment_list']['step'][0]['match'][0] g = m.create_match_tree(match_tree) pmt = ParseMatchTree(g) genes = pmt.extract_genes() assert 'BRAF' in genes, genes assert 'KRAS' in genes, genes assert 'EGFR' not in genes assert 'test' not in genes, genes assert len(genes) == 2, genes match_tree = trial['treatment_list']['step'][0]['match'][2] g = m.create_match_tree(match_tree) pmt = ParseMatchTree(g) genes = pmt.extract_genes() assert 'BRAF' not in genes, genes
def __init__(self, item): """ Creates data for ElasticSearch's autocomplete index :param item: Trial info: - treatment_list: Nested dictionary containing all match criteria - summary: Summary object created by the API """ self.summary = item['_summary'] self.treatment_list = item['treatment_list'] self.vdict = { 'variants': [], 'wts': [], 'svs': [], 'cnvs': [], 'exclusions': [] } self.genes = [] self.cancer_type_dict = dict() self.m = MatchEngine(get_db())
def _get_signatures(self, item): """ Creates hormone receptor status and mutational signature summary lists :param item: Trial document """ m = MatchEngine(get_db()) for step in item['treatment_list']['step']: if 'match' in step: g = m.create_match_tree(step['match'][0]) pmt = ParseMatchTree(g) signatures = pmt.extract_signatures() self.mmr.extend(signatures[0]) self.ms.extend(signatures[1]) self.sigs.extend(signatures[2]) self.hr.extend(pmt.extract_hr_status()) if 'arm' in step: for arm in step['arm']: if 'match' in arm: g = m.create_match_tree(arm['match'][0]) pmt = ParseMatchTree(g) signatures = pmt.extract_signatures() self.mmr.extend(signatures[0]) self.ms.extend(signatures[1]) self.sigs.extend(signatures[2]) self.hr.extend(pmt.extract_hr_status()) if 'dose_level' in arm: for dose in arm['dose_level']: if 'match' in dose: g = m.create_match_tree(dose['match'][0]) pmt = ParseMatchTree(g) signatures = pmt.extract_signatures() self.mmr.extend(signatures[0]) self.ms.extend(signatures[1]) self.sigs.extend(signatures[2]) self.hr.extend(pmt.extract_hr_status())
def test_ms_status(self): on_trial['_clinical'] = {} on_trial['_genomic'] = {'mmr_status': [{'value': 'MMR-Proficient'}]} db = get_db() me = MatchEngine(db) status, trial_tree = me.create_trial_tree(on_trial, no_validate=True) summary = Summary(on_trial['_clinical'], on_trial['_genomic'], {}, trial_tree) item = summary.create_summary(on_trial) assert 'mmr_status' in item, self._debug(item, 'mmr_status') on_trial['_genomic'] = {'ms_status': [{'value': 'MSI-H'}]} status, trial_tree = me.create_trial_tree(on_trial, no_validate=True) summary = Summary(on_trial['_clinical'], on_trial['_genomic'], {}, trial_tree) item = summary.create_summary(on_trial) assert 'ms_status' in item, self._debug(item, 'ms_status')
def test_extract_cancer_types(self): m = MatchEngine(get_db()) match_tree = match_tree_example g = m.create_match_tree(match_tree) pmt = ParseMatchTree(g) cancer_type_dict = pmt.extract_cancer_types() assert sorted(cancer_type_dict['diagnoses']) == sorted([ 'Ocular Melanoma' ]), cancer_type_dict['diagnoses'] assert sorted(cancer_type_dict['cancer_types_expanded']) == sorted([ 'Ocular Melanoma', 'Uveal Melanoma', 'Conjunctival Melanoma' ]), cancer_type_dict['cancer_types_expanded'] assert sorted(cancer_type_dict['excluded_cancer_types']) == [], cancer_type_dict['excluded_cancer_types'] assert cancer_type_dict['primary_cancer_types'] == ['Eye'], cancer_type_dict['primary_cancer_types'] m = MatchEngine(get_db()) match_tree = match_tree_example2 g = m.create_match_tree(match_tree) pmt = ParseMatchTree(g) cancer_type_dict = pmt.extract_cancer_types() assert sorted(cancer_type_dict['diagnoses']) == sorted(['_SOLID_']), cancer_type_dict['diagnoses'] assert 'Acute Lymphoid Leukemia' not in cancer_type_dict['cancer_types_expanded'], cancer_type_dict['cancer_types_expanded'] assert sorted(cancer_type_dict['excluded_cancer_types']) == [], cancer_type_dict['excluded_cancer_types'] assert cancer_type_dict['primary_cancer_types'] == ['All Solid Tumors'], cancer_type_dict['primary_cancer_types'] m = MatchEngine(get_db()) match_tree = match_tree_example3 g = m.create_match_tree(match_tree) pmt = ParseMatchTree(g) cancer_type_dict = pmt.extract_cancer_types() assert sorted(cancer_type_dict['diagnoses']) == sorted(['_LIQUID_']), cancer_type_dict['diagnoses'] assert 'Acute Lymphoid Leukemia' in cancer_type_dict['cancer_types_expanded'], cancer_type_dict[ 'cancer_types_expanded'] assert sorted(cancer_type_dict['excluded_cancer_types']) == [], cancer_type_dict['excluded_cancer_types'] assert cancer_type_dict['primary_cancer_types'] == ['All Liquid Tumors'], cancer_type_dict['primary_cancer_types']
def test_extract_variants(self): m = MatchEngine(get_db()) match_tree = trial['treatment_list']['step'][0]['match'][0] g = m.create_match_tree(match_tree) pmt = ParseMatchTree(g) v = pmt.extract_variants() assert 'BRAF V600E' in v['variants'] assert 'BRAF V600K' in v['variants'] assert 'KRAS any' in v['variants'] assert 'EGFR wt' in v['wts'] assert len(v['variants']) == 3 assert len(v['wts']) == 1 match_tree = trial['treatment_list']['step'][0]['match'][1] g = m.create_match_tree(match_tree) pmt = ParseMatchTree(g) v = pmt.extract_variants() assert 'PTEN CNV' in v['cnvs'] assert 'BRCA1 SV' in v['svs'] assert 'BRAF V600' in v['exclusions'] assert len(v['variants']) == 0 assert len(v['cnvs']) == 1 assert len(v['svs']) == 1 assert len(v['wts']) == 0 assert len(v['exclusions']) == 1 match_tree = trial['treatment_list']['step'][0]['match'][2] g = m.create_match_tree(match_tree) pmt = ParseMatchTree(g) v = pmt.extract_variants() assert 'BRAF V600E' not in v['variants'] assert len(v['variants']) == 0 assert len(v['wts']) == 0 assert 'BRAF V600E' in v['exclusions'] match_tree = trial['treatment_list']['step'][0]['match'][4] g = m.create_match_tree(match_tree) pmt = ParseMatchTree(g) v = pmt.extract_variants() assert 'BRAF V600K' in v['variants'] assert 'EGFR any' in v['variants'] assert len(v['variants']) == 2 assert 'PTEN CNV' in v['cnvs'] assert len(v['cnvs']) == 1 assert 'KRAS' in v['exclusions'] assert 'NRAS' in v['exclusions'] assert len(v['exclusions']) == 2 assert 'NTRK1 wt' in v['wts'] assert len(v['wts']) == 1
class Autocomplete: def __init__(self, item): """ Creates data for ElasticSearch's autocomplete index :param item: Trial info: - treatment_list: Nested dictionary containing all match criteria - summary: Summary object created by the API """ self.summary = item['_summary'] self.treatment_list = item['treatment_list'] self.vdict = { 'variants': [], 'wts': [], 'svs': [], 'cnvs': [], 'exclusions': [] } self.genes = [] self.cancer_type_dict = dict() self.m = MatchEngine(get_db()) @staticmethod def _get_cancer_type_weight(cancer_type, hierarchy='default'): """ Sets the weights for ElasticSearch autocompletion on cancer types. Cancer type terms are split so that autocomplete suggestions will populate regardless of which word in the multi-word cancer type string is initially input. Higher weighted terms will populate the top of the autocomplete dropdown list. :param cancer_type: Text to display in the autocomplete dropdown list. :param hierarchy: Weight to give the text. :return: Dictionary specifying ElasticSearch rules. """ weight_dict = {'primary': 10, 'default': 5, 'bucket': 20} if cancer_type == 'All Solid Tumors' or cancer_type == 'All Liquid Tumors': hierarchy = 'bucket' return { 'input': list(set([cancer_type] + [i for i in cancer_type.split() if len(i) > 3])), 'output': cancer_type, 'weight': weight_dict[hierarchy] } @staticmethod def _get_variants_weight(variant, esrule='variants'): """ Sets the weights for ElasticSearch autocompletion on gene variants. Higher weighted terms will populate the top of the autocomplete dropdown list. :param variant: Text to display in the autocomplete dropdown list. :param esrule: Type of variant. This will determine the ElasticSearch parameters. :return: Dictionary specifying ElasticSearch rules. """ weight_dict = { 'variants': 1, 'wts': 5, 'svs': 3, 'cnvs': 3 } return {'input': variant, 'weight': weight_dict[esrule]} @staticmethod def _get_investigator_suggest(investigator, dfci_investigator): """ Creates a list of investigators from the _summary field of the trial collection """ iin = [] iout = '' ispl = [i.strip() for i in investigator.split(',')] if len(ispl) == 1: iin = [ispl[0]] iout = investigator elif len(ispl) >= 2: iin = [ispl[0], ispl[1]] iout = '%s %s' % (ispl[1], ispl[0]) dfci_in = [] dfci_out = '' if dfci_investigator is not None and 'first_name' in dfci_investigator: dfci_in.append(dfci_investigator['first_name'].strip()) dfci_out += dfci_investigator['first_name'].strip() if dfci_investigator is not None and 'last_name' in dfci_investigator: dfci_in.append(dfci_investigator['last_name'].strip()) dfci_out += ' %s' % dfci_investigator['last_name'].strip() inv_suggest = [{ 'input': [i for i in iin if i != ''], 'output': iout }] if dfci_out != iout and dfci_out != '': inv_suggest.append({ 'input': dfci_in, 'output': dfci_out.strip() }) return inv_suggest @staticmethod def _get_tumor_types_search(ct_suggest): """ Maps special cancer type text output to the values stored in the ElasticSearch index. :param ct_suggest: Cancer type text to display. :return: Cancer type text stored in th ElasticSearch index, which we will query. """ tts = [] for ct in ct_suggest: if 'output' in ct and ct['output'] == 'All Solid Tumors': tts.append('_SOLID_') elif 'output' in ct and ct['output'] == 'All Liquid Tumors': tts.append('_LIQUID_') else: tts.append(ct['output']) return tts def _extract_data_from_match(self, match): """ Extract Cancer Type, Gene, and Variant data from the given match tree """ g = self.m.create_match_tree(match) pmt = ParseMatchTree(g) for key, value_list in list(pmt.extract_cancer_types().items()): if key not in self.cancer_type_dict: self.cancer_type_dict[key] = list() for item in value_list: if item not in self.cancer_type_dict[key]: self.cancer_type_dict[key].append(item) self.genes.extend(pmt.extract_genes()) vdict_tmp = pmt.extract_variants() for k, v in self.vdict.items(): v.extend(vdict_tmp[k]) def add_autocomplete(self): """ Recursively iterates through the treatment list and creates a list of genes contained within. :return: Nested dictionary containing all genes referenced within this trial """ for step in self.treatment_list['step']: if 'match' in step: self._extract_data_from_match(step['match'][0]) if 'arm' in step: for arm in step['arm']: if 'match' in arm: self._extract_data_from_match(arm['match'][0]) if 'dose_level' in arm: for dose in arm['dose_level']: if 'match' in dose: self._extract_data_from_match(dose['match'][0]) if not self.cancer_type_dict: self.cancer_type_dict = { 'diagnoses': [], 'primary_cancer_types': [], 'cancer_types_expanded': [], 'excluded_cancer_types': [] } weighted_cancer_types = [] for ct in self.cancer_type_dict['primary_cancer_types']: suggestion = self._get_cancer_type_weight(ct, hierarchy='primary') weighted_cancer_types.append(suggestion) for ct in set(self.cancer_type_dict['cancer_types_expanded']) - set( self.cancer_type_dict['primary_cancer_types']): suggestion = self._get_cancer_type_weight(ct, hierarchy='default') weighted_cancer_types.append(suggestion) weighted_variants = {} for key in ['variants', 'cnvs', 'svs', 'wts']: weighted_variants[key] = [] for v in set(self.vdict[key]): suggestion = self._get_variants_weight(v, esrule=key) weighted_variants[key].append(suggestion) suggestors = { "cancer_type_suggest": weighted_cancer_types, "hugo_symbol_suggest": {"input": list(set(self.genes))}, "variant_suggest": [i for i in weighted_variants['variants'] if not i['input'].endswith('any')], "wildtype_suggest": weighted_variants['wts'], "cnv_suggest": weighted_variants['cnvs'], "sv_suggest": weighted_variants['svs'], "protocol_no_suggest": {'input': self.summary['protocol_number']}, "disease_center_suggest": { 'input': [i.replace('(', '').replace(')', '') for i in self.summary['disease_center'].split()], 'output': self.summary['disease_center'] }, 'disease_status_suggest': {'input': self.summary['disease_status']}, 'drug_suggest': {'input': [i.title() for i in self.summary['drugs']]}, 'investigator_suggest': self._get_investigator_suggest(self.summary['investigator'], self.summary['dfci_investigator']), 'mmr_status_suggest': {'input': self.summary['mmr_status'] + self.summary['ms_status'] + self.summary['mutational_signatures']}, 'nct_number_suggest': {'input': self.summary['nct_number']} } searchers = { "tumor_types": list(set(self._get_tumor_types_search(weighted_cancer_types))), "genes": list(set(self.genes)), "variants": list(set([i['input'] for i in weighted_variants['variants']])), "wildtype_genes": list(set([i['input'] for i in weighted_variants['wts']])), "cnv_genes": list(set([i['input'] for i in weighted_variants['cnvs']])), "sv_genes": list(set([i['input'] for i in weighted_variants['svs']])), "exclusion_genes": list(set(self.vdict['exclusions'])), "protocol_no": self.summary["protocol_number"], "drugs": self.summary["drugs"], "age": self.summary["age_summary"], "phase": self.summary["phase_summary"], "disease_status": self.summary["disease_status"], "nct_number": self.summary["nct_number"], "disease_center": self.summary["disease_center"], "mmr_status": self.summary["mmr_status"] + self.summary['mutational_signatures'], "ms_status": self.summary["ms_status"], "mutational_signatures": self.summary["mutational_signatures"], "investigator": [i['output'] for i in suggestors['investigator_suggest']], "short_title": self.summary["short_title"] } return suggestors, searchers, parse_primary_cancer_types(self.cancer_type_dict['primary_cancer_types'])