Пример #1
0
def build_trial_elasticsearch_fields(items):
    """
    Create '_elasticsearch,' '_suggest' and '_summary' fields on trial documents.

    Normalize oncotree diagnoses fields using 'normalized' collection in db

    :param items:
    :return:
    """
    # get database connection.
    db = database.get_db()

    # loop over each item.
    for item in items:

        # build tree.
        me = MatchEngine(db)
        status, trial_tree = me.create_trial_tree(item, no_validate=True)

        # look at every node.
        genomic = {}
        clinical = {}
        other = {}
        for n in trial_tree.nodes():

            # get parent.
            if 'node_id' not in trial_tree.nodes[n]:
                continue

            node_id = trial_tree.nodes[n]['node_id']

            # look for multi-level nodes (right now its only match).
            if 'match_tree' in trial_tree.nodes[n]:
                # compress categories.
                mt = trial_tree.nodes[n]['match_tree']
                for x in mt:
                    if mt.nodes[x]['type'] == 'genomic':
                        insert_data_genomic(genomic, mt.nodes[x]['value'],
                                            node_id)
                    if mt.nodes[x]['type'] == 'clinical':
                        insert_data_clinical(clinical, mt.nodes[x]['value'],
                                             node_id)

            # add the other nodes.
            insert_data_other(trial_tree, node_id, n, other)

        # create _summary, _suggest, and _elasticsearch fields
        summary = Summary(clinical, genomic, other, trial_tree)
        item['_summary'] = summary.create_summary(item)

        autocomplete = Autocomplete(item)
        item['_suggest'], item['_elasticsearch'], item['_summary']['primary_tumor_types'] = \
            autocomplete.add_autocomplete()

        logging.info("trial inserted " + item['protocol_no'])

    logging.info("inserting trials to elasticsearch " +
                 str([item.setdefault("protocol_no") for item in items]))
Пример #2
0
    def test_extract_hr_status(self):

        m = MatchEngine(get_db())
        match_tree = trial['treatment_list']['step'][0]['match'][5]
        g = m.create_match_tree(match_tree)
        pmt = ParseMatchTree(g)
        hr = pmt.extract_hr_status()

        assert sorted(hr) == sorted(['HER2 Negative', 'ER Negative', 'PR Positive'])
Пример #3
0
    def test_extract_signatures(self):

        m = MatchEngine(get_db())
        match_tree = trial['treatment_list']['step'][0]['match'][3]
        g = m.create_match_tree(match_tree)
        pmt = ParseMatchTree(g)
        s = pmt.extract_signatures()

        assert 'MMR-D' in s[0]
        assert 'MSI-H' in s[1]
Пример #4
0
    def test_trial_summary(self):

        on_trial['_genomic'] = {'hugo_symbol': [{'value': 'TEST'}]}
        on_trial['_clinical'] = {'disease_status': [{'value': ['TEST']}]}
        other = {'management_group': [{'value': 'TEST'}]}

        # create trial tree
        db = get_db()
        me = MatchEngine(db)
        status, trial_tree = me.create_trial_tree(on_trial, no_validate=True)

        # validate summary
        summary = Summary(on_trial['_clinical'],
                          on_trial['_genomic'],
                          other,
                          trial_tree)
        item = summary.create_summary(on_trial)
        status_fields = ['drugs', 'genes', 'tumor_types', 'sponsor',
                         'phase_summary', 'accrual_goal', 'investigator', 'age_summary', 'protocol_number',
                         'disease_status', 'nct_number', 'disease_center', 'short_title',
                         'hormone_receptor_status']

        for field in status_fields:
            assert field in item, self._debug(item, field)

            if field not in ['dfci_investigator', 'hormone_receptor_status']:
                assert item[field], '%s| %s' % (field, item)

        # remove all fields and validate that the summary will not error
        del on_trial['age']
        del on_trial['phase']
        del on_trial['nct_id']
        del on_trial['protocol_no']
        del on_trial['principal_investigator']
        del on_trial['cancer_center_accrual_goal_upper']
        del on_trial['site_list']
        del on_trial['sponsor_list']
        del on_trial['drug_list']
        del on_trial['staff_list']

        status, trial_tree = me.create_trial_tree(on_trial, no_validate=True)
        summary = Summary(on_trial['_clinical'],
                          on_trial['_genomic'],
                          other,
                          trial_tree)
        item = summary.create_summary(on_trial)

        for field in status_fields:
            assert field in item, self._debug(item, field)
Пример #5
0
    def test_extract_genes(self):

        m = MatchEngine(get_db())
        match_tree = trial['treatment_list']['step'][0]['match'][0]
        g = m.create_match_tree(match_tree)
        pmt = ParseMatchTree(g)
        genes = pmt.extract_genes()

        assert 'BRAF' in genes, genes
        assert 'KRAS' in genes, genes
        assert 'EGFR' not in genes
        assert 'test' not in genes, genes
        assert len(genes) == 2, genes

        match_tree = trial['treatment_list']['step'][0]['match'][2]
        g = m.create_match_tree(match_tree)
        pmt = ParseMatchTree(g)
        genes = pmt.extract_genes()
        assert 'BRAF' not in genes, genes
Пример #6
0
    def __init__(self, item):
        """
        Creates data for ElasticSearch's autocomplete index

        :param item: Trial info:
                    - treatment_list: Nested dictionary containing all match criteria
                    - summary:        Summary object created by the API
        """
        self.summary = item['_summary']
        self.treatment_list = item['treatment_list']

        self.vdict = {
            'variants': [],
            'wts': [],
            'svs': [],
            'cnvs': [],
            'exclusions': []
        }
        self.genes = []
        self.cancer_type_dict = dict()
        self.m = MatchEngine(get_db())
Пример #7
0
    def _get_signatures(self, item):
        """
        Creates hormone receptor status and mutational signature summary lists

        :param item: Trial document
        """

        m = MatchEngine(get_db())
        for step in item['treatment_list']['step']:
            if 'match' in step:
                g = m.create_match_tree(step['match'][0])
                pmt = ParseMatchTree(g)
                signatures = pmt.extract_signatures()
                self.mmr.extend(signatures[0])
                self.ms.extend(signatures[1])
                self.sigs.extend(signatures[2])
                self.hr.extend(pmt.extract_hr_status())

            if 'arm' in step:
                for arm in step['arm']:
                    if 'match' in arm:
                        g = m.create_match_tree(arm['match'][0])
                        pmt = ParseMatchTree(g)
                        signatures = pmt.extract_signatures()
                        self.mmr.extend(signatures[0])
                        self.ms.extend(signatures[1])
                        self.sigs.extend(signatures[2])
                        self.hr.extend(pmt.extract_hr_status())

                    if 'dose_level' in arm:
                        for dose in arm['dose_level']:
                            if 'match' in dose:
                                g = m.create_match_tree(dose['match'][0])
                                pmt = ParseMatchTree(g)
                                signatures = pmt.extract_signatures()
                                self.mmr.extend(signatures[0])
                                self.ms.extend(signatures[1])
                                self.sigs.extend(signatures[2])
                                self.hr.extend(pmt.extract_hr_status())
Пример #8
0
    def test_ms_status(self):

        on_trial['_clinical'] = {}
        on_trial['_genomic'] = {'mmr_status': [{'value': 'MMR-Proficient'}]}
        db = get_db()
        me = MatchEngine(db)
        status, trial_tree = me.create_trial_tree(on_trial, no_validate=True)
        summary = Summary(on_trial['_clinical'],
                          on_trial['_genomic'],
                          {},
                          trial_tree)
        item = summary.create_summary(on_trial)
        assert 'mmr_status' in item, self._debug(item, 'mmr_status')

        on_trial['_genomic'] = {'ms_status': [{'value': 'MSI-H'}]}
        status, trial_tree = me.create_trial_tree(on_trial, no_validate=True)
        summary = Summary(on_trial['_clinical'],
                          on_trial['_genomic'],
                          {},
                          trial_tree)
        item = summary.create_summary(on_trial)
        assert 'ms_status' in item, self._debug(item, 'ms_status')
Пример #9
0
    def test_extract_cancer_types(self):

        m = MatchEngine(get_db())
        match_tree = match_tree_example
        g = m.create_match_tree(match_tree)
        pmt = ParseMatchTree(g)
        cancer_type_dict = pmt.extract_cancer_types()

        assert sorted(cancer_type_dict['diagnoses']) == sorted([
            'Ocular Melanoma'
        ]), cancer_type_dict['diagnoses']

        assert sorted(cancer_type_dict['cancer_types_expanded']) == sorted([
            'Ocular Melanoma',
            'Uveal Melanoma',
            'Conjunctival Melanoma'
        ]), cancer_type_dict['cancer_types_expanded']

        assert sorted(cancer_type_dict['excluded_cancer_types']) == [], cancer_type_dict['excluded_cancer_types']
        assert cancer_type_dict['primary_cancer_types'] == ['Eye'], cancer_type_dict['primary_cancer_types']

        m = MatchEngine(get_db())
        match_tree = match_tree_example2
        g = m.create_match_tree(match_tree)
        pmt = ParseMatchTree(g)
        cancer_type_dict = pmt.extract_cancer_types()

        assert sorted(cancer_type_dict['diagnoses']) == sorted(['_SOLID_']), cancer_type_dict['diagnoses']
        assert 'Acute Lymphoid Leukemia' not in cancer_type_dict['cancer_types_expanded'], cancer_type_dict['cancer_types_expanded']
        assert sorted(cancer_type_dict['excluded_cancer_types']) == [], cancer_type_dict['excluded_cancer_types']
        assert cancer_type_dict['primary_cancer_types'] == ['All Solid Tumors'], cancer_type_dict['primary_cancer_types']

        m = MatchEngine(get_db())
        match_tree = match_tree_example3
        g = m.create_match_tree(match_tree)
        pmt = ParseMatchTree(g)
        cancer_type_dict = pmt.extract_cancer_types()

        assert sorted(cancer_type_dict['diagnoses']) == sorted(['_LIQUID_']), cancer_type_dict['diagnoses']
        assert 'Acute Lymphoid Leukemia' in cancer_type_dict['cancer_types_expanded'], cancer_type_dict[
            'cancer_types_expanded']
        assert sorted(cancer_type_dict['excluded_cancer_types']) == [], cancer_type_dict['excluded_cancer_types']
        assert cancer_type_dict['primary_cancer_types'] == ['All Liquid Tumors'], cancer_type_dict['primary_cancer_types']
Пример #10
0
    def test_extract_variants(self):

        m = MatchEngine(get_db())
        match_tree = trial['treatment_list']['step'][0]['match'][0]
        g = m.create_match_tree(match_tree)
        pmt = ParseMatchTree(g)
        v = pmt.extract_variants()
        assert 'BRAF V600E' in v['variants']
        assert 'BRAF V600K' in v['variants']
        assert 'KRAS any' in v['variants']
        assert 'EGFR wt' in v['wts']
        assert len(v['variants']) == 3
        assert len(v['wts']) == 1

        match_tree = trial['treatment_list']['step'][0]['match'][1]
        g = m.create_match_tree(match_tree)
        pmt = ParseMatchTree(g)
        v = pmt.extract_variants()

        assert 'PTEN CNV' in v['cnvs']
        assert 'BRCA1 SV' in v['svs']
        assert 'BRAF V600' in v['exclusions']
        assert len(v['variants']) == 0
        assert len(v['cnvs']) == 1
        assert len(v['svs']) == 1
        assert len(v['wts']) == 0
        assert len(v['exclusions']) == 1

        match_tree = trial['treatment_list']['step'][0]['match'][2]
        g = m.create_match_tree(match_tree)
        pmt = ParseMatchTree(g)
        v = pmt.extract_variants()
        assert 'BRAF V600E' not in v['variants']
        assert len(v['variants']) == 0
        assert len(v['wts']) == 0
        assert 'BRAF V600E' in v['exclusions']

        match_tree = trial['treatment_list']['step'][0]['match'][4]
        g = m.create_match_tree(match_tree)
        pmt = ParseMatchTree(g)
        v = pmt.extract_variants()
        assert 'BRAF V600K' in v['variants']
        assert 'EGFR any' in v['variants']
        assert len(v['variants']) == 2
        assert 'PTEN CNV' in v['cnvs']
        assert len(v['cnvs']) == 1
        assert 'KRAS' in v['exclusions']
        assert 'NRAS' in v['exclusions']
        assert len(v['exclusions']) == 2
        assert 'NTRK1 wt' in v['wts']
        assert len(v['wts']) == 1
Пример #11
0
class Autocomplete:

    def __init__(self, item):
        """
        Creates data for ElasticSearch's autocomplete index

        :param item: Trial info:
                    - treatment_list: Nested dictionary containing all match criteria
                    - summary:        Summary object created by the API
        """
        self.summary = item['_summary']
        self.treatment_list = item['treatment_list']

        self.vdict = {
            'variants': [],
            'wts': [],
            'svs': [],
            'cnvs': [],
            'exclusions': []
        }
        self.genes = []
        self.cancer_type_dict = dict()
        self.m = MatchEngine(get_db())

    @staticmethod
    def _get_cancer_type_weight(cancer_type, hierarchy='default'):
        """
        Sets the weights for ElasticSearch autocompletion on cancer types. Cancer type terms
        are split so that autocomplete suggestions will populate regardless of which word in the
        multi-word cancer type string is initially input. Higher weighted terms will populate the
        top of the autocomplete dropdown list.

        :param cancer_type: Text to display in the autocomplete dropdown list.
        :param hierarchy: Weight to give the text.
        :return: Dictionary specifying ElasticSearch rules.
        """

        weight_dict = {'primary': 10, 'default': 5, 'bucket': 20}
        if cancer_type == 'All Solid Tumors' or cancer_type == 'All Liquid Tumors':
            hierarchy = 'bucket'

        return {
            'input': list(set([cancer_type] + [i for i in cancer_type.split() if len(i) > 3])),
            'output': cancer_type,
            'weight': weight_dict[hierarchy]
        }

    @staticmethod
    def _get_variants_weight(variant, esrule='variants'):
        """
        Sets the weights for ElasticSearch autocompletion on gene variants. Higher weighted terms will populate the
        top of the autocomplete dropdown list.

        :param variant: Text to display in the autocomplete dropdown list.
        :param esrule: Type of variant. This will determine the ElasticSearch parameters.
        :return: Dictionary specifying ElasticSearch rules.
        """

        weight_dict = {
            'variants': 1,
            'wts': 5,
            'svs': 3,
            'cnvs': 3
        }
        return {'input': variant, 'weight': weight_dict[esrule]}

    @staticmethod
    def _get_investigator_suggest(investigator, dfci_investigator):
        """
        Creates a list of investigators from the _summary field of the trial collection
        """

        iin = []
        iout = ''
        ispl = [i.strip() for i in investigator.split(',')]
        if len(ispl) == 1:
            iin = [ispl[0]]
            iout = investigator
        elif len(ispl) >= 2:
            iin = [ispl[0], ispl[1]]
            iout = '%s %s' % (ispl[1], ispl[0])

        dfci_in = []
        dfci_out = ''
        if dfci_investigator is not None and 'first_name' in dfci_investigator:
            dfci_in.append(dfci_investigator['first_name'].strip())
            dfci_out += dfci_investigator['first_name'].strip()
        if dfci_investigator is not None and 'last_name' in dfci_investigator:
            dfci_in.append(dfci_investigator['last_name'].strip())
            dfci_out += ' %s' % dfci_investigator['last_name'].strip()

        inv_suggest = [{
            'input': [i for i in iin if i != ''],
            'output': iout
        }]
        if dfci_out != iout and dfci_out != '':
            inv_suggest.append({
                'input': dfci_in,
                'output': dfci_out.strip()
            })

        return inv_suggest

    @staticmethod
    def _get_tumor_types_search(ct_suggest):
        """
        Maps special cancer type text output to the values stored in the ElasticSearch index.

        :param ct_suggest: Cancer type text to display.
        :return: Cancer type text stored in th ElasticSearch index, which we will query.
        """

        tts = []
        for ct in ct_suggest:
            if 'output' in ct and ct['output'] == 'All Solid Tumors':
                tts.append('_SOLID_')
            elif 'output' in ct and ct['output'] == 'All Liquid Tumors':
                tts.append('_LIQUID_')
            else:
                tts.append(ct['output'])

        return tts

    def _extract_data_from_match(self, match):
        """
        Extract Cancer Type, Gene, and Variant data from the given match tree
        """

        g = self.m.create_match_tree(match)
        pmt = ParseMatchTree(g)
        for key, value_list in list(pmt.extract_cancer_types().items()):
            if key not in self.cancer_type_dict:
                self.cancer_type_dict[key] = list()
            for item in value_list:
                if item not in self.cancer_type_dict[key]:
                    self.cancer_type_dict[key].append(item)
        self.genes.extend(pmt.extract_genes())
        vdict_tmp = pmt.extract_variants()
        for k, v in self.vdict.items():
            v.extend(vdict_tmp[k])

    def add_autocomplete(self):
        """
        Recursively iterates through the treatment list and creates a list of genes contained within.

        :return: Nested dictionary containing all genes referenced within this trial
        """

        for step in self.treatment_list['step']:
            if 'match' in step:
                self._extract_data_from_match(step['match'][0])

            if 'arm' in step:
                for arm in step['arm']:
                    if 'match' in arm:
                        self._extract_data_from_match(arm['match'][0])

                    if 'dose_level' in arm:
                        for dose in arm['dose_level']:
                            if 'match' in dose:
                                self._extract_data_from_match(dose['match'][0])

        if not self.cancer_type_dict:
            self.cancer_type_dict = {
                'diagnoses': [],
                'primary_cancer_types': [],
                'cancer_types_expanded': [],
                'excluded_cancer_types': []
            }

        weighted_cancer_types = []
        for ct in self.cancer_type_dict['primary_cancer_types']:
            suggestion = self._get_cancer_type_weight(ct, hierarchy='primary')
            weighted_cancer_types.append(suggestion)

        for ct in set(self.cancer_type_dict['cancer_types_expanded']) - set(
                self.cancer_type_dict['primary_cancer_types']):
            suggestion = self._get_cancer_type_weight(ct, hierarchy='default')
            weighted_cancer_types.append(suggestion)

        weighted_variants = {}
        for key in ['variants', 'cnvs', 'svs', 'wts']:
            weighted_variants[key] = []
            for v in set(self.vdict[key]):
                suggestion = self._get_variants_weight(v, esrule=key)
                weighted_variants[key].append(suggestion)

        suggestors = {
            "cancer_type_suggest": weighted_cancer_types,
            "hugo_symbol_suggest": {"input": list(set(self.genes))},
            "variant_suggest": [i for i in weighted_variants['variants'] if not i['input'].endswith('any')],
            "wildtype_suggest": weighted_variants['wts'],
            "cnv_suggest": weighted_variants['cnvs'],
            "sv_suggest": weighted_variants['svs'],
            "protocol_no_suggest": {'input': self.summary['protocol_number']},
            "disease_center_suggest": {
                'input': [i.replace('(', '').replace(')', '') for i in self.summary['disease_center'].split()],
                'output': self.summary['disease_center']
            },
            'disease_status_suggest': {'input': self.summary['disease_status']},
            'drug_suggest': {'input': [i.title() for i in self.summary['drugs']]},
            'investigator_suggest': self._get_investigator_suggest(self.summary['investigator'],
                                                                   self.summary['dfci_investigator']),
            'mmr_status_suggest': {'input': self.summary['mmr_status'] + self.summary['ms_status'] + self.summary['mutational_signatures']},
            'nct_number_suggest': {'input': self.summary['nct_number']}
        }

        searchers = {
            "tumor_types": list(set(self._get_tumor_types_search(weighted_cancer_types))),
            "genes": list(set(self.genes)),
            "variants": list(set([i['input'] for i in weighted_variants['variants']])),
            "wildtype_genes": list(set([i['input'] for i in weighted_variants['wts']])),
            "cnv_genes": list(set([i['input'] for i in weighted_variants['cnvs']])),
            "sv_genes": list(set([i['input'] for i in weighted_variants['svs']])),
            "exclusion_genes": list(set(self.vdict['exclusions'])),
            "protocol_no": self.summary["protocol_number"],
            "drugs": self.summary["drugs"],
            "age": self.summary["age_summary"],
            "phase": self.summary["phase_summary"],
            "disease_status": self.summary["disease_status"],
            "nct_number": self.summary["nct_number"],
            "disease_center": self.summary["disease_center"],
            "mmr_status": self.summary["mmr_status"] + self.summary['mutational_signatures'],
            "ms_status": self.summary["ms_status"],
            "mutational_signatures": self.summary["mutational_signatures"],
            "investigator": [i['output'] for i in suggestors['investigator_suggest']],
            "short_title": self.summary["short_title"]
        }

        return suggestors, searchers, parse_primary_cancer_types(self.cancer_type_dict['primary_cancer_types'])