Пример #1
0
    def test_run_query(self):

        # reinstantiate MatchEngine so that the set of all sample ids in the database includes the documents that were
        # posted by the unit test setUp
        self.me = MatchEngine(self.db)

        # define a clinical node
        node = {'type': 'clinical', 'value': {'ONCOTREE_PRIMARY_DIAGNOSIS': 'Adrenal Gland', 'AGE_NUMERICAL': '>=18'}}

        # run query
        result, matches = self.me.run_query(node)

        # assert number of samples are 89
        assert len(result) == 1, len(result)
        assert self.sample_id in result

        # define a genomic criteria
        node = {'type': 'genomic', 'value': {'HUGO_SYMBOL': '!BRAF'}}

        # run query
        result, matches = self.me.run_query(node)

        # assert number of samples is 1
        assert len(result) == 9, len(result)
        assert self.sample_id not in result

        self.add_genomic_v2()
        node = {'type': 'genomic', 'value': {'HUGO_SYMBOL': 'WHSC1'}}
        result, matches = self.me.run_query(node)
        assert 'actionability' in matches[0]
        assert matches[0]['mmr_status'] == 'Proficient (MMR-P / MSS)'
Пример #2
0
    def test_extract_hr_status(self):

        m = MatchEngine(get_db())
        match_tree = trial['treatment_list']['step'][0]['match'][5]
        g = m.create_match_tree(match_tree)
        pmt = ParseMatchTree(g)
        hr = pmt.extract_hr_status()

        assert sorted(hr) == sorted(['HER2 Negative', 'ER Negative', 'PR Positive'])
Пример #3
0
    def test_extract_signatures(self):

        m = MatchEngine(get_db())
        match_tree = trial['treatment_list']['step'][0]['match'][3]
        g = m.create_match_tree(match_tree)
        pmt = ParseMatchTree(g)
        s = pmt.extract_signatures()

        assert 'MMR-D' in s[0]
        assert 'MSI-H' in s[1]
Пример #4
0
    def test_extract_hr_status(self):

        m = MatchEngine(get_db())
        match_tree = trial['treatment_list']['step'][0]['match'][5]
        g = m.create_match_tree(match_tree)
        pmt = ParseMatchTree(g)
        hr = pmt.extract_hr_status()

        assert sorted(hr) == sorted(
            ['HER2 Negative', 'ER Negative', 'PR Positive'])
Пример #5
0
    def test_extract_signatures(self):

        m = MatchEngine(get_db())
        match_tree = trial['treatment_list']['step'][0]['match'][3]
        g = m.create_match_tree(match_tree)
        pmt = ParseMatchTree(g)
        s = pmt.extract_signatures()

        assert 'MMR-D' in s[0]
        assert 'MSI-H' in s[1]
Пример #6
0
def run_matchengine():
    """
    Computes matches between all trials in the database and the given subset list of patient MRNs

    :param mrns: List of patient MRNs
    :return: database collection of trial matches
    """

    db = database.get_db()
    me = MatchEngine(db)
    me.find_trial_matches()
Пример #7
0
def run_matchengine():
    """
    Computes matches between all trials in the database and the given subset list of patient MRNs

    :param mrns: List of patient MRNs
    :return: database collection of trial matches
    """

    db = database.get_db()
    me = MatchEngine(db)
    me.find_trial_matches()
Пример #8
0
def trial_insert(items):

    # get database connection.
    db = database.get_db()

    # loop over each item.
    for item in items:

        # log this.
        logging.info("trial inserted")

        # build tree.
        me = MatchEngine(db)
        status, trial_tree = me.create_trial_tree(item, no_validate=True)

        # look at every node.
        genomic = {}
        clinical = {}
        other = {}
        for n in trial_tree.nodes():

            # get parent.
            if 'node_id' not in trial_tree.node[n]:
                continue

            node_id = trial_tree.node[n]['node_id']

            # look for multi-level nodes (right now its only match).
            if 'match_tree' in trial_tree.node[n]:
                # compress categories.
                mt = trial_tree.node[n]['match_tree']
                for x in mt:
                    if mt.node[x]['type'] == 'genomic':
                        insert_data_genomic(genomic, mt.node[x]['value'],
                                            node_id)
                    if mt.node[x]['type'] == 'clinical':
                        insert_data_clinical(clinical, mt.node[x]['value'],
                                             node_id)

            # add the other nodes.
            insert_data_other(trial_tree, node_id, n, other)

        # create _summary, _suggest, and _elasticsearch fields
        summary = Summary(clinical, genomic, other, trial_tree)
        item['_summary'] = summary.create_summary(item)

        autocomplete = Autocomplete(item)
        item['_suggest'], item['_elasticsearch'], item['_summary']['primary_tumor_types'] = \
            autocomplete.add_autocomplete()

    return items
Пример #9
0
    def test_extract_variants(self):

        m = MatchEngine(get_db())
        match_tree = trial['treatment_list']['step'][0]['match'][0]
        g = m.create_match_tree(match_tree)
        pmt = ParseMatchTree(g)
        v = pmt.extract_variants()
        assert 'BRAF V600E' in v['variants']
        assert 'BRAF V600K' in v['variants']
        assert 'KRAS any' in v['variants']
        assert 'EGFR wt' in v['wts']
        assert len(v['variants']) == 3
        assert len(v['wts']) == 1

        match_tree = trial['treatment_list']['step'][0]['match'][1]
        g = m.create_match_tree(match_tree)
        pmt = ParseMatchTree(g)
        v = pmt.extract_variants()

        assert 'PTEN CNV' in v['cnvs']
        assert 'BRCA1 SV' in v['svs']
        assert 'BRAF V600' in v['exclusions']
        assert len(v['variants']) == 0
        assert len(v['cnvs']) == 1
        assert len(v['svs']) == 1
        assert len(v['wts']) == 0
        assert len(v['exclusions']) == 1

        match_tree = trial['treatment_list']['step'][0]['match'][2]
        g = m.create_match_tree(match_tree)
        pmt = ParseMatchTree(g)
        v = pmt.extract_variants()
        assert 'BRAF V600E' not in v['variants']
        assert len(v['variants']) == 0
        assert len(v['wts']) == 0
        assert 'BRAF V600E' in v['exclusions']

        match_tree = trial['treatment_list']['step'][0]['match'][4]
        g = m.create_match_tree(match_tree)
        pmt = ParseMatchTree(g)
        v = pmt.extract_variants()
        assert 'BRAF V600K' in v['variants']
        assert 'EGFR any' in v['variants']
        assert len(v['variants']) == 2
        assert 'PTEN CNV' in v['cnvs']
        assert len(v['cnvs']) == 1
        assert 'KRAS' in v['exclusions']
        assert 'NRAS' in v['exclusions']
        assert len(v['exclusions']) == 2
        assert 'NTRK1 wt' in v['wts']
        assert len(v['wts']) == 1
Пример #10
0
def trial_insert(items):

    # get database connection.
    db = database.get_db()

    # loop over each item.
    for item in items:

        # log this.
        logging.info("trial inserted")

        # build tree.
        me = MatchEngine(db)
        status, trial_tree = me.create_trial_tree(item, no_validate=True)

        # look at every node.
        genomic = {}
        clinical = {}
        other = {}
        for n in trial_tree.nodes():

            # get parent.
            if 'node_id' not in trial_tree.node[n]:
                continue

            node_id = trial_tree.node[n]['node_id']

            # look for multi-level nodes (right now its only match).
            if 'match_tree' in trial_tree.node[n]:
                # compress categories.
                mt = trial_tree.node[n]['match_tree']
                for x in mt:
                    if mt.node[x]['type'] == 'genomic':
                        insert_data_genomic(genomic, mt.node[x]['value'], node_id)
                    if mt.node[x]['type'] == 'clinical':
                        insert_data_clinical(clinical, mt.node[x]['value'], node_id)

            # add the other nodes.
            insert_data_other(trial_tree, node_id, n, other)

        # create _summary, _suggest, and _elasticsearch fields
        summary = Summary(clinical, genomic, other, trial_tree)
        item['_summary'] = summary.create_summary(item)

        autocomplete = Autocomplete(item)
        item['_suggest'], item['_elasticsearch'], item['_summary']['primary_tumor_types'] = \
            autocomplete.add_autocomplete()

    return items
Пример #11
0
    def test_trial_summary(self):

        on_trial['_genomic'] = {'hugo_symbol': [{'value': 'TEST'}]}
        on_trial['_clinical'] = {'disease_status': [{'value': ['TEST']}]}
        other = {'management_group': [{'value': 'TEST'}]}

        # create trial tree
        db = get_db()
        me = MatchEngine(db)
        status, trial_tree = me.create_trial_tree(on_trial, no_validate=True)

        # validate summary
        summary = Summary(on_trial['_clinical'],
                          on_trial['_genomic'],
                          other,
                          trial_tree)
        item = summary.create_summary(on_trial)
        status_fields = ['drugs', 'genes', 'tumor_types', 'sponsor',
                         'phase_summary', 'accrual_goal', 'investigator', 'age_summary', 'protocol_number',
                         'disease_status', 'nct_number', 'disease_center', 'short_title',
                         'hormone_receptor_status']

        for field in status_fields:
            assert field in item, self._debug(item, field)

            if field not in ['dfci_investigator', 'hormone_receptor_status']:
                assert item[field], '%s| %s' % (field, item)

        # remove all fields and validate that the summary will not error
        del on_trial['age']
        del on_trial['phase']
        del on_trial['nct_id']
        del on_trial['protocol_no']
        del on_trial['principal_investigator']
        del on_trial['cancer_center_accrual_goal_upper']
        del on_trial['site_list']
        del on_trial['sponsor_list']
        del on_trial['drug_list']
        del on_trial['staff_list']

        status, trial_tree = me.create_trial_tree(on_trial, no_validate=True)
        summary = Summary(on_trial['_clinical'],
                          on_trial['_genomic'],
                          other,
                          trial_tree)
        item = summary.create_summary(on_trial)

        for field in status_fields:
            assert field in item, self._debug(item, field)
Пример #12
0
    def test_extract_cancer_types(self):

        m = MatchEngine(get_db())
        match_tree = match_tree_example
        g = m.create_match_tree(match_tree)
        pmt = ParseMatchTree(g)
        cancer_type_dict = pmt.extract_cancer_types()

        assert sorted(cancer_type_dict['diagnoses']) == sorted(
            ['Ocular Melanoma']), cancer_type_dict['diagnoses']

        assert sorted(cancer_type_dict['cancer_types_expanded']) == sorted([
            'Ocular Melanoma', 'Uveal Melanoma', 'Conjunctival Melanoma'
        ]), cancer_type_dict['cancer_types_expanded']

        assert sorted(cancer_type_dict['excluded_cancer_types']
                      ) == [], cancer_type_dict['excluded_cancer_types']
        assert cancer_type_dict['primary_cancer_types'] == [
            'Eye'
        ], cancer_type_dict['primary_cancer_types']

        m = MatchEngine(get_db())
        match_tree = match_tree_example2
        g = m.create_match_tree(match_tree)
        pmt = ParseMatchTree(g)
        cancer_type_dict = pmt.extract_cancer_types()

        assert sorted(cancer_type_dict['diagnoses']) == sorted(
            ['_SOLID_']), cancer_type_dict['diagnoses']
        assert 'Acute Lymphoid Leukemia' not in cancer_type_dict[
            'cancer_types_expanded'], cancer_type_dict['cancer_types_expanded']
        assert sorted(cancer_type_dict['excluded_cancer_types']
                      ) == [], cancer_type_dict['excluded_cancer_types']
        assert cancer_type_dict['primary_cancer_types'] == [
            'All Solid Tumors'
        ], cancer_type_dict['primary_cancer_types']

        m = MatchEngine(get_db())
        match_tree = match_tree_example3
        g = m.create_match_tree(match_tree)
        pmt = ParseMatchTree(g)
        cancer_type_dict = pmt.extract_cancer_types()

        assert sorted(cancer_type_dict['diagnoses']) == sorted(
            ['_LIQUID_']), cancer_type_dict['diagnoses']
        assert 'Acute Lymphoid Leukemia' in cancer_type_dict[
            'cancer_types_expanded'], cancer_type_dict['cancer_types_expanded']
        assert sorted(cancer_type_dict['excluded_cancer_types']
                      ) == [], cancer_type_dict['excluded_cancer_types']
        assert cancer_type_dict['primary_cancer_types'] == [
            'All Liquid Tumors'
        ], cancer_type_dict['primary_cancer_types']
Пример #13
0
    def test_trial_summary(self):

        on_trial['_genomic'] = {'hugo_symbol': [{'value': 'TEST'}]}
        on_trial['_clinical'] = {'disease_status': [{'value': ['TEST']}]}
        other = {'management_group': [{'value': 'TEST'}]}

        # create trial tree
        db = get_db()
        me = MatchEngine(db)
        status, trial_tree = me.create_trial_tree(on_trial, no_validate=True)

        # validate summary
        summary = Summary(on_trial['_clinical'], on_trial['_genomic'], other,
                          trial_tree)
        item = summary.create_summary(on_trial)
        status_fields = [
            'drugs', 'genes', 'tumor_types', 'sponsor', 'phase_summary',
            'accrual_goal', 'investigator', 'age_summary', 'protocol_number',
            'disease_status', 'nct_number', 'disease_center', 'short_title',
            'hormone_receptor_status'
        ]

        for field in status_fields:
            assert field in item, self._debug(item, field)

            if field not in ['dfci_investigator', 'hormone_receptor_status']:
                assert item[field], '%s| %s' % (field, item)

        # remove all fields and validate that the summary will not error
        del on_trial['age']
        del on_trial['phase']
        del on_trial['nct_id']
        del on_trial['protocol_no']
        del on_trial['principal_investigator']
        del on_trial['cancer_center_accrual_goal_upper']
        del on_trial['site_list']
        del on_trial['sponsor_list']
        del on_trial['drug_list']
        del on_trial['staff_list']

        status, trial_tree = me.create_trial_tree(on_trial, no_validate=True)
        summary = Summary(on_trial['_clinical'], on_trial['_genomic'], other,
                          trial_tree)
        item = summary.create_summary(on_trial)

        for field in status_fields:
            assert field in item, self._debug(item, field)
Пример #14
0
    def test_ms_status(self):

        on_trial['_clinical'] = {}
        on_trial['_genomic'] = {'mmr_status': [{'value': 'MMR-Proficient'}]}
        db = get_db()
        me = MatchEngine(db)
        status, trial_tree = me.create_trial_tree(on_trial, no_validate=True)
        summary = Summary(on_trial['_clinical'], on_trial['_genomic'], {},
                          trial_tree)
        item = summary.create_summary(on_trial)
        assert 'mmr_status' in item, self._debug(item, 'mmr_status')

        on_trial['_genomic'] = {'ms_status': [{'value': 'MSI-H'}]}
        status, trial_tree = me.create_trial_tree(on_trial, no_validate=True)
        summary = Summary(on_trial['_clinical'], on_trial['_genomic'], {},
                          trial_tree)
        item = summary.create_summary(on_trial)
        assert 'ms_status' in item, self._debug(item, 'ms_status')
Пример #15
0
    def test_extract_genes(self):

        m = MatchEngine(get_db())
        match_tree = trial['treatment_list']['step'][0]['match'][0]
        g = m.create_match_tree(match_tree)
        pmt = ParseMatchTree(g)
        genes = pmt.extract_genes()

        assert 'BRAF' in genes, genes
        assert 'KRAS' in genes, genes
        assert 'EGFR' not in genes
        assert 'test' not in genes, genes
        assert len(genes) == 2, genes

        match_tree = trial['treatment_list']['step'][0]['match'][2]
        g = m.create_match_tree(match_tree)
        pmt = ParseMatchTree(g)
        genes = pmt.extract_genes()
        assert 'BRAF' not in genes, genes
Пример #16
0
    def test_extract_genes(self):

        m = MatchEngine(get_db())
        match_tree = trial['treatment_list']['step'][0]['match'][0]
        g = m.create_match_tree(match_tree)
        pmt = ParseMatchTree(g)
        genes = pmt.extract_genes()

        assert 'BRAF' in genes, genes
        assert 'KRAS' in genes, genes
        assert 'EGFR' not in genes
        assert 'test' not in genes, genes
        assert len(genes) == 2, genes

        match_tree = trial['treatment_list']['step'][0]['match'][2]
        g = m.create_match_tree(match_tree)
        pmt = ParseMatchTree(g)
        genes = pmt.extract_genes()
        assert 'BRAF' not in genes, genes
Пример #17
0
    def __init__(self, item):
        """
        Creates data for ElasticSearch's autocomplete index

        :param item: Trial info:
                    - treatment_list: Nested dictionary containing all match criteria
                    - summary:        Summary object created by the API
        """
        self.summary = item['_summary']
        self.treatment_list = item['treatment_list']

        self.vdict = {
            'variants': [],
            'wts': [],
            'svs': [],
            'cnvs': [],
            'exclusions': []
        }
        self.genes = []
        self.cancer_type_dict = None
        self.m = MatchEngine(get_db())
Пример #18
0
    def test_ms_status(self):

        on_trial['_clinical'] = {}
        on_trial['_genomic'] = {'mmr_status': [{'value': 'MMR-Proficient'}]}
        db = get_db()
        me = MatchEngine(db)
        status, trial_tree = me.create_trial_tree(on_trial, no_validate=True)
        summary = Summary(on_trial['_clinical'],
                          on_trial['_genomic'],
                          {},
                          trial_tree)
        item = summary.create_summary(on_trial)
        assert 'mmr_status' in item, self._debug(item, 'mmr_status')

        on_trial['_genomic'] = {'ms_status': [{'value': 'MSI-H'}]}
        status, trial_tree = me.create_trial_tree(on_trial, no_validate=True)
        summary = Summary(on_trial['_clinical'],
                          on_trial['_genomic'],
                          {},
                          trial_tree)
        item = summary.create_summary(on_trial)
        assert 'ms_status' in item, self._debug(item, 'ms_status')
Пример #19
0
    def _get_signatures(self, item):
        """
        Creates hormone receptor status and mutational signature summary lists

        :param item: Trial document
        """

        m = MatchEngine(get_db())
        for step in item['treatment_list']['step']:
            if 'match' in step:
                g = m.create_match_tree(step['match'][0])
                pmt = ParseMatchTree(g)
                signatures = pmt.extract_signatures()
                self.mmr.extend(signatures[0])
                self.ms.extend(signatures[1])
                self.sigs.extend(signatures[2])
                self.hr.extend(pmt.extract_hr_status())

            if 'arm' in step:
                for arm in step['arm']:
                    if 'match' in arm:
                        g = m.create_match_tree(arm['match'][0])
                        pmt = ParseMatchTree(g)
                        signatures = pmt.extract_signatures()
                        self.mmr.extend(signatures[0])
                        self.ms.extend(signatures[1])
                        self.sigs.extend(signatures[2])
                        self.hr.extend(pmt.extract_hr_status())

                    if 'dose_level' in arm:
                        for dose in arm['dose_level']:
                            if 'match' in dose:
                                g = m.create_match_tree(dose['match'][0])
                                pmt = ParseMatchTree(g)
                                signatures = pmt.extract_signatures()
                                self.mmr.extend(signatures[0])
                                self.ms.extend(signatures[1])
                                self.sigs.extend(signatures[2])
                                self.hr.extend(pmt.extract_hr_status())
Пример #20
0
def match(args):
    """
    Matches all trials in database to patients

    :param daemon: Boolean flag; when true, runs the matchengine once per 24 hours.
    """

    db = get_db(args.mongo_uri)

    while True:
        me = MatchEngine(db)
        me.find_trial_matches()

        # exit if it is not set to run as a nightly automated daemon, otherwise sleep for a day
        if not args.daemon:

            # choose output file format
            if args.json_format:
                file_format = 'json'
            elif args.outpath and len(args.outpath.split('.')) > 1:
                file_format = args.outpath.split('.')[-1]
                if file_format not in ['json', 'csv']:
                    file_format = 'csv'
            else:
                file_format = 'csv'

            # choose output path
            if args.outpath:
                outpath = args.outpath.split('.')[0]
            else:
                outpath = './results'

            # export results
            export_results(args.mongo_uri, file_format, outpath)

            break
        else:
            time.sleep(86400)   # sleep for 24 hours
Пример #21
0
    def _get_signatures(self, item):
        """
        Creates hormone receptor status and mutational signature summary lists

        :param item: Trial document
        """

        m = MatchEngine(get_db())
        for step in item['treatment_list']['step']:
            if 'match' in step:
                g = m.create_match_tree(step['match'][0])
                pmt = ParseMatchTree(g)
                signatures = pmt.extract_signatures()
                self.mmr.extend(signatures[0])
                self.ms.extend(signatures[1])
                self.hr.extend(pmt.extract_hr_status())

            if 'arm' in step:
                for arm in step['arm']:
                    if 'match' in arm:
                        g = m.create_match_tree(arm['match'][0])
                        pmt = ParseMatchTree(g)
                        signatures = pmt.extract_signatures()
                        self.mmr.extend(signatures[0])
                        self.ms.extend(signatures[1])
                        self.hr.extend(pmt.extract_hr_status())

                    if 'dose_level' in arm:
                        for dose in arm['dose_level']:
                            if 'match' in dose:
                                g = m.create_match_tree(dose['match'][0])
                                pmt = ParseMatchTree(g)
                                signatures = pmt.extract_signatures()
                                self.mmr.extend(signatures[0])
                                self.ms.extend(signatures[1])
                                self.hr.extend(pmt.extract_hr_status())
Пример #22
0
    def test_extract_variants(self):

        m = MatchEngine(get_db())
        match_tree = trial['treatment_list']['step'][0]['match'][0]
        g = m.create_match_tree(match_tree)
        pmt = ParseMatchTree(g)
        v = pmt.extract_variants()
        assert 'BRAF V600E' in v['variants']
        assert 'BRAF V600K' in v['variants']
        assert 'KRAS any' in v['variants']
        assert 'EGFR wt' in v['wts']
        assert len(v['variants']) == 3
        assert len(v['wts']) == 1

        match_tree = trial['treatment_list']['step'][0]['match'][1]
        g = m.create_match_tree(match_tree)
        pmt = ParseMatchTree(g)
        v = pmt.extract_variants()

        assert 'PTEN CNV' in v['cnvs']
        assert 'BRCA1 SV' in v['svs']
        assert 'BRAF V600' in v['exclusions']
        assert len(v['variants']) == 0
        assert len(v['cnvs']) == 1
        assert len(v['svs']) == 1
        assert len(v['wts']) == 0
        assert len(v['exclusions']) == 1

        match_tree = trial['treatment_list']['step'][0]['match'][2]
        g = m.create_match_tree(match_tree)
        pmt = ParseMatchTree(g)
        v = pmt.extract_variants()
        assert 'BRAF V600E' not in v['variants']
        assert len(v['variants']) == 0
        assert len(v['wts']) == 0
        assert 'BRAF V600E' in v['exclusions']

        match_tree = trial['treatment_list']['step'][0]['match'][4]
        g = m.create_match_tree(match_tree)
        pmt = ParseMatchTree(g)
        v = pmt.extract_variants()
        assert 'BRAF V600K' in v['variants']
        assert 'EGFR any' in v['variants']
        assert len(v['variants']) == 2
        assert 'PTEN CNV' in v['cnvs']
        assert len(v['cnvs']) == 1
        assert 'KRAS' in v['exclusions']
        assert 'NRAS' in v['exclusions']
        assert len(v['exclusions']) == 2
        assert 'NTRK1 wt' in v['wts']
        assert len(v['wts']) == 1
Пример #23
0
    def __init__(self, item):
        """
        Creates data for ElasticSearch's autocomplete index

        :param item: Trial info:
                    - treatment_list: Nested dictionary containing all match criteria
                    - summary:        Summary object created by the API
        """
        self.summary = item['_summary']
        self.treatment_list = item['treatment_list']

        self.vdict = {
            'variants': [],
            'wts': [],
            'svs': [],
            'cnvs': [],
            'exclusions': []
        }
        self.genes = []
        self.cancer_type_dict = None
        self.m = MatchEngine(get_db())
Пример #24
0
    def test_extract_cancer_types(self):

        m = MatchEngine(get_db())
        match_tree = match_tree_example
        g = m.create_match_tree(match_tree)
        pmt = ParseMatchTree(g)
        cancer_type_dict = pmt.extract_cancer_types()

        assert sorted(cancer_type_dict['diagnoses']) == sorted([
            'Ocular Melanoma'
        ]), cancer_type_dict['diagnoses']

        assert sorted(cancer_type_dict['cancer_types_expanded']) == sorted([
            'Ocular Melanoma',
            'Uveal Melanoma',
            'Conjunctival Melanoma'
        ]), cancer_type_dict['cancer_types_expanded']

        assert sorted(cancer_type_dict['excluded_cancer_types']) == [], cancer_type_dict['excluded_cancer_types']
        assert cancer_type_dict['primary_cancer_types'] == ['Eye'], cancer_type_dict['primary_cancer_types']

        m = MatchEngine(get_db())
        match_tree = match_tree_example2
        g = m.create_match_tree(match_tree)
        pmt = ParseMatchTree(g)
        cancer_type_dict = pmt.extract_cancer_types()

        assert sorted(cancer_type_dict['diagnoses']) == sorted(['_SOLID_']), cancer_type_dict['diagnoses']
        assert 'Acute Lymphoid Leukemia' not in cancer_type_dict['cancer_types_expanded'], cancer_type_dict['cancer_types_expanded']
        assert sorted(cancer_type_dict['excluded_cancer_types']) == [], cancer_type_dict['excluded_cancer_types']
        assert cancer_type_dict['primary_cancer_types'] == ['All Solid Tumors'], cancer_type_dict['primary_cancer_types']

        m = MatchEngine(get_db())
        match_tree = match_tree_example3
        g = m.create_match_tree(match_tree)
        pmt = ParseMatchTree(g)
        cancer_type_dict = pmt.extract_cancer_types()

        assert sorted(cancer_type_dict['diagnoses']) == sorted(['_LIQUID_']), cancer_type_dict['diagnoses']
        assert 'Acute Lymphoid Leukemia' in cancer_type_dict['cancer_types_expanded'], cancer_type_dict[
            'cancer_types_expanded']
        assert sorted(cancer_type_dict['excluded_cancer_types']) == [], cancer_type_dict['excluded_cancer_types']
        assert cancer_type_dict['primary_cancer_types'] == ['All Liquid Tumors'], cancer_type_dict['primary_cancer_types']
Пример #25
0
class TestMatchEngine(TestSetUp):

    # global vars.
    me = None

    def setUp(self):
        super(TestMatchEngine, self).setUp()

        # add clinical collection
        self.db.clinical.insert_many([{
            'ONCOTREE_PRIMARY_DIAGNOSIS_NAME': onc,
            'SAMPLE_ID': self.sample_id,
            'VITAL_STATUS': 'alive',
            'DFCI_MRN': self.mrn,
            'REPORT_DATE': self.static_date,
            'BIRTH_DATE': self.static_date,
            'GENDER': 'Male'
        } for onc in ['_LIQUID_', '_SOLID_']])

        self.add_clinical()
        self.add_genomic()

    def tearDown(self):
        self.db.clinical.drop()
        self.db.genomic.drop()

    def test_validate_yaml_format(self):

        # Read yaml file
        test_inp = read_file(os.path.join(YAML_DIR, '00-000.yml'))

        # Parse yaml document, The status should be 1 as validation should fail
        status, data = self.me.validate_yaml_format(test_inp)
        assert status == 1

        # Parse yaml document, The status should be 0 as validation should pass
        test_inp = read_file(os.path.join(YAML_DIR, '00-001.yml'))
        status, data = self.me.validate_yaml_format(test_inp)
        assert status == 0

        # Assert the protocol id is correctly annotated
        assert data['protocol_no'] == '00-001'
        assert data['protocol_id'] == 00001

    def test_validate_yaml_data(self):

        # Read yaml file
        test_inp = read_file(os.path.join(YAML_DIR, '00-001.yml'))
        status, data = self.me.validate_yaml_format(test_inp)

        # Validate the schema for yaml document
        errors = self.me.validate_yaml_data(data)

        # Assert that there are no errors
        assert len(errors) == 0

        test_inp = read_file(os.path.join(YAML_DIR, '00-002.yml'))
        status, data = self.me.validate_yaml_format(test_inp)

        # Assert Schema check fails
        errors = self.me.validate_yaml_data(data)
        assert errors['protocol_id'][0] == 'required field'

        # assert we don't need a match clause at root.
        test_inp = read_file(os.path.join(YAML_DIR, '00-003.yml'))
        status, data = self.me.validate_yaml_format(test_inp)
        assert status == 0

    def test_run_query(self):

        # reinstantiate MatchEngine so that the set of all sample ids in the database includes the documents that were
        # posted by the unit test setUp
        self.me = MatchEngine(self.db)

        # define a clinical node
        node = {'type': 'clinical', 'value': {'ONCOTREE_PRIMARY_DIAGNOSIS': 'Adrenal Gland', 'AGE_NUMERICAL': '>=18'}}

        # run query
        result, matches = self.me.run_query(node)

        # assert number of samples are 89
        assert len(result) == 1, len(result)
        assert self.sample_id in result

        # define a genomic criteria
        node = {'type': 'genomic', 'value': {'HUGO_SYMBOL': '!BRAF'}}

        # run query
        result, matches = self.me.run_query(node)

        # assert number of samples is 1
        assert len(result) == 9, len(result)
        assert self.sample_id not in result

        self.add_genomic_v2()
        node = {'type': 'genomic', 'value': {'HUGO_SYMBOL': 'WHSC1'}}
        result, matches = self.me.run_query(node)
        assert 'actionability' in matches[0]
        assert matches[0]['mmr_status'] == 'Proficient (MMR-P / MSS)'

    def test_prepare_clinical_criteria(self):

        onc = 'ONCOTREE_PRIMARY_DIAGNOSIS'
        oncname = 'ONCOTREE_PRIMARY_DIAGNOSIS_NAME'

        # create clinical criteria
        item = {'AGE_NUMERICAL': '>=18', onc: 'Melanoma'}

        # convert to mongo query
        c = self.me.prepare_clinical_criteria(item)

        # assert length of ONCOTREE_DIAGNOSIS for 'Melanoma' is 9
        assert len(c[oncname]['$in']) == 8

        # check 'BIRTH DATE mongo query does less than search'
        assert c['BIRTH_DATE'].keys()[0] == '$lte'

        # check "!"
        item = {'AGE_NUMERICAL': '<=18', onc: '!Melanoma'}
        c = self.me.prepare_clinical_criteria(item)
        assert c['BIRTH_DATE'].keys()[0] == '$gte'
        assert len(c[oncname]['$nin']) == 8

        # check with a list of diagnoses
        item = {'AGE_NUMERICAL': '>=18', onc: ['!Melanoma', '!Glioblastoma', 'Pheochromocytoma', 'Astrocytoma']}
        c = self.me.prepare_clinical_criteria(item)
        assert c['BIRTH_DATE'].keys()[0] == '$lte'
        assert '$nin' in c[oncname]
        assert '$in' in c[oncname]
        assert len(c[oncname]['$in']) == 2
        assert len(c[oncname]['$nin']) == 12

        # check _SOLID_ && _LIQUID_
        liquid_item = {'AGE_NUMERICAL': '>=18', onc: '_LIQUID_'}
        solid_item = {'AGE_NUMERICAL': '>=18', onc: '_SOLID_'}
        liqc = self.me.prepare_clinical_criteria(liquid_item)
        solc = self.me.prepare_clinical_criteria(solid_item)
        assert len(liqc[oncname]['$in']) == 51, len(liqc[oncname]['$in'])
        assert len(solc[oncname]['$in']) == 561, len(solc[oncname]['$in'])

    def test_prepare_genomic_criteria(self):

        # create a genomic criteria
        item = {'HUGO_SYMBOL': '!KRAS', 'PROTEIN_CHANGE': 'p.V600E'}

        # convert to mongo query
        c, neg, _ = self.me.prepare_genomic_criteria(item)

        # check ! symbol
        assert c['$and'][0]['TRUE_HUGO_SYMBOL']['$eq'] == 'KRAS'
        assert neg is True

        # check protein change
        assert c['$and'][0]['TRUE_PROTEIN_CHANGE']['$eq'] == 'p.V600E'

        # check wildtype
        assert '$or' in c['$and'][1], c
        assert c['$and'][1]['$or'] == [{'WILDTYPE': False}, {'WILDTYPE': {'$exists': False}}]

    def test_sv(self):

        # create a genomic criteria
        item = {'HUGO_SYMBOL': 'KRAS', 'VARIANT_CATEGORY': 'SV'}

        # convert to mongo query
        c, neg, _ = self.me.prepare_genomic_criteria(item)

        # check sv
        assert c['$and'][0]['VARIANT_CATEGORY']['$eq'] == 'SV'

        # check wildtype
        assert '$or' in c['$and'][1], c
        assert c['$and'][1]['$or'] == [{'WILDTYPE': False}, {'WILDTYPE': {'$exists': False}}]

    def test_create_trial_tree(self):
        # parse yaml file and create tree.
        test_inp = read_file(os.path.join(YAML_DIR, '00-004.yml'))
        status, trial_tree = self.me.create_trial_tree(test_inp)

        # assert it was successful.
        assert status == 0

        # check that we have 2 match trees.
        cnt = 0
        for n in trial_tree.nodes():
            if 'match_tree' in trial_tree.node[n]:
                cnt += 1
        assert cnt == 2

        # parse yaml file and create tree.
        test_inp = read_file(os.path.join(YAML_DIR, '00-004.yml'))
        status, trial_tree = self.me.create_trial_tree(test_inp)

        # check that we have 14 nodes.
        assert len(list(trial_tree.nodes())) == 4

        # assert we have 3 match trees.
        cnt = 0
        for n in trial_tree.nodes():
            if 'match_tree' in trial_tree.node[n]:
                cnt += 1
        assert cnt == 2

    def test_create_match_tree(self):

        # parse yaml file and create trial tree.
        test_inp = read_file(os.path.join(YAML_DIR, '00-004.yml'))
        status, trial_tree = self.me.create_trial_tree(test_inp)

        # get the graph for the only match.
        g = None
        i = -1
        number_of_nodes = [2, 10]
        edges = [[(1, 2)], [(1, 2), (2, 3), (2, 4), (4, 5), (4, 6), (4, 7), (4, 8), (4, 9), (4, 10)]]
        for n in trial_tree.nodes():
            if 'match_tree' in trial_tree.node[n]:
                i += 1
                g = trial_tree.node[n]['match_tree']
                assert g is not None

                # Check if tree contain correct number of nodes
                assert g.number_of_nodes() == number_of_nodes[i]

                # Check edges are correctly created
                assert list(nx.dfs_edges(g)) == edges[i]

                # First node should be and
                if i == 0:
                    assert g.node[1]['type'] == 'match'
                    assert g.node[2]['type'] == 'clinical'
                    print g.node[2]['value']
                    assert g.node[2]['value'] == {'disease_status': ['Advanced']}

                elif i == 1:
                    assert g.node[1]['type'] == 'match'
                    assert g.node[2]['type'] == 'and'
                    assert g.node[3]['type'] == 'genomic'
                    assert g.node[3]['value'] == {'hugo_symbol': 'IDH1', 'wildcard_protein_change': 'p.R132', 'variant_category': 'Mutation'}
                    assert g.node[4]['type'] == 'or'
                    assert g.node[5]['type'] == 'clinical'
                    assert g.node[5]['value'] == {'age_numerical': '>=18', 'oncotree_primary_diagnosis': '_SOLID_'}
                    assert g.node[6]['type'] == 'clinical'
                    assert g.node[6]['value'] == {'age_numerical': '>=18', 'oncotree_primary_diagnosis': 'Diffuse Glioma'}
                    assert g.node[7]['type'] == 'clinical'
                    assert g.node[7]['value'] == {'age_numerical': '>=18', 'oncotree_primary_diagnosis': 'Encapsulated Glioma'}
                    assert g.node[8]['type'] == 'clinical'
                    assert g.node[8]['value'] == {'age_numerical': '>=18', 'oncotree_primary_diagnosis': 'Cholangiocarcinoma'}
                    assert g.node[9]['type'] == 'clinical'
                    assert g.node[9]['value'] == {'age_numerical': '>=18', 'oncotree_primary_diagnosis': 'Acute Myeloid Leukemia'}
                    assert g.node[10]['type'] == 'clinical'
                    assert g.node[10]['value'] == {'age_numerical': '>=18', 'oncotree_primary_diagnosis': 'Myelodysplasia'}

    def test_search_oncotree_diagnosis(self):

        # Glioblastoma
        c = {'ONCOTREE_PRIMARY_DIAGNOSIS_NAME': {'$eq': 'Glioblastoma'}}
        oncotree = build_oncotree()
        c['ONCOTREE_PRIMARY_DIAGNOSIS_NAME'] = self.me._search_oncotree_diagnosis(oncotree, c)
        conc = c['ONCOTREE_PRIMARY_DIAGNOSIS_NAME']
        assert '$in' in conc, conc
        assert conc['$in'] == ['Small Cell Glioblastoma', 'Gliosarcoma', 'Glioblastoma Multiforme', 'Glioblastoma'], conc

        # Melanoma
        c = {'ONCOTREE_PRIMARY_DIAGNOSIS_NAME': {'$eq': 'Melanoma'}}
        conc = self.me._search_oncotree_diagnosis(oncotree, c)
        assert '$in' in conc, conc
        assert conc['$in'] == [
            'Melanoma', 'Congenital Nevus', 'Genitourinary Mucosal Melanoma', 'Cutaneous Melanoma',
            'Melanoma of Unknown Primary', 'Desmoplastic Melanoma', 'Lentigo Maligna Melanoma', 'Acral Melanoma'
        ]
Пример #26
0
    def setUp(self):
        """
        Descriptions of test patients

        1: >18, Adrenal Gland, Female, BRAF F346R Mutation
        2: >18, Melanoma, Female, EGFR L858R Mutation
        3: >18, Melanoma, Female, EGFR F346A Mutation
        4: >18, Melanoma, Female, EGFR F346B Mutation
        5: >18, Melanoma, Female, EGFR F000F Mutation
        6: >0.5 && <18, Melanoma, Male, EGFR SV
        7: >0.5 && <18, Glioblastoma, Male, EGFR CNV Hetero del
        8: >0.5 && <18, Glioblastoma, Male, EGFR CNV Gain
        9: >0.5 && <18, Glioblastoma, Male, EGFR CNV H**o del
        10: <0.5, Glioblastoma, Male, EGFR CNV High amp

        Descriptions of test trials
        00-001.yml: dose: EGFR L858R && >=18/_SOLID_
        00-002.yml: arm: EGFR L858R && >=18/_SOLID_
        00-003.yml: step: EGFR L858R && >=18/_SOLID_
        00-004.yml dose: EGFR L858R && >=18/_SOLID_
        00-005.yml 2 doses: EGFR L858R && >=18/_SOLID_
        00-006.yml exon: !13
        """

        self.db = get_db(None)
        for res in ["clinical", "dashboard", "filter", "genomic", "hipaa", "match", "normalize", "oplog"
                    "response", "statistics", "status", "team", "trial", "trial_match", "user"]:
            self.db.drop_collection(res)

        self.me = MatchEngine(self.db)

        self.trials = {}
        self.clinical_id = ObjectId()
        self.mrn = 'TCGA-BH-A1FR'
        self.sample_id = 'TCGA-OR-A5J1'
        self.mrns = [self.mrn] + [self.__random_id() for _ in range(9)]
        self.sample_ids = [self.sample_id] + [self.__random_id() for _ in range(9)]
        self.clinical_ids = [self.clinical_id] + [ObjectId() for _ in range(9)]
        self.static_date = dt.datetime.today()

        # clinical collection
        self.oncotree_diagnoses = ['Adrenal Gland'] + ['Melanoma'] * 5 + ['Glioblastoma'] * 4
        self.genders = ['Female'] * 5 + ['Male'] * 5

        # ages
        adult = self.static_date - dt.timedelta(days=365*19)
        child = self.static_date - dt.timedelta(days=365*10)
        infant = self.static_date - dt.timedelta(days=30*4)
        self.ages = [adult] * 5 + [child] * 4 + [infant]

        self.clinical = [{
            '_id': clinical_id,
            'ONCOTREE_PRIMARY_DIAGNOSIS_NAME': diagnosis,
            'SAMPLE_ID': sample_id,
            'VITAL_STATUS': 'alive',
            'MRN': mrn,
            'REPORT_DATE': self.static_date,
            'BIRTH_DATE': age,
            'GENDER': gender
        } for diagnosis, gender, age, clinical_id, sample_id, mrn in zip(
            self.oncotree_diagnoses, self.genders, self.ages, self.clinical_ids, self.sample_ids, self.mrns)]

        # genomic collection
        self.genes = ['BRAF'] + ['EGFR'] * 9
        self.protein_changes = ['p.F346R', 'p.L858R', 'p.F346A', 'p.F346B', 'p.F000F', None, None, None, None, None]
        self.variant_categories = ['MUTATION'] * 5 + ['SV', 'CNV', 'CNV', 'CNV', 'CNV']
        self.wildtypes = [False] * 10
        self.cnv_calls = [None, None, None, None, None, None,
                          'Heterozygous deletion', 'Gain', 'Homozygous deletion', 'High level amplification']
        self.genomic = [{
            'TRUE_VARIANT_CLASSIFICATION': 'In_Frame_Del',
            'TRUE_PROTEIN_CHANGE': protein_change,
            'VARIANT_CATEGORY': variant_category,
            'CHROMOSOME': 'chr3',
            'POSITION': 178952085,
            'TRUE_STRAND': '+',
            'WILDTYPE': wildtype,
            'CLINICAL_ID': _id,
            'CNV_CALL': cnv_call,
            'TRUE_HUGO_SYMBOL': gene,
            'SAMPLE_ID': sample_id,
            'TRUE_TRANSCRIPT_EXON': 19
        } for protein_change, variant_category, wildtype, cnv_call, gene, _id, sample_id in zip(
            self.protein_changes, self.variant_categories, self.wildtypes,
            self.cnv_calls, self.genes, self.clinical_ids, self.sample_ids
        )]

        # test trials
        self.test_trials = ['00-001', '00-002', '00-003']

        # demo match results
        pnos = ['00-001', '00-001', '00-001', '00-002', '00-002', '00-002']
        mlevels = ['arm', 'arm', 'arm', 'dose', 'dose', 'dose']
        iids = ['1', '2', '3', '4', '5', '6']
        galts = ['Alt1', 'Alt2', 'Alt2', 'Alt3', 'Alt3', 'Alt3']
        self.matches = [{
            'mrn': 'SAMPLE1',
            'sample_id': 'SAMPLE1-ID',
            'protocol_no': protocol_no,
            'match_level': match_level,
            'internal_id': internal_id,
            'genomic_alteration': genomic_alteration
        } for protocol_no, match_level, internal_id, genomic_alteration in zip(
            pnos, mlevels, iids, galts
        )]
Пример #27
0
class Autocomplete:
    def __init__(self, item):
        """
        Creates data for ElasticSearch's autocomplete index

        :param item: Trial info:
                    - treatment_list: Nested dictionary containing all match criteria
                    - summary:        Summary object created by the API
        """
        self.summary = item['_summary']
        self.treatment_list = item['treatment_list']

        self.vdict = {
            'variants': [],
            'wts': [],
            'svs': [],
            'cnvs': [],
            'exclusions': []
        }
        self.genes = []
        self.cancer_type_dict = None
        self.m = MatchEngine(get_db())

    @staticmethod
    def _get_cancer_type_weight(cancer_type, hierarchy='default'):
        """
        Sets the weights for ElasticSearch autocompletion on cancer types. Cancer type terms
        are split so that autocomplete suggestions will populate regardless of which word in the
        multi-word cancer type string is initially input. Higher weighted terms will populate the
        top of the autocomplete dropdown list.

        :param cancer_type: Text to display in the autocomplete dropdown list.
        :param hierarchy: Weight to give the text.
        :return: Dictionary specifying ElasticSearch rules.
        """

        weight_dict = {'primary': 10, 'default': 5, 'bucket': 20}
        if cancer_type == 'All Solid Tumors' or cancer_type == 'All Liquid Tumors':
            hierarchy = 'bucket'

        return {
            'input':
            list(
                set([cancer_type] +
                    [i for i in cancer_type.split() if len(i) > 3])),
            'output':
            cancer_type,
            'weight':
            weight_dict[hierarchy]
        }

    @staticmethod
    def _get_variants_weight(variant, esrule='variants'):
        """
        Sets the weights for ElasticSearch autocompletion on gene variants. Higher weighted terms will populate the
        top of the autocomplete dropdown list.

        :param variant: Text to display in the autocomplete dropdown list.
        :param esrule: Type of variant. This will determine the ElasticSearch parameters.
        :return: Dictionary specifying ElasticSearch rules.
        """

        weight_dict = {'variants': 1, 'wts': 5, 'svs': 3, 'cnvs': 3}
        return {'input': variant, 'weight': weight_dict[esrule]}

    @staticmethod
    def _get_investigator_suggest(investigator, dfci_investigator):
        """
        Creates a list of investigators from the _summary field of the trial collection
        """

        iin = []
        iout = ''
        ispl = [i.strip() for i in investigator.split(',')]
        if len(ispl) == 1:
            iin = [ispl[0]]
            iout = investigator
        elif len(ispl) >= 2:
            iin = [ispl[0], ispl[1]]
            iout = '%s %s' % (ispl[1], ispl[0])

        dfci_in = []
        dfci_out = ''
        if dfci_investigator is not None and 'first_name' in dfci_investigator:
            dfci_in.append(dfci_investigator['first_name'].strip())
            dfci_out += dfci_investigator['first_name'].strip()
        if dfci_investigator is not None and 'last_name' in dfci_investigator:
            dfci_in.append(dfci_investigator['last_name'].strip())
            dfci_out += ' %s' % dfci_investigator['last_name'].strip()

        inv_suggest = [{'input': [i for i in iin if i != ''], 'output': iout}]
        if dfci_out != iout and dfci_out != '':
            inv_suggest.append({'input': dfci_in, 'output': dfci_out.strip()})

        return inv_suggest

    @staticmethod
    def _get_tumor_types_search(ct_suggest):
        """
        Maps special cancer type text output to the values stored in the ElasticSearch index.

        :param ct_suggest: Cancer type text to display.
        :return: Cancer type text stored in th ElasticSearch index, which we will query.
        """

        tts = []
        for ct in ct_suggest:
            if 'output' in ct and ct['output'] == 'All Solid Tumors':
                tts.append('_SOLID_')
            elif 'output' in ct and ct['output'] == 'All Liquid Tumors':
                tts.append('_LIQUID_')
            else:
                tts.append(ct['output'])

        return tts

    def _extract_data_from_match(self, match):
        """
        Extract Cancer Type, Gene, and Variant data from the given match tree
        """

        g = self.m.create_match_tree(match)
        pmt = ParseMatchTree(g)
        self.cancer_type_dict = pmt.extract_cancer_types()
        self.genes.extend(pmt.extract_genes())
        vdict_tmp = pmt.extract_variants()
        for k, v in self.vdict.iteritems():
            v.extend(vdict_tmp[k])

    def add_autocomplete(self):
        """
        Recursively iterates through the treatment list and creates a list of genes contained within.

        :return: Nested dictionary containing all genes referenced within this trial
        """

        for step in self.treatment_list['step']:
            if 'match' in step:
                self._extract_data_from_match(step['match'][0])

            if 'arm' in step:
                for arm in step['arm']:
                    if 'match' in arm:
                        self._extract_data_from_match(arm['match'][0])

                    if 'dose_level' in arm:
                        for dose in arm['dose_level']:
                            if 'match' in dose:
                                self._extract_data_from_match(dose['match'][0])

        if self.cancer_type_dict is None:
            self.cancer_type_dict = {
                'diagnoses': [],
                'primary_cancer_types': [],
                'cancer_types_expanded': [],
                'excluded_cancer_types': []
            }

        weighted_cancer_types = []
        for ct in self.cancer_type_dict['primary_cancer_types']:
            suggestion = self._get_cancer_type_weight(ct, hierarchy='primary')
            weighted_cancer_types.append(suggestion)

        for ct in set(self.cancer_type_dict['cancer_types_expanded']) - set(
                self.cancer_type_dict['primary_cancer_types']):
            suggestion = self._get_cancer_type_weight(ct, hierarchy='default')
            weighted_cancer_types.append(suggestion)

        weighted_variants = {}
        for key in ['variants', 'cnvs', 'svs', 'wts']:
            weighted_variants[key] = []
            for v in set(self.vdict[key]):
                suggestion = self._get_variants_weight(v, esrule=key)
                weighted_variants[key].append(suggestion)

        suggestors = {
            "cancer_type_suggest":
            weighted_cancer_types,
            "hugo_symbol_suggest": {
                "input": list(set(self.genes))
            },
            "variant_suggest": [
                i for i in weighted_variants['variants']
                if not i['input'].endswith('any')
            ],
            "wildtype_suggest":
            weighted_variants['wts'],
            "cnv_suggest":
            weighted_variants['cnvs'],
            "sv_suggest":
            weighted_variants['svs'],
            "protocol_no_suggest": {
                'input': self.summary['protocol_number']
            },
            "disease_center_suggest": {
                'input': [
                    i.replace('(', '').replace(')', '')
                    for i in self.summary['disease_center'].split()
                ],
                'output':
                self.summary['disease_center']
            },
            'disease_status_suggest': {
                'input': self.summary['disease_status']
            },
            'drug_suggest': {
                'input': [i.title() for i in self.summary['drugs']]
            },
            'investigator_suggest':
            self._get_investigator_suggest(self.summary['investigator'],
                                           self.summary['dfci_investigator']),
            'mmr_status_suggest': {
                'input': self.summary['mmr_status'] + self.summary['ms_status']
            },
            'nct_number_suggest': {
                'input': self.summary['nct_number']
            }
        }

        searchers = {
            "tumor_types":
            list(set(self._get_tumor_types_search(weighted_cancer_types))),
            "genes":
            list(set(self.genes)),
            "variants":
            list(set([i['input'] for i in weighted_variants['variants']])),
            "wildtype_genes":
            list(set([i['input'] for i in weighted_variants['wts']])),
            "cnv_genes":
            list(set([i['input'] for i in weighted_variants['cnvs']])),
            "sv_genes":
            list(set([i['input'] for i in weighted_variants['svs']])),
            "exclusion_genes":
            list(set(self.vdict['exclusions'])),
            "protocol_no":
            self.summary["protocol_number"],
            "drugs":
            self.summary["drugs"],
            "age":
            self.summary["age_summary"],
            "phase":
            self.summary["phase_summary"],
            "disease_status":
            self.summary["disease_status"],
            "nct_number":
            self.summary["nct_number"],
            "disease_center":
            self.summary["disease_center"],
            "mmr_status":
            self.summary["mmr_status"],
            "ms_status":
            self.summary["ms_status"],
            "mutational_signatures":
            self.summary["mutational_signatures"],
            "investigator":
            [i['output'] for i in suggestors['investigator_suggest']],
            "short_title":
            self.summary["short_title"]
        }

        return suggestors, searchers, parse_primary_cancer_types(
            self.cancer_type_dict['primary_cancer_types'])
Пример #28
0
class Autocomplete:

    def __init__(self, item):
        """
        Creates data for ElasticSearch's autocomplete index

        :param item: Trial info:
                    - treatment_list: Nested dictionary containing all match criteria
                    - summary:        Summary object created by the API
        """
        self.summary = item['_summary']
        self.treatment_list = item['treatment_list']

        self.vdict = {
            'variants': [],
            'wts': [],
            'svs': [],
            'cnvs': [],
            'exclusions': []
        }
        self.genes = []
        self.cancer_type_dict = None
        self.m = MatchEngine(get_db())

    @staticmethod
    def _get_cancer_type_weight(cancer_type, hierarchy='default'):
        """
        Sets the weights for ElasticSearch autocompletion on cancer types. Cancer type terms
        are split so that autocomplete suggestions will populate regardless of which word in the
        multi-word cancer type string is initially input. Higher weighted terms will populate the
        top of the autocomplete dropdown list.

        :param cancer_type: Text to display in the autocomplete dropdown list.
        :param hierarchy: Weight to give the text.
        :return: Dictionary specifying ElasticSearch rules.
        """

        weight_dict = {'primary': 10, 'default': 5, 'bucket': 20}
        if cancer_type == 'All Solid Tumors' or cancer_type == 'All Liquid Tumors':
            hierarchy = 'bucket'

        return {
            'input': list(set([cancer_type] + [i for i in cancer_type.split() if len(i) > 3])),
            'output': cancer_type,
            'weight': weight_dict[hierarchy]
        }

    @staticmethod
    def _get_variants_weight(variant, esrule='variants'):
        """
        Sets the weights for ElasticSearch autocompletion on gene variants. Higher weighted terms will populate the
        top of the autocomplete dropdown list.

        :param variant: Text to display in the autocomplete dropdown list.
        :param esrule: Type of variant. This will determine the ElasticSearch parameters.
        :return: Dictionary specifying ElasticSearch rules.
        """

        weight_dict = {
            'variants': 1,
            'wts': 5,
            'svs': 3,
            'cnvs': 3
        }
        return {'input': variant, 'weight': weight_dict[esrule]}

    @staticmethod
    def _get_investigator_suggest(investigator, dfci_investigator):
        """
        Creates a list of investigators from the _summary field of the trial collection
        """

        iin = []
        iout = ''
        ispl = [i.strip() for i in investigator.split(',')]
        if len(ispl) == 1:
            iin = [ispl[0]]
            iout = investigator
        elif len(ispl) >= 2:
            iin = [ispl[0], ispl[1]]
            iout = '%s %s' % (ispl[1], ispl[0])

        dfci_in = []
        dfci_out = ''
        if dfci_investigator is not None and 'first_name' in dfci_investigator:
            dfci_in.append(dfci_investigator['first_name'].strip())
            dfci_out += dfci_investigator['first_name'].strip()
        if dfci_investigator is not None and 'last_name' in dfci_investigator:
            dfci_in.append(dfci_investigator['last_name'].strip())
            dfci_out += ' %s' % dfci_investigator['last_name'].strip()

        inv_suggest = [{
            'input': [i for i in iin if i != ''],
            'output': iout
        }]
        if dfci_out != iout and dfci_out != '':
            inv_suggest.append({
                'input': dfci_in,
                'output': dfci_out.strip()
            })

        return inv_suggest

    @staticmethod
    def _get_tumor_types_search(ct_suggest):
        """
        Maps special cancer type text output to the values stored in the ElasticSearch index.

        :param ct_suggest: Cancer type text to display.
        :return: Cancer type text stored in th ElasticSearch index, which we will query.
        """

        tts = []
        for ct in ct_suggest:
            if 'output' in ct and ct['output'] == 'All Solid Tumors':
                tts.append('_SOLID_')
            elif 'output' in ct and ct['output'] == 'All Liquid Tumors':
                tts.append('_LIQUID_')
            else:
                tts.append(ct['output'])

        return tts

    def _extract_data_from_match(self, match):
        """
        Extract Cancer Type, Gene, and Variant data from the given match tree
        """

        g = self.m.create_match_tree(match)
        pmt = ParseMatchTree(g)
        self.cancer_type_dict = pmt.extract_cancer_types()
        self.genes.extend(pmt.extract_genes())
        vdict_tmp = pmt.extract_variants()
        for k, v in self.vdict.iteritems():
            v.extend(vdict_tmp[k])

    def add_autocomplete(self):
        """
        Recursively iterates through the treatment list and creates a list of genes contained within.

        :return: Nested dictionary containing all genes referenced within this trial
        """

        for step in self.treatment_list['step']:
            if 'match' in step:
                self._extract_data_from_match(step['match'][0])

            if 'arm' in step:
                for arm in step['arm']:
                    if 'match' in arm:
                        self._extract_data_from_match(arm['match'][0])

                    if 'dose_level' in arm:
                        for dose in arm['dose_level']:
                            if 'match' in dose:
                                self._extract_data_from_match(dose['match'][0])

        if self.cancer_type_dict is None:
            self.cancer_type_dict = {
                'diagnoses': [],
                'primary_cancer_types': [],
                'cancer_types_expanded': [],
                'excluded_cancer_types': []
            }

        weighted_cancer_types = []
        for ct in self.cancer_type_dict['primary_cancer_types']:
            suggestion = self._get_cancer_type_weight(ct, hierarchy='primary')
            weighted_cancer_types.append(suggestion)

        for ct in set(self.cancer_type_dict['cancer_types_expanded']) - set(self.cancer_type_dict['primary_cancer_types']):
            suggestion = self._get_cancer_type_weight(ct, hierarchy='default')
            weighted_cancer_types.append(suggestion)

        weighted_variants = {}
        for key in ['variants', 'cnvs', 'svs', 'wts']:
            weighted_variants[key] = []
            for v in set(self.vdict[key]):
                suggestion = self._get_variants_weight(v, esrule=key)
                weighted_variants[key].append(suggestion)

        suggestors = {
            "cancer_type_suggest": weighted_cancer_types,
            "hugo_symbol_suggest": {"input": list(set(self.genes))},
            "variant_suggest": [i for i in weighted_variants['variants'] if not i['input'].endswith('any')],
            "wildtype_suggest": weighted_variants['wts'],
            "cnv_suggest": weighted_variants['cnvs'],
            "sv_suggest": weighted_variants['svs'],
            "protocol_no_suggest": {'input': self.summary['protocol_number']},
            "disease_center_suggest": {
                'input': [i.replace('(', '').replace(')', '') for i in self.summary['disease_center'].split()],
                'output': self.summary['disease_center']
            },
            'disease_status_suggest': {'input': self.summary['disease_status']},
            'drug_suggest': {'input': [i.title() for i in self.summary['drugs']]},
            'investigator_suggest': self._get_investigator_suggest(self.summary['investigator'],
                                                                   self.summary['dfci_investigator']),
            'mmr_status_suggest': {'input': self.summary['mmr_status'] + self.summary['ms_status']},
            'nct_number_suggest': {'input': self.summary['nct_number']}
        }

        searchers = {
            "tumor_types": list(set(self._get_tumor_types_search(weighted_cancer_types))),
            "genes": list(set(self.genes)),
            "variants": list(set([i['input'] for i in weighted_variants['variants']])),
            "wildtype_genes": list(set([i['input'] for i in weighted_variants['wts']])),
            "cnv_genes": list(set([i['input'] for i in weighted_variants['cnvs']])),
            "sv_genes": list(set([i['input'] for i in weighted_variants['svs']])),
            "exclusion_genes": list(set(self.vdict['exclusions'])),
            "protocol_no": self.summary["protocol_number"],
            "drugs": self.summary["drugs"],
            "age": self.summary["age_summary"],
            "phase": self.summary["phase_summary"],
            "disease_status": self.summary["disease_status"],
            "nct_number": self.summary["nct_number"],
            "disease_center": self.summary["disease_center"],
            "mmr_status": self.summary["mmr_status"],
            "ms_status": self.summary["ms_status"],
            "mutational_signatures": self.summary["mutational_signatures"],
            "investigator": [i['output'] for i in suggestors['investigator_suggest']],
            "short_title": self.summary["short_title"]
        }

        return suggestors, searchers, parse_primary_cancer_types(self.cancer_type_dict['primary_cancer_types'])