def test_get_match_paths(self): self.me.trials = dict() for file in glob.glob('./matchengine/tests/data/ctml_boolean_cases/*.json'): with open(file) as f: data = json.load(f) trial = [data] self.me.trials[file] = trial with open("./matchengine/tests/data/get_match_paths_expected.json") as f: test_cases = json.load(f) for trial in self.me.trials: filename = os.path.basename(trial) me_trial = self.me.trials[trial] match_tree = create_match_tree(self.me, MatchClauseData(match_clause=me_trial, internal_id='123', code='456', coordinating_center='The Death Star', status='Open to Accrual', parent_path=ParentPath(()), match_clause_level=MatchClauseLevel('arm'), match_clause_additional_attributes={}, is_suspended=True, protocol_no='12-345')) match_paths = list(get_match_paths(match_tree)) for test_case, match_path in zip(test_cases[filename], match_paths): for test_case_criteria_idx, test_case_criteria in enumerate(test_case["criteria_list"]): match_path_criteria = match_path.criteria_list[test_case_criteria_idx] assert test_case_criteria["depth"] == match_path_criteria.depth for inner_test_case_criteria, inner_match_path_criteria in zip(test_case_criteria["criteria"], match_path_criteria.criteria): assert nested_object_hash(inner_test_case_criteria) == nested_object_hash( inner_match_path_criteria)
def test_comparable_dict(self): assert nested_object_hash({}) == nested_object_hash({}) assert nested_object_hash({"1": "1", "2": "2"}) == nested_object_hash({"2": "2", "1": "1"}) assert nested_object_hash({"1": [{}, {2: 3}], "2": "2"}) == nested_object_hash({"2": "2", "1": [{2: 3}, {}]}) assert nested_object_hash({"1": [{'set': {1, 2, 3}}, {2: 3}], "2": "2"}) == nested_object_hash({"2": "2", "1": [{2: 3}, {'set': {3, 1, 2}}]}) assert nested_object_hash({ 1: { 2: [ { 3: 4, 5: {6, 7} } ] }, "4": [9, 8] }) != nested_object_hash({ 1: { 2: [ { 3: 4, 9: {6, 7} } ] }, "4": [9, 8] })
def test_create_match_tree(self): self.me.trials = dict() for file in glob.glob('./matchengine/tests/data/ctml_boolean_cases/*.json'): with open(file) as f: data = json.load(f) trial = [data] self.me.trials[file] = trial with open('./matchengine/tests/data/create_match_tree_expected.json') as f: test_cases = json.load(f) for trial in self.me.trials: me_trial = self.me.trials[trial] match_tree = create_match_tree(self.me, MatchClauseData(match_clause=me_trial, internal_id='123', code='456', coordinating_center='The Death Star', status='Open to Accrual', parent_path=ParentPath(()), match_clause_level=MatchClauseLevel('arm'), match_clause_additional_attributes={}, protocol_no='12-345', is_suspended=True)) test_case = test_cases[os.path.basename(trial)] assert len(test_case["nodes"]) == len(match_tree.nodes) for test_case_key in test_case.keys(): if test_case_key == "nodes": for node_id, node_attrs in test_case[test_case_key].items(): graph_node = match_tree.nodes[int(node_id)] assert len(node_attrs) == len(graph_node) assert nested_object_hash(node_attrs) == nested_object_hash(graph_node) else: for test_item, graph_item in zip(test_case[test_case_key], getattr(match_tree, test_case_key)): for idx, test_item_part in enumerate(test_item): assert test_item_part == graph_item[idx]
def hash(self) -> str: if self._hash is None: self._hash = nested_object_hash({ "query": [criteria.criteria for criteria in self.criteria_list] }) return self._hash
def raw_query_hash(self): if self._raw_query_hash is None: if not self.is_finalized: raise Exception("Query node is not finalized") else: self._raw_query_hash = nested_object_hash( self.extract_raw_query()) return self._raw_query_hash
def hash(self) -> str: if self._hash is None: self._hash = nested_object_hash({ "_tmp1": [query_part.hash() for query_part in self.query_parts], '_tmp2': self.exclusion }) return self._hash
def pre_process_trial_matches(self, trial_match: TrialMatch) -> Dict: """ Function which returns required fields for trial_match documents """ new_trial_match = dict() clinical_doc = self.cache.docs[trial_match.match_reason.clinical_id] new_trial_match.update(self.format_trial_match_k_v(clinical_doc)) new_trial_match['clinical_id'] = self.cache.docs[trial_match.match_reason.clinical_id][ '_id'] new_trial_match.update( { 'match_level': trial_match.match_clause_data.match_clause_level, 'internal_id': trial_match.match_clause_data.internal_id, 'reason_type': trial_match.match_reason.reason_name, 'q_depth': trial_match.match_reason.depth, 'q_width': trial_match.match_reason.width, 'code': trial_match.match_clause_data.code, 'trial_curation_level_status': 'closed' if trial_match.match_clause_data.is_suspended else 'open', 'trial_summary_status': trial_match.match_clause_data.status, 'coordinating_center': trial_match.match_clause_data.coordinating_center, 'show_in_ui': trial_match.match_reason.show_in_ui, 'query_hash': trial_match.match_criterion.hash() }) # add trial fields except for extras new_trial_match.update({ k: v for k, v in trial_match.trial.items() if k not in {'treatment_list', '_summary', 'status', '_elasticsearch', 'match'} }) new_trial_match.update( { 'match_path': '.'.join( [str(item) for item in trial_match.match_clause_data.parent_path]) }) new_trial_match['combo_coord'] = nested_object_hash( { 'query_hash': new_trial_match['query_hash'], 'match_path': new_trial_match['match_path'], self.match_criteria_transform.trial_identifier: new_trial_match[ self.match_criteria_transform.trial_identifier] }) new_trial_match['is_disabled'] = False new_trial_match.pop("_updated", None) new_trial_match.pop("last_updated", None) new_trial_match.pop("_id", None) return new_trial_match
async def run_query_task(matchengine: MatchEngine, task, worker_id): if matchengine.debug: log.info((f"Worker: {worker_id}, protocol_no: {task.trial['protocol_no']} got new QueryTask, " f"{matchengine._task_q.qsize()} tasks left in queue")) try: results: Dict[ClinicalID, List[MatchReason]] = await matchengine.run_query(task.query, task.clinical_ids) except Exception as e: results = dict() log.error(f"ERROR: Worker: {worker_id}, error: {e}") log.error(f"TRACEBACK: {traceback.print_tb(e.__traceback__)}") if e.__class__ is AutoReconnect: matchengine.task_q.put_nowait(task) matchengine.task_q.task_done() elif e.__class__ is CursorNotFound: matchengine.task_q.put_nowait(task) matchengine.task_q.task_done() elif e.__class__ is ServerSelectionTimeoutError: matchengine.task_q.put_nowait(task) matchengine.task_q.task_done() else: matchengine.loop.stop() log.error(f"ERROR: Worker: {worker_id}, error: {e}") log.error(f"TRACEBACK: {traceback.print_tb(e.__traceback__)}") try: by_sample_id = defaultdict(list) matchengine.results_transformer(results) if not results: matchengine.matches.setdefault(task.match_clause_data.protocol_no, dict()) for _, sample_results in results.items(): for result in sample_results: matchengine.queue_task_count += 1 if matchengine.queue_task_count % 1000 == 0 and matchengine.debug: log.info(f"Trial match count: {matchengine.queue_task_count}") match_context_data = TrialMatch(task.trial, task.match_clause_data, task.match_path, task.query, result, matchengine.starttime) # allow user to extend trial_match objects in plugin functions # generate required fields on trial match doc before # generate sort_order and hash fields after all fields are added new_match_proto = matchengine.pre_process_trial_matches(match_context_data) match_document = matchengine.create_trial_matches(match_context_data, new_match_proto) sort_order = get_sort_order(matchengine.config['trial_match_sorting'], match_document) match_document['sort_order'] = sort_order to_hash = {key: match_document[key] for key in match_document if key not in {'hash', 'is_disabled'}} match_document['hash'] = nested_object_hash(to_hash) match_document['_me_id'] = matchengine.run_id.hex matchengine.matches.setdefault(task.trial['protocol_no'], dict()).setdefault(match_document['sample_id'], list()).append(match_document) by_sample_id[match_document['sample_id']].append(match_document) except Exception as e: matchengine.loop.stop() log.error(f"ERROR: Worker: {worker_id}, error: {e}") log.error(f"TRACEBACK: {traceback.print_tb(e.__traceback__)}") raise e matchengine.task_q.task_done()
def query_node_transform(self, query_node: QueryNode) -> NoReturn: """ If a trial curation key/value requires alteration to a separate AND clause in the mongo query, do that here. Used to modify a query part dependent on another query part :return: """ # If a trial curation calls for a structural variant but does NOT have the structured SV data field # FUSION_PARTNER_HUGO_SYMBOL, then the extended_attributes query is done using a regex search of the free text # STRUCTURAL_VARIANT_COMMENT field on the patient's extended_attributes document. whole_query = query_node.extract_raw_query() # encode as full search criteria if 'STRUCTURAL_VARIANT_COMMENT' in whole_query: for do_not_render_part_name in [ 'TRUE_HUGO_SYMBOL', 'FUSION_PARTNER_HUGO_SYMBOL' ]: do_not_render_part = query_node.get_query_part_by_key( do_not_render_part_name) if do_not_render_part is not None: do_not_render_part.render = False gene = whole_query.get('TRUE_HUGO_SYMBOL') sv_part = query_node.get_query_part_by_key( 'STRUCTURAL_VARIANT_COMMENT') if 'STRUCTURED_SV' in whole_query: sv_part.mcq_invalidating = True sv_part.render = False else: sv_part.set_query_attr( 'STRUCTURAL_VARIANT_COMMENT', re.compile( rf"(.*\W{gene}\W.*)|(^{gene}\W.*)|(.*\W{gene}$)", re.IGNORECASE)) # blank-GENE -> Intergenic # GENE-blank -> Intergenic # GENE1-GENE1 -> GENE1-GENE1 # Intragenic # GENE1-GENE2 -> GENE1-GENE2 elif 'STRUCTURED_SV' in whole_query: sv_info_part = query_node.get_query_part_by_key('STRUCTURED_SV') sv_info_part.render = False left = query_node.get_query_part_value_by_key( 'TRUE_HUGO_SYMBOL', None) right = query_node.get_query_part_value_by_key( 'FUSION_PARTNER_HUGO_SYMBOL', None) for do_not_render_part_name in [ 'TRUE_HUGO_SYMBOL', 'FUSION_PARTNER_HUGO_SYMBOL' ]: do_not_render_part = query_node.get_query_part_by_key( do_not_render_part_name) if do_not_render_part is not None: do_not_render_part.render = False left_query = build_structured_sv_query(left, right, 'LEFT-RIGHT') right_query = build_structured_sv_query(left, right, 'RIGHT-LEFT') new_query = ({ '$or': [left_query, right_query] } if nested_object_hash(left_query) != nested_object_hash(right_query) else left_query) query_node.add_query_part( QueryPart(new_query, sv_info_part.negate, True, False)) # if signature curation is passed, do not query TRUE_HUGO_SYMBOL if { 'UVA_STATUS', 'TABACCO_STATUS', 'POLE_STATUS', 'TEMOZOLOMIDE_STATUS', 'MMR_STATUS', 'APOBEC_STATUS' }.intersection(set(whole_query.keys())): gene_part = query_node.get_query_part_by_key('TRUE_HUGO_SYMBOL') if gene_part is not None: gene_part.render = False
def hash(self) -> str: if self._hash is None: self._hash = nested_object_hash(self.query) return self._hash