def get_approved_evidence_levels( graphkb_conn: GraphKBConnection) -> List[Ontology]: filters = [] for source, names in APPROVED_EVIDENCE_LEVELS.items(): filters.append({ 'AND': [ { 'source': { 'target': 'Source', 'filters': { 'name': source } } }, { 'name': names, 'operator': 'IN' }, ] }) return graphkb_conn.query({ 'target': 'EvidenceLevel', 'filters': { 'OR': filters } })
def get_therapeutic_associated_genes(graphkb_conn: GraphKBConnection) -> Set[str]: therapeutic_relevance = get_terms_set(graphkb_conn, BASE_THERAPEUTIC_TERMS) statements = graphkb_conn.query( { 'target': 'Statement', 'filters': {'relevance': sorted(list(therapeutic_relevance))}, 'returnProperties': [ 'conditions.@rid', 'conditions.@class', 'conditions.reference1.@class', 'conditions.reference1.@rid', 'conditions.reference2.@class', 'conditions.reference2.@rid', 'reviewStatus', ], }, ) genes = set() for statement in statements: if statement['reviewStatus'] == FAILED_REVIEW_STATUS: continue for condition in statement['conditions']: if condition['@class'] == 'Feature': genes.add(condition['@rid']) elif condition['@class'].endswith('Variant'): if condition['reference1'] and condition['reference1']['@class'] == 'Feature': genes.add(condition['reference1']['@rid']) if condition['reference2'] and condition['reference2']['@class'] == 'Feature': genes.add(condition['reference2']['@rid']) return genes
def get_statements_from_variants( graphkb_conn: GraphKBConnection, variants: List[Record] ) -> List[Statement]: """ Given a list of variant records from GraphKB, return all the related statements Args: graphkb_conn (GraphKBConnection): the graphkb api connection object variants (list.<dict>): list of variant records Returns: list.<dict>: list of Statement records from graphkb """ return_props = ( BASE_RETURN_PROPERTIES + ['sourceId', 'source.name', 'source.displayName'] + [f'conditions.{p}' for p in GENERIC_RETURN_PROPERTIES] + [f'subject.{p}' for p in GENERIC_RETURN_PROPERTIES] + [f'evidence.{p}' for p in GENERIC_RETURN_PROPERTIES] + [f'relevance.{p}' for p in GENERIC_RETURN_PROPERTIES] + [f'evidenceLevel.{p}' for p in GENERIC_RETURN_PROPERTIES] + ['reviewStatus'] ) statements = graphkb_conn.query( { 'target': 'Statement', 'filters': {'conditions': convert_to_rid_list(variants), 'operator': 'CONTAINSANY'}, 'returnProperties': return_props, }, ) return [s for s in statements if s['reviewStatus'] != FAILED_REVIEW_STATUS]
def get_alternatives(graphkb_conn: GraphKBConnection, record_id: str) -> List[Dict]: return graphkb_conn.query({ 'target': [record_id], 'queryType': 'similarTo', 'treeEdges': [] })
def get_gene_information( graphkb_conn: GraphKBConnection, gene_names: Iterable[str] ) -> List[IprGene]: """ Create the Gene Info object for upload to IPR with the other report information Args: graphkb_conn ([type]): [description] gene_names ([type]): [description] """ logger.info('fetching variant related genes list') variants = graphkb_conn.query( {'target': 'Variant', 'returnProperties': ['@class', 'reference1', 'reference2']}, ) gene_flags: Dict[str, Set[str]] = { 'cancerRelated': set(), 'knownFusionPartner': set(), 'knownSmallMutation': set(), } for variant in variants: gene_flags['cancerRelated'].add(variant['reference1']) if variant['reference2']: gene_flags['cancerRelated'].add(variant['reference2']) gene_flags['knownFusionPartner'].add(variant['reference1']) gene_flags['knownFusionPartner'].add(variant['reference2']) elif variant['@class'] == 'PositionalVariant': gene_flags['knownSmallMutation'].add(variant['reference1']) logger.info('fetching oncogenes list') gene_flags['oncogene'] = convert_to_rid_set(get_oncokb_oncogenes(graphkb_conn)) logger.info('fetching tumour supressors list') gene_flags['tumourSuppressor'] = convert_to_rid_set(get_oncokb_tumour_supressors(graphkb_conn)) logger.info('fetching therapeutic associated genes lists') gene_flags['therapeuticAssociated'] = get_therapeutic_associated_genes(graphkb_conn) result = [] for gene_name in gene_names: equivalent = convert_to_rid_set(get_equivalent_features(graphkb_conn, gene_name)) row = IprGene({'name': gene_name}) for flag in gene_flags: row[flag] = bool(equivalent & gene_flags[flag]) flags = [c for c in row.keys() if c != 'name'] if any(row[c] for c in flags): result.append(row) # make smaller JSON to upload since all default to false already for flag in flags: if not row[flag]: del row[flag] return result
def get_preferred_drug_representation(graphkb_conn: GraphKBConnection, drug_record_id: str) -> Dict: """ Given a Drug record, follow its linked records to find the preferred representation by following alias, deprecating, and cross reference links """ source_preference = { r['@rid']: r['sort'] for r in graphkb_conn.query({ 'target': 'Source', 'returnProperties': ['sort', '@rid'] }) } drugs = sorted( get_alternatives(graphkb_conn, drug_record_id), key=lambda rec: generate_ontology_preference_key( rec, source_preference), ) return drugs[0]
def get_preferred_gene_name(graphkb_conn: GraphKBConnection, record_id: str) -> str: """ Given some Feature record ID return the preferred gene name """ record = graphkb_conn.get_record_by_id(record_id) biotype = record.get('biotype', '') genes = [] expanded = graphkb_conn.query({'target': [record_id], 'neighbors': 3})[0] if biotype != 'gene': for edge in expanded.get('out_ElementOf', []): target = edge['in'] if target.get('biotype') == 'gene': genes.append(target) for edge_type in [ 'out_AliasOf', 'in_AliasOf', 'in_DeprecatedBy', 'out_CrossReferenceOf', 'in_CrossReferenceOf', ]: target_name = 'out' if edge_type.startswith('in') else 'in' for edge in expanded.get(edge_type, []): target = edge[target_name] if target.get('biotype') == 'gene': genes.append(target) genes = sorted( genes, key=lambda gene: ( gene['deprecated'], bool(gene['dependency']), '_' in gene['name'], gene['name'].startswith('ens'), ), ) if genes: return genes[0]['displayName'] # fallback to the input displayName return record['displayName']
for match in variant_matches: print(variant_name, 'will match', match['displayName']) # return properties should be customized to the users needs return_props = (BASE_RETURN_PROPERTIES + ['sourceId', 'source.name', 'source.displayName'] + [f'conditions.{p}' for p in GENERIC_RETURN_PROPERTIES] + [f'subject.{p}' for p in GENERIC_RETURN_PROPERTIES] + [f'evidence.{p}' for p in GENERIC_RETURN_PROPERTIES] + [f'relevance.{p}' for p in GENERIC_RETURN_PROPERTIES] + [f'evidenceLevel.{p}' for p in GENERIC_RETURN_PROPERTIES]) statements = graphkb_conn.query({ 'target': 'Statement', 'filters': { 'conditions': convert_to_rid_list(variant_matches), 'operator': 'CONTAINSANY' }, 'returnProperties': return_props, }) for statement in statements[:5]: print( statement['relevance']['displayName'], statement['subject']['displayName'], statement['source']['displayName'] if statement['source'] else '', ) BASE_THERAPEUTIC_TERMS = 'therapeutic efficacy' therapeutic_terms = get_term_tree(graphkb_conn, BASE_THERAPEUTIC_TERMS,
def create_section_html( graphkb_conn: GraphKBConnection, gene_name: str, sentences_by_statement_id: Dict[str, str], statements: Dict[str, Statement], exp_variants: List[IprVariant], ) -> str: """ Generate HTML for a gene section of the comments """ output = [f'<h2>{gene_name}</h2>'] sentence_categories: Dict[str, str] = {} for statement_id, sentence in sentences_by_statement_id.items(): relevance = statements[statement_id]['relevance']['@rid'] category = categorize_relevance( graphkb_conn, relevance, RELEVANCE_BASE_TERMS + [('resistance', ['no sensitivity'])]) sentence_categories[sentence] = category # get the entrez gene description genes = sorted( graphkb_conn.query( { 'target': 'Feature', 'filters': { 'AND': [ { 'source': { 'target': 'Source', 'filters': { 'name': 'entrez gene' } } }, { 'name': gene_name }, { 'biotype': 'gene' }, ] }, }, ), key=generate_ontology_preference_key, ) variants_text = display_variants(gene_name, exp_variants) if not variants_text: # exclude sections where they are not linked to an experimental variant. this can occur when there are co-occurent statements collected return '' if genes and genes[0].get('description', ''): description = '. '.join(genes[0]['description'].split('. ')[:2]) sourceId = genes[0]['sourceId'] output.append(f''' <blockquote class="entrez_description" cite="{ENTREZ_GENE_URL}/{sourceId}"> {description}. </blockquote> <p> {variants_text} </p> ''') sentences_used: Set[str] = set() for section in [ {s for (s, v) in sentence_categories.items() if v == 'diagnostic'}, {s for (s, v) in sentence_categories.items() if v == 'biological'}, { s for (s, v) in sentence_categories.items() if v in ['therapeutic', 'prognostic'] }, { s for (s, v) in sentence_categories.items() if v not in [ 'diagnostic', 'biological', 'therapeutic', 'prognostic', 'resistance' ] }, {s for (s, v) in sentence_categories.items() if v == 'resistance'}, ]: content = '. '.join(sorted(list(section - sentences_used))) sentences_used.update(section) output.append(f'<p>{content}</p>') return '\n'.join(output)
def annotate_variant(graphkb_conn: GraphKBConnection, raw_variant_name: str, include_unmatched: bool = False) -> List[Dict[str, str]]: results = [] variant_name = convert_aa_3to1(raw_variant_name) if 'c.*' in variant_name: results.append({ 'variant': raw_variant_name, 'error': f'skipping unsupported notation: {variant_name}' }) return results print(f'processing: {variant_name}') try: variant_matches = match_positional_variant(graphkb_conn, variant_name) except FeatureNotFoundError: if include_unmatched: results.append({'variant': raw_variant_name}) return results except Exception as err: results.append({'variant': raw_variant_name, 'error': str(err)}) return results if variant_matches: print(f'{variant_name} matches {len(variant_matches)} variant records') # return properties should be customized to the users needs return_props = (BASE_RETURN_PROPERTIES + ['sourceId', 'source.name', 'source.displayName'] + [f'conditions.{p}' for p in GENERIC_RETURN_PROPERTIES] + [f'subject.{p}' for p in GENERIC_RETURN_PROPERTIES] + [f'evidence.{p}' for p in GENERIC_RETURN_PROPERTIES] + [f'relevance.{p}' for p in GENERIC_RETURN_PROPERTIES] + [f'evidenceLevel.{p}' for p in GENERIC_RETURN_PROPERTIES] + ['reviewStatus']) statements = typing.cast( Statement, graphkb_conn.query({ 'target': 'Statement', 'filters': { 'conditions': convert_to_rid_list(variant_matches), 'operator': 'CONTAINSANY', }, 'returnProperties': return_props, }), ) if not statements: if include_unmatched: results.append({ 'variant_matches': ';'.join(sorted([v['displayName'] for v in variant_matches])), 'variant': raw_variant_name, }) return results print(f'{variant_name} matches {len(statements)} statements') for statement in statements: row = { 'variant_matches': ';'.join(sorted([v['displayName'] for v in variant_matches])), 'variant': raw_variant_name, 'statement.relevance': statement['relevance']['displayName'], 'statement.@rid': statement['@rid'], 'statement.subject': statement['subject']['displayName'], 'statement.source': statement['source']['displayName'] if statement['source'] else '', 'statement.evidence': ';'.join(sorted([e['displayName'] for e in statement['evidence']])), 'statement.conditions': ';'.join( sorted([e['displayName'] for e in statement['conditions']])), 'statement.evidence_level': ';'.join( sorted([ e['displayName'] for e in (statement['evidenceLevel'] or []) ])), 'statement.review_status': statement['reviewStatus'], 'is_therapeutic': bool(statement['relevance']['@rid'] in therapeutic_terms), } results.append(row) return results