예제 #1
0
def get_therapeutic_associated_genes(graphkb_conn: GraphKBConnection) -> Set[str]:
    therapeutic_relevance = get_terms_set(graphkb_conn, BASE_THERAPEUTIC_TERMS)
    statements = graphkb_conn.query(
        {
            'target': 'Statement',
            'filters': {'relevance': sorted(list(therapeutic_relevance))},
            'returnProperties': [
                'conditions.@rid',
                'conditions.@class',
                'conditions.reference1.@class',
                'conditions.reference1.@rid',
                'conditions.reference2.@class',
                'conditions.reference2.@rid',
                'reviewStatus',
            ],
        },
    )
    genes = set()

    for statement in statements:
        if statement['reviewStatus'] == FAILED_REVIEW_STATUS:
            continue
        for condition in statement['conditions']:
            if condition['@class'] == 'Feature':
                genes.add(condition['@rid'])
            elif condition['@class'].endswith('Variant'):
                if condition['reference1'] and condition['reference1']['@class'] == 'Feature':
                    genes.add(condition['reference1']['@rid'])
                if condition['reference2'] and condition['reference2']['@class'] == 'Feature':
                    genes.add(condition['reference2']['@rid'])
    return genes
예제 #2
0
def get_approved_evidence_levels(
        graphkb_conn: GraphKBConnection) -> List[Ontology]:
    filters = []
    for source, names in APPROVED_EVIDENCE_LEVELS.items():
        filters.append({
            'AND': [
                {
                    'source': {
                        'target': 'Source',
                        'filters': {
                            'name': source
                        }
                    }
                },
                {
                    'name': names,
                    'operator': 'IN'
                },
            ]
        })
    return graphkb_conn.query({
        'target': 'EvidenceLevel',
        'filters': {
            'OR': filters
        }
    })
예제 #3
0
def get_statements_from_variants(
    graphkb_conn: GraphKBConnection, variants: List[Record]
) -> List[Statement]:
    """
    Given a list of variant records from GraphKB, return all the related statements

    Args:
        graphkb_conn (GraphKBConnection): the graphkb api connection object
        variants (list.<dict>): list of variant records

    Returns:
        list.<dict>: list of Statement records from graphkb
    """
    return_props = (
        BASE_RETURN_PROPERTIES
        + ['sourceId', 'source.name', 'source.displayName']
        + [f'conditions.{p}' for p in GENERIC_RETURN_PROPERTIES]
        + [f'subject.{p}' for p in GENERIC_RETURN_PROPERTIES]
        + [f'evidence.{p}' for p in GENERIC_RETURN_PROPERTIES]
        + [f'relevance.{p}' for p in GENERIC_RETURN_PROPERTIES]
        + [f'evidenceLevel.{p}' for p in GENERIC_RETURN_PROPERTIES]
        + ['reviewStatus']
    )

    statements = graphkb_conn.query(
        {
            'target': 'Statement',
            'filters': {'conditions': convert_to_rid_list(variants), 'operator': 'CONTAINSANY'},
            'returnProperties': return_props,
        },
    )
    return [s for s in statements if s['reviewStatus'] != FAILED_REVIEW_STATUS]
예제 #4
0
def get_alternatives(graphkb_conn: GraphKBConnection,
                     record_id: str) -> List[Dict]:
    return graphkb_conn.query({
        'target': [record_id],
        'queryType': 'similarTo',
        'treeEdges': []
    })
예제 #5
0
def get_gene_information(
    graphkb_conn: GraphKBConnection, gene_names: Iterable[str]
) -> List[IprGene]:
    """
    Create the Gene Info object for upload to IPR with the other report information

    Args:
        graphkb_conn ([type]): [description]
        gene_names ([type]): [description]
    """
    logger.info('fetching variant related genes list')
    variants = graphkb_conn.query(
        {'target': 'Variant', 'returnProperties': ['@class', 'reference1', 'reference2']},
    )

    gene_flags: Dict[str, Set[str]] = {
        'cancerRelated': set(),
        'knownFusionPartner': set(),
        'knownSmallMutation': set(),
    }

    for variant in variants:
        gene_flags['cancerRelated'].add(variant['reference1'])
        if variant['reference2']:
            gene_flags['cancerRelated'].add(variant['reference2'])
            gene_flags['knownFusionPartner'].add(variant['reference1'])
            gene_flags['knownFusionPartner'].add(variant['reference2'])
        elif variant['@class'] == 'PositionalVariant':
            gene_flags['knownSmallMutation'].add(variant['reference1'])

    logger.info('fetching oncogenes list')
    gene_flags['oncogene'] = convert_to_rid_set(get_oncokb_oncogenes(graphkb_conn))
    logger.info('fetching tumour supressors list')
    gene_flags['tumourSuppressor'] = convert_to_rid_set(get_oncokb_tumour_supressors(graphkb_conn))
    logger.info('fetching therapeutic associated genes lists')
    gene_flags['therapeuticAssociated'] = get_therapeutic_associated_genes(graphkb_conn)

    result = []

    for gene_name in gene_names:
        equivalent = convert_to_rid_set(get_equivalent_features(graphkb_conn, gene_name))

        row = IprGene({'name': gene_name})

        for flag in gene_flags:
            row[flag] = bool(equivalent & gene_flags[flag])

        flags = [c for c in row.keys() if c != 'name']

        if any(row[c] for c in flags):
            result.append(row)

            # make smaller JSON to upload since all default to false already
            for flag in flags:
                if not row[flag]:
                    del row[flag]

    return result
예제 #6
0
def get_preferred_gene_name(graphkb_conn: GraphKBConnection,
                            record_id: str) -> str:
    """
    Given some Feature record ID return the preferred gene name
    """
    record = graphkb_conn.get_record_by_id(record_id)
    biotype = record.get('biotype', '')
    genes = []
    expanded = graphkb_conn.query({'target': [record_id], 'neighbors': 3})[0]

    if biotype != 'gene':
        for edge in expanded.get('out_ElementOf', []):
            target = edge['in']
            if target.get('biotype') == 'gene':
                genes.append(target)

    for edge_type in [
            'out_AliasOf',
            'in_AliasOf',
            'in_DeprecatedBy',
            'out_CrossReferenceOf',
            'in_CrossReferenceOf',
    ]:
        target_name = 'out' if edge_type.startswith('in') else 'in'
        for edge in expanded.get(edge_type, []):
            target = edge[target_name]
            if target.get('biotype') == 'gene':
                genes.append(target)
    genes = sorted(
        genes,
        key=lambda gene: (
            gene['deprecated'],
            bool(gene['dependency']),
            '_' in gene['name'],
            gene['name'].startswith('ens'),
        ),
    )
    if genes:
        return genes[0]['displayName']
    # fallback to the input displayName
    return record['displayName']
예제 #7
0
def get_preferred_drug_representation(graphkb_conn: GraphKBConnection,
                                      drug_record_id: str) -> Dict:
    """
    Given a Drug record, follow its linked records to find the preferred
    representation by following alias, deprecating, and cross reference links
    """
    source_preference = {
        r['@rid']: r['sort']
        for r in graphkb_conn.query({
            'target': 'Source',
            'returnProperties': ['sort', '@rid']
        })
    }
    drugs = sorted(
        get_alternatives(graphkb_conn, drug_record_id),
        key=lambda rec: generate_ontology_preference_key(
            rec, source_preference),
    )
    return drugs[0]
예제 #8
0
from graphkb import GraphKBConnection
from graphkb.constants import BASE_RETURN_PROPERTIES, GENERIC_RETURN_PROPERTIES
from graphkb.match import match_positional_variant
from graphkb.util import convert_to_rid_list
from graphkb.vocab import get_term_tree

GKB_API_URL = 'https://pori-demo.bcgsc.ca/graphkb-api/api'
GKB_USER = '******'
GKB_PASSWORD = '******'

graphkb_conn = GraphKBConnection(GKB_API_URL, use_global_cache=False)
graphkb_conn.login(GKB_USER, GKB_PASSWORD)

variant_name = 'KRAS:p.G12D'
variant_matches = match_positional_variant(graphkb_conn, variant_name)

for match in variant_matches:
    print(variant_name, 'will match', match['displayName'])

# return properties should be customized to the users needs
return_props = (BASE_RETURN_PROPERTIES +
                ['sourceId', 'source.name', 'source.displayName'] +
                [f'conditions.{p}' for p in GENERIC_RETURN_PROPERTIES] +
                [f'subject.{p}' for p in GENERIC_RETURN_PROPERTIES] +
                [f'evidence.{p}' for p in GENERIC_RETURN_PROPERTIES] +
                [f'relevance.{p}' for p in GENERIC_RETURN_PROPERTIES] +
                [f'evidenceLevel.{p}' for p in GENERIC_RETURN_PROPERTIES])

statements = graphkb_conn.query({
    'target': 'Statement',
    'filters': {
예제 #9
0
def genes() -> List[Dict]:
    graphkb_conn = GraphKBConnection()
    graphkb_conn.login(os.environ['IPR_USER'], os.environ['IPR_PASS'])

    return get_gene_information(graphkb_conn,
                                ['kras', 'cdkn2a', 'blargh-monkeys', 'ewsr1'])
예제 #10
0
def summarize(
    graphkb_conn: GraphKBConnection,
    matches: Sequence[KbMatch],
    disease_name: str,
    variants: List[IprVariant],
) -> str:
    """
    Given a list of GraphKB matches generate a text summary to add to the report
    """
    templates: Dict[str, List[Statement]] = {}
    statements: Dict[str, Statement] = {}
    variants_by_keys = {v['key']: v for v in variants}
    variant_keys_by_statement_ids: Dict[str, Set[str]] = {}

    for match in matches:
        rid = match['kbStatementId']
        exp_variant = match['variant']
        variant_keys_by_statement_ids.setdefault(rid, set()).add(exp_variant)

    exp_variants_by_statements: Dict[str, List[IprVariant]] = {}
    for rid, keys in variant_keys_by_statement_ids.items():
        exp_variants_by_statements[rid] = [
            variants_by_keys[key] for key in keys
        ]

    disease_matches = convert_to_rid_set(
        get_term_tree(graphkb_conn, disease_name, ontology_class='Disease'))

    # get details for statements
    for match in matches:
        rid = match['kbStatementId'].replace('#', '')
        result = graphkb_conn.request(
            f'/statements/{rid}?neighbors=1')['result']

        templates.setdefault(result['displayNameTemplate'], []).append(result)
        statements[result['@rid']] = result

    # aggregate similar sentences
    sentences = {}
    for template, group in templates.items():
        sentences.update(
            aggregate_statements(graphkb_conn, template, group,
                                 disease_matches))

    # section statements by genes
    statements_by_genes = section_statements_by_genes(
        graphkb_conn, list(statements.values()))

    output: List[str] = [
        '<h3>The comments below were automatically generated from matches to GraphKB and have not been manually reviewed</h3>'
    ]

    for section, statement_rids in sorted(statements_by_genes.items(),
                                          key=lambda x: len(x[1]),
                                          reverse=True):
        exp_variants = {}
        for variant_list in [
                exp_variants_by_statements[r] for r in statement_rids
        ]:
            for variant in variant_list:
                exp_variants[variant['key']] = variant

        output.append(
            create_section_html(
                graphkb_conn,
                section,
                {r: sentences[r]
                 for r in statement_rids},
                {r: statements[r]
                 for r in statement_rids},
                list(exp_variants.values()),
            ))

    return '\n'.join(output)
예제 #11
0
def create_section_html(
    graphkb_conn: GraphKBConnection,
    gene_name: str,
    sentences_by_statement_id: Dict[str, str],
    statements: Dict[str, Statement],
    exp_variants: List[IprVariant],
) -> str:
    """
    Generate HTML for a gene section of the comments
    """
    output = [f'<h2>{gene_name}</h2>']

    sentence_categories: Dict[str, str] = {}

    for statement_id, sentence in sentences_by_statement_id.items():
        relevance = statements[statement_id]['relevance']['@rid']
        category = categorize_relevance(
            graphkb_conn, relevance,
            RELEVANCE_BASE_TERMS + [('resistance', ['no sensitivity'])])
        sentence_categories[sentence] = category

    # get the entrez gene description
    genes = sorted(
        graphkb_conn.query(
            {
                'target': 'Feature',
                'filters': {
                    'AND': [
                        {
                            'source': {
                                'target': 'Source',
                                'filters': {
                                    'name': 'entrez gene'
                                }
                            }
                        },
                        {
                            'name': gene_name
                        },
                        {
                            'biotype': 'gene'
                        },
                    ]
                },
            }, ),
        key=generate_ontology_preference_key,
    )

    variants_text = display_variants(gene_name, exp_variants)
    if not variants_text:
        # exclude sections where they are not linked to an experimental variant. this can occur when there are co-occurent statements collected
        return ''
    if genes and genes[0].get('description', ''):
        description = '. '.join(genes[0]['description'].split('. ')[:2])
        sourceId = genes[0]['sourceId']

        output.append(f'''
<blockquote class="entrez_description" cite="{ENTREZ_GENE_URL}/{sourceId}">
    {description}.
</blockquote>
<p>
    {variants_text}
</p>
''')

    sentences_used: Set[str] = set()

    for section in [
        {s
         for (s, v) in sentence_categories.items() if v == 'diagnostic'},
        {s
         for (s, v) in sentence_categories.items() if v == 'biological'},
        {
            s
            for (s, v) in sentence_categories.items()
            if v in ['therapeutic', 'prognostic']
        },
        {
            s
            for (s, v) in sentence_categories.items() if v not in [
                'diagnostic', 'biological', 'therapeutic', 'prognostic',
                'resistance'
            ]
        },
        {s
         for (s, v) in sentence_categories.items() if v == 'resistance'},
    ]:

        content = '. '.join(sorted(list(section - sentences_used)))
        sentences_used.update(section)
        output.append(f'<p>{content}</p>')
    return '\n'.join(output)
예제 #12
0
def test_login_ok():
    conn = GraphKBConnection()
    conn.login(os.environ['GRAPHKB_USER'], os.environ['GRAPHKB_PASS'])
    assert conn.token is not None
예제 #13
0
def conn():
    conn = GraphKBConnection()
    conn.login(os.environ['GRAPHKB_USER'], os.environ['GRAPHKB_PASS'])
    return conn
예제 #14
0
def annotate_variant(graphkb_conn: GraphKBConnection,
                     raw_variant_name: str,
                     include_unmatched: bool = False) -> List[Dict[str, str]]:
    results = []
    variant_name = convert_aa_3to1(raw_variant_name)

    if 'c.*' in variant_name:
        results.append({
            'variant':
            raw_variant_name,
            'error':
            f'skipping unsupported notation: {variant_name}'
        })
        return results

    print(f'processing: {variant_name}')

    try:
        variant_matches = match_positional_variant(graphkb_conn, variant_name)
    except FeatureNotFoundError:
        if include_unmatched:
            results.append({'variant': raw_variant_name})
        return results
    except Exception as err:
        results.append({'variant': raw_variant_name, 'error': str(err)})
        return results

    if variant_matches:
        print(f'{variant_name} matches {len(variant_matches)} variant records')
    # return properties should be customized to the users needs
    return_props = (BASE_RETURN_PROPERTIES +
                    ['sourceId', 'source.name', 'source.displayName'] +
                    [f'conditions.{p}' for p in GENERIC_RETURN_PROPERTIES] +
                    [f'subject.{p}' for p in GENERIC_RETURN_PROPERTIES] +
                    [f'evidence.{p}' for p in GENERIC_RETURN_PROPERTIES] +
                    [f'relevance.{p}' for p in GENERIC_RETURN_PROPERTIES] +
                    [f'evidenceLevel.{p}'
                     for p in GENERIC_RETURN_PROPERTIES] + ['reviewStatus'])

    statements = typing.cast(
        Statement,
        graphkb_conn.query({
            'target': 'Statement',
            'filters': {
                'conditions': convert_to_rid_list(variant_matches),
                'operator': 'CONTAINSANY',
            },
            'returnProperties': return_props,
        }),
    )
    if not statements:
        if include_unmatched:
            results.append({
                'variant_matches':
                ';'.join(sorted([v['displayName'] for v in variant_matches])),
                'variant':
                raw_variant_name,
            })
        return results
    print(f'{variant_name} matches {len(statements)} statements')

    for statement in statements:
        row = {
            'variant_matches':
            ';'.join(sorted([v['displayName'] for v in variant_matches])),
            'variant':
            raw_variant_name,
            'statement.relevance':
            statement['relevance']['displayName'],
            'statement.@rid':
            statement['@rid'],
            'statement.subject':
            statement['subject']['displayName'],
            'statement.source':
            statement['source']['displayName'] if statement['source'] else '',
            'statement.evidence':
            ';'.join(sorted([e['displayName']
                             for e in statement['evidence']])),
            'statement.conditions':
            ';'.join(
                sorted([e['displayName'] for e in statement['conditions']])),
            'statement.evidence_level':
            ';'.join(
                sorted([
                    e['displayName']
                    for e in (statement['evidenceLevel'] or [])
                ])),
            'statement.review_status':
            statement['reviewStatus'],
            'is_therapeutic':
            bool(statement['relevance']['@rid'] in therapeutic_terms),
        }
        results.append(row)
    return results
예제 #15
0
)
parser.add_argument('--graphkb_user',
                    default='colab_demo',
                    help='The username for logging in to GraphKB')
parser.add_argument('--graphkb_pass',
                    default='colab_demo',
                    help='The password for logging in to GraphKB')
parser.add_argument(
    '--include_unmatched',
    default=False,
    action='store_true',
    help='Include lines for variants that did not match any statements',
)
args = parser.parse_args()

graphkb_conn = GraphKBConnection(args.graphkb_url, use_global_cache=True)
graphkb_conn.login(args.graphkb_user, args.graphkb_pass)

# read the input files
inputs = []
for filename in args.inputs:
    print(f'reading: {filename}')
    temp_df = pd.read_csv(filename, sep='\t')
    temp_df['filename'] = os.path.basename(filename)
    inputs.append(temp_df)
input_df = pd.concat(inputs)


# generate the variant list df
def get_variant(row):
    if not pd.isnull(row['ANN[*].HGVS_P']):