def load(obo_file): """ Load OBO file into a networkx graph. :param obo_file: OBO definition file. :param logger: Python `logging` logger instance. :return: `networkx.MultiDiGraph` """ try: hpo_network = obonet.read_obo(obo_file) #return nx.MultiDiGraph(hpo_network.subgraph(['HP:0000118'] + list(nx.ancestors(hpo_network, 'HP:0000118')))) except (FileNotFoundError, PermissionError) as e: if logger is not None: logger.critical(e) else: sys.stderr.write(str(e)) exit(1) # roots for non-phenotype nodes non_phenotypes = { 'mortality_aging': 'HP:0040006', 'mode_of_inheritance': 'HP:0000005', 'clinical_modifier': 'HP:0012823', 'frequency': 'HP:0040279', 'clinical_course': 'HP:0031797', } # remove non-phenotype branches for _, hpo_id in non_phenotypes.items(): if hpo_id in hpo_network.nodes: children = nx.ancestors(hpo_network, hpo_id) hpo_network.remove_nodes_from([hpo_id] + list(children)) return hpo_network
def likelihood_moldx(input_file, output_file=None, k_phenotype_groups=1000): """ :param input_file: The file path to a file containing three columns. [ID\tkey=value\thpodid,hpoid,hpoid] :param output_file: The file path to an output file containing the predicted probabilities :param k_phenotype_groups: The number of phenotype groups to use for encoding phenotypes. The CLI version of phenopy allows for one of [1000, 1500] """ try: obo_file = config.get('hpo', 'obo_file') except (NoSectionError, NoOptionError): logger.critical( 'No HPO OBO file found in the configuration file. See "hpo:obo_file" parameter.' ) sys.exit(1) try: disease_to_phenotype_file = config.get('hpo', 'disease_to_phenotype_file') except (NoSectionError, NoOptionError): logger.critical( 'No HPO annotated dataset file found in the configuration file.' ' See "hpo:disease_to_phenotype_file" parameter.') sys.exit(1) logger.info(f'Loading HPO OBO file: {obo_file}') hpo_network, alt2prim, _ = \ generate_annotated_hpo_network(obo_file, disease_to_phenotype_file, ) # parse input records input_records = parse_input(input_file, hpo_network, alt2prim) record_ids = [record["record_id"] for record in input_records] phenotypes = [record["terms"] for record in input_records] # predict likelihood of molecular diagnosis positive_probabilities = predict_likelihood_moldx( phenotypes, phenotype_groups=None, hpo_network=hpo_network, alt2prim=alt2prim, k_phenotype_groups=k_phenotype_groups, ) if output_file is None: output_file = "phenopy.likelihood_moldx.txt" try: with open(output_file, "w") as f: for sample_id, probability in zip(record_ids, positive_probabilities): f.write(f"{sample_id}\t{probability}\n") except IOError: sys.exit("Something went wrong writing the probabilities to file")
def export_phenotype_hpoa_with_no_parents(phenotype_hpoa_file, phenotype_hpoa_no_parents_file, hpo_network, logger=None): """ Load HPO terms associated to genes as annotated in https://hpo.jax.org/app/download/annotation. Filter the parent terms for each gene. Dump pheno2genes_no_parents_file :param phenotype_hpoa_file: Phenotypes to diseases file. :param phenotype_hpoa_no_parents_file: Phenotypes to diseases file with parents removed. :param hpo_network: The HPO networkx object. :param logger: Python `logging` logger instance. :return: None """ try: with open(phenotype_hpoa_file, 'r') as tsv_fh: # skip the comment lines [next(tsv_fh) for _ in range(4)] df = pd.read_csv( tsv_fh, sep='\t', ) except (FileNotFoundError, PermissionError) as e: if logger is not None: logger.critical(e) else: sys.stderr.write(str(e)) exit(1) no_parents_df = df.copy() for gene, annotations in df.groupby('#DatabaseID'): termlist = [ node for node in annotations['HPO_ID'].tolist() if node in hpo_network.nodes() ] termlist = remove_parents(termlist, hpo_network) parent_idx = annotations.loc[~annotations['HPO_ID'].isin(termlist )].index no_parents_df.drop(parent_idx, inplace=True) try: no_parents_df.to_csv(phenotype_hpoa_no_parents_file, sep='\t', index=False) except PermissionError as e: if logger is not None: logger.critical(e) else: sys.stderr.write(str(e)) exit(1)
def parse_input(input_file, hpo_network, alt2prim): """ Parse input file. """ try: with open(input_file, 'r') as input_fh: reader = csv.reader(filter(lambda l: not l.startswith('#'), input_fh), delimiter='\t') records = [] for line in reader: # prcoess terms with convert and filter first terms = [] for term_id in line[2].split('|'): # convert alternate ids to primary if term_id in alt2prim: term_id = alt2prim[term_id] # filtering terms not in the hpo network if term_id not in hpo_network.nodes(): continue terms.append(term_id) record = { 'record_id': line[0], 'terms': remove_parents(terms, hpo_network), 'weights': {}, **dict( item.split('=') for item in line[1].split(';') if line[1] != '.') } # assign new weights here ex. Sex weights (similar to the age weights). records.append(record) except (FileNotFoundError, PermissionError) as e: logger.critical( f'Provided input file could not be loaded or does not exist: {e}') exit(1) except ValueError: logger.critical( f'Unable to parse input file, invalid line number: {reader.line_num}:{input_file}' ) exit(1) return records
def request_mimid_info(mimid): """ request mimid description from OMIM """ access = "entry?" api_key = os.getenv("OMIM_API_KEY") if api_key is None: api_key = config.get("omim", "omim_api_key") payload = { "mimNumber": mimid, "include": "text", "format": "json", "apiKey": api_key, } r = requests.get(OMIM_API_URL + access, params=payload) if r.status_code == 200: return r else: logger.critical( "Please set the omim_api_key in your phenopy.ini config file")
def read_records_file(records_file, no_parents=False, hpo_network=None, logger=None): """ Parse input file for patient descriptions into an array of dictionaries :param records_file: path to the records file to parse :param no_parents: remove parent nodes :param hpo_network: hpo network to use in removing parents :param logger: logger object to use in reporting errors :return: list of dictionaries """ try: with open(records_file) as records_fh: reader = csv.reader(records_fh, delimiter='\t') records = [] for line in reader: if line[0].startswith('#'): continue dict_ = { 'sample': line[0], 'age': parse(line[1], what='age'), 'gender': parse(line[1], what='sex'), 'terms': parse(line[2], what='HPO') } if no_parents is True and hpo_network is not None: dict_['terms'] = remove_parents(dict_['terms'], hpo_network) else: pass records.append(dict_) return records except (FileNotFoundError, PermissionError) as e: if logger is not None: logger.critical(e) else: sys.stderr.write(str(e)) exit(1)
def _load_hpo_network(obo_file, terms_to_genes, annotations_count, custom_annotations_file, hpo_network_file=None): """ Load and process phenotypes to genes and obo files if we don't have a processed network already. """ # We instruct the user that they can set hpo_network_file in .phenopy/phenopy.ini # The default value is empty string, so check for that first. if hpo_network_file is None: hpo_network_file = config.get('hpo', 'hpo_network_file') if not os.path.exists(hpo_network_file): # load and process hpo network logger.info(f'Loading HPO OBO file: {obo_file}') hpo_network = load_obo(obo_file, logger=logger) hpo_network = process(hpo_network, terms_to_genes, annotations_count, custom_annotations_file, logger=logger) # save a cache of the processed network cache(hpo_network, hpo_network_file) # the default hpo_network.pickle file was found else: try: hpo_network = restore(hpo_network_file) except (FileNotFoundError, PermissionError, IsADirectoryError) as e: logger.critical( f'{hpo_network_file} is not a valid path to a pickled hpo_network file.\n' f'In your $HOME/.phenopy/phenopy.ini, please set hpo_network_file' f'=/path/to/hpo_netowrk.pickle OR leave it empty, which is the default. ' ) raise e return hpo_network
def predict_likelihood_moldx(phenotypes, phenotype_groups=None, hpo_network=None, alt2prim=None, k_phenotype_groups=1000): """ Predicts the likelihood of molecular diagnosis given a set of phenotypes. :param phenotypes: A list of phenotypes or a list of lists of phenotypes. :param phenotype_groups: <optionnal> A dictionary of phenotype to phenotype group mappings. :param hpo_network: <optional> The hpo networkx object. :param alt2prim: <optional> A dictionary of alternate phenotype ids to primary phenotype ids. (must be given if hpo_network is provided) :param k_phenotype_groups <optional> An integer that represents the number of phenotype groups to use. :return: An array of probabilities for the positive class. """ # detect if phenotypes is 1d or 2d if hpo_network is None or alt2prim is None: try: obo_file = config.get('hpo', 'obo_file') except (NoSectionError, NoOptionError): logger.critical( 'No HPO OBO file found in the configuration file. See "hpo:obo_file" parameter.' ) raise try: disease_to_phenotype_file = config.get( 'hpo', 'disease_to_phenotype_file') except (NoSectionError, NoOptionError): logger.critical( 'No HPO annotated dataset file found in the configuration file.' ' See "hpo:disease_to_phenotype_file" parameter.') raise logger.info(f'Loading HPO OBO file: {obo_file}') hpo_network, alt2prim, _ = \ generate_annotated_hpo_network(obo_file, disease_to_phenotype_file, ) if phenotype_groups is None: phenotype_groups = read_phenotype_groups() try: phenotype_groups[list(phenotype_groups)[0]][f"k{k_phenotype_groups}"] except KeyError: logger.critical( "The value for k_phenotype_groups was not valid. Please use a valid k from the phenotype_groups dictionary." ) raise encoded_phenotypes = encode_phenotypes(phenotypes, phenotype_groups, hpo_network, alt2prim, k=k_phenotype_groups) model = joblib.load(config['models']['likelihood.model']) probabilities = model.predict_proba(encoded_phenotypes) return probabilities[:, 1]
def score(input_file, output_file='-', records_file=None, annotations_file=None, custom_disease_file=None, ages_distribution_file=None, self=False, summarization_method='BMWA', scoring_method='HRSS', threads=1): """ Scores similarity of provided HPO annotated entries (see format below) against a set of HPO annotated dataset. By default scoring happens against diseases annotated by the HPO group. See https://hpo.jax.org/app/download/annotation. Phenopy also supports scoring the product of provided entries (see "--product") or scoring against a custom records dataset (see "--records-file). :param input_file: File with HPO annotated entries, one per line (see format below). :param output_file: File path where to store the results. [default: - (stdout)] :param records_file: An entity-to-phenotype annotation file in the same format as "input_file". This file, if provided, is used to score entries in the "input_file" against entries here. [default: None] :param annotations_file: An entity-to-phenotype annotation file in the same format as "input_file". This file, if provided, is used to add information content to the network. [default: None] :param custom_disease_file: entity Annotation for ranking diseases/genes :param ages_distribution_file: Phenotypes age summary stats file containing phenotype HPO id, mean_age, and std. [default: None] :param self: Score entries in the "input_file" against itself. :param summarization_method: The method used to summarize the HRSS matrix. Supported Values are best match average (BMA), best match weighted average (BMWA), and maximum (maximum). [default: BMWA] :param scoring_method: Either HRSS or Resnik :param threads: Number of parallel processes to use. [default: 1] """ try: obo_file = config.get('hpo', 'obo_file') except (NoSectionError, NoOptionError): logger.critical( 'No HPO OBO file found in the configuration file. See "hpo:obo_file" parameter.' ) sys.exit(1) if custom_disease_file is None: try: disease_to_phenotype_file = config.get( 'hpo', 'disease_to_phenotype_file') except (NoSectionError, NoOptionError): logger.critical( 'No HPO annotated dataset file found in the configuration file.' ' See "hpo:disease_to_phenotype_file" parameter.') sys.exit(1) else: logger.info( f"using custom disease annotation file: {custom_disease_file}") disease_to_phenotype_file = custom_disease_file logger.info(f'Loading HPO OBO file: {obo_file}') hpo_network, alt2prim, disease_records = \ generate_annotated_hpo_network(obo_file, disease_to_phenotype_file, annotations_file=annotations_file, ages_distribution_file=ages_distribution_file ) # parse input records input_records = parse_input(input_file, hpo_network, alt2prim) # create instance the scorer class try: scorer = Scorer(hpo_network, summarization_method=summarization_method, scoring_method=scoring_method) except ValueError as e: logger.critical(f'Failed to initialize scoring class: {e}') sys.exit(1) if self: score_records = input_records scoring_pairs = half_product(len(score_records), len(score_records)) else: if records_file: score_records = parse_input(records_file, hpo_network, alt2prim) else: score_records = disease_records scoring_pairs = itertools.product( range(len(input_records)), range(len(score_records)), ) results = scorer.score_records(input_records, score_records, scoring_pairs, threads) with open_or_stdout(output_file) as output_fh: output_fh.write('\t'.join(['#query', 'entity_id', 'score'])) output_fh.write('\n') for result in results: output_fh.write('\t'.join(str(column) for column in result)) output_fh.write('\n')
def annotate(hpo_network, phenotype_to_diseases, num_diseases_annotated, alt2prim, annotations_file=None, ages_distribution_file=None, phenotype_disease_frequencies=None): """ Cleans the HPO network. Removes non-phenotype branches of the network, and merges all synonyms into one tag. :param hpo_network: `networkx.MultiDiGraph` to clean. :param phenotype_to_diseases: Dictionary mapping HPO terms to diseases. :param num_diseases_annotated: Number of diseases with HPO annotations. :param alt2prim: The dict of alternate terms to canonical terms. :param annotations_file: A list of custom annotation files, in the same format as tests/data/test.score-long.txt :param ages: age distributions object :param phenotype_disease_frequencies: dictionary of phenotype to disease frequencies :param logger: Python `logging` logger instance. :param ages_distribution_file: Path to phenotypes ages distribution file. :return: `networkx.MultiDiGraph` """ # Before calculating information content, check for custom_annotations_file and load custom_annos = None if annotations_file is not None: custom_annos = {} for record in parse_input(annotations_file, hpo_network, alt2prim): for term_id in record['terms']: if term_id not in custom_annos: custom_annos[term_id] = [] custom_annos[term_id].append(record['record_id']) # make ages distributions ages = None if ages_distribution_file is not None: try: ages = make_age_distributions(ages_distribution_file) logger.info( f'Adding custom phenotype age distributions to HPO nodes from file: {ages_distribution_file}' ) except (FileNotFoundError, PermissionError) as e: logger.critical(e) logger.critical( f'Specified phenotype ages file could not be loaded or does not exist: {e}' ) exit(1) for node_id, data in hpo_network.nodes(data=True): # annotate with information content value hpo_network.nodes[node_id]['ic'] = calculate_information_content( node_id, hpo_network, phenotype_to_diseases, num_diseases_annotated, custom_annos, ) # annotate with phenotype age distribution hpo_network.nodes[node_id]['disease_weights'] = {} if ages is not None and node_id in ages.index: hpo_network.nodes[node_id]['age_dist'] = ages.loc[node_id]['age_dist'] # add the disease_frequency weights as attributes to the node if phenotype_disease_frequencies is not None: if node_id in phenotype_disease_frequencies: for disease_id, frequency in phenotype_disease_frequencies[node_id].items(): hpo_network.nodes[node_id]['weights']['disease_frequency'][disease_id] = frequency # annotate with depth value # hard-coding origin node for now origin = 'HP:0000001' hpo_network.nodes[node_id]['depth'] = nx.shortest_path_length( hpo_network, node_id, origin ) # clean synonyms synonyms = [] try: for synonym in data['synonym']: synonyms.append(synonym) hpo_network.nodes[node_id]['synonyms'] = re.findall(r'"(.*?)"', ','.join(synonyms)) except KeyError: # pass if no synonym tags in the node pass return hpo_network
def score(query_hpo_file, records_file=None, query_name='SAMPLE', obo_file=None, pheno2genes_file=None, threads=1, agg_score='BMA', no_parents=False, custom_annotations_file=None, output_file=None): """ Scores a case HPO terms against all genes associated HPO. :param query_hpo_file: File with case HPO terms, one per line. :param records_file: One record per line, tab delimited. First column record unique identifier, second column pipe separated list of HPO identifier (HP:0000001). :param query_name: Unique identifier for the query file. :param obo_file: OBO file from https://hpo.jax.org/app/download/ontology. :param pheno2genes_file: Phenotypes to genes from https://hpo.jax.org/app/download/annotation. :param threads: Number of parallel process to use. :param agg_score: The aggregation method to use for summarizing the similarity matrix between two term sets Must be one of {'BMA', 'maximum'} :param no_parents: If provided, scoring is done by only using the most informative nodes. All parent nodes are removed. :param custom_annotations_file: A custom entity-to-phenotype annotation file in the same format as tests/data/test.score-product.txt :param output_file: filepath where to store the results. """ if agg_score not in {'BMA', 'maximum', }: logger.critical( 'agg_score must be one of {BMA, maximum}.') exit(1) if obo_file is None: try: obo_file = config.get('hpo', 'obo_file') except (NoSectionError, NoOptionError): logger.critical( 'No HPO OBO file provided and no "hpo:obo_file" found in the configuration file.') exit(1) if pheno2genes_file is None: try: pheno2genes_file = config.get('hpo', 'pheno2genes_file') except (NoSectionError, NoOptionError): logger.critical( 'No HPO pheno2genes_file file provided and no "hpo:pheno2genes_file" found in the configuration file.' ) exit(1) try: with open(query_hpo_file, 'r') as case_fh: case_hpo = case_fh.read().splitlines() except (FileNotFoundError, PermissionError) as e: logger.critical(e) exit(1) # load phenotypes to genes associations terms_to_genes, genes_to_terms, annotations_count = load_p2g( pheno2genes_file, logger=logger) # load hpo network hpo_network = _load_hpo_network( obo_file, terms_to_genes, annotations_count, custom_annotations_file) # create instance the scorer class scorer = Scorer(hpo_network) # multiprocessing objects manager = Manager() lock = manager.Lock() if no_parents is True: case_hpo = remove_parents(case_hpo, hpo_network) if records_file: # score and output case hpo terms against all genes associated set of hpo terms logger.info( f'Scoring HPO terms from file: {query_hpo_file} against entities in: {records_file}') records = read_records_file(records_file, no_parents, hpo_network, logger=logger) # include the case-to-iteslf records[query_name] = case_hpo if not output_file: sys.stdout.write('\t'.join(['#query', 'entity_id', 'score'])) sys.stdout.write('\n') with Pool(threads) as p: p.starmap(scorer.score_pairs, [(records, [ (query_name, record) for record in records], lock, agg_score, i, threads) for i in range(threads)]) else: with Pool(threads) as p: scored_results = p.starmap(scorer.score_pairs, [(records, [(query_name, record) for record in records], lock, agg_score, i, threads, False) for i in range(threads)]) scored_results = [item for sublist in scored_results for item in sublist] scored_results_df = pd.DataFrame(data=scored_results, columns='#query,entity_id,score'.split(',')) scored_results_df = scored_results_df.sort_values(by='score', ascending=False) scored_results_df.to_csv(output_file, sep='\t', index=False) logger.info(f'Scoring completed') logger.info(f'Writing results to file: {output_file}') else: # score and output case hpo terms against all genes associated set of hpo terms logger.info(f'Scoring case HPO terms from file: {query_hpo_file}') # add the case terms to the genes_to_terms dict genes_to_terms[query_name] = case_hpo if not output_file: sys.stdout.write('\t'.join(['#query', 'gene', 'score'])) sys.stdout.write('\n') # iterate over each cross-product and score the pair of records with Pool(threads) as p: p.starmap(scorer.score_pairs, [(genes_to_terms, [ (query_name, gene) for gene in genes_to_terms], lock, agg_score, i, threads) for i in range(threads)]) else: with Pool(threads) as p: scored_results = p.starmap(scorer.score_pairs, [(genes_to_terms, [(query_name, gene) for gene in genes_to_terms], lock, agg_score, i, threads, False) for i in range(threads)]) scored_results = [item for sublist in scored_results for item in sublist] scored_results_df = pd.DataFrame(data=scored_results, columns='#query,gene,score'.split(',')) scored_results_df = scored_results_df.sort_values(by='score', ascending=False) scored_results_df.to_csv(output_file, sep='\t', index=False) logger.info(f'Scoring completed') logger.info(f'Writing results to file: {output_file}')
def score_product(records_file, obo_file=None, pheno2genes_file=None, threads=1, agg_score='BMA', no_parents=False, custom_annotations_file=None): """ Scores the cartesian product of HPO terms from a list of unique records (cases, genes, diseases, etc). :param records_file: One record per line, tab delimited. First column record unique identifier, second column pipe separated list of HPO identifier (HP:0000001). :param obo_file: OBO file from https://hpo.jax.org/app/download/ontology. :param pheno2genes_file: Phenotypes to genes from https://hpo.jax.org/app/download/annotation. :param threads: Multiprocessing threads to use [default: 1]. :param agg_score: The aggregation method to use for summarizing the similarity matrix between two term sets Must be one of {'BMA', 'maximum'} :param no_parents: If provided, scoring is done by only using the most informative nodes. All parent nodes are removed. :param custom_annotations_file: A custom entity-to-phenotype annotation file in the same format as tests/data/test.score-product.txt """ if agg_score not in {'BMA', 'maximum', }: logger.critical( 'agg_score must be one of {BMA, maximum}.') exit(1) if obo_file is None: try: obo_file = config.get('hpo', 'obo_file') except (NoSectionError, NoOptionError): logger.critical( 'No HPO OBO file provided and no "hpo:obo_file" found in the configuration file.') exit(1) if pheno2genes_file is None: try: pheno2genes_file = config.get('hpo', 'pheno2genes_file') except (NoSectionError, NoOptionError): logger.critical( 'No HPO pheno2genes_file file provided and no "hpo:pheno2genes_file" found in the configuration file.' ) exit(1) # load phenotypes to genes associations terms_to_genes, _, annotations_count = load_p2g( pheno2genes_file, logger=logger) # load hpo network hpo_network = _load_hpo_network( obo_file, terms_to_genes, annotations_count, custom_annotations_file) # try except records = read_records_file(records_file, no_parents, hpo_network, logger=logger) logger.info(f'Scoring product of records from file: {records_file}') # create instance the scorer class scorer = Scorer(hpo_network) # create records product generator records_product = itertools.product(records.keys(), repeat=2) # iterate over each cross-product and score the pair of records manager = Manager() lock = manager.Lock() with Pool(threads) as p: p.starmap(scorer.score_pairs, [(records, records_product, lock, agg_score, i, threads) for i in range(threads)])