def test_test_similarity(self): """ check that test_similarity works correctly """ # select two probands with relatively rare terms. Ignore that we select # the same person twice, the example will still work. Since the # reference population only has three individuals, the chance of # selecting two individuals with HPO similarity as rare as those two # individuals is 1 in three. We check that the probability estimate for # the two "rare" individuals is relatively close to 0.33. probands = ["person_03", "person_03"] p = test_similarity(self.hpo_graph, self.hpo_terms, probands, n_sims=1000, score_type="resnik") self.assertLess(abs(p - 0.33), 0.04) # now chose two individuals who do not share terms, and so the chance # that two random probands share their terms to the same extent is # effectively 1. The test is currently set up so that the maximum P-value # is actually n/(n + 1), where n is the number of iterations. We use # n + 1, since the observed similarity should be in the simulated # distribution under the null hypothesis. probands = ["person_01", "person_03"] p = test_similarity(self.hpo_graph, self.hpo_terms, probands, n_sims=1000, score_type="resnik") self.assertLess(abs(p - 0.999), 0.03)
def analyse_genes(hpo_graph, hpo_by_proband, probands_by_gene, output_path, iterations, score_type): """ tests genes to see if their probands share HPO terms more than by chance. Args: hpo_graph: ICSimilarity object for the HPO term graph, with information on how many times each term has been used across all probands. hpo_by_proband: dictionary of HPO terms per proband probands_by_gene: dictionary of genes, to the probands who have variants in those genes. output_path: path to file to write the results to, or sys.stdout object. iterations: number of iterations to run. """ # Sometimes output_path is actually sys.stdout, other times it is a path. try: output = open(output_path, "w") except TypeError: output = output_path output.write("hgnc\thpo_similarity_p_value\n") for gene in sorted(probands_by_gene): probands = probands_by_gene[gene] p_value = None if len(probands) > 1: p_value = test_similarity(hpo_graph, hpo_by_proband, probands, iterations, score_type) if p_value is None: continue output.write("{0}\t{1}\n".format(gene, p_value)) output.close()
def analyse_genes(hpo_graph, hpo_by_proband, probands_by_gene, output_path, iterations, score_type): """ tests genes to see if their probands share HPO terms more than by chance. Args: hpo_graph: ICSimilarity object for the HPO term graph, with information on how many times each term has been used across all probands. hpo_by_proband: dictionary of HPO terms per proband probands_by_gene: dictionary of genes, to the probands who have variants in those genes. output_path: path to file to write the results to, or sys.stdout object. iterations: number of iterations to run. """ check_terms_in_graph(hpo_graph, hpo_by_proband) # Sometimes output_path is actually sys.stdout, other times it is a path. try: output = open(output_path, "w") except TypeError: output = output_path output.write("hgnc\thpo_similarity_p_value\n") for gene in sorted(probands_by_gene): probands = probands_by_gene[gene] p_value = None if len(probands) > 1: p_value = test_similarity(hpo_graph, hpo_by_proband, probands, iterations, score_type) if p_value is None: continue output.write("{0}\t{1}\n".format(gene, p_value)) output.close()