def main(argv): current_stats_url = '' previous_stats_url = '' output_rep = '' if len(argv) < 3: print_help() sys.exit(2) try: opts, argv = getopt.getopt(argv, "c:p:o:", ["current=", "previous=", "orep="]) except getopt.GetoptError: print_help() sys.exit(2) for opt, arg in opts: if opt == '-h': print_help() sys.exit() elif opt in ("-c", "--current"): current_stats_url = arg elif opt in ("-p", "--previous"): previous_stats_url = arg elif opt in ("-o", "-orep"): output_rep = arg if not output_rep.endswith("/"): output_rep += "/" if not os.path.exists(output_rep): os.mkdir(output_rep) output_json = output_rep + "go-stats-changes.json" output_tsv = output_rep + "go-stats-changes.tsv" print("Will write stats changes to " + output_json + " and " + output_tsv) current_stats = utils.fetch(current_stats_url).json() previous_stats = utils.fetch(previous_stats_url).json() json_changes = compute_changes(current_stats, previous_stats) json_changes = alter_annotation_changes(current_stats, previous_stats, None, None, json_changes) print("Saving Stats to <" + output_json + "> ...") utils.write_json(output_json, json_changes) print("Done.") print("Saving Stats to <" + output_tsv + "> ...") tsv_changes = create_text_report(json_changes) utils.write_text(output_tsv, tsv_changes) print("Done.")
def load_taxon_map(): global taxon_map print( "Using ", taxon_map_fallback_url, " (created from ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip) as a fallback to get taxon { id, label }" ) data = utils.fetch(taxon_map_fallback_url) if data is None or data.status_code != 200: return False taxon_map = json.loads(data.content) check = '9606' in taxon_map and taxon_map['9606'] == 'H**o sapiens' return check
def main(argv): golr_url = '' previous_stats_url = '' previous_stats_no_pb_url = '' current_obo_url = '' previous_obo_url = '' previous_references_url = '' output_rep = '' release_date = '' print(len(argv)) if len(argv) < 16: print_help() sys.exit(2) try: opts, argv = getopt.getopt(argv, "g:s:n:c:p:o:d:r:", [ "golrurl=", "pstats=", "pnstats=", "cobo=", "pobo=", "orep=", "date=", "ref=" ]) except getopt.GetoptError: print_help() sys.exit(2) for opt, arg in opts: if opt == '-h': print_help() sys.exit() elif opt in ("-g", "--golrurl"): golr_url = arg if not golr_url.endswith("/"): golr_url = golr_url + "/" elif opt in ("-s", "--pstats"): previous_stats_url = arg elif opt in ("-n", "--pnstats"): previous_stats_no_pb_url = arg elif opt in ("-c", "--cobo"): current_obo_url = arg elif opt in ("-p", "--pobo"): previous_obo_url = arg elif opt in ("-r", "--ref"): previous_references_url = arg elif opt in ("-o", "--orep"): output_rep = arg elif opt in ("-d", "--date"): release_date = arg if not output_rep.endswith("/"): output_rep += "/" if not os.path.exists(output_rep): os.mkdir(output_rep) # actual names of the files to be generated - can change here if needed output_stats = output_rep + "go-stats.json" output_stats_no_pb = output_rep + "go-stats-no-pb.json" output_references = output_rep + "go-references.tsv" output_pmids = output_rep + "go-pmids.tsv" output_pubmed_pmids = output_rep + "GO.uid" output_ontology_changes = output_rep + "go-ontology-changes.json" output_ontology_changes_tsv = output_rep + "go-ontology-changes.tsv" output_stats_summary = output_rep + "go-stats-summary.json" output_annotation_changes = output_rep + "go-annotation-changes.json" output_annotation_changes_tsv = output_rep + "go-annotation-changes.tsv" output_annotation_changes_no_pb = output_rep + "go-annotation-changes_no_pb.json" output_annotation_changes_no_pb_tsv = output_rep + "go-annotation-changes_no_pb.tsv" # 1 - Executing go_stats script print( "\n\n1a - EXECUTING GO_STATS SCRIPT (INCLUDING PROTEIN BINDING)...\n") json_stats = go_stats.compute_stats(golr_url, release_date) print("DONE.") print( "\n\n1b - EXECUTING GO_STATS SCRIPT (EXCLUDING PROTEIN BINDING)...\n") json_stats_no_pb = go_stats.compute_stats(golr_url, release_date, True) print("DONE.") print( "\n\n1c - EXECUTING GO_STATS SCRIPT (RETRIEVING PREVIOUS REFERENCES LIST)...\n" ) previous_references_ids = utils.fetch(previous_references_url).text previous_references_ids = previous_references_ids.split("\n") previous_references_ids = list( map(lambda x: x.split("\t")[0], previous_references_ids)) print("DONE.") print( "\n\n1d - EXECUTING GO_STATS SCRIPT (CREATING CURRENT REFERENCES LIST)...\n" ) references = go_stats.get_references() references_lines = [] for k, v in references.items(): references_lines.append(k + "\t" + str(v)) current_references_ids = list( map(lambda x: x.split("\t")[0], references_lines)) pmids_lines = list(filter(lambda x: "PMID:" in x, references_lines)) pmids_ids = list(map(lambda x: x.split("\t")[0].split(":")[1], pmids_lines)) utils.write_text(output_references, "\n".join(references_lines)) utils.write_text(output_pmids, "\n".join(pmids_lines)) utils.write_text(output_pubmed_pmids, "\n".join(pmids_ids)) print("DONE.") # 2 - Executing go_ontology_changes script print("\n\n2 - EXECUTING GO_ONTOLOGY_CHANGES SCRIPT...\n") json_onto_changes = go_ontology_changes.compute_changes( current_obo_url, previous_obo_url) utils.write_json(output_ontology_changes, json_onto_changes) tsv_onto_changes = go_ontology_changes.create_text_report( json_onto_changes) utils.write_text(output_ontology_changes_tsv, tsv_onto_changes) print("DONE.") # 3 - Executing go_annotation_changes script print( "\n\n3a - EXECUTING GO_ANNOTATION_CHANGES SCRIPT (INCLUDING PROTEIN BINDING)...\n" ) previous_stats = utils.fetch(previous_stats_url).json() json_annot_changes = go_annotation_changes.compute_changes( json_stats, previous_stats) print("DONE.") print( "\n\n3b - EXECUTING GO_ANNOTATION_CHANGES SCRIPT (EXCLUDING PROTEIN BINDING)...\n" ) previous_stats_no_pb = utils.fetch(previous_stats_no_pb_url).json( ) # WE STILL NEED TO CORRECT THAT: 1 FILE OR SEVERAL FILE ? IF SEVERAL, ONE MORE PARAMETER json_annot_no_pb_changes = go_annotation_changes.compute_changes( json_stats_no_pb, previous_stats_no_pb) print("DONE.") # 4 - Refining go-stats with ontology stats print("\n\n4 - EXECUTING GO_REFINE_STATS SCRIPT...\n") merged_annotations_diff = utils.merge_dict(json_stats, json_annot_changes) json_annot_changes = merged_annotations_diff ontology = json_onto_changes["summary"]["current"].copy() del ontology["release_date"] ontology["changes_created_terms"] = json_onto_changes["summary"][ "changes"]["created_terms"] ontology["changes_valid_terms"] = json_onto_changes["summary"]["changes"][ "valid_terms"] ontology["changes_obsolete_terms"] = json_onto_changes["summary"][ "changes"]["obsolete_terms"] ontology["changes_merged_terms"] = json_onto_changes["summary"]["changes"][ "merged_terms"] ontology["changes_biological_process_terms"] = json_onto_changes[ "summary"]["changes"]["biological_process_terms"] ontology["changes_molecular_function_terms"] = json_onto_changes[ "summary"]["changes"]["molecular_function_terms"] ontology["changes_cellular_component_terms"] = json_onto_changes[ "summary"]["changes"]["cellular_component_terms"] json_stats = { "release_date": json_stats["release_date"], "ontology": ontology, "annotations": json_stats["annotations"], "taxa": json_stats["taxa"], "bioentities": json_stats["bioentities"], "references": json_stats["references"] } print("\n4a - SAVING GO-STATS...\n") utils.write_json(output_stats, json_stats) print("DONE.") json_stats_no_pb = { "release_date": json_stats_no_pb["release_date"], "ontology": ontology, "annotations": json_stats_no_pb["annotations"], "taxa": json_stats_no_pb["taxa"], "bioentities": json_stats_no_pb["bioentities"], "references": json_stats_no_pb["references"] } print("\n4b - SAVING GO-STATS-NO-PB...\n") utils.write_json(output_stats_no_pb, json_stats_no_pb) print("DONE.") annotations_by_reference_genome = json_stats["annotations"][ "by_model_organism"] for taxon in annotations_by_reference_genome: for ecode in annotations_by_reference_genome[taxon]["by_evidence"]: annotations_by_reference_genome[taxon]["by_evidence"][ecode][ "B"] = json_stats["annotations"]["by_model_organism"][taxon][ "by_evidence"][ecode]["F"] - json_stats_no_pb[ "annotations"]["by_model_organism"][taxon][ "by_evidence"][ecode]["F"] for ecode in annotations_by_reference_genome[taxon][ "by_evidence_cluster"]: annotations_by_reference_genome[taxon]["by_evidence_cluster"][ ecode]["B"] = json_stats["annotations"]["by_model_organism"][ taxon]["by_evidence_cluster"][ecode][ "F"] - json_stats_no_pb["annotations"][ "by_model_organism"][taxon]["by_evidence_cluster"][ ecode]["F"] bioentities_by_reference_genome = {} for taxon in go_stats.reference_genomes_ids: key = go_stats.taxon_label(taxon) bioentities_by_reference_genome[key] = json_stats["bioentities"][ "by_filtered_taxon"]["cluster"][key] if key in json_stats[ "bioentities"]["by_filtered_taxon"]["cluster"] else {} # TODO: we don't have a way to filter on bioentity documents without direct annotations to PB ? # for btype in bioentities_by_reference_genome[key]: # val = json_stats_no_pb["bioentities"]["by_filtered_taxon"]["cluster"][key]["F"] if (key in json_stats_no_pb["bioentities"]["by_filtered_taxon"]["cluster"] and "F" in json_stats_no_pb["bioentities"]["by_filtered_taxon"]["cluster"][key]) else 0 # bioentities_by_reference_genome[key][btype]["B"] = bioentities_by_reference_genome[key][btype]["F"] - val references_by_reference_genome = {} for taxon in go_stats.reference_genomes_ids: key = go_stats.taxon_label(taxon) references_by_reference_genome[key] = json_stats["references"]["all"][ "by_filtered_taxon"][key] if key in json_stats["references"][ "all"]["by_filtered_taxon"] else {} pmids_by_reference_genome = {} for taxon in go_stats.reference_genomes_ids: key = go_stats.taxon_label(taxon) pmids_by_reference_genome[key] = json_stats["references"]["pmids"][ "by_filtered_taxon"][key] if key in json_stats["references"][ "pmids"]["by_filtered_taxon"] else {} # This is to modify the structure of the annotation changes based on recent requests print("\n4c - SAVING GO-ANNOTATION-CHANGES...\n") json_annot_changes = go_annotation_changes.alter_annotation_changes( json_stats, previous_stats, current_references_ids, previous_references_ids, json_annot_changes) utils.write_json(output_annotation_changes, json_annot_changes) tsv_annot_changes = go_annotation_changes.create_text_report( json_annot_changes) utils.write_text(output_annotation_changes_tsv, tsv_annot_changes) print("DONE.") print("\n4d - SAVING GO-ANNOTATION-NO-PB-CHANGES...\n") json_annot_no_pb_changes = go_annotation_changes.alter_annotation_changes( json_stats_no_pb, previous_stats_no_pb, current_references_ids, previous_references_ids, json_annot_no_pb_changes) utils.write_json(output_annotation_changes_no_pb, json_annot_no_pb_changes) tsv_annot_changes_no_pb = go_annotation_changes.create_text_report( json_annot_no_pb_changes) utils.write_text(output_annotation_changes_no_pb_tsv, tsv_annot_changes_no_pb) print("DONE.") json_stats_summary = { "release_date": json_stats["release_date"], "ontology": ontology, "annotations": { "total": json_stats["annotations"]["total"], "total_no_pb": json_stats_no_pb["annotations"]["total"], "total_pb": json_stats["annotations"]["total"] - json_stats_no_pb["annotations"]["total"], "by_aspect": { "P": json_stats["annotations"]["by_aspect"]["P"], "F": json_stats["annotations"]["by_aspect"]["F"], "C": json_stats["annotations"]["by_aspect"]["C"], "B": json_stats["annotations"]["by_aspect"]["F"] - json_stats_no_pb["annotations"]["by_aspect"]["F"] }, "by_bioentity_type_cluster": json_stats["annotations"]["by_bioentity_type"]["cluster"], "by_bioentity_type_cluster_no_pb": json_stats_no_pb["annotations"]["by_bioentity_type"]["cluster"], "by_qualifier": json_stats["annotations"]["by_qualifier"], "by_evidence_cluster": json_stats["annotations"]["by_evidence"]["cluster"], "by_evidence_cluster_no_pb": json_stats_no_pb["annotations"]["by_evidence"]["cluster"], "by_model_organism": annotations_by_reference_genome }, "taxa": { "total": json_stats["taxa"]["total"], "filtered": json_stats["taxa"]["filtered"], }, "bioentities": { "total": json_stats["bioentities"]["total"], "total_no_pb": json_stats_no_pb["bioentities"]["total"], "by_type_cluster": json_stats["bioentities"]["by_type"]["cluster"], "by_type_cluster_no_pb": json_stats_no_pb["bioentities"]["by_type"]["cluster"], "by_model_organism": bioentities_by_reference_genome }, "references": { "all": { "total": json_stats["references"]["all"]["total"], "total_no_pb": json_stats_no_pb["references"]["all"]["total"], "added": json_annot_changes["summary"]["changes"]["references"] ["added"], "removed": json_annot_changes["summary"]["changes"]["references"] ["removed"], "by_model_organism": references_by_reference_genome }, "pmids": { "total": json_stats["references"]["pmids"]["total"], "total_no_pb": json_stats_no_pb["references"]["pmids"]["total"], "added": json_annot_changes["summary"]["changes"]["pmids"]["added"], "removed": json_annot_changes["summary"]["changes"]["pmids"]["removed"], "by_model_organism": pmids_by_reference_genome } }, } # removing by_reference_genome.by_evidence for gen in json_stats_summary["annotations"]["by_model_organism"]: del json_stats_summary["annotations"]["by_model_organism"][gen][ "by_evidence"] print("\n4e - SAVING GO-STATS-SUMMARY...\n") utils.write_json(output_stats_summary, json_stats_summary) print("DONE.") # Indicate all processes finished print("SUCCESS.")
def compute_changes(current_obo_url, previous_obo_url): # The new published OBO archive print("Loading current GO ontology (" + current_obo_url + ")...") currentgo = OBO_Parser(utils.fetch(current_obo_url).text) # A previously published OBO archive print("Loading previous GO ontology (" + previous_obo_url + ")...") oldgo = OBO_Parser(utils.fetch(previous_obo_url).text) # New GO Terms created = {} created_count = 0 for id, newterm in currentgo.get_terms().items(): if not oldgo.has_term(id): if newterm.namespace not in created: created[newterm.namespace] = [] created[newterm.namespace].append({"id": id, "name": newterm.name}) created_count += 1 print(str(created_count) + " terms created since last revision") # Merged GO Terms merged = {} merged_list = [] merged_count = 0 for id, oldterm in oldgo.get_terms().items(): if not currentgo.has_term(id): if oldterm.namespace not in merged: merged[oldterm.namespace] = [] alts = currentgo.get_alternate_terms(id) if len(alts) > 0: merged[oldterm.namespace].append({ "current": alts[0], "previous": { "id": id, "name": oldterm.name } }) merged_count += 1 merged_list.append(oldterm.id) print(str(merged_count) + " terms merged since last revision") # Obsoleted GO Terms obsoleted = {} obsoleted_count = 0 new_terms = currentgo.get_terms() for id, oldterm in oldgo.get_terms().items(): if id not in new_terms: if oldterm.namespace not in obsoleted: obsoleted[oldterm.namespace] = [] if oldterm.id not in merged_list: obsoleted[oldterm.namespace].append({ "id": id, "name": oldterm.name }) obsoleted_count += 1 print(str(obsoleted_count) + " terms obsoleted since last revision") # Existing GO Terms with structural changes (is_a, part_of, has_part etc) relations_changes = {} structural_count = 0 structural_total_count = 0 for id, newterm in currentgo.get_terms().items(): if oldgo.has_term(id): oldterm = oldgo.get_term(id) if not newterm.structural_equals(oldterm): if newterm.namespace not in relations_changes: relations_changes[newterm.namespace] = [] reasons = {} for key, reason in newterm.explain_structural_differences( oldterm).items(): reasons[key] = { "current": reason['current'], "previous": reason['previous'] } relations_changes[newterm.namespace].append({ "id": id, "name": newterm.name, "changes": reasons }) structural_count += 1 structural_total_count += len(reasons) print( str(structural_count) + " terms relation changes since last revision") # Existing GO Terms with cross reference changes xrefs_changes = {} xrefs_count = 0 xrefs_total_count = 0 for id, newterm in currentgo.get_terms().items(): if oldgo.has_term(id): oldterm = oldgo.get_term(id) if not newterm.xrefs_equals(oldterm): if newterm.namespace not in xrefs_changes: xrefs_changes[newterm.namespace] = [] reasons = {} for key, reason in newterm.explain_xrefs_differences( oldterm).items(): reasons[key] = { "current": reason['current'], "previous": reason['previous'] } xrefs_changes[newterm.namespace].append({ "id": id, "name": newterm.name, "changes": reasons }) xrefs_count += 1 xrefs_total_count += newterm.count_xrefs_differences(oldterm) print(str(xrefs_count) + " terms xrefs changes since last revision") # Existing GO Terms with meta changes (synonyms, NO XREFS, definition, etc) meta_noxrefs_changes = {} meta_noxrefs_count = 0 meta_noxrefs_total_count = 0 for id, newterm in currentgo.get_terms().items(): if oldgo.has_term(id): oldterm = oldgo.get_term(id) if not newterm.meta_equals(oldterm, False): if newterm.namespace not in meta_noxrefs_changes: meta_noxrefs_changes[newterm.namespace] = [] reasons = {} for key, reason in newterm.explain_meta_differences( oldterm, False).items(): reasons[key] = { "current": reason['current'], "previous": reason['previous'] } meta_noxrefs_changes[newterm.namespace].append({ "id": id, "name": newterm.name, "changes": reasons }) meta_noxrefs_count += 1 meta_noxrefs_total_count += len(reasons) print( str(meta_noxrefs_count) + " terms meta (NO XREFS) changes since last revision") release_date = currentgo.header['data-version'] release_date = release_date[release_date.index("/") + 1:] if "/" in release_date else release_date last_date = oldgo.header['data-version'] last_date = last_date[last_date.index("/") + 1:] if "/" in last_date else last_date print("Creating JSON report...") report = {} report["summary"] = { "current": { "release_date": release_date, "valid_terms": len(currentgo.get_terms(TermState.VALID)), "obsolete_terms": len(currentgo.get_terms(TermState.OBSOLETED)), "merged_terms": len(currentgo.get_merged_terms(TermState.ANY)), "biological_process_terms": len(currentgo.get_terms_in("biological_process")), "molecular_function_terms": len(currentgo.get_terms_in("molecular_function")), "cellular_component_terms": len(currentgo.get_terms_in("cellular_component")), "meta_statements": currentgo.count_all_metas(TermState.VALID, False), "cross_references": currentgo.count_all_xrefs(TermState.VALID), "terms_relations": currentgo.count_all_structurals() }, "previous": { "release_date": last_date, "valid_terms": len(oldgo.get_terms(TermState.VALID)), "obsolete_terms": len(oldgo.get_terms(TermState.OBSOLETED)), "merged_terms": len(oldgo.get_merged_terms(TermState.ANY)), "biological_process_terms": len(oldgo.get_terms_in("biological_process")), "molecular_function_terms": len(oldgo.get_terms_in("molecular_function")), "cellular_component_terms": len(oldgo.get_terms_in("cellular_component")), "meta_statements": oldgo.count_all_metas(TermState.VALID, False), "cross_references": oldgo.count_all_xrefs(TermState.VALID), "terms_relations": oldgo.count_all_structurals() }, "changes": { "created_terms": created_count, "valid_terms": len(currentgo.get_terms(TermState.VALID)) - len(oldgo.get_terms(TermState.VALID)), "obsolete_terms": obsoleted_count, "merged_terms": merged_count, "biological_process_terms": len(currentgo.get_terms_in("biological_process")) - len(oldgo.get_terms_in("biological_process")), "molecular_function_terms": len(currentgo.get_terms_in("molecular_function")) - len(oldgo.get_terms_in("molecular_function")), "cellular_component_terms": len(currentgo.get_terms_in("cellular_component")) - len(oldgo.get_terms_in("cellular_component")), "meta_statements": meta_noxrefs_total_count, "meta_statements_by_term": meta_noxrefs_count, "cross_references": xrefs_total_count, "cross_references_by_term": xrefs_count, "relations": structural_total_count, "relations_by_term": structural_count } } report["detailed_changes"] = { "created_terms": created, "obsolete_terms": obsoleted, "merged_terms": merged, "meta_statements": meta_noxrefs_changes, "cross_references": xrefs_changes, "relations": relations_changes } print("JSON report created.") return report
def main(argv): golr_base_url = '' output_rep = '' slim_base_url = '' if len(argv) < 6: print_help() sys.exit(2) try: opts, argv = getopt.getopt(argv, "g:o:s:", ["golrurl=", "orep=", "slim="]) except getopt.GetoptError: print_help() sys.exit(2) for opt, arg in opts: if opt == '-h': print_help() sys.exit() elif opt in ("-g", "--golrurl"): golr_base_url = arg if not golr_base_url.endswith("/"): golr_base_url = golr_base_url + "/" elif opt in ("-o", "--orep"): output_rep = arg elif opt in ("-s", "--slim"): slim_base_url = arg if not slim_base_url.endswith("/"): slim_base_url = slim_base_url + "/" if not output_rep.endswith("/"): output_rep += "/" if not os.path.exists(output_rep): os.mkdir(output_rep) print("\n1 - Creating ontology map...") ontology_map = create_ontology_map(golr_base_url) print("Ontology map created with ", len(ontology_map), " terms") slims = ["goslim_agr.obo", "goslim_generic.obo", "goslim_chembl.obo"] print("\n2 - Loading ", len(slims), " slims to create the slim-specific GMTs...") slim_obos = {} for slim in slims: response = utils.fetch(slim_base_url + slim) obo = OBO_Parser(response.text) slim_obos[slim] = obo print("Slims loaded: ", len(slim_obos)) # taxa = utils.REFERENCE_GENOME_IDS taxa = ["NCBITaxon:9606", "NCBITaxon:10090"] print("\n3 - Creating the GMTs for ", len(taxa), " taxa") for taxon in taxa: taxon_id = taxon.split(":")[1] gmt_taxon = gmt(ontology_map, golr_base_url, taxon) output = output_rep + taxon_id for aspect in gmt_taxon: for evgroup in gmt_taxon[aspect]: if len(gmt_taxon[aspect][evgroup]) > 0: utils.write_text( output + "-" + aspect.lower() + "-" + evgroup.lower() + ".gmt", gmt_taxon[aspect][evgroup]) for slim_obo in slim_obos: oterms = slim_obos[slim_obo].get_terms(TermState.VALID) terms = oterms.keys() gmt_taxon_slim = filter_slim(gmt_taxon, terms) slim_key = slim_obo.replace(".obo", "") for aspect in gmt_taxon_slim: for evgroup in gmt_taxon_slim[aspect]: if len(gmt_taxon_slim[aspect][evgroup]) > 0: utils.write_text( output + "-" + slim_key + "-" + aspect.lower() + "-" + evgroup.lower() + ".gmt", gmt_taxon_slim[aspect][evgroup])