def nanopub_stats(ctx, input_fn): """Collect statistics on nanopub file input_fn can be json, jsonl or yaml and additionally gzipped """ counts = { "nanopubs": 0, "assertions": {"total": 0, "subject_only": 0, "nested": 0, "relations": {}}, } for np in bnf.read_nanopubs(input_fn): if "nanopub" in np: counts["nanopubs"] += 1 counts["assertions"]["total"] += len(np["nanopub"]["assertions"]) for assertion in np["nanopub"]["assertions"]: if assertion["relation"] is None: counts["assertions"]["subject_only"] += 1 else: if re.match("\s*\(", assertion["object"]): counts["assertions"]["nested"] += 1 if not assertion.get("relation") in counts["assertions"]["relations"]: counts["assertions"]["relations"][assertion.get("relation")] = 1 else: counts["assertions"]["relations"][assertion.get("relation")] += 1 counts["assertions"]["relations"] = sorted(counts["assertions"]["relations"]) print("DumpVar:\n", json.dumps(counts, indent=4))
def reformat(ctx, input_fn, output_fn): """Reformat between JSON, YAML, JSONLines formats \b input_fn: If input fn has *.gz, will read as a gzip file \b output_fn: If output fn has *.gz, will written as a gzip file If output fn has *.jsonl*, will written as a JSONLines file IF output fn has *.json*, will be written as a JSON file If output fn has *.yaml* or *.yml*, will be written as a YAML file """ try: ( out_fh, yaml_flag, jsonl_flag, json_flag, ) = bel.nanopub.files.create_nanopubs_fh(output_fn) if yaml_flag or json_flag: docs = [] # input file if re.search("gz$", input_fn): f = gzip.open(input_fn, "rt") else: f = open(input_fn, "rt") for np in bnf.read_nanopubs(input_fn): if yaml_flag or json_flag: docs.append(np) elif jsonl_flag: out_fh.write("{}\n".format(json.dumps(np))) if yaml_flag: yaml.dump(docs, out_fh) elif json_flag: json.dump(docs, out_fh, indent=4) finally: f.close() out_fh.close()
def pipeline( ctx, input_fn, db_save, db_delete, output_fn, rules, species, namespace_targets, version, api, config_fn, ): """BEL Pipeline - BEL Nanopubs into BEL Edges This will process BEL Nanopubs into BEL Edges by validating, orthologizing (if requested), canonicalizing, and then computing the BEL Edges based on the given rule_set. \b input_fn: If input fn has *.gz, will read as a gzip file If input fn has *.jsonl*, will parsed as a JSONLines file IF input fn has *.json*, will be parsed as a JSON file If input fn has *.yaml* or *.yml*, will be parsed as a YAML file \b output_fn: If output fn has *.gz, will written as a gzip file If output fn has *.jsonl*, will written as a JSONLines file IF output fn has *.json*, will be written as a JSON file If output fn has *.yaml* or *.yml*, will be written as a YAML file If output fn has *.jgf, will be written as JSON Graph Formatted file """ if config_fn: config = bel.db.Config.merge_config(ctx.config, override_config_fn=config_fn) else: config = ctx.config # Configuration - will return the first truthy result in list else the default option if namespace_targets: namespace_targets = json.loads(namespace_targets) if rules: rules = rules.replace(" ", "").split(",") namespace_targets = utils.first_true( [namespace_targets, config["bel"]["lang"].get("canonical")], None ) rules = utils.first_true( [rules, config["bel"]["nanopub"].get("pipeline_edge_rules", False)], False ) api = utils.first_true( [api, config["bel_api"]["servers"].get("api_url", None)], None ) version = utils.first_true( [version, config["bel"]["lang"].get("default_bel_version", None)], None ) n = bnn.Nanopub() try: json_flag, jsonl_flag, yaml_flag, jgf_flag = False, False, False, False all_bel_edges = [] fout = None if db_save or db_delete: if db_delete: arango_client = bel.db.arangodb.get_client() bel.db.arangodb.delete_database(arango_client, "edgestore") else: arango_client = bel.db.arangodb.get_client() edgestore_handle = bel.db.arangodb.get_edgestore_handle(arango_client) elif re.search("ya?ml", output_fn): yaml_flag = True elif "jsonl" in output_fn: jsonl_flag = True elif "json" in output_fn: json_flag = True elif "jgf" in output_fn: jgf_flag = True if db_save: pass elif "gz" in output_fn: fout = gzip.open(output_fn, "wt") else: fout = open(output_fn, "wt") nanopub_cnt = 0 with timy.Timer() as timer: for np in bnf.read_nanopubs(input_fn): # print('Nanopub:\n', json.dumps(np, indent=4)) nanopub_cnt += 1 if nanopub_cnt % 100 == 0: timer.track(f"{nanopub_cnt} Nanopubs processed into Edges") bel_edges = n.bel_edges( np, namespace_targets=namespace_targets, orthologize_target=species, rules=rules, ) if db_save: bel.edge.edges.load_edges_into_db(edgestore_handle, edges=bel_edges) elif jsonl_flag: fout.write("{}\n".format(json.dumps(bel_edges))) else: all_bel_edges.extend(bel_edges) if db_save: pass elif yaml_flag: fout.write("{}\n".format(yaml.dumps(all_bel_edges))) elif json_flag: fout.write("{}\n".format(json.dumps(all_bel_edges))) elif jgf_flag: bnf.edges_to_jgf(output_fn, all_bel_edges) finally: if fout: fout.close()