def export(): """Create export data file.""" from biomappings.resources import load_mappings, load_predictions, load_false_mappings here = os.path.abspath(os.path.dirname(__file__)) path = os.path.join(here, os.pardir, os.pardir, 'docs', '_data', 'summary.yml') true_mappings = load_mappings() false_mappings = load_false_mappings() rv = { 'positive': _get_counter(true_mappings), 'negative': _get_counter(false_mappings), 'predictions': _get_counter(load_predictions()), 'contributors': _get_contributors(itt.chain(true_mappings, false_mappings)), } rv.update({ f'{k}_mapping_count': sum(e['count'] for e in rv[k]) for k in ('positive', 'negative', 'predictions') }) rv.update({ f'{k}_prefix_count': len(set(itt.chain.from_iterable((e['source'], e['target']) for e in rv[k]))) for k in ('positive', 'negative', 'predictions') }) with open(path, 'w') as file: yaml.safe_dump(rv, file, indent=2)
def export(): """Create export data file.""" from biomappings.resources import load_mappings, load_predictions, load_false_mappings from biomappings.utils import DATA path = os.path.join(DATA, "summary.yml") true_mappings = load_mappings() false_mappings = load_false_mappings() rv = { "positive": _get_counter(true_mappings), "negative": _get_counter(false_mappings), "predictions": _get_counter(load_predictions()), "contributors": _get_contributors(itt.chain(true_mappings, false_mappings)), } rv.update({ f"{k}_mapping_count": sum(e["count"] for e in rv[k]) for k in ("positive", "negative", "predictions") }) rv.update({ f"{k}_prefix_count": len( set( itt.chain.from_iterable( (e["source"], e["target"]) for e in rv[k]))) for k in ("positive", "negative", "predictions") }) with open(path, "w") as file: yaml.safe_dump(rv, file, indent=2)
def update_biomappings(): """Update mappings from the BioMappings project.""" from indra.databases import mesh_client from indra.databases.identifiers import get_ns_id_from_identifiers from biomappings.resources import load_mappings, load_predictions # We now construct a mapping dict of these mappings biomappings = defaultdict(list) mappings = load_mappings() predictions = load_predictions() exclude_ns = {'kegg.pathway', 'depmap', 'ccle', 'reactome'} for mappings, mapping_type in ((mappings, 'curated'), (predictions, 'predicted')): for mapping in mappings: # We skip anything that isn't an exact match if mapping['relation'] != 'skos:exactMatch': continue # Skip excluded name spaces that aren't relevant here if mapping['source prefix'] in exclude_ns or \ mapping['target prefix'] in exclude_ns: continue # We only accept curated mappings for NCIT if mapping_type == 'predicted' and \ (mapping['source prefix'] == 'ncit' or mapping['target prefix'] == 'ncit'): continue source_ns, source_id = \ get_ns_id_from_identifiers(mapping['source prefix'], mapping['source identifier']) target_ns, target_id = \ get_ns_id_from_identifiers(mapping['target prefix'], mapping['target identifier']) # We only take real xrefs, not refs within a given ontology if source_ns == target_ns: continue biomappings[(source_ns, source_id, mapping['source name'])].append( (target_ns, target_id, mapping['target name'])) biomappings[(target_ns, target_id, mapping['target name'])].append( (source_ns, source_id, mapping['source name'])) def _filter_ncit(values): if len(values) > 1 and 'NCIT' in {v[0] for v in values}: return [v for v in values if v[0] != 'NCIT'] else: return values mesh_mappings = {k: _filter_ncit(v) for k, v in biomappings.items() if k[0] == 'MESH'} non_mesh_mappings = {k: [vv for vv in v if vv[0] != 'MESH'] for k, v in biomappings.items() if k[0] != 'MESH' and k[1] != 'MESH'} rows = [] for k, v in non_mesh_mappings.items(): for vv in v: rows.append(list(k + vv)) rows = sorted(rows, key=lambda x: x[1]) write_unicode_csv(get_resource_path('biomappings.tsv'), rows, delimiter='\t') # We next look at mappings to MeSH from EFO/HP/DOID for ns in ['efo', 'hp', 'doid']: for entry in load_resource_json('%s.json' % ns): db, db_id, name = ns.upper(), entry['id'], entry['name'] if (db, db_id) in biomappings: continue # We first need to decide if we prioritize another name space xref_dict = {xr['namespace']: xr['id'] for xr in entry.get('xrefs', [])} if 'MESH' in xref_dict or 'MSH' in xref_dict: mesh_id = xref_dict.get('MESH') or xref_dict.get('MSH') if not mesh_id.startswith('D'): continue mesh_name = mesh_client.get_mesh_name(mesh_id) if not mesh_name: continue key = ('MESH', mesh_id, mesh_name) if db_id.startswith('BFO'): db_to_use = 'BFO' db_id_to_use = db_id[4:] else: db_to_use = db db_id_to_use = db_id if key not in mesh_mappings: mesh_mappings[key] = [(db_to_use, db_id_to_use, entry['name'])] else: mesh_mappings[key].append((db_to_use, db_id_to_use, entry['name'])) rows = [] for k, v in mesh_mappings.items(): for vv in v: rows.append(list(k + vv)) rows = sorted(rows, key=lambda x: (x[1], x[2], x[3])) write_unicode_csv(get_resource_path('mesh_mappings.tsv'), rows, delimiter='\t')
def get_true_graph() -> nx.Graph: """Get a graph of the true mappings.""" return _graph_from_mappings(load_mappings())
def charts(): """Make charts.""" import matplotlib.pyplot as plt import seaborn as sns miriam_validator = MiriamValidator() true_mappings = load_mappings() true_graph = _graph_from_mappings(true_mappings, include=["skos:exactMatch"]) component_node_sizes, component_edge_sizes, component_densities, component_number_prefixes = ( [], [], [], [], ) prefix_list = [] components_with_duplicate_prefixes = [] incomplete_components = [] n_duplicates = [] for component in tqdm(nx.connected_components(true_graph), desc="Iterating components"): component = true_graph.subgraph(component) node_size = component.number_of_nodes() edge_size = component.number_of_edges() nodes_data = { curie: { "link": miriam_validator.get_url(data["prefix"], data["identifier"]), **data, } for curie, data in sorted(component.nodes(data=True), key=itemgetter(0)) } component_node_sizes.append(node_size) component_edge_sizes.append(edge_size) if node_size > 2: component_densities.append(nx.density(component)) if node_size > 2 and edge_size < (node_size * (node_size - 1) / 2): incomplete_components_edges = [] for u, v in sorted(nx.complement(component.copy()).edges()): if u > v: u, v = v, u incomplete_components_edges.append({ "source": { "curie": u, **nodes_data[u] }, "target": { "curie": v, **nodes_data[v] }, }) incomplete_components_edges = sorted( incomplete_components_edges, key=lambda d: d["source"]["curie"]) incomplete_components.append({ "nodes": nodes_data, "edges": incomplete_components_edges, }) prefixes = [true_graph.nodes[node]["prefix"] for node in component] prefix_list.extend(prefixes) unique_prefixes = len(set(prefixes)) component_number_prefixes.append(unique_prefixes) _n_duplicates = len(prefixes) - unique_prefixes n_duplicates.append(_n_duplicates) if _n_duplicates: components_with_duplicate_prefixes.append(nodes_data) with open(os.path.join(DATA, "incomplete_components.yml"), "w") as file: yaml.safe_dump(incomplete_components, file) with open(os.path.join(DATA, "components_with_duplicate_prefixes.yml"), "w") as file: yaml.safe_dump(components_with_duplicate_prefixes, file) fig, axes = plt.subplots(2, 3, figsize=(10.5, 6.5)) _countplot_list(component_node_sizes, ax=axes[0][0]) axes[0][0].set_yscale("log") axes[0][0].set_title("Size (Nodes)") _countplot_list(component_edge_sizes, ax=axes[0][1]) axes[0][1].set_yscale("log") axes[0][1].set_title("Size (Edges)") axes[0][1].set_ylabel("") sns.kdeplot(component_densities, ax=axes[0][2]) axes[0][2].set_xlim([0.0, 1.0]) # axes[0][2].set_yscale('log') axes[0][2].set_title("Density ($|V| > 2$)") axes[0][2].set_ylabel("") _countplot_list(component_number_prefixes, ax=axes[1][0]) axes[1][0].set_title("Number Prefixes") axes[1][0].set_yscale("log") # has duplicate prefix in component _countplot_list(n_duplicates, ax=axes[1][1]) axes[1][1].set_yscale("log") axes[0][2].set_ylabel("") axes[1][1].set_title("Number Duplicate Prefixes") axes[1][2].axis("off") path = os.path.join(IMG, "components.png") print("saving to", path) plt.tight_layout() plt.savefig(path, dpi=300) plt.close(fig) fig, axes = plt.subplots(1, 2, figsize=(8, 3.5)) sns.countplot(y=prefix_list, ax=axes[0], order=[k for k, _ in Counter(prefix_list).most_common()]) axes[0].set_xscale("log") axes[0].set_title("Prefix Frequency") relations = [m["relation"] for m in true_mappings] sns.countplot(y=relations, ax=axes[1], order=[k for k, _ in Counter(relations).most_common()]) axes[1].set_xscale("log") axes[1].set_title("Relation Frequency") path = os.path.join(IMG, "summary.png") print("saving to", path) plt.tight_layout() plt.savefig(path, dpi=300) plt.close(fig)
def get_true_graph(include: Optional[Sequence[str]] = None, exclude: Optional[Sequence[str]] = None) -> nx.Graph: """Get a graph of the true mappings.""" return _graph_from_mappings(load_mappings(), include=include, exclude=exclude)