Exemplo n.º 1
0
def export():
    """Create export data file."""
    from biomappings.resources import load_mappings, load_predictions, load_false_mappings

    here = os.path.abspath(os.path.dirname(__file__))
    path = os.path.join(here, os.pardir, os.pardir, 'docs', '_data', 'summary.yml')

    true_mappings = load_mappings()
    false_mappings = load_false_mappings()
    rv = {
        'positive': _get_counter(true_mappings),
        'negative': _get_counter(false_mappings),
        'predictions': _get_counter(load_predictions()),
        'contributors': _get_contributors(itt.chain(true_mappings, false_mappings)),
    }
    rv.update({
        f'{k}_mapping_count': sum(e['count'] for e in rv[k])
        for k in ('positive', 'negative', 'predictions')
    })
    rv.update({
        f'{k}_prefix_count': len(set(itt.chain.from_iterable((e['source'], e['target']) for e in rv[k])))
        for k in ('positive', 'negative', 'predictions')
    })
    with open(path, 'w') as file:
        yaml.safe_dump(rv, file, indent=2)
Exemplo n.º 2
0
def export():
    """Create export data file."""
    from biomappings.resources import load_mappings, load_predictions, load_false_mappings
    from biomappings.utils import DATA

    path = os.path.join(DATA, "summary.yml")

    true_mappings = load_mappings()
    false_mappings = load_false_mappings()
    rv = {
        "positive":
        _get_counter(true_mappings),
        "negative":
        _get_counter(false_mappings),
        "predictions":
        _get_counter(load_predictions()),
        "contributors":
        _get_contributors(itt.chain(true_mappings, false_mappings)),
    }
    rv.update({
        f"{k}_mapping_count": sum(e["count"] for e in rv[k])
        for k in ("positive", "negative", "predictions")
    })
    rv.update({
        f"{k}_prefix_count": len(
            set(
                itt.chain.from_iterable(
                    (e["source"], e["target"]) for e in rv[k])))
        for k in ("positive", "negative", "predictions")
    })
    with open(path, "w") as file:
        yaml.safe_dump(rv, file, indent=2)
Exemplo n.º 3
0
def update_biomappings():
    """Update mappings from the BioMappings project."""
    from indra.databases import mesh_client
    from indra.databases.identifiers import get_ns_id_from_identifiers
    from biomappings.resources import load_mappings, load_predictions

    # We now construct a mapping dict of these mappings
    biomappings = defaultdict(list)
    mappings = load_mappings()
    predictions = load_predictions()
    exclude_ns = {'kegg.pathway', 'depmap', 'ccle', 'reactome'}
    for mappings, mapping_type in ((mappings, 'curated'),
                                   (predictions, 'predicted')):
        for mapping in mappings:
            # We skip anything that isn't an exact match
            if mapping['relation'] != 'skos:exactMatch':
                continue
            # Skip excluded name spaces that aren't relevant here
            if mapping['source prefix'] in exclude_ns or \
                    mapping['target prefix'] in exclude_ns:
                continue
            # We only accept curated mappings for NCIT
            if mapping_type == 'predicted' and \
                    (mapping['source prefix'] == 'ncit' or
                     mapping['target prefix'] == 'ncit'):
                continue
            source_ns, source_id = \
                get_ns_id_from_identifiers(mapping['source prefix'],
                                           mapping['source identifier'])
            target_ns, target_id = \
                get_ns_id_from_identifiers(mapping['target prefix'],
                                           mapping['target identifier'])
            # We only take real xrefs, not refs within a given ontology
            if source_ns == target_ns:
                continue
            biomappings[(source_ns, source_id, mapping['source name'])].append(
                (target_ns, target_id, mapping['target name']))
            biomappings[(target_ns, target_id, mapping['target name'])].append(
                (source_ns, source_id, mapping['source name']))

    def _filter_ncit(values):
        if len(values) > 1 and 'NCIT' in {v[0] for v in values}:
            return [v for v in values if v[0] != 'NCIT']
        else:
            return values

    mesh_mappings = {k: _filter_ncit(v) for k, v in biomappings.items()
                     if k[0] == 'MESH'}
    non_mesh_mappings = {k: [vv for vv in v if vv[0] != 'MESH']
                         for k, v in biomappings.items()
                         if k[0] != 'MESH' and k[1] != 'MESH'}
    rows = []
    for k, v in non_mesh_mappings.items():
        for vv in v:
            rows.append(list(k + vv))
    rows = sorted(rows, key=lambda x: x[1])
    write_unicode_csv(get_resource_path('biomappings.tsv'), rows,
                      delimiter='\t')

    # We next look at mappings to MeSH from EFO/HP/DOID
    for ns in ['efo', 'hp', 'doid']:
        for entry in load_resource_json('%s.json' % ns):
            db, db_id, name = ns.upper(), entry['id'], entry['name']
            if (db, db_id) in biomappings:
                continue
            # We first need to decide if we prioritize another name space
            xref_dict = {xr['namespace']: xr['id']
                         for xr in entry.get('xrefs', [])}
            if 'MESH' in xref_dict or 'MSH' in xref_dict:
                mesh_id = xref_dict.get('MESH') or xref_dict.get('MSH')
                if not mesh_id.startswith('D'):
                    continue
                mesh_name = mesh_client.get_mesh_name(mesh_id)
                if not mesh_name:
                    continue
                key = ('MESH', mesh_id, mesh_name)
                if db_id.startswith('BFO'):
                    db_to_use = 'BFO'
                    db_id_to_use = db_id[4:]
                else:
                    db_to_use = db
                    db_id_to_use = db_id
                if key not in mesh_mappings:
                    mesh_mappings[key] = [(db_to_use, db_id_to_use,
                                           entry['name'])]
                else:
                    mesh_mappings[key].append((db_to_use, db_id_to_use,
                                               entry['name']))

    rows = []
    for k, v in mesh_mappings.items():
        for vv in v:
            rows.append(list(k + vv))
    rows = sorted(rows, key=lambda x: (x[1], x[2], x[3]))
    write_unicode_csv(get_resource_path('mesh_mappings.tsv'), rows,
                      delimiter='\t')
Exemplo n.º 4
0
def get_true_graph() -> nx.Graph:
    """Get a graph of the true mappings."""
    return _graph_from_mappings(load_mappings())
Exemplo n.º 5
0
def charts():
    """Make charts."""
    import matplotlib.pyplot as plt
    import seaborn as sns

    miriam_validator = MiriamValidator()
    true_mappings = load_mappings()
    true_graph = _graph_from_mappings(true_mappings,
                                      include=["skos:exactMatch"])

    component_node_sizes, component_edge_sizes, component_densities, component_number_prefixes = (
        [],
        [],
        [],
        [],
    )
    prefix_list = []
    components_with_duplicate_prefixes = []
    incomplete_components = []
    n_duplicates = []
    for component in tqdm(nx.connected_components(true_graph),
                          desc="Iterating components"):
        component = true_graph.subgraph(component)
        node_size = component.number_of_nodes()
        edge_size = component.number_of_edges()

        nodes_data = {
            curie: {
                "link":
                miriam_validator.get_url(data["prefix"], data["identifier"]),
                **data,
            }
            for curie, data in sorted(component.nodes(data=True),
                                      key=itemgetter(0))
        }

        component_node_sizes.append(node_size)
        component_edge_sizes.append(edge_size)
        if node_size > 2:
            component_densities.append(nx.density(component))
        if node_size > 2 and edge_size < (node_size * (node_size - 1) / 2):
            incomplete_components_edges = []
            for u, v in sorted(nx.complement(component.copy()).edges()):
                if u > v:
                    u, v = v, u
                incomplete_components_edges.append({
                    "source": {
                        "curie": u,
                        **nodes_data[u]
                    },
                    "target": {
                        "curie": v,
                        **nodes_data[v]
                    },
                })
            incomplete_components_edges = sorted(
                incomplete_components_edges,
                key=lambda d: d["source"]["curie"])
            incomplete_components.append({
                "nodes": nodes_data,
                "edges": incomplete_components_edges,
            })

        prefixes = [true_graph.nodes[node]["prefix"] for node in component]
        prefix_list.extend(prefixes)
        unique_prefixes = len(set(prefixes))
        component_number_prefixes.append(unique_prefixes)
        _n_duplicates = len(prefixes) - unique_prefixes
        n_duplicates.append(_n_duplicates)
        if _n_duplicates:
            components_with_duplicate_prefixes.append(nodes_data)

    with open(os.path.join(DATA, "incomplete_components.yml"), "w") as file:
        yaml.safe_dump(incomplete_components, file)
    with open(os.path.join(DATA, "components_with_duplicate_prefixes.yml"),
              "w") as file:
        yaml.safe_dump(components_with_duplicate_prefixes, file)

    fig, axes = plt.subplots(2, 3, figsize=(10.5, 6.5))

    _countplot_list(component_node_sizes, ax=axes[0][0])
    axes[0][0].set_yscale("log")
    axes[0][0].set_title("Size (Nodes)")

    _countplot_list(component_edge_sizes, ax=axes[0][1])
    axes[0][1].set_yscale("log")
    axes[0][1].set_title("Size (Edges)")
    axes[0][1].set_ylabel("")

    sns.kdeplot(component_densities, ax=axes[0][2])
    axes[0][2].set_xlim([0.0, 1.0])
    # axes[0][2].set_yscale('log')
    axes[0][2].set_title("Density ($|V| > 2$)")
    axes[0][2].set_ylabel("")

    _countplot_list(component_number_prefixes, ax=axes[1][0])
    axes[1][0].set_title("Number Prefixes")
    axes[1][0].set_yscale("log")
    # has duplicate prefix in component

    _countplot_list(n_duplicates, ax=axes[1][1])
    axes[1][1].set_yscale("log")
    axes[0][2].set_ylabel("")
    axes[1][1].set_title("Number Duplicate Prefixes")

    axes[1][2].axis("off")

    path = os.path.join(IMG, "components.png")
    print("saving to", path)
    plt.tight_layout()
    plt.savefig(path, dpi=300)
    plt.close(fig)

    fig, axes = plt.subplots(1, 2, figsize=(8, 3.5))
    sns.countplot(y=prefix_list,
                  ax=axes[0],
                  order=[k for k, _ in Counter(prefix_list).most_common()])
    axes[0].set_xscale("log")
    axes[0].set_title("Prefix Frequency")

    relations = [m["relation"] for m in true_mappings]
    sns.countplot(y=relations,
                  ax=axes[1],
                  order=[k for k, _ in Counter(relations).most_common()])
    axes[1].set_xscale("log")
    axes[1].set_title("Relation Frequency")

    path = os.path.join(IMG, "summary.png")
    print("saving to", path)
    plt.tight_layout()
    plt.savefig(path, dpi=300)
    plt.close(fig)
Exemplo n.º 6
0
def get_true_graph(include: Optional[Sequence[str]] = None,
                   exclude: Optional[Sequence[str]] = None) -> nx.Graph:
    """Get a graph of the true mappings."""
    return _graph_from_mappings(load_mappings(),
                                include=include,
                                exclude=exclude)