Exemplo n.º 1
0
def neo4j_node_summary(config, address, username, password, output=None):
    if output is not None and not is_writable(output):
        error(f'Cannot write to {output}')

    bolt_driver = GraphDatabase.driver(address, auth=(username, password))

    query = """
    MATCH (x) RETURN DISTINCT x.category AS category
    """

    with bolt_driver.session() as session:
        records = session.run(query)

    categories = set()

    for record in records:
        category = record['category']
        if isinstance(category, str):
            categories.add(category)
        elif isinstance(category, (list, set, tuple)):
            categories.update(category)
        elif category is None:
            continue
        else:
            error('Unrecognized value for node.category: {}'.format(category))

    rows = []
    with click.progressbar(categories, length=len(categories)) as bar:
        for category in bar:
            query = """
            MATCH (x) WHERE x.category = {category} OR {category} IN x.category
            RETURN DISTINCT
                {category} AS category,
                split(x.id, ':')[0] AS prefix,
                COUNT(*) AS frequency
            ORDER BY category, frequency DESC;
            """

            with bolt_driver.session() as session:
                records = session.run(query, category=category)

            for record in records:
                rows.append({
                    'category' : record['category'],
                    'prefix' : record['prefix'],
                    'frequency' : record['frequency']
                })

    df = pd.DataFrame(rows)
    df = df[['category', 'prefix', 'frequency']]

    if output is None:
        click.echo(df)
    else:
        df.to_csv(output, sep='|', header=True)
        click.echo('Saved report to {}'.format(output))
Exemplo n.º 2
0
def dump(config, inputs, output, input_type, output_type, mapping, preserve):
    """\b
    Transforms a knowledge graph from one representation to another
    """
    if not is_writable(output):
        error(f'Cannot write to {output}')

    t = load_transformer(inputs, input_type)
    if mapping != None:
        path = get_file_path(mapping)
        with click.open_file(path, 'rb') as f:
            d = pickle.load(f)
            click.echo('Performing mapping: ' + mapping)
            map_graph(G=t.graph, mapping=d, preserve=preserve)
    transform_and_save(t, output, output_type)
Exemplo n.º 3
0
def neo4j_download(config: dict, address: str, username: str, password: str,
                   output: str, output_type: str, subject_label: str,
                   object_label: str, edge_label: str, directed: bool,
                   page_size: int, stop_after: int):
    """
    Download nodes and edges from Neo4j database.
    \f

    Parameters
    ----------
    config: dict
        A dictionary containing the configuration for kgx.cli
    address: str
        The full HTTP address for Neo4j database
    username: str
        Username for authentication
    password: str
        Password for authentication
    output: str
        Where to write the output (stdout, by default)
    output_type: str
        The output type (``csv``, by default)
    subject_label: str
        The label for subject node in an association
    object_label: str
        The label for object node in an association
    edge_label: str
        The label for the edge in an association
    directed: bool
        Whether or not the edge is supposed to be directed (``true``, by default)
    stop_after: int
        The max number of edges to fetch
    page_size: int
        The page size to use while fetching associations from Neo4j (``10000``, by default)

    """
    if not is_writable(output):
        try:
            with open(output, 'w+') as f:
                pass
        except:
            error(f'Cannot write to {output}')

    output_transformer = get_transformer(output_type)()
    G = output_transformer.graph

    driver = http_gdb(address, username=username, password=password)

    subject_label = ':`{}`'.format(subject_label) if isinstance(
        subject_label, str) else ''
    object_label = ':`{}`'.format(object_label) if isinstance(
        object_label, str) else ''
    edge_label = ':`{}`'.format(edge_label) if isinstance(edge_label,
                                                          str) else ''

    if directed:
        query = 'match (n{})-[e{}]->(m{})'.format(subject_label, edge_label,
                                                  object_label)
    else:
        query = 'match (n{})-[e{}]-(m{})'.format(subject_label, edge_label,
                                                 object_label)

    results = driver.query('{} return count(*)'.format(query))
    size = [x[0] for x in results][0]
    print("SIZE: {}".format(size))

    if size == 0:
        click.echo('No records found.')
        return

    click.echo('Using cypher query: {} return n, e, m'.format(query))

    page_size = 1_000
    skip_flag = False

    with click.progressbar(
            list(range(0, size, page_size)),
            label='Downloading {} many edges'.format(size)) as bar:
        for i in bar:
            q = '{} return n, e, m skip {} limit {}'.format(
                query, i, page_size)
            results = driver.query(q)
            for n, e, m in results:
                subject_attr = n['data']
                object_attr = m['data']
                edge_attr = e['data']

                if 'id' not in subject_attr or 'id' not in object_attr:
                    if not skip_flag:
                        click.echo(
                            'Skipping records that have no id attribute')
                        skip_flag = True
                    continue

                s = subject_attr['id']
                o = object_attr['id']

                if 'edge_label' not in edge_attr:
                    edge_attr['edge_label'] = e['metadata']['type']

                if 'category' not in subject_attr:
                    subject_attr['category'] = n['metadata']['labels']

                if 'category' not in object_attr:
                    object_attr['category'] = m['metadata']['labels']

                if s not in G:
                    G.add_node(s, **subject_attr)
                if o not in G:
                    G.add_node(o, **object_attr)

                G.add_edge(s, o, key=edge_attr['edge_label'], **edge_attr)

            if stop_after is not None and G.number_of_edges() > stop_after:
                break

    output_transformer.save(output, extension=output_type)
Exemplo n.º 4
0
def neo4j_edge_summary(config: dict,
                       address: str,
                       username: str,
                       password: str,
                       output: str = None):
    """
    Get a summary of all the edges in a Neo4j database.
    \f

    Parameters
    ----------
    config: dict
        A dictionary containing the configuration for kgx.cli
    address: str
        The full HTTP address for Neo4j database
    username: str
        Username for authentication
    password: str
        Password for authentication
    output: str
        Where to write the output (stdout, by default)

    """
    if output is not None and not is_writable(output):
        error(f'Cannot write to {output}')

    http_driver = http_gdb(address, username=username, password=password)

    query = """
    MATCH (x) RETURN DISTINCT x.category AS category
    """

    records = http_driver.query(query)
    categories = set()

    for record in records:
        category = record[0]
        if isinstance(category, str):
            categories.add(category)
        elif isinstance(category, (list, set, tuple)):
            categories.update(category)
        elif category is None:
            continue
        else:
            error('Unrecognized value for node.category: {}'.format(category))

    categories = list(categories)

    query = """
    MATCH (n)-[r]-(m)
    WHERE
        (n.category = {category1} OR {category1} IN n.category) AND
        (m.category = {category2} OR {category2} IN m.category)
    RETURN DISTINCT
        {category1} AS subject_category,
        {category2} AS object_category,
        type(r) AS edge_type,
        split(n.id, ':')[0] AS subject_prefix,
        split(m.id, ':')[0] AS object_prefix,
        COUNT(*) AS frequency
    ORDER BY subject_category, object_category, frequency DESC;
    """

    combinations = [(c1, c2) for c1 in categories for c2 in categories]

    rows = []
    with click.progressbar(combinations, length=len(combinations)) as bar:
        for category1, category2 in bar:
            records = http_driver.query(query,
                                        params={
                                            'category1': category2,
                                            'category2': category2
                                        })
            for r in records:
                rows.append({
                    'subject_category': r[0],
                    'object_category': r[1],
                    'subject_prefix': r[3],
                    'object_prefix': r[4],
                    'frequency': r[5]
                })

    df = pd.DataFrame(rows)
    df = df[[
        'subject_category', 'subject_prefix', 'object_category',
        'object_prefix', 'frequency'
    ]]

    if output is None:
        with pd.option_context('display.max_rows', None, 'display.max_columns',
                               None):
            click.echo(df)
    else:
        df.to_csv(output, sep='|', header=True)
        click.echo('Saved report to {}'.format(output))
Exemplo n.º 5
0
def neo4j_node_summary(config: dict,
                       address: str,
                       username: str,
                       password: str,
                       output: str = None):
    """
    Get a summary of all the nodes in a Neo4j database.
    \f

    Parameters
    ----------
    config: dict
        A dictionary containing the configuration for kgx.cli
    address: str
        The full HTTP address for Neo4j database
    username: str
        Username for authentication
    password: str
        Password for authentication
    output: str
        Where to write the output (stdout, by default)

    """
    if output is not None and not is_writable(output):
        error(f'Cannot write to {output}')

    http_driver = http_gdb(address, username=username, password=password)

    query = """
    MATCH (x) RETURN DISTINCT x.category AS category
    """

    records = http_driver.query(query)
    categories = set()
    for record in records:
        category = record[0]
        if isinstance(category, str):
            categories.add(category)
        elif isinstance(category, (list, set, tuple)):
            categories.update(category)
        elif category is None:
            continue
        else:
            error('Unrecognized value for node.category: {}'.format(category))

    rows = []
    with click.progressbar(categories, length=len(categories)) as bar:
        for category in bar:
            query = f"""
            MATCH (x) WHERE x.category = '{category}' OR '{category}' IN x.category
            RETURN DISTINCT
                '{category}' AS category,
                split(x.id, ':')[0] AS prefix,
                COUNT(*) AS frequency
            ORDER BY category, frequency DESC;
            """
            records = http_driver.query(query)
            for record in records:
                rows.append({
                    'category': record[0],
                    'prefix': record[1],
                    'frequency': record[2]
                })

    df = pd.DataFrame(rows)
    df = df[['category', 'prefix', 'frequency']]

    if output is None:
        click.echo(df)
    else:
        df.to_csv(output, sep='|', header=True)
        click.echo('Saved report to {}'.format(output))
Exemplo n.º 6
0
def neo4j_download(config, page_size, stop_after, subject_label, object_label, edge_type, address, username, password, output, output_type):
    if not is_writable(output):
        try:
            with open(output, 'w+') as f:
                pass
        except:
            error(f'Cannot write to {output}')

    output_transformer = get_transformer(get_type(output))()
    G = output_transformer.graph

    driver = http_gdb(address, username=username, password=password)

    subject_label = ':`{}`'.format(subject_label) if isinstance(subject_label, str) else ''
    object_label = ':`{}`'.format(object_label) if isinstance(object_label, str) else ''
    edge_type = ':`{}`'.format(edge_type) if isinstance(edge_type, str) else ''

    match = 'match (n{})-[e{}]->(m{})'.format(subject_label, edge_type, object_label)

    results = driver.query('{} return count(*)'.format(match))

    click.echo('Using cyper query: {} return n, e, m'.format(match))

    for a, in results:
        size = a
        break

    if size == 0:
        click.echo('No data available')
        quit()

    page_size = 1_000

    skip_flag = False

    with click.progressbar(list(range(0, size, page_size)), label='Downloading {} many edges'.format(size)) as bar:
        for i in bar:
            q = '{} return n, e, m skip {} limit {}'.format(match, i, page_size)
            results = driver.query(q)

            for n, e, m in results:
                subject_attr = n['data']
                object_attr = m['data']
                edge_attr = e['data']

                if 'id' not in subject_attr or 'id' not in object_attr:
                    if not skip_flag:
                        click.echo('Skipping records that have no id attribute')
                        skip_flag = True
                    continue

                s = subject_attr['id']
                o = object_attr['id']

                if 'edge_label' not in edge_attr:
                    edge_attr['edge_label'] = e['metadata']['type']

                if 'category' not in subject_attr:
                    subject_attr['category'] = n['metadata']['labels']

                if 'category' not in object_attr:
                    object_attr['category'] = m['metadata']['labels']

                if s not in G:
                    G.add_node(s, **subject_attr)
                if o not in G:
                    G.add_node(o, **object_attr)

                G.add_edge(s, o, key=edge_attr['edge_label'], **edge_attr)

            if stop_after is not None and G.number_of_edges() > stop_after:
                break

    output_transformer.save(output)
Exemplo n.º 7
0
def neo4j_edge_summary(config, address, username, password, output=None):
    if output is not None and not is_writable(output):
        error(f'Cannot write to {output}')

    bolt_driver = GraphDatabase.driver(address, auth=(username, password))

    query = """
    MATCH (x) RETURN DISTINCT x.category AS category
    """

    with bolt_driver.session() as session:
        records = session.run(query)

    categories = set()

    for record in records:
        category = record['category']
        if isinstance(category, str):
            categories.add(category)
        elif isinstance(category, (list, set, tuple)):
            categories.update(category)
        elif category is None:
            continue
        else:
            error('Unrecognized value for node.category: {}'.format(category))

    categories = list(categories)

    query = """
    MATCH (n)-[r]-(m)
    WHERE
        (n.category = {category1} OR {category1} IN n.category) AND
        (m.category = {category2} OR {category2} IN m.category)
    RETURN DISTINCT
        {category1} AS subject_category,
        {category2} AS object_category,
        type(r) AS edge_type,
        split(n.id, ':')[0] AS subject_prefix,
        split(m.id, ':')[0] AS object_prefix,
        COUNT(*) AS frequency
    ORDER BY subject_category, object_category, frequency DESC;
    """

    combinations = [(c1, c2) for c1 in categories for c2 in categories]

    rows = []
    with click.progressbar(combinations, length=len(combinations)) as bar:
        for category1, category2 in bar:
            with bolt_driver.session() as session:
                records = session.run(query, category1=category1, category2=category2)

                for r in records:
                    rows.append({
                        'subject_category' : r['subject_category'],
                        'object_category' : r['object_category'],
                        'subject_prefix' : r['subject_prefix'],
                        'object_prefix' : r['object_prefix'],
                        'frequency' : r['frequency']
                    })

    df = pd.DataFrame(rows)
    df = df[['subject_category', 'subject_prefix', 'object_category', 'object_prefix', 'frequency']]

    if output is None:
        with pd.option_context('display.max_rows', None, 'display.max_columns', None):
            click.echo(df)
    else:
        df.to_csv(output, sep='|', header=True)
        click.echo('Saved report to {}'.format(output))