Пример #1
0
    def __init__(self,
                 graph=None,
                 host=None,
                 ports=None,
                 username=None,
                 password=None,
                 **args):
        super(NeoTransformer, self).__init__(graph)

        self.http_driver = None

        if ports is None:
            # read from config
            with open('config.yml', 'r') as ymlfile:
                cfg = yaml.load(ymlfile)
                if 'http_port' in cfg['neo4j']:
                    http_uri = "http://{}:{}".format(cfg['neo4j']['host'],
                                                     cfg['neo4j']['http_port'])
                    logging.debug(
                        "Initializing http driver with URI: {}".format(
                            http_uri))
                    self.http_driver = http_gdb(http_uri,
                                                username=username,
                                                password=password)
        else:
            if 'http' in ports:
                http_uri = "http://{}:{}".format(host, ports['http'])
                self.http_driver = http_gdb(http_uri,
                                            username=username,
                                            password=password)
Пример #2
0
 def __init__(self,
              graph: nx.MultiDiGraph = None,
              uri: str = None,
              username: str = None,
              password: str = None):
     """
     Initialize an instance of NeoTransformer.
     """
     super(NeoTransformer, self).__init__(graph)
     self.http_driver = None
     self.http_driver = http_gdb(uri, username=username, password=password)
Пример #3
0
 def __init__(self,
              graph=None,
              host=None,
              port=None,
              username=None,
              password=None):
     super(NeoTransformer, self).__init__(graph)
     self.http_driver = None
     http_uri = f'http://{host}:{port}'
     self.http_driver = http_gdb(http_uri,
                                 username=username,
                                 password=password)
Пример #4
0
def test_neo_to_graph_download():
    """ downloads a neo4j graph
    """
    return

    subject_label = 'gene'
    object_label = None
    edge_type = None
    stop_after = 100

    output_transformer = JsonTransformer()
    G = output_transformer.graph

    driver = http_gdb('http://localhost:7474', username='', password='')

    subject_label = ':`{}`'.format(subject_label) if isinstance(
        subject_label, str) else ''
    object_label = ':`{}`'.format(object_label) if isinstance(
        object_label, str) else ''
    edge_type = ':`{}`'.format(edge_type) if isinstance(edge_type, str) else ''

    match = 'match (n{})-[e{}]->(m{})'.format(subject_label, edge_type,
                                              object_label)

    results = driver.query('{} return count(*)'.format(match))

    print('Using cyper query: {} return n, e, m'.format(match))

    for a, in results:
        size = a
        break

    if size == 0:
        print('No data available')
        quit()

    page_size = 1_000

    skip_flag = False

    for i in range(0, size, page_size):
        q = '{} return n, e, m skip {} limit {}'.format(match, i, page_size)
        results = driver.query(q)

        for n, e, m in results:
            subject_attr = n['data']
            object_attr = m['data']
            edge_attr = e['data']

            if 'id' not in subject_attr or 'id' not in object_attr:
                if not skip_flag:
                    print('Skipping records that have no id attribute')
                    skip_flag = True
                continue

            s = subject_attr['id']
            o = object_attr['id']

            if 'edge_label' not in edge_attr:
                edge_attr['edge_label'] = e['metadata']['type']

            if 'category' not in subject_attr:
                subject_attr['category'] = n['metadata']['labels']

            if 'category' not in object_attr:
                object_attr['category'] = m['metadata']['labels']

            if s not in G:
                G.add_node(s, **subject_attr)
            if o not in G:
                G.add_node(o, **object_attr)

            G.add_edge(s, o, key=edge_attr['edge_label'], **edge_attr)

        if stop_after is not None and G.number_of_edges() > stop_after:
            break
Пример #5
0
def neo4j_download(config: dict, address: str, username: str, password: str,
                   output: str, output_type: str, subject_label: str,
                   object_label: str, edge_label: str, directed: bool,
                   page_size: int, stop_after: int):
    """
    Download nodes and edges from Neo4j database.
    \f

    Parameters
    ----------
    config: dict
        A dictionary containing the configuration for kgx.cli
    address: str
        The full HTTP address for Neo4j database
    username: str
        Username for authentication
    password: str
        Password for authentication
    output: str
        Where to write the output (stdout, by default)
    output_type: str
        The output type (``csv``, by default)
    subject_label: str
        The label for subject node in an association
    object_label: str
        The label for object node in an association
    edge_label: str
        The label for the edge in an association
    directed: bool
        Whether or not the edge is supposed to be directed (``true``, by default)
    stop_after: int
        The max number of edges to fetch
    page_size: int
        The page size to use while fetching associations from Neo4j (``10000``, by default)

    """
    if not is_writable(output):
        try:
            with open(output, 'w+') as f:
                pass
        except:
            error(f'Cannot write to {output}')

    output_transformer = get_transformer(output_type)()
    G = output_transformer.graph

    driver = http_gdb(address, username=username, password=password)

    subject_label = ':`{}`'.format(subject_label) if isinstance(
        subject_label, str) else ''
    object_label = ':`{}`'.format(object_label) if isinstance(
        object_label, str) else ''
    edge_label = ':`{}`'.format(edge_label) if isinstance(edge_label,
                                                          str) else ''

    if directed:
        query = 'match (n{})-[e{}]->(m{})'.format(subject_label, edge_label,
                                                  object_label)
    else:
        query = 'match (n{})-[e{}]-(m{})'.format(subject_label, edge_label,
                                                 object_label)

    results = driver.query('{} return count(*)'.format(query))
    size = [x[0] for x in results][0]
    print("SIZE: {}".format(size))

    if size == 0:
        click.echo('No records found.')
        return

    click.echo('Using cypher query: {} return n, e, m'.format(query))

    page_size = 1_000
    skip_flag = False

    with click.progressbar(
            list(range(0, size, page_size)),
            label='Downloading {} many edges'.format(size)) as bar:
        for i in bar:
            q = '{} return n, e, m skip {} limit {}'.format(
                query, i, page_size)
            results = driver.query(q)
            for n, e, m in results:
                subject_attr = n['data']
                object_attr = m['data']
                edge_attr = e['data']

                if 'id' not in subject_attr or 'id' not in object_attr:
                    if not skip_flag:
                        click.echo(
                            'Skipping records that have no id attribute')
                        skip_flag = True
                    continue

                s = subject_attr['id']
                o = object_attr['id']

                if 'edge_label' not in edge_attr:
                    edge_attr['edge_label'] = e['metadata']['type']

                if 'category' not in subject_attr:
                    subject_attr['category'] = n['metadata']['labels']

                if 'category' not in object_attr:
                    object_attr['category'] = m['metadata']['labels']

                if s not in G:
                    G.add_node(s, **subject_attr)
                if o not in G:
                    G.add_node(o, **object_attr)

                G.add_edge(s, o, key=edge_attr['edge_label'], **edge_attr)

            if stop_after is not None and G.number_of_edges() > stop_after:
                break

    output_transformer.save(output, extension=output_type)
Пример #6
0
def neo4j_edge_summary(config: dict,
                       address: str,
                       username: str,
                       password: str,
                       output: str = None):
    """
    Get a summary of all the edges in a Neo4j database.
    \f

    Parameters
    ----------
    config: dict
        A dictionary containing the configuration for kgx.cli
    address: str
        The full HTTP address for Neo4j database
    username: str
        Username for authentication
    password: str
        Password for authentication
    output: str
        Where to write the output (stdout, by default)

    """
    if output is not None and not is_writable(output):
        error(f'Cannot write to {output}')

    http_driver = http_gdb(address, username=username, password=password)

    query = """
    MATCH (x) RETURN DISTINCT x.category AS category
    """

    records = http_driver.query(query)
    categories = set()

    for record in records:
        category = record[0]
        if isinstance(category, str):
            categories.add(category)
        elif isinstance(category, (list, set, tuple)):
            categories.update(category)
        elif category is None:
            continue
        else:
            error('Unrecognized value for node.category: {}'.format(category))

    categories = list(categories)

    query = """
    MATCH (n)-[r]-(m)
    WHERE
        (n.category = {category1} OR {category1} IN n.category) AND
        (m.category = {category2} OR {category2} IN m.category)
    RETURN DISTINCT
        {category1} AS subject_category,
        {category2} AS object_category,
        type(r) AS edge_type,
        split(n.id, ':')[0] AS subject_prefix,
        split(m.id, ':')[0] AS object_prefix,
        COUNT(*) AS frequency
    ORDER BY subject_category, object_category, frequency DESC;
    """

    combinations = [(c1, c2) for c1 in categories for c2 in categories]

    rows = []
    with click.progressbar(combinations, length=len(combinations)) as bar:
        for category1, category2 in bar:
            records = http_driver.query(query,
                                        params={
                                            'category1': category2,
                                            'category2': category2
                                        })
            for r in records:
                rows.append({
                    'subject_category': r[0],
                    'object_category': r[1],
                    'subject_prefix': r[3],
                    'object_prefix': r[4],
                    'frequency': r[5]
                })

    df = pd.DataFrame(rows)
    df = df[[
        'subject_category', 'subject_prefix', 'object_category',
        'object_prefix', 'frequency'
    ]]

    if output is None:
        with pd.option_context('display.max_rows', None, 'display.max_columns',
                               None):
            click.echo(df)
    else:
        df.to_csv(output, sep='|', header=True)
        click.echo('Saved report to {}'.format(output))
Пример #7
0
def neo4j_node_summary(config: dict,
                       address: str,
                       username: str,
                       password: str,
                       output: str = None):
    """
    Get a summary of all the nodes in a Neo4j database.
    \f

    Parameters
    ----------
    config: dict
        A dictionary containing the configuration for kgx.cli
    address: str
        The full HTTP address for Neo4j database
    username: str
        Username for authentication
    password: str
        Password for authentication
    output: str
        Where to write the output (stdout, by default)

    """
    if output is not None and not is_writable(output):
        error(f'Cannot write to {output}')

    http_driver = http_gdb(address, username=username, password=password)

    query = """
    MATCH (x) RETURN DISTINCT x.category AS category
    """

    records = http_driver.query(query)
    categories = set()
    for record in records:
        category = record[0]
        if isinstance(category, str):
            categories.add(category)
        elif isinstance(category, (list, set, tuple)):
            categories.update(category)
        elif category is None:
            continue
        else:
            error('Unrecognized value for node.category: {}'.format(category))

    rows = []
    with click.progressbar(categories, length=len(categories)) as bar:
        for category in bar:
            query = f"""
            MATCH (x) WHERE x.category = '{category}' OR '{category}' IN x.category
            RETURN DISTINCT
                '{category}' AS category,
                split(x.id, ':')[0] AS prefix,
                COUNT(*) AS frequency
            ORDER BY category, frequency DESC;
            """
            records = http_driver.query(query)
            for record in records:
                rows.append({
                    'category': record[0],
                    'prefix': record[1],
                    'frequency': record[2]
                })

    df = pd.DataFrame(rows)
    df = df[['category', 'prefix', 'frequency']]

    if output is None:
        click.echo(df)
    else:
        df.to_csv(output, sep='|', header=True)
        click.echo('Saved report to {}'.format(output))
Пример #8
0
def neo4j_download(config, page_size, stop_after, subject_label, object_label, edge_type, address, username, password, output, output_type):
    if not is_writable(output):
        try:
            with open(output, 'w+') as f:
                pass
        except:
            error(f'Cannot write to {output}')

    output_transformer = get_transformer(get_type(output))()
    G = output_transformer.graph

    driver = http_gdb(address, username=username, password=password)

    subject_label = ':`{}`'.format(subject_label) if isinstance(subject_label, str) else ''
    object_label = ':`{}`'.format(object_label) if isinstance(object_label, str) else ''
    edge_type = ':`{}`'.format(edge_type) if isinstance(edge_type, str) else ''

    match = 'match (n{})-[e{}]->(m{})'.format(subject_label, edge_type, object_label)

    results = driver.query('{} return count(*)'.format(match))

    click.echo('Using cyper query: {} return n, e, m'.format(match))

    for a, in results:
        size = a
        break

    if size == 0:
        click.echo('No data available')
        quit()

    page_size = 1_000

    skip_flag = False

    with click.progressbar(list(range(0, size, page_size)), label='Downloading {} many edges'.format(size)) as bar:
        for i in bar:
            q = '{} return n, e, m skip {} limit {}'.format(match, i, page_size)
            results = driver.query(q)

            for n, e, m in results:
                subject_attr = n['data']
                object_attr = m['data']
                edge_attr = e['data']

                if 'id' not in subject_attr or 'id' not in object_attr:
                    if not skip_flag:
                        click.echo('Skipping records that have no id attribute')
                        skip_flag = True
                    continue

                s = subject_attr['id']
                o = object_attr['id']

                if 'edge_label' not in edge_attr:
                    edge_attr['edge_label'] = e['metadata']['type']

                if 'category' not in subject_attr:
                    subject_attr['category'] = n['metadata']['labels']

                if 'category' not in object_attr:
                    object_attr['category'] = m['metadata']['labels']

                if s not in G:
                    G.add_node(s, **subject_attr)
                if o not in G:
                    G.add_node(o, **object_attr)

                G.add_edge(s, o, key=edge_attr['edge_label'], **edge_attr)

            if stop_after is not None and G.number_of_edges() > stop_after:
                break

    output_transformer.save(output)