def neo4j_node_summary(config, address, username, password, output=None): if output is not None and not is_writable(output): error(f'Cannot write to {output}') bolt_driver = GraphDatabase.driver(address, auth=(username, password)) query = """ MATCH (x) RETURN DISTINCT x.category AS category """ with bolt_driver.session() as session: records = session.run(query) categories = set() for record in records: category = record['category'] if isinstance(category, str): categories.add(category) elif isinstance(category, (list, set, tuple)): categories.update(category) elif category is None: continue else: error('Unrecognized value for node.category: {}'.format(category)) rows = [] with click.progressbar(categories, length=len(categories)) as bar: for category in bar: query = """ MATCH (x) WHERE x.category = {category} OR {category} IN x.category RETURN DISTINCT {category} AS category, split(x.id, ':')[0] AS prefix, COUNT(*) AS frequency ORDER BY category, frequency DESC; """ with bolt_driver.session() as session: records = session.run(query, category=category) for record in records: rows.append({ 'category' : record['category'], 'prefix' : record['prefix'], 'frequency' : record['frequency'] }) df = pd.DataFrame(rows) df = df[['category', 'prefix', 'frequency']] if output is None: click.echo(df) else: df.to_csv(output, sep='|', header=True) click.echo('Saved report to {}'.format(output))
def dump(config, inputs, output, input_type, output_type, mapping, preserve): """\b Transforms a knowledge graph from one representation to another """ if not is_writable(output): error(f'Cannot write to {output}') t = load_transformer(inputs, input_type) if mapping != None: path = get_file_path(mapping) with click.open_file(path, 'rb') as f: d = pickle.load(f) click.echo('Performing mapping: ' + mapping) map_graph(G=t.graph, mapping=d, preserve=preserve) transform_and_save(t, output, output_type)
def neo4j_download(config: dict, address: str, username: str, password: str, output: str, output_type: str, subject_label: str, object_label: str, edge_label: str, directed: bool, page_size: int, stop_after: int): """ Download nodes and edges from Neo4j database. \f Parameters ---------- config: dict A dictionary containing the configuration for kgx.cli address: str The full HTTP address for Neo4j database username: str Username for authentication password: str Password for authentication output: str Where to write the output (stdout, by default) output_type: str The output type (``csv``, by default) subject_label: str The label for subject node in an association object_label: str The label for object node in an association edge_label: str The label for the edge in an association directed: bool Whether or not the edge is supposed to be directed (``true``, by default) stop_after: int The max number of edges to fetch page_size: int The page size to use while fetching associations from Neo4j (``10000``, by default) """ if not is_writable(output): try: with open(output, 'w+') as f: pass except: error(f'Cannot write to {output}') output_transformer = get_transformer(output_type)() G = output_transformer.graph driver = http_gdb(address, username=username, password=password) subject_label = ':`{}`'.format(subject_label) if isinstance( subject_label, str) else '' object_label = ':`{}`'.format(object_label) if isinstance( object_label, str) else '' edge_label = ':`{}`'.format(edge_label) if isinstance(edge_label, str) else '' if directed: query = 'match (n{})-[e{}]->(m{})'.format(subject_label, edge_label, object_label) else: query = 'match (n{})-[e{}]-(m{})'.format(subject_label, edge_label, object_label) results = driver.query('{} return count(*)'.format(query)) size = [x[0] for x in results][0] print("SIZE: {}".format(size)) if size == 0: click.echo('No records found.') return click.echo('Using cypher query: {} return n, e, m'.format(query)) page_size = 1_000 skip_flag = False with click.progressbar( list(range(0, size, page_size)), label='Downloading {} many edges'.format(size)) as bar: for i in bar: q = '{} return n, e, m skip {} limit {}'.format( query, i, page_size) results = driver.query(q) for n, e, m in results: subject_attr = n['data'] object_attr = m['data'] edge_attr = e['data'] if 'id' not in subject_attr or 'id' not in object_attr: if not skip_flag: click.echo( 'Skipping records that have no id attribute') skip_flag = True continue s = subject_attr['id'] o = object_attr['id'] if 'edge_label' not in edge_attr: edge_attr['edge_label'] = e['metadata']['type'] if 'category' not in subject_attr: subject_attr['category'] = n['metadata']['labels'] if 'category' not in object_attr: object_attr['category'] = m['metadata']['labels'] if s not in G: G.add_node(s, **subject_attr) if o not in G: G.add_node(o, **object_attr) G.add_edge(s, o, key=edge_attr['edge_label'], **edge_attr) if stop_after is not None and G.number_of_edges() > stop_after: break output_transformer.save(output, extension=output_type)
def neo4j_edge_summary(config: dict, address: str, username: str, password: str, output: str = None): """ Get a summary of all the edges in a Neo4j database. \f Parameters ---------- config: dict A dictionary containing the configuration for kgx.cli address: str The full HTTP address for Neo4j database username: str Username for authentication password: str Password for authentication output: str Where to write the output (stdout, by default) """ if output is not None and not is_writable(output): error(f'Cannot write to {output}') http_driver = http_gdb(address, username=username, password=password) query = """ MATCH (x) RETURN DISTINCT x.category AS category """ records = http_driver.query(query) categories = set() for record in records: category = record[0] if isinstance(category, str): categories.add(category) elif isinstance(category, (list, set, tuple)): categories.update(category) elif category is None: continue else: error('Unrecognized value for node.category: {}'.format(category)) categories = list(categories) query = """ MATCH (n)-[r]-(m) WHERE (n.category = {category1} OR {category1} IN n.category) AND (m.category = {category2} OR {category2} IN m.category) RETURN DISTINCT {category1} AS subject_category, {category2} AS object_category, type(r) AS edge_type, split(n.id, ':')[0] AS subject_prefix, split(m.id, ':')[0] AS object_prefix, COUNT(*) AS frequency ORDER BY subject_category, object_category, frequency DESC; """ combinations = [(c1, c2) for c1 in categories for c2 in categories] rows = [] with click.progressbar(combinations, length=len(combinations)) as bar: for category1, category2 in bar: records = http_driver.query(query, params={ 'category1': category2, 'category2': category2 }) for r in records: rows.append({ 'subject_category': r[0], 'object_category': r[1], 'subject_prefix': r[3], 'object_prefix': r[4], 'frequency': r[5] }) df = pd.DataFrame(rows) df = df[[ 'subject_category', 'subject_prefix', 'object_category', 'object_prefix', 'frequency' ]] if output is None: with pd.option_context('display.max_rows', None, 'display.max_columns', None): click.echo(df) else: df.to_csv(output, sep='|', header=True) click.echo('Saved report to {}'.format(output))
def neo4j_node_summary(config: dict, address: str, username: str, password: str, output: str = None): """ Get a summary of all the nodes in a Neo4j database. \f Parameters ---------- config: dict A dictionary containing the configuration for kgx.cli address: str The full HTTP address for Neo4j database username: str Username for authentication password: str Password for authentication output: str Where to write the output (stdout, by default) """ if output is not None and not is_writable(output): error(f'Cannot write to {output}') http_driver = http_gdb(address, username=username, password=password) query = """ MATCH (x) RETURN DISTINCT x.category AS category """ records = http_driver.query(query) categories = set() for record in records: category = record[0] if isinstance(category, str): categories.add(category) elif isinstance(category, (list, set, tuple)): categories.update(category) elif category is None: continue else: error('Unrecognized value for node.category: {}'.format(category)) rows = [] with click.progressbar(categories, length=len(categories)) as bar: for category in bar: query = f""" MATCH (x) WHERE x.category = '{category}' OR '{category}' IN x.category RETURN DISTINCT '{category}' AS category, split(x.id, ':')[0] AS prefix, COUNT(*) AS frequency ORDER BY category, frequency DESC; """ records = http_driver.query(query) for record in records: rows.append({ 'category': record[0], 'prefix': record[1], 'frequency': record[2] }) df = pd.DataFrame(rows) df = df[['category', 'prefix', 'frequency']] if output is None: click.echo(df) else: df.to_csv(output, sep='|', header=True) click.echo('Saved report to {}'.format(output))
def neo4j_download(config, page_size, stop_after, subject_label, object_label, edge_type, address, username, password, output, output_type): if not is_writable(output): try: with open(output, 'w+') as f: pass except: error(f'Cannot write to {output}') output_transformer = get_transformer(get_type(output))() G = output_transformer.graph driver = http_gdb(address, username=username, password=password) subject_label = ':`{}`'.format(subject_label) if isinstance(subject_label, str) else '' object_label = ':`{}`'.format(object_label) if isinstance(object_label, str) else '' edge_type = ':`{}`'.format(edge_type) if isinstance(edge_type, str) else '' match = 'match (n{})-[e{}]->(m{})'.format(subject_label, edge_type, object_label) results = driver.query('{} return count(*)'.format(match)) click.echo('Using cyper query: {} return n, e, m'.format(match)) for a, in results: size = a break if size == 0: click.echo('No data available') quit() page_size = 1_000 skip_flag = False with click.progressbar(list(range(0, size, page_size)), label='Downloading {} many edges'.format(size)) as bar: for i in bar: q = '{} return n, e, m skip {} limit {}'.format(match, i, page_size) results = driver.query(q) for n, e, m in results: subject_attr = n['data'] object_attr = m['data'] edge_attr = e['data'] if 'id' not in subject_attr or 'id' not in object_attr: if not skip_flag: click.echo('Skipping records that have no id attribute') skip_flag = True continue s = subject_attr['id'] o = object_attr['id'] if 'edge_label' not in edge_attr: edge_attr['edge_label'] = e['metadata']['type'] if 'category' not in subject_attr: subject_attr['category'] = n['metadata']['labels'] if 'category' not in object_attr: object_attr['category'] = m['metadata']['labels'] if s not in G: G.add_node(s, **subject_attr) if o not in G: G.add_node(o, **object_attr) G.add_edge(s, o, key=edge_attr['edge_label'], **edge_attr) if stop_after is not None and G.number_of_edges() > stop_after: break output_transformer.save(output)
def neo4j_edge_summary(config, address, username, password, output=None): if output is not None and not is_writable(output): error(f'Cannot write to {output}') bolt_driver = GraphDatabase.driver(address, auth=(username, password)) query = """ MATCH (x) RETURN DISTINCT x.category AS category """ with bolt_driver.session() as session: records = session.run(query) categories = set() for record in records: category = record['category'] if isinstance(category, str): categories.add(category) elif isinstance(category, (list, set, tuple)): categories.update(category) elif category is None: continue else: error('Unrecognized value for node.category: {}'.format(category)) categories = list(categories) query = """ MATCH (n)-[r]-(m) WHERE (n.category = {category1} OR {category1} IN n.category) AND (m.category = {category2} OR {category2} IN m.category) RETURN DISTINCT {category1} AS subject_category, {category2} AS object_category, type(r) AS edge_type, split(n.id, ':')[0] AS subject_prefix, split(m.id, ':')[0] AS object_prefix, COUNT(*) AS frequency ORDER BY subject_category, object_category, frequency DESC; """ combinations = [(c1, c2) for c1 in categories for c2 in categories] rows = [] with click.progressbar(combinations, length=len(combinations)) as bar: for category1, category2 in bar: with bolt_driver.session() as session: records = session.run(query, category1=category1, category2=category2) for r in records: rows.append({ 'subject_category' : r['subject_category'], 'object_category' : r['object_category'], 'subject_prefix' : r['subject_prefix'], 'object_prefix' : r['object_prefix'], 'frequency' : r['frequency'] }) df = pd.DataFrame(rows) df = df[['subject_category', 'subject_prefix', 'object_category', 'object_prefix', 'frequency']] if output is None: with pd.option_context('display.max_rows', None, 'display.max_columns', None): click.echo(df) else: df.to_csv(output, sep='|', header=True) click.echo('Saved report to {}'.format(output))