Exemplo n.º 1
0
def load_and_merge(yaml_file: str) -> Transformer:
    """Load and merge sources defined in the config YAML.

    Args:
        yaml_file: A string pointing to a KGX compatible config YAML.

    Returns:
        kgx.Transformer: The merged transformer that contains the merged graph.

    """
    config = parse_load_config(yaml_file)
    transformers: List = []

    # read all the sources defined in the YAML
    for key in config['target']:
        target = config['target'][key]
        logging.info("Loading {}".format(key))
        if target['type'] in get_file_types():
            # loading from a file
            transformer = get_transformer(target['type'])()
            for f in target['filename']:
                transformer.parse(f, input_format='tsv')
            transformers.append(transformer)
        elif target['type'] == 'neo4j':
            transformer = NeoTransformer(None, target['uri'],
                                         target['username'],
                                         target['password'])
            transformer.load()
            transformers.append(transformer)
        else:
            logging.error("type {} not yet supported".format(target['type']))

    # merge all subgraphs into a single graph
    merged_transformer = Transformer()
    merged_transformer.merge_graphs([x.graph for x in transformers])
    merged_transformer.report()

    # write the merged graph
    if 'destination' in config:
        destination = config['destination']
        if destination['type'] in ['csv', 'tsv', 'ttl', 'json', 'tar']:
            destination_transformer = get_transformer(destination['type'])(
                merged_transformer.graph)
            destination_transformer.save(destination['filename'],
                                         extension=destination['type'])
        elif destination['type'] == 'neo4j':
            destination_transformer = NeoTransformer(
                merged_transformer.graph,
                uri=destination['uri'],
                username=destination['username'],
                password=destination['password'])
            destination_transformer.save_with_unwind()
        else:
            logging.error(
                "type {} not yet supported for KGX load-and-merge operation.".
                format(destination['type']))

    return merged_transformer
Exemplo n.º 2
0
def test_neo_to_graph_transform():
    """
    load from neo4j and transform to nx graph
    """
    nt = NeoTransformer(host='localhost', port='7474', username='******', password='******')
    nt.load()
    nt.report()
    t = PandasTransformer(nt.graph)
    t.save(os.path.join(target_dir, "neo_graph.csv"))
Exemplo n.º 3
0
def test_neo_to_graph_transform():
    """
    load from neo4j and transform to nx graph
    """

    n = NeoTransformer()
    n.load()
    n.report()
    t = PandasTransformer(n)
    t.save("target/neo_graph.csv")
Exemplo n.º 4
0
def test_neo_to_graph_transform():
    """
    load from neo4j and transform to nx graph
    """
    nt = NeoTransformer(uri=DEFAULT_NEO4J_URL,
                        username=DEFAULT_NEO4J_USERNAME,
                        password=DEFAULT_NEO4J_PASSWORD)
    nt.load()
    nt.report()
    t = PandasTransformer(nt.graph)
    t.save(os.path.join(target_dir, "neo_graph.csv"))
Exemplo n.º 5
0
def test_neo_to_graph_transform():
    """
    load from neo4j and transform to nx graph
    """
    return

    nt = NeoTransformer(host='localhost',
                        port='7474',
                        username='',
                        password='')
    nt.load()
    nt.report()
    t = PandasTransformer(nt.graph)
    t.save("target/neo_graph.csv")
Exemplo n.º 6
0
                    help='A filter that can be applied to nodes')
parser.add_argument('--edge_filter',
                    action='append',
                    help='A filter that can be applied to edges')
parser.add_argument('--uri',
                    help='URI/URL for Neo4j (including port)',
                    default='localhost:7474')
parser.add_argument('--username', help='username', default='neo4j')
parser.add_argument('--password', help='password', default='demo')
args = parser.parse_args()

# Initialize NeoTransformer
nt = NeoTransformer(None,
                    uri=args.uri,
                    username=args.username,
                    password=args.password)

if args.node_filter:
    for f in args.node_filter:
        k, v = f.split('=')
        nt.set_node_filter(k, set(v))

if args.edge_filter:
    for f in args.edge_filter:
        k, v = f.split('=')
        nt.set_edge_filter(k, set(v))

# Read from Neo4j with the given filter constraints (if any)
nt.load()
nt.report()
Exemplo n.º 7
0
                    action='append',
                    help='A filter that can be applied to node and/or edges')
parser.add_argument('--host',
                    help='host to connect with Neo4j',
                    default='localhost')
parser.add_argument('--bolt_port',
                    help='Bolt port to connect with Neo4j',
                    default='7687')
parser.add_argument('--username',
                    help='username (default: neo4j)',
                    default='neo4j')
parser.add_argument('--password',
                    help='password (default: demo)',
                    default='demo')
args = parser.parse_args()

# Initialize NeoTransformer
n = NeoTransformer(None, args.host, {'bolt': args.bolt_port}, args.username,
                   args.password)

if args.filter is not None:
    if len(args.filter) > 0:
        for filter in args.filter:
            k, v = filter.split('=')
            # Set filters
            n.set_filter(k, v)

# Read from Neo4j with the given filter constraints (if any)
n.load()
n.report()
Exemplo n.º 8
0
def load_and_merge(yaml_file: str) -> nx.MultiDiGraph:
    """Load and merge sources defined in the config YAML.

    Args:
        yaml_file: A string pointing to a KGX compatible config YAML.

    Returns:
        networkx.MultiDiGraph: The merged graph.

    """
    gm = GraphMerge()
    config = parse_load_config(yaml_file)
    transformers: List = []

    # make sure all files exist before we start load
    for key in config['target']:
        target = config['target'][key]
        logging.info("Checking that file exist for {}".format(key))
        if target['type'] in get_file_types():
            for f in target['filename']:
                if not os.path.exists(f) or not os.path.isfile(f):
                    raise FileNotFoundError(
                        "File {} for transform {}  in yaml file {} "
                        "doesn't exist! Dying.", f, key, yaml_file)

    # read all the sources defined in the YAML
    for key in config['target']:
        target = config['target'][key]
        logging.info("Loading {}".format(key))
        if target['type'] in get_file_types():
            # loading from a file
            transformer = get_transformer(target['type'])()
            for f in target['filename']:
                transformer.parse(f, input_format='tsv')
                transformer.graph.name = key
            transformers.append(transformer)
        elif target['type'] == 'neo4j':
            transformer = NeoTransformer(None, target['uri'],
                                         target['username'],
                                         target['password'])
            transformer.load()
            transformers.append(transformer)
            transformer.graph.name = key
        else:
            logging.error("type {} not yet supported".format(target['type']))
        stats_filename = f"{key}_stats.yaml"
        generate_graph_stats(transformer.graph, key, stats_filename)

    # merge all subgraphs into a single graph
    merged_graph = gm.merge_all_graphs([x.graph for x in transformers])
    merged_graph.name = 'merged_graph'
    generate_graph_stats(merged_graph, merged_graph.name,
                         f"merged_graph_stats.yaml")

    # write the merged graph
    if 'destination' in config:
        for _, destination in config['destination'].items():
            if destination['type'] == 'neo4j':
                destination_transformer = NeoTransformer(
                    merged_graph,
                    uri=destination['uri'],
                    username=destination['username'],
                    password=destination['password'])
                destination_transformer.save_with_unwind()
            elif destination['type'] in get_file_types():
                destination_transformer = get_transformer(
                    destination['type'])(merged_graph)
                destination_transformer.save(destination['filename'],
                                             extension=destination['type'])
            else:
                logging.error(
                    "type {} not yet supported for KGX load-and-merge operation."
                    .format(destination['type']))

    return merged_graph
Exemplo n.º 9
0
def load_and_merge(yaml_file: str) -> nx.MultiDiGraph:
    """Load and merge sources defined in the config YAML.

    Args:
        yaml_file: A string pointing to a KGX compatible config YAML.

    Returns:
        networkx.MultiDiGraph: The merged graph.

    """
    config = parse_load_config(yaml_file)
    transformers: List = []

    # make sure all files exist before we start load
    for key in config['target']:
        target = config['target'][key]
        logging.info("Checking that file exist for {}".format(key))
        if target['type'] in get_file_types():
            for f in target['filename']:
                if not os.path.exists(f) or not os.path.isfile(f):
                    raise FileNotFoundError(
                        "File {} for transform {}  in yaml file {} "
                        "doesn't exist! Dying.", f, key, yaml_file)

    # read all the sources defined in the YAML
    for key in config['target']:
        target = config['target'][key]
        logging.info("Loading {}".format(key))
        if target['type'] in get_file_types():
            # loading from a file
            try:
                transformer = get_transformer(target['type'])()
                if target['type'] in {'tsv', 'neo4j'}:
                    if 'filters' in target:
                        apply_filters(target, transformer)
                for f in target['filename']:
                    transformer.parse(f, input_format='tsv')
                    transformer.graph.name = key
                if 'operations' in target:
                    apply_operations(target, transformer)
                transformers.append(transformer)
            except:
                logging.error("Failed loading {}".format(f))
        elif target['type'] == 'neo4j':
            transformer = NeoTransformer(None, target['uri'],
                                         target['username'],
                                         target['password'])
            if 'filters' in target:
                apply_filters(target, transformer)
            transformer.load()
            if 'operations' in target:
                apply_operations(target, transformer)
            transformers.append(transformer)
            transformer.graph.name = key
        else:
            logging.error("type {} not yet supported".format(target['type']))
        stats_filename = f"{key}_stats.yaml"
        generate_graph_stats(transformer.graph, key, stats_filename)

    # merge all subgraphs into a single graph
    merged_graph = merge_all_graphs([x.graph for x in transformers])
    merged_graph.name = 'merged_graph'
    generate_graph_stats(merged_graph, merged_graph.name,
                         "merged_graph_stats.yaml", ['provided_by'],
                         ['provided_by'])

    # write the merged graph
    if 'destination' in config:
        for _, destination in config['destination'].items():
            if destination['type'] == 'neo4j':
                destination_transformer = NeoTransformer(
                    merged_graph,
                    uri=destination['uri'],
                    username=destination['username'],
                    password=destination['password'])
                destination_transformer.save()
            elif destination['type'] in get_file_types():
                destination_transformer = get_transformer(
                    destination['type'])(merged_graph)
                mode = 'w:gz' if destination['type'] in {'tsv'} else None
                if destination['type'] in {'nt', 'nt.gz', 'ttl'}:
                    destination_transformer.set_property_types(PROPERTY_TYPES)
                destination_transformer.save(destination['filename'],
                                             output_format=destination['type'],
                                             mode=mode)
            else:
                logging.error(
                    "type {} not yet supported for KGX load-and-merge operation."
                    .format(destination['type']))

    return merged_graph