def load_transformer(input_paths:List[str], input_type:str=None) -> Transformer: """ Creates a transformer for the appropriate file type and loads the data into it from file. """ if input_type is None: input_types = [get_type(i) for i in input_paths] for t in input_types: if input_types[0] != t: error( """ Each input file must have the same file type. Try setting the --input-type parameter to enforce a single type. """ ) input_type = input_types[0] transformer_constructor = get_transformer(input_type) if transformer_constructor is None: error('Inputs do not have a recognized type: ' + str(get_file_types())) t = transformer_constructor() for i in input_paths: t.parse(i, input_type) t.report() return t
def build_transformer(path:str, input_type:str=None) -> Transformer: if input_type is None: input_type = get_type(path) constructor = get_transformer(input_type) if constructor is None: error('File does not have a recognized type: ' + str(get_file_types())) return constructor()
def transform_and_save(t:Transformer, output_path:str, output_type:str=None): """ Creates a transformer with the appropraite file type from the given transformer, and applies that new transformation and saves to file. """ if output_type is None: output_type = get_type(output_path) output_transformer = get_transformer(output_type) if output_transformer is None: error('Output does not have a recognized type: ' + str(get_file_types())) kwargs = { 'extention' : output_type } w = output_transformer(t.graph) result_path = w.save(output_path, **kwargs) if result_path is not None and os.path.isfile(result_path): click.echo("File created at: " + result_path) elif os.path.isfile(output_path): click.echo("File created at: " + output_path) else: error("Could not create file.")
def load_and_merge(yaml_file: str) -> Transformer: """Load and merge sources defined in the config YAML. Args: yaml_file: A string pointing to a KGX compatible config YAML. Returns: kgx.Transformer: The merged transformer that contains the merged graph. """ config = parse_load_config(yaml_file) transformers: List = [] # read all the sources defined in the YAML for key in config['target']: target = config['target'][key] logging.info("Loading {}".format(key)) if target['type'] in get_file_types(): # loading from a file transformer = get_transformer(target['type'])() for f in target['filename']: transformer.parse(f, input_format='tsv') transformers.append(transformer) elif target['type'] == 'neo4j': transformer = NeoTransformer(None, target['uri'], target['username'], target['password']) transformer.load() transformers.append(transformer) else: logging.error("type {} not yet supported".format(target['type'])) # merge all subgraphs into a single graph merged_transformer = Transformer() merged_transformer.merge_graphs([x.graph for x in transformers]) merged_transformer.report() # write the merged graph if 'destination' in config: destination = config['destination'] if destination['type'] in ['csv', 'tsv', 'ttl', 'json', 'tar']: destination_transformer = get_transformer(destination['type'])( merged_transformer.graph) destination_transformer.save(destination['filename'], extension=destination['type']) elif destination['type'] == 'neo4j': destination_transformer = NeoTransformer( merged_transformer.graph, uri=destination['uri'], username=destination['username'], password=destination['password']) destination_transformer.save_with_unwind() else: logging.error( "type {} not yet supported for KGX load-and-merge operation.". format(destination['type'])) return merged_transformer
def load_and_merge(config: dict, load_config): """ Load nodes and edges from files and KGs, as defined in a config YAML, and merge them into a single graph. The merge happens in-memory. This merged graph can then be written to a local/remote Neo4j instance OR be serialized into a file. \f .. note:: Everything here is driven by the ``load_config`` YAML. Parameters ---------- """ with open(load_config, 'r') as YML: cfg = yaml.load(YML, Loader=yaml.FullLoader) transformers = [] for key in cfg['target']: target = cfg['target'][key] logging.info("Loading {}".format(key)) if target['type'] in get_file_types(): # loading from a file transformer = get_transformer(target['type'])() transformer.parse(target['filename']) transformers.append(transformer) elif target['type'] == 'neo4j': transformer = kgx.NeoTransformer(None, target['uri'], target['username'], target['password']) # TODO: support filters transformer.load() transformers.append(transformer) else: logging.error( "type {} not yet supported for KGX load-and-merge operation.". format(target['type'])) merged_transformer = Transformer() merged_transformer.merge_graphs([x.graph for x in transformers]) destination = cfg['destination'] if destination['type'] in ['csv', 'tsv', 'ttl', 'json', 'tar']: destination_transformer = get_transformer(destination['type'])() destination_transformer.save(destination['filename']) elif destination['type'] == 'neo4j': destination_transformer = kgx.NeoTransformer( merged_transformer.graph, uri=destination['uri'], username=destination['username'], password=destination['password']) destination_transformer.save_with_unwind() else: logging.error( "type {} not yet supported for KGX load-and-merge operation.". format(destination['type']))
def transform(config: dict, inputs: List[str], input_type: str, output: str, output_type: str, mapping: str, preserve: bool): """ Transform a Knowledge Graph from one serialization form to another. \f Parameters ---------- config: dict A dictionary containing the configuration for kgx.cli inputs: List[str] A list of files that contains nodes/edges input_type: str The input type output: str The output file output_type: str The output type mapping: str A mapping file (TSV) for remapping node identifiers preserve: bool Whether to preserve old identifiers before remapping """ # load input_transformer = load_transformer(inputs, input_type) if mapping is not None: # remap mapping_dictionary = {} with open(mapping) as M: for line in M: element = line.rstrip().split('\t') mapping_dictionary[element[0]] = element[1] logging.info('Performing remapping based on {}'.format(mapping)) map_graph(input_transformer.graph, mapping=mapping_dictionary, preserve=preserve) # save output_transformer = get_transformer(output_type) if output_transformer is None: logging.error('Output does not have a recognized type: ' + str(get_file_types())) w = output_transformer(input_transformer.graph) w.save(output, extension=output_type)
def load_and_merge(config: dict, load_config): """ Load nodes and edges from files and KGs, as defined in a config YAML, and merge them into a single graph. The merge happens in-memory. This merged graph can then be written to a local/remote Neo4j instance OR be serialized into a file. \f .. note:: Everything here is driven by the ``load_config`` YAML. Parameters ---------- """ with open(load_config, 'r') as YML: cfg = yaml.load(YML, Loader=yaml.FullLoader) transformers = [] for key in cfg['target']: target = cfg['target'][key] logging.info("Loading {}".format(key)) if target['type'] in get_file_types(): # loading from a file transformer = get_transformer(target['type'])() if target['type'] in {'tsv', 'neo4j'}: # currently supporting filters only for TSV and Neo4j if 'filters' in target: filters = target['filters'] node_filters = filters[ 'node_filters'] if 'node_filters' in filters else {} edge_filters = filters[ 'edge_filters'] if 'edge_filters' in filters else {} for k, v in node_filters.items(): transformer.set_node_filter(k, set(v)) for k, v in edge_filters.items(): transformer.set_edge_filter(k, set(v)) logging.info(f"with node filters: {node_filters}") logging.info(f"with edge filters: {edge_filters}") for f in target['filename']: transformer.parse(f, input_format=target['type']) transformers.append(transformer) elif target['type'] == 'neo4j': transformer = kgx.NeoTransformer(None, target['uri'], target['username'], target['password']) if 'filters' in target: filters = target['filters'] node_filters = filters[ 'node_filters'] if 'node_filters' in filters else {} edge_filters = filters[ 'edge_filters'] if 'edge_filters' in filters else {} for k, v in node_filters.items(): transformer.set_node_filter(k, set(v)) for k, v in edge_filters.items(): transformer.set_edge_filter(k, set(v)) logging.info(f"with node filters: {node_filters}") logging.info(f"with edge filters: {edge_filters}") transformer.load() transformers.append(transformer) else: logging.error( "type {} not yet supported for KGX load-and-merge operation.". format(target['type'])) merged_graph = merge_all_graphs([x.graph for x in transformers]) destination = cfg['destination'] if destination['type'] in ['csv', 'tsv', 'ttl', 'json', 'tar']: destination_transformer = get_transformer( destination['type'])(merged_graph) destination_transformer.save(destination['filename']) elif destination['type'] == 'neo4j': destination_transformer = kgx.NeoTransformer( merged_graph, uri=destination['uri'], username=destination['username'], password=destination['password']) destination_transformer.save() else: logging.error( "type {} not yet supported for KGX load-and-merge operation.". format(destination['type']))
Parameters ---------- config: dict A dictionary containing the configuration for kgx.cli debug: bool Whether to print debug messages """ config.debug = debug if debug: logging.basicConfig(level=logging.DEBUG) @cli.command('node-summary') @click.argument('filepath', type=click.Path(exists=True), required=True) @click.option('--input-type', type=click.Choice(get_file_types())) @click.option('--max-rows', '-m', type=int, help='The maximum number of rows to return') @click.option('--output', '-o', type=click.Path(exists=False)) @pass_config def node_summary(config: dict, filepath: str, input_type: str, max_rows: int, output: str): """ Loads and summarizes a knowledge graph node set, where the input is a file. \f Parameters ---------- config: dict
def load_and_merge(yaml_file: str) -> nx.MultiDiGraph: """Load and merge sources defined in the config YAML. Args: yaml_file: A string pointing to a KGX compatible config YAML. Returns: networkx.MultiDiGraph: The merged graph. """ gm = GraphMerge() config = parse_load_config(yaml_file) transformers: List = [] # make sure all files exist before we start load for key in config['target']: target = config['target'][key] logging.info("Checking that file exist for {}".format(key)) if target['type'] in get_file_types(): for f in target['filename']: if not os.path.exists(f) or not os.path.isfile(f): raise FileNotFoundError( "File {} for transform {} in yaml file {} " "doesn't exist! Dying.", f, key, yaml_file) # read all the sources defined in the YAML for key in config['target']: target = config['target'][key] logging.info("Loading {}".format(key)) if target['type'] in get_file_types(): # loading from a file transformer = get_transformer(target['type'])() for f in target['filename']: transformer.parse(f, input_format='tsv') transformer.graph.name = key transformers.append(transformer) elif target['type'] == 'neo4j': transformer = NeoTransformer(None, target['uri'], target['username'], target['password']) transformer.load() transformers.append(transformer) transformer.graph.name = key else: logging.error("type {} not yet supported".format(target['type'])) stats_filename = f"{key}_stats.yaml" generate_graph_stats(transformer.graph, key, stats_filename) # merge all subgraphs into a single graph merged_graph = gm.merge_all_graphs([x.graph for x in transformers]) merged_graph.name = 'merged_graph' generate_graph_stats(merged_graph, merged_graph.name, f"merged_graph_stats.yaml") # write the merged graph if 'destination' in config: for _, destination in config['destination'].items(): if destination['type'] == 'neo4j': destination_transformer = NeoTransformer( merged_graph, uri=destination['uri'], username=destination['username'], password=destination['password']) destination_transformer.save_with_unwind() elif destination['type'] in get_file_types(): destination_transformer = get_transformer( destination['type'])(merged_graph) destination_transformer.save(destination['filename'], extension=destination['type']) else: logging.error( "type {} not yet supported for KGX load-and-merge operation." .format(destination['type'])) return merged_graph
def load_and_merge(yaml_file: str) -> nx.MultiDiGraph: """Load and merge sources defined in the config YAML. Args: yaml_file: A string pointing to a KGX compatible config YAML. Returns: networkx.MultiDiGraph: The merged graph. """ config = parse_load_config(yaml_file) transformers: List = [] # make sure all files exist before we start load for key in config['target']: target = config['target'][key] logging.info("Checking that file exist for {}".format(key)) if target['type'] in get_file_types(): for f in target['filename']: if not os.path.exists(f) or not os.path.isfile(f): raise FileNotFoundError( "File {} for transform {} in yaml file {} " "doesn't exist! Dying.", f, key, yaml_file) # read all the sources defined in the YAML for key in config['target']: target = config['target'][key] logging.info("Loading {}".format(key)) if target['type'] in get_file_types(): # loading from a file try: transformer = get_transformer(target['type'])() if target['type'] in {'tsv', 'neo4j'}: if 'filters' in target: apply_filters(target, transformer) for f in target['filename']: transformer.parse(f, input_format='tsv') transformer.graph.name = key if 'operations' in target: apply_operations(target, transformer) transformers.append(transformer) except: logging.error("Failed loading {}".format(f)) elif target['type'] == 'neo4j': transformer = NeoTransformer(None, target['uri'], target['username'], target['password']) if 'filters' in target: apply_filters(target, transformer) transformer.load() if 'operations' in target: apply_operations(target, transformer) transformers.append(transformer) transformer.graph.name = key else: logging.error("type {} not yet supported".format(target['type'])) stats_filename = f"{key}_stats.yaml" generate_graph_stats(transformer.graph, key, stats_filename) # merge all subgraphs into a single graph merged_graph = merge_all_graphs([x.graph for x in transformers]) merged_graph.name = 'merged_graph' generate_graph_stats(merged_graph, merged_graph.name, "merged_graph_stats.yaml", ['provided_by'], ['provided_by']) # write the merged graph if 'destination' in config: for _, destination in config['destination'].items(): if destination['type'] == 'neo4j': destination_transformer = NeoTransformer( merged_graph, uri=destination['uri'], username=destination['username'], password=destination['password']) destination_transformer.save() elif destination['type'] in get_file_types(): destination_transformer = get_transformer( destination['type'])(merged_graph) mode = 'w:gz' if destination['type'] in {'tsv'} else None if destination['type'] in {'nt', 'nt.gz', 'ttl'}: destination_transformer.set_property_types(PROPERTY_TYPES) destination_transformer.save(destination['filename'], output_format=destination['type'], mode=mode) else: logging.error( "type {} not yet supported for KGX load-and-merge operation." .format(destination['type'])) return merged_graph