Пример #1
0
def load_transformer(input_paths:List[str], input_type:str=None) -> Transformer:
    """
    Creates a transformer for the appropriate file type and loads the data into
    it from file.
    """
    if input_type is None:
        input_types = [get_type(i) for i in input_paths]
        for t in input_types:
            if input_types[0] != t:
                error(
                """
                Each input file must have the same file type.
                Try setting the --input-type parameter to enforce a single
                type.
                """
                )
            input_type = input_types[0]

    transformer_constructor = get_transformer(input_type)

    if transformer_constructor is None:
        error('Inputs do not have a recognized type: ' + str(get_file_types()))

    t = transformer_constructor()
    for i in input_paths:
        t.parse(i, input_type)

    t.report()

    return t
Пример #2
0
def build_transformer(path:str, input_type:str=None) -> Transformer:
    if input_type is None:
        input_type = get_type(path)
    constructor = get_transformer(input_type)
    if constructor is None:
        error('File does not have a recognized type: ' + str(get_file_types()))
    return constructor()
Пример #3
0
def transform_and_save(t:Transformer, output_path:str, output_type:str=None):
    """
    Creates a transformer with the appropraite file type from the given
    transformer, and applies that new transformation and saves to file.
    """
    if output_type is None:
        output_type = get_type(output_path)

    output_transformer = get_transformer(output_type)

    if output_transformer is None:
        error('Output does not have a recognized type: ' + str(get_file_types()))

    kwargs = {
        'extention' : output_type
    }

    w = output_transformer(t.graph)
    result_path = w.save(output_path, **kwargs)

    if result_path is not None and os.path.isfile(result_path):
        click.echo("File created at: " + result_path)
    elif os.path.isfile(output_path):
        click.echo("File created at: " + output_path)
    else:
        error("Could not create file.")
Пример #4
0
def load_and_merge(yaml_file: str) -> Transformer:
    """Load and merge sources defined in the config YAML.

    Args:
        yaml_file: A string pointing to a KGX compatible config YAML.

    Returns:
        kgx.Transformer: The merged transformer that contains the merged graph.

    """
    config = parse_load_config(yaml_file)
    transformers: List = []

    # read all the sources defined in the YAML
    for key in config['target']:
        target = config['target'][key]
        logging.info("Loading {}".format(key))
        if target['type'] in get_file_types():
            # loading from a file
            transformer = get_transformer(target['type'])()
            for f in target['filename']:
                transformer.parse(f, input_format='tsv')
            transformers.append(transformer)
        elif target['type'] == 'neo4j':
            transformer = NeoTransformer(None, target['uri'],
                                         target['username'],
                                         target['password'])
            transformer.load()
            transformers.append(transformer)
        else:
            logging.error("type {} not yet supported".format(target['type']))

    # merge all subgraphs into a single graph
    merged_transformer = Transformer()
    merged_transformer.merge_graphs([x.graph for x in transformers])
    merged_transformer.report()

    # write the merged graph
    if 'destination' in config:
        destination = config['destination']
        if destination['type'] in ['csv', 'tsv', 'ttl', 'json', 'tar']:
            destination_transformer = get_transformer(destination['type'])(
                merged_transformer.graph)
            destination_transformer.save(destination['filename'],
                                         extension=destination['type'])
        elif destination['type'] == 'neo4j':
            destination_transformer = NeoTransformer(
                merged_transformer.graph,
                uri=destination['uri'],
                username=destination['username'],
                password=destination['password'])
            destination_transformer.save_with_unwind()
        else:
            logging.error(
                "type {} not yet supported for KGX load-and-merge operation.".
                format(destination['type']))

    return merged_transformer
Пример #5
0
def load_and_merge(config: dict, load_config):
    """
    Load nodes and edges from files and KGs, as defined in a config YAML, and merge them into a single graph.
    The merge happens in-memory. This merged graph can then be written to a local/remote Neo4j instance
    OR be serialized into a file.
    \f

    .. note::
        Everything here is driven by the ``load_config`` YAML.

    Parameters
    ----------
    """

    with open(load_config, 'r') as YML:
        cfg = yaml.load(YML, Loader=yaml.FullLoader)

    transformers = []
    for key in cfg['target']:
        target = cfg['target'][key]
        logging.info("Loading {}".format(key))
        if target['type'] in get_file_types():
            # loading from a file
            transformer = get_transformer(target['type'])()
            transformer.parse(target['filename'])
            transformers.append(transformer)
        elif target['type'] == 'neo4j':
            transformer = kgx.NeoTransformer(None, target['uri'],
                                             target['username'],
                                             target['password'])
            # TODO: support filters
            transformer.load()
            transformers.append(transformer)
        else:
            logging.error(
                "type {} not yet supported for KGX load-and-merge operation.".
                format(target['type']))

    merged_transformer = Transformer()
    merged_transformer.merge_graphs([x.graph for x in transformers])

    destination = cfg['destination']
    if destination['type'] in ['csv', 'tsv', 'ttl', 'json', 'tar']:
        destination_transformer = get_transformer(destination['type'])()
        destination_transformer.save(destination['filename'])
    elif destination['type'] == 'neo4j':
        destination_transformer = kgx.NeoTransformer(
            merged_transformer.graph,
            uri=destination['uri'],
            username=destination['username'],
            password=destination['password'])
        destination_transformer.save_with_unwind()
    else:
        logging.error(
            "type {} not yet supported for KGX load-and-merge operation.".
            format(destination['type']))
Пример #6
0
def transform(config: dict, inputs: List[str], input_type: str, output: str,
              output_type: str, mapping: str, preserve: bool):
    """
    Transform a Knowledge Graph from one serialization form to another.
    \f

    Parameters
    ----------
    config: dict
        A dictionary containing the configuration for kgx.cli
    inputs: List[str]
        A list of files that contains nodes/edges
    input_type: str
        The input type
    output: str
        The output file
    output_type: str
        The output type
    mapping: str
        A mapping file (TSV) for remapping node identifiers
    preserve: bool
        Whether to preserve old identifiers before remapping

    """
    # load
    input_transformer = load_transformer(inputs, input_type)

    if mapping is not None:
        # remap
        mapping_dictionary = {}
        with open(mapping) as M:
            for line in M:
                element = line.rstrip().split('\t')
                mapping_dictionary[element[0]] = element[1]
        logging.info('Performing remapping based on {}'.format(mapping))
        map_graph(input_transformer.graph,
                  mapping=mapping_dictionary,
                  preserve=preserve)

    # save
    output_transformer = get_transformer(output_type)
    if output_transformer is None:
        logging.error('Output does not have a recognized type: ' +
                      str(get_file_types()))
    w = output_transformer(input_transformer.graph)
    w.save(output, extension=output_type)
Пример #7
0
def load_and_merge(config: dict, load_config):
    """
    Load nodes and edges from files and KGs, as defined in a config YAML, and merge them into a single graph.
    The merge happens in-memory. This merged graph can then be written to a local/remote Neo4j instance
    OR be serialized into a file.
    \f

    .. note::
        Everything here is driven by the ``load_config`` YAML.

    Parameters
    ----------
    """
    with open(load_config, 'r') as YML:
        cfg = yaml.load(YML, Loader=yaml.FullLoader)

    transformers = []
    for key in cfg['target']:
        target = cfg['target'][key]
        logging.info("Loading {}".format(key))
        if target['type'] in get_file_types():
            # loading from a file
            transformer = get_transformer(target['type'])()
            if target['type'] in {'tsv', 'neo4j'}:
                # currently supporting filters only for TSV and Neo4j
                if 'filters' in target:
                    filters = target['filters']
                    node_filters = filters[
                        'node_filters'] if 'node_filters' in filters else {}
                    edge_filters = filters[
                        'edge_filters'] if 'edge_filters' in filters else {}
                    for k, v in node_filters.items():
                        transformer.set_node_filter(k, set(v))
                    for k, v in edge_filters.items():
                        transformer.set_edge_filter(k, set(v))
                    logging.info(f"with node filters: {node_filters}")
                    logging.info(f"with edge filters: {edge_filters}")
            for f in target['filename']:
                transformer.parse(f, input_format=target['type'])
            transformers.append(transformer)
        elif target['type'] == 'neo4j':
            transformer = kgx.NeoTransformer(None, target['uri'],
                                             target['username'],
                                             target['password'])
            if 'filters' in target:
                filters = target['filters']
                node_filters = filters[
                    'node_filters'] if 'node_filters' in filters else {}
                edge_filters = filters[
                    'edge_filters'] if 'edge_filters' in filters else {}
                for k, v in node_filters.items():
                    transformer.set_node_filter(k, set(v))
                for k, v in edge_filters.items():
                    transformer.set_edge_filter(k, set(v))
                logging.info(f"with node filters: {node_filters}")
                logging.info(f"with edge filters: {edge_filters}")
            transformer.load()
            transformers.append(transformer)
        else:
            logging.error(
                "type {} not yet supported for KGX load-and-merge operation.".
                format(target['type']))

    merged_graph = merge_all_graphs([x.graph for x in transformers])

    destination = cfg['destination']
    if destination['type'] in ['csv', 'tsv', 'ttl', 'json', 'tar']:
        destination_transformer = get_transformer(
            destination['type'])(merged_graph)
        destination_transformer.save(destination['filename'])
    elif destination['type'] == 'neo4j':
        destination_transformer = kgx.NeoTransformer(
            merged_graph,
            uri=destination['uri'],
            username=destination['username'],
            password=destination['password'])
        destination_transformer.save()
    else:
        logging.error(
            "type {} not yet supported for KGX load-and-merge operation.".
            format(destination['type']))
Пример #8
0
    Parameters
    ----------
    config: dict
        A dictionary containing the configuration for kgx.cli
    debug: bool
        Whether to print debug messages

    """
    config.debug = debug
    if debug:
        logging.basicConfig(level=logging.DEBUG)


@cli.command('node-summary')
@click.argument('filepath', type=click.Path(exists=True), required=True)
@click.option('--input-type', type=click.Choice(get_file_types()))
@click.option('--max-rows',
              '-m',
              type=int,
              help='The maximum number of rows to return')
@click.option('--output', '-o', type=click.Path(exists=False))
@pass_config
def node_summary(config: dict, filepath: str, input_type: str, max_rows: int,
                 output: str):
    """
    Loads and summarizes a knowledge graph node set, where the input is a file.
    \f

    Parameters
    ----------
    config: dict
Пример #9
0
def load_and_merge(yaml_file: str) -> nx.MultiDiGraph:
    """Load and merge sources defined in the config YAML.

    Args:
        yaml_file: A string pointing to a KGX compatible config YAML.

    Returns:
        networkx.MultiDiGraph: The merged graph.

    """
    gm = GraphMerge()
    config = parse_load_config(yaml_file)
    transformers: List = []

    # make sure all files exist before we start load
    for key in config['target']:
        target = config['target'][key]
        logging.info("Checking that file exist for {}".format(key))
        if target['type'] in get_file_types():
            for f in target['filename']:
                if not os.path.exists(f) or not os.path.isfile(f):
                    raise FileNotFoundError(
                        "File {} for transform {}  in yaml file {} "
                        "doesn't exist! Dying.", f, key, yaml_file)

    # read all the sources defined in the YAML
    for key in config['target']:
        target = config['target'][key]
        logging.info("Loading {}".format(key))
        if target['type'] in get_file_types():
            # loading from a file
            transformer = get_transformer(target['type'])()
            for f in target['filename']:
                transformer.parse(f, input_format='tsv')
                transformer.graph.name = key
            transformers.append(transformer)
        elif target['type'] == 'neo4j':
            transformer = NeoTransformer(None, target['uri'],
                                         target['username'],
                                         target['password'])
            transformer.load()
            transformers.append(transformer)
            transformer.graph.name = key
        else:
            logging.error("type {} not yet supported".format(target['type']))
        stats_filename = f"{key}_stats.yaml"
        generate_graph_stats(transformer.graph, key, stats_filename)

    # merge all subgraphs into a single graph
    merged_graph = gm.merge_all_graphs([x.graph for x in transformers])
    merged_graph.name = 'merged_graph'
    generate_graph_stats(merged_graph, merged_graph.name,
                         f"merged_graph_stats.yaml")

    # write the merged graph
    if 'destination' in config:
        for _, destination in config['destination'].items():
            if destination['type'] == 'neo4j':
                destination_transformer = NeoTransformer(
                    merged_graph,
                    uri=destination['uri'],
                    username=destination['username'],
                    password=destination['password'])
                destination_transformer.save_with_unwind()
            elif destination['type'] in get_file_types():
                destination_transformer = get_transformer(
                    destination['type'])(merged_graph)
                destination_transformer.save(destination['filename'],
                                             extension=destination['type'])
            else:
                logging.error(
                    "type {} not yet supported for KGX load-and-merge operation."
                    .format(destination['type']))

    return merged_graph
Пример #10
0
def load_and_merge(yaml_file: str) -> nx.MultiDiGraph:
    """Load and merge sources defined in the config YAML.

    Args:
        yaml_file: A string pointing to a KGX compatible config YAML.

    Returns:
        networkx.MultiDiGraph: The merged graph.

    """
    config = parse_load_config(yaml_file)
    transformers: List = []

    # make sure all files exist before we start load
    for key in config['target']:
        target = config['target'][key]
        logging.info("Checking that file exist for {}".format(key))
        if target['type'] in get_file_types():
            for f in target['filename']:
                if not os.path.exists(f) or not os.path.isfile(f):
                    raise FileNotFoundError(
                        "File {} for transform {}  in yaml file {} "
                        "doesn't exist! Dying.", f, key, yaml_file)

    # read all the sources defined in the YAML
    for key in config['target']:
        target = config['target'][key]
        logging.info("Loading {}".format(key))
        if target['type'] in get_file_types():
            # loading from a file
            try:
                transformer = get_transformer(target['type'])()
                if target['type'] in {'tsv', 'neo4j'}:
                    if 'filters' in target:
                        apply_filters(target, transformer)
                for f in target['filename']:
                    transformer.parse(f, input_format='tsv')
                    transformer.graph.name = key
                if 'operations' in target:
                    apply_operations(target, transformer)
                transformers.append(transformer)
            except:
                logging.error("Failed loading {}".format(f))
        elif target['type'] == 'neo4j':
            transformer = NeoTransformer(None, target['uri'],
                                         target['username'],
                                         target['password'])
            if 'filters' in target:
                apply_filters(target, transformer)
            transformer.load()
            if 'operations' in target:
                apply_operations(target, transformer)
            transformers.append(transformer)
            transformer.graph.name = key
        else:
            logging.error("type {} not yet supported".format(target['type']))
        stats_filename = f"{key}_stats.yaml"
        generate_graph_stats(transformer.graph, key, stats_filename)

    # merge all subgraphs into a single graph
    merged_graph = merge_all_graphs([x.graph for x in transformers])
    merged_graph.name = 'merged_graph'
    generate_graph_stats(merged_graph, merged_graph.name,
                         "merged_graph_stats.yaml", ['provided_by'],
                         ['provided_by'])

    # write the merged graph
    if 'destination' in config:
        for _, destination in config['destination'].items():
            if destination['type'] == 'neo4j':
                destination_transformer = NeoTransformer(
                    merged_graph,
                    uri=destination['uri'],
                    username=destination['username'],
                    password=destination['password'])
                destination_transformer.save()
            elif destination['type'] in get_file_types():
                destination_transformer = get_transformer(
                    destination['type'])(merged_graph)
                mode = 'w:gz' if destination['type'] in {'tsv'} else None
                if destination['type'] in {'nt', 'nt.gz', 'ttl'}:
                    destination_transformer.set_property_types(PROPERTY_TYPES)
                destination_transformer.save(destination['filename'],
                                             output_format=destination['type'],
                                             mode=mode)
            else:
                logging.error(
                    "type {} not yet supported for KGX load-and-merge operation."
                    .format(destination['type']))

    return merged_graph