Пример #1
0
def main(input_path, output_path, biolink_model_only):
    """
    Uses ontobio to load ontologies and choose the best biolink model term
    for a node category or edge label.
    """
    input_transformer = get_transformer(get_type(input_path))()
    output_transformer = get_transformer(get_type(output_path))()
    input_transformer.parse(input_path)
    G = input_transformer.graph

    for n, data in G.nodes(data=True):
        if 'category' in data and isinstance(data['category'],
                                             (tuple, list, set)):
            for category in data['category']:
                if ':' in category:
                    curie = make_curie(category)
                    prefix, _ = curie.lower().rsplit(':', 1)
                    ontologies[prefix] = None

    for u, v, data in G.edges(data=True):
        if 'edge_label' in data and ':' in data['edge_label']:
            curie = make_curie(data['edge_label'])
            prefix, _ = curie.lower().rsplit(':', 1)
            ontologies[prefix] = None

    print(ontologies)

    for key in ontologies.keys():
        print(key)
        ontologies[key] = get_ontology(key)

    with click.progressbar(G.nodes(data=True)) as bar:
        for n, data in bar:
            if 'category' in data and isinstance(data['category'],
                                                 (list, set, tuple)):
                l = [
                    get_term(make_curie(c), biolink_model_only)
                    for c in data['category'] if ':' in c
                ]
                l += [c for c in data['category'] if ':' not in c]
                l = [x.replace('_', ' ') for x in l if x is not None]
                data['category'] = l
            elif 'category' not in data:
                data['category'] = ['named thing']

    with click.progressbar(G.edges(data=True)) as bar:
        for u, v, data in bar:
            if 'edge_label' in data and ':' in data['edge_label']:
                c = make_curie(data['edge_label'])
                data['edge_label'] = get_term(c, biolink_model_only)
                data['valid_edge_label'] = bmt.get_predicate(
                    data['edge_label']) is not None
            if 'edge_label' not in data or data['edge_label'] is None:
                data['edge_label'] = 'related_to'
            data['edge_label'] = data['edge_label'].replace(' ', '_')

    output_transformer.graph = G
    print('Saving to {}'.format(output_path))
    output_transformer.save(output_path)
Пример #2
0
def validate(config: dict, path: str, output: str, output_dir: str,
             format: str):
    """
    Run KGX validation on an input file to check for BioLink Model compliance.
    \f

    Parameters
    ----------
    config: dict
        A dictionary containing the configuration for kgx.cli
    path: str
        Path to input file
    output: str
        Path to output file
    output_dir:
        Path to a directory
    format:
        The input format

    """
    t = None
    if format:
        t = get_transformer(format)()
    else:
        t = get_transformer(get_type(path))()
    t.parse(path, input_format=format)
    validator = Validator()
    errors = validator.validate(t.graph)
    validator.write_report(errors, open(output, 'w'))
Пример #3
0
def build_transformer(path:str, input_type:str=None) -> Transformer:
    if input_type is None:
        input_type = get_type(path)
    constructor = get_transformer(input_type)
    if constructor is None:
        error('File does not have a recognized type: ' + str(get_file_types()))
    return constructor()
Пример #4
0
def load_transformer(input_paths:List[str], input_type:str=None) -> Transformer:
    """
    Creates a transformer for the appropriate file type and loads the data into
    it from file.
    """
    if input_type is None:
        input_types = [get_type(i) for i in input_paths]
        for t in input_types:
            if input_types[0] != t:
                error(
                """
                Each input file must have the same file type.
                Try setting the --input-type parameter to enforce a single
                type.
                """
                )
            input_type = input_types[0]

    transformer_constructor = get_transformer(input_type)

    if transformer_constructor is None:
        error('Inputs do not have a recognized type: ' + str(get_file_types()))

    t = transformer_constructor()
    for i in input_paths:
        t.parse(i, input_type)

    t.report()

    return t
Пример #5
0
def transform_and_save(t:Transformer, output_path:str, output_type:str=None):
    """
    Creates a transformer with the appropraite file type from the given
    transformer, and applies that new transformation and saves to file.
    """
    if output_type is None:
        output_type = get_type(output_path)

    output_transformer = get_transformer(output_type)

    if output_transformer is None:
        error('Output does not have a recognized type: ' + str(get_file_types()))

    kwargs = {
        'extention' : output_type
    }

    w = output_transformer(t.graph)
    result_path = w.save(output_path, **kwargs)

    if result_path is not None and os.path.isfile(result_path):
        click.echo("File created at: " + result_path)
    elif os.path.isfile(output_path):
        click.echo("File created at: " + output_path)
    else:
        error("Could not create file.")
Пример #6
0
def validate(config, path, input_type, output_dir, record_size):
    os.makedirs(output_dir, exist_ok=True)

    validator = Validator(record_size)

    t = get_transformer(get_type(path))()
    t.parse(path)
    # t = load_transformer(path, input_type)
    validator.validate(t.graph)

    for error_type, failures in validator.error_dict.items():
        filename = error_type.replace(' ', '_') + '.log'
        with click.open_file(os.path.join(output_dir, filename), 'a+') as f:
            f.write('--- {} ---\n'.format(datetime.now()))
            for t in failures:
                if len(t) == 2:
                    n, message = t
                    if message is not None:
                        f.write('node({}):\t{}\n'.format(n, message))
                    else:
                        f.write('node({})\n'.format(n))
                elif len(t) == 3:
                    u, v, message = t
                    if message is not None:
                        f.write('edge({}, {}):\t{}\n'.format(u, v, message))
                    else:
                        f.write('edge({}, {})\n'.format(u, v))

    if validator.error_dict == {}:
        click.echo('No errors found')
    else:
        for key, value in validator.error_dict.items():
            click.echo('{} - {}'.format(key, len(value)))
Пример #7
0
def merge(inputs, output):
    """
    Loads a series of knowledge graphs and merges cliques using `same_as` edges
    as well as `same_as` node properties. The resulting graph will not have any
    `same_as` edges, and the remaining clique leader nodes will have all
    equivalent identifiers in their `same_as` property.
    """
    transformers = []
    output_transformer = get_transformer(get_type(output))()
    graph = None
    for path in inputs:
        construct = get_transformer(get_type(path))
        if construct is None:
            raise Exception('No transformer for {}'.format(path))
        transformers.append(construct())
    for transformer, path in zip(transformers, inputs):
        if graph is None:
            graph = transformer.graph
        else:
            transformer.graph = graph
        transformer.parse(path)
    output_transformer.graph = graph
    output_transformer.graph = clique_merge(output_transformer.graph)
    output_transformer.save(output)
Пример #8
0
def validate(config, path, output, output_dir):
    t = get_transformer(get_type(path))()
    t.parse(path)

    validator = Validator()
    validator.validate(t.graph)

    time = datetime.now()

    if len(validator.errors) == 0:
        click.echo('No errors found')

    else:
        append_errors_to_file(output, validator.errors, time)
        if output_dir is not None:
            append_errors_to_files(output_dir, validator.errors, time)
Пример #9
0
def neo4j_download(config, page_size, stop_after, subject_label, object_label, edge_type, address, username, password, output, output_type):
    if not is_writable(output):
        try:
            with open(output, 'w+') as f:
                pass
        except:
            error(f'Cannot write to {output}')

    output_transformer = get_transformer(get_type(output))()
    G = output_transformer.graph

    driver = http_gdb(address, username=username, password=password)

    subject_label = ':`{}`'.format(subject_label) if isinstance(subject_label, str) else ''
    object_label = ':`{}`'.format(object_label) if isinstance(object_label, str) else ''
    edge_type = ':`{}`'.format(edge_type) if isinstance(edge_type, str) else ''

    match = 'match (n{})-[e{}]->(m{})'.format(subject_label, edge_type, object_label)

    results = driver.query('{} return count(*)'.format(match))

    click.echo('Using cyper query: {} return n, e, m'.format(match))

    for a, in results:
        size = a
        break

    if size == 0:
        click.echo('No data available')
        quit()

    page_size = 1_000

    skip_flag = False

    with click.progressbar(list(range(0, size, page_size)), label='Downloading {} many edges'.format(size)) as bar:
        for i in bar:
            q = '{} return n, e, m skip {} limit {}'.format(match, i, page_size)
            results = driver.query(q)

            for n, e, m in results:
                subject_attr = n['data']
                object_attr = m['data']
                edge_attr = e['data']

                if 'id' not in subject_attr or 'id' not in object_attr:
                    if not skip_flag:
                        click.echo('Skipping records that have no id attribute')
                        skip_flag = True
                    continue

                s = subject_attr['id']
                o = object_attr['id']

                if 'edge_label' not in edge_attr:
                    edge_attr['edge_label'] = e['metadata']['type']

                if 'category' not in subject_attr:
                    subject_attr['category'] = n['metadata']['labels']

                if 'category' not in object_attr:
                    object_attr['category'] = m['metadata']['labels']

                if s not in G:
                    G.add_node(s, **subject_attr)
                if o not in G:
                    G.add_node(o, **object_attr)

                G.add_edge(s, o, key=edge_attr['edge_label'], **edge_attr)

            if stop_after is not None and G.number_of_edges() > stop_after:
                break

    output_transformer.save(output)