Пример #1
0
def load_graph(ontology_list, data, def_format='n3', cache=True):

    def filter_valid_files(paths):
        if def_format == 'csv':
            filter_fn = lambda p: p.endswith('.csv') or p.endswith('.tsv')
        else:
            filter_fn = lambda p: p.endswith(def_format)
        return filter(filter_fn, paths)

    logger.info('Calculating data checksum')
    paths = ontology_list + [data]
    md5 = _md5_checksum(filter_valid_files(paths))

    cached_fn = '.%s' % md5
    g = None
    if os.path.exists(cached_fn) and cache:
        logger.info('Loading cached graph structure')
        g = _load_cached_graph(cached_fn)
    else:
        logger.info('Building graph structure')
        if def_format == 'n3':
            g = rdf(paths, def_format=def_format)
        elif def_format == 'csv':
            g = csv(ontology_list, data)
        if cache:
            _save_graph_to_cache(g, cached_fn)
    return g
Пример #2
0
def run(kwargs, cli=False):

    if cli:
        logger.setLevel(logging.DEBUG if kwargs['verbose'] else logging.INFO)
    else:
        logger.setLevel(logging.NOTSET)

    logger.info('Starting Hedwig')
    start = time.time()
    start_date = datetime.now().isoformat()

    graph = build_graph(kwargs)

    logger.info('Building the knowledge base')
    score_func = getattr(scorefunctions, kwargs['score'])
    kb = ExperimentKB(graph, score_func, instances_as_leaves=kwargs['leaves'])

    validator = Validate(kb,
                         significance_test=significance.apply_fisher,
                         adjustment=getattr(adjustment, kwargs['adjust']))

    rules_per_target = run_learner(kwargs, kb, validator)
    rules_report = generate_rules_report(kwargs, rules_per_target)

    end = time.time()
    time_taken = end - start
    logger.info('Finished in %d seconds' % time_taken)

    logger.info('Outputing results')

    if kwargs['covered']:
        with open(kwargs['covered'], 'w') as f:
            examples = Rule.ruleset_examples_json(rules_per_target)
            f.write(json.dumps(examples, indent=2))

    parameters_report = _parameters_report(kwargs, start_date, time_taken)
    rules_out_file = kwargs['output']
    if rules_out_file:
        with open(rules_out_file, 'w') as f:
            if rules_out_file.endswith('json'):
                f.write(
                    Rule.to_json(rules_per_target, show_uris=kwargs['uris']))
            else:
                f.write(parameters_report)
                f.write(rules_report)
    elif cli:
        print parameters_report
        print rules_report

    return rules_per_target
Пример #3
0
def load_graph(ontology_list, data, def_format='n3', cache=True):
    def filter_valid_files(paths):
        if def_format == 'csv':
            filter_fn = lambda p: p.endswith('.csv') or p.endswith('.tsv')
        else:
            filter_fn = lambda p: p.endswith(def_format)
        return filter(filter_fn, paths)

    logger.info('Calculating data checksum')
    paths = ontology_list + [data]
    md5 = _md5_checksum(filter_valid_files(paths))

    cached_fn = '.%s' % md5
    g = None
    if os.path.exists(cached_fn) and cache:
        logger.info('Loading cached graph structure')
        g = _load_cached_graph(cached_fn)
    else:
        logger.info('Building graph structure')
        if def_format == 'n3':
            g = rdf(paths, def_format=def_format)
        elif def_format == 'csv':
            g = csv(ontology_list, data)
        if cache:
            _save_graph_to_cache(g, cached_fn)
    return g
Пример #4
0
def run(kwargs, cli=False):

    if cli:
        logger.setLevel(logging.DEBUG if kwargs['verbose'] else logging.INFO)
    else:
        logger.setLevel(logging.NOTSET)

    logger.info('Starting Hedwig')
    start = time.time()
    start_date = datetime.now().isoformat()

    graph = build_graph(kwargs)

    logger.info('Building the knowledge base')
    score_func = getattr(scorefunctions, kwargs['score'])
    kb = ExperimentKB(graph, score_func, instances_as_leaves=kwargs['leaves'])

    validator = Validate(kb, significance_test=significance.apply_fisher,
                         adjustment=getattr(adjustment, kwargs['adjust']))

    rules_per_target = run_learner(kwargs, kb, validator)
    rules_report = generate_rules_report(kwargs, rules_per_target)
    

    end = time.time()
    time_taken = end-start
    logger.info('Finished in %d seconds' % time_taken)

    logger.info('Outputing results')

    if kwargs['covered']:
        with open(kwargs['covered'], 'w') as f:
            examples = Rule.ruleset_examples_json(rules_per_target)
            f.write(json.dumps(examples, indent=2))

    parameters_report = _parameters_report(kwargs, start_date, time_taken)
    rules_out_file = kwargs['output']
    if rules_out_file:
        with open(rules_out_file, 'w') as f:
            if rules_out_file.endswith('json'):
                f.write(Rule.to_json(rules_per_target, show_uris=kwargs['uris']))
            else:
                f.write(parameters_report)
                f.write(rules_report)
    elif cli:
        print parameters_report
        print rules_report

    return rules_per_target
Пример #5
0
def run_learner(kwargs, kb, validator):

    if kb.is_discrete_target():
        targets = kb.class_values if not kwargs['target'] else [
            kwargs['target']
        ]
    else:
        targets = [None]

    rules_report = ''
    rules_per_target = []

    for target in targets:
        if target:
            logger.info('Starting learner for target \'%s\'' % target)
        else:
            logger.info('Ranks detected - starting learner.')

        learner_cls = {
            'heuristic': HeuristicLearner,
            'optimal': OptimalLearner
        }[kwargs['learner']]
        learner = learner_cls(kb,
                              n=kwargs['beam'],
                              min_sup=int(kwargs['support'] * kb.n_examples()),
                              target=target,
                              depth=kwargs['depth'],
                              sim=0.9,
                              use_negations=kwargs['negations'],
                              optimal_subclass=kwargs['optimalsubclass'])
        rules = learner.induce()

        if kb.is_discrete_target():
            if kwargs['adjust'] == 'fdr':
                logger.info('Validating rules, FDR = %.3f' % kwargs['FDR'])
            elif kwargs['adjust'] == 'fwer':
                logger.info('Validating rules, alpha = %.3f' % kwargs['alpha'])
            rules = validator.test(rules,
                                   alpha=kwargs['alpha'],
                                   q=kwargs['FDR'])

        rules_per_target.append((target, rules))

    return rules_per_target
Пример #6
0
def run_learner(kwargs, kb, validator):

    if kb.is_discrete_target():
        targets = kb.class_values if not kwargs['target'] else [kwargs['target']]
    else:
        targets = [None]

    rules_report = ''
    rules_per_target = []

    for target in targets:
        if target:
            logger.info('Starting learner for target \'%s\'' % target)
        else:
            logger.info('Ranks detected - starting learner.')

        learner_cls = {
            'heuristic': HeuristicLearner,
            'optimal': OptimalLearner
        } [kwargs['learner']]
        learner = learner_cls(kb,
                          n=kwargs['beam'],
                          min_sup=int(kwargs['support']*kb.n_examples()),
                          target=target,
                          depth=kwargs['depth'],
                          sim=0.9,
                          use_negations=kwargs['negations'],
                          optimal_subclass=kwargs['optimalsubclass'])
        rules = learner.induce()

        if kb.is_discrete_target():
            if kwargs['adjust'] == 'fdr':
                logger.info('Validating rules, FDR = %.3f' % kwargs['FDR'])
            elif kwargs['adjust'] == 'fwer':
                logger.info('Validating rules, alpha = %.3f' % kwargs['alpha'])
            rules = validator.test(rules, alpha=kwargs['alpha'], q=kwargs['FDR'])

        rules_per_target.append((target, rules))

    return rules_per_target