def __init__(self, dataset='WN18'): self.log = Logger.get_log_cate('learning.txt', 'Learning') self.cfg = Config.load_learning_config(dataset) self.log.info( '****************************start new section*************************************' ) self.log.info('initialize learning {}'.format(current_milli_time())) self.triple_set = TripleSet() self.triple_set.read_triples(self.cfg['path_training'])
def __init__(self, datasets='WN18'): Rule.set_application_mode() self.config = Config.load_eval_config(datasets) self.training_set, self.validation_set, self.test_set = TripleSet( ), TripleSet(), TripleSet() self.training_set.read_triples(self.config['path_training']) self.validation_set.read_triples(self.config['path_valid']) self.test_set.read_triples(self.config['path_test']) self.result_set = ResultSet(self.config['path_prediction'], self.config['path_prediction'], True, 10)
class Evaluation(object): def __init__(self, datasets='WN18'): Rule.set_application_mode() self.config = Config.load_eval_config(datasets) self.training_set, self.validation_set, self.test_set = TripleSet( ), TripleSet(), TripleSet() self.training_set.read_triples(self.config['path_training']) self.validation_set.read_triples(self.config['path_valid']) self.test_set.read_triples(self.config['path_test']) self.result_set = ResultSet(self.config['path_prediction'], self.config['path_prediction'], True, 10) def eval(self, is_test_set=True, path_extend=False): # print('result_set {}'.format(len(result_set.results))) if path_extend: self.result_set = ResultSet(self.config['path_prediction_ext'], self.config['path_prediction_ext'], True, 10) elif not is_test_set: self.result_set = ResultSet(self.config['path_eval_predict'], self.config['path_eval_predict'], True, 10) hitsAtK = HitsAtK() hitsAtK.filter_sets.append(self.training_set) hitsAtK.filter_sets.append(self.validation_set) hitsAtK.filter_sets.append(self.test_set) score_set = self.test_set if is_test_set else self.validation_set if path_extend: score_set = self.validation_set self.__compute_scores(self.result_set, score_set, hitsAtK) print('hits@1 hits@3 hits@10') h1 = (hitsAtK.hits_adn_head_filtered[0] + hitsAtK.hits_adn_tail_filtered[0]) / (hitsAtK.counter_head + hitsAtK.counter_tail) h3 = (hitsAtK.hits_adn_head_filtered[2] + hitsAtK.hits_adn_tail_filtered[2]) / (hitsAtK.counter_head + hitsAtK.counter_tail) h10 = (hitsAtK.hits_adn_head_filtered[9] + hitsAtK.hits_adn_tail_filtered[9]) / (hitsAtK.counter_head + hitsAtK.counter_tail) print('{:.4f}\t {:.4f} {:.4f}'.format(h1, h3, h10)) def __compute_scores(self, result_set, gold, hitsAtK): for triple in gold.triples: cand1 = result_set.get_head_candidates(str(triple)) hitsAtK.evaluate_head(cand1, triple) cand2 = result_set.get_tail_candidates(str(triple)) hitsAtK.evaluate_tail(cand2, triple)
def prediction(self, valid_set=False,extend=False): training_set, test_set, valid_set = TripleSet(), TripleSet(), TripleSet() training_set.read_triples(self.cfg['path_training']) test_set.read_triples(self.cfg['path_test']) valid_set.read_triples(self.cfg['path_valid']) path_rules_used = self.cfg['path_rules'] #for path_rules_used in self.cfg['path_rules']: start_time = current_milli_time() tmp_path = path_rules_used.split('/') path_output_used = 'predictions/{}/{}'.format(self.datasets, tmp_path[2].replace('rule', 'predict')) self.log.info('rules learning: {}'.format(path_rules_used)) self.log.info('output learning: {}'.format(path_output_used)) rules = RuleReader(path_rules_used).read() if extend: rules_exd = RuleReader(self.cfg['path_rules_ext']).read() rules.extend(rules_exd) path_output_used = 'predictions/{}/ext_{}'.format(self.datasets, tmp_path[2].replace('rule', 'predict')) test_set, valid_set = valid_set, test_set elif valid_set: path_output_used = 'predictions/{}/predict_valid_1000.txt'.format(self.datasets) test_set, valid_set = valid_set, test_set rules_size = len(rules) print('*** read rules {} rom file {}'.format(rules_size, path_rules_used)) rule_engine = RuleEngine(path_output_used, self.cfg['unseen_nagative_examples']) rule_engine.apply_rules_arx(rules, training_set, test_set, valid_set, self.cfg['top_k_output']) print('* evaluated {} rules to propose candiates for {} *2 completion tasks'.format(rules_size, len(test_set.triples))) print('* finished in {} ms.'.format(current_milli_time() - start_time)) self.log.info('finished in {} s.'.format((current_milli_time() - start_time) // 1000))
class Learning(object): def __init__(self, dataset='WN18'): self.log = Logger.get_log_cate('learning.txt', 'Learning') self.cfg = Config.load_learning_config(dataset) self.log.info( '****************************start new section*************************************' ) self.log.info('initialize learning {}'.format(current_milli_time())) self.triple_set = TripleSet() self.triple_set.read_triples(self.cfg['path_training']) def train(self): triple_set = self.triple_set index_start_time = current_milli_time() self.log.info('training with config {}'.format(self.cfg)) path_sampler = PathSampler(triple_set) path_counter, batch_counter = 0, 0 mine_cyclic_not_acyclic = False all_useful_rules = [set()] snapshot_index, rule_size_cyclic, rule_size_acyclic = 0, 0, 0 last_cyclic_coverage, last_acyclic_coverage = 0.0, 0.0 self.log.info('indexing dataset: {}'.format(self.cfg['path_training'])) self.log.info('time elapsed: {} ms'.format(current_milli_time() - index_start_time)) snapshots_at = self.cfg['snapshots_at'] dataset = self.cfg['dataset'] start_time = current_milli_time() while True: batch_previously_found_rules, batch_new_useful_rules, batch_rules = 0, 0, 0 rule_size = rule_size_cyclic if mine_cyclic_not_acyclic else rule_size_acyclic useful_rules = all_useful_rules[rule_size] elapsed_seconds = (current_milli_time() - start_time) // 1000 ## snapshots rule affter t seconds white learning if elapsed_seconds > snapshots_at[snapshot_index]: total_rule = 0 for _rules in all_useful_rules: total_rule += len(_rules) snapshot_file = 'learning_rules/{}/rule_{}.txt'.format( dataset, snapshots_at[snapshot_index]) snapshot_index += 1 self.log.info('snapshot_rules: {} in file {}'.format( total_rule, snapshot_file)) snapshot_rules = copy.deepcopy(all_useful_rules) thread_snapshot = threading.Thread( target=self.process_snapshot_rule, args=( snapshot_rules, snapshot_file, )) thread_snapshot.start() print('created snapshot {} after {} seconds'.format( snapshot_index, elapsed_seconds)) if snapshot_index == len(snapshots_at): print( '*************************done learning*********************************' ) thread_snapshot.join() return 0 # batch learnig batch_start_time = current_milli_time() while True: if current_milli_time( ) - batch_start_time > self.cfg['batch_time']: break path_counter += 1 path = path_sampler.sample_path(rule_size + 2, mine_cyclic_not_acyclic) if path != None and path.is_valid(): rule = Rule() rule.init_from_path(path) gen_rules = rule.get_generalizations( mine_cyclic_not_acyclic) for r in gen_rules: if r.is_trivial(): continue batch_rules += 1 if r not in useful_rules: r.compute_scores(triple_set) if r.confidence >= self.cfg[ 'threshold_confidence'] and r.correctly_predicted >= self.cfg[ 'threshold_correct_predictions']: batch_new_useful_rules += 1 useful_rules.add(r) else: batch_previously_found_rules += 1 batch_counter += 1 str_type = 'CYCLIC' if mine_cyclic_not_acyclic else 'ACYCLIC' print('=====> batch [{} {}] {} (sampled {} pathes) *****'.format( str_type, rule_size + 1, batch_counter, path_counter)) current_coverage = batch_previously_found_rules / ( batch_new_useful_rules + batch_previously_found_rules) print( '=====> fraction of previously seen rules within useful rules in this batch: {} num of new rule = {} num of previously rule = {} num of all batch rules = {}' .format(current_coverage, batch_new_useful_rules, batch_previously_found_rules, batch_rules)) print('=====> stored rules: {}'.format(len(useful_rules))) if mine_cyclic_not_acyclic: last_cyclic_coverage = current_coverage else: last_cyclic_coverage = current_coverage if current_coverage > self.cfg[ 'saturation'] and batch_previously_found_rules > 1: rule_size += 1 if mine_cyclic_not_acyclic: rule_size_cyclic = rule_size if not mine_cyclic_not_acyclic: rule_size_acyclic = rule_size print( '=========================================================' ) print('=====> increasing rule size of {} rule to {}'.format( str_type, rule_size + 1)) self.log.info( 'increasing rule size of {} rules to {} after {} s'. format(str_type, rule_size + 1, (current_milli_time() - start_time) // 1000)) all_useful_rules.append(set()) mine_cyclic_not_acyclic = not mine_cyclic_not_acyclic if mine_cyclic_not_acyclic and rule_size_cyclic + 1 > self.cfg[ 'max_length_cylic']: mine_cyclic_not_acyclic = False def process_snapshot_rule(self, rules, file): if path.exists(file): remove(file) with open(file, 'w') as output_stream: for set_rule in rules: for rule in set_rule: print(rule, file=output_stream) def process_snapshot_rule_exis_file(self, rules, file): with open(file, 'a+') as output_stream: for set_rule in rules: for rule in set_rule: print(rule, file=output_stream) def train_with_batch(self, batch_triple, batch_time=100): is_connected, new_triple = self.triple_set.add_batch_triple( batch_triple) if is_connected: triple_set = self.triple_set path_sampler = PathSampler(triple_set) index_start_time = current_milli_time() self.log.info( 'train_with_batch triple_set: {}, new_triple: {}'.format( len(triple_set.triples), len(new_triple.triples))) path_counter, batch_counter = 0, 0 mine_cyclic_not_acyclic = False all_useful_rules = [set()] snapshot_index, rule_size_cyclic, rule_size_acyclic = 0, 0, 0 last_cyclic_coverage, last_acyclic_coverage = 0.0, 0.0 self.log.info('indexing dataset: {}'.format( self.cfg['path_training'])) self.log.info('time elapsed: {} ms'.format(current_milli_time() - index_start_time)) dataset = self.cfg['dataset'] start_time = current_milli_time() while True: batch_previously_found_rules, batch_new_useful_rules, batch_rules = 0, 0, 0 rule_size = rule_size_cyclic if mine_cyclic_not_acyclic else rule_size_acyclic useful_rules = all_useful_rules[rule_size] elapsed_seconds = (current_milli_time() - start_time) // 1000 if elapsed_seconds > batch_time: total_rule = 0 for _rules in all_useful_rules: total_rule += len(_rules) snapshot_file = 'learning_rules/{}/rule_extend_{}.txt'.format( dataset, 800) self.log.info( '***************************************************************' ) self.log.info('**snapshot_rules: {} in file {}'.format( total_rule, snapshot_file)) self.log.info( '***************************************************************' ) snapshot_rules = copy.deepcopy(all_useful_rules) thread_snapshot = threading.Thread( target=self.process_snapshot_rule, args=( snapshot_rules, snapshot_file, )) thread_snapshot.start() print('created snapshot {} after {} seconds'.format( total_rule, elapsed_seconds)) print( '*************************done learning*********************************' ) thread_snapshot.join() return 0 batch_start_time = current_milli_time() while True: if current_milli_time( ) - batch_start_time > self.cfg['batch_time']: break path_counter += 1 path = path_sampler.sample_batch_path( rule_size + 2, new_triple, mine_cyclic_not_acyclic) if path != None and path.is_valid(): rule = Rule() rule.init_from_path(path) gen_rules = rule.get_generalizations( mine_cyclic_not_acyclic) for r in gen_rules: if r.is_trivial(): continue batch_rules += 1 if r not in useful_rules: r.compute_scores(triple_set) if r.confidence >= self.cfg[ 'threshold_confidence'] and r.correctly_predicted >= self.cfg[ 'threshold_correct_predictions']: batch_new_useful_rules += 1 useful_rules.add(r) else: batch_previously_found_rules += 1 batch_counter += 1 str_type = 'CYCLIC' if mine_cyclic_not_acyclic else 'ACYCLIC' print( '=====> batch [{} {}] {} (sampled {} pathes) *****'.format( str_type, rule_size + 1, batch_counter, path_counter)) current_coverage = batch_previously_found_rules / ( batch_new_useful_rules + batch_previously_found_rules) print( '=====> fraction of previously seen rules within useful rules in this batch: {} num of new rule = {} num of previously rule = {} num of all batch rules = {}' .format(current_coverage, batch_new_useful_rules, batch_previously_found_rules, batch_rules)) print('=====> stored rules: {}'.format(len(useful_rules))) if mine_cyclic_not_acyclic: last_cyclic_coverage = current_coverage else: last_cyclic_coverage = current_coverage if current_coverage > self.cfg[ 'saturation'] and batch_previously_found_rules > 1: rule_size += 1 if mine_cyclic_not_acyclic: rule_size_cyclic = rule_size if not mine_cyclic_not_acyclic: rule_size_acyclic = rule_size print( '=========================================================' ) print( '=====> increasing rule size of {} rule to {}'.format( str_type, rule_size + 1)) self.log.info( 'increasing rule size of {} rules to {} after {} s'. format(str_type, rule_size + 1, (current_milli_time() - start_time) // 1000)) all_useful_rules.append(set()) mine_cyclic_not_acyclic = not mine_cyclic_not_acyclic if mine_cyclic_not_acyclic and rule_size_cyclic + 1 > self.cfg[ 'max_length_cylic']: mine_cyclic_not_acyclic = False def train_with_edge(self, triple): is_connected, new_triple = self.triple_set.add_edge_triple(triple) if is_connected: triple_set = self.triple_set path_sampler = PathSampler(triple_set) index_start_time = current_milli_time() self.log.info( 'train_with_batch triple_set: {}, new_triple: {}'.format( len(triple_set.triples), new_triple)) path_counter, batch_counter = 0, 0 mine_cyclic_not_acyclic = False all_useful_rules = [set()] snapshot_index, rule_size_cyclic, rule_size_acyclic = 0, 0, 0 last_cyclic_coverage, last_acyclic_coverage = 0.0, 0.0 self.log.info('indexing dataset: {}'.format( self.cfg['path_training'])) self.log.info('time elapsed: {} ms'.format(current_milli_time() - index_start_time)) dataset = self.cfg['dataset'] start_time = current_milli_time() while True: batch_previously_found_rules, batch_new_useful_rules, batch_rules = 0, 0, 0 rule_size = rule_size_cyclic if mine_cyclic_not_acyclic else rule_size_acyclic useful_rules = all_useful_rules[rule_size] elapsed_seconds = (current_milli_time() - start_time) // 1000 if elapsed_seconds > 1: total_rule = 0 for _rules in all_useful_rules: total_rule += len(_rules) snapshot_file = 'learning_rules/{}/rule_extend_{}.txt'.format( dataset, 20) self.log.info( '***************************************************************' ) self.log.info('**snapshot_rules: {} in file {}'.format( total_rule, snapshot_file)) self.log.info( '***************************************************************' ) snapshot_rules = copy.deepcopy(all_useful_rules) thread_snapshot = threading.Thread( target=self.process_snapshot_rule_exis_file, args=( snapshot_rules, snapshot_file, )) thread_snapshot.start() print('created snapshot {} after {} seconds'.format( total_rule, elapsed_seconds)) print( '*************************done learning*********************************' ) thread_snapshot.join() return 0 batch_start_time = current_milli_time() while True: if current_milli_time( ) - batch_start_time > self.cfg['batch_time']: break path_counter += 1 path = path_sampler.sample_triple(rule_size + 2, new_triple, mine_cyclic_not_acyclic) if path != None and path.is_valid(): rule = Rule() rule.init_from_path(path) gen_rules = rule.get_generalizations( mine_cyclic_not_acyclic) for r in gen_rules: if r.is_trivial(): continue batch_rules += 1 if r not in useful_rules: r.compute_scores(triple_set) if r.confidence >= 0.45 and r.correctly_predicted >= self.cfg[ 'threshold_correct_predictions']: #self.cfg['threshold_confidence'] batch_new_useful_rules += 1 useful_rules.add(r) else: batch_previously_found_rules += 1 batch_counter += 1 str_type = 'CYCLIC' if mine_cyclic_not_acyclic else 'ACYCLIC' print( '=====> batch [{} {}] {} (sampled {} pathes) *****'.format( str_type, rule_size + 1, batch_counter, path_counter)) if batch_new_useful_rules + batch_previously_found_rules != 0: current_coverage = batch_previously_found_rules / ( batch_new_useful_rules + batch_previously_found_rules) else: current_coverage = 0 print( '=====> fraction of previously seen rules within useful rules in this batch: {} num of new rule = {} num of previously rule = {} num of all batch rules = {}' .format(current_coverage, batch_new_useful_rules, batch_previously_found_rules, batch_rules)) print('=====> stored rules: {}'.format(len(useful_rules))) if mine_cyclic_not_acyclic: last_cyclic_coverage = current_coverage else: last_cyclic_coverage = current_coverage if current_coverage > self.cfg[ 'saturation'] and batch_previously_found_rules > 1: rule_size += 1 if mine_cyclic_not_acyclic: rule_size_cyclic = rule_size if not mine_cyclic_not_acyclic: rule_size_acyclic = rule_size print( '=========================================================' ) print( '=====> increasing rule size of {} rule to {}'.format( str_type, rule_size + 1)) self.log.info( 'increasing rule size of {} rules to {} after {} s'. format(str_type, rule_size + 1, (current_milli_time() - start_time) // 1000)) all_useful_rules.append(set()) mine_cyclic_not_acyclic = not mine_cyclic_not_acyclic if mine_cyclic_not_acyclic and rule_size_cyclic + 1 > self.cfg[ 'max_length_cylic']: mine_cyclic_not_acyclic = False
def apply_rules_arx(self, rules, training_set, test_set, validation_set, k): print('* applying rules') relation_to_rules = self.create_ordered_rule_index(rules) print( '* set up index structure covering rules for {} different relations' .format(len(relation_to_rules))) filter_set = TripleSet() filter_set.add_triple_set(training_set) filter_set.add_triple_set(test_set) filter_set.add_triple_set(validation_set) print('* constructed filter set with {} triples'.format( len(filter_set.triples))) if len(filter_set.triples) == 0: print('WARNING: using empty filter set!') # prepare the data structures used a s cache for question that are reoccuring # start iterating over the test cases counter, current_time, start_time = 0, 0, current_milli_time() ScoreTree.set_lower_bound(k) ScoreTree.set_upper_bound(ScoreTree.lower_bound) ScoreTree.set_epsilon(0.0001) for triple in test_set.triples: if counter % 100 == 0: print('* (# {} ) trying to guess the tail/head of {}'.format( counter, triple)) current_time = current_milli_time() print('Elapsed (s) = {}'.format( (current_time - start_time) // 1000)) start_time = current_milli_time() relation = triple.relation head = triple.head tail = triple.tail tail_question, head_question = (relation, head), (relation, tail) k_tail_tree = ScoreTree() k_head_tree = ScoreTree() if relation in relation_to_rules: relevant_rules = relation_to_rules.get(relation) for rule in relevant_rules: if not k_tail_tree.fine(): tail_candidates = rule.compute_tail_results( head, training_set) f_tail_candidates = self.__get_filtered_entities( filter_set, test_set, triple, tail_candidates, True) k_tail_tree.add_values(rule.get_applied_confidence(), f_tail_candidates) else: break for rule in relevant_rules: if not k_head_tree.fine(): head_candidates = rule.compute_head_results( tail, training_set) f_head_candidates = self.__get_filtered_entities( filter_set, test_set, triple, head_candidates, False) k_head_tree.add_values(rule.get_applied_confidence(), f_head_candidates) else: break k_tail_candidates, k_head_candidates = {}, {} k_tail_tree.get_as_linked_map(k_tail_candidates) k_head_tree.get_as_linked_map(k_head_candidates) top_k_tail_candidates = self.__sort_by_value(k_tail_candidates, k) top_k_head_candidates = self.__sort_by_value(k_head_candidates, k) counter += 1 writer = threading.Thread( target=self.__process_write_top_k_candidates, args=( triple, test_set, top_k_tail_candidates, top_k_head_candidates, )) writer.start() writer.join() print('* done with rule application')