def print_parameter(self, file=None): parameters = self.__dict__ print_time_info('Parameter setttings:', dash_top=True, file=file) for key, value in parameters.items(): if type(value) in {int, float, str, bool}: print('\t%s:' % key, value, file=file) print('---------------------------------------', file=file)
def save_sim_matrix(self, device): # Get the similarity matrix of the current model self.gnn_channel.eval() sim_train = self.gnn_channel.predict( self.loaded_data.train_sr_ent_seeds_ori, self.loaded_data.train_tg_ent_seeds_ori) sim_valid = self.gnn_channel.predict( self.loaded_data.valid_sr_ent_seeds, self.loaded_data.valid_tg_ent_seeds) sim_test = self.gnn_channel.predict(self.loaded_data.test_sr_ent_seeds, self.loaded_data.test_tg_ent_seeds) get_hits(sim_test, print_info=True, device=device) print_time_info('Best result on the test set', dash_top=True) sim_train = sim_train.cpu().numpy() sim_valid = sim_valid.cpu().numpy() sim_test = sim_test.cpu().numpy() def save_sim(sim, comment): if sim.shape[0] > 20000: partial_sim = sort_and_keep_indices(sim, device) partial_sim_t = sort_and_keep_indices(sim.T, device) np.save(str(self.log_dir / ('%s_sim.npy' % comment)), partial_sim) np.save(str(self.log_dir / ('%s_sim_t.npy' % comment)), partial_sim_t) else: np.save(str(self.log_dir / ('%s_sim.npy' % comment)), sim) save_sim(sim_train, 'train') save_sim(sim_valid, 'valid') save_sim(sim_test, 'test') print_time_info( "Model configs and predictions saved to directory: %s." % str(self.log_dir))
def save_model(self): save_path = self.log_dir / 'model.pt' state_dict = self.gnn_channel.state_dict() state_dict = OrderedDict( filter(lambda x: x[1].layout != torch.sparse_coo, state_dict.items())) torch.save(state_dict, str(save_path)) print_time_info("Model is saved to directory: %s." % str(self.log_dir))
def init_log(self, log_dir): log_dir = Path(log_dir) self.log_dir = log_dir if log_dir.exists(): rmtree(str(log_dir), ignore_errors=True) print_time_info("Warning! Forced remove directory %s." % (str(log_dir))) log_dir.mkdir() comment = log_dir.name with open(log_dir / 'parameters.txt', 'w') as f: print_time_info(comment, file=f) self.print_parameter(f)
def performance_svc(train_data, train_label, test_sims, C): clf = SVC(kernel='linear', C=C, gamma='auto') clf.fit(train_data, train_label) prediction = clf.predict(test_data) print_time_info('Classification accuracy: %f.' % (np.sum(prediction == test_label) / len(test_label))) weight = clf.coef_.reshape(-1, 1) # shape = [sim_num, 1] test_sims = ensemble_sims_with_weight(test_sims, weight) top_lr, top_rl, mr_lr, mr_rl, mrr_lr, mrr_rl = get_hits( test_sims, print_info=False, device=device) top1 = (top_lr[0] + top_rl[0]) / 2 return top1, weight
def load_value(self, value_seqs, value_embedding_cache_path, id2value_cache_path): if value_embedding_cache_path.exists() and id2value_cache_path.exists(): value_embedding = np.load(value_embedding_cache_path) with open(id2value_cache_path, 'r', encoding='utf8', errors='ignore') as f: id2value = json.load(f) print_time_info("Loaded value embedding from %s." % value_embedding_cache_path) print_time_info("Loaded values from %s." % id2value_cache_path) else: value_embedding, id2value = self.encode_value(value_seqs) np.save(value_embedding_cache_path, value_embedding) with open(id2value_cache_path, 'w', encoding='utf8', errors='ignore') as f: json.dump(id2value, f, ensure_ascii=False) assert len(value_embedding) == len(id2value) return value_embedding, id2value
def read_file(path, parse_func): num = -1 with open(path, 'r', encoding='utf8') as f: line = f.readline().strip() if line.isdigit(): num = int(line) else: f.seek(0) lines = f.readlines() lines = parse_func(lines) if len(lines) != num and num >= 0: print_time_info('File: %s has corruptted, data_num: %d/%d.' % (path, num, len(lines))) raise ValueError() return lines
def partial_get_hits(sim, top_k=(1, 10), kg='source', print_info=True): if isinstance(sim, np.ndarray): sim = torch.from_numpy(sim) top_lr, mr_lr, mrr_lr = topk(sim, top_k, device=device) if print_info: print_time_info('For each %s:' % kg, dash_top=True) print_time_info('MR: %.2f; MRR: %.2f%%.' % (mr_lr, mrr_lr)) for i in range(len(top_lr)): print_time_info('Hits@%d: %.2f%%' % (top_k[i], top_lr[i])) return top_lr, mr_lr, mrr_lr
def _load_seeds(directory, train_seeds_ratio, load_new_seed_split): train_data_path = directory / 'train_entity_seeds.txt' valid_data_path = directory / 'valid_entity_seeds.txt' test_data_path = directory / 'test_entity_seeds.txt' entity_seeds = read_seeds(directory / 'entity_seeds.txt') if load_new_seed_split: train_data_path = directory / 'hard_split' / 'train_entity_seeds.txt' valid_data_path = directory / 'hard_split' / 'valid_entity_seeds.txt' test_data_path = directory / 'hard_split' / 'test_entity_seeds.txt' print_time_info("Loading adversarially-splitted train/valid/test set from %s." % str(directory / 'hard_split')) train_entity_seeds = read_seeds(train_data_path) valid_entity_seeds = read_seeds(valid_data_path) test_entity_seeds = read_seeds(test_data_path) elif train_data_path.exists() and valid_data_path.exists() and test_data_path.exists(): print_time_info("Loading pre-splitted train/valid/test set from %s." % str(directory)) train_entity_seeds = read_seeds(train_data_path) valid_entity_seeds = read_seeds(valid_data_path) test_entity_seeds = read_seeds(test_data_path) else: test_sr_ids_path = directory / ('test_sr_ids_%d.txt' % int(train_seeds_ratio * 100)) if not test_sr_ids_path.exists(): print_time_info("Randomly split train/valid set from %s." % str(directory)) tmp_entity_seeds = [seed for seed in entity_seeds] random.shuffle(tmp_entity_seeds) train_entity_seeds = tmp_entity_seeds[:int(len(entity_seeds) * train_seeds_ratio)] valid_entity_seeds = tmp_entity_seeds[int(len(entity_seeds) * train_seeds_ratio):] test_entity_seeds = valid_entity_seeds test_sr_ent_set = set(x[0] for x in test_entity_seeds) with open(test_sr_ids_path, 'w', encoding='utf8') as f: for idx in test_sr_ent_set: f.write(str(idx) + '\n') else: print_time_info('Loading previously random splitted data set.') with open(test_sr_ids_path, 'r', encoding='utf8') as f: test_sr_ent_set = [int(line.strip()) for line in f.readlines()] test_sr_ent_set = set(test_sr_ent_set) train_entity_seeds = [seed for seed in entity_seeds if seed[0] not in test_sr_ent_set] valid_entity_seeds = [seed for seed in entity_seeds if seed[0] in test_sr_ent_set] test_entity_seeds = valid_entity_seeds return train_entity_seeds, valid_entity_seeds, test_entity_seeds, entity_seeds
def grid_search( log_comment, data_set, layer_num, device, load_new_seed_split=False, save_model=False, l2_regularization_range=(0, 1e-4, 1e-3), learning_rate_range=(1e-3, 4e-3, 7e-3), ): # attribute + gcn literal: Current best hit@1 42.90 at 100 epoch with (0.006, 0, 0) # BERT digit channel, 17% at (0.006, 0, 0.0001) att_conf = AttConf() att_conf.set_channel(log_comment) att_conf.set_epoch_num(100) att_conf.set_nega_sample_num(25) att_conf.layer_num = layer_num att_conf.set_log_comment(log_comment) att_conf.set_load_new_seed_split(load_new_seed_split) att_conf.init('./bin/%s' % data_set, device) data_set = data_set.split('/')[-1] best_hit_1 = 0 best_epoch_num = 0 best_parameter = (0, 0) if not os.path.exists('./cache_log'): os.mkdir('./cache_log') if not os.path.exists('./log'): os.mkdir('./log') for l2 in tqdm(l2_regularization_range): att_conf.set_l2_regularization(l2) for learning_rate in learning_rate_range: att_conf.set_learning_rate(learning_rate) if layer_num == 2: att_conf.init_log('./cache_log/%s_%s_%s_%s' % (att_conf.log_comment, data_set, str(l2), str(learning_rate))) else: att_conf.init_log('./cache_log/%s_%s_%s_%s_%d' % (att_conf.log_comment, data_set, str(l2), str(learning_rate), layer_num)) hit_at_1, epoch_num = att_conf.train(device) if hit_at_1 > best_hit_1: best_hit_1 = hit_at_1 best_epoch_num = epoch_num best_parameter = (learning_rate, l2) print_time_info("Current best hit@1 %.2f at %d epoch with %s" % (best_hit_1, best_epoch_num, str(best_parameter))) print_time_info("The best hit@1 %.2f at %d epoch with %s" % (best_hit_1, best_epoch_num, str(best_parameter))) att_conf.set_learning_rate(best_parameter[0]) att_conf.set_l2_regularization(best_parameter[1]) if load_new_seed_split: if layer_num == 2: att_conf.init_log('./log/grid_search_hard_%s_%s' % (att_conf.log_comment, data_set)) else: att_conf.init_log('./log/grid_search_hard_%s_%s_%d' % (att_conf.log_comment, data_set, layer_num)) else: if layer_num == 2: att_conf.init_log('./log/grid_search_%s_%s' % (att_conf.log_comment, data_set)) else: att_conf.init_log('./log/grid_search_%s_%s_%d' % (att_conf.log_comment, data_set, layer_num)) att_conf.train(device) att_conf.save_sim_matrix(device) if save_model: att_conf.save_model()
def init(self, directory, device): set_random_seed() self.directory = Path(directory) self.loaded_data = LoadData( self.train_seeds_ratio, self.directory, self.nega_sample_num, name_channel=self.name_channel, attribute_channel=self.attribute_value_channel, digit_literal_channel=self.digit_attribute_channel or self.literal_attribute_channel, load_new_seed_split=self.load_new_seed_split, device=device) self.sr_ent_num = self.loaded_data.sr_ent_num self.tg_ent_num = self.loaded_data.tg_ent_num self.att_num = self.loaded_data.att_num # Init graph adjacent matrix print_time_info('Begin preprocessing adjacent matrix') self.channels = {} edges_sr = torch.tensor(self.loaded_data.triples_sr)[:, :2] edges_tg = torch.tensor(self.loaded_data.triples_tg)[:, :2] edges_sr = torch.unique(edges_sr, dim=0) edges_tg = torch.unique(edges_tg, dim=0) if self.name_channel: self.channels['name'] = { 'edges_sr': edges_sr, 'edges_tg': edges_tg, 'sr_ent_embed': self.loaded_data.sr_embed, 'tg_ent_embed': self.loaded_data.tg_embed, } if self.structure_channel: self.channels['structure'] = { 'edges_sr': edges_sr, 'edges_tg': edges_tg } if self.attribute_value_channel: self.channels['attribute'] = { 'edges_sr': edges_sr, 'edges_tg': edges_tg, 'att_num': self.loaded_data.att_num, 'attribute_triples_sr': self.loaded_data.attribute_triples_sr, 'attribute_triples_tg': self.loaded_data.attribute_triples_tg, 'value_embedding': self.loaded_data.value_embedding } if self.literal_attribute_channel: self.channels['attribute'] = { 'edges_sr': edges_sr, 'edges_tg': edges_tg, 'att_num': self.loaded_data.literal_att_num, 'attribute_triples_sr': self.loaded_data.literal_triples_sr, 'attribute_triples_tg': self.loaded_data.literal_triples_tg, 'value_embedding': self.loaded_data.literal_value_embedding } if self.digit_attribute_channel: self.channels['attribute'] = { 'edges_sr': edges_sr, 'edges_tg': edges_tg, 'att_num': self.loaded_data.digit_att_num, 'attribute_triples_sr': self.loaded_data.digital_triples_sr, 'attribute_triples_tg': self.loaded_data.digital_triples_tg, 'value_embedding': self.loaded_data.digit_value_embedding } print_time_info('Finished preprocesssing adjacent matrix')
def ensemble_partial_sim_matrix(data_set, svm=False, device='cpu'): def partial_get_hits(sim, top_k=(1, 10), kg='source', print_info=True): if isinstance(sim, np.ndarray): sim = torch.from_numpy(sim) top_lr, mr_lr, mrr_lr = topk(sim, top_k, device=device) if print_info: print_time_info('For each %s:' % kg, dash_top=True) print_time_info('MR: %.2f; MRR: %.2f%%.' % (mr_lr, mrr_lr)) for i in range(len(top_lr)): print_time_info('Hits@%d: %.2f%%' % (top_k[i], top_lr[i])) return top_lr, mr_lr, mrr_lr def load_partial_sim_list(sim_path_list): sim = None shape = None for sim_path in tqdm(sim_path_list): target, sim_matrix_shape = load_partial_sim(sim_path) if shape == None: shape = sim_matrix_shape else: assert shape == sim_matrix_shape if sim is not None: assert sim.shape == target.shape sim = sim + target else: sim = target sim = sim / len(sim_path_list) return sim data_set = data_set.split('DWY100k/')[1] # init sim_list model_name_list = ['Literal', 'Structure', 'Digital', "Name"] sim_path_list = [ "./log/grid_search_%s_%s/test_sim.npy" % (model, data_set) for model in model_name_list ] sim_t_path_list = [ "./log/grid_search_%s_%s/test_sim_t.npy" % (model, data_set) for model in model_name_list ] if not svm: partial_get_hits(load_partial_sim_list(sim_path_list), kg='source') partial_get_hits(load_partial_sim_list(sim_t_path_list), kg='target') print_time_info('-------------------------------------') return def svm_ensemble(train_sim_path_list, valid_sim_path_list, test_sim_path_list, T=False): positive_data = [] # shape = [sim_num, size] negative_data = [] # shape = [sim_num, size * ratio] sim_num = len(train_sim_path_list) size = 30000 negative_indice = np.random.randint(low=0, high=size, size=(4 * sim_num * size, 2)) negative_indice = [(x, y) for x, y in negative_indice if x != y] for sim_path in tqdm(train_sim_path_list, desc='Load train sims'): sim, _ = load_partial_sim(sim_path) assert size == sim.shape[0] positive_data.append([sim[i, i] for i in range(size)]) negative_data.append([sim[x, y] for x, y in negative_indice]) positive_data = np.asarray(positive_data).T # shape = [size, sim_num] negative_data = np.asarray( negative_data).T # shape = [size * ratio, sim_num] print(positive_data.shape) print(negative_data.shape) valid_sims = [] for sim_path in tqdm(valid_sim_path_list, desc='Load valid sims'): sim = np.load(sim_path) if T: sim = sim.T valid_sims.append(np.expand_dims(sim, -1)) valid_sims = np.concatenate(valid_sims, axis=-1) # shape = [size, size, sim_num] data = np.concatenate([positive_data, negative_data], axis=0) label = [1 for _ in range(len(positive_data)) ] + [0 for _ in range(len(negative_data))] label = np.asarray(label) C_range = [1e-6, 1e-5] #[1e-1, 1, 10, 1000] best_C = 0 best_top1 = 0 best_weight = None for C in tqdm(C_range, desc='Fitting SVM'): clf = SVC(kernel='linear', C=C, gamma='auto') clf.fit(data, label) weight = clf.coef_.reshape(-1, 1) tmp_valid_sims = np.dot(valid_sims, weight) tmp_valid_sims = np.squeeze(tmp_valid_sims, axis=-1) top_lr, mr_lr, mrr_lr = partial_get_hits(-tmp_valid_sims, print_info=False) top1 = top_lr[0] if top1 > best_top1: best_top1 = top1 best_weight = weight best_C = C print('Best C=%f' % best_C) print('Best weight', best_weight.reshape(-1)) target_sim = None for idx, sim_path in tqdm(enumerate(test_sim_path_list), desc='Testing'): if target_sim is None: target_sim = best_weight[idx][0] * load_partial_sim( sim_path)[0] else: target_sim += best_weight[idx][0] * load_partial_sim( sim_path)[0] kg = 'source' if not T else 'target' partial_get_hits(-target_sim, kg=kg) train_sim_path_list = [ "./log/grid_search_%s_%s/train_sim.npy" % (model, data_set) for model in model_name_list ] train_sim_t_path_list = [ "./log/grid_search_%s_%s/train_sim_t.npy" % (model, data_set) for model in model_name_list ] valid_sim_path_list = [ "./log/grid_search_%s_%s/valid_sim.npy" % (model, data_set) for model in model_name_list ] test_sim_path_list = [ "./log/grid_search_%s_%s/test_sim.npy" % (model, data_set) for model in model_name_list ] test_sim_t_path_list = [ "./log/grid_search_%s_%s/test_sim_t.npy" % (model, data_set) for model in model_name_list ] svm_ensemble(train_sim_path_list, valid_sim_path_list, test_sim_path_list, T=False) svm_ensemble(train_sim_t_path_list, valid_sim_path_list, test_sim_t_path_list, T=True)