예제 #1
0
 def print_parameter(self, file=None):
     parameters = self.__dict__
     print_time_info('Parameter setttings:', dash_top=True, file=file)
     for key, value in parameters.items():
         if type(value) in {int, float, str, bool}:
             print('\t%s:' % key, value, file=file)
     print('---------------------------------------', file=file)
예제 #2
0
    def save_sim_matrix(self, device):
        # Get the similarity matrix of the current model
        self.gnn_channel.eval()
        sim_train = self.gnn_channel.predict(
            self.loaded_data.train_sr_ent_seeds_ori,
            self.loaded_data.train_tg_ent_seeds_ori)
        sim_valid = self.gnn_channel.predict(
            self.loaded_data.valid_sr_ent_seeds,
            self.loaded_data.valid_tg_ent_seeds)
        sim_test = self.gnn_channel.predict(self.loaded_data.test_sr_ent_seeds,
                                            self.loaded_data.test_tg_ent_seeds)
        get_hits(sim_test, print_info=True, device=device)
        print_time_info('Best result on the test set', dash_top=True)
        sim_train = sim_train.cpu().numpy()
        sim_valid = sim_valid.cpu().numpy()
        sim_test = sim_test.cpu().numpy()

        def save_sim(sim, comment):
            if sim.shape[0] > 20000:
                partial_sim = sort_and_keep_indices(sim, device)
                partial_sim_t = sort_and_keep_indices(sim.T, device)
                np.save(str(self.log_dir / ('%s_sim.npy' % comment)),
                        partial_sim)
                np.save(str(self.log_dir / ('%s_sim_t.npy' % comment)),
                        partial_sim_t)
            else:
                np.save(str(self.log_dir / ('%s_sim.npy' % comment)), sim)

        save_sim(sim_train, 'train')
        save_sim(sim_valid, 'valid')
        save_sim(sim_test, 'test')
        print_time_info(
            "Model configs and predictions saved to directory: %s." %
            str(self.log_dir))
예제 #3
0
 def save_model(self):
     save_path = self.log_dir / 'model.pt'
     state_dict = self.gnn_channel.state_dict()
     state_dict = OrderedDict(
         filter(lambda x: x[1].layout != torch.sparse_coo,
                state_dict.items()))
     torch.save(state_dict, str(save_path))
     print_time_info("Model is saved to directory: %s." % str(self.log_dir))
예제 #4
0
 def init_log(self, log_dir):
     log_dir = Path(log_dir)
     self.log_dir = log_dir
     if log_dir.exists():
         rmtree(str(log_dir), ignore_errors=True)
         print_time_info("Warning! Forced remove directory %s." %
                         (str(log_dir)))
     log_dir.mkdir()
     comment = log_dir.name
     with open(log_dir / 'parameters.txt', 'w') as f:
         print_time_info(comment, file=f)
         self.print_parameter(f)
예제 #5
0
 def performance_svc(train_data, train_label, test_sims, C):
     clf = SVC(kernel='linear', C=C, gamma='auto')
     clf.fit(train_data, train_label)
     prediction = clf.predict(test_data)
     print_time_info('Classification accuracy: %f.' %
                     (np.sum(prediction == test_label) / len(test_label)))
     weight = clf.coef_.reshape(-1, 1)  # shape = [sim_num, 1]
     test_sims = ensemble_sims_with_weight(test_sims, weight)
     top_lr, top_rl, mr_lr, mr_rl, mrr_lr, mrr_rl = get_hits(
         test_sims, print_info=False, device=device)
     top1 = (top_lr[0] + top_rl[0]) / 2
     return top1, weight
예제 #6
0
 def load_value(self, value_seqs, value_embedding_cache_path, id2value_cache_path):
     if value_embedding_cache_path.exists() and id2value_cache_path.exists():
         value_embedding = np.load(value_embedding_cache_path)
         with open(id2value_cache_path, 'r', encoding='utf8', errors='ignore') as f:
             id2value = json.load(f)
         print_time_info("Loaded value embedding from %s." % value_embedding_cache_path)
         print_time_info("Loaded values from %s." % id2value_cache_path)
     else:
         value_embedding, id2value = self.encode_value(value_seqs)
         np.save(value_embedding_cache_path, value_embedding)
         with open(id2value_cache_path, 'w', encoding='utf8', errors='ignore') as f:
             json.dump(id2value, f, ensure_ascii=False)
     assert len(value_embedding) == len(id2value)
     return value_embedding, id2value
예제 #7
0
def read_file(path, parse_func):
    num = -1
    with open(path, 'r', encoding='utf8') as f:
        line = f.readline().strip()
        if line.isdigit():
            num = int(line)
        else:
            f.seek(0)
        lines = f.readlines()

    lines = parse_func(lines)

    if len(lines) != num and num >= 0:
        print_time_info('File: %s has corruptted, data_num: %d/%d.' %
                        (path, num, len(lines)))
        raise ValueError()
    return lines
예제 #8
0
 def partial_get_hits(sim, top_k=(1, 10), kg='source', print_info=True):
     if isinstance(sim, np.ndarray):
         sim = torch.from_numpy(sim)
     top_lr, mr_lr, mrr_lr = topk(sim, top_k, device=device)
     if print_info:
         print_time_info('For each %s:' % kg, dash_top=True)
         print_time_info('MR: %.2f; MRR: %.2f%%.' % (mr_lr, mrr_lr))
         for i in range(len(top_lr)):
             print_time_info('Hits@%d: %.2f%%' % (top_k[i], top_lr[i]))
     return top_lr, mr_lr, mrr_lr
예제 #9
0
def _load_seeds(directory, train_seeds_ratio, load_new_seed_split):
    train_data_path = directory / 'train_entity_seeds.txt'
    valid_data_path = directory / 'valid_entity_seeds.txt'
    test_data_path = directory / 'test_entity_seeds.txt'
    entity_seeds = read_seeds(directory / 'entity_seeds.txt')
    if load_new_seed_split:
        train_data_path = directory / 'hard_split' / 'train_entity_seeds.txt'
        valid_data_path = directory / 'hard_split' / 'valid_entity_seeds.txt'
        test_data_path = directory / 'hard_split' / 'test_entity_seeds.txt'
        print_time_info("Loading adversarially-splitted train/valid/test set from %s." % str(directory / 'hard_split'))
        train_entity_seeds = read_seeds(train_data_path)
        valid_entity_seeds = read_seeds(valid_data_path)
        test_entity_seeds = read_seeds(test_data_path)
    elif train_data_path.exists() and valid_data_path.exists() and test_data_path.exists():
        print_time_info("Loading pre-splitted train/valid/test set from %s." % str(directory))
        train_entity_seeds = read_seeds(train_data_path)
        valid_entity_seeds = read_seeds(valid_data_path)
        test_entity_seeds = read_seeds(test_data_path)
    else:
        test_sr_ids_path = directory / ('test_sr_ids_%d.txt' % int(train_seeds_ratio * 100))
        if not test_sr_ids_path.exists():
            print_time_info("Randomly split train/valid set from %s." % str(directory))
            tmp_entity_seeds = [seed for seed in entity_seeds]
            random.shuffle(tmp_entity_seeds)
            train_entity_seeds = tmp_entity_seeds[:int(len(entity_seeds) * train_seeds_ratio)]
            valid_entity_seeds = tmp_entity_seeds[int(len(entity_seeds) * train_seeds_ratio):]
            test_entity_seeds = valid_entity_seeds
            test_sr_ent_set = set(x[0] for x in test_entity_seeds)
            with open(test_sr_ids_path, 'w', encoding='utf8') as f:
                for idx in test_sr_ent_set:
                    f.write(str(idx) + '\n')
        else:
            print_time_info('Loading previously random splitted data set.')
            with open(test_sr_ids_path, 'r', encoding='utf8') as f:
                test_sr_ent_set = [int(line.strip()) for line in f.readlines()]
                test_sr_ent_set = set(test_sr_ent_set)
            train_entity_seeds = [seed for seed in entity_seeds if seed[0] not in test_sr_ent_set]
            valid_entity_seeds = [seed for seed in entity_seeds if seed[0] in test_sr_ent_set]
            test_entity_seeds = valid_entity_seeds
    return train_entity_seeds, valid_entity_seeds, test_entity_seeds, entity_seeds
예제 #10
0
def grid_search(
        log_comment,
        data_set,
        layer_num,
        device,
        load_new_seed_split=False,
        save_model=False,
        l2_regularization_range=(0, 1e-4, 1e-3),
        learning_rate_range=(1e-3, 4e-3, 7e-3),
):
    # attribute + gcn literal:  Current best hit@1 42.90 at 100 epoch with (0.006, 0, 0)
    # BERT digit channel, 17% at (0.006, 0, 0.0001)

    att_conf = AttConf()
    att_conf.set_channel(log_comment)
    att_conf.set_epoch_num(100)
    att_conf.set_nega_sample_num(25)
    att_conf.layer_num = layer_num
    att_conf.set_log_comment(log_comment)
    att_conf.set_load_new_seed_split(load_new_seed_split)
    att_conf.init('./bin/%s' % data_set, device)

    data_set = data_set.split('/')[-1]
    best_hit_1 = 0
    best_epoch_num = 0
    best_parameter = (0, 0)
    if not os.path.exists('./cache_log'):
        os.mkdir('./cache_log')
    if not os.path.exists('./log'):
        os.mkdir('./log')

    for l2 in tqdm(l2_regularization_range):
        att_conf.set_l2_regularization(l2)
        for learning_rate in learning_rate_range:
            att_conf.set_learning_rate(learning_rate)
            if layer_num == 2:
                att_conf.init_log('./cache_log/%s_%s_%s_%s' %
                                  (att_conf.log_comment, data_set, str(l2),
                                   str(learning_rate)))
            else:
                att_conf.init_log('./cache_log/%s_%s_%s_%s_%d' %
                                  (att_conf.log_comment, data_set, str(l2),
                                   str(learning_rate), layer_num))
            hit_at_1, epoch_num = att_conf.train(device)
            if hit_at_1 > best_hit_1:
                best_hit_1 = hit_at_1
                best_epoch_num = epoch_num
                best_parameter = (learning_rate, l2)
        print_time_info("Current best hit@1 %.2f at %d epoch with %s" %
                        (best_hit_1, best_epoch_num, str(best_parameter)))
    print_time_info("The best hit@1 %.2f at %d epoch with %s" %
                    (best_hit_1, best_epoch_num, str(best_parameter)))
    att_conf.set_learning_rate(best_parameter[0])
    att_conf.set_l2_regularization(best_parameter[1])
    if load_new_seed_split:
        if layer_num == 2:
            att_conf.init_log('./log/grid_search_hard_%s_%s' %
                              (att_conf.log_comment, data_set))
        else:
            att_conf.init_log('./log/grid_search_hard_%s_%s_%d' %
                              (att_conf.log_comment, data_set, layer_num))
    else:
        if layer_num == 2:
            att_conf.init_log('./log/grid_search_%s_%s' %
                              (att_conf.log_comment, data_set))
        else:
            att_conf.init_log('./log/grid_search_%s_%s_%d' %
                              (att_conf.log_comment, data_set, layer_num))
    att_conf.train(device)
    att_conf.save_sim_matrix(device)
    if save_model:
        att_conf.save_model()
예제 #11
0
    def init(self, directory, device):
        set_random_seed()
        self.directory = Path(directory)
        self.loaded_data = LoadData(
            self.train_seeds_ratio,
            self.directory,
            self.nega_sample_num,
            name_channel=self.name_channel,
            attribute_channel=self.attribute_value_channel,
            digit_literal_channel=self.digit_attribute_channel
            or self.literal_attribute_channel,
            load_new_seed_split=self.load_new_seed_split,
            device=device)
        self.sr_ent_num = self.loaded_data.sr_ent_num
        self.tg_ent_num = self.loaded_data.tg_ent_num
        self.att_num = self.loaded_data.att_num

        # Init graph adjacent matrix
        print_time_info('Begin preprocessing adjacent matrix')
        self.channels = {}

        edges_sr = torch.tensor(self.loaded_data.triples_sr)[:, :2]
        edges_tg = torch.tensor(self.loaded_data.triples_tg)[:, :2]
        edges_sr = torch.unique(edges_sr, dim=0)
        edges_tg = torch.unique(edges_tg, dim=0)

        if self.name_channel:
            self.channels['name'] = {
                'edges_sr': edges_sr,
                'edges_tg': edges_tg,
                'sr_ent_embed': self.loaded_data.sr_embed,
                'tg_ent_embed': self.loaded_data.tg_embed,
            }
        if self.structure_channel:
            self.channels['structure'] = {
                'edges_sr': edges_sr,
                'edges_tg': edges_tg
            }
        if self.attribute_value_channel:
            self.channels['attribute'] = {
                'edges_sr': edges_sr,
                'edges_tg': edges_tg,
                'att_num': self.loaded_data.att_num,
                'attribute_triples_sr': self.loaded_data.attribute_triples_sr,
                'attribute_triples_tg': self.loaded_data.attribute_triples_tg,
                'value_embedding': self.loaded_data.value_embedding
            }
        if self.literal_attribute_channel:
            self.channels['attribute'] = {
                'edges_sr': edges_sr,
                'edges_tg': edges_tg,
                'att_num': self.loaded_data.literal_att_num,
                'attribute_triples_sr': self.loaded_data.literal_triples_sr,
                'attribute_triples_tg': self.loaded_data.literal_triples_tg,
                'value_embedding': self.loaded_data.literal_value_embedding
            }
        if self.digit_attribute_channel:
            self.channels['attribute'] = {
                'edges_sr': edges_sr,
                'edges_tg': edges_tg,
                'att_num': self.loaded_data.digit_att_num,
                'attribute_triples_sr': self.loaded_data.digital_triples_sr,
                'attribute_triples_tg': self.loaded_data.digital_triples_tg,
                'value_embedding': self.loaded_data.digit_value_embedding
            }
        print_time_info('Finished preprocesssing adjacent matrix')
예제 #12
0
def ensemble_partial_sim_matrix(data_set, svm=False, device='cpu'):
    def partial_get_hits(sim, top_k=(1, 10), kg='source', print_info=True):
        if isinstance(sim, np.ndarray):
            sim = torch.from_numpy(sim)
        top_lr, mr_lr, mrr_lr = topk(sim, top_k, device=device)
        if print_info:
            print_time_info('For each %s:' % kg, dash_top=True)
            print_time_info('MR: %.2f; MRR: %.2f%%.' % (mr_lr, mrr_lr))
            for i in range(len(top_lr)):
                print_time_info('Hits@%d: %.2f%%' % (top_k[i], top_lr[i]))
        return top_lr, mr_lr, mrr_lr

    def load_partial_sim_list(sim_path_list):
        sim = None
        shape = None
        for sim_path in tqdm(sim_path_list):
            target, sim_matrix_shape = load_partial_sim(sim_path)
            if shape == None:
                shape = sim_matrix_shape
            else:
                assert shape == sim_matrix_shape
            if sim is not None:
                assert sim.shape == target.shape
                sim = sim + target
            else:
                sim = target
        sim = sim / len(sim_path_list)
        return sim

    data_set = data_set.split('DWY100k/')[1]

    # init sim_list
    model_name_list = ['Literal', 'Structure', 'Digital', "Name"]
    sim_path_list = [
        "./log/grid_search_%s_%s/test_sim.npy" % (model, data_set)
        for model in model_name_list
    ]
    sim_t_path_list = [
        "./log/grid_search_%s_%s/test_sim_t.npy" % (model, data_set)
        for model in model_name_list
    ]
    if not svm:
        partial_get_hits(load_partial_sim_list(sim_path_list), kg='source')
        partial_get_hits(load_partial_sim_list(sim_t_path_list), kg='target')
        print_time_info('-------------------------------------')
        return

    def svm_ensemble(train_sim_path_list,
                     valid_sim_path_list,
                     test_sim_path_list,
                     T=False):
        positive_data = []  # shape = [sim_num, size]
        negative_data = []  # shape = [sim_num, size * ratio]
        sim_num = len(train_sim_path_list)

        size = 30000
        negative_indice = np.random.randint(low=0,
                                            high=size,
                                            size=(4 * sim_num * size, 2))
        negative_indice = [(x, y) for x, y in negative_indice if x != y]
        for sim_path in tqdm(train_sim_path_list, desc='Load train sims'):
            sim, _ = load_partial_sim(sim_path)
            assert size == sim.shape[0]
            positive_data.append([sim[i, i] for i in range(size)])
            negative_data.append([sim[x, y] for x, y in negative_indice])

        positive_data = np.asarray(positive_data).T  # shape = [size, sim_num]
        negative_data = np.asarray(
            negative_data).T  # shape = [size * ratio, sim_num]
        print(positive_data.shape)
        print(negative_data.shape)

        valid_sims = []
        for sim_path in tqdm(valid_sim_path_list, desc='Load valid sims'):
            sim = np.load(sim_path)
            if T:
                sim = sim.T
            valid_sims.append(np.expand_dims(sim, -1))
        valid_sims = np.concatenate(valid_sims,
                                    axis=-1)  # shape = [size, size, sim_num]

        data = np.concatenate([positive_data, negative_data], axis=0)
        label = [1 for _ in range(len(positive_data))
                 ] + [0 for _ in range(len(negative_data))]
        label = np.asarray(label)

        C_range = [1e-6, 1e-5]  #[1e-1, 1, 10, 1000]
        best_C = 0
        best_top1 = 0
        best_weight = None
        for C in tqdm(C_range, desc='Fitting SVM'):
            clf = SVC(kernel='linear', C=C, gamma='auto')
            clf.fit(data, label)
            weight = clf.coef_.reshape(-1, 1)
            tmp_valid_sims = np.dot(valid_sims, weight)
            tmp_valid_sims = np.squeeze(tmp_valid_sims, axis=-1)
            top_lr, mr_lr, mrr_lr = partial_get_hits(-tmp_valid_sims,
                                                     print_info=False)
            top1 = top_lr[0]
            if top1 > best_top1:
                best_top1 = top1
                best_weight = weight
                best_C = C
        print('Best C=%f' % best_C)
        print('Best weight', best_weight.reshape(-1))
        target_sim = None
        for idx, sim_path in tqdm(enumerate(test_sim_path_list),
                                  desc='Testing'):
            if target_sim is None:
                target_sim = best_weight[idx][0] * load_partial_sim(
                    sim_path)[0]
            else:
                target_sim += best_weight[idx][0] * load_partial_sim(
                    sim_path)[0]
        kg = 'source' if not T else 'target'
        partial_get_hits(-target_sim, kg=kg)

    train_sim_path_list = [
        "./log/grid_search_%s_%s/train_sim.npy" % (model, data_set)
        for model in model_name_list
    ]
    train_sim_t_path_list = [
        "./log/grid_search_%s_%s/train_sim_t.npy" % (model, data_set)
        for model in model_name_list
    ]
    valid_sim_path_list = [
        "./log/grid_search_%s_%s/valid_sim.npy" % (model, data_set)
        for model in model_name_list
    ]
    test_sim_path_list = [
        "./log/grid_search_%s_%s/test_sim.npy" % (model, data_set)
        for model in model_name_list
    ]
    test_sim_t_path_list = [
        "./log/grid_search_%s_%s/test_sim_t.npy" % (model, data_set)
        for model in model_name_list
    ]
    svm_ensemble(train_sim_path_list,
                 valid_sim_path_list,
                 test_sim_path_list,
                 T=False)
    svm_ensemble(train_sim_t_path_list,
                 valid_sim_path_list,
                 test_sim_t_path_list,
                 T=True)