Пример #1
0
    def re_read_context_and_negatives(self):
        log_text(self.log_path,
                 "...... Reading Data for Offline Batch Generation ......")
        for index in range(len(self.names)):
            name = self.names[index]
            self.context_heads[index].clear()
            self.context_head_relations[index].clear()
            self.context_tail_relations[index].clear()
            self.context_tails[index].clear()
            self.read_dict(
                self.context_heads[index],
                load_data(self.output_path + "%s_context_head.pickle" % name,
                          self.log_path, "self.%s_context_head" % name))
            self.read_dict(
                self.context_head_relations[index],
                load_data(
                    self.output_path +
                    "%s_context_head_relation.pickle" % name, self.log_path,
                    "self.%s_context_head_relation" % name))
            self.read_dict(
                self.context_tail_relations[index],
                load_data(
                    self.output_path +
                    "%s_context_tail_relation.pickle" % name, self.log_path,
                    "self.%s_context_tail_relation" % name))
            self.read_dict(
                self.context_tails[index],
                load_data(self.output_path + "%s_context_tail.pickle" % name,
                          self.log_path, "self.%s_context_tail" % name))

            self.negatives[index].clear()
            self.read_dict(
                self.negatives[index],
                load_data(self.output_path + "%s_negatives.pickle" % name,
                          self.log_path, "self.%s_negatives" % name))
    def re_sampling(self):
        for index in range(3):
            self.entity_heads[index].clear()
            self.entity_head_relations[index].clear()
            self.entity_tail_relations[index].clear()
            self.entity_tails[index].clear()
            self.negatives[index].clear()

        log_text(self.log_path, "...... Context Sampling ......")
        self.context_sampling()

        log_text(self.log_path, "...... Negative Sampling ......")
        self.negative_sampling()
 def read_dataset(self):
     names = ["train", "valid", "test"]
     string_triples = [
         self.string_train_triples, self.string_validate_triples,
         self.string_test_triples
     ]
     id_triples = [
         self.id_train_triples, self.id_validate_triples,
         self.id_test_triples
     ]
     num_of_triples = [0, 0, 0]
     for index in range(3):
         name = names[index]
         string_triple = string_triples[index]
         id_triple = id_triples[index]
         log_text(self.log_path,
                  "reading file %s" % self.input_path + name + ".txt")
         with open(self.input_path + name + ".txt") as data_reader:
             tmp_line = data_reader.readline()
             while tmp_line and tmp_line not in ["\n", "\r\n", "\r"]:
                 tmp_head = tmp_line.split()[0]
                 tmp_relation = tmp_line.split()[1]
                 tmp_tail = tmp_line.split()[2]
                 string_triple["heads"].append(tmp_head)
                 string_triple["relations"].append(tmp_relation)
                 string_triple["tails"].append(tmp_tail)
                 id_triple["id_heads"].append(
                     self.entity_id_generation(tmp_head))
                 id_triple["id_relations"].append(
                     self.relation_id_generation(tmp_relation))
                 id_triple["id_tails"].append(
                     self.entity_id_generation(tmp_tail))
                 num_of_triples[index] += 1
                 tmp_line = data_reader.readline()
             dump_data(string_triple,
                       self.output_path + "string_%s_triples.pickle" % name,
                       self.log_path, "string_%s_triples" % name)
             dump_data(id_triple,
                       self.output_path + "id_%s_triples.pickle" % name,
                       self.log_path, "id_%s_triples" % name)
     dump_data(self.entity2id, self.output_path + "entity2id.pickle",
               self.log_path, "self.entity2id")
     dump_data(self.relation2id, self.output_path + "relation2id.pickle",
               self.log_path, "self.relation2id")
     self.num_of_train_triples = num_of_triples[0]
     self.num_of_validate_triples = num_of_triples[1]
     self.num_of_test_triples = num_of_triples[2]
    def run_funcs(self):
        log_text(
            self.log_path,
            "...... Reading Data for Context and Negatives Sampling ......")
        self.read_data()

        log_text(self.log_path, "...... Entity Classification ......")
        self.entity_classification()

        log_text(self.log_path, "...... Context Sampling ......")
        self.context_sampling()

        log_text(self.log_path, "...... Negative Sampling ......")
        self.negative_sampling()

        if self.print_results_for_validation:
            log_text(self.log_path, "...... Result Validation ......")
            self.result_validation()
 def test(self, model):
     train_triple_tensor = load_data(
         self.output_path + "train_triple_tensor.pickle", self.log_path,
         "train_triple_tensor").to(self.device)
     test_dataset = MyDataset(self.num_of_test_triples)
     test_dataloader = DataLoader(test_dataset, self.test_batch_size, False)
     test_result = torch.zeros(4).to(
         self.device
     )  # [mean_rank, hit_n, filtered_mean_rank, filtered_hit_n]
     log_text(self.log_path,
              "number of test triples: %d" % self.num_of_test_triples)
     count = 0
     for test_batch in test_dataloader:
         if count % 1000 == 0:
             print "%d test triples processed" % count
         count += self.test_batch_size
         model.test_calc(
             self.n_of_hit, test_result, train_triple_tensor,
             torch.tensor([
                 self.id_test_triples["id_heads"][index]
                 for index in test_batch
             ]).to(self.device),
             torch.tensor([
                 self.id_test_triples["id_relations"][index]
                 for index in test_batch
             ]).to(self.device),
             torch.tensor([
                 self.id_test_triples["id_tails"][index]
                 for index in test_batch
             ]).to(self.device))
     log_text(
         self.log_path, "raw mean rank: %f" %
         (test_result[0].item() / float(self.num_of_test_triples)))
     log_text(
         self.log_path,
         "raw hit@%d: %f%%" % (self.n_of_hit, 100. * test_result[1].item() /
                               float(2. * self.num_of_test_triples)))
     log_text(
         self.log_path, "filtered mean rank: %f" %
         (test_result[2].item() / float(self.num_of_test_triples)))
     log_text(
         self.log_path, "filtered hit@%d: %f%%" %
         (self.n_of_hit, 100. * test_result[3].item() /
          float(2. * self.num_of_test_triples)))
 def statistics(self):
     log_text(self.log_path,
              "number of train triples: %d" % self.num_of_train_triples)
     log_text(
         self.log_path,
         "number of validate triples: %d" % self.num_of_validate_triples)
     log_text(self.log_path,
              "number of test triples: %d" % self.num_of_test_triples)
     log_text(self.log_path,
              "number of entities: %d" % self.num_of_entities)
     log_text(self.log_path,
              "number of relations: %d" % self.num_of_relations)
     statistics = {
         "num_of_train_triples": self.num_of_train_triples,
         "num_of_validate_triples": self.num_of_validate_triples,
         "num_of_test_triples": self.num_of_test_triples,
         "num_of_entities": self.num_of_entities,
         "num_of_relations": self.num_of_relations,
         "num_of_train_entities": None,
         "num_of_validate_entities": None,
         "num_of_test_entities": None
     }
     dump_data(statistics, self.output_path + "statistics.pickle",
               self.log_path, "statistics")
Пример #7
0
    def train(self):
        model = Model(self.result_path, self.log_path, self.entity_dimension,
                      self.relation_dimension, self.num_of_entities,
                      self.num_of_relations, self.norm, self.device)
        if self.continue_learning:
            model.input()
        model.to(self.device)
        optimizer = torch.optim.Adam(model.parameters(), self.learning_rate)
        PrintGPUStatus.print_gpu_status("after the initialization of model")

        self.offline_batch_retrieve = OfflineBatchRetrieve(
            self.names, self.dataset)

        current_validate_loss = self.validate(model)
        log_text(self.log_path,
                 "initial loss (validation): %f" % current_validate_loss)
        optimal_validate_loss = current_validate_loss
        self.optimal_entity_embeddings = model.entity_embeddings.weight.data.clone(
        )
        self.optimal_relation_embeddings = model.relation_embeddings.weight.data.clone(
        )

        entity_set = MyDataset(self.num_of_train_entities)
        entity_loader = DataLoader(entity_set, self.batch_size, True)
        patience_count = 0
        for epoch in range(self.num_of_epochs):
            epoch_loss = 0.
            if epoch != 0 and epoch % self.re_sampling_freq == 0:
                self.context_and_negatives.re_sampling()
                self.offline_batch_retrieve.re_read_context_and_negatives()
            for entity_id_batch in entity_loader:
                model.normalize()
                optimizer.zero_grad()
                entity_batch = [
                    self.train_entities[entity_id.item()]
                    for entity_id in entity_id_batch
                ]
                head_batch, tail_batch, both_batch = self.offline_batch_retrieve.batch_classification(
                    "train", entity_batch)
                batch_loss = self.loss_compute("train", model, head_batch,
                                               tail_batch, both_batch)
                batch_loss.backward()
                optimizer.step()
                epoch_loss += batch_loss
            log_text(
                self.log_path,
                "\r\nepoch " + str(epoch) + ": , loss: " + str(epoch_loss))
            if epoch % self.validation_freq == 0:
                current_validate_loss = self.validate(model)
                if current_validate_loss < optimal_validate_loss:
                    log_text(
                        self.log_path, "optimal validate loss: " +
                        str(optimal_validate_loss) + " -> " +
                        str(current_validate_loss))
                    patience_count = 0
                    optimal_validate_loss = current_validate_loss
                    self.optimal_entity_embeddings = model.entity_embeddings.weight.data.clone(
                    )
                    self.optimal_relation_embeddings = model.relation_embeddings.weight.data.clone(
                    )
                else:
                    patience_count += 1
                    log_text(
                        self.log_path, "early stop patience: " +
                        str(self.early_stop_patience) + ", patience count: " +
                        str(patience_count) + ", current validate loss: " +
                        str(current_validate_loss) +
                        ", optimal validate loss: " +
                        str(optimal_validate_loss))
                    if patience_count == self.patience:
                        if self.early_stop_patience == 1:
                            dump_data(
                                self.optimal_entity_embeddings.to("cpu"),
                                self.result_path +
                                "optimal_entity_embedding.pickle",
                                self.log_path,
                                "self.optimal_entity_embeddings")
                            dump_data(
                                self.optimal_relation_embeddings.to("cpu"),
                                self.result_path +
                                "optimal_relation_embedding.pickle",
                                self.log_path,
                                "self.optimal_relation_embeddings")
                            break
                        log_text(
                            self.log_path,
                            "learning rate: " + str(self.learning_rate) +
                            " -> " + str(self.learning_rate / 2))
                        self.learning_rate = self.learning_rate / 2
                        model.entity_embeddings.weight.data = self.optimal_entity_embeddings.clone(
                        )
                        model.relation_embeddings.weight.data = self.optimal_relation_embeddings.clone(
                        )
                        optimizer = torch.optim.Adam(model.parameters(),
                                                     lr=self.learning_rate)
                        patience_count = 0
                        self.early_stop_patience -= 1
            if (epoch + 1) % self.output_freq == 0:
                model.output()
                dump_data(self.optimal_entity_embeddings.to("cpu"),
                          self.result_path + "optimal_entity_embedding.pickle",
                          self.log_path, "self.optimal_entity_embeddings")
                dump_data(
                    self.optimal_relation_embeddings.to("cpu"),
                    self.result_path + "optimal_relation_embedding.pickle",
                    self.log_path, "self.optimal_relation_embeddings")
        print "test loss: %f" % self.test(model)
    def result_validation(self):
        names = ["train", "valid", "test"]
        log_text(self.log_path, "......Result of Reading Data......")
        for name in names:
            log_text(
                self.log_path,
                load_data(self.output_path + "string_%s_triples.pickle" % name,
                          self.log_path, ""))
            log_text(
                self.log_path,
                load_data(self.output_path + "id_%s_triples.pickle" % name,
                          self.log_path, ""))
        log_text(
            self.log_path,
            load_data(self.output_path + "entity2id.pickle", self.log_path,
                      ""))
        log_text(
            self.log_path,
            load_data(self.output_path + "relation2id.pickle", self.log_path,
                      ""))

        log_text(self.log_path,
                 "......Result of Head Relation to Tail and Reserve......")
        for name in names:
            log_text(
                self.log_path,
                load_data(
                    self.output_path +
                    "%s_head_relation_to_tail.pickle" % name, self.log_path,
                    ""))
            log_text(
                self.log_path,
                load_data(
                    self.output_path +
                    "%s_tail_relation_to_head.pickle" % name, self.log_path,
                    ""))

        log_text(self.log_path,
                 "......Result of Entity Context Extraction......")
        for name in names:
            log_text(
                self.log_path,
                load_data(
                    self.output_path + "%s_head_context_head.pickle" % name,
                    self.log_path, ""))
            log_text(
                self.log_path,
                load_data(
                    self.output_path +
                    "%s_head_context_relation.pickle" % name, self.log_path,
                    ""))
            log_text(
                self.log_path,
                load_data(
                    self.output_path +
                    "%s_head_context_statistics.pickle" % name, self.log_path,
                    ""))
            log_text(
                self.log_path,
                load_data(
                    self.output_path +
                    "%s_tail_context_relation.pickle" % name, self.log_path,
                    ""))
            log_text(
                self.log_path,
                load_data(
                    self.output_path + "%s_tail_context_tail.pickle" % name,
                    self.log_path, ""))
            log_text(
                self.log_path,
                load_data(
                    self.output_path +
                    "%s_tail_context_statistics.pickle" % name, self.log_path,
                    ""))

        log_text(self.log_path, "......Other Results......")
        log_text(
            self.log_path,
            load_data(self.output_path + "statistics.pickle", self.log_path,
                      ""))
        log_text(
            self.log_path,
            load_data(self.output_path + "train_triple_tensor.pickle",
                      self.log_path, ""))
    def run_functions(self):
        log_text(self.log_path,
                 "\r\n---------------------Start-------------------------")

        log_text(self.log_path, "...... Reading Data ......")
        self.read_dataset()

        log_text(self.log_path,
                 "...... Head Relation to Tail and the Reverse ......")
        self.head_relation_to_tail_and_reverse()

        log_text(self.log_path, "...... Entity Context Extraction ......")
        self.context_process()

        log_text(self.log_path, "...... Other Operations ......")
        self.train_triple_tensor_generation()
        self.statistics()

        if self.print_results_for_validation:
            log_text(self.log_path, "...... Result Validation ......")
            self.result_validation()

        log_text(self.log_path,
                 "---------------------End-------------------------")
Пример #10
0
def dump_data(obj, path, log_path, obj_name):
    log_text(log_path, "dumping %s to %s" % (obj_name, path))
    with open(path, "w") as writer:
        pickle.dump(obj, writer)
    def train(self):
        model = Model(self.result_path, self.log_path, self.entity_dimension, self.relation_dimension, self.num_of_entities, self.num_of_relations, self.norm, self.device)
        if self.continue_learning:
            model.input()
        model.to(self.device)
        optimizer = torch.optim.Adam(model.parameters(), self.learning_rate)
        PrintGPUStatus.print_gpu_status("after the initialization of model")

        self.offline_batch_retrieve = OfflineBatchRetrieve(self.names, self.dataset)

        current_mean_rank = self.validate(model)
        log_text(self.log_path, "initial mean rank (validation): %f" % current_mean_rank)
        optimal_mean_rank = current_mean_rank
        self.optimal_entity_embeddings = model.entity_embeddings.weight.data.clone()
        self.optimal_relation_embeddings = model.relation_embeddings.weight.data.clone()

        entity_set = MyDataset(self.num_of_train_entities)
        entity_loader = DataLoader(entity_set, self.batch_size, True)
        patience_count = 0
        for epoch in range(self.num_of_epochs):
            epoch_loss = 0.
            if epoch != 0 and epoch % self.re_sampling_freq == 0:
                self.context_and_negatives.re_sampling()
                self.offline_batch_retrieve.re_read_context_and_negatives()
            for entity_id_batch in entity_loader:
                model.normalize()
                optimizer.zero_grad()
                entity_batch = [self.train_entities[entity_id.item()] for entity_id in entity_id_batch]
                head_loss, tail_loss, both_loss, batch_loss = 0., 0., 0., 0.
                head_batch, tail_batch, both_batch = self.offline_batch_retrieve.batch_classification("train", entity_batch)
                if len(head_batch) > 0:
                    head_head, head_relation = self.offline_batch_retrieve.head_context_retrieve("train", head_batch)
                    negative_head_batch = self.offline_batch_retrieve.negative_retrieves("train", head_batch)
                    head_batch = torch.LongTensor(head_batch)
                    head_loss = -1. * model(head_batch.to(self.device),
                                            head_head.to(self.device), head_relation.to(self.device),
                                            None, None,
                                            negative_head_batch.to(self.device))
                if len(tail_batch) > 0:
                    tail_relation, tail_tail = self.offline_batch_retrieve.tail_context_retrieve("train", tail_batch)
                    negative_tail_batch = self.offline_batch_retrieve.negative_retrieves("train", tail_batch)
                    tail_batch = torch.LongTensor(tail_batch)
                    tail_loss = -1. * model(tail_batch.to(self.device),
                                            None, None,
                                            tail_relation.to(self.device), tail_tail.to(self.device),
                                            negative_tail_batch.to(self.device))
                if len(both_batch) > 0:
                    both_head, both_head_relation = self.offline_batch_retrieve.head_context_retrieve("train", both_batch)
                    both_tail_relation, both_tail = self.offline_batch_retrieve.tail_context_retrieve("train", both_batch)
                    negative_both_batch = self.offline_batch_retrieve.negative_retrieves("train", both_batch)
                    both_batch = torch.LongTensor(both_batch)
                    both_loss = -1. * model(both_batch.to(self.device),
                                            both_head.to(self.device), both_head_relation.to(self.device),
                                            both_tail_relation.to(self.device), both_tail.to(self.device),
                                            negative_both_batch.to(self.device))
                batch_loss += head_loss + tail_loss + both_loss
                batch_loss.backward()
                optimizer.step()
                epoch_loss += batch_loss
            log_text(self.log_path, "\r\nepoch " + str(epoch) + ": , loss: " + str(epoch_loss))
            if epoch % self.validation_freq == 0:
                current_mean_rank = self.validate(model)
                if current_mean_rank < optimal_mean_rank:
                    log_text(self.log_path, "optimal average raw mean rank: " + str(optimal_mean_rank) + " -> " + str(current_mean_rank))
                    patience_count = 0
                    optimal_mean_rank = current_mean_rank
                    self.optimal_entity_embeddings = model.entity_embeddings.weight.data.clone()
                    self.optimal_relation_embeddings = model.relation_embeddings.weight.data.clone()
                else:
                    patience_count += 1
                    log_text(self.log_path, "early stop patience: " + str(self.early_stop_patience) + ", patience count: " + str(patience_count) + ", current rank: " + str(current_mean_rank) + ", best rank: " + str(optimal_mean_rank))
                    if patience_count == self.patience:
                        if self.early_stop_patience == 1:
                            dump_data(self.optimal_entity_embeddings.to("cpu"),
                                      self.result_path + "optimal_entity_embedding.pickle", self.log_path,
                                      "self.optimal_entity_embeddings")
                            dump_data(self.optimal_relation_embeddings.to("cpu"),
                                      self.result_path + "optimal_relation_embedding.pickle", self.log_path,
                                      "self.optimal_relation_embeddings")
                            break
                        log_text(self.log_path, "learning rate: " + str(self.learning_rate) + " -> " + str(self.learning_rate / 2))
                        self.learning_rate = self.learning_rate / 2
                        model.entity_embeddings.weight.data = self.optimal_entity_embeddings.clone()
                        model.relation_embeddings.weight.data = self.optimal_relation_embeddings.clone()
                        optimizer = torch.optim.Adam(model.parameters(), lr=self.learning_rate)
                        patience_count = 0
                        self.early_stop_patience -= 1
            if (epoch + 1) % self.output_freq == 0:
                model.output()
                dump_data(self.optimal_entity_embeddings.to("cpu"), self.result_path + "optimal_entity_embedding.pickle", self.log_path, "self.optimal_entity_embeddings")
                dump_data(self.optimal_relation_embeddings.to("cpu"), self.result_path + "optimal_relation_embedding.pickle", self.log_path, "self.optimal_relation_embeddings")
        self.test(model)
    def run_functions(self):
        log_text(self.log_path,
                 "\r\n---------------------Start-------------------------")
        log_text(self.log_path, "dataset: %s" % self.dataset)
        log_text(self.log_path,
                 "head_context_size: %d" % self.head_context_size)
        log_text(self.log_path,
                 "tail_context_size: %d" % self.tail_context_size)
        log_text(self.log_path,
                 "negative_batch_size: %d" % self.negative_batch_size)
        log_text(self.log_path, "number of epochs: %d" % self.num_of_epochs)
        log_text(self.log_path, "batch size: %d" % self.batch_size)
        log_text(self.log_path, "norm: %d" % self.norm)
        log_text(self.log_path, "learning rate: %f" % self.learning_rate)
        log_text(self.log_path, "device: %s" % self.device)
        log_text(self.log_path,
                 "continue learning: %s" % self.continue_learning)
        log_text(self.log_path, "entity dimension: %d" % self.entity_dimension)
        log_text(self.log_path,
                 "relation dimension: %d" % self.relation_dimension)
        log_text(self.log_path, "patience: %d" % self.patience)
        log_text(self.log_path,
                 "early stop patience: %d" % self.early_stop_patience)
        log_text(self.log_path, "output frequency: %d" % self.output_freq)
        log_text(self.log_path,
                 "validation batch size: %d" % self.validation_batch_size)
        log_text(self.log_path, "test batch size: %d" % self.test_batch_size)
        log_text(self.log_path, "hit@: %d" % self.n_of_hit)

        self.read_data()
        self.train()

        log_text(self.log_path,
                 "---------------------End-------------------------")
    def train(self):
        entity_set = MyDataset(self.num_of_train_entities)
        entity_loader = DataLoader(entity_set, self.batch_size, True)
        batch_process = BatchProcess(
            self.train_entities, self.train_head_entities,
            self.train_tail_entities, self.train_both_entities,
            self.head_context_head, self.head_context_relation,
            self.head_context_statistics, self.tail_context_relation,
            self.tail_context_tail, self.tail_context_statistics,
            self.head_context_size, self.tail_context_size,
            self.num_of_train_entities, self.negative_batch_size, self.device)
        model = Model(self.result_path, self.log_path, self.entity_dimension,
                      self.relation_dimension, self.num_of_entities,
                      self.num_of_relations, self.norm, self.device)
        if self.continue_learning:
            model.input()
        model.to(self.device)
        optimizer = torch.optim.Adam(model.parameters(), self.learning_rate)
        current_mean_rank = self.validate(model)
        log_text(self.log_path,
                 "initial mean rank (validation): %f" % current_mean_rank)
        optimal_mean_rank = current_mean_rank
        self.optimal_entity_embeddings = model.entity_embeddings.weight.data.clone(
        )
        self.optimal_relation_embeddings = model.relation_embeddings.weight.data.clone(
        )

        patience_count = 0
        for epoch in range(self.num_of_epochs):
            epoch_loss = 0.
            count = 0
            for entity_id_batch in entity_loader:
                if count % 200 == 0:
                    print "%d batches processed " % count + time.strftime(
                        '%m-%d-%Y %H:%M:%S', time.localtime(time.time()))
                count += 1
                model.normalize()
                optimizer.zero_grad()
                entity_id_batch = entity_id_batch.tolist()
                entity_batch = [
                    self.train_entities[entity_id]
                    for entity_id in entity_id_batch
                ]
                head_loss, tail_loss, both_loss, batch_loss = 0., 0., 0., 0.
                head_batch, tail_batch, both_batch = batch_process.batch_classification(
                    entity_batch)
                if len(head_batch) > 0:
                    head_head, head_relation = batch_process.head_context_process(
                        head_batch)
                    negative_head_batch = batch_process.negative_batch_generation(
                        head_batch)
                    head_batch = torch.LongTensor(head_batch)
                    head_loss = -1. * model(
                        head_batch.to(self.device), head_head.to(self.device),
                        head_relation.to(self.device), None, None,
                        negative_head_batch.to(self.device))
                if len(tail_batch) > 0:
                    tail_relation, tail_tail = batch_process.tail_context_process(
                        tail_batch)
                    negative_tail_batch = batch_process.negative_batch_generation(
                        tail_batch)
                    tail_batch = torch.LongTensor(tail_batch)
                    tail_loss = -1. * model(
                        tail_batch.to(self.device), None, None,
                        tail_relation.to(self.device), tail_tail.to(
                            self.device), negative_tail_batch.to(self.device))
                if len(both_batch) > 0:
                    both_head, both_head_relation = batch_process.head_context_process(
                        both_batch)
                    both_tail_relation, both_tail = batch_process.tail_context_process(
                        both_batch)
                    negative_both_batch = batch_process.negative_batch_generation(
                        both_batch)
                    both_batch = torch.LongTensor(both_batch)
                    both_loss = -1. * model(
                        both_batch.to(self.device), both_head.to(self.device),
                        both_head_relation.to(self.device),
                        both_tail_relation.to(self.device),
                        both_tail.to(self.device),
                        negative_both_batch.to(self.device))

                batch_loss += head_loss + tail_loss + both_loss
                batch_loss.backward()
                optimizer.step()
                epoch_loss += batch_loss
            log_text(
                self.log_path,
                "\r\nepoch " + str(epoch) + ": , loss: " + str(epoch_loss))
            current_mean_rank = self.validate(model)
            if current_mean_rank < optimal_mean_rank:
                log_text(
                    self.log_path, "optimal average raw mean rank: " +
                    str(optimal_mean_rank) + " -> " + str(current_mean_rank))
                patience_count = 0
                optimal_mean_rank = current_mean_rank
                self.optimal_entity_embeddings = model.entity_embeddings.weight.data.clone(
                )
                self.optimal_relation_embeddings = model.relation_embeddings.weight.data.clone(
                )
            else:
                patience_count += 1
                log_text(
                    self.log_path,
                    "early stop patience: " + str(self.early_stop_patience) +
                    ", patience count: " + str(patience_count) +
                    ", current rank: " + str(current_mean_rank) +
                    ", best rank: " + str(optimal_mean_rank))
                if patience_count == self.patience:
                    if self.early_stop_patience == 1:
                        dump_data(
                            self.optimal_entity_embeddings.to("cpu"),
                            self.result_path +
                            "optimal_entity_embedding.pickle", self.log_path,
                            "self.optimal_entity_embeddings")
                        dump_data(
                            self.optimal_relation_embeddings.to("cpu"),
                            self.result_path +
                            "optimal_relation_embedding.pickle", self.log_path,
                            "self.optimal_relation_embeddings")
                        break
                    log_text(
                        self.log_path,
                        "learning rate: " + str(self.learning_rate) + " -> " +
                        str(self.learning_rate / 2))
                    self.learning_rate = self.learning_rate / 2
                    model.entity_embeddings.weight.data = self.optimal_entity_embeddings.clone(
                    )
                    model.relation_embeddings.weight.data = self.optimal_relation_embeddings.clone(
                    )
                    optimizer = torch.optim.Adam(model.parameters(),
                                                 lr=self.learning_rate)
                    patience_count = 0
                    self.early_stop_patience -= 1
            if epoch % self.output_freq == 0:
                model.output()
                dump_data(self.optimal_entity_embeddings.to("cpu"),
                          self.result_path + "optimal_entity_embedding.pickle",
                          self.log_path, "self.optimal_entity_embeddings")
                dump_data(
                    self.optimal_relation_embeddings.to("cpu"),
                    self.result_path + "optimal_relation_embedding.pickle",
                    self.log_path, "self.optimal_relation_embeddings")
        self.test(model)
    def train(self):
        model = Model(self.result_path, self.log_path, self.entity_dimension,
                      self.relation_dimension, self.num_of_entities,
                      self.num_of_relations, self.norm, self.device)
        if self.continue_learning:
            model.input()
        model.to(self.device)
        optimizer = torch.optim.Adam(model.parameters(), self.learning_rate)
        PrintGPUStatus.print_gpu_status("after the initialization of model")

        self.offline_batch_retrieve = OfflineBatchRetrieve(
            self.names, self.dataset)

        entity_set = MyDataset(self.num_of_train_entities)
        entity_loader = DataLoader(entity_set, self.batch_size, True)

        for epoch in range(self.num_of_epochs):
            epoch_loss = 0.
            if epoch != 0 and epoch % self.re_sampling_freq == 0:
                self.context_and_negatives.re_sampling()
                self.offline_batch_retrieve.re_read_context_and_negatives()
            for entity_id_batch in entity_loader:
                model.normalize()
                optimizer.zero_grad()
                entity_batch = [
                    self.train_entities[entity_id.item()]
                    for entity_id in entity_id_batch
                ]
                head_loss, tail_loss, both_loss, batch_loss = 0., 0., 0., 0.
                head_batch, tail_batch, both_batch = self.offline_batch_retrieve.batch_classification(
                    "train", entity_batch)
                if len(head_batch) > 0:
                    head_head, head_relation = self.offline_batch_retrieve.head_context_retrieve(
                        "train", head_batch)
                    negative_head_batch = self.offline_batch_retrieve.negative_retrieves(
                        "train", head_batch)
                    head_batch = torch.LongTensor(head_batch)
                    head_loss = -1. * model(
                        head_batch.to(self.device), head_head.to(self.device),
                        head_relation.to(self.device), None, None,
                        negative_head_batch.to(self.device))
                if len(tail_batch) > 0:
                    tail_relation, tail_tail = self.offline_batch_retrieve.tail_context_retrieve(
                        "train", tail_batch)
                    negative_tail_batch = self.offline_batch_retrieve.negative_retrieves(
                        "train", tail_batch)
                    tail_batch = torch.LongTensor(tail_batch)
                    tail_loss = -1. * model(
                        tail_batch.to(self.device), None, None,
                        tail_relation.to(self.device), tail_tail.to(
                            self.device), negative_tail_batch.to(self.device))
                if len(both_batch) > 0:
                    both_head, both_head_relation = self.offline_batch_retrieve.head_context_retrieve(
                        "train", both_batch)
                    both_tail_relation, both_tail = self.offline_batch_retrieve.tail_context_retrieve(
                        "train", both_batch)
                    negative_both_batch = self.offline_batch_retrieve.negative_retrieves(
                        "train", both_batch)
                    both_batch = torch.LongTensor(both_batch)
                    both_loss = -1. * model(
                        both_batch.to(self.device), both_head.to(self.device),
                        both_head_relation.to(self.device),
                        both_tail_relation.to(self.device),
                        both_tail.to(self.device),
                        negative_both_batch.to(self.device))
                batch_loss += head_loss + tail_loss + both_loss
                batch_loss.backward()
                optimizer.step()
                epoch_loss += batch_loss
            log_text(
                self.log_path,
                "\r\nepoch " + str(epoch) + ": , loss: " + str(epoch_loss))
            if (epoch + 1) % self.output_freq == 0:
                model.output()
    def result_validation(self):
        log_text(self.log_path,
                 "...... Result of Entity Classification ......")
        for name in self.names:
            log_text(
                self.log_path,
                load_data(self.output_path + "%s_entities.pickle" % name,
                          self.log_path, ""))
            log_text(
                self.log_path,
                load_data(self.output_path + "%s_head_entities.pickle" % name,
                          self.log_path, ""))
            log_text(
                self.log_path,
                load_data(self.output_path + "%s_tail_entities.pickle" % name,
                          self.log_path, ""))
            log_text(
                self.log_path,
                load_data(self.output_path + "%s_both_entities.pickle" % name,
                          self.log_path, ""))

        log_text(self.log_path, "...... Result of Context Sampling ......")
        for name in self.names:
            log_text(
                self.log_path,
                load_data(self.output_path + "%s_context_head.pickle" % name,
                          self.log_path, ""))
            log_text(
                self.log_path,
                load_data(
                    self.output_path +
                    "%s_context_head_relation.pickle" % name, self.log_path,
                    ""))
            log_text(
                self.log_path,
                load_data(
                    self.output_path +
                    "%s_context_tail_relation.pickle" % name, self.log_path,
                    ""))
            log_text(
                self.log_path,
                load_data(self.output_path + "%s_context_tail.pickle" % name,
                          self.log_path, ""))

        log_text(self.log_path, "...... Result of Negative Sampling ......")
        for name in self.names:
            log_text(
                self.log_path,
                load_data(self.output_path + "%s_negatives.pickle" % name,
                          self.output_path, ""))

        log_text(self.log_path, "...... Other Results ......")
        log_text(
            self.log_path,
            load_data(self.output_path + "statistics.pickle", self.log_path,
                      "statistics"))
    def run_functions(self):
        log_text(self.log_path,
                 "\r\n---------------------Start-------------------------")
        log_text(self.log_path, "dataset: %s" % self.dataset)
        log_text(self.log_path,
                 "head_context_size: %d" % self.head_context_size)
        log_text(self.log_path,
                 "tail_context_size: %d" % self.tail_context_size)
        log_text(self.log_path,
                 "negative_batch_size: %d" % self.negative_batch_size)
        log_text(self.log_path, "number of epochs: %d" % self.num_of_epochs)
        log_text(self.log_path, "batch size: %d" % self.batch_size)
        log_text(self.log_path, "norm: %d" % self.norm)
        log_text(self.log_path, "learning rate: %f" % self.learning_rate)
        log_text(self.log_path, "device: %s" % self.device)
        log_text(self.log_path,
                 "continue learning: %s" % self.continue_learning)
        log_text(self.log_path, "entity dimension: %d" % self.entity_dimension)
        log_text(self.log_path,
                 "relation dimension: %d" % self.relation_dimension)
        log_text(self.log_path, "output frequency: %d" % self.output_freq)

        log_text(self.log_path,
                 "...... Context and Negatives Preparation ......")
        self.prepare_context_and_negatives()

        log_text(self.log_path, "...... Reading Data for ISWC Training ......")
        self.read_data()

        log_text(self.log_path, "...... ISWC Training ......")
        self.train()

        log_text(self.log_path,
                 "---------------------End-------------------------")
Пример #17
0
def load_data(path, log_path, obj_name):
    log_text(log_path, "loading data from %s to %s" % (path, obj_name))
    with open(path) as reader:
        return pickle.load(reader)