示例#1
0
    def load_model(self, model_name):

        train_name = os.environ.get('MODEL_DUMP_PATH') + self.get_file_name(
            extension='')

        # Codifica las incidencias.
        vectorized_issue = get_vectorized_issue(
            self.kwargs["corpus"],
            self.kwargs["collection"],
            self.kwargs["glove_size"],
            attention_vector=self.kwargs["attention_vector"],
            categorical=True,
            column=self.kwargs["column"])

        # Generación de los conjuntos de entrenamiento a partir de las incidencias codificadas.
        raw_data = vectorized_issue.attention_vector_raw_data

        raw_data_post = data.get_dataset_other_categorical(
            raw_data, self.kwargs["num_samples"], self.kwargs["balanced"],
            self.kwargs["num_cat"], self.kwargs["column"])
        data_train, data_test, input_vocab = data.get_dataset_tree_categorical(
            raw_data_post,
            max_sentence_length=self.kwargs["max_input"],
            column=self.kwargs["column"],
            train_size=self.kwargs["train_porcent"])

        # Se obtienen los embeddings
        embeddings = data.create_embeddings_tree(vectorized_issue, input_vocab,
                                                 train_name)
        model = dy.Model()
        data_train = None
        data_test = None
        model_tree = TreeLstmCategorical(
            model,
            data_train,
            data_test,
            embeddings,
            train_name,
            update_embeddings=self.kwargs["update_embeddings"],
            hidden_dim=self.kwargs["hidden_size"],
            attention_size=self.kwargs["attention_size"],
            batch_size=self.kwargs["batch_size"],
            learning_rate=self.kwargs["learning_rate"],
            patience=self.kwargs["patience"],
            attention=self.kwargs["attention"],
            num_cat=self.kwargs["num_cat"])
        if os.path.exists(train_name + "input_vocab.txt"):
            os.remove(train_name + "input_vocab.txt")
        if os.path.exists(train_name + "input_embeddings.npy"):
            os.remove(train_name + "input_embeddings.npy")
        print(train_name)
        model_tree.model.populate(train_name)
        return model_tree
示例#2
0
    def run(self, model_name):

        train_name = os.environ.get('MODEL_DUMP_PATH') + self.get_file_name(
            extension='')

        # Codifica las incidencias.
        vectorized_issue = get_vectorized_issue(
            self.kwargs["corpus"],
            self.kwargs["collection"],
            self.kwargs["glove_size"],
            attention_vector=self.kwargs["attention_vector"],
            categorical=True,
            column=self.kwargs["column"])

        # Generación de los conjuntos de entrenamiento a partir de las incidencias codificadas.
        raw_data = vectorized_issue.attention_vector_raw_data

        raw_data_post = data.get_dataset_other_categorical(
            raw_data, self.kwargs["num_samples"], self.kwargs["balanced"],
            self.kwargs["num_cat"], self.kwargs["column"])
        data_train, data_test, input_vocab = data.get_dataset_tree_categorical(
            raw_data_post,
            max_sentence_length=self.kwargs["max_input"],
            column=self.kwargs["column"],
            train_size=self.kwargs["train_porcent"])

        embeddings = data.create_embeddings_tree(vectorized_issue, input_vocab,
                                                 train_name)

        model = dy.Model()
        model_tree = TreeLstmCategorical(
            model,
            data_train,
            data_test,
            embeddings,
            train_name,
            update_embeddings=self.kwargs["update_embeddings"],
            hidden_dim=self.kwargs["hidden_size"],
            attention_size=self.kwargs["attention_size"],
            batch_size=self.kwargs["batch_size"],
            learning_rate=self.kwargs["learning_rate"],
            patience=self.kwargs["patience"],
            attention=self.kwargs["attention"],
            corpus=self.kwargs["corpus"],
            num_cat=self.kwargs["num_cat"],
        )

        model_tree.fit()
        self.results = model_tree.evaluate()
        return self.results
示例#3
0
    def load_model(self, model_name):
        # Issue vectorization.
        vectorized_issue = get_vectorized_issue(self.kwargs["corpus"],
                                                self.kwargs["collection"],
                                                self.kwargs["glove_size"])

        # Training dataset from vectorized issues.
        raw_data = vectorized_issue.attention_vector_raw_data

        data_train, data_test, input_vocab = data.get_dataset_tree(
            raw_data, max_sentence_length=self.kwargs["max_input"])

        # Word embeddings.
        embeddings = data.create_embeddings_tree(vectorized_issue, input_vocab,
                                                 self.model_dump_path)
        model = dy.Model()
        data_train = None
        data_test = None
        model_tree = TreeLstm(
            model,
            data_train,
            data_test,
            embeddings,
            self.model_dump_path,
            update_embeddings=self.kwargs["update_embeddings"],
            hidden_dim=self.kwargs["hidden_size"],
            attention_size=self.kwargs["attention_size"],
            batch_size=self.kwargs["batch_size"],
            learning_rate=self.kwargs["learning_rate"],
            patience=self.kwargs["patience"],
            attention=self.kwargs["attention"])
        if os.path.exists(self.model_dump_path + "input_vocab.txt"):
            os.remove(self.model_dump_path + "input_vocab.txt")
        if os.path.exists(self.model_dump_path + "input_embeddings.npy"):
            os.remove(self.model_dump_path + "input_embeddings.npy")
        model_tree.model.populate(self.model_dump_path)
        return model_tree
示例#4
0
    def run(self):
        # Issue vectorization.
        vectorized_issue = get_vectorized_issue(self.kwargs["corpus"],
                                                self.kwargs["collection"],
                                                self.kwargs["glove_size"])

        # Training dataset from vectorized issues.
        raw_data = vectorized_issue.attention_vector_raw_data

        data_train, data_test, input_vocab = data.get_dataset_tree(
            raw_data, max_sentence_length=self.kwargs["max_input"])

        # Word embeddings.
        embeddings = data.create_embeddings_tree(vectorized_issue, input_vocab,
                                                 self.model_dump_path)

        model = dy.Model()

        tree_model = TreeLstm(
            model,
            data_train,
            data_test,
            embeddings,
            self.model_dump_path,
            update_embeddings=self.kwargs["update_embeddings"],
            hidden_dim=self.kwargs["hidden_size"],
            attention_size=self.kwargs["attention_size"],
            batch_size=self.kwargs["batch_size"],
            learning_rate=self.kwargs["learning_rate"],
            patience=self.kwargs["patience"],
            attention=self.kwargs["attention"],
            corpus=self.kwargs["corpus"])

        tree_model.fit()
        self.results = tree_model.evaluate()
        return self.results
示例#5
0
    label_map = {1: 0, -1: 1}
    initial_time = time()

    TRAIN_NAME = 'description'
    output_dir = './resultados/' + TRAIN_NAME + "/"

    if not os.path.exists('./resultados/'):
        os.makedirs('./resultados/')
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    GLOVE_SIZE = 100

    # Codifica las incidencias.
    vectorized_issue = get_vectorized_issue("eclipse",
                                            "new_eclipse_duplicate_det_task",
                                            GLOVE_SIZE)

    # a vocabulary that is built on-the-fly
    input_vocab = OTFVocab()
    #
    # bucket the data (but the parsed data is saved directly)
    parsed_training_data = []
    skipped = 0
    total = 0
    dir_path = os.path.dirname(os.path.realpath(__file__))

    start = time()

    # Generación de los conjuntos de entrenamiento a partir de las incidencias codificadas.
    data = vectorized_issue.constituency_trees_raw_data