def load_model(self, model_name): train_name = os.environ.get('MODEL_DUMP_PATH') + self.get_file_name( extension='') # Codifica las incidencias. vectorized_issue = get_vectorized_issue( self.kwargs["corpus"], self.kwargs["collection"], self.kwargs["glove_size"], attention_vector=self.kwargs["attention_vector"], categorical=True, column=self.kwargs["column"]) # Generación de los conjuntos de entrenamiento a partir de las incidencias codificadas. raw_data = vectorized_issue.attention_vector_raw_data raw_data_post = data.get_dataset_other_categorical( raw_data, self.kwargs["num_samples"], self.kwargs["balanced"], self.kwargs["num_cat"], self.kwargs["column"]) data_train, data_test, input_vocab = data.get_dataset_tree_categorical( raw_data_post, max_sentence_length=self.kwargs["max_input"], column=self.kwargs["column"], train_size=self.kwargs["train_porcent"]) # Se obtienen los embeddings embeddings = data.create_embeddings_tree(vectorized_issue, input_vocab, train_name) model = dy.Model() data_train = None data_test = None model_tree = TreeLstmCategorical( model, data_train, data_test, embeddings, train_name, update_embeddings=self.kwargs["update_embeddings"], hidden_dim=self.kwargs["hidden_size"], attention_size=self.kwargs["attention_size"], batch_size=self.kwargs["batch_size"], learning_rate=self.kwargs["learning_rate"], patience=self.kwargs["patience"], attention=self.kwargs["attention"], num_cat=self.kwargs["num_cat"]) if os.path.exists(train_name + "input_vocab.txt"): os.remove(train_name + "input_vocab.txt") if os.path.exists(train_name + "input_embeddings.npy"): os.remove(train_name + "input_embeddings.npy") print(train_name) model_tree.model.populate(train_name) return model_tree
def run(self, model_name): train_name = os.environ.get('MODEL_DUMP_PATH') + self.get_file_name( extension='') # Codifica las incidencias. vectorized_issue = get_vectorized_issue( self.kwargs["corpus"], self.kwargs["collection"], self.kwargs["glove_size"], attention_vector=self.kwargs["attention_vector"], categorical=True, column=self.kwargs["column"]) # Generación de los conjuntos de entrenamiento a partir de las incidencias codificadas. raw_data = vectorized_issue.attention_vector_raw_data raw_data_post = data.get_dataset_other_categorical( raw_data, self.kwargs["num_samples"], self.kwargs["balanced"], self.kwargs["num_cat"], self.kwargs["column"]) data_train, data_test, input_vocab = data.get_dataset_tree_categorical( raw_data_post, max_sentence_length=self.kwargs["max_input"], column=self.kwargs["column"], train_size=self.kwargs["train_porcent"]) embeddings = data.create_embeddings_tree(vectorized_issue, input_vocab, train_name) model = dy.Model() model_tree = TreeLstmCategorical( model, data_train, data_test, embeddings, train_name, update_embeddings=self.kwargs["update_embeddings"], hidden_dim=self.kwargs["hidden_size"], attention_size=self.kwargs["attention_size"], batch_size=self.kwargs["batch_size"], learning_rate=self.kwargs["learning_rate"], patience=self.kwargs["patience"], attention=self.kwargs["attention"], corpus=self.kwargs["corpus"], num_cat=self.kwargs["num_cat"], ) model_tree.fit() self.results = model_tree.evaluate() return self.results
def load_model(self, model_name): # Issue vectorization. vectorized_issue = get_vectorized_issue(self.kwargs["corpus"], self.kwargs["collection"], self.kwargs["glove_size"]) # Training dataset from vectorized issues. raw_data = vectorized_issue.attention_vector_raw_data data_train, data_test, input_vocab = data.get_dataset_tree( raw_data, max_sentence_length=self.kwargs["max_input"]) # Word embeddings. embeddings = data.create_embeddings_tree(vectorized_issue, input_vocab, self.model_dump_path) model = dy.Model() data_train = None data_test = None model_tree = TreeLstm( model, data_train, data_test, embeddings, self.model_dump_path, update_embeddings=self.kwargs["update_embeddings"], hidden_dim=self.kwargs["hidden_size"], attention_size=self.kwargs["attention_size"], batch_size=self.kwargs["batch_size"], learning_rate=self.kwargs["learning_rate"], patience=self.kwargs["patience"], attention=self.kwargs["attention"]) if os.path.exists(self.model_dump_path + "input_vocab.txt"): os.remove(self.model_dump_path + "input_vocab.txt") if os.path.exists(self.model_dump_path + "input_embeddings.npy"): os.remove(self.model_dump_path + "input_embeddings.npy") model_tree.model.populate(self.model_dump_path) return model_tree
def run(self): # Issue vectorization. vectorized_issue = get_vectorized_issue(self.kwargs["corpus"], self.kwargs["collection"], self.kwargs["glove_size"]) # Training dataset from vectorized issues. raw_data = vectorized_issue.attention_vector_raw_data data_train, data_test, input_vocab = data.get_dataset_tree( raw_data, max_sentence_length=self.kwargs["max_input"]) # Word embeddings. embeddings = data.create_embeddings_tree(vectorized_issue, input_vocab, self.model_dump_path) model = dy.Model() tree_model = TreeLstm( model, data_train, data_test, embeddings, self.model_dump_path, update_embeddings=self.kwargs["update_embeddings"], hidden_dim=self.kwargs["hidden_size"], attention_size=self.kwargs["attention_size"], batch_size=self.kwargs["batch_size"], learning_rate=self.kwargs["learning_rate"], patience=self.kwargs["patience"], attention=self.kwargs["attention"], corpus=self.kwargs["corpus"]) tree_model.fit() self.results = tree_model.evaluate() return self.results
label_map = {1: 0, -1: 1} initial_time = time() TRAIN_NAME = 'description' output_dir = './resultados/' + TRAIN_NAME + "/" if not os.path.exists('./resultados/'): os.makedirs('./resultados/') if not os.path.exists(output_dir): os.makedirs(output_dir) GLOVE_SIZE = 100 # Codifica las incidencias. vectorized_issue = get_vectorized_issue("eclipse", "new_eclipse_duplicate_det_task", GLOVE_SIZE) # a vocabulary that is built on-the-fly input_vocab = OTFVocab() # # bucket the data (but the parsed data is saved directly) parsed_training_data = [] skipped = 0 total = 0 dir_path = os.path.dirname(os.path.realpath(__file__)) start = time() # Generación de los conjuntos de entrenamiento a partir de las incidencias codificadas. data = vectorized_issue.constituency_trees_raw_data