def calculate_weight(self, x_test, y_test): self.load_models() y_test = np.argmax(y_test, axis=1) p = 0 for i in range(len(self.list_model)): y_pred = self.list_model[i].predict(x_test) y_pred = np.argmax(y_pred, axis=1) if self.w_model == 'f1': p = f1_score(y_test, y_pred, average='weighted') if self.w_model == 'precision': p = precision_score(y_test, y_pred, average='weighted') if self.w_model == 'f1_min': s = classification_report(y_test, y_pred, target_names=self.label, output_dict=True) p = float(s[self.min_label]['f1-score']) if self.w_model == 'precision_min': s = classification_report(y_test, y_pred, target_names=self.label, output_dict=True) p = float(s[self.min_label]['precision']) if self.nb_dataset == 1 or self.w_model is None: p = 1 if p == 0: p = 0.00000000001 self.list_weight.append(p) save_object(os.path.join(self.model_folder, "list_weight.pickle"), self.list_weight) return
def get_dataset(self, train_skip=1): ''' This function is responsible of creating the dataset of the wanted task from the given graph_nx. Wraps the function 'task_loader.load_task' for caching. Args: train_skip: float - ratio of the data we take for train. For example, if we have N possible samples in the given graph_nx, then we take only int(N/train_skip) samples for train. This is highly important in large graphs. Returns: X: dict - with keys 'train', 'test'. each value is a np.array of the dataset, where each entery is a sample with the embeddings. y: dict - with keys 'train', 'test', each value is a np.array of the dataset, where y[key][i] is the label of X[key][i] for the given task. ''' # load task data task_data_path = join(self.dump_folder, f'{self.task}_dataset_{self.pivot_time}.data') if os.path.exists(task_data_path): X, y = load_object(task_data_path) else: X, y = loader.load_task(self.graph_nx, self.task, train_skip=1, pivot_time=self.pivot_time, test_size=self.test_size) save_object((X, y), task_data_path) X = {'train': X['train'][::train_skip], 'test': X['test']} y = {'train': y['train'][::train_skip], 'test': y['test']} return X, y
def fit(self, label): label_to_int = {} for j in range(len(label)): label_to_int[label[j]] = j self.label_to_int = label_to_int save_object( os.path.join(self.feature_extraction_folder, "label_to_int.pickle"), self.label_to_int)
def find_optimal_cutoff(self, x_test, y_test): self.load_models() y_pred = [] for i in range(len(self.list_model)): y_pred.append(self.list_model[i].predict(x_test)) y_pred = sum([a * b for a, b in zip(y_pred, self.list_weight)]) / sum( self.list_weight) y_test = np.argmax(y_test, axis=1) self.cutoff = F1(y_test, y_pred[:, 1]) save_object(os.path.join(self.model_folder, "cutoff.pickle"), self.cutoff)
def fit_transform(self, templates): if self.analyser is None: self.fit(templates) x = self.transform(templates) save_object( os.path.join(self.feature_extraction_folder, "maxlen.pickle"), self.maxlen) if self.config_dict["encoding"] == "embedding": if self.embedding_matrix is None: self.embedding_matrix = self.instantiate_embedding_matrix( templates) return x, self.embedding_matrix else: return x
def calculate_pivot_time(self): ''' Calculate the pivot time that is needed in order to create a 'time_split_ratio' between train edges and test edges Returns: time step representing the pivot time step ''' ratio2pivot = {} ratio2pivot_path = join(self.dump_folder, 'ratio2pivot.dict') if os.path.exists(ratio2pivot_path): ratio2pivot = load_object(ratio2pivot_path) if self.test_size in ratio2pivot: return ratio2pivot[self.test_size] pivot_time = get_pivot_time(self.graph_nx, self.test_size) ratio2pivot[self.test_size] = pivot_time save_object(ratio2pivot, ratio2pivot_path) return pivot_time
def fit(self, templates): if self.vocabulary and self.analyser: pass else: vectorizer = TfidfVectorizer( ngram_range=(self.config_dict["min_n_gram"], self.config_dict["max_n_gram"]), lowercase=True, stop_words=None, min_df=1) vectorizer.fit(templates) self.analyser = vectorizer.build_analyzer() self.vocabulary = vectorizer.vocabulary_ save_object( os.path.join(self.feature_extraction_folder, "analyzer.pickle"), self.analyser) save_object( os.path.join(self.feature_extraction_folder, "vocabulary.pickle"), self.vocabulary) inputs = self.feature_engineering(templates) self.maxlen = max(max(len(x) for x in inputs), self.maxlen)
def fit(self, x_train, y_train, x_test, y_test, trainable=True, custom=True, batch_size=32, epochs=5): self.create_name_fig(trainable, custom) self.new_model() label, indexes, counts_elements = np.unique(y_train, return_counts=True, return_index=True) self.min_label = label[np.argmin(counts_elements)] self.label = [y_train[index] for index in sorted(indexes)] save_object(os.path.join(self.model_folder, "label.pickle"), self.label) save_object(os.path.join(self.model_folder, "min_label.pickle"), self.min_label) self.feature_extraction = PreprocessingTemplate( self.config_dict, self.feature_extraction_folder) corpus = np.hstack((x_train, x_test)) if self.config_dict["encoding"] == "embedding" and custom: _, self.embedding_matrix = self.feature_extraction.fit_transform( x_train) else: _ = self.feature_extraction.fit_transform(corpus) x_train = self.feature_extraction.transform(x_train) x_test = self.feature_extraction.transform(x_test) self.encoder = OneHotEncoding(self.feature_extraction_folder) self.encoder.fit(self.label) y_test = self.encoder.transform(y_test) self.n_out = len(label) if self.type_model == 'LSTM': params = { 'units_size1': 256, 'units_sizes': [128], 'dense_size1': 500, 'dense_size2': [300], 'dropout': 0.004 } else: params = { 'filter_sizes': [1, 2], 'nb_filter': 1024, 'dense_size1': 400, 'dense_size2': [250], 'dropout': 0.001 } model = Model(self.type_model, vocabulary_size=len(self.feature_extraction.vocabulary), embedding_dim=self.config_dict['embedding_dim'], embedding_matrix=self.embedding_matrix, trainable=trainable, params=params, filename=self.model_folder) for i in range(self.nb_dataset): x_temp, y_temp = generate_data(x_train, y_train, self.methode, self.threshold) y_temp = self.encoder.transform(y_temp) model.fit(x_train=x_temp, y_train=y_temp, x_test=x_test, y_test=y_test, batch_size=batch_size, epochs=epochs, i=i) self.calculate_weight(x_test=x_test, y_test=y_test) if self.cutoff == 'F1': self.find_optimal_cutoff(x_test=x_test, y_test=y_test) return