def run(self, data): super().run(data) # dont not remove this! # preprocessing - scaling X scaler = preprocessing.StandardScaler() scaler = scaler.fit(data[self._key_in_train_x]) temp = dict() temp['train_X_scaled'] = scaler.transform(data[self._key_in_train_x]) temp['test_X_scaled'] = scaler.transform(data[self._key_in_test_x]) # Train model for criticality self._model_crit, self._model_auc = PredictorWrapperClassification.\ train(temp['train_X_scaled'], temp['test_X_scaled'], data[self._key_in_train_crit], data[self._key_in_test_crit], data['meta_dataset_name'] + "_crit_pred", self._reload_if_existing) # select critical samples based on model data[self._key_out_train_x], data[ self._key_out_train_crit] = self._crit_get_varargs( temp['train_X_scaled'], data[self._key_in_train_rul]) data[self._key_out_test_x], data[ self._key_out_test_crit] = self._crit_get_varargs( temp['test_X_scaled'], data[self._key_in_test_rul]) # plot histogram to show awesome results Logging.log("RUL histogram of complete test population") max_rul = data[self._key_in_test_rul].max() Visual.plot_hist(data[self._key_in_test_rul], max_x=max_rul) Logging.log("RUL histogram of sub test population labeled critical") Visual.plot_hist(data[self._key_out_test_crit], max_x=max_rul) # metrics metrics = dict() # empty metrics metrics['model_auc'] = self._model_auc return data, metrics
def run(self, data): super().run(data) # dont not remove this! threshold = 1 # predict criticality self._model_risk, self._model_risk_rmse = PredictorWrapperRegression.train( data["train_X_scaled_crit_bounded_scaled_top"], data["test_X_scaled_crit_bounded_scaled_top"], np.array(data["train_risc"]), np.array(data["test_risc"]), data['meta_dataset_name'] + "_risc_pred", self._reload_if_existing) below_threshold = np.array(data["train_risc"] < threshold) above_threshold = np.array(data["train_risc"] >= threshold) n_above = sum(above_threshold) n_below = sum(below_threshold) Logging().log( 'there are {} samples above and {} below the threshold'.format( n_above, n_below)) n_above_percentage_to_be_balanced = n_below / (n_above + n_below) rand_nums = np.random.choice([1, 0], size=(n_above + n_below, ), p=[ n_above_percentage_to_be_balanced, (1 - n_above_percentage_to_be_balanced) ]) Logging().log( 'picked randomly a proportion of {0:.3}% samples from all samples ({1} samples)' .format(n_above_percentage_to_be_balanced, (n_above + n_below))) above_threshold_picked = rand_nums == 1 & above_threshold # Plot Visual().plot_scatter( self._model_risk.predict( data["test_X_scaled_crit_bounded_scaled_top"]), data["test_risc"]) Visual().plot_scatter( self._model_risk.predict( data["train_X_scaled_crit_bounded_scaled_top"]), data["train_risc"]) # aggregate metrics metrics = dict() metrics['model_rmse'] = self._model_risk_rmse return data, metrics
def train(X_train, X_test, y_train, y_test, model_filename, reload_if_existing, modeltype="RF"): """ trains and evaluates a model based on the given data :param X_train: Features for training, expected to a numpy.ndarray. :param X_test: Features for testing, expected to a numpy.ndarray. :param y_train: Labels for training. Expected to an one-dimesional array. :param y_test: Labels for testing. Expected to an one-dimesional array. :param model_filename: Filename of model when serialized to disk :param reload_if_existing: Boolean indicating if model should be restored from disk if existing. :param modeltype: modeltype to train (RF, SVC or LRCV). RF is recommended since being fast to train and non- linear - therefore usually yielding the best results. :return: """ if reload_if_existing is False or Path( model_filename).exists() is False: Logging().log("training {}. ".format(modeltype)) if modeltype is "RF": param_grid = { 'max_depth': [3, 5, 10, 15, 20], 'n_estimators': [3, 5, 10, 20] } clf = GridSearchCV(RandomForestRegressor(n_jobs=-1), param_grid) mdl = clf.fit(X_train, y_train) # output model quality rmse = RMSE.score(y_test, mdl.predict(X_test)) Logging().log("Mean squared error: {0:.3}".format(rmse)) # save model to file with open(model_filename, 'wb') as f: pickle.dump((mdl, rmse), f) else: Logging().log("restoring model from {}".format(model_filename)) with open(model_filename, 'rb') as fid: mdl, rmse = pickle.load(fid) return mdl, rmse
def _scale(self, train_df): ''' centers the data around 0 and scales it :param: train_df: Dataframe that contains the training data :return: dataframe with additional column scaled_FEATURE_X containing scaled features :return: trained_scalers: dictionary - Per feature stores scaler object that is needed in the testing phase to perform identical scaling, with key as column name ''' Logging().log("Scaling Features...") trained_scalers = {} for col in train_df.columns: # 1. consider only relevant columns if HeatmapConvolutionTrainer.relevant_columns(col): continue # 2. standard scaler scaler = preprocessing.StandardScaler() try: scaler = scaler.fit(train_df[col]) except: scaler.fit(train_df[col].reshape(-1, 1)) try: train_df['scaled_' + col] = scaler.transform(train_df[col]) except: train_df['scaled_' + col] = scaler.transform( train_df[col].reshape(-1, 1)) trained_scalers[col] = copy.deepcopy(scaler) return train_df, trained_scalers
def run(self, data_in, extract_frame_override=False, train_dff=None): super().run(data_in) # do not remove this! Logging.log("Training da heat...") # 1. transform to df and keep critical if not extract_frame_override: train_df = self._extract_critical_data_frame(data_in) train_df[self._field_in_train_cluster_id] = data_in[ self._field_in_train_cluster_id] else: train_df = train_dff for cluster_id in list(train_df["train_cluster_id"].unique() ): # per cluster own model if self._test_mode and not (cluster_id == 3): continue print("\n\n TRAINING CLUSTER: " + str(cluster_id)) cur_train_df = train_df[train_df[self._field_in_train_cluster_id] == cluster_id] # 2. scale data and remove outliers output_dfs, trained_scalers = self._preprocess_data( cur_train_df, self._remove_empty_features, self._nr_outlier_iterations, self._outlier_window_size, self._outlier_std_threshold) data_in[ "CL_" + str(cluster_id) + "_" + self._field_out_train_model_trained_scalers] = trained_scalers # 3. Train the model model_per_feature = self._build_heat_map_parallel(output_dfs) data_in["CL_" + str(cluster_id) + "_" + self._field_out_train_model_grid_area] = self._grid_area # 4. Store the models data_in["CL_" + str(cluster_id) + "_" + self._field_out_train_model] = model_per_feature # 5. empty metrics metrics = dict() return data_in, metrics
def _visualize_feature_series(self, train_df, cluster_ids): ''' plots each feature of the reduced training set with the color of its assigned cluster :param train_df: dataframe of prepared features (only critical values!) :param cluster_ids: array of cluster ids corresponding to the reduced rows ''' seaborn.set(style='ticks') train_df["train_cluster_id"] = cluster_ids for col in train_df.columns: if not col.startswith("FEATURE"): continue Logging().log("CURRENT -> "+ col) _order = list(set(cluster_ids)) fg = seaborn.FacetGrid(data=train_df, hue='train_cluster_id', hue_order=_order, aspect=1.61) fg.map(plt.scatter, 'RISK', col).add_legend() plt.show()
def _kmeans_parse(self, data): ''' extract all required parameters for k means clustering :param data: 2D array containing features in shape array([[ f1 f2 f3 f4, ...], [ f1 f2 f3 f4, ...], [ f1 f2 f3 f4, ...], ...]) :return n_samples: Number of input examples :return n_features: Number of features per example :return n_clusters: Number of expected target clusters ''' expected_cluster_number = self._algorithm_params[0] np.random.seed(42) n_samples, n_features = data.shape n_clusters = expected_cluster_number # Anzahl Cluster Logging().log("n_clusters: %d, \t n_samples %d, \t n_features %d" % (n_clusters, n_samples, n_features)) return n_samples, n_features, n_clusters
def run_stage(self, key, data=None): """ :param key: The key referencing the stage :param data: data dictionary :return: """ # current pipelineNode pipelineNode = self._stages[key] Logging().log("Running Stage: {}".format( pipelineNode.__class__.__name__)) # always update data from previous stage if data == None: data = self._data # run stage self._data, self._metrics = pipelineNode.run(data) return self._data, self._metrics
def run(self, data): super().run(data) # dont not remove this! # determine feature weights pf = PolyFitter() # create temporary dictionary for data that is not passed to the next stage temp = dict() X = data['train_X_scaled_crit_bounded_scaled'] n_samples = X.shape[0] n_idx_random = X.shape[ 1] # features are zero indexed, so the number is equal to the index of the new feature rand_feature = np.random.randn(n_samples) temp['t_X_s_c_b_s_enhanced'] = np.c_[X, rand_feature] # hstack data['meta_feature_weights'] = pf.get_weights( temp['t_X_s_c_b_s_enhanced'], data['train_risc']) # select top features if self._select_above_rand: Logging().log("selecting feature above random feature") # restore model from file in case existing model_filename = data[ 'meta_dataset_name'] + "_" + self._model_filename if self._reload_if_existing is False or Path( model_filename).exists() is False: data[ 'meta_feature_indices'] = pf.get_feature_idices_above_rand( data['meta_feature_weights'], n_idx_random=n_idx_random) # save model to file with open(model_filename, 'wb') as f: pickle.dump(data['meta_feature_indices'], f) else: Logging().log("restoring model from {}".format(model_filename)) with open(model_filename, 'rb') as fid: data['meta_feature_indices'] = pickle.load(fid) Logging().log("selected {} features from {}".format( len(data['meta_feature_indices']), n_idx_random)) else: Logging().log("selecting top {} features".format( self._n_top_features)) data['meta_feature_indices'] = pf.get_top_feature_idices( data['meta_feature_weights'], self._n_top_features) data[ self. _field_out_train_X_scaled_crit_bounded_scaled_top] = pf.get_top_features( data[self._field_in_train_X_scaled_crit_bounded_scaled], data['meta_feature_indices']) data[ self. _field_out_test_X_scaled_crit_bounded_scaled_top] = pf.get_top_features( data[self._field_in_test_X_scaled_crit_bounded_scaled], data['meta_feature_indices']) # aggregate metrics metrics = dict() # empty return data, metrics
def _outlier_removal(self, train_df, remove_empty, nr_iterations, split_windows, std_threshold): ''' outliers are removed from the training dataframe per feature by windowing and removing all values per window that are further away than std_threshold times the standard deviation :param: train_df: Dataframe that contains the training data :param: remove_empty: Boolean - if true empty features are removed :param: nr_iterations: Number of iterations that are repeated to remove outliers per window :param: split_windows: Data is split into split_windows equal length window that are between minimal risk and 1 :param: std_threshold: data that is further away than std_threshold * std of the feature is removed :return: output_dfs: list of dataframes with each having a column scaled_FEATURE_X that is outlierfree now and a column risk which is the risk for that feature at its row ''' if not self._remove_outliers: print("Outlier removal disabled!") # 1. Initialize output_dfs = [] iteration = range(nr_iterations) first = True # Per feature and window for col in train_df.columns: # 2. only scaled features are considered if HeatmapConvolutionTrainer.scaled_relevant_columns(col): continue result_df = train_df.sort_values("RISK") # 3. iterate multiple times over window # on each iteration remove outliers for i in iteration: sub_dfs = [] indices = [] rs = 0 # 4. iterate over windows for r in np.linspace(result_df["RISK"].min(), 1, split_windows): sub_df = result_df[(rs <= result_df["RISK"]) & (r > result_df["RISK"])] if self._remove_outliers: sub_df = sub_df[( (sub_df[col] - sub_df[col].mean()) / sub_df[col].std()).abs() < std_threshold] sub_dfs.append(sub_df) rs = r result_df = pd.concat(sub_dfs) # 5. Merge result to common dataframe output_dfs.append(result_df[["RISK", col]]) # 6. Remove empty if (remove_empty and len(result_df[col].unique()) < 2): continue # 7. Plot results if self._visualize_outlier: Logging().log("Pre - Standard Deviation vorher: " + str(train_df[col].std())) Visual().plot_scatter(train_df["RISK"], train_df[col]) #, "RISK", "feature") Logging().log("Post - Standard Deviation nachher: " + str(result_df[col].std())) Visual().plot_scatter(result_df["RISK"], result_df[col]) return output_dfs
def _build_heat_map(self, output_dfs): ''' using convolution for each point of a 2d array risk vs. feature value per feature a heat map is generated :param output_dfs: list of dataframes with each having a column scaled_FEATURE_X (that is outlierfree and scaled now) and a column risk which is the risk for that feature at its row :return a dictionary is returned that contains the feature name as key and its 2d heatmap as output ''' dimensions = {} for feature_df in output_dfs: # each output_df has one risk and value Logging().log("Processing Feature: " + feature_df.columns[1]) # Testmode if self._test_mode and (feature_df.columns[1] == "scaled_FEATURE_5"): print("Testing thus, break now!") break try: values = np.empty(len(feature_df)) values.fill(1) # Assign X Y Z X = feature_df.RISK.as_matrix() Y = feature_df[feature_df.columns[1]].as_matrix() Z = values # create x-y points to be used in heatmap of identical size risk_min = 0 risk_max = 1 feature_min = min([ rm for rm in [df[df.columns[1]].min() for df in output_dfs] if not math.isnan(rm) ]) feature_max = max([ rm for rm in [df[df.columns[1]].max() for df in output_dfs] if not math.isnan(rm) ]) xi = np.linspace(risk_min, risk_max, self._grid_area) yi = np.linspace(feature_min, feature_max, self._grid_area) # Z is a matrix of x-y values interpolated (!) zi = griddata((X, Y), Z, (xi[None, :], yi[:, None]), method=self._interpol_method) zmin = 0 zmax = 1 zi[(zi < zmin) | (zi > zmax)] = None # Convolve each point with a gaussian kernel giving the heat value at point xi,yi being Z # Advantage: kee horizontal and vertical influence grid_cur = np.nan_to_num(zi) # Smooth with a Gaussian kernel kernel = Gaussian2DKernel(stddev=self._std_gaus, x_size=self._kernel_size, y_size=self._kernel_size) grad = scipy_convolve(grid_cur, kernel, mode='same', method='direct') # no constant/zero values shall be allowed -> first - horizontal # horizontal interpolation bis an Rand Logging.log("I AM NEW") for r in range(len(grad)): # per dimension get first and last nonzero value cur_line = grad[:, r] nonzeros = numpy.where(cur_line > 0.0001)[0] if list(nonzeros): a = 20 # fill von 0 bis nonzeros[0] v = numpy.average(cur_line[nonzeros[0]:(nonzeros[0] + a)]) replacement = numpy.linspace(0, v, nonzeros[0] + a)[:(nonzeros[0])] grad[:len(replacement), r] = replacement # fill von nonzeros[-1] bis len(grid)-1 v = numpy.average(cur_line[nonzeros[-1] - a:(nonzeros[-1])]) replacement = numpy.linspace( 0, v, len(cur_line) - nonzeros[-1])[::-1] grad[nonzeros[-1]:, r] = replacement # vertikale interpolation bis an Rand for r in range(len(grad)): # per dimension get first and last nonzero value cur_line = grad[r, :] nonzeros = numpy.where(cur_line > 0.0001)[0] if list(nonzeros): a = 20 # fill von 0 bis nonzeros[0] v = numpy.average(cur_line[nonzeros[0]:(nonzeros[0] + a)]) replacement = numpy.linspace(0, v, nonzeros[0] + a)[:(nonzeros[0])] grad[r, :len(replacement)] = replacement # fill von nonzeros[-1] bis len(grid)-1 v = numpy.average(cur_line[nonzeros[-1] - a:(nonzeros[-1])]) replacement = numpy.linspace( 0, v, len(cur_line) - nonzeros[-1] + 1)[::-1] grad[r, nonzeros[-1] - 1:] = replacement # Store the model in memory dimensions[feature_df.columns[1]] = [ copy.deepcopy(np.absolute(grad)), copy.deepcopy(xi), copy.deepcopy(yi) ] if self._visualize_heatmap: fig, (ax_orig, ax_mag) = plt.subplots(1, 2) ax_orig.imshow(grid_cur[::-1, ::-1], cmap='RdYlGn') ax_orig.set_title('Original') ax_mag.imshow( np.absolute(grad)[::-1, ::-1], cmap='RdYlGn' ) # https://matplotlib.org/examples/color/colormaps_reference.html ax_mag.set_title('Heat') fig.show() plt.show() except: Logging().log("No chance") #traceback.print_exc() dimensions[feature_df.columns[1]] = None return dimensions, xi
def _build_one_heat_map(self, feature_df, risk_min, feature_min, feature_max, fine_tune=-1): Logging().log("Processing Feature: " + feature_df.columns[1]) if fine_tune == -1: try: values = np.empty(len(feature_df)) values.fill(1) # Assign X Y Z X = feature_df.RISK.as_matrix() Y = feature_df[feature_df.columns[1]].as_matrix() Z = values # create x-y points to be used in heatmap of identical size risk_min = feature_df.RISK.min() risk_max = 1 xi = np.linspace(risk_min, risk_max, self._grid_area) yi = np.linspace(feature_min, feature_max, self._grid_area) # Z is a matrix of x-y values interpolated (!) zi = griddata((X, Y), Z, (xi[None, :], yi[:, None]), method=self._interpol_method) zmin = 0 zmax = 1 zi[(zi < zmin) | (zi > zmax)] = None # Convolve each point with a gaussian kernel giving the heat value at point xi,yi being Z # Advantage: kee horizontal and vertical influence grid_cur = np.nan_to_num(zi) # Smooth with a Gaussian kernel kernel = Gaussian2DKernel(stddev=self._std_gaus, x_size=self._kernel_size, y_size=self._kernel_size) grad = scipy_convolve(grid_cur, kernel, mode='same', method='direct') # horizontal interpolation for r in range(len(grad)): # per dimension get first and last nonzero value cur_line = grad[:, r] nonzeros = numpy.where(cur_line > 0.0001)[0] if list(nonzeros): a = 4 # fill von 0 bis nonzeros[0] v = numpy.average(cur_line[nonzeros[0]:(nonzeros[0] + a)]) replacement = numpy.linspace(0, v, nonzeros[0] + a)[:(nonzeros[0])] grad[:len(replacement), r] = replacement # fill von nonzeros[-1] bis len(grid)-1 v = numpy.average(cur_line[nonzeros[-1] - a:(nonzeros[-1])]) replacement = numpy.linspace( 0, v, len(cur_line) - nonzeros[-1])[::-1] grad[nonzeros[-1]:, r] = replacement # Store the model in memory feature_name = feature_df.columns[1] result = [ feature_name, [ copy.deepcopy(np.absolute(grad)), copy.deepcopy(xi), copy.deepcopy(yi) ], grid_cur ] except: feature_name = feature_df.columns[1] Logging().log(str(feature_df.columns[1]) + ": Feature skipped") result = [feature_name, None, None] else: if fine_tune == 0: feature_df = feature_df[feature_df["RISK"] < 0.5] # hier changed!!!!!!!!!!!!! if fine_tune == 1: feature_df = feature_df[feature_df["RISK"] > 0.25] feature_df = feature_df[feature_df["RISK"] < 0.75] if fine_tune == 2: feature_df = feature_df[feature_df["RISK"] > 0.5] try: feature_df = self._remove_one_outlier(feature_df, feature_df.columns[1]) values = np.empty(len(feature_df)) values.fill(1) # Assign X Y Z X = feature_df.RISK.as_matrix() Y = feature_df[feature_df.columns[1]].as_matrix() Z = values risk_min = feature_df.RISK.min() risk_max = 1 xi = np.linspace(risk_min, risk_max, self._grid_area) yi = np.linspace(feature_min, feature_max, self._grid_area) # Z is a matrix of x-y values interpolated (!) zi = griddata((X, Y), Z, (xi[None, :], yi[:, None]), method=self._interpol_method) zmin = 0 zmax = 1 zi[(zi < zmin) | (zi > zmax)] = None # Convolve each point with a gaussian kernel giving the heat value at point xi,yi being Z # Advantage: kee horizontal and vertical influence grid_cur = np.nan_to_num(zi) # Smooth with a Gaussian kernel kernel = Gaussian2DKernel(stddev=self._std_gaus, x_size=self._kernel_size, y_size=self._kernel_size) grad = scipy_convolve(grid_cur, kernel, mode='same', method='direct') # vertikale interpolation bis an Rand for r in range(len(grad)): # per dimension get first and last nonzero value cur_line = grad[:, r] nonzeros = numpy.where(cur_line > 0.0001)[0] if list(nonzeros): a = 4 # fill von 0 bis nonzeros[0] v = numpy.average(cur_line[nonzeros[0]:(nonzeros[0] + a)]) replacement = numpy.linspace(0, v, nonzeros[0] + a)[:(nonzeros[0])] grad[:len(replacement), r] = replacement # fill von nonzeros[-1] bis len(grid)-1 v = numpy.average(cur_line[nonzeros[-1] - a:(nonzeros[-1])]) replacement = numpy.linspace( 0, v, len(cur_line) - nonzeros[-1])[::-1] grad[nonzeros[-1]:, r] = replacement # Store the model in memory feature_name = feature_df.columns[1] result = [ "fine_" + str(fine_tune) + "_" + feature_name, [ copy.deepcopy(np.absolute(grad)), copy.deepcopy(xi), copy.deepcopy(yi) ], grid_cur ] except: #traceback.print_exc() feature_name = feature_df.columns[1] result = [feature_name, None, None] return result
def run(self, data): super().run(data) # dont not remove this! # temporary dictionary temp = dict( ) # use this for all data, that is not referenced by self._field # Assign and model risk rul_percentile_value = np.percentile( data[self._field_in_train_rul_crit], self._rul_percentile) Logging().log( "any rul value larger than {0:.1f} will be dropped.".format( rul_percentile_value)) indices_train = np.array( data[self._field_in_train_rul_crit] <= rul_percentile_value) temp["train_rul_crit_bounded"] = data[self._field_in_train_rul_crit][ indices_train] # _bounded = only samples # with RUL in percentile temp["train_X_scaled_crit_bounded"] = data[ self._field_in_train_X_scaled_crit][indices_train] indices_test = np.array( data[self._field_in_test_rul_crit] <= rul_percentile_value) temp["test_rul_crit_bounded"] = data[ self._field_in_test_rul_crit][indices_test] temp["test_X_scaled_crit_bounded"] = data[ self._field_in_test_X_scaled_crit][indices_test] scaler = preprocessing.StandardScaler() scaler = scaler.fit(temp["train_X_scaled_crit_bounded"]) data[self. _field_out_train_X_scaled_crit_bounded_scaled] = scaler.transform( temp["train_X_scaled_crit_bounded"]) data[self. _field_out_test_X_scaled_crit_bounded_scaled] = scaler.transform( temp["test_X_scaled_crit_bounded"]) # first, we calculate the risk for all critical samples based on the rul rul_min = np.min(temp["train_rul_crit_bounded"]) rul_max = np.max(temp["train_rul_crit_bounded"]) Logging().log( "max RUL in bounded training dataset is {} (RISK = 1), min is {} (RISK = 0)." .format(rul_max, rul_min)) data[self._field_out_train_risc] = self._get_risc_target( temp["train_rul_crit_bounded"]) data[self._field_out_test_risc] = self._get_risc_target( temp["test_rul_crit_bounded"]) Visual().plot_scatter(temp["train_rul_crit_bounded"], data[self._field_out_train_risc]) #for field in ["train", "test", "valid"]: # field_real = "rul_" + field # if field_real in data: # data["risk_" + field] = self._get_risc_target(data[field_real]) # Visual().plot_scatter(data[field_real], data["risk_" + field]) # metrics metrics = dict() # empty metrics return data, metrics
def run(self, data_in): super().run(data_in) # do not remove this! Logging.log("Testing da heat...") # 1. transform to df and keep critical test_df = self._extract_critical_data_frame(data_in) # 2. assign cluster id, add column with id test_df = self._assign_cluster(data_in, test_df) test_df["predicted_rul"] = -1 test_df["predicted_risk"] = -1 abs_max_rul = test_df["RUL"].max() # 217 segment_thrshld = 0.33 * abs_max_rul if self._enable_all_print: print("THE MAXIMUM RUL IN THE DATA SET IS " + str(abs_max_rul)) # 3. extract current relevant data - do this for all and append for object_id in list(test_df["id"].unique()): all_feature_sum = False cur_df1 = test_df[test_df['id'] == object_id] print("Current: OBJECT ID: " + str(object_id)) timestamp_gap = 0 # PER Cluster need to shift incoming data else I cannot sum it up last_ts = 0 expected_rul = 99999999 all_feature_favorites = [] for cluster_id in list(cur_df1["cluster_id"].unique()): if self._test_mode and not (cluster_id == 3): continue Logging.log("--------> Eval: CLUSTER ID: " + str(cluster_id)) cur_df2 = cur_df1[cur_df1['cluster_id'] == cluster_id] cnt = 0 cur_df3 = cur_df2.sort_values("RUL", ascending=False) # per object predict only the maximal first = True for i in range(len(cur_df3)): # 0. parallelize only estimate last one current_test_df = cur_df3 if not first: continue if first: first = False Logging.log("--------> Eval: RUL RANGE: " + str(current_test_df["RUL"].max()) + " to " + str(current_test_df["RUL"].min())) # 1. OPTIMIERUNG - nehme nicht alles sondern nur die maximal letzten 120 (ansonsten verzerrt weil ich ja nur bis 200 gelernt hab) dist = current_test_df["RUL"].max( ) - current_test_df["RUL"].min() if dist > segment_thrshld: if self._enable_all_print: print( "SHORTENED RUL AREA !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! " ) thrshld = current_test_df["RUL"].min( ) + segment_thrshld current_test_df = current_test_df[ current_test_df["RUL"] < thrshld] # 4. run tester for this data frame and add column predicted try: skip = skip_features[int(cluster_id)] except: skip = [] # 5. shift the input curve to align with the one processed next if last_ts != 0: cur_ts = current_test_df["TS"].max() timestamp_gap = cur_ts - last_ts # 6. store last Timestamp for shifting if it is more urgent if current_test_df["RUL"].min() < expected_rul: expected_rul = current_test_df["RUL"].min() predicted_risk, predicted_rul, m, all_feature_sum, per_feature_sum, feature_favorites = self._predict_RUL( data_in, current_test_df, cluster_id, all_feature_sum, skip, timestamp_gap, expected_rul) all_feature_favorites += feature_favorites # VARIANTE 1 - weighted average mit 1/x print("USING WEIGHTED AVERAGE") total_amount = 0 total_count = 0 for feat in all_feature_favorites: weight = 1 / feat total_count += (weight * feat) total_amount += weight wAvg = total_count / total_amount predicted_risk = wAvg predicted_rul = (predicted_risk - 1) / m print("\n->>>>>> Estimated predicted RUL FINAL FINAL: " + str(predicted_rul) + "\nUPDATE RISK: " + str(predicted_risk)) # 7. wenn mehr als 2 features kleiner 0.53 sind dann nehme average dieser rego = [a for a in all_feature_favorites if a < 0.53] if len(rego) > 2: predicted_risk = numpy.average(rego) predicted_rul = (predicted_risk - 1) / m print("Estimated predicted RUL UPDATED: " + str(predicted_rul) + "\nUPDATE RISK: " + str(predicted_risk)) # 5. result should be at location of test_df WHERE current_test_df["RUL"].min() test_df = test_df.set_value(current_test_df.index[-1], "predicted_risk", predicted_risk) test_df = test_df.set_value(current_test_df.index[-1], "predicted_rul", predicted_rul) # 6. store last Timestamp for shifting if it is more urgent if current_test_df["TS"].max() > last_ts: last_ts = current_test_df["TS"].max() # 3. store to file if self._write_csv: cnt += 1 object_id = str(object_id) cluster_id = str(cluster_id) # 5. metrics metrics = {} return data_in, metrics
def train(X_train, X_test, y_train, y_test, model_filename, reload_if_existing, modeltype="RF", cv_measure="roc_auc_score"): """ trains and evaluates a model based on the given data :param X_train: Features for training, expected to a numpy.ndarray. :param X_test: Features for testing, expected to a numpy.ndarray. :param y_train: Labels for training. Expected to an one-dimesional array. :param y_test: Labels for testing. Expected to an one-dimesional array. :param model_filename: Filename of model when serialized to disk :param reload_if_existing: Boolean indicating if model should be restored from disk if existing. :param modeltype: modeltype to train (RF, SVC or LRCV). RF is recommended since being fast to train and non- linear - therefore usually yielding the best results. :param cv_measure: possible cv_measure are ['accuracy', 'precision', 'recall', 'roc_auc'] :return: """ if reload_if_existing is False or Path( model_filename).exists() is False: Logging().log("training {}. ".format(modeltype)) if modeltype is "LRCV": Logging().log("Optimizing for {}...".format(cv_measure)) lr = LogisticRegressionCV( Cs=[0.001, 0.01, 0.1, 1], cv=5, penalty='l1', scoring=cv_measure, # Changed from auROCWeighted solver='liblinear', tol=0.001, n_jobs=mp.cpu_count()) mdl = lr.fit(X_train, y_train) Logging().log("cross validated {0} (train) is {1:.3}".format( cv_measure, max(np.mean(mdl.scores_[1], axis=0)))) # get CV train metrics elif modeltype is "SVC": # after ~2h of training: cross validated roc_auc=0.511 on rex clf = SVC() mdl = clf.fit(X_train, y_train) elif modeltype is "RF": # after ~2h of training: cross validated roc_auc=0.511 on rex param_grid = { 'max_depth': [3, 5, 10, 15, 20], 'n_estimators': [3, 5, 10, 20] } clf = GridSearchCV(RandomForestClassifier(n_jobs=-1), param_grid) mdl = clf.fit(X_train, y_train) # output model quality cross_val_res = cross_val_score(mdl, X_test, y_test, scoring='roc_auc') auc_test = np.mean(cross_val_res) Logging().log( "cross validated AUC (test) is {0:.3}".format(auc_test)) # save model to file with open(model_filename, 'wb') as f: pickle.dump((mdl, auc_test), f) else: Logging().log("restoring model from {}".format(model_filename)) with open(model_filename, 'rb') as fid: (mdl, auc_test) = pickle.load(fid) return mdl, auc_test