def get_data(): values = [] labels = [] timestamp = [] for i in range(len(raw_data['value1'])): values.append(raw_data['value1'][i]) labels.append(raw_data['label1'][i]) timestamp.append(i) values, labels, timestamp = np.array(values), np.array(labels), np.array( timestamp) # Complete the timestamp, and obtain the missing point indicators. timestamp, missing, (values, labels) = \ complete_timestamp(timestamp, (values, labels)) # Split the training and testing data. test_portion = 0.2 test_n = int(len(values) * test_portion) train_values, test_values = values[:-test_n], values[-test_n:] train_labels, test_labels = labels[:-test_n], labels[-test_n:] train_missing, test_missing = missing[:-test_n], missing[-test_n:] # Standardize the training and testing data. train_values, mean, std = standardize_kpi(train_values, excludes=np.logical_or( train_labels, train_missing)) test_values, _, _ = standardize_kpi(test_values, mean=mean, std=std) return train_values, train_labels, train_missing, mean, std, test_values, test_labels, test_missing
def fit(self, X: pd.DataFrame): with self.device: # Reset all results from last run to avoid reusing variables self.means, self.stds, self.tf_sessions, self.models = [], [], [], [] for col_idx in trange(len(X.columns)): col = X.columns[col_idx] tf_session = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) timestamps = X.index features = X.loc[:, col].interpolate().bfill().values labels = pd.Series(0, X.index) timestamps, _, (features, labels) = complete_timestamp(timestamps, (features, labels)) missing = np.isnan(X.loc[:, col].values) _, mean, std = standardize_kpi(features, excludes=np.logical_or(labels, missing)) with tf.variable_scope('model') as model_vs: model = DonutModel( h_for_p_x=Sequential([ K.layers.Dense(100, kernel_regularizer=K.regularizers.l2(0.001), activation=tf.nn.relu), K.layers.Dense(100, kernel_regularizer=K.regularizers.l2(0.001), activation=tf.nn.relu), ]), h_for_q_z=Sequential([ K.layers.Dense(100, kernel_regularizer=K.regularizers.l2(0.001), activation=tf.nn.relu), K.layers.Dense(100, kernel_regularizer=K.regularizers.l2(0.001), activation=tf.nn.relu), ]), x_dims=self.x_dims, z_dims=5, ) trainer = QuietDonutTrainer(model=model, model_vs=model_vs, max_epoch=self.max_epoch, batch_size=self.batch_size, valid_batch_size=self.batch_size, missing_data_injection_rate=0.0, lr_anneal_factor=1.0) with tf_session.as_default(): trainer.fit(features, labels, missing, mean, std, valid_portion=0.25) self.means.append(mean) self.stds.append(std) self.tf_sessions.append(tf_session) self.models.append(model)
def generate_score(number): # Read the raw data. data_dir_path = 'C:/Users/Administrator/Downloads/research/donut-master/SMD/data_concat/data-' + number + '.csv' data = np.array(pd.read_csv(data_dir_path, header=None), dtype=np.float64) tag_dir_path = './SMD/test_label/machine-' + number + '.csv' tag = np.array(pd.read_csv(tag_dir_path, header=None), dtype=np.int) labels = np.append(np.zeros(int(len(data) / 2)), tag) # pick one colume values = data[:, 1] timestamp = np.arange(len(data)) + 1 # If there is no label, simply use all zeros. # labels = np.zeros_like(values, dtype=np.int32) # Complete the timestamp, and obtain the missing point indicators. timestamp, (values, labels) = \ complete_timestamp(timestamp, (values, labels)) # Split the training and testing data. test_portion = 0.5 test_n = int(len(values) * test_portion) train_values = values[:-test_n] test_values = values[-len(train_values):] train_labels, test_labels = labels[:-test_n], labels[-test_n:] # print(len(test_values), len(test_labels)) # Standardize the training and testing data. train_values, mean, std = standardize_kpi(train_values, excludes=train_labels) test_values, _, _ = standardize_kpi(test_values, mean=mean, std=std) import tensorflow as tf from donut import Donut from tensorflow import keras as K from tfsnippet.modules import Sequential # We build the entire model within the scope of `model_vs`, # it should hold exactly all the variables of `model`, including # the variables created by Keras layers. with tf.variable_scope('model') as model_vs: model = Donut( h_for_p_x=Sequential([ K.layers.Dense(100, kernel_regularizer=K.regularizers.l2(0.001), activation=tf.nn.relu), K.layers.Dense(100, kernel_regularizer=K.regularizers.l2(0.001), activation=tf.nn.relu), ]), h_for_q_z=Sequential([ K.layers.Dense(100, kernel_regularizer=K.regularizers.l2(0.001), activation=tf.nn.relu), K.layers.Dense(100, kernel_regularizer=K.regularizers.l2(0.001), activation=tf.nn.relu), ]), x_dims=120, z_dims=5, ) from donut import DonutTrainer, DonutPredictor trainer = DonutTrainer(model=model, model_vs=model_vs) predictor = DonutPredictor(model) with tf.Session().as_default(): trainer.fit(train_values, train_labels, mean, std) test_score = predictor.get_score(test_values) if not os.path.exists('./score'): os.makedirs('./score') np.save('./score/' + number + '.npy', test_score)
parser = parser.parse_args() # load data for train: data = pd.read_csv(parser.train_data, skiprows=[0, 1], header=None) data = data.dropna() values = data[2].values labels = data[3].values labels = np.zeros_like(values, dtype=np.int32) date_str = data[0].values date = [datetime.strptime(x, '%Y-%m-%d %H:%M:%S') for x in date_str] dateDelta = [x - date[0] for x in date] timestamp = [x.days * 1440 + x.seconds / 60 for x in dateDelta] # Complete the timestamp, and obtain the missing point indicators. timestamp, missing, (values, labels) = complete_timestamp(timestamp, (values, labels)) train_values = values train_labels = labels train_missing = missing # Split the training and testing data. test_portion = 0.3 test_n = int(len(values) * test_portion) train_values, test_values = values[:-test_n], values[-test_n:] train_labels, test_labels = labels[:-test_n], labels[-test_n:] train_missing, test_missing = missing[:-test_n], missing[-test_n:] # Standardize the training and testing data. train_values, mean, std = standardize_kpi(train_values, excludes=np.logical_or( train_labels, train_missing)) test_values, _, _ = standardize_kpi(test_values, mean=mean, std=std)
def vae_donut(ts_obj, window_size, mcmc_iteration, latent_dim, gaussian_window_size, step_size, plot_reconstruction=False, plot_anomaly_score=False): # authors use window_size = 120 # mcmc_iteration = 10 # https://github.com/kratzert/finetune_alexnet_with_tensorflow/issues/8 tf.reset_default_graph() start = time.time() # if there are missing time steps, we DO NOT fill them with NaNs because donut will replace them with 0s # using complete_timestamp # see line 6 in https://github.com/NetManAIOps/donut/blob/master/donut/preprocessing.py timestamp, values, labels = ts_obj.dataframe[ "timestamp"].values, ts_obj.dataframe["value"].values, np.zeros_like( ts_obj.dataframe["value"].values, dtype=np.int32) # print(len(timestamp)) # print(len(values)) # print(len(labels)) # Complete the timestamp, and obtain the missing point indicators # replaces missing with 0s. # donut cannot handle this date format for some reason if ts_obj.dateformat == "%Y-%m": rng = pd.date_range('2000-01-01', periods=len(values), freq='T') timestamp, missing, (values, labels) = complete_timestamp( rng, (values, labels)) else: timestamp, missing, (values, labels) = complete_timestamp( timestamp, (values, labels)) # print(len(timestamp)) # print(len(values)) # print(len(labels)) # print(sum(missing)) # Standardize the training and testing data. values, mean, std = standardize_kpi(values, excludes=np.logical_or( labels, missing)) with tf.variable_scope('model') as model_vs: model = Donut( h_for_p_x=Sequential([ K.layers.Dense(100, kernel_regularizer=K.regularizers.l2(0.001), activation=tf.nn.relu), K.layers.Dense(100, kernel_regularizer=K.regularizers.l2(0.001), activation=tf.nn.relu), ]), h_for_q_z=Sequential([ K.layers.Dense(100, kernel_regularizer=K.regularizers.l2(0.001), activation=tf.nn.relu), K.layers.Dense(100, kernel_regularizer=K.regularizers.l2(0.001), activation=tf.nn.relu), ]), x_dims=window_size, z_dims=latent_dim, ) trainer = DonutTrainer(model=model, model_vs=model_vs) predictor = DonutPredictor(model) with tf.Session().as_default(): trainer.fit(values, labels, missing, mean, std) score = predictor.get_score(values, missing) # if time series is [1,2,3,4...] and ts_length is 3 # this gives us [[1,2,3],[2,3,4]...] ts_strided = ah.as_sliding_window(values, window_size) ts_strided = my_func_float(np.array(ts_strided, dtype=np.float32)) missing_strided = ah.as_sliding_window(missing, window_size) missing_strided = my_func_int( np.array(missing_strided, dtype=np.int32)) # print(ts_strided) # print(missing_strided) x = model.vae.reconstruct( iterative_masked_reconstruct(reconstruct=model.vae.reconstruct, x=ts_strided, mask=missing_strided, iter_count=mcmc_iteration, back_prop=False)) # `x` is a :class:`tfsnippet.stochastic.StochasticTensor`, from which # you may derive many useful outputs, for example: # print(x.tensor.eval()) # the `x` samples # print(x.log_prob(group_ndims=0).eval()) # element-wise log p(x|z) of sampled x # print(x.distribution.log_prob(ts_strided).eval()) # the reconstruction probability # print(x.distribution.mean.eval(), x.distribution.std.eval()) # mean and std of p(x|z) tensor_reconstruction_probabilities = x.distribution.log_prob( ts_strided).eval() # because of the way strided works, we use the first 120 anomaly scores in the first slide # and then for remaining slides, we use the last point/score reconstruction_probabilities = list( tensor_reconstruction_probabilities[0]) for i in range(len(tensor_reconstruction_probabilities)): if i != 0: slide = tensor_reconstruction_probabilities[i] reconstruction_probabilities.append(slide[-1]) # print(len(reconstruction_probabilities)) # print(len(ts_obj.dataframe)) if ts_obj.miss: ref_date_range = ch.get_ref_date_range(ts_obj.dataframe, ts_obj.dateformat, ts_obj.timestep) gaps = ref_date_range[~ref_date_range.isin(ts_obj. dataframe["timestamp"])] filled_df = ch.fill_df(ts_obj.dataframe, ts_obj.timestep, ref_date_range, "fill_nan") # print("NaNs exist?: ",filled_df['value'].isnull().values.any()) filled_df[ "reconstruction_probabilities"] = reconstruction_probabilities # remove nans filled_df = filled_df.dropna() reconstruction_probabilities = list( filled_df["reconstruction_probabilities"].values) # print(len(reconstruction_probabilities)) # print(len(ts_obj.dataframe)) reconstruction_probabilities = [ abs(item) for item in reconstruction_probabilities ] anomaly_scores = anomaly_scores = ah.determine_anomaly_scores_error( reconstruction_probabilities, np.zeros_like(reconstruction_probabilities), ts_obj.get_length(), gaussian_window_size, step_size) end = time.time() if plot_reconstruction: plt.subplot(211) # see lines 98 to 100 of https://github.com/NetManAIOps/donut/blob/master/donut/prediction.py plt.title("Negative of Reconstruction Probabilities") plt.plot(reconstruction_probabilities) # plt.ylim([.99,1]) plt.subplot(212) plt.title("Time Series") plt.plot(ts_obj.dataframe["value"].values) plt.axvline(ts_obj.get_probationary_index(), color="black", label="probationary line") plt.tight_layout() plt.show() if plot_anomaly_score: plt.subplot(211) plt.title("Anomaly Scores") plt.plot(anomaly_scores) plt.ylim([.998, 1]) plt.subplot(212) plt.title("Time Series") plt.plot(ts_obj.dataframe["value"].values) plt.axvline(ts_obj.get_probationary_index(), color="black", label="probationary line") plt.tight_layout() plt.show() return { "Anomaly Scores": anomaly_scores, "Time": end - start, "Reconstruction Probabilities": reconstruction_probabilities }