def test_to_dict(): tree = rrcf.RCTree() tree.insert_point([0., 0.], index=0) tree.insert_point([0., 0.], index=1) tree.insert_point([0., 0.], index=2) tree.insert_point([0., 1.], index=3) obj = tree.to_dict() X = np.random.randn(10, 3) X[5] = X[2] tree = rrcf.RCTree(X) obj = tree.to_dict() with open('tree.json', 'w') as outfile: json.dump(obj, outfile)
def construct_force(self) -> List: """ Forest creator This function creates a list of trees which are constructed randomly. :return: - list forest: A list of the trees that are created randomly """ forest = [] sample_size_range = (self.number_of_rows // self.tree_size, self.tree_size) if sample_size_range[0] == 0: raise ValueError("Please check the tree_size. It seems that the number of the " "samples (rows) is less than the size of the tree") while len(forest) < self.number_trees: # Select random subsets of points uniformly from point set ixs = np.random.choice(self.number_of_rows, size=sample_size_range, replace=False) # Add sampled trees to forest trees = [rrcf.RCTree(self.data_array[ix], index_labels=ix) for ix in ixs] forest.extend(trees) return forest
def generate_forest(self): # Create a forest of empty trees forest = [] for _ in range(self.num_trees): tree = rrcf.RCTree() forest.append(tree) self.forest = forest
def test_print(): tree = rrcf.RCTree() tree.insert_point([0., 0.], index=0) tree.insert_point([0., 0.], index=1) tree.insert_point([0., 1.], index=3) print(list(tree.leaves.values())[0]) print(tree.root)
def fit_batch(self, points): ''' Creates a rrcf with num_trees trees from random samples of size tree_size from a batch set of points Parameters: points: the points from which to create the rrcf. np.ndarray of size (n x d) ''' # assert that points.shape has two dimensions try: assert(len(points.shape) is 2) except: raise ValueError("Input points must have shape (n x d)") self.num_points = points.shape[0] self.dimension = points.shape[1] # create forest forest = [] # scale mean and variance of points scaled_points = preprocessing.scale(points) # take unqiue values before random sampling, so random sample isn't replication of same points tree_index = 0 while len(forest) < self.num_trees: # Select random subsets of points uniformly from point set ixs = np.random.choice(self.num_points, self.tree_size, replace=False) # Add sampled trees to forest tree = rrcf_base.RCTree(scaled_points[ixs], index_labels=ixs) forest.append(tree) self.ixs[tree_index] = ixs tree_index += 1 self.forest = forest
def get_avgcodisp(instances): # Create a forest of empty trees forest = [] for _ in range(num_trees): tree = rrcf.RCTree() forest.append(tree) # Use the "shingle" generator to create rolling window points = rrcf.shingle(instances, size=shingle_size) # Create a dict to store anomaly score of each point avg_codisp = {} # For each shingle... for index, point in enumerate(points): # For each tree in the forest... for tree in forest: # If tree is above permitted size... if len(tree.leaves) > tree_size: # Drop the oldest point (FIFO) tree.forget_point(index - tree_size) # Insert the new point into the tree tree.insert_point(point, index=index) # Compute codisp on the new point... # new_codisp = tree.codisp(index) # And take the average over all trees if not index in avg_codisp: avg_codisp[index] = 0 avg_codisp[index] += tree.codisp(index) / num_trees return avg_codisp
def robust_random_cut(self, sketch_vector): # Set tree parameters # Specify sample parameters forest = [] num_trees = 50 tree_size = 256 n = len(sketch_vector) sample_size_range = (n // tree_size, tree_size) while len(forest) < num_trees: # Select random subsets of points uniformly from point set ixs = np.random.choice(n, size=sample_size_range, replace=False) # Add sampled trees to forest trees = [ rrcf.RCTree(sketch_vector[ix], index_labels=ix) for ix in ixs ] forest.extend(trees) # Compute average CoDisp avg_codisp = pd.Series(0.0, index=np.arange(n)) index = np.zeros(n) for tree in forest: codisp = pd.Series( {leaf: tree.codisp(leaf) for leaf in tree.leaves}) avg_codisp[codisp.index] += codisp np.add.at(index, codisp.index.values, 1) avg_codisp /= index predicted = avg_codisp > avg_codisp.quantile(0.70) # print("Predicted: ", predicted) return predicted
def create_forest(num_trees): forest = [] for _ in range(num_trees): tree = rrcf.RCTree() forest.append(tree) return forest
def test_from_dict(): num_leaves = 10 with open('tree.json', 'r') as infile: obj = json.load(infile) tree = rrcf.RCTree() tree.load_dict(obj) tree = rrcf.RCTree.from_dict(obj) # Ensure we didn't drop any duplicate leaves assert len(tree.leaves) == num_leaves
def test_insert_depth(): tree = rrcf.RCTree() tree.insert_point([0., 0.], index=0) tree.insert_point([0., 0.], index=1) tree.insert_point([0., 0.], index=2) tree.insert_point([0., 1.], index=3) tree.forget_point(index=3) min_depth = min(leaf.d for leaf in tree.leaves.values()) assert min_depth >= 0
def robust_random_cut(self, sketch_vector): # Set tree parameters sketch_vector = sketch_vector.sort_values(by='graphid', ascending=False) sketch = sketch_vector['sketch'].tolist() sketch = preprocessing.scale(sketch) num_trees = 50 shingle_size = 1 #args.win_size tree_size = 32 # Create a forest of empty trees forest = [] for _ in range(num_trees): tree = rrcf.RCTree() forest.append(tree) # Use the "shingle" generator to create rolling window points = rrcf.shingle(sketch, size=shingle_size) # Create a dict to store anomaly score of each point avg_codisp = {} # For each shingle... for index, point in enumerate(points): # For each tree in the forest... if index % 50 == 0: print("Index: ", index) for tree in forest: # If tree is above permitted size... if len(tree.leaves) > tree_size: # Drop the oldest point (FIFO) tree.forget_point(index - tree_size) # Insert the new point into the tree tree.insert_point(point, index=index) # Compute codisp on the new point... new_codisp = tree.codisp(index) # And take the average over all trees if not index in avg_codisp: avg_codisp[index] = 0 avg_codisp[index] += new_codisp / num_trees # print(avg_codisp) disp = pd.Series([avg_codisp[s] for s in avg_codisp]) pred_rrcf = disp > disp.quantile(0.95) print( metrics.classification_report(np.array(sketch_vector['anomaly']), pred_rrcf)) # plt.plot(disp) # plt.plot(disp, marker='.') # plt.show() return pred_rrcf, disp
def __init__(self, number_of_trees=40, train_size=500, queue_size=500, last_scores_size=8000, small_window_size=10,): super().__init__() self.forest = [rrcf.RCTree() for _ in range(number_of_trees)] self.train_size = train_size self.current_index = 0 self.queue_size = queue_size self.anomaly_scores_queue = [] self.points_to_add_in_future = [] self.last_scores_size = last_scores_size self.small_window_size = small_window_size self.queue = [] self.anomaly_scores_queue = []
def find_anomalies(input): # Set tree parameters num_trees = 40 shingle_size = 1 tree_size = 256 # Create a forest of empty trees forest = [] for _ in range(num_trees): tree = rrcf.RCTree() forest.append(tree) inputPoints = list(map(lambda x: x['value'], input)) points = rrcf.shingle(inputPoints, size=shingle_size) avg_codisp = {} disp = {} # For each shingle... for index, point in enumerate(inputPoints): # For each tree in the forest... for tree in forest: # If tree is above permitted size, drop the oldest point (FIFO) if len(tree.leaves) > tree_size: tree.forget_point(index - tree_size) # Insert the new point into the tree tree.insert_point(point, index=index) # Compute codisp on the new point and take the average among all trees if not index in avg_codisp: avg_codisp[index] = 0 avg_codisp[index] += tree.codisp(index) / num_trees disp[index] = tree.disp(index) output = [] for i in range(len(input)): codisp = avg_codisp[i] point = {} point['value'] = input[i]['value'] point['timestamp'] = input[i]['timestamp'] point['isAnomaly'] = codisp > 40 point['codisp'] = codisp output.append(point) return output
def init(df,param): # Set model parameters features=len(df) num_trees=15 tree_size=30 sample_size_range=(features // tree_size, tree_size) if 'options' in param: if 'params' in param['options']: if 'num_trees' in param['options']['params']: num_trees = int(param['options']['params']['num_trees']) if 'tree_size' in param['options']['params']: tree_size = int(param['options']['params']['tree_size']) # Convert data to nparray variables=[] if 'target_variables' in param: variables=param['target_variables'] other_variables=[] if 'feature_variables' in param: other_variables=param['feature_variables'] for item in other_variables: variables.append(item) data=df[variables].to_numpy().astype(float) # Create the random cut forest forest = [] while len(forest) < num_trees: # Select random subsets of points uniformly ixs = np.random.choice(features, size=sample_size_range, replace=False) # Add sampled trees to forest trees = [rcf.RCTree(data[ix], index_labels=ix) for ix in ixs] forest.extend(trees) return forest
def rrcf_calc(dfs): print(dfs) df_merged = dfs[0][['phase_dif']] min_date = df_merged.index.min() df_merged = df_merged[:min_date + pd.Timedelta(days=90)] print(df_merged) num_points = df_merged.shape[0] print("num_points: " + str(num_points)) num_trees = 6000 tree_size = 1000 # shingle_size = 24 # # points = rrcf.shingle(df_merged.Value, size=shingle_size) # points = np.vstack([point for point in points]) # num_points = points.shape[0] sample_size_range = (num_points // tree_size, tree_size) forest = [] while len(forest) < num_trees: print(len(forest)) indices = np.random.choice(num_points, size=sample_size_range, replace=False) trees = [ rrcf.RCTree(df_merged.iloc[ix], index_labels=ix) for ix in indices ] # trees = [rrcf.RCTree(points[ix], index_labels=ix) for ix in indices] forest.extend(trees) avg_codisp = pd.Series(0.0, index=np.arange(num_points)) n_owning_trees = np.zeros(num_points) for tree in forest: codisp = pd.Series({leaf: tree.codisp(leaf) for leaf in tree.leaves}) avg_codisp[codisp.index] += codisp np.add.at(n_owning_trees, codisp.index.values, 1) avg_codisp /= n_owning_trees # avg_codisp.index = df_merged.Value.iloc[(shingle_size - 1):].index print(avg_codisp) plot_anomaly_score(dfs[0], avg_codisp)
def robust_random_cut_batch(self, sketch): # Set tree parameters # Specify sample parameters sketch_vector = sketch['sketch'].tolist() # sketch_vector = preprocessing.scale(sketch_vector) sketch_vector = np.array(sketch_vector) forest = [] num_trees = 50 tree_size = 256 n = len(sketch_vector) sample_size_range = (n // tree_size, tree_size) while len(forest) < num_trees: # Select random subsets of points uniformly from point set ixs = np.random.choice(n, size=sample_size_range, replace=False) trees = [ rrcf.RCTree(sketch_vector[ix], index_labels=ix) for ix in ixs ] forest.extend(trees) # Compute average CoDisp avg_codisp = pd.Series(0.0, index=np.arange(n)) index = np.zeros(n) for tree in forest: codisp = pd.Series( {leaf: tree.codisp(leaf) for leaf in tree.leaves}) avg_codisp[codisp.index] += codisp np.add.at(index, codisp.index.values, 1) avg_codisp /= index pred_rrcf = avg_codisp > avg_codisp.quantile(0.95) # pred_rrcf = np.array(pred_rrcf) # print("Predicted: ", pred_rrcf) print( metrics.classification_report(np.array(sketch_vector['anomaly']), pred_rrcf)) return pred_rrcf, avg_codisp
import numpy as np import rrcf np.random.seed(0) n = 100 d = 3 X = np.random.randn(n, d) tree = rrcf.RCTree(X) deck = np.arange(n, dtype=int) np.random.shuffle(deck) indexes = deck[:5] def test_batch(): # Check stored bounding boxes and leaf counts after instantiating from batch branches = [] tree.map_branches(tree.root, op=tree._get_nodes, stack=branches) leafcount = tree._count_leaves(tree.root) assert (leafcount == n) for branch in branches: leafcount = tree._count_leaves(branch) assert (leafcount == branch.n) bbox = tree.get_bbox(branch) assert (bbox == branch.b).all() def test_forget_batch(): # Check stored bounding boxes and leaf counts after forgetting points for index in indexes: forgotten = tree.forget_point(index)
def stream_anomaly_scores(self, points, window_size, new_forest = False): ''' Computes anomaly scores for all points in a stream by computing the average collusive displacement. The assumption is that each point in the stream is only observed sequentially. Higher scores indicate a higher displacement and thus a higher likelihood of anomaly. If existing forest does not exist, or existing forest does exist with a different window size, create a new forest starting with the first point in the stream. Parameters: points: the stream of point on which to calculate anomaly scores window_size: the window size in which to ingest points. points are mapped as a n-dimensional window, where n = window_size new_forest: boolean that identifies whether to create a new forest or not Returns: anomaly_scores: pandas Series with index of points and average collusive displacement (anomaly score) for each point ''' # create a new empty forest if forest does not exit or forest does exist, but # with different window size if self.forest is None or new_forest: self.num_points = 0 forest = [] for i in range(self.num_trees): tree = rrcf_base.RCTree() forest.append(tree) self.ixs[i] = [] self.forest = forest # scale mean and variance of points #scaled_points = preprocessing.scale(points) #print(scaled_points.shape) # create rolling window of size window_size points_gen = rrcf_base.shingle(points, size=window_size) # calculate streaming anomaly scores avg_codisp = pd.Series(0.0, index=np.arange(self.num_points, self.num_points + points.shape[0])) initial_index = self.num_points for index, point in enumerate(points_gen): index += initial_index for tree_idx, tree in enumerate(self.forest): # If tree is above permitted size, drop the oldest point (FIFO) # TODO: forget oldest point or another random point with prob if len(tree.leaves) >= self.tree_size: forget_index = min(self.ixs[tree_idx]) tree.forget_point(forget_index) self.ixs[tree_idx] = np.delete(self.ixs[tree_idx], np.argwhere(self.ixs[tree_idx] == forget_index)) # Insert the new point into the tree try: tree.insert_point(point, index=index) self.ixs[tree_idx] = np.append(self.ixs[tree_idx], index) except: ValueError('failure for point {} at index {}'.format(point, index)) # Compute codisp on the new point and take the average among all trees avg_codisp[index] += tree.codisp(index) self.num_points += 1 return avg_codisp / self.num_trees
def __init__(self, stream, window_size, max_size, view, alpha=1, beta=1, gamma=1, data_stream=False, freq=False, num_trees=50, max_depth=256, seed=None): ''' :stream: a Stream object to mine :window_size: the size of a window to use—determines the max size of snippets ''' print('Running anomaly version.') super().__init__(stream, window_size, max_size, view=view, alpha=alpha, beta=beta, gamma=gamma, save_output=False) # tree parameters for rrcf print('Random seed {}.'.format(seed)) self.seed = seed # tree parameters for rrcf np.random.seed(seed) self.index = 0 self.num_trees = num_trees self.tree_size = max_depth self.data_stream = data_stream self.freq = freq # create a forest of empty trees for rrcf self.forest = [] for _ in range(self.num_trees): tree = rrcf.RCTree() self.forest.append(tree) self.anomaly_scores = list() self.score_times = list() self.score_snippets = list() if self.data_stream: # NOTE: hardcoded for DARPA IP and Chicago (the two datasets used in the paper). print('Running data stream version of P') self.occ_intervals = defaultdict(set) # paper uses 60 measurement periods if stream.name.startswith('darpa'): bin_width = 87725 / 60 elif stream.name.startswith('chicago'): gt_name = '../data/{}.txt'.format(self.stream.name) lines = open(gt_name, 'r').readlines() ts = int(lines[0].strip().split(',')[-1]) te = int(lines[-1].strip().split(',')[-1]) width = te - ts print('Width = te - ts = {} - {} = {}'.format(te, ts, width)) bin_width = 7773420 / 60 else: print( 'Baseline not implemented for datasets other than DARPA IP and Chicago Bike because the baseline falsely assumes that the stream length is known a priori.' ) sys.exit(1) self.bins = list() for i in range(1, 60): self.bins.append(0 + (bin_width * i))
def test_from_dict(): with open('tree.json', 'r') as infile: obj = json.load(infile) tree = rrcf.RCTree() tree.load_dict(obj) tree = rrcf.RCTree.from_dict(obj)
marker="v", markersize=10, label='iForest') auc_all[j, 1] = metrics.roc_auc_score(y, alg_scores) print('\n******RRCF*******\n') num_trees = 2500 tree_size = 256 forest = [] while len(forest) < num_trees: # Select random subsets of points uniformly from point set ixs = np.random.choice(n, size=(n // tree_size, tree_size), replace=False) # Add sampled trees to forest trees = [rrcf.RCTree(X[ix], index_labels=ix) for ix in ixs] forest.extend(trees) # Compute average CoDisp avg_codisp = pd.Series(0.0, index=np.arange(n)) index = np.zeros(n) for tree in forest: codisp = pd.Series( {leaf: tree.codisp(leaf) for leaf in tree.leaves}) avg_codisp[codisp.index] += codisp np.add.at(index, codisp.index.values, 1) avg_codisp /= index alg_scores = avg_codisp fpr_alg, tpr_alg, thresholds_alg = metrics.roc_curve(y, alg_scores,
def anomaly_detect2(df, flagdf, num_trees, tree_size): n = df.shape[0] codisps = {} for i in range(df.shape[1]): print('var' + str(i)) varname = df.columns[i] xseries = df.iloc[:, i].interpolate(method='linear') if all(np.isnan(xseries)): continue if np.isnan(xseries[0]): xseries[0] = xseries[1] xseries = pd.concat([ xseries, pd.Series(np.repeat(0, df.shape[0]), index=xseries.index) ], axis=1).to_numpy() sample_size_range = (n // tree_size, tree_size) # Construct forest forest = [] while len(forest) < num_trees: # Select random subsets of points uniformly ixs = np.random.choice(n, size=sample_size_range, replace=False) # Add sampled trees to forest trees = [rrcf.RCTree(xseries[ix], index_labels=ix) for ix in ixs] forest.extend(trees) # Compute average CoDisp avg_codisp = pd.Series(0.0, index=np.arange(n)) # avg_codisp_dict = {} index = np.zeros(n) for tree in forest: codisp = pd.Series( {leaf: tree.codisp(leaf) for leaf in tree.leaves}) avg_codisp[codisp.index] += codisp np.add.at(index, codisp.index.values, 1) avg_codisp /= index # avg_codisp_dict[index] = avg_codisp codisps[varname] = avg_codisp # c='WaterTemp_C' for c in list(codisps.keys()): avg_codisp = codisps[c] #get top 2% of anomaly scores; flag those points with +2 # avg_codisp_df = pd.DataFrame.from_dict(avg_codisp, orient='index', # columns=['score']) avg_codisp_df = pd.DataFrame(avg_codisp, columns=['score']) thresh = float(avg_codisp_df.quantile(0.98)) outl_inds_bool = avg_codisp_df.loc[:, 'score'] > thresh outl_inds_int = outl_inds_bool[outl_inds_bool].index outl_vals = flagdf.loc[flagdf.index[outl_inds_int], c] flagdf.loc[flagdf.index[outl_inds_int], c] = outl_vals + 2 df.loc[df.index[outl_inds_int], varname] = np.nan # outl_inds = avg_codisp_df[outl_inds_bool] # outl = pd.merge(outl_inds, pd.DataFrame(xseries, columns=['val']), how='left', left_index=True, # right_index=True) return (df, flagdf)
import numpy as np import rrcf np.random.seed(0) n = 100 d = 3 X = np.random.randn(n, d) Z = np.copy(X) Z[90:, :] = 1 tree = rrcf.RCTree(X) duplicate_tree = rrcf.RCTree(Z) tree_seeded = rrcf.RCTree(random_state=0) duplicate_tree_seeded = rrcf.RCTree(random_state=np.random.RandomState(0)) deck = np.arange(n, dtype=int) np.random.shuffle(deck) indexes = deck[:5] def test_batch(): # Check stored bounding boxes and leaf counts after instantiating from batch branches = [] tree.map_branches(tree.root, op=tree._get_nodes, stack=branches) leafcount = tree._count_leaves(tree.root) assert (leafcount == n) for branch in branches: leafcount = tree._count_leaves(branch) assert (leafcount == branch.n) bbox = tree.get_bbox(branch) assert (bbox == branch.b).all()
# Set forest parameters num_trees = 1000 tree_size = 256 n = df.shape[0] # (11183, 6) sample_size_range = (n // tree_size, tree_size) # Construct forest forest = [] while len(forest) < num_trees: # Select random subsets of points uniformly ixs = np.random.choice(n, size=sample_size_range, replace=False) trees = list() for ix in ixs: T = ndf[ix] trees.append(rrcf.RCTree(T, index_labels=ix)) # Add sampled trees to foresndf #trees = [rrcf.RCTree(ndf[ix], index_labels=ix) # for ix in ixs] forest.extend(trees) # Compute average CoDisp avg_codisp = pd.Series(0.0, index=np.arange(n)) index = np.zeros(n) for tree in forest: codisp = pd.Series({leaf: tree.codisp(leaf) for leaf in tree.leaves}) avg_codisp[codisp.index] += codisp np.add.at(index, codisp.index.values, 1) avg_codisp /= index fig, ax1 = plt.subplots(figsize=(10, 5))
tree_size = 256 #256 for i in range(len(df.columns)): varname = df.columns[i] xseries = df.iloc[:, i].interpolate(method='linear').to_numpy() if all(np.isnan(xseries)): continue if np.isnan(xseries[0]): xseries[0] = xseries[1] #create a forest of empty trees forest = [] for _ in range(num_trees): tree = rrcf.RCTree() forest.append(tree) #create rolling window points = rrcf.shingle(xseries, size=shingle_size) avg_codisp = {} for index, point in enumerate(points): if index % 2000 == 0: # if index > 16000: print('point' + str(index)) # if index == 17920: # raise ValueError('a') for tree in forest: #drop the oldest point (FIFO) if tree is too big if len(tree.leaves) > tree_size:
def _init_modeling(self): network = pd.read_csv('initial_training_data.csv', index_col='date', parse_dates=['date']) self.forest = [] for _ in range(self.num_trees): tree = rrcf.RCTree() self.forest.append(tree) train_len = len(network) #train_len = 1000 train_start = 80000 self.idx = 0 print("start!") for index in range(train_start, train_len): point = float(network[index:index + 1].values) # get one by one for tree in self.forest: if len(tree.leaves) > self.tree_size: tree.forget_point(self.idx - self.tree_size) tree.insert_point(point, index=self.idx) if not index in self.avg_codisp: self.avg_codisp[self.idx] = 0 self.avg_codisp[self.idx] += tree.codisp( self.idx) / self.num_trees # avg_codisp은 (각 tree 이 point를 anomaly로 생각하는 정도)의 평균 mean = np.array(list(self.avg_codisp.values())).mean() std = np.array(list(self.avg_codisp.values())).std() z = (self.avg_codisp[self.idx] - mean) / std self.idx += 1 if z > 3.0 or z < -3.0: # if abs(z-score) is over 3.0 # replace the value with the mean of prev 5 days network.iloc[index] = network[index - 5:index].mean() # print("init_modeling에서 anomaly detection 완료") print("init_modeling에서 trainign 시작") for i in range(7 + train_start, train_len): X_train = pd.Series() X_train['prev1'] = float(network[i - 7:i - 6]['target'].values) X_train['prev2'] = float(network[i - 6:i - 5]['target'].values) X_train['prev3'] = float(network[i - 5:i - 4]['target'].values) y_train = (network[i:i + 1]['target'].values) self.mfr.partial_fit(X_train.values.reshape(1, -1), y_train) print("train 완료") self.previous_target_3['prev3'] = float( network[train_len - 8:train_len - 7]['target'].values) self.previous_target_3['prev2'] = float( network[train_len - 7:train_len - 6]['target'].values) self.previous_target_3['prev1'] = float( network[train_len - 6:train_len - 5]['target'].values) self.previous_train_batch = network[train_len - 5:train_len]['target'].values print('endebded')
def anomaly_detect(df, flagdf, num_trees, shingle_size, tree_size): # df = z # Set tree parameters for robust random cut forest (rrcf) # num_trees = 40#40 # shingle_size = 20 # tree_size = 64#256 codisps = {} # i=1 # pp = list(enumerate(points)) # index, point = pp[17920] # tree=forest[0] # np.isnan(xseries) # xseries.interpolate(method='linear') # all(x.iloc[:,i].isnull()) # z.iloc[1,3] = np.nan # z.pH.interpolate() for i in range(df.shape[1]): print('var' + str(i)) varname = df.columns[i] xseries = df.iloc[:,i].interpolate(method='linear').to_numpy() if all(np.isnan(xseries)): continue if np.isnan(xseries[0]): xseries[0] = xseries[1] #create a forest of empty trees forest = [] for _ in range(num_trees): tree = rrcf.RCTree() forest.append(tree) #create rolling window points = rrcf.shingle(xseries, size=shingle_size) avg_codisp = {} for index, point in enumerate(points): if index % 2000 == 0: # if index > 16000: print('point' + str(index)) # if index == 17920: # raise ValueError('a') for tree in forest: #drop the oldest point (FIFO) if tree is too big if len(tree.leaves) > tree_size: tree.forget_point(index - tree_size) tree.insert_point(point, index=index) #compute collusive displacement on the inserted point new_codisp = tree.codisp(index) #take the average codisp across all trees; that's anomaly score if not index in avg_codisp: avg_codisp[index] = 0 avg_codisp[index] += new_codisp / num_trees codisps[varname] = avg_codisp # c='WaterTemp_C' for c in list(codisps.keys()): avg_codisp = codisps[c] #get top 2% of anomaly scores; flag those points with +2 avg_codisp_df = pd.DataFrame.from_dict(avg_codisp, orient='index', columns=['score']) thresh = float(avg_codisp_df.quantile(0.98)) outl_inds_bool = avg_codisp_df.loc[:,'score'] > thresh outl_inds_int = outl_inds_bool[outl_inds_bool].index outl_vals = flagdf.loc[flagdf.index[outl_inds_int], c] flagdf.loc[flagdf.index[outl_inds_int], c] = outl_vals + 2 df.loc[df.index[outl_inds_int], varname] = np.nan # outl_inds = avg_codisp_df[outl_inds_bool] # outl = pd.merge(outl_inds, pd.DataFrame(xseries, columns=['val']), how='left', left_index=True, # right_index=True) return (df, flagdf)
import numpy as np import rrcf np.random.seed(0) n = 100 d = 3 X = np.random.randn(n, d) Z = np.copy(X) Z[90:, :] = 1 tree = rrcf.RCTree(X) duplicate_tree = rrcf.RCTree(Z) deck = np.arange(n, dtype=int) np.random.shuffle(deck) indexes = deck[:5] def test_batch(): # Check stored bounding boxes and leaf counts after instantiating from batch branches = [] tree.map_branches(tree.root, op=tree._get_nodes, stack=branches) leafcount = tree._count_leaves(tree.root) assert (leafcount == n) for branch in branches: leafcount = tree._count_leaves(branch) assert (leafcount == branch.n) bbox = tree.get_bbox(branch) assert (bbox == branch.b).all()
print("{}:{} plotted".format(i, col)) # Set forest parameters num_trees = 100 tree_size = 256 sample_size_range = (n_train // tree_size, tree_size) # Construct forest forest = [] while len(forest) < num_trees: # Select random subsets of points uniformly ixs = np.random.choice(n_train, size=sample_size_range, replace=False) # Add sampled trees to forest trees = [rrcf.RCTree(ndf_train[ix], index_labels=ix) for ix in ixs] forest.extend(trees) print("Forest constructed") """ # Compute average CoDisp avg_codisp = pd.Series(0.0, index=np.arange(n_train)) index = np.zeros(n_train) for tree in forest: codisp = pd.Series({leaf: tree.codisp(leaf) for leaf in tree.leaves}) avg_codisp[codisp.index] += codisp np.add.at(index, codisp.index.values, 1) avg_codisp /= index""" # Create a dict to store anomaly score of each point avg_codisp = np.zeros(ndf_test.shape[0])
def main(): dataset_test = "xplane_1473" df_test = pd.read_csv('datasets/{}.csv'.format(dataset_test), delimiter='|') ndf_test = df_test.to_numpy() ndf_test[...] *= 0.8 # bozma kısmı print("Test type: {}".format(ndf_test.dtype)) dataset_train = "xplane_7540" df_train = pd.read_csv('datasets/{}.csv'.format(dataset_train), delimiter='|') n_train = 7540 ndf_train = df_train.to_numpy() print("Train type: {}".format(ndf_train.dtype)) ndf_train = ndf_train[:n_train] # ndf_train = ndf_train.astype(np.float16) if not os.path.isdir(dataset_test): os.mkdir(dataset_test) start_plot = time() for i, col in enumerate(df_test.columns): plt.title(col) plt.xlabel('time') plt.ylabel('value') plt.plot(ndf_test[:, i]) plt.savefig('{}/{}.png'.format(dataset_test, col)) plt.clf() print("{}:{} plotted".format(i, col)) end_plot = time() # Set forest parameters num_trees = 100 tree_size = 256 sample_size_range = (n_train // tree_size, tree_size) # Construct forest start_forest = time() forest = [] while len(forest) < num_trees: # Select random subsets of points uniformly ixs = np.random.choice(n_train, size=sample_size_range, replace=False) # Add sampled trees to forest trees = [rrcf.RCTree(ndf_train[ix], index_labels=ix) for ix in ixs] forest.extend(trees) print("Forest constructed") end_forest = time() # Create a dict to store anomaly score of each point # avg_codisp = np.zeros(ndf_test.shape[0]) cores = cpu_count() # create the multiprocessing pool pool = Pool(cores) start_test = time() ndf_test = ndf_test[(ndf_test.shape[0] % cores):] # to split equally, remove elements ndf_splitted = np.split(ndf_test, cores, axis=0)[1:] mapped = pool.map(test_func, forest, n_train, num_trees, ndf_splitted) avg_codisp = np.vstack(mapped) end_test = time() print("Finished...\nPlot: {}\nForest: {}\n{}Test: {}") fig, ax1 = plt.subplots(figsize=(10, 5)) color = 'tab:blue' plt.title("AVG CoDisp") ax1.tick_params(axis='y', labelcolor=color, labelsize=12) ax1.set_ylim(0, 100) plt.xlabel('sec/100') plt.ylabel('value') plt.plot(avg_codisp) plt.savefig('{}/result.png'.format(dataset_test))