示例#1
0
def test_to_dict():
    tree = rrcf.RCTree()
    tree.insert_point([0., 0.], index=0)
    tree.insert_point([0., 0.], index=1)
    tree.insert_point([0., 0.], index=2)
    tree.insert_point([0., 1.], index=3)
    obj = tree.to_dict()
    X = np.random.randn(10, 3)
    X[5] = X[2]
    tree = rrcf.RCTree(X)
    obj = tree.to_dict()
    with open('tree.json', 'w') as outfile:
        json.dump(obj, outfile)
示例#2
0
    def construct_force(self) -> List:
        """ Forest creator

        This function creates a list of trees which are constructed randomly.

        :return:
                - list forest: A list of the trees that are created randomly
        """

        forest = []

        sample_size_range = (self.number_of_rows // self.tree_size, self.tree_size)

        if sample_size_range[0] == 0:
            raise ValueError("Please check the tree_size. It seems that the number of the "
                             "samples (rows) is less than the size of the tree")

        while len(forest) < self.number_trees:
            # Select random subsets of points uniformly from point set
            ixs = np.random.choice(self.number_of_rows, size=sample_size_range,
                                   replace=False)
            # Add sampled trees to forest
            trees = [rrcf.RCTree(self.data_array[ix], index_labels=ix)
                     for ix in ixs]
            forest.extend(trees)
        return forest
示例#3
0
 def generate_forest(self):
     # Create a forest of empty trees
     forest = []
     for _ in range(self.num_trees):
         tree = rrcf.RCTree()
         forest.append(tree)
     self.forest = forest
示例#4
0
def test_print():
    tree = rrcf.RCTree()
    tree.insert_point([0., 0.], index=0)
    tree.insert_point([0., 0.], index=1)
    tree.insert_point([0., 1.], index=3)
    print(list(tree.leaves.values())[0])
    print(tree.root)
    def fit_batch(self, points):
        '''
            Creates a rrcf with num_trees trees from random samples of size tree_size from a batch set of points

            Parameters:
                points:   the points from which to create the rrcf. np.ndarray of size (n x d)
        '''

        # assert that points.shape has two dimensions
        try:
            assert(len(points.shape) is 2)
        except:
            raise ValueError("Input points must have shape (n x d)")
        self.num_points = points.shape[0]
        self.dimension = points.shape[1]

        # create forest
        forest = []

        # scale mean and variance of points
        scaled_points = preprocessing.scale(points)

        # take unqiue values before random sampling, so random sample isn't replication of same points
        tree_index = 0
        while len(forest) < self.num_trees:
            # Select random subsets of points uniformly from point set
                ixs = np.random.choice(self.num_points, self.tree_size, replace=False)
            # Add sampled trees to forest
                tree = rrcf_base.RCTree(scaled_points[ixs], index_labels=ixs)
                forest.append(tree)
                self.ixs[tree_index] = ixs
                tree_index += 1
        self.forest = forest
示例#6
0
def get_avgcodisp(instances):
    # Create a forest of empty trees
    forest = []
    for _ in range(num_trees):
        tree = rrcf.RCTree()
        forest.append(tree)

    # Use the "shingle" generator to create rolling window
    points = rrcf.shingle(instances, size=shingle_size)

    # Create a dict to store anomaly score of each point
    avg_codisp = {}

    # For each shingle...
    for index, point in enumerate(points):
        # For each tree in the forest...
        for tree in forest:
            # If tree is above permitted size...
            if len(tree.leaves) > tree_size:
                # Drop the oldest point (FIFO)
                tree.forget_point(index - tree_size)
            # Insert the new point into the tree
            tree.insert_point(point, index=index)
            # Compute codisp on the new point...
            # new_codisp = tree.codisp(index)
            # And take the average over all trees
            if not index in avg_codisp:
                avg_codisp[index] = 0
            avg_codisp[index] += tree.codisp(index) / num_trees
    return avg_codisp
示例#7
0
    def robust_random_cut(self, sketch_vector):
        # Set tree parameters
        # Specify sample parameters

        forest = []
        num_trees = 50
        tree_size = 256
        n = len(sketch_vector)
        sample_size_range = (n // tree_size, tree_size)
        while len(forest) < num_trees:
            # Select random subsets of points uniformly from point set
            ixs = np.random.choice(n, size=sample_size_range, replace=False)
            # Add sampled trees to forest
            trees = [
                rrcf.RCTree(sketch_vector[ix], index_labels=ix) for ix in ixs
            ]
            forest.extend(trees)

        # Compute average CoDisp
        avg_codisp = pd.Series(0.0, index=np.arange(n))
        index = np.zeros(n)
        for tree in forest:
            codisp = pd.Series(
                {leaf: tree.codisp(leaf)
                 for leaf in tree.leaves})
            avg_codisp[codisp.index] += codisp
            np.add.at(index, codisp.index.values, 1)
        avg_codisp /= index

        predicted = avg_codisp > avg_codisp.quantile(0.70)
        # print("Predicted: ", predicted)
        return predicted
示例#8
0
    def create_forest(num_trees):

        forest = []
        for _ in range(num_trees):
            tree = rrcf.RCTree()
            forest.append(tree)

        return forest
示例#9
0
def test_from_dict():
    num_leaves = 10
    with open('tree.json', 'r') as infile:
        obj = json.load(infile)
    tree = rrcf.RCTree()
    tree.load_dict(obj)
    tree = rrcf.RCTree.from_dict(obj)
    # Ensure we didn't drop any duplicate leaves
    assert len(tree.leaves) == num_leaves
示例#10
0
def test_insert_depth():
    tree = rrcf.RCTree()
    tree.insert_point([0., 0.], index=0)
    tree.insert_point([0., 0.], index=1)
    tree.insert_point([0., 0.], index=2)
    tree.insert_point([0., 1.], index=3)
    tree.forget_point(index=3)
    min_depth = min(leaf.d for leaf in tree.leaves.values())
    assert min_depth >= 0
示例#11
0
    def robust_random_cut(self, sketch_vector):
        # Set tree parameters

        sketch_vector = sketch_vector.sort_values(by='graphid',
                                                  ascending=False)
        sketch = sketch_vector['sketch'].tolist()
        sketch = preprocessing.scale(sketch)
        num_trees = 50
        shingle_size = 1  #args.win_size
        tree_size = 32

        # Create a forest of empty trees
        forest = []
        for _ in range(num_trees):
            tree = rrcf.RCTree()
            forest.append(tree)

        # Use the "shingle" generator to create rolling window
        points = rrcf.shingle(sketch, size=shingle_size)

        # Create a dict to store anomaly score of each point
        avg_codisp = {}
        # For each shingle...
        for index, point in enumerate(points):
            # For each tree in the forest...
            if index % 50 == 0:
                print("Index: ", index)
            for tree in forest:
                # If tree is above permitted size...
                if len(tree.leaves) > tree_size:
                    # Drop the oldest point (FIFO)
                    tree.forget_point(index - tree_size)
                # Insert the new point into the tree
                tree.insert_point(point, index=index)
                # Compute codisp on the new point...
                new_codisp = tree.codisp(index)
                # And take the average over all trees
                if not index in avg_codisp:
                    avg_codisp[index] = 0
                avg_codisp[index] += new_codisp / num_trees
        # print(avg_codisp)
        disp = pd.Series([avg_codisp[s] for s in avg_codisp])
        pred_rrcf = disp > disp.quantile(0.95)
        print(
            metrics.classification_report(np.array(sketch_vector['anomaly']),
                                          pred_rrcf))
        # plt.plot(disp)
        # plt.plot(disp, marker='.')
        # plt.show()
        return pred_rrcf, disp
 def __init__(self, number_of_trees=40, train_size=500,
              queue_size=500,
              last_scores_size=8000,
              small_window_size=10,):
     super().__init__()
     self.forest = [rrcf.RCTree() for _ in range(number_of_trees)]
     self.train_size = train_size
     self.current_index = 0
     self.queue_size = queue_size
     self.anomaly_scores_queue = []
     self.points_to_add_in_future = []
     self.last_scores_size = last_scores_size
     self.small_window_size = small_window_size
     self.queue = []
     self.anomaly_scores_queue = []
示例#13
0
def find_anomalies(input):
    # Set tree parameters
    num_trees = 40
    shingle_size = 1
    tree_size = 256

    # Create a forest of empty trees
    forest = []
    for _ in range(num_trees):
        tree = rrcf.RCTree()
        forest.append(tree)

    inputPoints = list(map(lambda x: x['value'], input))

    points = rrcf.shingle(inputPoints, size=shingle_size)

    avg_codisp = {}
    disp = {}

    # For each shingle...
    for index, point in enumerate(inputPoints):
        # For each tree in the forest...
        for tree in forest:
            # If tree is above permitted size, drop the oldest point (FIFO)
            if len(tree.leaves) > tree_size:
                tree.forget_point(index - tree_size)
            # Insert the new point into the tree
            tree.insert_point(point, index=index)
            # Compute codisp on the new point and take the average among all trees
            if not index in avg_codisp:
                avg_codisp[index] = 0
            avg_codisp[index] += tree.codisp(index) / num_trees
            disp[index] = tree.disp(index)

    output = []

    for i in range(len(input)):
        codisp = avg_codisp[i]

        point = {}
        point['value'] = input[i]['value']
        point['timestamp'] = input[i]['timestamp']
        point['isAnomaly'] = codisp > 40
        point['codisp'] = codisp
        output.append(point)

    return output
示例#14
0
def init(df,param):
    # Set model parameters
    features=len(df)
    num_trees=15
    tree_size=30
    sample_size_range=(features // tree_size, tree_size)
    
    if 'options' in param:
        if 'params' in param['options']:
            if 'num_trees' in param['options']['params']:
                num_trees = int(param['options']['params']['num_trees'])
            if 'tree_size' in param['options']['params']:
                tree_size = int(param['options']['params']['tree_size'])
    
    # Convert data to nparray
    variables=[]
    
    if 'target_variables' in param:
        variables=param['target_variables']
        
    other_variables=[]
    
    if 'feature_variables' in param:
        other_variables=param['feature_variables']

    for item in other_variables:
        variables.append(item)
    
    data=df[variables].to_numpy().astype(float)
    
    # Create the random cut forest
    forest = []
    while len(forest) < num_trees:
        # Select random subsets of points uniformly
        ixs = np.random.choice(features, size=sample_size_range,
                               replace=False)
        # Add sampled trees to forest
        trees = [rcf.RCTree(data[ix], index_labels=ix)
                 for ix in ixs]
        forest.extend(trees)
    return forest
示例#15
0
def rrcf_calc(dfs):
    print(dfs)
    df_merged = dfs[0][['phase_dif']]
    min_date = df_merged.index.min()
    df_merged = df_merged[:min_date + pd.Timedelta(days=90)]
    print(df_merged)
    num_points = df_merged.shape[0]
    print("num_points: " + str(num_points))
    num_trees = 6000
    tree_size = 1000
    # shingle_size = 24
    #
    # points = rrcf.shingle(df_merged.Value, size=shingle_size)
    # points = np.vstack([point for point in points])
    # num_points = points.shape[0]
    sample_size_range = (num_points // tree_size, tree_size)
    forest = []
    while len(forest) < num_trees:
        print(len(forest))
        indices = np.random.choice(num_points,
                                   size=sample_size_range,
                                   replace=False)

        trees = [
            rrcf.RCTree(df_merged.iloc[ix], index_labels=ix) for ix in indices
        ]
        # trees = [rrcf.RCTree(points[ix], index_labels=ix) for ix in indices]
        forest.extend(trees)

    avg_codisp = pd.Series(0.0, index=np.arange(num_points))
    n_owning_trees = np.zeros(num_points)
    for tree in forest:
        codisp = pd.Series({leaf: tree.codisp(leaf) for leaf in tree.leaves})
        avg_codisp[codisp.index] += codisp
        np.add.at(n_owning_trees, codisp.index.values, 1)
    avg_codisp /= n_owning_trees
    # avg_codisp.index = df_merged.Value.iloc[(shingle_size - 1):].index
    print(avg_codisp)
    plot_anomaly_score(dfs[0], avg_codisp)
示例#16
0
    def robust_random_cut_batch(self, sketch):
        # Set tree parameters
        # Specify sample parameters
        sketch_vector = sketch['sketch'].tolist()
        # sketch_vector = preprocessing.scale(sketch_vector)
        sketch_vector = np.array(sketch_vector)
        forest = []
        num_trees = 50
        tree_size = 256
        n = len(sketch_vector)
        sample_size_range = (n // tree_size, tree_size)
        while len(forest) < num_trees:
            # Select random subsets of points uniformly from point set
            ixs = np.random.choice(n, size=sample_size_range, replace=False)
            trees = [
                rrcf.RCTree(sketch_vector[ix], index_labels=ix) for ix in ixs
            ]
            forest.extend(trees)

        # Compute average CoDisp
        avg_codisp = pd.Series(0.0, index=np.arange(n))
        index = np.zeros(n)
        for tree in forest:
            codisp = pd.Series(
                {leaf: tree.codisp(leaf)
                 for leaf in tree.leaves})
            avg_codisp[codisp.index] += codisp
            np.add.at(index, codisp.index.values, 1)
        avg_codisp /= index

        pred_rrcf = avg_codisp > avg_codisp.quantile(0.95)
        # pred_rrcf = np.array(pred_rrcf)
        # print("Predicted: ", pred_rrcf)

        print(
            metrics.classification_report(np.array(sketch_vector['anomaly']),
                                          pred_rrcf))
        return pred_rrcf, avg_codisp
示例#17
0
import numpy as np
import rrcf

np.random.seed(0)
n = 100
d = 3
X = np.random.randn(n, d)
tree = rrcf.RCTree(X)

deck = np.arange(n, dtype=int)
np.random.shuffle(deck)
indexes = deck[:5]


def test_batch():
    # Check stored bounding boxes and leaf counts after instantiating from batch
    branches = []
    tree.map_branches(tree.root, op=tree._get_nodes, stack=branches)
    leafcount = tree._count_leaves(tree.root)
    assert (leafcount == n)
    for branch in branches:
        leafcount = tree._count_leaves(branch)
        assert (leafcount == branch.n)
        bbox = tree.get_bbox(branch)
        assert (bbox == branch.b).all()


def test_forget_batch():
    # Check stored bounding boxes and leaf counts after forgetting points
    for index in indexes:
        forgotten = tree.forget_point(index)
    def stream_anomaly_scores(self, points, window_size, new_forest = False):
        '''
        Computes anomaly scores for all points in a stream by computing the average
        collusive displacement. The assumption is that each point in the stream is only observed
        sequentially. Higher scores indicate a higher displacement and thus a 
        higher likelihood of anomaly. If existing forest does not exist, or existing forest does
        exist with a different window size, create a new forest starting with the first point 
        in the stream. 

        Parameters:
            points:         the stream of point on which to calculate anomaly scores
            window_size:    the window size in which to ingest points. points are mapped as a 
                            n-dimensional window, where n = window_size
            new_forest:     boolean that identifies whether to create a new forest or not

        Returns:
            anomaly_scores: pandas Series with index of points and average collusive 
                            displacement (anomaly score) for each point
        '''

        # create a new empty forest if forest does not exit or forest does exist, but 
        # with different window size
        if self.forest is None or new_forest:
            self.num_points = 0
            forest = []
            for i in range(self.num_trees):
                tree = rrcf_base.RCTree()
                forest.append(tree)
                self.ixs[i] = []
            self.forest = forest

        # scale mean and variance of points
        #scaled_points = preprocessing.scale(points)
        #print(scaled_points.shape)
        
        # create rolling window of size window_size
        points_gen = rrcf_base.shingle(points, size=window_size)

        # calculate streaming anomaly scores
        avg_codisp = pd.Series(0.0, index=np.arange(self.num_points, self.num_points + points.shape[0]))
        initial_index = self.num_points
        for index, point in enumerate(points_gen):

            index += initial_index
            for tree_idx, tree in enumerate(self.forest):
                # If tree is above permitted size, drop the oldest point (FIFO)
                # TODO: forget oldest point or another random point with prob
                if len(tree.leaves) >= self.tree_size:
                    forget_index = min(self.ixs[tree_idx])
                    tree.forget_point(forget_index)
                    self.ixs[tree_idx] = np.delete(self.ixs[tree_idx], np.argwhere(self.ixs[tree_idx] == forget_index))
                # Insert the new point into the tree
                try:
                    tree.insert_point(point, index=index)
                    self.ixs[tree_idx] = np.append(self.ixs[tree_idx], index)
                except:
                    ValueError('failure for point {} at index {}'.format(point, index))
                # Compute codisp on the new point and take the average among all trees
                avg_codisp[index] += tree.codisp(index)

            self.num_points += 1
        return avg_codisp / self.num_trees
    def __init__(self,
                 stream,
                 window_size,
                 max_size,
                 view,
                 alpha=1,
                 beta=1,
                 gamma=1,
                 data_stream=False,
                 freq=False,
                 num_trees=50,
                 max_depth=256,
                 seed=None):
        '''
        :stream: a Stream object to mine
        :window_size: the size of a window to use—determines the max size of snippets
        '''
        print('Running anomaly version.')
        super().__init__(stream,
                         window_size,
                         max_size,
                         view=view,
                         alpha=alpha,
                         beta=beta,
                         gamma=gamma,
                         save_output=False)

        # tree parameters for rrcf
        print('Random seed {}.'.format(seed))
        self.seed = seed
        # tree parameters for rrcf
        np.random.seed(seed)
        self.index = 0
        self.num_trees = num_trees
        self.tree_size = max_depth
        self.data_stream = data_stream
        self.freq = freq
        # create a forest of empty trees for rrcf
        self.forest = []
        for _ in range(self.num_trees):
            tree = rrcf.RCTree()
            self.forest.append(tree)

        self.anomaly_scores = list()
        self.score_times = list()
        self.score_snippets = list()

        if self.data_stream:  # NOTE: hardcoded for DARPA IP and Chicago (the two datasets used in the paper).
            print('Running data stream version of P')
            self.occ_intervals = defaultdict(set)
            # paper uses 60 measurement periods
            if stream.name.startswith('darpa'):
                bin_width = 87725 / 60
            elif stream.name.startswith('chicago'):
                gt_name = '../data/{}.txt'.format(self.stream.name)
                lines = open(gt_name, 'r').readlines()
                ts = int(lines[0].strip().split(',')[-1])
                te = int(lines[-1].strip().split(',')[-1])
                width = te - ts
                print('Width = te - ts = {} - {} = {}'.format(te, ts, width))
                bin_width = 7773420 / 60
            else:
                print(
                    'Baseline not implemented for datasets other than DARPA IP and Chicago Bike because the baseline falsely assumes that the stream length is known a priori.'
                )
                sys.exit(1)
            self.bins = list()
            for i in range(1, 60):
                self.bins.append(0 + (bin_width * i))
示例#20
0
文件: test_rrcf.py 项目: valeman/rrcf
def test_from_dict():
    with open('tree.json', 'r') as infile:
        obj = json.load(infile)
    tree = rrcf.RCTree()
    tree.load_dict(obj)
    tree = rrcf.RCTree.from_dict(obj)
                 marker="v",
                 markersize=10,
                 label='iForest')
        auc_all[j, 1] = metrics.roc_auc_score(y, alg_scores)

        print('\n******RRCF*******\n')
        num_trees = 2500
        tree_size = 256
        forest = []
        while len(forest) < num_trees:
            # Select random subsets of points uniformly from point set
            ixs = np.random.choice(n,
                                   size=(n // tree_size, tree_size),
                                   replace=False)
            # Add sampled trees to forest
            trees = [rrcf.RCTree(X[ix], index_labels=ix) for ix in ixs]
            forest.extend(trees)

        # Compute average CoDisp
        avg_codisp = pd.Series(0.0, index=np.arange(n))
        index = np.zeros(n)
        for tree in forest:
            codisp = pd.Series(
                {leaf: tree.codisp(leaf)
                 for leaf in tree.leaves})
            avg_codisp[codisp.index] += codisp
            np.add.at(index, codisp.index.values, 1)
        avg_codisp /= index
        alg_scores = avg_codisp
        fpr_alg, tpr_alg, thresholds_alg = metrics.roc_curve(y,
                                                             alg_scores,
示例#22
0
def anomaly_detect2(df, flagdf, num_trees, tree_size):

    n = df.shape[0]
    codisps = {}

    for i in range(df.shape[1]):
        print('var' + str(i))
        varname = df.columns[i]

        xseries = df.iloc[:, i].interpolate(method='linear')

        if all(np.isnan(xseries)):
            continue
        if np.isnan(xseries[0]):
            xseries[0] = xseries[1]

        xseries = pd.concat([
            xseries,
            pd.Series(np.repeat(0, df.shape[0]), index=xseries.index)
        ],
                            axis=1).to_numpy()

        sample_size_range = (n // tree_size, tree_size)

        # Construct forest
        forest = []
        while len(forest) < num_trees:
            # Select random subsets of points uniformly
            ixs = np.random.choice(n, size=sample_size_range, replace=False)
            # Add sampled trees to forest
            trees = [rrcf.RCTree(xseries[ix], index_labels=ix) for ix in ixs]
            forest.extend(trees)

        # Compute average CoDisp
        avg_codisp = pd.Series(0.0, index=np.arange(n))
        # avg_codisp_dict = {}
        index = np.zeros(n)
        for tree in forest:
            codisp = pd.Series(
                {leaf: tree.codisp(leaf)
                 for leaf in tree.leaves})
            avg_codisp[codisp.index] += codisp
            np.add.at(index, codisp.index.values, 1)
        avg_codisp /= index
        # avg_codisp_dict[index] = avg_codisp
        codisps[varname] = avg_codisp

    # c='WaterTemp_C'
    for c in list(codisps.keys()):
        avg_codisp = codisps[c]

        #get top 2% of anomaly scores; flag those points with +2
        # avg_codisp_df = pd.DataFrame.from_dict(avg_codisp, orient='index',
        #     columns=['score'])
        avg_codisp_df = pd.DataFrame(avg_codisp, columns=['score'])
        thresh = float(avg_codisp_df.quantile(0.98))
        outl_inds_bool = avg_codisp_df.loc[:, 'score'] > thresh
        outl_inds_int = outl_inds_bool[outl_inds_bool].index
        outl_vals = flagdf.loc[flagdf.index[outl_inds_int], c]
        flagdf.loc[flagdf.index[outl_inds_int], c] = outl_vals + 2

        df.loc[df.index[outl_inds_int], varname] = np.nan

        # outl_inds = avg_codisp_df[outl_inds_bool]
        # outl = pd.merge(outl_inds, pd.DataFrame(xseries, columns=['val']), how='left', left_index=True,
        #     right_index=True)

    return (df, flagdf)
示例#23
0
import numpy as np
import rrcf

np.random.seed(0)
n = 100
d = 3
X = np.random.randn(n, d)
Z = np.copy(X)
Z[90:, :] = 1

tree = rrcf.RCTree(X)
duplicate_tree = rrcf.RCTree(Z)

tree_seeded = rrcf.RCTree(random_state=0)
duplicate_tree_seeded = rrcf.RCTree(random_state=np.random.RandomState(0))

deck = np.arange(n, dtype=int)
np.random.shuffle(deck)
indexes = deck[:5]

def test_batch():
    # Check stored bounding boxes and leaf counts after instantiating from batch
    branches = []
    tree.map_branches(tree.root, op=tree._get_nodes, stack=branches)
    leafcount = tree._count_leaves(tree.root)
    assert (leafcount == n)
    for branch in branches:
        leafcount = tree._count_leaves(branch)
        assert (leafcount == branch.n)
        bbox = tree.get_bbox(branch)
        assert (bbox == branch.b).all()
示例#24
0
# Set forest parameters
num_trees = 1000
tree_size = 256
n = df.shape[0]  # (11183, 6)
sample_size_range = (n // tree_size, tree_size)

# Construct forest
forest = []
while len(forest) < num_trees:
    # Select random subsets of points uniformly
    ixs = np.random.choice(n, size=sample_size_range, replace=False)
    trees = list()
    for ix in ixs:
        T = ndf[ix]
        trees.append(rrcf.RCTree(T, index_labels=ix))
    # Add sampled trees to foresndf
    #trees = [rrcf.RCTree(ndf[ix], index_labels=ix)
    #        for ix in ixs]
    forest.extend(trees)

# Compute average CoDisp
avg_codisp = pd.Series(0.0, index=np.arange(n))
index = np.zeros(n)
for tree in forest:
    codisp = pd.Series({leaf: tree.codisp(leaf) for leaf in tree.leaves})
    avg_codisp[codisp.index] += codisp
    np.add.at(index, codisp.index.values, 1)
avg_codisp /= index

fig, ax1 = plt.subplots(figsize=(10, 5))
示例#25
0
tree_size = 256  #256

for i in range(len(df.columns)):
    varname = df.columns[i]

    xseries = df.iloc[:, i].interpolate(method='linear').to_numpy()

    if all(np.isnan(xseries)):
        continue
    if np.isnan(xseries[0]):
        xseries[0] = xseries[1]

    #create a forest of empty trees
    forest = []
    for _ in range(num_trees):
        tree = rrcf.RCTree()
        forest.append(tree)

    #create rolling window
    points = rrcf.shingle(xseries, size=shingle_size)

    avg_codisp = {}
    for index, point in enumerate(points):
        if index % 2000 == 0:
            # if index > 16000:
            print('point' + str(index))
        # if index == 17920:
        #     raise ValueError('a')
        for tree in forest:
            #drop the oldest point (FIFO) if tree is too big
            if len(tree.leaves) > tree_size:
示例#26
0
    def _init_modeling(self):
        network = pd.read_csv('initial_training_data.csv',
                              index_col='date',
                              parse_dates=['date'])

        self.forest = []
        for _ in range(self.num_trees):
            tree = rrcf.RCTree()
            self.forest.append(tree)

        train_len = len(network)
        #train_len = 1000
        train_start = 80000
        self.idx = 0

        print("start!")

        for index in range(train_start, train_len):
            point = float(network[index:index + 1].values)  # get one by one

            for tree in self.forest:
                if len(tree.leaves) > self.tree_size:
                    tree.forget_point(self.idx - self.tree_size)

                tree.insert_point(point, index=self.idx)

                if not index in self.avg_codisp:
                    self.avg_codisp[self.idx] = 0
                self.avg_codisp[self.idx] += tree.codisp(
                    self.idx) / self.num_trees

            # avg_codisp은 (각 tree 이 point를 anomaly로 생각하는 정도)의 평균
            mean = np.array(list(self.avg_codisp.values())).mean()
            std = np.array(list(self.avg_codisp.values())).std()

            z = (self.avg_codisp[self.idx] - mean) / std
            self.idx += 1

            if z > 3.0 or z < -3.0:
                # if abs(z-score) is over 3.0
                # replace the value with the mean of prev 5 days
                network.iloc[index] = network[index - 5:index].mean()  #

        print("init_modeling에서 anomaly detection 완료")

        print("init_modeling에서 trainign 시작")
        for i in range(7 + train_start, train_len):
            X_train = pd.Series()
            X_train['prev1'] = float(network[i - 7:i - 6]['target'].values)
            X_train['prev2'] = float(network[i - 6:i - 5]['target'].values)
            X_train['prev3'] = float(network[i - 5:i - 4]['target'].values)
            y_train = (network[i:i + 1]['target'].values)
            self.mfr.partial_fit(X_train.values.reshape(1, -1), y_train)
        print("train 완료")

        self.previous_target_3['prev3'] = float(
            network[train_len - 8:train_len - 7]['target'].values)
        self.previous_target_3['prev2'] = float(
            network[train_len - 7:train_len - 6]['target'].values)
        self.previous_target_3['prev1'] = float(
            network[train_len - 6:train_len - 5]['target'].values)
        self.previous_train_batch = network[train_len -
                                            5:train_len]['target'].values

        print('endebded')
def anomaly_detect(df, flagdf, num_trees, shingle_size, tree_size):

    # df = z
    # Set tree parameters for robust random cut forest (rrcf)
    # num_trees = 40#40
    # shingle_size = 20
    # tree_size = 64#256
    codisps = {}
    # i=1
    # pp = list(enumerate(points))
    # index, point = pp[17920]
    # tree=forest[0]
    # np.isnan(xseries)
    # xseries.interpolate(method='linear')
    # all(x.iloc[:,i].isnull())
    # z.iloc[1,3] = np.nan
    # z.pH.interpolate()

    for i in range(df.shape[1]):
        print('var' + str(i))
        varname = df.columns[i]

        xseries = df.iloc[:,i].interpolate(method='linear').to_numpy()

        if all(np.isnan(xseries)):
            continue
        if np.isnan(xseries[0]):
            xseries[0] = xseries[1]

        #create a forest of empty trees
        forest = []
        for _ in range(num_trees):
            tree = rrcf.RCTree()
            forest.append(tree)

        #create rolling window
        points = rrcf.shingle(xseries, size=shingle_size)

        avg_codisp = {}
        for index, point in enumerate(points):
            if index % 2000 == 0:
                # if index > 16000:
                print('point' + str(index))
            # if index == 17920:
            #     raise ValueError('a')
            for tree in forest:
                #drop the oldest point (FIFO) if tree is too big
                if len(tree.leaves) > tree_size:
                    tree.forget_point(index - tree_size)

                tree.insert_point(point, index=index)

                #compute collusive displacement on the inserted point
                new_codisp = tree.codisp(index)

                #take the average codisp across all trees; that's anomaly score
                if not index in avg_codisp:
                    avg_codisp[index] = 0
                avg_codisp[index] += new_codisp / num_trees

        codisps[varname] = avg_codisp

    # c='WaterTemp_C'
    for c in list(codisps.keys()):
        avg_codisp = codisps[c]

        #get top 2% of anomaly scores; flag those points with +2
        avg_codisp_df = pd.DataFrame.from_dict(avg_codisp, orient='index',
            columns=['score'])
        thresh = float(avg_codisp_df.quantile(0.98))
        outl_inds_bool = avg_codisp_df.loc[:,'score'] > thresh
        outl_inds_int = outl_inds_bool[outl_inds_bool].index
        outl_vals = flagdf.loc[flagdf.index[outl_inds_int], c]
        flagdf.loc[flagdf.index[outl_inds_int], c] = outl_vals + 2

        df.loc[df.index[outl_inds_int], varname] = np.nan

        # outl_inds = avg_codisp_df[outl_inds_bool]
        # outl = pd.merge(outl_inds, pd.DataFrame(xseries, columns=['val']), how='left', left_index=True,
        #     right_index=True)

    return (df, flagdf)
示例#28
0
import numpy as np
import rrcf

np.random.seed(0)
n = 100
d = 3
X = np.random.randn(n, d)
Z = np.copy(X)
Z[90:, :] = 1

tree = rrcf.RCTree(X)
duplicate_tree = rrcf.RCTree(Z)

deck = np.arange(n, dtype=int)
np.random.shuffle(deck)
indexes = deck[:5]


def test_batch():
    # Check stored bounding boxes and leaf counts after instantiating from batch
    branches = []
    tree.map_branches(tree.root, op=tree._get_nodes, stack=branches)
    leafcount = tree._count_leaves(tree.root)
    assert (leafcount == n)
    for branch in branches:
        leafcount = tree._count_leaves(branch)
        assert (leafcount == branch.n)
        bbox = tree.get_bbox(branch)
        assert (bbox == branch.b).all()

示例#29
0
    print("{}:{} plotted".format(i, col))

# Set forest parameters
num_trees = 100
tree_size = 256
sample_size_range = (n_train // tree_size, tree_size)

# Construct forest
forest = []
while len(forest) < num_trees:

    # Select random subsets of points uniformly
    ixs = np.random.choice(n_train, size=sample_size_range, replace=False)

    # Add sampled trees to forest
    trees = [rrcf.RCTree(ndf_train[ix], index_labels=ix) for ix in ixs]
    forest.extend(trees)

print("Forest constructed")
"""
# Compute average CoDisp
avg_codisp = pd.Series(0.0, index=np.arange(n_train))
index = np.zeros(n_train)
for tree in forest:
    codisp = pd.Series({leaf: tree.codisp(leaf) for leaf in tree.leaves})
    avg_codisp[codisp.index] += codisp
    np.add.at(index, codisp.index.values, 1)
avg_codisp /= index"""

# Create a dict to store anomaly score of each point
avg_codisp = np.zeros(ndf_test.shape[0])
示例#30
0
def main():
    dataset_test = "xplane_1473"
    df_test = pd.read_csv('datasets/{}.csv'.format(dataset_test),
                          delimiter='|')
    ndf_test = df_test.to_numpy()
    ndf_test[...] *= 0.8  # bozma kısmı

    print("Test type: {}".format(ndf_test.dtype))

    dataset_train = "xplane_7540"
    df_train = pd.read_csv('datasets/{}.csv'.format(dataset_train),
                           delimiter='|')
    n_train = 7540
    ndf_train = df_train.to_numpy()
    print("Train type: {}".format(ndf_train.dtype))

    ndf_train = ndf_train[:n_train]
    # ndf_train = ndf_train.astype(np.float16)

    if not os.path.isdir(dataset_test):
        os.mkdir(dataset_test)

    start_plot = time()
    for i, col in enumerate(df_test.columns):
        plt.title(col)
        plt.xlabel('time')
        plt.ylabel('value')
        plt.plot(ndf_test[:, i])
        plt.savefig('{}/{}.png'.format(dataset_test, col))
        plt.clf()
        print("{}:{} plotted".format(i, col))

    end_plot = time()
    # Set forest parameters
    num_trees = 100
    tree_size = 256
    sample_size_range = (n_train // tree_size, tree_size)

    # Construct forest
    start_forest = time()
    forest = []
    while len(forest) < num_trees:
        # Select random subsets of points uniformly
        ixs = np.random.choice(n_train, size=sample_size_range, replace=False)

        # Add sampled trees to forest
        trees = [rrcf.RCTree(ndf_train[ix], index_labels=ix) for ix in ixs]
        forest.extend(trees)

    print("Forest constructed")
    end_forest = time()
    # Create a dict to store anomaly score of each point
    # avg_codisp = np.zeros(ndf_test.shape[0])

    cores = cpu_count()
    # create the multiprocessing pool
    pool = Pool(cores)

    start_test = time()
    ndf_test = ndf_test[(ndf_test.shape[0] %
                         cores):]  # to split equally, remove elements
    ndf_splitted = np.split(ndf_test, cores, axis=0)[1:]
    mapped = pool.map(test_func, forest, n_train, num_trees, ndf_splitted)
    avg_codisp = np.vstack(mapped)
    end_test = time()

    print("Finished...\nPlot:   {}\nForest: {}\n{}Test:   {}")
    fig, ax1 = plt.subplots(figsize=(10, 5))
    color = 'tab:blue'
    plt.title("AVG CoDisp")
    ax1.tick_params(axis='y', labelcolor=color, labelsize=12)
    ax1.set_ylim(0, 100)
    plt.xlabel('sec/100')
    plt.ylabel('value')
    plt.plot(avg_codisp)
    plt.savefig('{}/result.png'.format(dataset_test))