Пример #1
0
    def simple_distance_based(self, data_table, cols, d_function, dmin, fmin):
        print('Calculating simple distance-based criterion.')

        # Normalize the dataset first.
        new_data_table = util.normalize_dataset(
            data_table.dropna(axis=0, subset=cols), cols)

        # Create the distance table first between all instances:
        self.distances = self.distance_table(new_data_table, cols, d_function)

        mask = []
        # Pass the rows in our table.
        for i in tqdm(range(0, len(new_data_table.index))):
            # Check what faction of neighbors are beyond dmin.
            frac = (float(
                sum([
                    1 for col_val in self.distances.iloc[i, :].tolist()
                    if col_val > dmin
                ])) / len(new_data_table.index))
            # Mark as an outlier if beyond the minimum frequency.
            mask.append(frac > fmin)
        data_mask = pd.DataFrame(mask,
                                 index=new_data_table.index,
                                 columns=['simple_dist_outlier'])
        data_table = pd.concat([data_table, data_mask], axis=1)
        del self.distances
        return data_table
Пример #2
0
    def apply_pca(data_table: pd.DataFrame, cols: List[str], number_comp: int) -> pd.DataFrame:
        """
        Fit PCA to the data and calculate the principal components. The transformed values are added to the dataframe
        in separate columns name 'pca_X', where X is the component number. The function can only be applied if the data
        does not have missing values (i.E. NaN), so rows with missing values have to be deleted before calling.

        :param data_table: Dataframe containing the data to apply the PCA on.
        :param cols: Columns to use for transforming the data to principal components.
        :param number_comp: Number of components to add to calculate. Can only be smaller or equal to the number of
        columns.
        :return: Dataframe with the original data and the transformed components.
        """

        # Normalize the data first.
        dt_norm = util.normalize_dataset(data_table, cols)

        # Perform the PCA
        pca = PCA(n_components=number_comp)
        pca.fit(dt_norm[cols])

        # Transform old values and add them to the original dataframe
        new_values = pca.transform(dt_norm[cols])
        for comp in range(0, number_comp):
            data_table['pca_' + str(comp + 1)] = new_values[:, comp]
        return data_table
Пример #3
0
    def local_outlier_factor(self, data_table: pd.DataFrame, cols: List[str],
                             d_function: str, k: int) -> pd.DataFrame:
        """
        Compute the local outlier factor for each row in the data table. Inspired by
        https://github.com/damjankuznar/pylof/blob/master/lof.py but tailored towards the distance # metrics and data
        structures used here.

        :param data_table: DataFrame to calculate the lof for.
        :param cols: Cols to use for calculating the distance between rows.
        :param d_function: Distance function to use. By now only 'euclidean' is supported.
        :param k: Number of neighboring points considered.
        :return: Original DataFrame with new column named 'lof' added.
        """

        print("Calculating local outlier factor.")

        # Normalize the dataset first.
        norm_data_table = util.normalize_dataset(
            data_table.dropna(axis=0, subset=cols), cols)
        # Create the distance table first between all instances:
        self.distances = self.create_distance_table(norm_data_table, cols,
                                                    d_function)

        outlier_factor = []
        # Compute the outlier score per row.
        for i in tqdm(range(0, len(norm_data_table.index))):
            outlier_factor.append(self.local_outlier_factor_instance(i, k))
        data_outlier_probs = pd.DataFrame(outlier_factor,
                                          index=norm_data_table.index,
                                          columns=['lof'])
        data_table = pd.concat([data_table, data_outlier_probs], axis=1)
        del self.distances
        return data_table
Пример #4
0
    def local_outlier_factor(self, data_table, cols, d_function, k):
        # Inspired by https://github.com/damjankuznar/pylof/blob/master/lof.py
        # but tailored towards the distance metrics and data structures used here.

        print("Calculating local outlier factor.")

        # Normalize the dataset first.
        new_data_table = util.normalize_dataset(
            data_table.dropna(axis=0, subset=cols), cols)
        # Create nearest k neighbors table and distances t othem for ecery data point.
        self.neighbor_distances, self.neighbors = self.k_nearest_neighbors(
            new_data_table, cols, k, d_function)

        outlier_factor = []
        # Compute the outlier score per row.
        for i in range(0, len(new_data_table.index)):
            if i % 100 == 0: print(f'Completed {i} steps for LOF.')
            outlier_factor.append(self.local_outlier_factor_instance(i, k))
        data_outlier_probs = pd.DataFrame(outlier_factor,
                                          index=new_data_table.index,
                                          columns=['lof'])
        data_table = pd.concat([data_table, data_outlier_probs], axis=1)
        del self.neighbors
        del self.neighbor_distances
        return data_table
Пример #5
0
    def determine_pc_explained_variance(self, data_table, cols):
        # Normalize the data first.
        dt_norm = util.normalize_dataset(data_table, cols)

        # perform the PCA.
        self.pca = PCA(n_components=len(cols))
        self.pca.fit(dt_norm[cols])
        # And return the explained variances.
        return self.pca.explained_variance_ratio_
Пример #6
0
    def local_outlier_factor(self, data_table, cols, d_function, k):
        # Inspired on https://github.com/damjankuznar/pylof/blob/master/lof.py
        # but tailored towards the distance metrics and data structures used here.

        # Normalize the dataset first.
        new_data_table = util.normalize_dataset(data_table.dropna(axis=0, subset=cols), cols)
        # Create the distance table first between all instances:
        self.distances = self.distance_table(new_data_table, cols, d_function)

        outlier_factor = []
        # Compute the outlier score per row.
        for i in range(0, len(new_data_table.index)):
            outlier_factor.append(self.local_outlier_factor_instance(i, k))
        data_outlier_probs = pd.DataFrame(outlier_factor, index=new_data_table.index, columns=['lof'])
        data_table = pd.concat([data_table, data_outlier_probs], axis=1)
        return data_table
Пример #7
0
    def apply_pca(self, data_table, cols, number_comp):
        # Normalize the data first.
        dt_norm = util.normalize_dataset(data_table, cols)

        # perform the PCA.
        self.pca = PCA(n_components=number_comp)
        self.pca.fit(dt_norm[cols])

        # Transform our old values.
        new_values = self.pca.transform(dt_norm[cols])

        #And add the new ones:
        for comp in range(0, number_comp):
            data_table['pca_' + str(comp + 1)] = new_values[:, comp]

        return data_table
Пример #8
0
    def determine_pc_explained_variance(data_table: pd.DataFrame, cols: List[str]) -> List[float]:
        """
        Perform the PCA on the selected columns and return the explained variance. The function can only be applied if
        the data does not have missing values (i.E. NaN), so rows with missing values have to be deleted before calling.

        :param data_table: Dataframe with the data to fit PCA on.
        :param cols: Columns in data_table to use for fitting the PCA.
        :return: List with explained variances of the principle components.
        """

        # Normalize the data
        dt_norm = util.normalize_dataset(data_table, cols)

        # Perform the PCA and return the explained variances
        pca = PCA(n_components=len(cols))
        pca.fit(dt_norm[cols])
        return pca.explained_variance_ratio_
Пример #9
0
    def simple_distance_based(self, data_table: pd.DataFrame, cols: List[str],
                              d_function: str, d_min: float,
                              f_min: float) -> pd.DataFrame:
        """
        Detect outliers using a simple distance based algorithm. Assuming a distance function, e.g. 'euclidean',
        a minimum distance of neighboring points and frequency of occurrence, outliers are detected and a new binary
        column is added.

        :param data_table: Data to detect outliers in.
        :param cols: Columns to use for calculating distance between rows.
        :param d_function: Distance function to use for calculating the distance between points.
        :param d_min: Minimum distance to count points as neigbours.
        :param f_min: Proportion of all data points from which a point is counted as an outlier.
        :return: Original data with new binary column names 'simple_dist_outlier'.
        """

        print('Calculating simple distance-based criterion.')

        # Normalize the dataset first
        norm_data_table = util.normalize_dataset(
            data_table.dropna(axis=0, subset=cols), cols)
        # Create the distance table first between all instances
        distances = self.create_distance_table(norm_data_table, cols,
                                               d_function)

        mask = []
        # Pass the rows in our table
        for i in tqdm(range(0, len(norm_data_table.index))):
            # Check what faction of neighbors are beyond dmin
            frac = (float(
                sum([
                    1 for col_val in distances.iloc[i, :].tolist()
                    if col_val > d_min
                ])) / len(norm_data_table.index))
            # Mark as an outlier if beyond the minimum frequency
            mask.append(frac > f_min)
        data_mask = pd.DataFrame(mask,
                                 index=norm_data_table.index,
                                 columns=['simple_dist_outlier'])
        data_table = pd.concat([data_table, data_mask], axis=1)
        return data_table