Пример #1
0
    def findInterval(self, x_t, pb=None):
        """
           This function finds interval associated with a timeseries given its
           probability


           INPUTS :
                proba : probability given by the classifier at time step t

           OUTPUTS :
                interval of x_t
        """
        # we could use binary search for better perf
        t_current = len(x_t)
        # predict probability
        if self.fears:
            probadf = self.handle_my_classifier(t_current,
                                                transform_to_format_fears(
                                                    numpy_to_df(x_t)),
                                                proba=True)
            proba = probadf['ProbNone1'].values[0]
        elif self.feat:
            proba = pb

        else:
            probadf = self.classifiers[t_current].predict_proba(
                x_t.reshape(1, -1))
            proba = probadf[0][1]  # a verifier

        # search for interval given probability
        ths = self.thresholds[t_current]
        for i, e in enumerate(sorted(ths, reverse=False)):
            if (proba <= e):
                return self.nbIntervals - i
        return 1
    def findInterval(self, x_t, pb=None):
        """
           This function finds interval associated with a timeseries given its
           probability

           INPUTS :
                proba : probability given by the classifier at time step t

           OUTPUTS :
                cluster of x_t
        """
        # we could use binary search for better perf
        t_current = len(x_t)
        # predict probability
        if self.fears:
            probadf = self.handle_my_classifier(t_current, transform_to_format_fears(numpy_to_df(x_t)), proba=True)
            proba = probadf['ProbNone1'].values[0]
        elif self.feat:
            proba=pb
        else:
            probadf = self.classifiers[t_current].predict_proba(x_t.reshape(1, -1))
            proba = probadf[0][1] # a verifier

        # Apply calibration on the single vector proba
        proba = self.calibrate_vector(self.calibrations_grids[t_current], proba)
        # Find interval (here: find the cluster)
        ret =  self.clusterings[t_current].predict(np.array([proba]))
        return ret
Пример #3
0
    def computeThresholdsAndindices(self, X_val):
        """
           This procedure computes thresholds and indices of data associatied
           to each interval.

           INPUTS :
                X_val : validation data

           OUTPUTS :
                thresholds : dictionary of thresholds for each time step.
                indices  : indices associated to each time step and each interval
        """

        _, t = X_val.shape

        # Predict classes
        if self.fears:
            predictions = self.handle_my_classifier(
                t, transform_to_format_fears(X_val), proba=True)
            predictions = predictions.values
        elif self.feat:
            with open(
                    op.join(self.folderRealData, self.dataset,
                            'ep_probas_' + str(t) + '.pkl'), 'rb') as inp:
                predictions = pickle.load(inp)
        else:
            predictions = self.classifiers[t].predict_proba(X_val)

        # Sort according to probabilities

        # if self.feat:
        #     sortedProbabilities = [(i,val) for i,val in zip(np.argsort(predictions)[::-1], sorted(predictions, reverse=True))]
        # else:
        #     sortedProbabilities = [(i,val) for i,val in zip(np.argsort(predictions[:, 1])[::-1], sorted(predictions[:, 1], reverse=True))]

        # Now we always save the full prediction vector and take the last component for the binary case
        sortedProbabilities = [(i, val) for i, val in zip(
            np.argsort(predictions[:, 1])[::-1],
            sorted(predictions[:, 1], reverse=True))]

        # equal frequence
        frequence = len(sortedProbabilities) // self.nbIntervals
        #compute thresholds
        thresholds = []
        indices = [[idx[0] for idx in sortedProbabilities[0:frequence]]]
        for i in range(1, self.nbIntervals):
            thresholds.append(sortedProbabilities[i * frequence][1])
            if (i == self.nbIntervals):
                indices.append(
                    [idx[0] for idx in sortedProbabilities[i * frequence:]])
            else:
                indices.append([
                    idx[0]
                    for idx in sortedProbabilities[i * frequence:(i + 1) *
                                                   frequence]
                ])

        return thresholds, indices
    def compute_P_yhat_y_gammak(self, X_val, Y_val, timestep, indicesData):
        """
           This function computes P_t(ŷ/y,c_k)


           INPUTS :
                X_val, Y_val : valdiation data
                timestep     : timestep reached
                indicesData  : indices of data associated to each interval / timestep
           OUTPUTS :
                probabilities : P_t(ŷ/y,gamma_k)

        """

        occurences = {}

        # initialise probabilities to 0
        probabilities = {(gamma_k, y, y_hat):0 for y in self.labels for y_hat in self.labels for gamma_k in range(self.nbIntervals)}
        

        keysprob = probabilities.keys()
        # print(np.unique(np.array([x[0] for x in keysprob])))
        # print(np.unique(np.array([x[1] for x in keysprob])))
        # print(np.unique(np.array([x[2] for x in keysprob])))


        # Iterate over intervals
        for gamma_k in range(self.nbIntervals):

            indices_gamma_k = indicesData[gamma_k]
            # Subset of Validation set in interval gamma_k
            X_val_ck = X_val.loc[indices_gamma_k,:]

            # Subset of Validation set in interval gamma_k
            if (X_val_ck.shape[0]>0):
                if self.fears:
                    predictions = self.handle_my_classifier(timestep, transform_to_format_fears(X_val_ck.iloc[:, :timestep]))
                elif self.feat:
                    with open(op.join(self.folderRealData, self.dataset, 'ep_preds_'+str(timestep)+'.pkl') ,'rb') as inp:
                        predictions = list(pickle.load(inp))
                        predictions = [predictions[ii] for ii in indices_gamma_k]
                else:
                    predictions = self.classifiers[timestep].predict(X_val_ck.iloc[:, :timestep])

                for y_hat, y in zip(predictions, Y_val.loc[indices_gamma_k]):
                    # frequenceuence
                    probabilities[gamma_k, y, y_hat] += 1
        # normalize
        for gamma_k, y, y_hat in probabilities.keys():
            Y_val_gamma = Y_val.loc[indicesData[gamma_k]]

            # number of observations in gammak knowing y
            sizeCluster_gamma = len(Y_val_gamma[Y_val_gamma==y])

            if (sizeCluster_gamma != 0):
                probabilities[gamma_k, y, y_hat] /= sizeCluster_gamma

        return probabilities
    def computeClusteringAndIndices(self, X_val, t):
        print("Computing grouping clusterings")

        """
           This procedure computes clustering models and indices of data associatied
           to each cluster.

           INPUTS :
                X_val : validation data

           OUTPUTS :
                thresholds : dictionary of thresholds for each time step.
                indices  : indices associated to each time step and each interval
        """


        # print("COMPUTING THRESHOLD")
        _, t = X_val.shape

        # Predict classes
        if self.fears:
            predictions = self.handle_my_classifier(t, transform_to_format_fears(X_val), proba=True)
            predictions = predictions.values
        elif self.feat:
            with open(op.join(self.folderRealData, self.dataset, 'ep_probas_'+str(t)+'.pkl') ,'rb') as inp:
                predictions = pickle.load(inp)
        else:
            predictions = self.classifiers[t].predict_proba(X_val)

        # predictions = [self.aggregateProbaVector(proba_vector) for proba_vector in predictions]
        # print("Ready to scale")
        # self.proba_scaler = StandardScaler().fit(predictions)
        # predictions = self.proba_scaler.transform(predictions)

        # Calibrate along each dimension
        self.calibrations_grids[t] = self.calibration(predictions, 3)
        predictions = self.calibrate(self.calibrations_grids[t], predictions)

        # TODO: Put good parameters in Kmeans 
        print("Ready to cluster proba vectors")
        kmeans_model = KMeans(n_clusters= self.nbIntervals, init='k-means++', n_init=10, max_iter=3000, tol=0.0001).fit(predictions)

        self.clusterings[t] = kmeans_model

        indices  = [[] for i in range(self.nbIntervals)]
        clusters = kmeans_model.labels_
        
        for index, cluster in enumerate(clusters):
            indices[cluster].append(index)
        thresholds = None

        return thresholds,indices
Пример #6
0
    def computeThresholdsAndindices(self, X_val):
        """
           This procedure computes thresholds and indices of data associatied
           to each interval.

           INPUTS :
                X_val : validation data

           OUTPUTS :
                thresholds : dictionary of thresholds for each time step.
                indices  : indices associated to each time step and each interval
        """

        _, t = X_val.shape
        # Predict classes
        if self.fears:
            predictions = self.handle_my_classifier(
                t, transform_to_format_fears(X_val), proba=True)
            predictions = predictions.values  # todo ProbNone1
        elif self.feat:
            with open(
                    'RealData/' + self.dataset + '/ep_probas_' + str(t) +
                    '.pkl', 'rb') as inp:
                predictions = pickle.load(inp)
        else:
            predictions = self.classifiers[t].predict_proba(X_val)
        # Sort according to probabilities
        if self.feat:
            sortedProbabilities = [(i, val) for i, val in zip(
                np.argsort(predictions)[::-1], sorted(predictions,
                                                      reverse=True))]
        else:
            sortedProbabilities = [(i, val) for i, val in zip(
                np.argsort(predictions[:, 1])[::-1],
                sorted(predictions[:, 1], reverse=True))]
        # equal frequence
        frequence = len(sortedProbabilities) // self.nbIntervals
        #compute thresholds
        thresholds = []
        for i in range(1, self.nbIntervals):
            thresholds.append(sortedProbabilities[i * frequence][1])
        return thresholds
Пример #7
0
    def compute_P_yhat_y_gammak(self, X_val, Y_val, timestep):
        """
           This function computes P_t(ŷ/y,c_k)


           INPUTS :
                X_val, Y_val : valdiation data
                timestep     : timestep reached
                indicesData  : indices of data associated to each interval / timestep

           OUTPUTS :
                probabilities : P_t(ŷ/y,gamma_k)

        """

        occurences = {}

        # initialise probabilities to 0
        probabilities = {(gamma_k, y, y_hat): 0
                         for y in self.labels for y_hat in self.labels
                         for gamma_k in range(self.nbIntervals)}
        rec = self.recodedTS.loc[X_val.index.values, :]
        # Iterate over intervals
        for gamma_k in range(self.nbIntervals):

            indices_gamma_k = rec[rec[timestep - 1] == gamma_k +
                                  1].index.values
            # Subset of Validation set in interval gamma_k

            X_val_ck = X_val.loc[indices_gamma_k, :]
            Y_val_ck = Y_val.loc[indices_gamma_k]
            # Subset of Validation set in interval gamma_k
            if (len(Y_val_ck) > 0):
                if self.fears:
                    predictions = self.handle_my_classifier(
                        timestep,
                        transform_to_format_fears(X_val_ck.iloc[:, :timestep]))
                elif self.feat:
                    with open(
                            'RealData/' + self.dataset + '/ep_preds_' +
                            str(timestep) + '.pkl', 'rb') as inp:
                        predictions = pickle.load(inp)
                        predictions = [
                            predictions[ii] for ii in indices_gamma_k
                        ]
                else:
                    predictions = self.classifiers[timestep].predict(
                        X_val_ck.iloc[:, :timestep])
                for y_hat, y in zip(predictions, Y_val_ck):
                    # frequenceuence
                    probabilities[gamma_k, y, y_hat] += 1
        # normalize
        for gamma_k, y, y_hat in probabilities.keys():
            indices_gamma_k = rec[rec[timestep - 1] == gamma_k +
                                  1].index.values
            Y_val_gamma = Y_val.loc[indices_gamma_k]

            # number of observations in gammak knowing y
            sizeCluster_gamma = len(Y_val_gamma[Y_val_gamma == y])
            try:
                if (sizeCluster_gamma != 0):
                    probabilities[gamma_k, y, y_hat] /= sizeCluster_gamma
            except ZeroDivisionError:
                print("Zero")
        return probabilities
Пример #8
0
    def compute_P_yhat_y_ck(self, X_val, Y_val, timestep):
        """
           This function computes P_t(ŷ/y,c_k)


           INPUTS :
                X_val, Y_val : valdiation data
                timestep     : timestep reached

           OUTPUTS :
                probabilities : probabilities of label y given a cluster ck.
        """

        ############## INITS
        occurences = {}
        probabilities = {}
        subsets = {}


        # clusters associated to time series
        # à modifier les noms de variables & noms de fonctions (id ou clusters etc)
        clusters_data = self.clustering.predict(X_val)

        # initialise probabilities to 0
        probabilities = {(c_k, y, y_hat):0 for y in self.labels for y_hat in self.labels for c_k in self.clusters}

        # for each cluster we associate indices of data corresponding to this cluster
        indices_data_cluster = {c_k:[] for c_k in self.clusters}

        for index, value in enumerate(clusters_data):
            # indices id ?
            indices_data_cluster[value].append(index)

        ############## OCCURENCES
        for c_k in self.clusters:

            indices_ck = indices_data_cluster[c_k]

            # Subset of Validation set in cluster C_k
            X_val_ck = X_val.iloc[indices_ck]

            if (len(indices_ck)>0):
                # predict labels for this subset
                if self.fears:
                    predictions = self.handle_my_classifier(timestep, transform_to_format_fears(X_val_ck.iloc[:, :timestep]))
                elif self.feat:
                    with open('RealData/'+self.dataset+'/ep_preds_'+str(timestep)+'.pkl' ,'rb') as inp:
                        predictions = pickle.load(inp)
                        predictions = [predictions[ii] for ii in indices_ck]
                else:
                    predictions = self.classifiers[timestep].predict(X_val_ck.iloc[:, :timestep])

                for y_hat, y in zip(predictions, Y_val.iloc[indices_ck]):
                    # compute frequence
                    probabilities[c_k, y, y_hat] += 1

        ############## NORMALIZATION KNOWING Y
        for c_k, y, y_hat in probabilities.keys():

            # subset ck
            Y_val_ck = Y_val.iloc[indices_data_cluster[c_k]]
            # number of observations in this subset that have label y
            sizeCluster_y = len(Y_val_ck[Y_val_ck==y])
            if sizeCluster_y != 0:
                probabilities[c_k, y, y_hat] /= sizeCluster_y

        return probabilities