def findInterval(self, x_t, pb=None): """ This function finds interval associated with a timeseries given its probability INPUTS : proba : probability given by the classifier at time step t OUTPUTS : interval of x_t """ # we could use binary search for better perf t_current = len(x_t) # predict probability if self.fears: probadf = self.handle_my_classifier(t_current, transform_to_format_fears( numpy_to_df(x_t)), proba=True) proba = probadf['ProbNone1'].values[0] elif self.feat: proba = pb else: probadf = self.classifiers[t_current].predict_proba( x_t.reshape(1, -1)) proba = probadf[0][1] # a verifier # search for interval given probability ths = self.thresholds[t_current] for i, e in enumerate(sorted(ths, reverse=False)): if (proba <= e): return self.nbIntervals - i return 1
def findInterval(self, x_t, pb=None): """ This function finds interval associated with a timeseries given its probability INPUTS : proba : probability given by the classifier at time step t OUTPUTS : cluster of x_t """ # we could use binary search for better perf t_current = len(x_t) # predict probability if self.fears: probadf = self.handle_my_classifier(t_current, transform_to_format_fears(numpy_to_df(x_t)), proba=True) proba = probadf['ProbNone1'].values[0] elif self.feat: proba=pb else: probadf = self.classifiers[t_current].predict_proba(x_t.reshape(1, -1)) proba = probadf[0][1] # a verifier # Apply calibration on the single vector proba proba = self.calibrate_vector(self.calibrations_grids[t_current], proba) # Find interval (here: find the cluster) ret = self.clusterings[t_current].predict(np.array([proba])) return ret
def computeThresholdsAndindices(self, X_val): """ This procedure computes thresholds and indices of data associatied to each interval. INPUTS : X_val : validation data OUTPUTS : thresholds : dictionary of thresholds for each time step. indices : indices associated to each time step and each interval """ _, t = X_val.shape # Predict classes if self.fears: predictions = self.handle_my_classifier( t, transform_to_format_fears(X_val), proba=True) predictions = predictions.values elif self.feat: with open( op.join(self.folderRealData, self.dataset, 'ep_probas_' + str(t) + '.pkl'), 'rb') as inp: predictions = pickle.load(inp) else: predictions = self.classifiers[t].predict_proba(X_val) # Sort according to probabilities # if self.feat: # sortedProbabilities = [(i,val) for i,val in zip(np.argsort(predictions)[::-1], sorted(predictions, reverse=True))] # else: # sortedProbabilities = [(i,val) for i,val in zip(np.argsort(predictions[:, 1])[::-1], sorted(predictions[:, 1], reverse=True))] # Now we always save the full prediction vector and take the last component for the binary case sortedProbabilities = [(i, val) for i, val in zip( np.argsort(predictions[:, 1])[::-1], sorted(predictions[:, 1], reverse=True))] # equal frequence frequence = len(sortedProbabilities) // self.nbIntervals #compute thresholds thresholds = [] indices = [[idx[0] for idx in sortedProbabilities[0:frequence]]] for i in range(1, self.nbIntervals): thresholds.append(sortedProbabilities[i * frequence][1]) if (i == self.nbIntervals): indices.append( [idx[0] for idx in sortedProbabilities[i * frequence:]]) else: indices.append([ idx[0] for idx in sortedProbabilities[i * frequence:(i + 1) * frequence] ]) return thresholds, indices
def compute_P_yhat_y_gammak(self, X_val, Y_val, timestep, indicesData): """ This function computes P_t(ŷ/y,c_k) INPUTS : X_val, Y_val : valdiation data timestep : timestep reached indicesData : indices of data associated to each interval / timestep OUTPUTS : probabilities : P_t(ŷ/y,gamma_k) """ occurences = {} # initialise probabilities to 0 probabilities = {(gamma_k, y, y_hat):0 for y in self.labels for y_hat in self.labels for gamma_k in range(self.nbIntervals)} keysprob = probabilities.keys() # print(np.unique(np.array([x[0] for x in keysprob]))) # print(np.unique(np.array([x[1] for x in keysprob]))) # print(np.unique(np.array([x[2] for x in keysprob]))) # Iterate over intervals for gamma_k in range(self.nbIntervals): indices_gamma_k = indicesData[gamma_k] # Subset of Validation set in interval gamma_k X_val_ck = X_val.loc[indices_gamma_k,:] # Subset of Validation set in interval gamma_k if (X_val_ck.shape[0]>0): if self.fears: predictions = self.handle_my_classifier(timestep, transform_to_format_fears(X_val_ck.iloc[:, :timestep])) elif self.feat: with open(op.join(self.folderRealData, self.dataset, 'ep_preds_'+str(timestep)+'.pkl') ,'rb') as inp: predictions = list(pickle.load(inp)) predictions = [predictions[ii] for ii in indices_gamma_k] else: predictions = self.classifiers[timestep].predict(X_val_ck.iloc[:, :timestep]) for y_hat, y in zip(predictions, Y_val.loc[indices_gamma_k]): # frequenceuence probabilities[gamma_k, y, y_hat] += 1 # normalize for gamma_k, y, y_hat in probabilities.keys(): Y_val_gamma = Y_val.loc[indicesData[gamma_k]] # number of observations in gammak knowing y sizeCluster_gamma = len(Y_val_gamma[Y_val_gamma==y]) if (sizeCluster_gamma != 0): probabilities[gamma_k, y, y_hat] /= sizeCluster_gamma return probabilities
def computeClusteringAndIndices(self, X_val, t): print("Computing grouping clusterings") """ This procedure computes clustering models and indices of data associatied to each cluster. INPUTS : X_val : validation data OUTPUTS : thresholds : dictionary of thresholds for each time step. indices : indices associated to each time step and each interval """ # print("COMPUTING THRESHOLD") _, t = X_val.shape # Predict classes if self.fears: predictions = self.handle_my_classifier(t, transform_to_format_fears(X_val), proba=True) predictions = predictions.values elif self.feat: with open(op.join(self.folderRealData, self.dataset, 'ep_probas_'+str(t)+'.pkl') ,'rb') as inp: predictions = pickle.load(inp) else: predictions = self.classifiers[t].predict_proba(X_val) # predictions = [self.aggregateProbaVector(proba_vector) for proba_vector in predictions] # print("Ready to scale") # self.proba_scaler = StandardScaler().fit(predictions) # predictions = self.proba_scaler.transform(predictions) # Calibrate along each dimension self.calibrations_grids[t] = self.calibration(predictions, 3) predictions = self.calibrate(self.calibrations_grids[t], predictions) # TODO: Put good parameters in Kmeans print("Ready to cluster proba vectors") kmeans_model = KMeans(n_clusters= self.nbIntervals, init='k-means++', n_init=10, max_iter=3000, tol=0.0001).fit(predictions) self.clusterings[t] = kmeans_model indices = [[] for i in range(self.nbIntervals)] clusters = kmeans_model.labels_ for index, cluster in enumerate(clusters): indices[cluster].append(index) thresholds = None return thresholds,indices
def computeThresholdsAndindices(self, X_val): """ This procedure computes thresholds and indices of data associatied to each interval. INPUTS : X_val : validation data OUTPUTS : thresholds : dictionary of thresholds for each time step. indices : indices associated to each time step and each interval """ _, t = X_val.shape # Predict classes if self.fears: predictions = self.handle_my_classifier( t, transform_to_format_fears(X_val), proba=True) predictions = predictions.values # todo ProbNone1 elif self.feat: with open( 'RealData/' + self.dataset + '/ep_probas_' + str(t) + '.pkl', 'rb') as inp: predictions = pickle.load(inp) else: predictions = self.classifiers[t].predict_proba(X_val) # Sort according to probabilities if self.feat: sortedProbabilities = [(i, val) for i, val in zip( np.argsort(predictions)[::-1], sorted(predictions, reverse=True))] else: sortedProbabilities = [(i, val) for i, val in zip( np.argsort(predictions[:, 1])[::-1], sorted(predictions[:, 1], reverse=True))] # equal frequence frequence = len(sortedProbabilities) // self.nbIntervals #compute thresholds thresholds = [] for i in range(1, self.nbIntervals): thresholds.append(sortedProbabilities[i * frequence][1]) return thresholds
def compute_P_yhat_y_gammak(self, X_val, Y_val, timestep): """ This function computes P_t(ŷ/y,c_k) INPUTS : X_val, Y_val : valdiation data timestep : timestep reached indicesData : indices of data associated to each interval / timestep OUTPUTS : probabilities : P_t(ŷ/y,gamma_k) """ occurences = {} # initialise probabilities to 0 probabilities = {(gamma_k, y, y_hat): 0 for y in self.labels for y_hat in self.labels for gamma_k in range(self.nbIntervals)} rec = self.recodedTS.loc[X_val.index.values, :] # Iterate over intervals for gamma_k in range(self.nbIntervals): indices_gamma_k = rec[rec[timestep - 1] == gamma_k + 1].index.values # Subset of Validation set in interval gamma_k X_val_ck = X_val.loc[indices_gamma_k, :] Y_val_ck = Y_val.loc[indices_gamma_k] # Subset of Validation set in interval gamma_k if (len(Y_val_ck) > 0): if self.fears: predictions = self.handle_my_classifier( timestep, transform_to_format_fears(X_val_ck.iloc[:, :timestep])) elif self.feat: with open( 'RealData/' + self.dataset + '/ep_preds_' + str(timestep) + '.pkl', 'rb') as inp: predictions = pickle.load(inp) predictions = [ predictions[ii] for ii in indices_gamma_k ] else: predictions = self.classifiers[timestep].predict( X_val_ck.iloc[:, :timestep]) for y_hat, y in zip(predictions, Y_val_ck): # frequenceuence probabilities[gamma_k, y, y_hat] += 1 # normalize for gamma_k, y, y_hat in probabilities.keys(): indices_gamma_k = rec[rec[timestep - 1] == gamma_k + 1].index.values Y_val_gamma = Y_val.loc[indices_gamma_k] # number of observations in gammak knowing y sizeCluster_gamma = len(Y_val_gamma[Y_val_gamma == y]) try: if (sizeCluster_gamma != 0): probabilities[gamma_k, y, y_hat] /= sizeCluster_gamma except ZeroDivisionError: print("Zero") return probabilities
def compute_P_yhat_y_ck(self, X_val, Y_val, timestep): """ This function computes P_t(ŷ/y,c_k) INPUTS : X_val, Y_val : valdiation data timestep : timestep reached OUTPUTS : probabilities : probabilities of label y given a cluster ck. """ ############## INITS occurences = {} probabilities = {} subsets = {} # clusters associated to time series # à modifier les noms de variables & noms de fonctions (id ou clusters etc) clusters_data = self.clustering.predict(X_val) # initialise probabilities to 0 probabilities = {(c_k, y, y_hat):0 for y in self.labels for y_hat in self.labels for c_k in self.clusters} # for each cluster we associate indices of data corresponding to this cluster indices_data_cluster = {c_k:[] for c_k in self.clusters} for index, value in enumerate(clusters_data): # indices id ? indices_data_cluster[value].append(index) ############## OCCURENCES for c_k in self.clusters: indices_ck = indices_data_cluster[c_k] # Subset of Validation set in cluster C_k X_val_ck = X_val.iloc[indices_ck] if (len(indices_ck)>0): # predict labels for this subset if self.fears: predictions = self.handle_my_classifier(timestep, transform_to_format_fears(X_val_ck.iloc[:, :timestep])) elif self.feat: with open('RealData/'+self.dataset+'/ep_preds_'+str(timestep)+'.pkl' ,'rb') as inp: predictions = pickle.load(inp) predictions = [predictions[ii] for ii in indices_ck] else: predictions = self.classifiers[timestep].predict(X_val_ck.iloc[:, :timestep]) for y_hat, y in zip(predictions, Y_val.iloc[indices_ck]): # compute frequence probabilities[c_k, y, y_hat] += 1 ############## NORMALIZATION KNOWING Y for c_k, y, y_hat in probabilities.keys(): # subset ck Y_val_ck = Y_val.iloc[indices_data_cluster[c_k]] # number of observations in this subset that have label y sizeCluster_y = len(Y_val_ck[Y_val_ck==y]) if sizeCluster_y != 0: probabilities[c_k, y, y_hat] /= sizeCluster_y return probabilities