def outlierDetection(samples_x, samples_y_aggregation): ''' ''' outliers = [] for samples_idx in range(0, len(samples_x)): #sys.stderr.write("[%s] DEBUG: Evaluating %d of %d samples\n" # \ % (os.path.basename(__file__), samples_idx + 1, len(samples_x))) diagnostic_regressor_gp = gp_create_model.createModel(\ samples_x[0:samples_idx] + samples_x[samples_idx + 1:],\ samples_y_aggregation[0:samples_idx] + samples_y_aggregation[samples_idx + 1:]) mu, sigma = gp_prediction.predict(samples_x[samples_idx], diagnostic_regressor_gp['model']) # 2.33 is the z-score for 98% confidence level if abs(samples_y_aggregation[samples_idx] - mu) > (2.33 * sigma): outliers.append({ "samples_idx": samples_idx, "expected_mu": mu, "expected_sigma": sigma, "difference": abs(samples_y_aggregation[samples_idx] - mu) - (2.33 * sigma) }) outliers = None if len(outliers) == 0 else outliers return outliers
def _outlierDetection_threaded(inputs): ''' Detect the outlier ''' [samples_idx, samples_x, samples_y_aggregation] = inputs sys.stderr.write("[%s] DEBUG: Evaluating %dth of %d samples\n"\ % (os.path.basename(__file__), samples_idx + 1, len(samples_x))) outlier = None # Create a diagnostic regression model which removes the sample that we want to evaluate diagnostic_regressor_gp = gp_create_model.createModel(\ samples_x[0:samples_idx] + samples_x[samples_idx + 1:],\ samples_y_aggregation[0:samples_idx] + samples_y_aggregation[samples_idx + 1:]) mu, sigma = gp_prediction.predict(samples_x[samples_idx], diagnostic_regressor_gp['model']) # 2.33 is the z-score for 98% confidence level if abs(samples_y_aggregation[samples_idx] - mu) > (2.33 * sigma): outlier = { "samples_idx": samples_idx, "expected_mu": mu, "expected_sigma": sigma, "difference": abs(samples_y_aggregation[samples_idx] - mu) - (2.33 * sigma) } return outlier