def queryLowLikelihoodInstances(self, drop_instances, num_instances): if num_instances == 0: return [] queries_df = self.getSelectedInstancesDataframe(drop_instances) matrix_tools.sort_data_frame(queries_df, 'likelihood', True, True) queries_df = queries_df.head(num_instances) self.addAnnotationQueries('low_likelihood', 'low', queries_df) return list(map(int, queries_df.index.values.tolist()))
def queryUncertainInstances(self, drop_instances, num_instances): if num_instances == 0: return [] queries_df = self.getSelectedInstancesDataframe(drop_instances) matrix_tools.sort_data_frame(queries_df, 'entropy', False, True) queries_df = queries_df.head(num_instances) self.addAnnotationQueries('uncertain', 'low', queries_df) return list(map(int, queries_df.index.values.tolist()))
def _compute_features_scoring_ranking(self): self.features_scores = {} for i, feature_id in enumerate(self.instances.features.ids): # Store values / pvalues self.features_scores[feature_id] = FeatureScoring(feature_id, self.scores, self.scoring_func) # Store ranks for func, _ in self.scoring_func: matrix_tools.sort_data_frame(self.scores, func, False, True) for rank, feature_id in enumerate(self.scores.index.values): self.features_scores[feature_id].set_rank(func, rank)
def ndcg(ground_truth, scores, pos_label=1): df = pd.DataFrame({ 'scores': scores, 'ground_truth': ground_truth, 'index': [0] * len(scores) }) matrix_tools.sort_data_frame(df, 'scores', False, True) df.loc[:, 'index'] = range(len(scores)) selection = df.loc[:, 'ground_truth'] == pos_label df = df.loc[selection, :] score = sum([pow(2, -row['index']) for _, row in df.iterrows()]) ideal_score = (sum([pow(2, -i) for i in range(len(scores))])) return score / ideal_score
def generateQueries(self): unsure_df = matrix_tools.extract_rows_with_thresholds( self.predictions, self.proba_min, self.proba_max, 'predicted_proba', deepcopy=True) unsure_df['predicted_proba'] = abs(unsure_df['predicted_proba'] - 0.5) matrix_tools.sort_data_frame(unsure_df, 'predicted_proba', True, True) if (self.num_annotations is not None and len(unsure_df) > self.num_annotations): unsure_df = unsure_df.head(n=self.num_annotations) for instance_id, row in unsure_df.iterrows(): query = self.generateQuery(instance_id, row['predicted_proba'], None, None) self.annotation_queries.append(query)
def getFamiliesBarplot(annotations_id, iteration, label): iteration = None if iteration == 'None' else int(iteration) family_counts = annotations_db_tools.getFamiliesCounts(session, annotations_id, iter_max=iteration, label=label) df = pd.DataFrame({ 'families': list(family_counts.keys()), 'counts': [family_counts[k] for k in list(family_counts.keys())] }) matrix_tools.sort_data_frame(df, 'families', ascending=True, inplace=True) barplot = BarPlot(list(df['families'])) dataset = PlotDataset(list(df['counts']), 'Num. Instances') dataset.set_color(colors_tools.get_label_color(label)) barplot.add_dataset(dataset) return jsonify(barplot.to_json())
def extractAlerts(self, predictions_monitoring): detection_threshold = self.alerts_conf.detection_threshold alerts = matrix_tools.extract_rows_with_thresholds( predictions_monitoring.predictions, detection_threshold, None, 'predicted_proba') alerts = matrix_tools.sort_data_frame(alerts, 'predicted_proba', False, False) return alerts
def generateQueries(self): predicted_scores = self.predictions['scores'] if len(predicted_scores) == 0: return boundary_scores = abs(predicted_scores) / max(abs(predicted_scores)) neighbours_scores = self.computeNeighboursScores() global_scores = self.delta * boundary_scores global_scores += (1 - self.delta) * neighbours_scores queries_df = pd.DataFrame(data={ 'scores': predicted_scores, 'boundary_scores': boundary_scores, 'neighbours_scores': neighbours_scores, 'global_scores': global_scores }, index=self.predictions.index) matrix_tools.sort_data_frame(queries_df, 'global_scores', True, True) queries_df = queries_df.head(n=self.num_annotations) for index, row in queries_df.iterrows(): query = self.generateQuery(index, row['scores'], None, None) self.annotation_queries.append(query)
def getSortedFeatures(experiment_id, criterion): exp = updateCurrentExperiment(experiment_id) scoring_filename = path.join(exp.output_dir(), 'scores.csv') scores = pd.read_csv(scoring_filename, header=0, index_col=0) pvalues = None if criterion == 'alphabet': features = scores.index.values.tolist() features.sort() values = None user_ids = get_feature_user_ids(session, features) return jsonify({ 'features': features, 'values': None, 'pvalues': None, 'user_ids': user_ids }) if criterion == 'null_variance': selection = scores.loc[:, 'variance'] == 0 scores = scores.loc[selection, :] criterion = 'variance' else: matrix_tools.sort_data_frame(scores, criterion, False, True) features = scores.index.values.tolist() values = scores[criterion].tolist() values = ['%.2f' % v for v in values] pvalues_col = '_'.join([criterion, 'pvalues']) if pvalues_col in scores.columns: pvalues = scores[pvalues_col].tolist() pvalues = ['%.2E' % Decimal(v) for v in pvalues] user_ids = get_feature_user_ids(session, features) return jsonify({ 'features': features, 'values': values, 'pvalues': pvalues, 'user_ids': user_ids })
def generateFamiliesScoresTables(self, classifier=None): if classifier is None: families_scores = {} families_scores['lr'] = self.generateFamiliesScoresTables('lr') families_scores['nb'] = self.generateFamiliesScoresTables('nb') return families_scores families_scores = [] for i, family in enumerate(list(self.lr_class_labels)): selection = self.scores[classifier + '_prediction'] if selection.shape[0] > 0: family_scores = self.scores.loc[self.scores[ classifier + '_prediction'] == family] family_scores = matrix_tools.sort_data_frame( family_scores, classifier + '_score', True, False) else: family_scores = pd.DataFrame( columns=self.scores.columns.values) families_scores.append(family_scores) return families_scores
def sortInstances(self): df = pd.DataFrame({'distance': self.distances}, index=list(map(str, self.instances_ids))) matrix_tools.sort_data_frame(df, 'distance', True, True) self.instances_ids = list(map(int, df.index.values.tolist())) self.distances = df.distance.tolist()
def display(self, directory): with open(path.join(directory, 'model_coefficients.csv'), 'w') as f: matrix_tools.sort_data_frame(self.coef_summary, 'abs_mean', False, True) self.coef_summary.to_csv(f, index_label='feature')
def finalComputations(self): matrix_tools.sort_data_frame( self.predictions, 'predicted_proba', True, True)
def generateQueriesFromScores(self): assert (np.array_equal(self.lr_class_labels, self.nb_class_labels)) lr_predicted_proba_df = self.generateLrPredictedProbaDataFrame() num_families = len(self.lr_class_labels) self.annotation_queries = [] # There are fewer annotation queries than the number of families if self.num_annotations <= num_families: if self.iteration.iteration_number % 2 == 0: classifier = 'lr' else: classifier = 'nb' matrix_tools.sort_data_frame(self.scores, classifier + '_score', True, True) selected_instances = self.scores.index.tolist()[:self. num_annotations] for instance_id in selected_instances: query = self.generateQuery(instance_id, 0, None, None) self.annotation_queries.append(query) return # Otherwise num_uncertain = [0] * num_families num_anomalous = [0] * num_families families_scores = self.generateFamiliesScoresTables() num_annotations = 0 stop = False selected_instances = [] while not stop: for i, family in enumerate(list(self.lr_class_labels)): if num_uncertain[i] <= num_anomalous[i]: classifier = 'lr' num_uncertain[i] += 1 else: classifier = 'nb' num_anomalous[i] += 1 scores = families_scores[classifier][i] selected_rows = scores.loc[scores['queried'] == False] if len(selected_rows) > 0: query = selected_rows.index.tolist()[0] else: # No anomalous or uncertain instances available for annotation # Select the most likely instance according to the logistic regression output self.conf.logger.debug( family + ': no anomalous, no uncertain instances') selected_rows = lr_predicted_proba_df.loc[ lr_predicted_proba_df['queried'] == False] selected_rows = matrix_tools.sort_data_frame( selected_rows, family, False, False) selection = selected_rows.index.tolist() # Break condition - There is no instance left in the unlabelled pool if len(selection) == 0: stop = True break else: query = selection[0] # Add annotation query and set queried = True num_annotations += 1 selected_instances.append(query) for c in ['nb', 'lr']: predicted_class = self.scores.loc[query, c + '_prediction'] predicted_class_index = np.where( self.lr_class_labels == predicted_class)[0][0] families_scores[c][predicted_class_index].set_value( query, 'queried', True) self.scores.set_value(query, 'queried', True) lr_predicted_proba_df.set_value(query, 'queried', True) # Break condition - self.num_annotations instances have been queried if num_annotations >= self.num_annotations: stop = True break for instance_id in selected_instances: query = self.generateQuery(instance_id, 0, None, None) self.annotation_queries.append(query)