Пример #1
0
 def queryLowLikelihoodInstances(self, drop_instances, num_instances):
     if num_instances == 0:
         return []
     queries_df = self.getSelectedInstancesDataframe(drop_instances)
     matrix_tools.sortDataFrame(queries_df, 'likelihood', True, True)
     queries_df = queries_df.head(num_instances)
     self.addAnnotationQueries('low_likelihood', 'low', queries_df)
     return list(map(int, queries_df.index.values.tolist()))
Пример #2
0
 def queryUncertainInstances(self, drop_instances, num_instances):
     if num_instances == 0:
         return []
     queries_df = self.getSelectedInstancesDataframe(drop_instances)
     matrix_tools.sortDataFrame(queries_df, 'entropy', False, True)
     queries_df = queries_df.head(num_instances)
     self.addAnnotationQueries('uncertain', 'low', queries_df)
     return list(map(int, queries_df.index.values.tolist()))
 def generateAnnotationQueries(self):
     unsure_df = matrix_tools.extractRowsWithThresholds(self.predictions,
                                                        self.proba_min,
                                                        self.proba_max,
                                                        'predicted_proba',
                                                        deepcopy=True)
     unsure_df['predicted_proba'] = abs(unsure_df['predicted_proba'] - 0.5)
     matrix_tools.sortDataFrame(unsure_df, 'predicted_proba', True, True)
     if self.num_annotations is not None and len(
             unsure_df) > self.num_annotations:
         unsure_df = unsure_df.head(n=self.num_annotations)
     for instance_id, row in unsure_df.iterrows():
         query = self.generateAnnotationQuery(instance_id,
                                              row['predicted_proba'], None,
                                              None)
         self.annotation_queries.append(query)
Пример #4
0
def getFamiliesBarplot(experiment_id, iteration, label):
    if iteration == 'None':
        iteration = None
    family_counts = annotations_db_tools.getFamiliesCounts(
        session, experiment_id, iteration_max=iteration, label=label)
    df = pd.DataFrame({
        'families':
        list(family_counts.keys()),
        'counts': [family_counts[k] for k in list(family_counts.keys())]
    })
    matrix_tools.sortDataFrame(df, 'families', ascending=True, inplace=True)
    barplot = BarPlot(list(df['families']))
    dataset = PlotDataset(list(df['counts']), 'Num. Instances')
    dataset.setColor(colors_tools.getLabelColor(label))
    barplot.addDataset(dataset)
    return jsonify(barplot.toJson())
Пример #5
0
 def extractAlerts(self, predictions_monitoring):
     detection_threshold = self.alerts_conf.detection_threshold
     alerts = matrix_tools.extractRowsWithThresholds(
         predictions_monitoring.predictions, detection_threshold, None,
         'predicted_proba')
     alerts = matrix_tools.sortDataFrame(alerts, 'predicted_proba', False,
                                         False)
     return alerts
Пример #6
0
 def generateAnnotationQueries(self):
     predicted_scores = self.predictions['scores']
     if len(predicted_scores) == 0:
         return
     boundary_scores = abs(predicted_scores) / max(abs(predicted_scores))
     neighbours_scores = self.computeNeighboursScores()
     global_scores = self.delta * boundary_scores + \
         (1 - self.delta) * neighbours_scores
     queries_df = pd.DataFrame(data={
         'scores': predicted_scores,
         'boundary_scores': boundary_scores,
         'neighbours_scores': neighbours_scores,
         'global_scores': global_scores
     },
                               index=self.predictions.index)
     matrix_tools.sortDataFrame(queries_df, 'global_scores', True, True)
     queries_df = queries_df.head(n=self.num_annotations)
     for index, row in queries_df.iterrows():
         query = self.generateAnnotationQuery(index, row['scores'], None,
                                              None)
         self.annotation_queries.append(query)
Пример #7
0
 def generateFamiliesScoresTables(self, classifier=None):
     if classifier is None:
         families_scores = {}
         families_scores['lr'] = self.generateFamiliesScoresTables('lr')
         families_scores['nb'] = self.generateFamiliesScoresTables('nb')
         return families_scores
     families_scores = []
     for i, family in enumerate(list(self.lr_class_labels)):
         selection = self.scores[classifier + '_prediction']
         if selection.shape[0] > 0:
             family_scores = self.scores.loc[self.scores[classifier +
                                                         '_prediction'] == family]
             family_scores = matrix_tools.sortDataFrame(
                 family_scores, classifier + '_score', True, False)
         else:
             family_scores = pd.DataFrame(
                 columns=self.scores.columns.values)
         families_scores.append(family_scores)
     return families_scores
Пример #8
0
    def generateQueriesFromScores(self):
        assert(np.array_equal(self.lr_class_labels, self.nb_class_labels))
        lr_predicted_proba_df = self.generateLrPredictedProbaDataFrame()
        num_families = len(self.lr_class_labels)
        self.annotation_queries = []

        # There are fewer annotation queries than the number of families
        if self.num_annotations <= num_families:
            if self.iteration.iteration_number % 2 == 0:
                classifier = 'lr'
            else:
                classifier = 'nb'
            matrix_tools.sortDataFrame(
                self.scores, classifier + '_score', True, True)
            selected_instances = self.scores.index.tolist()[
                :self.num_annotations]
            for instance_id in selected_instances:
                query = self.generateAnnotationQuery(
                    instance_id, 0, None, None)
                self.annotation_queries.append(query)
            return

        # Otherwise
        num_uncertain = [0] * num_families
        num_anomalous = [0] * num_families
        families_scores = self.generateFamiliesScoresTables()
        num_annotations = 0
        stop = False
        selected_instances = []
        while not stop:
            for i, family in enumerate(list(self.lr_class_labels)):
                if num_uncertain[i] <= num_anomalous[i]:
                    classifier = 'lr'
                    num_uncertain[i] += 1
                else:
                    classifier = 'nb'
                    num_anomalous[i] += 1
                scores = families_scores[classifier][i]
                selected_rows = scores.loc[scores['queried'] == False]
                if len(selected_rows) > 0:
                    query = selected_rows.index.tolist()[0]
                else:
                    # No anomalous or uncertain instances available for annotation
                    # Select the most likely instance according to the logistic regression output
                    self.conf.logger.debug(
                        family + ': no anomalous, no uncertain instances')
                    selected_rows = lr_predicted_proba_df.loc[lr_predicted_proba_df['queried'] == False]
                    selected_rows = matrix_tools.sortDataFrame(
                        selected_rows, family, False, False)
                    selection = selected_rows.index.tolist()
                    # Break condition - There is no instance left in the unlabelled pool
                    if len(selection) == 0:
                        stop = True
                        break
                    else:
                        query = selection[0]
                # Add annotation query and set queried = True
                num_annotations += 1
                selected_instances.append(query)
                for c in ['nb', 'lr']:
                    predicted_class = self.scores.loc[query, c + '_prediction']
                    predicted_class_index = np.where(
                        self.lr_class_labels == predicted_class)[0][0]
                    families_scores[c][predicted_class_index].set_value(
                        query, 'queried', True)
                self.scores.set_value(query, 'queried', True)
                lr_predicted_proba_df.set_value(query, 'queried', True)
                # Break condition - self.num_annotations instances have been queried
                if num_annotations >= self.num_annotations:
                    stop = True
                    break
        for instance_id in selected_instances:
            query = self.generateAnnotationQuery(instance_id, 0, None, None)
            self.annotation_queries.append(query)
Пример #9
0
 def display(self, directory):
     with open(path.join(directory, 'model_coefficients.csv'), 'w') as f:
         matrix_tools.sortDataFrame(self.coef_summary, 'abs_mean', False,
                                    True)
         self.coef_summary.to_csv(f, index_label='feature')
Пример #10
0
 def sortInstances(self):
     df = pd.DataFrame({'distance': self.distances},
                       index=list(map(str, self.instances_ids)))
     matrix_tools.sortDataFrame(df, 'distance', True, True)
     self.instances_ids = list(map(int, df.index.values.tolist()))
     self.distances = df.distance.tolist()
Пример #11
0
 def finalComputations(self):
     matrix_tools.sortDataFrame(self.predictions, 'predicted_proba', True,
                                True)