def stase_metrics(df): """Compute STASE metrics for a given dataset""" M, N = df.shape logging.info("FILES: {0}, ANTIVIRUS: {1}".format(M, N)) clusters = df.stack().value_counts() logging.info("clustering DONE") O = clusters.size logging.info("LABELS: {0}".format(O)) apps = rows_stats(df) logging.info("rowstats computation DONE") avs = cols_stats(df) logging.info("colstats computation DONE") m = pd.Series() m['equiponderance'], m['equiponderance_idx'] = ouroboros(avs['positives'], True) m['exclusivity'] = avs['alones'].sum() / M m['recognition'] = apps['positives'].mean() / N m['synchronicity'] = np.mean(avs['overlap']) m['genericity'] = 1 - (O-1) / (avs['positives'].sum()-1) m['uniformity'], m['uniformity_idx'] = ouroboros(clusters, True) m['divergence'] = (apps['distincts'].sum() - M) / (apps['positives'].sum() - M) m['consensuality'] = (apps['max'].sum() - M) / (apps['positives'].sum() - M) m['resemblance'] = np.mean(apps['resemblance'].dropna()) m['avs'] = N m['apps'] = M m['labels'] = O logging.info("metrics computation DONE") return m, apps, avs, clusters
def test_against_gini(self): print("{0:<15} || {1:<s} | {2:<4s}".format("ARRAY", "OURO", "GINI")) for array, _, _ in self.cases: orb = ouroboros(array) gin = self._gini(array) arr = ','.join([str(a) for a in array]) print("{0:<15} || {1:<4.2f} | {2:<4.2f}".format(arr, orb, gin))
def test_standalone(self): for array, index, indice in self.cases: idx, idc = ouroboros(array, True) self.assertAlmostEqual(idx, index, 2, msg="Array: {0}".format(array)) self.assertEqual(idc, indice, msg="Array: {0}".format(array))