def run(self): self.initialize() self.dfs = {} if self.logPaths is None: self.ingestLogs() for context in list(self.features.keys()) + list(self.labels.keys()): methodName = "_{0}".format(context) if not methodName in dir(self): Msg.abort( "Missing expected class method handler for {0}".format( context)) for feature in self.features: self.dfs[feature] = \ getattr(self, "_{0}".format(feature))(self.logPaths["features"][feature]) for label in self.labels: self.dfs[label] = \ getattr(self, "_{0}".format(label))(self.logPaths["labels"][label]) labelName = self.labelMap[label] if self.cvssVersion == 2: self.dfs[label][labelName] = self.dfs[label][labelName].apply( lambda x: Cvss.getSeverityV2(x)) else: self.dfs[label][labelName] = self.dfs[label][labelName].apply( lambda x: Cvss.getSeverityV3(x)) for feature in self.features: self.saveDataFeature(feature, self.dfs[feature]) self.saveData() if self.scaleDataFlag: self.scaleData() return 0
def run(self): self.df = pd.read_csv(self.inputDataPath) wb = xlsxwriter.Workbook(self.outputDataPath) fmtBold = wb.add_format({'bold': True}) wsSummary = wb.add_worksheet("Summary") wsData = wb.add_worksheet("Data") wsChart = wb.add_worksheet("Chart") wsSummary.set_column("A:I", 18) wsData.set_column("A:I", 18) columns = sorted(self.df.columns) columns.insert(0, "K") i = 0 for j in range(0, len(columns)): wsData.write(i, j, columns[j], fmtBold) i = 1 beginRow = i for index, row in self.df.iterrows(): wsData.write(i, 0, i) for j in range(1, len(columns)): wsData.write(i, j, row[columns[j]]) j += 1 i += 1 endRow = i chart = wb.add_chart({"type": "scatter"}) categories = "=Data!$A$2:$A101" series1 = "=Data!$B$2:$B101" series2 = "=Data!$C$2:$C101" series3 = "=Data!$D$2:$D101" chart.add_series({"categories": categories, "values": series1}) chart.add_series({"categories": categories, "values": series2}) chart.add_series({"categories": categories, "values": series3}) chart.set_x_axis({"Name": "KFold"}) chart.set_y_axis({"Name": "Score"}) wsChart.insert_chart("A1", chart) columns.pop(0) stats = ["Classifier", "Mean", "Median", "Std", "Var", "Min", "Max"] for j in range(0, len(stats)): wsSummary.write(0, j, stats[j], fmtBold) for i in range(0, len(columns)): classifier = columns[i] sMean = Number.asFloat(np.mean(self.df[classifier]), 3) sMedian = Number.asFloat(np.median(self.df[classifier]), 3) sStd = Number.asFloat(np.std(self.df[classifier]), 3) sVar = Number.asFloat(np.var(self.df[classifier]), 3) sMin = np.min(self.df[classifier]) sMax = np.max(self.df[classifier]) stats = [classifier, sMean, sMedian, sStd, sVar, sMin, sMax] for j in range(0, len(stats)): if j == 0: wsSummary.write(i + 1, j, stats[j], fmtBold) else: wsSummary.write(i + 1, j, stats[j]) wb.close() Msg.raw(self.df)
def scaleData(self): scaler = preprocessing.RobustScaler() for path in self.paths: dfData = pd.read_csv(path) labelName = dfData.columns.values[0].strip() joinNames = dfData.columns.values[1:3] featureNames = dfData.columns.values[3:] allNames = [labelName] + joinNames.tolist() + featureNames.tolist() df = pd.DataFrame(columns=allNames) df[labelName] = dfData[labelName].values df[joinNames] = dfData[joinNames].values df[featureNames] = scaler.fit_transform(dfData[featureNames]) Msg.show("Rescaling {0}".format(path)) df.to_csv(path, index=False, float_format='%.2f')
def build(self, inputDataPath): Msg.raw("Analyzing <-> {0}".format(inputDataPath)) self.ingestData(inputDataPath) kf = KFold(n_splits=self.kFolds, shuffle=self.randomizeDataFlag, random_state=None) X = self.dfFeatures.values y = self.dfLabel.values.astype(dtype="int64") scoreKeeper = {} for name, classifier in sorted(list(self.classifiers.items())): scores = cross_val_score(classifier, X, y, cv=kf) if not name in scoreKeeper: scoreKeeper[name] = [] scoreKeeper[name].extend(scores) return scoreKeeper
def saveDataFeature(self, featureName, featureDf): joinCols = ["sessionId", "projectName"] featureCols = self._prepareColumns(featureDf.columns.values.tolist(), self.aliases[featureName], joinCols) featureDf.columns = featureCols for labelName in self.labels: path = "{0}/{1}-{2}.csv".format(self.outputDir, labelName, featureName) labelDf = self.dfs[labelName].copy(deep=True) labelCols = self._prepareColumns(labelDf.columns.values.tolist(), self.aliases[labelName], joinCols) labelDf.columns = labelCols df = labelDf.merge(featureDf, how="inner", on=joinCols) Msg.show("Saving {0}/{1} -> {2}".format(labelName, featureName, path)) self.paths.append(path) df.to_csv(path, index=False, float_format='%.2f')
def initialize(self): Msg.show("Initializing") Msg.show("CVSS scoring version: {0}".format(self.cvssVersion)) Msg.show("Scale data: {0}".format(self.scaleDataFlag)) cfg = File.read(path=self.cfgPath, asJsonFlag=True) self.features = cfg["features"] self.labels = cfg["labels"] self.aliases = cfg["aliases"] self.labelMap = {} for label in self.labels: self.labelMap[label] = self.labels[label][0] Dir.make(self.outputDir) self.paths = [] if self.excludeMissingDataFlag: Msg.show("Excluding rows with missing data") else: Msg.show("Replacing missing data using '{0}' method".format( self.replaceMissingDataMethod))
def ingestLogData(self, logPaths, name, attributes): Msg.show("Ingesting {0} logs".format(name)) if not "projectName" in attributes: attributes.insert(0, "projectName") if not "sessionId" in attributes: attributes.insert(0, "sessionId") samples = [] for logPath in logPaths: results = File.read(path=logPath, asJsonFlag=True) samples.append(Data.getDictByKeys(results, attributes, None)) df = pd.DataFrame(samples) if self.excludeMissingDataFlag: df.dropna(inplace=True) else: if self.replaceMissingDataMethod == "median": df.fillna(df.median(), inplace=True) else: df.fillna(df.mean(), inplace=True) alias = self.aliases[name] return df
def run(self): self.initialize() scoreKeeper = Data.buildDictByKeys(self.classifiers.keys(), []) for i in range(0, self.cycles): Msg.show("Cycle {0} of {1}".format(i + 1, self.cycles)) scores = self.build(self.inputDataPath) for classifier in scores.keys(): scoreKeeper[classifier].extend(scores[classifier]) df = pd.DataFrame.from_dict(scoreKeeper) Msg.raw(df) df.to_csv(self.outputDataPath, index=False, float_format='%.2f') Msg.show("Saved results -> {0}".format(self.outputDataPath))
def initialize(self): Msg.show("Initializing") Dir.make(File.getDirectory(self.outputDataPath))