def experiemnt_three(dbFileName, meanFlag, outputStrParam, clusterFlag): from sklearn import cluster import plotter clusteringType = None if clusterFlag: clusteringType = cluster.KMeans(n_clusters=13) else: clusteringType = cluster.AgglomerativeClustering(n_clusters=13) print "Performing experiemnt # 3: Clustering score into two clusters " versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName) sanitizedVersions = sanityCheck.getCodeQualityofVersions(versionAndCodeQualityDict, meanFlag) sanitizedVersions_CQ = sanitizedVersions # print "Sanitized versions that will be used in study ", len(sanitizedVersions) # print "Sanitized versions ..." , sanitizedVersions NonZero_sanitizedVersionsWithScore = sanityCheck.getNonZeroVulnerbailityScoreOfSelectedVersions(sanitizedVersions) # print "zzzz", len(NonZero_sanitizedVersionsWithScore) ### dyumping scores ... brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore) onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[0], brokenDict[1] # print "lalalaa ", onlyTheNonZeroSanitizedVScores # strOfScoresToDump="" # for elem in onlyTheNonZeroSanitizedVScores: # strOfScoresToDump = strOfScoresToDump + str(elem) + "," + "\n" ### # IO_.writeStrToFile("scores_for_clustering_measure.csv", strOfScoresToDump) reshapedNonZerSanitizedScores = np.reshape(onlyTheNonZeroSanitizedVScores, (-1, 1)) clusteringType.fit(reshapedNonZerSanitizedScores) labelsFroVersions = clusteringType.labels_ if clusterFlag: centroids = clusteringType.cluster_centers_ print "And the centroids are .... ", centroids NonZer_Santized_versionDictWithLabels = utility.clusterByKmeansLabel( onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions ) ##### plotting clusters start # low_cluster_y, high_cluster_y = utility.plotClusterByLabel( onlyTheNonZeroSanitizedVersionIDs , labelsFroVersions, NonZero_sanitizedVersionsWithScore) # low_cluster_x = [ 22.35294118 for x in low_cluster_y] # hig_cluster_x = [ 50.82030058 for x in high_cluster_y] # plotter.createClusterPlots(low_cluster_x, low_cluster_y, hig_cluster_x, high_cluster_y) ##### plottign clusters end else: print "No centroids for Aggolomerative clustering" NonZer_Santized_versionDictWithLabels = utility.clusterByAggoloLabel( onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions ) print "And the labels are .... " print len(labelsFroVersions) cluster_labels = clusteringType.fit_predict(reshapedNonZerSanitizedScores) silhouette_avg = silhouette_score(reshapedNonZerSanitizedScores, cluster_labels) print "Silhouette average---> ", silhouette_avg ############################## themegaFile_All = outputStrParam + "_" + "culsterified_non_zero_all-CQ-HL.csv" IO_.dumpIntoClusterifiedFile(themegaFile_All, sanitizedVersions_CQ, NonZer_Santized_versionDictWithLabels, False)
def experiemnt_select_classifier(dbFileName, meanFlag, outputStrParam, clusterFlag, scoreTypeParam): from sklearn import cluster from sklearn.metrics import silhouette_samples, silhouette_score, v_measure_score, adjusted_mutual_info_score, completeness_score scoreListToret = [] print "Performing experiemnt: Select Classifier" clusters = [x for x in xrange(100) if x > 1] #clusters=[2] for clsuter_cnt in clusters: print "this is iteration #", clsuter_cnt versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard( dbFileName) sanitizedVersions = sanityCheck.getCodeQualityofVersions( versionAndCodeQualityDict, meanFlag) sanitizedVersions_CQ = sanitizedVersions NonZero_sanitizedVersionsWithScore = sanityCheck.getNonZeroVulnerbailityScoreOfSelectedVersions( sanitizedVersions) #print "zzzz", len(NonZero_sanitizedVersionsWithScore) brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore) onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[ 0], brokenDict[1] #print "lalalaa ", onlyTheNonZeroSanitizedVScores reshapedNonZerSanitizedScores = np.reshape( onlyTheNonZeroSanitizedVScores, (-1, 1)) clusteringType = None if clusterFlag: clusteringType = cluster.KMeans(n_clusters=clsuter_cnt) else: clusteringType = cluster.AgglomerativeClustering( n_clusters=clsuter_cnt) cluster_labels = clusteringType.fit_predict( reshapedNonZerSanitizedScores) scores = 0 if scoreTypeParam == 0: scores = silhouette_score(reshapedNonZerSanitizedScores, cluster_labels) score_type = 'Silhouette' # elif scoreTypeParam==1: # scores = v_measure_score(reshapedNonZerSanitizedScores, cluster_labels) # score_type='V-measure' # elif scoreTypeParam==2: # scores = adjusted_mutual_info_score(reshapedNonZerSanitizedScores, cluster_labels) # score_type='adjusted mutual info' # elif scoreTypeParam==3: # scores = completeness_score(reshapedNonZerSanitizedScores, cluster_labels) # score_type='completeness' score_combo = (clsuter_cnt, scores) scoreListToret.append(score_combo) print "::::: score_type={}, For n_clusters ={}, The clustering _score is ={} :::::".format( score_type, clsuter_cnt, scores) return scoreListToret
def experiemnt_select_classifier(dbFileName, meanFlag, outputStrParam, clusterFlag, scoreTypeParam): from sklearn import cluster from sklearn.metrics import ( silhouette_samples, silhouette_score, v_measure_score, adjusted_mutual_info_score, completeness_score, ) scoreListToret = [] print "Performing experiemnt: Select Classifier" clusters = [x for x in xrange(100) if x > 1] # clusters=[2] for clsuter_cnt in clusters: print "this is iteration #", clsuter_cnt versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName) sanitizedVersions = sanityCheck.getCodeQualityofVersions(versionAndCodeQualityDict, meanFlag) sanitizedVersions_CQ = sanitizedVersions NonZero_sanitizedVersionsWithScore = sanityCheck.getNonZeroVulnerbailityScoreOfSelectedVersions( sanitizedVersions ) # print "zzzz", len(NonZero_sanitizedVersionsWithScore) brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore) onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[0], brokenDict[1] # print "lalalaa ", onlyTheNonZeroSanitizedVScores reshapedNonZerSanitizedScores = np.reshape(onlyTheNonZeroSanitizedVScores, (-1, 1)) clusteringType = None if clusterFlag: clusteringType = cluster.KMeans(n_clusters=clsuter_cnt) else: clusteringType = cluster.AgglomerativeClustering(n_clusters=clsuter_cnt) cluster_labels = clusteringType.fit_predict(reshapedNonZerSanitizedScores) scores = 0 if scoreTypeParam == 0: scores = silhouette_score(reshapedNonZerSanitizedScores, cluster_labels) score_type = "Silhouette" # elif scoreTypeParam==1: # scores = v_measure_score(reshapedNonZerSanitizedScores, cluster_labels) # score_type='V-measure' # elif scoreTypeParam==2: # scores = adjusted_mutual_info_score(reshapedNonZerSanitizedScores, cluster_labels) # score_type='adjusted mutual info' # elif scoreTypeParam==3: # scores = completeness_score(reshapedNonZerSanitizedScores, cluster_labels) # score_type='completeness' score_combo = (clsuter_cnt, scores) scoreListToret.append(score_combo) print "::::: score_type={}, For n_clusters ={}, The clustering _score is ={} :::::".format( score_type, clsuter_cnt, scores ) return scoreListToret
def experiemnt_one(dbFileName, meanFlag, outputStrParam): print "Performing experiment # 1" #import correlation as corr_ versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName) sanitizedVersions = sanityCheck.getCodeQualityofVersions( versionAndCodeQualityDict, meanFlag) print "Sanitized versions that will be used in study ", len( sanitizedVersions) #print "Sanitized versions ..." , sanitizedVersions sanitizedVersionsWithScore = sanityCheck.getVulnerbailityScoreOfSelectedVersions( sanitizedVersions) ''' Stats on risk score-->len=721, median=51.1111111111, mean=38.0255199862, max=53.3333333333, min=0.0, ''' riskStatus = sanityCheck.getVulnerbailityScoreStatus( sanitizedVersionsWithScore) if meanFlag: threshold = riskStatus[0] ## first returned index is mean else: threshold = riskStatus[1] ############################## sanitizedVersions_CQ = sanitizedVersions ####### high vScore versions started high_CQ_dict = utility.getHighVScoreVersions_CQ(sanitizedVersionsWithScore, sanitizedVersions_CQ, threshold) high_vScore_Dict = utility.getHighVScoreVersions_VScore( sanitizedVersionsWithScore, threshold) print "high_vscore_versions ", len(high_vScore_Dict) ####### high vScore versions ended ####### low vScore versions started low_CQ_dict = utility.getLowVScoreVersions_CQ(sanitizedVersionsWithScore, sanitizedVersions_CQ, threshold) low_vScore_Dict = utility.getLowVScoreVersions_VScore( sanitizedVersionsWithScore, threshold) print "len_vscore_versions ", len(low_vScore_Dict) ####### low vScore versions ended ##### dumpin time ### three ways: first by dumping all highs then all lows themegaFile_Seperated = outputStrParam + "_" + "all-CQ-HL-Seperated.csv" IO_.dumpIntoFileByHighAndLow(themegaFile_Seperated, high_CQ_dict, low_CQ_dict) ### three ways : second by dumping as it si themegaFile_All = outputStrParam + "_" + "all-CQ-HL.csv" IO_.dumpIntoFile(themegaFile_All, sanitizedVersions_CQ, sanitizedVersionsWithScore, threshold, False) LGR.performLogiRegression(themegaFile_All)
def experiemnt_two(dbFileName, meanFlag, outputStrParam ): print "Performing experiemnt # 2" #import correlation as corr_ versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName) sanitizedVersions = sanityCheck.getCodeQualityofVersions(versionAndCodeQualityDict, meanFlag) print "Sanitized versions that will be used in study ", len(sanitizedVersions) #print "Sanitized versions ..." , sanitizedVersions NonZero_sanitizedVersionsWithScore = sanityCheck.getNonZeroVulnerbailityScoreOfSelectedVersions(sanitizedVersions) ''' Stats on risk score (non-zero elemnts)-->len=549, median=51.1111111111, mean=49.9387976503, max=53.3333333333, min=15.0 ''' ############################## sanitizedVersions_CQ = sanitizedVersions riskStatus = sanityCheck.getVulnerbailityScoreStatus(NonZero_sanitizedVersionsWithScore) if meanFlag: threshold = riskStatus[0] ## first returned index is mean else: threshold = riskStatus[1] ####### high vScore versions started high_CQ_dict = utility.getHighVScoreVersions_CQ( NonZero_sanitizedVersionsWithScore , sanitizedVersions_CQ , threshold) high_vScore_Dict = utility.getHighVScoreVersions_VScore(NonZero_sanitizedVersionsWithScore, threshold) print "non zero high_vscore_versions ", len(high_vScore_Dict) ####### high vScore versions ended ####### low vScore versions started low_CQ_dict = utility.getLowVScoreVersions_CQ( NonZero_sanitizedVersionsWithScore , sanitizedVersions_CQ , threshold) low_vScore_Dict = utility.getLowVScoreVersions_VScore(NonZero_sanitizedVersionsWithScore, threshold) print "non zero len_vscore_versions ", len(low_vScore_Dict) ####### low vScore versions ended ##### dumpin time ### three ways: first by dumping all highs then all lows themegaFile_Seperated = outputStrParam + "_" + "non_zero_all-CQ-HL-Seperated.csv" IO_.dumpIntoFileByHighAndLow( themegaFile_Seperated, high_CQ_dict, low_CQ_dict ) ### three ways : second by dumping as it si themegaFile_All = outputStrParam + "_" + "non_zero_all-CQ-HL.csv" IO_.dumpIntoFile( themegaFile_All,sanitizedVersions_CQ , NonZero_sanitizedVersionsWithScore, threshold, False ) LGR.performLogiRegression(themegaFile_All)
def experiemnt_correlation(dbFileName, meanFlag, outputStrParam, clusterFlag): import correlation from sklearn import cluster clusteringType = None if clusterFlag: clusteringType = cluster.KMeans(n_clusters=2) else: clusteringType = cluster.AgglomerativeClustering(n_clusters=2) print "Performing experiemnt # Correlation: Clustering score into two clusters " versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName) sanitizedVersions = sanityCheck.getCodeQualityofVersions( versionAndCodeQualityDict, meanFlag) sanitizedVersions_CQ = sanitizedVersions #print "Sanitized versions that will be used in study ", len(sanitizedVersions) #print "Sanitized versions ..." , sanitizedVersions NonZero_sanitizedVersionsWithScore = sanityCheck.getNonZeroVulnerbailityScoreOfSelectedVersions( sanitizedVersions) #print "zzzz", len(NonZero_sanitizedVersionsWithScore) brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore) onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[ 0], brokenDict[1] #print "lalalaa ", onlyTheNonZeroSanitizedVScores reshapedNonZerSanitizedScores = np.reshape(onlyTheNonZeroSanitizedVScores, (-1, 1)) clusteringType.fit(reshapedNonZerSanitizedScores) labelsFroVersions = clusteringType.labels_ if clusterFlag: centroids = clusteringType.cluster_centers_ print "And the centroids are .... ", centroids NonZer_Santized_versionDictWithLabels = utility.clusterByKmeansLabel( onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions) else: print "No centroids for Aggolomerative clustering" NonZer_Santized_versionDictWithLabels = utility.clusterByAggoloLabel( onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions) #print "And the labels are .... " #print labelsFroVersions #print "versionDictWithLabels" #print len(versionDictWithLabels) onlyHighV_Scores_Dict = utility.getH_Scores_ForCorr( NonZer_Santized_versionDictWithLabels, NonZero_sanitizedVersionsWithScore) correlation.performCorrBasedOnIndiMetrics(onlyHighV_Scores_Dict, sanitizedVersions_CQ)
def experiemnt_correlation(dbFileName, meanFlag, outputStrParam, clusterFlag): import correlation from sklearn import cluster clusteringType = None if clusterFlag: clusteringType = cluster.KMeans(n_clusters=2) else: clusteringType = cluster.AgglomerativeClustering(n_clusters=2) print "Performing experiemnt # Correlation: Clustering score into two clusters " versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName) sanitizedVersions = sanityCheck.getCodeQualityofVersions(versionAndCodeQualityDict, meanFlag) sanitizedVersions_CQ = sanitizedVersions # print "Sanitized versions that will be used in study ", len(sanitizedVersions) # print "Sanitized versions ..." , sanitizedVersions NonZero_sanitizedVersionsWithScore = sanityCheck.getNonZeroVulnerbailityScoreOfSelectedVersions(sanitizedVersions) # print "zzzz", len(NonZero_sanitizedVersionsWithScore) brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore) onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[0], brokenDict[1] # print "lalalaa ", onlyTheNonZeroSanitizedVScores reshapedNonZerSanitizedScores = np.reshape(onlyTheNonZeroSanitizedVScores, (-1, 1)) clusteringType.fit(reshapedNonZerSanitizedScores) labelsFroVersions = clusteringType.labels_ if clusterFlag: centroids = clusteringType.cluster_centers_ print "And the centroids are .... ", centroids NonZer_Santized_versionDictWithLabels = utility.clusterByKmeansLabel( onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions ) else: print "No centroids for Aggolomerative clustering" NonZer_Santized_versionDictWithLabels = utility.clusterByAggoloLabel( onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions ) # print "And the labels are .... " # print labelsFroVersions # print "versionDictWithLabels" # print len(versionDictWithLabels) onlyHighV_Scores_Dict = utility.getH_Scores_ForCorr( NonZer_Santized_versionDictWithLabels, NonZero_sanitizedVersionsWithScore ) correlation.performCorrBasedOnIndiMetrics(onlyHighV_Scores_Dict, sanitizedVersions_CQ)
def experiemnt_three(dbFileName, meanFlag, outputStrParam, clusterFlag): from sklearn import cluster import plotter clusteringType = None if clusterFlag: clusteringType = cluster.KMeans(n_clusters=13) else: clusteringType = cluster.AgglomerativeClustering(n_clusters=13) print "Performing experiemnt # 3: Clustering score into two clusters " versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName) sanitizedVersions = sanityCheck.getCodeQualityofVersions( versionAndCodeQualityDict, meanFlag) sanitizedVersions_CQ = sanitizedVersions #print "Sanitized versions that will be used in study ", len(sanitizedVersions) #print "Sanitized versions ..." , sanitizedVersions NonZero_sanitizedVersionsWithScore = sanityCheck.getNonZeroVulnerbailityScoreOfSelectedVersions( sanitizedVersions) #print "zzzz", len(NonZero_sanitizedVersionsWithScore) ### dyumping scores ... brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore) onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[ 0], brokenDict[1] #print "lalalaa ", onlyTheNonZeroSanitizedVScores #strOfScoresToDump="" #for elem in onlyTheNonZeroSanitizedVScores: # strOfScoresToDump = strOfScoresToDump + str(elem) + "," + "\n" ### #IO_.writeStrToFile("scores_for_clustering_measure.csv", strOfScoresToDump) reshapedNonZerSanitizedScores = np.reshape(onlyTheNonZeroSanitizedVScores, (-1, 1)) clusteringType.fit(reshapedNonZerSanitizedScores) labelsFroVersions = clusteringType.labels_ if clusterFlag: centroids = clusteringType.cluster_centers_ print "And the centroids are .... ", centroids NonZer_Santized_versionDictWithLabels = utility.clusterByKmeansLabel( onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions) ##### plotting clusters start #low_cluster_y, high_cluster_y = utility.plotClusterByLabel( onlyTheNonZeroSanitizedVersionIDs , labelsFroVersions, NonZero_sanitizedVersionsWithScore) #low_cluster_x = [ 22.35294118 for x in low_cluster_y] #hig_cluster_x = [ 50.82030058 for x in high_cluster_y] #plotter.createClusterPlots(low_cluster_x, low_cluster_y, hig_cluster_x, high_cluster_y) ##### plottign clusters end else: print "No centroids for Aggolomerative clustering" NonZer_Santized_versionDictWithLabels = utility.clusterByAggoloLabel( onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions) print "And the labels are .... " print len(labelsFroVersions) cluster_labels = clusteringType.fit_predict(reshapedNonZerSanitizedScores) silhouette_avg = silhouette_score(reshapedNonZerSanitizedScores, cluster_labels) print "Silhouette average---> ", silhouette_avg ############################## themegaFile_All = outputStrParam + "_" + "culsterified_non_zero_all-CQ-HL.csv" IO_.dumpIntoClusterifiedFile(themegaFile_All, sanitizedVersions_CQ, NonZer_Santized_versionDictWithLabels, False)
print "------------------" print "Correlating vScore and minor violations " _corr_score = doCorrelation(minor_violationsCntList, vScoreList) print "Pearson:{}, P-P:{}, Spaeramn:{}, S-P:{}, MIC:{}, Non-Linearity:{}".format( _corr_score[0], _corr_score[1], _corr_score[2], _corr_score[3], _corr_score[4], _corr_score[5]) ############################CORRRELATION ZONE ############################## import sanityCheck, utility dbFileName = "/Users/akond/Documents/Spring-2016/CSC522/OSSAndroidAppDataset/androSec.db" import DataExtractionFromTables as DEFT #import correlation as corr_ versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName) sanitizedVersions_CQ = sanityCheck.getCodeQualityofVersions( versionAndCodeQualityDict) print "Sanitized versions that will be used in study ", len( sanitizedVersions_CQ) #print "Sanitized versions ..." , sanitizedVersions ##### all the vulnerability versions started sanitizedVersionsWithScore = sanityCheck.getVulnerbailityScoreOfSelectedVersions( sanitizedVersions_CQ) ### call for correlation #performCorrBasedOnIndiMetrics(sanitizedVersionsWithScore, sanitizedVersions_CQ) ##### all the vulnerability versions ended ####### high vScore versions started medianScore = 51.11111 high_CQ_dict = utility.getHighVScoreVersions_CQ(sanitizedVersionsWithScore, sanitizedVersions_CQ, medianScore)
print "Pearson:{}, P-P:{}, Spaeramn:{}, S-P:{}, MIC:{}, Non-Linearity:{}".format(_corr_score[0], _corr_score[1], _corr_score[2], _corr_score[3], _corr_score[4], _corr_score[5]) _corr_score = [] print "------------------" print "Correlating vScore and minor violations " _corr_score = doCorrelation( minor_violationsCntList, vScoreList) print "Pearson:{}, P-P:{}, Spaeramn:{}, S-P:{}, MIC:{}, Non-Linearity:{}".format(_corr_score[0], _corr_score[1], _corr_score[2], _corr_score[3], _corr_score[4], _corr_score[5]) ############################CORRRELATION ZONE ############################## import sanityCheck, utility dbFileName="/Users/akond/Documents/Spring-2016/CSC522/OSSAndroidAppDataset/androSec.db" import DataExtractionFromTables as DEFT #import correlation as corr_ versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName) sanitizedVersions_CQ = sanityCheck.getCodeQualityofVersions(versionAndCodeQualityDict) print "Sanitized versions that will be used in study ", len(sanitizedVersions_CQ) #print "Sanitized versions ..." , sanitizedVersions ##### all the vulnerability versions started sanitizedVersionsWithScore = sanityCheck.getVulnerbailityScoreOfSelectedVersions(sanitizedVersions_CQ) ### call for correlation #performCorrBasedOnIndiMetrics(sanitizedVersionsWithScore, sanitizedVersions_CQ) ##### all the vulnerability versions ended ####### high vScore versions started medianScore = 51.11111 high_CQ_dict = utility.getHighVScoreVersions_CQ( sanitizedVersionsWithScore , sanitizedVersions_CQ , medianScore) high_vScore_Dict = utility.getHighVScoreVersions_VScore(sanitizedVersionsWithScore, medianScore) ### call for correlation