def experiemnt_three(dbFileName, meanFlag, outputStrParam, clusterFlag): from sklearn import cluster import plotter clusteringType = None if clusterFlag: clusteringType = cluster.KMeans(n_clusters=13) else: clusteringType = cluster.AgglomerativeClustering(n_clusters=13) print "Performing experiemnt # 3: Clustering score into two clusters " versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName) sanitizedVersions = sanityCheck.getCodeQualityofVersions(versionAndCodeQualityDict, meanFlag) sanitizedVersions_CQ = sanitizedVersions # print "Sanitized versions that will be used in study ", len(sanitizedVersions) # print "Sanitized versions ..." , sanitizedVersions NonZero_sanitizedVersionsWithScore = sanityCheck.getNonZeroVulnerbailityScoreOfSelectedVersions(sanitizedVersions) # print "zzzz", len(NonZero_sanitizedVersionsWithScore) ### dyumping scores ... brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore) onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[0], brokenDict[1] # print "lalalaa ", onlyTheNonZeroSanitizedVScores # strOfScoresToDump="" # for elem in onlyTheNonZeroSanitizedVScores: # strOfScoresToDump = strOfScoresToDump + str(elem) + "," + "\n" ### # IO_.writeStrToFile("scores_for_clustering_measure.csv", strOfScoresToDump) reshapedNonZerSanitizedScores = np.reshape(onlyTheNonZeroSanitizedVScores, (-1, 1)) clusteringType.fit(reshapedNonZerSanitizedScores) labelsFroVersions = clusteringType.labels_ if clusterFlag: centroids = clusteringType.cluster_centers_ print "And the centroids are .... ", centroids NonZer_Santized_versionDictWithLabels = utility.clusterByKmeansLabel( onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions ) ##### plotting clusters start # low_cluster_y, high_cluster_y = utility.plotClusterByLabel( onlyTheNonZeroSanitizedVersionIDs , labelsFroVersions, NonZero_sanitizedVersionsWithScore) # low_cluster_x = [ 22.35294118 for x in low_cluster_y] # hig_cluster_x = [ 50.82030058 for x in high_cluster_y] # plotter.createClusterPlots(low_cluster_x, low_cluster_y, hig_cluster_x, high_cluster_y) ##### plottign clusters end else: print "No centroids for Aggolomerative clustering" NonZer_Santized_versionDictWithLabels = utility.clusterByAggoloLabel( onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions ) print "And the labels are .... " print len(labelsFroVersions) cluster_labels = clusteringType.fit_predict(reshapedNonZerSanitizedScores) silhouette_avg = silhouette_score(reshapedNonZerSanitizedScores, cluster_labels) print "Silhouette average---> ", silhouette_avg ############################## themegaFile_All = outputStrParam + "_" + "culsterified_non_zero_all-CQ-HL.csv" IO_.dumpIntoClusterifiedFile(themegaFile_All, sanitizedVersions_CQ, NonZer_Santized_versionDictWithLabels, False)
def experiemnt_mobilesoft(dbFileName, outputStrParam): from sklearn import cluster import plotter clusteringType = cluster.AgglomerativeClustering(n_clusters=5) print "Performing experiemnt # Mobilesoft" versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName) sanitizedVersions = sanityCheck.getMobilesoftCodeQualityVersions( versionAndCodeQualityDict, 1.00) sanitizedVersions_CQ = sanitizedVersions NonZero_sanitizedVersionsWithScore = sanityCheck.getAllVulnerbailityScoreOfSelectedVersions( sanitizedVersions) brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore) onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[ 0], brokenDict[1] # strOfScoresToDump="" # for elem in onlyTheNonZeroSanitizedVScores: # strOfScoresToDump = strOfScoresToDump + str(elem) + "," + "\n" # # ## # IO_.writeStrToFile("scores_for_clustering_measure.csv", strOfScoresToDump) reshapedNonZerSanitizedScores = np.reshape(onlyTheNonZeroSanitizedVScores, (-1, 1)) clusteringType.fit(reshapedNonZerSanitizedScores) labelsFroVersions = clusteringType.labels_ print "No centroids for Aggolomerative clustering" NonZer_Santized_versionDictWithLabels = utility.clusterByAggoloLabel( onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions) print "And the labels are .... " print len(labelsFroVersions) cluster_labels = clusteringType.fit_predict(reshapedNonZerSanitizedScores) silhouette_avg = silhouette_score(reshapedNonZerSanitizedScores, cluster_labels) print "Silhouette average---> ", silhouette_avg # clusteringType = cluster.KMeans(n_clusters=5) # clusteringType.fit(reshapedNonZerSanitizedScores) # centroids = clusteringType.cluster_centers_ # print "And the centroids are .... ", centroids ############################## themegaFile_All = outputStrParam + "_" + "cluster_Headered_1407.csv" IO_.dumpIntoClusterifiedFile(themegaFile_All, sanitizedVersions_CQ, NonZer_Santized_versionDictWithLabels) '''
def experiemnt_correlation(dbFileName, meanFlag, outputStrParam, clusterFlag): import correlation from sklearn import cluster clusteringType = None if clusterFlag: clusteringType = cluster.KMeans(n_clusters=2) else: clusteringType = cluster.AgglomerativeClustering(n_clusters=2) print "Performing experiemnt # Correlation: Clustering score into two clusters " versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName) sanitizedVersions = sanityCheck.getCodeQualityofVersions( versionAndCodeQualityDict, meanFlag) sanitizedVersions_CQ = sanitizedVersions #print "Sanitized versions that will be used in study ", len(sanitizedVersions) #print "Sanitized versions ..." , sanitizedVersions NonZero_sanitizedVersionsWithScore = sanityCheck.getNonZeroVulnerbailityScoreOfSelectedVersions( sanitizedVersions) #print "zzzz", len(NonZero_sanitizedVersionsWithScore) brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore) onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[ 0], brokenDict[1] #print "lalalaa ", onlyTheNonZeroSanitizedVScores reshapedNonZerSanitizedScores = np.reshape(onlyTheNonZeroSanitizedVScores, (-1, 1)) clusteringType.fit(reshapedNonZerSanitizedScores) labelsFroVersions = clusteringType.labels_ if clusterFlag: centroids = clusteringType.cluster_centers_ print "And the centroids are .... ", centroids NonZer_Santized_versionDictWithLabels = utility.clusterByKmeansLabel( onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions) else: print "No centroids for Aggolomerative clustering" NonZer_Santized_versionDictWithLabels = utility.clusterByAggoloLabel( onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions) #print "And the labels are .... " #print labelsFroVersions #print "versionDictWithLabels" #print len(versionDictWithLabels) onlyHighV_Scores_Dict = utility.getH_Scores_ForCorr( NonZer_Santized_versionDictWithLabels, NonZero_sanitizedVersionsWithScore) correlation.performCorrBasedOnIndiMetrics(onlyHighV_Scores_Dict, sanitizedVersions_CQ)
def experiemnt_correlation(dbFileName, meanFlag, outputStrParam, clusterFlag): import correlation from sklearn import cluster clusteringType = None if clusterFlag: clusteringType = cluster.KMeans(n_clusters=2) else: clusteringType = cluster.AgglomerativeClustering(n_clusters=2) print "Performing experiemnt # Correlation: Clustering score into two clusters " versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName) sanitizedVersions = sanityCheck.getCodeQualityofVersions(versionAndCodeQualityDict, meanFlag) sanitizedVersions_CQ = sanitizedVersions # print "Sanitized versions that will be used in study ", len(sanitizedVersions) # print "Sanitized versions ..." , sanitizedVersions NonZero_sanitizedVersionsWithScore = sanityCheck.getNonZeroVulnerbailityScoreOfSelectedVersions(sanitizedVersions) # print "zzzz", len(NonZero_sanitizedVersionsWithScore) brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore) onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[0], brokenDict[1] # print "lalalaa ", onlyTheNonZeroSanitizedVScores reshapedNonZerSanitizedScores = np.reshape(onlyTheNonZeroSanitizedVScores, (-1, 1)) clusteringType.fit(reshapedNonZerSanitizedScores) labelsFroVersions = clusteringType.labels_ if clusterFlag: centroids = clusteringType.cluster_centers_ print "And the centroids are .... ", centroids NonZer_Santized_versionDictWithLabels = utility.clusterByKmeansLabel( onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions ) else: print "No centroids for Aggolomerative clustering" NonZer_Santized_versionDictWithLabels = utility.clusterByAggoloLabel( onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions ) # print "And the labels are .... " # print labelsFroVersions # print "versionDictWithLabels" # print len(versionDictWithLabels) onlyHighV_Scores_Dict = utility.getH_Scores_ForCorr( NonZer_Santized_versionDictWithLabels, NonZero_sanitizedVersionsWithScore ) correlation.performCorrBasedOnIndiMetrics(onlyHighV_Scores_Dict, sanitizedVersions_CQ)
def experiemnt_three(dbFileName, meanFlag, outputStrParam, clusterFlag): from sklearn import cluster import plotter clusteringType = None if clusterFlag: clusteringType = cluster.KMeans(n_clusters=13) else: clusteringType = cluster.AgglomerativeClustering(n_clusters=13) print "Performing experiemnt # 3: Clustering score into two clusters " versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName) sanitizedVersions = sanityCheck.getCodeQualityofVersions( versionAndCodeQualityDict, meanFlag) sanitizedVersions_CQ = sanitizedVersions #print "Sanitized versions that will be used in study ", len(sanitizedVersions) #print "Sanitized versions ..." , sanitizedVersions NonZero_sanitizedVersionsWithScore = sanityCheck.getNonZeroVulnerbailityScoreOfSelectedVersions( sanitizedVersions) #print "zzzz", len(NonZero_sanitizedVersionsWithScore) ### dyumping scores ... brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore) onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[ 0], brokenDict[1] #print "lalalaa ", onlyTheNonZeroSanitizedVScores #strOfScoresToDump="" #for elem in onlyTheNonZeroSanitizedVScores: # strOfScoresToDump = strOfScoresToDump + str(elem) + "," + "\n" ### #IO_.writeStrToFile("scores_for_clustering_measure.csv", strOfScoresToDump) reshapedNonZerSanitizedScores = np.reshape(onlyTheNonZeroSanitizedVScores, (-1, 1)) clusteringType.fit(reshapedNonZerSanitizedScores) labelsFroVersions = clusteringType.labels_ if clusterFlag: centroids = clusteringType.cluster_centers_ print "And the centroids are .... ", centroids NonZer_Santized_versionDictWithLabels = utility.clusterByKmeansLabel( onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions) ##### plotting clusters start #low_cluster_y, high_cluster_y = utility.plotClusterByLabel( onlyTheNonZeroSanitizedVersionIDs , labelsFroVersions, NonZero_sanitizedVersionsWithScore) #low_cluster_x = [ 22.35294118 for x in low_cluster_y] #hig_cluster_x = [ 50.82030058 for x in high_cluster_y] #plotter.createClusterPlots(low_cluster_x, low_cluster_y, hig_cluster_x, high_cluster_y) ##### plottign clusters end else: print "No centroids for Aggolomerative clustering" NonZer_Santized_versionDictWithLabels = utility.clusterByAggoloLabel( onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions) print "And the labels are .... " print len(labelsFroVersions) cluster_labels = clusteringType.fit_predict(reshapedNonZerSanitizedScores) silhouette_avg = silhouette_score(reshapedNonZerSanitizedScores, cluster_labels) print "Silhouette average---> ", silhouette_avg ############################## themegaFile_All = outputStrParam + "_" + "culsterified_non_zero_all-CQ-HL.csv" IO_.dumpIntoClusterifiedFile(themegaFile_All, sanitizedVersions_CQ, NonZer_Santized_versionDictWithLabels, False)