def experiemnt_three(dbFileName, meanFlag, outputStrParam, clusterFlag):
    from sklearn import cluster
    import plotter

    clusteringType = None
    if clusterFlag:
        clusteringType = cluster.KMeans(n_clusters=13)
    else:
        clusteringType = cluster.AgglomerativeClustering(n_clusters=13)

    print "Performing experiemnt # 3: Clustering score into two clusters "
    versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName)
    sanitizedVersions = sanityCheck.getCodeQualityofVersions(versionAndCodeQualityDict, meanFlag)
    sanitizedVersions_CQ = sanitizedVersions
    # print "Sanitized versions that will be used in study ", len(sanitizedVersions)
    # print "Sanitized versions ..." , sanitizedVersions
    NonZero_sanitizedVersionsWithScore = sanityCheck.getNonZeroVulnerbailityScoreOfSelectedVersions(sanitizedVersions)
    # print "zzzz", len(NonZero_sanitizedVersionsWithScore)
    ### dyumping scores ...

    brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore)
    onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[0], brokenDict[1]
    # print "lalalaa ", onlyTheNonZeroSanitizedVScores

    # strOfScoresToDump=""
    # for elem in onlyTheNonZeroSanitizedVScores:
    #  strOfScoresToDump = strOfScoresToDump + str(elem) +  "," + "\n"

    ###
    # IO_.writeStrToFile("scores_for_clustering_measure.csv", strOfScoresToDump)

    reshapedNonZerSanitizedScores = np.reshape(onlyTheNonZeroSanitizedVScores, (-1, 1))
    clusteringType.fit(reshapedNonZerSanitizedScores)
    labelsFroVersions = clusteringType.labels_
    if clusterFlag:
        centroids = clusteringType.cluster_centers_
        print "And the centroids are .... ", centroids
        NonZer_Santized_versionDictWithLabels = utility.clusterByKmeansLabel(
            onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions
        )
        ##### plotting clusters start
        # low_cluster_y, high_cluster_y = utility.plotClusterByLabel( onlyTheNonZeroSanitizedVersionIDs , labelsFroVersions, NonZero_sanitizedVersionsWithScore)
        # low_cluster_x = [ 22.35294118 for x in low_cluster_y]
        # hig_cluster_x = [ 50.82030058 for x in high_cluster_y]
        # plotter.createClusterPlots(low_cluster_x, low_cluster_y, hig_cluster_x, high_cluster_y)
        ##### plottign clusters end
    else:
        print "No centroids for Aggolomerative clustering"
        NonZer_Santized_versionDictWithLabels = utility.clusterByAggoloLabel(
            onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions
        )
    print "And the labels are .... "
    print len(labelsFroVersions)
    cluster_labels = clusteringType.fit_predict(reshapedNonZerSanitizedScores)
    silhouette_avg = silhouette_score(reshapedNonZerSanitizedScores, cluster_labels)
    print "Silhouette average---> ", silhouette_avg

    ##############################
    themegaFile_All = outputStrParam + "_" + "culsterified_non_zero_all-CQ-HL.csv"
    IO_.dumpIntoClusterifiedFile(themegaFile_All, sanitizedVersions_CQ, NonZer_Santized_versionDictWithLabels, False)
def experiemnt_one(dbFileName, meanFlag, outputStrParam):
    print "Performing experiment # 1"
    #import correlation as corr_
    versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName)
    sanitizedVersions = sanityCheck.getCodeQualityofVersions(
        versionAndCodeQualityDict, meanFlag)
    print "Sanitized versions that will be used in study ", len(
        sanitizedVersions)
    #print "Sanitized versions ..." , sanitizedVersions
    sanitizedVersionsWithScore = sanityCheck.getVulnerbailityScoreOfSelectedVersions(
        sanitizedVersions)
    '''
	Stats on risk score-->len=721, median=51.1111111111,  mean=38.0255199862, max=53.3333333333, min=0.0,
	'''

    riskStatus = sanityCheck.getVulnerbailityScoreStatus(
        sanitizedVersionsWithScore)
    if meanFlag:
        threshold = riskStatus[0]  ## first returned index is mean
    else:
        threshold = riskStatus[1]

    ##############################
    sanitizedVersions_CQ = sanitizedVersions

    #######  high vScore versions started

    high_CQ_dict = utility.getHighVScoreVersions_CQ(sanitizedVersionsWithScore,
                                                    sanitizedVersions_CQ,
                                                    threshold)
    high_vScore_Dict = utility.getHighVScoreVersions_VScore(
        sanitizedVersionsWithScore, threshold)
    print "high_vscore_versions ", len(high_vScore_Dict)
    #######  high vScore versions ended

    #######  low vScore versions started
    low_CQ_dict = utility.getLowVScoreVersions_CQ(sanitizedVersionsWithScore,
                                                  sanitizedVersions_CQ,
                                                  threshold)
    low_vScore_Dict = utility.getLowVScoreVersions_VScore(
        sanitizedVersionsWithScore, threshold)
    print "len_vscore_versions ", len(low_vScore_Dict)
    #######  low vScore versions ended
    ##### dumpin time
    ### three ways: first by dumping all highs then all lows
    themegaFile_Seperated = outputStrParam + "_" + "all-CQ-HL-Seperated.csv"
    IO_.dumpIntoFileByHighAndLow(themegaFile_Seperated, high_CQ_dict,
                                 low_CQ_dict)

    ### three ways : second by dumping as it si
    themegaFile_All = outputStrParam + "_" + "all-CQ-HL.csv"
    IO_.dumpIntoFile(themegaFile_All, sanitizedVersions_CQ,
                     sanitizedVersionsWithScore, threshold, False)
    LGR.performLogiRegression(themegaFile_All)
Пример #3
0
def mobilesoft_cart(fileNameParam, fileToWriteP):
    testAndTrainData = IO_.giveTestAndTrainingData(fileNameParam)
    trainData = testAndTrainData[0]
    testData = testAndTrainData[1]
    selected_training_data = pca_mobilesoft.getPCAedFeatures(trainData)
    print "Size of selected training data : ", np.shape(selected_training_data)
    print "=" * 50

    dict_of_results = param_exp_classifier.runCART(selected_training_data,
                                                   testData, 0.90)
    reportStr = param_exp_analysis.analyzeThis(dict_of_results)
    IO_.writeStrToFile(fileToWriteP, reportStr)
def speedup_random_forest(fileNameParam, fileToWriteP):

  testAndTrainData = IO_.giveTestAndTrainingData(fileNameParam)
  trainData = testAndTrainData[0]
  testData  = testAndTrainData[1]
  #print trainData
  selected_training_data = pca_mobilesoft.getPCAedFeatures(trainData)
  print "Size of selected training data : ", np.shape(selected_training_data)
  print "="*50

  dict_of_results = runRandomForest(selected_training_data, testData)
  reportStr = param_exp_analysis.analyzeThis(dict_of_results)
  IO_.writeStrToFile(fileToWriteP, reportStr)
def experiemnt_two(dbFileName, meanFlag, outputStrParam ):
	print "Performing experiemnt # 2"



	#import correlation as corr_
	versionAndCodeQualityDict =  DEFT.getValuesFrom_CodingStandard(dbFileName)
	sanitizedVersions = sanityCheck.getCodeQualityofVersions(versionAndCodeQualityDict, meanFlag)
	print "Sanitized versions that will be used in study ", len(sanitizedVersions)
	#print "Sanitized versions ..." , sanitizedVersions
	NonZero_sanitizedVersionsWithScore = sanityCheck.getNonZeroVulnerbailityScoreOfSelectedVersions(sanitizedVersions)

	'''
	Stats on risk score (non-zero elemnts)-->len=549, median=51.1111111111,  mean=49.9387976503, max=53.3333333333, min=15.0	
	'''

	############################## 
	sanitizedVersions_CQ = sanitizedVersions


	riskStatus = sanityCheck.getVulnerbailityScoreStatus(NonZero_sanitizedVersionsWithScore)
	if meanFlag:       
	 threshold = riskStatus[0]   ## first returned index is mean 
	else: 
	 threshold = riskStatus[1]  


	#######  high vScore versions started  

	high_CQ_dict = utility.getHighVScoreVersions_CQ( NonZero_sanitizedVersionsWithScore , sanitizedVersions_CQ , threshold)
	high_vScore_Dict = utility.getHighVScoreVersions_VScore(NonZero_sanitizedVersionsWithScore, threshold)
	print "non zero high_vscore_versions ", len(high_vScore_Dict)
	#######  high vScore versions ended   


	#######  low vScore versions started  
	low_CQ_dict = utility.getLowVScoreVersions_CQ( NonZero_sanitizedVersionsWithScore , sanitizedVersions_CQ , threshold)
	low_vScore_Dict = utility.getLowVScoreVersions_VScore(NonZero_sanitizedVersionsWithScore, threshold)
	print "non zero  len_vscore_versions ", len(low_vScore_Dict)
	#######  low vScore versions ended   
	##### dumpin time 
	### three ways: first by dumping all highs then all lows 
	themegaFile_Seperated = outputStrParam + "_" + "non_zero_all-CQ-HL-Seperated.csv"
	IO_.dumpIntoFileByHighAndLow( themegaFile_Seperated, high_CQ_dict, low_CQ_dict )

	### three ways : second by dumping as it si 
	themegaFile_All = outputStrParam + "_" + "non_zero_all-CQ-HL.csv"
	IO_.dumpIntoFile( themegaFile_All,sanitizedVersions_CQ , NonZero_sanitizedVersionsWithScore, threshold, False )
	LGR.performLogiRegression(themegaFile_All)  
def mobilesoft_cart(fileNameParam, fileToWriteP):
  indexVector = [0, 5, 10, 12, 13, 18, 19, 20]
  testAndTrainData = IO_.giveTestAndTrainingData(fileNameParam)
  trainData = testAndTrainData[0]
  testData  = testAndTrainData[1]
  selected_training_data = createMobileSoftFeatures(trainData, indexVector)
  print "Size of selected training data : ", np.shape(selected_training_data)
  print "="*50
  print "Glimpse at  selected features (10th entry): \n", selected_training_data.iloc[9, :]
  print "="*50
  print "Glimpse at  labels (10th entry): \n", testData.iloc[9]
  print "="*50
  dict_of_results = param_exp_classifier.runCART(selected_training_data, testData, 0.90)
  reportStr = param_exp_analysis.analyzeThis(dict_of_results)
  IO_.writeStrToFile(fileToWriteP, reportStr)
def experiemnt_mobilesoft(dbFileName, outputStrParam):
    from sklearn import cluster
    import plotter
    clusteringType = cluster.AgglomerativeClustering(n_clusters=5)

    print "Performing experiemnt # Mobilesoft"
    versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName)
    sanitizedVersions = sanityCheck.getMobilesoftCodeQualityVersions(
        versionAndCodeQualityDict, 1.00)
    sanitizedVersions_CQ = sanitizedVersions

    NonZero_sanitizedVersionsWithScore = sanityCheck.getAllVulnerbailityScoreOfSelectedVersions(
        sanitizedVersions)

    brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore)
    onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[
        0], brokenDict[1]

    # strOfScoresToDump=""
    # for elem in onlyTheNonZeroSanitizedVScores:
    #   strOfScoresToDump = strOfScoresToDump + str(elem) +  "," + "\n"
    #
    # ##
    # IO_.writeStrToFile("scores_for_clustering_measure.csv", strOfScoresToDump)

    reshapedNonZerSanitizedScores = np.reshape(onlyTheNonZeroSanitizedVScores,
                                               (-1, 1))
    clusteringType.fit(reshapedNonZerSanitizedScores)
    labelsFroVersions = clusteringType.labels_
    print "No centroids for Aggolomerative clustering"
    NonZer_Santized_versionDictWithLabels = utility.clusterByAggoloLabel(
        onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions)
    print "And the labels are .... "
    print len(labelsFroVersions)
    cluster_labels = clusteringType.fit_predict(reshapedNonZerSanitizedScores)
    silhouette_avg = silhouette_score(reshapedNonZerSanitizedScores,
                                      cluster_labels)
    print "Silhouette average---> ", silhouette_avg

    # clusteringType = cluster.KMeans(n_clusters=5)
    # clusteringType.fit(reshapedNonZerSanitizedScores)
    # centroids = clusteringType.cluster_centers_
    # print "And the centroids are .... ", centroids
    ##############################
    themegaFile_All = outputStrParam + "_" + "cluster_Headered_1407.csv"
    IO_.dumpIntoClusterifiedFile(themegaFile_All, sanitizedVersions_CQ,
                                 NonZer_Santized_versionDictWithLabels)
    '''
def experiemnt_gaussian_naive_bayes(fileNameParam):
    testAndTrainData = IO_.giveTestAndTrainingData(fileNameParam)
    print "This is 'experiemnt_gaussian_naive_bayes' "

    # settign up train data
    trainData = testAndTrainData[0]
    original_rows = trainData.shape[0]
    original_cols = trainData.shape[1]
    print "Size of  training data : rows: {}, columns: {}".format(
        original_rows, original_cols)

    # settign up test data
    testData = testAndTrainData[1]
    for selCount in xrange(original_cols):
        count_ = selCount + 1
        if count_ <= original_cols:
            slected_training_data = giveSelectedTrainingData(
                trainData, testData, count_)
            print "#################  No. of features to work with={}  ############".format(
                count_)
            print "Size of selected training data : ", slected_training_data.shape
            emperiemntSplitters = [
                float(x) / float(10) for x in xrange(10) if x > 0
            ]
            for elem in emperiemntSplitters:
                #print "Training size: {} %".format(float(elem*100))
                exp_x_classifiers.runGNB(slected_training_data, testData, elem)
def getData(fileNameParam):
    testAndTrainData = IO_.giveTestAndTrainingData(fileNameParam)

    trainData = testAndTrainData[0]
    testData = testAndTrainData[1]

    return trainData, testData
def experiemnt_random_forest(fileNameParam, fileToWriteP):
  
  testAndTrainData = IO_.giveTestAndTrainingData(fileNameParam)
  print "This is 'experiemnt_random_forest' "  
  
  # settign up train data 
  trainData = testAndTrainData[0]
  original_rows = trainData.shape[0]
  original_cols =  trainData.shape[1] 
  print "Size of  training data : rows: {}, columns: {}".format( original_rows , original_cols )
  
  # settign up test data 
  testData = testAndTrainData[1]   
  dict_of_results = param_exp_classifier.runRandomForest(trainData, testData)
  reportStr = param_exp_analysis.analyzeThis(dict_of_results)
  IO_.writeStrToFile(fileToWriteP, reportStr)
Пример #11
0
def experiemnt_CART(fileNameParam):
    import exp_x_classifiers, IO_
    testAndTrainData = IO_.giveTestAndTrainingData(fileNameParam)
    print "This is 'experiemnt_CART' "

    # settign up train data
    trainData = testAndTrainData[0]
    original_rows = trainData.shape[0]
    original_cols = trainData.shape[1]
    print "Size of  training data : rows: {}, columns: {}".format(
        original_rows, original_cols)

    # settign up test data
    testData = testAndTrainData[1]
    #  for selCount in xrange(original_cols):
    #    count_ = selCount + 1
    #    if count_ < original_cols:
    slected_training_data = giveSelectedTrainingData(trainData, testData,
                                                     original_cols)
    print "#################  No. of features to work with={}  ############".format(
        original_cols)
    print "Size of selected training data : ", slected_training_data.shape
    emperiemntSplitters = [float(x) / float(10) for x in xrange(10) if x > 0]
    for elem in emperiemntSplitters:
        #print "Training size: {} %".format(float(elem*100))
        param_exp_classifier.runCART(slected_training_data, testData, elem)
def experiemnt_random_forest(fileNameParam):
  import exp_x_classifiers , IO_ 
  testAndTrainData = IO_.giveTestAndTrainingData(fileNameParam)
  #print testAndTrainData
  print "This is 'experiemnt_random_forest' "  
  
  # settign up train data 
  trainData = testAndTrainData[0]
  #print trainData
  original_rows = trainData.shape[0]
  original_cols =  trainData.shape[1] 
  print "Size of  training data : rows: {}, columns: {}".format( original_rows , original_cols )
  
  # settign up test data 
  testData = testAndTrainData[1]   
  #print testData  
  for selCount in xrange(original_cols):
    count_ = selCount + 1 
    if count_ <= original_cols:      
      slected_training_data = giveSelectedTrainingData(trainData, testData, count_ ) 
      print "#################  No. of features to work with={}  ############".format(count_)
      print "Size of selected training data : ", slected_training_data.shape
      emperiemntSplitters=[float(x)/float(10) for x in xrange(10) if x > 0] 
      for elem in emperiemntSplitters:
	  #print "Training size: {} %".format(float(elem*100))
	  exp_x_classifiers.runRandomForest(slected_training_data, testData, elem)
def getData(fileNameParam): 
  testAndTrainData = IO_.giveTestAndTrainingData(fileNameParam)
  
  trainData = testAndTrainData[0]
  testData = testAndTrainData[1]    
 
  return trainData, testData   
Пример #14
0
def experiemnt_random_forest(fileNameParam, fileToWriteP):

    testAndTrainData = IO_.giveTestAndTrainingData(fileNameParam)
    print "This is 'experiemnt_random_forest' "

    # settign up train data
    trainData = testAndTrainData[0]
    original_rows = trainData.shape[0]
    original_cols = trainData.shape[1]
    print "Size of  training data : rows: {}, columns: {}".format(
        original_rows, original_cols)

    # settign up test data
    testData = testAndTrainData[1]
    dict_of_results = param_exp_classifier.runRandomForest(trainData, testData)
    reportStr = param_exp_analysis.analyzeThis(dict_of_results)
    IO_.writeStrToFile(fileToWriteP, reportStr)
def experiment_mobilesoft_knn(fileNameParam):
    testAndTrainData = IO_.giveTestAndTrainingData(fileNameParam)
    trainData = testAndTrainData[0]
    testData = testAndTrainData[1]
    #print trainData
    selected_training_data = getPCAedFeatures(trainData)
    print "Size of selected training data : ", np.shape(selected_training_data)
    print "=" * 50
    exp_x_classifiers.runKNN(selected_training_data, testData, 0.90)
    print "=" * 50
def runRandomForest(trainDataParam, testDataParam):
  res_combo_dict ={}
  #n_estimators_list=[500]
  n_estimators_list             = [75, 80, 85]
  criterion_list                = ['gini', 'entropy']
  #max_features_list             = ['auto', 'sqrt', 'log2', None]
  max_depth_list                = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, None]
  max_leaf_nodes_list           = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, None]
  bootstrap_list                = [True, False]
  #min_samples_split_list        = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]
  #oob_score_list                = [True, False]
  min_weight_fraction_leaf_list = [0.1, 0.2, 0.3, 0.4, 0.5] # cannot be more than 0.50


  ### setting the aprameters : test purpose
#  n_estimators_list=[50, 50000]
#  criterion_list = ['gini', 'entropy']
#  max_features_list=['auto',  None]
#  max_depth_list = [1, 1000 ]
#  max_leaf_nodes_list = [None, 5, 1000] # in our datset only 549 legit samples so should eb limited to 549
#  bootstrap_list=[True, False]
#  min_samples_split_list = [1,  1000]  # in our datset only 549 legit samples so should eb limited to 549
#  oob_score_list=[True, False]
#  min_weight_fraction_leaf_list=[0.0,  0.5] # must be between 0.0 and 0.50
#  warm_start_list=[True, False]
  ###

  for eti in n_estimators_list:
    for crit in criterion_list:
        for max_depth_ in max_depth_list:
          for max_leaf in max_leaf_nodes_list:
            for bootstrap_ in bootstrap_list:
                  for mwfratleaf in min_weight_fraction_leaf_list:
                      ## display params:
                      # n_jobs  has been set to -1 to use all the cores avialable , not part fo an experiemnt
                      print "##########"
                      print "n_estimators={}, criterion={}, max_dept={}, max_leaf_nodes={}".format(eti, crit, max_depth_, max_leaf  )
                      print "bootstrap={},  min-wt-frac={}".format(bootstrap_, mwfratleaf )
                      key_str_1 = str(eti) + "_" + crit + "_"  + str(max_depth_) + "_" + str(max_leaf) + "_"
                      key_str_2 = str(bootstrap_) + "_" + str(mwfratleaf) + "_"
                      key_for_dict = key_str_1 + key_str_2
                      ## fire up the model
                      with IO_.duration():
                        theRndForestModel = RandomForestClassifier(
                                                            n_estimators=eti, criterion=crit,
                                                            max_depth=max_depth_,
                                                            min_weight_fraction_leaf=mwfratleaf,
                                                            max_leaf_nodes=max_leaf, bootstrap=bootstrap_
                                                            )
                        res_tuple = perform_cross_validation(theRndForestModel, trainDataParam, testDataParam, 10)
                        res_combo_dict[key_for_dict] = res_tuple
                      print "##########"
  return res_combo_dict
Пример #17
0
def createcombinatorialFiles(dirParam, doc_count_param, dir_to_write_Param):
  all_file_names = os.listdir(dirParam)
  valid_file_names = [x for x in all_file_names if x.endswith("dump")]
  for elem1 in valid_file_names:
    elem1_fileName= dirParam +   "/"  + elem1     
    
    elem1_tokns =  IO_.readFile(elem1_fileName) 
    #print "Count of tokens for {} is {}".format(elem1, len(elem1_tokns))

    for elem2 in valid_file_names:
        elem2_fileName= dirParam +   "/"  + elem2   

        elem2_tokns =   IO_.readFile(elem2_fileName)            
        file_name_to_write = elem1 + "_" + elem2 + ".tsv"
        print "Cmparing {} and {}".format(elem1, elem2)   
        print "Cmparing {} and {}".format(len(elem1_tokns), len(elem2_tokns))           
        bothfiletokens = pre_process_tokens(elem1_tokns, elem2_tokns, doc_count_param)   
        filtered_elem1_tokns  = bothfiletokens[0]
        filtered_elem2_tokns = bothfiletokens[1]
        IO_.writeTokensToFile(dir_to_write_Param, file_name_to_write, filtered_elem1_tokns, filtered_elem2_tokns)
        
                             
def experiment_mobilesoft_knn(fileNameParam, indexVector):
    testAndTrainData = IO_.giveTestAndTrainingData(fileNameParam)
    trainData = testAndTrainData[0]
    testData = testAndTrainData[1]
    #print trainData
    selected_training_data = createMobileSoftFeatures(trainData, indexVector)
    print "Size of selected training data : ", np.shape(selected_training_data)
    print "=" * 50
    print "Glimpse at  selected features (10th entry): \n", selected_training_data.iloc[
        9, :]
    print "=" * 50
    print "Glimpse at  labels (10th entry): \n", testData.iloc[9]
    print "=" * 50
    exp_x_classifiers.runKNN(selected_training_data, testData, 0.90)
    print "=" * 50
def runSVM(fileNamaParam, trainizingSizeParam):
  # what percent will you use ? 
  testSplitSize = 1.0 - trainizingSizeParam
  testAndTrainData = IO_.giveTestAndTrainingData(fileNamaParam)
  trainData = testAndTrainData[0]
  testData = testAndTrainData[1]
  ### classification   
  ## get the test and training sets   
  featureSpace_train, featureSpace_test, vScore_train, vScore_test = cross_validation.train_test_split(trainData, testData, test_size=testSplitSize, random_state=0) 
  ## fire up the model 
  theSVMModel = svm.SVC(kernel='rbf', C=1).fit(featureSpace_train, vScore_train)   
  thePredictedScores = theSVMModel.predict(featureSpace_test)
  #print "The original vector: "
  #print vScore_test
  #print "The predicted score vector: "
  #print thePredictedScores
  evalClassifier(vScore_test, thePredictedScores) 
Пример #20
0
def runCART(fileNamaParam, trainizingSizeParam):
    # what percent will you use ?
    testSplitSize = 1.0 - trainizingSizeParam
    testAndTrainData = IO_.giveTestAndTrainingData(fileNamaParam)
    trainData = testAndTrainData[0]
    testData = testAndTrainData[1]
    ### classification
    ## get the test and training sets
    featureSpace_train, featureSpace_test, vScore_train, vScore_test = cross_validation.train_test_split(
        trainData, testData, test_size=testSplitSize, random_state=0)
    ## fire up the model
    theQDAModel = DecisionTreeClassifier()
    theQDAModel.fit(featureSpace_train, vScore_train)
    thePredictedScores = theQDAModel.predict(featureSpace_test)
    #print "The original vector: "
    #print vScore_test
    #print "The predicted score vector: "
    #print thePredictedScores
    evalClassifier(vScore_test, thePredictedScores)
                                             max_leaf_nodes=10000)

        mae_for_param_combo_2 = perform_cross_validation(
            the_Model_2, trainingData, testData, cv_param)[1]
        t2 = time.time()
        time_for_param_comb_2 = t2 - t1
        mae_list_2.append(mae_for_param_combo_2)
        time_list_2.append(time_for_param_comb_2)

    mae_a12_ = a12_utility.doSlowA12(mae_list_1, mae_list_2)
    time_a12_ = a12_utility.doSlowA12(time_list_2, time_list_1)
    print "MAE  comaprison: is default worse than 'best combo' ?", mae_a12_
    print "time comaprison: is 'best' combo slower than default ?", time_a12_


datasetFileName = "13_NonZeroDataset_Aggolo.csv"
iterations = 10000
cv_param = 5
print "========== Random Forest =========="
with IO_.duration():
    runRFTest(datasetFileName, iterations, cv_param)
print "========== KNN =========="
with IO_.duration():
    runknnTest(datasetFileName, iterations, cv_param)
print "========== SVM =========="
with IO_.duration():
    runsvmTest(datasetFileName, iterations, cv_param)
print "========== CART =========="
with IO_.duration():
    runCARTTest(datasetFileName, iterations, cv_param)
Пример #22
0

def giveSelectedTrainingData(trainParam, testParam, no_of_chices_param):
    from sklearn.feature_selection import SelectKBest
    from sklearn.feature_selection import chi2

    train_data_new = SelectKBest(chi2, k=no_of_chices_param).fit_transform(
        trainParam, testParam)
    return train_data_new


####### Open loggger ####
old_stdout = sys.stdout
output_file_name = "param_exp_random_forest_500_two_folds.txt"
log_file = open(output_file_name, "w")
sys.stdout = log_file

print "Started at: ", IO_.giveTimeStamp()
fileNameParam = "13_NonZeroDataset_Aggolo.csv"
fileToWrite = "param_exp_combo_report_500_two_folds.csv"
experiemnt_random_forest(fileNameParam, fileToWrite)
#experiemnt_SVM(fileNameParam)
#experiemnt_KNN(fileNameParam)
#experiemnt_CART(fileNameParam)

print "Done ;-)"
print "Ended at: ", IO_.giveTimeStamp()

#### close logger
sys.stdout = old_stdout
log_file.close()
  for value_for_one_classifier in valueListParam: 
    comparer = value_for_one_classifier  
    comparees = [x for x in valueListParam if x!=value_for_one_classifier] 
    print "---"
    for comparee_item in comparees: 
      #print "comparer: {}, comapree: {}".format(comparer, comparee_item)  
      a12_results = a12_utility.doSlowA12(comparer, comparee_item)
      print "----->", a12_results   

####### Open loggger ####
old_stdout = sys.stdout
output_file_name="a12_res_2Clusters.txt"
log_file = open( output_file_name,  "w")
sys.stdout = log_file  

print "Started at: ", IO_.giveTimeStamp()
count=10000
file_="2Clusters_NonZeroDataset_Aggolo.csv"
all_accu_moea = runs(count, file_)
all_acuu = all_accu_moea[0]
all_moea = all_accu_moea[1]
#print "**************** Hypo. tests for Accuracy  ****************"
#stat_hypo_test_(all_acuu)
#print "**************** Hypo. tests for Mean Abs. Error  ****************"
#stat_hypo_test_(all_moea)
print "**************** A12 tests for Accuracy  ****************"
stat_a12_test_(all_acuu)
print "**************** A12 tests for Mean Abs. Error  ****************"
stat_a12_test_(all_moea)
print "Ended at: ", IO_.giveTimeStamp() 
    mae_for_param_combo_2 =  perform_cross_validation(the_Model_2, trainingData, testData, cv_param)[1]
    t2 = time.time()  
    time_for_param_comb_2 =  t2 - t1
    mae_list_2.append(mae_for_param_combo_2) 
    time_list_2.append(time_for_param_comb_2)    
    
    
    
  mae_a12_ = a12_utility.doSlowA12(mae_list_1, mae_list_2)    
  time_a12_ = a12_utility.doSlowA12(time_list_2, time_list_1)   
  print "MAE  comaprison: is default worse than 'best combo' ?", mae_a12_  
  print "time comaprison: is 'best' combo slower than default ?", time_a12_      



datasetFileName="13_NonZeroDataset_Aggolo.csv"
iterations=10000
cv_param = 5
print "========== Random Forest =========="
with IO_.duration():
  runRFTest(datasetFileName, iterations, cv_param)
print "========== KNN =========="  
with IO_.duration():
  runknnTest(datasetFileName, iterations, cv_param) 
print "========== SVM =========="    
with IO_.duration():
  runsvmTest(datasetFileName, iterations, cv_param) 
print "========== CART =========="    
with IO_.duration():
  runCARTTest(datasetFileName, iterations, cv_param)   
def experiemnt_three(dbFileName, meanFlag, outputStrParam, clusterFlag):
    from sklearn import cluster
    import plotter
    clusteringType = None
    if clusterFlag:
        clusteringType = cluster.KMeans(n_clusters=13)
    else:
        clusteringType = cluster.AgglomerativeClustering(n_clusters=13)

    print "Performing experiemnt # 3: Clustering score into two clusters "
    versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName)
    sanitizedVersions = sanityCheck.getCodeQualityofVersions(
        versionAndCodeQualityDict, meanFlag)
    sanitizedVersions_CQ = sanitizedVersions
    #print "Sanitized versions that will be used in study ", len(sanitizedVersions)
    #print "Sanitized versions ..." , sanitizedVersions
    NonZero_sanitizedVersionsWithScore = sanityCheck.getNonZeroVulnerbailityScoreOfSelectedVersions(
        sanitizedVersions)
    #print "zzzz", len(NonZero_sanitizedVersionsWithScore)
    ### dyumping scores ...

    brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore)
    onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[
        0], brokenDict[1]
    #print "lalalaa ", onlyTheNonZeroSanitizedVScores

    #strOfScoresToDump=""
    #for elem in onlyTheNonZeroSanitizedVScores:
    #  strOfScoresToDump = strOfScoresToDump + str(elem) +  "," + "\n"

    ###
    #IO_.writeStrToFile("scores_for_clustering_measure.csv", strOfScoresToDump)

    reshapedNonZerSanitizedScores = np.reshape(onlyTheNonZeroSanitizedVScores,
                                               (-1, 1))
    clusteringType.fit(reshapedNonZerSanitizedScores)
    labelsFroVersions = clusteringType.labels_
    if clusterFlag:
        centroids = clusteringType.cluster_centers_
        print "And the centroids are .... ", centroids
        NonZer_Santized_versionDictWithLabels = utility.clusterByKmeansLabel(
            onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions)
        ##### plotting clusters start
        #low_cluster_y, high_cluster_y = utility.plotClusterByLabel( onlyTheNonZeroSanitizedVersionIDs , labelsFroVersions, NonZero_sanitizedVersionsWithScore)
        #low_cluster_x = [ 22.35294118 for x in low_cluster_y]
        #hig_cluster_x = [ 50.82030058 for x in high_cluster_y]
        #plotter.createClusterPlots(low_cluster_x, low_cluster_y, hig_cluster_x, high_cluster_y)
        ##### plottign clusters end
    else:
        print "No centroids for Aggolomerative clustering"
        NonZer_Santized_versionDictWithLabels = utility.clusterByAggoloLabel(
            onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions)
    print "And the labels are .... "
    print len(labelsFroVersions)
    cluster_labels = clusteringType.fit_predict(reshapedNonZerSanitizedScores)
    silhouette_avg = silhouette_score(reshapedNonZerSanitizedScores,
                                      cluster_labels)
    print "Silhouette average---> ", silhouette_avg

    ##############################
    themegaFile_All = outputStrParam + "_" + "culsterified_non_zero_all-CQ-HL.csv"
    IO_.dumpIntoClusterifiedFile(themegaFile_All, sanitizedVersions_CQ,
                                 NonZer_Santized_versionDictWithLabels, False)
def runRandomForest(trainDataParam, testDataParam):
  res_combo_dict ={}  
#  ### setting the aprameters 
  n_estimators_list=[500]
  #n_estimators_list=[10, 50, 100, 500]
  criterion_list = ['gini', 'entropy']
  max_features_list=['auto', 'sqrt', 'log2', None]
  max_depth_list = [5, 15,  50, None ]
  max_leaf_nodes_list = [None,  25, 50, 75] # in our datset only 549 legit samples so should eb limited to 549 
  bootstrap_list=[True, False] 
  min_samples_split_list = [1, 25, 50,  100] # in our datset only 549 legit samples so should eb limited to 549 
  oob_score_list=[True, False]
  min_weight_fraction_leaf_list=[0.0, 0.2, 0.3, 0.4] # must be between 0.0 and 0.50 
  warm_start_list=[True, False]
#  ###   
  
  ### setting the aprameters : test purpose 
#  n_estimators_list=[50, 50000]
#  criterion_list = ['gini', 'entropy']
#  max_features_list=['auto',  None]
#  max_depth_list = [1, 1000 ]
#  max_leaf_nodes_list = [None, 5, 1000] # in our datset only 549 legit samples so should eb limited to 549 
#  bootstrap_list=[True, False] 
#  min_samples_split_list = [1,  1000]  # in our datset only 549 legit samples so should eb limited to 549 
#  oob_score_list=[True, False]
#  min_weight_fraction_leaf_list=[0.0,  0.5] # must be between 0.0 and 0.50 
#  warm_start_list=[True, False]
  ###     
  
  for eti in n_estimators_list:
    for crit in criterion_list:
      for maxfeat in max_features_list: 
        for max_depth_ in max_depth_list:
          for max_leaf in max_leaf_nodes_list:
            for bootstrap_ in bootstrap_list:
              for min_sample in min_samples_split_list: 
                if bootstrap_==False:
                  oob_score_list=[False, False]      
                for oob_ in oob_score_list:    
                  for mwfratleaf in min_weight_fraction_leaf_list: 
                    for warm_start_ in warm_start_list:  
                      ## display params: 
                      # n_jobs  has been set to -1 to use all the cores avialable , not part fo an experiemnt 
                      print "##########"
                      print "n_estimators={}, criterion={}, max_features={}, max_dept={}, max_leaf_nodes={}".format(eti, crit, maxfeat, max_depth_, max_leaf  )
                      print "bootstrap={}, min-sample-split={}, oob_score={}, min-wt-frac={}, warm-start={}".format(bootstrap_, min_sample, oob_, mwfratleaf, warm_start_ ) 
                      key_str_1 = str(eti) + "_" + crit + "_" + str(maxfeat) + "_" + str(max_depth_) + "_" + str(max_leaf) + "_" 
                      key_str_2 = str(bootstrap_) + "_" + str(min_sample) + "_" + str(oob_) + "_" + str(mwfratleaf) + "_" +str(warm_start_) 
                      key_for_dict = key_str_1 + key_str_2 
                      ## fire up the model 
                      with IO_.duration():
                        theRndForestModel = RandomForestClassifier( 
                                                            n_estimators=eti, criterion=crit, 
                                                            max_depth=max_depth_, min_samples_split=min_sample, 
                                                            max_features=maxfeat, min_weight_fraction_leaf=mwfratleaf,  
                                                            max_leaf_nodes=max_leaf, bootstrap=bootstrap_, 
                                                            oob_score=oob_, n_jobs=-1 , warm_start=warm_start_ 
                                                            )
                        res_tuple = perform_cross_validation(theRndForestModel, trainDataParam, testDataParam, 2) 
                        res_combo_dict[key_for_dict] = res_tuple
                      print "##########" 
  return res_combo_dict