Exemplo n.º 1
0
def compareLists(defective_vals_for_feature, non_defective_vals_for_feature):
    '''
          summary time
          '''
    print('=' * 25)
    print("Defective values [MEDIAN]:{}, [MEAN]:{}, [COUNT]:{}".format(
        np.median(list(defective_vals_for_feature)),
        np.mean(list(defective_vals_for_feature)),
        len(defective_vals_for_feature)))
    print("Non Defective values [MEDIAN]:{}, [MEAN]:{}, [COUNT]:{}".format(
        np.median(list(non_defective_vals_for_feature)),
        np.mean(list(non_defective_vals_for_feature)),
        len(non_defective_vals_for_feature)))
    try:
        TS, p = stats.mannwhitneyu(list(defective_vals_for_feature),
                                   list(non_defective_vals_for_feature),
                                   alternative='greater')
    except ValueError:
        TS, p = 0.0, 1.0
    cliffs_delta = cliffsDelta.cliffsDelta(
        list(defective_vals_for_feature), list(non_defective_vals_for_feature))
    print('*' * 25)
    print('Feature:{}, pee value:{}, cliffs:{}'.format(feature_, p,
                                                       cliffs_delta))
    print('*' * 25)
    print('=' * 50)
Exemplo n.º 2
0
def B_into_table():
    dictB = data_warehouse.dictB
    lengthlst=[1,2,5,10]
    #namelst=['Rotate_down2','Rotate_up2','Down3','Left3','Right3','Up3']
    name='Up3'
    order_acclst=dictB[name+'order']
    #orderwrong
    random_acclst=dictB[name+'randomwrong']

    order_acclst= multiply_100(order_acclst)
    random_acclst= multiply_100(random_acclst)
    #print order_acclst
    #namelst=['Rotate_down2','Rotate_up2','Down2','Down3',
    #'Left3','Right3','Rotate_left2','Up3']
    origin_dct={'Rotate_down2':88.28,'Rotate_up2':88.84,'Down3':71.54,'Left3':83.7,'Right3':83.15,'Up3':75.38}
    #x =lengthlst
    y_order_acc=[]
    y_random_acc=[]
    y_diff=[]
    for i in range(len(lengthlst)):
        avgorder=averagenum(order_acclst[i])
        y_order_acc.append(avgorder)
        origin=origin_dct[name]
        avgrandom=averagenum(random_acclst[i])
        d, res = cliffsDelta.cliffsDelta(order_acclst[i],random_acclst[i])
        y_random_acc.append(avgrandom)
        sta,p_value =st.ranksums(random_acclst[i], order_acclst[i])
        print('%.2f&%.2f(%.2f&%.4f)&%.2f'% (avgrandom, avgorder,d, p_value,(avgorder-origin)/(avgrandom-origin)))
        #print('average order_acc:%.4f'% averagenum(order_acclst[i]))
        y_diff.append(averagenum(order_acclst[i])-averagenum(random_acclst[i]))
    return
Exemplo n.º 3
0
def A_into_table_discuss():
    dictA = data_warehouse.dictA
    lengthlst=[1,2,5,10]
    #namelst=['Rotate_down2','Rotate_up2','Down2','Down3','Left3','Right3','Rotate_left2','Up3']
    origin_dct={'Rotate_down2':81.81,'Rotate_up2':84.98,'Down2':84.78,'Down3':61.91,
                'Left3':72.42,'Right3':82.09,'Rotate_left2':84.04,'Up3':68.59}
    
    name='Up3'
    order_acclst=dictA[name+'order']
    #randomwrong
    #orderwrong
    random_acclst=dictA[name+'random']

    order_acclst= multiply_100(order_acclst)
    random_acclst= multiply_100(random_acclst)
    #print order_acclst
    #namelst=['Rotate_down2','Rotate_up2','Down2','Down3',
    #'Left3','Right3','Rotate_left2','Up3']
    
    #x =lengthlst
    y_order_acc=[]
    y_random_acc=[]
    y_diff=[]
    
    avgorder=averagenum(order_acclst[1])
    y_order_acc.append(avgorder)
    origin=origin_dct[name]
    avgrandom=averagenum(random_acclst[2])
    d, res = cliffsDelta.cliffsDelta(order_acclst[1],random_acclst[2])
    sta,p_value =st.ranksums(random_acclst[2], order_acclst[1])
    y_random_acc.append(avgrandom)
    print('%.2f&%.2f(%.2f&%.4f)&%.2f'% (avgrandom, avgorder,d,p_value, (avgorder-origin)/(avgrandom-origin)))
    #print('average order_acc:%.4f'% averagenum(order_acclst[i]))
    #y_diff.append(averagenum(order_acclst[1])-averagenum(random_acclst[2]))
    return
Exemplo n.º 4
0
    def test_nonoverlapping(self):  # marco's test
        x1 = [10, 20, 20, 20, 30, 30, 30, 40, 50, 100]
        x2 = [10, 20, 30, 40, 40, 50]
        factor = 110
        x2 = [x + factor for x in x2]

        d, res = cliffsDelta.cliffsDelta(x1, x2)
        self.assertEqual(res, 'large')
        self.assertAlmostEqual(d, -1, 2)
Exemplo n.º 5
0
    def test_tim(self):
        lst1 = range(8)
        out = []
        expected = ['negligible', 'negligible', 'small', 'small', 'medium']
        for r in [1.01, 1.1, 1.21, 1.5, 2]:
            lst2 = list(map(lambda x: x * r, lst1))
            d, res = cliffsDelta.cliffsDelta(lst1, lst2)
            out.append(res)

        self.assertEqual(expected, out)
Exemplo n.º 6
0
    def test_small_difference(self):
        """ test data from Marco Torchiano R package https://github.com/mtorchiano/effsize"""
        treatment = [10, 10, 20, 20, 20, 30, 30, 30, 40, 50]
        control = [10, 20, 30, 40, 40, 50]

        d, res = cliffsDelta.cliffsDelta(treatment, control)
        # Cliff's Delta
        #
        # delta estimate: -0.25 (small)
        # 95 percent confidence interval:
        #  inf        sup
        # -0.7265846  0.3890062
        self.assertEqual("small", res)
        self.assertEqual(d, -0.25)
def RQ2_stat_test(lags_dict, report_dict):
    lags = dict()
    lags['high'] = [{'latest': [], 'supported': []} for i in range(5)]
    lags['medium'] = [{'latest': [], 'supported': []} for i in range(5)]
    lags['low'] = [{'latest': [], 'supported': []} for i in range(5)]

    for vul in lags_dict:
        severity = report_dict[vul]['severity']
        lineage = lags_dict[vul]['package']['lineage_freshness']
        for client in lags_dict[vul]['client']:
            hop = client['downstream_propagation'] if client[
                'downstream_propagation'] < 4 else 4
            if client['lags_day'] == None:
                continue
            lags[severity][hop][lineage].append(client['lags_day'])

    print('=' * 50)
    print(
        '(RQ2) Statistical test (Kruskal-Wallis) and effect size (Cliff\'s delta) for lags in the latest lineage and the supported lineage'
    )
    print('=' * 50)
    print(
        'Severity, Down Prop., Kruskal-Wallis H statistic, p-value, effect size'
    )
    for severity in ['high', 'medium', 'low']:
        for hop in range(1, 5):
            if len(lags[severity][hop]['latest']) == 0:
                break
            hop_lineage_test = stats.kruskal(lags[severity][hop]['latest'],
                                             lags[severity][hop]['supported'])
            delta, res = cliffsDelta.cliffsDelta(
                lags[severity][hop]['latest'],
                lags[severity][hop]['supported'])
            print('%s, %d, %f, %s, %.2f' %
                  (severity, hop, hop_lineage_test[0],
                   'True' if hop_lineage_test[1] < 0.001 else 'False', delta))
    print('=' * 50)
    for severity in ['high', 'medium', 'low']:
        for hop in range(1, 5):
            mid_ll = np.median(lags[severity][hop]['latest'])
            mid_sl = np.median(lags[severity][hop]['supported'])
            if mid_ll > mid_sl:
                print(severity, hop, 'LL > SL')
            else:
                print(severity, hop, 'LL < SL')
Exemplo n.º 8
0
def performStatCompa(file_name, mon_par):
    df_ = pd.read_csv(file_name)
    print df_.head
    months = np.unique(df_['MONTH'].tolist())

    features = df_.columns
    dropcols = ['MONTH', 'FILE_NAME', 'Unnamed: 12']
    smell_names = [x_ for x_ in features if x_ not in dropcols]

    for mon_ in months:
        if mon_ == mon_par:
            mon_df = df_[df_['MONTH'] == mon_]
            for smell_name in smell_names:
                smelly_files = np.unique(
                    mon_df[mon_df[smell_name] > 0]['FILE_NAME'].tolist())
                non_smelly_files = np.unique(
                    mon_df[mon_df[smell_name] <= 0]['FILE_NAME'].tolist())

                smelly_loc_list = [
                    sum(1 for line_ in file_) for file_ in smelly_files
                ]
                non_smelly_loc_list = [
                    sum(1 for line_ in file_) for file_ in non_smelly_files
                ]

                print '=' * 50
                print "Smelly Size [MEDIAN]:{}, [MEAN]:{}".format(
                    np.median(smelly_loc_list), np.mean(smelly_loc_list))
                print "Non Smelly Size [MEDIAN]:{}, [MEAN]:{}".format(
                    np.median(non_smelly_loc_list),
                    np.mean(non_smelly_loc_list))
                if (np.mean(smelly_loc_list) != np.mean(non_smelly_loc_list)):
                    TS, p = stats.mannwhitneyu(smelly_loc_list,
                                               non_smelly_loc_list,
                                               alternative='greater')
                else:
                    TS, p = 0.0, 1.0
                cliffs_delta = cliffsDelta.cliffsDelta(smelly_loc_list,
                                                       non_smelly_loc_list)
                print 'Feature:{}, pee value:{}, cliffs:{}'.format(
                    smell_name, p, cliffs_delta)
                print '=' * 50
Exemplo n.º 9
0
def statisticalTest(scoreDict):
    pairs = {
        'db-mcc' : ('rule-db-mcc', 'cluster-db-mcc'),
        'db-precision' : ('rule-db-precision', 'cluster-db-precision'),        
        'tma-mcc' : ('rule-tma-mcc', 'cluster-tma-mcc'),
        'tma-precision' : ('rule-tma-precision', 'cluster-tma-precision'),     
        'im-mcc' : ('rule-im-mcc', 'cluster-im-mcc'),
        'im-precision' : ('rule-im-precision', 'cluster-im-precision'),     
    }
    uDict = {}
    effectDict = {}

    for name, pair in pairs.items():
        stat, p = mannwhitneyu(np.array(scoreDict[pair[0]]), np.array(scoreDict[pair[1]]))
        uDict[name] = (stat, p)
        effectsize, res = cliffsDelta.cliffsDelta(scoreDict[pair[0]], scoreDict[pair[1]])
        effectDict[name] = (effectsize, res)
    
    uDf = pd.DataFrame(data=uDict).T.to_excel(os.path.join(results_folder, f'utestresults{iters}iters{subSample}percent.xlsx'))
    effectDf = pd.DataFrame(data=effectDict).T.to_excel(os.path.join(results_folder, f'effectresults{iters}iters{subSample}percent.xlsx'))        
Exemplo n.º 10
0
                                             1][feature_]
        non_defective_vals_for_feature = df2read[df2read['SMELL_FLAG'] ==
                                                 0][feature_]
        '''
           summary time
           '''
        print 'THE FEATURE IS:', feature_
        print '=' * 25
        print "Smelly file values [MEDIAN]:{}, [MEAN]:{}".format(
            np.median(list(defective_vals_for_feature)),
            np.mean(list(defective_vals_for_feature)))
        print "Non smelly values [MEDIAN]:{}, [MEAN]:{}".format(
            np.median(list(non_defective_vals_for_feature)),
            np.mean(list(non_defective_vals_for_feature)))

        if feature_ == 'OWNER_LINES':
            TS, p = stats.mannwhitneyu(list(non_defective_vals_for_feature),
                                       list(defective_vals_for_feature),
                                       alternative='greater')
        else:
            TS, p = stats.mannwhitneyu(list(defective_vals_for_feature),
                                       list(non_defective_vals_for_feature),
                                       alternative='greater')

        cliffs_delta = cliffsDelta.cliffsDelta(
            list(defective_vals_for_feature),
            list(non_defective_vals_for_feature))
        print 'Feature:{}, P:{}, cliffs:{}'.format(feature_, p, cliffs_delta)
        print '=' * 50
    print '*' * 100
print "Ended at:", giveTimeStamp()
Exemplo n.º 11
0
      df2read['defect_status'] == 0][feature_]
  '''
 summary time
 '''
  print "Defective values stats: \n", defective_vals_for_feature.describe(
  )
  print "Non defective values stats: \n", non_defective_vals_for_feature.describe(
  )
  def_val, non_def_val = list(
      defective_vals_for_feature), list(
          non_defective_vals_for_feature)
  if ((np.unique(def_val) == np.unique(non_def_val)
       ) == False):
      TS_g, p_g = stats.mannwhitneyu(
          def_val, non_def_val, alternative='greater')
      cliffs_delta_g = cliffsDelta.cliffsDelta(
          def_val, non_def_val)
      print '[GREATER:TEST] pee value:{}, cliffs:{}'.format(
          p_g, cliffs_delta_g)
      print '=' * 50
      TS_l, p_l = stats.mannwhitneyu(def_val,
                                     non_def_val,
                                     alternative='less')
      cliffs_delta_l = cliffsDelta.cliffsDelta(
          def_val, non_def_val)
      print '[SMALLER:TEST] pee value:{}, cliffs:{}'.format(
          p_l, cliffs_delta_l)
      print '=' * 50
  '''
 all data summary
 '''
  data_for_feature = df2read[feature_]
authorsOfCleanGists = [
    x.reputation for x in gists if x.withAtleastOneSmell == False
]

stat, p = mannwhitneyu(authorsOfSmellyGists, authorsOfCleanGists)

alpha = 0.05
if p > alpha:
    print('Two sets of authors are same')
else:
    print(
        'Authors with at least one smell in gists are different than authors with no smell in gists'
    )

print('however, this difference is, ', end='')
print(cliffsDelta.cliffsDelta(authorsOfCleanGists, authorsOfSmellyGists))

print('=== GITHUB GIST MANUAL INSPECTION ===')
pt2 = PrettyTable()

pt2.field_names = ['smell', 'occurence', 'precision', 'recall']
pt2.add_row(['shell_injection', '86', '1.00', '1.00'])
pt2.add_row(['assert_used', '79', '1.00', '1.00'])
pt2.add_row(['empty_password', '2', '1.00', '1.00'])
pt2.add_row(['exec_used', '4', '1.00', '1.00'])
pt2.add_row(['debug_true', '2', '1.00', '1.00'])
pt2.add_row(['hardcoded_interface', '3', '1.00', '1.00'])
pt2.add_row(['hardcoded_secret', '31', '0.94', '0.91'])
pt2.add_row(['hardcoded_sql', '3', '1.00', '1.00'])
pt2.add_row(['hardcoded_tmp', '3', '1.00', '1.00'])
pt2.add_row(['no_integrity_check', '7', '0.43', '0.75'])
Exemplo n.º 13
0
 def test_negligible(self):  #Marco
     x1 = [10, 20, 20, 20, 30, 30, 30, 40, 50, 100]
     x2 = [10, 20, 30, 40, 40, 50]
     d, res = cliffsDelta.cliffsDelta(x1, x2)
     self.assertAlmostEqual(-0.06667, d, 4)