def compute_pearson_and_spearman_r(A, B, n_pool, n_test):
    assert A.shape[0] == n_pool + n_test
    A_diag = np.diag(A)[:n_pool].tolist()
    B_diag = np.diag(B)[:n_pool].tolist()

    A_pool_test = A[:n_pool][:, n_pool:]
    B_pool_test = B[:n_pool][:, n_pool:]
    A_offdiag = np.reshape(A_pool_test, -1).tolist()
    B_offdiag = np.reshape(B_pool_test, -1).tolist()

    pr_diag, pr_diag_p = pr(A_diag, B_diag)
    pr_offdiag, pr_offdiag_p = pr(A_offdiag, B_offdiag)

    spr_diag, spr_diag_p = spr(A_diag, B_diag)
    spr_offdiag, spr_offdiag_p = spr(A_offdiag, B_offdiag)
    return pr_diag, pr_offdiag, spr_diag, spr_offdiag, pr_diag_p, pr_offdiag_p, spr_diag_p, spr_offdiag_p
def correlation(df):
    corrp1 = np.corrcoef(df['Spots Number'], df['Flares Number'])
    cp1 = 'La correlación de Pearson entre el número de manchas y el número de llamaradas es {:.2f}'.format(
        corrp1[0][1])
    #print (cp1)
    corrp2 = np.corrcoef(df['Spots Month Mean'], df['F Month Mean'])
    cp2 = 'La correlación de Pearson entre la media mensual de manchas y de llamaradas es {:.2f}'.format(
        corrp2[0][1])
    #print (cp2)
    corrp3 = np.corrcoef(df['Spots Month Mean'], df['F Month Maximum'])
    cp3 = 'La correlación de Pearson entre la media mensual de manchas y el máximo mensual de llamaradas es {:.2f}'.format(
        corrp3[0][1])
    #print (cp3)

    corrs1 = spr(df['Spots Number'], df['Flares Number'])
    cs1 = 'La correlación de Spearman entre el número de manchas y el número de llamaradas es {:.2f}'.format(
        corrs1[0])
    #print (cs1)
    corrs2 = spr(df['Spots Month Mean'], df['F Month Mean'])
    cs2 = 'La correlación de Spearman entre la media mensual de manchas y de llamaradas es {:.2f}'.format(
        corrs2[0])
    #print (cs2)
    corrs3 = spr(df['Spots Month Mean'], df['F Month Maximum'])
    cs3 = 'La correlación de Spearman entre la media mensual de manchas y el máximo mensual de llamaradas es {:.2f}'.format(
        corrs3[0])
    #print (cs3)

    corrk1 = kdl(df['Spots Number'], df['Flares Number'])
    ck1 = 'La correlación de Kendall entre el numero de manchas y el número de llamaradas es {:.2f}'.format(
        corrk1[0])
    #print (ck1)
    corrk2 = kdl(df['Spots Month Mean'], df['F Month Mean'])
    ck2 = 'La correlación de Kendall entre la media mensual de manchas y de llamaradas es {:.2f}'.format(
        corrk2[0])
    #print (ck2)
    corrk3 = kdl(df['Spots Month Mean'], df['F Month Maximum'])
    ck3 = 'La correlación de Kendall entre la media mensual de manchas y el máximo mensual de llamaradas es {:.2f}'.format(
        corrk3[0])
    #print (ck3)

    return [cp1, cp2, cp3, cs1, cs2, cs3, ck1, ck2, ck3]
def evaluate_spearman_corr(gt, pred, ref='TotalScore'):
  """Spearman’s ρ rank correlation statistic between features and gt aesthetic score.
  First we get common elements to compare and order them by image file name.
  ref: either 'aesthetic score' if its mlsp method or 'TotalScore' if it's AADB dataset
  """
  imgs_to_eval = list(set(gt.ImageFile) & set(pred.ImageFile))
  gt = gt[gt['ImageFile'].isin(imgs_to_eval)].sort_values(by=['ImageFile'])
  pred = pred[pred['ImageFile'].isin(imgs_to_eval)].sort_values(by=['ImageFile'])

  for col in pred.columns[1:]:
    attr_gt = gt.loc[:,ref] 
    attr_pred = pred.loc[:,col]
    rho,pval = spr(attr_gt,attr_pred)
    print("{}: rho: {} at p value: {}".format(col, rho, pval))    
def plot_corr(gt, pred, name, ref='TotalScore'):
  """Plot datapoints and regression to see correlation between features and gt aesthetic score.
  First we get common elements to compare and order them by image file name.
  ref: either 'aesthetic score' if its mlsp method or 'TotalScore' if it's AADB dataset
  """
  imgs_to_eval = list(set(gt.ImageFile) & set(pred.ImageFile))
  gt = gt[gt['ImageFile'].isin(imgs_to_eval)].sort_values(by=['ImageFile'])
  pred = pred[pred['ImageFile'].isin(imgs_to_eval)].sort_values(by=['ImageFile'])

  for col in pred.columns[1:]:
    attr_gt = gt.loc[:,ref] 
    attr_pred = pred.loc[:,col]
    df = pd.DataFrame({"AestheticScore": attr_gt, col: attr_pred})
    sns_scatter_plot = sns.jointplot(x="AestheticScore", y=col, data=df, kind="reg")
    rho, pval = spr(attr_gt,attr_pred)
    sns_scatter_plot.savefig("./eval/" + name + "_" + col + "_" + "{:.4f}".format(rho) + ".png")
    plt.close()
예제 #5
0
                            #                         try:
                            df_dwd2 = transform_to_bools(
                                df_dwd2, percentile_level)
                            df_rea2 = transform_to_bools(
                                df_rea2, percentile_level)
    #                         except Exception as msg:
    #                             print(msg)
                        cmn_vals1 = df_dwd1.loc[cmn_idx].values.ravel()
                        cmn_vals2 = df_dwd2.loc[cmn_idx].values.ravel()

                        cmn_rea1 = df_rea1.loc[cmn_idx].values.ravel()
                        cmn_rea2 = df_rea2.loc[cmn_idx].values.ravel()
                        #                 np.nansum(df_dwd1)
                        #                 df_dwd1.max()
                        try:
                            spr_corr = spr(cmn_vals1, cmn_vals2)[0]
                            prs_corr = prs(cmn_vals1, cmn_vals2)[0]
                            sep_dist = distance_sorted[ix2]

                            spr_corr_rea = spr(cmn_rea1, cmn_rea2)[0]
                            prs_corr_rea = prs(cmn_rea1, cmn_rea2)[0]
            #             sep_dist_rea = distance_sorted[ix2]
                        except Exception as msg:
                            print(msg)

                        if np.isnan(spr_corr):
                            print('corr_is_nan')
                        df_distance_corr.loc[stn_id,
                                             'sep_dist_%s' % _id2] = sep_dist
                        df_distance_corr.loc[stn_id,
                                             'pears_corr_%s' % _id2] = spr_corr
예제 #6
0
import pandas as pd
from scipy.stats import spearmanr as spr
from sys import argv

script,prefix = argv
prefix = str(prefix)
# example Usage
# python getCorrelation.py imgListTestNewRegression_

groundTruth = pd.read_csv(prefix+'.csv')
predictAttr = pd.read_csv(prefix+'predict.csv')
assert (groundTruth.columns == predictAttr.columns).all()
assert groundTruth.shape == predictAttr.shape
assert pd.Series.equals(groundTruth.ImageFile,predictAttr.ImageFile)

for col in groundTruth.columns[1:]:
    attrGT = groundTruth.loc[:,col]
    attrP = predictAttr.loc[:,col]
    rho,pval = spr(attrGT,attrP)
    print "For {} rho: {} at p value: {}".format(col,rho,pval)

예제 #7
0
from scipy.stats import spearmanr as spr
import numpy as np
#SCIsdatabase = '~/IQA_CNN/SCIs/DistortedImages'
for x in range(10,100):
    gamma = float(x)/1000
    os.system('libsvm/svm-scale -l -1 -u 1 -s allrange live_train.txt > train_scale')
    os.system('libsvm/svm-train  -s 3 -g {0} -c 2048 -b 1 -q train_scale allmodel'.format(gamma))
    #print str(x)

    if len(sys.argv)>1 and  sys.argv[1]=='retest':
        with open('live_test.txt') as fp :
            lines = fp.readlines()
            lines = map(lambda x : x.replace('\n',''),lines)
            image_info = map(lambda x : x.split(' '),lines)
            fp.close()
        os.system('rm live_test_score.txt')
        cmds = map(lambda x : './brisquequality -im '+x[0]+' >> live_test_score.txt',image_info)
        for i,cm in enumerate(cmds):
            print i
            os.system(cm)
        #map(lambda x :os.system(x),cmds)

    #image_name = map(lambda x : '_'.join(['cim'+str(x/49+1),str((x-x/49*49)/7+1),str((x-x/49*49)%7+1)])+'.bmp',range(0,980))
    test_score = np.loadtxt('live_test_score.txt')
    mos_score = np.loadtxt('live_mos.txt')
    sp_v  = spr(test_score,mos_score)
    print sp_v
    fp = open('live_svm_parms.txt','a')
    fp.write('{0} : spearmanr : {1}\n'.format(gamma,sp_v))
    fp.close()
예제 #8
0
        
        dwd_pcp = dwd_hdf5_de.get_pandas_dataframe(stn_id).dropna()
        in_df_rea6_stn = in_df_rea6.loc[:, stn_id].dropna()
        
        cmn_idx = dwd_pcp.index.intersection(in_df_rea6_stn.index)
        #break

        df_dwd1 = resampleDf(dwd_pcp.loc[cmn_idx,:], temp_agg)
        df_rea1 = resampleDf(in_df_rea6_stn.loc[cmn_idx], temp_agg)
        
        if df_dwd1.size > 0:
            if test_for_extremes:
                df_dwd1 = transform_to_bools(df_dwd1, percentile_level)
                df_rea1 = transform_to_bools(df_rea1, percentile_level)
                
                spr_corr_dwd_rea = spr(df_dwd1.values.ravel(), df_rea1.values.ravel())[0]
                prs_corr_dwd_rea = prs(df_dwd1.values.ravel(), df_rea1.values.ravel())[0]
                
                
            else:
                spr_corr_dwd_rea = spr(df_dwd1.values.ravel(), df_rea1.values.ravel())[0]
                prs_corr_dwd_rea = prs(df_dwd1.values.ravel(), df_rea1.values.ravel())[0]


        df_distance_corr.loc[stn_id,
                             'prs_corr_dwd_rea'] = prs_corr_dwd_rea
        df_distance_corr.loc[stn_id,
                             'spr_corr_dwd_rea'] = spr_corr_dwd_rea


    # all stns
예제 #9
0
        _file = '../data/testing/test.pkl'
        gt_file = 'imgListTestNewRegression_.csv'

    assert(exists(_file))
    data = joblib.load(_file)
    groundTruth = pd.read_csv(gt_file, header=0, delimiter=',')
    n = groundTruth.shape[0]
    predAtt = pd.DataFrame(index=groundTruth.index, columns=groundTruth.columns)
    x = data[0]
    y_true = data[1]

    model = model2(weights_path=weights_file)
    y_predict = model.predict(x, batch_size=batch_size, verbose=1)

    attrs = ['BalacingElements', 'ColorHarmony', 'Content', 'DoF',
             'Light', 'MotionBlur', 'Object', 'RuleOfThirds', 'VividColor', 'Repetition', 'Symmetry', 'score']
    for i,attr in enumerate(attrs):
	attr_true = y_true[attr]
	attr_predict = y_predict[i]
	rho, p_value = spr(attr_true, attr_predict)
        error = mse(attr_true, attr_predict)
        print "for {} the spr correlation: {} with p value {} and error value: {}".format(attr, rho, p_value, error)

        attr_predict = pd.Series(y_predict[i].reshape(n))
        predAtt[attr] = attr_predict.values
  
    predAtt['ImageFile'] = groundTruth['ImageFile']
    predAtt.to_csv(gt_file[0:-4]+'_predict.csv', index=False)


def compare_pws_prim_netw_indicator_correlations(args):
    '''
     Find then for the pws station the neighboring prim_netw station
     intersect both stations, for the given probabilistic percentage
     threshold find the corresponding ppt_thr from the CDF of each station
     seperatly, make all values boolean (> 1, < 0) and calculate the pearson
     rank correlation between the two stations

     Add the result to a new dataframe and return it

    '''
    (path_to_prim_netw_data_hdf5, in_prim_netw_df_coords_utm32,
     path_pws_ppt_df_hdf5, in_pws_df_coords_utm32, all_pws_ids,
     prim_netw_points_tree, prim_netw_stns_ids, df_results_correlations,
     neighbor_to_chose, val_thr_percent, min_req_ppt_vals) = args

    # get all pws and prim_netw data
    HDF5_pws = HDF5(infile=path_pws_ppt_df_hdf5)

    HDF5_prim_netw = HDF5(infile=path_to_prim_netw_data_hdf5)

    alls_stns_len = len(all_pws_ids)
    # to count number of stations

    # iterating through pws ppt stations
    for ppt_stn_id in all_pws_ids:

        print('\n**\n pws stations is %d/%d**\n' %
              (alls_stns_len, len(all_pws_ids)))

        # reduce number of remaining stations
        alls_stns_len -= 1
        try:
            # read first pws station
            try:
                pws_ppt_stn1_orig = HDF5_pws.get_pandas_dataframe(ppt_stn_id)

            except Exception as msg:
                print('error reading pws', msg)

            pws_ppt_stn1_orig = pws_ppt_stn1_orig[
                pws_ppt_stn1_orig < max_ppt_thr]

            # select df with period
            pws_ppt_season = select_df_within_period(pws_ppt_stn1_orig,
                                                     start=start_date,
                                                     end=end_date)

            # drop all index with nan values
            pws_ppt_season.dropna(axis=0, inplace=True)

            if pws_ppt_season.size > min_req_ppt_vals:

                # find distance to all prim_netw stations, sort them, select
                # minimum
                (xpws, ynetamto) = (in_pws_df_coords_utm32.loc[ppt_stn_id,
                                                               'X'],
                                    in_pws_df_coords_utm32.loc[ppt_stn_id,
                                                               'Y'])

                # This finds the index of neighbours

                distances, indices = prim_netw_points_tree.query(np.array(
                    [xpws, ynetamto]),
                                                                 k=2)

                stn_2_prim_netw = prim_netw_stns_ids[
                    indices[neighbor_to_chose]]

                min_dist_ppt_prim_netw = np.round(distances[neighbor_to_chose],
                                                  2)

                if min_dist_ppt_prim_netw <= min_dist_thr_ppt:

                    # check if prim_netw station is near, select and read
                    # prim_netw stn
                    try:
                        df_prim_netw_orig = HDF5_prim_netw.get_pandas_dataframe(
                            stn_2_prim_netw)
                    except Exception as msg:
                        print('error reading prim_netw', msg)

                    df_prim_netw_orig.dropna(axis=0, inplace=True)

                    # select only data within same range
                    df_prim_netw_orig = select_df_within_period(
                        df_prim_netw_orig, pws_ppt_season.index[0],
                        pws_ppt_season.index[-1])

                    # ===============================================
                    # Check neighboring prim_netw stations
                    # ===============================================
                    # for the prim_netw station, neighboring the pws
                    # get id, coordinates and distances of prim_netw
                    # neighbor
                    (xprim_netw, yprim_netw) = (
                        in_prim_netw_df_coords_utm32.loc[stn_2_prim_netw, 'X'],
                        in_prim_netw_df_coords_utm32.loc[stn_2_prim_netw, 'Y'])

                    distances_prim_netw, indices_prim_netw = (
                        prim_netw_points_tree.query(np.array(
                            [xprim_netw, yprim_netw]),
                                                    k=5))
                    # +1 to get neighbor not same stn
                    stn_near_prim_netw = prim_netw_stns_ids[indices_prim_netw[
                        neighbor_to_chose + 1]]

                    min_dist_prim_netw_prim_netw = np.round(
                        distances_prim_netw[neighbor_to_chose + 1], 2)

                    try:
                        # read the neighboring prim_netw station

                        try:
                            df_prim_netw_ngbr = HDF5_prim_netw.get_pandas_dataframe(
                                stn_near_prim_netw)
                        except Exception as msg:
                            print('error reading prim_netw', msg)

                        df_prim_netw_ngbr.dropna(axis=0, inplace=True)
                        # select only data within same range
                        df_prim_netw_ngbr = select_df_within_period(
                            df_prim_netw_ngbr, pws_ppt_season.index[0],
                            pws_ppt_season.index[-1])
                    except Exception:
                        raise Exception

                    # calculate Indicator correlation between
                    # prim_netw-prim_netw
                    if min_dist_prim_netw_prim_netw < min_dist_thr_ppt:

                        cmn_idx = pws_ppt_season.index.intersection(
                            df_prim_netw_ngbr.index).intersection(
                                df_prim_netw_orig.index)

                        if cmn_idx.size > min_req_ppt_vals:

                            df_prim_netw_cmn_season = df_prim_netw_orig.loc[
                                cmn_idx, :]

                            df_pws_cmn_season = pws_ppt_season.loc[cmn_idx, :]

                            df_prim_netw_ngbr_season = df_prim_netw_ngbr.loc[
                                cmn_idx, :]

                            assert (df_prim_netw_cmn_season.isna().sum().
                                    values[0] == 0)
                            assert (
                                df_pws_cmn_season.isna().sum().values[0] == 0)
                            assert (df_prim_netw_ngbr_season.isna().sum().
                                    values[0] == 0)

                            #======================================
                            # select only upper tail of values of both dataframes
                            #======================================
                            val_thr_float = val_thr_percent / 100
                            # this will calculate the EDF of pws
                            # station
                            pws_cdf_x, pws_cdf_y = get_cdf_part_abv_thr(
                                df_pws_cmn_season.values.ravel(), -0.1)
                            # find ppt value corresponding to quantile
                            # threshold
                            pws_ppt_thr_per = pws_cdf_x[np.where(
                                pws_cdf_y >= val_thr_float)][0]

                            # this will calculate the EDF of prim_netw
                            # station
                            prim_netw_cdf_x, prim_netw_cdf_y = get_cdf_part_abv_thr(
                                df_prim_netw_cmn_season.values.ravel(), -0.1)

                            # find ppt value corresponding to quantile
                            # threshold
                            prim_netw_ppt_thr_per = prim_netw_cdf_x[np.where(
                                prim_netw_cdf_y >= val_thr_float)][0]

                            #                         print('\n****transform values to booleans*****\n')
                            # if Xi > Ppt_thr then 1 else 0
                            df_pws_cmn_Bool = (df_pws_cmn_season >
                                               pws_ppt_thr_per).astype(int)

                            df_prim_netw_cmn_Bool = (
                                df_prim_netw_cmn_season >
                                prim_netw_ppt_thr_per).astype(int)

                            # calculate spearman correlations of booleans 1, 0

                            bool_spr_corr = np.round(
                                spr(df_prim_netw_cmn_Bool.values.ravel(),
                                    df_pws_cmn_Bool.values.ravel())[0], 2)

                            #======================================
                            # select only upper tail both dataframes
                            #=====================================

                            prim_netw2_cdf_x, prim_netw2_cdf_y = (
                                get_cdf_part_abv_thr(
                                    df_prim_netw_ngbr_season.values, -0.1))

                            # get prim_netw2 ppt thr from cdf
                            prim_netw2_ppt_thr_per = prim_netw2_cdf_x[np.where(
                                prim_netw2_cdf_y >= val_thr_float)][0]

                            df_prim_netw2_cmn_Bool = (
                                df_prim_netw_ngbr_season >
                                prim_netw2_ppt_thr_per).astype(int)

                            # calculate spearman correlations of booleans
                            # 1, 0

                            bool_spr_corr_prim_netw = np.round(
                                spr(df_prim_netw_cmn_Bool.values.ravel(),
                                    df_prim_netw2_cmn_Bool.values.ravel())[0],
                                2)

                            # check if df_prim_netw2_cmn_Bool correlation between
                            # pws and prim_netw is higher than between
                            # prim_netw and prim_netw neighbours, if yes, keep
                            # pws

                            if True:
                                # bool_prs_corr >= bool_spr_corr_prim_netw:

                                print('+++keeping pws+++')

                                #==================================
                                # append the result to df_correlations
                                #==================================
                                #                                     df_results_correlations.loc[
                                #                                         ppt_stn_id,
                                #                                         'lon'] = lon_stn_pws
                                #                                     df_results_correlations.loc[
                                #                                         ppt_stn_id,
                                #                                         'lat'] = lat_stn_pws
                                df_results_correlations.loc[
                                    ppt_stn_id,
                                    'Distance to neighbor'] = min_dist_ppt_prim_netw

                                df_results_correlations.loc[
                                    ppt_stn_id,
                                    'prim_netw neighbor ID'] = stn_2_prim_netw

                                df_results_correlations.loc[
                                    ppt_stn_id,
                                    'prim_netw-prim_netw neighbor ID'] = stn_near_prim_netw

                                df_results_correlations.loc[
                                    ppt_stn_id,
                                    'Distance prim_netw-prim_netw neighbor'] = min_dist_prim_netw_prim_netw

                                df_results_correlations.loc[
                                    ppt_stn_id,
                                    'pws_%s_Per_ppt_thr' %
                                    val_thr_percent] = pws_ppt_thr_per

                                df_results_correlations.loc[
                                    ppt_stn_id,
                                    'prim_netw_%s_Per_ppt_thr' %
                                    val_thr_percent] = prim_netw_ppt_thr_per

                                df_results_correlations.loc[
                                    ppt_stn_id,
                                    'Bool_Spearman_Correlation_pws_prim_netw'] = bool_spr_corr
                                df_results_correlations.loc[
                                    ppt_stn_id,
                                    'Bool_Spearman_Correlation_prim_netw_prim_netw'] = bool_spr_corr_prim_netw
                            else:
                                pass
#                                 print('---Removing pws---')
#
#                                 df_results_correlations.loc[
#                                     ppt_stn_id,
#                                     'Bool_Pearson_Correlation_pws_prim_netw'
#                                 ] = bool_prs_corr
#                                 df_results_correlations.loc[
#                                     ppt_stn_id,
#                                     'Bool_Pearson_Correlation_prim_netw_prim_netw'
#                                 ] = bool_prs_corr_prim_netw

                        else:
                            print('not enough data')
    #                         print('\n********\n ADDED DATA TO DF RESULTS')
                    else:
                        pass
                        # print('After intersecting dataframes not enough data')
                else:
                    pass
                    # print('prim_netw Station is near but not enough data')
            else:
                pass
                # print('\n********\n prim_netw station is not near')

        except Exception as msg:
            print('error while finding neighbours ', msg)

            continue

    df_results_correlations.dropna(how='all', inplace=True)

    return df_results_correlations