def predict(self, clf, train=None, test=None, binary=False):
     """make predictions for ordinal or binary task"""
     predictions = clf.predict(test.X)
     if not binary:
         predictions = self.rescale_predictions(clf, train.y,
                                                clf.predict(train.X),
                                                predictions)
     test_y = test.y
     if binary:
         prob_predict = clf.predict_proba(test.X)
         test_y = self.binarize(test.y)
         print >> sys.stderr, 'accuracy (bin) = ', accuracy_score(
             predictions, test_y)
         print >> sys.stderr, 'tau (bin) = ', kendalltau(
             predictions, test_y)
         # correlation for ordinal task
         print >> sys.stderr, 'r (ord) = ', pearsonr(
             prob_predict[:, 0], test.y)
         print >> sys.stderr, 'tau (ord) = ', kendalltau(
             prob_predict[:, 0], test.y)
     else:
         print >> sys.stderr, 'r (ord) = ', pearsonr(predictions, test_y)
         print >> sys.stderr, 'tau (ord) = ', kendalltau(
             predictions, test_y)
         # score for binary task
         print >>sys.stderr, 'accuracy (bin) = ', \
             accuracy_score(self.binarize(predictions), self.binarize(test_y))
         print >>sys.stderr, 'tau (bin) = ', \
             kendalltau(self.binarize(predictions), self.binarize(test_y))
     for a, b in zip(test_y, predictions):
         print a, b
Пример #2
0
def npccf(x, y, method="spearmanr", min_lag=-10, max_lag=10):
    """ Compute cross correlation of time series x and y from min_lag to max_lag 
    (based on nonparametric correlation). r(lag) = corr(x[t-lag], y[t]).
    
    Parameters
    ----------
    x: time series
    y: time series
    method: "spearmanr" or "kendalltau"
    min_lag : int, default -10
    max_lag : int, default 10

    Returns
    ----------
    a dictionary with keys "corrs" (correlation coefficient corresponding to the lags),
    "lags" (corresponding lags), "lb" (lower bound) and "ub" (upper bound).
    """
    n1 = len(x)
    n2 = len(y)
    assert (n1 == n2
            ), "The length of time series x and time series y must be equal!"
    assert (min_lag <= max_lag), "min_lag must less than or equal to max_lag!"
    nlags = max_lag - min_lag + 1
    corrs = np.empty(nlags)
    if method == "spearmanr":
        for k, lag in enumerate(range(min_lag, (max_lag + 1))):
            if lag == 0:
                corrs[k] = spearmanr(x, y)[0]
            if lag < 0:
                corrs[k] = spearmanr(x[(-lag):], y[:lag])[0]
            if lag > 0:
                corrs[k] = spearmanr(x[:(-lag)], y[lag:])[0]
    elif method == "kendalltau":
        for k, lag in enumerate(range(min_lag, (max_lag + 1))):
            if lag == 0:
                corrs[k] = kendalltau(x, y)[0]
            if lag < 0:
                corrs[k] = kendalltau(x[(-lag):], y[:lag])[0]
            if lag > 0:
                corrs[k] = kendalltau(x[:(-lag)], y[lag:])[0]
    else:
        raise ValueError("The method %s is not supported." % method)
    return {
        "corrs": corrs,
        "lags": range(min_lag, (max_lag + 1)),
        "lb": np.repeat(-1 / np.sqrt(n1), nlags),
        "ub": np.repeat(1 / np.sqrt(n1), nlags)
    }
Пример #3
0
    def update(self, es, **kwargs):
        if es.countiter < 2:
            self.initialize(es)
            self.fit = es.fit.fit
        else:
            ft1, ft2 = self.fit[int(self.index_to_compare)], self.fit[int(np.ceil(self.index_to_compare))]
            ftt1, ftt2 = es.fit.fit[(es.popsize - 1) // 2], es.fit.fit[int(np.ceil((es.popsize - 1) / 2))]
            pt2 = self.index_to_compare - int(self.index_to_compare)
            # ptt2 = (es.popsize - 1) / 2 - (es.popsize - 1) // 2  # not in use
            s = 0
            if 1 < 3:
                s += pt2 * sum(es.fit.fit <= self.fit[int(np.ceil(self.index_to_compare))])
                s += (1 - pt2) * sum(es.fit.fit < self.fit[int(self.index_to_compare)])
                s -= es.popsize / 2.
                s *= 2. / es.popsize  # the range was popsize, is 2
            elif 11 < 3:  # compare ft with median of ftt
                s += self.index_to_compare - sum(self.fit <= es.fit.fit[es.popsize // 2])
                s *= 2 / es.popsize  # the range was popsize, is 2
            else:  # compare ftt j-index of ft
                s += (1 - pt2) * np.sign(ft1 - ftt1)
                s += pt2 * np.sign(ft2 - ftt1)
            self.s = (1 - self.c) * self.s + self.c * s
            es.sigma *= np.exp(self.s / self.damp)
        # es.more_to_write.append(10**(self.s))

        #es.more_to_write.append(10**((2 / es.popsize) * (sum(es.fit.fit < self.fit[int(self.index_to_compare)]) - (es.popsize + 1) / 2)))
        # # es.more_to_write.append(10**(self.index_to_compare - sum(self.fit <= es.fit.fit[es.popsize // 2])))
        # # es.more_to_write.append(10**(np.sign(self.fit[int(self.index_to_compare)] - es.fit.fit[es.popsize // 2])))
        if 11 < 3:
            import scipy.stats.stats as stats
            zkendall = stats.kendalltau(list(es.fit.fit) + list(self.fit),
                                        len(es.fit.fit) * [0] + len(self.fit) * [1])[0]
            es.more_to_write.append(10**zkendall)
        self.fit = es.fit.fit
Пример #4
0
 def get_metrics(self, y, yhat, name):
     mse = self.compute_mse(y, yhat)
     pearson = pearsonr(y, yhat)[0][0]
     kendall = kendalltau(y, yhat)[0]
     spearman = spearmanr(y, yhat)[0]
     return {"lat": self.lat, "lon": self.lon, "model": name, "mse": mse,
             "pearson": pearson, "kendall": kendall, "spearman": spearman}
Пример #5
0
def valid(val_loader, model, args, funcs=[]):
    if not callable(getattr(model, "predict", None)):
        assert callable(getattr(model, "compare", None))
        corrs, funcs_res = zip(*[
            pairwise_valid(val_loader, model, pv_seed, funcs)
            for pv_seed in getattr(args, "pairwise_valid_seeds", [1, 12, 123])
        ])
        funcs_res = np.mean(funcs_res, axis=0)
        logging.info("pairwise: {}".format(corrs))
        # return np.mean(corrs), true_accs, p_scores, funcs_res
        return np.mean(corrs), funcs_res

    model.eval()
    all_scores = []
    true_accs = []
    for step, (archs, accs, _) in enumerate(val_loader):
        scores = list(model.predict(archs).cpu().data.numpy())
        all_scores += scores
        true_accs += list(accs)

    if args.save_predict is not None:
        with open(args.save_predict, "wb") as wf:
            pickle.dump((true_accs, all_scores), wf)

    corr = stats.kendalltau(true_accs, all_scores).correlation
    funcs_res = [func(true_accs, all_scores) for func in funcs]
    return corr, funcs_res
Пример #6
0
def batch_corr(x, y, method="pearsonr", ngap=1):
    """ 
    Compute correlation on streaming sequence x and y (batch method)
    
    x: time series
    y: time series
    ngap: output correlation every ngap observations.
    """
    n1 = len(x)
    n2 = len(y)
    assert (n1 == n2
            ), "The length of time series x and time series y must be equal!"
    corrs = np.empty(n1)
    if method == "pearsonr":
        for i in range(0, len(x), ngap):
            corrs[i] = pearsonr(x[:(i + 1)], y[:(i + 1)])[0]
    elif method == "spearmanr":
        for i in range(0, len(x), ngap):
            corrs[i] = spearmanr(x[:(i + 1)], y[:(i + 1)])[0]
    elif method == "kendalltau":
        for i in range(0, len(x), ngap):
            corrs[i] = kendalltau(x[:(i + 1)], y[:(i + 1)])[0]
    else:
        raise ValueError(
            ('The method "%s" is not supported. Please specify one of '
             'the following options: "pearsonr", "spearmanr" or "kendalltau"')
            % method)
    return corrs
Пример #7
0
def test_xp(true_scores, predict_scores):
    true_inds = np.argsort(true_scores)[::-1]
    true_scores = np.array(true_scores)
    reorder_true_scores = true_scores[true_inds]
    predict_scores = np.array(predict_scores)
    reorder_predict_scores = predict_scores[true_inds]
    ranks = np.argsort(reorder_predict_scores)[::-1]
    num_archs = len(ranks)
    # calculate precision at each point
    cur_inds = np.zeros(num_archs)
    passed_set = set()
    for i_rank, rank in enumerate(ranks):
        cur_inds[i_rank] = (cur_inds[i_rank - 1] if i_rank > 0 else 0) + \
                           int(i_rank in passed_set) + int(rank <= i_rank)
        passed_set.add(rank)
    patks = cur_inds / (np.arange(num_archs) + 1)
    THRESH = 100
    p_corrs = []
    for prec in [0.1, 0.3, 0.5, 0.7, 0.9, 1.0]:
        k = np.where(patks[THRESH:] >= prec)[0][0] + THRESH
        arch_inds = ranks[:k][ranks[:k] < k]
        # stats.kendalltau(arch_inds, np.arange(len(arch_inds)))
        p_corrs.append(
            (k, float(k) / num_archs, len(arch_inds), prec,
             stats.kendalltau(reorder_true_scores[arch_inds],
                              reorder_predict_scores[arch_inds]).correlation))
    return p_corrs
Пример #8
0
def correlations_weighted_unweighted(labels):
    #load network
    print 'weighted vs unweighted'
    name = '_'.join(labels)
    wikipedia = load_graph("output/weightedpagerank/wikipedianetwork_hyp_engineering_"+name+".xml.gz")
    #read counts with zeros

    wikipedia_u = load_graph("output/weightedpagerank/wikipedianetwork_sem_sim_distinct_links.xml.gz")
    correlations_weighted_pagerank = {}
    for label in labels:
        for damping in [0.8,0.85,0.9]:
            correlations_values={}
            key_weighted = label+"_page_rank_weighted_"+str(damping)
            pagerank_weighted = wikipedia.vertex_properties[key_weighted]
            key_unweighted = "page_rank"+str(damping)
            pagerank_unweighted = wikipedia_u.vertex_properties[key_unweighted]
            print 'pearson'
            p = pearsonr(pagerank_weighted.a, pagerank_unweighted.a)
            print p
            correlations_values['pearson']=p
            print 'spearmanr'
            s = spearmanr(pagerank_weighted.a, pagerank_unweighted.a)
            print s
            correlations_values['spearmanr']=s
            print 'kendalltau'
            k = kendalltau(pagerank_weighted.a, pagerank_unweighted.a)
            print k
            correlations_values['kendalltau']=k
            correlations_weighted_pagerank[label+str(damping)]=correlations_values

    write_pickle(HOME+'output/correlations/correlations_pagerank_weightedvsunweighted'+name+'.obj', correlations_weighted_pagerank)
def kendall_rank_correlation(item_item, user_user, itemknn, wrmf):
    """
    Find/display kendall_rank_correlation between each of the recommendation methods in the
    input parameters
    :param item_item: list having top k shows through item-item
    :param user_user: list having top k shows through user-user
    :param itemknn: list having top k shows through MyMediaLite standard library's itemknn method
    :param wrmf: list having top k shows through MyMediaLite standard library's itemknn method
    :return: nothing
    """
    recommend_types = [item_item, user_user, itemknn, wrmf]
    k_r_correlation = np.zeros((4, 4))
    for i in range(4):
        for j in range(4):
            if j >= i:
                k_r_correlation[i][j] = kendalltau(recommend_types[i],
                                                   recommend_types[j])[0]
            else:
                k_r_correlation[i][j] = k_r_correlation[j][i]

    print("\n*** Kendall Rank correlation coefficient ***")
    table_labels = ["Item_Item", "User_User", "ItemKNN", "WRMF"]
    print("{0:^11s}{1:^11s}{2:^11s}{3:^11s}{4:^11s}".format("", *table_labels))
    for i in range(4):
        print("{0:11s}".format(table_labels[i]), end='')
        for j in range(4):
            print("{0:^11.5f}".format(k_r_correlation[i][j]), end='')
        print()
Пример #10
0
def correlations_ground_truth():
    print 'ground truth'
    #load network
    wikipedia = load_graph("output/weightedpagerank/wikipedianetwork_hyp_engineering.xml.gz")
    #read counts with zeros
    article_counts  =  pd.read_csv(TMP+'article_counts.tsv', sep='\t')
    cor = {}
    for damping in [0.8,0.9]:
        page_rank = pagerank(wikipedia, damping=damping)
        wikipedia.vertex_properties['page_rank_'+str(damping)] = page_rank
        page_rank_values = list()
        counts = list()
        correlations_values = {}
        for index, row in article_counts.iterrows():
            counts.append(float(row['counts']))
            page_rank_values.append(page_rank[wikipedia.vertex(int(row['target_article_id']))])
        print 'pearson'
        p = pearsonr(page_rank_values, counts)
        print p
        correlations_values['pearson']=p
        print 'spearmanr'
        s = spearmanr(page_rank_values, counts)
        print s
        correlations_values['spearmanr']=s
        print 'kendalltau'
        k = kendalltau(page_rank_values, counts)
        print k
        correlations_values['kendalltau']=k
        cor['page_rank_'+str(damping)]=correlations_values
    write_pickle(HOME+'output/correlations/correlations_pagerank.obj', cor)
Пример #11
0
def test_goodness(model, vocab):
    """Tests the model on its ability to create a goodness ranking for a category.
    Method: get spearman (rank) correlation between the predicted and the actual ranking.
    
    This method is using data from De Deyne et al. (2008)"""
    d = dedeyne_etal_goodness.get_goodness_rankings()
    results = {category: dict() for category in d}
    categories = set(d.keys()) & vocab
    for category in categories:
        exemplars = set(d[category]) & vocab
        sorted_exemplars = [
            b for a, b in sorted([(model.similarity(category, ex), ex) for ex in exemplars], reverse=True)
        ]
        predicted_ranking = []
        actual_ranking = []
        for exemplar in exemplars:
            actual_ranking.append(d[category].index(exemplar))
            predicted_ranking.append(sorted_exemplars.index(exemplar))
        results[category]["spearman"] = spearmanr(predicted_ranking, actual_ranking)
        results[category]["kendall"] = kendalltau(predicted_ranking, actual_ranking)
        results[category]["num_items"] = len(exemplars)
    avg_spearman = float(sum(abs(results[cat]["spearman"][0]) for cat in categories)) / len(categories)
    avg_kendall = float(sum(abs(results[cat]["kendall"][0]) for cat in categories)) / len(categories)
    results["overall"] = dict()
    results["overall"]["avg_spearman"] = avg_spearman
    results["overall"]["avg_kendall"] = avg_kendall
    return results
Пример #12
0
def batch_corr(x, y, method="pearsonr", ngap=1):
    """ 
    Compute correlation on streaming sequence x and y (batch method)
 
    Parameters
    ----------   
    x: time series
    y: time series
    method: determin which type of correlation is computed. Accept method is 
        "pearsonr", "spearmanr" or "kendalltau"
    ngap: output correlation every ngap observations
    
    Returns
    -------
    corrs: correlations computed at selected time indexes. The selected time indexes are ngap-1, 2*ngap-1, ...
    t: selected time indexes
    """
    n1 = len(x)
    n2 = len(y)
    assert (n1 == n2),"The length of time series x and time series y must be equal!"
    corrs = np.empty(n1//ngap)
    if method == "pearsonr":
        for i in range(ngap-1, n1, ngap):
            corrs[(i+1)/ngap - 1] = pearsonr(x[:(i+1)], y[:(i+1)])[0]
    elif method == "spearmanr":
        for i in range(ngap-1, n1, ngap):
            corrs[(i+1)/ngap - 1] = spearmanr(x[:(i+1)], y[:(i+1)])[0] 
    elif method == "kendalltau":
        for i in range(ngap-1, n1, ngap):
            corrs[(i+1)/ngap -1] = kendalltau(x[:(i+1)], y[:(i+1)])[0] 
    else:
        raise ValueError(('The method "%s" is not supported. Please specify one of ' 
        'the following options: "pearsonr", "spearmanr" or "kendalltau"') % method)
    t = range(ngap-1, n1, ngap)
    return corrs, t
Пример #13
0
def correlations_speed(cur, variable1, variable2, table):
    """
    Correlation of 2 variables (including scatter plot)
    """
    x = select(cur, variable1, table)
    y = select(cur, variable2, table)

    # Scatterplot
    #    mpl.style.use('ggplot')
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    ax.set_xlabel("Gap")
    ax.set_ylabel("Sentiment magnitude")
    fig.suptitle('Correlation funding gap and sentiment magnitude')
    plt.scatter(x, y)
    plt.show()

    # Pearson correlation and p-value
    p_corr_speed_length = pearsonr(x, y)
    print("Pearson: ", p_corr_speed_length)

    # Spearman correaltion and p-value
    s_corr_speed_length = spearmanr(x, y)
    print("Spearman: ", s_corr_speed_length)

    # Kendall correlation and p-value
    k_corr_speed_length = kendalltau(x, y)
    print("Kendall: ", k_corr_speed_length)
def KendallsTau(GroundAvgInfs,AlgoAvgInfs,PageRanks,OutDegree):
    
    import numpy as np
    from scipy.stats import stats
    
    Ground_rank = sorted(range(len(GroundAvgInfs)), key=lambda i: GroundAvgInfs[i])[-len(GroundAvgInfs):]
    Algo_rank = sorted(range(len(AlgoAvgInfs)), key=lambda i: AlgoAvgInfs[i])[-len(AlgoAvgInfs):]
    Pagerank_rank = sorted(range(len(PageRanks)), key=lambda i: PageRanks[i])[-len(PageRanks):]
    OutDeg_rank = sorted(range(len(OutDegree)), key=lambda i: OutDegree[i])[-len(OutDegree):]

    Ground_rank.reverse()
    Pagerank_rank.reverse()
    Algo_rank.reverse()
    OutDeg_rank.reverse()

    positions = list(range(453))

    dictionary = dict(zip(Ground_rank,positions))

    Ground_rankings = [dictionary[i] for i in Ground_rank]
    Page_rankings = [dictionary[i] for i in Pagerank_rank]
    Algo_rankings = [dictionary[i] for i in Algo_rank]
    OutDeg_rankings = [dictionary[i] for i in OutDeg_rank]

    '''
    Below is a full code of Kendall's tau just in case you want to know how it works
    concord = np.zeros(453)
    discord = np.zeros(453)

    for ii in Page_rankings:
        for jj in Page_rankings:
            if jj > ii:
                if Page_rankings[ii] < Page_rankings[jj]:
                    concord[ii] = concord[ii] + 1
                elif Page_rankings[ii] > Page_rankings[jj]:
                    discord[ii] = discord[ii] + 1
                
    concord_total = np.sum(concord)
    discord_total = np.sum(discord)
    
    kendall_tau = (concord_total-discord_total)/(concord_total+discord_total)
    '''
    AlgoTau, p_value = stats.kendalltau(Ground_rankings,Algo_rankings)
    PageTau, Pp_value = stats.kendalltau(Ground_rankings,Page_rankings)
    OutTau, Op_value = stats.kendalltau(Ground_rankings,OutDeg_rankings)
    
    return AlgoTau, PageTau, OutTau
def get_spearman_and_kendalltau_correlations(top_n_aspects: int = 10):
    correlations = {}

    for reviews_path in settings.BING_LIU_ASPECT_DATASETS_PATHS:
        dataset_name = basename(reviews_path).split('.')[0]
        print(f'\nDataset to analyze: {dataset_name}')

        # get freq aspects from Bing Liu manually created datasets
        aspects_freq_manual_assignment = get_aspect_frequency_ranking(reviews_path=reviews_path, top_n=top_n_aspects)
        print(f'TOP{top_n_aspects} Manually extracted aspects: {aspects_freq_manual_assignment}')

        # get aspects from RST + PageRank
        aspects_from_rst_based_on_pagerank = get_aspects_rankings_from_rst(
            [
                aspects_graph_path
                for aspects_graph_path
                in ASPECTS_GRAPH_PATHS
                if dataset_name in aspects_graph_path
            ][0],
            aspects_freq_manual_assignment
        )

        aspects_from_rst_based_on_pagerank_top = get_aspect_ranking_based_on_rst_and_pagerank(
            [
                aspects_graph_path
                for aspects_graph_path
                in ASPECTS_GRAPH_PATHS
                if dataset_name in aspects_graph_path
            ][0],
            top_n_aspects
        )

        print(f'Bing Liu aspects: {aspects_from_rst_based_on_pagerank}')
        print(f'RST aspects: {aspects_from_rst_based_on_pagerank_top}')

        aspects_freq_manual_assignment_ranking, aspects_from_rst_based_on_pagerank_ranking = create_rankings(
            aspects_freq_manual_assignment, aspects_from_rst_based_on_pagerank)

        spearman_correlation = stats.spearmanr(
            aspects_freq_manual_assignment_ranking, aspects_from_rst_based_on_pagerank_ranking)
        print(f'{dataset_name}, Spearman correlation of ranking: {spearman_correlation}')

        kendalltau_correlation = stats.kendalltau(
            aspects_freq_manual_assignment_ranking, aspects_from_rst_based_on_pagerank_ranking)
        print(f'{dataset_name}, KendalTau correlation of ranking: {kendalltau_correlation}')

        aspects_manual = set(aspects_freq_manual_assignment)
        aspects_rst = set(aspects_from_rst_based_on_pagerank_top)

        correlations[dataset_name] = {
            'Spearman Correlation': spearman_correlation[0],
            'Spearman p-value': spearman_correlation[1],
            'Kendall Tau Correlation': kendalltau_correlation[0],
            'Kendall Tau p-value': kendalltau_correlation[1],
            'Jaccard': len(aspects_manual.intersection(aspects_rst))/len(aspects_manual.union(aspects_rst)),
            'Recall': len(aspects_manual.intersection(aspects_rst)) / len(aspects_manual),
            'Precision': len(aspects_manual.intersection(aspects_rst)) / len(aspects_rst)
        }
    return correlations
Пример #16
0
def similarity(v1, v2):
    v1 = np.array(v1)
    v2 = np.array(v2)
    pcc = pearsonr(v1, v2)[0]
    cos = cosine(v1, v2)
    spc = spearmanr(v1, v2)[0]
    kdt = kendalltau(v1, v2)[0]
    return (pcc, cos, spc, kdt)
 def compute_distance(self, recommendation_list_1, recommendation_list_2):
     if self.distance == 'kendalltau':
         distance, p_value = stats.kendalltau(recommendation_list_1,
                                              recommendation_list_2)
     if self.distance == 'weighted_kendalltau':
         distance, p_value = stats.weightedtau(recommendation_list_1,
                                               recommendation_list_2)
     return distance
Пример #18
0
 def similarity_tf(v1, v2):
     ret_pcc, ret_cos, ret_SA = sess.run([_pcc, _cos, _SA],
                                         feed_dict={
                                             _x: v1,
                                             _y: v2,
                                             _len: len(v1)
                                         })
     ret_spc = spearmanr(v1, v2)[0]
     ret_kdt = kendalltau(v1, v2)[0]
     return ret_pcc, ret_cos, ret_spc, ret_kdt, ret_SA
Пример #19
0
def corr(x, y, method="pearsonr"):
    """ 
    Compute pearson correlation on time series x and y
    """
    if method == "pearsonr":
        return pearsonr(x, y)[0]
    elif method == "spearmanr":
        return spearmanr(x, y)[0]
    elif method == "kendalltau":
        return kendalltau(x, y)[0]
Пример #20
0
    def _correlation_test(self,
                          parents=None,
                          method=0,
                          corr_alpha=0.05):

        """correlation test of X,Y
        apply pearson, spearman and kendall's tau-b correlation 

        parameters
        __________

        method               -> 0 for pearson correlation
                                1 for spearman correlation
                                2 for kendall's tau-b correlation
                                
                                default as spearman correlation 
                                cuz non-gaussian distribution of data.
                
        corr_alpha           -> correlation significant alpha for two-tails test
                                default as 0.05.
                        
        attribute
        _________
        
        _corr_parents        -> correlation parents. nest dict.
        """

        # inital parents
        if parents is None:
            parents, _, _len_node = self._set_default_parents()
        # initial correlation parents
        _corr_parents = defaultdict(dict)
        # loop for all target variable
        for i in range(self.N):
            _corr_parents[i] = list()
            # return X,Y for parents
            X,Y,_len_node = self._set_parents_matrix(self.data,i,parents)
            if _len_node == 0:
                _corr_parents[i] = []
            else:        
                # caculate correlation coefficient and p value 
                # for each column in X and Y
                for j in range(_len_node):
                    # corr and p value for X column and Y for three methods.
                    if method == 0:
                        corr_result = pearsonr(X[:,j],Y)
                    elif method == 1:
                        corr_result = spearmanr(X[:,j],Y)
                    elif method == 2:
                        corr_result = kendalltau(X[:,j],Y)
                    # if significant, append selected link.
                    if corr_result[1] < corr_alpha:  
                         _corr_parents[i].append(parents[i][j])
                     
        return _corr_parents
Пример #21
0
def calculate_kendall_correlation(
        score_function,
        aggregation_function,
        path_ground_truth_list: str = PATH_GROUND_TRUTH_LIST,
        path_argument_list: str = PATH_ARGUMENT_LIST):
    """
    This function calls the score function and computes the with the results the ranking and then calculates the
    kendall tau value with the baseline ranking
    :param score_function: Functions which computes the score value like jacards similarity
    :param aggregation_function: This function collects the max, min, average oder sum value for an argument
    :param most_premises_function: If set to true score will be calculated with number of premises
    :param random_score_function: If set to true score will be drawn of uniform distribution
    :param path_ground_truth_list:
    :param path_argument_list:
    :return: kendall tau value and dictionary with tau values for all conclusions
    """
    kendall_tau_results = []
    # Calculate score values with different score function
    score_results = calculate_score(
        score_function,
        path_ground_truth_list=path_ground_truth_list,
        path_argument_list=path_argument_list)
    # Aggregate results with min, max, sum and average method
    aggregated_score_results = calculate_aggregation_with(
        aggregation_function, score_results)
    # Calculate score values with uniform correlation
    if score_function == calculate_random_score:
        random.seed(114)  # 12 15 17s
        for conclusion_id in aggregated_score_results.keys():
            for argument_id in aggregated_score_results[conclusion_id].keys():
                aggregated_score_results[conclusion_id][
                    argument_id] = random.uniform(0, 1)
    score_ranking = generate_ranking_from_aggregation(aggregated_score_results)
    # Collect baseline ranking from ground-truth-list.csv
    baseline_ranking = collect_baseline_ranking(path_ground_truth_list)
    baseline_ranking_dict = {}
    for rank in baseline_ranking:
        baseline_ranking_dict[rank[0]] = rank[1]
    # Calculate tau values and collect values in dict
    tau_conclusion_dict = {}
    for conclusion_id in score_ranking:
        ranking = score_ranking[conclusion_id]
        baseline_list = []
        scores_list = []
        for argument_id in ranking.keys():
            baseline_list.append(baseline_ranking_dict[argument_id])
            scores_list.append(ranking[argument_id])
        tau, p_value = stats.kendalltau(baseline_list, scores_list)
        if math.isnan(tau):
            tau = 0.0
        kendall_tau_results.append(tau)
        tau_conclusion_dict[conclusion_id] = tau
    return round(sum(kendall_tau_results) / len(kendall_tau_results),
                 2), tau_conclusion_dict
Пример #22
0
def get_corelation(X, Y):
    # Compute kendall and pearson correlation and return
    assert (len(X) == len(Y)), "X and Y must have same length"
    assert len(X) > 1, "Both X and Y must have at least 2 elements"

    correlation = {}
    cc, p_value = ss.pearsonr(X, Y)  #+[0.0] assume that FD=0 when MS=0
    correlation['pearson'] = {'corr': cc, 'p-value': p_value}
    cc, p_value = ss.kendalltau(X, Y)  #+[0.0] assume that FD=0 when MS=0
    correlation['kendall'] = {'corr': cc, 'p-value': p_value}
    return correlation
Пример #23
0
def valid_epoch(logger, val_loader, model, cfg, funcs=[]):
    model.eval()
    all_scores = []
    true_accs = []
    for _, (archs, accs) in enumerate(val_loader):
        scores = list(model.predict(archs).cpu().data.numpy())
        all_scores += scores
        true_accs += list(accs)

    corr = stats.kendalltau(true_accs, all_scores).correlation
    funcs_res = [func(true_accs, all_scores) for func in funcs]
    return corr, funcs_res
Пример #24
0
def custom_scatter(x, y, ax):
    rho, rhoval = pearsonr(x, y)
    try:
        tau, tauval = kendalltau(x, y)
    except OverflowError:
        tau = -10
        tauval = -10
    rhoval = pformatting(rhoval)
    tauval = pformatting(tauval)
    ax.annotate(r'$\rho$' + ': p={} ({})'.format(np.round(rho, 2), rhoval) +
                '\n' + r'$\tau$' +
                ': p={} ({})'.format(np.round(tau, 2), tauval),
                xy=(0.05, 0.8),
                fontsize=11,
                xycoords='axes fraction')
Пример #25
0
def compute_means(raw_df, quantity_df, quantity_label, axis):

    mean_df = pd.DataFrame(columns=[quantity_label])
    mean_df[quantity_label] = quantity_df["Quantity"]
    mean_df["Arithmetic Mean"] = raw_df.mean(axis=axis)
    mean_df["Geometric Mean"] = raw_df.apply(geo_mean, axis=axis)
    mean_df["Median"] = raw_df.mean(axis=axis)
    cols = []
    for col in raw_df.columns.values:
        cols.append(col)
    raw_df["v"] = raw_df[cols].count(axis=1)
    m = np.mean(raw_df['v'])
    raw_df['w'] = raw_df['v'] / (raw_df['v'] + m)
    raw_df['r'] = np.mean(raw_df[cols], axis=1)
    c = np.mean(raw_df[cols].values.flatten())
    raw_df['b'] = raw_df['w'] * raw_df['r'] + (1 - raw_df['w']) * c
    raw_df = raw_df.drop(['v', 'w', 'r'], axis=1)
    mean_df["Bayesian Mean"] = raw_df["b"]
    mean_df = mean_df.fillna(0)

    corr_df = pd.DataFrame(columns=[
        "Rho (Arithmetic Mean)",
        "Rho (Geometric Mean)",
        "Rho (Bayesian Mean)",
        "Rho (Median)",
        "Tau (Arithmetic Mean)",
        "Tau (Geometric Mean)",
        "Tau (Bayesian Mean)",
        "Tau (Median)",
    ])

    dict = {}
    quantity_column = mean_df[quantity_label]
    columns = mean_df.drop(quantity_label, axis=1).columns
    for col in columns:
        mean_column = mean_df[col]
        pearson_corr_val = pearsonr(quantity_column, mean_column)[0]
        if math.isnan(pearson_corr_val):
            pearson_corr_val = 0
        kendall_corr_val = kendalltau(quantity_column, mean_column)[0]
        if math.isnan(kendall_corr_val):
            kendall_corr_val = 0
        dict[f'Rho ({col})'] = pearson_corr_val
        dict[f'Tau ({col})'] = kendall_corr_val
    corr_df = corr_df.append(dict, ignore_index=True)

    return corr_df, mean_df
Пример #26
0
def test_xk(true_scores, predict_scores):
    true_inds = np.argsort(true_scores)[::-1]
    true_scores = np.array(true_scores)
    reorder_true_scores = true_scores[true_inds]
    predict_scores = np.array(predict_scores)
    reorder_predict_scores = predict_scores[true_inds]
    ranks = np.argsort(reorder_predict_scores)[::-1]
    num_archs = len(ranks)
    patks = []
    for ratio in [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0]:
        k = int(num_archs * ratio)
        p = len(np.where(ranks[:k] < k)[0]) / float(k)
        arch_inds = ranks[:k][ranks[:k] < k]
        patks.append(
            (k, ratio, len(arch_inds), p,
             stats.kendalltau(reorder_true_scores[arch_inds],
                              reorder_predict_scores[arch_inds]).correlation))
    return patks
Пример #27
0
def pairwise_valid(val_loader, model, seed=None):
    if seed is not None:
        random.seed(seed)
        np.random.seed(seed)
    model.eval()
    true_accs = []
    all_archs = []
    for step, (archs, accs) in enumerate(val_loader):
        all_archs += list(archs)
        true_accs += list(accs[:, -1])

    num_valid = len(true_accs)
    pseudo_scores = np.zeros(num_valid)
    indexes = model.argsort_list(all_archs, batch_size=512)
    pseudo_scores[indexes] = np.arange(num_valid)

    corr = stats.kendalltau(true_accs, pseudo_scores).correlation
    funcs_res = [func(true_accs, all_scores) for func in funcs]
    return corr, funcs_res
Пример #28
0
def _ktau_union(orig_run, rep_run, trim_thresh=TRIM_THRESH, pbar=False):
    """
    Helping function returning a generator to determine Kendall's tau Union (KTU) for all topics.

    @param orig_run: The original run.
    @param rep_run: The reproduced/replicated run.
    @param trim_thresh: Threshold values for the number of documents to be compared.
    @param pbar: Boolean value indicating if progress bar should be printed.
    @return: Generator with KTU values.
    """

    generator = tqdm(rep_run.items()) if pbar else rep_run.items()

    for topic, docs in generator:
        orig_docs = list(orig_run.get(topic).keys())[:trim_thresh]
        rep_docs = list(rep_run.get(topic).keys())[:trim_thresh]
        union = list(sorted(set(orig_docs + rep_docs)))
        orig_idx = [union.index(doc) for doc in orig_docs]
        rep_idx = [union.index(doc) for doc in rep_docs]
        yield topic, kendalltau(orig_idx, rep_idx).correlation
Пример #29
0
def corr(x, y, method="pearsonr"):
    """ 
    Compute pearson correlation on time series x and y
    
    Parameters
    ----------
    x: time series
    y: time series
    
    Returns
    -------
    corr: correlation between x and y
    """
    if method == "pearsonr":
        corr = pearsonr(x, y)[0]
    elif method == "spearmanr":
        corr = spearmanr(x, y)[0]
    elif method == "kendalltau":
        corr = kendalltau(x, y)[0]
    return corr
Пример #30
0
def test_xk(true_scores, predict_scores):
    true_inds = np.argsort(true_scores)[::-1]
    true_scores = np.array(true_scores)
    reorder_true_scores = true_scores[true_inds]
    predict_scores = np.array(predict_scores)
    reorder_predict_scores = predict_scores[true_inds]
    ranks = np.argsort(reorder_predict_scores)[::-1]
    num_archs = len(ranks)
    patks = []
    for ratio in [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0]:
        k = int(num_archs * ratio)
        if k < 1:
            continue
        p = len(np.where(ranks[:k] < k)[0]) / float(k)
        arch_inds = ranks[:k][ranks[:k] < k]
        # [#samples, #samples/#total_samples, models in top-K, P@K%, Kendall-Tau]
        patks.append((k, ratio, len(arch_inds), p, stats.kendalltau(
            reorder_true_scores[arch_inds],
            reorder_predict_scores[arch_inds]).correlation))
    return patks
Пример #31
0
                                        values = line.split()
                                        map = float(values[2])
                                        predictedqrelMap.append(map)
                                    retval = p.wait()

                                predictionMapResult = predicted_location_base + str(percentage) + '_bpref.txt'
                                tmp = ""

                                for val in predictedqrelMap:
                                    tmp = tmp + str(val) + ","
                                text_file = open(predictionMapResult, "w")
                                text_file.write(tmp)
                                text_file.close()
                                #exit(0)

                                tau, p_value = kendalltau(originalqrelMap, predictedqrelMap)
                                predictedqrelMap = [] # cleaning it for next trains_percenatge
                                list.append(tau)
                        protocol_result[protocol] = list

            #print len(training_variation)
            plt.subplot(subplot_loc[var])
            '''plt.plot(x_labels_set, protocol_result['SAL'], '-r', label='SAL', linewidth=2.0)
            #print protocol_result['SAL']
            plt.plot(x_labels_set, protocol_result['CAL'], '-b', label='CAL', linewidth=2.0)
            plt.plot(x_labels_set, protocol_result['SPL'], '-g', label='SPL', linewidth=2.0)
            '''

            plt.plot(x_labels_set, protocol_result['SAL'], '-r', marker='o', label='SAL', linewidth=1.0)
            plt.plot(x_labels_set, protocol_result['CAL'], '-b', marker='^', label='CAL', linewidth=1.0)
            plt.plot(x_labels_set, protocol_result['SPL'], '-g', marker='s', label='SPL', linewidth=1.0)
Пример #32
0
def test_nasbench(nasbench_search_space):
    import numpy as np
    from scipy.stats import stats
    from aw_nas.btcs import nasbench_101
    from aw_nas.evaluator.arch_network import PointwiseComparator
    from aw_nas.rollout.compare import CompareRollout

    ss = nasbench_search_space
    # construct controller
    controller = nasbench_101.NasBench101Controller(ss, device="cuda")
    compare_controller = nasbench_101.NasBench101CompareController(
        ss, device="cuda", rollout_type="compare")
    # construct evaluator
    evaluator = nasbench_101.NasBench101Evaluator(None, None, None)

    # test random sample
    _ = ss.random_sample()

    # test controller.sample
    rollouts = controller.sample(n=20)
    # test genotype
    print(rollouts[0].genotype)

    # test evaluator.evaluate_rollout
    rollouts = evaluator.evaluate_rollouts(rollouts, False)
    print(rollouts)

    evaluator.rollout_type = "compare"
    c_rollouts = compare_controller.sample(n=4)
    print(c_rollouts[0].genotype)

    # test evaluator.evaluate_rollout for compare rollouts
    c_rollouts = evaluator.evaluate_rollouts(c_rollouts, False)
    print(c_rollouts)

    # test nb101-gcn embedder
    comparator = PointwiseComparator(ss,
                                     arch_embedder_type="nb101-gcn",
                                     arch_embedder_cfg={"hid_dim": 96})
    comparator_2 = PointwiseComparator(ss,
                                       arch_embedder_type="nb101-gcn",
                                       arch_embedder_cfg={"hid_dim": 96})
    pred_scores = comparator.predict([r.arch for r in rollouts])
    pred_scores_2 = comparator_2.predict([r.arch for r in rollouts])
    label_scores = [r.perf["reward"] for r in rollouts]
    corr_init_1 = stats.kendalltau(label_scores,
                                   pred_scores.cpu().data.numpy()).correlation
    corr_init_2 = stats.kendalltau(
        label_scores,
        pred_scores_2.cpu().data.numpy()).correlation
    # compare_scores = comparator.compare([r.rollout_1.arch for r in c_rollouts],
    #                                     [r.rollout_2.arch for r in c_rollouts])

    # try training for several epochs using update_predict
    true_scores = np.random.rand(len(rollouts))
    for i_step in range(5):
        loss = comparator.update_predict([r.arch for r in rollouts],
                                         true_scores)
        print("update predict {}: {:.4f}".format(i_step, loss))

    # try training for several epochs using update_compare
    # construct compare rollouts between every pair in rollouts
    c_rollouts_2 = [
        CompareRollout(rollout_1=rollouts[i], rollout_2=rollouts[j])
        for i in range(len(rollouts)) for j in range(i)
    ]
    better_lst = [
        label_scores[j] > label_scores[i] for i in range(len(rollouts))
        for j in range(i)
    ]
    for i_step in range(5):
        loss = comparator_2.update_compare_rollouts(c_rollouts_2, better_lst)
        print("update compare {}: {:.4f}".format(i_step, loss))

    # test after training
    pred_scores_after = comparator.predict([r.arch for r in rollouts])
    pred_scores_2_after = comparator_2.predict([r.arch for r in rollouts])
    corr_after_1 = stats.kendalltau(
        label_scores,
        pred_scores_after.cpu().data.numpy()).correlation
    corr_after_2 = stats.kendalltau(
        label_scores,
        pred_scores_2_after.cpu().data.numpy()).correlation
    print("True accs: ", label_scores)
    print(
        "PREDICT: before training: {} (corr {:.3f}); after training: {} (corr {:.3f})"
        .format(pred_scores, corr_init_1, pred_scores_after, corr_after_1))
    print(
        "COMPARE: before training: {} (corr {:.3f}); after training: {} (corr {:.3f})"
        .format(pred_scores_2, corr_init_2, pred_scores_2_after, corr_after_2))
Пример #33
0
            for row in results:
                counts.append(float(row[1]))
                page_rank_values.append(pagerank[wikipedia.vertex(int(row[0]))])
            #for index, row in df.iterrows():
            #    counts.append(float(row['counts']))
            #    page_rank_values.append(pagerank[wikipedia.vertex(int(row['target_article_id']))])
            print 'pearson'
            p = pearsonr(page_rank_values, counts)
            print p
            correlations['pearson']=p
            print 'spearmanr'
            s= spearmanr(page_rank_values, counts)
            print s
            correlations['spearmanr']=s
            print 'kendalltau'
            k= kendalltau(page_rank_values, counts)
            print k
            correlations['kendalltau']=k
            correlations_sem_sim_weighted_pagerank[key]=correlations
        cor[kk]=correlations_sem_sim_weighted_pagerank


    write_pickle(HOME+'output/correlations/correlations_pagerank_without_zeros'+network_name+'.obj', cor)


def map_to_hyp_indicies(vocab, l):
    ids = list()
    for v in l.values:
        ids.append(vocab[str(v)])
    return ids
Пример #34
0
def correlations_zeros(labels, consider_zeros=True, clickstream_data='', struct=False):
    #load network
    print struct
    name = '_'.join(labels)
    wikipedia = load_graph("output/weightedpagerank/wikipedianetwork_hyp_engineering_"+name+".xml.gz")
    #read counts with zeros
    if consider_zeros:
        article_counts  =  pd.read_csv(TMP+clickstream_data+'article_counts.tsv', sep='\t')
        print TMP+clickstream_data+'article_counts.tsv'
        correlations_weighted_pagerank = {}
        for label in labels:
            if struct:
                label = label[7:]
            for damping in [0.8,0.85,0.9]:
                key = label+"_page_rank_weighted_"+str(damping)
                pagerank = wikipedia.vertex_properties[key]
                page_rank_values = list()
                counts = list()
                correlations_values = {}
                for index, row in article_counts.iterrows():
                    counts.append(float(row['counts']))
                    page_rank_values.append(pagerank[wikipedia.vertex(int(row['target_article_id']))])
                print 'pearson'
                p = pearsonr(page_rank_values, counts)
                print p
                correlations_values['pearson']=p
                print 'spearmanr'
                s = spearmanr(page_rank_values, counts)
                print s
                correlations_values['spearmanr']=s
                print 'kendalltau'
                k = kendalltau(page_rank_values, counts)
                print k
                correlations_values['kendalltau']=k
                correlations_weighted_pagerank[key]=correlations_values

        write_pickle(HOME+'output/correlations/'+clickstream_data+'correlations_pagerank_'+name+'.obj', correlations_weighted_pagerank)
    else:
        db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
        conn = db._create_connection()
        cursor = conn.cursor()
        # wikipedia  graph  structural statistics

        results = None
        try:
            if clickstream_data != '':

                results = cursor.execute('select c.curr_id,  sum(c.counts) as counts from clickstream_derived c where c.link_type_derived= %s  group by c.curr_id;', ("internal-link",))
                results = cursor.fetchall()
            else:
                results = cursor.execute('select c.curr_id,  sum(c.counts) as counts from clickstream_derived_en_201501 c where c.link_type_derived= %s  group by c.curr_id;', ("internal-link",))
                results = cursor.fetchall()

        except MySQLdb.Error, e:
            print ('error retrieving xy coord for all links links %s (%d)' % (e.args[1], e.args[0]))
        print 'after sql load'


        correlations_weighted_pagerank = {}
        for label in labels:
            if struct:
                label = label[7:]
            for damping in [0.8,0.85,0.9]:
                key = label+"_page_rank_weighted_"+str(damping)
                pagerank = wikipedia.vertex_properties[key]
                correlations={}
                counts=[]
                page_rank_values=[]
                for row in results:
                    counts.append(float(row[1]))
                    page_rank_values.append(pagerank[wikipedia.vertex(int(row[0]))])
                print 'pearson'
                p = pearsonr(page_rank_values, counts)
                print p
                correlations['pearson']=p
                print 'spearmanr'
                s= spearmanr(page_rank_values, counts)
                print s
                correlations['spearmanr']=s
                print 'kendalltau'
                k= kendalltau(page_rank_values, counts)
                print k
                correlations['kendalltau']=k
                correlations_weighted_pagerank[key]=correlations



        write_pickle(HOME+'output/correlations/'+clickstream_data+'correlations_pagerank_without_zeros'+name+'.obj', correlations_weighted_pagerank)
Пример #35
0
def main():

    finaldatafile = "finaldata.json"
    finalData = None
    try:
        with open(finaldatafile) as data_file:
            finalData = json.load(data_file)
    except:
        print("Run analysis")
        exit()


    for appliName in finalData:
        cgscore, issuescore, classSize = finalData[appliName]
        j = 0
        issueCallgraphValueForStats = []
        callGraphValueForStats = []

        issueSizeValueForStats = []
        classSizeValueForStats = []
        
        issueForModel = []
        callGraphForModel = []
        classSizeForModel = []

        for key in issuescore:
            if key in cgscore:
                j+=1
                issueCallgraphValueForStats.append(issuescore[key])
                callGraphValueForStats.append(cgscore[key])

        for key in issuescore:
            if key in classSize:
                issueSizeValueForStats.append(issuescore[key])
                classSizeValueForStats.append(classSize[key])
                
        for key in issuescore:
            if key in classSize:
                if key in cgscore:
                    issueForModel.append(issuescore[key])
                    callGraphForModel.append(cgscore[key])
                    classSizeForModel.append(classSize[key])

        if j>3:
            spearmanCorrelationCoefficient, spearmanpvalue = spearmanr(issueCallgraphValueForStats,callGraphValueForStats)
            kendalltauCorrelationCoefficient, kendalltaupvalue = kendalltau(issueCallgraphValueForStats,callGraphValueForStats)
            kstestdissueValueForStats, kstestpvalueissueValueForStats = kstest([issuescore[key] for key in issuescore],"norm")
            kstestdcgValueForGraph, kstestpvaluecgValueForGraph = kstest([cgscore[key] for key in cgscore],"norm")

            spearmanCorrelationCoefficient2, spearmanpvalue2 = spearmanr(issueSizeValueForStats,classSizeValueForStats)
            kendalltauCorrelationCoefficient2, kendalltaupvalue2 = kendalltau(issueSizeValueForStats,classSizeValueForStats)
            kstestdchissueSizeValueForStats, kstestpvaluechissueSizeValueForStats = kstest([issuescore[key] for key in issuescore],"norm")
            kstestdclassSizeValueForStats, kstestpvalueclassSizeValueForStats = kstest([classSize[key] for key in classSize],"norm")

            print(appliName)
            print("--- API Call <> Issue")
            print(" "*8 + "Spearman rho correlation coefficient = " + str(spearmanCorrelationCoefficient))
            print(" "*8 + "Spearman p-value = " + str(spearmanpvalue))
            print(" "*8 + "Kendall Tau = " + str(kendalltauCorrelationCoefficient))
            print(" "*8 + "Kendall p-value = " + str(kendalltaupvalue))
            print(" "*8 + "KS Test D = " + str(kstestdissueValueForStats))
            print(" "*8 + "KS p-value = " + str(kstestpvalueissueValueForStats))
            print(" "*8 + "KS Test D = " + str(kstestdcgValueForGraph))
            print(" "*8 + "KS p-value = " + str(kstestpvaluecgValueForGraph))
            print(" "*8 + "dataset size =" + str(j))
            print("--- Class Size <> Issue")
            print(" "*8 + "Spearman rho correlation coefficient = " + str(spearmanCorrelationCoefficient2))
            print(" "*8 + "Spearman p-value = " + str(spearmanpvalue2))
            print(" "*8 + "Kendall Tau = " + str(kendalltauCorrelationCoefficient2))
            print(" "*8 + "Kendall p-value = " + str(kendalltaupvalue2))
            print(" "*8 + "KS Test D = " + str(kstestdchissueSizeValueForStats))
            print(" "*8 + "KS p-value = " + str(kstestpvaluechissueSizeValueForStats))
            print(" "*8 + "KS Test D = " + str(kstestdclassSizeValueForStats))
            print(" "*8 + "KS p-value = " + str(kstestpvalueclassSizeValueForStats))

            y = issueForModel
            X = np.array([callGraphForModel,classSizeForModel]).transpose()
            X = list([list(i) for i in X])
            model = sm.OLS(y, X)
            results = model.fit()
            print(results.summary(yname="issues", xname =("APIcalls", "ClassSize")))

        else:
            print("FAILURE : " + appliName)

    print("|" * 80)
    print("-" * 80)
    print("-" * 80)
    print("|" * 80)

    issueForGlobalModel = []
    callGraphForGlobalModel = []
    classSizeForGlobalModel = []
    issueGlobalCallgraphValueForStats = []
    callGlobalGraphValueForStats = []
    NOissueGlobalCallgraphValueForStats = []
    issueGlobalSizeValueForStats = []
    classGlobalSizeValueForStats = []

    anova1issue = []
    anova2issue = []
    for appliName in finalData:
        cgscore, issuescore, classSize = finalData[appliName]
        for key in issuescore:
            if key in classSize:
                if key in cgscore:
                    issueForGlobalModel.append(issuescore[key])
                    callGraphForGlobalModel.append(cgscore[key])
                    classSizeForGlobalModel.append(issuescore[key])

        for key in issuescore:
            if key in cgscore:
                j+=1
                issueGlobalCallgraphValueForStats.append(issuescore[key])
                callGlobalGraphValueForStats.append(cgscore[key])
            else:
                NOissueGlobalCallgraphValueForStats.append(issuescore[key])

        for key in cgscore:
            if key in issuescore:
                anova1issue.append(cgscore[key])
            else:
                anova2issue.append(cgscore[key])


        for key in issuescore:
            if key in classSize:
                issueGlobalSizeValueForStats.append(issuescore[key])
                classGlobalSizeValueForStats.append(classSize[key])


    spearmanGlobalCorrelationCoefficient, spearmanpvalueGlobal = spearmanr(issueGlobalCallgraphValueForStats,callGlobalGraphValueForStats)
    kendalltauGlobalCorrelationCoefficient, kendalltaupvalueGlobal = kendalltau(issueGlobalCallgraphValueForStats,callGlobalGraphValueForStats)

    spearmanGlobalCorrelationCoefficient2, spearmanpvalue2Global = spearmanr(issueGlobalSizeValueForStats,classGlobalSizeValueForStats)
    kendalltauGlobalCorrelationCoefficient2, kendalltaupvalue2Global = kendalltau(issueGlobalSizeValueForStats,classGlobalSizeValueForStats)


    fvalueanova1, pvalueanova1 = f_oneway(issueGlobalCallgraphValueForStats, NOissueGlobalCallgraphValueForStats)

    fvalueanova2, pvalueanova2 = f_oneway(anova1issue, anova2issue)

    print(len(NOissueGlobalCallgraphValueForStats))
    print("--- Correlation : API Call <> Issue")
    print(" "*8 + "Spearman rho correlation coefficient = " + str(spearmanGlobalCorrelationCoefficient))
    print(" "*8 + "Spearman p-value = " + str(spearmanpvalueGlobal))
    print(" "*8 + "Kendall Tau = " + str(kendalltauGlobalCorrelationCoefficient))
    print(" "*8 + "Kendall p-value = " + str(kendalltaupvalueGlobal))
    print(" "*8 + "ANOVA F-value = " + str(fvalueanova1))
    print(" "*8 + "ANOVA p-value = " + str(pvalueanova1))
    print(" "*8 + "ANOVA F-value = " + str(fvalueanova2))
    print(" "*8 + "ANOVA p-value = " + str(pvalueanova2))
    print("--- Correlation : Class Size <> Issue")
    print(" "*8 + "Spearman rho correlation coefficient = " + str(spearmanGlobalCorrelationCoefficient2))
    print(" "*8 + "Spearman p-value = " + str(spearmanpvalue2Global))
    print(" "*8 + "Kendall Tau = " + str(kendalltauGlobalCorrelationCoefficient2))
    print(" "*8 + "Kendall p-value = " + str(kendalltaupvalue2Global))


    print("_"*80)
    print("_"*80)
    print("-- GLOBAL OLS --")
    y = issueForGlobalModel
    X = np.array([callGraphForGlobalModel,classSizeForGlobalModel]).transpose()
    X = list([list(i) for i in X])
    X = sm.add_constant(X,prepend=False)
    model = sm.OLS(y, X)
    results = model.fit()
    print(results.summary(yname="issues", xname =("APIcalls", "ClassSize", "const")))


    print("API CALLS only")
    X = callGraphForGlobalModel
    X = sm.add_constant(X,prepend=False)
    model2 = sm.OLS(y, X)
    results = model2.fit()
    print(results.summary(yname="issues",xname =["APIcalls","const"]))
    print("Size only")
    X = classSizeForGlobalModel
    X = sm.add_constant(X,prepend=False)
    model3 = sm.OLS(y, X)
    results = model3.fit()
    print(results.summary(yname="issues",xname =["ClassSize","const"]))