def compareDistributions(bins_type, nb_exp, folder="../resultData/features_on_films"):
    '''
    Ici on va comparer la distribution d'une feature dans differents tirages de nb_exp films.
    Le but est de trouver in fine le nb_exp minimal pour lequel la distribution est stable.
    Ca va me dire a quel point je peux paralleliser le calcul de la distance d'un film a ses controles
    '''
    
    pvalues = defaultdict(list)
    for feature in featuresNumeriques:
        print feature
        l = filter(lambda x: feature in x and bins_type in x and int(x.split('_')[2])>0.75*nb_exp and int(x.split('_')[2])<1.25*nb_exp, os.listdir(folder))
        for file_ in l:
            iter_ = int(file_.split('_')[-1].split('.')[0])
            print '----', iter_
            for other_file in filter(lambda x: int(x.split('_')[-1].split('.')[0])>iter_, l):
                iter2 = int(other_file.split('_')[-1].split('.')[0])
                print iter2
                f=open(os.path.join(folder, file_))
                l1=pickle.load(f); f.close()
    
                f=open(os.path.join(folder, other_file))
                l2=pickle.load(f); f.close()
                if bins_type =='quantile':
                    ks, pval = ks_2samp(l1[1], l2[1])
                else:
                    ks, pval = ks_2samp(l1, l2)
                    
                pvalues[feature].append(pval)
    for feature in featuresNumeriques:
        print feature, np.mean(pvalues[feature]), scoreatpercentile(pvalues[feature], 90)
        
    f=open(os.path.join(folder, 'pvalues_{}_{}'.format(bins_type, nb_exp)), 'w')
    pickle.dump(pvalues, f); f.close()
    return
示例#2
0
def different_expression(expression='data/sample_expr',
                         path_to_genes='data/genes_set.txt',
                         cutoff=0.05,
                         save=None):
    """
    Determine whether specified genes differ in expression between 2 classes. Use Kolmogorov-Smirnov criterion.
    NOTE: takes a dataframe where genes are columns and observations are rows
    :param expression: str - path to expression data in csv
    :param path_to_genes: str - path to subset of genes where each gene occupy 1 row
    :param cutoff: float - p-value threshold
    :param save: str - save fig in 'save' location if provided
    :return: boolean - whether expression of genes is altered
    """
    # Load data
    if not is_dataframe(expression):
        expression = pd.read_csv(expression)
    with open(path_to_genes, 'r') as source:
        genes = [gene.strip() for gene in source.readlines()]

    # Prepare df for subset of genes
    expression_subset = expression.loc[:, expression.columns.isin(genes)]
    expression_subset.loc[:, 'Description'] = expression.loc[:, 'Description']

    # Correlations between class (phenotype) and expression of each gene
    # and correlations between class and genes from subset
    corrs_all = [pearsonr(expression.loc[:, gene], expression.loc[:, 'Description'])[0] for gene in expression]
    corrs_gene_set = [pearsonr(expression.loc[:, gene], expression.loc[:, 'Description'])[0] for gene in expression_subset]

    # Plot functions
    plot_curves(corrs_all, corrs_gene_set, save)
    # Compute p-value
    p_value = ks_2samp(corrs_gene_set, corrs_all)[1]

    return p_value <= cutoff
示例#3
0
def KS_Testing(Databases, conditions):
    """
    docstring
    """

    columns = ["AppEn", "SampEn", "DFA", "HFD", "SD_ratio"]
    ks_test = list()

    for Data in Databases:
        for cond in conditions:
            #print(Data)
            print("Base de datos: ", cond)
            for col in columns:
                metric = np.array(Data[[col]])
                print("Métrica: ", col)
                #print(type(metric))
                comb = list(combinations(metric, 2))
                #print("Combinaciones posibles: ",len(comb))

                for i in range(len(comb) - 1):
                    pair = comb[i]

                    X = np.histogram(np.array(pair[0]).all(), bins='auto')
                    Y = np.histogram(np.array(pair[1]).all(), bins='auto')
                    ks_r = stats.ks_2samp(X[0], Y[0], alternative='two-sided')
                    p_val = ks_r[1]
                    #print(p_val)
                    if p_val < 0.05:
                        ks_test.append(0)
                    elif p_val > 0.05:
                        ks_test.append(1)
                    prob = np.sum(ks_test) / len(ks_test) * 100
                print("Porcentaje de Similitud {} %".format(prob))
            print("\n")
示例#4
0
def calc_kolmogorov_smirnov(columns, prior_data, target_data):
    """Calculate kolmogorov_smirnov matrix"""
    ks_param = {}
    for dkey in sorted(columns):
        ks_val = ks_2samp(prior_data[dkey], target_data[dkey])
        ks_param[dkey] = ks_val[0]
    return ks_param
示例#5
0
文件: GUI.py 项目: anton-shikov/HW_6
def GSEA(geo_ID, gene_list):
    gse = GEOparse.get_GEO(geo=geo_ID, destdir="./")
    expression = gse.pivot_samples('VALUE').T
    experiments = {}
    for i, (idx, row) in enumerate(gse.phenotype_data.iterrows()):
        tmp = {}
        tmp["Type"] = 1 if "control" in row["description"] else 0
        experiments[i] = tmp
    experiments = pd.DataFrame(experiments).T
    counter = 0
    all_genes_set = []
    all_corr_set = []
    genes_corr_set = []
    for gene in expression:
        counter += 1
        if counter <= 3:
            continue
        all_genes_set.append(gene)
        corr_matrix = np.corrcoef(
            [list(experiments['Type']),
             list(expression[gene])])
        all_corr_set.append(corr_matrix[0, 1])
        if gene in gene_list:
            genes_corr_set.append(corr_matrix[0, 1])
    p_value = ks_2samp(genes_corr_set, all_corr_set)[1]
    return (str(p_value))
def compute_correlations_ks(mirnas):
    """  Compute Pearson, Spearman, and KS for 5p/3p expr data of miRs. """

    print "miRNA\tPearson_r\tPearson_pval\tSpearman_rho\tSpearman_pval\t",
    print "KS_D\tKS_pval"
    for mir in mirnas:
        if len(mirnas[mir]) > 1:
            pears_r, pears_pval = pearsonr(mirnas[mir]["5p"],
                                           mirnas[mir]["3p"])
            spear_rho, spear_pval = spearmanr(mirnas[mir]["5p"],
                                              mirnas[mir]["3p"])
            ks_d, ks_pval = ks_2samp(mirnas[mir]["5p"], mirnas[mir]["3p"])
            mir_name = mir.rstrip('-')
            print "{0}\t{1:f}\t{2:f}\t{3:f}\t{4:f}\t{5:f}\t{6:f}".format(
                mir_name, pears_r, pears_pval, spear_rho, spear_pval, ks_d,
                ks_pval)
示例#7
0
    def pushButton_clicked(self):
        #        считываем текст, введенный в ячейку имени файла и ячейку генов
        gse_acc = self.lineEdit.text()
        mytext = self.textEdit.toPlainText()
        #        делаем из генов список
        genes = mytext.split()
        #        загружаем файл по имени
        gse = GEOparse.get_GEO(geo=gse_acc, destdir="./")
        #        получаем матрицу экспрессии по генам и образцам
        expression = gse.pivot_samples('VALUE').T

        #        получаем список из фенотипов: если в описании присутствует слово
        #        "control", считаем это контролем и присваиваем 1
        experiments = {}
        for i, (idx, row) in enumerate(gse.phenotype_data.iterrows()):
            tmp = {}
            tmp["Type"] = 1 if "control" in row["description"] else 0
            experiments[i] = tmp
        experiments = pd.DataFrame(experiments).T
        phen = list(experiments['Type'])
        #        строим матрицы корреляций (как в классе)
        counter = 0
        all_genes_set = []
        all_corr_set = []
        genes_corr_set = []
        for column in expression:
            counter += 1
            if counter <= 3:
                continue

            expressions = list(expression[column])
            gene = column
            all_genes_set.append(column)

            corr_matrix = np.corrcoef([phen, expressions])
            all_corr_set.append(corr_matrix[0, 1])
            if gene in genes:
                genes_corr_set.append(corr_matrix[0, 1])


#        получаем p-value по тесту Колмогорова-Смирнова
        p_value = ks_2samp(genes_corr_set, all_corr_set)[1]
        #        выводим его в окошко
        self.label_3.setText('{:.3f}'.format(p_value))
示例#8
0
文件: GUI.py 项目: anton-shikov/HW_6
def GSEA (geo_ID, gene_list):
    gse = GEOparse.get_GEO(geo=geo_ID, destdir="./")
    expression = gse.pivot_samples('VALUE').T
    experiments = {}
    for i, (idx, row) in enumerate(gse.phenotype_data.iterrows()):
        tmp = {}
        tmp["Type"] = 1 if "control" in row["description"] else 0
        experiments[i] = tmp
    experiments = pd.DataFrame(experiments).T
    counter = 0
    all_genes_set = []
    all_corr_set = []
    genes_corr_set = []
    for gene in expression:
        counter += 1
        if counter <= 3:
            continue
        all_genes_set.append(gene)               
        corr_matrix = np.corrcoef([list(experiments['Type']), list(expression[gene])])
        all_corr_set.append(corr_matrix[0,1])
        if gene in gene_list:
            genes_corr_set.append(corr_matrix[0,1])
    p_value = ks_2samp(genes_corr_set, all_corr_set)[1]
    return(str(p_value))
示例#9
0
    pearsonsWnt = list()

    for wnt1 in wntGenes:
        if wnt1 in geneDict:
            for wnt2 in wntGenes:
                if wnt2 in geneDict and wnt1 != wnt2:
                    corr, pVal = pearsonr(geneDict[wnt1], geneDict[wnt2])
                    if (corr == 1.0):
                        corr = 0.99999
                    pearsonsWnt.append(math.atanh(corr))

    # END Non Specific Wnt Pearson Correlations-----------------------------------------------

    # Determine if this is statistically significant------------------------------------------

    ks, pVal = ks_2samp(pearsons, pearsonsWnt)
    sigFile.write(file + " " + str(pVal) + "\n")

    # BUILD HISTOGRAM-------------------------------------------------------------------------

    if pVal < (0.05 / 769.0):
        try:
            # the histogram of the random data
            plt.hist(pearsons, 75, density=True, facecolor='b', alpha=0.25)
            n, bins, patches = plt.hist(pearsonsWnt,
                                        75,
                                        density=True,
                                        facecolor='g',
                                        alpha=0.25)

            plt.xlabel('Correlation')
示例#10
0
    sns.distplot(df[df["source"] == "aa12"][type_to_check], label="aa12", kde=False, rug=True)
    sns.distplot(df[df["source"] == "aa1"][type_to_check], label="aa1", kde=False, rug=True)
    ax = sns.distplot(df[df["source"] == "aa2"][type_to_check], label="aa2", kde=False, rug=True)
    ax.set(xlabel='Total length')
    plt.legend()
    sns.despine(offset=10, trim=True)
    path_to_read = "/Users/alessandrozonta/PycharmProjects/astar/output/"
    if save_figure is True:
        plt.savefig("{}/total_length_astar.pdf".format(path_to_read))
    else:
        plt.show()
    plt.close()


    total = []
    total.append(stats.ks_2samp(df[df["source"] == "a"]['fitness'], df[df["source"] == "aa012"]['fitness']).pvalue)
    total.append(stats.ks_2samp(df[df["source"] == "a"]['fitness'], df[df["source"] == "aa01"]['fitness']).pvalue)
    total.append(stats.ks_2samp(df[df["source"] == "a"]['fitness'], df[df["source"] == "aa02"]['fitness']).pvalue)
    total.append(stats.ks_2samp(df[df["source"] == "a"]['fitness'], df[df["source"] == "aa0"]['fitness']).pvalue)
    total.append(stats.ks_2samp(df[df["source"] == "a"]['fitness'], df[df["source"] == "aa12"]['fitness']).pvalue)
    total.append(stats.ks_2samp(df[df["source"] == "a"]['fitness'], df[df["source"] == "aa1"]['fitness']).pvalue)
    total.append(stats.ks_2samp(df[df["source"] == "a"]['fitness'], df[df["source"] == "aa2"]['fitness']).pvalue)
    total.append(stats.ks_2samp(df[df["source"] == "a"]['fitness'], df[df["source"] == "aa"]['fitness']).pvalue)

    # small p -> two different distributions

    logger.info(total)
    logger.info(np.mean(np.array(total)))
    logger.info(np.std(np.array(total)))

    total = []
示例#11
0
    def ks_feature_distribution(self, threshold=0.1, show_plots=True):
        """
        Uses the Kolomogorov-Smirnov test see if the distribution in the training and test sets are similar.
        
        Credit: https://www.kaggle.com/nanomathias/distribution-of-test-vs-training-data#1.-t-SNE-Distribution-Overview

        Parameters
        ----------
        threshold : float, optional
            KS statistic threshold, by default 0.1

        show_plots : bool, optional
            True to show histograms of feature distributions, by default True

        Returns
        -------
        DataFrame
            Columns that are significantly different in the train and test set.

        Examples
        --------
        >>> data.ks_feature_distribution()
        >>> data.ks_feature_distribution(threshold=0.2)
        """

        if self.x_test is None:
            raise ValueError(
                "Data must be split into train and test set. Please set the `x_test` variable."
            )

        report_info = technique_reason_repo["stats"]["dist_compare"]["ks"]

        diff_data = []
        diff_df = None

        for col in tqdm(self.x_train.columns):
            statistic, pvalue = ks_2samp(
                self.x_train[col].values, self.x_test[col].values
            )

            if pvalue <= 0.05 and np.abs(statistic) > threshold:
                diff_data.append(
                    {
                        "feature": col,
                        "p": np.round(pvalue, 5),
                        "statistic": np.round(np.abs(statistic), 2),
                    }
                )

        if diff_data:
            diff_df = pd.DataFrame(diff_data).sort_values(
                by=["statistic"], ascending=False
            )

            if show_plots:
                n_cols = 4
                n_rows = int(len(diff_df) / n_cols) + 1

                _, ax = plt.subplots(n_rows, n_cols, figsize=(40, 8 * n_rows))

                for i, (_, row) in enumerate(diff_df.iterrows()):
                    if i >= len(ax):
                        break
                    
                    extreme = np.max(
                        np.abs(
                            self.x_train[row.feature].tolist()
                            + self.x_test[row.feature].tolist()
                        )
                    )
                    self.x_train.loc[:, row.feature].swifter.apply(np.log1p).hist(
                        ax=ax[i],
                        alpha=0.6,
                        label="Train",
                        density=True,
                        bins=np.arange(-extreme, extreme, 0.25),
                    )

                    self.x_test.loc[:, row.feature].swifter.apply(np.log1p).hist(
                        ax=ax[i],
                        alpha=0.6,
                        label="Train",
                        density=True,
                        bins=np.arange(-extreme, extreme, 0.25),
                    )

                    ax[i].set_title(f"Statistic = {row.statistic}, p = {row.p}")
                    ax[i].set_xlabel(f"Log({row.feature})")
                    ax[i].legend()

                plt.tight_layout()
                plt.show()

            if self.report is not None:
                self.report.report_technique(report_info, [])

        return diff_df
示例#12
0
def KS2samp_normtest(x,alpha):
    normal_samp=np.random.normal(np.mean(x),np.std(x),len(x))
    KS,pKS=st.ks_2samp(x, normal_samp)
    if pKS>alpha:print "dist is normal; KStest pval=", round(pKS,2)
    else:print "dist is NOT normal; KStest pval=", round(pKS,2)
    return pKS
示例#13
0
 def compute(train_scores, validate_scores):
     """
     train/validate scores: predicted scores on train/validate set
     """
     return stats.ks_2samp(train_scores, validate_scores).pvalue
示例#14
0
def ks_test(observation_pdf, pdf):
    #observ_cdf = np.cumsum(observation_pdf)
    #cdf = np.cumsum(pdf)
    ks_stat, p_value = stats.ks_2samp(observation_pdf.reshape(-1),pdf.reshape(-1))
    return p_value
    plt.savefig("{}/total_length_random_walk.pdf".format(path_to_read))
    # plt.show()
    plt.close()

    to_check = ["fitness", "no_overlapping", "direction"]

    for c in to_check:
        for f in fit:
            here_list = copy.deepcopy(fit)
            here_list.remove(f)

            total = []
            # for el in here_list:
            total.append(
                stats.ks_2samp(
                    df[df["source"] == "fitness_no_visited_seed_pd0_"][c],
                    df[df["source"] ==
                       "fitness_no_visited_seed_pd1_"][c]).pvalue)
            logger.info(f)
            logger.info(total)
            logger.info(np.mean(np.array(total)))
            logger.info(np.std(np.array(total)))
            logger.info("---")

        logger.info("------------------------")

    #
    # path = "/Users/alessandrozonta/Desktop/output_random_walk/"
    # folders = sorted_nicely(glob.glob("{}*/".format(path)))
    #
    # for f in folders:
    #     name_folder = f.split("/")[-1]
示例#16
0
    # plt.close()

    # ax = sns.boxplot(x="source", y="direction", data=df)
    # ax.set(xlabel='sources', ylabel='directions')
    # sns.despine(offset=10, trim=True)
    # path_to_read = "/Users/alessandrozonta/PycharmProjects/NEAT/output/"
    # plt.savefig("{}/directions_neat.pdf".format(path_to_read))
    # # plt.show()
    # plt.close()

    to_check = ["fitness", "no_overlapping", "direction"]

    for c in to_check:
        total = []
        total.append(
            stats.ks_2samp(df[df["source"] == "neat2"][c],
                           df[df["source"] == "neat012"][c]).pvalue)
        total.append(
            stats.ks_2samp(df[df["source"] == "neat2"][c],
                           df[df["source"] == "neat01"][c]).pvalue)
        total.append(
            stats.ks_2samp(df[df["source"] == "neat2"][c],
                           df[df["source"] == "neat02"][c]).pvalue)
        total.append(
            stats.ks_2samp(df[df["source"] == "neat2"][c],
                           df[df["source"] == "neat0"][c]).pvalue)
        total.append(
            stats.ks_2samp(df[df["source"] == "neat2"][c],
                           df[df["source"] == "neat12"][c]).pvalue)
        total.append(
            stats.ks_2samp(df[df["source"] == "neat2"][c],
                           df[df["source"] == "neat1"][c]).pvalue)
def ks_test(x):
    test_values, ref_values = x[:len(x)/2], x[len(x)/2:]
    return ks_2samp(test_values, ref_values)[0]
示例#18
0
	
	# Specific Wnt Pearson Correlations-------------------------------------------------------
	
	pearsonsPairedWnt = list()
	for pair in wntPairs:
		if pair[0] in geneDict and pair[1] in geneDict:
			corr, pVal = pearsonr(geneDict[pair[0]], geneDict[pair[1]])
			if corr == 1.0:
				corr = 0.99999
			pearsonsPairedWnt.append(math.atanh(corr))

	# END Specific Wnt Pearson Correalations--------------------------------------------------

	# Determine if this is statistically significant------------------------------------------

	ks, pValNon = ks_2samp(pearsons, pearsonsWnt)
	ks, pValSpec = ks_2samp(pearsons, pearsonsPairedWnt)
	sigFile.write(file + " " + str(pValNon) + "," + str(pValSpec) + "\n")
	print(file + " " + str(pValNon) + "," + str(pValSpec) + "\n")

	# BUILD HISTOGRAM-------------------------------------------------------------------------

	if pValNon < (0.05 / 769.0) or pValSpec < (0.05 / 769.0):
		print("Number of Pairwise Wnt Data Points: " + str(len(pearsonsPairedWnt)))
		try:
			# the histogram of the random data
			plt.hist(pearsons, 75, density=True, range = [-2,2], facecolor='b', alpha=0.25)			# Blue Random Background Data
			plt.hist(pearsonsWnt, 75, density=True, range = [-2,2], facecolor='g', alpha=0.25)		# Green Non specific Wnt Data
			plt.hist(pearsonsPairedWnt, 75, density = True, range = [-2,2], facecolor='r', alpha=0.25)	# Red specific Wnt Data
		
			plt.xlabel('Correlation')
    # real_names = ["RWFBNV", "RWFB", "neat02", "aa"]
    # for name in real_names:
    #     ax = sns.distplot(df[df["source"] == name]["total_length"], label=name, kde=False, rug=True)
    # ax.set(xlabel='total length')
    # plt.legend()
    # sns.despine(offset=10, trim=True)
    # plt.savefig("{}/total_length_neat.pdf".format(path_to_read))
    # plt.show()
    # plt.close()

    to_check = ["fitness", "no_overlapping", "direction"]

    for c in to_check:
        total = []
        total.append(
            stats.ks_2samp(df[df["source"] == "neat02"][c],
                           df[df["source"] == "aa"][c]).pvalue)
        total.append(
            stats.ks_2samp(df[df["source"] == "neat02"][c],
                           df[df["source"] == "RWFB"][c]).pvalue)
        total.append(
            stats.ks_2samp(df[df["source"] == "neat02"][c],
                           df[df["source"] == "RWFBNV"][c]).pvalue)
        logger.info(total)
        logger.info(np.mean(np.array(total)))
        logger.info(np.std(np.array(total)))

        total = []
        total.append(
            stats.ks_2samp(df[df["source"] == "aa"][c],
                           df[df["source"] == "RWFB"][c]).pvalue)
        total.append(
示例#20
0
        numsamples=500
        samp_size=10
        alpha=0.05
        print "\n",lith[f],round(np.mean(curRh),0)
        smd.append(SMD_analysis(curUCS,numsamples,samp_size,alpha))
        curax=plt.gca()
        if f==len(lith)-1:
            curax.set_xlabel("UCS, MPa")
        plt.setp(curax.get_yticklabels(), visible=False)
        plt.tight_layout()

    

    for f in range(len(lith)):
        cursmd=smd[f]
        print ""
        for g in range(f+1,len(lith)):
            comsmd=smd[g]

            T,pT=st.ttest_ind(cursmd,comsmd)
            KS,pKS=st.ks_2samp(cursmd,comsmd)
            print [lith[f], lith[g], round(pT,3), round(pKS,3)]
                          
                          
plt.show()