Exemplo n.º 1
0
def save_enrichment(x):
     lib = gp.get_library_name('Human')

     with open('gensets.txt', 'w') as f:
          for item in range(len(lib)):
               f.write("%s %s\n" % (item, lib[item]))
     # lib = lib[49: 54]
     lib = lib[53]

     files = [(1, x+"/gcn-hom-hom.csv"), (2, x+"/gcn-hom-onto.csv"),
              (3, x+"/gcn-onto-onto.csv"), (4, x+"/gae-hom-hom.csv"),
              (5, x+"/gae-hom-onto.csv"), (6, x+"/gae-onto-onto.csv")]

     df = pd.DataFrame()
     writer = pd.ExcelWriter('enrich-cluster/full-results.xlsx')
     for key, file in files:
          print(file)
          cluster_data = read_file_2(file)
          for i in cluster_data:
               try:
                    enr = gp.enrichr(gene_list=list(cluster_data[i][2]), gene_sets=lib, organism='Human', cutoff=0.05).results
               except:
                    pass
               enr['model'] = key
               enr['cluster'] = i
               df = df.append(enr)

     df = df[(df['P-value'] < 0.05)]
     df.to_excel(writer, sheet_name="sheet1")
     writer.save()
Exemplo n.º 2
0
def scatter_stats(perms):

    statistics = {"gae-hom-hom-gae-hom-onto": list(),
                  "gae-hom-hom-gae-onto-onto": list(),
                  "gae-onto-onto-jcd-onto": list(),
                  "gae-hom-hom-jcd-hom": list(),
                  "gae-hom-hom-jcd-onto": list(),
                  "gae-hom-onto-gae-onto-onto": list(),
                  "gae-hom-onto-jcd-hom": list(),
                  "gae-hom-onto-jcd-onto": list(),
                  "gae-onto-onto-jcd-hom": list(),
                  "jcd-hom-jcd-onto": list(),
                  }

    for perm in range(perms):

        print("********** permutation no {} *********".format(perm))

        random_elments = random.sample(list(read_file_2("../enrich_red/selected_genesets.csv").keys()), 50)


        for i in files:
            print("file is {}".format(i))
            file_one = "../enrich_red/" + i[0] + ".csv"
            file_two = "../enrich_red/" + i[1] + ".csv"

            _one = read_file_2(file_one)
            _two = read_file_2(file_two)

            one = {ii: _one[ii] for ii in random_elments}
            two = {ii: _two[ii] for ii in random_elments}

            array = []
            for x, y in zip(one, two):
                x_one = list(one[x])
                y_one = list(two[y])

                array.extend(res(x_one, y_one))
                array.extend(res(y_one, x_one))

            x, y = zip(*array)

            linreg = spy.stats.linregress(x, y)

            statistics[i[0]+"-"+i[1]].append(linreg.rvalue)

    write_file("../perms2/"+lib+".csv", statistics)
Exemplo n.º 3
0
def one_correlation():
    selected = read_file_2("../enrich_red/selected_genesets.csv")
    top_n = 20
    for i in files:
        similar = read_file_2("../enrich_red/" + i + ".csv")

        array = []
        for x, y in zip(selected, similar):
            x_one = list(selected[x])
            y_one = list(similar[y])
            array.extend(res(x_one, y_one, top_n))

        x, y = zip(*array)

        fig, ax = plt.subplots()
        ax.scatter(x, y)

        linreg = spy.stats.linregress(x, y)
        temp = [linreg.intercept + linreg.slope.item() * k for k in x]
        plt.plot(x, temp, 'r')

        x_min = min(x)
        x_max = max(x) + statistics.stdev(x)

        y_min = min(y)
        y_max = max(y) + statistics.stdev(y)

        plt.xlim(x_min, x_max)
        plt.ylim(y_min, y_max)

        plt.ylabel("Corresponding P-value")
        plt.xlabel("Actual P-value")
        # plt.text(i[2], i[3], 'R2 = %0.2f' % linreg.rvalue)
        # plt.text(i[2], i[4], 'Slope = %0.2f' % linreg.slope)
        extra = Rectangle((0, 0),
                          1,
                          1,
                          fc="w",
                          fill=False,
                          edgecolor='none',
                          linewidth=0)
        ax.legend(
            [extra, extra],
            ('R2 = %0.2f' % linreg.rvalue, 'Slope = %0.2f' % linreg.slope))
        plt.title("Correlation plot for " + i)
        plt.show()
Exemplo n.º 4
0
def stat_correlation(perms, top_n):
    statistics = {
        "gae-hom-hom": list(),
        "gae-hom-onto": list(),
        "gae-onto-onto": list(),
        "jcd-hom": list(),
        "jcd-onto": list()
    }

    for perm in range(perms):

        print("********** permutation no {} *********".format(perm))
        # random_elments = random.sample(list(read_file_2("ms-project/geneset_pairing.csv").keys()), 50)

        random_elments = random.sample(
            list(read_file_2("../enrich_red/selected_genesets.csv").keys()),
            50)

        _selected = read_file_2("../enrich_red/selected_genesets.csv")

        selected = {ii: _selected[ii] for ii in random_elments}

        for i in files:
            print("file is {}".format(i))
            _similar = read_file_2("../enrich_red/" + i + ".csv")
            similar = {ii: _similar[ii] for ii in random_elments}

            array = []
            for x, y in zip(selected, similar):
                x_one = list(selected[x])
                y_one = list(similar[y])
                array.extend(res(x_one, y_one, top_n))

            x, y = zip(*array)

            linreg = spy.stats.linregress(x, y)

            statistics[i].append(linreg.rvalue)

    write_file("../perms/" + lib + ".csv", statistics)
Exemplo n.º 5
0
def distribution():

    mthds = read_file_2("../perms2/GO_Biological_Process_2018.csv")

    # Using 95% confidence interval
    # (1-0.95)/2
    t_score = abs(t.ppf(0.025, 23))
    alpha = 1 - 0.95

    excel_rows = [['Method', 'PT. Est', 'lower CI', 'upper CI']]

    for i in mthds:

        mean = statistics.mean(mthds[i])
        std = statistics.stdev(mthds[i])
        sqtr_nu = math.sqrt(len(mthds[i]))
        # p_hat and q_hat set to conservative since we have no previous data #0.5 for each
        # Since its probability I clip to 0

        x = pd.Series(mthds[i])

        # if i == 'jcd-hom-jcd-onto':
        #     mtd = replacenth(i, "-", " vs ", 2)
        # else:
        #     mtd = replacenth(i, "-", " vs ", 3)
        mtd = i
        lower_ci = max(mean - t_score * std / sqtr_nu, 0)
        upper_ci = mean + t_score * std / sqtr_nu

        qq_plot(x, mtd, "../qqplots/qq-" + i + ".png")

        excel_rows.append(
            [mtd, round(mean, 3),
             round(lower_ci, 3),
             round(upper_ci, 3)])

    df = pd.DataFrame.from_records(excel_rows[1:], columns=excel_rows[0])

    print(df)
    print(df.to_latex(index=True))
Exemplo n.º 6
0
    sample = read_file("enrich_red/gae-hom-hom.csv")
    red = read_file("enrich_red/selected_genesets.csv")
    temp = {}
    for i in red:
        if i in sample:
            temp[i] = red[i]
    write_file("enrich_red/selected_genesets.csv", temp)


# reduce_genesets()

lib = gp.get_library_name('Human')[53]
files = [("gae-hom-hom", 1, 1), ("gae-hom-onto", 1, 2),
         ("gae-onto-onto", 2, 3), ("jcd-hom-hom", 1, 4),
         ("jcd-onto-onto", 2, 5)]
data_desc = read_file_2("data\ms-project\data-description.csv")

for i in files:
    file_name = "enrich_red/" + i[0] + ".csv"
    file = read_file(file_name)
    # if i[1] == 1:
    #     neigh = read_file_2("data/ms-project/neig_len_hom.csv")
    # else:
    #     neigh = read_file_2("data/ms-project/neig_len_onto.csv")

    if i[2] == 1:
        rank = read_file_2(
            "ranking_results/ -- GAE -- Homology -- Homology.csv")
    elif i[2] == 2:
        rank = read_file_2(
            "ranking_results/ -- GAE -- Homology -- Ontology.csv")