예제 #1
0
def make_snp_venn(venn_grouped_info, selected_comparison, save_path):

    experiment_venn_data = {'Other': set()}

    for phenotype in selected_comparison.keys():
        experiment_venn_data[phenotype] = set()

    comparison_sample_list = set()
    all_samples = set()

    for a_pheno_sample in selected_comparison.keys():
        for a_thing in selected_comparison[a_pheno_sample]:
            comparison_sample_list.add(a_thing)

    for venn_comparison in venn_grouped_info:
        for a_sample in venn_comparison['sets']:
            all_samples.add(a_sample)
        for phenotype in selected_comparison.keys():
            for a_pheno_sample in selected_comparison[phenotype]:
                if a_pheno_sample in venn_comparison['sets']:
                    for var_pos in venn_comparison['components']:
                        experiment_venn_data[phenotype].add(var_pos)

    other_samples = (all_samples - comparison_sample_list)

    for venn_comparison in venn_grouped_info:
        for other_sample in other_samples:
            if other_sample in venn_comparison['sets']:
                for var_pos in venn_comparison['components']:
                    experiment_venn_data['Other'].add(var_pos)

    venn(experiment_venn_data)

    savefig(save_path + 'variants/snp_venn_fig.png', format="png")
예제 #2
0
def main():
    args = options()
    verbose = args.verbose

    truesets = {}
    falsesets = {}

    for o in args.orglist.split(' '):
        tmpstr = args.fname.split('_')

        Nens = int(tmpstr[1])
        Ngcs = int(tmpstr[3])
        Nngcs = int(tmpstr[5])
        isStochW = int(tmpstr[8])

        fname = args.iopath + o + args.condition + '/' + args.fname

        dfc = pandas.read_csv(fname + '_conditions.csv')
        gdf = pandas.read_csv(fname + '_gc_tab.csv', index_col=0)
        ngdf = pandas.read_csv(fname + '_ngc_tab.csv', index_col=0)

        gcs = []
        ngcs = []
        for i in dfc.index:
            gcs.append(list(dfc.iloc[i, 1:(1 + Ngcs)].values))
            ngcs.append(list(dfc.iloc[i,
                                      (1 + Ngcs):(1 + Ngcs + Nngcs)].values))

        ngdf_mask = ngdf.copy()
        for i in range(len(ngcs)):
            ngdf_mask.iloc[:, i] = ~ngdf.iloc[:, i].index.isin(ngcs[i])
        ngdf_masked = ngdf.where(ngdf_mask, np.nan)

        gdf_mask = gdf.copy()
        for i in range(len(gcs)):
            gdf_mask.iloc[:, i] = ~gdf.iloc[:, i].index.isin(gcs[i])
        gdf_masked = gdf.where(gdf_mask, np.nan)

        gdf_masked_maj = gdf_masked.copy()
        addMajorityCol(gdf_masked_maj)
        ngdf_masked_maj = ngdf_masked.copy()
        addMajorityCol(ngdf_masked_maj)

        TP = set(gdf_masked_maj[gdf_masked_maj['Majority'] == 1].index)
        TN = set(ngdf_masked_maj[ngdf_masked_maj['Majority'] == 0].index)
        FP = set(gdf_masked_maj[gdf_masked_maj['Majority'] == 0].index)
        FN = set(ngdf_masked_maj[ngdf_masked_maj['Majority'] == 1].index)

        truesets[o] = TP ^ TN
        falsesets[o] = FP ^ FN

    alltptn = set.intersection(*list(truesets.values()))
    print('\n'.join(alltptn))
    venn.venn(truesets)
    plt.savefig(args.iopath + 'venn_TN_TP.png')

    venn.venn(falsesets)
    plt.savefig(args.iopath + 'venn_FN_FP.png')

    return truesets
예제 #3
0
 def plot_ner_cooccurence_venndiagram(self):
     # plots a ven-diagram displaying how the ner labels co-occur
     
     df_dict = self.ner_df.groupby('ner_id').apply(lambda x: set(x['sentence_id'])).to_dict() # makes a dictionary of {label1 : set(sentence1, sentence2, ...)}
     for i, ner in self.id2ner.items(): # changing the label id's to label names
         if i in df_dict:
             df_dict[self.id2ner[i]] = df_dict.pop(i)
     
     venn(df_dict)
예제 #4
0
    def draw_venn(self):
        dct = {
            'local_items': set(self.__local_wallpaper_checker.local_items),
            'backup_items': set(self.__local_wallpaper_checker.backup_items),
            'local_subs': set(self.__local_wallpaper_checker.subscribed_items),
            'network_subs': set(self.__network_wallpaper_checker.subscription),
            'network_deleted': set(self.__deleted_wallpaper_checker.network_deleted_items),
        }
        venn(dct)

        plt.draw()
        plt.show()
예제 #5
0
def plotVenns(df, subs, mycmap, oname):

    fig, axes = plt.subplots(len(subs), 1, figsize=(5, 4 * len(subs)))
    for i, c in enumerate(subs):
        axes[i].set_title(mylabels[c])
        venn.venn(df[c],
                  cmap=mycmap,
                  ax=axes[i],
                  fontsize=6,
                  legend_loc="best")
    fig.tight_layout(pad=0.1)
    plt.savefig(oname)
    return
예제 #6
0
    def vennplot(self, data, title, filename):
        sns.set(rc={'figure.figsize': (12, 12)})
        sns.set_style("ticks")
        fig, ax = plt.subplots()
        sns.despine(fig=fig, ax=ax)

        venn(data, ax=ax)

        ax.set_title(title,
                     fontsize=35,
                     fontweight='bold')

        fig.savefig(os.path.join(self.outdir, "{}.png".format(filename)))
        plt.close()
def plot_peptide_overlap(peptide_dict, labels):
    fig, ax = plt.subplots(figsize=(10, 10))
    if len(peptide_dict) in range(2, 6):
        venn(peptide_dict, cmap="viridis", ax=ax)
    elif len(peptide_dict) == 6:
        pseudovenn(peptide_dict, cmap="viridis", ax=ax)
    else:
        print("No Peptide Venn Diagram plotted due to invalid number of samples.")
        print("Venn Diagrams require between 2 and 6 samples.")

    ax.set_title("Peptide Overlap", fontsize=20)
    ax.legend(labels=labels, fontsize=15, loc='best', bbox_to_anchor=[1.1, 1])
    fig.savefig("Peptide_overlap_venn.svg")
    fig.savefig("Peptide_overlap_venn.png", bbox_inches="tight")
예제 #8
0
 def plot():
     up = {}
     down = {}
     for name, genes_ddf in sorted(a_dict.items()):
         df = genes_ddf.df
         stable_ids = df[id_column]
         column = genes_ddf.venn_annotator["log2FC"]
         up[name] = set(stable_ids[df[column] > 0])
         down[name] = set(stable_ids[df[column] < 0])
     plt.figure(figsize=(4, 4))
     venn.venn(up)
     plt.savefig(str(output_prefix) + ".up.png", dpi=72)
     plt.figure(figsize=(4, 4))
     venn.venn(down)
     plt.savefig(str(output_prefix) + ".down.png", dpi=72)
예제 #9
0
def plotVenns4(df, subs, mycmap, oname):

    rw = [0, 0, 1, 1]
    cl = [0, 1, 0, 1]
    fig, axes = plt.subplots(2, 2)
    for i, c in enumerate(subs):
        axes[rw[i], cl[i]].set_title(mylabels[c])
        venn.venn(df[c],
                  cmap=mycmap,
                  ax=axes[rw[i], cl[i]],
                  fontsize=6,
                  legend_loc="best")
    fig.tight_layout(pad=0.1)
    plt.savefig(oname)
    return
예제 #10
0
def main():
    venn_dict = {}  # for venn diagram
    big_DataFrame = pd.DataFrame()
    sample_name_list = []
    for each in sys.argv[1:]:
        sample_name = os.path.basename(each).split("_")[0]
        print("read file: {}".format(sample_name))
        sample_name_list.append(sample_name)
        # df1 = pd.read_table(each, sep="\t", header=None, names=["Gene",sample_name])
        # read table
        df1 = pd.read_csv(each, sep=",", header=0)
        # print(df1.head())
        # add new clonotype_tra_id,clonotype_trb_id,clonotype_pair_id
        df1['clonotype_tra_id'] = df1[['v_gene', 'cdr3',
                                       'j_gene']].apply('_'.join, axis=1)
        df1['clonotype_trb_id'] = df1[['v_gene.1', 'cdr3.1',
                                       'j_gene.1']].apply('_'.join, axis=1)
        df1['clonotype_pair_id'] = df1[[
            'clonotype_tra_id', 'clonotype_trb_id'
        ]].apply("_".join, axis=1)
        # select data clonotype_tra_id,clonotype_trb_id,clonotype_pair_id, freq
        df1 = df1[['clonotype_pair_id', 'clonotype_id', 'proportion']]
        # print(df1.head())
        # add suffixes name for each sample:
        df1 = df1.rename(columns={
            'clonotype_id': 'clonotype_id_' + sample_name,
            'proportion': 'proportion_' + sample_name
        }, )
        if big_DataFrame.empty:
            big_DataFrame = df1
        else:
            big_DataFrame = pd.merge(big_DataFrame,
                                     df1,
                                     on='clonotype_pair_id',
                                     how='outer')
        # to venn list:
        venn_dict.setdefault(sample_name,
                             set(df1['clonotype_pair_id'].tolist()))
    big_DataFrame.to_csv("_".join(sample_name_list) + ".xls",
                         sep="\t",
                         index=False)
    # print(big_DataFrame.head())
    # print(venn_dict)
    print("write xls file: {}.xls".format("_".join(sample_name_list)))

    venn(venn_dict)
    plt.savefig("_".join(sample_name_list) + ".venn.png", dpi=300)
    print("draw plot: {}.venn.png".format("_".join(sample_name_list)))
예제 #11
0
def plot_venn(sets, path):

    if len(sets) > 1:
        fig = venn(sets).get_figure()
        fig.savefig(path)
    elif len(sets) in {0, 1}:
        print(f'plot_venn: No sets to intersect for {path}')
def generate_report(providers, conn):
    ioc_sets = {}
    display_sets = {}
    for provider in providers:
        ioc_sets[provider] = getIOCs(conn, 'provider = "{}"'.format(provider))

    #['AISCOMM' 'CrowdStrike' 'Cyber threat coalition' 'EmergingThreats' 'FarsightSecurity' 'Fortinet' 'IID' 'Palo Alto' 'SURBL']

    if 'Fortinet' in providers:
        display_sets['Fortinet'] = ioc_sets['Fortinet']
    if 'Palo Alto' in providers:
        display_sets['Palo Alto'] = ioc_sets['Palo Alto']
    if 'FarsightSecurity' in providers:
        display_sets['FarsightSecurity'] = ioc_sets['FarsightSecurity']
    #if 'AISCOMM' in providers:
    #	display_sets['AISCOMM'] = ioc_sets['AISCOMM']
    #if 'CrowdStrike' in providers:
    #	display_sets['CrowdStrike'] = ioc_sets['CrowdStrike']
    #if 'SURBL' in providers:
    #	display_sets['SURBL'] = ioc_sets['SURBL']
    #if 'Cyber threat coalition' in providers:
    #	display_sets['Cyber threat coalition'] = ioc_sets['Cyber threat coalition']
    if 'IID' in providers:
        display_sets['Infoblox'] = ioc_sets['IID']

    #display_sets['Infoblox'] = set()
    #for provider in set(ioc_sets).difference(set(display_sets)):
    #	display_sets['Infoblox'].update(ioc_sets[provider])

    plt.figure(figsize=(4, 4))
    v = venn(display_sets)
    #plt.title('Vendor IOCs overlap on active threats during last {}days'.format(age_IOCs_inactive_for_days))
    plt.title('Vendor IOCs overlap on active threats')
    plt.savefig('images/IOCs_overlap.png')
    logging.info('Generated images/IOCs_overlap.png successfully')
예제 #13
0
    def plot_ner_cooccurence_venndiagram(self):
        # FOR BONUS PART!!
        # Should plot a ven-diagram displaying how the ner labels co-occur
        venn_list = []

        for ner_id in self.ner_df['ner_id'].unique():
            ner = self.ner_df[self.ner_df['ner_id'] == ner_id]
            sents = ner['sentence_id'].unique().tolist()
            #venn_dic[ner[ner_id]] = set(sents)
            venn_list.append((ner_id, sents))
        #print(venn_list)
        dic = {}
        for v_list in venn_list:
            dic[v_list[0]] = set(v_list[1])
        venn(dic)
        plt.show()
        pass
예제 #14
0
    def plot_ner_cooccurence_venndiagram(self):
        # FOR BONUS PART!!
        # Should plot a ven-diagram displaying how the ner labels co-occur
        ner_dict = self.ner_vocab.vocab.copy()
        ner_dict.pop(pad_token)
        ner_dict.pop("O")

        ner_group = {}
        for label, id_ in ner_dict.items():
            ner_group[id_] = label[2:]

        all_groups = defaultdict(set)
        for ner, grp in ner_group.items():
            df = self.ner_df[self.ner_df["ner_id"] == ner]
            sents = set(df["sentence_id"])
            all_groups[grp] = all_groups[grp].union(sents)

        venn(all_groups, fmt="{percentage:.1f}%", cmap="plasma", fontsize=10)
예제 #15
0
def figure_bigmec_unsuccessful_coverage_venn():
    """ Create the figure displaying the number of successful bigmec constructions"""
    filename = "../Data/constructed_pathways/summary.csv"
    df = pd.read_csv(filename, index_col = 0)
    df_success = df.loc[df["Success"]== 0, :]

    dict_list = []
    for i, row in df_success.iterrows():
        dic = {}
        if isinstance(row["BGC type"], float):
            continue
        bgc_types = row["BGC type"].split("/")
        bgc_types
        for k in bgc_types:
            dic[k] = True
        dic["BGC"] = row["BGC"]
        dict_list.append(dic)

    df_venn = pd.DataFrame(dict_list)
    df_venn = df_venn.fillna(False)

    print(df_venn.sum())

    cols = ["T1PKS", "transAT-PKS-like", "transAT-PKS", "NRPS-like", "NRPS", "PKS-like", "BGC"]
    other_columns = [x for x in df_venn.columns if not x in cols]
    df_venn2 = pd.DataFrame()
    df_venn2["T1PKS"] = df_venn["T1PKS"]
    df_venn2["TransAT-PKS"] = df_venn[["transAT-PKS-like", "transAT-PKS"]].sum(axis = 1).astype(bool)
    df_venn2["NRPS"] = df_venn[["NRPS-like", "NRPS"]].sum(axis = 1).astype(bool)
    df_venn2["Other"] = df_venn[other_columns].sum(axis = 1).astype(bool)
    df_venn2.index = df_venn["BGC"]

    dic = {"T1PKS": set(df_venn2[df_venn2["T1PKS"]].index),
           "TransAT-PKS": set(df_venn2[df_venn2["TransAT-PKS"]].index),
           "NRPS": set(df_venn2[df_venn2["NRPS"]].index),
           "Other": set(df_venn2[df_venn2["Other"]].index)}
    venn.venn(dic)
    plt.savefig("../Figures/bigmec_venn_unsuccessful.svg")

    print("N total successful: ", len(df_success))
    for key, value in dic.items():
        print(key, len(value))
예제 #16
0
    def plot_ner_cooccurence_venndiagram(self):

        n_drug = self.ner_df.loc[self.ner_df['ner_id'] == 2,
                                 'sentence_id'].tolist()
        drug = self.ner_df.loc[self.ner_df['ner_id'] == 3,
                               'sentence_id'].tolist()
        group = self.ner_df.loc[self.ner_df['ner_id'] == 4,
                                'sentence_id'].tolist()
        brand = self.ner_df.loc[self.ner_df['ner_id'] == 5,
                                'sentence_id'].tolist()

        venn({
            "n_drug": set(n_drug),
            "drug": set(drug),
            "group": set(group),
            "brand": set(brand)
        })
        plt.show()

        pass
예제 #17
0
    def plot_ner_cooccurence_venndiagram(self):
        # FOR BONUS PART!!
        # Should plot a ven-diagram displaying how the ner labels co-occur

        all_counts = []
        for ner in [0, 1, 2, 3, 4]:
            n_df = self.ner_df[self.ner_df["ner_id"] == ner]
            sents = [i for i in n_df["sentence_id"]]
            all_counts.append(sents)

        list0, list1, list2, list3, list4 = all_counts
        #list2 = list2 + list3
        venn({
            "group": set(list1),
            "drug_n": set(list2),
            "drug": set(list3),
            "brand": set(list4)
        })
        plt.show()
        pass
    def plot_protein_overlap(self, save=False):
        fig, ax = plt.subplots(figsize=(10, 10))

        if len(self.protein_dict) in range(2, 6):
            venn(self.protein_dict, cmap="viridis", ax=ax)
        elif len(self.protein_dict) == 6:
            pseudovenn(self.protein_dict, cmap="viridis", ax=ax)
        else:
            print(
                "No Protein Venn Diagram plotted due to invalid number of samples."
            )
            print("Venn Diagrams require between 2 and 6 samples.")

        ax.legend(labels=self.labels,
                  fontsize=15,
                  loc="best",
                  bbox_to_anchor=[1.1, 1])
        ax.set_title("Protein Overlap", fontsize=20)
        if save:
            plt.savefig("Protein_overlap_venn.svg")
            plt.savefig("Protein_overlap_venn.png", bbox_inches="tight")
예제 #19
0
    def plot_ner_cooccurence_venndiagram(self):
        # FOR BONUS PART!!
        # Should plot a ven-diagram displaying how the ner labels co-occur

        counter_dict = {}
        sentence_ids = list(self.data_df["sentence_id"].unique())
        for sentence_id in sentence_ids:
            if sentence_id not in counter_dict.keys():
                counter_dict[sentence_id] = {}
                counter_dict[sentence_id][1] = 0
                counter_dict[sentence_id][2] = 0
                counter_dict[sentence_id][3] = 0
                counter_dict[sentence_id][4] = 0
                sub_ner_df = self.ner_df.loc[self.ner_df['sentence_id'] ==
                                             sentence_id]
                if not sub_ner_df.empty:
                    for j in range(len(sub_ner_df.index)):
                        ner = sub_ner_df.iloc[j]['ner_id']
                        counter_dict[sentence_id][ner] += 1
        self.counter_dict = counter_dict
        list_dict = {}
        for label in [1, 2, 3, 4]:
            for sentence, id_dict in self.counter_dict.items():
                for label, count in id_dict.items():
                    if self.id2ner[label] not in list_dict.keys():
                        list_dict[self.id2ner[label]] = []
                    if count >= 1:
                        list_dict[self.id2ner[label]].append(sentence)
        list_1 = list_dict['drug']
        list_2 = list_dict['drug_n']
        list_3 = list_dict['group']
        list_4 = list_dict['brand']
        venn({
            "drug": set(list_1),
            "drug_n": set(list_2),
            "group": set(list_3),
            "brand": set(list_4)
        })

        pass
예제 #20
0
    def plot_ner_cooccurence_venndiagram(self):
        # FOR BONUS PART!!
        # Should plot a ven-diagram displaying how the ner labels co-occur

        ner_sentences = self.ner_df["sentence_id"].tolist()
        ner_ids = self.ner_df["ner_id"].tolist()

        # lists consisting of each sentence_id that has an entity of that group
        brand = []  #ner_id = 2
        drug = []  # 3
        drug_n = []  # 4
        group = []  # 5

        sent_to_ner = zip(ner_sentences, ner_ids)

        for sent_id, ner_id in sent_to_ner:
            if ner_id == 2:
                brand.append(sent_id)
            elif ner_id == 3:
                drug.append(sent_id)
            elif ner_id == 4:
                drug_n.append(sent_id)
            elif ner_id == 5:
                group.append(sent_id)

        # make sure ner_ids are only ner_ids
        assert list(set(ner_ids)) == [2, 3, 4,
                                      5], 'There are unwanted ner_id\'s!'

        brand = set(brand)
        drug = set(drug)
        drug_n = set(drug_n)
        group = set(group)

        venn({'brand': brand, 'drug': drug, 'drug_n': drug_n, 'group': group})

        pass
예제 #21
0
    def plot_ner_cooccurence_venndiagram(self):
        # FOR BONUS PART!!
        # Should plot a ven-diagram displaying how the ner labels co-occur
        venn_data = {}

        for ner_id in self.ner_df['ner_id'].unique():
            ner_entries = self.ner_df[self.ner_df['ner_id'] == ner_id]
            sentence_ids = ner_entries['sentence_id'].unique()

            venn_data[self.id2ner[ner_id]] =set(sentence_ids)

        to_plot = venn(venn_data)
        to_plot.set_title('NER labels co-occur in sentences')
        # plt = to_plot.get_figure()
        plt.show()
예제 #22
0
    def plot_ner_cooccurence_venndiagram(self):
        # FOR BONUS PART!!
        # Should plot a ven-diagram displaying how the ner labels co-occur
        venn_data = {}

        for ner_id in self.ner_df['ner_id'].unique():
            ner_entries = self.ner_df[self.ner_df['ner_id'] == ner_id]
            sentence_ids = ner_entries['sentence_id'].unique()

            venn_data[self.id2ner[ner_id]] =set(sentence_ids)

        plot = venn(venn_data)
        plot.set_title('NER DISTRIBUTION BETWEEN SENTENCES')
        fig = plot.get_figure()
        # fig.savefig('venn.png')
        fig.show()
def venn_diagram(df_NP, taxonomy_Double):
    """
    Gets a data frame with all the same aglycon structures in one row.
   
    Counts all taxonomies and creates a venn diagram with the four taxonomies plants, bacteria,
    animals, fungi. Reads the original taxonmies of the 'Double' entries.
    
    Saves a venn-diagram of the different taxonmies as .png.
    """
    taxonomy_Single = [
        list(tax) for tax in df_NP.taxonomy if 'double' not in tax
    ]
    taxonomy_All = taxonomy_Single + taxonomy_Double
    plants = set()
    bacteria = set()
    animals = set()
    fungi = set()
    for tax_list in taxonomy_All:
        if "plants" in tax_list:
            for tax in tax_list:
                plants.add(tax.index)
        if "bacteria" in tax_list:
            for tax in tax_list:
                bacteria.add(tax.index)
        if "animals" in tax_list:
            for tax in tax_list:
                animals.add(tax.index)
        if "fungi" in tax_list:
            for tax in tax_list:
                fungi.add(tax.index)
    dic_for_venn = {
        "plants": plants,
        "bacteria": bacteria,
        "animals": animals,
        "fungi": fungi
    }
    fig = venn.venn(dic_for_venn)
    plt.title("venn-diagram from the taxonomy of aglycons")
    plt.savefig("output_data/Venn-Diagram.png")
    print("VENN DIAGRAM DONE")
예제 #24
0
    with open(filename, 'w') as fp:
        writer = csv.writer(fp)
        writer.writerow(['Tag', 'Count'])
        for tag, count in zip(tags, counts):
            writer.writerow([tag, count])


def print_tags(tags_map):
    for tag_value, track_ids in tags_map.items():
        print('{}: {}'.format(tag_value, len(track_ids)))


if __name__ == '__main__':
    data, total = read_data()
    df = pd.read_csv('results/stats_intersected.csv')
    quads = {1: 'I', 2: 'II', 3: 'III', 4: 'IV'}
    tracks = {quad_name: set() for quad_name in quads.values()}
    for quad_i, quad_name in quads.items():
        for tag in df[df.Quad == quad_i].Tags:
            tracks[quad_name] |= data[tag]
        print(quad_name, len(tracks[quad_name]))

    venn.venn(tracks)
    plt.savefig('results/quads.png', bbox_inches='tight')
    plt.title('Tracks ')
    plt.show()

    # print('Total: {}'.format(total))
    # tags, counts = get_top_tags(data)
    # export_stats(tags, counts)
예제 #25
0
def single_venn():
    venn(dois, cmap=['r', 'g', 'b', 'y', 'purple'])
    plt.show()
예제 #26
0
        columns=rename_map).set_index(index_names))
    # and return as series
    return df['id']


# In[8]:

# probably stick with venn diagram here
sns.set({'figure.figsize': (8, 10)})
sns.set_style('white')

# get only sample lists from gene expression and mutation
labels = ['gene expression', 'mutation']
label_map = {l: sample_lists[l] for l in labels}

venn(label_map)
plt.title('TCGA sample intersections, gene expression data', size=14)

# ### Count overlap between gene expression, methylation, and mutation datasets

# In[9]:

sns.set_style('white')
labels = ['gene expression', 'mutation', '27k methylation', '450k methylation']
samples = [sample_lists[l] for l in labels]

upset_series = series_from_samples(samples, labels)
upset_series[upset_series != 0].sort_values().head(20)

# In[10]:
예제 #27
0
    'Schmidt et al. 2016', 'Peebo et al. 2015', 'Li et al. 2014',
    'Valgepea et al. 2013'
],
                   rotation=90)
# ax.set_xticks(rotation=75)
ax.set_ylabel('total number of\nproteins quantified')
plt.savefig('../../figures/intersections_venn_summed.pdf', bbox_inches='tight')

# %%
# Venn Diagram
fig = plt.figure()

datasets = {
    'Schmidt et al. 2016':
    set(data[data['dataset'] == 'schmidt_2016']['gene_name'].unique()),
    'Peebo et al. 2015':
    set(data[data['dataset'] == 'peebo_2015']['gene_name'].unique()),
    'Li et al. 2014':
    set(data[data['dataset'] == 'li_2014']['gene_name'].unique()),
    'Valgepea et al. 2013':
    set(data[data['dataset'] == 'valgepea_2013']['gene_name'].unique())
}
venn(datasets,
     cmap=[
         colors['light_blue'], colors['green'], colors['purple'], colors['red']
     ])

plt.savefig('../../figures/figS3_intersections_venn.pdf', bbox_inches='tight')

# %%
    def look_for_diagnostic_sqeuences(self):
        """ 
        For every sequence in the ITS2 count table, look to see if it is found in every individual
            of a given group if it is, then count this sequence. Keep track of which groups a given
            sequence is found in and then we will output this info as a Venn or table. We will obviously
            be particularly interested in those sequences that are only found in one of the groups.

        Result:
        In the end there are very few sequences that are found in common with all samples of a host group.
        This approach will not work. Single sequences cannot be diagnostic.
        """
        host_group_to_seq_dd = defaultdict(set)
        for svd_group, sample_list in self.group_to_sample_list_dict.items():
            tot = len(list(self.counts_df_with_host))
            for i, seq in enumerate(list(self.counts_df_with_host)):
                print(f"{svd_group}:{i}/{tot}")
                ser = self.counts_df_with_host[seq]
                ser = ser[ser != 0]
                if set(set(sample_list)).issubset(ser.index.values):
                    # Then this sequence is found in all samples of the given host group
                    host_group_to_seq_dd[svd_group].add(seq)
        # At this point we know which sequences are found in all samples of a given group
        # now we can plot this up as a venn
        venn_obj = venn(host_group_to_seq_dd)
        venn_obj.set_title("Venn of sequnces found in all\nsamples of a given host group")
        plt.savefig('/home/humebc/projects/tara/tara_full_dataset_processing/host_diagnostic_ITS2/venn_plot.png' )
        
        # host_group_to_seq_dd = defaultdict(set)
        # for seq in list(self.counts_df_with_host):
        #     ser = self.counts_df_with_host[seq]
        #     ser = ser[ser != 0]
        #     # Check to see if, of the samples this seq is found in, if at least
        #     # one of those samples if from one of the host groups
        #     for svd_group, sample_list in self.group_to_sample_list_dict.items():
        #         if len(set(ser.index.values).intersection(set(sample_list))) > 1:
        #             # Then at least one of the samples that this seq is found in is of the host group
        #             host_group_to_seq_dd[svd_group].add(seq)
        # venn_obj = venn(host_group_to_seq_dd)
        # venn_obj.set_title("Venn of sequnces found in all\nsamples of a given host group")
        # plt.savefig('/home/humebc/projects/tara/tara_full_dataset_processing/host_diagnostic_ITS2/venn_plot.png' )
        # foo = 'this'

        # Venn is not really right for what we want to show here.
        # let's just straight up search for what we're after
        host_group_to_seq_dd = defaultdict(set)
        for svd_group, sample_list in self.group_to_sample_list_dict.items():
            for seq in list(self.counts_df_with_host):
                ser = self.counts_df_with_host[seq]
                ser = ser[ser != 0]
                # Check to see if this sequences is found in all samples of this group
                # and also none of the samples of the other groups
                if set(set(sample_list)).issubset(ser.index.values):
                    # Seq is found in all samples of this host group
                    found_in_other = False
                    for svd_group_other, sample_list_other in [(k, v) for k, v in self.group_to_sample_list_dict.items() if k != svd_group]:
                        # For all of the other svd_groups
                        if len(set(ser.index.values).intersection(set(sample_list_other))) > 0:
                            found_in_other = True
                    if found_in_other:
                        continue
                    else:
                        host_group_to_seq_dd[svd_group].add(seq)    
                else:
                    continue
        print("Sequences that are unique diagnostic of the host group:")
        print(host_group_to_seq_dd)
예제 #29
0
        cfg.manifest_url,
        header=7,  # skip first 6 lines
        index_col=0)
    manifest_df.to_csv(cfg.methylation_manifest)

manifest_df.iloc[:5, :5]

# In[5]:

# look at overlap of probes in manifest and probes in TCGA dataset
# all probes in TCGA should be in manifest, but we just want to make sure
manifest_probes = manifest_df.index
tcga_probes = tcga_methylation_df.index

sns.set_style('white')
venn({'manifest': set(manifest_probes), 'tcga': set(tcga_probes)})
plt.title('Probe overlap between Illumina manifest and TCGA dataset')

# In[6]:

# get probe type (type I or type II) for TCGA probes from manifest
tcga_methylation_df = (tcga_methylation_df.merge(
    manifest_df[['Infinium_Design_Type', 'CHR']],
    left_index=True,
    right_index=True).astype({
        'CHR': 'str'
    }).rename(
        columns={
            'Infinium_Design_Type': 'probe_type',
            'CHR': 'chromosome'
        }))
예제 #30
0
'''
This program helps visualize data with multiple characteristics.
An example would be multiple choice survey answers.
The data will be displayed in the form a Venn Diagram
'''

import matplotlib.pyplot as plt
from venn import venn

numbers = {
    "A": {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
    "B": {0, 2, 4, 6, 8, 10, 12, 14, 16},
    "C": {0, 3, 6, 9, 12, 15},
    "D": {0, 4, 8, 12, 16},
    "E": {0, 5, 10, 15}

}

venn(numbers, fmt="{percentage:.1f}%", fontsize=8, legend_loc="upper right")

plt.show()