def make_snp_venn(venn_grouped_info, selected_comparison, save_path): experiment_venn_data = {'Other': set()} for phenotype in selected_comparison.keys(): experiment_venn_data[phenotype] = set() comparison_sample_list = set() all_samples = set() for a_pheno_sample in selected_comparison.keys(): for a_thing in selected_comparison[a_pheno_sample]: comparison_sample_list.add(a_thing) for venn_comparison in venn_grouped_info: for a_sample in venn_comparison['sets']: all_samples.add(a_sample) for phenotype in selected_comparison.keys(): for a_pheno_sample in selected_comparison[phenotype]: if a_pheno_sample in venn_comparison['sets']: for var_pos in venn_comparison['components']: experiment_venn_data[phenotype].add(var_pos) other_samples = (all_samples - comparison_sample_list) for venn_comparison in venn_grouped_info: for other_sample in other_samples: if other_sample in venn_comparison['sets']: for var_pos in venn_comparison['components']: experiment_venn_data['Other'].add(var_pos) venn(experiment_venn_data) savefig(save_path + 'variants/snp_venn_fig.png', format="png")
def main(): args = options() verbose = args.verbose truesets = {} falsesets = {} for o in args.orglist.split(' '): tmpstr = args.fname.split('_') Nens = int(tmpstr[1]) Ngcs = int(tmpstr[3]) Nngcs = int(tmpstr[5]) isStochW = int(tmpstr[8]) fname = args.iopath + o + args.condition + '/' + args.fname dfc = pandas.read_csv(fname + '_conditions.csv') gdf = pandas.read_csv(fname + '_gc_tab.csv', index_col=0) ngdf = pandas.read_csv(fname + '_ngc_tab.csv', index_col=0) gcs = [] ngcs = [] for i in dfc.index: gcs.append(list(dfc.iloc[i, 1:(1 + Ngcs)].values)) ngcs.append(list(dfc.iloc[i, (1 + Ngcs):(1 + Ngcs + Nngcs)].values)) ngdf_mask = ngdf.copy() for i in range(len(ngcs)): ngdf_mask.iloc[:, i] = ~ngdf.iloc[:, i].index.isin(ngcs[i]) ngdf_masked = ngdf.where(ngdf_mask, np.nan) gdf_mask = gdf.copy() for i in range(len(gcs)): gdf_mask.iloc[:, i] = ~gdf.iloc[:, i].index.isin(gcs[i]) gdf_masked = gdf.where(gdf_mask, np.nan) gdf_masked_maj = gdf_masked.copy() addMajorityCol(gdf_masked_maj) ngdf_masked_maj = ngdf_masked.copy() addMajorityCol(ngdf_masked_maj) TP = set(gdf_masked_maj[gdf_masked_maj['Majority'] == 1].index) TN = set(ngdf_masked_maj[ngdf_masked_maj['Majority'] == 0].index) FP = set(gdf_masked_maj[gdf_masked_maj['Majority'] == 0].index) FN = set(ngdf_masked_maj[ngdf_masked_maj['Majority'] == 1].index) truesets[o] = TP ^ TN falsesets[o] = FP ^ FN alltptn = set.intersection(*list(truesets.values())) print('\n'.join(alltptn)) venn.venn(truesets) plt.savefig(args.iopath + 'venn_TN_TP.png') venn.venn(falsesets) plt.savefig(args.iopath + 'venn_FN_FP.png') return truesets
def plot_ner_cooccurence_venndiagram(self): # plots a ven-diagram displaying how the ner labels co-occur df_dict = self.ner_df.groupby('ner_id').apply(lambda x: set(x['sentence_id'])).to_dict() # makes a dictionary of {label1 : set(sentence1, sentence2, ...)} for i, ner in self.id2ner.items(): # changing the label id's to label names if i in df_dict: df_dict[self.id2ner[i]] = df_dict.pop(i) venn(df_dict)
def draw_venn(self): dct = { 'local_items': set(self.__local_wallpaper_checker.local_items), 'backup_items': set(self.__local_wallpaper_checker.backup_items), 'local_subs': set(self.__local_wallpaper_checker.subscribed_items), 'network_subs': set(self.__network_wallpaper_checker.subscription), 'network_deleted': set(self.__deleted_wallpaper_checker.network_deleted_items), } venn(dct) plt.draw() plt.show()
def plotVenns(df, subs, mycmap, oname): fig, axes = plt.subplots(len(subs), 1, figsize=(5, 4 * len(subs))) for i, c in enumerate(subs): axes[i].set_title(mylabels[c]) venn.venn(df[c], cmap=mycmap, ax=axes[i], fontsize=6, legend_loc="best") fig.tight_layout(pad=0.1) plt.savefig(oname) return
def vennplot(self, data, title, filename): sns.set(rc={'figure.figsize': (12, 12)}) sns.set_style("ticks") fig, ax = plt.subplots() sns.despine(fig=fig, ax=ax) venn(data, ax=ax) ax.set_title(title, fontsize=35, fontweight='bold') fig.savefig(os.path.join(self.outdir, "{}.png".format(filename))) plt.close()
def plot_peptide_overlap(peptide_dict, labels): fig, ax = plt.subplots(figsize=(10, 10)) if len(peptide_dict) in range(2, 6): venn(peptide_dict, cmap="viridis", ax=ax) elif len(peptide_dict) == 6: pseudovenn(peptide_dict, cmap="viridis", ax=ax) else: print("No Peptide Venn Diagram plotted due to invalid number of samples.") print("Venn Diagrams require between 2 and 6 samples.") ax.set_title("Peptide Overlap", fontsize=20) ax.legend(labels=labels, fontsize=15, loc='best', bbox_to_anchor=[1.1, 1]) fig.savefig("Peptide_overlap_venn.svg") fig.savefig("Peptide_overlap_venn.png", bbox_inches="tight")
def plot(): up = {} down = {} for name, genes_ddf in sorted(a_dict.items()): df = genes_ddf.df stable_ids = df[id_column] column = genes_ddf.venn_annotator["log2FC"] up[name] = set(stable_ids[df[column] > 0]) down[name] = set(stable_ids[df[column] < 0]) plt.figure(figsize=(4, 4)) venn.venn(up) plt.savefig(str(output_prefix) + ".up.png", dpi=72) plt.figure(figsize=(4, 4)) venn.venn(down) plt.savefig(str(output_prefix) + ".down.png", dpi=72)
def plotVenns4(df, subs, mycmap, oname): rw = [0, 0, 1, 1] cl = [0, 1, 0, 1] fig, axes = plt.subplots(2, 2) for i, c in enumerate(subs): axes[rw[i], cl[i]].set_title(mylabels[c]) venn.venn(df[c], cmap=mycmap, ax=axes[rw[i], cl[i]], fontsize=6, legend_loc="best") fig.tight_layout(pad=0.1) plt.savefig(oname) return
def main(): venn_dict = {} # for venn diagram big_DataFrame = pd.DataFrame() sample_name_list = [] for each in sys.argv[1:]: sample_name = os.path.basename(each).split("_")[0] print("read file: {}".format(sample_name)) sample_name_list.append(sample_name) # df1 = pd.read_table(each, sep="\t", header=None, names=["Gene",sample_name]) # read table df1 = pd.read_csv(each, sep=",", header=0) # print(df1.head()) # add new clonotype_tra_id,clonotype_trb_id,clonotype_pair_id df1['clonotype_tra_id'] = df1[['v_gene', 'cdr3', 'j_gene']].apply('_'.join, axis=1) df1['clonotype_trb_id'] = df1[['v_gene.1', 'cdr3.1', 'j_gene.1']].apply('_'.join, axis=1) df1['clonotype_pair_id'] = df1[[ 'clonotype_tra_id', 'clonotype_trb_id' ]].apply("_".join, axis=1) # select data clonotype_tra_id,clonotype_trb_id,clonotype_pair_id, freq df1 = df1[['clonotype_pair_id', 'clonotype_id', 'proportion']] # print(df1.head()) # add suffixes name for each sample: df1 = df1.rename(columns={ 'clonotype_id': 'clonotype_id_' + sample_name, 'proportion': 'proportion_' + sample_name }, ) if big_DataFrame.empty: big_DataFrame = df1 else: big_DataFrame = pd.merge(big_DataFrame, df1, on='clonotype_pair_id', how='outer') # to venn list: venn_dict.setdefault(sample_name, set(df1['clonotype_pair_id'].tolist())) big_DataFrame.to_csv("_".join(sample_name_list) + ".xls", sep="\t", index=False) # print(big_DataFrame.head()) # print(venn_dict) print("write xls file: {}.xls".format("_".join(sample_name_list))) venn(venn_dict) plt.savefig("_".join(sample_name_list) + ".venn.png", dpi=300) print("draw plot: {}.venn.png".format("_".join(sample_name_list)))
def plot_venn(sets, path): if len(sets) > 1: fig = venn(sets).get_figure() fig.savefig(path) elif len(sets) in {0, 1}: print(f'plot_venn: No sets to intersect for {path}')
def generate_report(providers, conn): ioc_sets = {} display_sets = {} for provider in providers: ioc_sets[provider] = getIOCs(conn, 'provider = "{}"'.format(provider)) #['AISCOMM' 'CrowdStrike' 'Cyber threat coalition' 'EmergingThreats' 'FarsightSecurity' 'Fortinet' 'IID' 'Palo Alto' 'SURBL'] if 'Fortinet' in providers: display_sets['Fortinet'] = ioc_sets['Fortinet'] if 'Palo Alto' in providers: display_sets['Palo Alto'] = ioc_sets['Palo Alto'] if 'FarsightSecurity' in providers: display_sets['FarsightSecurity'] = ioc_sets['FarsightSecurity'] #if 'AISCOMM' in providers: # display_sets['AISCOMM'] = ioc_sets['AISCOMM'] #if 'CrowdStrike' in providers: # display_sets['CrowdStrike'] = ioc_sets['CrowdStrike'] #if 'SURBL' in providers: # display_sets['SURBL'] = ioc_sets['SURBL'] #if 'Cyber threat coalition' in providers: # display_sets['Cyber threat coalition'] = ioc_sets['Cyber threat coalition'] if 'IID' in providers: display_sets['Infoblox'] = ioc_sets['IID'] #display_sets['Infoblox'] = set() #for provider in set(ioc_sets).difference(set(display_sets)): # display_sets['Infoblox'].update(ioc_sets[provider]) plt.figure(figsize=(4, 4)) v = venn(display_sets) #plt.title('Vendor IOCs overlap on active threats during last {}days'.format(age_IOCs_inactive_for_days)) plt.title('Vendor IOCs overlap on active threats') plt.savefig('images/IOCs_overlap.png') logging.info('Generated images/IOCs_overlap.png successfully')
def plot_ner_cooccurence_venndiagram(self): # FOR BONUS PART!! # Should plot a ven-diagram displaying how the ner labels co-occur venn_list = [] for ner_id in self.ner_df['ner_id'].unique(): ner = self.ner_df[self.ner_df['ner_id'] == ner_id] sents = ner['sentence_id'].unique().tolist() #venn_dic[ner[ner_id]] = set(sents) venn_list.append((ner_id, sents)) #print(venn_list) dic = {} for v_list in venn_list: dic[v_list[0]] = set(v_list[1]) venn(dic) plt.show() pass
def plot_ner_cooccurence_venndiagram(self): # FOR BONUS PART!! # Should plot a ven-diagram displaying how the ner labels co-occur ner_dict = self.ner_vocab.vocab.copy() ner_dict.pop(pad_token) ner_dict.pop("O") ner_group = {} for label, id_ in ner_dict.items(): ner_group[id_] = label[2:] all_groups = defaultdict(set) for ner, grp in ner_group.items(): df = self.ner_df[self.ner_df["ner_id"] == ner] sents = set(df["sentence_id"]) all_groups[grp] = all_groups[grp].union(sents) venn(all_groups, fmt="{percentage:.1f}%", cmap="plasma", fontsize=10)
def figure_bigmec_unsuccessful_coverage_venn(): """ Create the figure displaying the number of successful bigmec constructions""" filename = "../Data/constructed_pathways/summary.csv" df = pd.read_csv(filename, index_col = 0) df_success = df.loc[df["Success"]== 0, :] dict_list = [] for i, row in df_success.iterrows(): dic = {} if isinstance(row["BGC type"], float): continue bgc_types = row["BGC type"].split("/") bgc_types for k in bgc_types: dic[k] = True dic["BGC"] = row["BGC"] dict_list.append(dic) df_venn = pd.DataFrame(dict_list) df_venn = df_venn.fillna(False) print(df_venn.sum()) cols = ["T1PKS", "transAT-PKS-like", "transAT-PKS", "NRPS-like", "NRPS", "PKS-like", "BGC"] other_columns = [x for x in df_venn.columns if not x in cols] df_venn2 = pd.DataFrame() df_venn2["T1PKS"] = df_venn["T1PKS"] df_venn2["TransAT-PKS"] = df_venn[["transAT-PKS-like", "transAT-PKS"]].sum(axis = 1).astype(bool) df_venn2["NRPS"] = df_venn[["NRPS-like", "NRPS"]].sum(axis = 1).astype(bool) df_venn2["Other"] = df_venn[other_columns].sum(axis = 1).astype(bool) df_venn2.index = df_venn["BGC"] dic = {"T1PKS": set(df_venn2[df_venn2["T1PKS"]].index), "TransAT-PKS": set(df_venn2[df_venn2["TransAT-PKS"]].index), "NRPS": set(df_venn2[df_venn2["NRPS"]].index), "Other": set(df_venn2[df_venn2["Other"]].index)} venn.venn(dic) plt.savefig("../Figures/bigmec_venn_unsuccessful.svg") print("N total successful: ", len(df_success)) for key, value in dic.items(): print(key, len(value))
def plot_ner_cooccurence_venndiagram(self): n_drug = self.ner_df.loc[self.ner_df['ner_id'] == 2, 'sentence_id'].tolist() drug = self.ner_df.loc[self.ner_df['ner_id'] == 3, 'sentence_id'].tolist() group = self.ner_df.loc[self.ner_df['ner_id'] == 4, 'sentence_id'].tolist() brand = self.ner_df.loc[self.ner_df['ner_id'] == 5, 'sentence_id'].tolist() venn({ "n_drug": set(n_drug), "drug": set(drug), "group": set(group), "brand": set(brand) }) plt.show() pass
def plot_ner_cooccurence_venndiagram(self): # FOR BONUS PART!! # Should plot a ven-diagram displaying how the ner labels co-occur all_counts = [] for ner in [0, 1, 2, 3, 4]: n_df = self.ner_df[self.ner_df["ner_id"] == ner] sents = [i for i in n_df["sentence_id"]] all_counts.append(sents) list0, list1, list2, list3, list4 = all_counts #list2 = list2 + list3 venn({ "group": set(list1), "drug_n": set(list2), "drug": set(list3), "brand": set(list4) }) plt.show() pass
def plot_protein_overlap(self, save=False): fig, ax = plt.subplots(figsize=(10, 10)) if len(self.protein_dict) in range(2, 6): venn(self.protein_dict, cmap="viridis", ax=ax) elif len(self.protein_dict) == 6: pseudovenn(self.protein_dict, cmap="viridis", ax=ax) else: print( "No Protein Venn Diagram plotted due to invalid number of samples." ) print("Venn Diagrams require between 2 and 6 samples.") ax.legend(labels=self.labels, fontsize=15, loc="best", bbox_to_anchor=[1.1, 1]) ax.set_title("Protein Overlap", fontsize=20) if save: plt.savefig("Protein_overlap_venn.svg") plt.savefig("Protein_overlap_venn.png", bbox_inches="tight")
def plot_ner_cooccurence_venndiagram(self): # FOR BONUS PART!! # Should plot a ven-diagram displaying how the ner labels co-occur counter_dict = {} sentence_ids = list(self.data_df["sentence_id"].unique()) for sentence_id in sentence_ids: if sentence_id not in counter_dict.keys(): counter_dict[sentence_id] = {} counter_dict[sentence_id][1] = 0 counter_dict[sentence_id][2] = 0 counter_dict[sentence_id][3] = 0 counter_dict[sentence_id][4] = 0 sub_ner_df = self.ner_df.loc[self.ner_df['sentence_id'] == sentence_id] if not sub_ner_df.empty: for j in range(len(sub_ner_df.index)): ner = sub_ner_df.iloc[j]['ner_id'] counter_dict[sentence_id][ner] += 1 self.counter_dict = counter_dict list_dict = {} for label in [1, 2, 3, 4]: for sentence, id_dict in self.counter_dict.items(): for label, count in id_dict.items(): if self.id2ner[label] not in list_dict.keys(): list_dict[self.id2ner[label]] = [] if count >= 1: list_dict[self.id2ner[label]].append(sentence) list_1 = list_dict['drug'] list_2 = list_dict['drug_n'] list_3 = list_dict['group'] list_4 = list_dict['brand'] venn({ "drug": set(list_1), "drug_n": set(list_2), "group": set(list_3), "brand": set(list_4) }) pass
def plot_ner_cooccurence_venndiagram(self): # FOR BONUS PART!! # Should plot a ven-diagram displaying how the ner labels co-occur ner_sentences = self.ner_df["sentence_id"].tolist() ner_ids = self.ner_df["ner_id"].tolist() # lists consisting of each sentence_id that has an entity of that group brand = [] #ner_id = 2 drug = [] # 3 drug_n = [] # 4 group = [] # 5 sent_to_ner = zip(ner_sentences, ner_ids) for sent_id, ner_id in sent_to_ner: if ner_id == 2: brand.append(sent_id) elif ner_id == 3: drug.append(sent_id) elif ner_id == 4: drug_n.append(sent_id) elif ner_id == 5: group.append(sent_id) # make sure ner_ids are only ner_ids assert list(set(ner_ids)) == [2, 3, 4, 5], 'There are unwanted ner_id\'s!' brand = set(brand) drug = set(drug) drug_n = set(drug_n) group = set(group) venn({'brand': brand, 'drug': drug, 'drug_n': drug_n, 'group': group}) pass
def plot_ner_cooccurence_venndiagram(self): # FOR BONUS PART!! # Should plot a ven-diagram displaying how the ner labels co-occur venn_data = {} for ner_id in self.ner_df['ner_id'].unique(): ner_entries = self.ner_df[self.ner_df['ner_id'] == ner_id] sentence_ids = ner_entries['sentence_id'].unique() venn_data[self.id2ner[ner_id]] =set(sentence_ids) to_plot = venn(venn_data) to_plot.set_title('NER labels co-occur in sentences') # plt = to_plot.get_figure() plt.show()
def plot_ner_cooccurence_venndiagram(self): # FOR BONUS PART!! # Should plot a ven-diagram displaying how the ner labels co-occur venn_data = {} for ner_id in self.ner_df['ner_id'].unique(): ner_entries = self.ner_df[self.ner_df['ner_id'] == ner_id] sentence_ids = ner_entries['sentence_id'].unique() venn_data[self.id2ner[ner_id]] =set(sentence_ids) plot = venn(venn_data) plot.set_title('NER DISTRIBUTION BETWEEN SENTENCES') fig = plot.get_figure() # fig.savefig('venn.png') fig.show()
def venn_diagram(df_NP, taxonomy_Double): """ Gets a data frame with all the same aglycon structures in one row. Counts all taxonomies and creates a venn diagram with the four taxonomies plants, bacteria, animals, fungi. Reads the original taxonmies of the 'Double' entries. Saves a venn-diagram of the different taxonmies as .png. """ taxonomy_Single = [ list(tax) for tax in df_NP.taxonomy if 'double' not in tax ] taxonomy_All = taxonomy_Single + taxonomy_Double plants = set() bacteria = set() animals = set() fungi = set() for tax_list in taxonomy_All: if "plants" in tax_list: for tax in tax_list: plants.add(tax.index) if "bacteria" in tax_list: for tax in tax_list: bacteria.add(tax.index) if "animals" in tax_list: for tax in tax_list: animals.add(tax.index) if "fungi" in tax_list: for tax in tax_list: fungi.add(tax.index) dic_for_venn = { "plants": plants, "bacteria": bacteria, "animals": animals, "fungi": fungi } fig = venn.venn(dic_for_venn) plt.title("venn-diagram from the taxonomy of aglycons") plt.savefig("output_data/Venn-Diagram.png") print("VENN DIAGRAM DONE")
with open(filename, 'w') as fp: writer = csv.writer(fp) writer.writerow(['Tag', 'Count']) for tag, count in zip(tags, counts): writer.writerow([tag, count]) def print_tags(tags_map): for tag_value, track_ids in tags_map.items(): print('{}: {}'.format(tag_value, len(track_ids))) if __name__ == '__main__': data, total = read_data() df = pd.read_csv('results/stats_intersected.csv') quads = {1: 'I', 2: 'II', 3: 'III', 4: 'IV'} tracks = {quad_name: set() for quad_name in quads.values()} for quad_i, quad_name in quads.items(): for tag in df[df.Quad == quad_i].Tags: tracks[quad_name] |= data[tag] print(quad_name, len(tracks[quad_name])) venn.venn(tracks) plt.savefig('results/quads.png', bbox_inches='tight') plt.title('Tracks ') plt.show() # print('Total: {}'.format(total)) # tags, counts = get_top_tags(data) # export_stats(tags, counts)
def single_venn(): venn(dois, cmap=['r', 'g', 'b', 'y', 'purple']) plt.show()
columns=rename_map).set_index(index_names)) # and return as series return df['id'] # In[8]: # probably stick with venn diagram here sns.set({'figure.figsize': (8, 10)}) sns.set_style('white') # get only sample lists from gene expression and mutation labels = ['gene expression', 'mutation'] label_map = {l: sample_lists[l] for l in labels} venn(label_map) plt.title('TCGA sample intersections, gene expression data', size=14) # ### Count overlap between gene expression, methylation, and mutation datasets # In[9]: sns.set_style('white') labels = ['gene expression', 'mutation', '27k methylation', '450k methylation'] samples = [sample_lists[l] for l in labels] upset_series = series_from_samples(samples, labels) upset_series[upset_series != 0].sort_values().head(20) # In[10]:
'Schmidt et al. 2016', 'Peebo et al. 2015', 'Li et al. 2014', 'Valgepea et al. 2013' ], rotation=90) # ax.set_xticks(rotation=75) ax.set_ylabel('total number of\nproteins quantified') plt.savefig('../../figures/intersections_venn_summed.pdf', bbox_inches='tight') # %% # Venn Diagram fig = plt.figure() datasets = { 'Schmidt et al. 2016': set(data[data['dataset'] == 'schmidt_2016']['gene_name'].unique()), 'Peebo et al. 2015': set(data[data['dataset'] == 'peebo_2015']['gene_name'].unique()), 'Li et al. 2014': set(data[data['dataset'] == 'li_2014']['gene_name'].unique()), 'Valgepea et al. 2013': set(data[data['dataset'] == 'valgepea_2013']['gene_name'].unique()) } venn(datasets, cmap=[ colors['light_blue'], colors['green'], colors['purple'], colors['red'] ]) plt.savefig('../../figures/figS3_intersections_venn.pdf', bbox_inches='tight') # %%
def look_for_diagnostic_sqeuences(self): """ For every sequence in the ITS2 count table, look to see if it is found in every individual of a given group if it is, then count this sequence. Keep track of which groups a given sequence is found in and then we will output this info as a Venn or table. We will obviously be particularly interested in those sequences that are only found in one of the groups. Result: In the end there are very few sequences that are found in common with all samples of a host group. This approach will not work. Single sequences cannot be diagnostic. """ host_group_to_seq_dd = defaultdict(set) for svd_group, sample_list in self.group_to_sample_list_dict.items(): tot = len(list(self.counts_df_with_host)) for i, seq in enumerate(list(self.counts_df_with_host)): print(f"{svd_group}:{i}/{tot}") ser = self.counts_df_with_host[seq] ser = ser[ser != 0] if set(set(sample_list)).issubset(ser.index.values): # Then this sequence is found in all samples of the given host group host_group_to_seq_dd[svd_group].add(seq) # At this point we know which sequences are found in all samples of a given group # now we can plot this up as a venn venn_obj = venn(host_group_to_seq_dd) venn_obj.set_title("Venn of sequnces found in all\nsamples of a given host group") plt.savefig('/home/humebc/projects/tara/tara_full_dataset_processing/host_diagnostic_ITS2/venn_plot.png' ) # host_group_to_seq_dd = defaultdict(set) # for seq in list(self.counts_df_with_host): # ser = self.counts_df_with_host[seq] # ser = ser[ser != 0] # # Check to see if, of the samples this seq is found in, if at least # # one of those samples if from one of the host groups # for svd_group, sample_list in self.group_to_sample_list_dict.items(): # if len(set(ser.index.values).intersection(set(sample_list))) > 1: # # Then at least one of the samples that this seq is found in is of the host group # host_group_to_seq_dd[svd_group].add(seq) # venn_obj = venn(host_group_to_seq_dd) # venn_obj.set_title("Venn of sequnces found in all\nsamples of a given host group") # plt.savefig('/home/humebc/projects/tara/tara_full_dataset_processing/host_diagnostic_ITS2/venn_plot.png' ) # foo = 'this' # Venn is not really right for what we want to show here. # let's just straight up search for what we're after host_group_to_seq_dd = defaultdict(set) for svd_group, sample_list in self.group_to_sample_list_dict.items(): for seq in list(self.counts_df_with_host): ser = self.counts_df_with_host[seq] ser = ser[ser != 0] # Check to see if this sequences is found in all samples of this group # and also none of the samples of the other groups if set(set(sample_list)).issubset(ser.index.values): # Seq is found in all samples of this host group found_in_other = False for svd_group_other, sample_list_other in [(k, v) for k, v in self.group_to_sample_list_dict.items() if k != svd_group]: # For all of the other svd_groups if len(set(ser.index.values).intersection(set(sample_list_other))) > 0: found_in_other = True if found_in_other: continue else: host_group_to_seq_dd[svd_group].add(seq) else: continue print("Sequences that are unique diagnostic of the host group:") print(host_group_to_seq_dd)
cfg.manifest_url, header=7, # skip first 6 lines index_col=0) manifest_df.to_csv(cfg.methylation_manifest) manifest_df.iloc[:5, :5] # In[5]: # look at overlap of probes in manifest and probes in TCGA dataset # all probes in TCGA should be in manifest, but we just want to make sure manifest_probes = manifest_df.index tcga_probes = tcga_methylation_df.index sns.set_style('white') venn({'manifest': set(manifest_probes), 'tcga': set(tcga_probes)}) plt.title('Probe overlap between Illumina manifest and TCGA dataset') # In[6]: # get probe type (type I or type II) for TCGA probes from manifest tcga_methylation_df = (tcga_methylation_df.merge( manifest_df[['Infinium_Design_Type', 'CHR']], left_index=True, right_index=True).astype({ 'CHR': 'str' }).rename( columns={ 'Infinium_Design_Type': 'probe_type', 'CHR': 'chromosome' }))
''' This program helps visualize data with multiple characteristics. An example would be multiple choice survey answers. The data will be displayed in the form a Venn Diagram ''' import matplotlib.pyplot as plt from venn import venn numbers = { "A": {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, "B": {0, 2, 4, 6, 8, 10, 12, 14, 16}, "C": {0, 3, 6, 9, 12, 15}, "D": {0, 4, 8, 12, 16}, "E": {0, 5, 10, 15} } venn(numbers, fmt="{percentage:.1f}%", fontsize=8, legend_loc="upper right") plt.show()