def plot_intersection(ins_dict, save_fig=False): """ Visualize an upsetplot displaying the number of unique subjects found simultaneously in a pair of instruments Parameters ---------- ins_dict: dictionary save_fig: bool """ ins_names = list(ins_dict.keys()) list_comb = sum([ list(map(list, combinations(ins_names, i + 1))) for i in range(len(ins_names) + 1) ], []) list_uniquesubj = [] for lc in list_comb: list_uniquesubj.append([set(ins_dict[n].index) for n in lc]) int_counts = list(map(_count_intersection, list_uniquesubj)) inter_plot = from_memberships(list_comb, data=int_counts) plot(inter_plot, show_counts='%d', element_size=50, orientation='horizontal') if save_fig: plt.savefig(os.path.join(ut.out_folder, 'intersection_plot'), format='pdf') else: plt.show()
def plot_species_intersections(self, color, ignore_counts=0, orientation='horizontal'): memberships = [] data = [] species_groups, _ = self.orthogroups_sets() for k in species_groups: memberships.append(k) data.append(len(set(species_groups[k]))) structured_data = from_memberships(memberships, data=data) species_dict = {'P8084_finalAssembly': 'P.betacei', 'P_cactorum_10300': 'P.cactorum', 'P_infestans_RefSeq': 'P. infestans', 'P_palmivora_LILI_trCDS': 'P.palmivora', 'P_parasitica_INRA310': 'P.parasitica', 'P_ramorum_Pr102': 'P.ramorum', 'P_sojae_V3': 'P.sojae'} new_names = [species_dict[old_name] for old_name in structured_data.index.names] structured_data.index.names = new_names structured_data = structured_data[structured_data > ignore_counts].copy() p = plot(structured_data, orientation=orientation, show_counts=True, facecolor=color, element_size=40) return p
def prepare_intersection_data(data: CardLiveData, type_value: str) -> upsetplot.UpSet: """ Prepare the CardLiveData to generate intersection plots, specifcally convert into an UpSet object containing all intersections and cardinalities :param data: a CardLiveData object from which the rgi_parser is called :param type_value: The category in RGI to plot set membersips for :return: An upsetplot.UpSet class containing the intersections and category memberships for creating a plotly based UpSet plot """ totals_df = data.rgi_parser.get_column_values(data_type=type_value, values_name='categories', drop_duplicates=True) totals_df = totals_df.dropna() category_sets = totals_df.reset_index().groupby('filename')\ .agg(lambda x: tuple(x)).applymap(list) category_sets = category_sets['categories']\ .apply(lambda x: sorted(x)).sort_values().apply(tuple) category_sets = category_sets.value_counts() # convert to upset data upset_data = upsetplot.from_memberships(category_sets.index, category_sets.values) upset_data = upsetplot.UpSet(upset_data, sort_by='cardinality') return upset_data
def create_plot(gnps_task, metadata_column, metadata_terms, intensity_threshold): data_df = _get_task_df(gnps_task) metadata_terms = set(metadata_terms) INTENSITY_THRESHOLD = float(intensity_threshold) data_df = data_df[data_df["featurearea"] > INTENSITY_THRESHOLD] membership = [] grouped_df = data_df.groupby("featureid") for group_df in grouped_df: try: groups = set(group_df[1][metadata_column]) groups = list(groups & metadata_terms) membership.append(groups) except: print("ERROR") raise upset_data_df = from_memberships(membership) plotting_object = plot(upset_data_df, subset_size="count", sort_by="cardinality", orientation="horizontal", show_counts=True) uuid_save = str(uuid.uuid4()) pyplot.savefig("./output/{}.svg".format(uuid_save)) return [html.Img(src="/plot/{}".format(uuid_save))]
def getLevels(R, L, k): n = 1 while (1 != 0): tempR = [] tempL = [] upsetD = [] for i in range(len(R[n])): for j in range(len(R[1])): if (checkExists( R[1][j], R[n][i]) == False): # Fix this to work with lists intersectionTID = intersection(L[n][i], L[1][j]) if (len(intersectionTID) >= k): if (n == 1): tempR.append([R[n][i], R[1][j]]) else: tempR.append(R[n][i] + [R[1][j]]) tempL.append(intersectionTID) if (len(tempR) == 0): return R.append(tempR) L.append(tempL) R[n + 1], L[n + 1] = checkDuplicates(R[n + 1], L[n + 1]) for i in range(len(L[n + 1])): upsetD.append(len(L[n + 1][i])) print("\nLevel ", n + 1, "--> Number of itemsets = ", len(R[n + 1])) #print(R[n+1]) print("\n") upset = from_memberships(R[n + 1], data=upsetD) upset # doctest: +NORMALIZE_WHITESPACE plot(upset) pyplot.show() n += 1
def test_from_memberships_with_data(data, ndim): memberships = [[], ['hello'], ['world'], ['hello', 'world']] out = from_memberships(memberships, data=data) assert out is not data # make sure frame is copied if hasattr(data, 'loc') and np.asarray(data).dtype.kind in 'ifb': # but not deepcopied when possible if LooseVersion(pd.__version__) > LooseVersion('0.35'): assert out.values.base is np.asarray(data).base if ndim == 1: assert isinstance(out, pd.Series) else: assert isinstance(out, pd.DataFrame) assert_frame_equal( pd.DataFrame(out).reset_index(drop=True), pd.DataFrame(data).reset_index(drop=True)) no_data = from_memberships(memberships=memberships) assert_index_equal(out.index, no_data.index) with pytest.raises(ValueError, match='length'): from_memberships(memberships[:-1], data=data)
def run(self, output): dcount = 0 dbstr = " ".join(self.dbs) if os.path.exists(output + ".raw.tab"): print("Starting from previous task") with open(output + ".raw.tab", 'r') as input: for l in input: s = l.rstrip().split() self.counter[s[0]] = int(s[1]) else: with sp.Popen(f'{self.meryl} print venn {dbstr}', shell=True, stdout=sp.PIPE, bufsize=1, universal_newlines=True) as sf: for h in sf.stdout: s = h.split() self.counter[s[1]] += 1 dcount += 1 if dcount % 10000000 == 0: print(f'Progress: {dcount}') # print out raw data with open(output + ".raw.tab", 'w') as out: for w in sorted(self.counter, key=self.counter.get, reverse=True): out.write(f'{w}\t{self.counter[w]}\n') print("Created raw output file") # Prepare membership df array = list() data = list() for k, v in self.counter.items(): tlist = list() for i, e in enumerate(self.dbs): if (int(k) & (1 << int(i))): # The bit is set, add the file name tlist.append(basename(e).split('.')[0]) array.append(tlist) data.append(v) # Plot things out dataset = upsetplot.from_memberships(array, data=data) print(dataset) upset = upsetplot.UpSet(dataset, sort_by='cardinality', show_percentages=True) upset.plot() plt.savefig(output + ".pdf")
def plot_upset(ax): data = np.array([795., 27., 182., 7.]) # plt.rcParams.update({'font.size': fontsize}) example = from_memberships( [[' TP53 WT', ' MDM4 WT'], [' TP53 WT', ' MDM4 amp.'], [' TP53 mutant', ' MDM4 WT'], [' TP53 mutant', ' MDM4 amp.']], data=data) intersections, matrix, shading, totals = plot(example, with_lines=True, show_counts=True, element_size=50) plt.ylabel('Number of patients', fontproperties)
def upset(index): selection = clusters[np.where(sets[:, index] > 0)] items, counts = np.unique(selection, return_counts=True) subset = from_memberships(items, counts) sub_classes = np.unique([item for sublist in items for item in sublist]) print("Root Class: ", unique_clusters[index]) print("# Papers: ", len(selection)) print("# Labels: ", len(sub_classes)) print("# Classes: ", len(items)) if len(items) > 40 or len(sub_classes) > 20: print("Too many items") else: plot(subset)
def test_from_contents_vs_memberships(data, typ, id_column): contents = OrderedDict([('cat1', typ(['aa', 'bb', 'cc'])), ('cat2', typ(['cc', 'dd'])), ('cat3', typ(['ee']))]) # Note that ff is not present in contents data_df = pd.DataFrame(data, index=['aa', 'bb', 'cc', 'dd', 'ee', 'ff']) baseline = from_contents(contents, data=data_df, id_column=id_column) # compare from_contents to from_memberships expected = from_memberships(memberships=[{'cat1'}, {'cat1'}, {'cat1', 'cat2'}, {'cat2'}, {'cat3'}, []], data=data_df) assert_series_equal( baseline[id_column].reset_index(drop=True), pd.Series(['aa', 'bb', 'cc', 'dd', 'ee', 'ff'], name=id_column)) assert_frame_equal(baseline.drop([id_column], axis=1), expected)
def run(): fig = plt.figure(figsize=(8, 8)) data = np.array([795., 27., 182., 7.]) plt.rcParams.update({'font.size': 14}) example = from_memberships( [ [' TP53 WT', ' MDM4 WT'], [' TP53 WT', ' MDM4 amp.'], [' TP53 mutant', ' MDM4 WT'], [' TP53 mutant', ' MDM4 amp.']], data=data ) intersections, matrix, shading, totals = plot(example, fig=fig, with_lines=True, show_counts=True, element_size=50) plt.ylabel('Number of patients', fontdict=dict(weight='bold', fontsize=16)) filename = join(saving_dir, 'upset_MDM4_TP53.png') plt.savefig(filename)
def plot_clades_intersections(self, color): memberships = [] data = [] clades_groups, _ = self.orthogroups_sets_clades() for k in clades_groups: memberships.append(k) data.append(len(set(clades_groups[k]))) structured_data = from_memberships(memberships, data=data) p = plot(structured_data, orientation='vertical', show_counts=True, facecolor=color, element_size=100) return p
def load_venn_from_fredy(venn_file_name: str, base_name: str = 'geno'): """ From a venn file generated by fredy as : #Venn: 0000 111875 1000 346 0100 357 1100 272 0010 398 1010 0 0110 31 1110 107 0001 362 1001 1 0101 9 1101 89 0011 199 1011 76 0111 54 1111 2057 generate a data usable by upsetplot: venn_data = from_memberships( [[], ['geno1'], ['geno2'], ['geno1', 'geno2'], ['geno3'], ['geno1', 'geno3'], ... ], data=[111875,346,357,272,398,272,...] ) """ with open(venn_file_name) as venn_file: members = [ ] # first array = arrays of membership, second array: corresponding counts abundances = [] for line in venn_file.readlines(): if line[0] == '#': continue sline = line.strip().split() members.append(membership_line_to_array(sline[0], base_name)) abundances.append(int(sline[1])) return from_memberships(members, data=abundances)
def plot_graph(res, path): """ From upset_plot data Plot upset plots and store corresponding data""" path_figures = f"{path}/figures" import os os.makedirs(path_figures, exist_ok=True) for typ_res, dic in res.items(): liste_cats = sorted(dic.keys()) data_out = [] for cat in liste_cats: data_out.append(dic[cat]) example = from_memberships(liste_cats, data=data_out) plot(example) pyplot.savefig(f"{path_figures}/{typ_res}.png") print(f" figures stored in '{path_figures}/'") path_upset = f"{path}/data_upset.json" write_json_file(path_upset, [liste_cats, data_out]) print(f" output file in upset plot format stored in '{path_upset}'")
def make_upsetplot(WD, name, data, title): """Function to make an UpSetPlot. Need this three other functions : similarity_count(), get_clusters(), get_sub_clusters(). ARGS: WD (str) -- the working directory to save the result. name (str) -- name of the file to save. data -- the dictionary containing the organisms as keys and the genes/reactions/others to treat for the UpSetPlot. title (str) -- title of the graph. """ clusters = get_clusters(list(data.keys())) [clusters.insert(0, [key]) for key in data.keys()] count = [] log = "" for c in clusters: others = list(data.keys()) listInter = [] for x in c: others.remove(x) listInter.append(set(data[x])) cluster_data, sim_count = similiraty_count(data, listInter, others) count.append(sim_count) for i in c: log += i + " " log += " (" + str(sim_count) + ") :\n" for i in cluster_data: log += utils.cobra_compatibility(str(i)) + "\n" log += "\n------\n\n" utils.write_file(WD, name + ".log", log) my_upsetplot = from_memberships(clusters, count) plot(my_upsetplot, show_counts='%d', totals_plot_elements=3) plt.suptitle(title) plt.savefig(WD + name + ".pdf") plt.show()
def generate_upset_plot(results, label): """Cette fonction permet d'afficher le upset plot. :param: results: dictionnaire dans lequel les résultats des modèles sont stockés :param: label: string, nom de l'étiquette à laquelle on s'intéresse """ models_names = GetModelsNames(results) # On récupère tous les noms des modèles de l'étude somme = {} # Génération de toutes les combinaisons entre modèles for p in itertools.chain(*(itertools.combinations(models_names, long) for long in range(1, 4))): somme[p] = 0 # Ici, on compte, pour chaque combinaison, combien d'instances ont la propriété d'appartenir à l'ensemble # décrit par la combinaison for comb in somme.keys(): models_to_have = list(comb) for instance in results.keys(): flag_ok = True for model in models_names: # Si le modèle courant fait partie de ceux attendus et que l'étiquette donnée n'est pas bonne, NON if model in models_to_have and results[instance][model] != label: flag_ok = False # Si le modèle courant ne fait pas partie de ceux attendus mais que l'étiquette donnée est bonne, NON if model not in models_to_have and results[instance][model] == label: flag_ok = False if flag_ok == True: somme[comb] += 1 c, d = ([], []) for comb in somme.keys(): # Nettoyage des noms de modèles et initialisation des attributs de l'objet from_memberships comb_net = [] for cc in list(comb): comb_net.append(cc.split('/')[-1].replace('.txt', '')) c.append(comb_net) d.append(somme[comb]) # Réalisation de l'upset plot diagram = from_memberships(c, data=d) plot(diagram) plt.show()
def printList(R, L, T, k): tempR = [] tempL = [] upsetR = [] upsetD = [] for i in range(len(R[0])): if (len(L[0][i]) >= k): tempR.append(R[0][i]) tempL.append(L[0][i]) upsetR.append([R[0][i]]) upsetD.append(len(L[0][i])) R.append(tempR) L.append(tempL) upset = from_memberships(upsetR, data=upsetD) upset # doctest: +NORMALIZE_WHITESPACE print("For support threshold k =", k, "\n") print("Level 1 --> Number of itemsets =", len(R[1])) #print(R[1]) print("\n") plot(upset) pyplot.show() getLevels(R, L, k)
peaks_i = peak_df_list[i] peaks_i['sorted_samples'] = '' rows_now = peaks_i.shape[0] for j in list(range(rows_now)): sample_list = peaks_i.at[j,'sample_reps'] sample_array = np.unique(sample_list.split(',')) sample_sorted = sorted(sample_array) sample_str = ",".join(sample_sorted) peaks_i.at[j,'sorted_samples'] = sample_str summary_peaks_i = peaks_i[['sorted_samples', 'count']].groupby(['sorted_samples'], as_index = False).sum() summary_peak_df_list.append(summary_peaks_i) # construct data in appropriate format for upsetplot, and plot for i in list(range(len(summary_peak_df_list))): df_i = summary_peak_df_list[i] # Get group name basename = os.path.basename(peak_file_list[i]) group_name = basename.rsplit(".", -1)[0] file_name = group_name + ".consensus_peaks.pdf" categories = df_i.shape[0] cat_list = [] for j in list(range(categories)): summary_sample = df_i.at[j,'sorted_samples'].split(',') cat_list.append(summary_sample) # Plot peak_counts = upsetplot.from_memberships(cat_list, data = df_i['count']) upsetplot.plot(peak_counts) plt.show() plt.savefig(os.path.join(args.outpath, file_name))
def FindERG(data, depth=2, sort_num=20, verbose=False): ''' Find out endogenous reference gene Parameters ---------- data:pandas.DataFrmae DataFrame of data points with each entry in the form:['gene_id','sample1',...] depth:int Accuracy of endogenous reference gene,must be larger that 2 The larger the number, the fewer genes are screened out,Accuracy improvement sort_num:int The size of the peendogenous reference gener filter When the sample is large, it is recommended to increase the value verbose: bool Make the function noisy, writing times and results. Returns ------- result:list a list of endogenous reference gene ''' lp = [] if verbose: import time, datetime start = time.time() if depth == 1: print('the depth must larger than 2') return if len(data.columns) <= 2: print('the number of samples must larger than 2') return if depth > (len(data.columns) - 1): print('depth larger than samples') return count = 0 result = [] #result bucket_size = 1000 for i in itertools.combinations(data.columns[0:depth], 2): count = count + 1 test = data.replace(0, np.nan).dropna() last_std = pd.DataFrame() for k in range(0, len(data), bucket_size): test1 = test[i[0]].iloc[k:k + bucket_size] test2 = test[i[1]].iloc[k:k + bucket_size] data_len = len(test1.values) table1 = np.array(test1.values.tolist() * data_len).reshape( data_len, data_len) table2 = pd.DataFrame(table1.T / table1) table2.index = test1.index table4 = np.array(test2.values.tolist() * data_len).reshape( data_len, data_len) table5 = pd.DataFrame(table4.T / table4) table5.index = test1.index table6 = (table2 - table5).std() table6.index = test1.index l_std = table6.sort_values()[0:sort_num] if (k == 0): last_std = l_std else: last_std = pd.concat([last_std, l_std]) last_std = last_std.sort_values()[0:sort_num] testlist = list(last_std.index) #print(testlist) lp.append(testlist) #print(lllll) if (count == 1): result = testlist if (count > 1): result = list(set(testlist).intersection(set(result))) #Venn example = from_memberships(lp, data=range(len(lp))) if verbose: end = time.time() print("calculate time:%.2fs" % (end - start)) print(result) if depth > 2: plot(example) return result
def intersect(data, upset_plot=False): """A function that returns all possible distinct intersections and generates an upset plot Parameters ---------- data = pandas dataframe upset_plot = boolean Returns ------- df_final = dataframe with list of matches for each comparison and counts upset = data formatted to generate upset plots """ #convert data column names to strings col_names = [] for i in data.columns: col_names.append(str(i)) data.columns = col_names #total groups n = len(col_names) #generate all possible combinations for intersection analysis comb_list = [] for i in range(2, n + 1): comb_list.append(list(combinations(col_names, i))) #find all unique elements and drop na unique_elem = [] tot_elements = [] for i in col_names: unique_elem.append(set(data[i].dropna().to_list())) for i in range(len(unique_elem)): tot_elements.append([col_names[i], len(unique_elem[i])]) print("Total unique number of items", tot_elements) #make dictionary for unique elements dict_ = {} for i in range(len(col_names)): dict_.update({col_names[i]: unique_elem[i]}) #intersect data, find distinct sets, drop na list_intersect = [] for i in comb_list: for j in i: if len(j) == 2: func_1 = "set(data['{x}'].dropna().to_list()).intersection(data['{y}'].dropna().to_list())".format( x=j[0], y=j[1]) inter = eval(func_1) dict_adj = [] for i, k in dict_.items(): if i != j[0] and i != j[1]: dict_adj.append(k) for i in dict_adj: unique = inter - i inter = unique list_intersect.append([j, list(inter), len(list(inter))]) else: func_2 = "set(data['{x}'].dropna().to_list()).intersection(data['{y}'].dropna().to_list())".format( x=j[0], y=j[1]) cond = "i != j[0] and i != j[1]" for _ in range(2, len(j)): decor_1 = ".intersection(data['{z}'].dropna().to_list())".format( z=j[_]) decor_2 = " and i != j[{x}]".format(x=_) func_2 = func_2 + decor_1 cond = cond + decor_2 inter = eval(func_2) dict_adj = [] for i, k in dict_.items(): if eval(cond): dict_adj.append(k) for i in dict_adj: unique = inter - i inter = unique list_intersect.append([j, list(inter), len(list(inter))]) #obtain elements found only in individual datasets for j in range(len(col_names)): for i in list_intersect: if col_names[j] in set(i[0]): unique_elem[j] = unique_elem[j] - set(i[1]) unique_elem[j] = list(unique_elem[j]) #create dataframe for elements found only in individual datasets df_1 = pd.DataFrame(col_names) df_1[1] = unique_elem df_1[2] = [len(i) for i in unique_elem] #combine intersect data and unique elements found within individual sets df_2 = pd.DataFrame(list_intersect) df_3 = pd.concat([df_1, df_2]) df_3.columns = ["Intersection", "Match", "Counts"] df_3 = df_3.reset_index(drop=True) #generate data structure for upset plot upset = df_3.drop("Match", axis=1) lst_1 = df_3["Intersection"].to_list() lst_2 = df_3["Intersection"].to_list() for i in range(len(col_names)): lst_1[i] = [lst_2[i]] upset = from_memberships(lst_1, data=upset["Counts"]) #make upset plot if upset_plot == True: plot(upset) return df_3, upset
# print('Filtering reads for those mapping to small transcript intersections...') # filt = set([ # frozenset(memb_set) # for memb_set, count in memb_set_count.items() # if count < 10 # ]) # plot_read_to_transcripts = { # read: transcripts # for read, transcripts in read_to_transcripts.items() # if frozenset(transcripts) not in filt # } # print('done.') #else: plot_read_to_transcripts = read_to_transcripts memb = from_memberships(plot_read_to_transcripts.values()) upsetplot.plot(memb, subset_size='count', show_counts=True, sort_by='cardinality') out_f = '{}.upset.png'.format(out_pref) print('Plotting Upset to ', out_f) plt.savefig(out_f, format='png') n_multimap_viral_endo = sum([ 1 for transcripts in read_to_transcripts.values() if viral_gene in set(transcripts) and len(transcripts) > 1 ]) n_mapped_endo = sum([ 1 for transcripts in read_to_transcripts.values() if len(set(transcripts) & set(endo_trans)) > 1
def test_from_memberships_no_data(typ): with pytest.raises(ValueError, match='at least one set'): from_memberships([]) with pytest.raises(ValueError, match='at least one set'): from_memberships([[], []]) with pytest.raises(ValueError, match='strings'): from_memberships([[1]]) with pytest.raises(ValueError, match='strings'): from_memberships([[1, 'str']]) with pytest.raises(TypeError): from_memberships([1]) out = from_memberships([typ([]), typ(['hello']), typ(['world']), typ(['hello', 'world']), ]) exp = pd.DataFrame([[False, False, 1], [True, False, 1], [False, True, 1], [True, True, 1]], columns=['hello', 'world', 'ones'] ).set_index(['hello', 'world'])['ones'] assert isinstance(exp.index, pd.MultiIndex) assert_series_equal(exp, out) # test sorting by name out = from_memberships([typ(['hello']), typ(['world'])]) exp = pd.DataFrame([[True, False, 1], [False, True, 1]], columns=['hello', 'world', 'ones'] ).set_index(['hello', 'world'])['ones'] assert_series_equal(exp, out) out = from_memberships([typ(['world']), typ(['hello'])]) exp = pd.DataFrame([[False, True, 1], [True, False, 1]], columns=['hello', 'world', 'ones'] ).set_index(['hello', 'world'])['ones'] assert_series_equal(exp, out)