def load_data_lda(relative_sizes, sample_size, iterations, namespace, num_topics): # Load data base_path = build_basepath("samples", namespace, sample_size) data_a = dict() # x -> [ys] data_b = dict() # x -> [ys] data_all = dict() # x -> [ys] for rel_size, i, adj_path in rel_size_iteration_loop( relative_sizes, iterations, base_path): if rel_size not in data_a.keys(): data_a[rel_size] = [] data_b[rel_size] = [] data_all[rel_size] = [] # True LDA Eval (no adj) with open(adj_path + "eval/adj_lda_" + str(num_topics) + "_a.p", "rb") as f_a: perpl_exponent = pickle.load(f_a) data_a[rel_size].append(np.exp2(-perpl_exponent)) with open(adj_path + "eval/adj_lda_" + str(num_topics) + "_b.p", "rb") as f_b: perpl_exponent = pickle.load(f_b) data_b[rel_size].append(np.exp2(-perpl_exponent)) with open(adj_path + "eval/adj_lda_" + str(num_topics) + "_all.p", "rb") as f_all: perpl_exponent = pickle.load(f_all) data_all[rel_size].append(np.exp2(-perpl_exponent)) return data_a, data_b, data_all
def load_inference_data(namespace, sample_size, num_topics): base_path = build_basepath("samples", namespace, sample_size) inference_data = load_topic_group_data(base_path, relative_sizes, iterations, num_topics) # Init X-values (relative_size) x_values = sorted(list(inference_data.keys())) # Init plot 1 data red_y = [] purple_y = [] blue_y = [] # Init plot 2 data a_prop = dict() b_prop = dict() shared_prop = dict() for x in x_values: deviation_stats = deviation_stats_inference(inference_data[x]) print(deviation_stats) # Plot 1 - Data blue_y.append(deviation_stats[0][0] + deviation_stats[1][0] + deviation_stats[2][0]) purple_y.append(deviation_stats[0][0] + deviation_stats[2][0]) red_y.append(deviation_stats[0][0]) # Plot 2 - Data a_prop[x], b_prop[x], shared_prop[x] = deviation_stats data_1 = (red_y, purple_y, blue_y) data_2 = (a_prop, shared_prop, b_prop) return x_values, data_1, data_2
def plot_corpus_dictionary(ds_name, namespace): plt.rc('text', usetex=True) plt.style.use('seaborn-paper') with open("../../dumps/"+ds_name+".dump", "r") as f_dump: print("Loading Dump: ../../dumps/"+ds_name+".dump") data = json.load(f_dump) global_dict = data["global_dict"] out_path = build_basepath("plots", namespace, subfolder=ds_name) # Dump unique words for each group a_uniq, b_uniq, shared = extract_unique_words(global_dict, out_path) total = a_uniq + b_uniq + shared #Normalize a_uniq /= total b_uniq /= total shared /= total # Plot distribution fig, ax = plt.subplots(figsize=(5.7,0.7)) ax.get_yaxis().set_visible(False) ax.set_xticks([x/10 for x in range(11)]) ax.barh([0], [1], align='edge', color='blue', height=1, label=namespace["tag_coll_b"] + " exclusive") ax.barh([0], [a_uniq+shared], align='edge', color='purple', height=1, label="shared") ax.barh([0], [a_uniq], align='edge', color='red', height=1, label=namespace["tag_coll_a"] + " exclusive") ax.set_xlabel('Proportion') handles, labels = ax.get_legend_handles_labels() handles = [handles[2], handles[1], handles[0]] labels = [labels[2], labels[1], labels[0]] lgd = ax.legend(handles=handles, labels=labels, bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=3, mode="expand", borderaxespad=0.) fig.savefig(out_path+"ditionary_dist.png",bbox_extra_artists=(lgd,), bbox_inches='tight') fig.savefig(out_path + "ditionary_dist.pdf", bbox_extra_artists=(lgd,), bbox_inches='tight', format="pdf")
def plot_triple_stack(relative_sizes, sample_size, iterations, namespace, num_topics): # Init plot folders plot_path = build_basepath("plots", namespace, sample_size, appendix="-" + str(iterations) + "i") # Load data data_a_triple = [] data_b_triple = [] data_all_triple = [] for topics in num_topics: data_a, data_b, data_all = load_data_lda(relative_sizes, sample_size, iterations, namespace, topics) data_a_triple.append(data_a) data_b_triple.append(data_b) data_all_triple.append(data_all) # Plot perplexity plot_triple_stack_perpl_adj(data_a_triple, data_b_triple, data_all_triple, namespace, plot_path, "lda_all") plot_triple_stack_perpl_all(data_all_triple, namespace, plot_path, "lda_all")
def plot_vocab_mismatch(relative_sizes, sample_size, iterations, namespace, format="png"): plt.style.use('seaborn-paper') # Init plot folders plot_path = build_basepath("plots", namespace, sample_size, appendix="-" + str(iterations) + "i") # Load data base_path = build_basepath("samples", namespace, sample_size) data_a = dict() # x -> [ys] data_b = dict() # x -> [ys] for rel_size, i, adj_path in rel_size_iteration_loop(relative_sizes, iterations, base_path): if rel_size not in data_a.keys(): data_a[rel_size] = [] data_b[rel_size] = [] with open(adj_path + "eval/mismatch_a.p", "rb") as f_a: prob_mismatch_per_doc = pickle.load(f_a) data_a[rel_size] += prob_mismatch_per_doc with open(adj_path + "eval/mismatch_b.p", "rb") as f_b: prob_mismatch_per_doc = pickle.load(f_b) data_b[rel_size] += prob_mismatch_per_doc # Set x values x_values = sorted(list(data_a.keys())) # ---------------- # Plot Means + Std # ---------------- # Calc stats for x values means_a, std_a = list(zip(*[calculate_deviation_stats(data_a[x]) for x in x_values])) means_b, std_b = list(zip(*[calculate_deviation_stats(data_b[x]) for x in x_values])) # Command Line output print(namespace["tag_coll_a"] + " Vocab Mismatch") for i in range(len(x_values)): print(str(x_values[i]) + " - " + str(means_a[i]) + " - " + str(std_a[i])) print("\n" + namespace["tag_coll_b"] + " Vocab Mismatch") for i in range(len(x_values)): print(str(x_values[i]) + " - " + str(means_b[i]) + " - " + str(std_b[i])) plt.rc('text', usetex=True) # Plot overlapping fig, ax = plt.subplots(figsize=(5.7,3)) # Plot group A # Fill space in background ax.fill_between(x_values, np.array(means_a) + std_a, np.array(means_a) - std_a, alpha=0.3, facecolor="red") # plot mean ax.plot(x_values, means_a, color="red", linewidth=2.0, label=namespace["tag_coll_a"] + " mean") # plot std ax.plot(x_values, np.array(means_a) + std_a, '--', color="red", linewidth=0.5, label = namespace["tag_coll_a"] + " std") ax.plot(x_values, np.array(means_a) - std_a, '--', color="red", linewidth=0.5) # Plot group B # Fill space in background ax.fill_between(x_values, np.array(means_b) + std_b, np.array(means_b) - std_b, alpha=0.3, facecolor="blue") # plot mean ax.plot(x_values, means_b, color="blue", linewidth=2.0, label=namespace["tag_coll_b"] + " mean") # plot std ax.plot(x_values, np.array(means_b) + std_b, '--', color="blue", linewidth=0.5, label = namespace["tag_coll_b"] + " std") ax.plot(x_values, np.array(means_b) - std_b, '--', color="blue", linewidth=0.5) # Set x limits lgd = ax.legend( bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=4, mode="expand", borderaxespad=0.) ax.set_xlim(0.1,0.9) ax.set_ylim(0, 1) # Set axis labels ax.set_xlabel("Percentage of '"+namespace["tag_coll_a"]+"' documents in the training corpus") ax.set_ylabel("Proportion of Vocabulary Missmatch in a Test Document") fig.savefig(plot_path + "vocab_miss_mean.png", bbox_extra_artists=(lgd,), bbox_inches='tight') fig.savefig(plot_path + "vocab_miss_mean.pdf", bbox_extra_artists=(lgd,), bbox_inches='tight', format="pdf")
def plot_topic_group_distribution_triple(relative_sizes, sample_size, iterations, namespace, num_topics): plt.style.use('seaborn-paper') plt.rc('text', usetex=True) plot_path = build_basepath("plots", namespace, sample_size, appendix="-" + str(iterations) + "i") # Load data data_1_all = [] data_2_all = [] x_values = [] # Load data for topics in num_topics: x_values, data_1, data_2 = load_inference_data(namespace, sample_size, topics) data_1_all.append(data_1) data_2_all.append(data_2) output_latex_table(x_values, data_2, topics, namespace) # Plot 1 # f, axarr = plt.subplots(3, figsize=(2.2, 7), sharex=True, sharey=True) f, axarr = plt.subplots(1,3, figsize=(9,2), sharey=True) for index, topics in enumerate(num_topics): current_ax = axarr[index] plot_topic_dist(x_values, data_1_all[index], current_ax) current_ax.set_title(str(topics) + ' Topics') #current_ax.set_xlim(0.1, 0.9) current_ax.set_ylim(0, 1) legend_ax = axarr[1] handles, labels = legend_ax.get_legend_handles_labels() handles = [handles[2], handles[1], handles[0]] labels = [labels[2], labels[1], labels[0]] # Set x limits lgd = legend_ax.legend(handles=handles, labels=labels, bbox_to_anchor=(0., 1.13, 1., .102), loc=3, ncol=3, mode="expand", borderaxespad=0.) # Set axis labels # axarr[-1].set_xlabel("Perc. of '" + namespace["tag_coll_a"] + "' docs in training corpus") axarr[1].set_xlabel("Percentage of group's documents in the trainings corpus") axarr[0].set_ylabel("Proportion") f.savefig(plot_path + "lda_all_topic_dist_triple.png", bbox_extra_artists=(lgd,), bbox_inches='tight') f.savefig(plot_path + "lda_all_topic_dist_triple_triple.pdf", bbox_extra_artists=(lgd,), bbox_inches='tight', format="pdf") # Plot 2 # f2, axarr2 = plt.subplots(1,3, figsize=(5, 7), sharex=True, sharey=True) f2, axarr2 = plt.subplots(1,3, figsize=(9, 2) ,sharey=True) for index, topics in enumerate(num_topics): current_ax = axarr2[index] plot_topic_ratio(x_values, data_2_all[index], current_ax) current_ax.set_title(str(topics) + ' Topics') current_ax.set_xlim(0.1, 0.9) current_ax.set_ylim(0, 1) # Set x limits lgd2 = axarr2[1].legend(bbox_to_anchor=(0., 1.13, 1., .102), loc=3, ncol=2, mode="expand", borderaxespad=0.) # Set axis labels #axarr2[-1].set_xlabel("Percentage of group's documents in the trainings corpus") axarr2[0].set_ylabel("Topic/Corpus Ratio") axarr2[1].set_xlabel("Percentage of group's documents in the trainings corpus") f2.savefig(plot_path + "lda_all_topic_ratio_triple.png", bbox_extra_artists=(lgd2,), bbox_inches='tight') f2.savefig(plot_path + "lda_all_topic_ratio_triple_triple.pdf", bbox_extra_artists=(lgd2,), bbox_inches='tight', format="pdf")
def plot_corpus_stats(namespace, format="png", *args, **kwargs): plt.style.use('seaborn-paper') plt.rc('text', usetex=True) data_sets = namespace["data_sets"] for data_set in data_sets: # if data_set["path"].endswith("cleaned.tsv"): # print("Skipped: "+data_set["path"]) # continue file_path = data_set["path"] base_plotpath = build_basepath("plots", namespace, subfolder = ".".join(file_path.strip().split("/")[-1].split(".")[:-1])) a_lens, b_lens, len_diffs, len_diffs_perc,\ global_dict, uniq_words_a, uniq_words_b = load_dump_data(file_path=file_path, formatted=data_set["formatted"], index_a=data_set["index_a"], index_b=data_set["index_b"], *args, **kwargs) write_stat_overview(base_plotpath + "stats.txt", namespace, a_lens, b_lens, len_diffs, len_diffs_perc, global_dict, uniq_words_a, uniq_words_b) # Word Frequency Plots fig, ax = plt.subplots(figsize=(5.7,3)) ax = plt.gca() ax.set_xlabel("Word Rank") file_name = "word_frequency_global" ax.set_xscale("log") ax.set_yscale("log") plot_word_frequency(ax, global_dict, key="freq_global", linewidth=2) fig.tight_layout() fig.savefig(base_plotpath + file_name + "." + format, format=format) fig, ax = plt.subplots(figsize=(5.7,3)) ax = plt.gca() ax.set_xlabel("Word Rank") ax.set_xscale("log") ax.set_yscale("log") file_name = "word_frequency_" + namespace["tag_coll_a"] + "_" + namespace["tag_coll_b"] plot_word_frequency(ax, global_dict, key="freq_a", color="r", linewidth=2, label= namespace["tag_coll_a"] + " Words") plot_word_frequency(ax, global_dict, key="freq_b", color="b", linewidth=2, label= namespace["tag_coll_b"] + " Words") ax.legend() fig.tight_layout() fig.savefig(base_plotpath + file_name + "." + format, format=format) fig.savefig(base_plotpath + file_name + ".pdf", format="pdf") plt.clf() ''' Disabled # Document Length Distribution plt.figure() file_name = namespace["tag_coll_a"] + "_distribution_max1000" title = namespace["tag_coll_a"].upper() + " length distribution" plot_histogram(a_lens, mode="a", bin_to=1000) plt.xlabel("Document Length") # plt.title(title) plt.savefig(base_plotpath+file_name+"."+format,format=format) plt.figure() file_name = namespace["tag_coll_b"] + "_distribution_max1000" title = namespace["tag_coll_b"].upper() + " length distribution" plot_histogram(b_lens, mode="b", bin_to=1000) plt.xlabel("Document Length") # plt.title(title) plt.savefig(base_plotpath + file_name + "." + format, format=format) # Length Difference Distribution plt.figure() file_name = "diff_distribution_sbs1000" title = namespace["tag_coll_a"].upper()+"/"+namespace["tag_coll_b"].upper()+" length difference distribution" plot_diff_histogram(len_diffs, display_mode="sbs", namespace=namespace, bin_from=-1000, bin_to=1000, bin_gran=100) plt.xlabel("Pairwise length difference") # plt.title(title) plt.savefig(base_plotpath + file_name + "." + format, format=format) # Document Length CDF plt.figure() file_name = namespace["tag_coll_a"] + "_cdf_log" title = namespace["tag_coll_a"].upper() + " length CDF" plot_cdf(a_lens, mode="a") # plt.title(title) plt.xlabel("Document Length") plt.xscale("log") plt.yscale("log") plt.savefig(base_plotpath+file_name+"."+format,format=format) plt.figure() file_name = namespace["tag_coll_b"] + "_cdf_log" title = namespace["tag_coll_b"].upper() + " length CDF" plot_cdf(b_lens, mode="b") plt.xlabel("Document Length") # plt.title(title) plt.xscale("log") plt.yscale("log") plt.savefig(base_plotpath+file_name+"."+format,format=format) # Length Difference CDF plt.figure() file_name = "diffs_cdf" title = namespace["tag_coll_a"].upper()+"/"+namespace["tag_coll_b"].upper()+" length difference CDF" plot_diff_cdf(len_diffs, namespace=namespace) # plt.title(title) plt.xlabel("Pairwise length difference") plt.xscale("log") plt.yscale("log") plt.savefig(base_plotpath+file_name+"."+format,format=format) plt.figure() file_name = namespace["tag_coll_a"] + "_cdf_log_cut" title = namespace["tag_coll_a"].upper() + " length CDF" plot_cdf(a_lens, mode="a", cutoff=[50]) # plt.title(title) plt.xlabel("Document Length") plt.xscale("log") plt.yscale("log") plt.savefig(base_plotpath+file_name+"."+format,format=format) plt.figure() file_name = namespace["tag_coll_b"] + "_cdf_log_cut" title = namespace["tag_coll_b"].upper() + " length CDF" plot_cdf(b_lens, mode="b", cutoff=[50]) # plt.title(title) plt.xlabel("Document Length") plt.xscale("log") plt.yscale("log") plt.savefig(base_plotpath+file_name+"."+format,format=format) ''' plt.figure(figsize=(5.7,3)) file_name = namespace["tag_coll_a"] + "_" + namespace["tag_coll_b"] + "_cdf_log_cut" plot_cdf(b_lens, mode="b", cutoff=[50],label=True) plot_cdf(a_lens, mode="a", label=True) plt.xscale("log") plt.yscale("log") plt.legend(loc="lower right") ax = plt.gca() ax.set_xlabel("Document Length") plt.tight_layout() plt.savefig(base_plotpath + file_name + "." + format, format=format) plt.savefig(base_plotpath + file_name + ".pdf", format="pdf") plt.clf() # Percentual Length Difference CDF plt.figure(figsize=(5.7,3)) file_name = "diffs_cdf_perc_total" title = namespace["tag_coll_a"].upper()+"/"+namespace["tag_coll_b"].upper()+" Percentual length difference CDF" plot_diff_cdf(len_diffs_perc, namespace=namespace, at_x=[1]) # plt.title(title) plt.xlim([0, 3]) plt.xlabel("Pairwise percentual length difference") plt.tight_layout() plt.savefig(base_plotpath+file_name+"."+format,format=format) plt.savefig(base_plotpath + file_name + ".pdf", format="pdf") plt.clf() ''' plt.figure() file_name = "diffs_cdf_perc_total_threshold" title = namespace["tag_coll_a"].upper() + "/" + namespace[ "tag_coll_b"].upper() + " Percentual length difference CDF" plot_diff_cdf(len_diffs_perc, namespace=namespace) # plt.title(title) plt.xlim([0, 1]) plt.xlabel("Pairwise percentual length difference") plt.savefig(base_plotpath + file_name + "." + format, format=format) ''' # Difference Frequency Distribution plt.figure(figsize=(5.7,3)) file_name = "diff_counts_cut-2" title = namespace["tag_coll_a"].upper()+"/"+namespace["tag_coll_b"].upper()+" overhead cases" plot_counts(len_diffs_perc, namespace, cutoff=[-2, 2]) plt.xlim([-0.5, 2]) # plt.title(title) plt.tight_layout() plt.savefig(base_plotpath+file_name+"."+format,format=format) plt.savefig(base_plotpath + file_name + ".pdf", format="pdf") plt.close("all") if namespace["mode"] != "wiki": sources_a, sources_b, sources_pair = load_sources(file_path,namespace) # Source Frequency Distribution plt.figure(figsize=(5.7,3)) file_name = "source_dist_"+namespace["tag_coll_a"] plot_sources_dist(sources_a,namespace) plt.savefig(base_plotpath + file_name + "." + format, format=format) plt.savefig(base_plotpath + file_name + ".pdf", format="pdf") plt.clf() plt.figure(figsize=(5.7,3)) file_name = "source_dist_" + namespace["tag_coll_b"] plot_sources_dist(sources_b, namespace) plt.savefig(base_plotpath + file_name + "." + format, format=format) plt.savefig(base_plotpath + file_name + ".pdf", format="pdf") plt.clf() plt.figure(figsize=(5.7,5)) plt.axes().set_aspect('equal') file_name = "source_dist_pairs" plot_pair_matrix(sources_a, sources_b, sources_pair, namespace) plt.savefig(base_plotpath + file_name + "." + format, format=format, dpi=600) plt.savefig(base_plotpath + file_name + ".pdf", format="pdf")