def pc_topics_heatmap(fig_name, dendo_settings, paper_range=None): def index_by_year(tups): y_comm = {} for tup in tups: comm = y_comm.get(tup[1], set()) comm.add(tup[0]) y_comm[tup[1]] = comm return y_comm miner, graph, lda_model, vocab = get_graph_lda_data() p_conferences = graph.get_papers_by_venue() p_committees = graph.get_committee_by_conference() conference_topics = {} for conference in mysql.get_conferences(): if conference.id not in p_committees or conference.id not in p_conferences: continue year_committees = index_by_year(p_committees[conference.id]) year_papers = index_by_year(p_conferences[conference.id]) year_scores = {} topics = np.array([0] * lda_model.n_topics) for year in sorted(year_committees.keys(), key=lambda y: int(y)): if (paper_range is not None) and (int(year) not in paper_range): continue papers = year_papers.get(year, None) if papers is None: year_scores[int(year)] = None continue committee = year_committees[year] for paper_id in papers: paper = graph.paper_nodes[paper_id] author_ids = set(paper.author_ids.strip().split(",")) if len(author_ids.intersection(committee)) == 0: continue topics = np.add(topics, miner.documents[paper_id].topics_count) conference_topics[conference.id] = topics heatmap_arr = [] valid_conferences = [] for conference_id in sorted(conference_topics.keys(), key=lambda x: int(x)): tot = sum(conference_topics[conference_id]) if tot > 0: valid_conferences.append(conference_id) dist = [top / tot for top in conference_topics[conference_id]] heatmap_arr.append(dist) row_labels = ["%2d" % ind for ind in range(lda_model.n_topics)] col_labels = [ c.acronym for c in mysql.get_conferences() if c.id in valid_conferences ] make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, col_labels, "figs/v2/pc/%s.png" % fig_name, dendo_settings)
def pc_topics_heatmap(year_range=None): def index_by_year(tups): y_comm = {} for tup in tups: comm = y_comm.get(tup[1], set()) comm.add(tup[0]) y_comm[tup[1]] = comm return y_comm graph = cite_graph(GRAPH_CSV) miner = Miner(graph) lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977) max_len = 21 start = 1993 p_conferences = graph.get_papers_by_venue() p_committees = graph.get_committee_by_conference() conference_topics = {} for conference in mysql.get_conferences(): year_committees = index_by_year(p_committees[conference.id]) year_papers = index_by_year(p_conferences[conference.id]) year_scores = {} topics = np.array([0] * lda_model.n_topics) for year in sorted(year_committees.keys(), key=lambda y: int(y)): if (year_range is not None) and (int(year) not in year_range): continue papers = year_papers.get(year, None) if papers is None: year_scores[int(year)] = None continue committee = year_committees[year] for paper_id in papers: paper = graph.paper_nodes[paper_id] author_ids = set(paper.author_ids.strip().split(",")) if author_ids.intersection(committee): continue topics = np.add(topics, miner.documents[paper_id].topics_count) conference_topics[conference.id] = topics heatmap_arr = [] for conference_id in sorted(conference_topics.keys(), key=lambda x: int(x)): tot = sum(conference_topics[conference_id]) dist = [top / tot for top in conference_topics[conference_id]] heatmap_arr.append(dist) row_labels = range(lda_model.n_topics) column_labels = [c.acronym for c in mysql.get_conferences()] make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/pc/pc_heatmap_7topics.png")
def conference_diversity(fig_name, dend_settings, paper_range=None): miner, graph, lda_model, vocab = get_graph_lda_data() conferences = graph.get_papers_by_venue() conference_topics = {} conference_heatmaps = {} valid_conferences = [] for conference_id, papers in conferences.items(): topics = np.array([0] * lda_model.n_topics) for tup in yearize(papers).items(): if paper_range and tup[0] not in paper_range: continue for paper_id in tup[1]: topics = np.add(topics, miner.documents[paper_id].topics_count) if sum(topics) > 0: conference_topics[conference_id] = percent_sort(topics) conference_heatmaps[conference_id] = topics valid_conferences.append(conference_id) row_labels = [str(ind) + "-" + name for ind, name in zip(range(lda_model.n_topics), TOPICS)] # row_labels = ["%2d" % ind for ind in range(lda_model.n_topics)] column_labels = [c.acronym for c in mysql.get_conferences() if c.id in valid_conferences] # Heatmap heatmap_arr = [] for conference_id in sorted(conference_heatmaps.keys(), key=lambda x: int(x)): tot = sum(conference_heatmaps[conference_id]) dist = [top / tot for top in conference_heatmaps[conference_id]] heatmap_arr.append(dist) report(lda_model, vocab, 15) # make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, # "figs/v2/diversity/%s_dend.png" % fig_name, dend_settings) make_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/v2/diversity/%s.png" % fig_name)
def pc_topics_heatmap(fig_name, dendo_settings, paper_range=None): def index_by_year(tups): y_comm = {} for tup in tups: comm = y_comm.get(tup[1], set()) comm.add(tup[0]) y_comm[tup[1]] = comm return y_comm miner, graph, lda_model, vocab = get_graph_lda_data() p_conferences = graph.get_papers_by_venue() p_committees = graph.get_committee_by_conference() conference_topics = {} for conference in mysql.get_conferences(): if conference.id not in p_committees or conference.id not in p_conferences: continue year_committees = index_by_year(p_committees[conference.id]) year_papers = index_by_year(p_conferences[conference.id]) year_scores = {} topics = np.array([0] * lda_model.n_topics) for year in sorted(year_committees.keys(), key=lambda y: int(y)): if (paper_range is not None) and (int(year) not in paper_range): continue papers = year_papers.get(year, None) if papers is None: year_scores[int(year)] = None continue committee = year_committees[year] for paper_id in papers: paper = graph.paper_nodes[paper_id] author_ids = set(paper.author_ids.strip().split(",")) if len(author_ids.intersection(committee)) == 0: continue topics = np.add(topics, miner.documents[paper_id].topics_count) conference_topics[conference.id] = topics heatmap_arr = [] valid_conferences = [] for conference_id in sorted(conference_topics.keys(), key=lambda x: int(x)): tot = sum(conference_topics[conference_id]) if tot > 0: valid_conferences.append(conference_id) dist = [top / tot for top in conference_topics[conference_id]] heatmap_arr.append(dist) row_labels = ["%2d" % ind for ind in range(lda_model.n_topics)] col_labels = [c.acronym for c in mysql.get_conferences() if c.id in valid_conferences] make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, col_labels, "figs/v2/pc/%s.png" % fig_name, dendo_settings)
def pc_bias_table(): def index_by_year(tups): y_comm = {} for tup in tups: comm = y_comm.get(tup[1], set()) comm.add(tup[0]) y_comm[tup[1]] = comm return y_comm graph = cite_graph(GRAPH_CSV) max_len = 21 start = 1993 max_len = 5 start = 2009 p_conferences = graph.get_papers_by_venue() p_committees = graph.get_committee_by_conference() conf_year_scores = {} for conference in mysql.get_conferences(): year_committees = index_by_year(p_committees[conference.id]) year_papers = index_by_year(p_conferences[conference.id]) year_scores = {} for year in sorted(year_committees.keys(), key=lambda y: int(y)): if year < start: continue papers = year_papers.get(year, None) if papers is None: year_scores[int(year)] = None continue committee = year_committees[year] comm_papers = 0 non_comm_papers = 0 for paper_id in papers: paper = graph.paper_nodes[paper_id] author_ids = set(paper.author_ids.strip().split(",")) if author_ids.intersection(committee): comm_papers += 1 else: non_comm_papers += 1 year_scores[int(year)] = 0 if not comm_papers else int( round(comm_papers * 100 / (comm_papers + non_comm_papers))) conf_year_scores[conference.acronym] = year_scores header = ["conf"] + [str(start + i) for i in xrange(max_len)] table = PrettyTable(header) for conf, year_scores in conf_year_scores.items(): row = [conf] for index in xrange(max_len): row.append(year_scores.get(start + index, None)) table.add_row(row) print("```") print(table) print("```") exit()
def conference_evolution_2(paper_range, figname): graph = cite_graph(GRAPH_CSV) miner = Miner(graph) lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977) conferences = graph.get_papers_by_venue() conference_topics = {} conference_heatmaps = {} for conference_id, papers in conferences.items(): topics = np.array([0] * lda_model.n_topics) for tup in yearize(papers).items(): if tup[0] not in paper_range: continue for paper_id in tup[1]: topics = np.add(topics, miner.documents[paper_id].topics_count) conference_topics[conference_id] = percent_sort(topics) conference_heatmaps[conference_id] = topics n_top_words = 10 #Heatmap heatmap_arr = [] column_labels = [] for conference_id, conf in zip( sorted(conference_heatmaps.keys(), key=lambda x: int(x)), mysql.get_conferences()): tot = sum(conference_heatmaps[conference_id]) if tot == 0: continue column_labels.append(conf.acronym) dist = [top / tot for top in conference_heatmaps[conference_id]] heatmap_arr.append(dist) fig, ax = plt.subplots() heatmap_arr = np.array(heatmap_arr) heatmap = ax.pcolor(heatmap_arr, cmap=plt.cm.Reds) plt.ylabel("Conferences") plt.xlabel("Topics") row_labels = range(lda_model.n_topics) ax.set_xticks(np.arange(heatmap_arr.shape[1]) + 0.5, minor=False) ax.set_yticks(np.arange(heatmap_arr.shape[0]) + 0.5, minor=False) ax.set_xticklabels(row_labels, minor=False) ax.set_yticklabels(column_labels, minor=False) plt.savefig("figs/diversity/heatmap_7topics.png") plt.clf() for index, topic_dist in enumerate(lda_model.topic_word_): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] print('Topic {}: {}'.format(index, ', '.join(topic_words))) # make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/diversity/dend_heatmap_7topics.png") make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/evolution/%s.png" % figname)
def pc_bias_table(): def index_by_year(tups): y_comm = {} for tup in tups: comm = y_comm.get(tup[1], set()) comm.add(tup[0]) y_comm[tup[1]] = comm return y_comm graph = cite_graph(GRAPH_CSV) max_len = 21 start = 1993 max_len = 5 start = 2009 p_conferences = graph.get_papers_by_venue() p_committees = graph.get_committee_by_conference() conf_year_scores = {} for conference in mysql.get_conferences(): year_committees = index_by_year(p_committees[conference.id]) year_papers = index_by_year(p_conferences[conference.id]) year_scores = {} for year in sorted(year_committees.keys(), key=lambda y: int(y)): if year < start: continue papers = year_papers.get(year, None) if papers is None: year_scores[int(year)] = None continue committee = year_committees[year] comm_papers = 0 non_comm_papers = 0 for paper_id in papers: paper = graph.paper_nodes[paper_id] author_ids = set(paper.author_ids.strip().split(",")) if author_ids.intersection(committee): comm_papers += 1 else: non_comm_papers += 1 year_scores[int(year)] = 0 if not comm_papers else int(round(comm_papers * 100 / (comm_papers + non_comm_papers))) conf_year_scores[conference.acronym] = year_scores header = ["conf"] + [str(start + i) for i in xrange(max_len)] table = PrettyTable(header) for conf, year_scores in conf_year_scores.items(): row = [conf] for index in xrange(max_len): row.append(year_scores.get(start+index, None)) table.add_row(row) print("```") print(table) print("```") exit()
def pc_paper_count_table(): def index_by_year(tups): y_comm = {} for tup in tups: comm = y_comm.get(tup[1], set()) comm.add(tup[0]) y_comm[tup[1]] = comm return y_comm graph = cite_graph(GRAPH_CSV) start = 2009 p_conferences = graph.get_papers_by_venue() p_committees = graph.get_committee_by_conference() header = ["conf", "# Accepted", "# from PC", "Percentage"] table = PrettyTable(header) all_papers, all_pc_papers = 0, 0 for conference in mysql.get_conferences(): year_committees = index_by_year(p_committees[conference.id]) year_papers = index_by_year(p_conferences[conference.id]) year_scores = {} comm_papers = 0 tot_papers = 0 for year in sorted(year_committees.keys(), key=lambda y: int(y)): if int(year) < start: continue papers = year_papers.get(year, None) if papers is None: year_scores[int(year)] = None continue committee = year_committees[year] for paper_id in papers: paper = graph.paper_nodes[paper_id] author_ids = set(paper.author_ids.strip().split(",")) if author_ids.intersection(committee): comm_papers += 1 tot_papers += 1 table.add_row([ conference.acronym, tot_papers, comm_papers, int(round(100 * comm_papers / tot_papers, 0)) ]) all_papers += tot_papers all_pc_papers += comm_papers table.add_row([ "all", all_papers, all_pc_papers, int(round(100 * all_pc_papers / all_papers, 0)) ]) print("```") print(table) print("```")
def pc_paper_count_table(): def index_by_year(tups): y_comm = {} for tup in tups: comm = y_comm.get(tup[1], set()) comm.add(tup[0]) y_comm[tup[1]] = comm return y_comm graph = cite_graph(GRAPH_CSV) start = 2009 p_conferences = graph.get_papers_by_venue() p_committees = graph.get_committee_by_conference() header = ["conf", "# Accepted", "# from PC", "Percentage"] table = PrettyTable(header) all_papers, all_pc_papers = 0, 0 for conference in mysql.get_conferences(): year_committees = index_by_year(p_committees[conference.id]) year_papers = index_by_year(p_conferences[conference.id]) year_scores = {} comm_papers = 0 tot_papers = 0 for year in sorted(year_committees.keys(), key=lambda y: int(y)): if int(year) < start: continue papers = year_papers.get(year, None) if papers is None: year_scores[int(year)] = None continue committee = year_committees[year] for paper_id in papers: paper = graph.paper_nodes[paper_id] author_ids = set(paper.author_ids.strip().split(",")) if author_ids.intersection(committee): comm_papers += 1 tot_papers += 1 table.add_row([conference.acronym, tot_papers, comm_papers, int(round(100 * comm_papers / tot_papers, 0))]) all_papers += tot_papers all_pc_papers += comm_papers table.add_row(["all", all_papers, all_pc_papers, int(round(100 * all_pc_papers / all_papers, 0))]) print("```") print(table) print("```")
def conference_diversity(fig_name, dend_settings, paper_range=None): miner, graph, lda_model, vocab = get_graph_lda_data() conferences = graph.get_papers_by_venue() conference_topics = {} conference_heatmaps = {} valid_conferences = [] for conference_id, papers in conferences.items(): topics = np.array([0] * lda_model.n_topics) for tup in yearize(papers).items(): if paper_range and tup[0] not in paper_range: continue for paper_id in tup[1]: topics = np.add(topics, miner.documents[paper_id].topics_count) if sum(topics) > 0: conference_topics[conference_id] = percent_sort(topics) conference_heatmaps[conference_id] = topics valid_conferences.append(conference_id) row_labels = [ str(ind) + "-" + name for ind, name in zip(range(lda_model.n_topics), TOPICS) ] # row_labels = ["%2d" % ind for ind in range(lda_model.n_topics)] column_labels = [ c.acronym for c in mysql.get_conferences() if c.id in valid_conferences ] # Heatmap heatmap_arr = [] for conference_id in sorted(conference_heatmaps.keys(), key=lambda x: int(x)): tot = sum(conference_heatmaps[conference_id]) dist = [top / tot for top in conference_heatmaps[conference_id]] heatmap_arr.append(dist) report(lda_model, vocab, 15) # make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, # "figs/v2/diversity/%s_dend.png" % fig_name, dend_settings) make_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/v2/diversity/%s.png" % fig_name)
def conference_diversity(): graph = cite_graph(GRAPH_CSV) miner = Miner(graph) lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977) conferences = graph.get_papers_by_venue() conference_topics = {} conference_heatmaps = {} for conference_id, papers in conferences.items(): topics = np.array([0] * lda_model.n_topics) for tup in yearize(papers).items(): for paper_id in tup[1]: topics = np.add(topics, miner.documents[paper_id].topics_count) conference_topics[conference_id] = percent_sort(topics) conference_heatmaps[conference_id] = topics #fig, ax = plt.subplots() bar_vals = [] colors = [] width = 0.75 plts = [] x_axis = np.arange(1, len(conference_topics.keys())+1) #x_axis = [c.acronym for c in mysql.get_conferences()] y_offset = np.array([0]*len(conference_topics.keys())) colors_dict = {} for index in range(7): bar_val = [] color = [] for conference_id in sorted(conference_topics.keys(), key=lambda x: int(x)): topic = conference_topics[conference_id][index] colors_dict[topic[0]] = get_color(topic[0]) color.append(colors_dict[topic[0]]) bar_val.append(topic[1]) plts.append(plt.bar(x_axis, bar_val, width, color=color, bottom=y_offset)) y_offset = np.add(y_offset, bar_val) plt.ylabel("Topic Coverage %") plt.xlabel("Conferences") plt.xticks(x_axis+width/2, [c.acronym for c in mysql.get_conferences()]) plt.yticks(np.arange(0, 101, 10)) plt.ylim([0,101]) #Legends patches = [] for topic, color in colors_dict.items(): patches.append(mpatches.Patch(color=color, label='Topic %s' % str(topic))) plt.legend(handles=patches, loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=6, fontsize=7) plt.savefig("figs/diversity/conference_diversity_7topics.png") plt.clf() n_top_words = 10 #Heatmap heatmap_arr = [] for conference_id in sorted(conference_heatmaps.keys(), key=lambda x: int(x)): tot = sum(conference_heatmaps[conference_id]) dist = [top/tot for top in conference_heatmaps[conference_id]] heatmap_arr.append(dist) fig, ax = plt.subplots() heatmap_arr = np.array(heatmap_arr) heatmap = ax.pcolor(heatmap_arr, cmap=plt.cm.Reds) plt.ylabel("Conferences") plt.xlabel("Topics") # row_labels = range(lda_model.n_topics) t_names= ["Testing", "Applications", "Program Analysis", "Tools and Projects", "Defect Analysis", "Modeling", "Maintenance"] row_labels = [str(ind) + "-" + name for ind, name in zip(range(lda_model.n_topics), t_names)] column_labels = [c.acronym for c in mysql.get_conferences()] ax.set_xticks(np.arange(heatmap_arr.shape[1])+0.5, minor=False) ax.set_yticks(np.arange(heatmap_arr.shape[0])+0.5, minor=False) ax.set_xticklabels(row_labels, minor=False) ax.set_yticklabels(column_labels, minor=False) plt.savefig("figs/diversity/heatmap_7topics.png") plt.clf() for index, topic_dist in enumerate(lda_model.topic_word_): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] print('Topic {}: {}'.format(index, ', '.join(topic_words))) # make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/diversity/dend_heatmap_7topics.png") make_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/diversity/heatmap2.png")
def pc_bias(): def index_by_year(tups): y_comm = {} for tup in tups: comm = y_comm.get(tup[1], set()) comm.add(tup[0]) y_comm[tup[1]] = comm return y_comm legit_conferences = ["ICSE", "MSR", "FSE", "ASE"] colors = ['r', 'g', 'b', 'y'] graph = cite_graph(GRAPH_CSV) width = 0.5 space = 0.3 p_conferences = graph.get_papers_by_venue() p_committees = graph.get_committee_by_conference() max_len = 21 low = 1 high = max_len * (len(legit_conferences) * width + space) + 1 delta = (high - low) / max_len x_axis = np.arange(low, high, delta) x_ticks = np.arange(1993, 1993 + max_len) conf_index = 0 patches = [] for conference in mysql.get_conferences(): if conference.acronym not in legit_conferences: continue year_committees = index_by_year(p_committees[conference.id]) year_papers = index_by_year(p_conferences[conference.id]) year_scores = {} y_axis = [] #x_axis = np.arange(1, len(year_committees.keys())+1) for year in sorted(year_committees.keys(), key=lambda y: int(y)): papers = year_papers.get(year, None) if papers is None: y_axis.append(0) continue committee = year_committees[year] comm_papers = 0 non_comm_papers = 0 for paper_id in papers: paper = graph.paper_nodes[paper_id] author_ids = set(paper.author_ids.strip().split(",")) if author_ids.intersection(committee): comm_papers += 1 else: non_comm_papers += 1 year_scores[year] = (comm_papers, non_comm_papers) percent = 0 if not comm_papers else comm_papers * 100 / ( comm_papers + non_comm_papers) y_axis.append(percent) y_axis = np.lib.pad(y_axis, (max_len - len(y_axis), 0), 'constant', constant_values=0) plt.bar(x_axis + conf_index * width, y_axis, width=width, color=colors[conf_index]) patches.append( mpatches.Patch(color=colors[conf_index], label=conference.acronym)) conf_index += 1 plt.xlabel("Year") plt.ylabel("% of papers by PC") plt.xticks(x_axis + len(legit_conferences) * width / 2, [str(y)[2:] for y in x_ticks]) #plt.yticks(np.arange(0, 100, 10)) #plt.title(conference.acronym) plt.legend(handles=patches, loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=len(legit_conferences), fontsize=7) plt.savefig("figs/pc/pc.png") plt.clf()
def conference_evolution(): legit_conferences = ["ICSE", "MSR", "FSE", "ASE"] non_legit_conferences = ["GPCE", "FASE"] TOP_TOPIC_COUNT = 7 graph = cite_graph(GRAPH_CSV) miner = Miner(graph) lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977) conferences = graph.get_papers_by_venue() f, subplts = plt.subplots(3, 3) f.tight_layout() y_counter = -1 x_counter = 0 for conf_index, conference in enumerate(mysql.get_conferences()): # if conference.acronym not in legit_conferences: continue if conference.acronym in non_legit_conferences: continue y_counter += 1 if y_counter > 2: x_counter += 1 y_counter = 0 year_topics = {} year_heatmaps = {} for year, papers in yearize(conferences[conference.id]).items(): topics = np.array([0]*lda_model.n_topics) for paper_id in papers: topics = np.add(topics, miner.documents[paper_id].topics_count) year_heatmaps[year] = topics year_topics[year] = percent_sort(topics) width = 0.8 x_axis = np.arange(1, len(year_topics.keys()) + 1) # x_axis = [c.acronym for c in mysql.get_conferences()] y_offset = np.array([0] * len(year_topics.keys())) colors_dict={} for index in range(TOP_TOPIC_COUNT): bar_val, color = [], [] for year in sorted(year_topics.keys(), key=lambda x:int(x)): topic = year_topics[year][index] colors_dict[topic[0]] = get_color(topic[0]) color.append(colors_dict[topic[0]]) bar_val.append(topic[1]) subplts[x_counter, y_counter].bar(x_axis, bar_val, width, color=color, bottom=y_offset) y_offset = np.add(y_offset, bar_val) # subplts[x_counter, y_counter].set_ylabel("Topic Coverage %") #subplts[x_counter, y_counter].set_xlabel("Conferences") if len(year_topics.keys()) <= 14: subplts[x_counter, y_counter].set_xticks(x_axis + width / 2) subplts[x_counter, y_counter].set_xticklabels([str(y)[2:] for y in sorted(year_topics.keys(), key=lambda x: int(x))], fontsize=7) else: subplts[x_counter, y_counter].set_xticks(np.arange(1, len(year_topics.keys()) + 1, 2) + width / 2) subplts[x_counter, y_counter].set_xticklabels([str(y)[2:] for index, y in enumerate(sorted(year_topics.keys(), key=lambda x: int(x))) if index%2 == 0], fontsize=7) subplts[x_counter, y_counter].set_yticks(np.arange(0, 101, 20)) subplts[x_counter, y_counter].set_ylim([0,101]) subplts[x_counter, y_counter].set_title(conference.acronym) # Legends patches = [] labels = [] for topic in xrange(lda_model.n_topics): patches.append(mpatches.Patch(color=get_color(topic))) labels.append('Topic %s' % str(topic)) f.legend(handles=patches, labels=labels, loc='upper center', bbox_to_anchor=(0.5, 0.04), ncol=12, fontsize=7) plt.savefig("figs/evolution/evolution_7topics.png") plt.clf() n_top_words = 10 for index, topic_dist in enumerate(lda_model.topic_word_): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] print('Topic {}: {}'.format(index, ', '.join(topic_words)))
def conference_evolution(): legit_conferences = ["ICSE", "MSR", "FSE", "ASE"] non_legit_conferences = ["GPCE", "FASE"] TOP_TOPIC_COUNT = 7 graph = cite_graph(GRAPH_CSV) miner = Miner(graph) lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977) conferences = graph.get_papers_by_venue() f, subplts = plt.subplots(3, 3) f.tight_layout() y_counter = -1 x_counter = 0 for conf_index, conference in enumerate(mysql.get_conferences()): # if conference.acronym not in legit_conferences: continue if conference.acronym in non_legit_conferences: continue y_counter += 1 if y_counter > 2: x_counter += 1 y_counter = 0 year_topics = {} year_heatmaps = {} for year, papers in yearize(conferences[conference.id]).items(): topics = np.array([0] * lda_model.n_topics) for paper_id in papers: topics = np.add(topics, miner.documents[paper_id].topics_count) year_heatmaps[year] = topics year_topics[year] = percent_sort(topics) width = 0.8 x_axis = np.arange(1, len(year_topics.keys()) + 1) # x_axis = [c.acronym for c in mysql.get_conferences()] y_offset = np.array([0] * len(year_topics.keys())) colors_dict = {} for index in range(TOP_TOPIC_COUNT): bar_val, color = [], [] for year in sorted(year_topics.keys(), key=lambda x: int(x)): topic = year_topics[year][index] colors_dict[topic[0]] = get_color(topic[0]) color.append(colors_dict[topic[0]]) bar_val.append(topic[1]) subplts[x_counter, y_counter].bar(x_axis, bar_val, width, color=color, bottom=y_offset) y_offset = np.add(y_offset, bar_val) # subplts[x_counter, y_counter].set_ylabel("Topic Coverage %") #subplts[x_counter, y_counter].set_xlabel("Conferences") if len(year_topics.keys()) <= 14: subplts[x_counter, y_counter].set_xticks(x_axis + width / 2) subplts[x_counter, y_counter].set_xticklabels([ str(y)[2:] for y in sorted(year_topics.keys(), key=lambda x: int(x)) ], fontsize=7) else: subplts[x_counter, y_counter].set_xticks( np.arange(1, len(year_topics.keys()) + 1, 2) + width / 2) subplts[x_counter, y_counter].set_xticklabels([ str(y)[2:] for index, y in enumerate( sorted(year_topics.keys(), key=lambda x: int(x))) if index % 2 == 0 ], fontsize=7) subplts[x_counter, y_counter].set_yticks(np.arange(0, 101, 20)) subplts[x_counter, y_counter].set_ylim([0, 101]) subplts[x_counter, y_counter].set_title(conference.acronym) # Legends patches = [] labels = [] for topic in xrange(lda_model.n_topics): patches.append(mpatches.Patch(color=get_color(topic))) labels.append('Topic %s' % str(topic)) f.legend(handles=patches, labels=labels, loc='upper center', bbox_to_anchor=(0.5, 0.04), ncol=12, fontsize=7) plt.savefig("figs/evolution/evolution_7topics.png") plt.clf() n_top_words = 10 for index, topic_dist in enumerate(lda_model.topic_word_): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] print('Topic {}: {}'.format(index, ', '.join(topic_words)))
def pc_bias(): def index_by_year(tups): y_comm = {} for tup in tups: comm = y_comm.get(tup[1], set()) comm.add(tup[0]) y_comm[tup[1]] = comm return y_comm legit_conferences = ["ICSE", "MSR", "FSE", "ASE"] colors = ['r', 'g', 'b', 'y'] graph = cite_graph(GRAPH_CSV) width = 0.5 space = 0.3 p_conferences = graph.get_papers_by_venue() p_committees = graph.get_committee_by_conference() max_len = 21 low = 1 high = max_len * (len(legit_conferences)*width + space) + 1 delta = (high - low)/max_len x_axis = np.arange(low, high, delta) x_ticks = np.arange(1993, 1993+max_len) conf_index = 0 patches = [] for conference in mysql.get_conferences(): if conference.acronym not in legit_conferences: continue year_committees = index_by_year(p_committees[conference.id]) year_papers = index_by_year(p_conferences[conference.id]) year_scores = {} y_axis = [] #x_axis = np.arange(1, len(year_committees.keys())+1) for year in sorted(year_committees.keys(), key=lambda y: int(y)): papers = year_papers.get(year,None) if papers is None: y_axis.append(0) continue committee = year_committees[year] comm_papers = 0 non_comm_papers = 0 for paper_id in papers: paper = graph.paper_nodes[paper_id] author_ids = set(paper.author_ids.strip().split(",")) if author_ids.intersection(committee): comm_papers += 1 else: non_comm_papers += 1 year_scores[year] = (comm_papers, non_comm_papers) percent = 0 if not comm_papers else comm_papers*100/(comm_papers+non_comm_papers) y_axis.append(percent) y_axis = np.lib.pad(y_axis, (max_len-len(y_axis), 0), 'constant', constant_values=0) plt.bar(x_axis+conf_index*width, y_axis, width=width, color=colors[conf_index]) patches.append(mpatches.Patch(color=colors[conf_index], label=conference.acronym)) conf_index += 1 plt.xlabel("Year") plt.ylabel("% of papers by PC") plt.xticks(x_axis + len(legit_conferences)*width/2, [str(y)[2:] for y in x_ticks]) #plt.yticks(np.arange(0, 100, 10)) #plt.title(conference.acronym) plt.legend(handles=patches, loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=len(legit_conferences), fontsize=7) plt.savefig("figs/pc/pc.png") plt.clf()
def conference_diversity(): graph = cite_graph(GRAPH_CSV) miner = Miner(graph) lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977) conferences = graph.get_papers_by_venue() conference_topics = {} conference_heatmaps = {} for conference_id, papers in conferences.items(): topics = np.array([0] * lda_model.n_topics) for tup in yearize(papers).items(): for paper_id in tup[1]: topics = np.add(topics, miner.documents[paper_id].topics_count) conference_topics[conference_id] = percent_sort(topics) conference_heatmaps[conference_id] = topics #fig, ax = plt.subplots() bar_vals = [] colors = [] width = 0.75 plts = [] x_axis = np.arange(1, len(conference_topics.keys()) + 1) #x_axis = [c.acronym for c in mysql.get_conferences()] y_offset = np.array([0] * len(conference_topics.keys())) colors_dict = {} for index in range(7): bar_val = [] color = [] for conference_id in sorted(conference_topics.keys(), key=lambda x: int(x)): topic = conference_topics[conference_id][index] colors_dict[topic[0]] = get_color(topic[0]) color.append(colors_dict[topic[0]]) bar_val.append(topic[1]) plts.append( plt.bar(x_axis, bar_val, width, color=color, bottom=y_offset)) y_offset = np.add(y_offset, bar_val) plt.ylabel("Topic Coverage %") plt.xlabel("Conferences") plt.xticks(x_axis + width / 2, [c.acronym for c in mysql.get_conferences()]) plt.yticks(np.arange(0, 101, 10)) plt.ylim([0, 101]) #Legends patches = [] for topic, color in colors_dict.items(): patches.append( mpatches.Patch(color=color, label='Topic %s' % str(topic))) plt.legend(handles=patches, loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=6, fontsize=7) plt.savefig("figs/diversity/conference_diversity_7topics.png") plt.clf() n_top_words = 10 #Heatmap heatmap_arr = [] for conference_id in sorted(conference_heatmaps.keys(), key=lambda x: int(x)): tot = sum(conference_heatmaps[conference_id]) dist = [top / tot for top in conference_heatmaps[conference_id]] heatmap_arr.append(dist) fig, ax = plt.subplots() heatmap_arr = np.array(heatmap_arr) heatmap = ax.pcolor(heatmap_arr, cmap=plt.cm.Reds) plt.ylabel("Conferences") plt.xlabel("Topics") # row_labels = range(lda_model.n_topics) t_names = [ "Testing", "Applications", "Program Analysis", "Tools and Projects", "Defect Analysis", "Modeling", "Maintenance" ] row_labels = [ str(ind) + "-" + name for ind, name in zip(range(lda_model.n_topics), t_names) ] column_labels = [c.acronym for c in mysql.get_conferences()] ax.set_xticks(np.arange(heatmap_arr.shape[1]) + 0.5, minor=False) ax.set_yticks(np.arange(heatmap_arr.shape[0]) + 0.5, minor=False) ax.set_xticklabels(row_labels, minor=False) ax.set_yticklabels(column_labels, minor=False) plt.savefig("figs/diversity/heatmap_7topics.png") plt.clf() for index, topic_dist in enumerate(lda_model.topic_word_): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] print('Topic {}: {}'.format(index, ', '.join(topic_words))) # make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/diversity/dend_heatmap_7topics.png") make_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/diversity/heatmap2.png")
return name_map[name] return name def yearize(paps): paps = sorted(paps, key=lambda tup: tup[1], reverse=True) pap_dict = {} for pap in paps: year_paps = pap_dict.get(int(pap[1]), []) year_paps.append(pap[0]) pap_dict[int(pap[1])] = year_paps return OrderedDict(sorted(pap_dict.items(), key=lambda t: t[0])) CONFERENCES = [ shorter_names(conf.acronym) for conf in mysqldb.get_conferences() ] @Memoized def retrieve_graph_lda_data(): """ Fetch stored metadata :return: """ graph_file = 'cache/%s/%s/graph.pkl' % (THE.version, THE.permitted) vectorizer_file = 'cache/%s/%s/vectorizer.pkl' % (THE.version, THE.permitted) doc_2_vec_file = 'cache/%s/%s/doc_2_vec.pkl' % (THE.version, THE.permitted) documents_file = 'cache/%s/%s/documents.pkl' % (THE.version, THE.permitted) lda_model_file = 'cache/%s/%s/lda_model.pkl' % (THE.version, THE.permitted)
def pc_heatmap_delta(fig_name, title=None, paper_range=None): def index_by_year(tups): y_comm = {} for tup in tups: comm = y_comm.get(tup[1], set()) comm.add(tup[0]) y_comm[tup[1]] = comm return y_comm miner, graph, lda_model, vocab = get_graph_lda_data() p_conferences = graph.get_papers_by_venue() p_committees = graph.get_committee_by_conference() conference_topics = {} pc_conference_topics = {} for conference in mysqldb.get_conferences(): year_committees = index_by_year(p_committees[conference.id]) year_papers = index_by_year(p_conferences[conference.id]) topics = np.array([0] * lda_model.n_topics) pc_topics = np.array([0] * lda_model.n_topics) for year in sorted(year_committees.keys(), key=lambda y: int(y)): if (paper_range is not None) and (int(year) not in paper_range): continue papers = year_papers.get(year, None) if papers is None: continue committee = year_committees[year] for paper_id in papers: paper = graph.paper_nodes[paper_id] author_ids = set(paper.author_ids.strip().split(",")) paper_topics = miner.documents[paper_id].topics_count if len(author_ids.intersection(committee)) != 0: pc_topics = np.add(pc_topics, paper_topics) topics = np.add(topics, paper_topics) pc_conference_topics[conference.id] = pc_topics conference_topics[conference.id] = topics heatmap_arr = [] valid_conferences = [] for conference_id in sorted(conference_topics.keys(), key=lambda x: int(x)): tot = sum(conference_topics[conference_id]) pc_tot = sum(pc_conference_topics[conference_id]) if tot <= 0 or pc_tot <= 0: continue valid_conferences.append(conference_id) dist = [top / tot for top in conference_topics[conference_id]] pc_dist = [top / pc_tot for top in pc_conference_topics[conference_id]] # heatmap_arr.append([round(pc_d - d, 2) for d, pc_d in zip(dist, pc_dist)]) heatmap_arr.append([int(round(100 * (pc_d - d) / d, 0)) for d, pc_d in zip(dist, pc_dist)]) # heatmap_arr.append([round(d / pc_d, 2) for d, pc_d in zip(dist, pc_dist)]) np.savetxt("temp.csv", np.transpose(np.array(heatmap_arr)), delimiter=",") # HeatMap # row_labels = ["%2d" % ind for ind in range(lda_model.n_topics)] row_labels = TOPICS_ALL col_labels = [c.acronym for c in mysqldb.get_conferences() if c.id in valid_conferences] heatmap_arr = np.transpose(np.array(heatmap_arr, np.int)) plt.figure(figsize=(4, 3)) cmap = mpl.colors.ListedColormap(['red', 'lightsalmon', 'white', 'palegreen','lime']) bounds = [-20, -12, -5, 5, 12, 20] # bounds = [-0.2, -0.12, -0.05, 0.05, 0.12, 0.2] norm = mpl.colors.BoundaryNorm(bounds, cmap.N) # df = pd.DataFrame(heatmap_arr, columns=col_labels, index=row_labels) cax = plt.matshow(heatmap_arr, interpolation='nearest', cmap=cmap, norm=norm) for (i, j), z in np.ndenumerate(heatmap_arr): plt.text(j, i, abs(z), ha='center', va='center', fontsize=11) # ticks = [-0.2, -0.1, 0, 0.1, 0.2] ticks = [-20, -10, 0, 10, 20] plt.colorbar(cax, cmap=cmap, norm=norm, boundaries=bounds, ticks=ticks) plt.xticks(np.arange(len(list(col_labels))), list(col_labels), rotation="vertical") plt.yticks(np.arange(len(list(row_labels))), list(row_labels)) if title is None: title = "Topic Distribution Delta between papers by PC and all papers" plt.title(title, y=1.2) plt.savefig("figs/v3/%s/pc/%s.png" % (THE.permitted, fig_name), bbox_inches='tight') plt.clf()
def get_n_topics(): if THE.permitted == "journals": return 7 if THE.permitted == "all": return 11 STOP_WORDS = text.ENGLISH_STOP_WORDS.union(['software', 'engineering', 'paper', 'study', 'based', 'results', 'approach', 'case', 'workshop', 'international', 'research', 'conference', 'introduction', 'editors', 'article', 'issue', 'month', 'copyright', 'special', 'used', 'using', 'use', 'studies', 'review', 'editorial', 'report', 'book', 'ieee', 'published', 'science', 'column', 'author', 'proposed', 'icse', 'article', 'year', 'articles', 'page', '2000', '2004', 'papers', 'computer', 'held', 'editor']) CONFERENCES = [venue.acronym for venue in mysqldb.get_conferences()] # Config THE = O() THE.permitted = "all" def is_not_none(s): return s and s != 'None' def harmonic_dist(n): dist = [1 / i for i in range(1, n + 1)] total = sum(dist) return [d / total for d in dist]
def is_true(val): return val in [True, 'True', 'true'] def shorter_names(name): name_map = { "SOFTWARE": "S/W", "SIGSOFT": "NOTES", "MODELS": "MDLS" } if name in name_map: return name_map[name] return name CONFERENCES = [shorter_names(conf.acronym) for conf in mysqldb.get_conferences()] @Memoized def retrieve_graph_lda_data(): """ Fetch stored metadata :return: """ graph_file = 'cache/%s/%s/graph.pkl' % (THE.version, THE.permitted) vectorizer_file = 'cache/%s/%s/vectorizer.pkl' % (THE.version, THE.permitted) doc_2_vec_file = 'cache/%s/%s/doc_2_vec.pkl' % (THE.version, THE.permitted) documents_file = 'cache/%s/%s/documents.pkl' % (THE.version, THE.permitted) lda_model_file = 'cache/%s/%s/lda_model.pkl' % (THE.version, THE.permitted) vocabulary_file = 'cache/%s/%s/vocabulary.pkl' % (THE.version, THE.permitted) if os.path.isfile(graph_file) and os.path.isfile(vectorizer_file) \
def conference_evolution_2(paper_range, figname): graph = cite_graph(GRAPH_CSV) miner = Miner(graph) lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977) conferences = graph.get_papers_by_venue() conference_topics = {} conference_heatmaps = {} for conference_id, papers in conferences.items(): topics = np.array([0] * lda_model.n_topics) for tup in yearize(papers).items(): if tup[0] not in paper_range: continue for paper_id in tup[1]: topics = np.add(topics, miner.documents[paper_id].topics_count) conference_topics[conference_id] = percent_sort(topics) conference_heatmaps[conference_id] = topics n_top_words = 10 #Heatmap heatmap_arr = [] column_labels = [] for conference_id, conf in zip(sorted(conference_heatmaps.keys(), key=lambda x: int(x)), mysql.get_conferences()): tot = sum(conference_heatmaps[conference_id]) if tot == 0: continue column_labels.append(conf.acronym) dist = [top/tot for top in conference_heatmaps[conference_id]] heatmap_arr.append(dist) fig, ax = plt.subplots() heatmap_arr = np.array(heatmap_arr) heatmap = ax.pcolor(heatmap_arr, cmap=plt.cm.Reds) plt.ylabel("Conferences") plt.xlabel("Topics") row_labels = range(lda_model.n_topics) ax.set_xticks(np.arange(heatmap_arr.shape[1])+0.5, minor=False) ax.set_yticks(np.arange(heatmap_arr.shape[0])+0.5, minor=False) ax.set_xticklabels(row_labels, minor=False) ax.set_yticklabels(column_labels, minor=False) plt.savefig("figs/diversity/heatmap_7topics.png") plt.clf() for index, topic_dist in enumerate(lda_model.topic_word_): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] print('Topic {}: {}'.format(index, ', '.join(topic_words))) # make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/diversity/dend_heatmap_7topics.png") make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/evolution/%s.png"%figname)