Пример #1
0
def pc_topics_heatmap(fig_name, dendo_settings, paper_range=None):
    def index_by_year(tups):
        y_comm = {}
        for tup in tups:
            comm = y_comm.get(tup[1], set())
            comm.add(tup[0])
            y_comm[tup[1]] = comm
        return y_comm

    miner, graph, lda_model, vocab = get_graph_lda_data()
    p_conferences = graph.get_papers_by_venue()
    p_committees = graph.get_committee_by_conference()
    conference_topics = {}
    for conference in mysql.get_conferences():
        if conference.id not in p_committees or conference.id not in p_conferences:
            continue
        year_committees = index_by_year(p_committees[conference.id])
        year_papers = index_by_year(p_conferences[conference.id])
        year_scores = {}
        topics = np.array([0] * lda_model.n_topics)
        for year in sorted(year_committees.keys(), key=lambda y: int(y)):
            if (paper_range is not None) and (int(year) not in paper_range):
                continue
            papers = year_papers.get(year, None)
            if papers is None:
                year_scores[int(year)] = None
                continue
            committee = year_committees[year]
            for paper_id in papers:
                paper = graph.paper_nodes[paper_id]
                author_ids = set(paper.author_ids.strip().split(","))
                if len(author_ids.intersection(committee)) == 0:
                    continue
                topics = np.add(topics, miner.documents[paper_id].topics_count)
        conference_topics[conference.id] = topics
    heatmap_arr = []
    valid_conferences = []
    for conference_id in sorted(conference_topics.keys(),
                                key=lambda x: int(x)):
        tot = sum(conference_topics[conference_id])
        if tot > 0:
            valid_conferences.append(conference_id)
        dist = [top / tot for top in conference_topics[conference_id]]
        heatmap_arr.append(dist)
    row_labels = ["%2d" % ind for ind in range(lda_model.n_topics)]
    col_labels = [
        c.acronym for c in mysql.get_conferences() if c.id in valid_conferences
    ]
    make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, col_labels,
                       "figs/v2/pc/%s.png" % fig_name, dendo_settings)
Пример #2
0
def pc_topics_heatmap(year_range=None):
    def index_by_year(tups):
        y_comm = {}
        for tup in tups:
            comm = y_comm.get(tup[1], set())
            comm.add(tup[0])
            y_comm[tup[1]] = comm
        return y_comm

    graph = cite_graph(GRAPH_CSV)
    miner = Miner(graph)
    lda_model, vocab = miner.lda(7,
                                 n_iter=100,
                                 alpha=0.847433736937,
                                 beta=0.763774618977)
    max_len = 21
    start = 1993
    p_conferences = graph.get_papers_by_venue()
    p_committees = graph.get_committee_by_conference()
    conference_topics = {}
    for conference in mysql.get_conferences():
        year_committees = index_by_year(p_committees[conference.id])
        year_papers = index_by_year(p_conferences[conference.id])
        year_scores = {}
        topics = np.array([0] * lda_model.n_topics)
        for year in sorted(year_committees.keys(), key=lambda y: int(y)):
            if (year_range is not None) and (int(year) not in year_range):
                continue
            papers = year_papers.get(year, None)
            if papers is None:
                year_scores[int(year)] = None
                continue
            committee = year_committees[year]
            for paper_id in papers:
                paper = graph.paper_nodes[paper_id]
                author_ids = set(paper.author_ids.strip().split(","))
                if author_ids.intersection(committee):
                    continue
                topics = np.add(topics, miner.documents[paper_id].topics_count)
        conference_topics[conference.id] = topics
    heatmap_arr = []
    for conference_id in sorted(conference_topics.keys(),
                                key=lambda x: int(x)):
        tot = sum(conference_topics[conference_id])
        dist = [top / tot for top in conference_topics[conference_id]]
        heatmap_arr.append(dist)
    row_labels = range(lda_model.n_topics)
    column_labels = [c.acronym for c in mysql.get_conferences()]
    make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels,
                       "figs/pc/pc_heatmap_7topics.png")
Пример #3
0
def conference_diversity(fig_name, dend_settings, paper_range=None):
  miner, graph, lda_model, vocab = get_graph_lda_data()
  conferences = graph.get_papers_by_venue()
  conference_topics = {}
  conference_heatmaps = {}
  valid_conferences = []
  for conference_id, papers in conferences.items():
    topics = np.array([0] * lda_model.n_topics)
    for tup in yearize(papers).items():
      if paper_range and tup[0] not in paper_range: continue
      for paper_id in tup[1]:
        topics = np.add(topics, miner.documents[paper_id].topics_count)
    if sum(topics) > 0:
      conference_topics[conference_id] = percent_sort(topics)
      conference_heatmaps[conference_id] = topics
      valid_conferences.append(conference_id)
  row_labels = [str(ind) + "-" + name for ind, name in zip(range(lda_model.n_topics), TOPICS)]
  # row_labels = ["%2d" % ind for ind in range(lda_model.n_topics)]
  column_labels = [c.acronym for c in mysql.get_conferences() if c.id in valid_conferences]
  # Heatmap
  heatmap_arr = []
  for conference_id in sorted(conference_heatmaps.keys(), key=lambda x: int(x)):
    tot = sum(conference_heatmaps[conference_id])
    dist = [top / tot for top in conference_heatmaps[conference_id]]
    heatmap_arr.append(dist)
  report(lda_model, vocab, 15)
  # make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels,
  #                    "figs/v2/diversity/%s_dend.png" % fig_name, dend_settings)
  make_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/v2/diversity/%s.png" % fig_name)
Пример #4
0
def pc_topics_heatmap(fig_name, dendo_settings, paper_range=None):
  def index_by_year(tups):
    y_comm = {}
    for tup in tups:
      comm = y_comm.get(tup[1], set())
      comm.add(tup[0])
      y_comm[tup[1]] = comm
    return y_comm

  miner, graph, lda_model, vocab = get_graph_lda_data()
  p_conferences = graph.get_papers_by_venue()
  p_committees = graph.get_committee_by_conference()
  conference_topics = {}
  for conference in mysql.get_conferences():
    if conference.id not in p_committees or conference.id not in p_conferences:
      continue
    year_committees = index_by_year(p_committees[conference.id])
    year_papers = index_by_year(p_conferences[conference.id])
    year_scores = {}
    topics = np.array([0] * lda_model.n_topics)
    for year in sorted(year_committees.keys(), key=lambda y: int(y)):
      if (paper_range is not None) and (int(year) not in paper_range):
        continue
      papers = year_papers.get(year, None)
      if papers is None:
        year_scores[int(year)] = None
        continue
      committee = year_committees[year]
      for paper_id in papers:
        paper = graph.paper_nodes[paper_id]
        author_ids = set(paper.author_ids.strip().split(","))
        if len(author_ids.intersection(committee)) == 0:
          continue
        topics = np.add(topics, miner.documents[paper_id].topics_count)
    conference_topics[conference.id] = topics
  heatmap_arr = []
  valid_conferences = []
  for conference_id in sorted(conference_topics.keys(), key=lambda x: int(x)):
    tot = sum(conference_topics[conference_id])
    if tot > 0:
      valid_conferences.append(conference_id)
    dist = [top / tot for top in conference_topics[conference_id]]
    heatmap_arr.append(dist)
  row_labels = ["%2d" % ind for ind in range(lda_model.n_topics)]
  col_labels = [c.acronym for c in mysql.get_conferences() if c.id in valid_conferences]
  make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, col_labels, "figs/v2/pc/%s.png" % fig_name, dendo_settings)
Пример #5
0
def pc_topics_heatmap(year_range=None):
  def index_by_year(tups):
    y_comm = {}
    for tup in tups:
      comm = y_comm.get(tup[1], set())
      comm.add(tup[0])
      y_comm[tup[1]] = comm
    return y_comm

  graph = cite_graph(GRAPH_CSV)
  miner = Miner(graph)
  lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977)
  max_len = 21
  start = 1993
  p_conferences = graph.get_papers_by_venue()
  p_committees = graph.get_committee_by_conference()
  conference_topics = {}
  for conference in mysql.get_conferences():
    year_committees = index_by_year(p_committees[conference.id])
    year_papers = index_by_year(p_conferences[conference.id])
    year_scores = {}
    topics = np.array([0] * lda_model.n_topics)
    for year in sorted(year_committees.keys(), key=lambda y: int(y)):
      if (year_range is not None) and (int(year) not in year_range):
        continue
      papers = year_papers.get(year, None)
      if papers is None:
        year_scores[int(year)] = None
        continue
      committee = year_committees[year]
      for paper_id in papers:
        paper = graph.paper_nodes[paper_id]
        author_ids = set(paper.author_ids.strip().split(","))
        if author_ids.intersection(committee):
          continue
        topics = np.add(topics, miner.documents[paper_id].topics_count)
    conference_topics[conference.id] = topics
  heatmap_arr = []
  for conference_id in sorted(conference_topics.keys(), key=lambda x: int(x)):
    tot = sum(conference_topics[conference_id])
    dist = [top / tot for top in conference_topics[conference_id]]
    heatmap_arr.append(dist)
  row_labels = range(lda_model.n_topics)
  column_labels = [c.acronym for c in mysql.get_conferences()]
  make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/pc/pc_heatmap_7topics.png")
Пример #6
0
def pc_bias_table():
    def index_by_year(tups):
        y_comm = {}
        for tup in tups:
            comm = y_comm.get(tup[1], set())
            comm.add(tup[0])
            y_comm[tup[1]] = comm
        return y_comm

    graph = cite_graph(GRAPH_CSV)
    max_len = 21
    start = 1993
    max_len = 5
    start = 2009
    p_conferences = graph.get_papers_by_venue()
    p_committees = graph.get_committee_by_conference()
    conf_year_scores = {}
    for conference in mysql.get_conferences():
        year_committees = index_by_year(p_committees[conference.id])
        year_papers = index_by_year(p_conferences[conference.id])
        year_scores = {}
        for year in sorted(year_committees.keys(), key=lambda y: int(y)):
            if year < start: continue
            papers = year_papers.get(year, None)
            if papers is None:
                year_scores[int(year)] = None
                continue
            committee = year_committees[year]
            comm_papers = 0
            non_comm_papers = 0
            for paper_id in papers:
                paper = graph.paper_nodes[paper_id]
                author_ids = set(paper.author_ids.strip().split(","))
                if author_ids.intersection(committee):
                    comm_papers += 1
                else:
                    non_comm_papers += 1
            year_scores[int(year)] = 0 if not comm_papers else int(
                round(comm_papers * 100 / (comm_papers + non_comm_papers)))
        conf_year_scores[conference.acronym] = year_scores
    header = ["conf"] + [str(start + i) for i in xrange(max_len)]
    table = PrettyTable(header)
    for conf, year_scores in conf_year_scores.items():
        row = [conf]
        for index in xrange(max_len):
            row.append(year_scores.get(start + index, None))
        table.add_row(row)
    print("```")
    print(table)
    print("```")
    exit()
Пример #7
0
def conference_evolution_2(paper_range, figname):
    graph = cite_graph(GRAPH_CSV)
    miner = Miner(graph)
    lda_model, vocab = miner.lda(7,
                                 n_iter=100,
                                 alpha=0.847433736937,
                                 beta=0.763774618977)
    conferences = graph.get_papers_by_venue()
    conference_topics = {}
    conference_heatmaps = {}
    for conference_id, papers in conferences.items():
        topics = np.array([0] * lda_model.n_topics)
        for tup in yearize(papers).items():
            if tup[0] not in paper_range:
                continue
            for paper_id in tup[1]:
                topics = np.add(topics, miner.documents[paper_id].topics_count)
        conference_topics[conference_id] = percent_sort(topics)
        conference_heatmaps[conference_id] = topics
    n_top_words = 10
    #Heatmap
    heatmap_arr = []
    column_labels = []
    for conference_id, conf in zip(
            sorted(conference_heatmaps.keys(), key=lambda x: int(x)),
            mysql.get_conferences()):
        tot = sum(conference_heatmaps[conference_id])
        if tot == 0: continue
        column_labels.append(conf.acronym)
        dist = [top / tot for top in conference_heatmaps[conference_id]]
        heatmap_arr.append(dist)
    fig, ax = plt.subplots()
    heatmap_arr = np.array(heatmap_arr)
    heatmap = ax.pcolor(heatmap_arr, cmap=plt.cm.Reds)
    plt.ylabel("Conferences")
    plt.xlabel("Topics")
    row_labels = range(lda_model.n_topics)
    ax.set_xticks(np.arange(heatmap_arr.shape[1]) + 0.5, minor=False)
    ax.set_yticks(np.arange(heatmap_arr.shape[0]) + 0.5, minor=False)
    ax.set_xticklabels(row_labels, minor=False)
    ax.set_yticklabels(column_labels, minor=False)
    plt.savefig("figs/diversity/heatmap_7topics.png")
    plt.clf()
    for index, topic_dist in enumerate(lda_model.topic_word_):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words +
                                                                 1):-1]
        print('Topic {}: {}'.format(index, ', '.join(topic_words)))
    # make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/diversity/dend_heatmap_7topics.png")
    make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels,
                       "figs/evolution/%s.png" % figname)
Пример #8
0
def pc_bias_table():
  def index_by_year(tups):
    y_comm = {}
    for tup in tups:
      comm = y_comm.get(tup[1], set())
      comm.add(tup[0])
      y_comm[tup[1]] = comm
    return y_comm

  graph = cite_graph(GRAPH_CSV)
  max_len = 21
  start = 1993
  max_len = 5
  start = 2009
  p_conferences = graph.get_papers_by_venue()
  p_committees = graph.get_committee_by_conference()
  conf_year_scores = {}
  for conference in mysql.get_conferences():
    year_committees = index_by_year(p_committees[conference.id])
    year_papers = index_by_year(p_conferences[conference.id])
    year_scores = {}
    for year in sorted(year_committees.keys(), key=lambda y: int(y)):
      if year < start: continue
      papers = year_papers.get(year, None)
      if papers is None:
        year_scores[int(year)] = None
        continue
      committee = year_committees[year]
      comm_papers = 0
      non_comm_papers = 0
      for paper_id in papers:
        paper = graph.paper_nodes[paper_id]
        author_ids = set(paper.author_ids.strip().split(","))
        if author_ids.intersection(committee):
          comm_papers += 1
        else:
          non_comm_papers += 1
      year_scores[int(year)] = 0 if not comm_papers else int(round(comm_papers * 100 / (comm_papers + non_comm_papers)))
    conf_year_scores[conference.acronym] = year_scores
  header = ["conf"] + [str(start + i) for i in xrange(max_len)]
  table = PrettyTable(header)
  for conf, year_scores in conf_year_scores.items():
    row = [conf]
    for index in xrange(max_len):
      row.append(year_scores.get(start+index, None))
    table.add_row(row)
  print("```")
  print(table)
  print("```")
  exit()
Пример #9
0
def pc_paper_count_table():
    def index_by_year(tups):
        y_comm = {}
        for tup in tups:
            comm = y_comm.get(tup[1], set())
            comm.add(tup[0])
            y_comm[tup[1]] = comm
        return y_comm

    graph = cite_graph(GRAPH_CSV)
    start = 2009
    p_conferences = graph.get_papers_by_venue()
    p_committees = graph.get_committee_by_conference()
    header = ["conf", "# Accepted", "# from PC", "Percentage"]
    table = PrettyTable(header)
    all_papers, all_pc_papers = 0, 0
    for conference in mysql.get_conferences():
        year_committees = index_by_year(p_committees[conference.id])
        year_papers = index_by_year(p_conferences[conference.id])
        year_scores = {}
        comm_papers = 0
        tot_papers = 0
        for year in sorted(year_committees.keys(), key=lambda y: int(y)):
            if int(year) < start: continue
            papers = year_papers.get(year, None)
            if papers is None:
                year_scores[int(year)] = None
                continue
            committee = year_committees[year]
            for paper_id in papers:
                paper = graph.paper_nodes[paper_id]
                author_ids = set(paper.author_ids.strip().split(","))
                if author_ids.intersection(committee):
                    comm_papers += 1
                tot_papers += 1
        table.add_row([
            conference.acronym, tot_papers, comm_papers,
            int(round(100 * comm_papers / tot_papers, 0))
        ])
        all_papers += tot_papers
        all_pc_papers += comm_papers
    table.add_row([
        "all", all_papers, all_pc_papers,
        int(round(100 * all_pc_papers / all_papers, 0))
    ])
    print("```")
    print(table)
    print("```")
Пример #10
0
def pc_paper_count_table():
  def index_by_year(tups):
    y_comm = {}
    for tup in tups:
      comm = y_comm.get(tup[1], set())
      comm.add(tup[0])
      y_comm[tup[1]] = comm
    return y_comm

  graph = cite_graph(GRAPH_CSV)
  start = 2009
  p_conferences = graph.get_papers_by_venue()
  p_committees = graph.get_committee_by_conference()
  header = ["conf", "# Accepted", "# from PC", "Percentage"]
  table = PrettyTable(header)
  all_papers, all_pc_papers = 0, 0
  for conference in mysql.get_conferences():
    year_committees = index_by_year(p_committees[conference.id])
    year_papers = index_by_year(p_conferences[conference.id])
    year_scores = {}
    comm_papers = 0
    tot_papers = 0
    for year in sorted(year_committees.keys(), key=lambda y: int(y)):
      if int(year) < start: continue
      papers = year_papers.get(year, None)
      if papers is None:
        year_scores[int(year)] = None
        continue
      committee = year_committees[year]
      for paper_id in papers:
        paper = graph.paper_nodes[paper_id]
        author_ids = set(paper.author_ids.strip().split(","))
        if author_ids.intersection(committee):
          comm_papers += 1
        tot_papers += 1
    table.add_row([conference.acronym, tot_papers, comm_papers, int(round(100 * comm_papers / tot_papers, 0))])
    all_papers += tot_papers
    all_pc_papers += comm_papers
  table.add_row(["all", all_papers, all_pc_papers, int(round(100 * all_pc_papers / all_papers, 0))])
  print("```")
  print(table)
  print("```")
Пример #11
0
def conference_diversity(fig_name, dend_settings, paper_range=None):
    miner, graph, lda_model, vocab = get_graph_lda_data()
    conferences = graph.get_papers_by_venue()
    conference_topics = {}
    conference_heatmaps = {}
    valid_conferences = []
    for conference_id, papers in conferences.items():
        topics = np.array([0] * lda_model.n_topics)
        for tup in yearize(papers).items():
            if paper_range and tup[0] not in paper_range: continue
            for paper_id in tup[1]:
                topics = np.add(topics, miner.documents[paper_id].topics_count)
        if sum(topics) > 0:
            conference_topics[conference_id] = percent_sort(topics)
            conference_heatmaps[conference_id] = topics
            valid_conferences.append(conference_id)
    row_labels = [
        str(ind) + "-" + name
        for ind, name in zip(range(lda_model.n_topics), TOPICS)
    ]
    # row_labels = ["%2d" % ind for ind in range(lda_model.n_topics)]
    column_labels = [
        c.acronym for c in mysql.get_conferences() if c.id in valid_conferences
    ]
    # Heatmap
    heatmap_arr = []
    for conference_id in sorted(conference_heatmaps.keys(),
                                key=lambda x: int(x)):
        tot = sum(conference_heatmaps[conference_id])
        dist = [top / tot for top in conference_heatmaps[conference_id]]
        heatmap_arr.append(dist)
    report(lda_model, vocab, 15)
    # make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels,
    #                    "figs/v2/diversity/%s_dend.png" % fig_name, dend_settings)
    make_heatmap(np.transpose(heatmap_arr), row_labels, column_labels,
                 "figs/v2/diversity/%s.png" % fig_name)
Пример #12
0
def conference_diversity():
  graph = cite_graph(GRAPH_CSV)
  miner = Miner(graph)
  lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977)
  conferences = graph.get_papers_by_venue()
  conference_topics = {}
  conference_heatmaps = {}
  for conference_id, papers in conferences.items():
    topics = np.array([0] * lda_model.n_topics)
    for tup in yearize(papers).items():
      for paper_id in tup[1]:
        topics = np.add(topics, miner.documents[paper_id].topics_count)
    conference_topics[conference_id] = percent_sort(topics)
    conference_heatmaps[conference_id] = topics
  #fig, ax = plt.subplots()
  bar_vals = []
  colors = []
  width = 0.75
  plts = []
  x_axis = np.arange(1, len(conference_topics.keys())+1)
  #x_axis = [c.acronym for c in mysql.get_conferences()]
  y_offset = np.array([0]*len(conference_topics.keys()))
  colors_dict = {}
  for index in range(7):
    bar_val = []
    color = []
    for conference_id in sorted(conference_topics.keys(), key=lambda x: int(x)):
      topic = conference_topics[conference_id][index]
      colors_dict[topic[0]] = get_color(topic[0])
      color.append(colors_dict[topic[0]])
      bar_val.append(topic[1])
    plts.append(plt.bar(x_axis, bar_val, width, color=color, bottom=y_offset))
    y_offset = np.add(y_offset, bar_val)
  plt.ylabel("Topic Coverage %")
  plt.xlabel("Conferences")
  plt.xticks(x_axis+width/2, [c.acronym for c in mysql.get_conferences()])
  plt.yticks(np.arange(0, 101, 10))
  plt.ylim([0,101])
  #Legends
  patches = []
  for topic, color in colors_dict.items():
    patches.append(mpatches.Patch(color=color, label='Topic %s' % str(topic)))
  plt.legend(handles=patches, loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=6, fontsize=7)
  plt.savefig("figs/diversity/conference_diversity_7topics.png")
  plt.clf()
  n_top_words = 10
  #Heatmap
  heatmap_arr = []
  for conference_id in sorted(conference_heatmaps.keys(), key=lambda x: int(x)):
    tot = sum(conference_heatmaps[conference_id])
    dist = [top/tot for top in conference_heatmaps[conference_id]]
    heatmap_arr.append(dist)
  fig, ax = plt.subplots()
  heatmap_arr = np.array(heatmap_arr)
  heatmap = ax.pcolor(heatmap_arr, cmap=plt.cm.Reds)
  plt.ylabel("Conferences")
  plt.xlabel("Topics")
  # row_labels = range(lda_model.n_topics)
  t_names= ["Testing", "Applications", "Program Analysis", "Tools and Projects",
            "Defect Analysis", "Modeling", "Maintenance"]
  row_labels = [str(ind) + "-" + name for ind, name in zip(range(lda_model.n_topics), t_names)]
  column_labels = [c.acronym for c in mysql.get_conferences()]
  ax.set_xticks(np.arange(heatmap_arr.shape[1])+0.5, minor=False)
  ax.set_yticks(np.arange(heatmap_arr.shape[0])+0.5, minor=False)
  ax.set_xticklabels(row_labels, minor=False)
  ax.set_yticklabels(column_labels, minor=False)
  plt.savefig("figs/diversity/heatmap_7topics.png")
  plt.clf()
  for index, topic_dist in enumerate(lda_model.topic_word_):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1]
    print('Topic {}: {}'.format(index, ', '.join(topic_words)))
  # make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/diversity/dend_heatmap_7topics.png")
  make_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/diversity/heatmap2.png")
Пример #13
0
def pc_bias():
    def index_by_year(tups):
        y_comm = {}
        for tup in tups:
            comm = y_comm.get(tup[1], set())
            comm.add(tup[0])
            y_comm[tup[1]] = comm
        return y_comm

    legit_conferences = ["ICSE", "MSR", "FSE", "ASE"]
    colors = ['r', 'g', 'b', 'y']
    graph = cite_graph(GRAPH_CSV)
    width = 0.5
    space = 0.3
    p_conferences = graph.get_papers_by_venue()
    p_committees = graph.get_committee_by_conference()
    max_len = 21
    low = 1
    high = max_len * (len(legit_conferences) * width + space) + 1
    delta = (high - low) / max_len
    x_axis = np.arange(low, high, delta)
    x_ticks = np.arange(1993, 1993 + max_len)
    conf_index = 0
    patches = []
    for conference in mysql.get_conferences():
        if conference.acronym not in legit_conferences: continue
        year_committees = index_by_year(p_committees[conference.id])
        year_papers = index_by_year(p_conferences[conference.id])
        year_scores = {}
        y_axis = []
        #x_axis = np.arange(1, len(year_committees.keys())+1)
        for year in sorted(year_committees.keys(), key=lambda y: int(y)):
            papers = year_papers.get(year, None)
            if papers is None:
                y_axis.append(0)
                continue
            committee = year_committees[year]
            comm_papers = 0
            non_comm_papers = 0
            for paper_id in papers:
                paper = graph.paper_nodes[paper_id]
                author_ids = set(paper.author_ids.strip().split(","))
                if author_ids.intersection(committee):
                    comm_papers += 1
                else:
                    non_comm_papers += 1
            year_scores[year] = (comm_papers, non_comm_papers)
            percent = 0 if not comm_papers else comm_papers * 100 / (
                comm_papers + non_comm_papers)
            y_axis.append(percent)
        y_axis = np.lib.pad(y_axis, (max_len - len(y_axis), 0),
                            'constant',
                            constant_values=0)
        plt.bar(x_axis + conf_index * width,
                y_axis,
                width=width,
                color=colors[conf_index])
        patches.append(
            mpatches.Patch(color=colors[conf_index], label=conference.acronym))
        conf_index += 1
    plt.xlabel("Year")
    plt.ylabel("% of papers by PC")
    plt.xticks(x_axis + len(legit_conferences) * width / 2,
               [str(y)[2:] for y in x_ticks])
    #plt.yticks(np.arange(0, 100, 10))
    #plt.title(conference.acronym)
    plt.legend(handles=patches,
               loc='upper center',
               bbox_to_anchor=(0.5, -0.05),
               ncol=len(legit_conferences),
               fontsize=7)
    plt.savefig("figs/pc/pc.png")
    plt.clf()
Пример #14
0
def conference_evolution():
  legit_conferences = ["ICSE", "MSR", "FSE", "ASE"]
  non_legit_conferences = ["GPCE", "FASE"]
  TOP_TOPIC_COUNT = 7
  graph = cite_graph(GRAPH_CSV)
  miner = Miner(graph)
  lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977)
  conferences = graph.get_papers_by_venue()
  f, subplts = plt.subplots(3, 3)
  f.tight_layout()
  y_counter = -1
  x_counter = 0
  for conf_index, conference in enumerate(mysql.get_conferences()):
    # if conference.acronym not in legit_conferences: continue
    if conference.acronym in non_legit_conferences: continue
    y_counter += 1
    if y_counter > 2:
      x_counter += 1
      y_counter = 0
    year_topics = {}
    year_heatmaps = {}
    for year, papers in yearize(conferences[conference.id]).items():
      topics = np.array([0]*lda_model.n_topics)
      for paper_id in papers:
        topics = np.add(topics, miner.documents[paper_id].topics_count)
      year_heatmaps[year] = topics
      year_topics[year] = percent_sort(topics)
    width = 0.8
    x_axis = np.arange(1, len(year_topics.keys()) + 1)
    # x_axis = [c.acronym for c in mysql.get_conferences()]
    y_offset = np.array([0] * len(year_topics.keys()))
    colors_dict={}
    for index in range(TOP_TOPIC_COUNT):
      bar_val, color = [], []
      for year in sorted(year_topics.keys(), key=lambda x:int(x)):
        topic = year_topics[year][index]
        colors_dict[topic[0]] = get_color(topic[0])
        color.append(colors_dict[topic[0]])
        bar_val.append(topic[1])
      subplts[x_counter, y_counter].bar(x_axis, bar_val, width, color=color, bottom=y_offset)
      y_offset = np.add(y_offset, bar_val)
    # subplts[x_counter, y_counter].set_ylabel("Topic Coverage %")
    #subplts[x_counter, y_counter].set_xlabel("Conferences")
    if len(year_topics.keys()) <= 14:
      subplts[x_counter, y_counter].set_xticks(x_axis + width / 2)
      subplts[x_counter, y_counter].set_xticklabels([str(y)[2:] for y in sorted(year_topics.keys(), key=lambda x: int(x))], fontsize=7)
    else:
      subplts[x_counter, y_counter].set_xticks(np.arange(1, len(year_topics.keys()) + 1, 2) + width / 2)
      subplts[x_counter, y_counter].set_xticklabels([str(y)[2:] for index, y in enumerate(sorted(year_topics.keys(), key=lambda x: int(x))) if index%2 == 0], fontsize=7)

    subplts[x_counter, y_counter].set_yticks(np.arange(0, 101, 20))
    subplts[x_counter, y_counter].set_ylim([0,101])
    subplts[x_counter, y_counter].set_title(conference.acronym)
  # Legends
  patches = []
  labels = []
  for topic in xrange(lda_model.n_topics):
    patches.append(mpatches.Patch(color=get_color(topic)))
    labels.append('Topic %s' % str(topic))
  f.legend(handles=patches, labels=labels, loc='upper center', bbox_to_anchor=(0.5, 0.04), ncol=12, fontsize=7)
  plt.savefig("figs/evolution/evolution_7topics.png")
  plt.clf()
  n_top_words = 10
  for index, topic_dist in enumerate(lda_model.topic_word_):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1]
    print('Topic {}: {}'.format(index, ', '.join(topic_words)))
Пример #15
0
def conference_evolution():
    legit_conferences = ["ICSE", "MSR", "FSE", "ASE"]
    non_legit_conferences = ["GPCE", "FASE"]
    TOP_TOPIC_COUNT = 7
    graph = cite_graph(GRAPH_CSV)
    miner = Miner(graph)
    lda_model, vocab = miner.lda(7,
                                 n_iter=100,
                                 alpha=0.847433736937,
                                 beta=0.763774618977)
    conferences = graph.get_papers_by_venue()
    f, subplts = plt.subplots(3, 3)
    f.tight_layout()
    y_counter = -1
    x_counter = 0
    for conf_index, conference in enumerate(mysql.get_conferences()):
        # if conference.acronym not in legit_conferences: continue
        if conference.acronym in non_legit_conferences: continue
        y_counter += 1
        if y_counter > 2:
            x_counter += 1
            y_counter = 0
        year_topics = {}
        year_heatmaps = {}
        for year, papers in yearize(conferences[conference.id]).items():
            topics = np.array([0] * lda_model.n_topics)
            for paper_id in papers:
                topics = np.add(topics, miner.documents[paper_id].topics_count)
            year_heatmaps[year] = topics
            year_topics[year] = percent_sort(topics)
        width = 0.8
        x_axis = np.arange(1, len(year_topics.keys()) + 1)
        # x_axis = [c.acronym for c in mysql.get_conferences()]
        y_offset = np.array([0] * len(year_topics.keys()))
        colors_dict = {}
        for index in range(TOP_TOPIC_COUNT):
            bar_val, color = [], []
            for year in sorted(year_topics.keys(), key=lambda x: int(x)):
                topic = year_topics[year][index]
                colors_dict[topic[0]] = get_color(topic[0])
                color.append(colors_dict[topic[0]])
                bar_val.append(topic[1])
            subplts[x_counter, y_counter].bar(x_axis,
                                              bar_val,
                                              width,
                                              color=color,
                                              bottom=y_offset)
            y_offset = np.add(y_offset, bar_val)
        # subplts[x_counter, y_counter].set_ylabel("Topic Coverage %")
        #subplts[x_counter, y_counter].set_xlabel("Conferences")
        if len(year_topics.keys()) <= 14:
            subplts[x_counter, y_counter].set_xticks(x_axis + width / 2)
            subplts[x_counter, y_counter].set_xticklabels([
                str(y)[2:]
                for y in sorted(year_topics.keys(), key=lambda x: int(x))
            ],
                                                          fontsize=7)
        else:
            subplts[x_counter, y_counter].set_xticks(
                np.arange(1,
                          len(year_topics.keys()) + 1, 2) + width / 2)
            subplts[x_counter, y_counter].set_xticklabels([
                str(y)[2:] for index, y in enumerate(
                    sorted(year_topics.keys(), key=lambda x: int(x)))
                if index % 2 == 0
            ],
                                                          fontsize=7)

        subplts[x_counter, y_counter].set_yticks(np.arange(0, 101, 20))
        subplts[x_counter, y_counter].set_ylim([0, 101])
        subplts[x_counter, y_counter].set_title(conference.acronym)
    # Legends
    patches = []
    labels = []
    for topic in xrange(lda_model.n_topics):
        patches.append(mpatches.Patch(color=get_color(topic)))
        labels.append('Topic %s' % str(topic))
    f.legend(handles=patches,
             labels=labels,
             loc='upper center',
             bbox_to_anchor=(0.5, 0.04),
             ncol=12,
             fontsize=7)
    plt.savefig("figs/evolution/evolution_7topics.png")
    plt.clf()
    n_top_words = 10
    for index, topic_dist in enumerate(lda_model.topic_word_):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words +
                                                                 1):-1]
        print('Topic {}: {}'.format(index, ', '.join(topic_words)))
Пример #16
0
def pc_bias():
  def index_by_year(tups):
    y_comm = {}
    for tup in tups:
      comm = y_comm.get(tup[1], set())
      comm.add(tup[0])
      y_comm[tup[1]] = comm
    return y_comm

  legit_conferences = ["ICSE", "MSR", "FSE", "ASE"]
  colors = ['r', 'g', 'b', 'y']
  graph = cite_graph(GRAPH_CSV)
  width = 0.5
  space = 0.3
  p_conferences = graph.get_papers_by_venue()
  p_committees = graph.get_committee_by_conference()
  max_len = 21
  low = 1
  high = max_len * (len(legit_conferences)*width + space) + 1
  delta = (high - low)/max_len
  x_axis = np.arange(low, high, delta)
  x_ticks = np.arange(1993, 1993+max_len)
  conf_index = 0
  patches = []
  for conference in mysql.get_conferences():
    if conference.acronym not in legit_conferences: continue
    year_committees = index_by_year(p_committees[conference.id])
    year_papers = index_by_year(p_conferences[conference.id])
    year_scores = {}
    y_axis = []
    #x_axis = np.arange(1, len(year_committees.keys())+1)
    for year in sorted(year_committees.keys(), key=lambda y: int(y)):
      papers = year_papers.get(year,None)
      if papers is None:
        y_axis.append(0)
        continue
      committee = year_committees[year]
      comm_papers = 0
      non_comm_papers = 0
      for paper_id in papers:
        paper = graph.paper_nodes[paper_id]
        author_ids = set(paper.author_ids.strip().split(","))
        if author_ids.intersection(committee):
          comm_papers += 1
        else:
          non_comm_papers += 1
      year_scores[year] = (comm_papers, non_comm_papers)
      percent = 0 if not comm_papers else comm_papers*100/(comm_papers+non_comm_papers)
      y_axis.append(percent)
    y_axis = np.lib.pad(y_axis, (max_len-len(y_axis), 0), 'constant', constant_values=0)
    plt.bar(x_axis+conf_index*width, y_axis, width=width, color=colors[conf_index])
    patches.append(mpatches.Patch(color=colors[conf_index], label=conference.acronym))
    conf_index += 1
  plt.xlabel("Year")
  plt.ylabel("% of papers by PC")
  plt.xticks(x_axis + len(legit_conferences)*width/2, [str(y)[2:] for y in x_ticks])
  #plt.yticks(np.arange(0, 100, 10))
  #plt.title(conference.acronym)
  plt.legend(handles=patches, loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=len(legit_conferences), fontsize=7)
  plt.savefig("figs/pc/pc.png")
  plt.clf()
Пример #17
0
def conference_diversity():
    graph = cite_graph(GRAPH_CSV)
    miner = Miner(graph)
    lda_model, vocab = miner.lda(7,
                                 n_iter=100,
                                 alpha=0.847433736937,
                                 beta=0.763774618977)
    conferences = graph.get_papers_by_venue()
    conference_topics = {}
    conference_heatmaps = {}
    for conference_id, papers in conferences.items():
        topics = np.array([0] * lda_model.n_topics)
        for tup in yearize(papers).items():
            for paper_id in tup[1]:
                topics = np.add(topics, miner.documents[paper_id].topics_count)
        conference_topics[conference_id] = percent_sort(topics)
        conference_heatmaps[conference_id] = topics
    #fig, ax = plt.subplots()
    bar_vals = []
    colors = []
    width = 0.75
    plts = []
    x_axis = np.arange(1, len(conference_topics.keys()) + 1)
    #x_axis = [c.acronym for c in mysql.get_conferences()]
    y_offset = np.array([0] * len(conference_topics.keys()))
    colors_dict = {}
    for index in range(7):
        bar_val = []
        color = []
        for conference_id in sorted(conference_topics.keys(),
                                    key=lambda x: int(x)):
            topic = conference_topics[conference_id][index]
            colors_dict[topic[0]] = get_color(topic[0])
            color.append(colors_dict[topic[0]])
            bar_val.append(topic[1])
        plts.append(
            plt.bar(x_axis, bar_val, width, color=color, bottom=y_offset))
        y_offset = np.add(y_offset, bar_val)
    plt.ylabel("Topic Coverage %")
    plt.xlabel("Conferences")
    plt.xticks(x_axis + width / 2,
               [c.acronym for c in mysql.get_conferences()])
    plt.yticks(np.arange(0, 101, 10))
    plt.ylim([0, 101])
    #Legends
    patches = []
    for topic, color in colors_dict.items():
        patches.append(
            mpatches.Patch(color=color, label='Topic %s' % str(topic)))
    plt.legend(handles=patches,
               loc='upper center',
               bbox_to_anchor=(0.5, -0.05),
               ncol=6,
               fontsize=7)
    plt.savefig("figs/diversity/conference_diversity_7topics.png")
    plt.clf()
    n_top_words = 10
    #Heatmap
    heatmap_arr = []
    for conference_id in sorted(conference_heatmaps.keys(),
                                key=lambda x: int(x)):
        tot = sum(conference_heatmaps[conference_id])
        dist = [top / tot for top in conference_heatmaps[conference_id]]
        heatmap_arr.append(dist)
    fig, ax = plt.subplots()
    heatmap_arr = np.array(heatmap_arr)
    heatmap = ax.pcolor(heatmap_arr, cmap=plt.cm.Reds)
    plt.ylabel("Conferences")
    plt.xlabel("Topics")
    # row_labels = range(lda_model.n_topics)
    t_names = [
        "Testing", "Applications", "Program Analysis", "Tools and Projects",
        "Defect Analysis", "Modeling", "Maintenance"
    ]
    row_labels = [
        str(ind) + "-" + name
        for ind, name in zip(range(lda_model.n_topics), t_names)
    ]
    column_labels = [c.acronym for c in mysql.get_conferences()]
    ax.set_xticks(np.arange(heatmap_arr.shape[1]) + 0.5, minor=False)
    ax.set_yticks(np.arange(heatmap_arr.shape[0]) + 0.5, minor=False)
    ax.set_xticklabels(row_labels, minor=False)
    ax.set_yticklabels(column_labels, minor=False)
    plt.savefig("figs/diversity/heatmap_7topics.png")
    plt.clf()
    for index, topic_dist in enumerate(lda_model.topic_word_):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words +
                                                                 1):-1]
        print('Topic {}: {}'.format(index, ', '.join(topic_words)))
    # make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/diversity/dend_heatmap_7topics.png")
    make_heatmap(np.transpose(heatmap_arr), row_labels, column_labels,
                 "figs/diversity/heatmap2.png")
Пример #18
0
        return name_map[name]
    return name


def yearize(paps):
    paps = sorted(paps, key=lambda tup: tup[1], reverse=True)
    pap_dict = {}
    for pap in paps:
        year_paps = pap_dict.get(int(pap[1]), [])
        year_paps.append(pap[0])
        pap_dict[int(pap[1])] = year_paps
    return OrderedDict(sorted(pap_dict.items(), key=lambda t: t[0]))


CONFERENCES = [
    shorter_names(conf.acronym) for conf in mysqldb.get_conferences()
]


@Memoized
def retrieve_graph_lda_data():
    """
  Fetch stored metadata
  :return:
  """
    graph_file = 'cache/%s/%s/graph.pkl' % (THE.version, THE.permitted)
    vectorizer_file = 'cache/%s/%s/vectorizer.pkl' % (THE.version,
                                                      THE.permitted)
    doc_2_vec_file = 'cache/%s/%s/doc_2_vec.pkl' % (THE.version, THE.permitted)
    documents_file = 'cache/%s/%s/documents.pkl' % (THE.version, THE.permitted)
    lda_model_file = 'cache/%s/%s/lda_model.pkl' % (THE.version, THE.permitted)
Пример #19
0
def pc_heatmap_delta(fig_name, title=None, paper_range=None):
  def index_by_year(tups):
    y_comm = {}
    for tup in tups:
      comm = y_comm.get(tup[1], set())
      comm.add(tup[0])
      y_comm[tup[1]] = comm
    return y_comm

  miner, graph, lda_model, vocab = get_graph_lda_data()
  p_conferences = graph.get_papers_by_venue()
  p_committees = graph.get_committee_by_conference()
  conference_topics = {}
  pc_conference_topics = {}
  for conference in mysqldb.get_conferences():
    year_committees = index_by_year(p_committees[conference.id])
    year_papers = index_by_year(p_conferences[conference.id])
    topics = np.array([0] * lda_model.n_topics)
    pc_topics = np.array([0] * lda_model.n_topics)
    for year in sorted(year_committees.keys(), key=lambda y: int(y)):
      if (paper_range is not None) and (int(year) not in paper_range):
        continue
      papers = year_papers.get(year, None)
      if papers is None:
        continue
      committee = year_committees[year]
      for paper_id in papers:
        paper = graph.paper_nodes[paper_id]
        author_ids = set(paper.author_ids.strip().split(","))
        paper_topics = miner.documents[paper_id].topics_count
        if len(author_ids.intersection(committee)) != 0:
          pc_topics = np.add(pc_topics, paper_topics)
        topics = np.add(topics, paper_topics)
    pc_conference_topics[conference.id] = pc_topics
    conference_topics[conference.id] = topics
  heatmap_arr = []
  valid_conferences = []
  for conference_id in sorted(conference_topics.keys(), key=lambda x: int(x)):
    tot = sum(conference_topics[conference_id])
    pc_tot = sum(pc_conference_topics[conference_id])
    if tot <= 0 or pc_tot <= 0:
      continue
    valid_conferences.append(conference_id)
    dist = [top / tot for top in conference_topics[conference_id]]
    pc_dist = [top / pc_tot for top in pc_conference_topics[conference_id]]
    # heatmap_arr.append([round(pc_d - d, 2) for d, pc_d in zip(dist, pc_dist)])
    heatmap_arr.append([int(round(100 * (pc_d - d) / d, 0)) for d, pc_d in zip(dist, pc_dist)])
    # heatmap_arr.append([round(d / pc_d, 2) for d, pc_d in zip(dist, pc_dist)])

  np.savetxt("temp.csv", np.transpose(np.array(heatmap_arr)), delimiter=",")
  # HeatMap
  # row_labels = ["%2d" % ind for ind in range(lda_model.n_topics)]
  row_labels = TOPICS_ALL
  col_labels = [c.acronym for c in mysqldb.get_conferences() if c.id in valid_conferences]
  heatmap_arr = np.transpose(np.array(heatmap_arr, np.int))
  plt.figure(figsize=(4, 3))
  cmap = mpl.colors.ListedColormap(['red', 'lightsalmon', 'white', 'palegreen','lime'])
  bounds = [-20, -12, -5, 5, 12, 20]
  # bounds = [-0.2, -0.12, -0.05, 0.05, 0.12, 0.2]
  norm = mpl.colors.BoundaryNorm(bounds, cmap.N)
  # df = pd.DataFrame(heatmap_arr, columns=col_labels, index=row_labels)
  cax = plt.matshow(heatmap_arr, interpolation='nearest', cmap=cmap, norm=norm)
  for (i, j), z in np.ndenumerate(heatmap_arr):
    plt.text(j, i, abs(z), ha='center', va='center', fontsize=11)
  # ticks = [-0.2, -0.1, 0, 0.1, 0.2]
  ticks = [-20, -10, 0, 10, 20]
  plt.colorbar(cax, cmap=cmap, norm=norm, boundaries=bounds, ticks=ticks)
  plt.xticks(np.arange(len(list(col_labels))), list(col_labels), rotation="vertical")
  plt.yticks(np.arange(len(list(row_labels))), list(row_labels))
  if title is None:
    title = "Topic Distribution Delta between papers by PC and all papers"
  plt.title(title, y=1.2)
  plt.savefig("figs/v3/%s/pc/%s.png" % (THE.permitted, fig_name), bbox_inches='tight')
  plt.clf()
Пример #20
0
def get_n_topics():
  if THE.permitted == "journals":
    return 7
  if THE.permitted == "all":
    return 11


STOP_WORDS = text.ENGLISH_STOP_WORDS.union(['software', 'engineering', 'paper', 'study', 'based',
                                            'results', 'approach', 'case', 'workshop', 'international', 'research',
                                            'conference', 'introduction', 'editors', 'article', 'issue', 'month',
                                            'copyright', 'special', 'used', 'using', 'use', 'studies', 'review',
                                            'editorial', 'report', 'book', 'ieee', 'published', 'science', 'column',
                                            'author', 'proposed', 'icse', 'article', 'year', 'articles', 'page', '2000',
                                            '2004', 'papers', 'computer', 'held', 'editor'])

CONFERENCES = [venue.acronym for venue in mysqldb.get_conferences()]

# Config
THE = O()
THE.permitted = "all"


def is_not_none(s):
  return s and s != 'None'


def harmonic_dist(n):
  dist = [1 / i for i in range(1, n + 1)]
  total = sum(dist)
  return [d / total for d in dist]
Пример #21
0
def is_true(val):
  return val in [True, 'True', 'true']


def shorter_names(name):
  name_map = {
      "SOFTWARE": "S/W",
      "SIGSOFT": "NOTES",
      "MODELS": "MDLS"
  }
  if name in name_map:
    return name_map[name]
  return name


CONFERENCES = [shorter_names(conf.acronym) for conf in mysqldb.get_conferences()]


@Memoized
def retrieve_graph_lda_data():
  """
  Fetch stored metadata
  :return:
  """
  graph_file = 'cache/%s/%s/graph.pkl' % (THE.version, THE.permitted)
  vectorizer_file = 'cache/%s/%s/vectorizer.pkl' % (THE.version, THE.permitted)
  doc_2_vec_file = 'cache/%s/%s/doc_2_vec.pkl' % (THE.version, THE.permitted)
  documents_file = 'cache/%s/%s/documents.pkl' % (THE.version, THE.permitted)
  lda_model_file = 'cache/%s/%s/lda_model.pkl' % (THE.version, THE.permitted)
  vocabulary_file = 'cache/%s/%s/vocabulary.pkl' % (THE.version, THE.permitted)
  if os.path.isfile(graph_file) and os.path.isfile(vectorizer_file) \
Пример #22
0
def conference_evolution_2(paper_range, figname):
  graph = cite_graph(GRAPH_CSV)
  miner = Miner(graph)
  lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977)
  conferences = graph.get_papers_by_venue()
  conference_topics = {}
  conference_heatmaps = {}
  for conference_id, papers in conferences.items():
    topics = np.array([0] * lda_model.n_topics)
    for tup in yearize(papers).items():
      if tup[0] not in paper_range:
        continue
      for paper_id in tup[1]:
        topics = np.add(topics, miner.documents[paper_id].topics_count)
    conference_topics[conference_id] = percent_sort(topics)
    conference_heatmaps[conference_id] = topics
  n_top_words = 10
  #Heatmap
  heatmap_arr = []
  column_labels = []
  for conference_id, conf in zip(sorted(conference_heatmaps.keys(), key=lambda x: int(x)), mysql.get_conferences()):
    tot = sum(conference_heatmaps[conference_id])
    if tot == 0: continue
    column_labels.append(conf.acronym)
    dist = [top/tot for top in conference_heatmaps[conference_id]]
    heatmap_arr.append(dist)
  fig, ax = plt.subplots()
  heatmap_arr = np.array(heatmap_arr)
  heatmap = ax.pcolor(heatmap_arr, cmap=plt.cm.Reds)
  plt.ylabel("Conferences")
  plt.xlabel("Topics")
  row_labels = range(lda_model.n_topics)
  ax.set_xticks(np.arange(heatmap_arr.shape[1])+0.5, minor=False)
  ax.set_yticks(np.arange(heatmap_arr.shape[0])+0.5, minor=False)
  ax.set_xticklabels(row_labels, minor=False)
  ax.set_yticklabels(column_labels, minor=False)
  plt.savefig("figs/diversity/heatmap_7topics.png")
  plt.clf()
  for index, topic_dist in enumerate(lda_model.topic_word_):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1]
    print('Topic {}: {}'.format(index, ', '.join(topic_words)))
  # make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/diversity/dend_heatmap_7topics.png")
  make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/evolution/%s.png"%figname)