Пример #1
0
def retrieve_graph_lda_data():
  graph_file = 'cache/%s/graph.pkl' % THE.permitted
  vectorizer_file = 'cache/%s/vectorizer.pkl' % THE.permitted
  doc_2_vec_file = 'cache/%s/doc_2_vec.pkl' % THE.permitted
  documents_file = 'cache/%s/documents.pkl' % THE.permitted
  lda_model_file = 'cache/%s/lda_model.pkl' % THE.permitted
  vocabulary_file = 'cache/%s/vocabulary.pkl' % THE.permitted
  if os.path.isfile(graph_file) and os.path.isfile(vectorizer_file) \
          and os.path.isfile(doc_2_vec_file) and os.path.isfile(documents_file) \
          and os.path.isfile(lda_model_file) and os.path.isfile(vocabulary_file):
    with open(graph_file) as f:
      graph = pkl.load(f)
    miner = Miner(graph)
    with open(vectorizer_file) as f:
      miner.vectorizer = pkl.load(f)
    with open(doc_2_vec_file) as f:
      miner.doc_2_vec = joblib.load(f)
    with open(documents_file) as f:
      miner.documents = pkl.load(f)
    with open(lda_model_file) as f:
      lda_model = pkl.load(f)
    with open(vocabulary_file) as f:
      vocab = pkl.load(f)
  else:
    miner, graph, lda_model, vocab = store_graph_lda_data()
  return miner, graph, lda_model, vocab
Пример #2
0
def retrieve_graph_lda_data():
  """
  Fetch stored metadata
  :return:
  """
  graph_file = 'cache/%s/%s/graph.pkl' % (THE.version, THE.permitted)
  vectorizer_file = 'cache/%s/%s/vectorizer.pkl' % (THE.version, THE.permitted)
  doc_2_vec_file = 'cache/%s/%s/doc_2_vec.pkl' % (THE.version, THE.permitted)
  documents_file = 'cache/%s/%s/documents.pkl' % (THE.version, THE.permitted)
  lda_model_file = 'cache/%s/%s/lda_model.pkl' % (THE.version, THE.permitted)
  vocabulary_file = 'cache/%s/%s/vocabulary.pkl' % (THE.version, THE.permitted)
  if os.path.isfile(graph_file) and os.path.isfile(vectorizer_file) \
          and os.path.isfile(doc_2_vec_file) and os.path.isfile(documents_file) \
          and os.path.isfile(lda_model_file) and os.path.isfile(vocabulary_file):
    with open(graph_file) as f:
      graph = cPkl.load(f)
    miner = Miner(graph)
    with open(vectorizer_file) as f:
      miner.vectorizer = cPkl.load(f)
    with open(doc_2_vec_file) as f:
      miner.doc_2_vec = joblib.load(f)
    with open(documents_file) as f:
      miner.documents = cPkl.load(f)
    with open(lda_model_file) as f:
      lda_model = cPkl.load(f)
    with open(vocabulary_file) as f:
      vocab = cPkl.load(f)
  else:
    miner, graph, lda_model, vocab = store_graph_lda_data()
  return miner, graph, lda_model, vocab
Пример #3
0
def get_top_papers():
    n_topics = 7
    top_papers = {}
    for index in range(n_topics):
        top_papers[index] = []
    graph = cite_graph(GRAPH_CSV)
    miner = Miner(graph)
    lda_model, vocab = miner.lda(7,
                                 n_iter=100,
                                 alpha=0.847433736937,
                                 beta=0.763774618977)
    for paper_id, paper in graph.paper_nodes.items():
        topics = miner.documents[paper_id].topics_count
        # if int(paper.year) < 2009: continue
        if max(topics) == 0:
            continue
        topic = topics.argmax()
        # cites = len(paper.cites.split(",")) if paper.cites else 0
        cites = paper.local_cites
        top_papers[topic].append([(cites, paper.title, paper.authors,
                                   paper.year)])
    for index in range(n_topics):
        top_papers[index] = sorted(top_papers[index], reverse=True)[:4]
        print("***", index, "***")
        for paper in top_papers[index]:
            paper = paper[0]
            print(paper[0], paper[-1] + " - " + paper[1] + ", " + paper[2])
Пример #4
0
def retrieve_graph_lda_data():
    """
  Fetch stored metadata
  :return:
  """
    graph_file = 'cache/%s/%s/graph.pkl' % (THE.version, THE.permitted)
    vectorizer_file = 'cache/%s/%s/vectorizer.pkl' % (THE.version,
                                                      THE.permitted)
    doc_2_vec_file = 'cache/%s/%s/doc_2_vec.pkl' % (THE.version, THE.permitted)
    documents_file = 'cache/%s/%s/documents.pkl' % (THE.version, THE.permitted)
    lda_model_file = 'cache/%s/%s/lda_model.pkl' % (THE.version, THE.permitted)
    vocabulary_file = 'cache/%s/%s/vocabulary.pkl' % (THE.version,
                                                      THE.permitted)
    if os.path.isfile(graph_file) and os.path.isfile(vectorizer_file) \
            and os.path.isfile(doc_2_vec_file) and os.path.isfile(documents_file) \
            and os.path.isfile(lda_model_file) and os.path.isfile(vocabulary_file):
        with open(graph_file) as f:
            graph = cPkl.load(f)
        miner = Miner(graph,
                      permitted=THE.permitted,
                      ignores=THE.IGNORE_VENUES[THE.version])
        with open(vectorizer_file) as f:
            miner.vectorizer = cPkl.load(f)
        with open(doc_2_vec_file) as f:
            miner.doc_2_vec = joblib.load(f)
        with open(documents_file) as f:
            miner.documents = cPkl.load(f)
        with open(lda_model_file) as f:
            lda_model = cPkl.load(f)
        with open(vocabulary_file) as f:
            vocab = cPkl.load(f)
    else:
        miner, graph, lda_model, vocab = store_graph_lda_data()
    return miner, graph, lda_model, vocab
Пример #5
0
def get_graph_lda_data(iterations=ITERATIONS):
    graph = cite_graph(GRAPH_CSV)
    miner = Miner(graph)
    lda_model, vocab = miner.lda(N_TOPICS,
                                 n_iter=iterations,
                                 alpha=ALPHA,
                                 beta=BETA)
    return miner, graph, lda_model, vocab
Пример #6
0
def lda_topics():
  graph = cite_graph(GRAPH_CSV)
  miner = Miner(graph)
  lda_model, vocab = miner.lda(11, n_iter=100, alpha=0.22359, beta=0.53915)
  # lda_model, vocab = miner.lda(11, n_iter=100, alpha=0.847433736937, beta=0.763774618977)
  n_top_words = 15
  for index, topic_dist in enumerate(lda_model.topic_word_):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1]
    print('Topic {}: {}'.format(index, ', '.join(topic_words)))
Пример #7
0
def get_graph_lda_data():
    graph = cite_graph(GRAPH_CSV)
    miner = Miner(graph, THE.permitted, THE.IGNORE_VENUES[THE.version])
    lda_model, vocab = miner.lda(get_n_topics(),
                                 n_iter=ITERATIONS,
                                 alpha=ALPHA,
                                 beta=BETA,
                                 stop_words=STOP_WORDS)
    return miner, graph, lda_model, vocab
Пример #8
0
def super_author(fig_prefix="super_author", top_percent=1.00):
    graph = cite_graph(GRAPH_CSV)
    miner = Miner(graph)
    lda_model, vocab = miner.lda(7,
                                 n_iter=100,
                                 alpha=0.847433736937,
                                 beta=0.763774618977)
    authors = graph.get_papers_by_authors()
    author_topics = {}
    tops = top_authors(graph, top_percent)
    for author_id, papers in authors.items():
        if author_id not in tops:
            continue
        topics = [0] * lda_model.n_topics
        for paper_id, _, __ in papers:
            document = miner.documents[paper_id]
            for index, topic_count in enumerate(document.topics_count):
                if topic_count >= TOPIC_THRESHOLD:
                    topics[index] = 1
        author_topics[author_id] = sum(topics)
    vals = sorted(author_topics.values(), reverse=True)
    # x_axis = range(1, len(vals) + 1)
    # plt.ylabel("Topic Count")
    # plt.xlabel("Author ID")
    # plt.title("Super Author")
    # plt.ylim(min(vals)-1, max(vals)+1)
    # plt.plot(x_axis, vals)
    # plt.savefig("figs/super_author/%s.png"%fig_prefix)
    # plt.clf()
    fig = plt.figure(figsize=(8, 2), dpi=100)
    counter = Counter()
    for val in vals:
        counter[val] += 1
    bar_x = []
    bar_y = []
    for key in sorted(counter.keys()):
        bar_x.append(key)
        bar_y.append(counter[key])
    print(bar_x, bar_y)
    return
    fig, ax = plt.subplots()
    width = 2 / 3
    ax.bar(bar_x, bar_y, 2 / 3, color='blue', align='center')
    ax.set_xticks(np.arange(1, lda_model.n_topics + 1))
    ax.set_xticklabels(np.arange(1, lda_model.n_topics + 1))
    # for i, v in zip(bar_x,bar_y):
    #   ax.text(i, v + 0.25, str(v), color='red', fontweight='bold', fontsize=11, horizontalalignment='center')
    plt.xlabel("Topics")
    plt.ylabel("Authors Count")
    # plt.ylim(min(bar_y) - 1, max(bar_y) + 1)
    plt.savefig("figs/super_author/%s_bar.png" % fig_prefix)
    plt.clf()
    n_top_words = 10
    for index, topic_dist in enumerate(lda_model.topic_word_):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words +
                                                                 1):-1]
        print('Topic {}: {}'.format(index, ', '.join(topic_words)))
Пример #9
0
def lda_topics():
    graph = cite_graph(GRAPH_CSV)
    miner = Miner(graph)
    lda_model, vocab = miner.lda(11, n_iter=100, alpha=0.22359, beta=0.53915)
    # lda_model, vocab = miner.lda(11, n_iter=100, alpha=0.847433736937, beta=0.763774618977)
    n_top_words = 15
    for index, topic_dist in enumerate(lda_model.topic_word_):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words +
                                                                 1):-1]
        print('Topic {}: {}'.format(index, ', '.join(topic_words)))
Пример #10
0
def topic_evolution():
  graph = cite_graph(GRAPH_CSV)
  miner = Miner(graph)
  lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977)
  paper_nodes = graph.paper_nodes
  topics_map = {}
  n_topics = lda_model.n_topics
  for paper_id, paper in paper_nodes.items():
    document = miner.documents[paper_id]
    year_topics = topics_map.get(paper.year, np.array([0]*n_topics))
    topics_map[paper.year] = np.add(year_topics, document.topics_count)
  yt_map = {}
  for year, t_count in topics_map.items():
    yt_map[year] = percent_sort(t_count)
  width = 0.8
  plts = []
  x_axis = np.arange(1, len(yt_map.keys()) + 1)
  # x_axis = [c.acronym for c in mysql.get_conferences()]
  y_offset = np.array([0] * len(yt_map.keys()))
  colors_dict = {}
  TOP_TOPIC_COUNT = 7
  for index in range(TOP_TOPIC_COUNT):
    bar_val, color = [], []
    for year in sorted(yt_map.keys(), key=lambda x: int(x)):
      topic = yt_map[year][index]
      colors_dict[topic[0]] = get_color(topic[0])
      color.append(colors_dict[topic[0]])
      bar_val.append(topic[1])
    plts.append(plt.bar(x_axis, bar_val, width, color=color, bottom=y_offset))
    y_offset = np.add(y_offset, bar_val)
  plt.ylabel("Topic %")
  plt.xlabel("Year")
  plt.xticks(x_axis + width/2, [str(y)[2:] for y in sorted(yt_map.keys(), key=lambda x: int(x))])
  plt.yticks(np.arange(0, 101, 10))
  plt.ylim([0, 101])
  # Legends
  patches = []
  squares = []
  names = []
  t_names = ["Testing", "Applications", "Program Analysis", "Tools and Projects",
             "Defect Analysis", "Modeling", "Maintenance"]
  for index, (topic, color) in enumerate(colors_dict.items()):
    print(topic)
    patches.append(mpatches.Patch(color=color, label='Topic %s' % str(topic)))
    squares.append(plts[index][0])
    # names.append('Topic %s' % str(topic))
    # names.append(t_names[index])
  # plt.legend(handles=patches, loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=7, fontsize=9)
  plt.legend(tuple(patches), tuple(t_names), loc='upper center', bbox_to_anchor=(0.5, 1.14), ncol=4, fontsize=11, handlelength=0.7)
  plt.savefig("figs/topic_evolution/topic_evolution_7_gib.png")
  plt.clf()
  n_top_words = 10
  for index, topic_dist in enumerate(lda_model.topic_word_):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1]
    print('Topic {}: {}'.format(index, ', '.join(topic_words)))
Пример #11
0
def super_author(fig_prefix="super_author" ,top_percent=1.00):
  graph = cite_graph(GRAPH_CSV)
  miner = Miner(graph)
  lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977)
  authors = graph.get_papers_by_authors()
  author_topics = {}
  tops = top_authors(graph, top_percent)
  for author_id, papers in authors.items():
    if author_id not in tops:
      continue
    topics = [0]*lda_model.n_topics
    for paper_id, _, __ in papers:
      document = miner.documents[paper_id]
      for index, topic_count in enumerate(document.topics_count):
        if topic_count >= TOPIC_THRESHOLD:
          topics[index] = 1
    author_topics[author_id] = sum(topics)
  vals = sorted(author_topics.values(), reverse=True)
  # x_axis = range(1, len(vals) + 1)
  # plt.ylabel("Topic Count")
  # plt.xlabel("Author ID")
  # plt.title("Super Author")
  # plt.ylim(min(vals)-1, max(vals)+1)
  # plt.plot(x_axis, vals)
  # plt.savefig("figs/super_author/%s.png"%fig_prefix)
  # plt.clf()
  fig = plt.figure(figsize=(8, 2), dpi=100)
  counter = Counter()
  for val in vals:
    counter[val] += 1
  bar_x = []
  bar_y = []
  for key in sorted(counter.keys()):
    bar_x.append(key)
    bar_y.append(counter[key])
  print(bar_x, bar_y)
  return
  fig, ax = plt.subplots()
  width = 2/3
  ax.bar(bar_x, bar_y, 2/3, color='blue', align='center')
  ax.set_xticks(np.arange(1,lda_model.n_topics+1))
  ax.set_xticklabels(np.arange(1,lda_model.n_topics+1))
  # for i, v in zip(bar_x,bar_y):
  #   ax.text(i, v + 0.25, str(v), color='red', fontweight='bold', fontsize=11, horizontalalignment='center')
  plt.xlabel("Topics")
  plt.ylabel("Authors Count")
  # plt.ylim(min(bar_y) - 1, max(bar_y) + 1)
  plt.savefig("figs/super_author/%s_bar.png"%fig_prefix)
  plt.clf()
  n_top_words = 10
  for index, topic_dist in enumerate(lda_model.topic_word_):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1]
    print('Topic {}: {}'.format(index, ', '.join(topic_words)))
Пример #12
0
def conference_evolution_2(paper_range, figname):
    graph = cite_graph(GRAPH_CSV)
    miner = Miner(graph)
    lda_model, vocab = miner.lda(7,
                                 n_iter=100,
                                 alpha=0.847433736937,
                                 beta=0.763774618977)
    conferences = graph.get_papers_by_venue()
    conference_topics = {}
    conference_heatmaps = {}
    for conference_id, papers in conferences.items():
        topics = np.array([0] * lda_model.n_topics)
        for tup in yearize(papers).items():
            if tup[0] not in paper_range:
                continue
            for paper_id in tup[1]:
                topics = np.add(topics, miner.documents[paper_id].topics_count)
        conference_topics[conference_id] = percent_sort(topics)
        conference_heatmaps[conference_id] = topics
    n_top_words = 10
    #Heatmap
    heatmap_arr = []
    column_labels = []
    for conference_id, conf in zip(
            sorted(conference_heatmaps.keys(), key=lambda x: int(x)),
            mysql.get_conferences()):
        tot = sum(conference_heatmaps[conference_id])
        if tot == 0: continue
        column_labels.append(conf.acronym)
        dist = [top / tot for top in conference_heatmaps[conference_id]]
        heatmap_arr.append(dist)
    fig, ax = plt.subplots()
    heatmap_arr = np.array(heatmap_arr)
    heatmap = ax.pcolor(heatmap_arr, cmap=plt.cm.Reds)
    plt.ylabel("Conferences")
    plt.xlabel("Topics")
    row_labels = range(lda_model.n_topics)
    ax.set_xticks(np.arange(heatmap_arr.shape[1]) + 0.5, minor=False)
    ax.set_yticks(np.arange(heatmap_arr.shape[0]) + 0.5, minor=False)
    ax.set_xticklabels(row_labels, minor=False)
    ax.set_yticklabels(column_labels, minor=False)
    plt.savefig("figs/diversity/heatmap_7topics.png")
    plt.clf()
    for index, topic_dist in enumerate(lda_model.topic_word_):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words +
                                                                 1):-1]
        print('Topic {}: {}'.format(index, ', '.join(topic_words)))
    # make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/diversity/dend_heatmap_7topics.png")
    make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels,
                       "figs/evolution/%s.png" % figname)
Пример #13
0
def pc_topics_heatmap(year_range=None):
    def index_by_year(tups):
        y_comm = {}
        for tup in tups:
            comm = y_comm.get(tup[1], set())
            comm.add(tup[0])
            y_comm[tup[1]] = comm
        return y_comm

    graph = cite_graph(GRAPH_CSV)
    miner = Miner(graph)
    lda_model, vocab = miner.lda(7,
                                 n_iter=100,
                                 alpha=0.847433736937,
                                 beta=0.763774618977)
    max_len = 21
    start = 1993
    p_conferences = graph.get_papers_by_venue()
    p_committees = graph.get_committee_by_conference()
    conference_topics = {}
    for conference in mysql.get_conferences():
        year_committees = index_by_year(p_committees[conference.id])
        year_papers = index_by_year(p_conferences[conference.id])
        year_scores = {}
        topics = np.array([0] * lda_model.n_topics)
        for year in sorted(year_committees.keys(), key=lambda y: int(y)):
            if (year_range is not None) and (int(year) not in year_range):
                continue
            papers = year_papers.get(year, None)
            if papers is None:
                year_scores[int(year)] = None
                continue
            committee = year_committees[year]
            for paper_id in papers:
                paper = graph.paper_nodes[paper_id]
                author_ids = set(paper.author_ids.strip().split(","))
                if author_ids.intersection(committee):
                    continue
                topics = np.add(topics, miner.documents[paper_id].topics_count)
        conference_topics[conference.id] = topics
    heatmap_arr = []
    for conference_id in sorted(conference_topics.keys(),
                                key=lambda x: int(x)):
        tot = sum(conference_topics[conference_id])
        dist = [top / tot for top in conference_topics[conference_id]]
        heatmap_arr.append(dist)
    row_labels = range(lda_model.n_topics)
    column_labels = [c.acronym for c in mysql.get_conferences()]
    make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels,
                       "figs/pc/pc_heatmap_7topics.png")
Пример #14
0
def pc_topics_heatmap(year_range=None):
  def index_by_year(tups):
    y_comm = {}
    for tup in tups:
      comm = y_comm.get(tup[1], set())
      comm.add(tup[0])
      y_comm[tup[1]] = comm
    return y_comm

  graph = cite_graph(GRAPH_CSV)
  miner = Miner(graph)
  lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977)
  max_len = 21
  start = 1993
  p_conferences = graph.get_papers_by_venue()
  p_committees = graph.get_committee_by_conference()
  conference_topics = {}
  for conference in mysql.get_conferences():
    year_committees = index_by_year(p_committees[conference.id])
    year_papers = index_by_year(p_conferences[conference.id])
    year_scores = {}
    topics = np.array([0] * lda_model.n_topics)
    for year in sorted(year_committees.keys(), key=lambda y: int(y)):
      if (year_range is not None) and (int(year) not in year_range):
        continue
      papers = year_papers.get(year, None)
      if papers is None:
        year_scores[int(year)] = None
        continue
      committee = year_committees[year]
      for paper_id in papers:
        paper = graph.paper_nodes[paper_id]
        author_ids = set(paper.author_ids.strip().split(","))
        if author_ids.intersection(committee):
          continue
        topics = np.add(topics, miner.documents[paper_id].topics_count)
    conference_topics[conference.id] = topics
  heatmap_arr = []
  for conference_id in sorted(conference_topics.keys(), key=lambda x: int(x)):
    tot = sum(conference_topics[conference_id])
    dist = [top / tot for top in conference_topics[conference_id]]
    heatmap_arr.append(dist)
  row_labels = range(lda_model.n_topics)
  column_labels = [c.acronym for c in mysql.get_conferences()]
  make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/pc/pc_heatmap_7topics.png")
Пример #15
0
def conference_evolution_2(paper_range, figname):
  graph = cite_graph(GRAPH_CSV)
  miner = Miner(graph)
  lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977)
  conferences = graph.get_papers_by_venue()
  conference_topics = {}
  conference_heatmaps = {}
  for conference_id, papers in conferences.items():
    topics = np.array([0] * lda_model.n_topics)
    for tup in yearize(papers).items():
      if tup[0] not in paper_range:
        continue
      for paper_id in tup[1]:
        topics = np.add(topics, miner.documents[paper_id].topics_count)
    conference_topics[conference_id] = percent_sort(topics)
    conference_heatmaps[conference_id] = topics
  n_top_words = 10
  #Heatmap
  heatmap_arr = []
  column_labels = []
  for conference_id, conf in zip(sorted(conference_heatmaps.keys(), key=lambda x: int(x)), mysql.get_conferences()):
    tot = sum(conference_heatmaps[conference_id])
    if tot == 0: continue
    column_labels.append(conf.acronym)
    dist = [top/tot for top in conference_heatmaps[conference_id]]
    heatmap_arr.append(dist)
  fig, ax = plt.subplots()
  heatmap_arr = np.array(heatmap_arr)
  heatmap = ax.pcolor(heatmap_arr, cmap=plt.cm.Reds)
  plt.ylabel("Conferences")
  plt.xlabel("Topics")
  row_labels = range(lda_model.n_topics)
  ax.set_xticks(np.arange(heatmap_arr.shape[1])+0.5, minor=False)
  ax.set_yticks(np.arange(heatmap_arr.shape[0])+0.5, minor=False)
  ax.set_xticklabels(row_labels, minor=False)
  ax.set_yticklabels(column_labels, minor=False)
  plt.savefig("figs/diversity/heatmap_7topics.png")
  plt.clf()
  for index, topic_dist in enumerate(lda_model.topic_word_):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1]
    print('Topic {}: {}'.format(index, ', '.join(topic_words)))
  # make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/diversity/dend_heatmap_7topics.png")
  make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/evolution/%s.png"%figname)
Пример #16
0
def get_top_papers():
  n_topics = 7
  top_papers = {}
  for index in range(n_topics):
    top_papers[index] = []
  graph = cite_graph(GRAPH_CSV)
  miner = Miner(graph)
  lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977)
  for paper_id, paper in graph.paper_nodes.items():
    topics = miner.documents[paper_id].topics_count
    # if int(paper.year) < 2009: continue
    if max(topics) == 0:
      continue
    topic = topics.argmax()
    # cites = len(paper.cites.split(",")) if paper.cites else 0
    cites = paper.local_cites
    top_papers[topic].append([(cites, paper.title, paper.authors, paper.year)])
  for index in range(n_topics):
    top_papers[index] = sorted(top_papers[index], reverse=True)[:4]
    print("***", index, "***")
    for paper in top_papers[index]:
      paper = paper[0]
      print(paper[0], paper[-1] + " - " + paper[1] + ", " + paper[2])
Пример #17
0
def predict_venues(estimators,
                   is_independent=IS_INDEPENDENT_VENUE,
                   n_folds=5,
                   n_topics=N_TOPICS,
                   alpha=ALPHA,
                   beta=BETA,
                   n_iter=100,
                   min_tfidf_score=0.1,
                   tfidf_top=100,
                   random_state=RANDOM_STATE):
    def make_key(pred, pre_proc):
        return "%s - %s" % (pred.__name__, pre_proc.__name__)

    graph = cite_graph(GRAPH_CSV)
    miner = Miner(graph)
    papers, groups = get_papers_and_groups(graph,
                                           is_independent=is_independent)
    metrics_map = {
        make_key(predictor, preprocessor): []
        for predictor, preprocessor in estimators
    }
    for index, (train_x, train_y, test_x,
                test_y) in enumerate(split(papers, groups, n_folds=n_folds)):
        print("#### Iteration %d" % (index + 1))
        # TSNE
        process_embeddings(index, train_x, test_x)
        # Count Vectorizer
        vectorizer = CountVectorizer(stop_words=STOP_WORDS,
                                     token_pattern=TOKEN_PATTERN)
        train_docs = [paper.raw for paper in train_x]
        test_docs = [paper.raw for paper in test_x]
        train_vectorized = vectorizer.fit_transform(train_docs).toarray()
        test_vectorized = vectorizer.transform(test_docs).toarray()
        # TFIDF
        tfidf_transformer = TfidfTransformer()
        tfidf_matrix = tfidf_transformer.fit_transform(
            train_vectorized).toarray()
        tfidf_matrix[tfidf_matrix < min_tfidf_score] = 0
        tfidf_means = np.mean(tfidf_matrix, axis=0)
        tfidf_top_indices = np.argsort(tfidf_means)[::-1][:tfidf_top]
        # LDA-DE
        alpha = alpha if alpha else 50 / N_TOPICS
        beta = beta if beta else 0.01
        lda_model = lda.LDA(n_topics=n_topics,
                            alpha=alpha,
                            eta=beta,
                            n_iter=n_iter,
                            random_state=random_state)
        train_transformed = lda_model.fit_transform(train_vectorized)
        # Putting it together
        for i, (vectorized,
                topics) in enumerate(zip(train_vectorized, train_transformed)):
            train_x[i].vectorized = vectorized[tfidf_top_indices]
            train_x[i].topics_count = topics
            sum_t = sum(topics)
            sum_t = sum_t if sum_t else 0.00001
            train_x[i].topics_score = [np.float(t / sum_t) for t in topics]
        test_transformed = lda_model.transform(test_vectorized)
        for i, (vectorized,
                topics) in enumerate(zip(test_vectorized, test_transformed)):
            test_x[i].vectorized = vectorized[tfidf_top_indices]
            test_x[i].topics_count = topics
            sum_t = sum(topics)
            sum_t = sum_t if sum_t else 0.00001
            test_x[i].topics_score = [t / sum_t for t in topics]
        for predictor, preprocessor in estimators:
            key = make_key(predictor, preprocessor)
            print(key)
            predicted, metrics = predictor(preprocessor, train_x, train_y,
                                           test_x, test_y)
            metrics_map[key].append(metrics)
    for predictor, preprocessor in estimators:
        key = make_key(predictor, preprocessor)
        print("### " + key)
        measures = Metrics.avg_score(metrics_map[key])
        print(measures)
Пример #18
0
def get_graph_lda_data(iterations=ITERATIONS):
  graph = cite_graph(GRAPH_CSV)
  miner = Miner(graph)
  lda_model, vocab = miner.lda(N_TOPICS, n_iter=iterations, alpha=ALPHA, beta=BETA)
  return miner, graph, lda_model, vocab
Пример #19
0
def get_graph_lda_data():
  graph = cite_graph(GRAPH_CSV)
  miner = Miner(graph, THE.permitted)
  lda_model, vocab = miner.lda(get_n_topics(), n_iter=ITERATIONS, alpha=ALPHA, beta=BETA, stop_words=STOP_WORDS)
  return miner, graph, lda_model, vocab
Пример #20
0
def conference_evolution():
  legit_conferences = ["ICSE", "MSR", "FSE", "ASE"]
  non_legit_conferences = ["GPCE", "FASE"]
  TOP_TOPIC_COUNT = 7
  graph = cite_graph(GRAPH_CSV)
  miner = Miner(graph)
  lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977)
  conferences = graph.get_papers_by_venue()
  f, subplts = plt.subplots(3, 3)
  f.tight_layout()
  y_counter = -1
  x_counter = 0
  for conf_index, conference in enumerate(mysql.get_conferences()):
    # if conference.acronym not in legit_conferences: continue
    if conference.acronym in non_legit_conferences: continue
    y_counter += 1
    if y_counter > 2:
      x_counter += 1
      y_counter = 0
    year_topics = {}
    year_heatmaps = {}
    for year, papers in yearize(conferences[conference.id]).items():
      topics = np.array([0]*lda_model.n_topics)
      for paper_id in papers:
        topics = np.add(topics, miner.documents[paper_id].topics_count)
      year_heatmaps[year] = topics
      year_topics[year] = percent_sort(topics)
    width = 0.8
    x_axis = np.arange(1, len(year_topics.keys()) + 1)
    # x_axis = [c.acronym for c in mysql.get_conferences()]
    y_offset = np.array([0] * len(year_topics.keys()))
    colors_dict={}
    for index in range(TOP_TOPIC_COUNT):
      bar_val, color = [], []
      for year in sorted(year_topics.keys(), key=lambda x:int(x)):
        topic = year_topics[year][index]
        colors_dict[topic[0]] = get_color(topic[0])
        color.append(colors_dict[topic[0]])
        bar_val.append(topic[1])
      subplts[x_counter, y_counter].bar(x_axis, bar_val, width, color=color, bottom=y_offset)
      y_offset = np.add(y_offset, bar_val)
    # subplts[x_counter, y_counter].set_ylabel("Topic Coverage %")
    #subplts[x_counter, y_counter].set_xlabel("Conferences")
    if len(year_topics.keys()) <= 14:
      subplts[x_counter, y_counter].set_xticks(x_axis + width / 2)
      subplts[x_counter, y_counter].set_xticklabels([str(y)[2:] for y in sorted(year_topics.keys(), key=lambda x: int(x))], fontsize=7)
    else:
      subplts[x_counter, y_counter].set_xticks(np.arange(1, len(year_topics.keys()) + 1, 2) + width / 2)
      subplts[x_counter, y_counter].set_xticklabels([str(y)[2:] for index, y in enumerate(sorted(year_topics.keys(), key=lambda x: int(x))) if index%2 == 0], fontsize=7)

    subplts[x_counter, y_counter].set_yticks(np.arange(0, 101, 20))
    subplts[x_counter, y_counter].set_ylim([0,101])
    subplts[x_counter, y_counter].set_title(conference.acronym)
  # Legends
  patches = []
  labels = []
  for topic in xrange(lda_model.n_topics):
    patches.append(mpatches.Patch(color=get_color(topic)))
    labels.append('Topic %s' % str(topic))
  f.legend(handles=patches, labels=labels, loc='upper center', bbox_to_anchor=(0.5, 0.04), ncol=12, fontsize=7)
  plt.savefig("figs/evolution/evolution_7topics.png")
  plt.clf()
  n_top_words = 10
  for index, topic_dist in enumerate(lda_model.topic_word_):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1]
    print('Topic {}: {}'.format(index, ', '.join(topic_words)))
Пример #21
0
def topic_evolution():
    graph = cite_graph(GRAPH_CSV)
    miner = Miner(graph)
    lda_model, vocab = miner.lda(7,
                                 n_iter=100,
                                 alpha=0.847433736937,
                                 beta=0.763774618977)
    paper_nodes = graph.paper_nodes
    topics_map = {}
    n_topics = lda_model.n_topics
    for paper_id, paper in paper_nodes.items():
        document = miner.documents[paper_id]
        year_topics = topics_map.get(paper.year, np.array([0] * n_topics))
        topics_map[paper.year] = np.add(year_topics, document.topics_count)
    yt_map = {}
    for year, t_count in topics_map.items():
        yt_map[year] = percent_sort(t_count)
    width = 0.8
    plts = []
    x_axis = np.arange(1, len(yt_map.keys()) + 1)
    # x_axis = [c.acronym for c in mysql.get_conferences()]
    y_offset = np.array([0] * len(yt_map.keys()))
    colors_dict = {}
    TOP_TOPIC_COUNT = 7
    for index in range(TOP_TOPIC_COUNT):
        bar_val, color = [], []
        for year in sorted(yt_map.keys(), key=lambda x: int(x)):
            topic = yt_map[year][index]
            colors_dict[topic[0]] = get_color(topic[0])
            color.append(colors_dict[topic[0]])
            bar_val.append(topic[1])
        plts.append(
            plt.bar(x_axis, bar_val, width, color=color, bottom=y_offset))
        y_offset = np.add(y_offset, bar_val)
    plt.ylabel("Topic %")
    plt.xlabel("Year")
    plt.xticks(
        x_axis + width / 2,
        [str(y)[2:] for y in sorted(yt_map.keys(), key=lambda x: int(x))])
    plt.yticks(np.arange(0, 101, 10))
    plt.ylim([0, 101])
    # Legends
    patches = []
    squares = []
    names = []
    t_names = [
        "Testing", "Applications", "Program Analysis", "Tools and Projects",
        "Defect Analysis", "Modeling", "Maintenance"
    ]
    for index, (topic, color) in enumerate(colors_dict.items()):
        print(topic)
        patches.append(
            mpatches.Patch(color=color, label='Topic %s' % str(topic)))
        squares.append(plts[index][0])
        # names.append('Topic %s' % str(topic))
        # names.append(t_names[index])
    # plt.legend(handles=patches, loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=7, fontsize=9)
    plt.legend(tuple(patches),
               tuple(t_names),
               loc='upper center',
               bbox_to_anchor=(0.5, 1.14),
               ncol=4,
               fontsize=11,
               handlelength=0.7)
    plt.savefig("figs/topic_evolution/topic_evolution_7_gib.png")
    plt.clf()
    n_top_words = 10
    for index, topic_dist in enumerate(lda_model.topic_word_):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words +
                                                                 1):-1]
        print('Topic {}: {}'.format(index, ', '.join(topic_words)))
Пример #22
0
def conference_evolution():
    legit_conferences = ["ICSE", "MSR", "FSE", "ASE"]
    non_legit_conferences = ["GPCE", "FASE"]
    TOP_TOPIC_COUNT = 7
    graph = cite_graph(GRAPH_CSV)
    miner = Miner(graph)
    lda_model, vocab = miner.lda(7,
                                 n_iter=100,
                                 alpha=0.847433736937,
                                 beta=0.763774618977)
    conferences = graph.get_papers_by_venue()
    f, subplts = plt.subplots(3, 3)
    f.tight_layout()
    y_counter = -1
    x_counter = 0
    for conf_index, conference in enumerate(mysql.get_conferences()):
        # if conference.acronym not in legit_conferences: continue
        if conference.acronym in non_legit_conferences: continue
        y_counter += 1
        if y_counter > 2:
            x_counter += 1
            y_counter = 0
        year_topics = {}
        year_heatmaps = {}
        for year, papers in yearize(conferences[conference.id]).items():
            topics = np.array([0] * lda_model.n_topics)
            for paper_id in papers:
                topics = np.add(topics, miner.documents[paper_id].topics_count)
            year_heatmaps[year] = topics
            year_topics[year] = percent_sort(topics)
        width = 0.8
        x_axis = np.arange(1, len(year_topics.keys()) + 1)
        # x_axis = [c.acronym for c in mysql.get_conferences()]
        y_offset = np.array([0] * len(year_topics.keys()))
        colors_dict = {}
        for index in range(TOP_TOPIC_COUNT):
            bar_val, color = [], []
            for year in sorted(year_topics.keys(), key=lambda x: int(x)):
                topic = year_topics[year][index]
                colors_dict[topic[0]] = get_color(topic[0])
                color.append(colors_dict[topic[0]])
                bar_val.append(topic[1])
            subplts[x_counter, y_counter].bar(x_axis,
                                              bar_val,
                                              width,
                                              color=color,
                                              bottom=y_offset)
            y_offset = np.add(y_offset, bar_val)
        # subplts[x_counter, y_counter].set_ylabel("Topic Coverage %")
        #subplts[x_counter, y_counter].set_xlabel("Conferences")
        if len(year_topics.keys()) <= 14:
            subplts[x_counter, y_counter].set_xticks(x_axis + width / 2)
            subplts[x_counter, y_counter].set_xticklabels([
                str(y)[2:]
                for y in sorted(year_topics.keys(), key=lambda x: int(x))
            ],
                                                          fontsize=7)
        else:
            subplts[x_counter, y_counter].set_xticks(
                np.arange(1,
                          len(year_topics.keys()) + 1, 2) + width / 2)
            subplts[x_counter, y_counter].set_xticklabels([
                str(y)[2:] for index, y in enumerate(
                    sorted(year_topics.keys(), key=lambda x: int(x)))
                if index % 2 == 0
            ],
                                                          fontsize=7)

        subplts[x_counter, y_counter].set_yticks(np.arange(0, 101, 20))
        subplts[x_counter, y_counter].set_ylim([0, 101])
        subplts[x_counter, y_counter].set_title(conference.acronym)
    # Legends
    patches = []
    labels = []
    for topic in xrange(lda_model.n_topics):
        patches.append(mpatches.Patch(color=get_color(topic)))
        labels.append('Topic %s' % str(topic))
    f.legend(handles=patches,
             labels=labels,
             loc='upper center',
             bbox_to_anchor=(0.5, 0.04),
             ncol=12,
             fontsize=7)
    plt.savefig("figs/evolution/evolution_7topics.png")
    plt.clf()
    n_top_words = 10
    for index, topic_dist in enumerate(lda_model.topic_word_):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words +
                                                                 1):-1]
        print('Topic {}: {}'.format(index, ', '.join(topic_words)))
Пример #23
0
def conference_diversity():
  graph = cite_graph(GRAPH_CSV)
  miner = Miner(graph)
  lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977)
  conferences = graph.get_papers_by_venue()
  conference_topics = {}
  conference_heatmaps = {}
  for conference_id, papers in conferences.items():
    topics = np.array([0] * lda_model.n_topics)
    for tup in yearize(papers).items():
      for paper_id in tup[1]:
        topics = np.add(topics, miner.documents[paper_id].topics_count)
    conference_topics[conference_id] = percent_sort(topics)
    conference_heatmaps[conference_id] = topics
  #fig, ax = plt.subplots()
  bar_vals = []
  colors = []
  width = 0.75
  plts = []
  x_axis = np.arange(1, len(conference_topics.keys())+1)
  #x_axis = [c.acronym for c in mysql.get_conferences()]
  y_offset = np.array([0]*len(conference_topics.keys()))
  colors_dict = {}
  for index in range(7):
    bar_val = []
    color = []
    for conference_id in sorted(conference_topics.keys(), key=lambda x: int(x)):
      topic = conference_topics[conference_id][index]
      colors_dict[topic[0]] = get_color(topic[0])
      color.append(colors_dict[topic[0]])
      bar_val.append(topic[1])
    plts.append(plt.bar(x_axis, bar_val, width, color=color, bottom=y_offset))
    y_offset = np.add(y_offset, bar_val)
  plt.ylabel("Topic Coverage %")
  plt.xlabel("Conferences")
  plt.xticks(x_axis+width/2, [c.acronym for c in mysql.get_conferences()])
  plt.yticks(np.arange(0, 101, 10))
  plt.ylim([0,101])
  #Legends
  patches = []
  for topic, color in colors_dict.items():
    patches.append(mpatches.Patch(color=color, label='Topic %s' % str(topic)))
  plt.legend(handles=patches, loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=6, fontsize=7)
  plt.savefig("figs/diversity/conference_diversity_7topics.png")
  plt.clf()
  n_top_words = 10
  #Heatmap
  heatmap_arr = []
  for conference_id in sorted(conference_heatmaps.keys(), key=lambda x: int(x)):
    tot = sum(conference_heatmaps[conference_id])
    dist = [top/tot for top in conference_heatmaps[conference_id]]
    heatmap_arr.append(dist)
  fig, ax = plt.subplots()
  heatmap_arr = np.array(heatmap_arr)
  heatmap = ax.pcolor(heatmap_arr, cmap=plt.cm.Reds)
  plt.ylabel("Conferences")
  plt.xlabel("Topics")
  # row_labels = range(lda_model.n_topics)
  t_names= ["Testing", "Applications", "Program Analysis", "Tools and Projects",
            "Defect Analysis", "Modeling", "Maintenance"]
  row_labels = [str(ind) + "-" + name for ind, name in zip(range(lda_model.n_topics), t_names)]
  column_labels = [c.acronym for c in mysql.get_conferences()]
  ax.set_xticks(np.arange(heatmap_arr.shape[1])+0.5, minor=False)
  ax.set_yticks(np.arange(heatmap_arr.shape[0])+0.5, minor=False)
  ax.set_xticklabels(row_labels, minor=False)
  ax.set_yticklabels(column_labels, minor=False)
  plt.savefig("figs/diversity/heatmap_7topics.png")
  plt.clf()
  for index, topic_dist in enumerate(lda_model.topic_word_):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1]
    print('Topic {}: {}'.format(index, ', '.join(topic_words)))
  # make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/diversity/dend_heatmap_7topics.png")
  make_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/diversity/heatmap2.png")
Пример #24
0
def conference_diversity():
    graph = cite_graph(GRAPH_CSV)
    miner = Miner(graph)
    lda_model, vocab = miner.lda(7,
                                 n_iter=100,
                                 alpha=0.847433736937,
                                 beta=0.763774618977)
    conferences = graph.get_papers_by_venue()
    conference_topics = {}
    conference_heatmaps = {}
    for conference_id, papers in conferences.items():
        topics = np.array([0] * lda_model.n_topics)
        for tup in yearize(papers).items():
            for paper_id in tup[1]:
                topics = np.add(topics, miner.documents[paper_id].topics_count)
        conference_topics[conference_id] = percent_sort(topics)
        conference_heatmaps[conference_id] = topics
    #fig, ax = plt.subplots()
    bar_vals = []
    colors = []
    width = 0.75
    plts = []
    x_axis = np.arange(1, len(conference_topics.keys()) + 1)
    #x_axis = [c.acronym for c in mysql.get_conferences()]
    y_offset = np.array([0] * len(conference_topics.keys()))
    colors_dict = {}
    for index in range(7):
        bar_val = []
        color = []
        for conference_id in sorted(conference_topics.keys(),
                                    key=lambda x: int(x)):
            topic = conference_topics[conference_id][index]
            colors_dict[topic[0]] = get_color(topic[0])
            color.append(colors_dict[topic[0]])
            bar_val.append(topic[1])
        plts.append(
            plt.bar(x_axis, bar_val, width, color=color, bottom=y_offset))
        y_offset = np.add(y_offset, bar_val)
    plt.ylabel("Topic Coverage %")
    plt.xlabel("Conferences")
    plt.xticks(x_axis + width / 2,
               [c.acronym for c in mysql.get_conferences()])
    plt.yticks(np.arange(0, 101, 10))
    plt.ylim([0, 101])
    #Legends
    patches = []
    for topic, color in colors_dict.items():
        patches.append(
            mpatches.Patch(color=color, label='Topic %s' % str(topic)))
    plt.legend(handles=patches,
               loc='upper center',
               bbox_to_anchor=(0.5, -0.05),
               ncol=6,
               fontsize=7)
    plt.savefig("figs/diversity/conference_diversity_7topics.png")
    plt.clf()
    n_top_words = 10
    #Heatmap
    heatmap_arr = []
    for conference_id in sorted(conference_heatmaps.keys(),
                                key=lambda x: int(x)):
        tot = sum(conference_heatmaps[conference_id])
        dist = [top / tot for top in conference_heatmaps[conference_id]]
        heatmap_arr.append(dist)
    fig, ax = plt.subplots()
    heatmap_arr = np.array(heatmap_arr)
    heatmap = ax.pcolor(heatmap_arr, cmap=plt.cm.Reds)
    plt.ylabel("Conferences")
    plt.xlabel("Topics")
    # row_labels = range(lda_model.n_topics)
    t_names = [
        "Testing", "Applications", "Program Analysis", "Tools and Projects",
        "Defect Analysis", "Modeling", "Maintenance"
    ]
    row_labels = [
        str(ind) + "-" + name
        for ind, name in zip(range(lda_model.n_topics), t_names)
    ]
    column_labels = [c.acronym for c in mysql.get_conferences()]
    ax.set_xticks(np.arange(heatmap_arr.shape[1]) + 0.5, minor=False)
    ax.set_yticks(np.arange(heatmap_arr.shape[0]) + 0.5, minor=False)
    ax.set_xticklabels(row_labels, minor=False)
    ax.set_yticklabels(column_labels, minor=False)
    plt.savefig("figs/diversity/heatmap_7topics.png")
    plt.clf()
    for index, topic_dist in enumerate(lda_model.topic_word_):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words +
                                                                 1):-1]
        print('Topic {}: {}'.format(index, ', '.join(topic_words)))
    # make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/diversity/dend_heatmap_7topics.png")
    make_heatmap(np.transpose(heatmap_arr), row_labels, column_labels,
                 "figs/diversity/heatmap2.png")