Exemplo n.º 1
0
def reputation(only_first=False):
    if only_first:
        print("## First Authors Only")
    papers = read_papers()
    submissions = format_conf_acceptance(papers)
    author_map = top_authors(cite_graph(GRAPH_CSV))
    accepteds, rejecteds = [], []
    for conf_id, papers in submissions.items():
        accepted, rejected = [], []
        for paper in papers:
            for i, author in enumerate(paper.authors):
                if only_first and i > 0: break
                cites = 0
                if author in author_map:
                    cites = author_map[author][1]
                if paper.decision == 'accept':
                    accepted.append(cites)
                else:
                    rejected.append(cites)
        print("#### %s" % conf_id)
        print("**Accepted** => Med: %0.2f, IQR: %0.2f, Min: %d, Max: %d" %
              (Metrics.median(accepted), Metrics.iqr(accepted), min(accepted),
               max(accepted)))
        print("**Rejected** => Med: %0.2f, IQR: %0.2f, Min: %d, Max: %d" %
              (Metrics.median(rejected), Metrics.iqr(rejected), min(rejected),
               max(rejected)))
        accepteds += accepted
        rejecteds += rejected
    print("#### All")
    print("**Accepted** => Med: %0.2f, IQR: %0.2f, Min: %d, Max: %d" %
          (Metrics.median(accepteds), Metrics.iqr(accepteds), min(accepteds),
           max(accepteds)))
    print("**Rejected** => Med: %0.2f, IQR: %0.2f, Min: %d, Max: %d" %
          (Metrics.median(rejecteds), Metrics.iqr(rejecteds), min(rejecteds),
           max(rejecteds)))
Exemplo n.º 2
0
def paper_and_author_growth(min_year=1992, max_year=2015):
  graph = cite_graph(GRAPH_CSV)
  year_authors_map = OrderedDict()
  year_papers_map = OrderedDict()
  for _, paper in graph.get_paper_nodes(permitted=THE.permitted).items():
    year = int(paper.year)
    if not (min_year < year <= max_year): continue
    authors = paper.authors.split(",")
    year_authors_map[year] = year_authors_map.get(year, set([])).union(authors)
    year_papers_map[year] = year_papers_map.get(year, 0) + 1
  x_axis = []
  papers = []
  authors = []
  seen = set(year_authors_map[sorted(year_authors_map.keys())[0]])
  f = open("figs/v3/%s/paper_author_count.csv" % THE.permitted, "wb")
  f.write("Year, # Papers, # Authors\n")
  for key in sorted(year_authors_map.keys())[1:]:
    x_axis.append(key)
    papers.append(year_papers_map[key])
    new_authors = set(year_authors_map[key]).difference(seen)
    authors.append(len(new_authors))
    seen = seen.union(set(year_authors_map[key]))
    f.write("%d, %d, %d\n" % (key, year_papers_map[key], len(new_authors)))
  plt.plot(x_axis, papers)
  plt.plot(x_axis, authors)
  legends = ['Papers', 'Authors']
  plt.legend(legends, loc='upper left')
  plt.title('Growth of Papers and Authors')
  plt.xlabel("Year")
  plt.ylabel(" Count")
  plt.savefig("figs/v3/%s/paper_author_count.png" % THE.permitted)
  plt.clf()
  f.close()
Exemplo n.º 3
0
def get_top_papers():
    n_topics = 7
    top_papers = {}
    for index in range(n_topics):
        top_papers[index] = []
    graph = cite_graph(GRAPH_CSV)
    miner = Miner(graph)
    lda_model, vocab = miner.lda(7,
                                 n_iter=100,
                                 alpha=0.847433736937,
                                 beta=0.763774618977)
    for paper_id, paper in graph.paper_nodes.items():
        topics = miner.documents[paper_id].topics_count
        # if int(paper.year) < 2009: continue
        if max(topics) == 0:
            continue
        topic = topics.argmax()
        # cites = len(paper.cites.split(",")) if paper.cites else 0
        cites = paper.local_cites
        top_papers[topic].append([(cites, paper.title, paper.authors,
                                   paper.year)])
    for index in range(n_topics):
        top_papers[index] = sorted(top_papers[index], reverse=True)[:4]
        print("***", index, "***")
        for paper in top_papers[index]:
            paper = paper[0]
            print(paper[0], paper[-1] + " - " + paper[1] + ", " + paper[2])
Exemplo n.º 4
0
def paper_bar():
    graph = cite_graph(GRAPH_CSV)
    conferences = graph.get_papers_by_venue()
    start = 2001
    end = 2012
    year_count = {}
    for year in range(start, end + 1):
        year_count[year] = 0
    for conference_id, papers in conferences.items():
        for tup in papers:
            count = year_count.get(int(tup[1]), None)
            if count is None: continue
            year_count[int(tup[1])] += 1
    bar_x, bar_y = [], []
    for year, count in year_count.items():
        bar_x.append(year)
        bar_y.append(count)
    fig = plt.figure(figsize=(8, 3))
    plt.bar(bar_x, bar_y, color='blue', align='center')
    plt.xlim([start - 1, end + 1])
    plt.xticks(bar_x, rotation=45)
    plt.ylim(300, 800)
    plt.xlabel('Year')
    plt.ylabel('# of Papers')
    plt.savefig("figs/paper_count.png", bbox_inches='tight')
    plt.clf()
Exemplo n.º 5
0
Arquivo: ist.py Projeto: ai-se/citemap
def paper_bar():
  graph = cite_graph(GRAPH_CSV)
  conferences = graph.get_papers_by_venue()
  start = 2001
  end = 2015
  year_count = {}
  for year in range(start, end + 1):
    year_count[year] = 0
  for conference_id, papers in conferences.items():
    for tup in papers:
      count = year_count.get(int(tup[1]), None)
      if count is None: continue
      year_count[int(tup[1])] += 1
  bar_x, bar_y = [], []
  for year, count in year_count.items():
    bar_x.append(year)
    bar_y.append(count)
  fig = plt.figure(figsize=(8, 3))
  plt.bar(bar_x, bar_y, color='blue', align='center')
  plt.xlim([start - 1, end + 1])
  plt.xticks(bar_x, rotation=45)
  plt.ylim(300, 1100)
  plt.xlabel('Year')
  plt.ylabel('# of Papers')
  plt.savefig("figs/v2/paper_count.png", bbox_inches='tight')
  plt.clf()
Exemplo n.º 6
0
def reputation(only_first=False):
  if only_first:
    print("## First Authors Only")
  papers = read_papers()
  submissions = format_conf_acceptance(papers)
  author_map = top_authors(cite_graph(GRAPH_CSV))
  accepteds, rejecteds = [], []
  for conf_id, papers in submissions.items():
    accepted, rejected = [], []
    for paper in papers:
      for i, author in enumerate(paper.authors):
        if only_first and i > 0: break
        cites = 0
        if author in author_map:
          cites = author_map[author][1]
        if paper.decision == 'accept':
          accepted.append(cites)
        else:
          rejected.append(cites)
    print("#### %s" % conf_id)
    print("**Accepted** => Med: %0.2f, IQR: %0.2f, Min: %d, Max: %d" %
          (Metrics.median(accepted), Metrics.iqr(accepted), min(accepted), max(accepted)))
    print("**Rejected** => Med: %0.2f, IQR: %0.2f, Min: %d, Max: %d" %
          (Metrics.median(rejected), Metrics.iqr(rejected), min(rejected), max(rejected)))
    accepteds += accepted
    rejecteds += rejected
  print("#### All")
  print("**Accepted** => Med: %0.2f, IQR: %0.2f, Min: %d, Max: %d" %
        (Metrics.median(accepteds), Metrics.iqr(accepteds), min(accepteds), max(accepteds)))
  print("**Rejected** => Med: %0.2f, IQR: %0.2f, Min: %d, Max: %d" %
        (Metrics.median(rejecteds), Metrics.iqr(rejecteds), min(rejecteds), max(rejecteds)))
Exemplo n.º 7
0
def get_graph_lda_data(iterations=ITERATIONS):
    graph = cite_graph(GRAPH_CSV)
    miner = Miner(graph)
    lda_model, vocab = miner.lda(N_TOPICS,
                                 n_iter=iterations,
                                 alpha=ALPHA,
                                 beta=BETA)
    return miner, graph, lda_model, vocab
Exemplo n.º 8
0
def get_graph_lda_data():
    graph = cite_graph(GRAPH_CSV)
    miner = Miner(graph, THE.permitted, THE.IGNORE_VENUES[THE.version])
    lda_model, vocab = miner.lda(get_n_topics(),
                                 n_iter=ITERATIONS,
                                 alpha=ALPHA,
                                 beta=BETA,
                                 stop_words=STOP_WORDS)
    return miner, graph, lda_model, vocab
Exemplo n.º 9
0
def super_author(fig_prefix="super_author", top_percent=1.00):
    graph = cite_graph(GRAPH_CSV)
    miner = Miner(graph)
    lda_model, vocab = miner.lda(7,
                                 n_iter=100,
                                 alpha=0.847433736937,
                                 beta=0.763774618977)
    authors = graph.get_papers_by_authors()
    author_topics = {}
    tops = top_authors(graph, top_percent)
    for author_id, papers in authors.items():
        if author_id not in tops:
            continue
        topics = [0] * lda_model.n_topics
        for paper_id, _, __ in papers:
            document = miner.documents[paper_id]
            for index, topic_count in enumerate(document.topics_count):
                if topic_count >= TOPIC_THRESHOLD:
                    topics[index] = 1
        author_topics[author_id] = sum(topics)
    vals = sorted(author_topics.values(), reverse=True)
    # x_axis = range(1, len(vals) + 1)
    # plt.ylabel("Topic Count")
    # plt.xlabel("Author ID")
    # plt.title("Super Author")
    # plt.ylim(min(vals)-1, max(vals)+1)
    # plt.plot(x_axis, vals)
    # plt.savefig("figs/super_author/%s.png"%fig_prefix)
    # plt.clf()
    fig = plt.figure(figsize=(8, 2), dpi=100)
    counter = Counter()
    for val in vals:
        counter[val] += 1
    bar_x = []
    bar_y = []
    for key in sorted(counter.keys()):
        bar_x.append(key)
        bar_y.append(counter[key])
    print(bar_x, bar_y)
    return
    fig, ax = plt.subplots()
    width = 2 / 3
    ax.bar(bar_x, bar_y, 2 / 3, color='blue', align='center')
    ax.set_xticks(np.arange(1, lda_model.n_topics + 1))
    ax.set_xticklabels(np.arange(1, lda_model.n_topics + 1))
    # for i, v in zip(bar_x,bar_y):
    #   ax.text(i, v + 0.25, str(v), color='red', fontweight='bold', fontsize=11, horizontalalignment='center')
    plt.xlabel("Topics")
    plt.ylabel("Authors Count")
    # plt.ylim(min(bar_y) - 1, max(bar_y) + 1)
    plt.savefig("figs/super_author/%s_bar.png" % fig_prefix)
    plt.clf()
    n_top_words = 10
    for index, topic_dist in enumerate(lda_model.topic_word_):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words +
                                                                 1):-1]
        print('Topic {}: {}'.format(index, ', '.join(topic_words)))
Exemplo n.º 10
0
def lda_topics():
  graph = cite_graph(GRAPH_CSV)
  miner = Miner(graph)
  lda_model, vocab = miner.lda(11, n_iter=100, alpha=0.22359, beta=0.53915)
  # lda_model, vocab = miner.lda(11, n_iter=100, alpha=0.847433736937, beta=0.763774618977)
  n_top_words = 15
  for index, topic_dist in enumerate(lda_model.topic_word_):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1]
    print('Topic {}: {}'.format(index, ', '.join(topic_words)))
Exemplo n.º 11
0
def retrieve_graph():
    graph_file = 'cache/%s/%s/graph.pkl' % (THE.version, THE.permitted)
    if os.path.isfile(graph_file):
        with open(graph_file) as f:
            graph = cPkl.load(f)
    else:
        graph = cite_graph(GRAPH_CSV)
        with open(graph_file, 'wb') as f:
            cPkl.dump(graph, f, cPkl.HIGHEST_PROTOCOL)
    return graph
Exemplo n.º 12
0
def lda_topics():
    graph = cite_graph(GRAPH_CSV)
    miner = Miner(graph)
    lda_model, vocab = miner.lda(11, n_iter=100, alpha=0.22359, beta=0.53915)
    # lda_model, vocab = miner.lda(11, n_iter=100, alpha=0.847433736937, beta=0.763774618977)
    n_top_words = 15
    for index, topic_dist in enumerate(lda_model.topic_word_):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words +
                                                                 1):-1]
        print('Topic {}: {}'.format(index, ', '.join(topic_words)))
Exemplo n.º 13
0
def retrieve_graph():
  graph_file = 'cache/%s/graph.pkl' % THE.permitted
  if os.path.isfile(graph_file):
    with open(graph_file) as f:
      graph = pkl.load(f)
  else:
    graph = cite_graph(GRAPH_CSV)
    with open(graph_file, 'wb') as f:
      pkl.dump(graph, f, pkl.HIGHEST_PROTOCOL)
  return graph
Exemplo n.º 14
0
def retrieve_graph(graph_file=GRAPH_CSV, from_cache=True):
  cached = 'cache/graph.pkl'
  if os.path.isfile(cached) and from_cache:
    with open(cached) as f:
      graph = cPkl.load(f)
  else:
    graph = cite_graph(graph_file)
    with open(cached, 'wb') as f:
      cPkl.dump(graph, f, cPkl.HIGHEST_PROTOCOL)
  return graph
Exemplo n.º 15
0
def topic_evolution():
  graph = cite_graph(GRAPH_CSV)
  miner = Miner(graph)
  lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977)
  paper_nodes = graph.paper_nodes
  topics_map = {}
  n_topics = lda_model.n_topics
  for paper_id, paper in paper_nodes.items():
    document = miner.documents[paper_id]
    year_topics = topics_map.get(paper.year, np.array([0]*n_topics))
    topics_map[paper.year] = np.add(year_topics, document.topics_count)
  yt_map = {}
  for year, t_count in topics_map.items():
    yt_map[year] = percent_sort(t_count)
  width = 0.8
  plts = []
  x_axis = np.arange(1, len(yt_map.keys()) + 1)
  # x_axis = [c.acronym for c in mysql.get_conferences()]
  y_offset = np.array([0] * len(yt_map.keys()))
  colors_dict = {}
  TOP_TOPIC_COUNT = 7
  for index in range(TOP_TOPIC_COUNT):
    bar_val, color = [], []
    for year in sorted(yt_map.keys(), key=lambda x: int(x)):
      topic = yt_map[year][index]
      colors_dict[topic[0]] = get_color(topic[0])
      color.append(colors_dict[topic[0]])
      bar_val.append(topic[1])
    plts.append(plt.bar(x_axis, bar_val, width, color=color, bottom=y_offset))
    y_offset = np.add(y_offset, bar_val)
  plt.ylabel("Topic %")
  plt.xlabel("Year")
  plt.xticks(x_axis + width/2, [str(y)[2:] for y in sorted(yt_map.keys(), key=lambda x: int(x))])
  plt.yticks(np.arange(0, 101, 10))
  plt.ylim([0, 101])
  # Legends
  patches = []
  squares = []
  names = []
  t_names = ["Testing", "Applications", "Program Analysis", "Tools and Projects",
             "Defect Analysis", "Modeling", "Maintenance"]
  for index, (topic, color) in enumerate(colors_dict.items()):
    print(topic)
    patches.append(mpatches.Patch(color=color, label='Topic %s' % str(topic)))
    squares.append(plts[index][0])
    # names.append('Topic %s' % str(topic))
    # names.append(t_names[index])
  # plt.legend(handles=patches, loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=7, fontsize=9)
  plt.legend(tuple(patches), tuple(t_names), loc='upper center', bbox_to_anchor=(0.5, 1.14), ncol=4, fontsize=11, handlelength=0.7)
  plt.savefig("figs/topic_evolution/topic_evolution_7_gib.png")
  plt.clf()
  n_top_words = 10
  for index, topic_dist in enumerate(lda_model.topic_word_):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1]
    print('Topic {}: {}'.format(index, ', '.join(topic_words)))
Exemplo n.º 16
0
def predict_venues(estimators, is_independent=IS_INDEPENDENT_VENUE,
                   n_folds=5, n_topics=N_TOPICS, alpha=ALPHA, beta=BETA,
                   n_iter=100, min_tfidf_score=0.1, tfidf_top=100, random_state=RANDOM_STATE):
  def make_key(pred, pre_proc):
    return "%s - %s" % (pred.__name__, pre_proc.__name__)

  graph = cite_graph(GRAPH_CSV)
  miner = Miner(graph)
  papers, groups = get_papers_and_groups(graph, is_independent=is_independent)
  metrics_map = {make_key(predictor, preprocessor): [] for predictor, preprocessor in estimators}
  for index, (train_x, train_y, test_x, test_y) in enumerate(split(papers, groups, n_folds=n_folds)):
    print("#### Iteration %d" % (index + 1))
    # TSNE
    process_embeddings(index, train_x, test_x)
    # Count Vectorizer
    vectorizer = CountVectorizer(stop_words=STOP_WORDS, token_pattern=TOKEN_PATTERN)
    train_docs = [paper.raw for paper in train_x]
    test_docs = [paper.raw for paper in test_x]
    train_vectorized = vectorizer.fit_transform(train_docs).toarray()
    test_vectorized = vectorizer.transform(test_docs).toarray()
    # TFIDF
    tfidf_transformer = TfidfTransformer()
    tfidf_matrix = tfidf_transformer.fit_transform(train_vectorized).toarray()
    tfidf_matrix[tfidf_matrix < min_tfidf_score] = 0
    tfidf_means = np.mean(tfidf_matrix, axis=0)
    tfidf_top_indices = np.argsort(tfidf_means)[::-1][:tfidf_top]
    # LDA-DE
    alpha = alpha if alpha else 50 / N_TOPICS
    beta = beta if beta else 0.01
    lda_model = lda.LDA(n_topics=n_topics, alpha=alpha, eta=beta, n_iter=n_iter, random_state=random_state)
    train_transformed = lda_model.fit_transform(train_vectorized)
    # Putting it together
    for i, (vectorized, topics) in enumerate(zip(train_vectorized, train_transformed)):
      train_x[i].vectorized = vectorized[tfidf_top_indices]
      train_x[i].topics_count = topics
      sum_t = sum(topics)
      sum_t = sum_t if sum_t else 0.00001
      train_x[i].topics_score = [np.float(t / sum_t) for t in topics]
    test_transformed = lda_model.transform(test_vectorized)
    for i, (vectorized, topics) in enumerate(zip(test_vectorized, test_transformed)):
      test_x[i].vectorized = vectorized[tfidf_top_indices]
      test_x[i].topics_count = topics
      sum_t = sum(topics)
      sum_t = sum_t if sum_t else 0.00001
      test_x[i].topics_score = [t / sum_t for t in topics]
    for predictor, preprocessor in estimators:
      key = make_key(predictor, preprocessor)
      print(key)
      predicted, metrics = predictor(preprocessor, train_x, train_y, test_x, test_y)
      metrics_map[key].append(metrics)
  for predictor, preprocessor in estimators:
    key = make_key(predictor, preprocessor)
    print("### " + key)
    measures = Metrics.avg_score(metrics_map[key])
    print(measures)
Exemplo n.º 17
0
def super_author(fig_prefix="super_author" ,top_percent=1.00):
  graph = cite_graph(GRAPH_CSV)
  miner = Miner(graph)
  lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977)
  authors = graph.get_papers_by_authors()
  author_topics = {}
  tops = top_authors(graph, top_percent)
  for author_id, papers in authors.items():
    if author_id not in tops:
      continue
    topics = [0]*lda_model.n_topics
    for paper_id, _, __ in papers:
      document = miner.documents[paper_id]
      for index, topic_count in enumerate(document.topics_count):
        if topic_count >= TOPIC_THRESHOLD:
          topics[index] = 1
    author_topics[author_id] = sum(topics)
  vals = sorted(author_topics.values(), reverse=True)
  # x_axis = range(1, len(vals) + 1)
  # plt.ylabel("Topic Count")
  # plt.xlabel("Author ID")
  # plt.title("Super Author")
  # plt.ylim(min(vals)-1, max(vals)+1)
  # plt.plot(x_axis, vals)
  # plt.savefig("figs/super_author/%s.png"%fig_prefix)
  # plt.clf()
  fig = plt.figure(figsize=(8, 2), dpi=100)
  counter = Counter()
  for val in vals:
    counter[val] += 1
  bar_x = []
  bar_y = []
  for key in sorted(counter.keys()):
    bar_x.append(key)
    bar_y.append(counter[key])
  print(bar_x, bar_y)
  return
  fig, ax = plt.subplots()
  width = 2/3
  ax.bar(bar_x, bar_y, 2/3, color='blue', align='center')
  ax.set_xticks(np.arange(1,lda_model.n_topics+1))
  ax.set_xticklabels(np.arange(1,lda_model.n_topics+1))
  # for i, v in zip(bar_x,bar_y):
  #   ax.text(i, v + 0.25, str(v), color='red', fontweight='bold', fontsize=11, horizontalalignment='center')
  plt.xlabel("Topics")
  plt.ylabel("Authors Count")
  # plt.ylim(min(bar_y) - 1, max(bar_y) + 1)
  plt.savefig("figs/super_author/%s_bar.png"%fig_prefix)
  plt.clf()
  n_top_words = 10
  for index, topic_dist in enumerate(lda_model.topic_word_):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1]
    print('Topic {}: {}'.format(index, ', '.join(topic_words)))
Exemplo n.º 18
0
def pc_bias_table():
    def index_by_year(tups):
        y_comm = {}
        for tup in tups:
            comm = y_comm.get(tup[1], set())
            comm.add(tup[0])
            y_comm[tup[1]] = comm
        return y_comm

    graph = cite_graph(GRAPH_CSV)
    max_len = 21
    start = 1993
    max_len = 5
    start = 2009
    p_conferences = graph.get_papers_by_venue()
    p_committees = graph.get_committee_by_conference()
    conf_year_scores = {}
    for conference in mysql.get_conferences():
        year_committees = index_by_year(p_committees[conference.id])
        year_papers = index_by_year(p_conferences[conference.id])
        year_scores = {}
        for year in sorted(year_committees.keys(), key=lambda y: int(y)):
            if year < start: continue
            papers = year_papers.get(year, None)
            if papers is None:
                year_scores[int(year)] = None
                continue
            committee = year_committees[year]
            comm_papers = 0
            non_comm_papers = 0
            for paper_id in papers:
                paper = graph.paper_nodes[paper_id]
                author_ids = set(paper.author_ids.strip().split(","))
                if author_ids.intersection(committee):
                    comm_papers += 1
                else:
                    non_comm_papers += 1
            year_scores[int(year)] = 0 if not comm_papers else int(
                round(comm_papers * 100 / (comm_papers + non_comm_papers)))
        conf_year_scores[conference.acronym] = year_scores
    header = ["conf"] + [str(start + i) for i in xrange(max_len)]
    table = PrettyTable(header)
    for conf, year_scores in conf_year_scores.items():
        row = [conf]
        for index in xrange(max_len):
            row.append(year_scores.get(start + index, None))
        table.add_row(row)
    print("```")
    print(table)
    print("```")
    exit()
Exemplo n.º 19
0
def conference_evolution_2(paper_range, figname):
    graph = cite_graph(GRAPH_CSV)
    miner = Miner(graph)
    lda_model, vocab = miner.lda(7,
                                 n_iter=100,
                                 alpha=0.847433736937,
                                 beta=0.763774618977)
    conferences = graph.get_papers_by_venue()
    conference_topics = {}
    conference_heatmaps = {}
    for conference_id, papers in conferences.items():
        topics = np.array([0] * lda_model.n_topics)
        for tup in yearize(papers).items():
            if tup[0] not in paper_range:
                continue
            for paper_id in tup[1]:
                topics = np.add(topics, miner.documents[paper_id].topics_count)
        conference_topics[conference_id] = percent_sort(topics)
        conference_heatmaps[conference_id] = topics
    n_top_words = 10
    #Heatmap
    heatmap_arr = []
    column_labels = []
    for conference_id, conf in zip(
            sorted(conference_heatmaps.keys(), key=lambda x: int(x)),
            mysql.get_conferences()):
        tot = sum(conference_heatmaps[conference_id])
        if tot == 0: continue
        column_labels.append(conf.acronym)
        dist = [top / tot for top in conference_heatmaps[conference_id]]
        heatmap_arr.append(dist)
    fig, ax = plt.subplots()
    heatmap_arr = np.array(heatmap_arr)
    heatmap = ax.pcolor(heatmap_arr, cmap=plt.cm.Reds)
    plt.ylabel("Conferences")
    plt.xlabel("Topics")
    row_labels = range(lda_model.n_topics)
    ax.set_xticks(np.arange(heatmap_arr.shape[1]) + 0.5, minor=False)
    ax.set_yticks(np.arange(heatmap_arr.shape[0]) + 0.5, minor=False)
    ax.set_xticklabels(row_labels, minor=False)
    ax.set_yticklabels(column_labels, minor=False)
    plt.savefig("figs/diversity/heatmap_7topics.png")
    plt.clf()
    for index, topic_dist in enumerate(lda_model.topic_word_):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words +
                                                                 1):-1]
        print('Topic {}: {}'.format(index, ', '.join(topic_words)))
    # make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/diversity/dend_heatmap_7topics.png")
    make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels,
                       "figs/evolution/%s.png" % figname)
Exemplo n.º 20
0
def pc_topics_heatmap(year_range=None):
    def index_by_year(tups):
        y_comm = {}
        for tup in tups:
            comm = y_comm.get(tup[1], set())
            comm.add(tup[0])
            y_comm[tup[1]] = comm
        return y_comm

    graph = cite_graph(GRAPH_CSV)
    miner = Miner(graph)
    lda_model, vocab = miner.lda(7,
                                 n_iter=100,
                                 alpha=0.847433736937,
                                 beta=0.763774618977)
    max_len = 21
    start = 1993
    p_conferences = graph.get_papers_by_venue()
    p_committees = graph.get_committee_by_conference()
    conference_topics = {}
    for conference in mysql.get_conferences():
        year_committees = index_by_year(p_committees[conference.id])
        year_papers = index_by_year(p_conferences[conference.id])
        year_scores = {}
        topics = np.array([0] * lda_model.n_topics)
        for year in sorted(year_committees.keys(), key=lambda y: int(y)):
            if (year_range is not None) and (int(year) not in year_range):
                continue
            papers = year_papers.get(year, None)
            if papers is None:
                year_scores[int(year)] = None
                continue
            committee = year_committees[year]
            for paper_id in papers:
                paper = graph.paper_nodes[paper_id]
                author_ids = set(paper.author_ids.strip().split(","))
                if author_ids.intersection(committee):
                    continue
                topics = np.add(topics, miner.documents[paper_id].topics_count)
        conference_topics[conference.id] = topics
    heatmap_arr = []
    for conference_id in sorted(conference_topics.keys(),
                                key=lambda x: int(x)):
        tot = sum(conference_topics[conference_id])
        dist = [top / tot for top in conference_topics[conference_id]]
        heatmap_arr.append(dist)
    row_labels = range(lda_model.n_topics)
    column_labels = [c.acronym for c in mysql.get_conferences()]
    make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels,
                       "figs/pc/pc_heatmap_7topics.png")
Exemplo n.º 21
0
def pc_bias_table():
  def index_by_year(tups):
    y_comm = {}
    for tup in tups:
      comm = y_comm.get(tup[1], set())
      comm.add(tup[0])
      y_comm[tup[1]] = comm
    return y_comm

  graph = cite_graph(GRAPH_CSV)
  max_len = 21
  start = 1993
  max_len = 5
  start = 2009
  p_conferences = graph.get_papers_by_venue()
  p_committees = graph.get_committee_by_conference()
  conf_year_scores = {}
  for conference in mysql.get_conferences():
    year_committees = index_by_year(p_committees[conference.id])
    year_papers = index_by_year(p_conferences[conference.id])
    year_scores = {}
    for year in sorted(year_committees.keys(), key=lambda y: int(y)):
      if year < start: continue
      papers = year_papers.get(year, None)
      if papers is None:
        year_scores[int(year)] = None
        continue
      committee = year_committees[year]
      comm_papers = 0
      non_comm_papers = 0
      for paper_id in papers:
        paper = graph.paper_nodes[paper_id]
        author_ids = set(paper.author_ids.strip().split(","))
        if author_ids.intersection(committee):
          comm_papers += 1
        else:
          non_comm_papers += 1
      year_scores[int(year)] = 0 if not comm_papers else int(round(comm_papers * 100 / (comm_papers + non_comm_papers)))
    conf_year_scores[conference.acronym] = year_scores
  header = ["conf"] + [str(start + i) for i in xrange(max_len)]
  table = PrettyTable(header)
  for conf, year_scores in conf_year_scores.items():
    row = [conf]
    for index in xrange(max_len):
      row.append(year_scores.get(start+index, None))
    table.add_row(row)
  print("```")
  print(table)
  print("```")
  exit()
Exemplo n.º 22
0
def pc_paper_count_table():
    def index_by_year(tups):
        y_comm = {}
        for tup in tups:
            comm = y_comm.get(tup[1], set())
            comm.add(tup[0])
            y_comm[tup[1]] = comm
        return y_comm

    graph = cite_graph(GRAPH_CSV)
    start = 2009
    p_conferences = graph.get_papers_by_venue()
    p_committees = graph.get_committee_by_conference()
    header = ["conf", "# Accepted", "# from PC", "Percentage"]
    table = PrettyTable(header)
    all_papers, all_pc_papers = 0, 0
    for conference in mysql.get_conferences():
        year_committees = index_by_year(p_committees[conference.id])
        year_papers = index_by_year(p_conferences[conference.id])
        year_scores = {}
        comm_papers = 0
        tot_papers = 0
        for year in sorted(year_committees.keys(), key=lambda y: int(y)):
            if int(year) < start: continue
            papers = year_papers.get(year, None)
            if papers is None:
                year_scores[int(year)] = None
                continue
            committee = year_committees[year]
            for paper_id in papers:
                paper = graph.paper_nodes[paper_id]
                author_ids = set(paper.author_ids.strip().split(","))
                if author_ids.intersection(committee):
                    comm_papers += 1
                tot_papers += 1
        table.add_row([
            conference.acronym, tot_papers, comm_papers,
            int(round(100 * comm_papers / tot_papers, 0))
        ])
        all_papers += tot_papers
        all_pc_papers += comm_papers
    table.add_row([
        "all", all_papers, all_pc_papers,
        int(round(100 * all_pc_papers / all_papers, 0))
    ])
    print("```")
    print(table)
    print("```")
Exemplo n.º 23
0
def pc_topics_heatmap(year_range=None):
  def index_by_year(tups):
    y_comm = {}
    for tup in tups:
      comm = y_comm.get(tup[1], set())
      comm.add(tup[0])
      y_comm[tup[1]] = comm
    return y_comm

  graph = cite_graph(GRAPH_CSV)
  miner = Miner(graph)
  lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977)
  max_len = 21
  start = 1993
  p_conferences = graph.get_papers_by_venue()
  p_committees = graph.get_committee_by_conference()
  conference_topics = {}
  for conference in mysql.get_conferences():
    year_committees = index_by_year(p_committees[conference.id])
    year_papers = index_by_year(p_conferences[conference.id])
    year_scores = {}
    topics = np.array([0] * lda_model.n_topics)
    for year in sorted(year_committees.keys(), key=lambda y: int(y)):
      if (year_range is not None) and (int(year) not in year_range):
        continue
      papers = year_papers.get(year, None)
      if papers is None:
        year_scores[int(year)] = None
        continue
      committee = year_committees[year]
      for paper_id in papers:
        paper = graph.paper_nodes[paper_id]
        author_ids = set(paper.author_ids.strip().split(","))
        if author_ids.intersection(committee):
          continue
        topics = np.add(topics, miner.documents[paper_id].topics_count)
    conference_topics[conference.id] = topics
  heatmap_arr = []
  for conference_id in sorted(conference_topics.keys(), key=lambda x: int(x)):
    tot = sum(conference_topics[conference_id])
    dist = [top / tot for top in conference_topics[conference_id]]
    heatmap_arr.append(dist)
  row_labels = range(lda_model.n_topics)
  column_labels = [c.acronym for c in mysql.get_conferences()]
  make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/pc/pc_heatmap_7topics.png")
Exemplo n.º 24
0
def conference_evolution_2(paper_range, figname):
  graph = cite_graph(GRAPH_CSV)
  miner = Miner(graph)
  lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977)
  conferences = graph.get_papers_by_venue()
  conference_topics = {}
  conference_heatmaps = {}
  for conference_id, papers in conferences.items():
    topics = np.array([0] * lda_model.n_topics)
    for tup in yearize(papers).items():
      if tup[0] not in paper_range:
        continue
      for paper_id in tup[1]:
        topics = np.add(topics, miner.documents[paper_id].topics_count)
    conference_topics[conference_id] = percent_sort(topics)
    conference_heatmaps[conference_id] = topics
  n_top_words = 10
  #Heatmap
  heatmap_arr = []
  column_labels = []
  for conference_id, conf in zip(sorted(conference_heatmaps.keys(), key=lambda x: int(x)), mysql.get_conferences()):
    tot = sum(conference_heatmaps[conference_id])
    if tot == 0: continue
    column_labels.append(conf.acronym)
    dist = [top/tot for top in conference_heatmaps[conference_id]]
    heatmap_arr.append(dist)
  fig, ax = plt.subplots()
  heatmap_arr = np.array(heatmap_arr)
  heatmap = ax.pcolor(heatmap_arr, cmap=plt.cm.Reds)
  plt.ylabel("Conferences")
  plt.xlabel("Topics")
  row_labels = range(lda_model.n_topics)
  ax.set_xticks(np.arange(heatmap_arr.shape[1])+0.5, minor=False)
  ax.set_yticks(np.arange(heatmap_arr.shape[0])+0.5, minor=False)
  ax.set_xticklabels(row_labels, minor=False)
  ax.set_yticklabels(column_labels, minor=False)
  plt.savefig("figs/diversity/heatmap_7topics.png")
  plt.clf()
  for index, topic_dist in enumerate(lda_model.topic_word_):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1]
    print('Topic {}: {}'.format(index, ', '.join(topic_words)))
  # make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/diversity/dend_heatmap_7topics.png")
  make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/evolution/%s.png"%figname)
Exemplo n.º 25
0
Arquivo: ist.py Projeto: ai-se/citemap
def pc_paper_count_table():
  def index_by_year(tups):
    y_comm = {}
    for tup in tups:
      comm = y_comm.get(tup[1], set())
      comm.add(tup[0])
      y_comm[tup[1]] = comm
    return y_comm

  graph = cite_graph(GRAPH_CSV)
  start = 2009
  p_conferences = graph.get_papers_by_venue()
  p_committees = graph.get_committee_by_conference()
  header = ["conf", "# Accepted", "# from PC", "Percentage"]
  table = PrettyTable(header)
  all_papers, all_pc_papers = 0, 0
  for conference in mysql.get_conferences():
    year_committees = index_by_year(p_committees[conference.id])
    year_papers = index_by_year(p_conferences[conference.id])
    year_scores = {}
    comm_papers = 0
    tot_papers = 0
    for year in sorted(year_committees.keys(), key=lambda y: int(y)):
      if int(year) < start: continue
      papers = year_papers.get(year, None)
      if papers is None:
        year_scores[int(year)] = None
        continue
      committee = year_committees[year]
      for paper_id in papers:
        paper = graph.paper_nodes[paper_id]
        author_ids = set(paper.author_ids.strip().split(","))
        if author_ids.intersection(committee):
          comm_papers += 1
        tot_papers += 1
    table.add_row([conference.acronym, tot_papers, comm_papers, int(round(100 * comm_papers / tot_papers, 0))])
    all_papers += tot_papers
    all_pc_papers += comm_papers
  table.add_row(["all", all_papers, all_pc_papers, int(round(100 * all_pc_papers / all_papers, 0))])
  print("```")
  print(table)
  print("```")
Exemplo n.º 26
0
Arquivo: ist.py Projeto: ai-se/citemap
def print_top_authors(file_name, top_percent=None, min_year=None):
  graph = cite_graph(GRAPH_CSV)
  tops = top_authors(graph, top_percent=top_percent, min_year=min_year)
  author_papers = graph.get_papers_by_authors()
  top_tups = []
  for author_id, author in graph.author_nodes.items():
    if author_id in tops:
      papers = author_papers.get(author_id, None)
      if papers is None: continue
      total_cites = 0
      counts = 0
      for paper_tup in papers:
        if min_year is not None and int(paper_tup[1]) < min_year: continue
        paper_id = paper_tup[0]
        total_cites += graph.paper_nodes[paper_id].local_cites
        counts += 1
      top_tups.append((author.name, counts, total_cites))
  top_tups = sorted(top_tups, key=lambda x: x[-1], reverse=True)
  with open(file_name, "wb") as f:
    for top_tup in top_tups:
      f.write(str(top_tup))
      f.write("\n")
Exemplo n.º 27
0
def print_top_authors(top_percent=None, min_year=None):
    graph = cite_graph(GRAPH_CSV)
    tops = top_authors(graph, top_percent=top_percent, min_year=min_year)
    author_papers = graph.get_papers_by_authors()
    top_tups = []
    for author_id, author in graph.author_nodes.items():
        if author_id in tops:
            papers = author_papers.get(author_id, None)
            if papers is None: continue
            total_cites = 0
            counts = 0
            for paper_tup in papers:
                if min_year is not None and int(paper_tup[1]) < min_year:
                    continue
                paper_id = paper_tup[0]
                total_cites += graph.paper_nodes[paper_id].local_cites
                counts += 1
            top_tups.append((author.name, counts, total_cites))
    top_tups = sorted(top_tups, key=lambda x: x[-1], reverse=True)
    with open("temp_all.txt", "wb") as f:
        for top_tup in top_tups:
            f.write(str(top_tup))
            f.write("\n")
Exemplo n.º 28
0
def get_top_papers():
  n_topics = 7
  top_papers = {}
  for index in range(n_topics):
    top_papers[index] = []
  graph = cite_graph(GRAPH_CSV)
  miner = Miner(graph)
  lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977)
  for paper_id, paper in graph.paper_nodes.items():
    topics = miner.documents[paper_id].topics_count
    # if int(paper.year) < 2009: continue
    if max(topics) == 0:
      continue
    topic = topics.argmax()
    # cites = len(paper.cites.split(",")) if paper.cites else 0
    cites = paper.local_cites
    top_papers[topic].append([(cites, paper.title, paper.authors, paper.year)])
  for index in range(n_topics):
    top_papers[index] = sorted(top_papers[index], reverse=True)[:4]
    print("***", index, "***")
    for paper in top_papers[index]:
      paper = paper[0]
      print(paper[0], paper[-1] + " - " + paper[1] + ", " + paper[2])
Exemplo n.º 29
0
def paper_bar(start=1992, end=2016):
  print("PAPER BAR for %s" % THE.permitted)
  graph = cite_graph(GRAPH_CSV)
  venues = graph.get_papers_by_venue(permitted=THE.permitted)
  year_count = {}
  for year in range(start, end + 1):
    year_count[year] = 0
  for conference_id, papers in venues.items():
    for tup in papers:
      count = year_count.get(int(tup[1]), None)
      if count is None: continue
      year_count[int(tup[1])] += 1
  bar_x, bar_y = [], []
  for year, count in year_count.items():
    bar_x.append(year)
    bar_y.append(count)
  plt.figure(figsize=(8, 3))
  plt.bar(bar_x, bar_y, color='blue', align='center')
  plt.xlim([start - 1, end + 1])
  plt.xticks(bar_x, rotation=45)
  plt.xlabel('Year')
  plt.ylabel('# of Papers')
  plt.savefig("figs/v3/%s/paper_count.png" % THE.permitted, bbox_inches='tight')
  plt.clf()
Exemplo n.º 30
0
def conference_evolution():
    legit_conferences = ["ICSE", "MSR", "FSE", "ASE"]
    non_legit_conferences = ["GPCE", "FASE"]
    TOP_TOPIC_COUNT = 7
    graph = cite_graph(GRAPH_CSV)
    miner = Miner(graph)
    lda_model, vocab = miner.lda(7,
                                 n_iter=100,
                                 alpha=0.847433736937,
                                 beta=0.763774618977)
    conferences = graph.get_papers_by_venue()
    f, subplts = plt.subplots(3, 3)
    f.tight_layout()
    y_counter = -1
    x_counter = 0
    for conf_index, conference in enumerate(mysql.get_conferences()):
        # if conference.acronym not in legit_conferences: continue
        if conference.acronym in non_legit_conferences: continue
        y_counter += 1
        if y_counter > 2:
            x_counter += 1
            y_counter = 0
        year_topics = {}
        year_heatmaps = {}
        for year, papers in yearize(conferences[conference.id]).items():
            topics = np.array([0] * lda_model.n_topics)
            for paper_id in papers:
                topics = np.add(topics, miner.documents[paper_id].topics_count)
            year_heatmaps[year] = topics
            year_topics[year] = percent_sort(topics)
        width = 0.8
        x_axis = np.arange(1, len(year_topics.keys()) + 1)
        # x_axis = [c.acronym for c in mysql.get_conferences()]
        y_offset = np.array([0] * len(year_topics.keys()))
        colors_dict = {}
        for index in range(TOP_TOPIC_COUNT):
            bar_val, color = [], []
            for year in sorted(year_topics.keys(), key=lambda x: int(x)):
                topic = year_topics[year][index]
                colors_dict[topic[0]] = get_color(topic[0])
                color.append(colors_dict[topic[0]])
                bar_val.append(topic[1])
            subplts[x_counter, y_counter].bar(x_axis,
                                              bar_val,
                                              width,
                                              color=color,
                                              bottom=y_offset)
            y_offset = np.add(y_offset, bar_val)
        # subplts[x_counter, y_counter].set_ylabel("Topic Coverage %")
        #subplts[x_counter, y_counter].set_xlabel("Conferences")
        if len(year_topics.keys()) <= 14:
            subplts[x_counter, y_counter].set_xticks(x_axis + width / 2)
            subplts[x_counter, y_counter].set_xticklabels([
                str(y)[2:]
                for y in sorted(year_topics.keys(), key=lambda x: int(x))
            ],
                                                          fontsize=7)
        else:
            subplts[x_counter, y_counter].set_xticks(
                np.arange(1,
                          len(year_topics.keys()) + 1, 2) + width / 2)
            subplts[x_counter, y_counter].set_xticklabels([
                str(y)[2:] for index, y in enumerate(
                    sorted(year_topics.keys(), key=lambda x: int(x)))
                if index % 2 == 0
            ],
                                                          fontsize=7)

        subplts[x_counter, y_counter].set_yticks(np.arange(0, 101, 20))
        subplts[x_counter, y_counter].set_ylim([0, 101])
        subplts[x_counter, y_counter].set_title(conference.acronym)
    # Legends
    patches = []
    labels = []
    for topic in xrange(lda_model.n_topics):
        patches.append(mpatches.Patch(color=get_color(topic)))
        labels.append('Topic %s' % str(topic))
    f.legend(handles=patches,
             labels=labels,
             loc='upper center',
             bbox_to_anchor=(0.5, 0.04),
             ncol=12,
             fontsize=7)
    plt.savefig("figs/evolution/evolution_7topics.png")
    plt.clf()
    n_top_words = 10
    for index, topic_dist in enumerate(lda_model.topic_word_):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words +
                                                                 1):-1]
        print('Topic {}: {}'.format(index, ', '.join(topic_words)))
Exemplo n.º 31
0
def get_graph_lda_data(iterations=ITERATIONS):
  graph = cite_graph(GRAPH_CSV)
  miner = Miner(graph)
  lda_model, vocab = miner.lda(N_TOPICS, n_iter=iterations, alpha=ALPHA, beta=BETA)
  return miner, graph, lda_model, vocab
Exemplo n.º 32
0
def get_graph_lda_data():
  graph = cite_graph(GRAPH_CSV)
  miner = Miner(graph, THE.permitted)
  lda_model, vocab = miner.lda(get_n_topics(), n_iter=ITERATIONS, alpha=ALPHA, beta=BETA, stop_words=STOP_WORDS)
  return miner, graph, lda_model, vocab
Exemplo n.º 33
0
def conference_diversity():
  graph = cite_graph(GRAPH_CSV)
  miner = Miner(graph)
  lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977)
  conferences = graph.get_papers_by_venue()
  conference_topics = {}
  conference_heatmaps = {}
  for conference_id, papers in conferences.items():
    topics = np.array([0] * lda_model.n_topics)
    for tup in yearize(papers).items():
      for paper_id in tup[1]:
        topics = np.add(topics, miner.documents[paper_id].topics_count)
    conference_topics[conference_id] = percent_sort(topics)
    conference_heatmaps[conference_id] = topics
  #fig, ax = plt.subplots()
  bar_vals = []
  colors = []
  width = 0.75
  plts = []
  x_axis = np.arange(1, len(conference_topics.keys())+1)
  #x_axis = [c.acronym for c in mysql.get_conferences()]
  y_offset = np.array([0]*len(conference_topics.keys()))
  colors_dict = {}
  for index in range(7):
    bar_val = []
    color = []
    for conference_id in sorted(conference_topics.keys(), key=lambda x: int(x)):
      topic = conference_topics[conference_id][index]
      colors_dict[topic[0]] = get_color(topic[0])
      color.append(colors_dict[topic[0]])
      bar_val.append(topic[1])
    plts.append(plt.bar(x_axis, bar_val, width, color=color, bottom=y_offset))
    y_offset = np.add(y_offset, bar_val)
  plt.ylabel("Topic Coverage %")
  plt.xlabel("Conferences")
  plt.xticks(x_axis+width/2, [c.acronym for c in mysql.get_conferences()])
  plt.yticks(np.arange(0, 101, 10))
  plt.ylim([0,101])
  #Legends
  patches = []
  for topic, color in colors_dict.items():
    patches.append(mpatches.Patch(color=color, label='Topic %s' % str(topic)))
  plt.legend(handles=patches, loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=6, fontsize=7)
  plt.savefig("figs/diversity/conference_diversity_7topics.png")
  plt.clf()
  n_top_words = 10
  #Heatmap
  heatmap_arr = []
  for conference_id in sorted(conference_heatmaps.keys(), key=lambda x: int(x)):
    tot = sum(conference_heatmaps[conference_id])
    dist = [top/tot for top in conference_heatmaps[conference_id]]
    heatmap_arr.append(dist)
  fig, ax = plt.subplots()
  heatmap_arr = np.array(heatmap_arr)
  heatmap = ax.pcolor(heatmap_arr, cmap=plt.cm.Reds)
  plt.ylabel("Conferences")
  plt.xlabel("Topics")
  # row_labels = range(lda_model.n_topics)
  t_names= ["Testing", "Applications", "Program Analysis", "Tools and Projects",
            "Defect Analysis", "Modeling", "Maintenance"]
  row_labels = [str(ind) + "-" + name for ind, name in zip(range(lda_model.n_topics), t_names)]
  column_labels = [c.acronym for c in mysql.get_conferences()]
  ax.set_xticks(np.arange(heatmap_arr.shape[1])+0.5, minor=False)
  ax.set_yticks(np.arange(heatmap_arr.shape[0])+0.5, minor=False)
  ax.set_xticklabels(row_labels, minor=False)
  ax.set_yticklabels(column_labels, minor=False)
  plt.savefig("figs/diversity/heatmap_7topics.png")
  plt.clf()
  for index, topic_dist in enumerate(lda_model.topic_word_):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1]
    print('Topic {}: {}'.format(index, ', '.join(topic_words)))
  # make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/diversity/dend_heatmap_7topics.png")
  make_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/diversity/heatmap2.png")
Exemplo n.º 34
0
def pc_bias():
  def index_by_year(tups):
    y_comm = {}
    for tup in tups:
      comm = y_comm.get(tup[1], set())
      comm.add(tup[0])
      y_comm[tup[1]] = comm
    return y_comm

  legit_conferences = ["ICSE", "MSR", "FSE", "ASE"]
  colors = ['r', 'g', 'b', 'y']
  graph = cite_graph(GRAPH_CSV)
  width = 0.5
  space = 0.3
  p_conferences = graph.get_papers_by_venue()
  p_committees = graph.get_committee_by_conference()
  max_len = 21
  low = 1
  high = max_len * (len(legit_conferences)*width + space) + 1
  delta = (high - low)/max_len
  x_axis = np.arange(low, high, delta)
  x_ticks = np.arange(1993, 1993+max_len)
  conf_index = 0
  patches = []
  for conference in mysql.get_conferences():
    if conference.acronym not in legit_conferences: continue
    year_committees = index_by_year(p_committees[conference.id])
    year_papers = index_by_year(p_conferences[conference.id])
    year_scores = {}
    y_axis = []
    #x_axis = np.arange(1, len(year_committees.keys())+1)
    for year in sorted(year_committees.keys(), key=lambda y: int(y)):
      papers = year_papers.get(year,None)
      if papers is None:
        y_axis.append(0)
        continue
      committee = year_committees[year]
      comm_papers = 0
      non_comm_papers = 0
      for paper_id in papers:
        paper = graph.paper_nodes[paper_id]
        author_ids = set(paper.author_ids.strip().split(","))
        if author_ids.intersection(committee):
          comm_papers += 1
        else:
          non_comm_papers += 1
      year_scores[year] = (comm_papers, non_comm_papers)
      percent = 0 if not comm_papers else comm_papers*100/(comm_papers+non_comm_papers)
      y_axis.append(percent)
    y_axis = np.lib.pad(y_axis, (max_len-len(y_axis), 0), 'constant', constant_values=0)
    plt.bar(x_axis+conf_index*width, y_axis, width=width, color=colors[conf_index])
    patches.append(mpatches.Patch(color=colors[conf_index], label=conference.acronym))
    conf_index += 1
  plt.xlabel("Year")
  plt.ylabel("% of papers by PC")
  plt.xticks(x_axis + len(legit_conferences)*width/2, [str(y)[2:] for y in x_ticks])
  #plt.yticks(np.arange(0, 100, 10))
  #plt.title(conference.acronym)
  plt.legend(handles=patches, loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=len(legit_conferences), fontsize=7)
  plt.savefig("figs/pc/pc.png")
  plt.clf()
Exemplo n.º 35
0
def predict_venues(estimators,
                   is_independent=IS_INDEPENDENT_VENUE,
                   n_folds=5,
                   n_topics=N_TOPICS,
                   alpha=ALPHA,
                   beta=BETA,
                   n_iter=100,
                   min_tfidf_score=0.1,
                   tfidf_top=100,
                   random_state=RANDOM_STATE):
    def make_key(pred, pre_proc):
        return "%s - %s" % (pred.__name__, pre_proc.__name__)

    graph = cite_graph(GRAPH_CSV)
    miner = Miner(graph)
    papers, groups = get_papers_and_groups(graph,
                                           is_independent=is_independent)
    metrics_map = {
        make_key(predictor, preprocessor): []
        for predictor, preprocessor in estimators
    }
    for index, (train_x, train_y, test_x,
                test_y) in enumerate(split(papers, groups, n_folds=n_folds)):
        print("#### Iteration %d" % (index + 1))
        # TSNE
        process_embeddings(index, train_x, test_x)
        # Count Vectorizer
        vectorizer = CountVectorizer(stop_words=STOP_WORDS,
                                     token_pattern=TOKEN_PATTERN)
        train_docs = [paper.raw for paper in train_x]
        test_docs = [paper.raw for paper in test_x]
        train_vectorized = vectorizer.fit_transform(train_docs).toarray()
        test_vectorized = vectorizer.transform(test_docs).toarray()
        # TFIDF
        tfidf_transformer = TfidfTransformer()
        tfidf_matrix = tfidf_transformer.fit_transform(
            train_vectorized).toarray()
        tfidf_matrix[tfidf_matrix < min_tfidf_score] = 0
        tfidf_means = np.mean(tfidf_matrix, axis=0)
        tfidf_top_indices = np.argsort(tfidf_means)[::-1][:tfidf_top]
        # LDA-DE
        alpha = alpha if alpha else 50 / N_TOPICS
        beta = beta if beta else 0.01
        lda_model = lda.LDA(n_topics=n_topics,
                            alpha=alpha,
                            eta=beta,
                            n_iter=n_iter,
                            random_state=random_state)
        train_transformed = lda_model.fit_transform(train_vectorized)
        # Putting it together
        for i, (vectorized,
                topics) in enumerate(zip(train_vectorized, train_transformed)):
            train_x[i].vectorized = vectorized[tfidf_top_indices]
            train_x[i].topics_count = topics
            sum_t = sum(topics)
            sum_t = sum_t if sum_t else 0.00001
            train_x[i].topics_score = [np.float(t / sum_t) for t in topics]
        test_transformed = lda_model.transform(test_vectorized)
        for i, (vectorized,
                topics) in enumerate(zip(test_vectorized, test_transformed)):
            test_x[i].vectorized = vectorized[tfidf_top_indices]
            test_x[i].topics_count = topics
            sum_t = sum(topics)
            sum_t = sum_t if sum_t else 0.00001
            test_x[i].topics_score = [t / sum_t for t in topics]
        for predictor, preprocessor in estimators:
            key = make_key(predictor, preprocessor)
            print(key)
            predicted, metrics = predictor(preprocessor, train_x, train_y,
                                           test_x, test_y)
            metrics_map[key].append(metrics)
    for predictor, preprocessor in estimators:
        key = make_key(predictor, preprocessor)
        print("### " + key)
        measures = Metrics.avg_score(metrics_map[key])
        print(measures)
Exemplo n.º 36
0
def conference_evolution():
  legit_conferences = ["ICSE", "MSR", "FSE", "ASE"]
  non_legit_conferences = ["GPCE", "FASE"]
  TOP_TOPIC_COUNT = 7
  graph = cite_graph(GRAPH_CSV)
  miner = Miner(graph)
  lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977)
  conferences = graph.get_papers_by_venue()
  f, subplts = plt.subplots(3, 3)
  f.tight_layout()
  y_counter = -1
  x_counter = 0
  for conf_index, conference in enumerate(mysql.get_conferences()):
    # if conference.acronym not in legit_conferences: continue
    if conference.acronym in non_legit_conferences: continue
    y_counter += 1
    if y_counter > 2:
      x_counter += 1
      y_counter = 0
    year_topics = {}
    year_heatmaps = {}
    for year, papers in yearize(conferences[conference.id]).items():
      topics = np.array([0]*lda_model.n_topics)
      for paper_id in papers:
        topics = np.add(topics, miner.documents[paper_id].topics_count)
      year_heatmaps[year] = topics
      year_topics[year] = percent_sort(topics)
    width = 0.8
    x_axis = np.arange(1, len(year_topics.keys()) + 1)
    # x_axis = [c.acronym for c in mysql.get_conferences()]
    y_offset = np.array([0] * len(year_topics.keys()))
    colors_dict={}
    for index in range(TOP_TOPIC_COUNT):
      bar_val, color = [], []
      for year in sorted(year_topics.keys(), key=lambda x:int(x)):
        topic = year_topics[year][index]
        colors_dict[topic[0]] = get_color(topic[0])
        color.append(colors_dict[topic[0]])
        bar_val.append(topic[1])
      subplts[x_counter, y_counter].bar(x_axis, bar_val, width, color=color, bottom=y_offset)
      y_offset = np.add(y_offset, bar_val)
    # subplts[x_counter, y_counter].set_ylabel("Topic Coverage %")
    #subplts[x_counter, y_counter].set_xlabel("Conferences")
    if len(year_topics.keys()) <= 14:
      subplts[x_counter, y_counter].set_xticks(x_axis + width / 2)
      subplts[x_counter, y_counter].set_xticklabels([str(y)[2:] for y in sorted(year_topics.keys(), key=lambda x: int(x))], fontsize=7)
    else:
      subplts[x_counter, y_counter].set_xticks(np.arange(1, len(year_topics.keys()) + 1, 2) + width / 2)
      subplts[x_counter, y_counter].set_xticklabels([str(y)[2:] for index, y in enumerate(sorted(year_topics.keys(), key=lambda x: int(x))) if index%2 == 0], fontsize=7)

    subplts[x_counter, y_counter].set_yticks(np.arange(0, 101, 20))
    subplts[x_counter, y_counter].set_ylim([0,101])
    subplts[x_counter, y_counter].set_title(conference.acronym)
  # Legends
  patches = []
  labels = []
  for topic in xrange(lda_model.n_topics):
    patches.append(mpatches.Patch(color=get_color(topic)))
    labels.append('Topic %s' % str(topic))
  f.legend(handles=patches, labels=labels, loc='upper center', bbox_to_anchor=(0.5, 0.04), ncol=12, fontsize=7)
  plt.savefig("figs/evolution/evolution_7topics.png")
  plt.clf()
  n_top_words = 10
  for index, topic_dist in enumerate(lda_model.topic_word_):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1]
    print('Topic {}: {}'.format(index, ', '.join(topic_words)))
Exemplo n.º 37
0
def conference_diversity():
    graph = cite_graph(GRAPH_CSV)
    miner = Miner(graph)
    lda_model, vocab = miner.lda(7,
                                 n_iter=100,
                                 alpha=0.847433736937,
                                 beta=0.763774618977)
    conferences = graph.get_papers_by_venue()
    conference_topics = {}
    conference_heatmaps = {}
    for conference_id, papers in conferences.items():
        topics = np.array([0] * lda_model.n_topics)
        for tup in yearize(papers).items():
            for paper_id in tup[1]:
                topics = np.add(topics, miner.documents[paper_id].topics_count)
        conference_topics[conference_id] = percent_sort(topics)
        conference_heatmaps[conference_id] = topics
    #fig, ax = plt.subplots()
    bar_vals = []
    colors = []
    width = 0.75
    plts = []
    x_axis = np.arange(1, len(conference_topics.keys()) + 1)
    #x_axis = [c.acronym for c in mysql.get_conferences()]
    y_offset = np.array([0] * len(conference_topics.keys()))
    colors_dict = {}
    for index in range(7):
        bar_val = []
        color = []
        for conference_id in sorted(conference_topics.keys(),
                                    key=lambda x: int(x)):
            topic = conference_topics[conference_id][index]
            colors_dict[topic[0]] = get_color(topic[0])
            color.append(colors_dict[topic[0]])
            bar_val.append(topic[1])
        plts.append(
            plt.bar(x_axis, bar_val, width, color=color, bottom=y_offset))
        y_offset = np.add(y_offset, bar_val)
    plt.ylabel("Topic Coverage %")
    plt.xlabel("Conferences")
    plt.xticks(x_axis + width / 2,
               [c.acronym for c in mysql.get_conferences()])
    plt.yticks(np.arange(0, 101, 10))
    plt.ylim([0, 101])
    #Legends
    patches = []
    for topic, color in colors_dict.items():
        patches.append(
            mpatches.Patch(color=color, label='Topic %s' % str(topic)))
    plt.legend(handles=patches,
               loc='upper center',
               bbox_to_anchor=(0.5, -0.05),
               ncol=6,
               fontsize=7)
    plt.savefig("figs/diversity/conference_diversity_7topics.png")
    plt.clf()
    n_top_words = 10
    #Heatmap
    heatmap_arr = []
    for conference_id in sorted(conference_heatmaps.keys(),
                                key=lambda x: int(x)):
        tot = sum(conference_heatmaps[conference_id])
        dist = [top / tot for top in conference_heatmaps[conference_id]]
        heatmap_arr.append(dist)
    fig, ax = plt.subplots()
    heatmap_arr = np.array(heatmap_arr)
    heatmap = ax.pcolor(heatmap_arr, cmap=plt.cm.Reds)
    plt.ylabel("Conferences")
    plt.xlabel("Topics")
    # row_labels = range(lda_model.n_topics)
    t_names = [
        "Testing", "Applications", "Program Analysis", "Tools and Projects",
        "Defect Analysis", "Modeling", "Maintenance"
    ]
    row_labels = [
        str(ind) + "-" + name
        for ind, name in zip(range(lda_model.n_topics), t_names)
    ]
    column_labels = [c.acronym for c in mysql.get_conferences()]
    ax.set_xticks(np.arange(heatmap_arr.shape[1]) + 0.5, minor=False)
    ax.set_yticks(np.arange(heatmap_arr.shape[0]) + 0.5, minor=False)
    ax.set_xticklabels(row_labels, minor=False)
    ax.set_yticklabels(column_labels, minor=False)
    plt.savefig("figs/diversity/heatmap_7topics.png")
    plt.clf()
    for index, topic_dist in enumerate(lda_model.topic_word_):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words +
                                                                 1):-1]
        print('Topic {}: {}'.format(index, ', '.join(topic_words)))
    # make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/diversity/dend_heatmap_7topics.png")
    make_heatmap(np.transpose(heatmap_arr), row_labels, column_labels,
                 "figs/diversity/heatmap2.png")
Exemplo n.º 38
0
def topic_evolution():
    graph = cite_graph(GRAPH_CSV)
    miner = Miner(graph)
    lda_model, vocab = miner.lda(7,
                                 n_iter=100,
                                 alpha=0.847433736937,
                                 beta=0.763774618977)
    paper_nodes = graph.paper_nodes
    topics_map = {}
    n_topics = lda_model.n_topics
    for paper_id, paper in paper_nodes.items():
        document = miner.documents[paper_id]
        year_topics = topics_map.get(paper.year, np.array([0] * n_topics))
        topics_map[paper.year] = np.add(year_topics, document.topics_count)
    yt_map = {}
    for year, t_count in topics_map.items():
        yt_map[year] = percent_sort(t_count)
    width = 0.8
    plts = []
    x_axis = np.arange(1, len(yt_map.keys()) + 1)
    # x_axis = [c.acronym for c in mysql.get_conferences()]
    y_offset = np.array([0] * len(yt_map.keys()))
    colors_dict = {}
    TOP_TOPIC_COUNT = 7
    for index in range(TOP_TOPIC_COUNT):
        bar_val, color = [], []
        for year in sorted(yt_map.keys(), key=lambda x: int(x)):
            topic = yt_map[year][index]
            colors_dict[topic[0]] = get_color(topic[0])
            color.append(colors_dict[topic[0]])
            bar_val.append(topic[1])
        plts.append(
            plt.bar(x_axis, bar_val, width, color=color, bottom=y_offset))
        y_offset = np.add(y_offset, bar_val)
    plt.ylabel("Topic %")
    plt.xlabel("Year")
    plt.xticks(
        x_axis + width / 2,
        [str(y)[2:] for y in sorted(yt_map.keys(), key=lambda x: int(x))])
    plt.yticks(np.arange(0, 101, 10))
    plt.ylim([0, 101])
    # Legends
    patches = []
    squares = []
    names = []
    t_names = [
        "Testing", "Applications", "Program Analysis", "Tools and Projects",
        "Defect Analysis", "Modeling", "Maintenance"
    ]
    for index, (topic, color) in enumerate(colors_dict.items()):
        print(topic)
        patches.append(
            mpatches.Patch(color=color, label='Topic %s' % str(topic)))
        squares.append(plts[index][0])
        # names.append('Topic %s' % str(topic))
        # names.append(t_names[index])
    # plt.legend(handles=patches, loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=7, fontsize=9)
    plt.legend(tuple(patches),
               tuple(t_names),
               loc='upper center',
               bbox_to_anchor=(0.5, 1.14),
               ncol=4,
               fontsize=11,
               handlelength=0.7)
    plt.savefig("figs/topic_evolution/topic_evolution_7_gib.png")
    plt.clf()
    n_top_words = 10
    for index, topic_dist in enumerate(lda_model.topic_word_):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words +
                                                                 1):-1]
        print('Topic {}: {}'.format(index, ', '.join(topic_words)))
Exemplo n.º 39
0
def pc_bias():
    def index_by_year(tups):
        y_comm = {}
        for tup in tups:
            comm = y_comm.get(tup[1], set())
            comm.add(tup[0])
            y_comm[tup[1]] = comm
        return y_comm

    legit_conferences = ["ICSE", "MSR", "FSE", "ASE"]
    colors = ['r', 'g', 'b', 'y']
    graph = cite_graph(GRAPH_CSV)
    width = 0.5
    space = 0.3
    p_conferences = graph.get_papers_by_venue()
    p_committees = graph.get_committee_by_conference()
    max_len = 21
    low = 1
    high = max_len * (len(legit_conferences) * width + space) + 1
    delta = (high - low) / max_len
    x_axis = np.arange(low, high, delta)
    x_ticks = np.arange(1993, 1993 + max_len)
    conf_index = 0
    patches = []
    for conference in mysql.get_conferences():
        if conference.acronym not in legit_conferences: continue
        year_committees = index_by_year(p_committees[conference.id])
        year_papers = index_by_year(p_conferences[conference.id])
        year_scores = {}
        y_axis = []
        #x_axis = np.arange(1, len(year_committees.keys())+1)
        for year in sorted(year_committees.keys(), key=lambda y: int(y)):
            papers = year_papers.get(year, None)
            if papers is None:
                y_axis.append(0)
                continue
            committee = year_committees[year]
            comm_papers = 0
            non_comm_papers = 0
            for paper_id in papers:
                paper = graph.paper_nodes[paper_id]
                author_ids = set(paper.author_ids.strip().split(","))
                if author_ids.intersection(committee):
                    comm_papers += 1
                else:
                    non_comm_papers += 1
            year_scores[year] = (comm_papers, non_comm_papers)
            percent = 0 if not comm_papers else comm_papers * 100 / (
                comm_papers + non_comm_papers)
            y_axis.append(percent)
        y_axis = np.lib.pad(y_axis, (max_len - len(y_axis), 0),
                            'constant',
                            constant_values=0)
        plt.bar(x_axis + conf_index * width,
                y_axis,
                width=width,
                color=colors[conf_index])
        patches.append(
            mpatches.Patch(color=colors[conf_index], label=conference.acronym))
        conf_index += 1
    plt.xlabel("Year")
    plt.ylabel("% of papers by PC")
    plt.xticks(x_axis + len(legit_conferences) * width / 2,
               [str(y)[2:] for y in x_ticks])
    #plt.yticks(np.arange(0, 100, 10))
    #plt.title(conference.acronym)
    plt.legend(handles=patches,
               loc='upper center',
               bbox_to_anchor=(0.5, -0.05),
               ncol=len(legit_conferences),
               fontsize=7)
    plt.savefig("figs/pc/pc.png")
    plt.clf()