def identify(): d = gender_guesser.detector.Detector(case_sensitive=False) authors = mysql.get_authors() counts = {} for a_id, node in authors.items(): if node.name: f_name = get_first_name(node.name) g = from_db(f_name) if g is None: g = d.get_gender(f_name) counts[g] = counts.get(g, []) + [node] node.f_name = f_name author_genders = [] for key, nodes in counts.items(): if "female" in key: gender = "f" elif "male" in key: gender = "m" else: continue for node in nodes: author_genders.append((gender, int(node.id))) # flattened = counts['unknown'] + counts['andy'] # nodes = [] mysql.update_genders(author_genders)
def get_author_genders(): authors = mysqldb.get_authors() gender_map = {} for a_id, node in authors.items(): if not node.gender: continue gender_map[a_id] = node.gender return gender_map
def get_authors_by_h_index(save_file, top_count=100): with open(save_file) as f: author_data = cPkl.load(f) authors = mysql.get_authors() h_index_results = sorted([(a_id, authors[a_id]['name'], score) for a_id, score in author_data.items()], key=lambda x: x[2], reverse=True) for i, result in enumerate(h_index_results[:top_count]): print(i + 1, result[1], result[2])
def get_author_genders(): authors = mysql.get_authors() gender_map = {} for a_id, node in authors.items(): if not node.gender: gender_map[a_id] = "u" else: gender_map[a_id] = node.gender return gender_map
def print_top_author_names(file_name): naive_pr_file = "figs/%s/%s/authors/%s.pkl" % (THE.version, THE.permitted, "for_gender/naive_page_rank") cite_pr_file = "figs/%s/%s/authors/%s.pkl" % (THE.version, THE.permitted, "for_gender/cite_page_rank") publ_pr_file = "figs/%s/%s/authors/%s.pkl" % (THE.version, THE.permitted, "for_gender/publ_page_rank") authors = mysql.get_authors() pr_results = sorted([ authors[a_id]['name'] # (a_id, authors[a_id]['name'], score) for a_id, score in open_pkl(file_name).items()], key=lambda x: x[1], reverse=True) print(pr_results[:100])
def plot_damp_top_authors(folder, damps, top, min_year, plot_author_count=20, show_legend=True): graph = cite_graph(GRAPH_CSV) top_authors = most_cited_authors(graph, top, min_year)[:plot_author_count] author_nodes = mysql.get_authors() x_labels = [author_nodes[a[0]].name for a in top_authors] x_axis = range(1, plot_author_count + 1) top_author_ids = np.array([a[0] for a in top_authors]) folder_path = "figs/%s/%s/authors/%s" % (THE.version, THE.permitted, folder) palette = np.array(sns.color_palette("hls", plot_author_count)) legends = [] # for i, f_name in enumerate(os.listdir(folder_path)): y_axes = [] means = np.array([0.0] * plot_author_count) plt.figure(figsize=(8, 2)) for i, _ in enumerate(damps): # file_name = "%s/%s" % (folder_path, name) file_name = "%s/page_rank_%0.2f.pkl" % (folder_path, damps[i]) with open(file_name) as f: pr_scores = cPkl.load(f) y_axis = np.array([pr_scores[a] for a in top_author_ids]) y_axes.append(y_axis) means += y_axis indices = np.argsort(means)[::-1] top_author_ids = top_author_ids[indices] # sns.set_style("whitegrid", {'axes.grid': False}) sns.set_style("white") for i, y_axis in enumerate(y_axes): plt.plot(x_axis, y_axis[indices], c=palette[i]) legends.append("%0.2f" % damps[i]) if show_legend: plt.legend(legends, bbox_to_anchor=(-0.1, 1.15, 1.15, 0.2), loc="lower left", mode="expand", borderaxespad=0, ncol=10) fig_name = "figs/%s/%s/authors/damp_%s.png" % (THE.version, THE.permitted, folder) plt.ylabel("Page Rank Score", fontsize=14) plt.xlabel("Author ID", fontsize=14) plt.xticks(x_axis, top_author_ids, rotation='vertical') plt.title("Page Rank Score for top %d cited author with varying damping factors" % plot_author_count) plt.savefig(fig_name, bbox_inches='tight') plt.clf()
def from_file(file_name, delimiter='$|$'): paper_nodes = {} author_edges = {} cite_edges = {} collaborator_edges = {} author_nodes = mysqldb.get_authors() ref_nodes = {} def add_collaborator_edges(authors): if len(authors) <= 1: return for i in range(len(authors)): for j in range(i + 1, len(authors)): low, high = min(authors[i].id, authors[j].id), max( authors[i].id, authors[j].id) key = low + "-" + high e = collaborator_edges.get(key, None) if e is None: e = Edge(source=low, target=high, edge_type="collaborator", count=1) else: e.count += 1 collaborator_edges[key] = e with open(file_name, 'rb') as f: column_names = f.readline().strip().lower().split(delimiter) for line in f.readlines(): line = line.decode('utf-8', 'ignore').encode("utf-8") columns = line.strip().split(delimiter) paper_node = Node() for name, val in zip(column_names, columns): paper_node[name] = val paper_node["type"] = "paper" if paper_node.ref_id: ref_nodes[paper_node.ref_id] = paper_node paper_nodes[paper_node.id] = paper_node paper_authors = [] for author_id, author in zip( columns[AUTHOR_ID_INDEX].split(","), columns[AUTHOR_NAME_INDEX].split(",")): author_node = author_nodes[author_id] paper_authors.append(author_node) edge = Edge(source=author_node.id, target=paper_node.id, edge_type="author") author_edges[edge.id] = edge add_collaborator_edges(paper_authors) cited_counts = {} for paper_id, paper in paper_nodes.items(): if not paper.ref_id: continue references = paper.cites if not references: continue source = ref_nodes[paper.ref_id] for ref_id in references.split(","): if not ref_nodes.get(ref_id, None): continue target = ref_nodes[ref_id] target_cited = cited_counts.get(target.id, 0) cited_counts[target.id] = target_cited + 1 edge = Edge(source=source.id, target=target.id, edge_type="cites") # edge = Edge(source=target.id, target=source.id, edge_type="cite") cite_edges[edge.id] = edge for paper_id, paper in paper_nodes.items(): paper["local_cites"] = cited_counts.get(paper_id, 0) graph = Graph() graph.paper_nodes = paper_nodes graph.author_nodes = author_nodes graph.author_edges = author_edges graph.cite_edges = cite_edges graph.collaborator_edges = collaborator_edges graph.add_pc_membership(mysqldb.get_pc_membership()) return graph
def from_file(file_name, delimiter='$|$'): paper_nodes = {} author_edges = {} cite_edges = {} collaborator_edges = {} author_nodes = mysqldb.get_authors() ref_nodes = {} def add_collaborator_edges(authors): if len(authors) <= 1: return for i in range(len(authors)): for j in range(i + 1, len(authors)): low, high = min(authors[i].id, authors[j].id), max(authors[i].id, authors[j].id) key = low + "-" + high e = collaborator_edges.get(key, None) if e is None: e = Edge(source=low, target=high, edge_type="collaborator", count=1) else: e.count += 1 collaborator_edges[key] = e with open(file_name, 'rb') as f: column_names = f.readline().strip().lower().split(delimiter) for line in f.readlines(): line = line.decode('utf-8', 'ignore').encode("utf-8") columns = line.strip().split(delimiter) paper_node = Node() for name, val in zip(column_names, columns): paper_node[name] = val paper_node["type"] = "paper" if paper_node.ref_id: ref_nodes[paper_node.ref_id] = paper_node paper_nodes[paper_node.id] = paper_node paper_authors = [] for author_id, author in zip(columns[AUTHOR_ID_INDEX].split(","), columns[AUTHOR_NAME_INDEX].split(",")): author_node = author_nodes[author_id] paper_authors.append(author_node) edge = Edge(source=author_node.id, target=paper_node.id, edge_type="author") author_edges[edge.id] = edge add_collaborator_edges(paper_authors) cited_counts = {} for paper_id, paper in paper_nodes.items(): if not paper.ref_id: continue references = paper.cites if not references: continue source = ref_nodes[paper.ref_id] for ref_id in references.split(","): if not ref_nodes.get(ref_id, None): continue target = ref_nodes[ref_id] target_cited = cited_counts.get(target.id, 0) cited_counts[target.id] = target_cited + 1 edge = Edge(source=source.id, target=target.id, edge_type="cites") # edge = Edge(source=target.id, target=source.id, edge_type="cite") cite_edges[edge.id] = edge for paper_id, paper in paper_nodes.items(): paper["local_cites"] = cited_counts.get(paper_id, 0) graph = Graph() graph.paper_nodes = paper_nodes graph.author_nodes = author_nodes graph.author_edges = author_edges graph.cite_edges = cite_edges graph.collaborator_edges = collaborator_edges graph.add_pc_membership(mysqldb.get_pc_membership()) return graph