def get_and_graph(): schools_tree = es.get_tree("http://www.gla.ac.uk/schools/") ns = 'http://exslt.org/regular-expressions' path = '//div[@class="row standardContent"]//a[re:match(@href, "schools/[A-Za-z]+/")]' a_elems = schools_tree.xpath(path, namespaces={'re':ns}) base_url = "http://www.gla.ac.uk" urls = [] names = [] for a in a_elems: staff_page_url = base_url + a.get("href") + "staff/" urls.append(staff_page_url) school_name = a.text names.append(school_name) school_names_urls = zip(names, urls) print school_names_urls #remove SOCS as done already, physics for now cause it's huge for tup in school_names_urls[:]: if "Physics" in tup[0]: school_names_urls.remove(tup) # For each school for name, url in school_names_urls[10:]: print name, url if "Humanities" in name: name = "School of Humanities" author_name_urls = es.get_author_name_urls(name, url) # write these to file for safe keeping # ALREADY BEING DONE BY ES #with open("../nameurls/" + name + ".txt", 'w') as f: # json.dump(author_name_urls) coauthor_dict = es.get_coauthors_dict(author_name_urls, name) # extract just names from name urls and put in list #author_names = [author_name for author_name, author_url in author_name_urls] # Put names in Title First Name Last Name order for paper_id, data in coauthor_dict.items(): authors = data["authors"] newauthors = [(anu[0].split(", ")[1] + " " + anu[0].split(", ")[0], anu[1]) for anu in authors] coauthor_dict[paper_id]["authors"] = newauthors # Do the same for author_name_urls # TODO is this necessary? Because we're checking against urls - could even just give gm the urls author_name_urls = [(anu[0].split(", ")[1] + " " + anu[0].split(", ")[0], anu[1]) for anu in author_name_urls] # now make graph gm = gfd.GraphMaker() gm.populate_graph(coauthor_dict, author_name_urls) gm.add_metrics() gm.add_just_school_community() gm.write_to_file("../newestgraphs/" + name + ".json")
def get_enlighten_data(): """ Gets data for each school from Enlighten using englighten_scraper. Returns a list of tuples, one for each school; each tuple has the a data_dict (dictionary keyed by paper id with paper metadata as values) and a list of (name, enlighten url) pairs for the authors in the school """ data_dicts = [] author_name_urls_list = [] school_data = {} school_name_urls = es.get_school_name_urls() for schoolname, schoolurl in school_name_urls: author_name_urls = es.get_author_name_urls(schoolname, schoolurl) data_dict = es.get_coauthors_dict(author_name_urls, schoolname) #data_dicts.append(data_dict) #author_name_urls_list.append(author_name_urls) school_data[schoolname] = (data_dict, author_name_urls) #data_schoolauthors = zip(data_dicts, author_name_urls_list) return school_data
a_elems = schools_tree.xpath(path, namespaces={'re':ns}) base_url = "http://www.gla.ac.uk" urls = [] names = [] for a in a_elems: staff_page_url = base_url + a.get("href") + "staff/" urls.append(staff_page_url) school_name = a.text names.append(school_name) school_names_urls = zip(names, urls) print school_names_urls start_index = int(sys.argv[1]) end_index = int(sys.argv[2]) # TODO REMOVE SLICING # THIS is temporary to just do the schools we haven't done yet for schl_name, schl_url in school_names_urls[start_index:end_index]: author_name_urls = es.get_author_name_urls(schl_url, schl_name) titles_dict = es.get_titles_dict(author_name_urls) stats = cs.Stats(authors_dict, name) stats.write_to_file("stats_results/stats_test.txt")