def graphs_from_files(): filenames = ["Adam Smith Business School", "Dental School", "School of Chemistry", "School of Critical Studies", "School of Culture and Creative Arts", "School of Education" ] schools_tree = es.get_tree("http://www.gla.ac.uk/schools/") ns = 'http://exslt.org/regular-expressions' path = '//div[@class="row standardContent"]//a[re:match(@href, "schools/[A-Za-z]+/")]' a_elems = schools_tree.xpath(path, namespaces={'re':ns}) base_url = "http://www.gla.ac.uk" urls = [] names = [] for a in a_elems: staff_page_url = base_url + a.get("href") + "staff/" urls.append(staff_page_url) school_name = a.text names.append(school_name) school_names_urls = zip(names, urls) print school_names_urls for name, url in school_names_urls: if name in filenames: with open("../coauthor_data/" + name + ".txt") as f: d = json.load(f) staff_names = es.get_names(url) gm = gfd.GraphMaker(d, staff_names) gm.write_to_file(name + " graph")
def scrape_and_make(data_dict): #authorkw_dict = defaultdict(str) authorkw_dict = {} names = es.get_names("http://www.gla.ac.uk/schools/computing/staff/") # TODO do we need to pass dd into gm? gm = gfd.GraphMaker(data_dict, names) # TODO adding of keywords has to be done somewhere else - text processing module... #gm.add_kw_to_data() for title, info in data_dict.items(): authors = info["authors"] # At this point kws are in a list of phrases (returned that way from es and from kw extraction) # First, stem or lemmatize each word in each phrase (or don't) # To add to author kw, join on "|" so we have a full string which can be searched for substring # Then we need to tokenise that into individual terms to put in inv index # Use regex to split into terms.. take into account "|" # Once tokenised, build inv index like in search.py keyword_list = info["keywords"] # TODO temporary fixes keyword_list = [kw for kw in keyword_list if kw != ""] stemmed_kw_list = stem_kwlist(keyword_list) #stemmed_kw_list = keyword_list[:] # TODO temporary fixes stemmed_kw_list = [kw for kw in stemmed_kw_list if "abstract avail" not in kw] # TODO do we need to check if authors exists? for author in authors: # TODO Add a node to collab graph here gm.add_vertex(author) # Make list of keywords into a single string, with phrases separated by a divisor # TODO using kw list for now #kw_string = "|".join(stemmed_kw_list) author_name = author[0] # TODO have to do this to work with shelve because loading from json for now author_id = author[1].encode("utf-8") # Add author to authorkw_dict / add keywords to existing author # TODO using kw list but may become string if author_id not in authorkw_dict: authorkw_dict[author_id] = {"name": author_name, "keywords": stemmed_kw_list} else: authorkw_dict[author_id]["keywords"].extend(stemmed_kw_list) # NB each author is a (name, unique id) pair; unique id is dict key, name goes as value # Add edges to collab graph here gm.add_links(title, authors) # Potentially add to inv_index here as we are already looping through the data_dict clean_authorkw = clean_keywords(authorkw_dict.copy()) # store collab graph (as json?) gm.write_to_file("newcsgraph") # store authorkw_dict (using shelve for now) she = shelve.open("authorkw.db") she.update(authorkw_dict) she.close() # TODO return (clean_authorkw, gm.get_graph())
def graph_me_up(): with open("../coauthor_data/School of Computing Science.txt") as f: d = json.load(f) names = es.get_names("http://www.gla.ac.uk/schools/computing/staff/") gm = gfd.GraphMaker(d, names) g = gm.get_graph() gm.write_to_file("cslatest") return g