Пример #1
0
def graphs_from_files():
	filenames = ["Adam Smith Business School",
				"Dental School",
				"School of Chemistry",
				"School of Critical Studies",
				"School of Culture and Creative Arts",
				"School of Education"
				]

	schools_tree = es.get_tree("http://www.gla.ac.uk/schools/")
	ns = 'http://exslt.org/regular-expressions'
	path = '//div[@class="row standardContent"]//a[re:match(@href, "schools/[A-Za-z]+/")]'
	a_elems = schools_tree.xpath(path, namespaces={'re':ns})
	base_url = "http://www.gla.ac.uk"
	urls = []
	names = []

	for a in a_elems:
		staff_page_url = base_url + a.get("href") + "staff/"
		urls.append(staff_page_url)
		school_name = a.text
		names.append(school_name)

	school_names_urls = zip(names, urls)
	print school_names_urls


	for name, url in school_names_urls:
		if name in filenames:
			with open("../coauthor_data/" + name + ".txt") as f:
				d = json.load(f)

			staff_names = es.get_names(url)
			gm = gfd.GraphMaker(d, staff_names)
			gm.write_to_file(name + " graph")
Пример #2
0
def scrape_and_make(data_dict):
	#authorkw_dict = defaultdict(str)
	authorkw_dict = {}
	names = es.get_names("http://www.gla.ac.uk/schools/computing/staff/")
	# TODO do we need to pass dd into gm?
	gm = gfd.GraphMaker(data_dict, names)
	# TODO adding of keywords has to be done somewhere else - text processing module...
	#gm.add_kw_to_data()

	for title, info in data_dict.items():
		authors = info["authors"]
		# At this point kws are in a list of phrases (returned that way from es and from kw extraction)
		# First, stem or lemmatize each word in each phrase (or don't)
		# To add to author kw, join on "|" so we have a full string which can be searched for substring
		# Then we need to tokenise that into individual terms to put in inv index
		# Use regex to split into terms.. take into account "|"
		# Once tokenised, build inv index like in search.py
		keyword_list = info["keywords"]
		# TODO temporary fixes
		keyword_list = [kw for kw in keyword_list if kw != ""]
		stemmed_kw_list = stem_kwlist(keyword_list)
		#stemmed_kw_list = keyword_list[:]
		# TODO temporary fixes
		stemmed_kw_list = [kw for kw in stemmed_kw_list if "abstract avail" not in kw]
		# TODO do we need to check if authors exists?
		for author in authors:
			# TODO Add a node to collab graph here
			gm.add_vertex(author)
			# Make list of keywords into a single string, with phrases separated by a divisor
			# TODO using kw list for now
			#kw_string = "|".join(stemmed_kw_list)
			author_name = author[0]
			# TODO have to do this to work with shelve because loading from json for now
			author_id = author[1].encode("utf-8")
			# Add author to authorkw_dict / add keywords to existing author
			# TODO using kw list but may become string
			if author_id not in authorkw_dict:
				authorkw_dict[author_id] = {"name": author_name, "keywords": stemmed_kw_list}
			else:
				authorkw_dict[author_id]["keywords"].extend(stemmed_kw_list)
			# NB each author is a (name, unique id) pair; unique id is dict key, name goes as value

		# Add edges to collab graph here
		gm.add_links(title, authors)
		# Potentially add to inv_index here as we are already looping through the data_dict
	
	clean_authorkw = clean_keywords(authorkw_dict.copy())
	# store collab graph (as json?)
	gm.write_to_file("newcsgraph")
	# store authorkw_dict (using shelve for now)
	she = shelve.open("authorkw.db")
	she.update(authorkw_dict)
	she.close()
	# TODO
	return (clean_authorkw, gm.get_graph())
Пример #3
0
def graph_me_up():
	with open("../coauthor_data/School of Computing Science.txt") as f:
		d = json.load(f)

	names = es.get_names("http://www.gla.ac.uk/schools/computing/staff/")

	gm = gfd.GraphMaker(d, names)

	g = gm.get_graph()

	gm.write_to_file("cslatest")

	return g