def get_shoutouts(self, url): if self.get_shoutout_page(url): #An example is Warhammer40000. page = self.get_shoutout_page(url) soup = BeautifulSoup(page) soup = soup.find("div", {"id": "wikitext"}) if soup == None: print "INVALID PAGE:" + url, sys.stderr #print(page) shoutouturl = add_schema(self.base_url + "ShoutOut/" + url.split('/')[-1]) print("delete " + shoutouturl) self.delete_page(shoutouturl) return self.get_shoutouts(url) else: page = self.get_multitropes(url) if not page: #An example is MontyPythonsFlyingCircus. page = self.get_page(url) soup = BeautifulSoup(page) items = [item for item in soup.findAll('li') if '/Main/ShoutOut' in str(item)] try: soup = min(items, key=lambda x: str(x).index('/Main/ShoutOut')) except ValueError: soup = BeautifulSoup("") links = [link.get('href') for link in soup.find_all('a')] links = [link for link in links if link and self.is_trope(link) and filters.is_work(link)] return links
def replicate(self): self.make_db() self.cache_pages('filteredworks.txt') works = self.list_pages() for work in works: if filters.is_work(work): shoutouts = tropes.get_shoutouts(work) tropes.save_shoutouts(work, shoutouts) rankdict = tropes.pagerank(tropes.list_edges()) ranked = [(rankdict[k], k) for k in rankdict] ranked.sort() ranked = [(k, v) for (k, v) in ranked if (self.count_referrers(k) * 2 > self.count_references(k))] ranked = list(reversed(ranked)) self.print_results(ranked)
def replicate(self): '''The steps to fully replicate my work.''' #Run scrapy runspider crawler.py #Filter out pages that aren't works or that redirect. self.make_db() self.cache_pages('filteredworks.txt') works = self.list_pages() for work in works: if filters.is_work(work): shoutouts = tropes.get_shoutouts(work) tropes.save_shoutouts(work, shoutouts) rankdict = tropes.pagerank(tropes.list_edges()) ranked = [(rankdict[k], k) for k in rankdict] ranked.sort() ranked = [(k, v) for (k, v) in ranked if (self.count_referrers(k) * 2 > self.count_references(k))] ranked = list(reversed(ranked)) self.print_results(ranked)