示例#1
0
 def get_shoutouts(self, url):
   if self.get_shoutout_page(url):
     #An example is Warhammer40000.
     page = self.get_shoutout_page(url)
     soup = BeautifulSoup(page)
     soup = soup.find("div", {"id": "wikitext"})
     if soup == None:
       print "INVALID PAGE:" + url, sys.stderr
       #print(page)
       shoutouturl = add_schema(self.base_url + "ShoutOut/"
         + url.split('/')[-1])
       print("delete " + shoutouturl)
       self.delete_page(shoutouturl)
       return self.get_shoutouts(url)
   else:
     page = self.get_multitropes(url)
     if not page:
       #An example is MontyPythonsFlyingCircus.
       page = self.get_page(url)
     soup = BeautifulSoup(page)
     items = [item for item in soup.findAll('li') if
       '/Main/ShoutOut' in str(item)]
     try:
       soup = min(items, key=lambda x: str(x).index('/Main/ShoutOut'))
     except ValueError:
       soup = BeautifulSoup("")
   links = [link.get('href') for link in soup.find_all('a')]
   links = [link for link in links if link and self.is_trope(link)
     and filters.is_work(link)]
   return links
示例#2
0
 def replicate(self):
   self.make_db()
   self.cache_pages('filteredworks.txt')
   works = self.list_pages()
   for work in works:
     if filters.is_work(work):
       shoutouts = tropes.get_shoutouts(work)
       tropes.save_shoutouts(work, shoutouts)
   rankdict = tropes.pagerank(tropes.list_edges())
   ranked = [(rankdict[k], k) for k in rankdict]
   ranked.sort()
   ranked = [(k, v) for (k, v) in ranked if (self.count_referrers(k) * 2 > 
     self.count_references(k))]
   ranked = list(reversed(ranked))
   self.print_results(ranked)
示例#3
0
 def replicate(self):
   '''The steps to fully replicate my work.'''
   #Run scrapy runspider crawler.py
   #Filter out pages that aren't works or that redirect.
   self.make_db()
   self.cache_pages('filteredworks.txt')
   works = self.list_pages()
   for work in works:
     if filters.is_work(work):
       shoutouts = tropes.get_shoutouts(work)
       tropes.save_shoutouts(work, shoutouts)
   rankdict = tropes.pagerank(tropes.list_edges())
   ranked = [(rankdict[k], k) for k in rankdict]
   ranked.sort()
   ranked = [(k, v) for (k, v) in ranked if (self.count_referrers(k) * 2 > 
     self.count_references(k))]
   ranked = list(reversed(ranked))
   self.print_results(ranked)