示例#1
0
 def get_books(soup,link=''):
     anchors = soup.findAll('a',href=re.compile(Retriever.BOOK_REGEX))
     books = {}
     for anchor in anchors:
         href = helpers.make_correct_link(link, anchor['href'])
         if helpers.check_local_link(Retriever.SITE_LINK, href):
             books[ href] = anchor.text
     return books.items()
示例#2
0
 def tasks_executing_loop(self):
     task = self.storage.get_task()
     if task.need_html:
         print '    downloading: %s' % task.link
         if not helpers.check_local_link('http://lib.ru', task.link):
             logger.write('TM_LibRu: not local link!', notlocallink=task.link)
             self.storage.mark_executed()
             return
         page = (self.dm.download(task.link) , )
         url = page[0][1].url
         if not helpers.check_local_link('http://lib.ru', url):
             logger.write('TM_LibRu: not local link!', notlocallink=url,link=task.link)
             self.storage.mark_executed()
             return
     else:
         page = ()
     self.print_executing(task)
     self.execute_task(task, page)
示例#3
0
 def read_last_modified(self,page):
     field = LAST_MODIFIED
     if not helpers.check_local_link('http://lib.ru', page.url):
         return
     if field in page.headers:
         new_date = datetime.strptime(page.headers[field], DM_LibRu_AddRefreshLinks.TIME_FORMAT )
         refreshes = Refresh.objects.filter(link=page.url)
         if refreshes:
             refreshes[0].last_modified = new_date
             refreshes[0].save()
         else:
             Refresh.objects.create(link=page.url, last_modified= new_date)