def get_books(soup,link=''): anchors = soup.findAll('a',href=re.compile(Retriever.BOOK_REGEX)) books = {} for anchor in anchors: href = helpers.make_correct_link(link, anchor['href']) if helpers.check_local_link(Retriever.SITE_LINK, href): books[ href] = anchor.text return books.items()
def tasks_executing_loop(self): task = self.storage.get_task() if task.need_html: print ' downloading: %s' % task.link if not helpers.check_local_link('http://lib.ru', task.link): logger.write('TM_LibRu: not local link!', notlocallink=task.link) self.storage.mark_executed() return page = (self.dm.download(task.link) , ) url = page[0][1].url if not helpers.check_local_link('http://lib.ru', url): logger.write('TM_LibRu: not local link!', notlocallink=url,link=task.link) self.storage.mark_executed() return else: page = () self.print_executing(task) self.execute_task(task, page)
def read_last_modified(self,page): field = LAST_MODIFIED if not helpers.check_local_link('http://lib.ru', page.url): return if field in page.headers: new_date = datetime.strptime(page.headers[field], DM_LibRu_AddRefreshLinks.TIME_FORMAT ) refreshes = Refresh.objects.filter(link=page.url) if refreshes: refreshes[0].last_modified = new_date refreshes[0].save() else: Refresh.objects.create(link=page.url, last_modified= new_date)