def correct_book_info(book_info,page_link): new_links = {} for format, link in book_info.links.items(): new_links[format] = make_correct_link(page_link, link) book_info.links = new_links book_info.image = make_correct_link(page_link, book_info.image) book_info.pagelink = page_link return book_info
def execute(self, html): soup = get_soup(html) magazine_info = Retriever.get_magazine_info(soup) magazine_info.cover = make_correct_link(self.link, magazine_info.cover) magazine_info.link = self.link issue_links = [ make_correct_link(self.link, link) for link in Retriever.get_links_magazine_issue(soup) ] self.tasks = [ MagazineIssueTask( magazine_info.name, issue_link) for issue_link in issue_links ] self.tasks += [ MagazineSavingTask( magazine_info ) ] return True
def execute(self, html): soup = get_soup(html) genres = Retriever.get_genres(soup) for genre in genres: genre['link'] = make_correct_link(self.link, genre['link']) self.tasks = [ GetAllPagesGenre(g['link'],g['name']) for g in genres ] return True
def get_link_pages(soup, page_link): link_pages = [ ] td = Retriever.get_field(soup, u'Страницы:', 'td') if td: anchors = td.findAll('a') anchors = anchors[:-1] link_pages += [ make_correct_link(page_link, anchor['href']) for anchor in anchors ] return link_pages
def get_books(soup,link=''): anchors = soup.findAll('a',href=re.compile(Retriever.BOOK_REGEX)) books = {} for anchor in anchors: href = helpers.make_correct_link(link, anchor['href']) if helpers.check_local_link(Retriever.SITE_LINK, href): books[ href] = anchor.text return books.items()
def get_links(soup, page_link): links = {} select = soup.find('select', id='useropt') if select: format_options = select.findAll('option') formats = [] for format_option in format_options: formats.append(format_option['value']) for format in formats: link = "%s/%s" % (page_link, format) links[format] = make_correct_link(page_link, link) else: number = Retriever.get_booknumber_from_link(page_link) anchors = soup.findAll('a', href=re.compile('/b/%s/[A-z]+$' % number )) anchors = filter(Retriever.is_available_anchor, anchors) for link in anchors: format = Retriever.get_format_from_anchor(link) correct_link = make_correct_link(page_link, link['href']) links[format] = correct_link return links
def execute(self): entry = BeautifulSoup(self.entry,convertEntities=BeautifulSoup.XML_ENTITIES) bookinfo = Retriever.get_bookinfo(entry) if not bookinfo: logger.write_fail("empty links at entry",entry=entry, link=self.link) return True bookinfo.pagelink = self.link for format, link in bookinfo.links.items(): bookinfo.links[format] = make_correct_link(self.link, bookinfo.links[format] ) self.tasks = [ BookSavingTask(bookinfo) ] return True
def execute(self, html): soup = get_soup(html) year, number = Retriever.get_issue_year_number_by_link(self.link) articles = Retriever.get_issue_articles(soup) for article in articles: article.magazine = self.magazine article.year = year article.number = number article.link = make_correct_link(self.link, article.link) article.link = Retriever.print_version(article.link) self.tasks = [ArticleSavingTask(article) for article in articles] return True
def execute(self, html): data, page = html soup = BeautifulSoup(data,convertEntities=BeautifulSoup.XML_ENTITIES, fromEncoding='utf-8') if Retriever.is_acquisition_feed(soup): self.tasks = [ Entry(unicode(entry), self.link) for entry in Retriever.get_entries(soup) ] else: #navigation feed entries = Retriever.get_entries(soup) links = filter( lambda l: l!=None, [Retriever.get_catalog_link(entry) for entry in entries] ) links = filter( Retriever.is_permitted_link, links) links = map( lambda link: make_correct_link(self.link, link), links) self.tasks = [ Page(link) for link in links] return True
def postprocess_bookinfo(bookinfo, pagelink, tag): bookinfo.pagelink = pagelink for format, link in bookinfo.links.items(): bookinfo.links[format] = make_correct_link(pagelink, link) bookinfo.tags.append(tag) return bookinfo
def execute(self, html): soup = get_soup(html) links_magazines = Retriever.get_links_on_magazines(soup) links_magazines = [make_correct_link(self.link, link) for link in links_magazines] self.tasks = [ MagazineInfoTask( link ) for link in links_magazines] return True
def execute(self,html): soup = get_soup(html) links = [make_correct_link(self.link, book_link) for book_link in Retriever.get_book_links(soup) ] self.tasks = [ GetBookInfoTask( link) for link in links] return True
def execute(self,html): soup = get_soup(html) tasks = [ GenreTask( make_correct_link(self.link, glink)) for glink in Retriever.get_genres_links(soup) ] self.tasks = tasks return True
def get_picture(soup,page_link, id): img_link = Retriever.get_picture_link(soup) if not img_link: return None img_link = make_correct_link(page_link,img_link) return img_link
def correct_queue(queue, page_link): for q in queue: q['link'] = make_correct_link(page_link, q['link'])
def make_correct_link_test(): print "helper_tests: make correct link test" assert make_correct_link('http://ya.ru/a/', 'g.html') == 'http://ya.ru/a/g.html' assert make_correct_link('http://ya.ru/', 'g.html') == 'http://ya.ru/g.html' assert make_correct_link('http://ya.ru/a', 'g.html') == 'http://ya.ru/a/g.html' assert make_correct_link('http://ya.ru/a?set=1', 'g.html') == 'http://ya.ru/g.html' assert make_correct_link('http://ya.ru/a.php', 'g.html') == 'http://ya.ru/g.html' assert make_correct_link('http://ya.ru/vehicles/', 'http://www.a.ru/g.html') == 'http://www.a.ru/g.html' assert make_correct_link('http://ya.ru/vehicles/', '/a.html') == 'http://ya.ru/a.html' assert make_correct_link('http://ya.ru/vehicles/', 'http://www.org.ru/a.html') == 'http://www.org.ru/a.html' assert make_correct_link('http://ya.ru/vehicles/','https://www.org.ru/a.html') == 'https://www.org.ru/a.html' assert make_correct_link('http://www.epubbooks.ru/stanza/index.xml', 'xmllastadd.php') == 'http://www.epubbooks.ru/stanza/xmllastadd.php' assert make_correct_link('http://www.stanza.epubbooks.ru/stanza/genres/0.xml', '../../tpl/download.php?npp=3376') == 'http://www.stanza.epubbooks.ru/tpl/download.php?npp=3376' assert make_correct_link('http://www.stanza.epubbooks.ru/stanza/genres/0.xml', '../../tpl/download.php?npp=3376') == 'http://www.stanza.epubbooks.ru/tpl/download.php?npp=3376'