Пример #1
0
 def correct_book_info(book_info,page_link):
     new_links = {}
     for format, link in book_info.links.items():
         new_links[format] = make_correct_link(page_link, link)
     book_info.links = new_links
     book_info.image = make_correct_link(page_link, book_info.image)
     book_info.pagelink = page_link
     return book_info
Пример #2
0
 def execute(self, html):
     soup = get_soup(html)
     magazine_info = Retriever.get_magazine_info(soup)
     magazine_info.cover = make_correct_link(self.link, magazine_info.cover)
     magazine_info.link = self.link
     issue_links = [ make_correct_link(self.link, link) for link in Retriever.get_links_magazine_issue(soup) ]
     self.tasks = [ MagazineIssueTask( magazine_info.name, issue_link) for issue_link in issue_links ]
     self.tasks += [ MagazineSavingTask( magazine_info ) ]
     return True
Пример #3
0
 def execute(self, html):
     soup = get_soup(html)
     genres = Retriever.get_genres(soup)
     for genre in genres:
         genre['link'] = make_correct_link(self.link, genre['link'])
     self.tasks = [ GetAllPagesGenre(g['link'],g['name']) for g in genres ]
     return True
Пример #4
0
 def get_link_pages(soup, page_link):
     link_pages = [ ]
     td = Retriever.get_field(soup, u'Страницы:', 'td')
     if td:
         anchors = td.findAll('a')
         anchors = anchors[:-1]
         link_pages += [ make_correct_link(page_link, anchor['href']) for anchor in anchors ]
     return link_pages
Пример #5
0
 def get_books(soup,link=''):
     anchors = soup.findAll('a',href=re.compile(Retriever.BOOK_REGEX))
     books = {}
     for anchor in anchors:
         href = helpers.make_correct_link(link, anchor['href'])
         if helpers.check_local_link(Retriever.SITE_LINK, href):
             books[ href] = anchor.text
     return books.items()
Пример #6
0
 def get_links(soup, page_link):
     links = {}
     select = soup.find('select', id='useropt')
     if select:
         format_options = select.findAll('option')
         formats = []
         for format_option in format_options:
             formats.append(format_option['value'])
         for format in formats:
             link = "%s/%s" % (page_link, format)
             links[format] = make_correct_link(page_link, link)
     else:
         number = Retriever.get_booknumber_from_link(page_link)
         anchors = soup.findAll('a', href=re.compile('/b/%s/[A-z]+$' % number ))
         anchors = filter(Retriever.is_available_anchor, anchors)
         for link in anchors:
             format = Retriever.get_format_from_anchor(link)
             correct_link = make_correct_link(page_link, link['href'])
             links[format] =  correct_link
     return links
Пример #7
0
 def execute(self):
     entry = BeautifulSoup(self.entry,convertEntities=BeautifulSoup.XML_ENTITIES)
     bookinfo = Retriever.get_bookinfo(entry)
     if not bookinfo:
         logger.write_fail("empty links at entry",entry=entry, link=self.link)
         return True
     bookinfo.pagelink = self.link
     for format, link in bookinfo.links.items():
         bookinfo.links[format] = make_correct_link(self.link, bookinfo.links[format] )
     self.tasks = [ BookSavingTask(bookinfo) ]
     return True
Пример #8
0
 def execute(self, html):
     soup = get_soup(html)
     year, number = Retriever.get_issue_year_number_by_link(self.link)
     articles = Retriever.get_issue_articles(soup)
     for article in articles:
         article.magazine = self.magazine
         article.year = year
         article.number = number
         article.link = make_correct_link(self.link, article.link)
         article.link = Retriever.print_version(article.link)
     self.tasks = [ArticleSavingTask(article) for article in articles]
     return True
Пример #9
0
 def execute(self, html):
     data, page = html
     soup = BeautifulSoup(data,convertEntities=BeautifulSoup.XML_ENTITIES, fromEncoding='utf-8')
     if Retriever.is_acquisition_feed(soup):
         self.tasks = [ Entry(unicode(entry), self.link) for entry in Retriever.get_entries(soup) ]
     else: #navigation feed
         entries = Retriever.get_entries(soup)
         links = filter( lambda l: l!=None, [Retriever.get_catalog_link(entry) for entry in entries] )
         links = filter( Retriever.is_permitted_link, links)
         links = map( lambda link: make_correct_link(self.link, link), links)
         self.tasks = [ Page(link) for link in links]
     return True
Пример #10
0
 def postprocess_bookinfo(bookinfo, pagelink, tag):
     bookinfo.pagelink = pagelink
     for format, link in bookinfo.links.items():
         bookinfo.links[format] = make_correct_link(pagelink, link)
     bookinfo.tags.append(tag)
     return bookinfo
Пример #11
0
 def execute(self, html):
     soup = get_soup(html)
     links_magazines = Retriever.get_links_on_magazines(soup)
     links_magazines = [make_correct_link(self.link, link) for link in links_magazines]
     self.tasks = [ MagazineInfoTask( link ) for link in links_magazines]
     return True
Пример #12
0
 def execute(self,html):
     soup = get_soup(html)
     links = [make_correct_link(self.link, book_link) for book_link in Retriever.get_book_links(soup) ]
     self.tasks = [ GetBookInfoTask( link) for link in links]
     return True
Пример #13
0
 def execute(self,html):
     soup = get_soup(html)
     tasks = [ GenreTask( make_correct_link(self.link, glink)) for glink in Retriever.get_genres_links(soup) ]
     self.tasks = tasks
     return True
Пример #14
0
 def get_picture(soup,page_link, id):
     img_link = Retriever.get_picture_link(soup)
     if not img_link:
         return None
     img_link = make_correct_link(page_link,img_link)
     return img_link
Пример #15
0
 def correct_queue(queue, page_link):
     for q in queue:
         q['link'] = make_correct_link(page_link, q['link'])
Пример #16
0
def make_correct_link_test():
    print "helper_tests: make correct link test"
    assert make_correct_link('http://ya.ru/a/', 'g.html') == 'http://ya.ru/a/g.html'
    assert make_correct_link('http://ya.ru/', 'g.html')  == 'http://ya.ru/g.html'
    assert make_correct_link('http://ya.ru/a', 'g.html')  == 'http://ya.ru/a/g.html'
    assert make_correct_link('http://ya.ru/a?set=1', 'g.html')  == 'http://ya.ru/g.html'
    assert make_correct_link('http://ya.ru/a.php', 'g.html')  == 'http://ya.ru/g.html'
    assert make_correct_link('http://ya.ru/vehicles/', 'http://www.a.ru/g.html')  == 'http://www.a.ru/g.html'
    assert make_correct_link('http://ya.ru/vehicles/', '/a.html') ==  'http://ya.ru/a.html'
    assert make_correct_link('http://ya.ru/vehicles/', 'http://www.org.ru/a.html')  == 'http://www.org.ru/a.html'
    assert make_correct_link('http://ya.ru/vehicles/','https://www.org.ru/a.html')  == 'https://www.org.ru/a.html'
    assert make_correct_link('http://www.epubbooks.ru/stanza/index.xml', 'xmllastadd.php')  == 'http://www.epubbooks.ru/stanza/xmllastadd.php'
    assert make_correct_link('http://www.stanza.epubbooks.ru/stanza/genres/0.xml', '../../tpl/download.php?npp=3376') == 'http://www.stanza.epubbooks.ru/tpl/download.php?npp=3376'
    assert make_correct_link('http://www.stanza.epubbooks.ru/stanza/genres/0.xml', '../../tpl/download.php?npp=3376') == 'http://www.stanza.epubbooks.ru/tpl/download.php?npp=3376'