Exemplos de parse_url em Python, exemplos de app.utils.parse_url.parse_url em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: xs147_spider.py Projeto: WebPractices/Eager

 def get_menu(self, url=None):
     """crawling novel menu"""
     if self._novel_html is None:
         if url:
             res = parse_url(url, 'utf-8', params=self._params)
             self._novel_html = etree.HTML(res)
         else:
             return False
     chapter_url = self._novel_html.xpath('//*[@id="list"]/dd/a/@href')[9:]
     chapter_title = self._novel_html.xpath(
         '//*[@id="list"]/dd/a/text()')[9:]
     for url, title in zip(chapter_url, chapter_title):
         chapter = {'source_url': self._base_url + url, 'title': title}
         yield chapter

Exemplo n.º 2

0

Exibir arquivo

Arquivo: biquge_spider.py Projeto: WebPractices/Eager

 def get_info(self, novel_url):
     """crawler novel info"""
     res = parse_url(novel_url, 'gbk', params=self._params)
     html = etree.HTML(res)
     self._novel_html = html
     image = html.xpath('//*[@id="fmimg"]/img/@src')[0]
     intro = html.xpath('//*[@id="intro"]/p/text()')[0]
     update_info = html.xpath('//*[@id="info"]/p[3]/text()')[0]
     state = self._novel_state(update_info)
     info = {
         'image': self._base_url + image,
         'intro': intro,
         'state': state
     }
     return info

Exemplo n.º 3

0

Exibir arquivo

Arquivo: xs147_spider.py Projeto: WebPractices/Eager

 def get_info(self, novel_url):
     """crawler novel info"""
     res = parse_url(novel_url, 'utf-8', params=self._params)
     html = etree.HTML(res)
     self._novel_html = html
     image = html.xpath('//*[@id="fmimg"]/a/img/@src')[0]
     intro = html.xpath('//*[@id="intro"]/text()')[2]
     update_info = html.xpath('//*[@id="info"]/p[3]/text()')[0]
     _datetime = update_info.split('：')[1].split(' ')[0]
     state = self._novel_state(_datetime)
     info = {
         'image': self._base_url + image,
         'intro': intro,
         'state': state
     }
     return info

Exemplo n.º 4

0

Exibir arquivo

Arquivo: xs147_spider.py Projeto: WebPractices/Eager

 def get_url(self, category):
     """crawler all novels url according to category"""
     url = self._xs147.format(category=category)
     res = parse_url(url, 'utf-8', params=self._params)
     html = etree.HTML(res)
     urls = html.xpath(
         '//*[@id="main"]/div[@class="novelslist"]/div[1]/ul/li/a/@href')
     names = html.xpath(
         '//*[@id="main"]/div[@class="novelslist"]/div[1]/ul/li/a/text()')
     authors = html.xpath(
         '//*[@id="main"]/div[@class="novelslist"]/div[1]/ul/li/text()')
     for url, name, author in zip(urls, names, authors):
         novel = {
             'source_url': url,
             'name': name,
             'author': author.lstrip('/')
         }
         yield novel

Exemplo n.º 5

0

Exibir arquivo

Arquivo: biquge_spider.py Projeto: WebPractices/Eager

 def get_url(self, category):
     """crawler all novels url according to category"""
     url = self._biquge.format(category=category)
     res = parse_url(url, 'gbk', params=self._params)
     html = etree.HTML(res)
     urls = html.xpath('//*[@id="newscontent"]/div[@class="l"]/ul/li/span[1]/a/@href')
     urls_r = html.xpath('//*[@id="newscontent"]/div[@class="r"]/ul/li/span[1]/a/@href')
     names = html.xpath('//*[@id="newscontent"]/div[@class="l"]/ul/li/span[1]/a/text()')
     names_r = html.xpath('//*[@id="newscontent"]/div[@class="r"]/ul/li/span[1]/a/text()')
     authors = html.xpath('//*[@id="newscontent"]/div[@class="l"]/ul/li/span[3]/text()')
     authors_r = html.xpath('//*[@id="newscontent"]/div[@class="r"]/ul/li/span[2]/text()')
     urls.extend(urls_r)
     names.extend(names_r)
     authors.extend(authors_r)
     for url, name, author in zip(urls, names, authors):
         novel = {
             'source_url': url,
             'name': name,
             'author': author
         }
         yield novel

Exemplo n.º 6

0

Exibir arquivo

Arquivo: biquge_spider.py Projeto: WebPractices/Eager

 def get_chapter(self, chapter_url):
     """crawler chapter content"""
     res = parse_url(chapter_url, 'gbk', params=self._params)
     html = etree.HTML(res)
     content = html.xpath('//*[@id="content"]/text()')
     return ''.join(content)