Exemplo n.º 1
0
class Anicobin(Collector):
    def __init__(self, reporter, waiter, outdir, useragent) -> None:
        super(Anicobin, self).__init__()
        self.reporter: Reporter = reporter
        self.waiter = waiter
        self.outdir = outdir
        self.useragent = useragent
        self.cacher = Cacher(self.outdir)
        self.semaphore = Semaphore(2)

    async def get(self, url):
        filename = urllib.parse.quote(url, safe='') + '.html'
        cache, _ = self.cacher.get(filename)
        if cache:
            html = cache
        else:
            await self.waiter.wait(url)
            print('fetching', url)
            async with aiohttp.request('get',
                                       url,
                                       headers={'user-agent':
                                                self.useragent}) as req:
                content = await req.read()
                html = content.decode(SITE_ENCODING)
                self.cacher.set(filename, html)

        return html

    async def collect(self, base_url, queue_size=3):
        async def f(page):
            print(page)
            html, _ = await self.async_retry(3, self.get,
                                             f'{base_url}?p={page}')
            result = []
            for post_url in await self.run_in_executor(get_post_urls, html):
                _html = await self.get(post_url)
                urls = get_pict_urls(_html)
                for url in urls:
                    filename = urllib.parse.quote(url, safe='')
                    content, _ = self.cacher.get(filename, binary=True)
                    if not content:
                        await self.add_future(
                            'dlimage', download_file(url, filename,
                                                     self.cacher))

                result.extend(urls)

            return len(result) > 0

        await self.queued_paging(1,
                                 1000,
                                 lambda page: f(page),
                                 queue_size=queue_size)
Exemplo n.º 2
0
class Keiba(Collector):
    def __init__(self, reporter, waiter, outdir, useragent) -> None:
        super(Keiba, self).__init__()
        self.reporter: Reporter = reporter
        self.waiter = waiter
        self.outdir = outdir
        self.useragent = useragent
        self.cacher = Cacher(self.outdir)
        self.semaphore = Semaphore(2)

    async def get_search_page(
        self,
        n: int,
        options: dict = {
            'pid': str,
            'word': str,
            'track[]': str,
            'start_year': str,
            'start_mon': str,
            'end_year': str,
            'end_mon': str,
            'jyo[]': str,
            'kyori_min': str,
            'kyori_max': str,
            'sort': str,
            'list': str,
        }):
        url = 'https://db.netkeiba.com/'
        psuedo_url = f'{url}?{urllib.parse.urlencode(options)}&page=1'
        filename = urllib.parse.quote(psuedo_url + '.html', safe='')

        cache, _ = self.cacher.get(filename)
        if cache:
            search_result = cache
        else:
            await self.waiter.wait(psuedo_url)
            print('fetching', psuedo_url)
            async with aiohttp.request(
                    'post',
                    url=url,
                    headers={
                        'content-type': 'application/x-www-form-urlencoded',
                        'user-agent': self.useragent
                    },
                    data=urllib.parse.urlencode(options)) as req:
                content = await req.read()
                search_result = content.decode(SITE_ENCODING)
                if str(req.url) == url:
                    self.cacher.set(filename, search_result)
                else:
                    print(f'Warning: redirected to {str(req.url)}')
                    self.cacher.set(
                        urllib.parse.quote(str(req.url), safe='') + '.html',
                        search_result)
                    return None

        if n == 1:
            return search_result
        else:
            data = await self.run_in_executor(get_nextpage_data, search_result)
            data['page'] = str(n)
            psuedo_url = f'{url}?{urllib.parse.urlencode(options)}&page={n}'
            filename = urllib.parse.quote(psuedo_url + '.html', safe='')

            cache, _ = self.cacher.get(filename)
            if cache:
                result = cache
            else:
                await self.waiter.wait(psuedo_url)
                print('fetching', psuedo_url)
                async with aiohttp.request(
                        'post',
                        url=url,
                        headers={
                            'content-type':
                            'application/x-www-form-urlencoded',
                            'user-agent': self.useragent
                        },
                        data=urllib.parse.urlencode(
                            data, encoding=SITE_ENCODING)) as req:
                    try:
                        content = await req.read()
                        result = content.decode(SITE_ENCODING)
                        self.cacher.set(filename, result)
                    except Exception as e:
                        print('get_tail', e)
                        return None
            return result

    async def get_race_page(self, url):
        filename = urllib.parse.quote(url, safe='') + '.html'
        cache, _ = self.cacher.get(filename, ext='')
        if cache:
            html = cache
        else:
            await self.waiter.wait(url)
            print('fetching', url)
            async with aiohttp.request('get',
                                       url,
                                       headers={'user-agent':
                                                self.useragent}) as req:
                content = await req.read()
                html = content.decode(SITE_ENCODING)
                self.cacher.set(filename, html)

        return html

    async def collect(self, year, queue_size=3):
        async def f(page):
            print(page)
            html, _ = await self.async_retry(
                3, self.get_search_page, page, {
                    'pid': 'race_list',
                    'start_year': str(year),
                    'end_year': str(year),
                    'sort': 'date',
                    'list': '100'
                })
            return len([
                await self.add_future('get_race', self.get_race_page(race_url))
                for race_url in await self.run_in_executor(
                    get_race_urls, html)
            ]) == 100 if html else False

        await self.queued_paging(1,
                                 1000,
                                 lambda page: f(page),
                                 queue_size=queue_size)

    async def collect_horse(self, year, queue_size=3):
        async def f(page):
            print(page)
            html, error = await self.async_retry(3, self.get_search_page, page,
                                                 {
                                                     'pid': 'horse_list',
                                                     'list': '100',
                                                     'birthyear': year,
                                                 })
            if error:
                print('Waringn: max retries exceeded')
                return False
            return len(await self.run_in_executor(
                get_horse_urls, html)) == 100 if html else False

        await self.queued_paging(1,
                                 1000,
                                 lambda page: f(page),
                                 queue_size=queue_size)
Exemplo n.º 3
0
class WearCollector(Collector):
    def __init__(self,
                 reporter: Reporter,
                 waiter: Waiter,
                 outdir: str,
                 useragent: str = ''):
        super(WearCollector, self).__init__()
        self.reporter: Reporter = reporter
        self.waiter = waiter
        self.outdir = outdir
        self.useragent = useragent
        self.cacher = Cacher(self.outdir)
        # 非同期処理の同時接続数制御
        self.semaphore = Semaphore(2)
        # ファイルダウンローダ
        self.downloader = Downloader(self.waiter, self.semaphore,
                                     self.reporter)

    async def download_user_page(self, url: str, page_num):
        url = url + f'?pageno={page_num}'

        # キャッシュがあれば使う
        filename = urllib.parse.quote(url, safe='') + '.html'
        content, info = self.cacher.get(filename)
        if content and info:
            html = content
            realurl = info.get('realurl')
            self.reporter.report(INFO, f'use cache {url}')
        else:
            await self.waiter.wait(url)
            async with self.semaphore:
                self.reporter.report(INFO, f'fetching {url}', type=NETWORK)
                async with aiohttp.request(
                        'get', url, headers={'user-agent':
                                             self.useragent}) as res:
                    html = await res.text()
                    realurl = str(res.url)
                    self.cacher.set(filename, html, {
                        'status': res.status,
                        'realurl': realurl
                    })

        # 終了条件
        if page_num >= 2 and realurl.count('?pageno') == 0:
            return False
        else:
            for url, data in await self.run_in_executor(parse_user, html):
                await self.add_future(
                    'gallery',
                    self.gallery_collector(url, 1, 501, userdata=data))
            return True

    async def user_collector(self, url: str, pagestart: int, pageend: int):
        await self.queued_paging(
            pagestart, pageend,
            lambda page: self.download_user_page(url, page))

    async def download_gallery_page(self,
                                    url: str,
                                    page_num: int,
                                    userdata=None):
        url = url + f'?pageno={page_num}'
        filename = urllib.parse.quote(url, safe='') + '.html'
        content, info = self.cacher.get(filename)
        if content and info:
            html = content
            realurl = info.get('realurl')
            self.reporter.report(INFO, f'use cache {url}')
        else:
            await self.waiter.wait(url)
            async with self.semaphore:
                self.reporter.report(INFO, f'fetching {url}', type=NETWORK)
                async with aiohttp.request(
                        'get', url, headers={'user-agent':
                                             self.useragent}) as res:
                    html = await res.text()
                    realurl = str(res.url)
                    self.cacher.set(filename, html, {
                        'status': res.status,
                        'realurl': realurl
                    })

        # 終了条件
        if page_num >= 2 and realurl.count('?pageno') == 0:
            return False
        else:
            for url, data in await self.run_in_executor(
                    parse_gallely, html, userdata):
                imagefile = urllib.parse.quote(url, safe='')
                tmp_save(os.path.join(self.outdir, imagefile + '.json'),
                         json.dumps(data))
                imagepath = os.path.join(self.outdir, imagefile)
                if not os.path.exists(imagepath):
                    await self.add_future(
                        'image',
                        self.downloader.download_file(
                            url,
                            imagepath,
                            headers={'user-agent': self.useragent}))
            return True

    async def gallery_collector(self,
                                url: str,
                                pagestart: int,
                                pageend: int,
                                userdata=None):
        await self.queued_paging(
            pagestart, pageend, lambda page: self.download_gallery_page(
                url, page, userdata=userdata))