Пример #1
0
async def fetch(client, url, name, is_web):
    with async_timeout.timeout(15):
        try:
            headers = {'user-agent': get_random_user_agent()}
            if is_web:
                params = {
                    'wd': name,
                    'ie': 'utf-8',
                    'rn': CONFIG.BAIDU_RN,
                    'vf_bl': 1
                }
            else:
                params = {'word': name}
            async with client.get(url, params=params,
                                  headers=headers) as response:
                assert response.status == 200
                LOGGER.info('Task url: {}'.format(response.url))
                try:
                    text = await response.text()
                except:
                    text = await response.read()
                return text
        except Exception as e:
            LOGGER.exception(e)
            return None
Пример #2
0
async def fetch(client, url, novels_name):
    with async_timeout.timeout(20):
        try:
            headers = {
                'User-Agent': get_random_user_agent(),
                'Referer': "http://www.so.com/haosou.html?src=home"
            }
            params = {
                'ie': 'utf-8',
                'src': 'noscript_home',
                'shb': 1,
                'q': novels_name,
            }
            async with client.get(url, params=params,
                                  headers=headers) as response:
                assert response.status == 200
                LOGGER.info('Task url: {}'.format(response.url))
                try:
                    text = await response.text()
                except:
                    text = await response.read()
                return text
        except Exception as e:
            LOGGER.exception(e)
            return None
Пример #3
0
class ZHRankingSpider(Spider):
    start_urls = ['http://book.zongheng.com/rank.html']

    headers = {
        "User-Agent":
        asyncio.get_event_loop().run_until_complete(get_random_user_agent())
    }
    concurrency = 3

    async def parse(self, res):
        result = []
        res_dic = {}

        items_data = await RankingItem.get_items(html=res.html)

        for item in items_data:
            each_book_list = []
            # 只取排名前十的书籍数据
            for index, value in enumerate(item.book_list[:10]):
                item_data = await NameItem.get_item(html_etree=value)
                name = item_data.top_name or item_data.other_name
                each_book_list.append({'num': index + 1, 'name': name})
            data = {
                'title': item.ranking_title,
                'more': item.more,
                'book_list': each_book_list,
                'updated_at': time.strftime("%Y-%m-%d %X", time.localtime()),
            }
            result.append(data)

        res_dic['data'] = result
        res_dic['target_url'] = res.url
        res_dic['type'] = "人气榜单"
        res_dic['spider'] = "zongheng"
        await self.save(res_dic)

    async def save(self, res_dic):
        try:
            motor_db = MotorBaseOld().db
            await motor_db.novels_ranking.update_one(
                {'target_url': res_dic['target_url']}, {
                    '$set': {
                        'data':
                        res_dic['data'],
                        'spider':
                        res_dic['spider'],
                        'type':
                        res_dic['type'],
                        'finished_at':
                        time.strftime("%Y-%m-%d %X", time.localtime())
                    }
                },
                upsert=True)
        except Exception as e:
            self.logger.exception(e)
Пример #4
0
async def fetch(client, url):
    with async_timeout.timeout(10):
        try:
            headers = {'user-agent': get_random_user_agent()}
            async with client.get(url, headers=headers) as response:
                assert response.status == 200
                LOGGER.info('Task url: {}'.format(response.url))
                try:
                    text = await response.text()
                except:
                    text = await response.read()
                return text
        except Exception as e:
            LOGGER.exception(e)
            return None
Пример #5
0
async def fetch(client, url, novels_name):
    with async_timeout.timeout(20):
        try:
            headers = {
                'user-agent': get_random_user_agent(),
                'referer': "https://www.bing.com/"
            }
            params = {'q': novels_name, 'ensearch': 0}
            async with client.get(url, params=params,
                                  headers=headers) as response:
                assert response.status == 200
                LOGGER.info('Task url: {}'.format(response.url))
                try:
                    text = await response.text()
                except:
                    text = await response.read()
                return text
        except Exception as e:
            LOGGER.exception(e)
            return None
Пример #6
0
async def get_real_url(client, url):
    with async_timeout.timeout(10):
        try:
            headers = {'user-agent': get_random_user_agent()}
            async with client.head(url, headers=headers, allow_redirects=True) as response:
                assert response.status == 200
                LOGGER.info('Parse url: {}'.format(response.url))
                # text = ""
                # try:
                #     text = await response.text()
                # except:
                #     text = await response.read()
                # if text:
                #     print(text)
                #     text = re.findall(r'replace\(\"(.*?)\"\)', str(text))
                #     text = text[0] if text[0] else ""
                url = response.url if response.url else None
                return url
        except Exception as e:
            LOGGER.exception(e)
            return None
Пример #7
0
class QidianRankingSpider(Spider):
    start_urls = [
        "http://r.qidian.com/?chn=" + str(url)
        for url in [-1, 21, 1, 2, 22, 4, 15, 6, 5, 7, 8, 9, 10, 12]
    ]

    headers = {
        "User-Agent":
        asyncio.get_event_loop().run_until_complete(get_random_user_agent())
    }
    concurrency = 3
    qidian_type = {
        '-1': '全部类别',
        '21': '玄幻',
        '1': '奇幻',
        '2': '武侠',
        '22': '仙侠',
        '4': '都市',
        '15': '职场',
        '6': '军事',
        '5': '历史',
        '7': '游戏',
        '8': '体育',
        '9': '科幻',
        '10': '灵异',
        '12': '二次元',
    }

    async def parse(self, res):
        items_data = await RankingItem.get_items(html=res.html)
        result = []
        res_dic = {}
        for item in items_data:
            each_book_list = []
            # 只取排名前十的书籍数据
            for index, value in enumerate(item.book_list[:10]):
                item_data = await NameItem.get_item(html_etree=value)
                name = item_data.top_name or item_data.other_name
                each_book_list.append({'num': index + 1, 'name': name})
            data = {
                'title': item.ranking_title,
                'more': item.more,
                'book_list': each_book_list,
                'updated_at': time.strftime("%Y-%m-%d %X", time.localtime()),
            }
            result.append(data)
        res_dic['data'] = result
        res_dic['target_url'] = res.url
        res_dic['type'] = self.qidian_type.get(res.url.split('=')[-1])
        res_dic['spider'] = "qidian"
        await self.save(res_dic=res_dic)

    async def save(self, res_dic):
        # 存进数据库
        try:
            motor_db = MotorBaseOld().db
            await motor_db.novels_ranking.update_one(
                {'target_url': res_dic['target_url']}, {
                    '$set': {
                        'data':
                        res_dic['data'],
                        'spider':
                        res_dic['spider'],
                        'type':
                        res_dic['type'],
                        'finished_at':
                        time.strftime("%Y-%m-%d %X", time.localtime())
                    }
                },
                upsert=True)
        except Exception as e:
            self.logger.exception(e)