Exemplo n.º 1
0
    async def process(self, response: Response):
        print(self['urls'])
        print(self['cid'])
        print(self['aid'])
        print(self['mid'])
        print(self.json())
        if self['mid'] and self['aid'] and self['urls'] and self['cid']:
            # 推荐视频
            yield Request(url=recommend_url.format(cid=self['cid'],
                                                   aid=self['aid']),
                          callback=Recommend)

            # 用户信息
            yield Request(url=user_url,
                          method='POST',
                          data={
                              'csrf': '',
                              'mid': self['mid']
                          },
                          callback=UserInfo,
                          not_filter=True)

            # 下载视频
            for order, url in enumerate(self['urls']):
                yield Request(url=url.replace('http', 'https'),
                              meta={'name': self['aid'] + '_' + str(order)},
                              callback=Video)
Exemplo n.º 2
0
async def videoinfo(response: Response):
    if response.status_code == 200 and response.json()['status']:
        params = re.findall('mid=(.*?)&pagesize=30&tid=0&page=(.*?)&keyword',
                            response.url)[0]
        yield Request(url=video_url.format(params[0],
                                           int(params[1]) + 1),
                      callback=videoinfo)
        data = response.json()['data']['vlist']
        for d in data:
            db['video'].update_one({'aid': d['aid']}, {'$set': d}, True)
            yield Request(url=av_url.format(aid=d['aid']), callback=AVPage)
Exemplo n.º 3
0
 async def process(self, response: Response):
     status = response.json().get('status')
     if status:
         data = response.json().get('data')
         pattern = 'mid=(\d+?)&pagesize=30&tid=0&page=(\d+?)&keyword=&order=pubdate'
         patn = re.findall(pattern, response.url)[0]
         print(patn)
         yield Request(url=videos_url.format(mid=patn[0],
                                             page=int(patn[1]) + 1),
                       callback=VideoInfo),
         for v in data['vlist']:
             yield Request(url=av_url.format(aid=v.get('aid')), callback=AV)
Exemplo n.º 4
0
def set_init_requests():
    for init_mid in init_mids:
        init_data = {'csrf': None, 'mid': init_mid}
        yield Request(url=follower_url.format(mid=init_mid, page=1),
                      callback=follow)
        yield Request(url=followed_url.format(mid=init_mid, page=1),
                      callback=follow)
        yield Request(url=user_url,
                      method='post',
                      data=init_data,
                      form_filter=init_data,
                      callback=UserInfo)
Exemplo n.º 5
0
 async def process(self, response: Response):
     if response.status_code == 200:
         result = response.json()
         if result['status']:
             for key in self.keys():
                 if key in result['data']:
                     self[key] = result['data'][key]
             self['level'] = result['data']['level_info']['current_level']
             db['user'].update_one({'mid': self['mid']},
                                   {"$set": self.json()}, True)
             yield Request(url=followed_url.format(
                 mid=result['data']['mid'], page=1),
                           callback=follow)
             yield Request(url=follower_url.format(
                 mid=result['data']['mid'], page=1),
                           callback=follow)
Exemplo n.º 6
0
 async def process(self, response: Response):
     if response.status_code == 200:
         durl = json.loads(self['data'])['durl']
         aid = re.findall('https://www.bilibili.com/video/av(.*)',
                          response.url)[0]
         db['video'].update_one({'aid': int(aid)}, {'$set': self.json()})
         print(db['video'].find_one({'aid': int(aid)}))
         for d in durl:
             name = str(aid) + '_' + str(d['order']) + '.mp4'
             yield Request(d['url'], meta={'name': name}, callback=download)
Exemplo n.º 7
0
async def follow(response):
    if response.status_code == 200:
        result = response.json()
        if result['code'] == 0:
            params = re.findall('x/relation/(.*?)\?vmid=(.*?)&pn=(.*?)&ps',
                                response.url)[0]
            if params[0] == 'followings':
                yield Request(url=followed_url.format(mid=params[1],
                                                      page=int(params[2]) + 1),
                              callback=follow)
            elif params[0] == 'followers':
                yield Request(url=follower_url.format(mid=params[1],
                                                      page=int(params[2]) + 1),
                              callback=follow)
            for data in result['data']['list']:
                form_data = {'csrf': '', 'mid': data['mid']}
                yield Request(url=user_url,
                              method='post',
                              data=form_data,
                              form_filter=form_data,
                              callback=UserInfo)
Exemplo n.º 8
0
 async def process(self, response):
     font_type = response.html.xpath(
         '//div[contains(@class, "book-info")]/p[3]/em[1]/span/@class',
         first=True)
     self['total_words'] = await get_nums(font_type, self['total_words'])
     self['total_click'] = await get_nums(font_type, self['total_click'])
     self['week_click'] = await get_nums(font_type, self['week_click'])
     self['total_recommend'] = await get_nums(font_type,
                                              self['total_recommend'])
     self['week_recommend'] = await get_nums(font_type,
                                             self['week_recommend'])
     self['img'] = 'https:' + self['img'][:-1]
     data = self.json()
     print(data)
     db['qidian'].update_one({'id': self['id']},
                             {"$set": dict(self.json())}, True)
     yield Request(score_url.format(spider.cookies.get('_csrfToken'),
                                    self['id']),
                   callback=BookScore,
                   meta={'bookid': self['id']})
Exemplo n.º 9
0
 async def process(self, response: Response):
     for data in response.json():
         yield Request(av_url.format(aid=data[1]), callback=AV)
Exemplo n.º 10
0

class Video(Model):
    async def process(self, response: Response):
        file_name = response.current_request.meta.get('name')
        if file_name and response.status_code == 200:
            with open(file_name + '.mp4', 'wb') as f:
                for content in response.iter_content(chunk_size=512):
                    f.write(content)
                    f.flush()


snake = Spider('bilibili', workers=5)

snake.init_requests = [
    Request(url=videos_url.format(mid='35789774', page=1), callback=VideoInfo),
]
snake.async_limit = 5


@snake.Middleware('request')
async def test(request):
    print(request.url)
    if request.url.startswith('https://space.bilibili.com/'):
        request.info.update(
            {'headers': {
                'Referer': 'https://space.bilibili.com/'
            }})
    else:
        request.info.update({'headers': {'Referer': 'https://bilibili.com/'}})
Exemplo n.º 11
0
    def dumps_nowait(self, item):
        item = pickle.dumps(item)
        self.put_nowait(item)

    def loads_nowait(self):
        item = self.get_nowait()
        return pickle.loads(item)


if __name__ == '__main__':
    from requests_spider import Request, Response
    from requests_spider import Model
    from requests_spider import Field


    class m(Model):
        a = Field()
        _b = 111

        async def process(self, response: Response):
            print(response)


    r = Request(url='https://www.baidu.com', callback=m)

    q = Squeue()
    r.__dict__.update({'_love': 11})
    q.dumps_nowait(r)
    re = q.loads_nowait()
    print(re.__dict__)
Exemplo n.º 12
0
async def test(response):
    for x in range(100000):
        yield Request(url='http://www.httpbin.org/get', callback=speed)
    print(response.status_code)
Exemplo n.º 13
0
from requests_spider import Spider, Request


async def test(response):
    for x in range(100000):
        yield Request(url='http://www.httpbin.org/get', callback=speed)
    print(response.status_code)


async def speed(response):
    print(response.status_code)


spider = Spider('test')
spider.init_requests = [Request(url='http://www.httpbin.org/status/200', callback=test)]

if __name__ == '__main__':
    spider.run()
Exemplo n.º 14
0
                name = str(aid) + '_' + str(d['order']) + '.mp4'
                yield Request(d['url'], meta={'name': name}, callback=download)


async def download(response: Response):
    if response.status_code == 200:
        chunk_size = 1024
        name = response.current_request.meta.get('name')
        with open(name, 'wb') as f:
            for content in response.iter_content(chunk_size=chunk_size):
                f.write(content)
                f.flush()


spider = Spider('bilibili_video', workers=4)
spider.async_limit = 4
spider.init_requests = [
    Request(url=video_url.format(mid, 1), callback=videoinfo) for mid in mids
]


@spider.Middleware('request')
async def set_headers(request: Request):
    request.info.update({'headers': {'Referer': rf, 'UserAgent': ua}})
    await asyncio.sleep(2)
    return request


if __name__ == '__main__':
    spider.run()
Exemplo n.º 15
0

class Video(Model):
    async def process(self, response: Response):
        filename = response.url.split('/')[-1]
        print(filename)
        with open(filename, 'wb') as f:
            f.write(response.content)
            f.flush()


spider = Spider('pearvideo')
spider.init_requests = [
    Request(
        url=
        'http://www.pearvideo.com/popular_loading.jsp?reqType=1&start={}&sort={}'
        .format(x, x * 10),
        callback=VideoInfo) for x in range(0, 10)
]

spider.rules = [
    XRule(rule='//a[contains(@class, "popularembd")]/@href'),
    RRule(rule='(http://.*?\.mp4)', model=Video)
]


@spider.Middleware('request')
async def set_timeout(request):
    # print(request.url)
    # await asyncio.sleep(1)
    return request
Exemplo n.º 16
0
import json
from requests_spider import XField, Spider, Model, Response, Request


class Proxy(Model):
    ip = XField(rule='//tr[contains(@class, "odd")]/td[2]', first=False)
    port = XField(rule='//tr[contains(@class, "odd")]/td[3]', first=False)

    async def process(self, response: Response):
        with open('proxy1.txt', 'a+') as file:
            for result in self.merge():
                file.write(json.dumps(result) + '\n')


snake = Spider('proxy', workers=15)
snake.domains = ['www.xicidaili.com']
snake.init_requests = [
    Request(url='http://www.xicidaili.com/nn/{}'.format(x), callback=Proxy) for x in range(1, 10)
]

snake.async_limit = 5

if __name__ == '__main__':
    snake.run()
Exemplo n.º 17
0
                                                self['week_recommend'])
        self['img'] = 'https:' + self['img'][:-1]
        data = self.json()
        print(data)
        db['qidian'].update_one({'id': self['id']},
                                {"$set": dict(self.json())}, True)
        yield Request(score_url.format(spider.cookies.get('_csrfToken'),
                                       self['id']),
                      callback=BookScore,
                      meta={'bookid': self['id']})


spider = Spider(name='one', workers=4)
spider.domains = ['book.qidian.com', 'www.qidian.com']
spider.init_requests = [
    Request(rank_url.format(1, page)) for page in range(1, 2)
]

spider.rules = [
    XRule(rule='//div[@class="book-img-box"]/a/@href', callback=BookInfo)
]


@spider.Middleware('request')
async def sleep(request):
    # await asyncio.sleep(5)
    return request


@spider.Middleware('response')
async def set_error_code(response):