예제 #1
0
파일: run.py 프로젝트: yangshimin/spider
 def __init__(self):
     self.url = 'https://passport.weibo.cn/sso/login'
     self.s = requests.Session()
     self.headers = HEADERS
     self.data = DATA
     self.followers_url = 'https://m.weibo.cn/api/container/getSecond?containerid={containerid}_-_FOLLOWERS&page={page}'
     self.weibo_url = 'https://m.weibo.cn/api/container/getIndex?containerid={containerid}_-_WEIBO_SECOND_PROFILE_WEIBO&page_type=03&page={page}'
     self.mongoclient = MongoClient()
     self.redisclient = RedisClient()
예제 #2
0
    def __init__(self):
        self.s = requests.Session()
        self.s.headers.update(headers)
        self.s.proxies = proxies
        self.offset = 0
        self._db = MongoClient()
        # 第一个参数
        self.d = {"rid": None, "offset": 0, "total": "true", "limit": "20", "csrf_token": ""}

        # 存储每个歌单的url,更好用redis来存储
        self.seen = set()
예제 #3
0
def main():
    s = Scheduler()
    print('程序开始运行。。')
    # s.run('1266321801')
    db = MongoClient()
    while True:
        user = db.find_one_flag()
        if user:
            s.run(user['user_id'])
        else:
            print('所有用户已经爬取完')
            break
예제 #4
0
    def __init__(self):
        self.redisdb = RedisClient()
        self.mongodb = MongoClient()
        self.crawl_urls = []
        self.use_proxy = False
        if CRAWL_MODE == 'proxy':
            self.use_proxy = True

        self.start_urls = [
            'https://www.douban.com/group/explore',
            'https://www.douban.com/group/explore/culture',
            'https://www.douban.com/group/explore/travel',
            'https://www.douban.com/group/explore/ent',
            'https://www.douban.com/group/explore/fashion',
            'https://www.douban.com/group/explore/life',
            'https://www.douban.com/group/explore/tech'
        ]
예제 #5
0
파일: run.py 프로젝트: yangshimin/spider
class WeiBoSpider(object):
    def __init__(self):
        self.url = 'https://passport.weibo.cn/sso/login'
        self.s = requests.Session()
        self.headers = HEADERS
        self.data = DATA
        self.followers_url = 'https://m.weibo.cn/api/container/getSecond?containerid={containerid}_-_FOLLOWERS&page={page}'
        self.weibo_url = 'https://m.weibo.cn/api/container/getIndex?containerid={containerid}_-_WEIBO_SECOND_PROFILE_WEIBO&page_type=03&page={page}'
        self.mongoclient = MongoClient()
        self.redisclient = RedisClient()

    def get_uid(self):
        response = self.s.post(self.url, headers=self.headers, data=self.data)
        if response.status_code == 200:
            response.encoding = chardet.detect(response.content)['encoding']
            uid = response.json()['data']['uid']
        else:
            print('请求网页出错:', response.status_code)
        self.followers_containerid = '100505' + uid
        self.weibo_containerid = '230413' + uid

    def make_followers_kw(self, followers_page=1):
        followers_kw = (self.followers_containerid, followers_page)
        self.get_followers_page(followers_kw)

    def make_weibo_kw(self, weibo_page=1):
        weibo_kw = (self.weibo_containerid, weibo_page)
        self.get_followers_page(weibo_kw)

    def get_followers_page(self, kw):
        if kw[0].startswith('100505'):
            url = self.followers_url.format(containerid=kw[0], page=kw[1])
            info_list = self.s.get(url).json()
            self.parse_followers_info(info_list)
        else:
            url = self.weibo_url.format(containerid=kw[0], page=kw[1])
            info_list = self.s.get(url).json()
            self.parse_weibo_info(info_list)

    def parse_followers_info(self, followers_list):
        res = {}
        current_page = followers_list['cardlistInfo']['page']
        res['flag'] = followers_list['title']
        maxpage = followers_list['maxPage'] + 1
        followers = followers_list['cards']
        for follower in followers:
            user = follower['user']
            res['name'] = user['screen_name']
            res['id'] = user['id']
            res['profile_url'] = user['profile_url']
            res['weibo_count'] = user['statuses_count']
            res['verify_info'] = user.get('verified_reason')
            res['description'] = user['description']
            res['gender'] = '男' if user['gender'] == 'm' else '女'
            res['followers_count'] = user['followers_count']
            res['follow_count'] = user['follow_count']

            print(res)
            self.mongoclient.save_to_mongo(res)

        if current_page < maxpage:
            self.make_followers_kw(followers_page=current_page)

    def parse_weibo_info(self, weibo_list):
        res = {}
        current_page = weibo_list['cardlistInfo']['page']
        blog_total = weibo_list['cardlistInfo']['total']
        print(current_page)
        blog_list = weibo_list['cards']
        if current_page == 2:
            blogs = blog_list[1:]
        else:
            blogs = blog_list
        for blog in blogs:
            blog_info = blog['mblog']
            res['blog_content_url'] = blog['scheme']
            # 针对带评论转发和只转发两种状态
            res['blog_title'] = blog_info['retweeted_status']['page_info'][
                'content1'] if blog_info.get('raw_text') and blog_info[
                    'retweeted_status'].get('page_info') else blog_info['text']
            res['is_privacy'] = blog_info['title']['text']
            res['attitudes_count'] = blog_info['attitudes_count']
            res['create_time'] = blog_info['created_at']
            res['comments_count'] = blog_info['comments_count']
            res['reads_count'] = blog_info['reads_count']
            res['source'] = blog_info['source']
            res['location'] = blog_info.get('page_info').get(
                'page_title') if blog_info.get('page_info') else None

            print(res)
            self.mongoclient.save_to_mongo(res)

        if current_page and (current_page - 1) * 10 < blog_total:
            self.make_weibo_kw(weibo_page=current_page)
예제 #6
0
class wangyimusic_comment(object):
    def __init__(self):
        self.s = requests.Session()
        self.s.headers.update(headers)
        self.s.proxies = proxies
        self.offset = 0
        self._db = MongoClient()
        # 第一个参数
        self.d = {"rid": None, "offset": 0, "total": "true", "limit": "20", "csrf_token": ""}

        # 存储每个歌单的url,更好用redis来存储
        self.seen = set()

    def get_params(self):
        iv = '0102030405060708'
        # first_encText = generatestring()
        h_encText = self.aes_encrypt(str(self.d), g, iv)
        h_encText = self.aes_encrypt(h_encText, 'FFFFFFFFFFFFFFFF', iv)
        return h_encText


    # 因为 参数中的唯一一个变量正是那个 16 位随机字符串,因为我们用16个F代替,所以,
    # 每次调用的结果必然一致,下面的值是在浏览器中console中模拟取得。
    def get_encSecKey(self):
        encSecKey = '257348aecb5e556c066de214e531faadd1c55d814f9be95fd06d6bff9f4c7a41f831f6394d5a3fd2e3881736d94a02ca919d952872e7d0a50ebfa1769a7a62d512f5f1ca21aec60bc3819a9c3ffca5eca9a0dba6d6f7249b06f5965ecfff3695b54e1c28f3f624750ed39e7de08fc8493242e26dbc4484a01c76f739e135637c'
        return encSecKey


    # 加密文本长度必须为 16(AES-128)、24(AES-192)、32(AES-256) Bytes长度,目前AES-128足够用
    def aes_encrypt(self, text, key, iv):
        encryptor = AES.new(key, AES.MODE_CBC, iv)
        # 加密函数,如果 text 不是16的倍数(加密文本text必须为16的倍数), 那就补足为16的倍数
        miss_lenght = 16 - len(text) % 16
        text = text + miss_lenght * chr(miss_lenght)
        encrypt_text = encryptor.encrypt(text)
        # 不明白为什么有这句
        encrypt_text = base64.b64encode(encrypt_text)
        return encrypt_text.decode('utf-8')

    def get_list_page(self, url):

        response = self.s.get(url)
        if response.status_code == 200:
            return response.text
        else:
            print('请求url时出错:', response.status_code)

    def parse_list(self, response):
        page = etree.HTML(response)
        links = page.xpath(".//*[@id='m-pl-container']/li/p[@class='dec']/a/@href")
        for link in links:
            base_songsheet_url = base_list_url + link
            self.add_songsheet_url(base_songsheet_url)
            print(base_songsheet_url)
        next_page = base_list_url + page.xpath(".//*/div[@class='u-page']/a[text()='下一页']/@href")[0]
        if not next_page.endswith('javascript:void(0)'):
            # print("*********************************")
            # print(next_page)
            self.parse_list(self.get_list_page(next_page))

    def get_song(self, url):
        response = self.s.get(url)
        if response.status_code == 200:
            page = etree.HTML(response.text)
            song_sheet = page.xpath(".//*/ul[@class='f-hide']/li/a/@href")
            # self.song_sheet_name = page.xpath('.//*/title/text()')[0]
            for song_url in song_sheet:
                self.item = {}
                self.comment_url = base_list_url + song_url
                self.song_name = self.get_song_name(self.comment_url)
                song_id = self.comment_url.split('=')[1]
                self.d["rid"] = "R_SO_4_" + str(song_id)
                comment_dict = self.get_comment(song_id)

                self.parse_comment(comment_dict, self.item)
                self._db.save_to_mongo(self.song_name, self.item)

                total = comment_dict['total']
                print('fisrt time:', total)
                while self.offset + 20 < total:
                    self.offset += 20
                    self.d['offset'] = self.offset
                    self.get_comment(song_id)
        else:
            print('请求 {url} 时出错: {code}'.format(url=url, code=response.status_code))

    def get_comment(self, id):
        response = self.s.post(song_info_url.format(id=id), data={
            'params': self.get_params(),
            'encSecKey': self.get_encSecKey()
        }, cookies={'Cookie': 'appver=1.5.0.75771;'})
        res_json = json.loads(response.text)
        self.parse_comment(res_json, self.item)
        return res_json

    def parse_comment(self, comment_dict, item):
        for comment in comment_dict['comments']:
            item['commentId'] = comment['commentId']
            item['nickname'] = comment['user']['nickname']
            item['content'] = comment['content']
            item['time'] = self.timestamp_to_time(comment['time'])
            item['likedCount'] = comment['likedCount']
            item['song_url'] = self.comment_url

            self._db.save_to_mongo(self.song_name, self.item)
            print(item)
            # item = {}

            # self.parse_comment(comment_dict, self.item)
            # self._db.save_to_mongo(self.song_sheet_name, self.item)

    def timestamp_to_time(self, timestamp):
        time_length = len(str(timestamp))
        if time_length == 13:
            timearr = time.localtime(timestamp // 1000)
        else:
            timearr = time.localtime(timestamp)
        return time.strftime('%Y-%m-%d %H:%M', timearr)

    def add_songsheet_url(self, url):
        if url not in self.seen:
            self.seen.add(url)

    def get_song_name(self, song_url):
        response = self.s.get(song_url)
        page = etree.HTML(response.text)
        song_name = page.xpath('.//*/title/text()')[0].split('-')[0]
        return song_name
예제 #7
0
 def __init__(self):
     self.download = Download()
     self.db = MongoClient()
     self.user_url_list = []
     self.threads = []
예제 #8
0
class Scheduler(object):
    def __init__(self):
        self.download = Download()
        self.db = MongoClient()
        self.user_url_list = []
        self.threads = []

    def run(self, user_id=config.START_ID):
        self.user_start(user_id)

    def user_start(self, user_id):
        user_id = int(user_id)
        results = self.db.find(user_id)
        if (results and results['flag'] == False) or not results:
            index_data = self.get_user_index(user_id)
            if index_data:
                self.get_user_info(user_id)
                self.get_fans(user_id, index_data['user'])
                self.get_followers(user_id, index_data['user'])
            else:
                data = {'user_id': user_id, 'flag': 'Error'}
                self.db.save(data)
        else:
            print(results['user'], " 该用户已经爬取过")

    def get_user_index(self, user_id):
        user_index = 'https://m.weibo.cn/api/container/getIndex?containerid=100505{user_id}'
        url = user_index.format(user_id=user_id)
        response = self.download.get_html(url)
        if response:
            try:
                res_json = json.loads(response)
                if 'userInfo' in res_json.keys():
                    user = res_json['userInfo']['screen_name']
                    user_id = res_json['userInfo']['id']
                    user_url = res_json['userInfo']['profile_url']
                    fans = res_json['userInfo']['followers_count']
                    followers = res_json['userInfo']['follow_count']
                    time = datetime.datetime.now().strftime(
                        '%Y-%m-%d %H:%M:%S')
                    data = {
                        'user': user,
                        'user_id': user_id,
                        'user_url': user_url,
                        'fans': fans,
                        'followers': followers,
                        'time': time,
                        'flag': True
                    }
                    print('正在抓取 ' + user + ' ID为:' + str(user_id))
                    self.db.save(data)
                    return data
            except:
                print('json解析出错')
                return None

    def get_user_info(self, user_id):
        user_info = 'https://m.weibo.cn/api/container/getIndex?containerid=230283{user_id}_-_INFO'
        url = user_info.format(user_id=user_id)
        response = self.download.get_html(url)
        if response:
            # pattern = re.compile( r'{"card_type":41,"item_name":"\u6027\u522b","item_content":"(.*?)"}.*?{"card_type":41,"item_name":"\u6240\u5728\u5730","item_content":"(.*?)"}.*?{"card_type":41,"item_name":"\u7b80\u4ecb","item_content":"(.*?)"}.*?{"card_type":41,"item_name":"\u7b49\u7ea7".*?"item_content":"(.*?)".*?{"card_type":41,"item_name":"\u9633\u5149\u4fe1\u7528","item_content":"(.*?)".*?{"card_type":41,"item_name":"\u6ce8\u518c\u65f6\u95f4","item_content":"(.*?)"}',re.S)
            # results = re.search(pattern,response)
            # if results:
            #     sex = results.group(1)
            #     location = results.group(2)
            #     jianjie = results.group(3)
            #     level = results.group(4)
            #     credit = results.group(5)
            #     reg_time = results.group(6)
            sex = ''
            location = ''
            jianjie = ''
            level = ''
            credit = ''
            reg_time = ''

            sex_pattern = re.compile(
                r'{"card_type":41,"item_name":"\\u6027\\u522b","item_content":"(.*?)"}',
                re.S)
            location_pattern = re.compile(
                r'{"card_type":41,"item_name":"\\u6240\\u5728\\u5730","item_content":"(.*?)"}',
                re.S)
            # jianjie_pattern = re.compile(r'{"card_type":41,"item_name":"\\u7b80\\u4ecb","item_content":"(.*?)"}',re.S)
            level_pattern = re.compile(
                r'{"card_type":41,"item_name":"\\u7b49\\u7ea7".*?"item_content":"(.*?)"',
                re.S)
            credit_pattern = re.compile(
                r'{"card_type":41,"item_name":"\\u9633\\u5149\\u4fe1\\u7528","item_content":"(.*?)"',
                re.S)
            reg_time_pattern = re.compile(
                r'{"card_type":41,"item_name":"\\u6ce8\\u518c\\u65f6\\u95f4","item_content":"(.*?)"}',
                re.S)

            sex_res = re.search(sex_pattern, response)
            if sex_res:
                sex = sex_res.group(1).encode('utf8').decode('unicode_escape')

            location_res = re.search(location_pattern, response)
            if location_res:
                location = location_res.group(1).encode('utf8').decode(
                    'unicode_escape')

            # jianjie_res = re.search(jianjie_pattern,response)
            # if jianjie_res:
            #     jianjie = jianjie_res.group(1).encode('utf8').decode('unicode_escape')

            level_res = re.search(level_pattern, response)
            if level_res:
                level = level_res.group(1).encode('utf8').decode(
                    'unicode_escape')

            credit_res = re.search(credit_pattern, response)
            if credit_res:
                credit = credit_res.group(1).encode('utf8').decode(
                    'unicode_escape')

            reg_time_res = re.search(reg_time_pattern, response)
            if reg_time_res:
                reg_time = reg_time_res.group(1).encode('utf8').decode(
                    'unicode_escape')

            data = {
                'user_id': user_id,
                'sex': sex,
                'location': location,
                # 'jianjie':jianjie,
                'level': level,
                'credit': credit,
                'reg_time': reg_time
            }
            self.db.save(data)

    def get_fans(self, user_id, user_name):
        fans = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_fans_-_{user_id}&since_id={since_id}'
        for sid in range(1, 251):
            print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
            print('正在爬 ' + user_name + ' 第' + str(sid) + '页的粉丝')
            sleep(0.5)
            url = fans.format(user_id=user_id, since_id=sid)
            print(url)
            response = self.download.get_html(url)
            if response:
                try:
                    res_json = json.loads(response)
                    if 'cards' in res_json.keys():
                        if res_json['cards']:
                            results = res_json['cards'][0]
                            if 'card_group' in results.keys():
                                for res in results['card_group']:
                                    if 'user' in res.keys():
                                        user = res['user']['screen_name']
                                        fans_user_id = res['user']['id']
                                        data = {
                                            'user': user,
                                            'user_id': fans_user_id,
                                            'flag': False
                                        }
                                        self.db.save_first(data)
                        else:
                            print('爬了' + user_name + ' ' + str(sid) + ' 页粉丝')
                            break
                except:
                    print('json解析出错')

    def get_followers(self, user_id, user_name):
        followers = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_{user_id}&page={page}'
        for page in range(1, 11):
            print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
            print('正在爬 ' + user_name + ' 第' + str(page) + '页的关注')
            sleep(0.5)
            url = followers.format(user_id=user_id, page=page)
            response = self.download.get_html(url)
            if response:
                try:
                    res_json = json.loads(response)
                    if 'cards' in res_json.keys():
                        if res_json['cards']:
                            results = res_json['cards'][0]
                            if 'card_group' in results.keys():
                                for res in results['card_group']:
                                    if 'user' in res.keys():
                                        user = res['user']['screen_name']
                                        follower_user_id = res['user']['id']
                                        data = {
                                            'user': user,
                                            'user_id': follower_user_id,
                                            'flag': False
                                        }
                                        self.db.save_first(data)
                        else:
                            print('爬了' + user_name + ' ' + str(page) + ' 页关注')
                            break
                except:
                    print('json解析出错')
예제 #9
0
class GroupCrawler(object):
    def __init__(self):
        self.redisdb = RedisClient()
        self.mongodb = MongoClient()
        self.crawl_urls = []
        self.use_proxy = False
        if CRAWL_MODE == 'proxy':
            self.use_proxy = True

        self.start_urls = [
            'https://www.douban.com/group/explore',
            'https://www.douban.com/group/explore/culture',
            'https://www.douban.com/group/explore/travel',
            'https://www.douban.com/group/explore/ent',
            'https://www.douban.com/group/explore/fashion',
            'https://www.douban.com/group/explore/life',
            'https://www.douban.com/group/explore/tech'
        ]

    def start_requests(self):
        for url in self.start_urls:
            self.redisdb.add_new_url(url)

    def parse_content(self, html, url):
        if not html:
            logging.info('html is empty, from %s' % url)
            self.redisdb.add_new_url(url)
            return None
        bs = BeautifulSoup(html, 'lxml')
        # 找下一步要爬取的
        links = bs.find_all(
            'a', href=re.compile('https://www.douban.com/group/\d*-*\w*/$'))
        for l in links:
            new_url = l.attrs.get('href', '')
            if new_url and not self.redisdb.is_old_url(new_url):
                self.redisdb.add_new_url(new_url)

        # 解析想要的内容
        info = {
            'name': '',
            'gid': '',
            'members': 0,
            'created_at': '',
            'owner_name': '',
            'owner_id': ''
        }

        try:
            info['name'] = self.parse_content_name(bs)
            info['gid'] = self.parse_content_gid(bs)
            info['members'] = self.parse_content_members(bs)
            info['created_at'] = self.parse_content_createdat(bs)
            info['owner_id'], info['owner_name'] = self.parse_content_owner(bs)
            logging.info(info)
            try:
                if info['gid']:
                    self.mongodb.save_group(info)
                else:
                    self.redisdb.add_new_url(url)
            except:
                self.redisdb.add_new_url(url)
                logging.info('insert into mysql error: %s' % info)
                traceback.print_exc()
        except:
            if new_url not in self.start_urls:
                logging.info('parse url %s error' % url)
                self.redisdb.add_new_url(url)
                # traceback.print_exc()

    def parse_content_name(self, bs):
        try:
            return bs.select_one('#group-info > h1').string.strip().encode(
                'utf8')
        except:
            pass
        return ''

    def parse_content_gid(self, bs):
        try:
            group_members = bs.select_one(
                '#content > div.grid-16-8.clearfix > div.aside > div.mod.side-nav > p > a'
            )
            return group_members.attrs.get('href').split('/group/')[1].split(
                '/')[0]
        except:
            pass
        return ''

    def parse_content_members(self, bs):
        try:
            group_members = bs.select_one(
                '#content > div.grid-16-8.clearfix > div.aside > div.mod.side-nav > p > a'
            )
            pattern = re.compile('.*?\((\d+)\)', re.S)
            match = pattern.match(group_members.string.strip())
            return match.groups()[0].encode('utf8')
        except:
            pass
        return 0

    def parse_content_createdat(self, bs):
        try:
            created_at = ''
            for s in bs.select_one(
                    '#content > div.grid-16-8.clearfix > div.article > div.group-board p'
            ).strings:
                created_at += s.strip()

            pattern = re.compile('.*?(\d{4}-\d{2}-\d{2})', re.S)
            match = pattern.match(created_at)
            return match.groups()[0].encode('utf8')
        except:
            pass
        return ''

    def parse_content_owner(self, bs):
        try:
            owner = bs.select_one(
                '#content > div.grid-16-8.clearfix > div.article > div.group-board > p > a'
            )
            owner_id = owner.attrs.get('href').split('/people/')[1].split(
                '/')[0]
            owner_name = owner.string.strip().encode('utf8')
            return owner_id, owner_name
        except:
            pass
        return '', ''

    def crawler(self, iurl):
        url, proxies = iurl
        headers = {'Referer': 'https://www.douban.com/group/explore'}
        content = get_page(url, headers=headers, proxies=proxies)
        self.parse_content(content, url)

    def run(self):
        print 'group crawler start runing'
        if IS_SERVER:
            self.start_requests()
        pools = Pool(CRAWL_WORKER_THREAD_NUM)
        count = 0
        while True:
            while self.redisdb.url_len:
                if self.redisdb.is_url_lock():
                    logging.info('url pool is locked')
                    continue
                urls = self.redisdb.get_new_urls(CRAWL_WORKER_THREAD_NUM)
                self.redisdb.add_old_urls(urls)
                self.redisdb.url_unlock()

                if CRAWL_MODE in ['proxy', 'mix']:
                    if CRAWL_MODE == 'mix' and count % 5 == 0:
                        proxies = {}
                    else:
                        proxy = self.redisdb.rand_proxy().replace(
                            'https', 'http')
                        proxies = {'https': proxy, 'http': proxy}
                pools.map(self.crawler, [(x, proxies) for x in urls])
                logging.info('waitting for next round')

                count += 1
                if count >= 1000:
                    count = 0
                sleep(CRAWL_WORKER_SLEEP)
            else:
                print 'url queue len is: %s' % self.redisdb.url_len