def parse_water_fall(self, response): # prepare data data = response.json() uid, folder = response.meta['uid'], response.meta['folder'] # continue to next page cursor = data['next_cursor'] yield scrapy.Request(api.get_water_fall(uid, cursor), callback=self.parse_water_fall, meta=response.meta) # yield all videos for video in data['list']: video, mid = video['page_info'], video['mid'] video_type = video['object_type'] match video_type: case 'video': urls = [video['media_info'][key] for key in self.video_keys] url = urls[0] if urls else '' self.logger.info(f'{folder} found 1 video (from {response.url})') yield WeiboItem(uuid=mid, filename=f'{folder}/{mid}.mp4', file_urls=[url]) case 'story': for i, slide in enumerate(video['slide_cover']['slide_videos']): url = slide['url'] yield WeiboItem(uuid=f'{mid}_{i}', filename=f'{folder}/{mid}_{i}.mp4', file_urls=[url]) case _: self.logger.warning('Unknown video type "%s".', video_type)
def parse_img(self, response): item = WeiboItem() item['img_url'] = response.url item['img_id'] = response.meta['img_id'] item['img_path'] = response.meta['dir'] + item[ 'img_id'] + response.url[-4:] yield item
def parse_weibo(self, response): item = WeiboItem() print response.xpath('/html/head/title/text()').extract()[0] for p in response.xpath('//div[contains(@id,"M")]'): item['content'] = p.xpath('./div/span/text()').extract() yield item yield Request("http://weibo.cn" + response.xpath('//*[@id="pagelist"]/form/div/a/@href').extract()[0], cookies=self.cookies, callback=self.parse_weibo)
def parse_user(self, response): user_item = response.meta['key'] user_item['weibo_count'] = response.xpath( '//span[@class="tc"]/text()').re_first('\d+') user_item['following_count'] = response.xpath( '//a[contains(@href, "follow")]/text()').re_first('\d+') user_item['follower_count'] = response.xpath( '//a[contains(@href, "fans")]/text()').re_first('\d+') weibos = response.xpath('//div[contains(@id, "M")]') for weibo in weibos: weibo_item = WeiboItem() # print('♦️' * 100) # print(weibo.xpath('./@id').extract_first()) # print('♦️' * 100) weibo_item['name'] = user_item['name'] weibo_item['id'] = weibo.xpath('./@id').extract_first() weibo_item['text'] = weibo.xpath( './/span[@class="ctt"]/text()').extract_first() weibo_item['pictures'] = weibo.xpath( './/img[@alt="图片"]/@src').extract() weibo_item['videos'] = weibo.xpath( './/a[contains(text(), "视频")]/@href').extract_first() weibo_item['likes_count'] = weibo.xpath( './/a[contains(text(), "赞")]/text()').re_first('\d+') weibo_item['reposts_count'] = weibo.xpath( './/a[contains(text(), "转发")]/text()').re_first('\d+') weibo_item['comments_count'] = weibo.xpath( './/a[contains(text(), "评论")]/text()').re_first('\d+') # print(weibo_item['id']) yield weibo_item yield user_item
def get_user(self, response): if response.status == 200: data_script = response.xpath('//script/text()').extract() for s in data_script: if 'FM.view({"ns":"pl.content.followTab.index"' in s: selector = js2xml_unescape(s) nodes = selector.xpath('//li[@class="follow_item S_line2"]') for n in nodes: content = etree.tostring(n, pretty_print=True, encoding='utf8') s = etree.HTML(unescape(content.decode())) username = s.xpath('//li[@class="follow_item S_line2"]//dt/a/@title')[0] href = s.xpath('//li[@class="follow_item S_line2"]//dt/a/@href')[0] follow_c = s.xpath('//span[1]/em[@class="count"]/a/text()') fans_c = s.xpath('//span[2]/em[@class="count"]/a/text()') article_c = s.xpath('//span[3]/em[@class="count"]/a/text()') if follow_c and fans_c and article_c: # 有些公众账号涉及隐私,没有关注粉丝等数据,且不是收集目标,直接排除。 # 如 同性恋婚姻合法化:https://weibo.com/p/1008089993f2f8dc0d7a92bc4a28748d6e8fd0/super_index if int(fans_c[0]) < 3000 and int(follow_c[0]) < 1000 and int(article_c[0]) > 1: # 粉丝数量少于3000,关注少于1000,微博数大于1条 item = WeiboItem() item["username"] = username item['href'] = urljoin('https://weibo.com', href) item['md5_index'] = get_md5(item['href']) item['done'] = False yield item next_page_url = self._get_next_page(selector) if next_page_url: yield scrapy.Request(url=next_page_url, callback=self.get_user)
def parse_weibos(self, response): result = json.loads(response.text) if result.get('ok') and result.get('data').get('cards'): weibos = result.get('data').get('cards') for weibo in weibos: mblog = weibo.get('mblog') if mblog: weibo_item = WeiboItem() field_map = { 'id': 'id', 'attitudes_count': 'attitudes_count', 'comments_count': 'comments_count', 'created_at': 'created_at', 'reposts_count': 'reposts_count', 'picture': 'original_pic', 'pictures': 'pics', 'source': 'source', 'text': 'text', 'raw_text': 'raw_text', 'thumbnail': 'thumbnail_pic' } for field, value in field_map.items(): weibo_item[field] = mblog.get(value) weibo_item['user'] = response.meta.get('uid') yield weibo_item #下一页微博 uid = response.meta.get('uid') page = response.meta.get('page') + 1 yield Request(self.weibo_url.format(uid=uid, page=page), callback=self.parse_weibos, meta={ 'uid': uid, 'page': page })
def parse(self, response): # 保存结果 item = WeiboItem() resp_dict = json.loads(response.body_as_unicode()) user = resp_dict['data']['user'] # 用户id item['uid'] = user['id'] # 用户名 item['screen_name'] = user['screen_name'] # 博文主页 item['profile_url'] = user['profile_url'] # 关注量 item['follow_count'] = user['follow_count'] # 粉丝 item['followers_count'] = user['followers_count'] print(user['screen_name']) yield item # 用户id uid = furl(response.url).args['uid'] # 修改为当前用户id self.attention.args['containerid'] = '231051_-_followers_-_{}'.format( uid) if uid not in self.data: self.data.add(uid) yield scrapy.Request(url=self.attention.url, meta={'uid': uid}, callback=self.parse_follow)
def parse(self, response): # print(response.text) item = WeiboItem() for p in response.xpath("//div[@class='c'and @id]"): try: text_transfrom = "".join( p.xpath("./div/text()").re(r'[\u4e00-\u9fa5]')) text_fanart = "".join( p.xpath("./div/span[@class='ctt']/text()").extract()) item['text_fanart'] = text_fanart item['text_transfrom'] = text_transfrom item['like'] = "".join( p.xpath("./div/a").re(r'赞\[[0-9]*?\]')).replace( '赞[', '').replace(']', '') item['transmit'] = "".join( p.xpath("./div/a").re(r'转发\[[0-9]*?\]')).replace( '转发[', '').replace(']', '') item['comment'] = "".join( p.xpath("./div/a").re(r'评论\[[0-9]*?\]')).replace( '评论[', '').replace(']', '') time_from = "".join( p.xpath("./div/span[@class='ct']/text()").extract()).split( "\xa0来自") item['time'] = self.clear_date(time_from[0]) item['_from'] = time_from[1] yield item except Exception as e: print(e) continue
def parse_detail(self, response): url = response.url item = WeiboItem() item['content'] = response.xpath( '//div[@class="c" and @id="M_"]//span[@class="ctt"]//text()' ).extract() yield item
def parse_detail(self, response): print('-----parse_detail------') id = re.search('comment/(.*?)\?', response.url).group(1) url = response.url content = ''.join( response.xpath( '//div[@id="M_"]//span[@class="ctt"]//text()').extract()) print(id, url, content) comment_count = response.xpath( '//span[@class="pms"]//text()').re_first('评论\[(.*?)\]') forward_count = response.xpath( '//a[contains(., "转发[")]//text()').re_first('转发\[(.*?)\]') like_count = response.xpath('//a[contains(., "赞[")]//text()').re_first( '赞\[(.*?)\]') print('转发: {} 评论: {} 赞: {}'.format(comment_count, forward_count, like_count)) posted_at = response.xpath('//div[@id="M_"]//span[@class="ct"]//text()' ).extract_first(default=None) user = response.xpath('//div[@id="M_"]/div/a/text()').extract_first( default=None) print(posted_at, user) weibo_item = WeiboItem() for field in weibo_item.fields: try: weibo_item[field] = eval(field) except NameError: self.logger.debug('Field No Defined!' + field) yield weibo_item
def parse_weibo(self, response): result = json.loads(response.text) if result.get('data') and result.get('data').get('cards'): weibo_list = result.get('data').get('cards') item = WeiboItem() field_map = { 'created_at': 'created_at', 'id': 'id', # 'attitudes_count':'attitudes_count', } for weibo in weibo_list: if weibo.get('mblog'): mblog = weibo.get('mblog') for k, v in field_map.items(): item[k] = mblog[v] yield item #下一页微博 uid = response.meta.get('uid') page = response.meta.get('page') + 1 self.log(uid) yield scrapy.Request(url=self.weibo_url.format(uid=uid, page=page), callback=self.parse_weibo, meta={ 'page': page, 'uid': uid })
def parse(self, response): data = json.loads(response.text) if data and 'data' in data: pattern = re.compile( '<div.*?list_title_b.*?<a href="(.*?)".*?_blank">(.*?)</a>.*?subinfo S_txt2">(.*?)</span></a>.*?' + 'S_txt2">(.*?)</span>.*?praised S_ficon W_f16">ñ</em><em>(.*?)</em>.*?ficon_' + 'repeat S_ficon W_f16">.*?</em><em>(.*?)</em>.*?forward S_ficon W_f16.*?</em><em>' + '(.*?)</em>.*?</div>', re.S) result = re.findall(pattern, data.get('data')) for info in result: item = WeiboItem() item['content'] = info[1] item['author'] = info[2] item['publishTime'] = info[3] item['repost'] = info[4] item['comment'] = info[5] item['approve'] = info[6] item['address'] = info[0] yield item if self.offset < 30: self.offset += 1 url = self.base_url.format(self.offset) yield scrapy.Request(url, callback=self.parse)
def parse(self, response): result = json.loads(response.text) if result.get('ok') and result.get('data').get('cards'): weibos = result.get('data').get('cards') for weibo in weibos: myblog = weibo.get('mblog') if myblog: item = WeiboItem() field_map = { 'id': 'id', 'attitudes_count': 'attitudes_count', 'comments_count': 'comments_count', 'reposts_count': 'reposts_count', 'picture': 'original_pic', # 'pictures': 'pics', 'created_at': 'created_at', 'source': 'source', 'text': 'text', # 'raw_text': 'raw_text', 'thumbnail': 'thumbnail_pic', } for field, attr in field_map.items(): item[field] = myblog.get(attr) item['user'] = response.meta.get('user_info').get('lfid') yield item
def parse(self, response): # print("response.meta", response.meta) if ('page' in response.meta): page = response.meta['page'] else: page = 1 result = json.loads(response.text) # print(response.text) users = result.get("data").get("cards")[0] if (users is None): return for card in result.get("data").get("cards"): for group in card.get("card_group"): user = group.get("user") if (user): weibo = WeiboItem() weibo['uid'] = user['id'] weibo['nikename'] = user['screen_name'] weibo['faceurl'] = user['profile_image_url'] yield weibo request = scrapy.Request( "https://m.weibo.cn/api/container/getIndex?containerid=231051_-_fans_-_3641513235&since_id=%d" % (page), callback=self.parse) request.meta['page'] = page + 1 yield request
def parse(self, response): text_json = json.loads(response.body_as_unicode()) self.since_id = text_json.get('data').get('cardlistInfo').get( 'since_id') cards = text_json.get('data').get('cards') for it in cards: it_son = it.get('mblog') if it_son: self.created_at = it_son['created_at'] self.text = it_son['text'] self.source = it_son['source'] self.scheme = it['scheme'] self.reposts_count = it_son['reposts_count'] self.comments_count = it_son['comments_count'] self.attitudes_count = it_son['attitudes_count'] soup = BeautifulSoup(str(self.text), "html.parser") self.text = soup.get_text() if len(self.created_at) < 6: self.created_at = "%s%s" % ("2020-", self.created_at) self.textLength = len(self.text) items = WeiboItem(created_at=self.created_at, text=self.text, source=self.source, scheme=self.scheme, reposts_count=self.reposts_count, comments_count=self.comments_count, attitudes_count=self.attitudes_count, textLength=self.textLength) yield items if not self.since_id: return urls = "%s%s" % (self.url, str(self.since_id)) yield scrapy.Request(urls, callback=self.parse)
def parse(self, response): js = json.loads(response.text) weibo_items = js.get('data').get('cards') for weibo_item in weibo_items: item = WeiboItem() item['time'] = weibo_item.get('mblog').get('created_at') item['txt'] = weibo_item.get('mblog').get('text') yield item
def parse(self,response): item = WeiboItem() selector = Selector(response) tweets = selector.xpath('body/div[@class="c" and @id]') for tweet in tweets: device = 'None' #id = tweet.xpath('@id').extract_first() # 微博ID id = tweet.xpath('div/a/text()').extract_first() #content = tweet.xpath('div/span[@class="ctt"]/text()').extract_first() # 微博内容 content = tweet.xpath('div/span[@class="ctt"]') info = content[0].xpath('string(.)').extract_first() cooridinates = tweet.xpath('div/a/@href').extract_first() # 定位坐标 like = re.findall(u'\u8d5e\[(\d+)\]', tweet.extract()) # 点赞数 transfer = re.findall(u'\u8f6c\u53d1\[(\d+)\]', tweet.extract()) # 转载数 comment = re.findall(u'\u8bc4\u8bba\[(\d+)\]', tweet.extract()) # 评论数 others = tweet.xpath('div/span[@class="ct"]/text()').extract_first() # 求时间和使用工具(手机或平台) print id print info if others : others = others.split(u"\u6765\u81ea") pubday = others[0] if len(others)>1: device= others[1] if like : like = int(like[0]) print like if transfer: transfer = int(transfer[0]) print transfer if comment: comment = int(comment[0]) print comment item['nickname']=id item['content'] = info item['pubday'] = pubday item['device'] = device item['like'] = like item['transfer'] = transfer item['comment'] = comment yield item
def parse_1(self, response): # 初始化 selenium 引擎,打开一个浏览器模拟人的行为 # 避免被反爬虫 self.config_driver() print("Open driver...") self.driver.get(response.url) # 让浏览器能获取足够的数据,加载完成 time.sleep(20) # Let the user actually see something! page = 1 # 微博的首页热门微博通过下拉出现新的微博 # 尽量模仿人的行为去浏览网页,间隔一段时间下拉 # 下拉的距离也稍微随机一下 # 偶尔出现网页加载不完全的情况,可以手动刷新一下 # 可能和网络状况有关,有的时候正常浏览也加载不出来,大概是微博自身的问题 while (page <= 30): time.sleep(max(random.random(), random.gauss(0.5, 1.0))) print(f"下拉 {page} 次...") self.driver.execute_script("window.scrollBy(0,{})".format( max(0, int(random.gauss(150.0, 25.0))))) page += 1 source = self.driver.page_source # 调用 BeautifulSoup 解析 html soup = BeautifulSoup(source, 'lxml') container = soup.find('div', class_='UG_contents') items = container.find_all('div', class_=re.compile('UG_list_*')) print("此次共获取 {} 条记录".format(len(items))) for item_des in items: url = '#' wb_from = '' if item_des.has_attr('href'): url = item_des['href'] elif item_des.find('div', 'vid'): if item_des.find('div', 'vid').has_attr('href'): url = item_des.find('div', 'vid')['href'] if len(url) < 2: continue item = item_des.find('h3', class_=re.compile('list_title_*')) emojis = [] if item: item_text = item.text.strip() item_emojis = item.find_all('img', alt=True) if item_emojis: for item_emoji in item_emojis: if item_emoji and item_emoji.has_attr('alt'): emojis.append(item_emoji['alt']) weibo_item = WeiboItem() weibo_item['url'] = url # print(item_text, ''.join(emojis)) content = remove_multi_space(item_text).strip() weibo_item['content'] = content + ''.join(emojis) sub_info = item_des.find('span', 'subinfo') if sub_info and sub_info.text: wb_from = sub_info.text.strip() weibo_item['wb_from'] = wb_from yield weibo_item
def parse(self, response): catelog = parse.parse_qs( parse.urlparse(parse.unquote(response.url)).query)['q'] item = WeiboItem() cardlist = response.xpath( "//div[@class='card-feed']/div[@class='content']//li/img/@src" ).extract() item['catelog'] = catelog item['photos'] = cardlist yield item
def parse_content(self, response): # item = XiaoshuoItem() item = WeiboItem() item['url'] = response.url item['title'] = response.xpath( '//div[@class="bookname"]/h1/text()').extract_first() item['content'] = response.xpath( '//div[@id="content"]/text()').extract() nextpage = response.xpath( '//div/a[contains(text(),"下一章")]/@href').extract_first() yield item print(item['title'])
def parse(self, response): depth = response.meta['depth'] fail_count = response.meta['fail_count'] responseUrl = response.url requestUrl = response.meta['url'] #判断请求网页与返回网页的url一致性,不一样则重新请求 if not str(responseUrl).__eq__(requestUrl): #判断失败次数,超过两次放弃 if fail_count <= 1: fail_count += 1 yield scrapy.Request(requestUrl, callback=self.parse, method='GET', dont_filter=True, meta={ 'depth': depth, 'url': requestUrl, 'fail_count': fail_count }, headers=self.headers, cookies=self.cookie) else: #记录请求失败的网页 item = FailUrlItem() item['fail_url'] = requestUrl yield item else: item = WeiboItem() data = response.body.decode() html_etree = self.dataToHtml(data) item['focus_by'] = html_etree.xpath('//h1/text()')[0] focus_li = html_etree.xpath( '//div[@class="title W_fb W_autocut "]') for focus in focus_li: item['name'] = focus.xpath('.//a/text()')[0] focus_url = focus.xpath('.//a[@class="S_txt1"]/@href') item['url'] = 'https://weibo.com' + focus_url[0].split('?')[0] item['fans_num'] = 10000000 url = item[ 'url'] + '/follow?from=page_100306&wvr=6&mod=headfollow' yield item yield scrapy.Request(url, callback=self.parse_focus_list, method='GET', dont_filter=True, meta={ 'depth': depth, 'url': url, 'fail_count': fail_count }, headers=self.headers, cookies=self.cookie)
def getItem(self, card): for j in range(len(card)): card_group = card[j] item = WeiboItem() desc1 = card_group.get("desc1") desc2 = card_group.get("desc2") id = card_group.get('user').get("id") name = card_group.get('user').get("screen_name") item['name'] = name item['weiboid'] = id item["desc1"] = desc1 item["desc2"] = desc2 return item
def parse4(self, response): response = response.text response = json.loads(response) # print(response) #这里有可能是暂无数据 # print(len(response['data']['data'])) for i in range(len(response['data']['data'])): userid = response['data']['data'][i]['user']['id'] username = response['data']['data'][i]['user']['screen_name'] Item = WeiboItem() Item["userid"] = userid Item["username"] = username yield Item
def item_parse(self, response): imgs = response.xpath("//img/@src").extract() for item in imgs: myitem = WeiboItem() # 实例化item相当于定义一个类 myitem['image_urls'] = item yield myitem next_link = response.xpath( "//div[@class='NewPages']/ul//a/@href").extract() if next_link[len(next_link) - 1] != '#': yield scrapy.Request( "http://www.umei.cc/meinvtupian/meinvxiezhen/" + next_link[len(next_link) - 1], callback=self.item_parse) return imgs
def parse2(self, response): response = response.text response = json.loads(response) for i in range(len(response['data']['data'])): userid = response['data']['data'][i]['user']['id'] username = response['data']['data'][i]['user']['screen_name'] # print(userid, username) Item = WeiboItem() Item["userid"] = userid Item["username"] = username self.count = self.count + 1 print(self.count) time.sleep(1) yield Item
def parse_person(self, response): """ 解析个人信息 :param response: :return: """ jsonObj = json.loads(response.text) # 获取信息出错 if jsonObj.get('msg'): # 再次调用parse_person yield Request(url=response.url, headers=self.headers, callback=self.parse_person) item = WeiboItem() # 从response.url中截取uid uid = response.url.split('_')[0].split('=')[1][6:] item['uid'] = uid # 遍历cards cards = jsonObj.get('cards') or [] for something in cards: card_group = something.get('card_group') or [] # 遍历card_group,逐项寻找感兴趣的信息 for card in card_group: item_name = card.get('item_name') # logging.debug(item_name) if '昵称' == item_name: item['name'] = card.get('item_content') elif '性别' == item_name: item['gender'] = card.get('item_content') elif '所在地' == item_name: item['location'] = card.get('item_content') elif '简介' == item_name: item['intro'] = card.get('item_content') elif '注册时间' == item_name: item['signin'] = card.get('item_content') elif '学校' == item_name: item['school'] = card.get('item_content') elif '公司' == item_name: item['company'] = card.get('item_content') yield item # 获取此人粉丝信息 yield Request(url='{0}?containerid=100505{1}_-_FOLLOWERS'.format( self.getSecond, uid), headers=self.headers, callback=self.parse_followers)
def start_requests(self): """ 以一个或者多个微博id为根节点微博,开始根据转发关系递归爬取,并传递自己的id给转发结点作为父亲id """ #cookies = self.cookies_list[2] for weibo_id in self.start_weibo_id: self.weibo_id_set.add(weibo_id) item = WeiboItem() item['weibo_id'] = weibo_id item['father_weibo_id'] = 'ROOT' yield scrapy.Request(url=self.weibo_template % (weibo_id, 1), cookies=self.get_random_cookies(), meta={'item': item}, callback=self.parse_content)
def parse_info(self, response): user = WeiboItem() info_mapping = { u'昵称': 'screen_name', u'性别': 'gender', u'所在地': 'location', u'生日': 'birthday' } user['uid'] = response.meta['uid'] for info in response.xpath('//div[@class="item-info-page"]'): if info.xpath('span/text()').extract(): key = info.xpath('span/text()').extract()[0] if key in info_mapping: user[info_mapping[key]] = info.xpath('p/text()').extract()[0] return user
def parse(self, response): item = WeiboItem() conts = [] datas = response.xpath('//div[@class="c"]/div/span[@class="ctt"]') for i in range(0, len(datas)): conts.append(datas[i].xpath('string(.)').extract()) item['content'] = conts item['times'] = response.xpath('//span[@class="ct"]/text()').extract() item['likes'] = response.xpath( '//div[@class="c"]/div[2]/a[3]/text()').extract() item['comments'] = response.xpath( '//div[@class="c"]/div/a[@class="cc"]/text()').extract() item['transfer'] = response.xpath( '//div[@class="c"]/div[2]/a[4]/text()').extract() yield item
def parse_search(self, response): res = json.loads(response.text) if (str(res['ok']) == '1'): for key in res['data']['cards'][0]['card_group']: item = WeiboItem() # weibo info item['weibo_id'] = key['mblog']['id'] item['weibo_content'] = self.filt_re.sub( '', key['mblog']['text']) item['weibo_repost'] = key['mblog']['reposts_count'] item['weibo_comment'] = key['mblog']['comments_count'] item['weibo_like'] = key['mblog']['attitudes_count'] timestr = str(key['mblog']['created_at']) ctime = self.get_ctime(timestr) item['weibo_ctime'] = ctime url_pos = key['scheme'].index("?mblogid") item['weibo_url'] = key['scheme'][0:url_pos] # user info item['user_id'] = str(key['mblog']['user']['id']) item['user_name'] = key['mblog']['user']['screen_name'] item['user_weibo'] = key['mblog']['user']['statuses_count'] item['user_follow'] = key['mblog']['user']['follow_count'] item['user_fan'] = key['mblog']['user']['followers_count'] item['user_verified'] = key['mblog']['user']['verified'] user_url = key['mblog']['user']['profile_url'] uurl_pos = user_url.index('?uid=') item['user_url'] = user_url[0:uurl_pos] item['mediaName'] = self.mediaName item['keyword'] = response.meta['keyword'] # print('Weibo crawled at %s' %ctime) if item['weibo_content'].strip() == '': return for word in self.filterword: if item['weibo_content'].find(word) != -1: return yield item #yield scrapy.Request(url=key['scheme'],callback=self.parse) else: # i=i-1 if ('msg' in res.keys()): print(res['msg'])