def parse_mark(self, response): mark = None index = 1 result = get_select_first_str(response, "/html/body/div[5]/div[1]/div[6]/p/a[position()='1']/text()", None) if result != None: while True: r = get_select_first_str(response, "/html/body/div[5]/div[1]/div[6]/p/a[position()=" + str(index) + "]/text()", None) if r == None: break index = index + 1 if mark == None: mark = r else: mark = mark + ',' + r else: result = get_select_first_str(response, "/html/body/div[5]/div[1]/div[5]/p/a[position()='1']/text()", None) index = 1 if result != None: while True: r = get_select_first_str(response, "/html/body/div[5]/div[1]/div[5]/p/a[position()=" + str(index) + "]/text()", None) if r == None: break index = index + 1 if mark == None: mark = r else: mark = mark + ',' + r return mark
def parse(self, response): elements = response.xpath( "//div[@class='mBox']//div[@class='bd']//ul[@class='clearfix wenList']//li" ) if len(elements) <= 0: print('============================') print(response.text) return e = elements[0] for i in range(len(elements)): j = i + 1 print(str(j)) head = "//div[@class='mBox']//div[@class='bd']//ul[@class='clearfix wenList']//li[position()=" + str( j) + "]" href = get_select_first_str(e, head + "//a[position()=1]//@href", None) if href is not None: content_url = "http:" + href yield response.follow(content_url, callback=self.content) next_url = get_select_first_str( response, "//div[@class='mBox']//div[@class='bd']//div[@class='pager']//a[@class='page' and text()='下一页']/@href", None) if next_url is not None: next_url = "http:" + next_url yield response.follow(next_url, callback=self.parse)
def parse(self, response): type = 0 if response.request.meta['type'] is None: type = get_joke_type(response.url) else: type = response.request.meta['type'] page_url_head = None if response.request.meta['page_url_head'] is None: page_url_head = get_joke_page_url_head(response.url) else: page_url_head = response.request.meta['page_url_head'] if page_url_head == None: print(response.text) return elements = response.xpath("//*[@id='main']/div/div[1]/div[2]/ul/li") if len(elements) <= 0: return e = elements[0] for i in range(len(elements)): j = i + 1 print(str(j)) head = "//*[@id='main']/div/div[1]/div[2]/ul/li[position()=" + str( j) + "]" href = get_select_first_str(e, head + "/div/h2/a/@href", None) title = get_select_first_str(e, head + "/div/h2/a/text()", None) if href is not None: content_url = "http://www.jdjhj.com" + href yield response.follow(content_url, callback=self.content, meta={ 'type': type, 'title': title }) next_url = get_select_first_str( response, "//*[@id='pager']/ul/li/a[text()='下一页']/@href", None) if next_url is not None: next_url = page_url_head + "/" + next_url print(next_url) yield scrapy.Request(next_url, callback=self.parse, meta={ 'type': type, 'page_url_head': page_url_head })
def parse(self, response): type = 0 if response.request.meta['type'] is None: type = get_source_type(response.url) else: type = response.request.meta['type'] page_url_head = None if response.request.meta['page_url_head'] is None: page_url_head = get_picture_page_url_head(response.url) else: page_url_head = response.request.meta['page_url_head'] elements = response.xpath("//*[@id='bqblist']/a") if len(elements) <= 0: return e = elements[0] for i in range(len(elements)): j = i + 1 print(str(j)) head = "//*[@id='bqblist']/a[position()=" + str(j) + "]" href = get_select_first_str(e, head + "/@href", None) title = get_select_first_str(e, head + "/@title", None) if href is not None: href = "https://www.fabiaoqing.com" + href content_url = href pictures = [] has_error = 'false' yield response.follow(content_url, callback = self.content, meta = { 'type':type, 'group_url':content_url, 'stage':'content', 'pictures':pictures, 'has_error':has_error }) next_url = get_select_first_str(response, "//*[@id='bqblist']/div[@class='ui pagination menu']/a[contains(text(), '下一页')]/@href", None) if next_url is not None: next_url = page_url_head + next_url yield response.follow(next_url, callback = self.parse, meta = { 'type':type, 'page_url_head':page_url_head, 'stage':'page' })
def parse(self, response): type = 0 if response.request.meta['type'] is None: type = get_source_type(response.url) else: type = response.request.meta['type'] elements = response.xpath("/html/body/div[5]/div[1]/ul/li") if len(elements) <= 0: return e = elements[0] for i in range(len(elements)): j = i + 1 print(str(j)) head = "/html/body/div[5]/div[1]/ul/li[position()=" + str(j) + "]" href = get_select_first_str(e, head + "//a[position()=1]//@href", None) if href is not None: content_url = href picture_urls = [] has_error = 'false' yield response.follow(content_url, callback = self.content, meta = {'type':type, 'group_url':content_url, 'stage':'content', 'picture_urls':picture_urls, 'has_error':has_error}) next_url = get_select_first_str(response, "/html/body/div[5]/div[1]/div/a[text()='下一页']/@href", None) if next_url is not None: yield response.follow(next_url, callback = self.parse, meta = {'type':type, 'stage':'page'})
def parse(self, response): type = 0 if response.request.meta['type'] is None: type = get_joke_type(response.url) else: type = response.request.meta['type'] elements = response.xpath( "//*[@id='main']/div/div[@class='line1' or @class='line2']") if len(elements) <= 0: return e = elements[0] for i in range(len(elements)): j = i + 1 print(str(j)) head = "//*[@id='main']/div/div[(@class='line1' or @class='line2') and position()=" + str( j) + "]" href = get_select_first_str(e, head + "/a/@href", None) title = get_select_first_str(e, head + "/a/text()", None) if href is not None: content_url = href yield response.follow(content_url, callback=self.content, meta={ 'type': type, 'title': title }) next_url = get_select_first_str( response, "//*[@id='main']/div/div[@class='pgs cl']/div[@class='pg']/a[text()='下一页']/@href", None) if next_url is not None: print('###########################') print(next_url) yield scrapy.Request(next_url, callback=self.parse, meta={'type': type})
def content(self, response): item = JokeItem() title = response.request.meta['title'] if title != None: item['title'] = title.strip() else: item['title'] = None joke_type = title = response.request.meta['type'] if joke_type != None: item['type'] = joke_type else: item['type'] = None text = get_select_first_str( response, "//*[@id='main']/div/div[1]/div[2]/div[1]/p/span/span", None) if text == None: text = get_select_first_str( response, "//*[@id='main']/div/div[1]/div[2]/div[1]/p", None) item['text'] = text item['crawl_origin'] = '天天搞笑网' item['crawl_url'] = response.url print(concat_str('笑话标题:', title)) print(concat_str('内容', text)) return item
def parse(self, response): type = 0 if response.request.meta['type'] is None: type = get_source_type(response.url) else: type = response.request.meta['type'] group_url = None if response.request.meta['group_url'] is None: group_url = None else: group_url = response.request.meta['group_url'] pictures = [] if response.request.meta['pictures'] is not None: pictures = response.request.meta['pictures'] has_error = 'false' if response.request.meta['has_error'] is not None: has_error = response.request.meta['has_error'] title = get_select_first_str(response, "//*[@id='bqb']/div[1]/h1/text()", None) if title != None: title = title.strip() images = response.xpath( "//*[@id='bqb']//div[1]//div[1]//div//div//div[@class='bqppdiv1']") if images == None or len(images) == 0: has_error = 'true' else: for i in range(len(images)): j = i + 1 print(str(j)) head = "//*[@id='bqb']//div[1]//div[1]//div//div//div[position()=" + str( j) + "]" src = get_select_first_str(response, head + "//img//@data-original", None) description = get_select_first_str(response, head + "//img//@title", None) picture = {} picture['url'] = src picture['description'] = description pictures.append(picture) marks = response.xpath('//*[@id="bqb"]/div[1]/div[2]/a') mark = '' if marks == None or len(marks) == 0: mark = None else: for i in range(len(marks)): j = i + 1 mark_str = get_select_first_str( response, '//*[@id="bqb"]/div[1]/div[2]/a[position()=' + str(j) + "]/@title", None) mark = mark + mark_str.replace("表情包", "") + ',' if mark != '': mark = mark[:-1] item = PictureItem() item['type'] = type item['title'] = title item['mark'] = mark item['thumbs_up_times'] = None item['crawl_origin'] = '发表情' item['crawl_url'] = response.url item['group_url'] = group_url item['pictures'] = pictures item['has_error'] = has_error print(concat_str('图片类型:', type)) print(concat_str('图片标题:', title)) print(concat_str('group_url:', group_url)) print('picture_urls:') print('has_error:', has_error) print(pictures) yield item
def content(self, response): type = 0 if response.request.meta['type'] is None: type = get_source_type(response.url) else: type = response.request.meta['type'] group_url = None if response.request.meta['group_url'] is None: group_url = None else: group_url = response.request.meta['group_url'] picture_urls = [] if response.request.meta['picture_urls'] is not None: picture_urls = response.request.meta['picture_urls'] has_error = 'false' if response.request.meta['has_error'] is not None: has_error = response.request.meta['has_error'] title = get_select_first_str(response, "/html/body/div[5]/div[1]/div[1]/h1/text()", None) if title != None: title = title.strip() mark = self.parse_mark(response) thumbs_up = get_select_first_str(response, "/html/body/div[5]/div[1]/div[4]/a[1]/i/text()", None) thumbs_up_times = None if thumbs_up == None: thumbs_up_times = None else: r = re.findall('[0-9]\d*', thumbs_up) if r != None and r.__len__() > 0: thumbs_up_times = int(r[0]) if '万' in thumbs_up: thumbs_up_times = thumbs_up_times * 10000 images = response.xpath("//*[@id='txtabbox']/div[2]/ul/li") if images == None or len(images) == 0: images = response.xpath("/html/body/div[5]/div[1]/ul/li") if images == None or len(images) == 0: has_error = 'true' else: for i in range(len(images)): j = i + 1 print(str(j)) head = "/html/body/div[5]/div[1]/ul/li[position()=" + str(j) + "]" src = get_select_first_str(response, head + "/img/@data-original", None) picture_urls.append(src) else: for i in range(len(images)): j = i + 1 print(str(j)) head = "//*[@id='txtabbox']/div[2]/ul/li[position()=" + str(j) + "]" src = get_select_first_str(response, head + "//a//img/@data-original", None) picture_urls.append(src) item = PictureItem() item['type'] = type item['title'] = title item['mark'] = mark item['thumbs_up_times'] = thumbs_up_times item['crawl_origin'] = '微茶' item['crawl_url'] = response.url item['group_url'] = group_url item['picture_urls'] = picture_urls item['has_error'] = has_error print(concat_str('图片类型:', type)) print(concat_str('图片标题:', title)) print(concat_str('图片标签:', mark)) print(concat_str('点赞数量:', str(thumbs_up_times))) print(concat_str('group_url:', group_url)) print('picture_urls:') print('has_error:', has_error) print(picture_urls) yield item
def parse(self, response): elements = response.xpath( "//div[@class='y-wrap']//div[@class='y-box container']//div[@class='y-left index-content']//div[@riot-tag='feedBox']//div[@class='feedBox']//div[@riot-tag='wcommonFeed']//div[@class='wcommonFeed']//ul//li[@ga_event='article_item_click']" ) e = elements[0] for i in range(len(elements)): item = NewsItem() j = i + 1 print(str(j)) item['type'] = get_news_type(response.url) head = "//div[@class='y-wrap']//div[@class='y-box container']//div[@class='y-left index-content']//div[@riot-tag='feedBox']//div[@class='feedBox']//div[@riot-tag='wcommonFeed']//div[@class='wcommonFeed']//ul//li[@ga_event='article_item_click' and position()=" + str( j) + "]" title = get_select_first_str( e, head + "//div[@class='item-inner y-box']//div[@class='rbox-inner']//div[@class='title-box']//a/text()", None) if title != None: item['title'] = title.strip() else: item['title'] = None media_url = get_select_first_str( e, head + "//div[@class='item-inner y-box']//a[@class='lbtn media-avatar']/@href", None) if media_url != None: media_url = 'https://www.toutiao.com' + media_url item['media_url'] = media_url else: item['media_url'] = None media_avatar_img = get_select_first_str( e, head + "//div[@class='item-inner y-box']//a[@class='lbtn media-avatar']//img/@src", None) if media_avatar_img != None: media_avatar_img = 'https:' + media_avatar_img item['media_avatar_img'] = media_avatar_img else: item['media_avatar_img'] = None media_name = get_select_first_str( e, head + "//div[@class='item-inner y-box']//a[@class='lbtn source']/text()", None) if media_name != None: media_name = media_name.replace('\xa0', '') media_name = media_name.replace('⋅', '') item['media_name'] = media_name comment = get_select_first_str( e, head + "//div[@class='item-inner y-box']//a[@class='lbtn comment']/text()", None) if comment == None: item['comment_count'] = None else: r = re.findall('[1-9]\d*', comment) if r != None and r.__len__() > 0: comment_count = int(r[0]) if '万' in comment: comment_count = comment_count * 10000 item['comment_count'] = comment_count else: item['comment_count'] = None article_img = get_select_first_str( e, head + "//div[@class='item-inner y-box']//a[@class='img-wrap']//img/@src", None) if article_img != None: article_img = 'https:' + article_img item['article_img'] = article_img else: item['article_img'] = None article_url = get_select_first_str( e, head + "//div[@class='item-inner y-box']//a[@class='link title']/@href", None) if article_url != None: article_url = 'https://www.toutiao.com' + article_url item['article_url'] = article_url else: item['article_url'] = None item['mark'] = None item['crawl_origin'] = '今日头条' item['crawl_url'] = response.url print(concat_str('文章标题:', title)) print(concat_str('源媒体:', media_url)) print(concat_str('源媒体头像:', media_avatar_img)) print(concat_str('源媒体名称:', media_name)) print(concat_str('评论数:', comment)) print(concat_str('文章图片:', article_img)) print(concat_str('文章url:', article_url)) yield item
def parse(self, response): type = 0 if response.request.meta['type'] is None: type = get_source_type(response.url) else: type = response.request.meta['type'] group_url = None if response.request.meta['group_url'] is None: group_url = None else: group_url = response.request.meta['group_url'] picture_urls = [] if response.request.meta['picture_urls'] is not None: picture_urls = response.request.meta['picture_urls'] has_error = 'false' if response.request.meta['has_error'] is not None: has_error = response.request.meta['has_error'] title = get_select_first_str( response, "/html/body/div[3]/div[3]/div[1]/div[1]/div[1]/h1/text()", None) if title != None: title = title.strip() images = response.xpath("//*[@id='content']/ul/center/img") if images == None or len(images) == 0: images = response.xpath("//*[@id='content']/ul/center//img") if images == None or len(images) == 0: has_error = 'true' else: divs = response.xpath("//*[@id='content']/ul/center//div") for d in range(len(divs)): images = response.xpath( "//*[@id='content']/ul/center//div[position()=" + str(d) + "]//img") for i in range(len(images)): j = i + 1 print(str(j)) head = "//*[@id='content']/ul/center//div[position()=" + str( d) + "]//img[position()=" + str(j) + "]" src = get_select_first_str(response, head + "/@src", None) picture_urls.append(src) else: for i in range(len(images)): j = i + 1 print(str(j)) head = "//*[@id='content']/ul/center/img[position()=" + str( j) + "]" src = get_select_first_str(response, head + "/@src", None) picture_urls.append(src) item = PictureItem() item['type'] = type item['title'] = title item['mark'] = None item['thumbs_up_times'] = None item['crawl_origin'] = '美头网' item['crawl_url'] = response.url item['group_url'] = group_url item['picture_urls'] = picture_urls item['has_error'] = has_error print(concat_str('图片类型:', type)) print(concat_str('图片标题:', title)) print(concat_str('group_url:', group_url)) print('picture_urls:') print('has_error:', has_error) print(picture_urls) yield item
def content(self, response): item = JokeItem() item['type'] = '1' head = "//div[@class='th']//div[@class='t2']//div[@class='main']//div[@class='news_info']" title = get_select_first_str( response, head + "//div[@class='main_info_top']//div[@id='main_info_top_right']//div[@class='head_title_2']//span//a/@title", None) if title != None: item['title'] = title.strip() else: item['title'] = None mark = None index = 1 while True: r = get_select_first_str( response, head + "//div[@class='main_info_top']//div[@id='main_info_top_right']//div[@class='publish_info']//span//a[position()=" + str(index) + "]/text()", None) if r == None: break index = index + 1 if mark == None: mark = r else: mark = mark + ',' + r item['mark'] = mark media_url = get_select_first_str( response, head + "//div[@class='main_info_top']//div[@class='main_info_top_left']//div[@id='head_photo']//a/@href", None) if media_url != None: media_url = 'http://www.xiaohuabus.com' + media_url item['media_url'] = media_url else: item['media_url'] = None media_avatar_img = get_select_first_str( response, head + "//div[@class='main_info_top']//div[@class='main_info_top_left']//div[@id='head_photo']//a//img/@src", None) if media_avatar_img != None: media_avatar_img = 'http://www.xiaohuabus.com' + media_avatar_img item['media_avatar_img'] = media_avatar_img else: item['media_avatar_img'] = None media_name = get_select_first_str( response, head + "//div[@class='main_info_top']//div[@id='main_info_top_right']//div[@id='head_title']//div[@class='user_info']//span[@id='yonghuming']//a/text()", None) if media_name != None: media_name = media_name.replace('\xa0', '') media_name = media_name.replace('⋅', '') item['media_name'] = media_name thumbs_up = get_select_first_str( response, head + "//div[@class='feix']//div[@class='feix_right']//a[position()=1]//span[position()=2]/text()", None) thumbs_up_times = None if thumbs_up == None: item['thumbs_up_times'] = None else: r = re.findall('[1-9]\d*', thumbs_up) if r != None and r.__len__() > 0: thumbs_up_times = int(r[0]) if '万' in thumbs_up: thumbs_up_times = thumbs_up_times * 10000 item['thumbs_up_times'] = thumbs_up_times else: item['thumbs_up_times'] = None text = get_select_first_str(response, head + "//div[@class='main_info_bottom']", None) item['text'] = text item['crawl_origin'] = '笑话巴士' item['crawl_url'] = response.url print(concat_str('图片标题:', title)) print(concat_str('源媒体:', media_url)) print(concat_str('源媒体头像:', media_avatar_img)) print(concat_str('源媒体名称:', media_name)) print(concat_str('获赞数:', str(thumbs_up_times))) print(concat_str('标签:', mark)) print(concat_str('内容', text)) return item
def parse(self, response): elements = response.xpath( "//div[@class='th']//div[@class='main']//div[@class='main_info']") if len(elements) <= 0: return e = elements[0] for i in range(len(elements)): item = TextItem() j = i + 1 print(str(j)) #item['type'] = get_news_type(response.url) head = "//div[@class='th']//div[@class='main']//div[@class='main_info' and position()=" + str( j) + "]" title = get_select_first_str( e, head + "//div[@class='main_info_top']//div[@id='main_info_top_right']//div[@class='head_title_2']//span//a//h1/text()", None) if title != None: item['title'] = title.strip() else: item['title'] = None mark = None index = 1 while True: r = get_select_first_str( e, head + "//div[@class='main_info_top']//div[@id='main_info_top_right']//div[@id='head_title']//div[@class='publish_info']//span//a[position()=" + str(index) + "]/text()", None) if r == None: break index = index + 1 if mark == None: mark = r else: mark = mark + ',' + r item['mark'] = mark media_url = get_select_first_str( e, head + "//div[@class='main_info_top']//div[@class='main_info_top_left']//div[@id='head_photo']//a/@href", None) if media_url != None: media_url = 'http://www.xiaohuabus.com' + media_url item['media_url'] = media_url else: item['media_url'] = None media_avatar_img = get_select_first_str( e, head + "//div[@class='main_info_top']//div[@class='main_info_top_left']//div[@id='head_photo']//a//img/@src", None) if media_avatar_img != None: media_avatar_img = 'http://www.xiaohuabus.com' + media_avatar_img item['media_avatar_img'] = media_avatar_img else: item['media_avatar_img'] = None media_name = get_select_first_str( e, head + "//div[@class='main_info_top']//div[@id='main_info_top_right']//div[@id='head_title']//div[@class='user_info']//span[@id='yonghuming']//a/text()", None) if media_name != None: media_name = media_name.replace('\xa0', '') media_name = media_name.replace('⋅', '') item['media_name'] = media_name thumbs_up = get_select_first_str( e, head + "//div[@class='feix']//div[@class='feix_right']//a[position()=1]//span[position()=2]/text()", None) thumbs_up_times = None if thumbs_up == None: item['thumbs_up_times'] = None else: r = re.findall('[1-9]\d*', thumbs_up) if r != None and r.__len__() > 0: thumbs_up_times = int(r[0]) if '万' in thumbs_up: thumbs_up_times = thumbs_up_times * 10000 item['thumbs_up_times'] = thumbs_up_times else: item['thumbs_up_times'] = None text = get_select_first_str( e, head + "//div[@class='main_info_bottom']", None) item['text'] = text item['crawl_origin'] = '笑话巴士' item['crawl_url'] = response.url print(concat_str('图片标题:', title)) print(concat_str('源媒体:', media_url)) print(concat_str('源媒体头像:', media_avatar_img)) print(concat_str('源媒体名称:', media_name)) print(concat_str('获赞数:', str(thumbs_up_times))) print(concat_str('标签:', mark)) print(concat_str('文本内容:', text)) yield item next_url = get_select_first_str( response, "//div[@class='th']//div[@class='main']//div[@class='pager']//a[@class='page' and text()='下一页']/@href", None) if next_url is not None: next_url = "http:" + next_url yield response.follow(next_url, callback=self.parse)