def parse(self, response): self.driver.get(response.url) # time.sleep(5) # content = self.driver.page_source # print("爬取的内容如下:" + content) # selector = Selector(text=content) selector = Selector(response) # bigTitle = selector.xpath('//div[@class="hd"]/h2/text()').extract() # self.getBigTitle(selector) # self.getSmallTitle(selector) myContent = selector.xpath('//div[@class="WordSection1"]/p[@class="MsoNormal"]/span//text()').extract() i = 0 isTitle = False space = "\r\n\n\t" space1 = "\r\n" content1 = "" # self.singleText(content1, i, isTitle, myContent, space) for line in myContent: if isTitle: content1 += "^^^^^^^^^^^^^^^^^^^^^^^^^^^^" + space1 content1 += "当前的问题是:" + line + space1 content1 += "^^^^^^^^^^^^^^^^^^^^^^^^^^^^" + space isTitle = False continue if Utils.matchTitle(line): i += 1 if i > 10: break content1 += "______________________" + space1 content1 += line + "---------------" + space1 content1 += "______________________" + space1 isTitle = True continue if ~isTitle: l = line # for l in line: if Utils.matchTitle(l): # content1 += line content1 += space continue content1 += l endChar = l[len(l) - 1] if Utils.isEndChar(endChar): content1 += space print(content1)
def parse(self, response): shop_id = response.url.split('/')[-1].strip() shop_url = response.url data = Selector(response).xpath('//div[@class="page-main"]') main = Selector(response).xpath('//div[@class="market-main"]') detail = Selector(response).xpath('//div[@class="market-detail"]') detail_other = Selector(response).xpath( '//div[@class="market-detail-other Hide"]') navigator_div = Selector(response).xpath('//div[@class="breadcrumb"]') location = [] for loc in navigator_div.xpath('b/a/span'): location.append(loc.xpath('text()').extract()[0].strip()) shop_navigation_path = '>'.join(location) print shop_navigation_path shop_name = Selector(response).xpath( '//div[@class="shop-name"]/h1/text()').extract()[0].strip() print shop_name shop_district = Selector(response).xpath( '//span[@class="region"]/text()').extract()[0].strip() print shop_district shop_address = Selector(response).xpath( '//span[@itemprop="street-address"]/text()').extract()[0].strip() print shop_address shop_phone_1 = None shop_phone_2 = None shop_rank = Selector(response).xpath( '//div[@class="comment-rst"]/span/@title').extract()[0].strip() shop_taste_score = None shop_env_score = None shop_service_score = None shop_price = Selector(response).xpath( '//div[@class="comment-rst"]/dl/dd/text()').extract()[0].strip() shop_review = Selector(response).xpath( '//div[@class="comment-rst"]/a/span/text()').extract()[0].strip() print shop_phone_1 print shop_rank print shop_taste_score, shop_env_score, shop_service_score, shop_price, shop_review
def detial_parse(self, response): autohomeforumItem = response.meta['autohomeforumItem'] selector = Selector(response) mainBody = selector.xpath("//div[@id='cont_main']") main_topic = mainBody.xpath("./div[@id='maxwrap-maintopic']") detailContent = autohomeforumItem['contents'] topic_text = main_topic.xpath(".//div[contains(@class,'conttxt')]") topic_text = topic_text.xpath("string(.)").extract_first() # 检查是否有ttf cmp = re.compile(",url\('(//.*.ttf)'\) format\('woff'\)") rst = cmp.findall(response.body.decode('utf-8')) if rst: self.loopGet(self.savefont, rst[0]) currentPage = int(re.findall('-(\d+)\.html', response.url)[0]) if currentPage == 1: imageRecognizer = ImageRecognizer(orignText=topic_text, orignFont='temp.ttf') try: topic_text = ' '.join(imageRecognizer.outterCall().replace( '\n', '').split()) detailContent.append( [' '.join(topic_text.replace('\n', '').split()), '楼主']) except Exception as e: print(e) main_replyList = mainBody.xpath("./div[@id='maxwrap-reply']/div") for replyItem in main_replyList: try: tempList = dict() floor = replyItem.xpath(".//button/text()").extract_first() authorId = replyItem.xpath(".//a[@xname='uname']/@href" ).extract_first().split('/')[-2] authorName = replyItem.xpath( ".//a[@xname='uname']/text()").extract_first().strip() replyWho = replyItem.xpath( ".//div[@class='relyhfcon']//a[2]/text()") publistTime = replyItem.xpath( ".//span[@xname='date']/text()").extract_first() tempList['publishTime'] = publistTime if replyWho: # 如果是回复某楼层的,则tempList第一个是内容,第二个是楼层 thisContent = replyItem.xpath( ".//div[@class = 'yy_reply_cont']") thisContent = thisContent.xpath( "string(.)").extract_first() if rst: try: imageRecognizer = ImageRecognizer( orignText=thisContent, orignFont='temp.ttf') thisContent = ' '.join( imageRecognizer.outterCall().replace( '\n', '').split()) tempList['thisContent'] = ' '.join( thisContent.replace('\n', '').split()) tempList['replyWho'] = ' '.join( replyWho.extract_first().replace('\n', '').split()) except Exception as e: print(e) else: thisContent = replyItem.xpath( ".//div[contains(@class,'x-reply')]") thisContent = thisContent.xpath( "string(.)").extract_first() if rst: try: imageRecognizer = ImageRecognizer( orignText=thisContent, orignFont='temp.ttf') thisContent = ' '.join( imageRecognizer.outterCall().replace( '\n', '').split()) tempList['thisContent'] = ' '.join( thisContent.replace('\n', '').split()) tempList['replyWho'] = '楼主' except Exception as e: print(e) tempList['floor'] = floor tempList['authorId'] = authorId tempList['authorName'] = authorName detailContent.append(tempList) except Exception as e: print(e) autohomeforumItem['scrapyTime'] = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime()) autohomeforumItem['contents'] = detailContent nextPageUrl = selector.xpath("//a[text()='下一页']/@href").extract_first() if nextPageUrl is not None: yield SplashRequest(url=self.baseUrl + nextPageUrl, callback=self.detial_parse, args={ 'wait': 1, 'timeout': 60, 'images': 0 }, meta={'autohomeforumItem': autohomeforumItem}) else: yield autohomeforumItem
def company_info(self, response): # company_data = {'unit_type': response.meta['unit_type'], 'city': '', # 'start_date': '', 'number': '', 'authority': '', 'type_of_registration': '', # 'business_area': '', 'security_number': '', 'capital': '', 'unit_property': '', # 'social_registration': '', 'registered_address': '', 'registered__postal_code': '', # 'business_address': '', 'business_postal_number': '', 'legal_person': '', # 'website': '', # } company_name = Selector(response=response).xpath( '//td[@colspan="3"]')[0].xpath('./a/@title').extract_first() # company_data['company_name'] = company_name # # test = self.r.sadd('title_name1', company_name) # unit_property = Selector(response=response).xpath( # '//td[@style="width: 350px;padding-top: 9px;"]/text()').extract_first() # if unit_property.split(): # unit_property = unit_property.split()[0] # company_data['unit_property'] = unit_property # # capital = Selector(response=response).xpath('//td[@colspan="3"]')[2].xpath('text()').extract_first() # if capital is not None: # if capital != '/': # company_data['capital'] = capital + '万元' # # city = Selector(response=response).xpath('//td[@colspan="3"]')[1].xpath('text()').extract_first() # if city.split(): # city = city.split()[0] # company_data['city'] = city # # start_company_data = Selector(response=response).xpath('//td[@style="width: 230px;padding-top: 9px;"]')[ # 3].xpath('text()').extract_first() # if start_company_data.split(): # start_company_data = start_company_data.split()[0] # company_data['start_date'] = start_company_data # # number = Selector(response=response).xpath('//td[@colspan="3"]')[3].xpath( # 'text()').extract_first() # if number.split(): # number = number.split()[0] # company_data['number'] = number # # authority = Selector(response=response).xpath('//td[@style="width: 230px;padding-top: 9px;"]')[5].xpath( # 'text()').extract_first() # if authority is not None: # authority = authority.split()[0] # if authority != '/': # company_data['authority'] = authority # # type_of_registration = Selector(response=response).xpath('//td[@colspan="5"]')[0].xpath( # 'text()').extract_first() # if type_of_registration.split(): # type_of_registration = type_of_registration.split()[0] # company_data['type_of_registration'] = type_of_registration # # business_area = Selector(response=response).xpath('//td[@colspan="5"]')[1].xpath( # 'text()').extract_first() # if business_area is not None: # business_area = business_area.split()[0] # if business_area != '/': # company_data['business_area'] = business_area # # security_number = Selector(response=response).xpath('//td[@colspan="3"]')[4].xpath( # 'text()').extract_first() # if security_number is not None: # security_number = security_number.split()[0] # if security_number != '/': # company_data['security_number'] = security_number # # social_registration = Selector(response=response).xpath('//td[@style="width: 230px;padding-top: 9px;"]')[ # 9].xpath( # 'text()').extract_first() # if social_registration is not None: # social_registration = social_registration.split()[0] # if social_registration != '/': # company_data['social_registration'] = social_registration # # registered_address = Selector(response=response).xpath('//td[@colspan="3"]')[5].xpath( # 'text()').extract_first() # if registered_address is not None: # registered_address = registered_address.split()[0] # if registered_address != '/': # company_data['registered_address'] = registered_address # # registered__postal_code = Selector(response=response).xpath('//td[@style="width: 230px;padding-top: 9px;"]')[ # 11].xpath( # 'text()').extract_first() # if registered__postal_code is not None: # registered__postal_code = registered__postal_code.split()[0] # if registered__postal_code != '/': # company_data['registered__postal_code'] = registered__postal_code # # business_address = Selector(response=response).xpath('//td[@colspan="3"]')[5].xpath( # 'text()').extract_first() # if business_address is not None: # business_address = business_address.split()[0] # if business_address != '/': # company_data['business_address'] = business_address # # business_postal_number = Selector(response=response).xpath('//td[@style="width: 230px;padding-top: 9px;"]')[ # 13].xpath( # 'text()').extract_first() # if business_postal_number is not None: # business_postal_number = business_postal_number.split()[0] # if business_postal_number != '/': # company_data['business_postal_number'] = business_postal_number # # legal_person = Selector(response=response).xpath('//td[@colspan="2"]/text()').extract_first() # if legal_person is not None: # legal_person = legal_person.split()[0] # if legal_person != '/': # company_data['legal_person'] = legal_person # # if len(Selector(response=response).xpath('//td[@colspan="5"]')) == 3: # website = Selector(response=response).xpath('//td[@colspan="5"]')[2].xpath( # 'text()').extract_first() # if website.split(): # print(website.split(), 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA') # website = website.split()[0] # if website.startswith('www') or website.startswith('http'): # company_data['website'] = website # print('公司信息', company_data) # yield scrapy.FormRequest(url='tongna', formcompany_data=company_data, callback=self.company_zz) ## 资质件信息 # ability_info_all = Selector(response=response).xpath('//table[@id="table_zz"]') # # print(ability_info_all, company_name) # if ability_info_all: # ability_info_all = ability_info_all[0].xpath('./tbody/tr') # for a in ability_info_all: # info_condition = a.xpath('./td') # # print(len(info_condition), company_name) # ability_data = {'company_name': company_name, 'issuing_authority': '', 'ability_type': '', # 'licence': '', 'grade': '', 'ability_number': '', 'start_date': ''} # ability_type = info_condition[0].xpath('text()').extract_first() # try: # ability_data['ability_type'] = ability_type.split()[0] # # except IndexError: # ability_data['ability_type'] = '' # # try: # licence = info_condition[1].xpath('text()').extract_first() # if licence is not None: # ability_data['licence'] = licence # except IndexError: # pass # # try: # grade = info_condition[2].xpath('text()').extract_first() # if grade is not None: # ability_data['grade'] = grade # except IndexError: # pass # # try: # ability_number = info_condition[3].xpath('text()').extract_first() # if ability_number is not None: # ability_data['ability_number'] = ability_number # except IndexError: # continue # # try: # start_date = info_condition[4].xpath('text()').extract_first() # if start_date is not None: # ability_data['start_date'] = start_date # except IndexError: # pass # # try: # end_date = info_condition[5].xpath('text()').extract_first() # if end_date is not None: # ability_data['end_date'] = end_date # except IndexError: # pass # # try: # issuing_authority = info_condition[6].xpath('text()').extract_first() # if issuing_authority is not None: # ability_data['issuing_authority'] = issuing_authority # except IndexError: # pass # # print('企业资质', ability_data, 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA') # # # yield scrapy.FormRequest(url='tongna', formdata=ability_info_all, callback=self.ability_zz) ## 安全证件信息 # ability_info_all2 = Selector(response=response).xpath('//table[@id="table_zz"]') # if len(ability_info_all2) == 2: # safe_ability = ability_info_all2[1].xpath('./tbody/tr') # print('为啥不执行这个安全证件信息????%s' % safe_ability) # for s in safe_ability: # safe_certificates_data = {'company_name': company_name, 'safe_number': '', 'address_certificates': '', # 'start_date_certificates': '', 'type_certificates': ''} # all_safe_td = s.xpath('./td') # safe_number = all_safe_td[0].xpath('text()').extract_first() # if safe_number is not None: # safe_number = safe_number.split()[0] # if safe_number == '无': # continue # safe_certificates_data['safe_number'] = safe_number # else: # continue # # address_certificates = all_safe_td[1].xpath('text()').extract_first() # if address_certificates is not None: # safe_certificates_data['address_certificates'] = address_certificates # # start_date_certificates = all_safe_td[2].xpath('text()').extract_first() # if start_date_certificates is not None: # safe_certificates_data['start_date_certificates'] = start_date_certificates # # type_certificates = all_safe_td[3].xpath('text()').extract_first() # if type_certificates is not None: # safe_certificates_data['type_certificates'] = type_certificates # print('企业安全证件信息', safe_certificates_data, 'BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB') # # yield scrapy.FormRequest(url='tongna', formdata=safe_certificates_data, callback=self.ability_zz) # ## 系统相关细信息 # authentication_all = Selector(response=response).xpath('//table[@id="table_sys"]') # if authentication_all: # authentica_tr = authentication_all.xpath('./tbody/tr') # # for a in authentica_tr: # system_data = {'company_name': company_name, 'system_end': '', 'system_name': '', # 'system_start': ''} # d = a.xpath('./td') # system_name = d[0].xpath('text()').extract_first() # if system_name is not None: # # print('系统相关信息---%s----%s' % system_name, type(system_name)) # system_data['system_name'] = system_name # else: # continue # # system_start = d[1].xpath('text()').extract_first() # if system_start is not None: # # print('系统相关信息---%s----%s' % system_start, type(system_start)) # system_data['system_start'] = system_start # # system_end = d[2].xpath('text()').extract_first() # if system_end is not None: # # print('系统相关信息---%s----%s' % system_end, type(system_end)) # system_data['system_end'] = system_end # print('企业系统认证', system_data, 'CCCCCCCCCCCCCCCCCCCCCCCCCCCCCC') # # # yield scrapy.FormRequest(url='tongna', formdata=system_data, callback=self.ability_zz) # 项目详情 project_performance = Selector( response=response).xpath('//div[@id="tab4"]/table/tbody/tr') print(len(project_performance), project_performance.xpath('./td/text()').extract_first(), company_name) if len(project_performance) != 1: for p in project_performance: project_data = { 'project_name': '', 'project_address': '', 'project_status': '', 'project_capital': '', 'project_start_date': '', 'project_company': '', 'project_complete': '' } easy_info = p.xpath('./td[@align="center"]') if len(easy_info) == 0: pass else: print(len(easy_info), 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAA') project_name = easy_info[2].xpath('text()').extract_first() if project_name is not None: project_data['project_name'] = project_name else: continue project_address = easy_info[3].xpath( 'text()').extract_first() if project_address is not None: project_data['project_address'] = project_address project_status = easy_info[4].xpath( 'text()').extract_first() if project_status is not None: project_data['project_status'] = project_status project_capital = easy_info[5].xpath( 'text()').extract_first() if project_capital is not None: project_data['project_capital'] = project_capital project_start_date = easy_info[6].xpath( 'text()').extract_first() if project_start_date is not None: project_data['project_start_date'] = project_start_date project_company = easy_info[7].xpath( 'text()').extract_first() if project_company is not None: project_data['project_company'] = project_company project_complete = easy_info[8].xpath( './font/text()').extract_first() if project_complete is not None: project_data['project_complete'] = project_complete content = p.xpath( './td[@colspan="9"]/table[@id="table_report"]/tr') print(project_data)
def parseNews(self, response): charset = tools.detectPageCharset(response.body) if charset is not None: try: response.body.decode(charset) except UnicodeDecodeError: response.body.decode(self.backupCharset) else: logging.log( logging.WARNING, "Can not detect the charset encoding of " + response.url) sel = Selector(response) brandname = response.meta['brandname'] page_type = sel.xpath( '//div[@class="contents"]/div[@id="brand-right"]/div[@class="m-zx-lbox"]' ) page_type2 = sel.xpath( '//div[@class="g-content clearfix m-first"]/div[@class="first clearfix"]/div[@class="g-main fleft"]' ) page_type3 = sel.xpath('//div[@class="g-content clearfix articlePic"]') page_type4 = sel.xpath('//div[@class="centess"]//div[@id="pardynr"]') page_type5 = sel.xpath( '//div[@id="content94"]//div[@class="con2_left_row1"]') if len(page_type) > 0: # page_type 官方发布的新闻页面 title = page_type.xpath('./h1/text()').extract() date = page_type.xpath( './dl[@class="date"]/dt/span[1]/text()').extract() content = page_type.xpath( './div[@class="main"]//p//text()').extract() elif len(page_type2) > 0: # page_type2 编辑写的页面 all_pages = page_type2.xpath( './dl[@class="pages_fullRead"]/dd/a/@href').extract() if len(all_pages) > 0: # 如果包含全页阅读 需重新生成请求 url = "http://www.yoka.com" + all_pages[0] r = Request(url, callback=self.parseNews) r.meta['brandname'] = brandname yield r return else: print 'page type 2 llllllllllllllllllllllll' title = page_type2.xpath( './h1[@class="infoTitle"]/text()').extract() date = page_type2.xpath( './div[@class="infoTime"]/div[@class="time"]/i/text()' ).extract() content = page_type2.xpath( './div[@class="double_quotes"]/div/text()').extract() content.extend( page_type2.xpath( './div[@class="textCon"]//p//text()').extract()) elif len(page_type3) > 0: # page_type3 图片幻灯片的新闻页面 title = page_type3.xpath('./h1[@id="picTitle"]/text()').extract() content = page_type3.xpath( './dl[@class="text"]//dd/text()').extract() # 从URL中提取日期 http://www.yoka.com/fashion/popinfo/2016/0725/pic48495001119565.shtml?source=brand date_pattern = re.compile('(\d{4})/(\d{4})') sresult = re.search(date_pattern, response.url) if sresult is None: date_str = "" else: date_str = sresult.group() index = date_str.index('/') sub = date_str[index:index + 3] date = date_str.replace(sub, sub + '-').replace('/', '-') elif len(page_type4) > 0: # page_type4 老版网站的页面 http://www.yoka.com/fashion/roadshow/2008/082290701.shtml title = page_type4.xpath( './dl[@class="viewtis"]/dt/h1/text()').extract() # 提取日期字符串 如 2008-08-22 11:14 来源: date_str = page_type4.xpath( './dl[@class="viewtis"]/dd/text()').extract() pattern = re.compile('(\d{4}-\d{2}-\d{2})') date = re.search(pattern, date_str[0]).group() content = page_type4.xpath( './div[@id="viewbody"]//p//text()').extract() # 寻找是否有下一页链接 http://www.yoka.com/fashion/popinfo/2009/0922253399.shtml next_page = page_type4.xpath( './div[@id="viewbody"]//span[@class="pagebox_next"]/a/@href' ).extract() if len(next_page) > 0: # 如果有下一页链接, 则需要生成新的请求,交给parseNewsNextPage处理 url = "http://www.yoka.com" + next_page[0] r = Request(url, callback=self.parseNewsNextPage) article = { 'brandname': brandname, 'title': title, 'date': date, 'content': content } r.meta['article'] = article yield r return elif len(page_type5) > 0: # page_type5 老版网页页面: http://www.yoka.com/luxury/watch/2008/060268802.shtml title = page_type5.xpath('./h2/text()').extract() # 提取日期字符串 如:2008-06-02 17:12 来源: date_str = page_type5.xpath('./div[@class="src"]/text()').extract() pattern = re.compile('(\d{4}-\d{2}-\d{2})') date = re.search(pattern, date_str[0]).group() content = page_type5.xpath( './div[@class="con"]//p//text()').extract() # 寻找是否有下一页链接 http://www.yoka.com/luxury/watch/2008/060268802.shtml next_page = page_type5.xpath( './div[@class="con"]/p[@align="right"]/a[position()>1 and @style]' ).extract() if len(next_page): url = "http://www.yoka.com" + next_page[0] r = Request(url, callback=self.parseNewsNextPage) article = { 'brandname': brandname, 'title': title, 'date': date, 'content': content } r.meta['article'] = article yield r return else: return item = NewsItem() item['title'] = "".join(title) item['date'] = "".join(date) item['brandname'] = brandname item['content'] = "".join(content) yield item
def parse1(self, response): sele = Selector(response) title = sele.xpath('//title/text()').extract_first() if title: # 文章正文内容 Content = '' Content_urls = sele.xpath('//ul[@class="ov"]//a/@href').extract() Content_urls_list = [] for Content_url in Content_urls: if 'http://www.cankaoxiaoxi.com' in Content_url: Content_urls_list.append(Content_url) for url in Content_urls_list: response1 = requests.get(url) soup = Selector(text=response1.text) bodys = soup.xpath( '//div[@id="ctrlfscont"]//p/text()').extract() for body in bodys: Content = Content + str(body) time.sleep(4) if len(Content) < 10: bodys = sele.xpath( '//div[@id="ctrlfscont"]//p/text()').extract() for body in bodys: Content = Content + str(body) try: AgreeCount = sele.xpath( '//p[@class="emoji-num"]/text()').extract()[3] except: AgreeCount = '' try: DisagreeCount = sele.xpath( '//p[@class="emoji-num"]/text()').extract()[0] except: DisagreeCount = '' item = Yuqing_CankaoxiaoxiItem({ 'AuthorID': '', 'AuthorName': sele.xpath( '//span[@id="editor_baidu"]/text()').extract_first(), 'ArticleTitle': title, 'SourceArticleURL': response.url, 'URL': response.url, 'PublishTime': sele.xpath( '//span[@id="pubtime_baidu"]/text()').extract_first(), 'Crawler': time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), 'ReadCount': '', 'CommentCount': '', 'TransmitCount': '', 'Content': Content, 'comments': '', 'AgreeCount': AgreeCount, 'DisagreeCount': DisagreeCount, 'AskCount': '', 'ParticipateCount': '', 'CollectionCount': '', 'Classification': sele.xpath('//div[@class="crumb"]/a/text()').extract()[1], 'Labels': sele.xpath( '//meta[@name="keywords"]/@content').extract_first(), 'Type': '', 'RewardCount': '' }) yield item
def parse(self, response): sel = Selector(response) item = ChanyoujiUser() user_data = response.meta['data'] item['user_id'] = user_data['user_id'] user_name = sel.xpath('//div[contains(@class, "header-inner")]/h1/text()').extract() if user_name: item['user_name'] = user_name[0] else: item['user_name'] = None ret = sel.xpath('//div[contains(@class, "header-inner")]/div[1]/text()').extract() if ret: num_youji = ret[0] num = re.compile('\d{1,}') m1 = num.search(num_youji) if m1: item['num_notes'] = int(m1.group()) ret = sel.xpath( '//div[contains(@class,"header-inner")]/a/img[contains(@class,"avatar") and @src]/@src').extract() if ret: item['avatar'] = ret[0] ret = sel.xpath('//div[contains(@class, "sns-site")]/p/text()').extract() if ret: ret = ret[0] if u'喜欢她的游记' in ret: item['gender'] = 'f' elif u'喜欢他的游记' in ret: item['gender'] = 'm' ret = sel.xpath( '//div[contains(@class, "sns-site")]/ul[@class="sns-ico"]/li[contains(@class,"weibo")]/a/@href').extract() if ret: weibo_url = ret[0] item['weibo_url'] = weibo_url match = re.search(r'weibo\.com/u/(\d+)/?$', weibo_url) if match: item['weibo_uid'] = int(match.groups()[0]) else: match = re.search(r'weibo\.com/([^/]+)/?$', weibo_url) if match: item['weibo_uid'] = match.groups()[0] ret = sel.xpath( '//div[contains(@class, "sns-site")]/ul[@class="sns-ico"]/li[contains(@class,"douban")]/a/@href').extract() if ret: douban_url = ret[0] item['douban_url'] = douban_url match = re.search(r'douban\.com/people/(\d+)/?$', douban_url) if match: item['douban_uid'] = int(match.groups()[0]) ret = sel.xpath( '//div[contains(@class, "sns-site")]/ul[@class="sns-ico"]/li[contains(@class,"renren")]/a/@href').extract() if ret: renren_url = ret[0] item['renren_url'] = renren_url match = re.search(r'renren\.com/(\d+)/profile/?$', renren_url) if match: item['renren_uid'] = int(match.groups()[0]) marker = {} # 查找Gmaps.map.markers对象 match = re.search(r'Gmaps\.map\.markers\s*=\s*(?=\[)(.+?)(?<=\])', response.body) if match: try: marker_data = json.loads(match.groups()[0]) for tmp in marker_data: lat = float(tmp['lat']) lng = float(tmp['lng']) mid = tmp['id'] title = tmp['title'].strip() desc = tmp['description'] match = re.search(r'href\s*="([^"]+)"', desc) href = 'http://chanyouji.com' + match.groups()[0] if match else None if href: marker[mid] = {'lat': lat, 'lng': lng, 'title': title, 'url': href, 'data_id': mid} except (ValueError, KeyError): pass traveled_list = [] for data_id in sel.xpath( '//ul[@id="attraction_markers_list"]//a[contains(@class,"node") and @data-id]/@data-id').extract(): data_id = int(data_id) if data_id not in marker: continue traveled_list.append(marker[data_id]) item['traveled'] = traveled_list if not item['traveled']: yield item else: yield Request(url=item['traveled'][0]['url'], callback=self.parse_note, meta={'item': item})
def _parse_description(self, item): """Parse or generate meeting description.""" desc_text = " ".join( Selector(text=html.unescape(item["description"])).css( "*::text").extract()) return re.sub(r"\s+", " ", desc_text).strip()
def get_user_info(response): # sel = Selector(response) if 'http://weibo.com/sorry?pagenotfound&' == response.url: return None user_item = UserInfoItem() user_item['url'] = response.url user_id = check_value(get_page_conf_info(response, 'oid')) user_item['user_id'] = user_id page_id = check_value(get_page_conf_info(response, 'page_id')) user_item['page_id'] = page_id info_div = get_dom_html(response, 'Pl_Official_PersonalInfo__60') if info_div: info_list = Selector( text=info_div).xpath('//li[@class="li_1 clearfix"]') for info in info_list: info_title = info.xpath( './span[contains(@class, "pt_title S_txt")]').xpath( 'string(.)').extract_first() info_detail = info.xpath('./span[contains(@class, "pt_detail")]' ).xpath('string(.)').extract() info_detail = [info_.strip() for info_ in info_detail if info_] if '昵称' in info_title: user_item['nick_name'] = check_value(''.join(info_detail)) elif '真实姓名' in info_title: user_item['real_name'] = check_value(''.join(info_detail)) elif '所在地' in info_title: user_item['location'] = check_value(''.join(info_detail)) elif '性别' in info_title: user_item['sex'] = check_value(''.join(info_detail)) elif '性取向' in info_title: user_item['sexual_orientation'] = check_value( ''.join(info_detail)) elif '感情状况' in info_title: user_item['Relationship_status'] = check_value( ''.join(info_detail)) elif '生日' in info_title: user_item['birthday'] = check_value(''.join(info_detail)) elif '博客' in info_title: user_item['blog_address'] = check_value(''.join(info_detail)) elif '个性域名' in info_title: user_item['personal_url'] = check_value(''.join(info_detail)) elif '简介' in info_title: user_item['description'] = check_value(''.join(info_detail)) elif '注册时间' in info_title: user_item['register_date'] = check_value(''.join(info_detail)) elif '公司' in info_title: user_item['company'] = check_value('\n'.join(info_detail)) elif '大学' in info_title: user_item['education'] = check_value('\n'.join(info_detail)) elif '标签' in info_title: user_item['tag'] = check_value('\n'.join(info_detail)) elif '邮箱' in info_title: user_item['mail'] = check_value(''.join(info_detail)) elif 'QQ' in info_title: user_item['qq'] = check_value(''.join(info_detail)) elif '血型' in info_title: user_item['blood_type'] = check_value(''.join(info_detail)) else: print('info div more value!! ' + info_title) else: raise ValueError('no info div') # 关注、粉丝、微博数 num_div = get_dom_html(response, 'Pl_Core_T8CustomTriColumn__56') if num_div: num_list = Selector(text=num_div).xpath( '//td[contains(@class, "S_line")]/a[contains(@class, "t_link S_txt")]' ) for num_ in num_list: num_data = num_.xpath('./*[contains(@class, "W_f")]').xpath( 'string(.)').extract_first() num_name = num_.xpath('./span[contains(@class, "S_txt")]').xpath( 'string(.)').extract_first() if not num_data or not num_data.strip().isdigit(): num_data = -1 if '关注' in num_name: user_item['friends_num'] = num_data elif '粉丝' in num_name: user_item['fans_num'] = num_data elif '微博' in num_name: user_item['blog_num'] = num_data else: print('num div more value!! ' + num_name) else: raise ValueError('no num div') head_div = get_dom_html(response, 'Pl_Official_Headerv6') if head_div: head_info = Selector(text=head_div).xpath( '//a[@class="icon_bed"]/em/@class').extract_first() if not head_info: user_item['is_v'] = 'nil' else: user_item['is_v'] = check_value(head_info) # 获取头像URL head_img_url = Selector(text=head_div).xpath( '//div[@node-type="photo"]' '//img[@class="photo"]/@src').extract_first() if head_img_url: user_item['head_img_url'] = head_img_url else: user_item['head_img_url'] = '' else: raise ValueError('head div error!!') level_div = get_dom_html(response, 'Pl_Official_RightGrowNew') if level_div: level_info = Selector(text=level_div) \ .xpath('//div[contains(@class,"level_box S_txt")]').xpath('string(.)').extract_first() user_item['rank'] = check_value(level_info) else: raise ValueError('level div error!!') user_item['parse_time'] = time.time() return user_item
def extract_item_desc(): inner = response.css('a.J-tooltip::attr(title)').extract_first() sel = Selector(text=inner) item_desc = sel.css('.tooltip-tip::text').get() return item_desc
def parse_detail(position_queue): positions_info = [] while True: if position_queue.empty(): break url = position_queue.get() proxy["http"] = get_proxy() time.sleep(random.randint(5, 10)) response = requests.get(url, headers=headers, proxies=proxy) time.sleep(random.randint(5, 10)) selector = Selector(text=response.text) apartment = selector.xpath('//div[@class="company"]/text()').extract()[0] # 招聘部门 title = selector.xpath('//div[@class="job-name"]/@title').extract()[0] # 标题 publish_time = selector.xpath('//p[@class="publish_time"]/text()').extract()[0] # 发布时间 publish_time = publish_time.split("发布于拉勾网")[0].strip() job_desc = selector.xpath('//dd[@class="job_bt"]/div').extract()[0].replace( "<div>", '').replace("</div>", '').replace( "<p>", '').replace("</p>", '').strip() # 职位描述 job_advantage = selector.xpath('//dd[@class="job-advantage"]/p/text()').extract()[0] # 职业诱惑 job_addr_list= selector.xpath('//div[@class="work_addr"]').extract()[0] job_addr_list = remove_tags(job_addr_list).split("\n") job_addr_list = [job_addr.strip() for job_addr in job_addr_list if job_addr.strip != "查看地图"] job_addr = "".join(job_addr_list) salary = selector.xpath('//dd[@class="job_request"]/p/span/text()').extract()[0] # 薪资 if '-' in salary: salary_min = salary.split('-')[0] salary_max = salary.split('-')[1] elif '以上' in salary: salary_min = salary.split('以上')[0] salary_max = salary_min city = selector.xpath('//dd[@class="job_request"]/p/span[2]/text()').extract()[0].replace("/", '') # 经验要求 work_experience = selector.xpath('//dd[@class="job_request"]/p/span[3]/text()').extract()[0] if '-' in work_experience: work_experience_min = work_experience.split('-')[0].replace("经验", '') if int(work_experience_min) > 1: work_experience_min = work_experience_min + 'years' else: work_experience_min = work_experience_min + 'year' work_experience_max = work_experience.split('-')[1].replace("年", '').replace("/", '').strip() + 'years' elif '不限' in work_experience: work_experience_min = work_experience.replace("经验不限", "no require") work_experience_max = work_experience_min elif '以下' in work_experience: work_experience_max = work_experience.split("年")[0].replace("经验", "").replace("年", "") if int(work_experience_max) > 1: work_experience_max = work_experience_max + 'years' work_experience_min = work_experience_max else: work_experience_max = work_experience_max + 'year' work_experience_min = work_experience_max elif '应届' in work_experience: work_experience_min = work_experience.replace("经验应届毕业生 /", "graduates") work_experience_max = work_experience_min # 学历 education = selector.xpath('//dd[@class="job_request"]/p/span[4]/text()').extract()[0] if "本科" in education: education = education.split("本")[0].replace("", "undergraduate") elif "大专" in education: education = education.split("大")[0].replace("", "junior_college_student") elif "不限" in education: education = education.replace("学历不限 /", "no require") # 职业类型 job_type = selector.xpath('//dd[@class="job_request"]/p/span[5]/text()').extract()[0] if "全职" in job_type: job_type = job_type.replace("全职", "full time") else: job_type = job_type.replace("实习", "fieldwork") positions_info.append((apartment, title, salary_min, salary_max, city, work_experience_min, work_experience_max, education, job_type, publish_time, job_advantage, job_desc, job_addr)) for position_info in positions_info: try: insert_sql = """ insert into lagou_job_requests (apartment, title, salary_min, salary_max, city, work_experience_min, work_experience_max, education, job_type, publish_time, job_advantage, job_desc, job_addr) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) """ cursor.execute(insert_sql, (position_info[0], position_info[1], position_info[2], position_info[3], position_info[4], position_info[5], position_info[6], position_info[7], position_info[8], position_info[9], position_info[10], position_info[11], position_info[12])) conn.commit() except: conn.rollback()
# Importing selector from scrapy import Selector # Importing requests to load html data import requests # Creating a container for the url to scrap all the content of jumia home page url = "https://www.jumia.com.ng" # Creating a container to hold the HTML source html = requests.get(url).content # Creating a selector object sel = Selector(text=html) # Printing out the total elements in the page print(len(sel.xpath("//*"))) # Selecting all Id in the home page print(sel.xpath("//@id")) # Searching for a particular class attribute print(sel.xpath("//p[@class = '-mas -elli2']")) # Selecting all products in class mas elli2 print(sel.xpath('//p[@class = "-mas -elli2"]//text()').extract()) # Searching for ps4 console image print(sel.xpath('//div/img[@alt="PS4 Consoles"]').extract()) # Printing out all the courses offered in the home page
from scrapy import Selector from urllib.request import urlopen html = urlopen("https://www.pythonparatodos.com.br/formulario.html") sel = Selector(text=html.read()) lista = sel.xpath('//input') terceiro_input = lista[3] print(terceiro_input.extract())
def product_parse(self, response): if len(response.text) < 40000: yield scrapy.Request(url=response.request.url, callback=self.product_parse, dont_filter=True, meta=response.meta) return None item = response.meta['item'] # 商品链接 product_url = response.request.url # 商品ID ProductID = product_url.split('/')[-1].split('.')[0] # 商品链接urlID urlID = product_url.split('/')[-2] # 商品链接urlID urlID = product_url.split('/')[-2] # 店铺名称 try: shop_name = re.findall('shopName":"(.*?)"', response.text)[0] except: try: shop_name = re.findall('"curShopName":.*?>(.*?)</a>"', response.text)[0] except: try: shop_name = response.xpath( ".//div[@class='si-intro-list']/dl[1]/dd/a/text()" ).extract()[0] except: shop_name = None #去掉shopname中的空白字符 shop_name = re.sub(r'\r', '', shop_name) shop_name = re.sub(r'\t', '', shop_name) shop_name = re.sub(r'\n', '', shop_name) shop_name = re.sub(r' ', '', shop_name) # 商品名称 try: p_Name = response.xpath( ".//div[@class='imgzoom-main']/a[@id='bigImg']/img/@alt" ).extract()[0] except: try: p_Name = re.findall('"itemDisplayName":"(.*?)"', response.text)[0] except: p_Name = None #类别 try: X_type = Selector(response).re('"分类":"(.*?)"')[0] except: try: X_type = Selector(response).re( '分类</span> </div> </td> <td class="val">(.*?)</td>')[0] except: try: X_type = re.findall('"分类":"(.*?)"', response.text)[0] except: X_type = None # 品牌 try: brand = Selector(response).re('"brandName":"(.*?)"')[0] except: try: brand = Selector(response).re('<li><b>品牌</b>:(.*?)</li>')[0] except: try: brand = re.findall('"brandName":"(.*?)"', response.text)[0] except: brand = None # 去掉品牌括号内容 if brand: if re.findall(r'(.*?)', brand): re_com = re.compile('(.*?)') brand = brand[:0] + re.sub(re_com, '', brand) if brand: if re.findall(r'\(.*?\)', brand): re_cn = re.compile('\(.*?\)') brand = brand[:0] + re.sub(re_cn, '', brand) # 颜色 color = None # 类型,商品型号 try: X_name = Selector(response).re( '型号</span> </div> </td> <td class="val">(.*?)</td>')[0] except: try: X_name = re.findall( '型号</span> </div> </td> <td class="val">(.*?)</td>', response.text)[0] if X_name == None: X_name = re.findall( '型号</span> </div> </td> <td class="val">(.*?)</td>', response.text)[0] except: X_name = None if X_name: if brand: if brand in X_name: X_name = X_name[:0] + re.sub(brand, '', X_name) X_name = X_name[:0] + re.sub(r'(.*?)', '', X_name) X_name = X_name[:0] + re.sub(r'\(.*?\)', '', X_name) #安装方式 try: install = Selector(response).re('安装方式:(.*?)</li>')[0] except: try: install = Selector(response).re( '安装方式</span> </div> </td> <td class="val">(.*?)</td>')[0] except: try: install = re.findall('安装方式:(.*?)</li>', response.text)[0] except: try: install = re.findall( '安装方式</span> </div> </td> <td class="val">(.*?)</td>', response.text)[0] except: install = None #是否可以直饮 try: drink = Selector(response).re('是否直饮:(.*?)</li>')[0] except: try: drink = Selector(response).re( '是否直饮</span> </div> </td> <td class="val">(.*?)</td>')[0] except: try: drink = re.findall('是否直饮:(.*?)</li>', response.text)[0] except: try: drink = re.findall( '是否直饮</span> </div> </td> <td class="val">(.*?)</td>', response.text)[0] except: drink = None #滤芯种类 try: kinds = Selector(response).re('滤芯种类:(.*?)</li>')[0] except: try: kinds = Selector(response).re( '滤芯种类</span> </div> </td> <td class="val">(.*?)</td>')[0] except: try: kinds = re.findall('滤芯种类:(.*?)</li>', response.text)[0] except: try: kinds = re.findall( '滤芯种类</span> </div> </td> <td class="val">(.*?)</td>', response.text)[0] except: kinds = None #滤芯使用寿命 try: life = Selector(response).re('滤芯寿命:(.*?)</li>')[0] except: try: life = Selector(response).re( '滤芯寿命</span> </div> </td> <td class="val">(.*?)</td>')[0] except: try: life = re.findall('滤芯寿命:(.*?)</li>', response.text)[0] except: try: life = re.findall( '滤芯寿命</span> </div> </td> <td class="val">(.*?)</td>', response.text)[0] except: life = None #过滤精度 try: precision = Selector(response).re('过滤精度:(.*?)</li>')[0] except: try: precision = Selector(response).re( '过滤精度</span> </div> </td> <td class="val">(.*?)</td>')[0] except: try: precision = re.findall('过滤精度:(.*?)</li>', response.text)[0] except: try: precision = re.findall( '过滤精度</span> </div> </td> <td class="val">(.*?)</td>', response.text)[0] except: precision = None # 核心参数 type = '"' soup = BeautifulSoup(response.text, 'lxml') try: ul = soup.find('ul', attrs={'class': 'cnt clearfix'}) li = ul.find_all('li') for i in range(len(li)): type = type[:] + li[i].text if i < len(li) - 1: type = type[:] + ' ' if i == len(li) - 1: type = type[:] + '"' except: try: # 部分核心参数格式更改 div = soup.find('div', class_='prod-detail-container') ul = div.find('ul', attrs={'class': 'clearfix'}) li = ul.find_all('li') for each in li: li_li = each.find_all('li') for i in range(len(li_li)): type = type[:] + li_li[i].text if i < len(li_li) - 1: type = type[:] + ' ' if i == len(li_li) - 1: type = type[:] + '"' except: type = None if type: if len(type) < 2: type = None if type == None: try: parameter_id = Selector(response).re( '"mainPartNumber":"(.*?)"')[0] except: try: parameter_id = re.findall('"mainPartNumber":"(.*?)"', response.text)[0] except: parameter_id = None type = None if parameter_id: try: parameter_id = Selector(response).re( '"mainPartNumber":"(.*?)"')[0] parameter_url = 'https://product.suning.com/pds-web/ajax/itemParameter_%s_R0105002_10051.html' % parameter_id para_response = requests.get(parameter_url).text time.sleep(0.3) eles = re.findall('"snparameterdesc":"(.*?)"', para_response) souls = re.findall('"snparameterVal":"(.*?)"', para_response) try: type = '"' for i in range(len(eles)): type = type[:] + eles[i] + ':' + souls[i] if i < len(eles) - 1: type = type[:] + ' ' if i == len(eles) - 1: type = type[:] + '"' if len(type) < 2: type = None except: type = None if brand == None: try: brand = re.findall( '"snparameterdesc":"品牌","snparameterVal":"(.*?)"', para_response)[0] except: brand = None try: X_name = re.findall( '"snparameterdesc":"型号","snparameterVal":"(.*?)"', para_response)[0] except: X_name = None if X_name: if brand: if brand in X_name: X_name = X_name[:0] + re.sub(brand, '', X_name) X_name = X_name[:0] + re.sub(r'(.*?)', '', X_name) X_name = X_name[:0] + re.sub(r'\(.*?\)', '', X_name) #类别 if X_type == None: try: X_type = re.findall( '"snparameterdesc":"分类","snparameterVal":"(.*?)"', para_response)[0] except: X_type = None #安装方式 if install == None: try: install = re.findall( '"snparameterdesc":"安装方式","snparameterVal":"(.*?)"', para_response)[0] except: install = None #是否直饮 if drink == None: try: drink = re.findall( '"snparameterdesc":"是否直饮","snparameterVal":"(.*?)"', para_response)[0] except: drink = None #滤芯种类 if kinds == None: try: kinds = re.findall( '"snparameterdesc":"滤芯种类","snparameterVal":"(.*?)"', para_response)[0] except: kinds = None #滤芯使用寿命 if life == None: try: life = re.findall( '"snparameterdesc":"滤芯寿命","snparameterVal":"(.*?)"', para_response)[0] except: life = None #过滤精度 if precision == None: try: precision = re.findall( '"snparameterdesc":"过滤精度","snparameterVal":"(.*?)"', para_response)[0] except: precision = None except: pass # 获取相关请求url keyword_url = 'https://review.suning.com/ajax/getreview_labels/general-000000000' + ProductID + '-' + urlID + '-----commodityrLabels.htm' comment_url = 'https://review.suning.com/ajax/review_satisfy/general-000000000' + ProductID + '-' + urlID + '-----satisfy.htm' price_url = 'https://pas.suning.com/nspcsale_0_000000000' + ProductID + '_000000000' + ProductID + '_' + urlID + '_10_010_0100101_20268_1000000_9017_10106_Z001.html' # 获取印象关键字 try: keyword_response = requests.get(keyword_url).text keyword_text = json.loads( re.findall(r'\((.*?)\)', keyword_response)[0]) keyword_list = keyword_text.get('commodityLabelCountList') key_str = '"' keyword = [] for i in range(len(keyword_list)): key_str = key_str[:] + keyword_list[i].get('labelName') if i < len(keyword_list) - 1: key_str = key_str[:] + ' ' if i == len(keyword_list) - 1: key_str = key_str[:] + '"' keyword.append(key_str) except: keyword = None # 获取评价信息 try: comment_response = requests.get(comment_url).text comment_text = json.loads( re.findall(r'\((.*?)\)', comment_response)[0]) comment_list = comment_text.get('reviewCounts')[0] # 差评 PoorCount = comment_list.get('oneStarCount') twoStarCount = comment_list.get('twoStarCount') threeStarCount = comment_list.get('threeStarCount') fourStarCount = comment_list.get('fourStarCount') fiveStarCount = comment_list.get('fiveStarCount') # 评论数量 CommentCount = comment_list.get('totalCount') # 好评 GoodCount = fourStarCount + fiveStarCount # 中评 GeneralCount = twoStarCount + threeStarCount # 好评度 # 得到百分比取整函数 if CommentCount != 0: goodpercent = round(GoodCount / CommentCount * 100) generalpercent = round(GeneralCount / CommentCount * 100) poorpercent = round(PoorCount / CommentCount * 100) commentlist = [GoodCount, GeneralCount, PoorCount] percent_list = [goodpercent, generalpercent, poorpercent] # 对不满百分之一的判定 for i in range(len(percent_list)): if percent_list[i] == 0 and commentlist[ i] != 0 and CommentCount != 0: percent_list[i] = 1 nomaxpercent = 0 # 定义为累计不是最大百分比数值 # 好评度计算url='http://res.suning.cn/project/review/js/reviewAll.js?v=20170823001' if CommentCount != 0: maxpercent = max(goodpercent, generalpercent, poorpercent) for each in percent_list: if maxpercent != each: nomaxpercent += each GoodRateShow = 100 - nomaxpercent else: GoodRateShow = 100 else: PoorCount = 0 CommentCount = 0 GoodCount = 0 GeneralCount = 0 GoodRateShow = 100 except: PoorCount = 0 CommentCount = 0 GoodCount = 0 GeneralCount = 0 GoodRateShow = 100 # 有关价格 try: price_response = requests.get(price_url).text except requests.RequestException as e: # print(e) time.sleep(2) s = requests.session() s.keep_alive = False s.mount('https://', HTTPAdapter(max_retries=5)) price_response = s.get(price_url).text if len(price_response) > 900: try: price = re.findall('"refPrice":"(.*?)"', price_response)[0] PreferentialPrice = re.findall('"promotionPrice":"(.*?)"', price_response)[0] if len(price) < 1: price = re.findall('"netPrice":"(.*?)"', price_response)[0] if price: if float(price) < float(PreferentialPrice): tt = price price = PreferentialPrice PreferentialPrice = tt except: price = None PreferentialPrice = None else: time.sleep(3) price_response = requests.get(price_url).text if len(price_response) > 900: try: price = re.findall('"refPrice":"(.*?)"', price_response)[0] PreferentialPrice = re.findall('"promotionPrice":"(.*?)"', price_response)[0] if len(price) < 1: price = re.findall('"netPrice":"(.*?)"', price_response)[0] if price: if float(price) < float(PreferentialPrice): tt = price price = PreferentialPrice PreferentialPrice = tt except: price = None PreferentialPrice = None else: # 作出失败判断并将url归入重试 price_response = self.retry_price(price_url) if len(price_response) > 500: try: price = re.findall('"refPrice":"(.*?)"', price_response)[0] PreferentialPrice = re.findall( '"promotionPrice":"(.*?)"', price_response)[0] if len(price) < 1: price = re.findall('"netPrice":"(.*?)"', price_response)[0] if price: if float(price) < float(PreferentialPrice): tt = price price = PreferentialPrice PreferentialPrice = tt except: price = None PreferentialPrice = None else: PreferentialPrice = None price = None if kinds: if re.findall(r'\d', kinds) and len(kinds) < 3: level = kinds kinds = None else: level = None else: level = None # 防止出现多个字段出现为空 if p_Name == None and brand == None and type == None: yield None else: source = '苏宁' item['shop_name'] = shop_name item['p_Name'] = p_Name item['X_name'] = X_name item['type'] = type item['price'] = price item['PreferentialPrice'] = PreferentialPrice item['brand'] = brand item['keyword'] = keyword item['PoorCount'] = PoorCount item['CommentCount'] = CommentCount item['GoodCount'] = GoodCount item['GeneralCount'] = GeneralCount item['GoodRateShow'] = GoodRateShow item['install'] = install item['drink'] = drink item['source'] = source item['level'] = level item['kinds'] = kinds item['life'] = life item['precision'] = precision item['color'] = color item['product_url'] = product_url item['ProductID'] = ProductID item['X_type'] = X_type yield item
def parse_detail(self, response): # TODO 主动关闭爬虫问题 try: data = Selector(text=response.body.decode('utf-8')) items = str(data.xpath('string(.)').extract()[0]).replace( '\xa0', '').replace('\u3000', '') WJBT_45 = '' SJ_46 = '' LY_47 = '' ZWBT_48 = '' DKBH_49 = '' ZDBH_50 = '' PMJG_51 = '' GGZRFS_52 = '' GPSJ_53 = '' ZRR_54 = '' ZRF_55 = '' SRR_56 = '' SRF_57 = '' SRDW_58 = '' WZ_59 = '' DKWZ_60 = '' CRMJ_61 = '' YT_62 = '' CJJ_63 = '' BDCQDJH_64 = '' CRHTBH_65 = '' CRHT_66 = '' BGXYBH_67 = '' TDYT_68 = '' SYNX_69 = '' MJ_70 = '' TDMJ_71 = '' ZRJG_72 = '' CRNX_73 = '' TDSYNX_74 = '' BZ_75 = '' GSQ_76 = '' LXDW_77 = '' DWDZ_78 = '' YZBM_79 = '' LXDH_80 = '' LXR_81 = '' DZYJ_82 = '' # TODO 共有字段 reFunction(f'时间:\s*([{self.reStr}]*)\s', LY) # 文件标题 WJBT_45 = response.meta.get('title') # 时间 SJ_46 = data.xpath( '//div[@class="ztzx_frame_subtitle_l"]/span[1]/text()' ).extract_first() # 来源 LY_47 = data.xpath( '//div[@class="ztzx_frame_subtitle_l"]/span[2]/text()' ).extract_first() # 正文标题 ZWBT_48 = data.xpath( '//div[@class="ztzx_frame_content"]/div[1]/text()' ).extract_first() # 公示期 GSQ_76 = reFunction( f'公示期:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)[。\s]', items) # 联系单位 LXDW_77 = reFunction( '联系单位:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 单位地址 DWDZ_78 = reFunction( '单位地址:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 邮政编码 YZBM_79 = reFunction( '邮政编码:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 联系电话 LXDH_80 = reFunction( '联系电话:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 联系人 LXR_81 = reFunction( '联\s*系\s*人:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 电子邮件 DZYJ_82 = reFunction( '电子邮件:([()\w\.:: —\(\)@〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(url + WJBT_45 + SJ_46) soup = BeautifulSoup( response.body.decode('utf-8').replace('thead', 'tbody')) table = soup.find('table') htmlTable = htmlTableTransformer() if '国有划拨土地使用权结果公示' in items: table.find_all('tr')[1].extract() tdData = htmlTable.tableTrTdRegulationToList(table) for _ in range(len(list(tdData.values())[0])): # 地块编号 DKBH_49 = tdData.get('地块编号')[_] if tdData.get( '地块编号') else '' # 公开转让方式 GGZRFS_52 = tdData.get('公开转让方式')[_] if tdData.get( '公开转让方式') else '' # 挂牌时间 GPSJ_53 = tdData.get('挂牌')[_] if tdData.get('挂牌') else '' # 受让人 SRR_56 = tdData.get('受让人')[_] if tdData.get('受让人') else '' # 位置 WZ_59 = tdData.get('位置')[_] if tdData.get('位置') else '' # 出让面积(平方米) CRMJ_61 = tdData.get('出让面积')[_] if tdData.get( '出让面积') else '' # 用途 YT_62 = tdData.get('用途')[_] if tdData.get('用途') else '' # 成交价(万元) CJJ_63 = tdData.get('成交价')[_] if tdData.get('成交价') else '' # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if True: # 重复效验通过, 存储数据 csvFile = [ WJBT_45, SJ_46, LY_47, ZWBT_48, DKBH_49, ZDBH_50, PMJG_51, GGZRFS_52, GPSJ_53, ZRR_54, ZRF_55, SRR_56, SRF_57, SRDW_58, WZ_59, DKWZ_60, CRMJ_61, YT_62, CJJ_63, BDCQDJH_64, CRHTBH_65, CRHT_66, BGXYBH_67, TDYT_68, SYNX_69, MJ_70, TDMJ_71, ZRJG_72, CRNX_73, TDSYNX_74, BZ_75, GSQ_76, LXDW_77, DWDZ_78, YZBM_79, LXDH_80, LXR_81, DZYJ_82, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\t', '').replace( '\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) elif '不动产权登记证号' in items: # 转让方 ZRF_55 = reFunction( '转让方:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 受让方 SRF_57 = reFunction( '受让方:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 位置 WZ_59 = reFunction( '宗地位置:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 不动产权登记证号 BDCQDJH_64 = reFunction( '不动产权登记证号:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 出让合同编号 CRHTBH_65 = reFunction( '出让合同编号:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 变更协议编号 BGXYBH_67 = reFunction( '出让合同变更协议编号:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 土地用途 TDYT_68 = reFunction( '土地用途:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 使用年限 SYNX_69 = reFunction( '使用年限:\s*([()【】\w\.::—\(\)〔〕\s㎡≤≥《》\-\/\%,;,、\.﹪]*)面\s*积', items) # 面积 MJ_70 = reFunction( '面\s*积:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 转让价格(单价总价) ZRJG_72 = reFunction( '转让价格:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、。\.﹪]*)\s', items) # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if True: # 重复效验通过, 存储数据 csvFile = [ WJBT_45, SJ_46, LY_47, ZWBT_48, DKBH_49, ZDBH_50, PMJG_51, GGZRFS_52, GPSJ_53, ZRR_54, ZRF_55, SRR_56, SRF_57, SRDW_58, WZ_59, DKWZ_60, CRMJ_61, YT_62, CJJ_63, BDCQDJH_64, CRHTBH_65, CRHT_66, BGXYBH_67, TDYT_68, SYNX_69, MJ_70, TDMJ_71, ZRJG_72, CRNX_73, TDSYNX_74, BZ_75, GSQ_76, LXDW_77, DWDZ_78, YZBM_79, LXDH_80, LXR_81, DZYJ_82, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\t', '').replace( '\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) elif '挂牌出让地块的基本情况和规划指标要求' in items: # 宗地编号 ZDBH_50 = reFunction( '宗地编号:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 挂牌时间 GPSJ_53 = reFunction( '挂牌时间为:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;。,、\.﹪]*)\s', items).replace('。', '') # 转让人 ZRR_54 = reFunction( '转让人为:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%\.﹪]*),', items) # 位置 WZ_59 = reFunction( '宗地坐落:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 土地用途 TDYT_68 = reFunction( '土地用途:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 面积 MJ_70 = reFunction( '宗地面积:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 出让年限 CRNX_73 = reFunction( '出让年限:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 备注 BZ_75 = reFunction( '备注:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;。,、\.﹪]*)\s*二', items) # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if True: # 重复效验通过, 存储数据 csvFile = [ WJBT_45, SJ_46, LY_47, ZWBT_48, DKBH_49, ZDBH_50, PMJG_51, GGZRFS_52, GPSJ_53, ZRR_54, ZRF_55, SRR_56, SRF_57, SRDW_58, WZ_59, DKWZ_60, CRMJ_61, YT_62, CJJ_63, BDCQDJH_64, CRHTBH_65, CRHT_66, BGXYBH_67, TDYT_68, SYNX_69, MJ_70, TDMJ_71, ZRJG_72, CRNX_73, TDSYNX_74, BZ_75, GSQ_76, LXDW_77, DWDZ_78, YZBM_79, LXDH_80, LXR_81, DZYJ_82, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\t', '').replace( '\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) elif '地块基本情况' in items: try: if '备注' not in items: tdData = htmlTable.tableTrTdRegulationToList(table) for _ in range(len(list(tdData.values())[0])): # 宗地编号 ZDBH_50 = tdData.get('宗地编号')[_] if tdData.get( '宗地编号') else '' # 受让单位 SRDW_58 = tdData.get('受让单位')[_] if tdData.get( '受让单位') else '' # 受让人 SRR_56 = tdData.get('竞得人')[_] if tdData.get( '竞得人') else '' # 地块位置 DKWZ_60 = tdData.get('地块位置')[_] if tdData.get( '地块位置') else '' # 土地用途 TDYT_68 = tdData.get('土地用途')[_] if tdData.get( '土地用途') else '' # 成交价(万元) CJJ_63 = tdData.get('成交价(万元)')[_] if tdData.get( '成交价(万元)') else '' # 土地面积(公顷) TDMJ_71 = tdData.get('土地面积(亩)')[_] if tdData.get( '土地面积(亩)') else '' # 出让年限 CRNX_73 = tdData.get('出让年限')[_] if tdData.get( '出让年限') else '' # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist( md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if True: # 重复效验通过, 存储数据 csvFile = [ WJBT_45, SJ_46, LY_47, ZWBT_48, DKBH_49, ZDBH_50, PMJG_51, GGZRFS_52, GPSJ_53, ZRR_54, ZRF_55, SRR_56, SRF_57, SRDW_58, WZ_59, DKWZ_60, CRMJ_61, YT_62, CJJ_63, BDCQDJH_64, CRHTBH_65, CRHT_66, BGXYBH_67, TDYT_68, SYNX_69, MJ_70, TDMJ_71, ZRJG_72, CRNX_73, TDSYNX_74, BZ_75, GSQ_76, LXDW_77, DWDZ_78, YZBM_79, LXDH_80, LXR_81, DZYJ_82, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace( ',', ' ' ).replace('\n', '').replace( '\t', '' ).replace('\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) else: if '竞得人' not in items: for item in [ '宗地编号' + _ for _ in re.findall('一([\s\S]*)二、', items) [0].split('宗地编号')[1:] ]: # 宗地编号 ZDBH_50 = reFunction('编号\s*([\w\-]*)\s', item) # 受让单位 SRDW_58 = reFunction( '受让单位\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 地块位置 DKWZ_60 = reFunction( '地块位置\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 成交价(万元) CJJ_63 = reFunction( '成交价\(万元\)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item ) if reFunction( '成交价\(万元\)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item ) else reFunction( '成交价(万元)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 土地用途 TDYT_68 = reFunction( '土地用途\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 土地面积(公顷) TDMJ_71 = reFunction( '土地\s*面积\s*\(公顷\)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 出让年限 CRNX_73 = reFunction( '出让年限\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 备注 BZ_75 = reFunction( '备注:\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)', item) if '二' in BZ_75: BZ_75 = '' # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist( md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if True: # 重复效验通过, 存储数据 csvFile = [ WJBT_45, SJ_46, LY_47, ZWBT_48, DKBH_49, ZDBH_50, PMJG_51, GGZRFS_52, GPSJ_53, ZRR_54, ZRF_55, SRR_56, SRF_57, SRDW_58, WZ_59, DKWZ_60, CRMJ_61, YT_62, CJJ_63, BDCQDJH_64, CRHTBH_65, CRHT_66, BGXYBH_67, TDYT_68, SYNX_69, MJ_70, TDMJ_71, ZRJG_72, CRNX_73, TDSYNX_74, BZ_75, GSQ_76, LXDW_77, DWDZ_78, YZBM_79, LXDH_80, LXR_81, DZYJ_82, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace( ',', ' ' ).replace('\n', '').replace( '\t', '').replace( '\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) except Exception as e: if '竞得人' not in items: for item in [ '宗地编号' + _ for _ in re.findall( '一([\s\S]*)二、', items)[0].split('宗地编号')[1:] ]: # 宗地编号 ZDBH_50 = reFunction('编号\s*([\w\-]*)\s', item) # 受让单位 SRDW_58 = reFunction( '受让单位\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 地块位置 DKWZ_60 = reFunction( '地块位置\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 成交价(万元) CJJ_63 = reFunction( '成交价\(万元\)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item ) if reFunction( '成交价\(万元\)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item ) else reFunction( '成交价(万元)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 土地用途 TDYT_68 = reFunction( '土地用途\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 土地面积(公顷) TDMJ_71 = reFunction( '土地\s*面积\s*\(公顷\)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 出让年限 CRNX_73 = reFunction( '出让年限\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 备注 BZ_75 = reFunction( '备注:\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)', item) if '二' in BZ_75: BZ_75 = '' # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist( md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if True: # 重复效验通过, 存储数据 csvFile = [ WJBT_45, SJ_46, LY_47, ZWBT_48, DKBH_49, ZDBH_50, PMJG_51, GGZRFS_52, GPSJ_53, ZRR_54, ZRF_55, SRR_56, SRF_57, SRDW_58, WZ_59, DKWZ_60, CRMJ_61, YT_62, CJJ_63, BDCQDJH_64, CRHTBH_65, CRHT_66, BGXYBH_67, TDYT_68, SYNX_69, MJ_70, TDMJ_71, ZRJG_72, CRNX_73, TDSYNX_74, BZ_75, GSQ_76, LXDW_77, DWDZ_78, YZBM_79, LXDH_80, LXR_81, DZYJ_82, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace( ',', ' ' ).replace('\n', '').replace( '\t', '' ).replace('\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) except Exception as e: print(response.url) self.log( f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR)
def get_blog_list(response, total_page_response=None, ajax_html=None): """ 获取当页的微博文章 :param ajax_html: :param total_page_response: :param response: :return: """ if response: blog_html = get_dom_html(response, 'Pl_Official_MyProfileFeed_') try: sel = Selector(text=blog_html) except Exception as e: raise e blog_div = sel.xpath( '//div[contains(@class, "WB_feed WB_feed_v")]/div[@mid]') blog_user_id = get_page_conf_info(response, 'oid') else: blog_div = Selector(text=ajax_html).xpath('//body/div[@mid]') blog_user_id = get_page_conf_info(total_page_response, 'oid') for blog_ in blog_div: blog_item = BlogItem() blog_item['user_id'] = blog_user_id blog_item['praise_time'] = str(time.time()) mid = blog_.xpath('./@mid').extract_first() blog_item['mid'] = check_value(mid) # 如果是转发的,获得转发的信息 is_forward = blog_.xpath('./@isforward').extract_first() if is_forward and is_forward == '1': blog_item['is_forward'] = 'True' o_mid = blog_.xpath('./@omid').extract_first() blog_item['o_mid'] = o_mid m_info = blog_.xpath('./@minfo').extract_first() tb_info = blog_.xpath('./@tbinfo').extract_first() o_user_id = '' if m_info: m_info_d = m_info.strip().split('&') for x in m_info_d: info_key_value = x.strip().split('=') if info_key_value[0] == 'ru': o_user_id = info_key_value[-1] blog_item['o_user_id'] = o_user_id elif info_key_value[0] == 'rm': if not o_mid: blog_item['o_mid'] = info_key_value[-1] if tb_info: tb_info_d = tb_info.strip().split('&') for x in tb_info_d: tb_key_value = x.strip().split('=') if tb_key_value[0] == 'ouid': if not blog_user_id: blog_item['user_id'] = tb_key_value[-1] elif tb_key_value[0] == 'rouid': if not o_user_id: blog_item['o_user_id'] = tb_key_value[-1] # 获取转发的文章 forward_item = BlogItem() sub_div = blog_.xpath( './div/div[@class="WB_detail"]/div[@class="WB_feed_expand"]' '/div[@node-type="feed_list_forwardContent"]') is_empty = sub_div.xpath('./div[@class="WB_empty"]') # 转发是否已经被删除 if not is_empty: forward_item['is_forward'] = 'False' sub_info_div = sub_div.xpath( './div[@class="WB_info"]/a[contains(@class, "W_fb")]') sub_user_info = sub_info_div.xpath( './@usercard').extract_first() forward_item['user_id'] = '-1' if sub_user_info: x = sub_user_info.strip().split('&') for y in x: z = y.strip().split('=') if z[0] == 'id': forward_item['user_id'] = z[-1] sub_mid_info = sub_info_div.xpath( './@suda-uatrack').extract_first() forward_item['mid'] = '-1' if sub_mid_info: forward_item['mid'] = sub_mid_info.strip().split(':')[-1] # 转发的微博内容 sub_blog_info = sub_div.xpath( './div[@class="WB_text"]').extract_first() forward_item['praise_time'] = str(time.time()) sub_unflod_url, sub_info_dict = get_blog_content_info( sub_blog_info) forward_item['blog_info'] = sub_info_dict['text_list'] forward_item['at_url_list'] = sub_info_dict['at_url_list'] forward_item['at_list'] = sub_info_dict['at_text_list'] forward_item['topic_list'] = sub_info_dict['topic_list'] forward_item['topic_url_list'] = sub_info_dict[ 'topic_url_list'] forward_item['article_url'] = sub_info_dict['article_url_list'][0] \ if sub_info_dict['article_url_list'] else '' forward_item['picture_url'] = sub_info_dict['img_url_list'] # 获得转发的图片 sub_pic_div = sub_div.xpath( './div[@node-type="feed_list_media_prev"]' '//div[@class="media_box"]/ul//img/@src').extract() forward_item['picture_url'] += sub_pic_div forward_item['picture_url'] = turn_to_big_pic( forward_item['picture_url']) # 时间日期,来自 sub_foot_div = sub_div.xpath( './div[contains(@class, "WB_func")]') sub_from_div = sub_foot_div.xpath( './div[contains(@class, "WB_from")]/a') forward_item['date_time'] = check_value( sub_from_div[0].xpath('./@title').extract_first()) forward_item['data_from'] = check_value( sub_from_div[1].xpath('string(.)').extract_first( )) if len(sub_from_div) > 1 else '' forward_item['exact_time'] = check_value( sub_from_div[0].xpath('./@date').extract_first()) # 评论、转发、赞 forward_item['forward_num'] = -1 forward_item['prise_num'] = -1 forward_item['comment_num'] = -1 sub_mid = sub_foot_div.xpath( './div[@class="WB_handle W_fr"]/@mid').extract_first() if 'mid' not in forward_item.fields and not forward_item[ 'mid'].isdigit(): forward_item['mid'] = sub_mid sub_num_div = sub_foot_div.xpath( './div[@class="WB_handle W_fr"]//ul/li') for sub_div in sub_num_div: sub_type = sub_div.xpath( './span/a/span//em[@class]/@class').extract_first() sub_num = sub_div.xpath( './span/a/span//em[not(@class)]/text()').extract_first( ) if sub_type and 'ficon_forward' in sub_type: if '转发' in sub_num: forward_item['forward_num'] = '0' elif sub_num.strip().isdigit(): forward_item['forward_num'] = sub_num.strip() else: print('Parse sub forward_num error!! ' + sub_num) elif sub_type and 'ficon_repeat' in sub_type: if '评论' in sub_num: forward_item['comment_num'] = '0' elif sub_num.strip().isdigit(): forward_item['comment_num'] = sub_num.strip() else: print('Parse sub comment_num error!! ' + sub_num) elif sub_type and 'ficon_praised' in sub_type: if '赞' in sub_num: forward_item['prise_num'] = '0' elif sub_num.strip().isdigit(): forward_item['prise_num'] = sub_num.strip() else: print('Parse sub prise_num error!! ' + sub_num) yield sub_unflod_url, forward_item else: blog_item['is_forward'] = 'Forward delete' else: blog_item['is_forward'] = 'False' # head_img_url = blog_.xpath('./div[@node-type="feed_content"]/' # 'div[contains(@class, "WB_face")]//img/@src').extract_first() # blog_item['head_img_url'] = check_value(head_img_url) blog_info = blog_.xpath( './div[@node-type="feed_content"]/' 'div[@class="WB_detail"]/div[contains(@class, "WB_text")]' ).extract_first() # 获得日期时间和来源 date_div = blog_.xpath( './div[@node-type="feed_content"]/' 'div[@class="WB_detail"]/div[contains(@class, "WB_from")]/a') blog_item['date_time'] = check_value( date_div[0].xpath('./@title').extract_first()) blog_item['data_from'] = check_value(date_div[1].xpath('string(.)').extract_first()) \ if len(date_div) > 1 else '' blog_item['exact_time'] = check_value( date_div[0].xpath('./@date').extract_first()) unflod_url, info_dict = get_blog_content_info(blog_info) blog_item['blog_info'] = info_dict['text_list'] blog_item['at_url_list'] = info_dict['at_url_list'] blog_item['at_list'] = info_dict['at_text_list'] blog_item['topic_list'] = info_dict['topic_list'] blog_item['topic_url_list'] = info_dict['topic_url_list'] blog_item['article_url'] = info_dict['article_url_list'][ 0] if info_dict['article_url_list'] else '' blog_item['picture_url'] = info_dict['img_url_list'] media_div = blog_.xpath( './div[@node-type="feed_content"]/' 'div[@class="WB_detail"]//div[@class="media_box"]//img/@src' ).extract() if 'picture_url' in blog_item.fields: blog_item['picture_url'] += media_div else: blog_item['picture_url'] = media_div if blog_item['picture_url']: if response: blog_item['picture_url'] = urljoin_list( response, blog_item['picture_url']) else: blog_item['picture_url'] = urljoin_list( total_page_response, blog_item['picture_url']) blog_item['picture_url'] = turn_to_big_pic(blog_item['picture_url']) # 获取点赞、评论、转发数 blog_item['forward_num'] = -1 blog_item['prise_num'] = -1 blog_item['comment_num'] = -1 foot_div = blog_.xpath('./div//ul[contains(@class, "WB_row_line")]/li') for sub_div in foot_div: sub_type = sub_div.xpath( './a/span[@class="pos"]//em[@class]/@class').extract_first() sub_num = sub_div.xpath( './a/span[@class="pos"]//em[not(@class)]/text()' ).extract_first() if sub_type and 'ficon_forward' in sub_type: if '转发' in sub_num: blog_item['forward_num'] = '0' elif sub_num.strip().isdigit(): blog_item['forward_num'] = sub_num.strip() else: print('Paese forward_num error!! ' + sub_num) elif sub_type and 'ficon_repeat' in sub_type: if '评论' in sub_num: blog_item['comment_num'] = '0' elif sub_num.strip().isdigit(): blog_item['comment_num'] = sub_num.strip() else: print('Paese comment_num error!! ' + sub_num) elif sub_type and 'ficon_praised' in sub_type: if '赞' in sub_num: blog_item['prise_num'] = '0' elif sub_num.strip().isdigit(): blog_item['prise_num'] = sub_num.strip() else: print('Paese prise_num error!! ' + sub_num) yield unflod_url, blog_item
'', 'AgreeCount': AgreeCount, 'DisagreeCount': DisagreeCount, 'AskCount': '', 'ParticipateCount': '', 'CollectionCount': '', 'Classification': sele.xpath('//div[@class="crumb"]/a/text()').extract()[1], 'Labels': sele.xpath( '//meta[@name="keywords"]/@content').extract_first(), 'Type': '', 'RewardCount': '' }) yield item if __name__ == '__main__': response = requests.get( 'http://www.cankaoxiaoxi.com/mil/20180814/2310386_2.shtml') soup = Selector(text=response.text) body = soup.xpath('//div[@id="ctrlfscont"]//p/text()').extract() print(body)
def get_blog_content_info(blog_content_html, is_unflod=False): sel = Selector(text=blog_content_html) # 微博文字信息 if is_unflod: blog_text_div = sel.xpath('//body/child::node()').extract() else: blog_text_div = sel.xpath( '//div[contains(@class, "WB_text")]/child::node()').extract() text_list = [] at_url_list = [] at_text_list = [] topic_list = [] topic_url_list = [] article_url_list = [] img_url_list = [] unfold_url = None for child_div in blog_text_div: content_sel = Selector(text=child_div) a_sel = content_sel.xpath('//a') img_sel = content_sel.xpath('//img') if a_sel: a_type = a_sel.xpath( './i/@class | ./span/i/@class').extract_first() # 转发的时候带着图片 if a_type and 'ficon_cd_img' in a_type: action_data = a_sel.xpath('./@action-data').extract_first() uid = '' mid = '' pid = '' short_url = '' if action_data: for x in action_data.split('&'): x_key = x.strip().split('=')[0] x_value = x.strip().split('=')[1] if x_key == 'uid': uid = x_value elif x_key == 'mid': mid = x_value elif x_key == 'pid': pid = x_value elif x_key == 'short_url': short_url = x_value if short_url: img_url_list.append(short_url) elif uid and mid and pid: img_url = 'http://photo.weibo.com/' \ + uid \ + '/wbphotos/large/mid/' \ + mid \ + '/pid/' \ + pid img_url_list.append(img_url) else: print('No img url' + str(a_sel.extract())) text_list.append( check_value(a_sel.xpath('string(.)').extract_first())) elif a_sel.xpath('./@extra-data') and a_sel.xpath( './@extra-data').extract_first() == 'type=atname': at_text = check_value(a_sel.xpath('string(.)').extract_first()) at_text_list.append(at_text) text_list.append(at_text) at_url_list.append(a_sel.xpath('./@href').extract_first()) elif a_sel.xpath('./@extra-data') and a_sel.xpath( './@extra-data').extract_first() == 'type=topic': topic_text = check_value( a_sel.xpath('string(.)').extract_first()) text_list.append(topic_text) topic_list.append(topic_text) topic_url_list.append(a_sel.xpath('./@href').extract_first()) elif a_sel.xpath('./@action-type') and a_sel.xpath( './@action-type').extract_first() == 'fl_unfold': # 获取展开全文的URL,这个URL只用于获取内容的ajax请求 fl_action_data = a_sel.xpath('./@action-data').extract_first() unfold_url = 'http://weibo.com/p/aj/mblog/getlongtext?ajwvr=6&' + fl_action_data elif content_sel.xpath('//a/img'): img_type = content_sel.xpath('//a/img/@type').extract_first() # 如果是表情 if img_type and img_type == 'face': title = img_sel.xpath('./@title').extract_first() src = img_sel.xpath('./@src').extract_first() text = gen_emjo_text(title, src) text_list.append(text) else: text_list.append( check_value( img_sel.xpath('string(.)').extract_first())) elif a_type: a_href = check_value(a_sel.xpath('./@href').extract_first()) if 'ficon_cd_longwb' in a_type: article_url_list.append(a_href) part_text = gen_a_text(a_type, a_href) text_list.append( check_value(a_sel.xpath('string(.)').extract_first()) + part_text) else: print('blogs has more type!! ' + str(a_sel.extract()) + ' \n' + blog_content_html) elif img_sel: img_type = img_sel.xpath('./@type').extract_first() # 如果是表情 if img_type and img_type == 'face': title = img_sel.xpath('./@title').extract_first() src = img_sel.xpath('./@src').extract_first() text = gen_emjo_text(title, src) text_list.append(text) else: text_list.append( check_value(img_sel.xpath('string(.)').extract_first())) else: text_list.append( check_value(content_sel.xpath('string(.)').extract_first())) return unfold_url, { 'text_list': ''.join(text_list), 'at_url_list': at_url_list, 'at_text_list': at_text_list, 'topic_list': topic_list, 'topic_url_list': topic_url_list, 'article_url_list': article_url_list, 'img_url_list': img_url_list }
def selector(self): if hasattr(self, '_selector'): return self._selector self._selector = Selector(text=self.numbered_html) return self._selector
def _get_comment_info(response, blog_id, html_=None, parent_comment_id='0'): comment_div = None if parent_comment_id == '0': json_data = response.text try: json_obj = json.loads(json_data) html_ = json_obj['data']['html'] sel = Selector(text=html_) comment_div = sel.xpath( '//div[@class="list_box"]/div[@class="list_ul"]/div[@node-type="root_comment"]' ) except: print('Parse comment json error!! ') elif html_: sel = Selector(text=html_) comment_div = sel.xpath('//div[@comment_id]') else: raise ValueError('None param html_') if comment_div: for comment_info in comment_div: # 获得根comment的信息 root_comment = CommentItem() root_comment['parent_comment_id'] = str(parent_comment_id) root_comment['blog_id'] = blog_id root_comment['parse_time'] = str(time.time()) comment_id = comment_info.xpath('./@comment_id').extract_first() root_comment['comment_id'] = check_value(comment_id) user_info_div = comment_info.xpath( './div[@class="list_con"]/div[@class="WB_text"]/a[1]') nick_name = user_info_div.xpath('string(.)').extract_first() root_comment['comment_user_nick'] = check_value(nick_name) user_id = user_info_div.xpath('./@usercard').extract_first() id_str = check_value(user_id).split('=') root_comment['comment_user_id'] = id_str[1] if len( id_str) > 1 else '' user_url = user_info_div.xpath('./@href').extract_first() root_comment['comment_user_page'] = response.urljoin(user_url) date_time_div = comment_info.xpath( './div[@class="list_con"]/div[contains(@class, "WB_func")]') date_time = date_time_div.xpath( './div[contains(@class, "WB_from")]').xpath( 'string(.)').extract_first() root_comment['comment_date_time'] = check_value(date_time) praise_num = date_time_div.xpath( './div[contains(@class, "WB_handle")]' '/ul//span[@node-type="like_status"]/em[not(@class)]/text()' ).extract_first() if isinstance(praise_num, int) or praise_num.isdigit(): root_comment['praise_num'] = str(praise_num) elif '赞' in praise_num: root_comment['praise_num'] = '0' else: root_comment['praise_num'] = '-1' comment_content_div = comment_info.xpath( './div[@class="list_con"]/div[@class="WB_text"]' ).extract_first() info_dic = get_comment_content(comment_content_div) root_comment['content'] = info_dic['text_list'] root_comment['at_url_list'] = info_dic['at_url_list'] root_comment['at_name_list'] = info_dic['at_text_list'] root_comment['topic_url_list'] = info_dic['topic_url_list'] root_comment['topic_text_list'] = info_dic['topic_url_list'] root_comment['img_url_list'] = info_dic['img_url_list'] # 获得更多回复的链接 more_replay = None if parent_comment_id == '0': more_replay = comment_info.xpath('./div[@node-type="replywrap"]' '//a[@action-type="click_more_child_comment_big"]/@action-data') \ .extract_first() # 是否有子评论 child_div = comment_info.xpath( './div[@class="list_con"]/div[contains(@class, "list_box_in")]' '/div[@node-type="child_comment"]').extract_first() root_comment['child_comment_ids'] = [] if child_div and parent_comment_id == '0': # and not more_replay: for child_comment in get_child_comment(response, blog_id, child_div, comment_id): root_comment['child_comment_ids'].append( child_comment['comment_id']) yield None, child_comment yield more_replay, root_comment
def parse_pages(self, response): """ 对搜索页中的每条微博信息进行抽取, 如果微博内容中有显示完全有“展开全文”按钮则继续返回一个微博全文的请求, 否则返回item :param response: :return: """ page_json = json.loads(response.body.decode('utf-8')) card_group = [] try: card_group = page_json['data']['cards'][-1]['card_group'] except IndexError as e: if response.meta.get('retry', 0) == 10: logger.error( '账号:[%s],第 %s 页解析微博列表json出错,已重试10次,放弃重试!错误原因:%s,返回信息:%s', response.meta['account'], response.meta['index'], e, page_json) return None else: logger.warning( '账号:[%s],第 %s 页解析微博列表json出错,将重试第 %s 次,错误原因:%s,返回信息:%s', response.meta['account'], response.meta['index'], response.meta.get('retry', 0) + 1, e, page_json) yield scrapy.Request(url=response.url, callback=self.parse_pages, dont_filter=True, meta={ 'index': response.meta['index'], 'retry': response.meta.get('retry', 0) + 1 }) return None for i in card_group: item = WeiboItem() item['weibo_mid'] = int(i['mblog']['mid']) item['user_nick_name'] = i['mblog']['user']['screen_name'] item['user_home_url'] = i['mblog']['user']['profile_url'].split( '?')[0] text = i['mblog']['text'] text_s = Selector(text=text, type='html') item['content'] = text_s.xpath( 'normalize-space(string(.))').extract_first('') item['time'] = self.format_date(i['mblog']['created_at']) item['forwarded_count'] = i['mblog']['reposts_count'] item['comment_count'] = i['mblog']['comments_count'] item['like_count'] = i['mblog']['attitudes_count'] item['weibo_url'] = i['scheme'].split('?')[0] if len(text_s.xpath( '//a[text()="全文"]')) != 0: # 有展开全文按钮,构造全文请求,获取全文 yield scrapy.Request(url=WeiboSpider.FULL_CONTENT_URL % item['weibo_mid'], callback=self.parse_full_content, dont_filter=True, meta={'item': item}) else: yield item # 构造返回该微博评论request comment_url = WeiboSpider.COMMENT_URL % (item['weibo_mid'], 1) yield scrapy.Request( url=comment_url, callback=self.parse_comment, dont_filter=True, meta={ 'mid': item['weibo_mid'], # 微博mid 'index': 1, 'count': 0 }) logger.info('成功获取第%s页的微博信息', response.meta['index'])
def get_comment_content(comment_div): sel = Selector(text=comment_div) # 微博文字信息 blog_text_div = sel.xpath('//body/div/child::node()').extract() text_list = [] at_url_list = [] at_text_list = [] topic_list = [] topic_url_list = [] img_url_list = [] for child_div in blog_text_div: content_sel = Selector(text=child_div) a_sel = content_sel.xpath('//a') img_sel = content_sel.xpath('//img') if a_sel: a_type = a_sel.xpath( './i/@class | ./span/i/@class').extract_first() # 转发的时候带着图片 if a_type and 'ficon_cd_img' in a_type: action_data = a_sel.xpath('./@action-data').extract_first() uid = '' mid = '' pid = '' short_url = '' if action_data: for x in action_data.split('&'): x_key = x.strip().split('=')[0] x_value = x.strip().split('=')[1] if x_key == 'uid': uid = x_value elif x_key == 'mid': mid = x_value elif x_key == 'pid': pid = x_value elif x_key == 'short_url': short_url = x_value if short_url: img_url_list.append(short_url) elif uid and mid and pid: img_url = 'http://photo.weibo.com/' \ + uid \ + '/wbphotos/large/mid/' \ + mid \ + '/pid/' \ + pid img_url_list.append(img_url) else: print('No img url' + str(a_sel.extract())) text_list.append( check_value(a_sel.xpath('string(.)').extract_first())) elif a_sel.xpath('./@extra-data') and a_sel.xpath( './@extra-data').extract_first() == 'type=atname': at_text = check_value(a_sel.xpath('string(.)').extract_first()) at_text_list.append(at_text) text_list.append(at_text) at_url_list.append(a_sel.xpath('./@href').extract_first()) elif a_sel.xpath('./@extra-data') and a_sel.xpath( './@extra-data').extract_first() == 'type=topic': topic_text = check_value( a_sel.xpath('string(.)').extract_first()) text_list.append(topic_text) topic_list.append(topic_text) topic_url_list.append(a_sel.xpath('./@href').extract_first()) elif content_sel.xpath('//a/img'): img_type = content_sel.xpath('//a/img/@type').extract_first() # 如果是表情 if img_type and img_type == 'face': title = img_sel.xpath('./@title').extract_first() src = img_sel.xpath('./@src').extract_first() text = gen_emjo_text(title, src) text_list.append(text) else: text_list.append( check_value( img_sel.xpath('string(.)').extract_first())) # else: # print('blogs has more type!! ' + str(a_sel.extract()) + ' \n' + comment_div) elif img_sel: img_type = img_sel.xpath('./@type').extract_first() # 如果是表情 if img_type and img_type == 'face': title = img_sel.xpath('./@title').extract_first() src = img_sel.xpath('./@src').extract_first() text = gen_emjo_text(title, src) text_list.append(text) else: text_list.append( check_value(img_sel.xpath('string(.)').extract_first())) else: text_list.append( check_value(content_sel.xpath('string(.)').extract_first())) return { 'text_list': ''.join(text_list), 'at_url_list': at_url_list, 'at_text_list': at_text_list, 'topic_list': topic_list, 'topic_url_list': topic_url_list, 'img_url_list': img_url_list }
def parse_detail(self, response): # TODO 主动关闭爬虫问题 try: data = Selector(text=response.body.decode('utf-8')) items = str(data.xpath('string(.)').extract()[0]).replace('\xa0', '').replace('\u3000', '') WJBT_27 = '' SJ_28 = '' LY_29 = '' WJBT_30 = '' ZDBH_31 = '' BH_32 = '' DKWZ_33 = '' TDWZ_34 = '' TDMJM_35 = '' TDMJPFM_36 = '' TDYT_37 = '' CJJ_38 = '' JDR_39 = '' GSQ_40 = '' LXDW_41 = '' DWDZ_42 = '' YZBM_43 = '' LXDH_44 = '' # TODO 共有字段 reFunction(f'时间:\s*([{self.reStr}]*)\s', LY) # 文件标题 WJBT_27 = response.meta.get('title') # 时间 SJ_28 = data.xpath('//div[@class="ztzx_frame_subtitle_l"]/span[1]/text()').extract_first() # 来源 LY_29 = data.xpath('//div[@class="ztzx_frame_subtitle_l"]/span[2]/text()').extract_first() # 文件编号 WJBT_30 = data.xpath('//div[@class="ztzx_frame_content"]/div[1]/text()').extract_first() # 公示期 GSQ_40 = reFunction(f'公示期:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)。', items) # 联系单位 LXDW_41 = reFunction('联系单位:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 单位地址 DWDZ_42 = reFunction('单位地址:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 邮政编码 YZBM_43 = reFunction('邮政编码:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 联系电话 LXDH_44 = reFunction('联系电话:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(url + WJBT_27 + SJ_28) soup = BeautifulSoup(response.body.decode('utf-8').replace('thead', 'tbody')) table = soup.find('table') htmlTable = htmlTableTransformer() if table: if '竣工时间' in items: try: tdData = htmlTable.tableTrTdUNregulationToList(table) for _ in range(len(list(tdData.values())[0])): # 宗地编号 ZDBH_31 = tdData.get('地块编号')[_] if tdData.get('地块编号') else '' # 地块位置 DKWZ_33 = tdData.get('位置')[_] if tdData.get('位置') else '' # 土地位置 TDWZ_34 = tdData.get('位置')[_] if tdData.get('位置') else '' # 土地面积(亩) TDMJM_35 = tdData.get('出让面积平方米/亩')[_] if tdData.get('出让面积平方米/亩') else '' # 土地面积(平方米) TDMJPFM_36 = tdData.get(list(tdData.keys())[7])[_] if tdData.get(list(tdData.keys())[7]) else '' # 土地用途 TDYT_37 = tdData.get('用途')[_] if tdData.get('用途') else '' # 成交价(万元) CJJ_38 = tdData.get('成交价(万元)')[_] if tdData.get('成交价(万元)') else tdData.get('成交价(万元)')[_] if tdData.get('成交价(万元)') else '' # 竞得人 JDR_39 = tdData.get('受让人')[_] if tdData.get('受让人') else '' # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if TDYT_37: # 重复效验通过, 存储数据 csvFile = [ WJBT_27, SJ_28, LY_29, WJBT_30, ZDBH_31, BH_32, DKWZ_33, TDWZ_34, TDMJM_35, TDMJPFM_36, TDYT_37, CJJ_38, JDR_39, GSQ_40, LXDW_41, DWDZ_42, YZBM_43, LXDH_44, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace('\n', '').replace('\r', '').replace( r'\xa0', '').replace('\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield except: for tdData in table.find_all('tr')[2:]: # 宗地编号 ZDBH_31 = tdData.find_all('td')[4].string.strip() # 地块位置 DKWZ_33 = tdData.find_all('td')[5].string.strip() # 土地位置 TDWZ_34 = tdData.find_all('td')[5].string.strip() # 土地面积(亩) TDMJM_35 = tdData.find_all('td')[6].string.strip() # 土地面积(平方米) TDMJPFM_36 = tdData.find_all('td')[7].string.strip() # 土地用途 TDYT_37 = tdData.find_all('td')[8].string.strip() # 成交价(万元) CJJ_38 = tdData.find_all('td')[9].string.strip() # 竞得人 JDR_39 = tdData.find_all('td')[3].string.strip() # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if TDYT_37: # 重复效验通过, 存储数据 csvFile = [ WJBT_27, SJ_28, LY_29, WJBT_30, ZDBH_31, BH_32, DKWZ_33, TDWZ_34, TDMJM_35, TDMJPFM_36, TDYT_37, CJJ_38, JDR_39, GSQ_40, LXDW_41, DWDZ_42, YZBM_43, LXDH_44, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace('\n', '').replace('\r', '').replace( r'\xa0', '').replace('\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield elif '转让方' not in items: if len(table.find_all('tr')[1].find_all('td')) < 5: table.find_all('tr')[1].extract() table.find_all('tr')[0].find_all('td')[-1].extract() tdData = htmlTable.tableTrTdRegulationToList(table) for _ in range(len(list(tdData.values())[0])): # 宗地编号 ZDBH_31 = tdData.get('宗地编号')[_] if tdData.get('宗地编号') else '' # 编号 BH_32 = tdData.get('编号')[_] if tdData.get('编号') else '' # 地块位置 DKWZ_33 = tdData.get('地块位置')[_] if tdData.get('地块位置') else '' # 土地位置 TDWZ_34 = tdData.get('土地位置')[_] if tdData.get('土地位置') else '' # 土地面积(亩) TDMJM_35 = tdData.get('土地面积(亩)')[_] if tdData.get('土地面积(亩)') else '' # 土地面积(平方米) TDMJPFM_36 = tdData.get('土地面积(平方米)')[_] if tdData.get('土地面积(平方米)') else '' # 土地用途 TDYT_37 = tdData.get('土地用途')[_] if tdData.get('土地用途') else '' # 成交价(万元) CJJ_38 = tdData.get('成交价(万元)')[_] if tdData.get('成交价(万元)') else tdData.get('成交价(万元)')[_] if tdData.get('成交价(万元)') else '' # 竞得人 JDR_39 = tdData.get('竞得人')[_] if tdData.get('竞得人') else '' # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if TDYT_37: # 重复效验通过, 存储数据 csvFile = [ WJBT_27, SJ_28, LY_29, WJBT_30, ZDBH_31, BH_32, DKWZ_33, TDWZ_34, TDMJM_35, TDMJPFM_36, TDYT_37, CJJ_38, JDR_39, GSQ_40, LXDW_41, DWDZ_42, YZBM_43, LXDH_44, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace('\n', '').replace('\r', '').replace(r'\xa0', '').replace('\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log(f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield elif '地块基本情况' in items: # 宗地编号 ZDBH_31 = reFunction('宗地编号\s*([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 地块位置 DKWZ_33 = reFunction('地块位置\s*([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 土地面积(亩) TDMJM_35 = reFunction('土地面积\(公顷\)\s*([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 土地用途 TDYT_37 = reFunction('土地用途\s*([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 成交价(万元) CJJ_38 = reFunction('成交价\(万元\)\s*([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 竞得人 JDR_39 = reFunction('受让单位\s*([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if TDYT_37: # 重复效验通过, 存储数据 csvFile = [ WJBT_27, SJ_28, LY_29, WJBT_30, ZDBH_31, BH_32, DKWZ_33, TDWZ_34, TDMJM_35, TDMJPFM_36, TDYT_37, CJJ_38, JDR_39, GSQ_40, LXDW_41, DWDZ_42, YZBM_43, LXDH_44, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace('\n', '').replace('\r', '').replace( r'\xa0', '').replace('\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log(f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider(self, 'response msg info %s, job duplicated!' % response.url) elif '转让方' in items: # 编号 BH_32 = reFunction('不动产权登记证号:([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 地块位置 DKWZ_33 = reFunction('宗地位置:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 土地面积(平方米) TDMJPFM_36 = reFunction('面\s*积:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 土地用途 TDYT_37 = reFunction('土地用途:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 成交价(万元) # CJJ_38 # 竞得人 JDR_39 = reFunction('受让方:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if TDYT_37: # 重复效验通过, 存储数据 csvFile = [ WJBT_27, SJ_28, LY_29, WJBT_30, ZDBH_31, BH_32, DKWZ_33, TDWZ_34, TDMJM_35, TDMJPFM_36, TDYT_37, CJJ_38, JDR_39, GSQ_40, LXDW_41, DWDZ_42, YZBM_43, LXDH_44, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace('\n', '').replace('\t', '').replace('\r', '').replace( r'\xa0', '').replace('\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log(f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider(self, 'response msg info %s, job duplicated!' % response.url) except Exception as e: print(response.url) self.log(f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR)
def get_element(path, tree): sel = Selector(text=tree) xp = lambda x: sel.xpath(x).extract() return xp(path)
def port_sample(sample, schemas=None, extractors=None): """Convert slybot samples made before slybot 0.13 to new format.""" if schemas is None: schemas = {} if extractors is None: extractors = {} container_id = gen_predictable_id(sample.get('id', 1), sample['page_id']) default_annotations = [_create_container('body', container_id)] if not sample.get('annotated_body') and not sample.get('plugins'): sample['plugins'] = { 'annotations-plugin': { 'extracts': default_annotations } } return sample if not sample.get('plugins'): sample['plugins'] = load_annotations(sample.get('annotated_body', u'')) else: repair_ids(sample) sample.pop('annotated_body', None) # Group annotations by type annotations = sample['plugins']['annotations-plugin']['extracts'] try: sel = Selector(text=add_tagids(sample['original_body'])) except KeyError: annotated = sample.get('annotated_body', u'') sample['original_body'] = annotated try: tagged = add_tagids(annotated) except KeyError: tagged = u'' sel = Selector(text=tagged) annotations = port_standard(annotations, sel, sample) standard_annos, generated_annos, variant_annos = [], [], [] for a in annotations: if a.get('generated'): generated_annos.append(a) elif a.get('variants', 0) > 0: variant_annos.append(a) else: standard_annos.append(a) if not annotations: sample['plugins'] = { 'annotations-plugin': { 'extracts': default_annotations } } return sample new_annotations = [] a = find_element(annotations[0], sel) for b in annotations[1:]: b = find_element(b, sel) a = find_common_parent(a, b) parent = a.getparent() container = _create_container( a if parent is None else parent, container_id, selector=sel) new_annotations.append(container) for a in standard_annos: a.pop('variant', None) new_annotations.extend(standard_annos) new_annotations.extend(port_generated(generated_annos, sel)) new_annotations.extend(port_variants(variant_annos, sel)) for a in new_annotations: if not (a.get('item_container') and a.get('container_id')): a['container_id'] = container_id a.pop('tagid', None) or a.pop('data-tagid', None) # Update annotations sample['plugins']['annotations-plugin']['extracts'] = new_annotations sample['version'] = SLYBOT_VERSION schema_id, schemas = guess_schema(sample, schemas) container['schema_id'] = schema_id return sample, schemas
def parseArticle(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: src_channel = response.meta['src_channel'] sub_channel = response.meta['sub_channel'] title = response.meta['title'] post_user = response.meta['post_user'] tags = response.meta['tags'] post_date = response.meta['post_date'] source_url = response.meta['source_url'] self.logDao.info(u'开始解析文章:' + title + ':' + post_date + ':' + source_url) selector = Selector(text=body) # 得到样式 styleUrls = selector.xpath( '//link[@rel="stylesheet"]/@href').extract() styleList = [] for styleUrl in styleUrls: if styleUrl.startswith('//'): styleUrl = 'http:' + styleUrl # 得到hash作为key styleUrlHash = EncryptUtil.md5(styleUrl) if not self.css.get(styleUrlHash): # 不存在则去下载 并保存 self.css[styleUrlHash] = CssUtil.downLoad(styleUrl) styleList.append(self.css[styleUrlHash]) styles = CssUtil.compressCss(styleList).replace('\'', '"').replace( '\\', '\\\\') # 替换样式里面的链接 styles = CssUtil.clearUrl(styles) content_html = selector.xpath('//*[@class="article"]') backHtml = selector.xpath('//*[@id="backsohucom"]').extract_first( '') if not len(content_html): self.logDao.info(u'不存在内容:' + source_url) return # 去除内部不需要的标签u'<p data-role="editor-name">责任编辑:<span></span></p>' # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract() content_items = content_html.xpath( '*[not(name(.)="script") and not(name(.)="style") and not(name(.)="iframe") and not(boolean(@data-role="editor-name"))]' ) if not len(content_items): self.logDao.info(u'不存在内容:' + source_url) return # 得到纯文本 content_txt = [] for item in content_items: # 文本 allTxt = item.xpath('.//text()').extract() allTxt = ''.join(allTxt).replace('\t', '') # 加入 content_txt.append(allTxt) content_txt = '\n'.join(content_txt) # 组装新的内容标签 outHtml = """<div class="article-page"><article class="article">${++content++}</article></div>""" content_items = content_items.extract() content_items = ''.join(content_items) content_html = outHtml.replace('${++content++}', content_items) content_html = content_html.replace(backHtml, '') selector = Selector(text=content_html) # 解析文档中的所有图片url,然后替换成标识 image_urls = [] imgs = selector.xpath('descendant::img') for img in imgs: # 图片可能放在src 或者data-src image_url_base = img.xpath('@src').extract_first('') if image_url_base.startswith('//'): image_url = 'http:' + image_url_base else: image_url = image_url_base if image_url and image_url.startswith('http'): self.logDao.info(u'得到图片:' + image_url) image_urls.append({ 'url': image_url, }) content_html = content_html.replace( image_url_base, image_url) urlHash = EncryptUtil.md5(source_url.encode('utf8')) self.saveFile(urlHash, body) # 得到hashCode hash_code = self.checkDao.getHashCode(source_url) # 去除 image 的 alt title selector = Selector(text=content_html) imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract() # 处理提示块img的 alt title, 关注//img/@alt|//img/@title for imgAltTitle in imgAltTitles: if imgAltTitle.strip(' '): content_html = content_html.replace(imgAltTitle, '') contentItem = ContentItem() contentItem['content_txt'] = content_txt contentItem['image_urls'] = image_urls contentItem['title'] = title contentItem['source_url'] = source_url contentItem['post_date'] = post_date contentItem['sub_channel'] = sub_channel contentItem['post_user'] = post_user contentItem['tags'] = ','.join(tags) contentItem['styles'] = styles contentItem['content_html'] = content_html contentItem['hash_code'] = hash_code contentItem['info_type'] = 1 contentItem['src_source_id'] = 7 # contentItem['src_account_id'] = 0 contentItem['src_channel'] = src_channel contentItem['src_ref'] = '搜狐科技' return contentItem
def parse_neighborhood_info(self, response): basic_info = response.text city_name = response.meta['city_name'] block_name = '>'.join(Selector(text=basic_info).xpath('//div[@class="xiaoquDetailbreadCrumbs"]/div[@class="fl l-txt"]/a/text()').extract()) neighborhood_name='' if len(Selector(text=basic_info).xpath('//div[@class="xiaoquDetailHeader"]/div[@class="xiaoquDetailHeaderContent clear"]/div[@class="detailHeader fl"]/h1[@class="detailTitle"]/text()').extract())>0: neighborhood_name = Selector(text=basic_info).xpath('//div[@class="xiaoquDetailHeader"]/div[@class="xiaoquDetailHeaderContent clear"]/div[@class="detailHeader fl"]/h1[@class="detailTitle"]/text()').extract()[0] neighborhood_addr = Selector(text=basic_info).xpath( '//div[@class="xiaoquDetailHeader"]/div[@class="xiaoquDetailHeaderContent clear"]/div[@class="detailHeader fl"]/div[@class="detailDesc"]/text()').extract()[0] neighborhood_price = '' if len(Selector(text=basic_info).xpath( '//div[@class="xiaoquOverview"]/div[@class="xiaoquDescribe fr"]/div[@class="xiaoquPrice clear"]//span[@class="xiaoquUnitPrice"]/text()').extract())>0: neighborhood_price = Selector(text=basic_info).xpath( '//div[@class="xiaoquOverview"]/div[@class="xiaoquDescribe fr"]/div[@class="xiaoquPrice clear"]//span[@class="xiaoquUnitPrice"]/text()').extract()[0] neighborhood_year = Selector(text=basic_info).xpath( '//div[@class="xiaoquOverview"]/div[@class="xiaoquDescribe fr"]/div[@class="xiaoquInfo"]/div[@class="xiaoquInfoItem"][1]/span[@class="xiaoquInfoContent"]/text()').extract()[0] neighborhood_type = Selector(text=basic_info).xpath( '//div[@class="xiaoquOverview"]/div[@class="xiaoquDescribe fr"]/div[@class="xiaoquInfo"]/div[@class="xiaoquInfoItem"][2]/span[@class="xiaoquInfoContent"]/text()').extract()[0] neighborhood_estate = Selector(text=basic_info).xpath( '//div[@class="xiaoquOverview"]/div[@class="xiaoquDescribe fr"]/div[@class="xiaoquInfo"]/div[@class="xiaoquInfoItem"][3]/span[@class="xiaoquInfoContent"]/text()').extract()[0] neighborhood_property = Selector(text=basic_info).xpath( '//div[@class="xiaoquOverview"]/div[@class="xiaoquDescribe fr"]/div[@class="xiaoquInfo"]/div[@class="xiaoquInfoItem"][4]/span[@class="xiaoquInfoContent"]/text()').extract()[0] neighborhood_company = Selector(text=basic_info).xpath( '//div[@class="xiaoquOverview"]/div[@class="xiaoquDescribe fr"]/div[@class="xiaoquInfo"]/div[@class="xiaoquInfoItem"][5]/span[@class="xiaoquInfoContent"]/text()').extract()[0] neighborhood_builds = Selector(text=basic_info).xpath( '//div[@class="xiaoquOverview"]/div[@class="xiaoquDescribe fr"]/div[@class="xiaoquInfo"]/div[@class="xiaoquInfoItem"][6]/span[@class="xiaoquInfoContent"]/text()').extract()[0] neighborhood_houses = Selector(text=basic_info).xpath( '//div[@class="xiaoquOverview"]/div[@class="xiaoquDescribe fr"]/div[@class="xiaoquInfo"]/div[@class="xiaoquInfoItem"][7]/span[@class="xiaoquInfoContent"]/text()').extract()[0] item = LianjiaLoaderItem(item=LianjiaResultItem(), response=response) item.add_value('batch_date', self.batch_date) item.add_value('city_name', city_name) item.add_value('block_name', block_name) item.add_value('neighborhood_name', neighborhood_name) item.add_value('neighborhood_addr', neighborhood_addr) item.add_value('neighborhood_price', neighborhood_price) item.add_value('neighborhood_year', neighborhood_year) item.add_value('neighborhood_type', neighborhood_type) item.add_value('neighborhood_estate', neighborhood_estate) item.add_value('neighborhood_property', neighborhood_property) item.add_value('neighborhood_company', neighborhood_company) item.add_value('neighborhood_builds', neighborhood_builds) item.add_value('neighborhood_houses', neighborhood_houses) item.add_value('table_name', 'spider.lianjia_result') yield item.load_item()
def spider(url, place): text = requests.get(url).content.decode("utf-8") sel = Selector(text=text) date_all = sel.xpath('//div[@id="forecast"]/div[@class="detail"]') day_id = 0 for date_item in date_all: if day_id <= 2: date_i = "".join( date_item.xpath( './/div[@class="today"]/table/tbody/tr[1]/td[2]/text()'). extract()).strip() else: date_i = "".join( date_item.xpath( './/div[@class="today"]/table/tbody/tr[1]/td[1]/text()'). extract()).strip() time_all = sel.xpath('//div[@id="hour3"]/div') time_item = time_all[day_id] for item in range(0, 8): time_i = "".join( (time_item.xpath('.//div[@class="row first"]/div/text()') )[item + 1].extract()).strip() temperature = "".join( (time_item.xpath('.//div[@class="row wd"]/div/text()') )[item + 1].extract()).strip() humidity = "".join( (time_item.xpath('.//div[@class="row xdsd"]/div/text()') )[item + 1].extract()).strip() if place == 'shenzhen': existed_data = Temperature.select().where(( Temperature.date == date_i) & (Temperature.time == time_i)) if existed_data: temperature_data = existed_data[0] else: temperature_data = Temperature() elif place == 'guangzhou': existed_data = Temperature2.select().where( (Temperature2.date == date_i) & (Temperature2.time == time_i)) if existed_data: temperature_data = existed_data[0] else: temperature_data = Temperature2() elif place == 'foshan': existed_data = Temperature3.select().where( (Temperature3.date == date_i) & (Temperature3.time == time_i)) if existed_data: temperature_data = existed_data[0] else: temperature_data = Temperature3() elif place == 'dongguan': existed_data = Temperature4.select().where( (Temperature4.date == date_i) & (Temperature4.time == time_i)) if existed_data: temperature_data = existed_data[0] else: temperature_data = Temperature4() temperature_data.date = date_i temperature_data.time = time_i temperature_data.temperature = temperature temperature_data.humidity = humidity temperature_data.save() day_id += 1
def title_parse(self, response): selector = Selector(response) itemList = selector.xpath( "//div[@id = 'subcontent']/dl[contains(@class,'list_dl') and not(contains(@class,'bluebg'))]" ) for item in itemList: try: autohomeforumItem = AutohomeforumItem() autohomeforumItem['carId'] = re.findall( '-(\d+)-', response.url)[0] autohomeforumItem['iconName'] = item.xpath( "./dt/span/@class").extract_first() autohomeforumItem['title'] = item.xpath( "./dt/a[1]/text()").extract_first() autohomeforumItem['author'] = item.xpath( "./dd[1]/a/text()").extract_first() autohomeforumItem['authorId'] = item.xpath( "./dd[1]/a/@href").extract_first().split('/')[-1] autohomeforumItem['publishTime'] = item.xpath( "./dd[1]/span/text()").extract_first() autohomeforumItem['replyNum'] = item.xpath( "./dd[2]/span[1]/text()").extract_first() autohomeforumItem['clickNum'] = item.xpath( "./dd[2]/span[2]/text()").extract_first() autohomeforumItem['lastReplyer'] = item.xpath( "./dd[3]/a/text()").extract_first() autohomeforumItem['lastReplyTime'] = item.xpath( "./dd[3]/span/text()").extract_first() detialUrl = item.xpath("./dt/a[1]/@href").extract_first() autohomeforumItem['itemId'] = re.findall('/(\d+)-', detialUrl)[0] autohomeforumItem['url'] = self.baseUrl + detialUrl autohomeforumItem['contents'] = list() if self.isSavedInMongodb({ 'carId': autohomeforumItem['carId'], 'itemId': autohomeforumItem['itemId'] }) > 0: logging.warning('{carId}的{itemId}已经保存'.format( carId=autohomeforumItem['carId'], itemId=autohomeforumItem['itemId'])) continue yield SplashRequest( url=self.baseUrl + detialUrl, callback=self.detial_parse, args={ 'wait': 1, 'timeout': 60, 'images': 0 }, meta={'autohomeforumItem': autohomeforumItem}) except Exception as e: print(e) maxNumText = selector.xpath( "//span[@class='fr']/text()").extract_first() try: maxNum = re.findall("(\d+)", maxNumText)[0] except Exception as e: print(e) currentPageNum = selector.xpath( "//span[@class='cur']/text()").extract_first() if int(currentPageNum) < response.meta['page']: nextUrl = re.sub('\d+.html', str(int(currentPageNum) + 1) + '.html', response.url) yield SplashRequest(url=nextUrl, callback=self.title_parse, args={ 'wait': 1, 'timeout': 60, 'images': 0 }, meta=response.meta)
def duck_selector(self, response): base_url = "https://duckduckgo.com/" snippets = response \ .xpath("//div[@class='result results_links results_links_deep web-result ']") \ .extract() itemproc = self.crawler.engine.scraper.itemproc id_person = response.meta['id_person'] base_attr = response.meta['attr'] search = response.meta['search'] num_snippet = response.meta['num_snip'] for snippet in snippets: storage_item = UsmItem() num_snippet = num_snippet + 1 title = Selector(text=snippet).xpath("//div/h2/a/node()").extract() cite = Selector(text=snippet).xpath("//div/a/@href").extract() text = Selector(text=snippet).xpath("//div/a[@class='result__snippet']/node()").extract() if title.__len__() > 0: tmp = "" for text in title: for r in ["<b>", "</b>"]: text = text.replace(r, '') tmp = tmp + text title = tmp else: title = "" if cite.__len__() > 0: cite = cite[0] else: cite = "" if text.__len__() > 0: tmp = "" for txt in title: for r in ["<b>", "</b>"]: txt = txt.replace(r, '') tmp = tmp + txt text = tmp else: text = "" if cite != "": self.log("---------------------------------") self.log("------------TITLE----------------") self.log(title) self.log("------------CITE-----------------") self.log(cite) self.log("------------TEXT-----------------") self.log(text) self.log("-----------ID PERSON-----------------") self.log(id_person) self.log("-----------SEARCH----------------") self.log(search) self.log("--------------ATTR---------------") self.log(base_attr) self.log("-----------ENGINE SEARCH---------") self.log(self.browser) self.log("------------NUMBER SNIPPET-------") self.log(num_snippet) storage_item['title'] = title storage_item['cite'] = cite storage_item['text'] = text storage_item['id_person'] = id_person storage_item['search'] = search storage_item['attr'] = base_attr storage_item['engine_search'] = self.browser storage_item['number_snippet'] = num_snippet itemproc.process_item(storage_item, self)