def parse_detail(self, response): try: # 数据获取不全 data = Selector(text=response.body.decode('gbk')) items = str(data.xpath('string(.)').extract()[0]).replace( '\xa0', '').replace('\u3000', '') # 共有字段 fileTitle = data.xpath( '//td[@class="fh tac bw fwb f18-0 pl2 b0"]/text()' ).extract_first() # 正文标题 textTitle = data.xpath( '//td[@class="fh vat bw f8-0 b1"]/table[1]//tr[1]/td[@align="center"]/text()' ).extract_first() supllyType = response.meta.get('supllyType').strip() administration = response.meta.get('administration').strip() supplyNoticeTitle = response.meta.get('supplyNoticeTitle').strip() publishTime = response.meta.get('publishTime').strip() projectName = '' parcelNumber = '' parcelLocation = '' landPurpose = '' landArea = '' transferTimeLimit = '' transferPrice = '' landPurposeDetail = '' transferUnit = '' remark = '' publicityPeriod = '' contactUnit = '' unitAddr = '' postalCode = '' contactTel = '' contacter = '' email = '' lanServiceCondition = '' # 公告类型 # noticeType = # 公示期 publicityPeriod = reFunction(u'公示期:([\s\S]*)三、', reFunction('四、[\s\S]*', items)).strip() # 联系单位 contactUnit = reFunction(u'联系单位:([\s\S]*)单位地址', reFunction('四、[\s\S]*', items)).strip() # 单位地址 unitAddr = reFunction(u'单位地址:([\s\S]*)邮政编码', reFunction('四、[\s\S]*', items)).strip() # 邮政编码 postalCode = reFunction(u'邮政编码:([\s\S]*)联系电话', reFunction('四、[\s\S]*', items)).strip() # 联系电话 contactTel = reFunction(u'联系电话:([\s\S]*)联 系 人', reFunction('四、[\s\S]*', items)).strip() # 联系人 contacter = reFunction(u'联 系 人:([\s\S]*)电子邮件', reFunction('四、[\s\S]*', items)).strip() # 电子邮件 email = reFunction(u'电子邮件:([\w\.\@]*)(?:[\S]*)', reFunction('四、[\s\S]*', items)).strip() if '宗地编号' in items: for item in [ '宗地编号' + _ for _ in re.findall('([\s\S]*)二、', items) [0].split('宗地编号')[1:] ]: # 宗地编号 parcelNumber = reFunction('宗地编号:(?:\s*)([\s\S]*)地块位置', item).strip() # 地块位置 parcelArea parcelLocation = reFunction('地块位置:(?:\s*)([\s\S]*)土地用途:', item).strip() # 土地用途 landPurpose = reFunction('土地用途:(?:\s*)([\s\S]*)土地面积\(公顷\)', item).strip() # 土地面积(公顷) landArea = reFunction( '土地面积\(公顷\):(?:\s*)([\w}/\.{]*)(?:\s*)', item).strip() # 项目名称 projectName = reFunction('项目名称:(?:\s*)([\s\S]*)土地用途明细', item).strip() # 出让年限 transferTimeLimit = reFunction( '出让年限:(?:\s*)([\s\S]*)成交价\(万元\)', item).strip() # 成交价(万元) transferPrice = reFunction( '成交价\(万元\):(?:\s*)([\s\S]*)土地用途明细', item).strip() # 土地用途明细(用途名称、面积) landPurposeDetail = reFunction( '(?:\s*)面积\(公顷\)(?:\s*)([\w}/\.{]*)受让单位', item).strip() if reFunction( '(?:\s*)面积\(公顷\)(?:\s*)([\w}/\.{]*)受让单位', item).strip() else reFunction( '(?:\s*)([\d\.]*)(?:[\s]*)受让单位', item).strip() # 受让单位 transferUnit = reFunction('受让单位:(?:\s*)([\w}/{]*)(?:\s*)', item).strip() # 土地使用条件 lanServiceCondition = reFunction( '土地使用条件:(?:\s*)([\s\S]*)备注', item).strip() # 备注 # remark = reFunction(u'备注:(?:\s*)([\w}/,、\u4e00-\uffe5()《》:\-\.<≤。{\u3002\uff1f\uff01\uff0c\u3001\uff1b\uff1a\u201c\u201d\u2018\u2019\uff08\uff09\u300a\u300b\u3008\u3009\u3010\u3011\u300e\u300f\u300c\u300d\ufe43\ufe44\u3014\u3015\u2026\u2014\uff5e\ufe4f\uffe5]*)(?:\s*)', item).strip() remark = reFunction(u'备注:(?:\s*)([\s\S]*)(?:\s*)[二、]?', item).strip() # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(parcelNumber + publishTime + parcelLocation + url) # 存储数据 csvFile = [ administration, supplyNoticeTitle, publishTime, fileTitle, textTitle, projectName, parcelNumber, parcelLocation, landPurpose, landArea, transferTimeLimit, transferPrice, landPurposeDetail, transferUnit, remark, publicityPeriod, contactUnit, unitAddr, postalCode, contactTel, contacter, email, lanServiceCondition, crawlingTime, url, md5Mark ] self.fileDetail.write(','.join([ _.replace(',', ' ').replace('\n', '').replace( '\r', '') if _ else _ for _ in csvFile ])) self.fileDetail.write('\n') yield #TODO elif '地块编号' in items: for item in [ '地块编号' + _ for _ in re.findall('([\s\S]*)二、', items) [0].split('地块编号')[1:] ]: # 地块编号 parcelNumber = reFunction('地块编号:(?:\s*)([\s\S]*)地块位置', item).strip() # 地块位置 parcelArea parcelLocation = reFunction('地块位置:(?:\s*)([\s\S]*)土地用途:', item).strip() # 土地用途 landPurpose = reFunction('土地用途:(?:\s*)([\s\S]*)土地面积\(公顷\)', item).strip() # 土地面积(公顷) landArea = reFunction( '土地面积\(公顷\):(?:\s*)([\w}/\.{]*)(?:\s*)', item).strip() # 项目名称 projectName = reFunction('项目名称:(?:\s*)([\s\S]*)土地用途明细', item).strip() # 出让年限 transferTimeLimit = reFunction( '出让年限:(?:\s*)([\s\S]*)成交价\(万元\)', item).strip() # 成交价(万元) transferPrice = reFunction( '成交价\(万元\):(?:\s*)([\s\S]*)土地用途明细', item).strip() # 土地用途明细(用途名称、面积) landPurposeDetail = reFunction( '(?:\s*)面积\(公顷\)(?:\s*)([\w}/\.{]*)受让单位', item).strip() if reFunction( '(?:\s*)面积\(公顷\)(?:\s*)([\w}/\.{]*)受让单位', item).strip() else reFunction( '(?:\s*)([\d\.]*)(?:[\s]*)受让单位', item).strip() # 受让单位 transferUnit = reFunction('受让单位:(?:\s*)([\w}/{]*)(?:\s*)', item).strip() # 土地使用条件 lanServiceCondition = reFunction( '土地使用条件:(?:\s*)([\s\S]*)备注', item).strip() # 备注 remark = reFunction(u'备注:(?:\s*)([\s\S]*)(?:\s*)[二、]?', item).strip() # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(parcelNumber + publishTime + parcelLocation + url) # 存储数据 csvFile = [ administration, supplyNoticeTitle, publishTime, fileTitle, textTitle, projectName, parcelNumber, parcelLocation, landPurpose, landArea, transferTimeLimit, transferPrice, landPurposeDetail, transferUnit, remark, publicityPeriod, contactUnit, unitAddr, postalCode, contactTel, contacter, email, lanServiceCondition, crawlingTime, url, md5Mark ] self.fileDetail.write(','.join([ _.replace(',', ' ').replace('\n', '').replace( '\r', '') if _ else _ for _ in csvFile ])) self.fileDetail.write('\n') #TODO except Exception as e: self.log(f'详情页数据解析失败, 错误: {e}', level=logging.ERROR)
def parse_detail(self, response): try: data = Selector(text=response.body.decode('utf-8')) items = str(data.xpath('string(.)').extract()[0]).replace( '\xa0', '').replace('\u3000', '') '''data.xpath("string(path)") path -- xpath提取的路径 这里提取到父标签 ''' # TODO 共有字段 # 标题 BT_10 = response.meta.get('title') LY = data.xpath( '//div[@class="content-small-title"]/text()').extract_first() # 来源 LY_11 = reFunction(f'来源:\s*([{self.reStr}]*)\s', LY) # 时间 SJ_12 = reFunction(f'时间:\s*([{self.reStr}]*)\s', LY) # 编号 BH_13 = ''.join( data.xpath("string(//table[1]/tbody/tr[2]/td[1])").extract()) # 土地位置 TDWZ_14 = ''.join( data.xpath("string(//table[1]/tbody/tr[2]/td[2])").extract()) # 使用权面积 SYQMJ_15 = ''.join( data.xpath("string(//table[1]/tbody/tr[2]/td[3])").extract()) # TODO 规划用地性质 GHYDXZ_16 = ''.join( data.xpath("string(//table[1]/tbody/tr[2]/td[4])").extract()) # 出让年限 CRNX_17 = ''.join( data.xpath("string(//table[1]/tbody/tr[2]/td[5])").extract()) # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(url + BT_10 + SJ_12) # 是否需要判断重复 请求 if DUPLICATE_SWITCH: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: # 重复效验通过, 存储数据 csvFile = [ BT_10, LY_11, SJ_12, BH_13, TDWZ_14, SYQMJ_15, GHYDXZ_16, CRNX_17, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\r', '').replace( r'\xa0', '').replace('\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) except Exception as e: print(response.url) self.log( f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR)
def parse_detail(self, response): try: data = Selector(text=response.body.decode('gbk')) items = str(data.xpath('string(.)').extract()[0]).replace('\xa0', '').replace('\u3000', '') # 共有字段 supplyLandTitle = response.meta.get('supplyLandTitle') administration = response.meta.get('administration') publishTime = response.meta.get('publishTime') # detailPage # 写入时, 没有的字段置为空 totalSupplyLand = '' yearSupplyPlan = '' industrialLand = '' businessLand = '' totalHousionSupply = '' low_rentLand = '' affordableHousing = '' pengGaiLand = '' low_rentpengGaiLand = '' pengGaiAffordableHousing = '' pengGaiCommercialHousing = '' commercialHousing = '' ortherHousingLand = '' publicServiceLand = '' transportationLand = '' waterAreaLand = '' specialLand = '' publicRentalLand = '' limitCommercialLand = '' mediumCommercialLand = '' totalCommercialLand = '' commercialRatio = '' if '公共管理与公共服务用地' in items and '合计' in items and '特殊用地' in items and '水域及水利设施用地' in items and reFunction('经济适用房用[地](?:\s*)([\S\s]*)(?:\s*)棚改用地', items): # 文件标题 fileTitle = data.xpath('//td[@class="fh tac bw fwb f18-0 pl2 b0"]/text()').extract_first() # 总供应面积合计 totalSupplyLand = reFunction('合计(?:\s*)([\d\.]*)(?:\s*)', items) # 供应计划年度 yearSupplyPlan = reFunction('(\d{4})年度国有建设用地供应计划', items) # 工矿仓储用地: 供应面积(公顷)、新增、存量、 industrialLand = reFunction('工矿仓储用地(?:\s*)([\S\s]*)(?:\s*)商服用地', items) # 商服用地: 供应面积(公顷)、新增、存量 businessLand = reFunction('商服用地(?:\s*)([\S\s]*)(?:\s*)住宅用地', items) # # 住房供地总量 # totalHousionSupply = reFunction('小计(?:\s*)([\d\.]*)(?:\s*)([\d\.]*)(?:\s*)([\d\.]*)(?:\s*)', items)[0] if reFunction('小计(?:\s*)([\d\.]*)(?:\s*)([\d\.]*)(?:\s*)([\d\.]*)(?:\s*)', items) else ' ' # 住宅用地 - 廉租房用地: 供应面积(公顷)、新增、存量、 low_rentLand = '|'.join(reFunction('廉租房用地(?:\s*)([\S\s]*)(?:\s*)棚改用地', items)) # 住宅用地经济适用房用地: 供应面积(公顷)、新增、存量 affordableHousing = '|'.join(reFunction('经济适用房用[地](?:\s*)([\S\s]*)(?:\s*)棚改用地', items)) # 住宅用地 - 棚改用地 pengGaiLand = '|'.join(reFunction('棚改用地(?:\s*)([\S\s]*)(?:\s*)经济适用房用', items)) # # 住宅用地棚改用地廉租房: 供应面积(公顷) ,新增、存量 # low_rentpengGaiLand = '|'.join(reFunction('棚改用地(?:\s*)([\S\s]*)(?:\s*)经济适用房用', items)) # 住宅用地 - 商品房用地: 供应面积(公顷)、新增、存量 commercialHousing = '|'.join(reFunction('商品房用地(?:\s*)([\S\s]*)(?:\s*)其他用地', items)) # 住宅用地 - 其他用地: 供应面积(公顷)、新增、存量 ortherHousingLand = '|'.join(reFunction('其他用地(?:\s*)([\S\s]*)(?:\s*)小计', items)) # 公共管理与公共服务用地: 供应面积(公顷)、新增、存量 publicServiceLand = '|'.join(reFunction('公共管理与公共服务用地(?:\s*)([\S\s]*)(?:\s*)交通运输用地', items)) # 交通运输用地: 供应面积(公顷)、新增、存量 transportationLand = '|'.join(reFunction('交通运输用地(?:\s*)([\S\s]*)(?:\s*)水域及水利设施用地', items)) # 水域及水利设施用地: 供应面积(公顷)、新增、存量 waterAreaLand = '|'.join(reFunction('水域及水利设施用地(?:\s*)([\S\s]*)(?:\s*)特殊用地', items)) # 特殊用地: 供应面积(公顷)、新增、存量 specialLand = '|'.join(reFunction('特殊用地(?:\s*)([\S\s]*)(?:\s*)合计', items)) # 唯一标识 # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # 爬取地址url url = response.url md5Mark = encrypt_md5(fileTitle + totalSupplyLand + yearSupplyPlan + url) csvFile = [administration, supplyLandTitle, publishTime, fileTitle, totalSupplyLand, yearSupplyPlan, industrialLand, businessLand, totalHousionSupply, low_rentLand, affordableHousing, pengGaiLand, low_rentpengGaiLand, pengGaiAffordableHousing, pengGaiCommercialHousing, commercialHousing, ortherHousingLand, publicServiceLand, transportationLand, waterAreaLand, specialLand, publicRentalLand, limitCommercialLand, mediumCommercialLand, totalCommercialLand, commercialRatio, crawlingTime, url, md5Mark] self.fileDetail.write(','.join([_.replace(',', ' ').replace('\n', '').replace('\r', '') if _ else _ for _ in csvFile])) self.fileDetail.write('\n') yield elif '总供应面积合计:' in items and '各类棚户区改造用地' in items \ and reFunction('总供应面积合计:(?:\s*)([\S\s]*)(?:\s*)供应计划年度', items) \ and len(re.split('\s*', reFunction('商品住房(?:\s*)([\s\S]*)', reFunction('总 量(?:\s*)([\s\S]*)%', items)))) > 3: # 文件标题 fileTitle = data.xpath('//td[@class="fh tac bw fwb f18-0 pl2 b0"]/text()').extract_first() for item in [reFunction('([\S\s]*)(?:[\d\.]*%)', '总供应面积合计:' + _) for _ in re.findall('([\s\S]*)', items)[0].split('总供应面积合计:')[1:]]: # 总供应面积合计 totalSupplyLand = reFunction('总供应面积合计:(?:\s*)([\d\w\.]*)(?:\s*)', item) # 供应计划年度 yearSupplyPlan = reFunction('供应计划年度:(?:\s*)([\d\w\.]*)(?:\s*)', item) # 商服用地: 供应面积(公顷)、新增、存量 businessLand = reFunction('商服用地:(?:\s*)([\d\w\.]*)(?:\s*)', item) # 工矿仓储用地: 供应面积(公顷)、新增、存量、 industrialLand = reFunction('工矿仓储用地:(?:\s*)([\d\w\.]*)(?:\s*)', item) # businessLand # 住房供地总量 totalHousionSupply = reFunction('住房供地总量:(?:\s*)([\S\s]*)(?:\s*)其中存量', item) # 先获取数字在一一对应 dataList = re.split('\s*', reFunction('商品住房(?:\s*)([\s\S]*)', reFunction('总 量(?:\s*)([\s\S]*)[%]?', item))) # 住宅用地 - 廉租房用地: 供应面积(公顷)、新增、存量、 low_rentLand = dataList[0] # 住宅用地经济适用房用地: 供应面积(公顷)、新增、存量 affordableHousing = dataList[1] # 住宅用地 - 棚改用地 - 总量 pengGaiLand = dataList[2] # 住宅用地棚改用地廉租房: 供应面积(公顷) 新增、存量 low_rentpengGaiLand = dataList[3] # 住宅用地 - 棚改用地经济适用房用地: 供应面积公顷)、新增、存量 pengGaiAffordableHousing = dataList[4] # 住宅用地 - 棚改用地 - 中小套型商品住房: 供应面积(公顷)、新增、存量 pengGaiCommercialHousing = dataList[5] # # 住宅用地 - 商品房用地: 供应面积(公顷)、新增、存量 # commercialHousing = dataList[9] # 公共租赁房: 划拨用地面积、出让用地面积 publicRentalLand = dataList[6] + '|' + dataList[7] # 限价商品房用地面积 limitCommercialLand = dataList[8] # 商品住房用地 - 中小套型商品住房用地 mediumCommercialLand = dataList[10] # 商品住房用地 - 总量 totalCommercialLand = dataList[9] # 保障性安居工程和中小套型商品房用地占比( %) commercialRatio = dataList[11] + '%' if '%' not in dataList[11] else dataList[11] # 公共管理与公共服务用地: 供应面积(公顷)、新增、存量 publicServiceLand = reFunction('公共管理与服务用地:(?:\s*)([\d\w\.]*)(?:\s*)', item) # 交通运输用地: 供应面积(公顷)、新增、存量 transportationLand = reFunction('交通运输用地:(?:\s*)([\d\w\.]*)(?:\s*)', item) # 水域及水利设施用地: 供应面积(公顷)、新增、存量 waterAreaLand = reFunction('水域及水利设施用地:(?:\s*)([\d\w\.]*)(?:\s*)', item) # 特殊用地: 供应面积(公顷)、新增、存量 specialLand = reFunction('特殊用地:(?:\s*)([\d\w\.]*)(?:\s*)', item) # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # 爬取地址url url = response.url md5Mark = encrypt_md5(fileTitle + totalSupplyLand + yearSupplyPlan + url) csvFile = [administration, supplyLandTitle, publishTime, fileTitle, totalSupplyLand, yearSupplyPlan, industrialLand, businessLand, totalHousionSupply, low_rentLand, affordableHousing, pengGaiLand, low_rentpengGaiLand, pengGaiAffordableHousing, pengGaiCommercialHousing, commercialHousing, ortherHousingLand, publicServiceLand, transportationLand, waterAreaLand, specialLand, publicRentalLand, limitCommercialLand, mediumCommercialLand, totalCommercialLand, commercialRatio, crawlingTime, url, md5Mark] self.fileDetail.write(','.join([_.replace(',', ' ').replace('\n', '').replace('\r', '') if _ else _ for _ in csvFile])) self.fileDetail.write('\n') yield else: # 文件标题 fileTitle = data.xpath('//td[@class="fh tac bw fwb f18-0 pl2 b0"]/text()').extract_first() # 商服用地:供应面积(公顷)、新增、存量 businessLand = reFunction('商服用地[占]?(?:\s*)([[\d}/\.{]*)(?:\s*)公顷', items) # 工矿仓储用地:供应面积(公顷)、新增、存量、 industrialLand = reFunction('工矿仓储用地[占]?(?:\s*)([[\d}/\.{]*)(?:\s*)公顷', items) # 住房供地总量 totalHousionSupply = reFunction('住宅用地[占]?(?:\s*)([[\d}/\.{]*)(?:\s*)公顷', items) # 公共管理与公共服务用地 publicServiceLand = reFunction('公共管理与公共服务用地[占]?(?:\s*)([[\d}/\.{]*)(?:\s*)公顷', items) # 交通运输用地:供应面积(公顷)、新增、存量 transportationLand = reFunction('交通运输用地[占]?(?:\s*)([[\d}/\.{]*)(?:\s*)公顷', items) # 水域及水利设施用地:供应面积(公顷)、新增、存量 waterAreaLand = reFunction('水域及水利设施用地[占]?(?:\s*)([[\d}/\.{]*)(?:\s*)公顷', items) # 特殊用地: 供应面积(公顷)、新增、存量 specialLand = reFunction('特殊用地(?:\s*)([[\d}/\.{]*)(?:\s*)公顷', items) # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # 爬取地址url url = response.url md5Mark = encrypt_md5(fileTitle + totalSupplyLand + yearSupplyPlan + url) csvFile = [administration,supplyLandTitle,publishTime,fileTitle,totalSupplyLand,yearSupplyPlan,industrialLand,businessLand, totalHousionSupply,low_rentLand,affordableHousing,pengGaiLand,low_rentpengGaiLand, pengGaiAffordableHousing,pengGaiCommercialHousing,commercialHousing,ortherHousingLand,publicServiceLand, transportationLand,waterAreaLand,specialLand,publicRentalLand,limitCommercialLand,mediumCommercialLand, totalCommercialLand,commercialRatio, crawlingTime, url, md5Mark] self.fileDetail.write(','.join([_.replace(',', ' ').replace('\n', '').replace('\r', '') if _ else _ for _ in csvFile])) self.fileDetail.write('\n') yield except Exception as e: self.log(f'详情页数据解析失败, 错误: {e}', level=logging.ERROR)
def parse_detail(self, response): # TODO 主动关闭爬虫问题 try: data = Selector(text=response.body.decode('utf-8')) items = str(data.xpath('string(.)').extract()[0]).replace( '\xa0', '').replace('\u3000', '') WJBT_27 = '' SJ_28 = '' LY_29 = '' WJBT_30 = '' ZDBH_31 = '' BH_32 = '' DKWZ_33 = '' TDWZ_34 = '' TDMJM_35 = '' TDMJPFM_36 = '' TDYT_37 = '' CJJ_38 = '' JDR_39 = '' GSQ_40 = '' LXDW_41 = '' DWDZ_42 = '' YZBM_43 = '' LXDH_44 = '' # TODO 共有字段 reFunction(f'时间:\s*([{self.reStr}]*)\s', LY) # 文件标题 WJBT_27 = response.meta.get('title') # 时间 SJ_28 = data.xpath( '//div[@class="ztzx_frame_subtitle_l"]/span[1]/text()' ).extract_first() # 来源 LY_29 = data.xpath( '//div[@class="ztzx_frame_subtitle_l"]/span[2]/text()' ).extract_first() # 文件编号 WJBT_30 = data.xpath( '//div[@class="ztzx_frame_content"]/div[1]/text()' ).extract_first() # 公示期 GSQ_40 = reFunction( f'公示期:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)。', items) # 联系单位 LXDW_41 = reFunction( '联系单位:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 单位地址 DWDZ_42 = reFunction( '单位地址:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 邮政编码 YZBM_43 = reFunction( '邮政编码:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 联系电话 LXDH_44 = reFunction( '联系电话:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(url + WJBT_27 + SJ_28) soup = BeautifulSoup( response.body.decode('utf-8').replace('thead', 'tbody')) table = soup.find('table') htmlTable = htmlTableTransformer() if table: if '竣工时间' in items: try: tdData = htmlTable.tableTrTdUNregulationToList(table) for _ in range(len(list(tdData.values())[0])): # 宗地编号 ZDBH_31 = tdData.get('地块编号')[_] if tdData.get( '地块编号') else '' # 地块位置 DKWZ_33 = tdData.get('位置')[_] if tdData.get( '位置') else '' # 土地位置 TDWZ_34 = tdData.get('位置')[_] if tdData.get( '位置') else '' # 土地面积(亩) TDMJM_35 = tdData.get( '出让面积平方米/亩')[_] if tdData.get( '出让面积平方米/亩') else '' # 土地面积(平方米) TDMJPFM_36 = tdData.get(list( tdData.keys())[7])[_] if tdData.get( list(tdData.keys())[7]) else '' # 土地用途 TDYT_37 = tdData.get('用途')[_] if tdData.get( '用途') else '' # 成交价(万元) CJJ_38 = tdData.get('成交价(万元)')[_] if tdData.get( '成交价(万元)') else tdData.get( '成交价(万元)')[_] if tdData.get( '成交价(万元)') else '' # 竞得人 JDR_39 = tdData.get('受让人')[_] if tdData.get( '受让人') else '' # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist( md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if TDYT_37: # 重复效验通过, 存储数据 csvFile = [ WJBT_27, SJ_28, LY_29, WJBT_30, ZDBH_31, BH_32, DKWZ_33, TDWZ_34, TDMJM_35, TDMJPFM_36, TDYT_37, CJJ_38, JDR_39, GSQ_40, LXDW_41, DWDZ_42, YZBM_43, LXDH_44, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace( ',', ' ' ).replace('\n', '').replace( '\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield except: for tdData in table.find_all('tr')[2:]: # 宗地编号 ZDBH_31 = tdData.find_all('td')[4].string.strip() # 地块位置 DKWZ_33 = tdData.find_all('td')[5].string.strip() # 土地位置 TDWZ_34 = tdData.find_all('td')[5].string.strip() # 土地面积(亩) TDMJM_35 = tdData.find_all('td')[6].string.strip() # 土地面积(平方米) TDMJPFM_36 = tdData.find_all( 'td')[7].string.strip() # 土地用途 TDYT_37 = tdData.find_all('td')[8].string.strip() # 成交价(万元) CJJ_38 = tdData.find_all('td')[9].string.strip() # 竞得人 JDR_39 = tdData.find_all('td')[3].string.strip() # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist( md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if TDYT_37: # 重复效验通过, 存储数据 csvFile = [ WJBT_27, SJ_28, LY_29, WJBT_30, ZDBH_31, BH_32, DKWZ_33, TDWZ_34, TDMJM_35, TDMJPFM_36, TDYT_37, CJJ_38, JDR_39, GSQ_40, LXDW_41, DWDZ_42, YZBM_43, LXDH_44, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace( ',', ' ' ).replace('\n', '').replace( '\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield elif '转让方' not in items: if len(table.find_all('tr')[1].find_all('td')) < 5: table.find_all('tr')[1].extract() table.find_all('tr')[0].find_all('td')[-1].extract() tdData = htmlTable.tableTrTdRegulationToList(table) for _ in range(len(list(tdData.values())[0])): # 宗地编号 ZDBH_31 = tdData.get('宗地编号')[_] if tdData.get( '宗地编号') else '' # 编号 BH_32 = tdData.get('编号')[_] if tdData.get('编号') else '' # 地块位置 DKWZ_33 = tdData.get('地块位置')[_] if tdData.get( '地块位置') else '' # 土地位置 TDWZ_34 = tdData.get('土地位置')[_] if tdData.get( '土地位置') else '' # 土地面积(亩) TDMJM_35 = tdData.get('土地面积(亩)')[_] if tdData.get( '土地面积(亩)') else '' # 土地面积(平方米) TDMJPFM_36 = tdData.get('土地面积(平方米)')[_] if tdData.get( '土地面积(平方米)') else '' # 土地用途 TDYT_37 = tdData.get('土地用途')[_] if tdData.get( '土地用途') else '' # 成交价(万元) CJJ_38 = tdData.get('成交价(万元)')[_] if tdData.get( '成交价(万元)') else tdData.get( '成交价(万元)')[_] if tdData.get('成交价(万元)') else '' # 竞得人 JDR_39 = tdData.get('竞得人')[_] if tdData.get( '竞得人') else '' # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if TDYT_37: # 重复效验通过, 存储数据 csvFile = [ WJBT_27, SJ_28, LY_29, WJBT_30, ZDBH_31, BH_32, DKWZ_33, TDWZ_34, TDMJM_35, TDMJPFM_36, TDYT_37, CJJ_38, JDR_39, GSQ_40, LXDW_41, DWDZ_42, YZBM_43, LXDH_44, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace( ',', ' ').replace('\n', '').replace( '\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield elif '地块基本情况' in items: # 宗地编号 ZDBH_31 = reFunction( '宗地编号\s*([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 地块位置 DKWZ_33 = reFunction( '地块位置\s*([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 土地面积(亩) TDMJM_35 = reFunction( '土地面积\(公顷\)\s*([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 土地用途 TDYT_37 = reFunction( '土地用途\s*([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 成交价(万元) CJJ_38 = reFunction( '成交价\(万元\)\s*([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 竞得人 JDR_39 = reFunction( '受让单位\s*([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if TDYT_37: # 重复效验通过, 存储数据 csvFile = [ WJBT_27, SJ_28, LY_29, WJBT_30, ZDBH_31, BH_32, DKWZ_33, TDWZ_34, TDMJM_35, TDMJPFM_36, TDYT_37, CJJ_38, JDR_39, GSQ_40, LXDW_41, DWDZ_42, YZBM_43, LXDH_44, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) elif '转让方' in items: # 编号 BH_32 = reFunction( '不动产权登记证号:([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 地块位置 DKWZ_33 = reFunction( '宗地位置:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 土地面积(平方米) TDMJPFM_36 = reFunction( '面\s*积:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 土地用途 TDYT_37 = reFunction( '土地用途:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 成交价(万元) # CJJ_38 # 竞得人 JDR_39 = reFunction( '受让方:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if TDYT_37: # 重复效验通过, 存储数据 csvFile = [ WJBT_27, SJ_28, LY_29, WJBT_30, ZDBH_31, BH_32, DKWZ_33, TDWZ_34, TDMJM_35, TDMJPFM_36, TDYT_37, CJJ_38, JDR_39, GSQ_40, LXDW_41, DWDZ_42, YZBM_43, LXDH_44, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\t', '').replace( '\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) except Exception as e: print(response.url) self.log( f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR)
def parse_detail(self, response): try: data = Selector(text=response.body.decode('gbk')) items = str(data.xpath('string(.)').extract()[0]).replace( '\xa0', '').replace('\u3000', '') # 按照宗地编号来获取一页有几条数据 # dataCount = len(list(filter(None, re.findall('宗地编号', items)))) # 共有字段 fileTitle = data.xpath( '//td[@class="fh tac bw fwb f18-0 pl2 b0"]/text()' ).extract_first() textTitle = data.xpath( '//td[@class="fh vat bw f8-0 b1"]/table[1]//tr[1]/td[@align="center"]/text()' ).extract_first() noticeType = response.meta.get('noticeType').strip() administration = response.meta.get('administration').strip() supplyNoticeTitle = response.meta.get('supplyNoticeTitle').strip() publishTime = response.meta.get('publishTime').strip() parcelNumber = '' parcelArea = '' parcelLocation = '' transferTimeLimit = '' plotRatio = '' buildingDensity = '' greenRatio = '' buldingHP = '' landPurpose = '' investmentIntensity = '' cashDeposit = '' evaluateNum = '' landCondition = '' startPrice = '' bidIncrenment = '' hangOutDeadTime = '' hangOutStartTime = '' supportingInfrastructure = '' landItact = '' sewageDisposalFacility = '' remark = '' transferTime = reFunction( u'申请人可于((?:[\w\s\u4e00-\u9fa5]*)至(?:[\s\w\u4e00-\u9fa5]*))到', reFunction('四、[\s\S]*五、', items)).strip() transferAddr = reFunction( u'申请人可于(?:[\w\s\u4e00-\u9fa5]*)至(?:[\s\w\u4e00-\u9fa5]*)到 ([\s\S\w\u4e00-\u9fa5.\n\r]*出让文件)', reFunction('四、[\s\S]*五、', items)).strip().replace('获取 挂牌 出让文件', '') try: time1 = reFunction(u'保证金的截止时间为([\w\s\u4e00-\u9fa5]*)。', reFunction('五、[\s\S]*六、', items)).strip() time2 = reFunction(u'将在([\w\s\u4e00-\u9fa5]*)前确认其竞买资格', reFunction('五、[\s\S]*六、', items)).strip() # 保证金截止时间 # time.strftime("%Y-%m-%d %H:%M", time.strptime('2020年05月19日09时00分', u"%Y年%m月%d日%H时%M分")) depositTime = time.strftime( "%Y-%m-%d %H:%M", time.strptime(time1, u"%Y年%m月%d日%H时%M分")) # 确认竞买资格时间 affirmBuyTime = time.strftime( "%Y-%m-%d %H:%M", time.strptime(time2, u"%Y年%m月%d日%H时%M分")) except: # 保证金截止时间 depositTime = time1 # 确认竞买资格时间 affirmBuyTime = time2 # 联系地址 address = reFunction(u'联系地址:([\s\S]*)联 系 人', reFunction('八、[\s\S]*', items)).strip() # 电话 tel = reFunction(u'联系电话:([\s\S]*)开户单位', reFunction('八、[\s\S]*', items)).strip() # 联系人 linkman = reFunction(u'联系电话:([\s\S]*)开户单位', reFunction('八、[\s\S]*', items)).strip() # 开户单位 accountOpener = reFunction(u'开户单位:([\s\S]*)开户银行', reFunction('八、[\s\S]*', items)).strip() # 开户银行 depositBank = reFunction(u'开户银行:([\s\S]*)银行帐号', reFunction('八、[\s\S]*', items)).strip() # 银行帐号 bankAccount = reFunction(u'银行帐号:([\w]*)(?:[\S]*)', reFunction('八、[\s\S]*', items)).strip() if '宗地编号' in items: for item in [ '宗地编号' + _ for _ in re.findall('([\s\S]*)二、', items) [0].split('宗地编号')[1:] ]: # 宗地编号 parcelNumber = reFunction('宗地编号:(?:\s*)([\s\S]*)宗地总面积', item).strip() # 宗地面积 parcelArea parcelArea = reFunction('宗地总面积:(?:\s*)([\w}/{]*)(?:\s*)', item).strip() # 宗地坐落 parcelLocation parcelLocation = reFunction('宗地坐落:(?:\s*)([\s\S]*)出让年限', item).strip() # 岀让年限 transferTimeLimit transferTimeLimit = reFunction( '出让年限:(?:\s*)([\w}/{]*)(?:\s*)', item).strip() # 容积率 plotRatio plotRatio = reFunction('容积率:(?:\s*)([\w}/{]*)(?:\s*)', item).strip() # 建筑密度(%) buildingDensity buildingDensity = reFunction('建筑密度\(%\):([\s\S]*)绿化率', item).strip() # 绿地率(%) greenRatio greenRatio = reFunction('绿[地化]率\(%\):([\s\S]*)建筑限高', item).strip() # 建筑限高(米) buldingHP buldingHP = reFunction('建筑限高\(米\):(?:\s*)([\w}{/]*)主要用途', item).strip() # 土地用途 landPurpose landPurpose = reFunction('主要用途:(?:\s*)([\w}{/]*)(?:\s*)', item).strip() # 投资强度 investmentIntensity investmentIntensity = reFunction( '投资强度:(?:\s*)([\w}{/]*)(?:\s*)保证金', item).strip() # 保证金 cashDeposit cashDeposit = reFunction('保证金:(?:\s*)([\w}{/]*)(?:\s*)', item).strip() # 估价报告备案号 evaluateNum evaluateNum = reFunction( '估价报告备案号(?:\s*)([A-Za-z0-9_}{/]*)(?:\s*)', item).strip() # 现状土地条件 landCondition landCondition = reFunction( '([:\u4e00-\u9fa5 ]*)', reFunction('估价报告备案号:([\s\S]*)起始价', item)).strip() # TODO 起始价 startPrice startPrice = reFunction('起始价:(?:\s*)([\w}/{]*)(?:\s*)', item).strip() # 加价幅度 bidIncrenment bidIncrenment = reFunction('加价幅度:(?:\s*)([\w}/{]*)(?:\s*)', item).strip() try: time3 = reFunction( '挂牌[(竞价)]*截止时间:(?:\s*)([\w}/{]*)(?:\s*)', item).strip() time4 = reFunction( '挂牌[(竞价)]*开始时间:(?:\s*)([\w}/{]*)(?:\s*)', item).strip() # 挂牌截止时间 hangOutDeadTime = time.strftime( "%Y-%m-%d %H:%M", time.strptime(time3, u"%Y年%m月%d日%H时%M分")) # 挂牌开始时间 hangOutStartTime = time.strftime( "%Y-%m-%d %H:%M", time.strptime(time4, u"%Y年%m月%d日%H时%M分")) except: # 保证金截止时间 depositTime = time3 # 确认竞买资格时间 affirmBuyTime = time4 # 基础设施配套 supportingInfrastructure supportingInfrastructure = reFunction( '基础设施配套:(?:\s*)([\w}/{]*)(?:\s*)', item).strip() # 是否土地平整 landItact landItact = reFunction( '是否土地平整[: :](?:\s*)([\w}/{]*)(?:\s*)', item).strip() # 排污设施状况 sewageDisposalFacility sewageDisposalFacility = reFunction( '排污设施状况:(?:\s*)([\w}/{]*)(?:\s*)', item).strip() # 备注 remark remark = reFunction('备注:([\s\S]*)(?:\s*)', item).strip() # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(fileTitle + publishTime + transferTime + url) csvFile = [ fileTitle, textTitle, noticeType, administration, supplyNoticeTitle, publishTime, transferTime, transferAddr, depositTime, affirmBuyTime, address, tel, linkman, accountOpener, depositBank, bankAccount, parcelNumber, parcelArea, parcelLocation, transferTimeLimit, plotRatio, buildingDensity, greenRatio, buldingHP, landPurpose, investmentIntensity, cashDeposit, evaluateNum, landCondition, startPrice, bidIncrenment, hangOutDeadTime, hangOutStartTime, supportingInfrastructure, landItact, sewageDisposalFacility, remark, crawlingTime, url, md5Mark, '\n' ] # 存储数据 self.fileDetail.write(','.join([ _.replace(',', ' ').replace('\n', '').replace( '\r', '') if _ else _ for _ in csvFile ])) self.fileDetail.write('\n') yield # TODO except Exception as e: self.log(f'详情页数据解析失败, 错误: {e}', level=logging.ERROR)
def parse_detail(self, response): # TODO 主动关闭爬虫问题 try: data = Selector(text=response.body.decode('utf-8')) items = str(data.xpath('string(.)').extract()[0]).replace( '\xa0', '').replace('\u3000', '') WJBT_45 = '' SJ_46 = '' LY_47 = '' ZWBT_48 = '' DKBH_49 = '' ZDBH_50 = '' PMJG_51 = '' GGZRFS_52 = '' GPSJ_53 = '' ZRR_54 = '' ZRF_55 = '' SRR_56 = '' SRF_57 = '' SRDW_58 = '' WZ_59 = '' DKWZ_60 = '' CRMJ_61 = '' YT_62 = '' CJJ_63 = '' BDCQDJH_64 = '' CRHTBH_65 = '' CRHT_66 = '' BGXYBH_67 = '' TDYT_68 = '' SYNX_69 = '' MJ_70 = '' TDMJ_71 = '' ZRJG_72 = '' CRNX_73 = '' TDSYNX_74 = '' BZ_75 = '' GSQ_76 = '' LXDW_77 = '' DWDZ_78 = '' YZBM_79 = '' LXDH_80 = '' LXR_81 = '' DZYJ_82 = '' # TODO 共有字段 reFunction(f'时间:\s*([{self.reStr}]*)\s', LY) # 文件标题 WJBT_45 = response.meta.get('title') # 时间 SJ_46 = data.xpath( '//div[@class="ztzx_frame_subtitle_l"]/span[1]/text()' ).extract_first() # 来源 LY_47 = data.xpath( '//div[@class="ztzx_frame_subtitle_l"]/span[2]/text()' ).extract_first() # 正文标题 ZWBT_48 = data.xpath( '//div[@class="ztzx_frame_content"]/div[1]/text()' ).extract_first() # 公示期 GSQ_76 = reFunction( f'公示期:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)[。\s]', items) # 联系单位 LXDW_77 = reFunction( '联系单位:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 单位地址 DWDZ_78 = reFunction( '单位地址:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 邮政编码 YZBM_79 = reFunction( '邮政编码:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 联系电话 LXDH_80 = reFunction( '联系电话:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 联系人 LXR_81 = reFunction( '联\s*系\s*人:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 电子邮件 DZYJ_82 = reFunction( '电子邮件:([()\w\.:: —\(\)@〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(url + WJBT_45 + SJ_46) soup = BeautifulSoup( response.body.decode('utf-8').replace('thead', 'tbody')) table = soup.find('table') htmlTable = htmlTableTransformer() if '国有划拨土地使用权结果公示' in items: table.find_all('tr')[1].extract() tdData = htmlTable.tableTrTdRegulationToList(table) for _ in range(len(list(tdData.values())[0])): # 地块编号 DKBH_49 = tdData.get('地块编号')[_] if tdData.get( '地块编号') else '' # 公开转让方式 GGZRFS_52 = tdData.get('公开转让方式')[_] if tdData.get( '公开转让方式') else '' # 挂牌时间 GPSJ_53 = tdData.get('挂牌')[_] if tdData.get('挂牌') else '' # 受让人 SRR_56 = tdData.get('受让人')[_] if tdData.get('受让人') else '' # 位置 WZ_59 = tdData.get('位置')[_] if tdData.get('位置') else '' # 出让面积(平方米) CRMJ_61 = tdData.get('出让面积')[_] if tdData.get( '出让面积') else '' # 用途 YT_62 = tdData.get('用途')[_] if tdData.get('用途') else '' # 成交价(万元) CJJ_63 = tdData.get('成交价')[_] if tdData.get('成交价') else '' # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if True: # 重复效验通过, 存储数据 csvFile = [ WJBT_45, SJ_46, LY_47, ZWBT_48, DKBH_49, ZDBH_50, PMJG_51, GGZRFS_52, GPSJ_53, ZRR_54, ZRF_55, SRR_56, SRF_57, SRDW_58, WZ_59, DKWZ_60, CRMJ_61, YT_62, CJJ_63, BDCQDJH_64, CRHTBH_65, CRHT_66, BGXYBH_67, TDYT_68, SYNX_69, MJ_70, TDMJ_71, ZRJG_72, CRNX_73, TDSYNX_74, BZ_75, GSQ_76, LXDW_77, DWDZ_78, YZBM_79, LXDH_80, LXR_81, DZYJ_82, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\t', '').replace( '\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) elif '不动产权登记证号' in items: # 转让方 ZRF_55 = reFunction( '转让方:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 受让方 SRF_57 = reFunction( '受让方:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 位置 WZ_59 = reFunction( '宗地位置:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 不动产权登记证号 BDCQDJH_64 = reFunction( '不动产权登记证号:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 出让合同编号 CRHTBH_65 = reFunction( '出让合同编号:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 变更协议编号 BGXYBH_67 = reFunction( '出让合同变更协议编号:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 土地用途 TDYT_68 = reFunction( '土地用途:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 使用年限 SYNX_69 = reFunction( '使用年限:\s*([()【】\w\.::—\(\)〔〕\s㎡≤≥《》\-\/\%,;,、\.﹪]*)面\s*积', items) # 面积 MJ_70 = reFunction( '面\s*积:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 转让价格(单价总价) ZRJG_72 = reFunction( '转让价格:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、。\.﹪]*)\s', items) # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if True: # 重复效验通过, 存储数据 csvFile = [ WJBT_45, SJ_46, LY_47, ZWBT_48, DKBH_49, ZDBH_50, PMJG_51, GGZRFS_52, GPSJ_53, ZRR_54, ZRF_55, SRR_56, SRF_57, SRDW_58, WZ_59, DKWZ_60, CRMJ_61, YT_62, CJJ_63, BDCQDJH_64, CRHTBH_65, CRHT_66, BGXYBH_67, TDYT_68, SYNX_69, MJ_70, TDMJ_71, ZRJG_72, CRNX_73, TDSYNX_74, BZ_75, GSQ_76, LXDW_77, DWDZ_78, YZBM_79, LXDH_80, LXR_81, DZYJ_82, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\t', '').replace( '\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) elif '挂牌出让地块的基本情况和规划指标要求' in items: # 宗地编号 ZDBH_50 = reFunction( '宗地编号:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 挂牌时间 GPSJ_53 = reFunction( '挂牌时间为:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;。,、\.﹪]*)\s', items).replace('。', '') # 转让人 ZRR_54 = reFunction( '转让人为:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%\.﹪]*),', items) # 位置 WZ_59 = reFunction( '宗地坐落:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 土地用途 TDYT_68 = reFunction( '土地用途:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 面积 MJ_70 = reFunction( '宗地面积:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 出让年限 CRNX_73 = reFunction( '出让年限:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 备注 BZ_75 = reFunction( '备注:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;。,、\.﹪]*)\s*二', items) # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if True: # 重复效验通过, 存储数据 csvFile = [ WJBT_45, SJ_46, LY_47, ZWBT_48, DKBH_49, ZDBH_50, PMJG_51, GGZRFS_52, GPSJ_53, ZRR_54, ZRF_55, SRR_56, SRF_57, SRDW_58, WZ_59, DKWZ_60, CRMJ_61, YT_62, CJJ_63, BDCQDJH_64, CRHTBH_65, CRHT_66, BGXYBH_67, TDYT_68, SYNX_69, MJ_70, TDMJ_71, ZRJG_72, CRNX_73, TDSYNX_74, BZ_75, GSQ_76, LXDW_77, DWDZ_78, YZBM_79, LXDH_80, LXR_81, DZYJ_82, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\t', '').replace( '\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) elif '地块基本情况' in items: try: if '备注' not in items: tdData = htmlTable.tableTrTdRegulationToList(table) for _ in range(len(list(tdData.values())[0])): # 宗地编号 ZDBH_50 = tdData.get('宗地编号')[_] if tdData.get( '宗地编号') else '' # 受让单位 SRDW_58 = tdData.get('受让单位')[_] if tdData.get( '受让单位') else '' # 受让人 SRR_56 = tdData.get('竞得人')[_] if tdData.get( '竞得人') else '' # 地块位置 DKWZ_60 = tdData.get('地块位置')[_] if tdData.get( '地块位置') else '' # 土地用途 TDYT_68 = tdData.get('土地用途')[_] if tdData.get( '土地用途') else '' # 成交价(万元) CJJ_63 = tdData.get('成交价(万元)')[_] if tdData.get( '成交价(万元)') else '' # 土地面积(公顷) TDMJ_71 = tdData.get('土地面积(亩)')[_] if tdData.get( '土地面积(亩)') else '' # 出让年限 CRNX_73 = tdData.get('出让年限')[_] if tdData.get( '出让年限') else '' # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist( md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if True: # 重复效验通过, 存储数据 csvFile = [ WJBT_45, SJ_46, LY_47, ZWBT_48, DKBH_49, ZDBH_50, PMJG_51, GGZRFS_52, GPSJ_53, ZRR_54, ZRF_55, SRR_56, SRF_57, SRDW_58, WZ_59, DKWZ_60, CRMJ_61, YT_62, CJJ_63, BDCQDJH_64, CRHTBH_65, CRHT_66, BGXYBH_67, TDYT_68, SYNX_69, MJ_70, TDMJ_71, ZRJG_72, CRNX_73, TDSYNX_74, BZ_75, GSQ_76, LXDW_77, DWDZ_78, YZBM_79, LXDH_80, LXR_81, DZYJ_82, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace( ',', ' ' ).replace('\n', '').replace( '\t', '' ).replace('\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) else: if '竞得人' not in items: for item in [ '宗地编号' + _ for _ in re.findall('一([\s\S]*)二、', items) [0].split('宗地编号')[1:] ]: # 宗地编号 ZDBH_50 = reFunction('编号\s*([\w\-]*)\s', item) # 受让单位 SRDW_58 = reFunction( '受让单位\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 地块位置 DKWZ_60 = reFunction( '地块位置\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 成交价(万元) CJJ_63 = reFunction( '成交价\(万元\)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item ) if reFunction( '成交价\(万元\)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item ) else reFunction( '成交价(万元)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 土地用途 TDYT_68 = reFunction( '土地用途\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 土地面积(公顷) TDMJ_71 = reFunction( '土地\s*面积\s*\(公顷\)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 出让年限 CRNX_73 = reFunction( '出让年限\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 备注 BZ_75 = reFunction( '备注:\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)', item) if '二' in BZ_75: BZ_75 = '' # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist( md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if True: # 重复效验通过, 存储数据 csvFile = [ WJBT_45, SJ_46, LY_47, ZWBT_48, DKBH_49, ZDBH_50, PMJG_51, GGZRFS_52, GPSJ_53, ZRR_54, ZRF_55, SRR_56, SRF_57, SRDW_58, WZ_59, DKWZ_60, CRMJ_61, YT_62, CJJ_63, BDCQDJH_64, CRHTBH_65, CRHT_66, BGXYBH_67, TDYT_68, SYNX_69, MJ_70, TDMJ_71, ZRJG_72, CRNX_73, TDSYNX_74, BZ_75, GSQ_76, LXDW_77, DWDZ_78, YZBM_79, LXDH_80, LXR_81, DZYJ_82, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace( ',', ' ' ).replace('\n', '').replace( '\t', '').replace( '\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) except Exception as e: if '竞得人' not in items: for item in [ '宗地编号' + _ for _ in re.findall( '一([\s\S]*)二、', items)[0].split('宗地编号')[1:] ]: # 宗地编号 ZDBH_50 = reFunction('编号\s*([\w\-]*)\s', item) # 受让单位 SRDW_58 = reFunction( '受让单位\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 地块位置 DKWZ_60 = reFunction( '地块位置\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 成交价(万元) CJJ_63 = reFunction( '成交价\(万元\)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item ) if reFunction( '成交价\(万元\)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item ) else reFunction( '成交价(万元)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 土地用途 TDYT_68 = reFunction( '土地用途\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 土地面积(公顷) TDMJ_71 = reFunction( '土地\s*面积\s*\(公顷\)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 出让年限 CRNX_73 = reFunction( '出让年限\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 备注 BZ_75 = reFunction( '备注:\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)', item) if '二' in BZ_75: BZ_75 = '' # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist( md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if True: # 重复效验通过, 存储数据 csvFile = [ WJBT_45, SJ_46, LY_47, ZWBT_48, DKBH_49, ZDBH_50, PMJG_51, GGZRFS_52, GPSJ_53, ZRR_54, ZRF_55, SRR_56, SRF_57, SRDW_58, WZ_59, DKWZ_60, CRMJ_61, YT_62, CJJ_63, BDCQDJH_64, CRHTBH_65, CRHT_66, BGXYBH_67, TDYT_68, SYNX_69, MJ_70, TDMJ_71, ZRJG_72, CRNX_73, TDSYNX_74, BZ_75, GSQ_76, LXDW_77, DWDZ_78, YZBM_79, LXDH_80, LXR_81, DZYJ_82, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace( ',', ' ' ).replace('\n', '').replace( '\t', '' ).replace('\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) except Exception as e: print(response.url) self.log( f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR)
def parse_detail(self, response): try: # 数据获取不全 # 数据获取不全 categorynum = response.meta.get('categorynum') infoid = response.meta.get('infoid') targetUrl = "https://www.cqggzy.com/tiaozhuan.html?infoid=" + infoid + "&categorynum=" + categorynum results = '' for _ in range(5): try: self.session.get(targetUrl, headers=self.header, allow_redirects=False, timeout=60) redirectUrl = 'https://www.cqggzy.com/EpointWebBuilderService/getInfoListAndCategoryList.action?cmd=pageRedirect' data = {'categorynum': categorynum, 'infoid': infoid} response_ = self.session.post(redirectUrl, headers=self.header, data=data, allow_redirects=False, timeout=60) url = 'https://www.cqggzy.com' + response_.json().get( 'custom') if 'http' not in response_.json().get( 'custom') else response_.json().get('custom') results = self.session.get(url, headers=self.header, allow_redirects=False, timeout=60) break except Exception as e: pass data = Selector(text=results.content.decode('utf-8')) items = str(data.xpath('string(.)').extract()[0]).replace( '\xa0', '').replace('\u3000', '') GGXH_31 = '' ZDBH_32 = '' DKWZ_33 = '' TDYT_34 = '' TDMJ_35 = '' RJL_36 = '' JZMD_37 = '' LDL_38 = '' BZJ_39 = '' DJ_40 = '' JRZJMJ_41 = '' CRFS_42 = '' CRNX_43 = '' CJJ_44 = '' SRDW_45 = '' TDSYTJ_46 = '' JYSJ_47 = '' CJR_48 = '' BZ_49 = '' LXDW_50 = '' LXDZ_51 = '' LXDH_52 = '' GSQ_53 = '' # 共有字段 # 文件标题 WJBT_27 = data.xpath( '//*[@class="article-title"]/text()').extract_first() # 信息时间 XXSJ_28 = reFunction( '(\d{4}-\d{1,2}-\d{1,2})', data.xpath( '//*[@class="info-source"]/text()[1]').extract_first()) # TODO # 正文标题 ZWBT_29 = WJBT_27 soup = BeautifulSoup(results.content.decode('utf-8')) table = soup.find('table') if '土地使用条件' in items or '宗地编号' in items or '公告序号' in items: # TODO 正则匹配的页面 # 公告序号 GGXH_31 = '|'.join( re.findall('公告序号(?:[\s]*)([()\w\.:\-\/\%,、]*)(?:\n)', items)) # 宗地编号 / 编号 ZDBH_32_ = '|'.join( re.findall( f'[宗地](?:[\s]*)编号(?:[\s]*)([{self.reStr}]*)(?:\s)', items)) ZDBH_32 = ZDBH_32_.replace(':', '') if ZDBH_32_ else ZDBH_32_ # 地块位置 DKWZ_33_ = '|'.join( re.findall(f'地块位置(?:[\s]*)({self.reStr})(?:\n)', items)) DKWZ_33 = DKWZ_33_ if DKWZ_33_ else '|'.join( re.findall( '土地(?:[\s]*)坐落(?:[\s]*)([()\w\.:: ≤;≥\-\/\%,、\.]*)(?:\n)', items)) # 土地用途 / 用途 TDYT_34 = '|'.join( re.findall('[土地]?用途(?:[\s]*)([()\w\.:\-\/\%,、]*)(?:\n)', items)) # 土地面积(平方米) / 土地面积(m2) / 出让面积(m) TDMJ_35_ = '|'.join( re.findall( '土地面积\(m2\)(?:[\s]*)([()\w\.:\-\/\%,、\.]*)(?:\n)', items)) TDMJ_35 = TDMJ_35_ if TDMJ_35_ else '|'.join( re.findall( f'土地面积(?:\s*)[\((]*平方米[\))]*(?:[\s]*)({self.reStr})(?:\n)', items)) # 容积率 RJL_36 = '|'.join( re.findall('容积率(?:[\s]*)([()\w\.:≤≥\-\/\%,、\.]*)(?:\n)', items)) # 计容建筑面积(m2) JRZJMJ_41 = '|'.join( re.findall(f'计容建筑面积\(m2\)(?:[\s]*)({self.reStr})(?:\n)', items)) # 出让方式 CRFS_42 = '|'.join( re.findall( '出让方式[:]*(?:[\s]*)([()\w\.:≤≥\-\/\%,、\.]*)(?:\n)', items)) # 出让年限 CRNX_43 = '|'.join( re.findall(f'出让年限(?:[\s]*)({self.reStr})(?:\n)', items)) # 成交价(万元) / 成交价 CJJ_44_ = '|'.join( re.findall( f'成交价(?:[\s]*)[\((]*万元[)\)]*(?:[\s]*)({self.reStr})(?:\n)', items)) CJJ_44 = CJJ_44_ if CJJ_44_ else '|'.join( re.findall(f'成交价:(?:[\s]*)({self.reStr})(?:三)', items)) # 受让单位 SRDW_45 = '|'.join( re.findall('受让单位(?:[\s]*)([()\w\.:≤≥\-\/\%,、\.]*)(?:\n)', items)) # 土地使用条件 TDSYTJ_46 = '|'.join( re.findall( '土地(?:[\s]*)使用(?:[\s]*)条件(?:[\s]*)([()\w\.:: ≤;≥\-\/\%,、\.]*)(?:\n)', items)) # 交易时间 JYSJ_47 = '|'.join( re.findall('交易时间(?:[\s]*)([()\w\.:≤≥\-\/\%,、\.]*)(?:\n)', items)) # 成交人 CJR_48 = '|'.join( re.findall(f'成交人:(?:[\s]*)({self.reStr})(?:二)', items)) # 备注 BZ_49 = '|'.join( re.findall(f'备注:(?:[\s]*)({self.reStr})(?:\n)', items)) # 联系地址 LXDZ_51 = '|'.join( re.findall(f'联系地址:(?:[\s]*)([{self.reStr}]*)(?:\s)', items)) # 联系电话 LXDH_52 = '|'.join( re.findall('联系电话:(?:[\s]*)([()\w\.:≤≥\-\/\%,、\.]*)(?:\n)', items)) # 公示期 GSQ_53 = '|'.join( re.findall(f'公示时间:(?:[\s]*)([{self.reStr}]*)(?:\n)', items)) else: if not table: # TODO 正则匹配的页面 # 公告序号 GGXH_31 = '|'.join( re.findall('公告序号(?:[\s]*)([()\w\.:\-\/\%,、]*)(?:\n)', items)) # 宗地编号 / 编号 ZDBH_32_ = '|'.join( re.findall( f'[宗地](?:[\s]*)编号(?:[\s]*)([{self.reStr}]*)(?:\s)', items)) ZDBH_32 = ZDBH_32_.replace(':', '') if ZDBH_32_ else ZDBH_32_ # 地块位置 DKWZ_33_ = '|'.join( re.findall(f'地块位置(?:[\s]*)({self.reStr})(?:\n)', items)) DKWZ_33 = DKWZ_33_ if DKWZ_33_ else '|'.join( re.findall( '土地(?:[\s]*)坐落(?:[\s]*)([()\w\.:: ≤;≥\-\/\%,、\.]*)(?:\n)', items)) # 土地用途 / 用途 TDYT_34 = '|'.join( re.findall( '[土地]?用途(?:[\s]*)([()\w\.:\-\/\%,、]*)(?:\n)', items)) # 土地面积(平方米) / 土地面积(m2) / 出让面积(m) TDMJ_35_ = '|'.join( re.findall( '土地面积\(m2\)(?:[\s]*)([()\w\.:\-\/\%,、\.]*)(?:\n)', items)) TDMJ_35 = TDMJ_35_ if TDMJ_35_ else '|'.join( re.findall( f'土地面积(?:\s*)[\((]*平方米[\))]*(?:[\s]*)({self.reStr})(?:\n)', items)) # 容积率 RJL_36 = '|'.join( re.findall( '容积率(?:[\s]*)([()\w\.:≤≥\-\/\%,、\.]*)(?:\n)', items)) # 计容建筑面积(m2) JRZJMJ_41 = '|'.join( re.findall( f'计容建筑面积\(m2\)(?:[\s]*)({self.reStr})(?:\n)', items)) # 出让方式 CRFS_42 = '|'.join( re.findall( '出让方式[:]*(?:[\s]*)([()\w\.:≤≥\-\/\%,、\.]*)(?:\n)', items)) # 出让年限 CRNX_43 = '|'.join( re.findall(f'出让年限(?:[\s]*)({self.reStr})(?:\n)', items)) # 成交价(万元) / 成交价 CJJ_44_ = '|'.join( re.findall( f'成交价(?:[\s]*)[\((]*万元[)\)]*(?:[\s]*)({self.reStr})(?:\n)', items)) CJJ_44 = CJJ_44_ if CJJ_44_ else '|'.join( re.findall(f'成交价:(?:[\s]*)({self.reStr})(?:三)', items)) # 受让单位 SRDW_45 = '|'.join( re.findall( '受让单位(?:[\s]*)([()\w\.:≤≥\-\/\%,、\.]*)(?:\n)', items)) # 土地使用条件 TDSYTJ_46 = '|'.join( re.findall( '土地(?:[\s]*)使用(?:[\s]*)条件(?:[\s]*)([()\w\.:: ≤;≥\-\/\%,、\.]*)(?:\n)', items)) # 交易时间 JYSJ_47 = '|'.join( re.findall( '交易时间(?:[\s]*)([()\w\.:≤≥\-\/\%,、\.]*)(?:\n)', items)) # 成交人 CJR_48 = '|'.join( re.findall(f'成交人:(?:[\s]*)({self.reStr})(?:二)', items)) # 备注 BZ_49 = '|'.join( re.findall(f'备注:(?:[\s]*)({self.reStr})(?:\n)', items)) # 联系地址 LXDZ_51 = '|'.join( re.findall(f'联系地址:(?:[\s]*)([{self.reStr}]*)(?:\s)', items)) # 联系电话 LXDH_52 = '|'.join( re.findall( '联系电话:(?:[\s]*)([()\w\.:≤≥\-\/\%,、\.]*)(?:\n)', items)) # 公示期 GSQ_53 = '|'.join( re.findall(f'公示时间:(?:[\s]*)([{self.reStr}]*)(?:\n)', items)) else: htmlTable = htmlTableTransformer() tdData = htmlTable.tableTrTdRegulation(table) # 宗地编号 / 编号 ZDBH_32 = tdData.get('编号') if tdData.get( '编号') else tdData.get('地块编号') # 地块位置 DKWZ_33 = tdData.get('地块位置') # 土地用途 / 用途 TDYT_34 = tdData.get('用途') if tdData.get( '用途') else tdData.get('土地用途') # 土地面积(平方米) / 土地面积(m2) / 出让面积(m) TDMJ_35_ = tdData.get('地块面积(平方米)') if tdData.get( '地块面积(平方米)') else tdData.get('地块面积(㎡)') TDMJ_35 = TDMJ_35_ if TDMJ_35_ else tdData.get('宗地面积(平方米)') # 出让方式 CRFS_42 = tdData.get('出让方式') # 容积率 RJL_36 = tdData.get('容积率') # 建筑密度( %) JZMD_37 = tdData.get('建筑密度(%)') # 绿地率( %) LDL_38 = tdData.get('绿地率(%)') # 底价(万元) DJ_40 = tdData.get('底价(万元)') # 保证金(万元) BZJ_39 = tdData.get('保证金(万元)') # 出让年限 CRNX_43 = tdData.get('出让年限') # 成交价(万元) / 成交价 CJJ_44 = tdData.get('成交价(万元)') if tdData.get( '成交价(万元)') else tdData.get('成交价格(万元)') # 成交人 CJR_48 = tdData.get('成交人') # 备注 BZ_49 = tdData.get('备注') # 公示期 GSQ_53 = reFunction( f'公示期:(?:[\s]*)([{self.reStr}]*)(?:\s)', items) # 联系单位 LXDW_50 = reFunction( f'联 系 人:(?:[\s]*)([{self.reStr}]*)(?:\s)', items) # 联系地址 LXDZ_51 = reFunction( f'联系地址:(?:[\s]*)([{self.reStr}]*)(?:\s)', items) # 联系电话 LXDH_52 = reFunction( f'联系电话:(?:[\s]*)([{self.reStr}]*)(?:\s)', items) # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # 爬取地址url url = url if url else response.url # 唯一标识 md5Mark = encrypt_md5(url + ZDBH_32 + DKWZ_33) # 存储数据 csvFile = [ WJBT_27, XXSJ_28, ZWBT_29, GGXH_31, ZDBH_32, DKWZ_33, TDYT_34, TDMJ_35, RJL_36, JZMD_37, LDL_38, BZJ_39, DJ_40, JRZJMJ_41, CRFS_42, CRNX_43, CJJ_44, SRDW_45, TDSYTJ_46, JYSJ_47, CJR_48, BZ_49, LXDW_50, LXDZ_51, LXDH_52, GSQ_53, crawlingTime, url, md5Mark, ] self.fileDetail.write(','.join([ _.replace(',', ' ').replace('\n', '').replace('\r', '') if _ else _ for _ in csvFile ])) self.fileDetail.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield #TODO except Exception as e: self.log(f'详情页数据解析失败, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR)
def parse_detail(self, response): try: data = Selector(text=response.body.decode('gbk')) items = str(data.xpath('string(.)').extract()[0]).replace('\xa0', '').replace('\u3000', '') # 共有字段 signTime = response.meta.get('signTime') administration = response.meta.get('administration') parcelLocation = response.meta.get('parcelLocation') totalArea = response.meta.get('totalArea') # detailPage # 项目名称 projectName = reFunction('项目名称:(?:\s*)([\s\S]*)项目位置', items).strip() # 项目位置 projectLocation = reFunction('项目位置:(?:\s*)([\s\S]*)面积(公顷)', items).strip() # 面积(公顷) area = reFunction('面积(公顷):(?:\s*)([\s\S]*)土地来源', items).strip() # 土地来源 landSource = reFunction('土地来源:(?:\s*)([\s\S]*)土地用途', items).strip() # 土地用途 landPurpose = reFunction('土地用途:(?:\s*)([\s\S]*)供地方式', items).strip() # 供地方式 supplyType = reFunction('供地方式:(?:\s*)([\s\S]*)土地使用年限', items).strip() # landUsegeTerm landUsegeTerm = reFunction('土地使用年限:(?:\s*)([\s\S]*)行业分类', items).strip() # 行业分类 classification = reFunction('行业分类:(?:\s*)([\s\S]*)土地级别', items).strip() # 土地级别 landLevel = reFunction('行业分类:(?:\s*)([\s\S]*)成交价格', items).strip() # 成交价格(万元) transferPrice = reFunction('成交价格(万元):(?:\s*)([\s\S]*)分期支付约定', items).strip() # TODO stagesData = reFunction('分期支付约定:(?:\s*)([\s\S]*)土地使用权人', items).strip() # 分期支付约定-支付期号 issue = '' # 分期支付约定-约定支付日期 paymentDate = '|'.join([strfTime(_) for _ in list(filter(None, re.findall('\d{4}年\d{2}月\d{2}日', stagesData)))]) # 分期支付约定-约定支付金额(万元) paymentAmount = '' # 分期支付约定-备注 remark = '' for _ in range(0, len(list(filter(None, re.findall('年', stagesData))))): # id 一定是从 9 开始 如果有多个项, 用Xpath一一匹配 id = _ + 9 issue += data.xpath(f'//*[@id="r-{id}-0"]/td[1]/text()').extract_first()+'|' if data.xpath(f'//*[@id="r-{id}-0"]/td[1]/text()').extract_first() else ' ' paymentAmount += data.xpath(f'//*[@id="r-{id}-0"]/td[3]/text()').extract_first()+'|' if data.xpath(f'//*[@id="r-{id}-0"]/td[3]/text()').extract_first() else ' ' remark += data.xpath(f'//*[@id="r-{id}-0"]/td[4]/text()').extract_first()+'|' if data.xpath(f'//*[@id="r-{id}-0"]/td[4]/text()').extract_first() else ' ' # TODO # 土地使用权人 landHolder = reFunction('土地使用权人:(?:\s*)([\s\S]*)约定容积率', items).strip() # 约定容积率上限 plotRatioLOWER = reFunction('上限:(?:\s*)([\s\S]*)约定交地时间', items).strip() # 约定容积率下限 plotRatioUP = reFunction('下限:(?:\s*)([\s\S]*)上限', items).strip() # 约定交地时间 agreedDeliveryTime = strfTime(reFunction('约定交地时间:(?:\s*)([\s\S]*)约定开工时间', items).strip()) # 约定开工时间 agreedStartTime = strfTime(reFunction('约定开工时间:(?:\s*)([\s\S]*)约定竣工时间', items).strip()) # 约定竣工时间 agreedCompletionTime = strfTime(reFunction('约定竣工时间:(?:\s*)([\s\S]*)实际开工时间', items).strip()) # 实际开工时间 actualStartTime = strfTime(reFunction('实际开工时间:(?:\s*)([\s\S]*)实际竣工时间', items).strip()) # 实际竣工时间 actualCompletionTime = strfTime(reFunction('实际竣工时间:(?:\s*)([\s\S]*)批准单位', items).strip()) # 批准单位 approvedUnit = reFunction('批准单位:(?:\s*)([\s\S]*)合同签订日期', items).strip() # 合同签订日期 contractTime = strfTime(reFunction('合同签订日期:(?:\s*)([\s\S]*)', items).strip()) # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(landSource+projectLocation+projectName+url) csvFile = [administration,parcelLocation,totalArea,landPurpose,signTime,projectName,projectLocation,area,landSource, supplyType,landUsegeTerm,classification,landLevel,transferPrice,issue,paymentDate,paymentAmount,remark,landHolder, plotRatioUP,plotRatioLOWER,agreedDeliveryTime,agreedStartTime,agreedCompletionTime,actualStartTime,actualCompletionTime,approvedUnit,contractTime,crawlingTime, url, md5Mark, '\n'] # 存储数据 self.fileDetail.write(','.join([_.replace(',', ' ').replace('\n', '').replace('\r', '') if _ else _ for _ in csvFile])) self.fileDetail.write('\n') yield #TODO except Exception as e: self.log(f'详情页数据解析失败, 错误: {e}', level=logging.ERROR)
tdList = table.tbody.find_all('tr')[1].find_all('td') for _ in range(1, len(tdList) + 1): table.tbody.find_all('tr')[0].insert(number + _, tdList[_ - 1]) tdReplace.extract() table.tbody.find_all('tr')[1].extract() except: pass htmlTable = htmlTableTransformer() tdData = htmlTable.tableTrTdRegulation(table) # 宗地编号 ZDBH_4 = tdData.get('地块编号') # 宗地坐落 ZDZL_5_ = tdData.get('土地位置') if tdData.get('土地位置') else tdData.get( '地块位置/名称') ZDZL_5 = ZDZL_5_.replace( reFunction(f'备注(?:[\s]*)([{reStr}]*)\s', reFunction('一([\s\S]*)二', items)), '') # 面积 MJ_6 = tdData.get('土地面积(m2)') if tdData.get( '土地面积(m2)') else tdData.get('土地面积(平方米)') # 土地用途 TDYT_7 = tdData.get('土地用途') if tdData.get('土地用途') else tdData.get( '规划地性质') # 出让年限 CRNX_8_ = tdData.get(r'出让\u3000年限') if tdData.get( r'出让\u3000年限') else tdData.get('出让年限') CRNX_8 = CRNX_8_ if CRNX_8_ else tdData.get('出让年限(年)') # 容积率 RJL_9 = tdData.get('容积率') # 绿地率 LDL_10 = tdData.get('绿地率') if tdData.get('绿地率') else tdData.get( '绿地率(%)')
def parse_detail(self, response): try: data = Selector(text=response.body.decode('utf-8')) noticeDetail = 'https://www.sz68.com' + data.xpath( '//iframe[@id="externalframe1"]/@src').extract_first( ) if data.xpath( '//iframe[@id="externalframe1"]/@src').extract_first( ) else 'https://www.sz68.com' + data.xpath( '//iframe[@id="externalframe0"]/@src').extract_first() ZWBT = '' GGQ = '' GPKSSJ = '' GPJSSJ = '' ZDDM_DKZDBH = '' ZDH = '' DKWZ = '' DKYT = '' ZRHYLB = '' TDMJ = '' JZMJ = '' TDSYNX = '' TDFZXZ = '' RJL = '' GPQSJ = '' JMBZJ = '' TDSYNX = '' ZBJJZSJ = '' BMSJ = '' BMDD = '' DZ = '' DH = '' JYSJ = response.meta.get('JYSJ') JYZT = response.meta.get('JYZT') ZDH = response.meta.get('ZDH') TDWZ = response.meta.get('TDWZ') QSJ = response.meta.get('QSJ') TDYT = response.meta.get('TDYT') TDMJ = response.meta.get('TDMJ') JYFS = response.meta.get('JYFS') id = response.meta.get('id') # 公告详情 detailData = requests.get(noticeDetail, headers=self.header, allow_redirects=False, timeout=60, verify=False) if detailData.status_code == 200: detail = Selector(text=detailData.content.decode('utf-8')) items = str(detail.xpath('string(.)').extract()[0]).replace( '\xa0', '').replace('\u3000', '').replace('\n', '').replace(' ', '') # 正文标题 ZWBT = ''.join( detail.xpath( '/html/body/div/p[2]/span//text() | /html/body/p[2]/span//text()|/html/body/p[1]/span//text()' ).extract()) # 公告期 GGQ = reFunction('公告期自([\w \-\s]*)[止]?,', items) # 挂牌开始时间 GPKSSJ = reFunction( '挂牌期自(\d{4}年\d{1,2}月\d{1,2}日)[起]?至(?:\d{4}年\d{1,2}月\d{1,2}日\d{1,2}时)止', items) # 挂牌结束时间 GPJSSJ = reFunction( '挂牌期自(?:\d{4}年\d{1,2}月\d{1,2}日)[起]?至(\d{4}年\d{1,2}月\d{1,2}日\d{1,2}时)止', items) # TODO 解析页面表格 soup = BeautifulSoup(detailData.text) table = soup.find('body').find('div').find( 'table') if soup.find('body').find('div').find( 'table') else soup.find('table') htmlTable = htmlTableTransformer() tdData = htmlTable.table_tr_td(table) # 宗地代码 / 地块宗地编号 ZDDM_DKZDBH = tdData.get('宗地编号') if tdData.get( '宗地编号') else tdData.get('地块宗地编号') # 宗地号 ZDH_A = tdData.get('宗地号') # 土地位置 DKWZ = tdData.get('土地位置') # 土地用途 DKYT = tdData.get('土地用途') # 准入行业类别 ZRHYLB = tdData.get('准入行业类别') # 土地面积 / 土地面积(平方米) TDMJ_A = tdData.get('土地面积(平方米)') if tdData.get( '土地面积(平方米)') else tdData.get('土地面积') # 建筑面积(平方米) / 总建筑面积 JZMJ = tdData.get('建筑面积(平方米)') if tdData.get( '建筑面积(平方米)') else tdData.get('总建筑面积') # 挂牌起始价(人民币万元) GPQSJ = tdData.get('挂牌起始价(人民币、万元)') # 竞买(投标)保证金(人民币万元) JMBZJ = tdData.get('竞买(投标)保证金(人民币、万元)') # 土地使用年限(年) TDSYNX = tdData.get('土地使用年期') if not detail.xpath('//table').extract(): # 宗地代码 / 地块宗地编号 ZDDM_DKZDBH = reFunction('宗地编号([\w \-\s]*),', items) # 土地使用年期 / 土地使用年限 情况2 中的 土地使用年期 TDSYNX = reFunction('土地使用年[\s期限]*[为]?(\d*年)', items) # 土地发展建设现状 TDFZXZ = reFunction('土地的发展建设现状:([\S\s]*。)', items) # 容积率 容积率不大于1.518。 RJL = reFunction('容积率[\D]*([\.\d]*)。', items) # 土地位置 宗地位于龙岗 中心城14号地, DKWZ = reFunction('宗地位于([\w \s]*),', items) # 土地用途 DKYT = reFunction('土地用途为([\w \s]*),', items) # TODO 是否需要在解析一种页面 http://localhost:63342/IntegrationSpider/Logs/dwsw.html?_ijt=rfnsd28r0fb132e6i5qkd3db6f # 保证金截止时间 ZBJJZSJ = reFunction( '保证金的到账截止时间为(\d{4}年\d{1,2}月\d{1,2}日\d{1,2}时\d{1,2}分)', items) # 地址 //匹配这些中文标点符号 。 ? ! , 、 ; : DZ = '|'.join( re.findall('地址:([\w \.\-\s\/\%,\(\)。 \? \! 、:]*);咨询电话', items)) # 电话 DH = '|'.join( re.findall('咨询电话:([\w \.\-\s\/\%,\(\)。 \? \! 、]*)[;。]', items)) else: raise IntegrationException(f'获取公告详情失败, url: {noticeDetail}') # TODO 基本信息 完成 itemsData = str(data.xpath('string(.)').extract()[0]).replace( '\xa0', '').replace('\u3000', '') # 交易方式 JYFS_A = data.xpath( '//div[@class="content_case1"]/div[1]/ul/li[2]/span/text()' ).extract_first() # 交易类型 JYLX = data.xpath( '//div[@class="content_case1"]/div[1]/ul/li[1]/span/text()' ).extract_first() # 宗地 ZD = data.xpath('//div[@class="content_case1"]/div[1]/div/text()' ).extract_first() # 发布时间 FBSJ = data.xpath( '//div[@class="content_case1"]/div[2]/span[2]/text()' ).extract_first() # 交易状态 JYZT_A = data.xpath( '//div[@class="content_case1"]/div[2]/span[3]/text()' ).extract_first() # 中标人 / 竞得人 ZBR_24 = data.xpath( '//div[@class="right_first"]/div[1]/div[2]/text()' ).extract_first() # 成交价(元) CJJ_25 = data.xpath( '//div[@class="right_first"]/div[2]/div[2]/text()' ).extract_first() # 保证金(元) BZJ_26 = data.xpath( '//div[@class="right_first twin"][1]/div[1]/div[2]/text()' ).extract_first() # 起始价(元) QSJ_A = data.xpath( '//div[@class="right_first twin"][1]/div[2]/div[2]/text()' ).extract_first() # 竞价阶梯(元) JJJT_28 = data.xpath( '//div[@class="right_first twin"][2]/div[1]/div[2]/text()' ).extract_first() # 封顶价(元) FDJ_29 = data.xpath( '//div[@class="right_first twin"][2]/div[2]/div[2]/text()' ).extract_first() # 竞买申请截止时间 JMSQJZSJ_30 = data.xpath( '//div[@class="right_first twin"][3]/div[1]/div[2]/text()' ).extract_first() # 竞买人数 JMRS_31 = data.xpath( '//div[@class="right_first twin"][3]/div[2]/div[2]/text()' ).extract_first() # TODO 标的详情 完成 BDdetail = data.xpath( '//li[@class="weather_info_ul_item"]/div[2]/span') # 宗地号 ZDH_B = BDdetail[0].xpath('text()').extract_first() # 土地面积 TDMJ_B = BDdetail[1].xpath('text()').extract_first() # 建筑面积 JZMJ_A = BDdetail[2].xpath('text()').extract_first() # 容积率 RJL_A = BDdetail[3].xpath('text()').extract_first() # 建筑覆盖率 JZFGL = BDdetail[4].xpath('text()').extract_first() # 建筑高度 JZGD = BDdetail[5].xpath('text()').extract_first() # 用途 YT = BDdetail[6].xpath('text()').extract_first() # 使用年限 SYNX = BDdetail[7].xpath('text()').extract_first() # 区域 QY = BDdetail[8].xpath('text()').extract_first() # 位置 WZ = BDdetail[9].xpath('text()').extract_first() # 绿地率 LDL = BDdetail[10].xpath('text()').extract_first() # 建筑楼层 JZLC = BDdetail[11].xpath('text()').extract_first() # TODO 竞价记录 完成 # 竞买人 JMR = data.xpath( '//div[@class="conomy"][1]/table/tr[2]/td[2]/text()' ).extract_first() # 竞买出价(元) JMSJ = data.xpath( '//div[@class="conomy"][1]/table/tr[2]/td[3]/text()' ).extract_first() # 竞价时间 CJSJ = data.xpath( '//div[@class="conomy"][1]/table/tr[2]/td[4]/text()' ).extract_first() # 状态 ZT = data.xpath( '//div[@class="conomy"][1]/table/tr[2]/td[5]/text()' ).extract_first() # TODO 结果公示 完成 results = requests.post( 'https://www.sz68.com/tiaim/web/resultdetailbytargetId', headers=self.header, data={'targetId': id}, allow_redirects=False, timeout=60, verify=False) if results.status_code == 200: resultsData = results.json() # 正文标题 ZWBT_A = resultsData.get('notice').get('NAME') # 发布日期 FBRQ = resultsData.get('notice').get('PUBLISH_TIME') # 宗地号 ZDH_C = resultsData.get('notice').get('DTL_REF_NO') # 竞得人 JDR = reFunction('竞得人:([\w \.\-\s\/\%,]*)<', resultsData.get('fileExtName')) # 中标人 ZBR_A = reFunction('中标人:([\w \.\-\s\/\%,]*)<', resultsData.get('fileExtName')) # 位置 WZ = reFunction('位置:([\w \.\-\s\/\%,、]*)<', resultsData.get('fileExtName')) # 土地用途 TDYT_A = reFunction('土地用途:([\w \.\-\s\/\%,、]*)<', resultsData.get('fileExtName')) # 土地面积 TDMJ_C = reFunction('土地面积:([\w \.\-\s\/\%,、]*)<', resultsData.get('fileExtName')) # 建筑面积 JZMJ_B = reFunction('建筑面积:([\w \.\-\s\/\%,、]*)<', resultsData.get('fileExtName')) # 起始价 QSJ_D = reFunction('起始价:([\w \.\-\s\/\%,、]*)<', resultsData.get('fileExtName')) # 成交价 CJJ_A = reFunction('成交价:([\w \.\-\s\/\%,、]*)<', resultsData.get('fileExtName')) # 溢价率 YJL = reFunction('溢价率:([\w \.\-\s\/\%,、]*)<', resultsData.get('fileExtName')) # 综合楼面单价 ZHLMDJ = reFunction('综合楼面单价:([\w \.\-\s\/\%,、]*)<', resultsData.get('fileExtName')) # TODO 附件 解析出让合同 完成 accessory = '土地模块|' links = data.xpath('//div[@class="accessory_link"]/a') for link in links: fileName = link.xpath( 'text()[position()=((position() mod 2)=0)]' ).extract_first().strip() if link.xpath( 'text()[position()=((position() mod 2)=0)]').extract_first( ) else '未知名称' try: href = link.xpath('@href').extract_first() linkPath = self.dirName + f'土地模块_{ZDH}' + fileName response = requests.get(href, headers=self.header, timeout=200) with open(linkPath, 'wb') as fp: fp.write(response.content) except: pass else: accessory += fileName + '|' # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # 爬取地址url url = response.url md5Mark = encrypt_md5(ZDH + WZ + ZWBT + url) csvFile = [ JYSJ, JYZT, ZDH, TDWZ, QSJ, TDYT, TDMJ, JYFS_A, JYLX, ZD, FBSJ, JYZT_A, ZBR_24, CJJ_25, BZJ_26, QSJ_A, JJJT_28, FDJ_29, JMSQJZSJ_30, JMRS_31, ZWBT, GGQ, GPKSSJ, GPJSSJ, ZDDM_DKZDBH, ZDH_A, DKWZ, DKYT, ZRHYLB, TDMJ_A, JZMJ, TDSYNX, TDFZXZ, RJL, GPQSJ, JMBZJ, TDSYNX, ZBJJZSJ, DZ, DH, ZDH_B, TDMJ_B, JZMJ_B, RJL_A, JZFGL, JZGD, YT, SYNX, QY, WZ, LDL, JZLC, JMR, JMSJ, CJSJ, ZT, ZWBT_A, FBRQ, ZDH_C, JDR, ZBR_A, WZ, TDYT_A, TDMJ_C, JZMJ_B, QSJ_D, CJJ_A, YJL, ZHLMDJ, crawlingTime, url, md5Mark, accessory, ] fileData = [] for _ in csvFile: try: fileData.append( _.replace(',', ' ').replace('\n', '').replace('\r', '')) except: fileData.append(str(_)) self.fileDetail.write(','.join(fileData)) self.fileDetail.write('\n') except Exception as e: self.log(f'详情页数据解析失败, 错误: {e}', level=logging.ERROR)
def parse_detail(self, response): # TODO 主动关闭爬虫问题 try: data = Selector(text=response.body.decode('utf-8')) items = str(data.xpath('string(.)').extract()[0]).replace('\xa0', '').replace('\u3000', '') htmlTable = htmlTableTransformer() WJBT_1 = '' XXSJ_2 = '' WBT_3 = '' GGBH_4 = '' CRSJ_5 = '' GGNX_6 = '' ZDBH_7 = '' DKWZ_8 = '' ZDWZ_9 = '' ZDZL_10 = '' TDYT_11 = '' GHTDYT_12 = '' CRNX_13 = '' SYNX_14 = '' PZJGJWH_15 = '' GHYDMJ_16 = '' GHMJ_17 = '' CRMJ_18 = '' CRYDMJ_19 = '' ZDCRMJ_20 = '' JZMD_21 = '' RJL_22 = '' LDL_23 = '' LDL_24 = '' JZKZGD_25 = '' JZKZZGD_26 = '' JZXS_27 = '' TZQD_28 = '' TDGJBAH_29 = '' SFSZD_30 = '' TDXZTJ_31 = '' JMBZJ_32 = '' JMBZJ_72 = '' QJJ_33 = '' CRQSJ_34 = '' JJFD_35 = '' SFSZBLJ_36 = '' GPKSSJ_37 = '' GPJZSJ_38 = '' HQCRWJSJ_39 = '' TJJMSQSJ_40 = '' BZJJZSJ_41 = '' QRJMZGSJ_42 = '' LXDZ_43 = '' LXDH_44 = '' LXR_45 = '' BZJZH_86 = '' BZJZH_87 = '' BZJZH_88 = '' CRJZH_97 = '' CRJZH_98 = '' CRJZH_99 = '' # TODO 共有字段 reFunction(f'时间:\s*([{self.reStr}]*)\s', LY) # 文件标题 WJBT_1 = response.meta.get('title').strip() # 信息时间 XXSJ_2 = reFunction('[\d\-]*', data.xpath('//p[@class="sub-cp"]/text()').extract_first()) # 正文标题 WBT_3 = WJBT_1 # 公告编号 GGBH_4 = ''.join(data.xpath('//div[@class="substance"]/p[position() <5]/.//*[contains(text(),"号")]/ancestor::p/.//text()').extract()) # 出让时间 CRSJ_5 = reFunction('定于\s*([()【】\w\.—\(\)〔〕㎡≤≥《》\-\/\%\.﹪]*)[,;,、在]', items) # 公告类型 GGNX_6 = '出让公告' # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(url + WJBT_1 + XXSJ_2) GPSJ_0 = reFunction('挂牌交易期限:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)[\s。]', items) GPSJ_1 = reFunction('申请人可于:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)到', items) GPSJ = GPSJ_0 if GPSJ_0 else GPSJ_1 # 挂牌开始时间、 GPKSSJ_37 = reFunction('挂牌开始时间:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 挂牌截止时间、 GPJZSJ_38 = reFunction('挂牌截止时间:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) if GPSJ: try: GPKSSJ_37 = GPSJ.split('至')[0] GPJZSJ_38 = GPSJ.split('至')[1] except: pass # 获取出让文件时间、 HQCRWJSJ_39 = GPSJ_1 # 提交竞买申请时间、 TJJMSQSJ_40 = reFunction('(\d{4}年\d{1,3}月\d{1,3}日(?:[()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)至 \d{4}年\d{1,3}月\d{1,3}日)', items) # 保证金截止时间、 BZJJZSJ_41 = reFunction('(\d{4}年\d{1,3}月\d{1,3}日(?:[()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)至 \d{4}年\d{1,3}月\d{1,3}日)', items) # 确认竞买资格时间 QRJMZGSJ_42 = reFunction('(\d{4}年\d{1,3}月\d{1,3}日(?:[()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)至 \d{4}年\d{1,3}月\d{1,3}日)', items) # 联系地址、 LXDZ_43 = reFunction('联系地址:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 联系电话、 LXDH_44 = reFunction('联系电话:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 联系人、 LXR_45 = reFunction('联\s*系\s*人:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) ZH_0 = reFunction('以下账户:*\s*([\w\.:: —\(\)\s〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪\s]*)[一二三四五六七八九123456789]*', items) ZH_1 = reFunction('保证金帐户:*\s*([\w\.:: —\(\)\s〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪]*)\s*', items) try: if ZH_0: if ZH_0[:2] == '户名': result = re.split('[①②③④]*', ZH_0) # 保证金账户开户单位 / 户名 BZJZH_86 = result[0].replace('户名:','') if result[0] else '' # 保证金账户账号 BZJZH_87 = '|'.join([re.split(',|,', _)[0] for _ in result[1:]]) # 保证金账户开户行 BZJZH_88 = '|'.join([re.split(',|,', _)[-1] for _ in result[1:]]) else: result = re.split('[①②③④]*', ZH_0) # 保证金账户开户单位 / 户名 BZJZH_86 = '|'.join([re.findall('开\s*户\s*行:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪]*)\s', _)[0] if re.findall('开 户 行:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪]*)\s', _) else '' for _ in result]) # 保证金账户账号 BZJZH_87 = '|'.join([re.findall('户\s*名:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪]*)\s', _)[0] if re.findall('开 户 行:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪]*)\s', _) else '' for _ in result]) # 保证金账户开户行 BZJZH_88 = '|'.join([re.findall('账\s*号:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪]*)\s', _)[0] if re.findall('账\s*号:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪]*)\s', _) else '' for _ in result]) elif ZH_1: # 保证金账户开户单位 / 户名 BZJZH_86 = '|'.join(re.findall('开户[单位名称]*:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,,;;、\.﹪]*)\s', ZH_1)).replace(';') # 保证金账户账号 BZJZH_87 = '|'.join(re.findall('开\s*户\s*行:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,,;;、\.﹪]*)\s', ZH_1)).replace(';') # 保证金账户开户行 BZJZH_88 = '|'.join(re.findall('帐\s*号:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,,;;、\.﹪]*)\s', ZH_1)).replace(';') except: pass CR = reFunction('出让金帐户:*\s*([\w\.:: —\(\)\s〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪]*)\s*', items) try: # 出让金账户开户单位 / 户名 CRJZH_97 = '|'.join(re.findall('开户[单位名称]*:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,,;;、\.﹪]*)\s', CR)).replace(';') # 出让金账户开户行 CRJZH_98 = '|'.join(re.findall('开\s*户\s*行:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,,;;、\.﹪]*)\s', CR)).replace(';') # 出让金账户账号 CRJZH_99 = '|'.join(re.findall('帐\s*号:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,,;;、\.﹪]*)\s', CR)).replace(';') except: pass if '拍卖出让地块的基本情况和规划指标要求' not in items and '备注' not in items and '挂牌出让地块的基本情况和规划指标要求' not in items: try: soup = BeautifulSoup(response.body.decode('utf-8')) tables = soup.find_all('table') if '规划用途及主要指标' in items: # 处理费标准的表格 soup = BeautifulSoup(response.body.decode('utf-8')) table = soup.find('table') tdReplace = table.tbody.find_all('tr')[0].find('td', colspan='4') number = table.tbody.find_all('tr')[0].index(tdReplace) tdList = table.tbody.find_all('tr')[1].find_all('td') for _ in range(1, len(tdList) + 1): table.tbody.find_all('tr')[0].insert(number + _, tdList[_ - 1]) tdReplace.extract() [_.extract() for _ in table.tbody.find_all('tr')[1].find_all('td')] table.tbody.find_all('tr')[1].extract() tdData = htmlTable.tableTrTdChangeToList(table) for _ in range(len(list(tdData.values())[0])): # 宗地编号 ZDBH_7 = tdData.get('宗地编号')[_] if tdData.get('宗地编号') else '' # 出让面积(m2) CRMJ_18 = tdData.get('土地面积(㎡)')[_] if tdData.get('土地面积(㎡)') else '' # 容积率 RJL_22 = tdData.get('容积率')[_] if tdData.get('容积率') else '' # 绿地率( %) LDL_24 = tdData.get('绿地率(%)')[_] if tdData.get('绿地率(%)') else '' # 建筑系数( %) JZXS_27 = tdData.get('建筑系数(%)')[_] if tdData.get('建筑系数(%)') else '' # 竟买保证金(万元) JMBZJ_72 = tdData.get('竞买保证金(万元)')[_] if tdData.get('竞买保证金(万元)') else '' # 出让起始价(万元) CRQSJ_34 = tdData.get('挂牌出让起始价(万元)')[_] if tdData.get('挂牌出让起始价(万元)') else '' # 加价幅度、 JJFD_35 = tdData.get('加价幅度')[_] if tdData.get('加价幅度') else '' # 写入数据 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if ZDBH_7: # 重复效验通过, 存储数据 csvFile = [ WJBT_1, XXSJ_2, WBT_3, GGBH_4, CRSJ_5, GGNX_6, ZDBH_7, DKWZ_8, ZDWZ_9, ZDZL_10, TDYT_11, GHTDYT_12, CRNX_13, SYNX_14, PZJGJWH_15, GHYDMJ_16, GHMJ_17, CRMJ_18, CRYDMJ_19, ZDCRMJ_20, JZMD_21, RJL_22, LDL_23, LDL_24, JZKZGD_25, JZKZZGD_26, JZXS_27, TZQD_28, TDGJBAH_29, SFSZD_30, TDXZTJ_31, JMBZJ_32, JMBZJ_72, QJJ_33, CRQSJ_34, JJFD_35, SFSZBLJ_36, GPKSSJ_37, GPJZSJ_38, HQCRWJSJ_39, TJJMSQSJ_40, BZJJZSJ_41, QRJMZGSJ_42, LXDZ_43, LXDH_44, LXR_45, BZJZH_86, BZJZH_87, BZJZH_88, CRJZH_97, CRJZH_98, CRJZH_99, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace('\n', '').replace('\t', '').replace( '\r', '').replace(r'\xa0', '').replace('\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log(f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider(self, 'response msg info %s, job duplicated!' % response.url) elif len(tables) <= 3: tdsList = {} for table in tables: td = htmlTable.tableTrTdRegulationToList(table) tdsList.update(td) for _ in range(len(list(tdsList.values())[0])): # 宗地编号 ZDBH_7 = tdsList.get('宗地编号')[_] if tdsList.get('宗地编号') else '' # 地块编号 地块名称 DKWZ_8 = tdsList.get('地块编号')[_] if tdsList.get('地块编号') else tdsList.get('地块编号')[_] if tdsList.get('地块编号') else '' # 宗地位置 ZDWZ_9 = tdsList.get('宗地位置')[_] if tdsList.get('宗地位置') else '' # 宗地坐落 ZDZL_10 = tdsList.get('宗地坐落')[_] if tdsList.get('宗地坐落') else '' # 土地用途 TDYT_11 = tdsList.get('土地用途')[_] if tdsList.get('土地用途') else '' # 规划土地用途 GHTDYT_12 = tdsList.get('规划土地用途')[_] if tdsList.get('规划土地用途') else '' # 出让年限 CRNX_13 = tdsList.get('出让年限')[_] if tdsList.get('出让年限') else '' # 使用年限 SYNX_14 = tdsList.get('使用年限')[_] if tdsList.get('使用年限') else '' # 批准机关及文号 PZJGJWH_15 = tdsList.get('批准机关及文号')[_] if tdsList.get('批准机关及文号') else tdsList.get('批准文号')[_] if tdsList.get('批准文号') else '' # 规划用地面积〔m2) GHYDMJ_16 = tdsList.get('规划用地面积(m2)')[_] if tdsList.get('规划用地面积(m2)') else tdsList.get('用地面积(㎡)')[_] if tdsList.get('用地面积(㎡)') else tdsList.get('规划用地面积(㎡)')[_] if tdsList.get('规划用地面积(㎡)') else '' # 出让面积(m2) CRMJ_18 = tdsList.get('出让面积(㎡)')[_] if tdsList.get('出让面积(㎡)') else '' # 规划面积(m2) GHMJ_17 = tdsList.get('规划面积(㎡)')[_] if tdsList.get('规划面积(㎡)') else '' # 出让用地面积(m2) CRYDMJ_19 = tdsList.get('出让用地面积(m2)')[_] if tdsList.get('出让用地面积(m2)') else '' # 宗地出让面积 ZDCRMJ_20 = tdsList.get('宗地出让面积')[_] if tdsList.get('宗地出让面积') else '' # 建筑密度 JZMD_21 = tdsList.get('建筑密度(%)')[_] if tdsList.get('建筑密度(%)') else tdsList.get('建筑密度(%)')[_] if tdsList.get('建筑密度(%)') else '' # 容积率 RJL_22 = tdsList.get('容积率')[_] if tdsList.get('容积率') else '' # 绿地率 LDL_23 = tdsList.get('宗地坐落')[_] if tdsList.get('宗地坐落') else '' # 绿地率( %) LDL_24 = tdsList.get('绿地率')[_] if tdsList.get('绿地率') else tdsList.get('绿地率(%)')[_] if tdsList.get('绿地率(%)') else tdsList.get('绿地率(%)')[_] if tdsList.get('绿地率(%)') else '' # 建筑控制高度(m) JZKZGD_25 = tdsList.get('建筑控制高度(m)')[_] if tdsList.get('建筑控制高度(m)') else '' # 建筑控制高度(米) JZKZZGD_26 = tdsList.get('建筑控制高度(米)')[_] if tdsList.get('建筑控制高度(米)') else '' # 投资强度(万元 / 公顷) TZQD_28 = tdsList.get('投资强度(万元/公顷)')[_] if tdsList.get('投资强度(万元/公顷)') else '' # 竞买保证金 JMBZJ_32 = tdsList.get('竞买保证金')[_] if tdsList.get('竞买保证金') else '' # 出让起始价(万元) CRQSJ_34 = tdsList.get('出让起始价')[_] if tdsList.get('出让起始价') else '' # 竟买保证金(万元) JMBZJ_72 = tdsList.get('竞买保证金(万元)')[_] if tdsList.get('竞买保证金(万元)') else '' # 起叫价 QJJ_33 = tdsList.get('起始价')[_] if tdsList.get('起始价') else tdsList.get('出让起始价')[_] if tdsList.get('出让起始价') else '' # 加价幅度 JJFD_35 = tdsList.get('加价幅度')[_] if tdsList.get('加价幅度') else '' if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if ZDBH_7: # 重复效验通过, 存储数据 csvFile = [ WJBT_1, XXSJ_2, WBT_3, GGBH_4, CRSJ_5, GGNX_6, ZDBH_7, DKWZ_8, ZDWZ_9, ZDZL_10, TDYT_11, GHTDYT_12, CRNX_13, SYNX_14, PZJGJWH_15, GHYDMJ_16, GHMJ_17, CRMJ_18, CRYDMJ_19, ZDCRMJ_20, JZMD_21, RJL_22, LDL_23, LDL_24, JZKZGD_25, JZKZZGD_26, JZXS_27, TZQD_28, TDGJBAH_29, SFSZD_30, TDXZTJ_31, JMBZJ_32, JMBZJ_72, QJJ_33, CRQSJ_34, JJFD_35, SFSZBLJ_36, GPKSSJ_37, GPJZSJ_38, HQCRWJSJ_39, TJJMSQSJ_40, BZJJZSJ_41, QRJMZGSJ_42, LXDZ_43, LXDH_44, LXR_45, BZJZH_86, BZJZH_87, BZJZH_88, CRJZH_97, CRJZH_98, CRJZH_99, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace('\n', '').replace('\t', '').replace( '\r', '').replace(r'\xa0', '').replace('\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log(f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider(self, 'response msg info %s, job duplicated!' % response.url) elif len(tables) == 6: # TODO pass except: for item in ['宗地编号' + _ for _ in re.findall('一([\s\S]*)二、', items)[0].split('宗地编号')[1:]]: # 宗地编号 ZDBH_7 = reFunction('宗地编号:*\s*([\w\-]*)\s', item).replace('宗地位置', '').replace('地块名称', '') # 宗地坐落 ZDZL_10 = reFunction('宗地坐落:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 土地用途 TDYT_11 = reFunction('土地用途:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 出让年限 CRNX_13 = reFunction('出让年限:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 宗地出让面积 ZDCRMJ_20 = reFunction('宗地\s*面积:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 建筑密度 JZMD_21 = reFunction('建筑密度\(%\):*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 容积率 RJL_22 = reFunction('容积率:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 绿地率( %) LDL_24 = reFunction('绿化率\(%\):*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 建筑控制高度(米) JZKZZGD_26 = reFunction('建筑限高\(米\):*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 投资强度(万元 / 公顷) TZQD_28 = reFunction('投资强度:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 土地估价备案号 TDGJBAH_29 = reFunction('土地估价备案号:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 现状土地条件 TDXZTJ_31 = reFunction('土地现状:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 竞买保证金 JMBZJ_32 = reFunction('保证金:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 起叫价 QJJ_33 = reFunction('起始价:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 加价幅度 JJFD_35 = reFunction('加价幅度:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 挂牌开始时间、 GPKSSJ_37 = reFunction('挂牌开始时间:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 挂牌截止时间、 GPJZSJ_38 = reFunction('挂牌截止时间:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if ZDBH_7: # 重复效验通过, 存储数据 csvFile = [ WJBT_1, XXSJ_2, WBT_3, GGBH_4, CRSJ_5, GGNX_6, ZDBH_7, DKWZ_8, ZDWZ_9, ZDZL_10, TDYT_11, GHTDYT_12, CRNX_13, SYNX_14, PZJGJWH_15, GHYDMJ_16, GHMJ_17, CRMJ_18, CRYDMJ_19, ZDCRMJ_20, JZMD_21, RJL_22, LDL_23, LDL_24, JZKZGD_25, JZKZZGD_26, JZXS_27, TZQD_28, TDGJBAH_29, SFSZD_30, TDXZTJ_31, JMBZJ_32, JMBZJ_72, QJJ_33, CRQSJ_34, JJFD_35, SFSZBLJ_36, GPKSSJ_37, GPJZSJ_38, HQCRWJSJ_39, TJJMSQSJ_40, BZJJZSJ_41, QRJMZGSJ_42, LXDZ_43, LXDH_44, LXR_45, BZJZH_86, BZJZH_87, BZJZH_88, CRJZH_97, CRJZH_98, CRJZH_99, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace('\n', '').replace('\t', '').replace( '\r', '').replace( r'\xa0', '').replace('\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log(f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider(self, 'response msg info %s, job duplicated!' % response.url) else: for item in ['宗地编号' + _ for _ in re.findall('一([\s\S]*)二、', items)[0].split('宗地编号')[1:]]: # 宗地编号 ZDBH_7 = reFunction('宗地编号:*\s*([\w\-]*)\s', item).replace('宗地位置', '').replace('地块名称', '') # 宗地坐落 ZDZL_10 = reFunction('宗地坐落:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 土地用途 TDYT_11 = reFunction('土地用途:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 出让年限 CRNX_13 = reFunction('出让年限:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 宗地出让面积 ZDCRMJ_20 = reFunction('宗地\s*面积:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 建筑密度 JZMD_21 = reFunction('建筑密度\(%\):*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 容积率 RJL_22 = reFunction('容积率:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 绿地率( %) LDL_24 = reFunction('绿化率\(%\):*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 建筑控制高度(米) JZKZZGD_26 = reFunction('建筑限高\(米\):*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 投资强度(万元 / 公顷) TZQD_28 = reFunction('投资强度:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 土地估价备案号 TDGJBAH_29 = reFunction('土地估价备案号:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 现状土地条件 TDXZTJ_31 = reFunction('土地现状:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 竞买保证金 JMBZJ_32 = reFunction('保证金:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 起叫价 QJJ_33 = reFunction('起始价:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 加价幅度 JJFD_35 = reFunction('加价幅度:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item) # 挂牌开始时间、 GPKSSJ_37 = reFunction('挂牌开始时间:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 挂牌截止时间、 GPJZSJ_38 = reFunction('挂牌截止时间:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if ZDBH_7: # 重复效验通过, 存储数据 csvFile = [ WJBT_1, XXSJ_2, WBT_3, GGBH_4, CRSJ_5, GGNX_6, ZDBH_7, DKWZ_8, ZDWZ_9, ZDZL_10, TDYT_11, GHTDYT_12, CRNX_13, SYNX_14, PZJGJWH_15, GHYDMJ_16, GHMJ_17, CRMJ_18, CRYDMJ_19, ZDCRMJ_20, JZMD_21, RJL_22, LDL_23, LDL_24, JZKZGD_25, JZKZZGD_26, JZXS_27, TZQD_28, TDGJBAH_29, SFSZD_30, TDXZTJ_31, JMBZJ_32, JMBZJ_72, QJJ_33, CRQSJ_34, JJFD_35, SFSZBLJ_36, GPKSSJ_37, GPJZSJ_38, HQCRWJSJ_39, TJJMSQSJ_40, BZJJZSJ_41, QRJMZGSJ_42, LXDZ_43, LXDH_44, LXR_45, BZJZH_86, BZJZH_87, BZJZH_88, CRJZH_97, CRJZH_98, CRJZH_99, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace('\n', '').replace('\t', '').replace('\r', '').replace( r'\xa0', '').replace('\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log(f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider(self, 'response msg info %s, job duplicated!' % response.url) except Exception as e: print(response.url) self.log(f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR)
for ID in [ 1, # 2, # 3, # 4, ]: with open(f'./model{ID}.html', 'r', encoding='utf-8') as fp: # datas = Selector(text=fp.read()) data = Selector( text=fp.read()).xpath('string(.)').extract()[0].replace( '\xa0', '').replace('\u3000', '') # print(re.sub(r'\s*', '', data)) items = data # item = reFunction('一、[\s\S]*二、', items) reStr = '[()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*' # for item in ['宗地编号' + _ for _ in re.findall('一、([\s\S]*)二、', items)[0].split('宗地编号')[1:]]: # print(item) print(data) for item in [ '地块编号' + _ for _ in re.findall('一([\s\S]*)二、', items)[0].split('地块编号')[1:] ]: # 地块编号 DKBH_54 = reFunction( '备注:*\s*([()\w\.::—\¥ (\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) print(f'ID:{ID}最终结果:\n', DKBH_54)
def parse_detail(self, response): # TODO 主动关闭爬虫问题 try: data = Selector(text=response.body.decode('utf-8')) items = str(data.xpath('string(.)').extract()[0]).replace( '\xa0', '').replace('\u3000', '') htmlTable = htmlTableTransformer() WJBT_48 = '' XXSJ_49 = '' ZWBT_50 = '' GGBH_51 = '' CRSJ_52 = '' GGNX_53 = '' DKBH_54 = '' DKWZ_55 = '' TDYT_56 = '' TDMJ_57 = '' CRNX_58 = '' CJJ_59 = '' SRDW_60 = '' TDXZTJ_61 = '' TDSYTJ_62 = '' BZ_63 = '' GSQ_64 = '' LXFS_65 = '' DWDZ_66 = '' YZBM_67 = '' LXDH_68 = '' LXR_69 = '' LXDW_77 = '' DZYJ_70 = '' # TODO 共有字段 reFunction(f'时间:\s*([{self.reStr}]*)\s', LY) # 文件标题 WJBT_48 = response.meta.get('title').strip() # 信息时间 XXSJ_49 = reFunction( '[\d\-]*', data.xpath('//p[@class="sub-cp"]/text()').extract_first()) # 正文标题 ZWBT_50 = WJBT_48 # 公告编号 GGBH_51 = ''.join( data.xpath( '//div[@class="substance"]/p[position() <5]/.//*[contains(text(),"号")]/ancestor::p/.//text()' ).extract()) # 出让时间 CRSJ_52 = reFunction( '定于\s*([()【】\w\.—\(\)〔〕㎡≤≥《》\-\/\%\.﹪]*)[,;,、在]', items) # 公告类型 GGNX_53 = '出让结果' # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(url + WJBT_48 + XXSJ_49) # 公示期 GSQ_64 = reFunction( '公示期:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)[\s。]', items) # 联系方式 # LXFS_65 # 联系单位 LXDW_77 = reFunction( '联系单位:*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 单位地址 DWDZ_66 = reFunction( '单位地址:*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 邮政编码 YZBM_67 = reFunction( '邮政编码:*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 联系电话 LXDH_68 = reFunction( '联系电话:*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 联系人 LXR_69 = reFunction( '联\s*系\s*人:*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 电子邮件 DZYJ_70 = reFunction( '电子邮件:*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》@\-\/\%,;,、\.﹪]*)\s', items) if '宗地编号' in items or '土地位置' in items: soup = BeautifulSoup(response.body.decode('utf-8')) table = soup.find('table') tdData = htmlTable.tableTrTdRegulationToList(table) for _ in range(len(list(tdData.values())[0])): # 地块编号 DKBH_54 = tdData.get('宗地编号')[_] if tdData.get( '宗地编号') else '' # 地块位置 DKWZ_55 = tdData.get('宗地位置')[_] if tdData.get( '宗地位置') else tdData.get('土地位置')[_] if tdData.get( '土地位置') else '' # 土地用途 TDYT_56 = tdData.get('土地用途')[_] if tdData.get( '土地用途') else tdData.get('规划土地用途')[_] if tdData.get( '规划土地用途') else '' # 土地面积(公顷) TDMJ_57 = tdData.get('土地面积(m2)')[_] if tdData.get( '土地面积(m2)') else tdData.get( '出让土地面积(㎡)')[_] if tdData.get('出让土地面积(㎡)') else '' # 出让年限 CRNX_58 = tdData.get('使用年限')[_] if tdData.get( '使用年限') else tdData.get('出让年限')[_] if tdData.get( '出让年限') else '' # 成交价(万元) CJJ_59 = tdData.get('成交价(万元)')[_] if tdData.get( '成交价(万元)') else tdData.get( '成交价(人民币)')[_] if tdData.get('成交价(人民币)') else '' # 受让单位 SRDW_60 = tdData.get('受让单位')[_] if tdData.get( '受让单位') else tdData.get('竞买人(单位)')[_] if tdData.get( '竞买人(单位)') else '' # 土地使用条件 TDSYTJ_62 = tdData.get('土地使用条件')[_] if tdData.get( '土地使用条件') else '' # 数据写入 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if DKWZ_55: # 重复效验通过, 存储数据 csvFile = [ WJBT_48, XXSJ_49, ZWBT_50, GGBH_51, CRSJ_52, GGNX_53, DKBH_54, DKWZ_55, TDYT_56, TDMJ_57, CRNX_58, CJJ_59, SRDW_60, TDXZTJ_61, TDSYTJ_62, BZ_63, GSQ_64, LXFS_65, DWDZ_66, YZBM_67, LXDH_68, LXR_69, LXDW_77, DZYJ_70, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\t', '').replace( '\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) elif '地块编号' in items: for item in [ '地块编号' + _ for _ in re.findall('一([\s\S]*)二、', items) [0].split('地块编号')[1:] ]: # 地块编号 DKBH_54 = reFunction('地块编号:*\s*([\w\-]*)\s', item) # 地块位置 DKWZ_55 = reFunction( '地块位置:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 土地用途 TDYT_56 = reFunction( '土地用途:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 土地面积(公顷) TDMJ_57 = reFunction( '土地面积\(公顷\):*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 出让年限 CRNX_58 = reFunction( '出让年限:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 成交价(万元) CJJ_59 = reFunction( '成交价\(万元\):*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 受让单位 SRDW_60 = reFunction( '受让单位:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 土地现状 TDXZTJ_61 = reFunction( '土地现状:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 土地使用条件 TDSYTJ_62 = reFunction( '土地使用条件:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 备注 BZ_63 = reFunction( '备注:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item) # 数据写入 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if DKWZ_55: # 重复效验通过, 存储数据 csvFile = [ WJBT_48, XXSJ_49, ZWBT_50, GGBH_51, CRSJ_52, GGNX_53, DKBH_54, DKWZ_55, TDYT_56, TDMJ_57, CRNX_58, CJJ_59, SRDW_60, TDXZTJ_61, TDSYTJ_62, BZ_63, GSQ_64, LXFS_65, DWDZ_66, YZBM_67, LXDH_68, LXR_69, LXDW_77, DZYJ_70, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\t', '').replace( '\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) else: # 地块位置 DKWZ_55 = reFunction( '地理位置:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 出让年限 CRNX_58 = reFunction( '出让年限:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 成交价(万元) CJJ_59 = reFunction( '成交价格(人民币):*\s*([()\w\.::—\¥ (\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 受让单位 SRDW_60 = reFunction( '竞得人名称:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 土地现状 TDXZTJ_61 = reFunction( '土地现状:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items) # 数据写入 if self.name in DUPLICATE_SWITCH_LIST: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if DKWZ_55: # 重复效验通过, 存储数据 csvFile = [ WJBT_48, XXSJ_49, ZWBT_50, GGBH_51, CRSJ_52, GGNX_53, DKBH_54, DKWZ_55, TDYT_56, TDMJ_57, CRNX_58, CJJ_59, SRDW_60, TDXZTJ_61, TDSYTJ_62, BZ_63, GSQ_64, LXFS_65, DWDZ_66, YZBM_67, LXDH_68, LXR_69, LXDW_77, DZYJ_70, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\t', '').replace( '\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) except Exception as e: print(response.url) self.log( f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR)
def parse_detail(self, response): try: # 数据获取不全 categorynum = response.meta.get('categorynum') infoid = response.meta.get('infoid') targetUrl = "https://www.cqggzy.com/tiaozhuan.html?infoid=" + infoid + "&categorynum=" + categorynum results = '' for _ in range(5): try: self.session.get(targetUrl, headers=self.header, allow_redirects=False, timeout=60) redirectUrl = 'https://www.cqggzy.com/EpointWebBuilderService/getInfoListAndCategoryList.action?cmd=pageRedirect' data = {'categorynum': categorynum, 'infoid': infoid} response_ = self.session.post(redirectUrl, headers=self.header, data=data, allow_redirects=False, timeout=60) url = 'https://www.cqggzy.com' + response_.json().get( 'custom') if 'http' not in response_.json().get( 'custom') else response_.json().get('custom') results = self.session.get(url, headers=self.header, allow_redirects=False, timeout=60) break except Exception as e: pass data = Selector(text=results.content.decode('utf-8')) items = str(data.xpath('string(.)').extract()[0]).replace( '\xa0', '').replace('\u3000', '') WJBT_1 = '' XXSJ_2 = '' TDWZ_3 = '' YT_4 = '' TDMJ_5 = '' ZJRJZMJ_6 = '' ZDJZMD_7 = '' LDL_9 = '' CRJKQSJ_8 = '' JMBZJ_11 = '' BH_12 = '' CYNB_13 = '' KJMJ_14 = '' TZQD_15 = '' CCYQ_16 = '' BZ_17 = '' HQCRWJSJ_18 = '' HQCRWJDD_19 = '' BMSJ_20 = '' BMDD_21 = '' BZJJZSJ_22 = '' QRJMZGSJ_23 = '' LXDZ_24 = '' LXDH_25 = '' LXR_26 = '' # 共有字段 # 文件标题 WJBT_1 = data.xpath( '//*[@class="article-title"]/text()').extract_first() # 信息时间 XXSJ_2 = reFunction( '(\d{4}-\d{1,2}-\d{1,2})', data.xpath( '//*[@class="info-source"]/text()[1]').extract_first()) if (('总计容建筑面' in items and '序号' in items) or data.xpath('//table')) and '宗地编号' not in items: # TODO soup = BeautifulSoup(results.content.decode('utf-8')) tableMso = soup.find('table', 'MsoTableGrid') table = soup.find('table') htmlTable = htmlTableTransformer() try: if tableMso: tdData = htmlTable.table_tr_td(table) else: tdData = htmlTable.tableTrTdRegulation(table) sourceTdData = tdData for key, value in tdData.items(): tdData[key] = value.replace(str(key), '') if value else value # 土地位置 //table[@class="MsoNormalTable"] TDWZ_3 = tdData.get('土地位置') # 用途 YT_4 = tdData.get('土地用途') if tdData.get( '土地用途') else tdData.get('用途') # 土地面积(m) TDMJ_5 = tdData.get('土地面积(m)') if tdData.get( '土地面积 (m)') else tdData.get('土地面积 (㎡)') # 总计容建筑面积(m2) ZJRJZMJ_6 = tdData.get('总计容建筑面积(㎡)') # 最大建筑密度 ZDJZMD_7 = tdData.get('最大建筑密度') # 绿地率 LDL_9 = tdData.get('绿地率') # TODO 正则匹配 if not ZDJZMD_7 and not LDL_9: # sourceTdData for value in sourceTdData.values(): if '最大建筑密度' in value: ZDJZMD_7 = value.replace('最大建筑密度', '') if '绿地率' in value: LDL_9_ = value.replace('绿地率', '') LDL_9 = LDL_9_ if len( LDL_9_ ) < 10 else reFunction( f'绿地率[:]*\s*([()\w\.:: \(\)〔〕≤≥\-\/\%,、\.﹪]*)[;。,]?', value) if '总计容建筑面积' in value: LDL_9 = value.replace('总计容建筑面积(㎡)', '') # 出让价款起始价(万元) CRJKQSJ_8 = tdData.get('出让价款起始价(万元)') # 投标竞买保证金(万元) 保证金(万元) JMBZJ_11 = tdData.get('保证金(万元)') if tdData.get( '保证金(万元)') else tdData.get('投标、竞买保证金(万元)') # 编号 BH_12 = tdData.get('编号') # 产业类别 CYNB_13 = tdData.get('产业类别') # 可建面积(m2)或容积率 KJMJ_14 = tdData.get('可建面积(㎡)或容积率') # 投资强度(万元 / 公顷) TZQD_15 = tdData.get('投资强度(万元/公顷)') # 产出要求(万元 / 公顷) CCYQ_16 = tdData.get('产出要求(万元/公顷)') # 备注 其他需要说明的宗地情况: BZ_17_ = tdData.get('序号').split( '备注:')[-1] if '备注' in tdData.get('序号') else tdData.get( '备注:') other = tdData.get('序号').split( '其他需要说明的宗地情况:')[-1] if '其他需要说明的宗地情况:' in tdData.get( '序号') else tdData.get('其他需要说明的宗地情况:') BZ_17 = other if not BZ_17_ else BZ_17_ # 获取出让文件时间 HQCRWJSJ_18 = reFunction( '竞买申请人可在([\w :\.\-\s\/\%,、]*)。', reFunction('二、([\s\S]*)三、', items)) # 获取出让文件地点 HQCRWJDD_19 = reFunction( '网址:([\w :\.\-\s\/\%,、]*)(?:[\)\s]*)', reFunction('二、([\s\S]*)三、', items)) # 报名时间 BMSJ_20 = reFunction( '竞买申请人可在([\w \.:\-\s\/\%,、]*)\(报名时间\)', reFunction('三、([\s\S]*)四、', items)) # 保证金截止时间 BZJJZSJ_22 = reFunction( '竞买保证金到账截止时间为([\w \.:\-\s\/\%,、]*)。', reFunction('三、([\s\S]*)四、', items)) # 确认竞买资格时间 QRJMZGSJ_23 = BZJJZSJ_22 # 联系地址 LXDZ_24 = '|'.join( re.findall('联系地址:([\w 、\.:\-\/\%,、()]*)(?:[,\n])', reFunction('七、([\s\S]*)', items))) # 联系电话 LXDH_25 = '|'.join( re.findall( '[联系]*电话[::]([\w 、\.:\-\/\%,、()]*)(?:[\n。])', reFunction('七、([\s\S]*)', items))) # 联系人 LXR_26 = '|'.join( re.findall('联系人[::]([\w 、\.:\-\/\%,、()]*)(?:[ ,]*)', reFunction('七、([\s\S]*)', items))) except: for item in [ '宗地编号' + _ for _ in re.findall( '一、([\s\S]*)二、', items)[0].split('宗地编号')[1:] ]: # 土地位置 TDWZ_3 += '|' + reFunction( '宗地坐落:([\w :\.\-\s\/\%,、]*)(?:\s)', item) # 用途 YT_4_1 = reFunction( '主要用途:(?:[\s]*)([\w :\.\- \/\%,、]*)(?:\s)', item) YT_4_2 = reFunction( '土地用途[:](?:[\s]*)([\w ::\.\- \/\%,、]*)(?:\s)', item) YT_4 += '|' + YT_4_1 + YT_4_2 # 土地面积(m) TDMJ_5 += '|' + reFunction( '宗地总面积:(?:[\s]*)([\w :\.\- \/\%,、㎡]*)(?:\s)', item ) if reFunction( '宗地总面积:(?:[\s]*)([\w :\.\- \/\%,、㎡]*)(?:\s)', item ) else '|' + reFunction( '宗地面积:(?:[\s]*)([\w :\.\- \/\%,、㎡]*)(?:\s)', item) # 最大建筑密度 ZDJZMD_7 += '|' + reFunction( '建筑密度\(%\):([\w :\.\-\s\/\%,、]*)(?:\s)', item) # 绿地率 LDL_9 += '|' + reFunction( '绿地率\(%\)[:]([\w :\.\-\s\/\%,、≤;≥]*)(?:\s)', item) # 编号 BH_12 += '|' + reFunction( '宗地编号[:]([\w :\.\-\s\/\%,、]*)(?:\s)', item) # 投资强度(万元 / 公顷) TZQD_15 += '|' + reFunction( '投资强度[:]([\w :\.\-\s\/\%,、]*)(?:\s)', item) # 备注 BZ_17 += '|' + reFunction('备注:([\s\S]*)', item) # TODO 获取出让文件时间 HQCRWJSJ_18 = reFunction( '申请人可于([\w :\.\-\s\/\%,、]*)到', reFunction('四、([\s\S]*)五、', items)) # 获取出让文件地点 HQCRWJDD_19 = reFunction( '申请人可于(?:[\w :\.\-\s\/\%,、]*)到([\w :\.\-\s\/\%,、]*)获取', reFunction('四、([\s\S]*)五、', items)) # 报名时间 BMSJ_20 = reFunction('申请人可于([\w \.:\-\s\/\%,、]*)到', reFunction('五、([\s\S]*)六、', items)) # 保证金截止时间 BZJJZSJ_22 = reFunction( '竞买保证金的截止时间为([\w \d\.:\-\s\/\%,、 ]*)。', reFunction('五、([\s\S]*)六、', items)) # 确认竞买资格时间 QRJMZGSJ_23 = BZJJZSJ_22 # 联系地址 LXDZ_24 = '|'.join( re.findall('联系地址:([\w 、\.:\-\/\%,、()]*)(?:[,\n])', reFunction('八|七、([\s\S]*)', items))) # 联系电话 LXDH_25 = '|'.join( re.findall( '[联系]*电话[::]([\w 、\.:\-\/\%,、()]*)(?:[\n。])', reFunction('八|七、([\s\S]*)', items))) # 联系人 LXR_26 = '|'.join( re.findall('联 系 人[::]([ \w]*)(?:[\n]*)', reFunction('八|七、([\s\S]*)', items))) else: for item in [ '宗地编号' + _ for _ in re.findall('一、([\s\S]*)二、', items) [0].split('宗地编号')[1:] ]: # 土地位置 TDWZ_3 += '|' + reFunction( '宗地坐落:([\w :\.\-\s\/\%,、]*)(?:\s)', item) # 用途 YT_4_1 = reFunction( '主要用途:(?:[\s]*)([\w :\.\- \/\%,、]*)(?:\s)', item) YT_4_2 = reFunction( '土地用途[:](?:[\s]*)([\w ::\.\- \/\%,、]*)(?:\s)', item) YT_4 += '|' + YT_4_1 + YT_4_2 # 土地面积(m) TDMJ_5 += '|' + reFunction( '宗地总面积:(?:[\s]*)([\w :\.\- \/\%,、㎡]*)(?:\s)', item) if reFunction( '宗地总面积:(?:[\s]*)([\w :\.\- \/\%,、㎡]*)(?:\s)', item ) else '|' + reFunction( '宗地面积:(?:[\s]*)([\w :\.\- \/\%,、㎡]*)(?:\s)', item) # 最大建筑密度 ZDJZMD_7 += '|' + reFunction( '建筑密度:([\w :\.\-\s\/\%,、≦;≥]*)(?:\s)', item) # 绿地率 LDL_9 += '|' + reFunction( '绿地率\(%\)[:]([\w :\.\-\s\/\%,、≤;≥]*)(?:\s)', item) # 编号 BH_12 += '|' + reFunction( '宗地编号[:]([\w :\.\-\s\/\%,、]*)(?:\s)', item) # 投资强度(万元 / 公顷) TZQD_15 += '|' + reFunction( '投资强度[:]([\w :\.\-\s\/\%,、]*)(?:\s)', item) # 备注 BZ_17 += '|' + reFunction('备注:([\s\S]*)', item) # TODO 获取出让文件时间 HQCRWJSJ_18 = reFunction('申请人可于([\w :\.\-\s\/\%,、]*)到', reFunction('四、([\s\S]*)五、', items)) # 获取出让文件地点 HQCRWJDD_19 = reFunction( '申请人可于(?:[\w :\.\-\s\/\%,、]*)到([\w :\.\-\s\/\%,、]*)获取', reFunction('四、([\s\S]*)五、', items)) # 报名时间 BMSJ_20 = reFunction('申请人可于([\w \.:\-\s\/\%,、]*)到', reFunction('五、([\s\S]*)六、', items)) # 保证金截止时间 BZJJZSJ_22 = reFunction('竞买保证金的截止时间为([\w \d\.:\-\s\/\%,、 ]*)。', reFunction('五、([\s\S]*)六、', items)) # 确认竞买资格时间 QRJMZGSJ_23 = BZJJZSJ_22 # 联系地址 LXDZ_24 = '|'.join( re.findall('联系地址:([\w 、\.:\-\/\%,、()]*)(?:[,\n])', reFunction('八|七、([\s\S]*)', items))) # 联系电话 LXDH_25 = '|'.join( re.findall('[联系]*电话[::]([\w 、\.:\-\/\%,、()]*)(?:[\n。])', reFunction('八|七、([\s\S]*)', items))) # 联系人 LXR_26 = '|'.join( re.findall('联 系 人[::]([ \w]*)(?:[\n]*)', reFunction('八|七、([\s\S]*)', items))) # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # 爬取地址url url = url if url else response.url # 唯一标识 md5Mark = encrypt_md5(url) # 存储数据 csvFile = [ WJBT_1, XXSJ_2, TDWZ_3, YT_4, TDMJ_5, ZJRJZMJ_6, ZDJZMD_7, LDL_9, CRJKQSJ_8, JMBZJ_11, BH_12, CYNB_13, KJMJ_14, TZQD_15, CCYQ_16, BZ_17, HQCRWJSJ_18, HQCRWJDD_19, BMSJ_20, BMDD_21, BZJJZSJ_22, QRJMZGSJ_23, LXDZ_24, LXDH_25, LXR_26, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\r', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') yield except Exception as e: self.log(f'详情页数据解析失败, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR)
def parse_detail(self, response): try: data = Selector(text=response.body.decode('utf-8')) items = str(data.xpath('string(.)').extract()[0]).replace('\xa0', '').replace('\u3000', '') # TODO 共有字段 # 标题 BT_1 = ''.join(data.xpath('//*[@id="ctl00_ContentPlaceHolder1_UpdatePanel2"]/div/span/text()').extract()) # 公告编号 GGBH_2 = reFunction(f'公告编号:\s*([{self.reStr}]*)\s',items) # 地块编号 DKBH_3 = reFunction(f'地块编号:\s*([{self.reStr}]*)\s',items) # 地块名称 DKMC_4 = reFunction(f'地块名称:\s*([{self.reStr}]*)\s',items) # 容积率 RJL_5 = reFunction(f'容积率:\s*([{self.reStr}]*)\s',items) # 用地性质 YDXZ_6 = reFunction(f'用地性质:\s*([{self.reStr}]*)\s',items) # 规划面积 GHMJ_7 = reFunction(f'规划面积:\s*([{self.reStr}]*)\s',items) # 实际岀让面积 SJCRMJ_8 = reFunction(f'实际出让面积:\s*([{self.reStr}]*)\s',items) # 公告发布时间 GGFBSJ_9 = reFunction(f'公告发布时间:\s*([{self.reStr}]*)\s',items) # 保证金金额 BZJJE_10 = reFunction(f'保证金金额:\s*([{self.reStr}]*)\s',items) # 挂牌起始价 GPQSJ_11 = reFunction(f'挂牌起始价:\s*([{self.reStr}]*)\s',items) # 竟争保障房建设资金起始价 JZBZ_12 = reFunction(f'竞争保障房建设资金起始价:\s*([{self.reStr}]*)\s',items) # 最高限价 ZGXJ_13 = reFunction(f'最高限价:\s*([{self.reStr}]*)\s',items) # 加价幅度 JJFD_14 = reFunction(f'加价幅度:\s*([{self.reStr}]*)\s',items) # 报名开始时时间 BMKS_15 = reFunction(f'报名开始时间:\s*([{self.reStr}]*)\s',items) # 报名截至时间 BMJZ_16 = reFunction(f'报名截至时间:\s*([{self.reStr}]*)\s',items) # 报价截至时间 BJJZ_17 = reFunction(f'报价截至时间:\s*([{self.reStr}]*)\s',items) # 保证金截至时间 BZJJZ_18 = reFunction(f'保证金截至时间:\s*([{self.reStr}]*)\s',items) # 限时竟价开始时间 ZSJJKS_19 = reFunction(f'限时竞价开始时间:\s*([{self.reStr}]*)\s',items) # 最新报价 ZXBJ_20 = reFunction(f'最新报价:\s*([{self.reStr}]*)\s',items) # 最近报价时间 ZXBJ_21 = reFunction(f'最新报价时间:\s*([{self.reStr}]*)\s',items) # 竟得者 JDZ_22 = reFunction(f'竞得者:\s*([{self.reStr}]*)\s',items) # 竟得价 ZDJ_23 = reFunction(f'竞得价:\s*([{self.reStr}]*)\s',items) # 报价轮次 BJLC_24 = data.xpath('//*[@id="ctl00_ContentPlaceHolder1_GVLandPrice"]/tr[2]/td[1]/text()').extract_first() # 报价人 BJR_25 = data.xpath('//*[@id="ctl00_ContentPlaceHolder1_GVLandPrice"]/tr[2]/td[2]/span/text()').extract_first() # 金额报价 JEBJ_26 = data.xpath('//*[@id="ctl00_ContentPlaceHolder1_GVLandPrice"]/tr[2]/td[3]/span/text()').extract_first() # 单位地价 DWDJ_27 = data.xpath('//*[@id="ctl00_ContentPlaceHolder1_GVLandPrice"]/tr[2]/td[4]/span/text()').extract_first() # 报价时间 BJSJ_28 = data.xpath('//*[@id="ctl00_ContentPlaceHolder1_GVLandPrice"]/tr[2]/td[5]/text()').extract_first() # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(url + BT_1 + GGBH_2) # 是否需要判断重复 请求 if DUPLICATE_SWITCH: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: # 重复效验通过, 存储数据 csvFile = [ BT_1, GGBH_2, DKBH_3, DKMC_4, RJL_5, YDXZ_6, GHMJ_7, SJCRMJ_8, GGFBSJ_9, BZJJE_10, GPQSJ_11, JZBZ_12, ZGXJ_13, JJFD_14, BMKS_15, BMJZ_16, BJJZ_17, BZJJZ_18, ZSJJKS_19, ZXBJ_20, ZXBJ_21, JDZ_22, ZDJ_23, BJLC_24, BJR_25, JEBJ_26, DWDJ_27, BJSJ_28, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace('\n', '').replace('\r', '').replace(r'\xa0', '').replace('\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log(f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider(self, 'response msg info %s, job duplicated!' % response.url) except Exception as e: print(response.url) self.log(f'详情页数据解析失败, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR)
def parse_detail(self, response): try: data = Selector(text=response.body.decode('utf-8')) items = str(data.xpath('string(.)').extract()[0]).replace( '\xa0', '').replace('\u3000', '') WJBT_1 = '' WZLY_2 = '' GXSJ_3 = '' ZDBH_4 = '' ZDZL_5 = '' MJ_6 = '' TDYT_7 = '' CRNX_8 = '' RJL_9 = '' LDL_10 = '' JZMD_11 = '' JZXG_12 = '' JMBZJ_13 = '' QSJ_14 = '' ZJFD_15 = '' CRR_16 = '' QTSM_17 = '' # TODO 共有字段 # 文件标题 WJBT_1 = response.meta.get('title') # 文章来源 WZLY_2 = data.xpath('//div[@class="news_time"]/span[1]/text()' ).extract_first().replace('文章来自:', '') # 更新时间 GXSJ_3 = data.xpath('//div[@class="news_time"]/span[2]/text()' ).extract_first().replace('更新时间:', '') # 备注 QTSM_17 = reFunction(f'备注(?:[\s]*)([{self.reStr}]*)\s', items) # TODO //table[@border="1"] //table[@border="0"] # table 解析 if '宗地编号' not in items and '配套建筑规划用地' not in items: if data.xpath( '//table[@border="0"]') and '主要规划指标' not in items: soup = BeautifulSoup(response.body.decode('utf-8')) table = soup.find('table') htmlTable = htmlTableTransformer() tdData = htmlTable.tableTrTdRegulation(table) # 宗地编号 ZDBH_4 = tdData.get('地块编号') # 宗地坐落 ZDZL_5 = tdData.get('土地位置') # 面积 MJ_6 = tdData.get('土地面积(平方米)') # 土地用途 TDYT_7 = tdData.get('土地用途') # 出让年限 CRNX_8 = tdData.get('出让年限(年)') if tdData.get( '出让年限(年)') else tdData.get('出让年限') # 容积率 RJL_9 = tdData.get('容积率') if tdData.get( '容积率') else tdData.get('容积率(不大于)') # 绿地率 LDL_10 = tdData.get('绿地率') if tdData.get( '绿地率') else tdData.get('绿地率(不小于)') # 建筑密度 JZMD_11 = tdData.get('建筑密度') # 建筑限高 JZXG_12 = tdData.get('建筑高度') # 竞买保证金 JMBZJ_13 = tdData.get('竞买保证金(万元)') if tdData.get( '竞买保证金(万元)') else tdData.get('竞买保证金(元)') # 起始价 QSJ_14 = tdData.get('起始价(万元)') # 增价幅度 ZJFD_15 = tdData.get('增价幅度(万元)') if tdData.get( '增价幅度(万元)') else tdData.get('加价幅度') if '规划指标要求' in items: soup = BeautifulSoup(response.body.decode('utf-8')) table = soup.find('table') tdReplace_ = table.tbody.find_all('tr')[0].find( 'td', colspan='4') tdReplace = tdReplace_ if tdReplace_ else table.tbody.find_all( 'tr')[0].find('td', colspan='3') try: number = table.tbody.find_all('tr')[0].index(tdReplace) tdList = table.tbody.find_all('tr')[1].find_all('td') for _ in range(1, len(tdList) + 1): table.tbody.find_all('tr')[0].insert( number + _, tdList[_ - 1]) tdReplace.extract() table.tbody.find_all('tr')[1].extract() except: pass htmlTable = htmlTableTransformer() tdData = htmlTable.tableTrTdRegulation(table) # 宗地编号 ZDBH_4 = tdData.get('地块编号') # 宗地坐落 ZDZL_5_ = tdData.get('土地位置') if tdData.get( '土地位置') else tdData.get('地块位置/名称') ZDZL_5 = ZDZL_5_.replace( reFunction(f'备注(?:[\s]*)([{self.reStr}]*)\s', reFunction('一([\s\S]*)二', items)), '') # 面积 MJ_6 = tdData.get('土地面积(m2)') if tdData.get( '土地面积(m2)') else tdData.get('土地面积(平方米)') # 土地用途 TDYT_7 = tdData.get('土地用途') if tdData.get( '土地用途') else tdData.get('规划地性质') # 出让年限 CRNX_8_ = tdData.get(r'出让\u3000年限') if tdData.get( r'出让\u3000年限') else tdData.get('出让年限') CRNX_8 = CRNX_8_ if CRNX_8_ else tdData.get('出让年限(年)') # 容积率 RJL_9 = tdData.get('容积率') if tdData.get( '容积率') else tdData.get('容积率(不大于)') # 绿地率 LDL_10_ = tdData.get('绿地率') if tdData.get( '绿地率') else tdData.get('绿地率(%)') LDL_10 = LDL_10_ if LDL_10_ else tdData.get('绿地率(不小于)') # 建筑密度 JZMD_11_ = tdData.get('建筑\u3000密度') if tdData.get( '建筑\u3000密度') else tdData.get('建筑密度') JZMD_11__ = JZMD_11_ if JZMD_11_ else tdData.get('建筑密度(%)') JZMD_11 = JZMD_11__ if JZMD_11__ else tdData.get( '建筑\u3000密度(不大于)') # 建筑限高 JZXG_12_ = tdData.get('建筑限高') if tdData.get( '建筑限高') else tdData.get('建筑高度(m)') JZXG_12__ = JZXG_12_ if JZXG_12_ else tdData.get('建筑高度') JZXG_12 = JZXG_12__ if JZXG_12__ else tdData.get( '建筑限高(不高于)') # 竞买保证金 JMBZJ_13 = tdData.get('竞买保证金(元)') if tdData.get( '竞买保证金(元)') else tdData.get('竞买保证金(万元)') # 起始价 QSJ_14_ = tdData.get('起始价(元)') if tdData.get( '起始价(元)') else tdData.get('挂牌出让起始价(元)') QSJ_14 = QSJ_14_ if QSJ_14_ else tdData.get('起始价(万元)') # 增价幅度 ZJFD_15 = tdData.get('增价幅度(万元)') if tdData.get( '增价幅度(万元)') else tdData.get('加价幅度') if ZJFD_15 == '' and QSJ_14 == '' and JMBZJ_13 == '': soup = BeautifulSoup(response.body.decode('utf-8')) table = soup.find('table') tdReplace0 = table.tbody.find_all('tr')[0].find_all( 'td')[-1] # 第一个 tdReplace1 = table.tbody.find_all('tr')[1].find_all( 'td')[-1] # 第二个 number0 = table.tbody.find_all('tr')[0].index( tdReplace0) # 第一个index number1 = table.tbody.find_all('tr')[1].index( tdReplace1) # 第二个index tdList2 = table.tbody.find_all('tr')[2].find_all( 'td') # 第二个 tdList3 = table.tbody.find_all('tr')[3].find_all( 'td') # 第四个 for _ in range(1, len(tdList2) + 1): table.tbody.find_all('tr')[0].insert( number0 + _, tdList2[_ - 1]) for _ in range(1, len(tdList3) + 1): table.tbody.find_all('tr')[1].insert( number1 + _, tdList3[_ - 1]) table.tbody.find_all('tr')[2].extract() htmlTable = htmlTableTransformer() tdDataCopy = htmlTable.tableTrTdRegulation(table) # 竞买保证金 JMBZJ_13 = tdDataCopy.get( '竞买保证金(元)') if tdDataCopy.get( '竞买保证金(元)') else tdDataCopy.get('竞买保证金(万元)') # 起始价 QSJ_14_ = tdDataCopy.get('起始价(元)') if tdDataCopy.get( '起始价(元)') else tdDataCopy.get('挂牌出让起始价(元)') QSJ_14 = QSJ_14_ if QSJ_14_ else tdDataCopy.get( '起始价(万元)') # 增价幅度 ZJFD_15 = tdDataCopy.get('增价幅度(万元)') if tdDataCopy.get( '增价幅度(万元)') else tdDataCopy.get('加价幅度') # 出让人 if '标的序号' in items: soup = BeautifulSoup(response.body.decode('utf-8')) table = soup.find('table', border='0') htmlTable = htmlTableTransformer() tdData = htmlTable.table_tr_td(table) # 宗地坐落 ZDZL_5 = tdData.get('标的位置') # 面积 MJ_6 = tdData.get('土地面积') if tdData.get( '土地面积') else tdData.get('土地面积(平方米)') # 起始价 QSJ_14_ = tdData.get('起始价(元)') if tdData.get( '起始价(元)') else tdData.get('拍卖参考价(万元)') QSJ_14 = QSJ_14_ if QSJ_14_ else tdData.get('起始价(万元)') # 出让年限 CRNX_8 = tdData.get('土地性质(年限)') if tdData.get( '土地性质(年限)') else tdData.get('出让年限(年)') else: if '宗地编号' in items: for item in [ '宗地编号' + _ for _ in re.findall( '一([\s\S]*)二', items)[0].split('宗地编号')[1:] ]: # 宗地编号 ZDBH_4 += '|' + reFunction( f'宗地编号:(?:[\s]*)([{self.reStr}]*)\s', item) # 宗地坐落 ZDZL_5 += '|' + reFunction( f'宗地坐落:(?:[\s]*)([{self.reStr}]*)\s', item) # 面积 MJ_6 += '|' + reFunction( f'宗地面积:(?:[\s]*)([{self.reStr}]*)\s', item) # 出让年限 CRNX_8 += '|' + reFunction( f'出让年限:(?:[\s]*)([{self.reStr}]*)\s', item) # 容积率 RJL_9 += '|' + reFunction( f'容积率:(?:[\s]*)([{self.reStr}]*)\s', item) # 绿地率 LDL_10 += '|' + reFunction( f'绿地率\(%\):(?:[\s]*)([{self.reStr}]*)\s', item) # 建筑密度 JZMD_11 += '|' + reFunction( f'建筑密度\(%\):(?:[\s]*)([{self.reStr}]*)\s', item) # 建筑限高 JZXG_12 += '|' + reFunction( f'建筑限高\(米\):(?:[\s]*)([{self.reStr}]*)\s', item) # 竞买保证金 JMBZJ_13 += '|' + reFunction( f'保证金:(?:[\s]*)([{self.reStr}]*)\s', item) # 起始价 QSJ_14 += '|' + reFunction( f'起始价:(?:[\s]*)([{self.reStr}]*)\s', item) # 增价幅度 ZJFD_15 += '|' + reFunction( f'加价幅度:(?:[\s]*)([{self.reStr}]*)\s', item) # 出让人 # CRR_16 += '|' + reFunction(f'宗地编号:(?:[\s]*)([{self.reStr}]*)\s', item) # 其他说明 QTSM_17 += '|' + reFunction( f'备注:(?:[\s]*)([{self.reStr}]*)\s', item) if '配套建筑规划用地' in items: soup = BeautifulSoup(response.body.decode('utf-8')) table = soup.find('table') tdReplace0 = table.tbody.find_all('tr')[0].find_all('td')[ -1] # 第一个 tdReplace1 = table.tbody.find_all('tr')[1].find_all('td')[ -1] # 第二个 number0 = table.tbody.find_all('tr')[0].index( tdReplace0) # 第一个index number1 = table.tbody.find_all('tr')[1].index( tdReplace1) # 第二个index tdList2 = table.tbody.find_all('tr')[2].find_all( 'td') # 第二个 tdList3 = table.tbody.find_all('tr')[3].find_all( 'td') # 第四个 for _ in range(1, len(tdList2) + 1): table.tbody.find_all('tr')[0].insert( number0 + _, tdList2[_ - 1]) for _ in range(1, len(tdList3) + 1): table.tbody.find_all('tr')[1].insert( number1 + _, tdList3[_ - 1]) table.tbody.find_all('tr')[2].extract() htmlTable = htmlTableTransformer() tdData = htmlTable.tableTrTdRegulation(table) # 宗地编号 ZDBH_4 = tdData.get('地块编号') # 宗地坐落 ZDZL_5 = tdData.get('地块位置/名称') # 面积 MJ_6 = tdData.get('配套设施出让面积(m2)') if tdData.get( '配套设施出让面积(m2)') else tdData.get('土地面积(平方米)') # 土地用途 TDYT_7 = tdData.get('配套建筑规划用地性质') # 出让年限 CRNX_8 = tdData.get('出让年限') if tdData.get( '出让年限') else tdData.get('出让年限(年)') # 容积率 RJL_9 = tdData.get('容积率') if tdData.get( '容积率') else tdData.get('容积率(不大于)') # 绿地率 LDL_10 = tdData.get('公园整体绿地率(%)') if tdData.get( '公园整体绿地率(%)') else tdData.get('绿地率(不小于)') # 建筑密度 JZMD_11 = tdData.get('公园整体建筑密度(%)') # 建筑限高 JZXG_12_ = tdData.get('建筑限高') if tdData.get( '建筑限高') else tdData.get('建筑高度(m)') JZXG_12 = JZXG_12_ if JZXG_12_ else tdData.get('建筑高度') # 竞买保证金 JMBZJ_13 = tdData.get('竞买保证金(元)') if tdData.get( '竞买保证金(元)') else tdData.get('竞买保证金(万元)') # 起始价 QSJ_14_ = tdData.get('起始价(元)') if tdData.get( '起始价(元)') else tdData.get('配套设施用地挂牌出让起始价(元)') QSJ_14 = QSJ_14_ if QSJ_14_ else tdData.get('起始价(万元)') # 增价幅度 ZJFD_15 = tdData.get('增价幅度(万元)') if tdData.get( '增价幅度(万元)') else tdData.get('加价幅度') # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(url) # 存储数据 csvFile = [ WJBT_1, WZLY_2, GXSJ_3, ZDBH_4, ZDZL_5, MJ_6, TDYT_7, CRNX_8, RJL_9, LDL_10, JZMD_11, JZXG_12, JMBZJ_13, QSJ_14, ZJFD_15, QTSM_17, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\r', '').replace( r'\xa0', '').replace('\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield except Exception as e: print(response.url) self.log(f'详情页数据解析失败, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR)
def parse_detail(self, response): try: data = Selector(text=response.body.decode('utf-8')) items = str(data.xpath('string(.)').extract()[0]).replace( '\xa0', '').replace('\u3000', '') BT_47 = '' LY_55 = '' LYSJ_48 = '' XH_49 = '' PZWH_50 = '' YDDW_51 = '' GDFS_52 = '' PZSJ_53 = '' WZ_54 = '' YT_55 = '' MJ_56 = '' RJL_57 = '' GYWAFA_58 = '' # TODO 共有字段 # 标题 BT_47 = response.meta.get('title') LY = data.xpath( '//div[@class="content-small-title"]/text()').extract_first() # 来源 LY_55 = reFunction(f'来源:\s*([{self.reStr}]*)\s', LY) # 时间 LYSJ_48 = reFunction(f'时间:\s*([{self.reStr}]*)\s', LY) # 解析 table 若出错 使用正则 htmlTable = htmlTableTransformer() if '宗地编号' not in items: try: soup = BeautifulSoup(response.body.decode('utf-8')) table = soup.find_all('table')[0] if not table.tbody.find_all('tr')[0].find_all( text=re.compile("序号|受让人")): table.tbody.find_all('tr')[0].extract() tdsData = htmlTable.tableTrTdRegulationToList(table) for _ in range(len(list(tdsData.values())[0])): # if response.url == 'http://zzland.zhengzhou.gov.cn/hbgd/1715241.jhtml': # print() # 序号 XH_49 = tdsData.get('序号')[_] if tdsData.get( '序号') else '' # 批准文号 PZWH_50 = tdsData.get('批准文号')[_] if tdsData.get( '批准文号') else '' # 用地单位 YDDW_51_ = tdsData.get('用地单位(受让人)')[_] if tdsData.get( '用地单位(受让人)') else tdsData.get( '受让人')[_] if tdsData.get('受让人') else '' YDDW_51 = YDDW_51_ if YDDW_51_ else tdsData.get( '单位')[_] # 供地方式 GDFS_52 = tdsData.get('供地方式')[_] if tdsData.get( '供地方式') else tdsData.get('供应方式')[_] if tdsData.get( '供应方式') else '' # 批准时间 PZSJ_53 = tdsData.get('批准时间')[_] if tdsData.get( '批准时间') else tdsData.get('签订日期')[_] if tdsData.get( '签订日期') else '' # 位置 WZ_54_0 = tdsData.get('土地位置') WZ_54_1 = tdsData.get('土地座落') WZ_54_2 = tdsData.get('宗地位置') WZ_54_3 = tdsData.get('位置') WZ_54_ = list( filter(None, [WZ_54_0, WZ_54_1, WZ_54_2, WZ_54_3])) WZ_54 = WZ_54_[0][_] if WZ_54_ else '' # 用途 YT_55_0 = tdsData.get('用途') YT_55_1 = tdsData.get('土地用途') YT_55_2 = tdsData.get('用途明细') YT_55_ = list(filter(None, [YT_55_0, YT_55_1, YT_55_2])) YT_55 = YT_55_[0][_] if YT_55_ else '' # 面积 MJ_56_0 = tdsData.get('面积(平方米)') MJ_56_1 = tdsData.get('划拨面积') MJ_56_2 = tdsData.get('出让/划拨面积') MJ_56_3 = tdsData.get('面积(公顷)') MJ_56_ = list( filter(None, [MJ_56_0, MJ_56_1, MJ_56_2, MJ_56_3])) MJ_56 = MJ_56_[0][_] if MJ_56_ else '' # 容积率 RJL_57 = tdsData.get('容积率')[_] if tdsData.get( '容积率') else '' # 供应方案文号 GYWAFA_58 = tdsData.get('供应方案文号')[_] if tdsData.get( '供应方案文号') else '' # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(url + BT_47 + LYSJ_48) # 是否需要判断重复 请求 if DUPLICATE_SWITCH: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: # 重复效验通过, 存储数据 csvFile = [ BT_47, LY_55, LYSJ_48, XH_49, PZWH_50, YDDW_51, GDFS_52, PZSJ_53, WZ_54, YT_55, MJ_56, RJL_57, GYWAFA_58, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) except Exception as e: pass else: # 进行正则匹配 # 序号 XH_49 = reFunction(f'宗地编号([{self.reStr}]*)地块位置', items) # 用地单位 YDDW_51 = reFunction(f'受让单位([{self.reStr}]*)备注:', items) # 位置 WZ_54 = reFunction(f'地块位置([{self.reStr}]*)土地用途', items) # 用途 YT_55 = reFunction(f'土地用途([{self.reStr}]*)土地面积', items) # 面积 MJ_56 = reFunction(f'土地面积\(公顷\)([{self.reStr}]*)项目名称', items) # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(url + BT_47 + LYSJ_48) # 是否需要判断重复 请求 if DUPLICATE_SWITCH: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: # 重复效验通过, 存储数据 csvFile = [ BT_47, LY_55, LYSJ_48, XH_49, PZWH_50, YDDW_51, GDFS_52, PZSJ_53, WZ_54, YT_55, MJ_56, RJL_57, GYWAFA_58, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\r', '').replace( r'\xa0', '').replace('\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) except Exception as e: print(response.url) self.log( f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR)
def parse_detail(self, response): try: data = Selector(text=response.body.decode('gbk')) items = str(data.xpath('string(.)').extract()[0]).replace( '\xa0', '').replace('\u3000', '') # TODO 共有字段 # 文件标题 WJBT_6 = response.meta.get('title') # 发布时间 FBBT_7 = response.meta.get('ND') # 正文标题 ZWBT_8_ = data.xpath( '//*[@id="dl_news"]/tr/td/table[2]/tr/td/p[1]/b/span/text()' ).extract_first() if data.xpath( '//*[@id="dl_news"]/tr/td/table[2]/tr/td/p[1]/b/span/text()' ).extract_first() else data.xpath( '//*[@id="dl_news"]/tr/td/table[2]/tr/td/div[1]/b/span/text()' ).extract_first() ZWBT_8 = ZWBT_8_ if ZWBT_8_ else WJBT_6 # 公告类型 GGNX_9_ = reFunction( f'公告类型:\s*([{self.reStr}]*)\s', items) if reFunction( f'公告类型:\s*([{self.reStr}]*)\s', items) else WJBT_6[-4:] GGNX_9 = GGNX_9_ if GGNX_9_ in ['出让公告', '补充公告', '地块公告', '地块公示' ] else '' # 地块坐落 DKZL_10_ = reFunction(f'地块坐落于\s*([{self.reStr}]*)。', items) DKZL_10 = DKZL_10_ if DKZL_10_ else reFunction( f'地块位于\s*([{self.reStr}]*)四至为', items) # 四至 SZ_11 = reFunction(f'四至为:\s*([{self.reStr}]*)。', items) # 土地现状 TDXZ_12_ = reFunction( '现状为\s*([()\w\.:: ,,、;, \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)。', items) TDXZ_12 = TDXZ_12_ if TDXZ_12_ else reFunction( '[,,,、。]现状\s*([()\w\.:: ,,、;, \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)。', items) # 出让土地面积 CRTDMJ_13 = reFunction( f'出让土地面积\s*([()\w\.:: \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)[,,]', items) # 土地用途 TDYT_14 = reFunction( f'土地用途为\s*([()\w\.:: \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)[,,]', items) # 容积率 RJL_15 = reFunction( f'容积率\s*([()\w\.:: \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)[,,]', items) # 建筑系数 JZXS_16 = reFunction( f'建筑系数\s*([()\w\.:: \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)[,,]', items) # 建筑密度 JZMD_17 = reFunction( f'建筑密度\s*([()\w\.:: \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)[,,]', items) # 绿地率 LDL_18 = reFunction( f'绿地率[为]*\s*([()\w\.:: \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)[,,]', items) # 建筑限高 JZXG_19 = reFunction( f'建筑限高\s*([()\w\.:: \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)[,。,]', items) # 准入产业类别 ZRHYNB_20 = reFunction( f'准入产业类别为\s*([()\w\.:: \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)[,,]', items) # 投资强度 TZQD_21 = reFunction( f'投资强度\s*([()\w\.:: \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)。', items) # 出让年限 CRNX_22 = reFunction( f'出让年[限期]为\s*([()\w\.:: \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)[,,]', items) # 起始价 QSJ_23 = reFunction( f'起始价为\s*([()\w\.:: \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)。', items).replace('人民币', '') # 保证金 BZJ_24 = reFunction( f'保证金人民币\s*([()\w\.:: \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)[,,]', items) # 保证金到账截止时间 BZJJZSJ_25 = reFunction( f'保证金到账期限为\s*([()\w\.:: \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)[,,]', items) # 报名截止时间 BMJZSJ_26 = reFunction( f'报名截止时间为\s*([()\w\.:: \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)[,。,]', items) # 公告期 GGQ_27 = reFunction( f'公告日期为\s*([()\w\.:: \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)[,,]', items) # 挂牌时间 GPSJ_28_ = reFunction( f'挂牌时间自\s*([()\w\.:: \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)[,]', items) GPSJ_28 = GPSJ_28_ if GPSJ_28_ else reFunction( f'挂牌时间自\s*([()\w\.:: \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)[,]', items) # 挂牌地点 GPDD_29_ = reFunction( '挂牌时间自(?:\d{4}年\d{1,2}月\d{1,2}日至\d{4}年\d{1,2}月\d{1,2}日[上下午]*\d{1,2}:\d{1,2})\s*([()\w\.:: \(\)〔〕,,㎡≤≥《》\-\/\%,、\.﹪]*)。', items).strip(',').strip(',') GPDD_29 = GPDD_29_ if GPDD_29_ else reFunction( '挂牌时间自(?:\d{4}年\d{1,2}月\d{1,2}日至\d{4}年\d{1,2}月\d{1,2}日\d{1,2}:\d{1,2})\s*([()\w\.:: \(\)〔〕,,㎡≤≥《》\-\/\%,、\.﹪]*)。', items).strip(',').strip(',') # 增价幅度 ZJFD_30 = reFunction( f'增价幅度为[人民币]*\s*([()\w\.:: \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)。', items) # 联系单位 LXDW_31 = reFunction( f'联系单位:\s*([()\w\.:: \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)\s', items) # 联系人 LXR_32 = reFunction( f'联系人:\s*([()\w\.:: \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)\s', items).split('联系电话')[0] # 联系电话 LXDH_33 = reFunction( f'联系电话:\s*([()\w\.:: \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)\s', items) # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(url) # 存储数据 csvFile = [ WJBT_6, FBBT_7, ZWBT_8, GGNX_9, DKZL_10, SZ_11, TDXZ_12, CRTDMJ_13, TDYT_14, RJL_15, JZXS_16, JZMD_17, LDL_18, JZXG_19, ZRHYNB_20, TZQD_21, CRNX_22, QSJ_23, BZJ_24, BZJJZSJ_25, BMJZSJ_26, GGQ_27, GPSJ_28, GPDD_29, ZJFD_30, LXDW_31, LXR_32, LXDH_33, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\r', '').replace( r'\xa0', '').replace('\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield except Exception as e: print(response.url) self.log(f'详情页数据解析失败, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR)
def parse_detail(self, response): # TODO 主动关闭爬虫问题 try: data = Selector(text=response.body.decode('utf-8')) items = str(data.xpath('string(.)').extract()[0]).replace( '\xa0', '').replace('\u3000', '') GGLX_1 = '' WJBT_2 = '' SJ_3 = '' LY_4 = '' ZWBT_5 = '' ZDBH_6 = '' TDWZ_7 = '' CRMJ_8 = '' LHYD_9 = '' DLYD_10 = '' TDYT_11 = '' CRNX_12 = '' RJL_13 = '' JZMD_14 = '' LDL_15 = '' JZKJ_16 = '' QSJ_17 = '' BZJ_18 = '' JJFD_19 = '' BMRQ_20 = '' GPRQ_21 = '' GPJZSJ_22 = '' BZJDZSJ_23 = '' LXDZ_24 = '' LXR_25 = '' LXDH_26 = '' # TODO 共有字段 reFunction(f'时间:\s*([{self.reStr}]*)\s', LY) # 公告类型 GGLX_1 = '出让公告' # 文件标题 WJBT_2 = response.meta.get('title') # 时间 SJ_3 = data.xpath( '//div[@class="ztzx_frame_subtitle_l"]/span[1]/text()' ).extract_first() # 来源 LY_4 = data.xpath( '//div[@class="ztzx_frame_subtitle_l"]/span[2]/text()' ).extract_first() # 正文标题 ZWBT_5 = data.xpath( '//div[@class="ztzx_frame_content"]/div[1]/text()' ).extract_first() # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(url + WJBT_2 + SJ_3) # 报名时间起止日期 BMRQ_20 = reFunction(f'报名申请时间:\s*([\w]*);', items) if reFunction( f'报名申请时间:\s*([\w]*);', items ) else reFunction(f'申请人可于(\w*),向我局提交书面申请', items) if reFunction( f'申请人可于(\w*),向我局提交书面申请', items ) else reFunction(f'申请时间为:(\w*)', items) if reFunction( f'申请时间为:(\w*)', items) else reFunction(f'申请人可于(\w*)到', items) GPTime = reFunction(f'网上挂牌(报价)时间:\s*([\w]*)', items) if reFunction( f'网上挂牌(报价)时间:\s*([\w]*)', items) else reFunction( f'挂牌时间为:\s*([\w]*)', items) try: if GPTime: # 挂牌开始时间 GPRQ_21 = GPTime.split('至')[0] # 挂牌截止时间 GPJZSJ_22 = GPTime.split('至')[1] else: GPRQ_21 = reFunction(f'挂牌时间为:\s*([\s\S]*)', reFunction('六、([\s\S]*)七、', items)) GPJZSJ_22 = reFunction(f'挂牌时间为:\s*([\s\S]*)', reFunction('六、([\s\S]*)七、', items)) except Exception as e: self.log(f'详情页数据挂牌时间解析失败, 请求:{response.url}, 信息: {e}', level=logging.DEBUG) GPRQ_21 = '' GPJZSJ_22 = '' # 保证金到账截止时间 BZJDZSJ_23 = reFunction( f'保证金到账截止时间为:\s*([\w]*)', items) if reFunction( f'保证金到账截止时间为:\s*([\w]*)', items) else reFunction( f'保证金交纳截止时间:\s*([\w]*)', items) if reFunction( f'保证金交纳截止时间:\s*([\w]*)', items) else reFunction( f'保证金的截止时间为\s*([\w]*)', items) # 联系地址 LXDZ_24 = reFunction( '联系地址:\s*([()\w\.:: \(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)', items) if reFunction( f'联系地址:\s*([()\w\.:: \(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)', items) else reFunction( '单位地址:\s*([()\w\.\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)', items) # 联系人 LXR_25 = reFunction( f'联\s系\s人:\s*([()\w\.:: \(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)', items) # 联系电话 LXDH_26 = reFunction( f'联系电话:\s*([()\w\.:: \(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)', items) if '挂牌出让宗地的基本情况和规划指标等要求' not in items and '宗地编号' not in items: # 处理 table 情况 soup = BeautifulSoup(response.body.decode('utf-8')) table = soup.find('table') try: tdReplace = table.tbody.find_all('tr')[0].find( 'td', colspan='4') if table.tbody.find_all('tr')[0].find( 'td', colspan='4') else table.tbody.find_all( 'tr')[0].find('td', colspan="2") number = table.tbody.find_all('tr')[0].index(tdReplace) tdList = table.tbody.find_all('tr')[1].find_all('td') for _ in range(1, len(tdList) + 1): table.tbody.find_all('tr')[0].insert( number + _, tdList[_ - 1]) tdReplace.extract() table.tbody.find_all('tr')[1].extract() except: soup = BeautifulSoup(response.body.decode('utf-8')) table = soup.find('table') tdReplace = table.thead.find_all('tr')[0].find( 'td', colspan='4') if table.thead.find_all('tr')[0].find( 'td', colspan='4') else table.thead.find_all( 'tr')[0].find('td', colspan="2") number = table.thead.find_all('tr')[0].index(tdReplace) tdList = table.thead.find_all('tr')[1].find_all('td') for _ in range(1, len(tdList) + 1): table.thead.find_all('tr')[0].insert( number + _, tdList[_ - 1]) tdReplace.extract() table.thead.find_all('tr')[1].extract() table.tbody.insert( 0, table.thead.find_all('tr')[0]) # 插入 thead 的内容 table.thead.extract() htmlTable = htmlTableTransformer() try: tdData = htmlTable.tableTrTdRegulationToList(table) if not tdData and 'thead' in items: # 如果没有拿到 则可能存在 thead soup = BeautifulSoup(response.body.decode('utf-8')) table = soup.find('table') tdReplace = table.thead.find_all('tr')[0].find( 'td', colspan='4') if table.thead.find_all('tr')[0].find( 'td', colspan='4') else table.thead.find_all( 'tr')[0].find('td', colspan="2") number = table.thead.find_all('tr')[0].index(tdReplace) tdList = table.thead.find_all('tr')[1].find_all('td') for _ in range(1, len(tdList) + 1): table.thead.find_all('tr')[0].insert( number + _, tdList[_ - 1]) tdReplace.extract() table.thead.find_all('tr')[1].extract() table.tbody.insert( 0, table.thead.find_all('tr')[0]) # 插入 thead 的内容 table.thead.extract() htmlTable = htmlTableTransformer() except: tdData = {} for _ in range(len(list(tdData.values())[0])): # 宗地编号 ZDBH_6 = tdData.get('编号')[_] if tdData.get('编号') else '' # 土地位置 TDWZ_7 = tdData.get('土地位置')[_] if tdData.get( '土地位置') else '' # 出让面积(m2) CRMJ_8_0 = tdData.get('土地面积') CRMJ_8_1 = tdData.get('土地面积(平方米)') CRMJ_8_ = list(filter(None, [CRMJ_8_0, CRMJ_8_1])) CRMJ_8 = CRMJ_8_[0][_] if CRMJ_8_ else '' # 土地用途 TDYT_11 = tdData.get('土地用途')[_] if tdData.get( '土地用途') else '' # 岀让年限 CRNX_12 = tdData.get('出让年限(年)')[_] if tdData.get( '出让年限(年)') else '' # 容积率 RJL_13 = tdData.get('容积率')[_] if tdData.get( '容积率') else tdData.get('容 积 率')[_] if tdData.get( '容 积 率') else '' # 建筑密度 # JZMD_14 # 绿地率 LDL_15 = tdData.get('绿化率')[_] if tdData.get('绿化率') else '' # 建筑空间 JZKJ_16 = tdData.get('控制高度(m)')[_] if tdData.get( '控制高度(m)') else tdData.get('建筑限高(m)')[_] if tdData.get( '建筑限高(m)') else '' # 起始价(万元) QSJ_17 = tdData.get('挂牌起始价(万元)')[_] if tdData.get( '挂牌起始价(万元)') else '' # 保证金(万元) BZJ_18 = tdData.get('竞买保证金(万元)')[_] if tdData.get( '竞买保证金(万元)') else tdData.get( '竞买保证金(万元)')[_] if tdData.get('竞买保证金(万元)') else '' # 竞价幅度(万元) JJFD_19 = tdData.get('増价幅度(万元/次)')[_] if tdData.get( '増价幅度(万元/次)') else '' # 是否需要判断重复 请求 if DUPLICATE_SWITCH: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if ZDBH_6 and TDYT_11: # 重复效验通过, 存储数据 csvFile = [ GGLX_1, WJBT_2, SJ_3, LY_4, ZWBT_5, ZDBH_6, TDWZ_7, CRMJ_8, LHYD_9, DLYD_10, TDYT_11, CRNX_12, RJL_13, JZMD_14, LDL_15, JZKJ_16, QSJ_17, BZJ_18, JJFD_19, BMRQ_20, GPRQ_21, GPJZSJ_22, BZJDZSJ_23, LXDZ_24, LXR_25, LXDH_26, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) # TODO 判断 elif '挂牌出让宗地的基本情况和规划指标等要求' in items: for item in re.split( '\d、', reFunction('一、挂牌出让宗地的基本情况和规划指标等要求:([\s\S]*)二、', items)): # TODO if not item.strip(): continue # 宗地编号 ZDBH_6 = reFunction( f'^([()\w\.:: %\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)宗地位于', item) # 土地位置 TDWZ_7 = reFunction( f'宗地位于([()\w\.:: \(\)〔〕㎡≤≥《》\-\/\%;、\.﹪]*),', item) # 出让面积(m2) CRMJ_8 = reFunction( f'土地出让面积([()\w\.:: %\(\)〔〕㎡≤≥《》\-\/\%;、\.﹪]*),', item) # 土地用途 TDYT_11 = reFunction( f'宗地规划用途为([()\w\.:: %\(\)〔〕㎡≤≥《》\-\/\%;、\.﹪]*),', item) # 岀让年限 CRNX_12 = reFunction( f'宗地土地出让年期([()\w\.:: —\(\),〔〕%㎡≤≥《》\-\/\%;、\.﹪]*)。', item) # 容积率 RJL_13 = reFunction( f'容积率([()\w\.:: \(\)%〔〕㎡≤≥《》\-\/\%;、\.﹪]*),', item) # 建筑密度 JZMD_14 = reFunction( f'建筑密度([()\w\.:: \(\)〔〕%㎡≤≥《》\-\/\%;、\.﹪]*),', item) # 绿地率 LDL_15 = reFunction( f'绿地率([()\w\.:: \(\)〔〕%㎡≤≥《》\-\/\%;、\.﹪]*),', item) # 建筑空间 JZKJ_16 = reFunction( f'建筑空间([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%;、\.﹪]*),', item) # 起始价(万元) QSJ_17 = reFunction( f'本宗地起始价([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%;、\.﹪]*),', item) # 保证金(万元) BZJ_18 = reFunction( f'竞买保证金([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%;、\.﹪]*)', item) # 竞价幅度(万元) JJFD_19 = reFunction( f'增价幅度([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%;、\.﹪]*)', item) # 是否需要判断重复 请求 if DUPLICATE_SWITCH: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if ZDBH_6 and TDYT_11: # 重复效验通过, 存储数据 csvFile = [ GGLX_1, WJBT_2, SJ_3, LY_4, ZWBT_5, ZDBH_6, TDWZ_7, CRMJ_8, LHYD_9, DLYD_10, TDYT_11, CRNX_12, RJL_13, JZMD_14, LDL_15, JZKJ_16, QSJ_17, BZJ_18, JJFD_19, BMRQ_20, GPRQ_21, GPJZSJ_22, BZJDZSJ_23, LXDZ_24, LXR_25, LXDH_26, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) elif '挂牌出让地块基本情况' in items and '宗地编号' in items: for item in [ '宗地编号' + _ for _ in re.findall('一([\s\S]*)二、', items) [0].split('宗地编号')[1:] ]: # 宗地编号 ZDBH_6 = reFunction( f'宗地编号为([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%;、\.﹪]*),', item) # 土地位置 TDWZ_7 = reFunction( f'该地块([()\w\.:: —\(\)〔〕%㎡≤≥《》,\-\/\%;、\.﹪]*)。出让面积', item) # 出让面积(m2) CRMJ_8 = reFunction( f'出让面积:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item) # 绿化用地 LHYD_9 = reFunction( f'绿化用地:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item) # 道路用地 DLYD_10 = reFunction( f'道路用地:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item) # 土地用途 TDYT_11 = reFunction( f'用途:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item) # 岀让年限 CRNX_12 = reFunction( f'出让年限:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item) # 容积率 RJL_13 = reFunction( f'容积率:*([()\w\.:: ,—\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item) # 建筑密度 JZMD_14 = reFunction( f'建筑密度:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item) # 绿地率 LDL_15 = reFunction( f'绿地率:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item ) if reFunction( f'绿地率:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item) else reFunction( f'绿地率(%)([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item) # 起始价(万元) QSJ_17 = reFunction( f'起始价为:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*),', item) # 保证金(万元) BZJ_18 = reFunction( f'竞买保证金为:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*),', item) # 竞价幅度(万元) JJFD_19 = reFunction( f'竞价幅度为:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*)。', item) # 是否需要判断重复 请求 if DUPLICATE_SWITCH: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if ZDBH_6 and TDYT_11: # 重复效验通过, 存储数据 csvFile = [ GGLX_1, WJBT_2, SJ_3, LY_4, ZWBT_5, ZDBH_6, TDWZ_7, CRMJ_8, LHYD_9, DLYD_10, TDYT_11, CRNX_12, RJL_13, JZMD_14, LDL_15, JZKJ_16, QSJ_17, BZJ_18, JJFD_19, BMRQ_20, GPRQ_21, GPJZSJ_22, BZJDZSJ_23, LXDZ_24, LXR_25, LXDH_26, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) else: if '宗地编号' in items and '地块基本情况' not in items: for item in [ '宗地编号' + _ for _ in re.findall( '一([\s\S]*)二、', items)[0].split('宗地编号')[1:] ]: # 宗地编号 ZDBH_6 = reFunction( f'宗地编号:*\s*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*)\s', item) # 土地位置 TDWZ_7 = reFunction( f'宗地坐落:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s', item) # 出让面积(m2) CRMJ_8 = reFunction( f'宗地\s*总*面积:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s', item) # 土地用途 TDYT_11 = reFunction( f'土地用途[明细]*:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s', item) # 岀让年限 CRNX_12 = reFunction( f'出让年限:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s', item) # 容积率 RJL_13 = reFunction( f'容积率:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s', item) # 建筑密度 JZMD_14 = reFunction( f'建筑密度\(%\):*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s', item) # 绿地率 LDL_15 = reFunction( f'绿[地化]率\(%\):*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s', item ) if reFunction( f'绿[地化]率\(%\):*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s', item ) else reFunction( f'绿地率(%)\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s', item) # 建筑空间 JZKJ_16 = reFunction( f'建筑限高\(米\):*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s', item) # 起始价(万元) QSJ_17 = reFunction( f'起始价:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s', item) # 保证金(万元) BZJ_18 = reFunction( f'保证金:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s', item) # 竞价幅度(万元) JJFD_19 = reFunction( f'加价幅度:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s', item) # 挂牌开始时间 GPRQ_21 = reFunction( f'挂牌开始时间:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s', item) # 挂牌截止时间 GPJZSJ_22 = reFunction( f'挂牌截止时间:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s', item) # 联系地址 LXDZ_24 = reFunction( f'联系地址:\s*([()\w\.\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)', items).split('联')[0] if reFunction( f'联系地址:\s*([()\w\.\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)', items) else '' # 联系人 LXR_25 = reFunction( f'联\s系\s人:\s*([()\w\.\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)', items ).split('联')[0] if reFunction( f'联\s系\s人:\s*([()\w\.\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)', items) else '' # 联系电话 LXDH_26 = reFunction( f'联系电话:\s*([()\d\.:: \(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)', items) # 是否需要判断重复 请求 if DUPLICATE_SWITCH: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if ZDBH_6 and TDYT_11: # 重复效验通过, 存储数据 csvFile = [ GGLX_1, WJBT_2, SJ_3, LY_4, ZWBT_5, ZDBH_6, TDWZ_7, CRMJ_8, LHYD_9, DLYD_10, TDYT_11, CRNX_12, RJL_13, JZMD_14, LDL_15, JZKJ_16, QSJ_17, BZJ_18, JJFD_19, BMRQ_20, GPRQ_21, GPJZSJ_22, BZJDZSJ_23, LXDZ_24, LXR_25, LXDH_26, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace( ',', ' ').replace('\n', '').replace( '\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) elif '地块基本情况' in items: # todo soup = BeautifulSoup(response.body.decode('utf-8')) table = soup.find('table') htmlTable = htmlTableTransformer() tdData = htmlTable.tableTrTdRegulationToList(table) for _ in range(len(list(tdData.values())[0])): # 宗地编号 ZDBH_6 = tdData.get('编号')[_] if tdData.get( '编号') else '' # 土地位置 TDWZ_7 = tdData.get('地块位置')[_] if tdData.get( '地块位置') else '' # 出让面积(m2) CRMJ_8 = tdData.get('土地面积(亩)')[_] if tdData.get( '土地面积(亩)') else '' # 土地用途 TDYT_11 = tdData.get('土地用途')[_] if tdData.get( '土地用途') else '' # 是否需要判断重复 请求 if DUPLICATE_SWITCH: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: if ZDBH_6 and TDYT_11: # 重复效验通过, 存储数据 csvFile = [ GGLX_1, WJBT_2, SJ_3, LY_4, ZWBT_5, ZDBH_6, TDWZ_7, CRMJ_8, LHYD_9, DLYD_10, TDYT_11, CRNX_12, RJL_13, JZMD_14, LDL_15, JZKJ_16, QSJ_17, BZJ_18, JJFD_19, BMRQ_20, GPRQ_21, GPJZSJ_22, BZJDZSJ_23, LXDZ_24, LXR_25, LXDH_26, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace( ',', ' ').replace('\n', '').replace( '\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) except Exception as e: print(response.url) self.log( f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR)
def parse_detail(self, response): try: data = Selector(text=response.body.decode('utf-8')) items = str(data.xpath('string(.)').extract()[0]).replace( '\xa0', '').replace('\u3000', '') WJBT_1 = '' FBSJ_2 = '' WZLY_3 = '' SYH_4 = '' XXFL_5 = '' FBJG_6 = '' FBRQ_7 = '' WH_8 = '' SFYX_9 = '' XXMC_10 = '' ZWBT_11 = '' ZDBH_12 = '' ZDZMJ_13 = '' ZDZL_14 = '' SYNX_15 = '' CRNX_16 = '' RJL_17 = '' JZMD_18 = '' LDL_19 = '' JZXG_20 = '' TDYT_21 = '' TZQD_22 = '' BZJ_23 = '' GJBGBAH_24 = '' QSJ_25 = '' JJFD_26 = '' GPKSSJ_27 = '' GPJZSJ_28 = '' HQCRWJSJ_29 = '' HQCRWJDD_30 = '' BMSJ_31 = '' BMDD_32 = '' BZJJZSJ_33 = '' QRJMZGSJ_34 = '' LXDZ_35 = '' LXR_36 = '' LXDH_37 = '' KHDW_38 = '' KHYH_39 = '' YHZH_40 = '' # TODO 共有字段 # 文件标题 WJBT_1 = data.xpath( '//div[@class="title"]/h1/text()').extract_first() # 发布时间 reFunction('', items) FBSJ_2 = reFunction( '(\d{4}年\d{2}月\d{2}日 \d{2}:\d{2})\';', data.xpath('//div[@class="toolbar"]/script[1]/text()'). extract_first()) # 文章来源 WZLY_3 = reFunction( f'document.write\(\'文章来源:([{self.reStr}]*)\'\);', data.xpath('//div[@class="toolbar"]/script[2]/text()'). extract_first()) # 索引号 SYH_4 = data.xpath( '//div[@class="xxgk_xl_top"]/ul/li[1]/span/text()' ).extract_first() # 信息分类 XXFL_5 = data.xpath( '//div[@class="xxgk_xl_top"]/ul/li[2]/span/text()' ).extract_first() # 发布机构 FBJG_6 = reFunction( f'str_1 = "([{self.reStr}]*)";', data.xpath( '//div[@class="xxgk_xl_top"]/ul/li[3]/span/script/text()'). extract_first()) # 发文日期 FBRQ_7 = reFunction( f'str_1 = "([{self.reStr}]*)";', data.xpath( '//div[@class="xxgk_xl_top"]/ul/li[4]/span/script/text()'). extract_first()) # 文号 WH_8 = data.xpath( '//div[@class="xxgk_xl_top"]/ul/li[5]/span/text()' ).extract_first() # 是否有效 SFYX_9 = reFunction( f'var isok=\'([{self.reStr}]*)\';', data.xpath('//div[@class="xxgk_xl_top"]/ul/li[6]/script/text()' ).extract_first()) # 信息名称 XXMC_10 = data.xpath( '//div[@class="xxgk_xl_top"]/ul/li[7]/span/text()' ).extract_first() # 正文标题 ZWBT_11 = data.xpath( '//tr[@class="firstRow"]/td/text()').extract_first() if '主要规划指标' not in items: # item_ = reFunction('一、[\s\S]*二、', items) for item in [ '宗地编号' + _ for _ in re.findall('一([\s\S]*)二', items) [0].split('宗地编号')[1:] ]: # 联系电话 LXDH_37 = reFunction(f'联系电话:\s*([{self.reStr}]*)\s*开户单位', reFunction('八、[\s\S]*', items)) # 宗地编号 / 地块编号 ZDBH_12_ = '|'.join( re.findall( f'[宗地块]*(?:[\s]*)编号:(?:[\s]*)([{self.reStr}]*)宗地总面积', item)) ZDBH_12 += '|' + ZDBH_12_ if ZDBH_12_ else '|' + '|'.join( re.findall( f'[宗地块]*(?:[\s]*)编号:(?:[\s]*)([{self.reStr}]*)\s*', item)) # 宗地总面积 / 挂牌面积(m2) ZDZMJ_13_ = '|'.join( re.findall(f'宗地总面积:(?:[\s]*)([{self.reStr}]*)宗地坐落', item)) ZDZMJ_13 += '|' + ZDZMJ_13_ if ZDZMJ_13_ else '|' + '|'.join( re.findall(f'宗地总面积:(?:[\s]*)([{self.reStr}]*)\s*', item)) # 土地坐落 / 宗地坐落 ZDZL_14 += '|' + '|'.join( re.findall(f'宗地坐落:(?:[\s]*)([{self.reStr}]*)\s*出让年限', item)) # ZDZL_14 += '|' + ZDZL_14_ if ZDZL_14_ else '|'.join(re.findall(f'宗地坐落:(?:[\s]*)([{self.reStr}]*)\s*', item)) # 岀让年限 CRNX_16_ = '|'.join( re.findall(f'出让年限:(?:[\s]*)([{self.reStr}]*)\s*容积率', item)) CRNX_16 += '|' + reFunction('^[|]*\d{1,3}年', CRNX_16_) # CRNX_16 += '|' + CRNX_16_ if CRNX_16_ else '|'.join(re.findall(f'出让年限:(?:[\s]*)([{self.reStr}]*)\s*', item)) # 容积率 RJL_17 += '|' + '|'.join( re.findall( f'容积率:(?:[\s]*)([{self.reStr}]*)\s*建筑密度\(%\)', item)) # RJL_17 += '|' + RJL_17_ if RJL_17_ else '|'.join(re.findall(f'容积率:(?:[\s]*)([{self.reStr}]*)\s*', item)) # 建筑密度( %) / 建筑密度 JZMD_18 += '|' + '|'.join( re.findall( f'建筑密度\(%\):(?:[\s]*)([{self.reStr}]*)\s*绿化率', item)) # JZMD_18 += '|' + JZMD_18_ if JZMD_18_ else '|'.join(re.findall(f'建筑密度\(%\):(?:[\s]*)([{self.reStr}]*)\s*', item)) # 绿地率 / | 绿化率( %) LDL_19 += '|' + '|'.join( re.findall( f'绿化率\(%\):(?:[\s]*)([{self.reStr}]*)\s*建筑限高', item)) # LDL_19 += '|' + LDL_19_ if LDL_19_ else '|'.join(re.findall(f'绿化率\(%\):(?:[\s]*)([{self.reStr}]*)\s*', item)) # 建筑限高 / 建筑限高(米) JZXG_20 += '|' + '|'.join( re.findall( f'建筑限高\(米\):(?:[\s]*)([{self.reStr}]*)\s*土地用途明细', item)) # JZXG_20 += '|' + JZXG_20_ if JZXG_20_ else '|'.join(re.findall(f'建筑限高\(米\):(?:[\s]*)([{self.reStr}]*)\s*', item)) # 土地用途明细 / 土地用途 TDYT_21 += '|' + '|'.join( re.findall(f'土地用途明细:(?:[\s]*)([{self.reStr}]*)\s*投资强度', item)) # TDYT_21 += '|' + TDYT_21_ if TDYT_21_ else '|'.join(re.findall(f'土地用途明细:(?:[\s]*)([{self.reStr}]*)\s*', item)) # 投资强度 TZQD_22 += '|' + '|'.join( re.findall(f'投资强度:(?:[\s]*)([{self.reStr}]*)\s*保证金', item)) # TZQD_22 += '|' + TZQD_22_ if TZQD_22_ else '|'.join(re.findall(f'投资强度:(?:[\s]*)([{self.reStr}]*)\s*', item)) # 保证金(万元) / 保证金 BZJ_23 += '|' + '|'.join( re.findall(f'保证金:(?:[\s]*)([{self.reStr}]*)\s*估价报告备案号', item)) # BZJ_23 += '|' + BZJ_23_ if BZJ_23_ else '|'.join(re.findall(f'保证金:(?:[\s]*)([{self.reStr}]*)\s*', item)) # 估价报告备案号 GJBGBAH_24_ = '|'.join( re.findall( f'估价报告备案号(?:[\s]*)([{self.reStr}]*)\s*现状土地条件', item)) GJBGBAH_24__ = '|' + GJBGBAH_24_ if GJBGBAH_24_ else '|'.join( re.findall(f'估价报告备案号(?:[\s]*)([{self.reStr}]*)\s*起始价', item)) GJBGBAH_24 += '|' + reFunction('^\w{10, 16}', GJBGBAH_24__) # 起始价 / 起始价(万元) QSJ_25 += '|' + '|'.join( re.findall(f'起始价:(?:[\s]*)([{self.reStr}]*)\s*加价幅度', item)) # QSJ_25 += '|' + QSJ_25_ if QSJ_25_ else '|'.join(re.findall(f'起始价:(?:[\s]*)([{self.reStr}]*)\s*', item)) # 加价幅度 JJFD_26 += '|' + '|'.join( re.findall(f'加价幅度:(?:[\s]*)([{self.reStr}]*)\s*挂牌开始时间', item)) # JJFD_26 += '|' + JJFD_26_ if JJFD_26_ else '|'.join(re.findall(f'加价幅度:(?:[\s]*)([{self.reStr}]*)\s', item)) # 挂牌开始时间 GPKSSJ_27 += '|' + '|'.join( re.findall( f'挂牌开始时间:(?:[\s]*)([{self.reStr}]*)\s*挂牌截止时间', item)) # GPKSSJ_27 += '|' + GPKSSJ_27_ if GPKSSJ_27_ else '|'.join(re.findall(f'挂牌开始时间:(?:[\s]*)([{self.reStr}]*)\s*', item)) # 挂牌截止时间 GPJZSJ_28 += '|' + '|'.join( re.findall( f'挂牌截止时间:(?:[\s]*)([{self.reStr}]*)\s*(?:宗地编号|二)', item)) # GPJZSJ_28 += '|' + GPJZSJ_28_ if GPJZSJ_28_ else '|'.join(re.findall(f'挂牌截止时间:(?:[\s]*)([{reStr}]*)(?:宗地编号|二|\s*)', item)) else: soup = BeautifulSoup(response.body.decode('utf-8')) table = soup.find('table') if not table: for item in [ '宗地编号' + _ for _ in re.findall( '一([\s\S]*)二', items)[0].split('宗地编号')[1:] ]: # 联系电话 LXDH_37 = reFunction( f'联系电话:\s*([{self.reStr}]*)\s*开户单位', reFunction('八、[\s\S]*', items)) # 宗地编号 / 地块编号 ZDBH_12_ = '|'.join( re.findall( f'[宗地块]*(?:[\s]*)编号:(?:[\s]*)([{self.reStr}]*)宗地总面积', item)) ZDBH_12__ = ZDBH_12_ if ZDBH_12_ else '|' + '|'.join( re.findall( f'[宗地块]*(?:[\s]*)编号:(?:[\s]*)([{self.reStr}]*)\s*', item)) ZDBH_12 += ZDBH_12__ # 宗地总面积 / 挂牌面积(m2) ZDZMJ_13_ = '|'.join( re.findall(f'宗地总面积:(?:[\s]*)([{self.reStr}]*)宗地坐落', item)) ZDZMJ_13__ = ZDZMJ_13_ if ZDZMJ_13_ else '|' + '|'.join( re.findall(f'宗地总面积:(?:[\s]*)([{self.reStr}]*)\s*', item)) ZDZMJ_13 += ZDZMJ_13__ # 土地坐落 / 宗地坐落 ZDZL_14 += '|' + '|'.join( re.findall( f'宗地坐落:(?:[\s]*)([{self.reStr}]*)\s*出让年限', item)) # ZDZL_14 += '|' + ZDZL_14_ if ZDZL_14_ else '|'.join(re.findall(f'宗地坐落:(?:[\s]*)([{self.reStr}]*)\s*', item)) # 岀让年限 CRNX_16_ = '|'.join( re.findall( f'出让年限:(?:[\s]*)([{self.reStr}]*)\s*容积率', item)) CRNX_16 += '|' + reFunction('^[|]*\d{1,3}年', CRNX_16_) # 容积率 RJL_17 += '|' + '|'.join( re.findall( f'容积率:(?:[\s]*)([{self.reStr}]*)\s*建筑密度\(%\)', item)) # RJL_17 += '|' + RJL_17_ if RJL_17_ else '|'.join(re.findall(f'容积率:(?:[\s]*)([{self.reStr}]*)\s*', item)) # 建筑密度( %) / 建筑密度 JZMD_18 += '|' + '|'.join( re.findall( f'建筑密度\(%\):(?:[\s]*)([{self.reStr}]*)\s*绿化率', item)) # JZMD_18 += '|' + JZMD_18_ if JZMD_18_ else '|'.join(re.findall(f'建筑密度\(%\):(?:[\s]*)([{self.reStr}]*)\s*', item)) # 绿地率 / | 绿化率( %) LDL_19 += '|' + '|'.join( re.findall( f'绿化率\(%\):(?:[\s]*)([{self.reStr}]*)\s*建筑限高', item)) # LDL_19 += '|' + LDL_19_ if LDL_19_ else '|'.join(re.findall(f'绿化率\(%\):(?:[\s]*)([{self.reStr}]*)\s*', item)) # 建筑限高 / 建筑限高(米) JZXG_20 += '|' + '|'.join( re.findall( f'建筑限高\(米\):(?:[\s]*)([{self.reStr}]*)\s*土地用途明细', item)) # JZXG_20 += '|' + JZXG_20_ if JZXG_20_ else '|'.join(re.findall(f'建筑限高\(米\):(?:[\s]*)([{self.reStr}]*)\s*', item)) # 土地用途明细 / 土地用途 TDYT_21 += '|' + '|'.join( re.findall( f'土地用途明细:(?:[\s]*)([{self.reStr}]*)\s*投资强度', item)) # TDYT_21 += '|' + TDYT_21_ if TDYT_21_ else '|'.join(re.findall(f'土地用途明细:(?:[\s]*)([{self.reStr}]*)\s*', item)) # 投资强度 TZQD_22 += '|' + '|'.join( re.findall( f'投资强度:(?:[\s]*)([{self.reStr}]*)\s*保证金', item)) # TZQD_22 += '|' + TZQD_22_ if TZQD_22_ else '|'.join(re.findall(f'投资强度:(?:[\s]*)([{self.reStr}]*)\s*', item)) # 保证金(万元) / 保证金 BZJ_23 += '|' + '|'.join( re.findall( f'保证金:(?:[\s]*)([{self.reStr}]*)\s*估价报告备案号', item)) # BZJ_23 += '|' + BZJ_23_ if BZJ_23_ else '|'.join(re.findall(f'保证金:(?:[\s]*)([{self.reStr}]*)\s*', item)) # 估价报告备案号 现状土地条件 GJBGBAH_24_ = '|'.join( re.findall( f'估价报告备案号(?:[\s]*)([{self.reStr}]*)\s*现状土地条件', item)) GJBGBAH_24__ = '|' + GJBGBAH_24_ if GJBGBAH_24_ else '|'.join( re.findall( f'估价报告备案号(?:[\s]*)([{self.reStr}]*)\s*起始价', item)) GJBGBAH_24 += '|' + reFunction('^\w{10, 16}', GJBGBAH_24__) # 起始价 / 起始价(万元) QSJ_25 += '|' + '|'.join( re.findall( f'起始价:(?:[\s]*)([{self.reStr}]*)\s*加价幅度', item)) # QSJ_25 += '|' + QSJ_25_ if QSJ_25_ else '|'.join(re.findall(f'起始价:(?:[\s]*)([{self.reStr}]*)\s*', item)) # 加价幅度 JJFD_26 += '|' + '|'.join( re.findall( f'加价幅度:(?:[\s]*)([{self.reStr}]*)\s*挂牌开始时间', item)) # JJFD_26 += '|' + JJFD_26_ if JJFD_26_ else '|'.join(re.findall(f'加价幅度:(?:[\s]*)([{self.reStr}]*)\s', item)) # 挂牌开始时间 GPKSSJ_27 += '|' + '|'.join( re.findall( f'挂牌开始时间:(?:[\s]*)([{self.reStr}]*)\s*挂牌截止时间', item)) # GPKSSJ_27 += '|' + GPKSSJ_27_ if GPKSSJ_27_ else '|'.join(re.findall(f'挂牌开始时间:(?:[\s]*)([{self.reStr}]*)\s*', item)) # 挂牌截止时间 GPJZSJ_28 += '|' + '|'.join( re.findall( f'挂牌截止时间:(?:[\s]*)([{self.reStr}]*)\s*(?:宗地编号|二)', item)) # GPJZSJ_28 += '|' + GPJZSJ_28_ if GPJZSJ_28_ else '|'.join(re.findall(f'挂牌截止时间:(?:[\s]*)([{reStr}]*)(?:宗地编号|二|\s*)', item)) else: # 联系电话 LXDH_37 = reFunction(f'联系电话:\s*([{self.reStr}]*)\s', reFunction('八|七、[\s\S]*', items)) htmlTable = htmlTableTransformer() tdData = htmlTable.tableTrTdRegulation(table) # 宗地编号 / 地块编号 ZDBH_12 = tdData.get('地块编号') # 宗地总面积 / 挂牌面积(m2) ZDZMJ_13 = tdData.get(r'挂牌面积(m2)') # 土地坐落 / 宗地坐落 ZDZL_14 = tdData.get('土地坐落') # 使用年限 SYNX_15 = tdData.get('使用年限') # 起始价 / 起始价(万元) QSJ_25 = tdData.get('起始价(万元)') # 土地用途明细 / 土地用途 TDYT_21 = tdData.get('土地用途') # 保证金(万元) / 保证金 BZJ_23 = tdData.get('保证金(万元)') ZYGHZB = tdData.get('主要规划指标') # 容积率 RJL_17 = reFunction( '容积率[:]*\s*([()\w\.:: \(\)〔〕≤≥\-\/\%,、\.﹪]*)[;。,]?', ZYGHZB) # 建筑密度( %) / 建筑密度 JZMD_18 = reFunction( '建筑密度[:]*\s*([()\w\.:: \(\)〔〕≤≥\-\/\%,、\.﹪]*)容积率', ZYGHZB) # 绿地率 / | 绿化率( %) LDL_19 = reFunction( '绿地率[:]*\s*([()\w\.:: \(\)〔〕≤≥\-\/\%,、\.﹪]*)[;。,]?', ZYGHZB) # 建筑限高 / 建筑限高(米) JZXG_20 = reFunction( '建筑限高[:]*\s*([()\w\.:: \(\)〔〕≤≥\-\/\%,、\.﹪]*)[;。,]?', ZYGHZB) # TODO # 获取出让文件时间 HQCRWJSJ_29 = reFunction(f'申请人可于(?:[\s]*)([{self.reStr}]*)到', reFunction('四、[\s\S]*五、', items)) # 获取出让文件地点 HQCRWJDD_30 = reFunction( f'申请人可于(?:[\s]*)(?:[{self.reStr}]*)到\s*([{self.reStr}]*)获取 挂牌', reFunction('四、[\s\S]*五、', items)) # 报名时间 BMSJ_31 = reFunction(f'申请人可于(?:[\s]*)([{self.reStr}]*)到', reFunction('五、[\s\S]*六、', items)) # 报名地点 BMDD_32 = reFunction( f'申请人可于(?:[\s]*)(?:[{self.reStr}]*)到\s*([{self.reStr}]*)向我局提交书面申请', reFunction('五、[\s\S]*六、', items)) # 保证金截止时间 BZJJZSJ_33 = reFunction(f'截止时间为(?:[\s]*)([{self.reStr}]*)\s*。经审', reFunction('五、[\s\S]*六、', items)) # 确认竞买资格时间 QRJMZGSJ_34 = reFunction(f'我局将在\s*([{self.reStr}]*)\s*前确认其竞买资格', reFunction('五、[\s\S]*六、', items)) # TODO 联系地址 LXDZ_35 = reFunction(f'联系地址:\s*([{self.reStr}]*)\s*联 系', reFunction('八、[\s\S]*', items)) # 联系人 LXR_36 = reFunction(f'联 系\s*人:\s*([{self.reStr}]*)\s*联系电话', reFunction('八、[\s\S]*', items)) # 开户单位 KHDW_38 = reFunction(f'开户单位:\s*([{self.reStr}]*)\s*开户银行', reFunction('八、[\s\S]*', items)) # 开户银行 KHYH_39 = reFunction(f'开户银行:\s*([{self.reStr}]*)\s*银行帐号', reFunction('八、[\s\S]*', items)) # 银行帐号 YHZH_40 = reFunction( '^\d{17}', reFunction(f'银行帐号:\s*([{self.reStr}]*)\s*', reFunction('八、[\s\S]*', items))) # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(url) # 存储数据 csvFile = [ WJBT_1, FBSJ_2, WZLY_3, SYH_4, XXFL_5, FBJG_6, FBRQ_7, WH_8, SFYX_9, XXMC_10, ZWBT_11, ZDBH_12, ZDZMJ_13, ZDZL_14, SYNX_15, CRNX_16, RJL_17, JZMD_18, LDL_19, JZXG_20, TDYT_21, TZQD_22, BZJ_23, GJBGBAH_24, QSJ_25, JJFD_26, GPKSSJ_27, GPJZSJ_28, HQCRWJSJ_29, HQCRWJDD_30, BMSJ_31, BMDD_32, BZJJZSJ_33, QRJMZGSJ_34, LXDZ_35, LXR_36, LXDH_37, KHDW_38, KHYH_39, YHZH_40, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\r', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield except Exception as e: print(response.url) self.log(f'详情页数据解析失败, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR)
def parse_detail(self, response): try: data = Selector(text=response.body.decode('utf-8')) items = str(data.xpath('string(.)').extract()[0]).replace( '\xa0', '').replace('\u3000', '') BT_18 = '' LY_19 = '' SJ_20 = '' XZQ_21 = '' DZJGH_22 = '' XMMC_23 = '' XMWZ_24 = '' MJ_25 = '' TDLY_26 = '' TSYT_27 = '' GDFS_28 = '' TDSYNX_29 = '' HYFL_30 = '' TDJB_31 = '' CJJG_32 = '' ZFQH_33 = '' YDZFRQ_34 = '' YDZFJE_35 = '' BZ_36 = '' TDSTQR_37 = '' SX_38 = '' XX_39 = '' YDJDSJ_40 = '' YDKGSJ_41 = '' YDJGSJ_42 = '' SJKGSJ_43 = '' SJJGSJ_44 = '' PZDW_45 = '' HTQDRQ_46 = '' # TODO 共有字段 # 标题 BT_18 = response.meta.get('title') LY = data.xpath( '//div[@class="content-small-title"]/text()').extract_first() # 来源 LY_19 = reFunction(f'来源:\s*([{self.reStr}]*)\s', LY) # 时间 SJ_20 = reFunction(f'时间:\s*([{self.reStr}]*)\s', LY) # 解析 table 若出错 使用正则 htmlTable = htmlTableTransformer() if '宗地编号' not in items and '行政区' not in items: try: soup = BeautifulSoup(response.body.decode('utf-8')) table = soup.find_all('table')[0] if not table.tbody.find_all('tr')[0].find_all( text=re.compile("用地单位|受让人")): table.tbody.find_all('tr')[0].extract() tdsData = htmlTable.tableTrTdRegulationToList(table) for _ in range(len(list(tdsData.values())[0])): # 项目位置 XMWZ_24 = tdsData.get('土地座落')[_] if tdsData.get( '土地座落') else tdsData.get('宗地位置')[_] if tdsData.get( '宗地位置') else '' # 面积 MJ_25_0 = tdsData.get('出让面积(公顷)') MJ_25_1 = tdsData.get('出让面积') MJ_25_2 = tdsData.get('出让/划拨面积') MJ_25_ = list(filter(None, [MJ_25_0, MJ_25_1, MJ_25_2])) MJ_25 = MJ_25_[0][_] if MJ_25_ else '' # 土地用途 TSYT_27 = tdsData.get('土地用途')[_] if tdsData.get( '土地用途') else tdsData.get('用途明细')[_] if tdsData.get( '用途明细') else '' # 供地方式 GDFS_28 = tdsData.get('供应方式')[_] if tdsData.get( '供应方式') else '' # 土地级别 TDJB_31 = tdsData.get('土地级别')[_] if tdsData.get( '土地级别') else '' # 成交价格 CJJG_32_0 = tdsData.get('出让价款') CJJG_32_1 = tdsData.get('出让价款(万元)') CJJG_32_2 = tdsData.get('出让/划拨价歀') CJJG_32_ = list( filter(None, [CJJG_32_0, CJJG_32_1, CJJG_32_2])) CJJG_32 = CJJG_32_[0][_] if CJJG_32_ else '' # 土地使用权人 TDSTQR_37 = tdsData.get('用地单位')[_] if tdsData.get( '用地单位') else tdsData.get('受让人')[_] if tdsData.get( '受让人') else '' # 合同签订日期 HTQDRQ_46 = tdsData.get('签订日期')[_] if tdsData.get( '签订日期') else '' # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(url + LY_19 + SJ_20) # 是否需要判断重复 请求 if DUPLICATE_SWITCH: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: # 重复效验通过, 存储数据 csvFile = [ BT_18, LY_19, SJ_20, XZQ_21, DZJGH_22, XMMC_23, XMWZ_24, MJ_25, TDLY_26, TSYT_27, GDFS_28, TDSYNX_29, HYFL_30, TDJB_31, CJJG_32, ZFQH_33, YDZFRQ_34, YDZFJE_35, BZ_36, TDSTQR_37, SX_38, XX_39, YDJDSJ_40, YDKGSJ_41, YDJGSJ_42, SJKGSJ_43, SJJGSJ_44, PZDW_45, HTQDRQ_46, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\r', '').replace( r'\xa0', '').replace( '\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) except Exception as e: pass else: # 进行正则匹配 # 行政区 XZQ_21 = reFunction(f'行政区:([{self.reStr}]*)电子监管号', items) # 电子监管号 DZJGH_22 = reFunction(f'电子监管号:([{self.reStr}]*)项目名称', items) # 项目名称 XMMC_23_ = reFunction(f'项目名称:([{self.reStr}]*)项目位置', items) XMMC_23 = XMMC_23_ if XMMC_23_ else reFunction( f'宗地编号([{self.reStr}]*)地块位置', items) # 项目位置 XMWZ_24_ = reFunction(f'项目位置:([{self.reStr}]*)面积(公顷): ', items) XMWZ_24 = XMWZ_24_ if XMWZ_24_ else reFunction( f'地块位置([{self.reStr}]*)土地用途', items) # 面积 MJ_25_ = reFunction(f'面积\(公顷\):([{self.reStr}]*)土地来源', items) MJ_25 = MJ_25_ if MJ_25_ else reFunction( f'土地面积\(公顷\)([{self.reStr}]*)出让年限', items) # 土地来源 TDLY_26 = reFunction(f'土地来源:([{self.reStr}]*)土地用途', items) # 土地用途 TSYT_27_ = reFunction(f'土地用途:([{self.reStr}]*)供地方式', items) TSYT_27 = TSYT_27_ if TSYT_27_ else data.xpath( 'string(//table/tbody/tr[5]/td[1])').extract_first() # 供地方式 GDFS_28 = reFunction(f'供地方式:([{self.reStr}]*)土地使用年限', items) # 土地使用年限 TDSYNX_29_ = reFunction(f'土地使用年限:([{self.reStr}]*)行业分类', items) TDSYNX_29 = TDSYNX_29_ if TDSYNX_29_ else reFunction( f'出让年限([{self.reStr}]*)成交价\(万元\)', items) # 行业分类 HYFL_30 = reFunction(f'行业分类:([{self.reStr}]*)土地级别', items) # 土地级别 TDJB_31 = reFunction(f'土地级别:([{self.reStr}]*)成交价格\(万元\)', items) # 成交价格 CJJG_32_ = reFunction(f'成交价格\(万元\):([{self.reStr}]*)分期支付约定', items) CJJG_32 = CJJG_32_ if CJJG_32_ else reFunction( f'成交价格\(万元\)([{self.reStr}]*)明细用途', items) # 分期支付约定—支付期号 ZFQH_33 = data.xpath( '//table/tbody/tr[10]/td[1]/text()').extract_first() # 分期支付约定—约定支付日期 YDZFRQ_34 = data.xpath( '//table/tbody/tr[10]/td[2]/text()').extract_first() # 分期支付约定—约定支付金额 YDZFJE_35 = data.xpath( '//table/tbody/tr[10]/td[3]/text()').extract_first() # 分期支付约定—备注 BZ_36 = data.xpath( 'string(//table/tbody/tr[10]/td[4])').extract_first() # 土地使用权人 TDSTQR_37_ = reFunction(f'土地使用权人:([{self.reStr}]*)约定容积率', items) TDSTQR_37 = TDSTQR_37_ if TDSTQR_37_ else reFunction( f'受让单位([{self.reStr}]*)备注', items) # 约定容积率——下限 SX_38 = reFunction(f'下限:([{self.reStr}]*)上限', items) # 约定容积率——上限 XX_39 = reFunction(f'上限:([{self.reStr}]*)约定交地时间', items) # 约定交地时间 YDJDSJ_40 = reFunction(f'约定交地时间:([{self.reStr}]*)约定开工时间', items) # 约定开工时间 YDKGSJ_41 = reFunction(f'约定开工时间:([{self.reStr}]*)约定竣工时间', items) # 约定竣工时间 YDJGSJ_42 = reFunction(f'约定竣工时间:([{self.reStr}]*)实际开工时间', items) # 实际开工时间 SJKGSJ_43 = reFunction(f'实际开工时间:([{self.reStr}]*)实际竣工时间', items) # 实际竣工时间 SJJGSJ_44 = reFunction(f'实际竣工时间:([{self.reStr}]*)批准单位', items) # 批准单位 PZDW_45 = reFunction(f'批准单位:([{self.reStr}]*)合同签订日期', items) # 合同签订日期 HTQDRQ_46 = reFunction(f'合同签订日期:([{self.reStr}]*)\s', items) crawlingTime = time.strftime("%Y-%m-%d", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(url + LY_19 + SJ_20) # 是否需要判断重复 请求 if DUPLICATE_SWITCH: if self.redisClient.isExist(md5Mark): # 存在, 去重计数 self.duplicateUrl += 1 if self.duplicateUrl < 50: # 重复效验通过, 存储数据 csvFile = [ BT_18, LY_19, SJ_20, XZQ_21, DZJGH_22, XMMC_23, XMWZ_24, MJ_25, TDLY_26, TSYT_27, GDFS_28, TDSYNX_29, HYFL_30, TDJB_31, CJJG_32, ZFQH_33, YDZFRQ_34, YDZFJE_35, BZ_36, TDSTQR_37, SX_38, XX_39, YDJDSJ_40, YDKGSJ_41, YDJGSJ_42, SJKGSJ_43, SJJGSJ_44, PZDW_45, HTQDRQ_46, crawlingTime, url, md5Mark, ] results = '' for _ in csvFile: try: if _ and _ != '|' * len(_): results += _.replace(',', ' ').replace( '\n', '').replace('\r', '').replace( r'\xa0', '').replace('\xa0', '') + ',' else: results += ',' except Exception as e: results += ',' self.log( f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR) with open(self.pathDetail, 'a+') as fp: fp.write(results) fp.write('\n') self.log(f'数据获取成功', level=logging.INFO) yield else: self.crawler.engine.close_spider( self, 'response msg info %s, job duplicated!' % response.url) except Exception as e: print(response.url) self.log( f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR)