class LawyerWriterPipeline(object): def __init__(self): self.userInfoInfoData = UserInfoInfoData() def process_item(self, item, spider): line = dict(item) if line.has_key('UILawNumber'): if line.has_key('UISex') == False: line['UISex'] = None self.userInfoInfoData.insert_temp_userinfo( (line["UIID"], line["UIPhone"], line["UIName"], line["UIEmail"], line["UIPic"], line["UILawNumber"], line["LawOrg"], line["ProvinceCode"], line["CityCode"], line["Address"], line["UISignature"], line['UISex'])) if line.has_key('fiil_str'): try: for fiid in item['fiil_str']: self.userInfoInfoData.insert_temp_userfield( (str(uuid.uuid1()).replace('-', ''), fiid, line["UIID"])) except Exception as e: print e return item def close_spider(self, spider): pass
class SiChuanLawyerSpider(scrapy.Spider): name = "sichuan_law_spider" start_urls = ["http://fwpt.scsf.gov.cn/lsfw/lsfw.shtml"] areaData = AreaData() userInfoInfoData = UserInfoInfoData() pagesize = 20 provincode = '51' baseurl = "http://fwpt.scsf.gov.cn/lsfw/lsfwlist.shtml" def parse(self, response): for item in response.css(".dropdown a"): prostr = ''.join(item.xpath("@onclick").extract()) citycode = prostr.replace(u"sfjdjg('", '').replace(u"')", '') cityname = ''.join(item.xpath("text()").extract()) yield scrapy.FormRequest( url=self.baseurl, method="POST", headers={'X-Requested-With': 'XMLHttpRequest'}, dont_filter=True, callback=self.parseAjaxPageList, errback=self.handle_error, meta={ 'areacode': citycode, 'cityname': cityname }, formdata={ "page": '1', 'fydm': citycode, 'kplb': '2' }) def parseAjaxPageList(self, response): pagecount = int(''.join( response.xpath(u'//a[last()]/@onclick').extract()).replace( u'query(', '').replace(u')', '')) for page in range(1, pagecount): yield scrapy.FormRequest( url=self.baseurl, method="POST", headers={'X-Requested-With': 'XMLHttpRequest'}, dont_filter=True, callback=self.parseAjaxList, errback=self.handle_error, meta={ 'areacode': response.meta['areacode'], 'cityname': response.meta['cityname'] }, formdata={ "page": str(page), 'fydm': response.meta['areacode'], 'kplb': '2' }) def parseAjaxList(self, response): for i in response.xpath( "//div[@class='synopsis_N fl']/a/@href").extract(): detail_url = 'http://fwpt.scsf.gov.cn/' + i yield scrapy.Request( url=detail_url, method="GET", dont_filter=True, callback=self.parse_detail, errback=self.handle_error, meta={'cityname': response.meta['cityname']}, ) #详情页面 def parse_detail(self, response): item = {} # #[UIID],[UIPhone] ,[UIName] ,[UIEmail] ,[UIPic],[UILawNumber],[LawOrg],[ProvinceCode],[CityCode],[Address],[UISignature] item["UIID"] = str(uuid.uuid1()).replace('-', '') uiphone = ''.join( response.xpath('/html/body/div[3]/table/tbody/tr[5]/td[4]/text()'). extract()).replace('\t', '').replace('\n', '') match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone)) item['UILawNumber'] = ''.join( response.css('.font18::text').extract()).replace( u'执业证号 (', '').replace(u')', '').replace(u"\xa0", '') if item["UILawNumber"] != None and len( item["UILawNumber"] ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber( (item["UILawNumber"], )) == None: item["UIPhone"] = None if match_count == 0 else uiphone item['UIName'] = ''.join(response.css('.font28::text').extract()) item["ProvinceCode"] = self.provincode item['LawOrg'] = ''.join( response.css('.lsjjxg3::text').extract()).replace('\t', '').replace( '\n', '') item['UIEmail'] = ''.join( response.xpath( '/html/body/div[3]/table/tbody/tr[6]/td[4]/text()'). extract()).replace('\t', '').replace('\n', '') item["UISignature"] = None item["Address"] = None item["CityCode"] = ''.join( self.areaData.find_area_by_name_return_code( (response.meta['cityname']))) # 头像路径 dirname = 'sichuan' head_url = ''.join( response.xpath( '/html/body/div[3]/table/tbody/tr[1]/td[1]/img/@src'). extract()) item["UIPic"] = ''.join( http_util.downloadImage(["http://sd.12348.gov.cn/" + head_url], '/AppFile/' + dirname + "/" + item["UIID"] + '/head')) item['url'] = response.url return item def handle_error(self, result, *args, **kw): print "error url is :%s" % result.request.url self.logger.error("error url is :%s" % result.request.url)
class HeBeiLawyerSpider(scrapy.Spider): name = "hebei_law_spider" start_urls = ["http://he.12348.gov.cn/skywcm/webpage/search/index.jsp"] areaData = AreaData() userInfoInfoData = UserInfoInfoData() pagesize = 20 provincode = '13' baseurl = "http://he.12348.gov.cn/skywcm/webpage/search/search_do.jsp" def parse(self, response): isflag = 0 for item in response.xpath("//dl[@class='searchTab1']/dd"): if isflag < 3: isflag = isflag + 1 continue citycode = item.xpath('@data').extract_first().replace( u"{districtcode:'", "").replace(u"'}", '') cityname = item.xpath('a/text()').extract_first() yield scrapy.FormRequest( url=self.baseurl, method="POST", headers={'X-Requested-With': 'XMLHttpRequest'}, dont_filter=True, callback=self.parseAjaxPageList, errback=self.handle_error, meta={ 'areacode': citycode, 'cityname': cityname }, formdata={ "pageNum": '1', 'pageSize': str(self.pagesize), 'districtcode': citycode, 'type': '2', 'businessType': '1', 'pkid': '0', 't': str(int(time.time())) }) def parseAjaxPageList(self, response): data = json.loads(response.body_as_unicode()) pagecount = int(data['pageCount']) for page in range(1, pagecount): yield scrapy.FormRequest( url=self.baseurl, method="POST", headers={'X-Requested-With': 'XMLHttpRequest'}, dont_filter=True, callback=self.parseAjaxList, errback=self.handle_error, meta={'cityname': response.meta["cityname"]}, formdata={ "pageNum": str(page), 'pageSize': str(self.pagesize), 'districtcode': response.meta['areacode'], 'type': '2', 'businessType': '1', 'pkid': '0', 't': str(int(time.time())) }) def parseAjaxList(self, response): data = json.loads(response.body_as_unicode()) for item in data['datas']: item['cityname'] = response.meta['cityname'] yield self.parse_detail(item) #详情页面 def parse_detail(self, data): item = {} # #[UIID],[UIPhone] ,[UIName] ,[UIEmail] ,[UIPic],[UILawNumber],[LawOrg],[ProvinceCode],[CityCode],[Address],[UISignature] item["UIID"] = str(uuid.uuid1()).replace('-', '') uiphone = '' if data.has_key( 'cell_phone') == False else data['cell_phone'] match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone)) item['UILawNumber'] = None if data.has_key( 'accountcode') == False else data['accountcode'] if item["UILawNumber"] != None and len( item["UILawNumber"] ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber( (item["UILawNumber"], )) == None: item["UIPhone"] = None if match_count == 0 else uiphone item['UIName'] = data['user_name'] item["ProvinceCode"] = self.provincode item['LawOrg'] = None if data.has_key( 'accountorg') == False else data['accountorg'] item['UIEmail'] = None if data.has_key( 'email') == False else data['email'] item["UISignature"] = None item["Address"] = None if data.has_key( 'address') == False else data['address'] item['UISex'] = 0 if data['sex'] == 1 else 1 item["CityCode"] = ''.join( self.areaData.find_area_by_name_return_code( (data['cityname']))) # 头像路径 dirname = "shandong" item["UIPic"] = ''.join( http_util.downloadImage([data['picImg']], '/AppFile/' + dirname + "/" + item["UIID"] + '/head')) return item def handle_error(self, result, *args, **kw): print "error url is :%s" % result.request.url self.logger.error("error url is :%s" % result.request.url)
class ShanDongLawyerSpider(scrapy.Spider): name = "guangxi_law_spider" start_urls = ["http://gx.12348.gov.cn/lssws/index.jhtml"] areaData = AreaData() userInfoInfoData = UserInfoInfoData() pagesize = 20 provincode = '45' baseurl = "http://gx.12348.gov.cn/lssws/index_1.jhtml?qkey=mscms.ms.getLSList&args_code={0}" def parse(self, response): isflag = 0 for item in response.css(".content-inquiry-city button"): if isflag == 0: isflag = 1 continue citycode = item.xpath("@q").extract_first()[0:4] cityname = item.xpath("text()").extract_first() city_href = self.baseurl.format(citycode) yield scrapy.FormRequest( url=city_href, method="GET", dont_filter=True, callback=self.parsePageList, errback=self.handle_error, meta={ 'cityname': cityname, 'citycode': citycode }, ) def parsePageList(self, response): pagecountstr = response.css('#totalnum::attr(value)').extract_first() pagecount = (int(pagecountstr) - 1) / (self.pagesize + 1) page_next_url = "http://gx.12348.gov.cn/lssws/index_{0}.jhtml?qkey=mscms.ms.getLSList&args_code={1}" for page in range(1, pagecount): yield scrapy.FormRequest( url=page_next_url.format(str(page), response.meta['citycode']), method="GET", dont_filter=True, callback=self.parseList, errback=self.handle_error, meta=response.meta, ) def parseList(self, response): for item in response.css( '.search-results-box a::attr(href)').extract(): detail_url = "http://gx.12348.gov.cn" + item.replace('..', '') yield scrapy.FormRequest( url=detail_url, method="GET", dont_filter=True, callback=self.parse_detail, errback=self.handle_error, meta={'cityname': response.meta['cityname']}, ) #详情页面 def parse_detail(self, response): item = {} # #[UIID],[UIPhone] ,[UIName] ,[UIEmail] ,[UIPic],[UILawNumber],[LawOrg],[ProvinceCode],[CityCode],[Address],[UISignature] item["UIID"] = str(uuid.uuid1()).replace('-', '') uiphone = response.css('.zynx::text').extract_first().replace( '\t', '').replace('\r', '').replace('\n', '') match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone)) item['UILawNumber'] = response.css( '.zyzh::text').extract_first().replace('\t', '').replace( '\r', '').replace('\n', '') if item["UILawNumber"] != None and len( item["UILawNumber"] ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber( (item["UILawNumber"], )) == None: item["UIPhone"] = None if match_count == 0 else uiphone item['UIName'] = response.xpath( "//div[@class='row ryjs-top-name']/h3/text()").extract_first( ).replace('\t', '').replace('\r', '').replace('\n', '') item["ProvinceCode"] = self.provincode item['LawOrg'] = response.css( ".zyjg a::text").extract_first().replace('\t', '').replace( '\r', '').replace('\n', '') item['UIEmail'] = None item["UISignature"] = None item['fiil_str'] = field_info_dic.find_field_by_name(''.join( response.css("#ywzc::attr(value)").extract()).split(u",")) item["Address"] = response.xpath( "/html/body/div[1]/div[4]/div/div[2]/div[2]/div[1]/div[5]/span/text()" ).extract_first().replace('\t', '').replace('\r', '').replace('\n', '') item["CityCode"] = ''.join( self.areaData.find_area_by_name_return_code( (response.meta['cityname']))) # 头像路径 dirname = 'guangxi' headurl = "http://gx.12348.gov.cn" + ''.join( response.xpath('//img[@id="img-billid"]/@src').extract() ).replace('..', '') item["UIPic"] = ''.join( http_util.downloadImage([headurl], '/AppFile/' + dirname + "/" + item["UIID"] + '/head')) item['url'] = response.url return item def handle_error(self, result, *args, **kw): print "error url is :%s" % result.request.url self.logger.error("error url is :%s" % result.request.url)
class HeiLongJiangLawyerSpider(scrapy.Spider): name = "hlj_law_spider" start_urls = [ "http://hl.12348.gov.cn/gfpt/public/gfpt/ggflfw/wsbs/ls/tolist?dqbm=23" ] areaData = AreaData() userInfoInfoData = UserInfoInfoData() pagesize = 20 provincode = '23' baseurl = "http://hl.12348.gov.cn/gfpt/public/gfpt/ggflfw/wsbs/ls/listlsry" def parse(self, response): for item in response.css( "#shiqu_second li a::attr(onclick)").extract(): prostr = item.split(u',') citycode = prostr[0].replace(u'xzShi(', '').replace(u"'", '') cityname = prostr[1].replace(u"'", '') yield scrapy.FormRequest( url=self.baseurl, method="POST", headers={'X-Requested-With': 'XMLHttpRequest'}, dont_filter=True, callback=self.parseAjaxPageList, errback=self.handle_error, meta={ 'citycode': citycode, 'cityname': cityname }, formdata={ "dqPage": '1', 'countSize': str(self.pagesize), 'startSize': '1', 'dqbm': citycode, 'type': '1', 'rymc': u'请输入关键词' }) def parseAjaxPageList(self, response): yieldlist = [] data = json.loads(response.body_as_unicode()) pagecount = int(data['countPage']) yieldlist.extend(self.parseAjaxList(response)) for page in range(2, pagecount): countSize = self.pagesize * page startSize = self.pagesize + page yieldlist.append( scrapy.FormRequest( url=self.baseurl, method="POST", headers={'X-Requested-With': 'XMLHttpRequest'}, dont_filter=True, callback=self.parseAjaxList, errback=self.handle_error, meta={ 'citycode': response.meta['citycode'], 'cityname': response.meta['cityname'] }, formdata={ "dqPage": str(page), 'countSize': str(countSize), 'startSize': str(startSize), 'dqbm': response.meta['citycode'], 'type': '1', 'rymc': u'请输入关键词' })) return yieldlist def parseAjaxList(self, response): data = json.loads(response.body_as_unicode()) detail_url = 'http://hl.12348.gov.cn/gfpt/public/gfpt/ggflfw/wsbs/ls/ryDetail?rybm={0}' for item in data['lsrylist']: yield scrapy.Request( url=detail_url.format(item[0]), method="GET", dont_filter=True, meta={'cityname': response.meta['cityname']}, errback=self.handle_error, callback=self.parse_detail, ) #详情页面 def parse_detail(self, response): item = {} # #[UIID],[UIPhone] ,[UIName] ,[UIEmail] ,[UIPic],[UILawNumber],[LawOrg],[ProvinceCode],[CityCode],[Address],[UISignature] print response.url item["UIID"] = str(uuid.uuid1()).replace('-', '') uiphone = ''.join( response.xpath( '/html/body/div[2]/div/div[3]/div/dl/dd/li[6]/text()').re( '[^s]')).replace(' ', '').replace('\t', '').replace('\n', '') match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone)) item['UILawNumber'] = ''.join( response.xpath( '/html/body/div[2]/div/div[3]/div/dl/dd/li[3]/text()').re( '[^s]')).replace(' ', '').replace('\t', '').replace('\n', '') if item["UILawNumber"] != None and len( item["UILawNumber"] ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber( (item["UILawNumber"], )) == None: item["UIPhone"] = None if match_count == 0 else uiphone item['UIName'] = ''.join( response.xpath( '/html/body/div[2]/div/div[3]/div/dl/dd/h1/text()').re( '[^s]')).replace(' ', '').replace('\t', '').replace('\n', '') item["ProvinceCode"] = self.provincode item['LawOrg'] = ''.join( response.xpath( '/html/body/div[2]/div/div[3]/div/dl/dd/li[4]/a/text()'). re('[^s]')).replace(' ', '').replace('\t', '').replace('\n', '') item['UIEmail'] = None item["UISignature"] = ''.join( response.xpath('//*[@id="news_content_0"]/text()').re( '[^s]')).replace(' ', '').replace('\t', '').replace( '\n', '').replace('\r', '') item["Address"] = None item["CityCode"] = ''.join( self.areaData.find_area_by_name_return_code( (response.meta['cityname']))) # 头像路径 dirname = 'hlj' headurl = response.xpath( '/html/body/div[2]/div/div[3]/div/dl/dt/img/@src' ).extract_first() item["UIPic"] = ''.join( http_util.downloadImage(["http://hl.12348.gov.cn" + headurl], '/AppFile/' + dirname + "/" + item["UIID"] + '/head')) item['url'] = response.url return item def handle_error(self, result, *args, **kw): print "error url is :%s" % result.request.url self.logger.error("error url is :%s" % result.request.url)
class ZhaoFaLawyerSpider(scrapy.Spider): name = "zhaofa_lawyer" start_urls = ["http://china.findlaw.cn/beijing/lawyer"] areaData = AreaData() userInfoInfoData = UserInfoInfoData() def parse(self, response): provinceSet = list() urlmetedata = list() child_city_url = 'http://china.findlaw.cn/area_front/index.php?c=ajax&a=getChildCity' headers = { 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0' } for item in response.xpath( "//select[@id='province']/option/@value").extract(): if item != '': provinceSet.append(item) for pro in provinceSet: res = requests.post(child_city_url, data={ 'areacode': pro, 'profPy': None, 'typeid': '0' }, headers=headers) data = res.json() for key in data['data']: try: urlmetedata.append({ 'url': data['data'][key]['url'], 'province': data['data'][key]['province'], 'city': data['data'][key]['city'] }) except Exception as e: pass for item in urlmetedata: yield scrapy.Request(url=item['url'], meta={ 'province': item['province'], 'city': item['city'] }, callback=self.parse_lawyer_next_page, errback=self.handle_error) def parse_lawyer_next_page(self, response): last_page_url = ''.join( response.xpath( "//div[@class='common-pagination']/a[last()]/@href").extract()) self.parse_lawyer_list(response) if last_page_url != '': pagecount = int( re.match('.*/p_(?P<page>\d+)/', last_page_url).group('page')) for page in range(2, int(pagecount)): list_url = response.url + '/p_' + str(page) yield scrapy.Request(url=list_url, meta={ 'province': response.meta['province'], 'city': response.meta['city'] }, callback=self.parse_lawyer_list, errback=self.handle_error) def parse_lawyer_list(self, response): for item in response.css(".sr-list li"): detail_url = item.css(".lawyer_name::attr(href)").extract_first() yield scrapy.Request(url=detail_url, meta={ 'province': response.meta['province'], 'city': response.meta['city'] }, callback=self.parse_lawyer_item, errback=self.handle_error) def parse_lawyer_item(self, response): item = {} zhiye = response.xpath( "//dl[@class='information_practice information_practice_new']") # print ''.join(zhiye.extract()) item["UILawNumber"] = ''.join( zhiye.xpath(u'dd/span[contains(text(),"律师证编号:")]/text()').extract( )).replace(u'执业律师 (律师证编号:', '').replace(u')', '').replace(' ', '') uiphone = ''.join( response.css('.right_consult_phone a::text').extract()) match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone)) item["UIPhone"] = None if match_count == 0 else uiphone #如果数据库不存在执业证号 if item["UILawNumber"] != None and len( item["UILawNumber"] ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber( (item["UILawNumber"], )) == None and item["UIPhone"] != None: item["UIName"] = ''.join( response.xpath('//h1[@class="lvshi_info_name"]/text()'). extract()).replace(' ', '').replace(u"律师", '') item["LawOrg"] = response.xpath( '//p[@class="lvshi_info_add"]/text()').extract_first() item["Address"] = ''.join( response.css( '.information_practice_dd::text').extract()).replace( ' ', '') item["UIEmail"] = None desc = ''.join( response.xpath("//p[@class='information_info']/span/text()"). extract()).replace(u"\xa0", '') desc = re.sub( r'(<a.*?>.*?</a>)|((class|style|color|href)="[^"]*?")|(<.*?>)|(<[/].*?>)', '', desc).replace("\r", '').replace("\n", '').replace(' ', '') item["UISignature"] = None if desc == '' else desc.replace( u"\xa0", '').replace("\t", '').replace("\n", '').replace( ' ', '').replace(u'&', '').replace('...', '') item["ProvinceCode"] = ''.join( self.areaData.find_area_by_name_return_code( (response.meta['province']))) item["CityCode"] = ''.join( self.areaData.find_area_by_name_return_code( (response.meta['city']))) item["UIID"] = str(uuid.uuid1()).replace('-', '') item["UIPic"] = ''.join( http_util.downloadImage([ "http:" + ''.join( response.css( '.lvshi_info_pic a img::attr(src)').extract()) ], '/AppFile/' + item["UIID"] + '/head')) item["url"] = response.url return item def handle_error(self, result, *args, **kw): print "error url is :%s" % result.request.url self.logger.error("error url is :%s" % result.request.url)
class JiangXiLawyerSpider(scrapy.Spider): name = "jiangxi_law_spider" start_urls = ["http://lawnew.jxsf.gov.cn/views/lawyerInfo/findLawyer.jsp"] areaData = AreaData() userInfoInfoData = UserInfoInfoData() pagesize = 20 provincode = '38' baseurl = "http://lawnew.jxsf.gov.cn/flfw-jx/portEmpLs/queryLSList?{0}" def parse(self, response): province_city_dic = [] province_city_dic.append({'citycode': '3601', 'cityname': u'南昌市'}) province_city_dic.append({'citycode': '3602', 'cityname': u'景德镇市'}) province_city_dic.append({'citycode': '3603', 'cityname': u'萍乡市'}) province_city_dic.append({'citycode': '3604', 'cityname': u'九江市'}) province_city_dic.append({'citycode': '3605', 'cityname': u'新余市'}) province_city_dic.append({'citycode': '3606', 'cityname': u'鹰潭市'}) province_city_dic.append({'citycode': '3607', 'cityname': u'赣州市'}) province_city_dic.append({'citycode': '3608', 'cityname': u'吉安市'}) province_city_dic.append({'citycode': '3609', 'cityname': u'宜春市'}) province_city_dic.append({'citycode': '3610', 'cityname': u'抚州市'}) province_city_dic.append({'citycode': '3611', 'cityname': u'上饶市'}) for item in province_city_dic: news_url = self.baseurl.format("pageSize=" + str(self.pagesize) + "&pageNum=1&city=" + item['citycode'] + "&selInfo=&ywzc=&_=" + str(int(time.time()))) yield scrapy.Request( url=news_url, method="GET", dont_filter=True, callback=self.parseAjaxPageList, errback=self.handle_error, meta={'cityname': item['cityname']}, ) def parseAjaxPageList(self, response): data = json.loads(response.body_as_unicode()) pagecount = int(data['content']['pages']) for page in range(1, pagecount): page_url = re.sub('&pageNum=\d+', '&pageNum=' + str(page), response.url) detail_url = re.sub('&_=.*', '&_=' + str(int(time.time())), page_url) yield scrapy.Request( url=detail_url, method="GET", dont_filter=True, callback=self.parseAjaxList, errback=self.handle_error, meta={'cityname': response.meta['cityname']}, ) def parseAjaxList(self, response): data = json.loads(response.body_as_unicode())['content']['list'] for item in data: yield self.parse_detail(item, response.meta['cityname']) # 详情页面 def parse_detail(self, data, cityname): item = {} # #[UIID],[UIPhone] ,[UIName] ,[UIEmail] ,[UIPic],[UILawNumber],[LawOrg],[ProvinceCode],[CityCode],[Address],[UISignature] item["UIID"] = str(uuid.uuid1()).replace('-', '') uiphone = '' if data.has_key('sjhm') == False else data['sjhm'] match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone)) item['UILawNumber'] = None if data.has_key( 'zyzh') == False else data['zyzh'] if item["UILawNumber"] != None and len( item["UILawNumber"] ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber( (item["UILawNumber"], )) == None: item["UIPhone"] = None if match_count == 0 else uiphone item['UIName'] = data['lsxm'] item["ProvinceCode"] = self.provincode item['LawOrg'] = None if data.has_key( 'swsmc') == False else data['swsmc'] item['UIEmail'] = None if data.has_key( 'dzyx') == False else data['dzyx'] item["UISignature"] = None if data.has_key( 'grjj') == False else data['grjj'] item["Address"] = None if data.has_key( 'deptAdress') == False else data['deptAdress'].replace( "\r", '').replace("\n", '').replace(' ', '') item['UISex'] = None if data.has_key('xb') == False else ( 0 if data['xb'] == u'男' else 1) item["CityCode"] = ''.join( self.areaData.find_area_by_name_return_code((cityname))) fiil_str = None if data.has_key( 'ywzcmc') == False else data['ywzcmc'] if fiil_str != None: item['fiil_str'] = field_info_dic.find_field_by_name( fiil_str.split(u",")) # 头像路径 dirname = "jiangxi" item["UIPic"] = ''.join( http_util.downloadImage([ 'http://lawnew.jxsf.gov.cn/flfw-jx/views/picture/' + data['lszp'] ], '/AppFile/' + dirname + "/" + item["UIID"] + '/head')) return item def handle_error(self, result, *args, **kw): print "error url is :%s" % result.request.url self.logger.error("error url is :%s" % result.request.url)
class LiaoNingLawyerSpider(scrapy.Spider): name = "liaoning_law_spider" start_urls = ["http://218.60.145.124:8080/lnlxoa/govhall/lawyerResult.jsp"] areaData = AreaData() userInfoInfoData = UserInfoInfoData() def parse(self, response): baseurl = "http://218.60.145.124:8080/lnlxoa/govhall/lawyerResultOne.jsp?pn={0}" #1145 for i in range(1, 1145): yield scrapy.Request( url=baseurl.format(str(i)), method="GET", callback=self.parse_list, meta={"dont_redirect": True}, errback=self.handle_error, ) def parse_list(self, response): url = 'http://218.60.145.124:8080//lnlxoa/govhall/{0}' for i in response.css('.zi11 a::attr(href)'): yield scrapy.Request( url=url.format(i.extract()), method="GET", callback=self.parse_detail, meta={"dont_redirect": True}, errback=self.handle_error, ) #详情页面 def parse_detail(self, response): item = {} # #[UIID],[UIPhone] ,[UIName] ,[UIEmail] ,[UIPic],[UILawNumber],[LawOrg],[ProvinceCode],[CityCode],[Address],[UISignature] item["UIID"] = str(uuid.uuid1()).replace('-', '') table = response.xpath('//div[@class="zi35"]/table') uiphone = "".join( table.xpath('tr[7]/td/text()').re('[^\s]')).split(u':')[1] match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone)) item['UILawNumber'] = "".join( table.xpath('tr[11]/td[1]/text()').re('[^\s]')).split(u':')[1] if item["UILawNumber"] != None and len( item["UILawNumber"] ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber( (item["UILawNumber"], )) == None: item["UIPhone"] = None if match_count == 0 else uiphone item['UIName'] = "".join( table.xpath('tr[1]/td[1]/text()').re('[^\s]')).split(u':')[1] item["ProvinceCode"] = ''.join( self.areaData.find_area_by_name_return_code((u'辽宁'))) item['LawOrg'] = "".join( table.xpath('tr[2]/td/text()').re('[^\s]')).split(u':')[1] item['UIEmail'] = "".join( table.xpath('tr[14]/td/text()').re('[^\s]')).split(':')[1] item["UISignature"] = None item['FIID'] = None item["Address"] = None item["CityCode"] = None # 头像路径 dirname = 'liaoning' item["UIPic"] = ''.join( http_util.downloadImage([ "http://218.60.145.124:8080/lnlxoa/govhall" + "".join(table.xpath('tr[1]/td[2]/img/@src').re('[^\s]')) ], '/AppFile/' + dirname + "/" + item["UIID"] + '/head')) item['url'] = response.url return item def handle_error(self, result, *args, **kw): print "error url is :%s" % result.request.url self.logger.error("error url is :%s" % result.request.url)
class ShanDongLawyerSpider(scrapy.Spider): name = "shandong_law_spider" start_urls = ["http://www.sd12348.gov.cn/channels/ch00630/"] areaData = AreaData() userInfoInfoData = UserInfoInfoData() pagesize = 20 provincode = '37' baseurl = "http://www.sd12348.gov.cn/sftIDC/select/search.do" def parse(self, response): isflag = 0 for item in response.css("#cityDiv ul li a::attr(href)").extract(): if isflag == 0: isflag = 1 continue prostr = item.split(u',') citycode = prostr[0].replace(u'javascript:changeCitya(', '').replace(u"'", '') cityname = prostr[1].replace(u"'", '').replace(u");", '') yield scrapy.FormRequest( url=self.baseurl, method="POST", headers={'X-Requested-With': 'XMLHttpRequest'}, dont_filter=True, callback=self.parseAjaxPageList, errback=self.handle_error, meta={ 'pageSize': str(self.pagesize), 'areacode': citycode, 'cityname': cityname, 'type': 'lawyer', 'flag': '0', 'status': '0' }, formdata={ "page": '1', 'pageSize': str(self.pagesize), 'areacode': citycode, 'type': 'lawyer', 'flag': '0', 'status': '0' }) def parseAjaxPageList(self, response): data = json.loads(response.body_as_unicode()) pagecount = (int(data['totalCount']) - 1) / (self.pagesize + 1) for page in range(1, pagecount): response.meta['page'] = str(page) yield scrapy.FormRequest( url=self.baseurl, method="POST", headers={'X-Requested-With': 'XMLHttpRequest'}, dont_filter=True, callback=self.parseAjaxList, errback=self.handle_error, meta=response.meta, formdata={ "page": str(page), 'pageSize': str(self.pagesize), 'areacode': response.meta['areacode'], 'type': 'lawyer', 'flag': '0', 'status': '0' }) def parseAjaxList(self, response): data = json.loads(response.body_as_unicode()) detail_url = 'http://sd.12348.gov.cn/sftIDC/lawworkmanage/findPersonnelListByid.do?type=lawyer&id={0}' for i in data['list']: yield scrapy.FormRequest( url=detail_url.format(i['id']), method="POST", headers={'X-Requested-With': 'XMLHttpRequest'}, dont_filter=True, callback=self.parse_detail, errback=self.handle_error, meta={'cityname': response.meta['cityname']}, ) #详情页面 def parse_detail(self, response): item = {} # #[UIID],[UIPhone] ,[UIName] ,[UIEmail] ,[UIPic],[UILawNumber],[LawOrg],[ProvinceCode],[CityCode],[Address],[UISignature] data = json.loads(response.body_as_unicode()) item["UIID"] = str(uuid.uuid1()).replace('-', '') uiphone = data['telnum'] match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone)) item['UILawNumber'] = data['licenseno'] if item["UILawNumber"] != None and len( item["UILawNumber"] ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber( (item["UILawNumber"], )) == None: item["UIPhone"] = None if match_count == 0 else uiphone item['UIName'] = data['name'] item["ProvinceCode"] = self.provincode item['LawOrg'] = data['lawfirmname'] item['UIEmail'] = None item["UISignature"] = data['lawyerinfo'] item['fiil_str'] = field_info_dic.find_field_by_name( data['zhuangchang']) item["Address"] = data['lawfirmaddress'] item["CityCode"] = ''.join( self.areaData.find_area_by_name_return_code( (response.meta['cityname']))) # 头像路径 dirname = self.name item["UIPic"] = ''.join( http_util.downloadImage( ["http://sd.12348.gov.cn" + data['logourl']], '/AppFile/' + dirname + "/" + item["UIID"] + '/head')) item['url'] = response.url return item def handle_error(self, result, *args, **kw): print "error url is :%s" % result.request.url self.logger.error("error url is :%s" % result.request.url)
def __init__(self): self.userInfoInfoData = UserInfoInfoData()
class ChinaLineLawyerSpider(scrapy.Spider): name = "chinaline_law_spider" start_urls = ["http://www.fl900.com"] areaData = AreaData() userInfoInfoData = UserInfoInfoData() baseurl = "http://www.fl900.com/lawyer/0-0-{0}.html" def parse(self, response): for p in range(1, 1551): yield scrapy.FormRequest( url=self.baseurl.format(str(p)), method="GET", dont_filter=True, callback=self.parseList, errback=self.handle_error, ) def parseList(self, response): for i in response.css(".lawyerlist li a:nth-child(1)"): detail_url = 'http://www.fl900.com' + i.xpath( "@href").extract_first() uname = i.xpath("img/@alt").extract_first() yield scrapy.FormRequest(url=detail_url, method="GET", dont_filter=True, callback=self.parse_detail, errback=self.handle_error, meta={'uname': uname}) #详情页面 def parse_detail(self, response): item = {} # #[UIID],[UIPhone] ,[UIName] ,[UIEmail] ,[UIPic],[UILawNumber],[LawOrg],[ProvinceCode],[CityCode],[Address],[UISignature] item["UIID"] = str(uuid.uuid1()).replace('-', '') uiphone = ''.join( response.xpath('/html/body/div[3]/div[2]/div[2]/li[3]/text()'). extract()).replace(u'手机:', '').replace(u"\xa0", '') match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone)) item['UILawNumber'] = ''.join( response.xpath( '/html/body/div[3]/div[2]/div[1]/ul[1]/li[2]/p[2]/label[1]/text()' ).extract()).replace(u'执业证号:', '').replace(u"\xa0", '') if item["UILawNumber"] != None and len( item["UILawNumber"] ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber( (item["UILawNumber"], )) == None: item["UIPhone"] = None if match_count == 0 else uiphone item['UIName'] = response.meta['uname'] item['LawOrg'] = ''.join( response.xpath( '/html/body/div[3]/div[2]/div[1]/ul[1]/li[2]/p[2]/label[2]/text()' ).extract()).replace(u'执业机构:', '').replace(u"\xa0", '') item['UIEmail'] = ''.join( response.xpath( '/html/body/div[3]/table/tbody/tr[6]/td[4]/text()'). extract()).replace('\t', '').replace('\n', '') item["UISignature"] = ''.join( response.xpath( '/html/body/div[3]/div[2]/div[1]/ul[1]/li[2]/p[1]/text()'). extract()).replace(u"\xa0", '') item["Address"] = ''.join( response.xpath( '/html/body/div[3]/div[2]/div[1]/ul[1]/li[2]/p[2]/label[3]/text()' ).extract()).replace(u'联系地址:', '').replace(u"\xa0", '') pro_city_str = ''.join( response.xpath('/html/body/div[3]/div[2]/div[2]/li[1]/text()'). extract()).replace(u'地区:', '').split(' ') item["ProvinceCode"] = ''.join( self.areaData.find_area_by_name_return_code((pro_city_str[0]))) item["CityCode"] = ''.join( self.areaData.find_area_by_name_return_code((pro_city_str[1]))) # 头像路径 dirname = 'fl900' item["UIPic"] = '/APPFile/head.jpg' fiil_str = response.css('.goodat span::text').extract() if fiil_str != None: item['fiil_str'] = field_info_dic.find_field_by_name(fiil_str) item['url'] = response.url return item def handle_error(self, result, *args, **kw): print "error url is :%s" % result.request.url self.logger.error("error url is :%s" % result.request.url)
class FaWeiShiLawyerSpider(scrapy.Spider): name = "faweishi_law_spider" start_urls = ["http://m.faweishi.com/lawyer/china/"] areaData = AreaData() userInfoInfoData = UserInfoInfoData() baseurl = "http://m.faweishi.com/ajax.php" def parse(self, response): for item in response.xpath("//div[@class='fenl'][2]/ul/li/a"): yield scrapy.Request(url="http://m.faweishi.com" + item.xpath('@href').extract_first(), method="GET", dont_filter=True, callback=self.parse_province, errback=self.handle_error, meta={ 'province': item.xpath('text()').extract_first(), 'start': str(1) }) def parse_province(self, response): requests_arr = [] where = response.css('.lawList::attr("w")').extract_first() if where != None: requests_arr.extend(self.parse_province_list(response)) start = str(int(response.meta['start']) + 1) requests_arr.append( scrapy.FormRequest(url=self.baseurl, method="POST", headers={ 'X-Requested-With': 'XMLHttpRequest', 'Content-Type': 'application/x-www-form-urlencoded' }, dont_filter=True, callback=self.parse_province, errback=self.handle_error, formdata={ 'action': 'get_law', 'start': start, 'where': where }, meta={ 'province': response.meta['province'], 'start': start })) return requests_arr def parse_province_list(self, response): for item in response.css('.lawList li a::attr(href)').extract(): yield scrapy.Request(url=item, method="GET", dont_filter=True, callback=self.parse_detail, errback=self.handle_error, meta={'province': response.meta['province']}) #详情页面 def parse_detail(self, response): item = {} # #[UIID],[UIPhone] ,[UIName] ,[UIEmail] ,[UIPic],[UILawNumber],[LawOrg],[ProvinceCode],[CityCode],[Address],[UISignature] item["UIID"] = str(uuid.uuid1()).replace('-', '') uiphone = ''.join( response.xpath( '/html/body/div[1]/div[1]/div/div/div[2]/div[2]/text()').re( '[^\s+]')) match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone)) item['UILawNumber'] = ''.join( response.xpath( "/html/body/div[1]/div[1]/div/div/div[2]/div[3]/text()").re( '[^\s+]')) if item["UILawNumber"] != None and len( item["UILawNumber"] ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber( (item["UILawNumber"], )) == None: item["UIPhone"] = None if match_count == 0 else uiphone item['UIName'] = ''.join( response.xpath( "/html/body/div[1]/div[1]/div/div/div[2]/div[1]/div[1]/text()" ).re('[^\s+]')).replace(u'律师', '') item['LawOrg'] = ''.join( response.xpath( '/html/body/div[1]/div[1]/div/div/div[2]/div[4]/text()'). re('[^\s+]')) item['UIEmail'] = None item["UISignature"] = ''.join( response.css('#about::text').re('[^\s+]')).replace("\t", '') item["Address"] = ''.join( response.xpath( '/html/body/div[1]/div[1]/div/div/div[2]/div[5]/text()'). re('[^\s+]')) item["ProvinceCode"] = ''.join( self.areaData.find_area_by_name_return_code( (response.meta['province']))) item["CityCode"] = None fiil_str = ''.join( response.xpath( '/html/body/div[1]/div[1]/div/div/div[3]/span/text()'). extract()).replace('\r', '').replace('\t', '').replace('\n', '') item['fiil_str'] = field_info_dic.find_field_by_name( fiil_str.split(" ")) # 头像路径 dirname = 'fws' head_url = ''.join( response.css('.lshil3-1-1 img::attr(src)').extract()) item["UIPic"] = ''.join( http_util.downloadImage([head_url], '/AppFile/' + dirname + "/" + item["UIID"] + '/head')) if item["UIPic"] == '' or item["UIPic"] == None: item["UIPic"] = '/APPFile/head.jpg' item['url'] = response.url return item def handle_error(self, result, *args, **kw): print "error url is :%s" % result.request.url self.logger.error("error url is :%s" % result.request.url)
class FabangLawyerSpider(scrapy.Spider): name = "fabang_lawyer" start_urls = ["http://lawyer.fabang.com"] areaData = AreaData() userInfoInfoData = UserInfoInfoData() def parse(self, response): start_url = "http://lawyer.fabang.com/list/0-0-0-key-1-{0}.html" for page in range(1, 1075): yield scrapy.Request(url=start_url.format(str(page)), method="get", callback=self.parse_lawyer_list, errback=self.handle_error) def parse_lawyer_list(self, response): for detail_html in response.css(".lawyerlist"): detail_url = detail_html.css(".uname::attr(href)").extract_first() yield scrapy.Request(url=detail_url, callback=self.parse_lawyer_item, errback=self.handle_error) def parse_lawyer_item(self, response): item = {} item["UILawNumber"] = ''.join( response.xpath( u'//p[contains(text(),"执业证号:")]/text()').extract()).replace( ' ', '').replace(u'执业证号:', '') uiphone = ''.join( response.xpath( '//strong[@class="mobile"]/text()').extract()).replace( ' ', '') match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone)) item["UIPhone"] = None if match_count == 0 else uiphone #如果数据库不存在执业证号 if item["UILawNumber"] != None and len( item["UILawNumber"] ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber( (item["UILawNumber"], )) == None and item["UIPhone"] != no: item["UIName"] = ''.join( response.xpath( '//strong[@class="lawyername"]/text()').extract()).replace( ' ', '').replace(u"律师", '') item["LawOrg"] = response.xpath( '//p[@class="jigou"][1]/a/text()').extract_first() item["Address"] = ''.join( response.xpath( u'//p[contains(text(),"地\xa0\xa0\xa0\xa0址:")]/text()'). extract()).replace(' ', '').replace(u'地\xa0\xa0\xa0\xa0址:', '') item["UIEmail"] = ''.join( response.xpath( u'//p[contains(text(),"邮\xa0\xa0\xa0\xa0箱:")]/text()'). extract()).replace(' ', '').replace(u'邮\xa0\xa0\xa0\xa0箱:', '') fiil_str = ''.join( response.xpath(u'//p[contains(text(),"专长领域:")]/text()'). extract()).replace(' ', '').replace(u'专长领域:', '') desc = ''.join( response.xpath( "//div[@class='content'][last()]/*").extract()).replace( u"\xa0", '') desc = re.sub( r'(<a.*?>.*?</a>)|((class|style|color|href)="[^"]*?")|(<.*?>)|(<[/].*?>)', '', desc).replace("\r", '').replace("\n", '').replace(' ', '') s_start_index = 0 if desc.index(u'分享到:') == -1 else desc.index( u'分享到:') item["UISignature"] = None if desc == '' else desc[ s_start_index:].replace(u'分享到:', '').replace( u"\xa0", '').replace("\t", '').replace("\n", '').replace( ' ', '').replace(u'&', '').replace('...', '') province_city = response.xpath( '//div[@class="info_nm SG_txtc "]/text()').extract_first( ).replace("\r", '').replace("\n", '').split(" ") item["ProvinceCode"] = ''.join( self.areaData.find_area_by_name_return_code( (province_city[0]))) item["CityCode"] = ''.join( self.areaData.find_area_by_name_return_code( (province_city[1]))) item['fiil_str'] = field_info_dic.find_field_by_name( fiil_str.split(u"\xa0")) item["UIID"] = str(uuid.uuid1()).replace('-', '') item["UIPic"] = ''.join( http_util.downloadImage([ "http://lawyer.fabang.com" + ''.join( response.css( '.info_img_area img::attr(src)').extract()) ], '/AppFile/' + item["UIID"] + '/head')) item["url"] = response.url return item def handle_error(self, result, *args, **kw): print "error url is :%s" % result.request.url self.logger.error("error url is :%s" % result.request.url)