def get_list_info(self, url_page, html_type, auction_type): response = requests.get(url_page, headers=self.headers) html = response.text tree = etree.HTML(html) div_list = tree.xpath('//div[@class="sflistdiv"]') for i in div_list: info = [] auction = Auction(source, auction_type) auction.province = '上海' auction.city = '上海' auction.html_type = html_type auction.source_html = html auction_id = i.xpath( 'div[@class="sflistdivn2"]/div[@class="f20hei"]/a/@href' )[0].split('/')[-1] is_exist = coll.find_one({ 'auction_id': str(auction_id), 'source': source }) if is_exist: log.info('id已存在,id="{}"'.format(str(auction_id))) continue auction.auction_id = auction_id try: auction_name_ = i.xpath( 'div[@class="sflistdivn2"]/div[@class="f20hei"]/a/text()' )[0] except Exception as e: auction_name_ = '' region = i.xpath( 'div[@class="sflistdivn2"]/div[@class="sflistban"]/text()')[0] auction.region = re.search(' - (.*?)$', region, re.S | re.M).group(1) auction_time_ = i.xpath( 'div[@class="sflistdivn2"]/div[@class="sflisttime"]/text()')[0] address = i.xpath( 'div[@class="sflistdivn2"]/div[@class="sflistcan"]/text()' )[3].encode().decode() auction.auction_name = auction_name_ + address try: auction_time = re.search('拍卖时间:(.*?)$', auction_time_, re.S | re.M).group(1) auction.auction_time = datetime.datetime.strptime( auction_time, "%y.%m.%d") except Exception as e: auction.auction_time = None info.append(i.xpath('string(div[@class="sflistdivn2"])')) area_ = i.xpath( 'div[@class="sflistdivn2"]/div[@class="sflistcan"]/span[1]/text()' )[0] auction.area = re.search('面积:(.*?)$', area_, re.S | re.M).group(1) floor = i.xpath( 'div[@class="sflistdivn2"]/div[@class="sflistcan"]/span[3]/text()' )[0] auction.floor = re.search('楼层:(.*?)$', floor, re.S | re.M).group(1) start_auction_price = i.xpath('//div[@class="f34hong"]/text()')[0] auction.start_auction_price = float( re.search('(\d+),?(\d+)', start_auction_price, re.S | re.M).group(1).replace(',', '')) * 10000 auction.insert_db()
def parse(self, html): auction_list = html.xpath("//dl/dd/a/@href") for auction_url in auction_list: try: url = 'http://www.shjiapai.cn' + auction_url auction_res = requests.get(url, headers=self.headers) con = auction_res.text auction_id = re.search('id/(\d+).html', auction_url).group(1) if not check_auction(source=source, auction_id=auction_id): auction = Auction(source=source, auction_type=auction_type) auction.source_html = con auction.auction_id = auction_id auction.auction_name = re.search('楼盘名称.*?">(.*?)</td', con, re.S | re.M).group(1) auction.city = '上海' auction.html_type = '房产' auction.start_auction_price = re.search( '预计售价.*?">(.*?)</td', con, re.S | re.M).group(1) auction.floor = re.search('层.*?">(.*?)楼</td', con, re.S | re.M).group(1) auction.area = re.search('户型面积.*?">(.*?)</td', con, re.S | re.M).group(1) auction.build_type = re.search('物业类型.*?">(.*?)</td', con, re.S | re.M).group(1) auction.info = re.search('其它.*?>(.*?)</div', con, re.S | re.M).group(1) auction.insert_db() else: log.info("数据已存在") except Exception as e: log.error("{}解析失败".format(auction_url))
def crawler_detail_page(self, auction_id, province_name, city_name, type_name, auction_type): detail_url = 'http://www.chinesesfpm.com/index/index/info/biao_id/' + auction_id res = requests.get(detail_url) tree = etree.HTML(res.text) a = Auction(source=source, auction_type=auction_type) a.auction_id = auction_id a.auction_name = tree.xpath( '/html/body/div/div[6]/div/div[2]/div[1]/div[1]/text()')[0] a.html_type = type_name auction_time = tree.xpath( '/html/body/div/div[6]/div/div[2]/div[1]/div[2]/div[2]/div[2]/text()' )[0] auction_time_ = re.search('开始时间: (.*?)$', auction_time, re.S | re.M).group(1) a.auction_time = datetime.datetime.strptime(auction_time_, "%Y年%m月%d日 %H时%M分%S秒") a.province = province_name a.city = city_name a.info = [ tree.xpath('string(//*[@id="f4"])'), tree.xpath('string(//*[@id="f6"])') ] start_auction_price = \ tree.xpath('/html/body/div/div[6]/div/div[2]/div[1]/div[2]/div[2]/div[5]/div[1]/em[3]/text()')[0] s = start_auction_price.encode('utf-8').decode() a.start_auction_price = float( re.search('起拍价: ¥(.*)', s, re.S | re.M).group(1)) court = tree.xpath( '/html/body/div/div[6]/div/div[2]/div[1]/div[2]/div[2]/div[5]/div[2]/em[1]/text()' )[0] a.court = re.search('拍卖机构:(.*)', court, re.S | re.M).group(1) a.source_html = res.text a.insert_db()
def detail_parse(auction_res, auction_type, html_type, auction_id): con = auction_res.json() auction = Auction(source=source, auction_type=auction_type) auction.source_html = con auction.html_type = html_type auction.auction_id = auction_id auction.auction_name = con['object_title'] auction.start_auction_price = con['start_price'] auction.assess_value = con['appraise_price'] auction.earnest_money = con['bond_price'] auction.court = con['court_name'] auction_time = con['start_time'] location = con['location'] auction.auction_time = datetime.datetime.strptime( auction_time, "%Y-%m-%d %H:%M:%S") province, city, region = location.split(' ') auction.province = province auction.city = city auction.region = region if html_type == '房产': auction.floor = con['detail']['house_floor'] auction.area = con['detail']['gross_floor_area'] elif html_type == '土地': auction.area = con['detail']['l_land_area'] auction.insert_db()
def get_detail_info(self, detail_url, region_name, city_name, province_name, id_, html_type, auction_type): aution = Auction(source, auction_type) try: info = [] response = s.get(detail_url, headers=self.headers) html = response.text tree = etree.HTML(html) aution.region = region_name aution.auction_id = id_ aution.city = city_name aution.html_type = html_type aution.source_html = html aution.province = province_name aution.auction_name = tree.xpath('//div[contains(@class,"pm-main clearfix")]/h1/text()')[0].strip() start_auction_price = tree.xpath('//*[@id="J_HoverShow"]/tr[1]/td[1]/span[2]/span/text()')[0] \ .replace(',', '').replace(' ', '') aution.start_auction_price = float(start_auction_price) earnest_money = tree.xpath('//*[@id="J_HoverShow"]/tr[2]/td[1]/span[2]/span/text()')[0] \ .replace(',', '').replace(' ', '') aution.earnest_money = float(earnest_money) try: assess_value = tree.xpath('//*[@id="J_HoverShow"]/tr[3]/td[1]/span[2]/span/text()')[0].replace(',', '') aution.assess_value = float(assess_value) except Exception: aution.assess_value = None aution.court = tree.xpath('//p[@class="subscribe-unit"]/span/a/text()')[0] aution.contacts = tree.xpath('//p[@class="subscribe-unit"]/span/em/text()')[0] aution.phone_number = tree.xpath('//p[@class="subscribe-unit"][2]/span[2]/text()')[1] info.append(tree.xpath('string(//*[@id="J_DetailTabMain"]/div[4])')) info.append(tree.xpath('string(//*[@id="J_DetailTabMain"]/div[5])')) aution.info = info logo = tree.xpath('//h1[@class="bid-fail"]/text()') if logo: if '撤回' in logo[0] or '以物抵债' in logo[0] or '中止' in logo[0] or '暂缓' in logo[0] \ or '撤拍' in logo[0] or '待确认' in logo[0]: return elif '已结束' in logo[0]: # 时间字符串 auction_time = tree.xpath('//span[@class="countdown J_TimeLeft"]/text()')[0] aution.auction_time = datetime.datetime.strptime(auction_time, "%Y/%m/%d %H:%M:%S") else: # 时间戳 auction_time = tree.xpath('//li[@id="sf-countdown"]/@data-start')[0] aution.auction_time = datetime.datetime.fromtimestamp(int(auction_time) / 1000) else: # 时间戳 auction_time = tree.xpath('//li[@id="sf-countdown"]/@data-start')[0] aution.auction_time = datetime.datetime.fromtimestamp(int(auction_time) / 1000) aution.insert_db() except Exception as e: log.error('解析错误,url="{}",e="{}"'.format(detail_url, e))
def start_crawler(self): for type_num in type_list: page_num = self.get_page(type_num.code) for page in range(1, int(page_num) + 1): url = 'http://auction.jd.com/getJudicatureList.html?page=' + str( page) + '&limit=40&childrenCateId=' + type_num.code try: response = s.get(url, headers=self.headers) html = response.json() try: for info in html['ls']: auction = Auction( source=source, auction_type=type_num.auction_type) auction.html_type = type_num.html_type auction.auction_name = info['title'] # 商品名 auction.assess_value = info[ 'assessmentPrice'] # 评估值 try: auction.province = info['province'] # 省 auction.city = info['city'] # 城市 except Exception as e: auction.province = None auction.city = None auction.auction_time = datetime.datetime.fromtimestamp( int(info['startTime']) / 1000) # 评估值 auction.earnest_money = info['currentPrice'] # 保证金 auction.auction_id = str(info['id']) # 商品id is_exist = coll.find_one({ 'auction_id': str(info['id']), 'source': source }) if is_exist: log.info('id已存在,id="{}"'.format(str( info['id']))) continue self.get_detail(str(info['id']), auction) except Exception as e: log.error('解析错误,url="{}"'.format(url)) except Exception as e: log.error('请求错误,url="{}"'.format(url))
def get_info(self, url): response = requests.get(url=url, headers=self.headers) html = etree.HTML(response.text) print(url) wrong_list = [] try: wrong = html.xpath("//div[@class='dialog']/h1/text()")[0] wrong_list.append(wrong) except Exception as e: print(e) if "We're sorry, but something went wrong." not in wrong_list: title = html.xpath("//div[@class='title']/text()")[0] start_price = html.xpath( "//table[@class='item-attrs']//tr[1]/td[2]/text()")[0] assess_price = html.xpath( "//table[@class='item-attrs']//tr[1]/td[4]/text()")[0] ensure_price = html.xpath( "//table[@class='item-attrs']//tr[1]/td[6]/text()")[0] auction_id = re.search( "http://auction\.qdauction\.com/items/(\d+)", url).group(1) auction = Auction(source=source, auction_type=auction_type) auction.auction_name = title auction.start_auction_price = start_price auction.assess_value = assess_price auction.earnest_money = ensure_price auction.auction_id = auction_id try: time = html.xpath("//tr[@class='deal']/td[4]/text()")[0] Auction.auction_time = datetime.datetime.strptime( time, "%Y-%m-%d %H:%M:%S") except Exception as e: print(e) auction.source_html = response.text auction.city = '青岛' auction.html_type = '其他' auction.insert_db()
def get_detail(self, id_, auction_time, html_type, auction_type, province, city, region): auction = Auction(source=source, auction_type=auction_type) auction.html_type = html_type auction.auction_type = auction_type auction.province = province auction.city = city auction.region = region detail_url = 'http://www1.rmfysszc.gov.cn/Handle/' + id_ + '.shtml' try: response = requests.get(detail_url, headers=self.headers) html = response.content.decode() auction.source_html = html info_list = [] try: if 'GetRecord()' in html: tree = etree.HTML(html) auction.auction_name = tree.xpath( '//div[@id="Title"]/h1/text()')[0] start_auction_price = tree.xpath( '//*[@id="price"]/div[1]/span/text()')[0] auction.start_auction_price = self.get_float( start_auction_price) assess_value = tree.xpath( '//*[@id="bg1"]/div[1]/table/tr[1]/td/span[2]/text()' )[0] try: auction.assess_value = self.get_float(assess_value) except Exception as e: auction.assess_value = None earnest_money = tree.xpath( '//*[@id="bg1"]/div[1]/table/tr[2]/td/span[2]/text()' )[0] auction.earnest_money = self.get_float(earnest_money) announcement_date = tree.xpath( '//*[@id="bg1"]/div[1]/table/tr[3]/td/span/text()')[0] announcement_date_ = re.search(': (.*?)$', announcement_date, re.S | re.M).group(1) auction.announcement_date = datetime.datetime.strptime( announcement_date_, "%Y.%m.%d") auction_level = tree.xpath( '//*[@id="bg1"]/div[1]/table/tr[4]/td/span/text()')[0] auction.auction_level = re.search(': (.*?)$', auction_level, re.S | re.M).group(1) court = tree.xpath( '//*[@id="bg1"]/div[2]/table/tr[1]/td/span/text()')[0] auction.court = re.search(': (.*?)$', court, re.S | re.M).group(1) info_list.append( tree.xpath( 'string(//*[@id="bdjs11"])').encode().decode()) info_list.append( tree.xpath( 'string(//*[@id="jjjl"])').encode().decode()) contacts = tree.xpath( '//*[@id="bg1"]/div[2]/table/tr[2]/td/span/text()')[0] auction.contacts = re.search(': (.*?)$', contacts, re.S | re.M).group(1) phone_number = tree.xpath( '//*[@id="bg1"]/div[2]/table/tr[3]/td/span/text()')[0] auction.phone_number = re.search(': (.*?)$', phone_number, re.S | re.M).group(1) auction.info = info_list try: auction.build_type = tree.xpath( '//*[@id="bdjs11"]/table[1]/tr[2]/td[4]/text()')[0] except Exception as e: auction.build_type = None auction.auction_id = id_ auction.auction_time = self.get_date(date=auction_time) auction.insert_db() elif 'bmnumber()' in html: tree = etree.HTML(html) auction.auction_name = tree.xpath( '//div[@id="Title"]/h1/text()')[0] start_auction_price = tree.xpath( '//*[@id="price"]/div[1]/span/text()')[0] auction.start_auction_price = self.get_float( start_auction_price) assess_value = tree.xpath( '//*[@id="bg1"]/div[1]/table/tr[1]/td/span[2]/text()' )[0] auction.assess_value = self.get_float(assess_value) earnest_money = tree.xpath( '//*[@id="bg1"]/div[1]/table/tr[2]/td/span[2]/text()' )[0] auction.earnest_money = self.get_float(earnest_money) announcement_date = tree.xpath( '//*[@id="bg1"]/div[1]/table/tr[3]/td/span/text()')[0] announcement_date_ = re.search(': (.*?)$', announcement_date, re.S | re.M).group(1) auction.announcement_date = datetime.datetime.strptime( announcement_date_, "%Y-%m-%d") auction_level = tree.xpath( '//*[@id="bg1"]/div[1]/table/tr[4]/td/span/text()')[0] auction.auction_level = re.search(': (.*?)$', auction_level, re.S | re.M).group(1) court = tree.xpath( '//*[@id="bg1"]/div[2]/table/tr[1]/td/span/text()')[0] auction.court = re.search(': (.*?)$', court, re.S | re.M).group(1) info_list.append( tree.xpath( 'string(//*[@id="bdjs"])').encode().decode()) contacts = tree.xpath( '//*[@id="bg1"]/div[2]/table/tr[2]/td/span/text()')[0] auction.contacts = re.search(': (.*?)$', contacts, re.S | re.M).group(1) phone_number = tree.xpath( '//*[@id="bg1"]/div[2]/table/tr[3]/td/span/text()')[0] auction.phone_number = re.search(': (.*?)$', phone_number, re.S | re.M).group(1) auction.info = info_list try: auction.build_type = tree.xpath( '//*[@id="bdjs11"]/table[1]/tr[2]/td[4]/text()')[0] except Exception as e: auction.build_type = None auction.auction_id = id_ auction.auction_time = self.get_date(date=auction_time) auction.insert_db() else: tree = etree.HTML(html) auction.auction_name = tree.xpath( '//*[@id="xmgg"]/div/div[1]/text()')[0] assess_value = tree.xpath( '/html/body/div[6]/table/tr/td/ul/li[3]/span/text()' )[0] auction.assess_value = self.get_float(assess_value) announcement_date = tree.xpath( '/html/body/div[6]/table/tr/td/ul/li[2]/span/text()' )[0] try: auction.announcement_date = datetime.datetime.strptime( announcement_date, "%Y-%m-%d") except Exception as e: auction.announcement_date = datetime.datetime.strptime( announcement_date, "%Y/%m/%d") auction.court = tree.xpath( '/html/body/div[6]/table/tr/td/ul/li[1]/span/text()' )[0] info_list.append( tree.xpath( 'string(//*[@id="bdxx"]/div)').encode().decode()) info_list.append( tree.xpath('string(//*[@id="tjzl"]/div/div[2])'). encode().decode()) auction.contacts = tree.xpath( '/html/body/div[6]/table/tr/td/ul/li[4]/span/text()' )[0] auction.phone_number = tree.xpath( '/html/body/div[6]/table/tr/td/ul/li[5]/span/text()' )[0] auction.info = info_list try: auction.build_type = tree.xpath( '//*[@id="bdxx"]/div/div[2]/table/tr[2]/td[3]/text()' )[0] except Exception as e: auction.build_type = None auction.auction_id = id_ auction.auction_time = self.get_date(date=auction_time) auction.insert_db() except Exception as e: log.error('解析错误,url="{}",e="{}"'.format(detail_url, e)) except Exception as e: log.error('详情页请求错误,url="{}",e="{}"'.format(detail_url, e))
def get_detail(self, aution_url, aution_id, aution_time, region_name, city_name, html_type, auction_type): info = [] aution = Auction(source, auction_type) response = requests.get(aution_url, headers=self.headers) try: html = response.text tree = etree.HTML(html) aution.auction_id = aution_id aution.region = region_name aution.city = city_name aution.source_html = html aution.html_type = html_type try: aution.start_auction_price = float( tree.xpath('//*[@id="Price_Start"]/text()')[0].replace( ',', '')) except Exception as e: aution.start_auction_price = None if 'item2' in aution_url: aution.auction_name = tree.xpath( '//div[@class="d-m-title"]/b/text()')[0] aution.auction_level = tree.xpath( '//div[@class="d-m-tb"]/table[1]/tr[1]/td[2]/text()')[0] try: assess_value = tree.xpath( '//div[@class="d-m-tb"]/table[1]/tr[4]/td[1]/text()' )[0] aution.assess_value = float( re.search('(\d+),?(\d+)', assess_value, re.S | re.M).group(1).replace(',', '')) except Exception as e: aution.assess_value = None earnest_money = tree.xpath( '//div[@class="d-m-tb"]/table[1]/tr[3]/td[2]/text()')[0] aution.earnest_money = float( re.search('(\d+),?(\d+)', earnest_money, re.S | re.M).group(1).replace(',', '')) court = tree.xpath('//td[@class="pr7"]/text()')[0] aution.court = re.search('法院:(.*?)$', court, re.S | re.M).group(1) aution.contacts = tree.xpath('//td[@valign="top"]/text()')[0] phone_number = tree.xpath('//td[@colspan="2"]/text()')[0] try: aution.phone_number = re.search('联系电话:(.*?)$', phone_number, re.S | re.M).group(1) except Exception as e: aution.phone_number = None info.append( tree.xpath( 'string(//div[@class="panel-con"]/div[@class="d-block"][2])' )) info.append( tree.xpath( 'string(//div[@class="panel-con"]/div[@class="d-article d-article2"][3])' )) aution.info = info if aution_time: aution.auction_time = datetime.datetime.strptime( aution_time, "%Y-%m-%d %H:%M:%S") else: aution.auction_name = tree.xpath( '//div[@class="DivItemName"]/text()')[0] aution.auction_level = tree.xpath( '/html/body/div[1]/div[7]/div[2]/div[1]/div[2]/div[4]/li[4]/text()' )[0] try: assess_value = tree.xpath( '/html/body/div[1]/div[7]/div[2]/div[1]/div[2]/div[4]/li[5]/text()' )[0] aution.assess_value = float( re.search('(\d+),?(\d+)', assess_value, re.S | re.M).group(1).replace(',', '')) except Exception as e: aution.assess_value = None earnest_money = tree.xpath( '/html/body/div[1]/div[7]/div[2]/div[1]/div[2]/div[4]/li[6]/text()' )[0] aution.earnest_money = float( re.search('(\d+),?(\d+)', earnest_money, re.S | re.M).group(1).replace(',', '')) court = tree.xpath( '/html/body/div[1]/div[7]/div[2]/div[1]/div[2]/div[4]/li[8]/text()' )[0] aution.court = re.search('法院:(.*?)$', court, re.S | re.M).group(1) area = tree.xpath( '/html/body/div[1]/div[7]/div[2]/div[1]/div[2]/div[4]/li[2]/text()' )[0] aution.area = float( re.search('(\d+)\.(\d+)', area, re.S | re.M).group(1).replace(',', '')) info.append(tree.xpath('string(//div[@id="Tab1"])')) info.append( tree.xpath('string(//div[@class="bootstrap-table"])')) aution.info = info if aution_time: aution.auction_time = datetime.datetime.strptime( aution_time, "%Y-%m-%d %H:%M:%S") aution.insert_db() except Exception as e: log.error('解析错误,url="{}",e="{}"'.format(aution_url, e))