def tj_calendar(room_id): calendar_dict = dict() page_response = com_tools.requests_get(com_tools.tj_room_url_tmp % room_id, headers=com_tools.tujia_header, sleep_=0) if page_response and u'房屋描述' in page_response.text: try: products = "https://www.tujia.com/bingo/pc/product/getProducts" products_data = { "checkInDate": str(today), "checkOutDate": str(two_mon_date), "unitId": room_id, "activityInfo": None, "callCenter": True } products_response = com_tools.requests_post(products, json=products_data) product_id = products_response.json( )['data']['products'][0]['productId'] product_calendar = "https://www.tujia.com/bingo/pc/product/getProductCalendar" product_calendar_data = { "productId": product_id, "unitId": room_id } product_calendar_response = com_tools.requests_post( product_calendar, json=product_calendar_data) date_start = datetime.datetime.strptime(str(first_day_month), '%Y-%m-%d') for check_day in product_calendar_response.json( )['data']['checkIn']: date_str = datetime.datetime.strftime(date_start, '%Y-%m-%d') calendar_dict[str(date_str)] = check_day date_start += datetime.timedelta(days=1) except Exception as msg: logging.error(msg) return calendar_dict
def my_rank(place_pinyin, page_index=1, rank_list=list(), crawl_num=330, rank=0, my_headers=None): """ 爬取蚂蚁排名 无论城市还是商圈都可以用这个 :param place_pinyin: 要爬取的城市商圈的拼音 :param page_index: 以第一页开始 第一页为1 :param rank_list: rankList为存储排名的房源id的list :param crawl_num: 设置爬取的条数 :param rank: :param my_headers: :return: """ if page_index == 1: rank_list = list() my_list_url = 'http://www.mayi.com/%s/%s/?map=no' % (place_pinyin, page_index) response = com_tools.requests_post(my_list_url, headers=my_headers, proxy=None, sleep_=0) if response: html = etree.HTML(response.text) pg_active = html.xpath('//*[@id="page"]/a[@class="pg-active"]/text()') if pg_active != [str(page_index)]: # print(len(rank_list), " 是蚂蚁最终排名抓取的条数 ") return rank_list for dd in (html.xpath('//*[@id="searchRoom"]/dd')): # print(dd.xpath("./@data")[0]) rank += 1 rank_list.append([str(dd.xpath("./@data")[0]), rank]) if len(rank_list) < crawl_num: # print(len(rank_list), end=' ') return my_rank(place_pinyin, page_index=page_index + 1, rank_list=rank_list, crawl_num=crawl_num, rank=rank, my_headers=my_headers) else: # print(len(rank_list), " 是蚂蚁最终排名抓取的条数 ") return rank_list
def calender(self): if self.state == 1 and self.third_type == 1: try: params = { "roomid": self.third_id, "startday": self.start_day, "initStock": 1 } cur_resp = com_tools.requests_get(url=self.my_price_url, headers=self.my_header, params=params, proxy=None) if cur_resp: for data_day in cur_resp.json()['data']: date = data_day['date'] price = data_day['price'] vacant_count = data_day['stock'] available = data_day['isRent'] # 是否可定 0 不可定 1可定 self.calendar_dict[date] = { 'status': available, 'price': price, 'vacantCount': vacant_count } if len(self.calendar_dict) < 90: self.start_day = com_tools.get_next_month_today( self.start_day) self.calender() except Exception as e: logging.exception(e) elif self.state == 1 and self.third_type == 3: try: products = "https://www.tujia.com/bingo/pc/product/getProducts" products_data = { "checkInDate": str(self.today), "checkOutDate": str(self.two_mon_date), "unitId": self.third_id, "activityInfo": None, "callCenter": True } products_response = com_tools.requests_post(products, json=products_data, sleep_=0) product_id = products_response.json( )['data']['products'][0]['productId'] product_calendar = "https://www.tujia.com/bingo/pc/product/getProductCalendar" product_calendar_data = { "productId": product_id, "unitId": self.third_id } product_calendar_response = com_tools.requests_post( product_calendar, json=product_calendar_data, sleep_=0) date_start = datetime.datetime.strptime( str(self.first_day_month), '%Y-%m-%d') for check_day in product_calendar_response.json( )['data']['checkIn']: date_str = datetime.datetime.strftime( date_start, '%Y-%m-%d') self.calendar_dict[str(date_str)] = check_day date_start += datetime.timedelta(days=1) except Exception as msg: logging.exception(msg) elif self.state == 1 and self.third_type == 25: zg_header = { "Cookie": self.zg_cookie, "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36", "x-csrf-token": self.rf_token, } data = {'productId': 1129482} zg_calendar_response = com_tools.requests_post( "https://phoenix.meituan.com/gw/cprod/api/v1/calendar/query", headers=zg_header, json=data) for data in zg_calendar_response.json()['data']['dateInfos']: date_str = datetime.datetime.strftime( datetime.datetime.strptime(str(data['date']), '%Y%m%d'), '%Y-%m-%d') try: price = data['price'] / 100 except: price = 0 self.calendar_dict[str(date_str)] = { 'status': data['openStatus'], 'price': price, 'vacantCount': data['inventoryNum'] } elif self.state == 1 and self.third_type == 6 and self.srf_token: try: url = "http://cd.xiaozhu.com/ajax.php" headers = { "xSRF-Token": self.srf_token, "Referer": "http://cd.xiaozhu.com/fangzi/%s.html" % self.third_id } params = { "op": "AJAX_GetLodgeUnitCalendar", "lodgeunitid": self.third_id, "startdate": "2018-09-01", "enddate": "2018-10-01", "editable": "true", "calendarCode": "true", "rand": "0.3927607474257376" } calender_response = com_tools.requests_get(url, headers=headers, params=params, sleep_=0, proxy=None) if calender_response: for day_calender in calender_response.json(): dt = day_calender['start'] available = 1 if day_calender[ 'state'] == 'available' else 0 price = day_calender['normalDayPrice'] self.calendar_dict[dt] = { 'status': available, 'price': price, 'vacantCount': available } except Exception as e: logging.exception(e) elif self.state == 1 and self.third_type == 20: params = { "_format": "with_conditions", "count": "3", "listing_id": self.third_id, "month": self.cur_month, "year": self.cur_year, "key": "d306zoyjsyarp7ifhu67rjxn52tv0t20", "currency": "CNY", "locale": "zh" } calendar_month = "https://www.airbnbchina.cn/api/v2/calendar_months" page_response = com_tools.requests_get( calendar_month, headers=com_tools.air_header, params=params, sleep_=0) if page_response: for calendar_month in page_response.json()['calendar_months']: for day_state in calendar_month['days']: dt = day_state['date'] available = 1 if bool(day_state['available']) else 0 price = day_state['price']['local_price'] self.calendar_dict[dt] = { 'status': available, 'price': price, 'vacantCount': available } self.storage()
def tj_bc_rank(city, city_id, bc, bc_id, page_index=1, rank_list=list(), crawl_num=330, rank=0): """ 商圈排名 :param city: 途家的城市的名 :param city_id: 途家的城市的id :param bc: 途家的商圈名 :param bc_id: 途家的商圈id :param page_index: 以第一页开始 第一页为1 :param rank_list: rankList为存储排名的房源id的list :param crawl_num: 爬取多少个 :param rank: 排名 :return: 爬取300条记录或者没有记录之后的rankList """ if page_index == 1: rank_list = list() today = datetime.date.today() yesterday = today - datetime.timedelta(days=1) dateValue = str(today) + "," + str(yesterday) searchUrl = 'https://client.tujia.com/tmsv4/searchunitfull' headers = { "User-Agent": "Mozilla/5.0 (Linux; Android 5.1.1; vivo X7 Build/LMY47V) AppleWebKit/537.36 " "(KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/" "537.36 tujia(hotel/6.60/75 mNet/wifi loc/zh_CN)", } json_data = { "bi": "{\"picturemode\":\"small\"}", "parameter": { "H5Url": None, "VillaChannelThemeId": 0, "conditions": [{ "gType": 1, "isHotRecommend": False, "isLandmark": False, "label": "1人", "sentValue": None, "type": 8, "value": "1" }, { "gType": 0, "isHotRecommend": False, "isLandmark": False, "label": city, "sentValue": None, "type": 42, "value": city_id }, { "gType": 2, "isHotRecommend": False, "isLandmark": False, "label": bc, "sentValue": None, "type": 11, "value": bc_id }, { "gType": 1, "isHotRecommend": False, "isLandmark": False, "label": "5公里以内", "sentValue": None, "type": 10, "value": "5000" }, { "gType": 4, "isHotRecommend": False, "isLandmark": False, "label": "推荐排序", "sentValue": None, "type": 48, "value": "1" }, { "gType": 0, "isHotRecommend": False, "isLandmark": False, "label": "", "sentValue": None, "type": 47, "value": dateValue }], "onlyNeedUnitCount": False, "pageIndex": page_index, "pageSize": 20, "returnAllConditions": False, "returnNavigations": False }, "client": { "appId": "com.tujia.hotel", "appVersion": "6.60_75", "channelCode": "", "devModel": "", "devToken": "", "devType": 2, "locale": "zh-CN", "osVersion": "", "screenInfo": "", "uID": "" }, "code": None, "psid": "", "type": "searchunitfull", "user": None, "usid": None } resp = com_tools.requests_post(searchUrl, headers=headers, json=json_data, sleep_=0) if resp: resp_json = resp.json() try: unitlist = resp_json['content']['list'] except: # print("途家商圈爬虫出现错误!", city, city_id, bc, bc_id, resp_json) unitlist = [] if len(unitlist) == 0: # print(len(rank_list), " 途家商圈%s_%s最终排名抓取的条数" % (city, bc)) return rank_list for util in unitlist: rank += 1 rank_list.append([str(util['unitId']), rank]) if len(rank_list) < crawl_num: # print(len(rank_list), end=' ') return tj_bc_rank(city, city_id, bc, bc_id, page_index + 1, rank_list=rank_list, crawl_num=crawl_num, rank=rank) else: # print(len(rank_list), " 途家商圈%s_%s最终排名抓取的条数" % (city, bc)) return rank_list
def tj_city_rank(city_id, page_index=0, rank_list=list(), crawl_num=330, rank=0): """ 途家城市排名 :param city_id: 途家的城市的id :param page_index: 以第一页开始 第一页为0 :param rank_list: rankList为存储排名的房源id的list :param crawl_num: 设置爬取的条数 :param rank: 排名 :return: 爬取300条记录或者没有记录之后的rankList """ if page_index == 0: rank_list = list() today = datetime.date.today() yesterday = today - datetime.timedelta(days=1) date_value = str(today) + "," + str(yesterday) search_url = 'https://client.tujia.com/tmsv4/searchunitfull' headers = {"User-Agent": com_tools.chrom_ua} json_data = { "bi": "{\"picturemode\":\"small\"}", "parameter": { "H5Url": None, "VillaChannelThemeId": 0, "conditions": [{ "gType": 0, "isHotRecommend": False, "isLandmark": False, "label": None, "sentValue": None, "type": 47, "value": date_value }, { "gType": 1, "isHotRecommend": False, "isLandmark": False, "label": "1人", "sentValue": None, "type": 8, "value": "1" }, { "gType": 0, "isHotRecommend": False, "isLandmark": False, "label": None, "sentValue": None, "type": 42, "value": city_id }], "onlyNeedUnitCount": False, "pageIndex": page_index, "pageSize": 20, "returnAllConditions": True, "returnNavigations": True }, "client": { "appId": "com.tujia.hotel", "appVersion": "6.60_75", "channelCode": "", "devModel": "", "devToken": "", "devType": 2, "locale": "zh-CN", "osVersion": "", "screenInfo": "", "uID": "" }, "code": None, "psid": "", "type": "", "user": None, "usid": None } resp = com_tools.requests_post(search_url, headers=headers, json=json_data, sleep_=0) if resp: resp_json = resp.json() unit_list = resp_json['content']['list'] if len(unit_list) == 0: # print(len(rank_list), " 途家城市最终排名抓取的条数") return rank_list for util in unit_list: unit_id = util['unitId'] rank += 1 rank_list.append([str(unit_id), rank]) if len(rank_list) < crawl_num: # print(len(rank_list), end=' ') return tj_city_rank(city_id, page_index + 1, rank_list=rank_list, crawl_num=crawl_num, rank=rank) else: # print(len(rank_list), " 途家城市最终排名抓取的条数") return rank_list