示例#1
0
def tj_calendar(room_id):
    calendar_dict = dict()
    page_response = com_tools.requests_get(com_tools.tj_room_url_tmp % room_id,
                                           headers=com_tools.tujia_header,
                                           sleep_=0)
    if page_response and u'房屋描述' in page_response.text:
        try:
            products = "https://www.tujia.com/bingo/pc/product/getProducts"
            products_data = {
                "checkInDate": str(today),
                "checkOutDate": str(two_mon_date),
                "unitId": room_id,
                "activityInfo": None,
                "callCenter": True
            }
            products_response = com_tools.requests_post(products,
                                                        json=products_data)
            product_id = products_response.json(
            )['data']['products'][0]['productId']
            product_calendar = "https://www.tujia.com/bingo/pc/product/getProductCalendar"
            product_calendar_data = {
                "productId": product_id,
                "unitId": room_id
            }
            product_calendar_response = com_tools.requests_post(
                product_calendar, json=product_calendar_data)
            date_start = datetime.datetime.strptime(str(first_day_month),
                                                    '%Y-%m-%d')
            for check_day in product_calendar_response.json(
            )['data']['checkIn']:
                date_str = datetime.datetime.strftime(date_start, '%Y-%m-%d')
                calendar_dict[str(date_str)] = check_day
                date_start += datetime.timedelta(days=1)
        except Exception as msg:
            logging.error(msg)
    return calendar_dict
示例#2
0
def my_rank(place_pinyin,
            page_index=1,
            rank_list=list(),
            crawl_num=330,
            rank=0,
            my_headers=None):
    """
    爬取蚂蚁排名  无论城市还是商圈都可以用这个
    :param place_pinyin: 要爬取的城市商圈的拼音
    :param page_index: 以第一页开始  第一页为1
    :param rank_list: rankList为存储排名的房源id的list
    :param crawl_num: 设置爬取的条数
    :param rank:
    :param my_headers:
    :return:
    """
    if page_index == 1:
        rank_list = list()
    my_list_url = 'http://www.mayi.com/%s/%s/?map=no' % (place_pinyin,
                                                         page_index)
    response = com_tools.requests_post(my_list_url,
                                       headers=my_headers,
                                       proxy=None,
                                       sleep_=0)
    if response:
        html = etree.HTML(response.text)
        pg_active = html.xpath('//*[@id="page"]/a[@class="pg-active"]/text()')
        if pg_active != [str(page_index)]:
            # print(len(rank_list), " 是蚂蚁最终排名抓取的条数   ")
            return rank_list
        for dd in (html.xpath('//*[@id="searchRoom"]/dd')):
            # print(dd.xpath("./@data")[0])
            rank += 1
            rank_list.append([str(dd.xpath("./@data")[0]), rank])
        if len(rank_list) < crawl_num:
            # print(len(rank_list), end=' ')
            return my_rank(place_pinyin,
                           page_index=page_index + 1,
                           rank_list=rank_list,
                           crawl_num=crawl_num,
                           rank=rank,
                           my_headers=my_headers)
        else:
            # print(len(rank_list), " 是蚂蚁最终排名抓取的条数    ")
            return rank_list
示例#3
0
    def calender(self):
        if self.state == 1 and self.third_type == 1:
            try:
                params = {
                    "roomid": self.third_id,
                    "startday": self.start_day,
                    "initStock": 1
                }
                cur_resp = com_tools.requests_get(url=self.my_price_url,
                                                  headers=self.my_header,
                                                  params=params,
                                                  proxy=None)
                if cur_resp:
                    for data_day in cur_resp.json()['data']:
                        date = data_day['date']
                        price = data_day['price']
                        vacant_count = data_day['stock']
                        available = data_day['isRent']  # 是否可定  0 不可定  1可定
                        self.calendar_dict[date] = {
                            'status': available,
                            'price': price,
                            'vacantCount': vacant_count
                        }
                    if len(self.calendar_dict) < 90:
                        self.start_day = com_tools.get_next_month_today(
                            self.start_day)
                        self.calender()
            except Exception as e:
                logging.exception(e)
        elif self.state == 1 and self.third_type == 3:
            try:
                products = "https://www.tujia.com/bingo/pc/product/getProducts"
                products_data = {
                    "checkInDate": str(self.today),
                    "checkOutDate": str(self.two_mon_date),
                    "unitId": self.third_id,
                    "activityInfo": None,
                    "callCenter": True
                }
                products_response = com_tools.requests_post(products,
                                                            json=products_data,
                                                            sleep_=0)
                product_id = products_response.json(
                )['data']['products'][0]['productId']
                product_calendar = "https://www.tujia.com/bingo/pc/product/getProductCalendar"
                product_calendar_data = {
                    "productId": product_id,
                    "unitId": self.third_id
                }
                product_calendar_response = com_tools.requests_post(
                    product_calendar, json=product_calendar_data, sleep_=0)
                date_start = datetime.datetime.strptime(
                    str(self.first_day_month), '%Y-%m-%d')
                for check_day in product_calendar_response.json(
                )['data']['checkIn']:
                    date_str = datetime.datetime.strftime(
                        date_start, '%Y-%m-%d')
                    self.calendar_dict[str(date_str)] = check_day
                    date_start += datetime.timedelta(days=1)
            except Exception as msg:
                logging.exception(msg)
        elif self.state == 1 and self.third_type == 25:
            zg_header = {
                "Cookie": self.zg_cookie,
                "User-Agent":
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
                "x-csrf-token": self.rf_token,
            }
            data = {'productId': 1129482}
            zg_calendar_response = com_tools.requests_post(
                "https://phoenix.meituan.com/gw/cprod/api/v1/calendar/query",
                headers=zg_header,
                json=data)
            for data in zg_calendar_response.json()['data']['dateInfos']:
                date_str = datetime.datetime.strftime(
                    datetime.datetime.strptime(str(data['date']), '%Y%m%d'),
                    '%Y-%m-%d')
                try:
                    price = data['price'] / 100
                except:
                    price = 0
                self.calendar_dict[str(date_str)] = {
                    'status': data['openStatus'],
                    'price': price,
                    'vacantCount': data['inventoryNum']
                }

        elif self.state == 1 and self.third_type == 6 and self.srf_token:
            try:
                url = "http://cd.xiaozhu.com/ajax.php"
                headers = {
                    "xSRF-Token":
                    self.srf_token,
                    "Referer":
                    "http://cd.xiaozhu.com/fangzi/%s.html" % self.third_id
                }
                params = {
                    "op": "AJAX_GetLodgeUnitCalendar",
                    "lodgeunitid": self.third_id,
                    "startdate": "2018-09-01",
                    "enddate": "2018-10-01",
                    "editable": "true",
                    "calendarCode": "true",
                    "rand": "0.3927607474257376"
                }
                calender_response = com_tools.requests_get(url,
                                                           headers=headers,
                                                           params=params,
                                                           sleep_=0,
                                                           proxy=None)
                if calender_response:
                    for day_calender in calender_response.json():
                        dt = day_calender['start']
                        available = 1 if day_calender[
                            'state'] == 'available' else 0
                        price = day_calender['normalDayPrice']
                        self.calendar_dict[dt] = {
                            'status': available,
                            'price': price,
                            'vacantCount': available
                        }
            except Exception as e:
                logging.exception(e)

        elif self.state == 1 and self.third_type == 20:
            params = {
                "_format": "with_conditions",
                "count": "3",
                "listing_id": self.third_id,
                "month": self.cur_month,
                "year": self.cur_year,
                "key": "d306zoyjsyarp7ifhu67rjxn52tv0t20",
                "currency": "CNY",
                "locale": "zh"
            }
            calendar_month = "https://www.airbnbchina.cn/api/v2/calendar_months"
            page_response = com_tools.requests_get(
                calendar_month,
                headers=com_tools.air_header,
                params=params,
                sleep_=0)
            if page_response:
                for calendar_month in page_response.json()['calendar_months']:
                    for day_state in calendar_month['days']:
                        dt = day_state['date']
                        available = 1 if bool(day_state['available']) else 0
                        price = day_state['price']['local_price']
                        self.calendar_dict[dt] = {
                            'status': available,
                            'price': price,
                            'vacantCount': available
                        }
        self.storage()
示例#4
0
def tj_bc_rank(city,
               city_id,
               bc,
               bc_id,
               page_index=1,
               rank_list=list(),
               crawl_num=330,
               rank=0):
    """
             商圈排名
    :param city: 途家的城市的名
    :param city_id: 途家的城市的id
    :param bc: 途家的商圈名
    :param bc_id: 途家的商圈id
    :param page_index: 以第一页开始  第一页为1
    :param rank_list: rankList为存储排名的房源id的list
    :param crawl_num: 爬取多少个
    :param rank: 排名
    :return: 爬取300条记录或者没有记录之后的rankList
    """
    if page_index == 1:
        rank_list = list()
    today = datetime.date.today()
    yesterday = today - datetime.timedelta(days=1)
    dateValue = str(today) + "," + str(yesterday)
    searchUrl = 'https://client.tujia.com/tmsv4/searchunitfull'
    headers = {
        "User-Agent":
        "Mozilla/5.0 (Linux; Android 5.1.1; vivo X7 Build/LMY47V) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/"
        "537.36 tujia(hotel/6.60/75 mNet/wifi loc/zh_CN)",
    }
    json_data = {
        "bi": "{\"picturemode\":\"small\"}",
        "parameter": {
            "H5Url":
            None,
            "VillaChannelThemeId":
            0,
            "conditions": [{
                "gType": 1,
                "isHotRecommend": False,
                "isLandmark": False,
                "label": "1人",
                "sentValue": None,
                "type": 8,
                "value": "1"
            }, {
                "gType": 0,
                "isHotRecommend": False,
                "isLandmark": False,
                "label": city,
                "sentValue": None,
                "type": 42,
                "value": city_id
            }, {
                "gType": 2,
                "isHotRecommend": False,
                "isLandmark": False,
                "label": bc,
                "sentValue": None,
                "type": 11,
                "value": bc_id
            }, {
                "gType": 1,
                "isHotRecommend": False,
                "isLandmark": False,
                "label": "5公里以内",
                "sentValue": None,
                "type": 10,
                "value": "5000"
            }, {
                "gType": 4,
                "isHotRecommend": False,
                "isLandmark": False,
                "label": "推荐排序",
                "sentValue": None,
                "type": 48,
                "value": "1"
            }, {
                "gType": 0,
                "isHotRecommend": False,
                "isLandmark": False,
                "label": "",
                "sentValue": None,
                "type": 47,
                "value": dateValue
            }],
            "onlyNeedUnitCount":
            False,
            "pageIndex":
            page_index,
            "pageSize":
            20,
            "returnAllConditions":
            False,
            "returnNavigations":
            False
        },
        "client": {
            "appId": "com.tujia.hotel",
            "appVersion": "6.60_75",
            "channelCode": "",
            "devModel": "",
            "devToken": "",
            "devType": 2,
            "locale": "zh-CN",
            "osVersion": "",
            "screenInfo": "",
            "uID": ""
        },
        "code": None,
        "psid": "",
        "type": "searchunitfull",
        "user": None,
        "usid": None
    }
    resp = com_tools.requests_post(searchUrl,
                                   headers=headers,
                                   json=json_data,
                                   sleep_=0)
    if resp:
        resp_json = resp.json()

        try:
            unitlist = resp_json['content']['list']
        except:
            # print("途家商圈爬虫出现错误!", city, city_id, bc, bc_id, resp_json)
            unitlist = []

        if len(unitlist) == 0:
            # print(len(rank_list), " 途家商圈%s_%s最终排名抓取的条数" % (city, bc))
            return rank_list
        for util in unitlist:
            rank += 1
            rank_list.append([str(util['unitId']), rank])
        if len(rank_list) < crawl_num:
            # print(len(rank_list), end=' ')
            return tj_bc_rank(city,
                              city_id,
                              bc,
                              bc_id,
                              page_index + 1,
                              rank_list=rank_list,
                              crawl_num=crawl_num,
                              rank=rank)
        else:
            # print(len(rank_list), " 途家商圈%s_%s最终排名抓取的条数" % (city, bc))
            return rank_list
示例#5
0
def tj_city_rank(city_id,
                 page_index=0,
                 rank_list=list(),
                 crawl_num=330,
                 rank=0):
    """
    途家城市排名
    :param city_id: 途家的城市的id
    :param page_index: 以第一页开始  第一页为0
    :param rank_list: rankList为存储排名的房源id的list
    :param crawl_num: 设置爬取的条数
    :param rank: 排名
    :return: 爬取300条记录或者没有记录之后的rankList
    """
    if page_index == 0:
        rank_list = list()
    today = datetime.date.today()
    yesterday = today - datetime.timedelta(days=1)
    date_value = str(today) + "," + str(yesterday)
    search_url = 'https://client.tujia.com/tmsv4/searchunitfull'
    headers = {"User-Agent": com_tools.chrom_ua}
    json_data = {
        "bi": "{\"picturemode\":\"small\"}",
        "parameter": {
            "H5Url":
            None,
            "VillaChannelThemeId":
            0,
            "conditions": [{
                "gType": 0,
                "isHotRecommend": False,
                "isLandmark": False,
                "label": None,
                "sentValue": None,
                "type": 47,
                "value": date_value
            }, {
                "gType": 1,
                "isHotRecommend": False,
                "isLandmark": False,
                "label": "1人",
                "sentValue": None,
                "type": 8,
                "value": "1"
            }, {
                "gType": 0,
                "isHotRecommend": False,
                "isLandmark": False,
                "label": None,
                "sentValue": None,
                "type": 42,
                "value": city_id
            }],
            "onlyNeedUnitCount":
            False,
            "pageIndex":
            page_index,
            "pageSize":
            20,
            "returnAllConditions":
            True,
            "returnNavigations":
            True
        },
        "client": {
            "appId": "com.tujia.hotel",
            "appVersion": "6.60_75",
            "channelCode": "",
            "devModel": "",
            "devToken": "",
            "devType": 2,
            "locale": "zh-CN",
            "osVersion": "",
            "screenInfo": "",
            "uID": ""
        },
        "code": None,
        "psid": "",
        "type": "",
        "user": None,
        "usid": None
    }
    resp = com_tools.requests_post(search_url,
                                   headers=headers,
                                   json=json_data,
                                   sleep_=0)
    if resp:
        resp_json = resp.json()
        unit_list = resp_json['content']['list']
        if len(unit_list) == 0:
            # print(len(rank_list), " 途家城市最终排名抓取的条数")
            return rank_list
        for util in unit_list:
            unit_id = util['unitId']
            rank += 1
            rank_list.append([str(unit_id), rank])
        if len(rank_list) < crawl_num:
            # print(len(rank_list), end=' ')
            return tj_city_rank(city_id,
                                page_index + 1,
                                rank_list=rank_list,
                                crawl_num=crawl_num,
                                rank=rank)
        else:
            # print(len(rank_list), " 途家城市最终排名抓取的条数")
            return rank_list