def main():
    # tourtype = ['g33831', 'g2916', 'g2926', 'g2834', 'g5672', 'g27852', 'g20038', 'g33832']
    tourarea = [
        'r29', 'r31', 'r30', 'r32', 'r12036', 'r12033', 'r34', 'r33', 'r12035'
    ]

    random.shuffle(tourarea)
    print('tourarea {}'.format(tourarea))

    for tp in tourarea:
        # for loc in dz_location:
        baseurl = 'http://www.dianping.com/shenzhen/ch35/%s' % tp
        time.sleep(random.randint(2, 5))
        basepage = html_from_url(baseurl)
        e = pq(basepage)
        maxpage = 0
        if e('.PageLink').text():
            maxpage = int(e('.PageLink:last').attr('data-ga-page'))
            # maxpage = int(e('.PageLink').eq(-1).attr('data-ga-page'))
        elif 0 < e('#shop-all-list li').length <= 15:
            maxpage = 1
        else:
            print('maxpage in else: {}'.format(maxpage))
            continue
        print('maxpage: {}'.format(maxpage))
        for i in range(1, maxpage + 1):
            url = baseurl + 'p' + str(i)
            time.sleep(random.randint(5, 10))
            tour = tour_from_url(url)
def main():
    lifetype = [
        'g141', 'g133', 'g2636', 'g20042', 'g142', 'g134', 'g135', 'g140',
        'g144', 'g32732', 'g137', 'g20038', 'g156', 'g20039', 'g20040',
        'g6694', 'g2754', 'g20041', 'g33857', 'g34089', 'g34090'
    ]

    random.shuffle(lifetype)
    # random.shuffle(dz_location)

    print('lifetype {}'.format(lifetype))
    # print('dz_location {}'.format(dz_location))

    for tp in lifetype:
        for loc in dz_location:
            baseurl = 'http://www.dianping.com/shenzhen/ch30/%s%s' % (tp, loc)
            time.sleep(random.randint(2, 5))
            basepage = html_from_url(baseurl)
            e = pq(basepage)
            maxpage = 0
            if e('.PageLink').text():
                maxpage = int(e('.PageLink:last').attr('data-ga-page'))
                # maxpage = int(e('.PageLink').eq(-1).attr('data-ga-page'))
            elif 0 < e('#shop-all-list li').length <= 15:
                maxpage = 1
            else:
                print('maxpage in else: {}'.format(maxpage))
                continue
            print('maxpage: {}'.format(maxpage))
            for i in range(1, maxpage + 1):
                url = baseurl + 'p' + str(i)
                time.sleep(random.randint(5, 10))
                life = life_from_url(url)
def main():
    beautytype = ['g157', 'g158', 'g33761', 'g183', 'g148', 'g149', 'g2898', 'g159', 'g493', 'g2572', 'g123', 'g2790']
    cur_location = ['r29', 'r1949', 'r7475', 'r1560', 'r12322', 'r1556', 'r1951', 'r12321', 'r1559', 'r12225', 'r1557',
                    'r1573', 'r12324', 'r12226', 'r12323', 'r3138', 'r12320', 'r12319', 'r1950']
    random.shuffle(beautytype)
    # random.shuffle(dz_location)

    print('beautytype {}'.format(beautytype))
    print('dz_location {}'.format(dz_location))

    for tp in beautytype:
        # for loc in dz_location:
        for loc in cur_location:
            baseurl = 'http://www.dianping.com/shenzhen/ch50/%s%s' % (tp, loc)
            time.sleep(random.randint(2, 5))
            # pool = ConnectionPool(host='localhost', port=6379, db=0, password='******')
            # redis = StrictRedis(connection_pool=pool)
            basepage = html_from_url(baseurl)
            e = pq(basepage)
            maxpage = 0
            if e('.PageLink').text():
                maxpage = int(e('.PageLink:last').attr('data-ga-page'))
                # maxpage = int(e('.PageLink').eq(-1).attr('data-ga-page'))
            elif 0 < e('#shop-all-list li').length <= 15:
                maxpage = 1
            else:
                print('maxpage in else: {}'.format(maxpage))
                continue
            print('maxpage: {}'.format(maxpage))

            for i in range(1, maxpage + 1):
                url = baseurl + 'p' + str(i)
                time.sleep(random.randint(5, 10))
                beauty = beauty_from_url(url)
def main():
    filmtype = [
        'g136', 'g25461', 'g33880', 'g33877', 'g33879', 'g33878', 'g33881',
        'g33882'
    ]

    area = ['r29', 'r31', 'r30', 'r32', 'r12033', 'r12035', 'r34', 'r33']

    random.shuffle(filmtype)
    random.shuffle(area)

    print('filmtype {}'.format(filmtype))
    print('area {}'.format(area))
    for tp in filmtype:
        for loc in area:
            baseurl = 'http://www.dianping.com/shenzhen/ch25/%s%s' % (tp, loc)
            time.sleep(random.randint(2, 5))
            basepage = html_from_url(baseurl)
            e = pq(basepage)
            maxpage = 0
            if e('.PageLink').text():
                maxpage = int(e('.PageLink:last').attr('data-ga-page'))
                # maxpage = int(e('.PageLink').eq(-1).attr('data-ga-page'))
            elif 0 < e('#shop-all-list li').length <= 15:
                maxpage = 1
            else:
                print('maxpage in else: {}'.format(maxpage))
                continue
            print('maxpage: {}'.format(maxpage))
            for i in range(1, maxpage + 1):
                url = baseurl + 'p' + str(i)
                time.sleep(random.randint(5, 10))
                film = film_from_url(url)
def pet_from_url(url):
    """
    从 url 中解析出页面内所有的商家
    """
    page = html_from_url(url)
    e = pq(page)
    items = e('#shop-all-list>ul>li')
    pet = [pet_from_li(i) for i in items]
    return pet
def home_from_url(url):
    """
    从 url 中解析出页面内所有的商家
    """
    page = html_from_url(url)
    e = pq(page)
    items = e('.shop-list').children('li')
    home = [home_from_li(i) for i in items]
    return home
def tour_from_url(url):
    """
    从 url 中解析出页面内所有的商家
    """
    page = html_from_url(url)
    e = pq(page)
    items = e('#shop-all-list').find('li')
    tour = [tour_from_li(i) for i in items]
    return tour
def life_from_url(url):
    """
    从 url 中解析出页面内所有的商家
    """
    page = html_from_url(url)
    e = pq(page)
    items = e('#shop-all-list').find('li')
    life = [life_from_li(i) for i in items]
    return life
Пример #9
0
def wedding_from_url(url):
    """
    从 url 中解析出页面内所有的商家
    """
    page = html_from_url(url)
    e = pq(page)
    items = e('.shop-list').children('li')
    wedding = [wedding_from_li(i) for i in items]
    return wedding
def film_from_url(url):
    """
    从 url 中解析出页面内所有的商家
    """
    page = html_from_url(url)
    e = pq(page)
    items = e('#shop-all-list').find('li')
    film = [film_from_li(i) for i in items]
    return film
def education_from_url(url):
    """
    从 url 中解析出页面内所有的商家
    """
    page = html_from_url(url)
    e = pq(page)
    items = e('#shop-all-list>ul>li')
    education = [education_from_li(i) for i in items]
    return education
def sports_from_url(url):
    """
    从 url 中解析出页面内所有的商家
    """
    page = html_from_url(url)
    e = pq(page)
    items = e('#shop-all-list').find('li')
    sports = [sports_from_li(i) for i in items]
    return sports
def baby_from_url(url):
    """
    从 url 中解析出页面内所有的商家
    """
    page = html_from_url(url)
    e = pq(page)
    items = e('.shop-list').children('li')
    baby = [baby_from_li(i) for i in items]
    return baby
def car_from_url(url):
    """
    从 url 中解析出页面内所有的商家
    """
    # url = 'http://www.dianping.com/shenzhen/ch65/g180r12320p2'
    page = html_from_url(url)
    e = pq(page)
    items = e('#shop-all-list>ul>li')
    car = [car_from_li(i) for i in items]
    return car
def home_from_url_decoration(url):
    """
    从 url 中解析出页面内所有的商家
    """
    page = html_from_url(url)
    e = pq(page)
    # items = e('.shop-list').children('div')
    # home = [home_from_div(i) for i in items if pq(i).attr('class') != 'shop-list-general']
    items = e('.shop-list').children('.shop-list-item')
    home = [home_from_item(i) for i in items]
    return home
def hotel_from_url(url):
    """
    从 url 中解析出页面内所有的商家
    """
    # url = 'http://www.dianping.com/shenzhen/hotel/r12036p4'
    page = html_from_url(url)
    e = pq(page)
    if not e('.hotelshop-list .no-hotel-block').text():
        items = e('.hotelshop-list').children('li')
        hotel = [hotel_from_li(i) for i in items]
        return hotel
def food_from_url(url):
    """
    从 url 中解析出页面内所有的商家
    """
    page = html_from_url(url)
    e = pq(page)
    if e('#not-found-tip'):
        return html_from_url(url)
    items = e('#shop-all-list').find('li')
    # 调用 food_from_li
    food = []
    for i in items:
        e = pq(i)
        print("before e('.txt>.tit>a:first')('href') : {}".format(e('.txt>.tit>a:first').attr('href')))
        if not e('.txt>.tit>a:first').attr('href'):
            print("after e('.txt>.tit>a:first')('href') : {}".format(e('.txt>.tit>a:first').attr('href')))
            return food_from_url(url)
        food.append(food_from_li(i))
    # food = [food_from_li(i) for i in items]
    return food
Пример #18
0
def main():
    shoppingtype = [
        'g120', 'g33943', 'g33944', 'g33906', 'g33905', 'g33904', 'g119',
        'g122', 'g121', 'g130', 'g32739', 'g187', 'g235', 'g123', 'g128',
        'g125', 'g27809', 'g27810', 'g27811', 'g27812', 'g26085', 'g124',
        'g127', 'g126', 'g6826', 'g32705', 'g6829', 'g6827', 'g32700', 'g6830',
        'g34124', 'g129', 'g184', 'g33760', 'g33759', 'g2714', 'g26101',
        'g33858', 'g2776', 'g32698', 'g34114', 'g131'
    ]

    # shoppingtype = ['g120', 'g33943', 'g33944', 'g33906', 'g33905', 'g33904', 'g119', 'g122', 'g121', 'g130', 'g32739',
    #                 'g187', 'g235', 'g123', 'g128', 'g125', 'g26085', 'g124', 'g127', 'g126', 'g34124', 'g129', 'g184',
    #                 'g2714', 'g26101', 'g33858', 'g2776', 'g32698', 'g34114', 'g131']
    cur_location = [
        'r34', 'r8646', 'r1957', 'r1570', 'r12335', 'r8647', 'r8357', 'r8355',
        'r12334', 'r8648', 'r3141', 'r70631'
    ]

    random.shuffle(shoppingtype)
    random.shuffle(dz_location)
    print('shoppingtype : {}'.format(shoppingtype))
    print('dz_location : {}'.format(dz_location))

    for tp in shoppingtype:
        # for loc in dz_location:
        for loc in cur_location[::-1]:
            baseurl = 'http://www.dianping.com/shenzhen/ch20/%s%s' % (tp, loc)
            time.sleep(random.randint(2, 5))
            basepage = html_from_url(baseurl)
            e = pq(basepage)
            maxpage = 0
            if e('.PageLink').text():
                maxpage = int(e('.PageLink:last').attr('data-ga-page'))
                # maxpage = int(e('.PageLink').eq(-1).attr('data-ga-page'))
            elif 0 < e('#shop-all-list li').length <= 15:
                maxpage = 1
            else:
                print('maxpage in else: {}'.format(maxpage))
                continue
            print('maxpage: {}'.format(maxpage))

            for i in range(1, maxpage + 1):
                url = baseurl + 'p' + str(i)
                time.sleep(random.randint(5, 10))
                shopping = shopping_from_url(url)
Пример #19
0
def shopping_from_url(url):
    """
    从 html页面中内解析出所有的商家
    """
    page = html_from_url(url)
    e = pq(page)
    items = e('#shop-all-list').find('li')
    shopping = []
    for i in items:
        e = pq(i)
        print("before e('.txt>.tit>a:first')('href') : {}".format(
            e('.txt>.tit>a:first').attr('href')))
        if not e('.txt>.tit>a:first').attr('href'):
            print("after e('.txt>.tit>a:first')('href') : {}".format(
                e('.txt>.tit>a:first').attr('href')))
            return shopping_from_url(url)
        shopping.append(shopping_from_li(i))
    # shopping = [shopping_from_li(i) for i in items]
    return shopping
def beauty_from_url(url):
    """
    从 url 中解析出页面内所有的商家
    """
    # url = 'http://www.dianping.com/shenzhen/ch50/g159r64846p1'
    # url = 'http://www.dianping.com/shenzhen/ch50/g33761r8351p3'
    page = html_from_url(url)
    e = pq(page)
    items = e('#shop-all-list ul > li')
    beauty = []
    for i in items:
        e = pq(i)
        print("before e('.txt>.tit>a:first')('href') : {}".format(e('.txt>.tit>a:first').attr('href')))
        if not e('.txt>.tit>a:first').attr('href'):
            print("after e('.txt>.tit>a:first')('href') : {}".format(e('.txt>.tit>a:first').attr('href')))
            return beauty_from_url(url)
        beauty.append(beauty_from_li(i))
    # beauty = [beauty_from_li(i) for i in items]
    return beauty
def main():
    # foodtype = ['g103', 'g205', 'g733', 'g1947', 'g32728', 'g1953', 'g111', 'g117', 'g1833', 'g241', 'g132', 'g113',
    #             'g33924', 'g225', 'g226', 'g34041', 'g34040', 'g110', 'g32731', 'g3027', 'g3023', 'g34060', 'g3017',
    #             'g4477', 'g32730', 'g208', 'g34061', 'g34063', 'g32729', 'g34065', 'g34062', 'g34064', 'g34066', 'g116',
    #             'g238', 'g24340', 'g254', 'g232', 'g231', 'g253', 'g219', 'g251', 'g508', 'g114', 'g102', 'g4467',
    #             'g4473', 'g4469', 'g115', 'g109', 'g104', 'g112', 'g210', 'g217', 'g1881', 'g221', 'g4509', 'g222',
    #             'g223', 'g4557', 'g118', 'g134', 'g133', 'g247', 'g246', 'g311', 'g6743', 'g1387', 'g26483', 'g26482',
    #             'g26484', 'g252', 'g34014', 'g101', 'g34055', 'g3243', 'g207', 'g106', 'g250', 'g34032', 'g1338',
    #             'g26481', 'g1959', 'g2714', 'g25474', 'g107', 'g34059', 'g1783']

    foodtype = ['g103', 'g111', 'g117', 'g132', 'g113', 'g110', 'g116', 'g219', 'g251', 'g508', 'g114', 'g102', 'g115',
                'g109', 'g104', 'g112', 'g118', 'g34014', 'g101', 'g34055', 'g3243', 'g207', 'g106', 'g250', 'g34032',
                'g1338', 'g26481', 'g1959', 'g2714', 'g25474', 'g107', 'g34059', 'g1783']
    cur_location = ['r29', 'r1949', 'r7475', 'r1560', 'r12322', 'r1556', 'r1951', 'r12321', 'r1559', 'r12225', 'r1557',
                    'r1573', 'r12324', 'r12226', 'r12323', 'r3138', 'r12320', 'r12319', 'r1950']
    random.shuffle(foodtype)
    # random.shuffle(dz_location)

    print('foodtype {}'.format(foodtype))
    # print('dz_location {}'.format(dz_location))

    for tp in foodtype:
        # for loc in dz_location:
        for loc in cur_location[::-1]:
            baseurl = 'http://www.dianping.com/shenzhen/ch10/%s%s' % (tp, loc)
            time.sleep(random.randint(2, 5))
            basepage = html_from_url(baseurl)
            e = pq(basepage)
            maxpage = 0
            if e('.PageLink').text():
                maxpage = int(e('.PageLink:last').attr('data-ga-page'))
                # maxpage = int(e('.PageLink').eq(-1).attr('data-ga-page'))
            elif 0 < e('#shop-all-list li').length <= 15:
                maxpage = 1
            else:
                print('maxpage in else: {}'.format(maxpage))
                continue
            print('maxpage: {}'.format(maxpage))
            for i in range(1, maxpage + 1):
                url = baseurl + 'p' + str(i)
                time.sleep(random.randint(5, 10))
                food = food_from_url(url)
def service_from_url(url):
    """
    从 url 中解析出页面内所有的商家
    """
    # url = 'http://www.dianping.com/shenzhen/ch80/g26085r34p39'
    page = html_from_url(url)
    e = pq(page)
    items = e('#shop-all-list>ul>li')
    service = []
    for i in items:
        e = pq(i)
        print("before e('.txt>.tit>a:first')('href') : {}".format(
            e('.txt>.tit>a:first').attr('href')))
        if not e('.txt>.tit>a:first').attr('href'):
            print("after e('.txt>.tit>a:first')('href') : {}".format(
                e('.txt>.tit>a:first').attr('href')))
            return service_from_url(url)
        service.append(service_from_li(i))
    # service = [service_from_li(i) for i in items]
    return service
Пример #23
0
def get_address():
    for item in db.Wedding.find():
        item_url = item['url']
        number = item['number']
        print('item url : {}'.format(item_url))
        time.sleep(random.randint(2, 5))
        html = html_from_url(item_url)
        # html = html_from_url_selenium(item_url)
        # html = httpSpider(item_url)
        print('html : {}'.format(html))
        e = pq(html)
        # if 'window.shop_config={' in e('script').text():
        address = ''
        if e('.road-addr').text():
            address = e('.road-addr').text().strip()
        print('address : {}'.format(address))
        db.Wedding.update({"number": number}, {"$set": {
            "address": address
        }},
                          upsert=True)
def sports_from_baseurl(baseurl):
    """
    从 baseurl 中解析出页面内所有的商家
    """
    time.sleep(random.randint(2, 5))
    basepage = html_from_url(baseurl)
    e = pq(basepage)
    maxpage = 0
    if e('.PageLink').text():
        maxpage = int(e('.PageLink:last').attr('data-ga-page'))
        # maxpage = int(e('.PageLink').eq(-1).attr('data-ga-page'))
    elif 0 < e('#shop-all-list li').length <= 15:
        maxpage = 1
    else:
        print('maxpage in else: {}'.format(maxpage))
        # continue
    print('maxpage: {}'.format(maxpage))
    for i in range(1, maxpage + 1):
        url = baseurl + 'p' + str(i)
        time.sleep(random.randint(5, 10))
        sports = sports_from_url(url)
def main():
    cartype = [
        'g175', 'g178', 'g34072', 'g176', 'g34087', 'g34073', 'g180', 'g34088',
        'g34074', 'g34075', 'g34076', 'g177', 'g34077'
    ]

    area = [
        'r29', 'r31', 'r30', 'r32', 'r12036', 'r12033', 'r34', 'r33', 'r12035'
    ]
    cur_location = ['r8348', 'r8352', 'r33', 'r34']
    random.shuffle(cartype)
    random.shuffle(area)

    print('cartype {}'.format(cartype))
    print('area {}'.format(area))

    for tp in cartype:
        cararea = dz_location if tp in ['g175', 'g176', 'g180', 'g177'
                                        ] else area
        print('cararea {}'.format(cararea))
        # for loc in cararea:
        for loc in cur_location:
            baseurl = 'http://www.dianping.com/shenzhen/ch65/%s%s' % (tp, loc)
            time.sleep(random.randint(2, 5))
            basepage = html_from_url(baseurl)
            e = pq(basepage)
            maxpage = 0
            if e('.PageLink').text():
                maxpage = int(e('.PageLink:last').attr('data-ga-page'))
                # maxpage = int(e('.PageLink').eq(-1).attr('data-ga-page'))
            elif 0 < e('#shop-all-list li').length <= 15:
                maxpage = 1
            else:
                print('maxpage in else: {}'.format(maxpage))
                continue
            print('maxpage: {}'.format(maxpage))
            for i in range(1, maxpage + 1):
                url = baseurl + 'p' + str(i)
                time.sleep(random.randint(5, 10))
                car = car_from_url(url)
def main():
    educationtype = [
        'g2872', 'g2873', 'g2877', 'g2876', 'g2874', 'g2878', 'g179', 'g260',
        'g32722', 'g34105', 'g33897', 'g33899', 'g33898', 'g34106', 'g34107',
        'g2882'
    ]

    area = [
        'r29', 'r31', 'r30', 'r32', 'r12036', 'r12033', 'r34', 'r33', 'r12035'
    ]

    random.shuffle(educationtype)
    random.shuffle(area)

    print('educationtype {}'.format(educationtype))
    print('area {}'.format(area))

    for tp in educationtype:
        educationarea = dz_location if (tp == 'g2876'
                                        or tp == 'g179') else area
        for loc in educationarea:
            baseurl = 'http://www.dianping.com/shenzhen/ch75/%s%s' % (tp, loc)
            time.sleep(random.randint(2, 5))
            basepage = html_from_url(baseurl)
            e = pq(basepage)
            maxpage = 0
            if e('.PageLink').text():
                maxpage = int(e('.PageLink:last').attr('data-ga-page'))
                # maxpage = int(e('.PageLink').eq(-1).attr('data-ga-page'))
            elif 0 < e('#shop-all-list li').length <= 15:
                maxpage = 1
            else:
                print('maxpage in else: {}'.format(maxpage))
                continue
            print('maxpage: {}'.format(maxpage))
            for i in range(1, maxpage + 1):
                url = baseurl + 'p' + str(i)
                time.sleep(random.randint(5, 10))
                education = education_from_url(url)
def main():
    babytype = [
        'g193', 'g27761', 'g161', 'g27767', 'g188', 'g27762', 'g27763',
        'g2784', 'g258', 'g27768', 'g34117', 'g257', 'g34116', 'g27814',
        'g33797', 'g125', 'g20009', 'g189', 'g33803', 'g33808', 'g27769'
    ]

    area = [
        'r29', 'r30', 'r31', 'r32', 'r33', 'r34', 'r12033', 'r12035', 'r12036'
    ]

    random.shuffle(babytype)
    random.shuffle(area)
    print('babytype {}'.format(babytype))
    print('area {}'.format(area))

    for tp in babytype:
        babyarea = dz_location if tp == 'g125' else area
        for loc in babyarea:
            baseurl = 'http://www.dianping.com/shenzhen/ch70/%s%s' % (tp, loc)
            time.sleep(random.randint(2, 5))
            basepage = html_from_url(baseurl)
            e = pq(basepage)
            maxpage = 0
            if e('.PageLink').text():
                maxpage = int(e('.PageLink:last').attr('title'))
                # maxpage = int(e('.PageLink').eq(-1).attr('title'))
            elif 0 < e('.shop-list li').length <= 15:
                maxpage = 1
            else:
                print('maxpage in else: {}'.format(maxpage))
                continue
            print('maxpage: {}'.format(maxpage))

            for i in range(1, maxpage + 1):
                url = baseurl + 'p' + str(i)
                time.sleep(random.randint(5, 10))
                baby = baby_from_url(url)
Пример #28
0
def main():
    # weddingtype = ['g25410', 'g33888', 'g34057', 'g163', 'g6699', 'g6698', 'g162', 'g983', 'g1016', 'g25411', 'g167',
    #                'g1039', 'g27943', 'g34108', 'g191', 'g2814', 'g2816', 'g2818', 'g166', 'g185', 'g6700', 'g164',
    #                'g25412', 'g186', 'g192', 'g6844']

    weddingtype = [
        'g25410', 'g33888', 'g34057', 'g163', 'g162', 'g167', 'g191', 'g166',
        'g185', 'g6700', 'g164', 'g25412', 'g186', 'g192', 'g6844'
    ]
    area = [
        'r29', 'r31', 'r30', 'r32', 'r12036', 'r12033', 'r34', 'r33', 'r12035'
    ]
    random.shuffle(weddingtype)
    random.shuffle(area)
    print('weddingtype {}'.format(weddingtype))
    print('area {}'.format(area))
    get_address()
    for tp in weddingtype:
        for loc in area:
            baseurl = 'http://www.dianping.com/shenzhen/ch55/%s%s' % (tp, loc)
            time.sleep(random.randint(2, 5))
            basepage = html_from_url(baseurl)
            e = pq(basepage)
            maxpage = 0
            if e('.PageLink').text():
                maxpage = int(e('.PageLink:last').attr('title'))
                # maxpage = int(e('.PageLink').eq(-1).attr('title'))
            elif 0 < e('.shop-list li').length <= 15:
                maxpage = 1
            else:
                print('maxpage in else: {}'.format(maxpage))
                continue
            print('maxpage: {}'.format(maxpage))

            for i in range(1, maxpage + 1):
                url = baseurl + 'p' + str(i)
                time.sleep(random.randint(5, 10))
                wedding = wedding_from_url(url)
def main():
    medicaltype = [
        'g183', 'g181', 'g182', 'g2914', 'g612', 'g235', 'g25148', 'g34050',
        'g34053', 'g257', 'g34051', 'g34046', 'g34052', 'g34054', 'g34049',
        'g34048', 'g2912'
    ]

    area = [
        'r29', 'r31', 'r30', 'r32', 'r12036', 'r12033', 'r34', 'r33', 'r12035'
    ]

    random.shuffle(medicaltype)
    random.shuffle(area)

    print('medicaltype {}'.format(medicaltype))
    print('area {}'.format(area))

    for tp in medicaltype:
        medicalarea = dz_location if (tp == 'g181' or tp == 'g235') else area
        for loc in medicalarea:
            baseurl = 'http://www.dianping.com/shenzhen/ch85/%s%s' % (tp, loc)
            time.sleep(random.randint(2, 5))
            basepage = html_from_url(baseurl)
            e = pq(basepage)
            maxpage = 0
            if e('.PageLink').text():
                maxpage = int(e('.PageLink:last').attr('data-ga-page'))
            elif 0 < e('#shop-all-list li').length <= 15:
                maxpage = 1
            else:
                print('maxpage in else: {}'.format(maxpage))
                continue
            print('maxpage: {}'.format(maxpage))
            for i in range(1, maxpage + 1):
                url = baseurl + 'p' + str(i)
                time.sleep(random.randint(5, 10))
                medical = medical_from_url(url)
def main():
    hometype = ['g32704', 'g25475', 'g33867', 'g33876', 'g34035', 'g6827', 'g6826', 'g32702', 'g32705']
    # area = ['r29', 'r31', 'r30', 'r32', 'r12036', 'r12033', 'r34', 'r33', 'r12035']
    # random.shuffle(hometype)
    # random.shuffle(area)
    print('hometype {}'.format(hometype))
    # print('area {}'.format(area))

    for tp in hometype:
        for loc in dz_location:
            baseurl = 'http://www.dianping.com/shenzhen/ch90/%s%s' % (tp, loc)
            time.sleep(random.randint(2, 5))
            basepage = html_from_url(baseurl)
            e = pq(basepage)
            maxpage = 0
            if e('.pageLink').text():
                maxpage = int(e('.pageLink:last').attr('title'))
                print('maxpage in first if: {}'.format(maxpage))
                # maxpage = int(e('.PageLink').eq(-1).attr('title'))
            elif e('.pages-num>.pages').text() == '':
                maxpage = 1
                print('maxpage in elif: {}'.format(maxpage))
            else:
                print('maxpage in else: {}'.format(maxpage))
                continue
            print('maxpage: {}'.format(maxpage))

            for i in range(1, maxpage + 1):
                url = baseurl + 'p' + str(i)
                time.sleep(random.randint(5, 10))
                if tp in ['g25475', 'g32704']:
                    print('tp in if : {}'.format(tp))
                    home = home_from_url_decoration(url)
                else:
                    print('tp in else : {}'.format(tp))
                    home = home_from_url(url)