Пример #1
0
def get_brand(main_brand_list):
    """ 获取子品牌和车型(车型没啥用)
    :param main_brand_list 主品牌列表
    :return: 子品牌列表。[{主品牌:,品牌:,车型{}},{同前},{同前}]列表,元素为每个子品牌的信息,包括所属主品牌信息,子品牌信息,下属车型信息(字典)
    """
    brand_list = []
    for i in range(len(main_brand_list)):
        id = main_brand_list[i]['id']
        main_brand_name = main_brand_list[i]['name']
        main_brand_id = main_brand_list[i]['id']
        main_brand_show = main_brand_list[i]['show']
        main_brand_num = main_brand_list[i]['num']
        main_brand_url = main_brand_list[i]['url']
        url = 'http://api.car.bitauto.com/CarInfo/getlefttreejson.ashx?' \
              'tagtype=jingxiaoshang&pagetype=masterbrand&objid=' + str(
            id) + '&citycode=beijing%2F&cityid=201'  # 获取子品牌真正请求的网址
        # print url
        # print main_brand_name
        data = general_helper.get_json_response(url)
        mbrandbox = data['brand']
        # print type(mbrandbox)
        for mbox in mbrandbox.values():
            for mb in mbox:
                if 'child' in mb.keys():
                    # print 'get it'
                    child = mb['child']
                    logger.debug("%s,%s,%s,%s" %
                                 (main_brand_name, main_brand_id,
                                  main_brand_show, len(child)))
                    for b in child:
                        brand = {
                            'mainbrand': main_brand_name,
                            'mainid': main_brand_id,
                            'mainshow': main_brand_show,
                            'main_url0': main_brand_url,
                            'mainnum': main_brand_num,
                            'name': b['name'].decode('utf-8'),
                            'url': b['url'],
                            'num': b['num'],
                            'show': b['url'].split('/')[2]
                        }
                        # print b['url'].split('/')
                        mchild = b['child']  # 品牌下属车型模块
                        brand['model'] = []
                        # print brand['name']#,brand['url'],brand['show']
                        brand_list.append(brand)
                        for m in mchild:
                            model = {}
                            model['name'] = m['name'].decode('utf-8')  # 车型名
                            model['url'] = m['url']  # 车型url
                            # showid=m['url'].split('/')[2].split('-')
                            # id=showid[1]
                            model['show'] = m['url'].split('/')[2]  # 车型缩写
                            model['num'] = m['num']  # 车型经销商数
                            brand['model'].append(model)
                            # print model['name']
                else:
                    continue
    return brand_list
Пример #2
0
def get_brand_serial(main_brand_list):
    """根据主品牌ID获取品牌与车型信息
    可以得到以下信息:
    brand_id
    brand_name,
    serial_id
    serial_name

    :param main_brand_list:
    :return:品牌车型列表
    """
    brand_serial_list = []
    brand_serial_url_base = 'http://api.car.bitauto.com/CarInfo/getlefttreejson.ashx?tagtype=chexing&pagetype=masterbrand&objid=@main_brand_id'
    for main_brand_item in main_brand_list:
        brand_serial_url = brand_serial_url_base.replace(
            '@main_brand_id', str(main_brand_item['main_brand_id']))
        data = general_helper.get_json_response(brand_serial_url)
        main_brand_box = data['brand']
        for key1 in main_brand_box:
            for mb in main_brand_box[key1]:
                if 'child' in mb:
                    logger.debug(main_brand_item['main_brand_name'])
                    for brand_item in mb['child'] or []:
                        brand = {
                            'main_brand_id':
                            main_brand_item['main_brand_id'],
                            'main_brand_name':
                            main_brand_item['main_brand_name'],
                            'brand_id':
                            int(
                                re.search(r'\d+',
                                          str(brand_item['url'])).group()),
                            'brand_name':
                            brand_item['name'].decode('utf-8'),
                            'serial': []
                        }
                        if 'child' in brand_item:
                            for serial_item in brand_item['child'] or []:
                                serial = {
                                    'serial_id':
                                    int(
                                        re.search(
                                            r'\d+',
                                            str(serial_item['url'])).group()),
                                    'serial_name':
                                    serial_item['name'].decode('utf-8')
                                }
                                brand['serial'].append(serial)
                            brand_serial_list.append(brand)
                        else:
                            continue
                else:
                    continue
    return brand_serial_list
    pass
Пример #3
0
def get_all_dealer(brand_list):
    """ 根据品牌获取品牌覆盖地区再获取商家信息
    :param brand_list
    :return:空。数据存入
    """
    reload(sys)
    sys.setdefaultencoding("utf-8")
    # conn,cur=Linksql('192.168.10.71','datacrowler','1qazXSW@','PriceCrawlerDB')
    conn = None
    cur = None
    for brand in brand_list:  # 每个品牌
        logger.debug('crawling: %s' % brand['name'])
        mbrandname = brand['mainbrand']
        mbrandid = brand['mainid']
        mbrandshow = brand['mainshow']
        bname = brand['name']
        bshow = brand['show']
        if brand['num'] == 0:  # 品牌后数字为0即该品牌没有商家
            continue
        else:
            logger.info("get dealers of %s, %s, %s" %
                        (mbrandname, bname, brand['num'] or 0))
            burl = general_helper.build_url(main_url, brand['url'])
            bhtml = general_helper.get_response(burl)
            bsoup = BeautifulSoup(bhtml, 'lxml')
            try:  # 有的品牌无法从商家列表上边的区域位置按钮弹层中获得该品牌的覆盖的地区,会抛出异常
                plist = get_location(bsoup, 'ul', 'layer-txt-list')
            except Exception, e:
                plist = get_province(main_url, bshow)  # 此时采取第二种方案
                logger.critical(
                    "%s, %s, %s" %
                    (bname, len(plist), ' this brand don\'t have dealer'))
                raise
            # brand['location']=[]
            if len(
                    plist
            ) == 0:  # 有的品牌无法从商家列表上边的区域位置按钮弹层中获得该品牌的覆盖的地区,会抛出异常,虽然经过方案二的处理但是有的品牌下没有覆盖省及直辖市,此时plist=[]
                continue
            else:
                for p in plist:
                    pname = p['name']
                    purl = p['url']
                    pshow = p['show']

                    pnum = 0
                    try:
                        pnum = int(p['num'])
                    except Exception, e:
                        logger.critical(e.message)
                        raise
                    finally:
                        pnum = 0
Пример #4
0
def crawl():
    logger.debug("getting main brand list")
    main_brand_list = get_main_brand()
    logger.debug('getting brand serial list')
    brand_serial_list = get_brand_serial(main_brand_list)
    logger.debug('getting and inserting car list')
    get_and_insert_car(brand_serial_list)
    pass
Пример #5
0
def get_and_insert_car(brand_serial_list):
    """根据车型ID获取车款信息,以及补充车型信息
    可以得到以下信息
    serial_spell,
    serial_show_name,

    car_id,
    car_name,
    car_gear,
    car_engine,
    car_msrp,
    car_sale_year

    :param brand_serial_list:
    :return:主品牌品牌车型车款列表
    """
    serial_url_base = 'http://car.bitauto.com/tree_chexing/sb_@serial_id'
    for brand in brand_serial_list:
        logger.debug('brand: %s' % (brand['brand_name']))
        brand_serial_car_list = []
        for serial in brand['serial']:
            logger.debug('serial: %s' % (serial['serial_name']))
            serial_id = serial['serial_id']
            serial_url = serial_url_base.replace('@serial_id', str(serial_id))
            logger.debug('url: %s' % serial_url)
            content = general_helper.get_response(serial_url)
            html = etree.HTML(content)
            serial_spell = str(
                html.xpath(
                    '//div[@class="section-header header1"]/div/h2/a/@href')
                [0])[1:-1]
            serial_show_name = str(
                html.xpath(
                    '//div[@class="section-header header1"]/div/h2/a/text()')
                [0]).decode('utf-8')
            car_row_list = html.xpath('//table[@id="compare_sale"]/tbody/tr')
            for car_row in car_row_list:
                if 'class' in car_row.attrib and car_row.attrib[
                        'class'] == 'table-tit':  # 分组表头
                    car_engine = str(
                        car_row.xpath(
                            'normalize-space(th[@class="first-item"])')
                    ).decode('utf-8')
                else:  # 车款
                    car_id = int(
                        re.search(r'\d+',
                                  car_row.attrib['id']).group().strip())
                    car_name = str(car_row.xpath('td/a/text()')
                                   [0]).strip().decode('utf-8')
                    car_gear = str(
                        car_row.xpath('string(td[3])')).strip().decode('utf-8')
                    car_msrp_match = re.search(
                        r'(\d+(\.\d+)?)',
                        str(
                            car_row.xpath('string(td[@class="txt-right"]/span)'
                                          )).strip())
                    car_msrp = car_msrp_match.group(
                    ) if car_msrp_match else 0.0

                    car_sale_year = re.search(r'^\d+', car_name).group() or ''
                    brand_serial_car = {
                        'main_brand_id': brand['main_brand_id'],
                        'main_brand_name': brand['main_brand_name'],
                        'brand_id': brand['brand_id'],
                        'brand_name': brand['brand_name'],
                        'serial_id': serial['serial_id'],
                        'serial_name': serial['serial_name'],
                        'serial_spell': serial_spell,
                        'serial_show_name': serial_show_name,
                        'car_id': car_id,
                        'car_name': car_name,
                        'car_gear': car_gear,
                        'car_engine': car_engine,
                        'car_msrp': car_msrp,
                        'car_sale_year': car_sale_year
                    }
                    brand_serial_car_list.append(brand_serial_car)
        insert_car_to_db(brand_serial_car_list)
Пример #6
0
def get_dealer(lurl, location):
    """从一个品牌区域的url获取商家信息

    :param lurl:
    :param location:
    :return:
    """
    html = general_helper.get_response(lurl)
    # print html.encode('gbk','ignore')
    soup = BeautifulSoup(html, 'lxml')
    # print 'begin get dealer'
    dealerbox = soup.find('div', 'main-inner-section sm dealer-box')
    dealerlist = dealerbox.find_all('div', 'row dealer-list')
    # print dealerlist
    for dealer in dealerlist:
        inf = dealer.find('div', 'col-xs-6 left')
        name = inf.find('h6', 'title-4s').find('a')  # 标题
        dtype = name.find('em').string  # 商家类型
        # print dtype
        durl = re.findall(r'(?<=href=\").*?(?=\")', str(name))[0]  # 商家url
        # print durl
        dname = re.findall(r'(?<=span>).*?(?=<)',
                           str(name))[0].decode('utf-8')  # 商家名称
        # print dname
        dealer_id = int(re.findall(r'(?<=com/)\d+(?=/)', str(durl))[0])  # 商家ID
        # print dealer_id

        dpinpai = re.findall(r'(?<=span\>).*?(?=\<)',
                             str(inf.find('p', 'brand')))[0].decode(
                                 'utf-8')  # 商家主营品牌
        # print dpinpai
        if inf.find('p', 'promote') != None:  # 判断商家是否有正在进行的降价
            dpromotetitle = inf.find(
                'p', 'promote').find('a').string.decode('utf-8').replace(
                    r'\s+', u' ')
            dpromoteurl = re.findall(
                r'(?<=href=\").*?(?=\")',
                str(inf.find('p', 'promote').find('a')))[0].decode('utf-8')
            dpromoteday = inf.find('p', 'promote').find(
                'span', 'time').string.decode('utf-8')
        else:
            dpromotetitle = None  # 商家正在进行的降价标题
            dpromoteurl = None  # 降价新闻的url
            dpromoteday = None  # 剩余天数
        # print dpromotetitle
        # print dpromoteurl
        # print dpromoteday
        add = inf.find('p', 'add').find_all(
            'span', attrs={'title':
                           True})[0].attrs['title'].replace(u'\xa0',
                                                            u'')  # 商家地址
        # print add.encode('gbk','ignore')
        tel = get_dealer_telephone(dealer_id)
        dtel = tel  # 商家电话
        # print dtel.encode('gbk','ignore')
        try:
            dsalearea = inf.find('p', 'tel').find('span',
                                                  'sales-area').string  # 售卖地区
        except Exception, e:
            print lurl, dname, location['pname'], location[
                'mainbrand'], location['bname'], inf.find('p', 'tel')
            raise
        dcity = dealer.find('div',
                            'col-xs-7 middle').p.string.split(' ')[0]  # 所在城市
        dlocation = dealer.find(
            'div',
            'col-xs-7 middle').p.string.split(' ')[1].replace('&nbsp;',
                                                              '')  # 所在地区
        now_time = general_helper.get_now()
        logger.debug("%s,%s,%s,%s,%s" %
                     (location['pname'], dcity, dlocation, dealer_id, dname))
        sql = u"insert into dealer_raw(" \
              u"`main_brand_id`,`main_brand_name`,`main_brand_show`,`brand_name`,`brand_show`,`province_name`,`province_show`," \
              u"`city_name`,`location_name`,`dealer_type`,`dealer_url`,`dealer_name`,`dealer_id`,`dealer_brand`,`dealer_pro_title`," \
              u"`dealer_pro_url`,`dealer_pro_day`,`dealer_add`,`dealer_tel`,`sale_area`,`url`,`create_time`" \
              u") values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
        params = (location['mainid'], location['mainbrand'],
                  location['mainshow'], location['bname'], location['bshow'],
                  location['pname'], location['pshow'], dcity, dlocation,
                  dtype, durl.decode('utf-8'), dname, dealer_id, dpinpai,
                  dpromotetitle or '', dpromoteurl
                  or '', dpromoteday or '', add, dtel or '', dsalearea,
                  lurl.decode('utf-8'), now_time)
        # print sql
        try:
            mysql.insert(sql, params)
        except Exception, e:
            print 'this is an except:', str(e)
            print sql
            print location['mainid'], location['mainbrand'], location[
                'mainshow'], location['bname'], location['bshow'], location[
                    'pname'], location['pshow']
            print dcity, dlocation
            print dtype, durl, dname, dealer_id
            print dpinpai
            print dpromotetitle, dpromoteurl, dpromoteday
            print add.encode('gbk', 'ignore'), dtel, dsalearea
            print lurl
            raise