예제 #1
0
def crawl():
    # 获取所有省份
    logger.info("start: get all provinces.")
    get_all_province()
    logger.info("finish: get all provinces.")

    # 获取主品牌和品牌
    logger.info("start: get main brands.")
    main_brand_list = get_main_brand()
    logger.info("finish: get main brands.")

    logger.info("start: get brands.")
    brand_list = get_brand(main_brand_list)
    logger.info("finish: get brands.")

    success = 0
    start_time = general_helper.get_now()
    # print start_time
    sql = u"insert into crawl_log (project_name,complete_success,start_time)  values (%s, %s, %s)"
    params = (u'易车商家抓取', success, start_time)
    mysql.insert(sql, params)

    # get_all_dealer(brand_list[0:1])
    l = len(brand_list)
    a = l / 4
    b = (l / 4) * 2
    c = (l / 4) * 3
    print a, b, c
    brand_list1 = brand_list[0:a]
    brand_list2 = brand_list[a:b]
    brand_list3 = brand_list[b:c]
    brand_list4 = brand_list[c:]
    print len(brand_list1), len(brand_list2), len(brand_list3), len(
        brand_list4)

    p1 = multiprocessing.Process(target=get_all_dealer, args=(brand_list1, ))
    p2 = multiprocessing.Process(target=get_all_dealer, args=(brand_list2, ))
    p3 = multiprocessing.Process(target=get_all_dealer, args=(brand_list3, ))
    p4 = multiprocessing.Process(target=get_all_dealer, args=(brand_list4, ))
    p1.start()  # 启动进程
    p2.start()
    p3.start()
    p4.start()
    p1.join()  # 等子进程结束才执行主进程
    p2.join()
    p3.join()
    p4.join()

    success = 1
    end_time = general_helper.get_now()
    sql = u"update crawl_log set complete_success = %s, end_time = %s where id = (" \
          u"select id from ( " \
          u"select max(id) as id from crawl_log as a where project_name= %s ) as s)"
    params = (success, end_time, project_name)
    mysql.update(sql, params)
예제 #2
0
def insert_car_to_db(brand_serial_car_list):
    """ 将车款数据插入数据库

    :param brand_serial_car_list:
    :return:
    """
    sql = 'INSERT INTO car_data.car ( main_brand_id, main_brand_name, brand_id, brand_name, ' \
          'serial_id, serial_name, serial_spell, serial_show_name, car_id, car_name, car_gear, ' \
          'car_engine, car_msrp, car_sale_year, create_time) ' \
          'VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);'
    now = general_helper.get_now()
    params_list = []

    for car in brand_serial_car_list:
        params = (car['main_brand_id'], car['main_brand_name'],
                  car['brand_id'], car['brand_name'], car['serial_id'],
                  car['serial_name'], car['serial_spell'],
                  car['serial_show_name'], car['car_id'], car['car_name'],
                  car['car_gear'], car['car_engine'], car['car_msrp']
                  or 0.0, car['car_sale_year'], now)
        params_list.append(params)

    mysql.insert_batch(sql, params_list)

    pass
예제 #3
0
def get_all_province():
    """ 获取区域
    从北京站获取的省和直辖市是全面的,获取每个省和直辖市的名字,每个省和直辖市的品牌是不一样的,
    本来是从每个品牌商家列表上边的区域位置按钮弹层中获得该品牌的覆盖的地区,但是有的品牌不支持这样,
    所以对这些品牌应先获得所有的地区,然后与各品牌拼接url。顺便写入数据库

    以上两种方法结合互补执行

    :return:
    """
    bhtml = general_helper.get_response(
        'http://dealer.bitauto.com/beijing/audi/')
    bsoup = BeautifulSoup(bhtml, 'lxml')
    provincelist = bsoup.find('ul', 'layer-txt-list').find_all('li')
    plist = []
    for province in provincelist:
        p = {}
        p['url'] = re.findall(r'(?<=href=\").*?(?=\">)',
                              str(province))[0].decode('utf-8')
        p['name'] = re.findall(r'(?<=0\">).*?(?=<)',
                               str(province))[0].decode('utf-8')
        p['show'] = p['url'].split('/')[1].decode('utf-8')
        p['num'] = re.findall(r'(?<=\().*?(?=\))', str(province))[0]
        now_time = general_helper.get_now()
        sql = u"insert into province (`name`,`show`,`url`,`create_time`)\
           values ( %s,%s,%s,%s)"

        params = (p['name'], p['show'], p['url'], now_time)
        mysql.insert(sql, params)
        plist.append(p)
    return plist
예제 #4
0
# -*- coding: utf-8 -*-
# ----------------------------------------------------------------------------------------------------------------------
# file: province_test
# author: eva
# date: 2018/1/12
# version: 
# description:
# ----------------------------------------------------------------------------------------------------------------------

from utils import general_helper
from utils.commons import mysql


if __name__ == '__main__':
    # 1. insert
    now_time = general_helper.get_now()
    sql = u"insert into province (`name`,`show`,`url`,`create_time`)\
               values ( %s,%s,%s, %s)"
    params = ('北京', 'beijing', 'http://beijing.bitauto.com', now_time)
    mysql.insert(sql, params)

    # 2. select
    sql = u"select distinct `name`,`show` from province"
    params = ('北京', 'beijing', 'http://beijing.bitauto.com', now_time)
    records = mysql.select(sql)
    print len(records)

pass


예제 #5
0
def get_dealer(lurl, location):
    """从一个品牌区域的url获取商家信息

    :param lurl:
    :param location:
    :return:
    """
    html = general_helper.get_response(lurl)
    # print html.encode('gbk','ignore')
    soup = BeautifulSoup(html, 'lxml')
    # print 'begin get dealer'
    dealerbox = soup.find('div', 'main-inner-section sm dealer-box')
    dealerlist = dealerbox.find_all('div', 'row dealer-list')
    # print dealerlist
    for dealer in dealerlist:
        inf = dealer.find('div', 'col-xs-6 left')
        name = inf.find('h6', 'title-4s').find('a')  # 标题
        dtype = name.find('em').string  # 商家类型
        # print dtype
        durl = re.findall(r'(?<=href=\").*?(?=\")', str(name))[0]  # 商家url
        # print durl
        dname = re.findall(r'(?<=span>).*?(?=<)',
                           str(name))[0].decode('utf-8')  # 商家名称
        # print dname
        dealer_id = int(re.findall(r'(?<=com/)\d+(?=/)', str(durl))[0])  # 商家ID
        # print dealer_id

        dpinpai = re.findall(r'(?<=span\>).*?(?=\<)',
                             str(inf.find('p', 'brand')))[0].decode(
                                 'utf-8')  # 商家主营品牌
        # print dpinpai
        if inf.find('p', 'promote') != None:  # 判断商家是否有正在进行的降价
            dpromotetitle = inf.find(
                'p', 'promote').find('a').string.decode('utf-8').replace(
                    r'\s+', u' ')
            dpromoteurl = re.findall(
                r'(?<=href=\").*?(?=\")',
                str(inf.find('p', 'promote').find('a')))[0].decode('utf-8')
            dpromoteday = inf.find('p', 'promote').find(
                'span', 'time').string.decode('utf-8')
        else:
            dpromotetitle = None  # 商家正在进行的降价标题
            dpromoteurl = None  # 降价新闻的url
            dpromoteday = None  # 剩余天数
        # print dpromotetitle
        # print dpromoteurl
        # print dpromoteday
        add = inf.find('p', 'add').find_all(
            'span', attrs={'title':
                           True})[0].attrs['title'].replace(u'\xa0',
                                                            u'')  # 商家地址
        # print add.encode('gbk','ignore')
        tel = get_dealer_telephone(dealer_id)
        dtel = tel  # 商家电话
        # print dtel.encode('gbk','ignore')
        try:
            dsalearea = inf.find('p', 'tel').find('span',
                                                  'sales-area').string  # 售卖地区
        except Exception, e:
            print lurl, dname, location['pname'], location[
                'mainbrand'], location['bname'], inf.find('p', 'tel')
            raise
        dcity = dealer.find('div',
                            'col-xs-7 middle').p.string.split(' ')[0]  # 所在城市
        dlocation = dealer.find(
            'div',
            'col-xs-7 middle').p.string.split(' ')[1].replace('&nbsp;',
                                                              '')  # 所在地区
        now_time = general_helper.get_now()
        logger.debug("%s,%s,%s,%s,%s" %
                     (location['pname'], dcity, dlocation, dealer_id, dname))
        sql = u"insert into dealer_raw(" \
              u"`main_brand_id`,`main_brand_name`,`main_brand_show`,`brand_name`,`brand_show`,`province_name`,`province_show`," \
              u"`city_name`,`location_name`,`dealer_type`,`dealer_url`,`dealer_name`,`dealer_id`,`dealer_brand`,`dealer_pro_title`," \
              u"`dealer_pro_url`,`dealer_pro_day`,`dealer_add`,`dealer_tel`,`sale_area`,`url`,`create_time`" \
              u") values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
        params = (location['mainid'], location['mainbrand'],
                  location['mainshow'], location['bname'], location['bshow'],
                  location['pname'], location['pshow'], dcity, dlocation,
                  dtype, durl.decode('utf-8'), dname, dealer_id, dpinpai,
                  dpromotetitle or '', dpromoteurl
                  or '', dpromoteday or '', add, dtel or '', dsalearea,
                  lurl.decode('utf-8'), now_time)
        # print sql
        try:
            mysql.insert(sql, params)
        except Exception, e:
            print 'this is an except:', str(e)
            print sql
            print location['mainid'], location['mainbrand'], location[
                'mainshow'], location['bname'], location['bshow'], location[
                    'pname'], location['pshow']
            print dcity, dlocation
            print dtype, durl, dname, dealer_id
            print dpinpai
            print dpromotetitle, dpromoteurl, dpromoteday
            print add.encode('gbk', 'ignore'), dtel, dsalearea
            print lurl
            raise
예제 #6
0
# -*- coding: utf-8 -*-
# ----------------------------------------------------------------------------------------------------------------------
# file: mysqldb_helper_test
# author: eva
# date: 2018/1/12
# version:
# description:
# ----------------------------------------------------------------------------------------------------------------------

from utils import general_helper
from utils.commons import mysql

if __name__ == '__main__':
    # 1. insert
    success = 0
    start_time = general_helper.get_now()
    # print start_time
    sql = u"insert into crawl_log (project_name,complete_success,start_time)  VALUES (%s, %s, %s)"
    params = (u'易车商家抓取', success, start_time)
    mysql.insert(sql, params)

    success = 1
    end_time = general_helper.get_now()
    sql = u"update crawl_log set complete_success = %s, end_time = %s where id = (" \
          u"select id from ( " \
          u"select max(id) as id from crawl_log as a where project_name='易车商家抓取') as s)"
    params = (success, end_time)
    mysql.update(sql, params)

pass