示例#1
0
 def __init__(self):
     #        handlers = [
     #            ################用户页面#############
     #            (r'/people/([^/]+)', empty),
     #            ################愿望相关##############
     #            (r'/wish/pool', empty),
     #            (r'/wish/([^/]+)', empty),
     #            (r'/wish/([^/]+)/add', empty),
     #            (r'/wish/([^/]+)/edit', empty),
     #            (r'/wish/([^/]+)/delete', empty),
     #            (r'/wish/([^/]+)/reply', empty),
     #            ###############设置系统###############
     #            (r'/settings/profile',empty),
     #            (r'/settings/email',empty),
     #            (r'/settings/profile',empty),
     #            (r'/settings/acount',empty),
     #            ################通知系统##############
     #            (r'/notifications',empty),
     #            (r'/notifications/all',empty)
     #        ]
     routed_handlers = route.get_routes()
     routed_handlers.append(
         tornado.web.url(r"/static/(.*)", t.StaticFileHandler,
                         app_config['static_path']))
     routed_handlers.append(
         tornado.web.url('/.*?',
                         base.PageNotFoundHandler,
                         name='page_not_found'))
     settings = app_config
     tornado.web.Application.__init__(self, routed_handlers, **settings)
     self.session = ConnectDB()
     self.r = ConnectRDB()
示例#2
0
    def __init__(self, database, table):

        db = ConnectDB(database)
#        db.create_new_table(table)

        missing = open("missing", "a+")
        
        for word in open('words.list'):
            word = word.rstrip()
            print '正在添加单词……  ' , word
            word_xml = GetWord()
            word_dic = word_xml.get_word_xml(word)
            if word_dic == None:
                print '%s Not Found' % word
                missing.write(word + '\n')
                continue
            else:
                #print word_dic
                db.insert_a_dict(word_dic)
示例#3
0
    def __init__(self, database, table):

        db = ConnectDB(database)
        #        db.create_new_table(table)

        missing = open("missing", "a+")

        for word in open('words.list'):
            word = word.rstrip()
            print '正在添加单词……  ', word
            word_xml = GetWord()
            word_dic = word_xml.get_word_xml(word)
            if word_dic == None:
                print '%s Not Found' % word
                missing.write(word + '\n')
                continue
            else:
                #print word_dic
                db.insert_a_dict(word_dic)
示例#4
0
def SaveData(table_name="",
             brand="",
             series="",
             conf="",
             status="",
             URL_="",
             index=""):
    conf = json.loads(conf)
    for (k, v) in conf.items():
        spaceid = k
        models = v[u"车型名称"]
        if models == '-':
            continue
        mth = re.compile(r'(.*)(20\d\d)(.*)')
        y = re.search(mth, models)
        if y:
            year = int(y.group(2))
        else:
            year = 0
        guide_price = v[u"厂商指导价(元)"]
        #f=re.compile(r'(\d+.\d+)')
        #p=re.search(f,guide_price)
        #guide_price=p.group()
        emission_standard = v[u"环保标准"]
        structure = v[u"车身结构"]
        level = v[u"级别"]
        manufacturer = v[u"厂商"]
        json_text = json.dumps(v, encoding='utf-8', ensure_ascii=False)
        db = ConnectDB()
        n = db.select(table_name="spider_json", field="spaceid", value=spaceid)
        if n != 0:
            logger.info("spaceid: %s exists " % spaceid)
            continue
        db.insert(table_name=table_name,
                  spaceid=spaceid,
                  brand=brand,
                  series=series,
                  models=models,
                  guide_price=guide_price,
                  level=level,
                  emission_standard=emission_standard,
                  structure=structure,
                  status=status,
                  manufacturer=manufacturer,
                  year=year,
                  index=index,
                  json_text=json_text,
                  URL_=URL_)
        db.dbclose()
示例#5
0
文件: spider.py 项目: shancang/spider
def SaveData(table_name="",brand="",series="",conf="",status="",URL_="",index=""):
    conf=json.loads(conf)
    for (k,v) in conf.items():
        spaceid = k
        models = v[u"车型名称"]
        if models == '-':
            continue
        mth=re.compile(r'(.*)(20\d\d)(.*)')
        y=re.search(mth,models)
        if y:
            year = int(y.group(2))
        else:
            year = 0
        guide_price =v[u"厂商指导价(元)"]
        #f=re.compile(r'(\d+.\d+)')
        #p=re.search(f,guide_price)
        #guide_price=p.group()
        emission_standard=v[u"环保标准"]
        structure=v[u"车身结构"]
        level=v[u"级别"]
        manufacturer=v[u"厂商"]
        json_text=json.dumps(v,encoding='utf-8', ensure_ascii=False)
        db=ConnectDB()
        n = db.select(table_name="spider_json",field="spaceid",value=spaceid)
        if n != 0:
            logger.info("spaceid: %s exists " %  spaceid )
            continue
        db.insert(table_name=table_name, 
                    spaceid=spaceid,
                    brand=brand,
                    series=series,
                    models=models,
                    guide_price=guide_price,
                    level=level,
                    emission_standard=emission_standard,
                    structure=structure,
                    status=status ,
                    manufacturer=manufacturer,
                    year=year,
                    index=index,
                    json_text=json_text,
                    URL_=URL_)
        db.dbclose()
示例#6
0
def giveFirstLatter(first_latter, item):
    fname = first_latter + ".txt"
    path = 'D:/pyLearning/spider-master/spider-master/spider/tmp/' + fname  #添加类型的记录path   需修改
    fobj = open(path, 'r+')
    fileList = fobj.read().splitlines()
    fobj.close()
    column_tit = re.compile(r'column_tit')
    column_tit = item.find_all("div", attrs={"class": column_tit})
    width = re.compile(r'848px')
    width = item.find_all("td", attrs={"width": width})
    for (brand, width) in zip(column_tit, width):
        print "--" + brand.a.span.text.strip()
        brands = brand.a.span.text.strip()
        logger.info(u"--品牌" + brand.a.span.text + " start!...")
        item_list = re.compile(r'item_list')
        item_list = width.find_all("div", attrs={"class": item_list})
        for i in item_list:
            print "---" + i.a.get("href")
            text = i.a.get("href").strip()
            if text not in fileList:  #如果没在文本中找到的话,插入href 并写入数据库
                logger.info(u"---车型" + i.a.get("href") + " start!...")
                href = url2 + i.a.get("href") + "config.htm"
                logger.info(u"---链接地址 " + href)
                obj = GetObj(href)
                html = obj.gethtml()
                while (True):
                    if not html is None:
                        coding = obj.getcodeing(html)  #获取编码类型
                        soup = BeautifulSoup(html,
                                             'html5lib',
                                             from_encoding=coding)
                        base_title = re.compile(r'base_title')
                        base_title = soup.find_all("tr",
                                                   attrs={"id": base_title})
                        soup2 = base_title[0]  #找到base_title的DOM
                        col = re.compile(r'col')
                        col = soup2.find_all("td", attrs={"scope": col})  #找到td
                        for i in col:
                            model = i.a.text.strip()  #获取到model
                            logger.info("model " + model)
                            modid = i.get("id")
                            mod = re.compile(r'(mod_)(.*)')
                            carid = re.search(mod, modid)
                            if hasattr(carid, 'group'):
                                carid = carid.group(2)
                                string = "bname_" + carid
                                db = ConnectDB()
                                n = db.select(table_name="carInfo1",
                                              field="vechiclesID",
                                              value=carid)
                                if n != 0:
                                    logger.info("vechiclesID: %s exists " %
                                                carid)
                                    continue
                                series = re.compile(string)
                                series = soup.find("td", attrs={"id": series})
                                if not series is None:  #获取到series
                                    series = series.a.text.strip()
                                    logger.info("series " + string)
                                else:
                                    logger.error(string + "not found!!!!")
                                    series = "-"

                                string = "type_name_" + carid  #获取到carType
                                carType = re.compile(string)
                                carType = soup.find("td",
                                                    attrs={"id": carType})
                                if not carType is None:
                                    carType = carType.a.text.strip()
                                    logger.info("carType " + carType)
                                else:
                                    logger.error(string + "not found!!!!")
                                    series = "-"

                                string = "m_newseat_" + carid  #获取到peopleNum
                                peopleNum = re.compile(string)
                                peopleNum = soup.find("td",
                                                      attrs={"id": peopleNum})
                                if not peopleNum is None:
                                    peopleNum = peopleNum.text.strip()
                                    logger.info("peopleNum " + peopleNum)
                                else:
                                    logger.error(string + "not found!!!!")
                                    peopleNum = "-"

                                string = "syear_" + carid  #获取到marketTime
                                marketTime = re.compile(string)
                                marketTime = soup.find(
                                    "td", attrs={"id": marketTime})
                                if not marketTime is None:
                                    marketTime = marketTime.text.strip()
                                    logger.info("marketTime " + marketTime)
                                else:
                                    logger.error(string + "not found!!!!")
                                    marketTime = "-"

                                string = "m_disl_working_mpower_" + carid  #获取到engine
                                engine = re.compile(string)
                                engine = soup.find("td", attrs={"id": engine})
                                if not engine is None:
                                    engine = engine.text.strip()
                                    logger.info("engine " + engine)
                                else:
                                    logger.error(string + "not found!!!")
                                    engine = "-"

                                string = "m_mdisl_" + carid
                                displacement = re.compile(string)
                                displacement = soup.find(
                                    "td", attrs={"id": displacement})
                                if not displacement is None:
                                    displacement = displacement.text.strip()
                                    logger.info("displacement " + displacement)
                                else:
                                    logger.error(string + "not found!!!")
                                    displacement = "-"

                                db.insertTyre2("carInfo1", carid, brands,
                                               series, carType, peopleNum,
                                               marketTime, engine,
                                               displacement, first_latter,
                                               model)
                                db.dbclose()
                            else:
                                logger.error(modid + u" 该处无法获得汽车id!")
                                break
                        break
                    else:
                        time.sleep(360)
                        html = obj.gethtml()
                fobj = open(path, 'a+')
                print u'写入' + first_latter + ' ' + text
                fobj.write(text + '\n')
                fobj.flush()
                fobj.close()
            else:
                logger.info(u"跳过" + i.a.get("href"))
                print u"跳过" + i.a.get("href")
                continue  #否则进行下一条判断
    print first_latter + u"已完成!!!"
    logger.info(first_latter + u"已完成!!!")
示例#7
0
# -*- coding: utf-8
import threading
import re
import time
import sys
import json
from bs4 import BeautifulSoup
from getobj import GetObj
from db import ConnectDB
from common import *
import urllib2
import random
import sys
import cookielib
db = ConnectDB()
url = "http://newcar.xcar.com.cn/price/"
url2 = "http://newcar.xcar.com.cn"


def giveFirstLatter(first_latter, item):
    fname = first_latter + ".txt"
    path = 'D:/pyLearning/spider-master/spider-master/spider/tmp/' + fname  #添加类型的记录path   需修改
    fobj = open(path, 'r+')
    fileList = fobj.read().splitlines()
    fobj.close()
    column_tit = re.compile(r'column_tit')
    column_tit = item.find_all("div", attrs={"class": column_tit})
    width = re.compile(r'848px')
    width = item.find_all("td", attrs={"width": width})
    for (brand, width) in zip(column_tit, width):
        print "--" + brand.a.span.text.strip()
示例#8
0
def threadSpider(brand,url2):   #获取到brand
    fname = brand+".txt"
    path ="../file/"+fname
    fobj = open(path,'a+')
    fileList = fobj.read().splitlines()
    print fileList
    fobj.close()
    obj = GetObj(url2)
    html = obj.gethtml()
    coding = obj.getcodeing(html)
    soup = BeautifulSoup(html,"html5lib",from_encoding=coding)
    clearfix = re.compile(r'list clearfix')
    clearfix = soup.find_all("div",attrs={"class":clearfix})
    figure = clearfix[1].find_all("a")
    for item in figure:
        flow = item.text.strip()
        streak = re.sub(r'\([^\)]*\)',"",flow)    #获取到花纹
        logger.info("streak:" + streak)
        href = item.get("href")
        newUrl = url + href
        obj = GetObj(newUrl)
        html = obj.gethtml()
        coding = obj.getcodeing(html);
        soup = BeautifulSoup(html,"html5lib",from_encoding=coding)
        clearfix = re.compile(r'products clearfix')
        clearfix = soup.find("div",attrs={"class":clearfix})
        clearfix2 = re.compile(r'product clearfix')
        clearfix2 = clearfix.find_all("div",attrs={"class":clearfix2})
        for i in clearfix2:
            name = i.a.get("title")                     #获取到轮胎name
            print name
            logger.info("name:" + name)
            href = i.a.get("href")
            print href
            xx= href.split("/")
            xh = xx[2].split(".")
            tyreid = xh[0]
            if href not in fileList:
                fobj = open(path,'a+')
                print u'写入'+href
                fobj.write(href+'\n')
                fobj.flush()
                fobj.close()

                db=ConnectDB()
                n = db.select(table_name="tyreinfo",field="tyreID",value=tyreid)
                if n != 0:
                    logger.info("tyreID: %s exists " %  tyreid )
                    print tyreid + u"存在"
                    continue
                tyreUrl = url + href
                tyreObj = GetObj(tyreUrl)
                tyreHtml = tyreObj.gethtml()
                tyreSoup = BeautifulSoup(tyreHtml,"html5lib",from_encoding=coding)
                basic = re.compile(r'basic free')
                basic = tyreSoup.find("div",attrs={"class":basic})
                fl = re.compile(r'fl')
                fl = basic.find("span",attrs={"class":fl})


                standard = fl.text.strip()                  #获取到standard
                logger.info("standard:" + standard)

                dl = basic.find_all("dl")
                loaded = dl[4].dd.text.strip()            #获取到load
                #loaded = re.sub(r'\([^\)]*\)',"",loaded)
                logger.info("load:" + loaded)

                speed = dl[5].dd.text.strip()       #获取到speed
                #speed = re.sub(r'\([^\)]*\)',"",speed)
                logger.info("speed:"+speed)

                place = dl[6].dd.text.strip()
                logger.info("place:"+place)



                pi3c = re.compile(r'clearfix pi3c')
                pi3c = basic.find("div",attrs={"class":pi3c})
                pi3c = pi3c.find_all("em")

                wearproof = pi3c[0].text.strip()            #获取到wearproof
                #wearproof = ""
                logger.info("wearproof:"+wearproof)

                traction = pi3c[1].text.strip()             #获取到traction
                logger.info("traction:"+traction)

                highTemperature = pi3c[2].text.strip()      #获取到highTemperature
                logger.info("highTemperature:"+highTemperature)

                db.insert("tyreinfo",tyreid,brand,streak,name,standard,loaded,speed,wearproof,traction,highTemperature)
                db.dbclose()

            else:
                logger.info(u"跳过"+href)
                print(u"跳过"+href)
                continue
    logger.info("finish:" + brand)
示例#9
0
def SaveDataToCarInfo(table_name="",
                      brand="",
                      series="",
                      type_name="",
                      conf="",
                      index=""):
    conf = json.loads(conf)
    for (k, v) in conf.items():
        spaceid = k
        if (v.has_key("0")):
            year = v["0"]
            x = re.compile(r'<(.[^>]*)>')
            year = re.sub(x, "", year)
            #x = re.compile(r'')
            year = year.split('.')[0]  #获取的年份
            print year
        else:
            year = "error"
        if (v.has_key("284")):
            peopleNum = v["284"]
            x = re.compile(r'<(.[^>]*)>')
            peopleNum = re.sub(x, "", peopleNum)
            print peopleNum
        else:
            peopleNum = "error"
        if (v.has_key("555")):
            engine = v["555"]
            x = re.compile(r'<(.[^>]*)>')
            engine = re.sub(x, "", engine)
            print engine
        else:
            engine = "error"
        if (v.has_key("287")):
            displacement = v["287"]
            x = re.compile(r'<(.[^>]*)>')
            displacement = re.sub(x, "", displacement)
            print displacement
        else:
            displacement = "error"

        if type_name == u"微型车":
            type = 0
        elif type_name == u"小型车":
            type = 1
        elif type_name == u"紧凑型车":
            type = 2
        elif type_name == u"中型车":
            type = 3
        elif type_name == u"中大型车":
            type = 4
        elif type_name == u"大型车":
            type = 5
        elif type_name == u"SUV":
            type = 6
        elif type_name == u"MPV":
            type = 7
        elif type_name == u"跑车":
            type = 8
        elif type_name == u"皮卡":
            type = 9
        elif type_name == u"微面":
            type = 10
        elif type_name == u"轻客":
            type = 11
        db = ConnectDB()
        n = db.select(table_name="carinfo", field="vechiclesID", value=spaceid)
        if n != 0:
            logger.info("spaceid: %s exists " % spaceid)
            continue
        db.insertTyre(table_name, spaceid, brand, series, type, peopleNum,
                      year, engine, displacement, index)
        db.dbclose()
示例#10
0
def thrad(type_name, url2):
    #logger.info("name:%s url: %s" % (type_name,url2))
    url2 = url2.encode("utf-8")  #用utf-8编码
    obj = GetObj(url2)  #得到一个爬虫对象
    html = obj.gethtml()  #获取页面
    coding = obj.getcodeing(html)  #获取编码类型
    soup = BeautifulSoup(html, 'html5lib', from_encoding=coding)

    #print "----------------------------------------------"
    #print type_name
    #print "----------------------------------------------"
    logger.info("start %s...." % type_name)
    content = soup.find("div",
                        attrs={"class": ["tab-content-item",
                                         "current"]})  #find返回的不是列表是文本
    soup = BeautifulSoup(str(content), 'html5lib')  #再返回一个soup对象
    index = soup.find_all('span', attrs={'class': "font-letter"})  #找到字典顺序
    box = soup.find_all(
        'div', attrs={'class': ["uibox-con", "rank-list", "rank-list-pic"]})
    for (index, box) in zip(index, box):
        #for item in box:
        #获取字母分割的DIV 同时获取字母索引
        index = index.text.strip()  #默认删除空白符
        brand_soup = BeautifulSoup(str(box), 'html5lib')  #返回一个soup对象
        brand_html = brand_soup.find_all('dl')
        for brand_item in brand_html:
            #品牌名称
            brand = brand_item.dt.text.strip()  #品牌
            series_html = brand_item.dd
            series_soup = BeautifulSoup(str(series_html),
                                        'html5lib')  #根据<dd>标签找到子目录的soup
            manufacturer_name = series_soup.find_all('div',
                                                     attrs={"class":
                                                            "h3-tit"})  #品牌名称
            ul = series_soup.find_all('ul', attrs={"class": "rank-list-ul"})
            for (manufacturer, ul_tag) in zip(manufacturer_name, ul):
                #获取厂商名称
                manufacturer = manufacturer.text
                logger.info("start %s...." % manufacturer)
                logger.debug(ul_tag)
                soup = BeautifulSoup(str(ul_tag), 'html5lib')
                w = re.compile(r's\d+')
                litag = soup.find_all('li', id=w)
                for item in litag:
                    #获取车系名称
                    series = item.h4.text
                    db = ConnectDB()  #建立数据库连接
                    n = db.select(table_name="carinfo",
                                  field="series",
                                  value=series)  #查询
                    db.dbclose()  #关闭连接
                    if n != 0:
                        logger.info("%s %s %s exists " %
                                    (type_name, brand, series))  #如果找到,说明存在该条记录
                        continue
                    href = item.h4.a.get("href")  #如果没找到,则取得他的链接地址
                    price = item.div.text  #记录价格
                    url_id = href.split("/")[3]  #记录url_id
                    #print "●●%s %s %s" % (series,price,url_id)
                    #拼接在售车辆的配置页面URL
                    sale_conf_url = "http://car.autohome.com.cn/config/series/%s.html" % url_id
                    #拼接停售车辆的配置页面URL
                    stop_sale_conf_url = "http://www.autohome.com.cn/%s/sale.html" % url_id
                    url_dic = {
                        "sale_conf_url": sale_conf_url,
                        "stop_sale_conf_url": stop_sale_conf_url
                    }
                    #threads=[]
                    for (url_name, sale_url) in url_dic.items():
                        #在售
                        if url_name == "sale_conf_url":
                            status = u"在售"
                            #print sale_url
                            #def get_josn():
                            log_mess = "%s:%s %s %s %s %s %s %s" % (
                                status, type_name, index, brand, manufacturer,
                                series, price, url_id)
                            obj = GetObj(sale_url)
                            conf = obj.getconf()
                            if conf:
                                #print conf
                                logger.info(log_mess)
                                """SaveData(table_name="spider_json",    #存储到数据库
                                    brand=brand,
                                    series=series,
                                    conf=conf,
                                    status=status,
                                    index=index,
                                    URL_=sale_conf_url,
                                    level=type_name,
                                    manufacturer = manufacturer)"""
                                SaveDataToCarInfo("carinfo", brand, series,
                                                  type_name, conf, index)
                            else:
                                mess = u"没有找到相关配置"
                                logger.info("%s %s" % (log_mess, mess))
                                #print mess
                        else:

                            #停售
                            #def get_stop_conf():
                            status = u"停售"
                            obj = GetObj(sale_url)
                            html = obj.gethtml()
                            coding = obj.getcodeing(html)
                            soup = BeautifulSoup(html,
                                                 'html5lib',
                                                 from_encoding=coding)
                            filter_html = soup.find_all(
                                'div', attrs={"class": "models_nav"})
                            log_mess = "%s:%s %s %s %s %s %s %s" % (
                                status, type_name, index, brand, manufacturer,
                                series, price, url_id)
                            if filter_html:
                                for item in filter_html:
                                    href = item.find('a',
                                                     text=u'参数配置').get("href")
                                    stop_sale_conf_url_1 = url + href
                                    obj = GetObj(stop_sale_conf_url_1)
                                    conf = obj.getconf()
                                    if conf:
                                        #print conf
                                        logger.info("%s %s" % (log_mess, href))
                                        """SaveData(table_name="spider_json",
                                            brand=brand,
                                            series=series,
                                            conf=conf,
                                            status=status,
                                            index=index,
                                            level=type_name,
                                            URL_=stop_sale_conf_url_1)"""
                                        #print u"在售品牌中的停售车辆"
                                        SaveDataToCarInfo(
                                            "carinfo", brand, series,
                                            type_name, conf, index)
                                    else:
                                        mess = u"没有找到相关配置"
                                        logger.info("%s %s %s" %
                                                    (log_mess, mess, href))
                                        #print mess
                            else:
                                mess = u"没有找到相关配置"
                                logger.info("%s %s" % (log_mess, mess))
示例#11
0
def SaveData(table_name="",
             brand="",
             series="",
             conf="",
             status="",
             URL_="",
             index="",
             level="",
             manufacturer=""):
    conf = json.loads(conf)  #转化为python对象
    for (k, v) in conf.items():
        spaceid = k
        name = v["567"]
        x = re.compile(r'span>(.*?)<span')
        if name == '-':
            continue
        name1 = re.search(x, name)
        if hasattr(name1, "group"):
            name1 = name1.group(1)
        else:
            x = re.compile(r'</span>(.*?)$')
            name1 = re.search(x, name)
            if hasattr(name1, "group"):
                name1 = name1.group(1)
            else:
                x = re.compile(r'^(.*?)<span')
                name1 = re.search(x, name)
        print name1

        year = v["0"]
        x = re.compile(r'(</span>|>|^)(.*?)($|<|<span)')
        year = re.search(x, year)
        year = year.group(2)

        guide_price = v["219"]
        x = re.compile(
            r'(</span>|>|^)([1-9]\d*.\d*|0.\d*[1-9]\d*)($|<|</span>)')  #取小数
        temp = re.search(x, guide_price)  #在数组中搜索
        if hasattr(temp, "group"):
            guide_price = temp.group(2)
        else:
            guide_price = 0
        print guide_price

        structure = v["281"]
        structure = re.search(re.compile(r'(</span>|>|^)(.*?)($|<|<span)'),
                              structure).group(2)
        print structure

        emission_standard = v["1072"]
        emission_standard = re.search(
            re.compile(r'(</span>|>|^)(.*?)($|<|<span)'),
            emission_standard).group(2)
        print emission_standard
        json_text = json.dumps(v, encoding='utf-8', ensure_ascii=False)
        db = ConnectDB()
        n = db.select(table_name="spider_json", field="spaceid", value=spaceid)
        if n != 0:
            logger.info("spaceid: %s exists " % spaceid)
            continue
        db.insert(table_name=table_name,
                  spaceid=spaceid,
                  brand=brand,
                  series=series,
                  models=name1,
                  guide_price=guide_price,
                  level=level,
                  emission_standard=emission_standard,
                  structure=structure,
                  status=status,
                  manufacturer=manufacturer,
                  year=year,
                  index=index,
                  json_text="",
                  URL_=URL_)
        db.dbclose()
示例#12
0
文件: test.py 项目: xjy2017y/myPython
    peopleNum = v["284"]
    x = re.compile(r'<(.[^>]*)>')
    peopleNum = re.sub(x, "", peopleNum)
    print peopleNum

    engine = v["555"]
    x = re.compile(r'<(.[^>]*)>')
    engine = re.sub(x, "", engine)
    print engine

    displacement = v["287"]
    x = re.compile(r'<(.[^>]*)>')
    displacement = re.sub(x, "", displacement)
    print displacement
    db = ConnectDB()
    db.insertTyre("carinfo", spaceid, "", "", 1, peopleNum, year, engine,
                  displacement, "S")
    db.dbclose()
"""for (k,v) in conf_json.items():
    spaceid = k
    name = v["<span class='hs_kw0_configpl'></span><span class='hs_kw1_configpl'></span>"]
    x = re.compile(r'span>(.*?)<span')
    if name == '-':
            continue
    name1 = re.search(x,name)
    if hasattr(name1,"group"):
        name1 = name1.group(1)
    else:
        x = re.compile(r'</span>(.*?)$')
        name1 = re.search(x,name)
示例#13
0
文件: spider.py 项目: shancang/spider
def thrad(type_name,url2):
    logger.info("name:%s url: %s" % (type_name,url2))
    url2=url2.encode("utf-8")
    obj = GetObj(url2)
    html=obj.gethtml()
    coding=obj.getcodeing(html)
    soup=BeautifulSoup(html,'html5lib',from_encoding=coding)
    

    #print "----------------------------------------------"
    #print type_name
    #print "----------------------------------------------"
    logger.info("start %s...." % type_name)
    content=soup.find("div",attrs={"class":["tab-content-item","current"]})
    soup=BeautifulSoup(str(content),'html5lib')
    index = soup.find_all('span',attrs={'class':"font-letter"})
    box =  soup.find_all('div',attrs={'class':["uibox-con", "rank-list","rank-list-pic"]})
    for (index,box) in zip(index,box):
    #for item in box:
        #获取字母分割的DIV 同时获取字母索引
        index = index.text.strip()
        brand_soup  = BeautifulSoup(str(box),'html5lib')
        brand_html=brand_soup.find_all('dl')
        for brand_item in brand_html:
            #品牌名称
            brand  = brand_item.dt.text.strip()
            series_html = brand_item.dd
            series_soup=BeautifulSoup(str(series_html),'html5lib')
            manufacturer_name=series_soup.find_all('div',attrs={"class":"h3-tit"})
            ul=series_soup.find_all('ul',attrs={"class":"rank-list-ul"})
            for (manufacturer,ul_tag) in zip(manufacturer_name,ul):
                #获取厂商名称
                manufacturer=manufacturer.text
                logger.info("start %s...." % manufacturer )
                logger.debug(ul_tag)
                soup=BeautifulSoup(str(ul_tag),'html5lib')
                w=re.compile(r's\d+')
                litag=soup.find_all('li',id=w)
                for item in litag:
                    #获取车系名称
                    series=item.h4.text
                    db=ConnectDB()
                    n=db.select(table_name="spider_json",field="series",value=series)
                    db.dbclose()
                    if n != 0:
                        logger.info("%s %s %s exists " % (type_name,brand, series) )
                        continue
                    href=item.h4.a.get("href")
                    price=item.div.text
                    url_id=href.split("/")[3]
                    #print "●●%s %s %s" % (series,price,url_id)
                    #拼接在售车辆的配置页面URL
                    sale_conf_url="http://car.autohome.com.cn/config/series/%s.html" % url_id
                    #拼接停售车辆的配置页面URL
                    stop_sale_conf_url="http://www.autohome.com.cn/%s/sale.html" % url_id
                    url_dic={"sale_conf_url":sale_conf_url,"stop_sale_conf_url":stop_sale_conf_url}
                    #threads=[]
                    for (url_name,sale_url) in url_dic.items():
                        #在售
                        if url_name == "sale_conf_url":
                            status=u"在售"
                            #print sale_url
                            #def get_josn():
                            log_mess="%s:%s %s %s %s %s %s %s" % (status,type_name,index,brand,manufacturer,series,price,url_id)
                            obj=GetObj(sale_url)
                            conf=obj.getconf()
                            if conf:
                                #print conf
                                logger.info(log_mess)
                                SaveData(table_name="spider_json",
                                    brand=brand,
                                    series=series,
                                    conf=conf,
                                    status=status,
                                    index=index,
                                    URL_=sale_conf_url)
                                
                            else:
                                mess= u"没有找到相关配置"
                                logger.info("%s %s" % (log_mess,mess))
                                #print mess
                        else:
                
                            #停售
                            #def get_stop_conf():
                            status=u"停售"
                            obj=GetObj(sale_url)
                            html=obj.gethtml()
                            coding=obj.getcodeing(html)
                            soup=BeautifulSoup(html,'html5lib',from_encoding=coding)
                            filter_html=soup.find_all('div',attrs={"class":"models_nav"})
                            log_mess="%s:%s %s %s %s %s %s %s" % (status,type_name,index,brand,manufacturer,series,price,url_id)
                            if filter_html:
                                for item in filter_html:
                                    href=item.find('a',text=u'参数配置').get("href")
                                    stop_sale_conf_url_1=url+href
                                    obj=GetObj(stop_sale_conf_url_1)
                                    conf=obj.getconf()
                                    if conf:
                                        #print conf
                                        logger.info("%s %s" % (log_mess,href))
                                        SaveData(table_name="spider_json",
                                            brand=brand,
                                            series=series,
                                            conf=conf,
                                            status=status,
                                            index=index,
                                            URL_=stop_sale_conf_url_1)
                                        #print u"在售品牌中的停售车辆"
                
                                    else:
                                        mess= u"没有找到相关配置"
                                        logger.info("%s %s %s" % (log_mess,mess,href))
                                        #print mess
                            else:
                                mess= u"没有找到相关配置"
                                logger.info("%s %s" % (log_mess,mess))