def GetFirstType(url): obj = GetObj(url) html = obj.gethtml() coding = obj.getcodeing(html) soup = BeautifulSoup(html,"html5lib",from_encoding=coding) m=re.compile(r"navcar") content=soup.find_all("li",attrs={"class":m}) url1={} for item in content: name=item.a.text if name == u"电动车": continue href=item.a.get("href") url1[name]=href return url1
def GetFirstType(url): obj = GetObj(url) html = obj.gethtml() coding = obj.getcodeing(html) soup = BeautifulSoup(html, "html5lib", from_encoding=coding) m = re.compile(r"navcar") content = soup.find_all("li", attrs={"class": m}) url1 = {} for item in content: name = item.a.text if name == u"电动车": continue href = item.a.get("href") url1[name] = href return url1
def GetFirstType(url): obj = GetObj(url) #得到一个爬虫对象 html = obj.gethtml() #得到请求的页面内容 coding = obj.getcodeing(html) #得到编码类型 soup = BeautifulSoup( html, "html5lib", from_encoding=coding) #将html内容打开,用html5lib做解析器,用coding进行编码 m = re.compile(r"navcar") #python通过re模块提供对正则表达式的支持,查找“navcar”字段的字符 content = soup.find_all("li", attrs={"class": m}) #搜索文档树, 找到<li>标签并且class = navcar url1 = {} for item in content: #循环 name = item.a.text #获取子节点<a>的内容 if name == u"电动车": #排除掉电动车 中文前加u就是告诉python后面的是个unicode编码,存储时按unicode格式存储。 continue href = item.a.get("href") #获取链接 href = "http:" + href url1[name] = href #存储到数组中 return url1
def firstGetHtml(): obj = GetObj(url); html = obj.gethtml(); coding = obj.getcodeing(html); soup = BeautifulSoup(html,"html5lib",from_encoding=coding) list = re.compile(r'i4') list = soup.find("div",attrs={"id":list}) li = list.find_all("li") for item in li: if(item.a.get("href").strip() != '//'): #排除掉空的url href = url + item.a.get("href") brand = item.a.text.strip() print brand+u"开始爬取" logger.info(brand + u" 开始爬取") t = threading.Thread(target=threadSpider,args=(brand,href)) t.start() while True: if(len(threading.enumerate()) < THARED_NUMBER + 1 ): #threading.enumerate(): 返回一个包含正在运行的线程的list。正在运行指线程启动后、结束前,不包括启动前和终止后的线程。 这里限制线程数不大于6个 break return
def GetFirstTypeAika(url): obj = GetObj(url) #得到一个爬虫对象 html = obj.gethtml() #得到请求的页面内容 coding = obj.getcodeing(html) #得到编码类型 soup = BeautifulSoup( html, "html5lib", from_encoding=coding) #将html内容打开,用html5lib做解析器,用coding进行编码 container = re.compile(r"container") content = soup.find_all("div", attrs={"class": container }) #搜索文档树, 找到<li>标签并且class = navcar for item in content: first_latter = item.div.text.strip() print first_latter logger.info(u"字母" + first_latter + " start!...") if (first_latter >= 'B'): t = threading.Thread(target=giveFirstLatter, args=(first_latter, item)) t.start() while True: if ( len(threading.enumerate()) < THARED_NUMBER + 1 ): #threading.enumerate(): 返回一个包含正在运行的线程的list。正在运行指线程启动后、结束前,不包括启动前和终止后的线程。 这里限制线程数不大于6个 break return
def giveFirstLatter(first_latter, item): fname = first_latter + ".txt" path = 'D:/pyLearning/spider-master/spider-master/spider/tmp/' + fname #添加类型的记录path 需修改 fobj = open(path, 'r+') fileList = fobj.read().splitlines() fobj.close() column_tit = re.compile(r'column_tit') column_tit = item.find_all("div", attrs={"class": column_tit}) width = re.compile(r'848px') width = item.find_all("td", attrs={"width": width}) for (brand, width) in zip(column_tit, width): print "--" + brand.a.span.text.strip() brands = brand.a.span.text.strip() logger.info(u"--品牌" + brand.a.span.text + " start!...") item_list = re.compile(r'item_list') item_list = width.find_all("div", attrs={"class": item_list}) for i in item_list: print "---" + i.a.get("href") text = i.a.get("href").strip() if text not in fileList: #如果没在文本中找到的话,插入href 并写入数据库 logger.info(u"---车型" + i.a.get("href") + " start!...") href = url2 + i.a.get("href") + "config.htm" logger.info(u"---链接地址 " + href) obj = GetObj(href) html = obj.gethtml() while (True): if not html is None: coding = obj.getcodeing(html) #获取编码类型 soup = BeautifulSoup(html, 'html5lib', from_encoding=coding) base_title = re.compile(r'base_title') base_title = soup.find_all("tr", attrs={"id": base_title}) soup2 = base_title[0] #找到base_title的DOM col = re.compile(r'col') col = soup2.find_all("td", attrs={"scope": col}) #找到td for i in col: model = i.a.text.strip() #获取到model logger.info("model " + model) modid = i.get("id") mod = re.compile(r'(mod_)(.*)') carid = re.search(mod, modid) if hasattr(carid, 'group'): carid = carid.group(2) string = "bname_" + carid db = ConnectDB() n = db.select(table_name="carInfo1", field="vechiclesID", value=carid) if n != 0: logger.info("vechiclesID: %s exists " % carid) continue series = re.compile(string) series = soup.find("td", attrs={"id": series}) if not series is None: #获取到series series = series.a.text.strip() logger.info("series " + string) else: logger.error(string + "not found!!!!") series = "-" string = "type_name_" + carid #获取到carType carType = re.compile(string) carType = soup.find("td", attrs={"id": carType}) if not carType is None: carType = carType.a.text.strip() logger.info("carType " + carType) else: logger.error(string + "not found!!!!") series = "-" string = "m_newseat_" + carid #获取到peopleNum peopleNum = re.compile(string) peopleNum = soup.find("td", attrs={"id": peopleNum}) if not peopleNum is None: peopleNum = peopleNum.text.strip() logger.info("peopleNum " + peopleNum) else: logger.error(string + "not found!!!!") peopleNum = "-" string = "syear_" + carid #获取到marketTime marketTime = re.compile(string) marketTime = soup.find( "td", attrs={"id": marketTime}) if not marketTime is None: marketTime = marketTime.text.strip() logger.info("marketTime " + marketTime) else: logger.error(string + "not found!!!!") marketTime = "-" string = "m_disl_working_mpower_" + carid #获取到engine engine = re.compile(string) engine = soup.find("td", attrs={"id": engine}) if not engine is None: engine = engine.text.strip() logger.info("engine " + engine) else: logger.error(string + "not found!!!") engine = "-" string = "m_mdisl_" + carid displacement = re.compile(string) displacement = soup.find( "td", attrs={"id": displacement}) if not displacement is None: displacement = displacement.text.strip() logger.info("displacement " + displacement) else: logger.error(string + "not found!!!") displacement = "-" db.insertTyre2("carInfo1", carid, brands, series, carType, peopleNum, marketTime, engine, displacement, first_latter, model) db.dbclose() else: logger.error(modid + u" 该处无法获得汽车id!") break break else: time.sleep(360) html = obj.gethtml() fobj = open(path, 'a+') print u'写入' + first_latter + ' ' + text fobj.write(text + '\n') fobj.flush() fobj.close() else: logger.info(u"跳过" + i.a.get("href")) print u"跳过" + i.a.get("href") continue #否则进行下一条判断 print first_latter + u"已完成!!!" logger.info(first_latter + u"已完成!!!")
def threadSpider(brand,url2): #获取到brand fname = brand+".txt" path ="../file/"+fname fobj = open(path,'a+') fileList = fobj.read().splitlines() print fileList fobj.close() obj = GetObj(url2) html = obj.gethtml() coding = obj.getcodeing(html) soup = BeautifulSoup(html,"html5lib",from_encoding=coding) clearfix = re.compile(r'list clearfix') clearfix = soup.find_all("div",attrs={"class":clearfix}) figure = clearfix[1].find_all("a") for item in figure: flow = item.text.strip() streak = re.sub(r'\([^\)]*\)',"",flow) #获取到花纹 logger.info("streak:" + streak) href = item.get("href") newUrl = url + href obj = GetObj(newUrl) html = obj.gethtml() coding = obj.getcodeing(html); soup = BeautifulSoup(html,"html5lib",from_encoding=coding) clearfix = re.compile(r'products clearfix') clearfix = soup.find("div",attrs={"class":clearfix}) clearfix2 = re.compile(r'product clearfix') clearfix2 = clearfix.find_all("div",attrs={"class":clearfix2}) for i in clearfix2: name = i.a.get("title") #获取到轮胎name print name logger.info("name:" + name) href = i.a.get("href") print href xx= href.split("/") xh = xx[2].split(".") tyreid = xh[0] if href not in fileList: fobj = open(path,'a+') print u'写入'+href fobj.write(href+'\n') fobj.flush() fobj.close() db=ConnectDB() n = db.select(table_name="tyreinfo",field="tyreID",value=tyreid) if n != 0: logger.info("tyreID: %s exists " % tyreid ) print tyreid + u"存在" continue tyreUrl = url + href tyreObj = GetObj(tyreUrl) tyreHtml = tyreObj.gethtml() tyreSoup = BeautifulSoup(tyreHtml,"html5lib",from_encoding=coding) basic = re.compile(r'basic free') basic = tyreSoup.find("div",attrs={"class":basic}) fl = re.compile(r'fl') fl = basic.find("span",attrs={"class":fl}) standard = fl.text.strip() #获取到standard logger.info("standard:" + standard) dl = basic.find_all("dl") loaded = dl[4].dd.text.strip() #获取到load #loaded = re.sub(r'\([^\)]*\)',"",loaded) logger.info("load:" + loaded) speed = dl[5].dd.text.strip() #获取到speed #speed = re.sub(r'\([^\)]*\)',"",speed) logger.info("speed:"+speed) place = dl[6].dd.text.strip() logger.info("place:"+place) pi3c = re.compile(r'clearfix pi3c') pi3c = basic.find("div",attrs={"class":pi3c}) pi3c = pi3c.find_all("em") wearproof = pi3c[0].text.strip() #获取到wearproof #wearproof = "" logger.info("wearproof:"+wearproof) traction = pi3c[1].text.strip() #获取到traction logger.info("traction:"+traction) highTemperature = pi3c[2].text.strip() #获取到highTemperature logger.info("highTemperature:"+highTemperature) db.insert("tyreinfo",tyreid,brand,streak,name,standard,loaded,speed,wearproof,traction,highTemperature) db.dbclose() else: logger.info(u"跳过"+href) print(u"跳过"+href) continue logger.info("finish:" + brand)
def thrad(type_name, url2): #logger.info("name:%s url: %s" % (type_name,url2)) url2 = url2.encode("utf-8") #用utf-8编码 obj = GetObj(url2) #得到一个爬虫对象 html = obj.gethtml() #获取页面 coding = obj.getcodeing(html) #获取编码类型 soup = BeautifulSoup(html, 'html5lib', from_encoding=coding) #print "----------------------------------------------" #print type_name #print "----------------------------------------------" logger.info("start %s...." % type_name) content = soup.find("div", attrs={"class": ["tab-content-item", "current"]}) #find返回的不是列表是文本 soup = BeautifulSoup(str(content), 'html5lib') #再返回一个soup对象 index = soup.find_all('span', attrs={'class': "font-letter"}) #找到字典顺序 box = soup.find_all( 'div', attrs={'class': ["uibox-con", "rank-list", "rank-list-pic"]}) for (index, box) in zip(index, box): #for item in box: #获取字母分割的DIV 同时获取字母索引 index = index.text.strip() #默认删除空白符 brand_soup = BeautifulSoup(str(box), 'html5lib') #返回一个soup对象 brand_html = brand_soup.find_all('dl') for brand_item in brand_html: #品牌名称 brand = brand_item.dt.text.strip() #品牌 series_html = brand_item.dd series_soup = BeautifulSoup(str(series_html), 'html5lib') #根据<dd>标签找到子目录的soup manufacturer_name = series_soup.find_all('div', attrs={"class": "h3-tit"}) #品牌名称 ul = series_soup.find_all('ul', attrs={"class": "rank-list-ul"}) for (manufacturer, ul_tag) in zip(manufacturer_name, ul): #获取厂商名称 manufacturer = manufacturer.text logger.info("start %s...." % manufacturer) logger.debug(ul_tag) soup = BeautifulSoup(str(ul_tag), 'html5lib') w = re.compile(r's\d+') litag = soup.find_all('li', id=w) for item in litag: #获取车系名称 series = item.h4.text db = ConnectDB() #建立数据库连接 n = db.select(table_name="carinfo", field="series", value=series) #查询 db.dbclose() #关闭连接 if n != 0: logger.info("%s %s %s exists " % (type_name, brand, series)) #如果找到,说明存在该条记录 continue href = item.h4.a.get("href") #如果没找到,则取得他的链接地址 price = item.div.text #记录价格 url_id = href.split("/")[3] #记录url_id #print "●●%s %s %s" % (series,price,url_id) #拼接在售车辆的配置页面URL sale_conf_url = "http://car.autohome.com.cn/config/series/%s.html" % url_id #拼接停售车辆的配置页面URL stop_sale_conf_url = "http://www.autohome.com.cn/%s/sale.html" % url_id url_dic = { "sale_conf_url": sale_conf_url, "stop_sale_conf_url": stop_sale_conf_url } #threads=[] for (url_name, sale_url) in url_dic.items(): #在售 if url_name == "sale_conf_url": status = u"在售" #print sale_url #def get_josn(): log_mess = "%s:%s %s %s %s %s %s %s" % ( status, type_name, index, brand, manufacturer, series, price, url_id) obj = GetObj(sale_url) conf = obj.getconf() if conf: #print conf logger.info(log_mess) """SaveData(table_name="spider_json", #存储到数据库 brand=brand, series=series, conf=conf, status=status, index=index, URL_=sale_conf_url, level=type_name, manufacturer = manufacturer)""" SaveDataToCarInfo("carinfo", brand, series, type_name, conf, index) else: mess = u"没有找到相关配置" logger.info("%s %s" % (log_mess, mess)) #print mess else: #停售 #def get_stop_conf(): status = u"停售" obj = GetObj(sale_url) html = obj.gethtml() coding = obj.getcodeing(html) soup = BeautifulSoup(html, 'html5lib', from_encoding=coding) filter_html = soup.find_all( 'div', attrs={"class": "models_nav"}) log_mess = "%s:%s %s %s %s %s %s %s" % ( status, type_name, index, brand, manufacturer, series, price, url_id) if filter_html: for item in filter_html: href = item.find('a', text=u'参数配置').get("href") stop_sale_conf_url_1 = url + href obj = GetObj(stop_sale_conf_url_1) conf = obj.getconf() if conf: #print conf logger.info("%s %s" % (log_mess, href)) """SaveData(table_name="spider_json", brand=brand, series=series, conf=conf, status=status, index=index, level=type_name, URL_=stop_sale_conf_url_1)""" #print u"在售品牌中的停售车辆" SaveDataToCarInfo( "carinfo", brand, series, type_name, conf, index) else: mess = u"没有找到相关配置" logger.info("%s %s %s" % (log_mess, mess, href)) #print mess else: mess = u"没有找到相关配置" logger.info("%s %s" % (log_mess, mess))
import sys #小型车可以 #微型车可以 #紧凑型车可以 #中型车 的人数 #大型车可以 #SUV #MPV # reload(sys) sys.setdefaultencoding('utf8') url = "http://newcar.xcar.com.cn/2365/config.htm" obj = GetObj(url) html = obj.gethtml() c = re.compile(r'(var specIDs =)(\[.*\])') coding = obj.getcodeing(html) #得到编码类型 soup = BeautifulSoup(html, "html5lib", from_encoding=coding) print soup temp = re.search(c, html) if hasattr(temp, 'group'): temp = temp.group(2) else: driver = webdriver.Chrome( executable_path= r"C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe") # driver = webdriver.Firefox() # 打开网页 driver.get(url) ss = json.loads(temp)
def thrad(type_name,url2): logger.info("name:%s url: %s" % (type_name,url2)) url2=url2.encode("utf-8") obj = GetObj(url2) html=obj.gethtml() coding=obj.getcodeing(html) soup=BeautifulSoup(html,'html5lib',from_encoding=coding) #print "----------------------------------------------" #print type_name #print "----------------------------------------------" logger.info("start %s...." % type_name) content=soup.find("div",attrs={"class":["tab-content-item","current"]}) soup=BeautifulSoup(str(content),'html5lib') index = soup.find_all('span',attrs={'class':"font-letter"}) box = soup.find_all('div',attrs={'class':["uibox-con", "rank-list","rank-list-pic"]}) for (index,box) in zip(index,box): #for item in box: #获取字母分割的DIV 同时获取字母索引 index = index.text.strip() brand_soup = BeautifulSoup(str(box),'html5lib') brand_html=brand_soup.find_all('dl') for brand_item in brand_html: #品牌名称 brand = brand_item.dt.text.strip() series_html = brand_item.dd series_soup=BeautifulSoup(str(series_html),'html5lib') manufacturer_name=series_soup.find_all('div',attrs={"class":"h3-tit"}) ul=series_soup.find_all('ul',attrs={"class":"rank-list-ul"}) for (manufacturer,ul_tag) in zip(manufacturer_name,ul): #获取厂商名称 manufacturer=manufacturer.text logger.info("start %s...." % manufacturer ) logger.debug(ul_tag) soup=BeautifulSoup(str(ul_tag),'html5lib') w=re.compile(r's\d+') litag=soup.find_all('li',id=w) for item in litag: #获取车系名称 series=item.h4.text db=ConnectDB() n=db.select(table_name="spider_json",field="series",value=series) db.dbclose() if n != 0: logger.info("%s %s %s exists " % (type_name,brand, series) ) continue href=item.h4.a.get("href") price=item.div.text url_id=href.split("/")[3] #print "●●%s %s %s" % (series,price,url_id) #拼接在售车辆的配置页面URL sale_conf_url="http://car.autohome.com.cn/config/series/%s.html" % url_id #拼接停售车辆的配置页面URL stop_sale_conf_url="http://www.autohome.com.cn/%s/sale.html" % url_id url_dic={"sale_conf_url":sale_conf_url,"stop_sale_conf_url":stop_sale_conf_url} #threads=[] for (url_name,sale_url) in url_dic.items(): #在售 if url_name == "sale_conf_url": status=u"在售" #print sale_url #def get_josn(): log_mess="%s:%s %s %s %s %s %s %s" % (status,type_name,index,brand,manufacturer,series,price,url_id) obj=GetObj(sale_url) conf=obj.getconf() if conf: #print conf logger.info(log_mess) SaveData(table_name="spider_json", brand=brand, series=series, conf=conf, status=status, index=index, URL_=sale_conf_url) else: mess= u"没有找到相关配置" logger.info("%s %s" % (log_mess,mess)) #print mess else: #停售 #def get_stop_conf(): status=u"停售" obj=GetObj(sale_url) html=obj.gethtml() coding=obj.getcodeing(html) soup=BeautifulSoup(html,'html5lib',from_encoding=coding) filter_html=soup.find_all('div',attrs={"class":"models_nav"}) log_mess="%s:%s %s %s %s %s %s %s" % (status,type_name,index,brand,manufacturer,series,price,url_id) if filter_html: for item in filter_html: href=item.find('a',text=u'参数配置').get("href") stop_sale_conf_url_1=url+href obj=GetObj(stop_sale_conf_url_1) conf=obj.getconf() if conf: #print conf logger.info("%s %s" % (log_mess,href)) SaveData(table_name="spider_json", brand=brand, series=series, conf=conf, status=status, index=index, URL_=stop_sale_conf_url_1) #print u"在售品牌中的停售车辆" else: mess= u"没有找到相关配置" logger.info("%s %s %s" % (log_mess,mess,href)) #print mess else: mess= u"没有找到相关配置" logger.info("%s %s" % (log_mess,mess))