class ChinaGKSpider(): def __init__(self): self.header = Config().getHeader() self.logger = MyLog().getLogger() self.gongKongDao = GongKongDao() def getUrls(self, page): urls = [] start_url = "http://www.gongkong.com/select/QueryLink?pageIndex=" + str( page) + "&articleForm=productInfo" soup = SpiderUtil().getSoup(start_url) result = soup.findAll("div", class_="main_r_text") print(start_url, " 页面获取到的url个数: ", len(result)) self.logger.info(start_url + " 页面获取到的url个数: " + str(len(result))) for i in result: urls.append(i.a['href']) return urls def getData(self, url): datas = {} datas['detailsLink'] = url detailsLink = url uuidURL = str(uuid.uuid1()) # 此处进行查找url数据库,如果不存在则进行解析,否则则返回 if (len(self.gongKongDao.getOne(detailsLink)) == 0): self.gongKongDao.insertURL(detailsLink, uuidURL) soup = SpiderUtil().getSoup(url) tab_text = soup.find("div", class_="tab_text") if (tab_text == None): datas['productName'] = soup.find( 'div', class_='product_title').h1.getText() table = soup.find('table', class_='dqfs1') for tr in table.children: if type(tr) == bs4.element.Tag: for td in tr.children: if type(td) == bs4.element.Tag: if td.string == '关键字:': for t in td.next_siblings: if type(t) == bs4.element.Tag: datas['keyWord'] = t.getText( ).strip().replace("\n", "&&") # print("关键字:" + keyWord) elif td.string == '产品分类:': for t in td.next_siblings: if type(t) == bs4.element.Tag: datas[ 'produceCategory'] = t.getText( ).strip().replace( "\040", "&&") # print("产品分类:" + produceCategory) elif td.string == '品牌:': for t in td.next_siblings: if type(t) == bs4.element.Tag: datas['brand'] = re.sub( "(\t|\n|\r|\040)*", "", t.getText()) # print("品牌:" + brand) datas['produceInfo'] = soup.find( 'dd', style='overflow: auto; line-height: 22px;').getText() # print("产品简介: " + productInfo) else: # 当时只有title的时候将其title拆成品牌,产品名,分类 # 如:'http://www.gongkong.com/ProductSeriesNew/Detail?id=31223&categoryId=808' for tab in tab_text.children: if type(tab) == bs4.element.Tag: te = re.sub("(\040)*", "", tab.getText()) tes = te.split( "\xa0\xa0") #\xa0是 的转义字符 使用repr()打印出来的 brand = tes[0] productName = tes[1] produceCategory = tes[2] datas['brand'] = brand datas['productName'] = productName datas['produceCategory'] = produceCategory print("产品名字:" + productName) print("产品分类:" + produceCategory) print("品牌:" + brand) self.gongKongDao.insertGongKong(datas, uuidURL) # 获取具体的型号 def getDetail(self, url, id, type): detail = {} # 非编译器运行时 # path = os.getcwd() # executable_path = path + "\\chormedirver.exe" # print(executable_path) # browser = webdriver.Chrome(executable_path) browser = webdriver.Chrome() browser.get(url) categroySelect = Select(browser.find_element_by_id("categorySelect_0")) categroy = categroySelect.options[id].text if (categroy != '-请选择-'): detail['categroy'] = categroy categroySelect.select_by_visible_text(categroy.strip()) time.sleep(Config.getSleepTime()) brandSelect = Select(browser.find_element_by_id("brandSelect_0")) for brand in brandSelect.options: if (brand.text != '-请选择-'): #浏览器选择 detail['brand'] = brand.text brandSelect.select_by_visible_text(brand.text) time.sleep(Config.getSleepTime()) productSelect = Select( browser.find_element_by_id("ProductSelect_0")) for product in productSelect.options: if (product.text != '-请选择-'): detail['product'] = product.text #浏览器选择 productSelect.select_by_visible_text(product.text) time.sleep(Config.getSleepTime()) modelSelect = Select( browser.find_element_by_id("modelSelect_0")) # 存储全部选择 for model in modelSelect.options: if (model.text != '-请选择-'): detail['model'] = model.text # details.append(detail) print("类别: " + detail['categroy'] + " 品牌:" + detail['brand'] + " 系列:" + detail['product'] + " 型号:" + detail['model']) self.gongKongDao.insertDetail(detail, type) browser.close()
class CNVDSpider(): def __init__(self): self.header = Config().getHeader() self.logger = MyLog().getLogger() self.cnvdDao = CNVDDao() def getUrls(self, num): urls = [] start_url = "http://ics.cnvd.org.cn/?max=20&offset=" + str(num) self.logger.info("开始页面:{}".format(start_url)) soup = SpiderUtil().getSoup(start_url) results = soup.find_all('td', style="text-align:left;padding-left:10px;") self.logger.info("{} 页面获取到的url个数:{}".format(start_url, len(results))) for result in results: urls.append(result.a['href']) return urls def getData(self, url): soup = SpiderUtil().getSoupByWebDriver(url) print(url) chname = soup.find("div", class_="blkContainerSblk").h1.getText() messageResult = {} messageResult['chname'] = chname tbody = soup.find("table", class_="gg_detail").tbody TRlist = tbody.find_all('tr') for trlist in TRlist[:-1]: if trlist.td.string == "影响产品": impact_productSum = '' if "影响产品" not in messageResult: messageResult["impact_product"] = [] for td in trlist.td.next_siblings: if type(td) == bs4.element.Tag: for k in td: impact_product = '' if type(k) == bs4.element.Tag: impact_product = re.sub( "(\t|\n|\r|\040)*", "", k.getText()) else: impact_product = re.sub( "(\t|\n|\r|\040)*", "", k.string) if impact_product != "": if impact_productSum == '': impact_productSum = impact_product else: impact_productSum = impact_productSum + ',' + impact_product messageResult['impact_product'].append(impact_productSum) else: name = trlist.td.string if name in Config().getCnvdVulList(): codename = Config().getCnvdVulList()[name] for td in trlist.td.next_siblings: if type(td) == bs4.element.Tag: tdText = re.sub(r"(\r|\t|\n|\040)*", "", td.getText()) if len(tdText): if codename in messageResult: messageResult[codename].append(tdText) else: messageResult[codename] = tdText else: self.logger.warning("url:{}, Chname:{}。 未收入的标签:{}".format( url, chname, name)) for name in Config().getCnvdVulList(): if Config().getCnvdVulList()[name] not in messageResult: messageResult[Config().getCnvdVulList()[name]] = NULL self.cnvdDao.insert(messageResult) # 判断是否是已经爬过的信息 # 即判断cnvd-id是否存在 def isExist(self, cnvd_id): list = self.cnvdDao.selectByCNVDId(cnvd_id) if len(list) == 1: return True # 表示存在该条信息 elif len(list) == 0: return False # 表示不存在该条信息 else: self.logger.error("查询出错:cnvd_id:{}, [ERROR]:list:{}".format( cnvd_id, list)) return def getPageNum(self): soup = SpiderUtil().getSoupByWebDriver("http://ics.cnvd.org.cn/") step = soup.find_all("a", class_="step") pageNum = step[len(step) - 1].get_text() return int(pageNum) # 爬取全部信息 def spiderAll(self): pageNum = self.getPageNum() # 从最后一页开始爬取 for i in range(pageNum)[::-1]: urls = self.getUrls(i * 20) for url in urls[::-1]: u = url.split("/") cnvdId = u[len(u) - 1] if self.isExist(cnvdId) == False: try: self.getData(url) # 不存在该漏洞信息则获取并插入 except Exception as excep: self.logger.error("getDataError{}".format(excep)) # 更新数据 def update(self): pageNum = self.getPageNum() # 从第一页开始更新数据 for i in range(pageNum): urls = self.getUrls(i * 20) for url in urls: u = url.split("/") cnvdId = u[len(u) - 1] if self.isExist(cnvdId) == False: try: self.getData(url) # 不存在该信息则获取并插入 except Exception as excep: self.logger.error("getDataError{}".format(excep)) elif self.isExist(cnvdId) == True: return # 存在该信息 则退出 # 存在的问题:如果在未更新完的情况下程序被终止时才重新运行更新数据 这会丢失中间的一些数据