class SpiderUtil(): def __int__(self): self.logger = MyLog().getLogger() def getSoup(self, url): req = request.Request(url, headers=Config().getHeader()) for i in range(Config().getMAX_NUM()): try: resp = request.urlopen(req) except urllib.error.URLError as e: if i < Config().getMAX_NUM() - 1: continue else: self.logger.error("{}:{}:次之后还是失败".format( url, Config().getMAX_NUM())) return content = resp.read() soup = BeautifulSoup(content, "lxml") return soup def getSoupByWebDriver(self, url): webdriver = Webdriver() content = webdriver.getPage_source(url) soup = BeautifulSoup(content, "lxml") webdriver.close() return soup
def __init__(self): __driver = "geckodriver" self.logger = MyLog().getLogger() # 获取驱动目录 driverpath = "" if platform.system() == "Windows": driverpath = os.path.join( os.path.abspath(os.path.dirname(__file__)), __driver + ".exe") elif platform.system() == "Linux": driverpath = os.path.join( os.path.abspath(os.path.dirname(__file__)), __driver) else: self.logger.error("浏览器驱动文件出错:未在以下文件夹下" "查找到驱动文件{}:{}".format( __driver, os.path.dirname(driverpath))) #print(driverpath) # 设置Firefox的启动选项 options = webdriver.FirefoxOptions() #options.add_argument('-headless') self.browser = webdriver.Firefox(firefox_options=options, executable_path=driverpath) # 隐式等待10秒,待页面元素全部加载完毕 self.browser.implicitly_wait(10)
class Webdriver(): # 初始化并加载浏览器 def __init__(self): __driver = "geckodriver" self.logger = MyLog().getLogger() # 获取驱动目录 driverpath = "" if platform.system() == "Windows": driverpath = os.path.join( os.path.abspath(os.path.dirname(__file__)), __driver + ".exe") elif platform.system() == "Linux": driverpath = os.path.join( os.path.abspath(os.path.dirname(__file__)), __driver) else: self.logger.error("浏览器驱动文件出错:未在以下文件夹下" "查找到驱动文件{}:{}".format( __driver, os.path.dirname(driverpath))) #print(driverpath) # 设置Firefox的启动选项 options = webdriver.FirefoxOptions() #options.add_argument('-headless') self.browser = webdriver.Firefox(firefox_options=options, executable_path=driverpath) # 隐式等待10秒,待页面元素全部加载完毕 self.browser.implicitly_wait(10) # 获取页面源代码 def getPage_source(self, url): self.browser.get(url) # 刷新页面获取完整的页面 self.browser.refresh() time.sleep(0.3) return self.browser.page_source # 获取cookie def getCookies(self, url): self.browser.get(url) # 刷新页面 重新加载页面 self.browser.refresh() time.sleep(0.3) res = self.browser.get_cookies() # (list)res = [{'domain': 'www.cnvd.org.cn', 'httpOnly': True, 'expiry': 1527519798.543155, 'secure': False, 'value': '1c652993f3cfb95e68057050a70b69ef', 'name': '__jsluid', 'path': '/'}, {'domain': 'www.cnvd.org.cn', 'httpOnly': False, 'expiry': 1495987361, 'secure': False, 'value': '1495983761.518|0|lKyWZPLs%2FizLz8vTlbysQtasKFw%3D', 'name': '__jsl_clearance', 'path': '/'}] cookie = "" for r in res: cookie += (r['name'] + "=" + r["value"] + ";") return cookie # 关闭浏览器 def close(self): self.browser.close()
def __init__(self): self.header = Config().getHeader() self.logger = MyLog().getLogger() self.cnnvdDao = CnnvdDao()
class CnnvdSpider(): def __init__(self): self.header = Config().getHeader() self.logger = MyLog().getLogger() self.cnnvdDao = CnnvdDao() def spiderAll(self, ): start_url = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag" urls = self.getUrls(start_url) for url in urls: data = self.getDetailData(url) self.cnnvdDao.insert(data) def getUrls(self, start_url): urls = [] soup = SpiderUtil().getSoup(start_url) page = self.getTotalPage(soup) #倒序爬取,即从最后一页开始爬取 for i in range(page): self.getDetailUrls(i, urls) return urls def getTotalPage(self, soup): # 获取总条数 pageText = soup.find('div', class_='page').getText().split("\n") totalNum = 0 for text in pageText: if text != '': totalNum = int(re.sub("\D", "", text)) break if totalNum == 0: self.logger.error("getTotalNum Error") if totalNum % 10 != 0: page = int(totalNum / 10 + 1) else: page = int(totalNum / 10) return page def getDetailUrls(self, page, urls): url = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag?pageno=" + str( page) soup = SpiderUtil().getSoup(url) list_list = soup.find('div', class_='list_list') urlList = list_list.findAll('div', class_='f1') for u in urlList: urls.append(u.a['href']) def getDetailData(self, url): data = {} data['detailUrl'] = url soup = SpiderUtil().getSoup(url) details = soup.find('div', class_='detail_xq w770') data['chname'] = details.h2.getText() for li in details.ul: if type(li) == bs4.element.Tag: texts = re.sub("(\t|\n|\r|\040)*", "", li.getText()).split(":") if texts[0] in Config().getCnnvdVulList(): codeName = Config().getCnnvdVulList()[texts[0]] data[codeName] = texts[1] print(codeName + ": " + data[codeName]) #漏洞简介 vul_descriptions = soup.find('div', class_='d_ldjj').findAll( 'p', style='text-indent:2em') data['vul_description'] = '' for vul_description in vul_descriptions: data['vul_description'] += re.sub("(\t|\n|\r|\040)*", "", vul_description.getText()) #漏洞公告,参考网址,受影响实体 contents = soup.findAll('div', class_='d_ldjj m_t_20') for content in contents: title = content.find('div', class_='title_bt').getText() title = re.sub("(\t|\n|\r|\040)*", "", title) if title in Config().getCnnvdVulList(): codeName = Config().getCnnvdVulList()[title] data[codeName] = '' p = content.findAll('p', style='text-indent:2em') for x in p: data[codeName] += re.sub("(\t|\n|\r|\040)*", "", x.getText()) return data
def __int__(self): self.logger = MyLog().getLogger()
def __init__(self): self.header = Config().getHeader() self.logger = MyLog().getLogger() self.gongKongDao = GongKongDao()
class ChinaGKSpider(): def __init__(self): self.header = Config().getHeader() self.logger = MyLog().getLogger() self.gongKongDao = GongKongDao() def getUrls(self, page): urls = [] start_url = "http://www.gongkong.com/select/QueryLink?pageIndex=" + str( page) + "&articleForm=productInfo" soup = SpiderUtil().getSoup(start_url) result = soup.findAll("div", class_="main_r_text") print(start_url, " 页面获取到的url个数: ", len(result)) self.logger.info(start_url + " 页面获取到的url个数: " + str(len(result))) for i in result: urls.append(i.a['href']) return urls def getData(self, url): datas = {} datas['detailsLink'] = url detailsLink = url uuidURL = str(uuid.uuid1()) # 此处进行查找url数据库,如果不存在则进行解析,否则则返回 if (len(self.gongKongDao.getOne(detailsLink)) == 0): self.gongKongDao.insertURL(detailsLink, uuidURL) soup = SpiderUtil().getSoup(url) tab_text = soup.find("div", class_="tab_text") if (tab_text == None): datas['productName'] = soup.find( 'div', class_='product_title').h1.getText() table = soup.find('table', class_='dqfs1') for tr in table.children: if type(tr) == bs4.element.Tag: for td in tr.children: if type(td) == bs4.element.Tag: if td.string == '关键字:': for t in td.next_siblings: if type(t) == bs4.element.Tag: datas['keyWord'] = t.getText( ).strip().replace("\n", "&&") # print("关键字:" + keyWord) elif td.string == '产品分类:': for t in td.next_siblings: if type(t) == bs4.element.Tag: datas[ 'produceCategory'] = t.getText( ).strip().replace( "\040", "&&") # print("产品分类:" + produceCategory) elif td.string == '品牌:': for t in td.next_siblings: if type(t) == bs4.element.Tag: datas['brand'] = re.sub( "(\t|\n|\r|\040)*", "", t.getText()) # print("品牌:" + brand) datas['produceInfo'] = soup.find( 'dd', style='overflow: auto; line-height: 22px;').getText() # print("产品简介: " + productInfo) else: # 当时只有title的时候将其title拆成品牌,产品名,分类 # 如:'http://www.gongkong.com/ProductSeriesNew/Detail?id=31223&categoryId=808' for tab in tab_text.children: if type(tab) == bs4.element.Tag: te = re.sub("(\040)*", "", tab.getText()) tes = te.split( "\xa0\xa0") #\xa0是 的转义字符 使用repr()打印出来的 brand = tes[0] productName = tes[1] produceCategory = tes[2] datas['brand'] = brand datas['productName'] = productName datas['produceCategory'] = produceCategory print("产品名字:" + productName) print("产品分类:" + produceCategory) print("品牌:" + brand) self.gongKongDao.insertGongKong(datas, uuidURL) # 获取具体的型号 def getDetail(self, url, id, type): detail = {} # 非编译器运行时 # path = os.getcwd() # executable_path = path + "\\chormedirver.exe" # print(executable_path) # browser = webdriver.Chrome(executable_path) browser = webdriver.Chrome() browser.get(url) categroySelect = Select(browser.find_element_by_id("categorySelect_0")) categroy = categroySelect.options[id].text if (categroy != '-请选择-'): detail['categroy'] = categroy categroySelect.select_by_visible_text(categroy.strip()) time.sleep(Config.getSleepTime()) brandSelect = Select(browser.find_element_by_id("brandSelect_0")) for brand in brandSelect.options: if (brand.text != '-请选择-'): #浏览器选择 detail['brand'] = brand.text brandSelect.select_by_visible_text(brand.text) time.sleep(Config.getSleepTime()) productSelect = Select( browser.find_element_by_id("ProductSelect_0")) for product in productSelect.options: if (product.text != '-请选择-'): detail['product'] = product.text #浏览器选择 productSelect.select_by_visible_text(product.text) time.sleep(Config.getSleepTime()) modelSelect = Select( browser.find_element_by_id("modelSelect_0")) # 存储全部选择 for model in modelSelect.options: if (model.text != '-请选择-'): detail['model'] = model.text # details.append(detail) print("类别: " + detail['categroy'] + " 品牌:" + detail['brand'] + " 系列:" + detail['product'] + " 型号:" + detail['model']) self.gongKongDao.insertDetail(detail, type) browser.close()
def __init__(self): self.header = Config().getHeader() self.logger = MyLog().getLogger() self.urlKeyWordDao = UrlKeyWordDao()
class KeyWordAnalyst(): def __init__(self): self.header = Config().getHeader() self.logger = MyLog().getLogger() self.urlKeyWordDao = UrlKeyWordDao() def dataPreparation(self): dataCount = {} dataList = self.urlKeyWordDao.listURL() pattern = re.compile(Config().getFilterFile()) for data in dataList: datas = data.url.split(".") for key in datas: match = pattern.match(key) if match == None and key != '': if not dataCount.get(key): dataCount[key] = 1 else: dataCount[key] += 1 sort = sorted(dataCount.items(), key=lambda e: e[1], reverse=True) for item in sort: print(item) self.logger.debug(item) def dataPreparationXXX(self): dataCount = {} dataList = self.urlKeyWordDao.listURL() pattern = re.compile(Config().getFilterFile2()) for data in dataList: datas = data.url.split("/") for key in datas: match = pattern.match(key) if match == None and key != '': if not dataCount.get(key): dataCount[key] = 1 else: dataCount[key] += 1 sort = sorted(dataCount.items(), key=lambda e: e[1], reverse=True) for item in sort: print(item) self.logger.debug(item) def dataPreparationByEngine(self, engine): dataList = self.urlKeyWordDao.listURLByEngine(engine) self.xx(dataList) def dataPreparationBySearchWord(self, searchWord): dataList = self.urlKeyWordDao.listURLBySearchWord(searchWord) self.xx(dataList) def xx(self, dataList): dataCount = {} pattern = re.compile(Config().getFilterFile2()) for data in dataList: datas = data.url.split("/") for key in datas: match = pattern.match(key) if match == None and key != '': if not dataCount.get(key): dataCount[key] = 1 else: dataCount[key] += 1 sort = sorted(dataCount.items(), key=lambda e: e[1], reverse=True) for item in sort: print(item) self.logger.debug(item)
def __init__(self): self.DBSession = sqlalchemy.orm.sessionmaker(bind = MysqlConfig().getEngine()) self.session = self.DBSession() MysqlConfig().getBaseModel().metadata.create_all(MysqlConfig().getEngine()) self.logger = MyLog().getLogger()
class CNVDSpider(): def __init__(self): self.header = Config().getHeader() self.logger = MyLog().getLogger() self.cnvdDao = CNVDDao() def getUrls(self, num): urls = [] start_url = "http://ics.cnvd.org.cn/?max=20&offset=" + str(num) self.logger.info("开始页面:{}".format(start_url)) soup = SpiderUtil().getSoup(start_url) results = soup.find_all('td', style="text-align:left;padding-left:10px;") self.logger.info("{} 页面获取到的url个数:{}".format(start_url, len(results))) for result in results: urls.append(result.a['href']) return urls def getData(self, url): soup = SpiderUtil().getSoupByWebDriver(url) print(url) chname = soup.find("div", class_="blkContainerSblk").h1.getText() messageResult = {} messageResult['chname'] = chname tbody = soup.find("table", class_="gg_detail").tbody TRlist = tbody.find_all('tr') for trlist in TRlist[:-1]: if trlist.td.string == "影响产品": impact_productSum = '' if "影响产品" not in messageResult: messageResult["impact_product"] = [] for td in trlist.td.next_siblings: if type(td) == bs4.element.Tag: for k in td: impact_product = '' if type(k) == bs4.element.Tag: impact_product = re.sub( "(\t|\n|\r|\040)*", "", k.getText()) else: impact_product = re.sub( "(\t|\n|\r|\040)*", "", k.string) if impact_product != "": if impact_productSum == '': impact_productSum = impact_product else: impact_productSum = impact_productSum + ',' + impact_product messageResult['impact_product'].append(impact_productSum) else: name = trlist.td.string if name in Config().getCnvdVulList(): codename = Config().getCnvdVulList()[name] for td in trlist.td.next_siblings: if type(td) == bs4.element.Tag: tdText = re.sub(r"(\r|\t|\n|\040)*", "", td.getText()) if len(tdText): if codename in messageResult: messageResult[codename].append(tdText) else: messageResult[codename] = tdText else: self.logger.warning("url:{}, Chname:{}。 未收入的标签:{}".format( url, chname, name)) for name in Config().getCnvdVulList(): if Config().getCnvdVulList()[name] not in messageResult: messageResult[Config().getCnvdVulList()[name]] = NULL self.cnvdDao.insert(messageResult) # 判断是否是已经爬过的信息 # 即判断cnvd-id是否存在 def isExist(self, cnvd_id): list = self.cnvdDao.selectByCNVDId(cnvd_id) if len(list) == 1: return True # 表示存在该条信息 elif len(list) == 0: return False # 表示不存在该条信息 else: self.logger.error("查询出错:cnvd_id:{}, [ERROR]:list:{}".format( cnvd_id, list)) return def getPageNum(self): soup = SpiderUtil().getSoupByWebDriver("http://ics.cnvd.org.cn/") step = soup.find_all("a", class_="step") pageNum = step[len(step) - 1].get_text() return int(pageNum) # 爬取全部信息 def spiderAll(self): pageNum = self.getPageNum() # 从最后一页开始爬取 for i in range(pageNum)[::-1]: urls = self.getUrls(i * 20) for url in urls[::-1]: u = url.split("/") cnvdId = u[len(u) - 1] if self.isExist(cnvdId) == False: try: self.getData(url) # 不存在该漏洞信息则获取并插入 except Exception as excep: self.logger.error("getDataError{}".format(excep)) # 更新数据 def update(self): pageNum = self.getPageNum() # 从第一页开始更新数据 for i in range(pageNum): urls = self.getUrls(i * 20) for url in urls: u = url.split("/") cnvdId = u[len(u) - 1] if self.isExist(cnvdId) == False: try: self.getData(url) # 不存在该信息则获取并插入 except Exception as excep: self.logger.error("getDataError{}".format(excep)) elif self.isExist(cnvdId) == True: return # 存在该信息 则退出 # 存在的问题:如果在未更新完的情况下程序被终止时才重新运行更新数据 这会丢失中间的一些数据
class UrlKeyWordSpider(): def __init__(self): self.header = Config().getHeader() self.logger = MyLog().getLogger() self.urlKeyWordDao = UrlKeyWordDao() def getUrlsByBaidu(self, keyWord): datas = [] # 对url中中文处理 url = "/s?wd=" + urllib.parse.quote(keyWord) self.getByBaidu(url, keyWord, datas) self.urlKeyWordDao.insert(datas) def getUrlsByGoogle(self, keyWord): datas = [] # 对url中中文处理 start_url = "https://www.google.com.hk/search?q=" + urllib.parse.quote( keyWord) browser = webdriver.Chrome() browser.get(start_url) while (self.isElementExist(browser, '下一页')): browser.find_element_by_link_text('下一页').click() soup = BeautifulSoup(browser.page_source, "html.parser", from_encoding='UTF-8') self.getDataByGoogle(soup, keyWord, datas) time.sleep(Config().getSleepTime()) browser.close() self.urlKeyWordDao.insert(datas) def getDataByGoogle(self, soup, keyWord, datas): results = soup.findAll('div', class_='rc') for result in results: try: data = {} data['url'] = result.find('cite', class_='_Rm').getText() data['urlTitle'] = result.h3.getText() data['searchEngine'] = "Google" data['searchWord'] = keyWord datas.append(data) except Exception as e: self.logger.error("getData获取数据错误:[error]:{}……result:{}".format( e, str(result).replace(u'\xa0', u' '))) def getByBaidu(self, url, keyWord, datas): url = "https://www.baidu.com" + url soup = SpiderUtil().getSoup(url) self.getDataByBaidu(soup, keyWord, datas) nextUrl = self.getNextPageUrl(soup) if nextUrl != -1: self.getByBaidu(nextUrl, keyWord, datas) def getDataByBaidu(self, soup, keyWord, datas): results = soup.findAll('div', class_="f13") for result in results: try: data = {} data['url'] = result.a.getText() data['urlTitle'] = result.div['data-tools'] data['searchEngine'] = "百度" data['searchWord'] = keyWord datas.append(data) except Exception as e: self.logger.error("getData获取数据错误:[error]:{}……result:{}".format( e, str(result).replace(u'\xa0', u' '))) def getNextPageUrl(self, soup): nextUrls = soup.find('div', id='page').findAll('a') if nextUrls == None: return -1 if len(nextUrls) <= 0: return -1 if nextUrls[len(nextUrls) - 1].getText() != '下一页>': return -1 return nextUrls[len(nextUrls) - 1]['href'] #判断是否存在此标签 def isElementExist(self, browser, element): flag = True try: browser.find_element_by_link_text(element) except: flag = False return flag