def parserLi(li): name = li.find_all("a")[1].text infoUrl = root_url + li.find_all("a")[1]['href'] imageUrl = li.find_all("a")[0]['data-original'] time = li.find("span", {"class": "note"}).text log.info("parser li: name:[%s] image:{%s} infoUrl:{%s}", name, imageUrl, infoUrl) # 进入详情页 soup = BeautifulSoup(spider_tool.get_page(infoUrl), 'lxml') type = soup.find("div", {"id": "detail-box"}).find_all('dd')[0].a.text day_path = work_path + time + "/" if not os.path.exists(day_path): os.makedirs(day_path) # 下载图片 imagepath = day_path + type + "___" + name + imageUrl[-4:] if not os.path.exists(imagepath): spider_tool.down_file(imageUrl, imagepath) # 进入下载页面 downUrl = root_url + soup.find_all( "div", {"class": "ui-box border-gray clearfix" })[1].find_all("a")[1].attrs["href"] downSoup = BeautifulSoup(spider_tool.get_page(downUrl), 'lxml') videoUrl = downSoup.find("div", {"class": "download"}).a.attrs["href"] log.info("start down:type:[%s] name:[%s] video:{%s} path:{%s}", type, name, videoUrl, day_path + type + "___" + name + videoUrl[-4:]) # t = spider_tool.download_aria2(videoUrl, type + "___" + name + videoUrl[-4:], day_path) aria2_list.append({ "url": videoUrl, "name": type + "___" + name + videoUrl[-4:], "path": day_path }) log.info("end down:type:[%s] name:[%s] video:{%s} path:{%s}", type, name, videoUrl, day_path + type + "___" + name + videoUrl[-4:])
def search_ipvm(flag=1): url = 'https://ipvm.com/' html = get_page(url, sflag=flag) try: soup = BeautifulSoup(html, 'html.parser') titles = soup.find_all(class_='title-link-primary') time = soup.find_all(func) content = soup.find_all( class_='article-snippet text-muted m-b-0 hidden-xs-down') content1 = soup.find_all( class_='article-snippet text-muted hidden-sm-down') #print(desc1) time_lst = [] href_lst = [] title_lst = [] content_lst = [] content_lst.append(content1[0].string) for tag in titles: title_lst.append(tag.string) href_lst.append(tag['href']) for tag in time: time_lst.append(tag['data-datetime']) for tag in content: content_lst.append(tag.string) for i in range(0, len(href_lst)): insert_info(title=title_lst[i], content=content_lst[i], time=time_lst[i], href=href_lst[i], source="ipvm") except Exception as a: print(a)
def axis(flag=1): url = 'https://www.axis.com/en-hk/support/product-security' html = get_page(url, sflag=flag) try: soup = BeautifulSoup(html, 'html.parser') li = soup.find_all(class_='field-item')[3] li = li.select('li') title_lst = [] href_lst = [] date_lst = [] date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") for tag in li: href_lst.append(tag.a['href']) title_lst.append(tag.a.string) print(href_lst, title_lst) for i in range(len(title_lst)): insert_info(title=title_lst[i], content='', time=date, href=href_lst[i], source='axis') except Exception as a: print(a) pass
def mcw0(flag=1): url = 'https://github.com/mcw0/PoC' html = get_page(url, sflag=flag) try: soup = BeautifulSoup(html, 'html.parser') title = soup.find_all('h2') date_lst = [] for tag in title: date_lst.append( list(enumerate(tag.next_sibling.next_sibling))[0][1][:10]) title_lst = [] href_lst = [] for tag in title: title_lst.append(tag.text) href_lst.append(url + tag.a['href']) # print(href_lst) for i in range(len(title_lst)): insert_info(title=title_lst[i], content='', time=date_lst[i], href=href_lst[i], source='bashis') except Exception as a: print(a) pass
def cnvd(flag=1): url = "http://www.cnvd.org.cn" html = get_page(url, sflag=flag) try: soup = BeautifulSoup(html, 'html.parser') titles = soup.select('.t1_tab_b > ul > li > a') date = soup.find_all('span', class_='t1_sp_right') title_lst = [] href_lst = [] date_lst = [] if titles: for tag in titles: title_lst.append(tag['title']) href_lst.append(url + tag['href']) if date: for tag in date: date_lst.append(tag.string.strip()) # print(title_lst) # print(href_lst) # print(date_lst) for i in range(len(title_lst)): insert_info(title=title_lst[i], content='', time=date_lst[i], href=href_lst[i], source='cnvd') except Exception as a: pass
def search_anquanke(flag=1): url = 'https://www.anquanke.com/' html = get_page(url, sflag=flag) try: soup = BeautifulSoup(html, 'html.parser') titles = soup.find_all(class_='title') content = soup.find_all(class_='desc hide-in-mobile-device') time = soup.find_all(class_='date') title_lst = [] href_lst = [] content_lst = [] time_lst = [] for tag in titles: if tag.a: title_lst.append(tag.a.string) href_lst.append(url + tag.a['href'][1:]) for tag in content: content_lst.append(tag.string) for tag in time: time_lst.append(tag.span.text.strip()) for i in range(0, len(href_lst)): insert_info(title=title_lst[i], content=content_lst[i], time=time_lst[i], href=href_lst[i], source='anquanke') except Exception as e: print(e)
def eanqun(flag=1): url = "https://www.easyaq.com/daily" html = get_page(url, sflag=flag) try: soup = BeautifulSoup(html, 'html.parser') titles = soup.find_all('h3') content = soup.find_all('p') date = soup.select('div[class="source"] > span > span') href_lst = [] content_lst = [] title_lst = [] date_lst = [] if titles: for tag in titles: title_lst.append(tag.a.string) href_lst.append("https://www.easyaq.com" + tag.a['href']) if content: for tag in content: content_lst.append(tag.string) if date: for tag in date: date_lst.append(tag.string) for i in range(len(title_lst)): insert_info(title=title_lst[i], content=content_lst[i], time=date_lst[i], href=href_lst[i], source='eanquan') except Exception as a: pass
def cert(flag=1): url = "http://cert.360.cn" html = get_page(url, sflag=flag) try: soup = BeautifulSoup(html, 'html.parser') titles = soup.find_all(class_='news-title') date = soup.find_all(class_='news-date') content = soup.find_all(class_="news-conent") title_lst = [] href_lst = [] time_lst = [] content_lst = [] for tag in titles: title_lst.append(tag.a.string) href_lst.append(url + tag.a['href']) for tag in date: time_lst.append(tag.string.split(' ')[1].strip()) for tag in content: content_lst.append(tag.string) for i in range(len(title_lst)): insert_info(title=title_lst[i], content=content_lst[i], time=time_lst[i], href=href_lst[i], source='360cert') except Exception as a: pass
def start_spider(url=root_url, info=""): t1 = time.time() log.info("start spider: %s info:%s 线程ID:%s", url, info, threading.get_ident()) soup = BeautifulSoup(spider_tool.get_page(url), 'lxml') aria2_list = [] monkey.patch_all() # 将程序中所有IO操作做上标记使程序非阻塞状态 g_list = [] ul_list = soup.find_all('ul', {'class': 'clearfix'}) for ul_index in range(0, len(ul_list) - 1): for li in ul_list[ul_index].find_all('li'): try: # parserLi(li) g_list.append(gevent.spawn(parserLi, li)) except Exception as err: log.error(err) gevent.joinall(g_list) time.sleep(5) for i, g in enumerate(g_list): dict = g.value log.info("spider_tool.download_aria2(%s,%s,%s)", dict['url'], dict['name'], dict['path']) if not exists_check(dict['url'], dict['name'], dict['path']) and is_prod: spider_tool.download_aria2(dict['url'], dict['name'], dict['path']) t2 = time.time() log.info("end spider:总共耗时:%s url:%s info:%s 线程ID:%s", (t2 - t1), url, info, threading.get_ident())
def start(): isExists = os.path.exists(work_path) if not isExists: os.makedirs(work_path) soup = BeautifulSoup(spider_tool.get_page(root_url), 'lxml') ul = soup.find_all('ul', {'class': 'clearfix'})[0] for li in ul.find_all('li'): try: parserLi(li) except Exception as err: log.error(err) for dict in aria2_list: spider_tool.download_aria2(dict['url'], dict['name'], dict['path'])
def anquanniu(flag=1): url = 'https://www.aqniu.com/' html = get_page(url, sflag=flag) try: mouthlist = { "一月": '01', "二月": '02', '三月': '03', '四月': '04', "五月": '05', '六月': '06', '七月': '07', '八月': '08', '九月': '09', '十月': '10', '十一月': '11', '十二月': '12' } soup = BeautifulSoup(html, 'html.parser') titles = soup.select('.post > div > div > h4 ') content = soup.select('.post > div > div > p ') date = soup.select('.post > div > div > div > span[class="date"]') href_lst = [] content_lst = [] title_lst = [] date_lst = [] if titles: for tag in titles: title_lst.append(tag.a.string) href_lst.append(tag.a['href']) if content: for tag in content: content_lst.append(tag.string) if date: for tag in date: s = tag.string.split(',') year = s[2][1:] mouth = mouthlist[s[1][1:3]] day = s[1][4:] time = '-'.join([year, mouth, day]) date_lst.append(time) for i in range(len(title_lst)): insert_info(title=title_lst[i], content=content_lst[i], time=date_lst[i], href=href_lst[i], source='anquanniu') except Exception as a: pass
def vdoo(flag=1): url = 'https://www.vdoo.com/blog/' html = get_page(url, sflag=flag) try: mouthlist = { "January": '01', "February": '02', 'March': '03', 'April': '04', "May": '05', 'June': '06', 'July': '07', 'August': '08', 'September': '09', 'October': '10', 'November': '11', 'December': '12' } soup = BeautifulSoup(html, 'html.parser') titles = soup.find_all(class_='posts-group-item-title') content = soup.find_all(class_='posts-group-item-snippet') href = soup.select('a[style]') date = soup.find_all(class_='posts-group-item-date') title_lst = [] content_lst = [] href_lst = [] date_lst = [] for tag in titles: title_lst.append(tag.string) for tag in content: content_lst.append(tag.string) for tag in href: href_lst.append('https://www.vdoo.com' + tag['href']) for tag in date: s = tag.string.split(',') year = s[1][1:] mouth = mouthlist[s[0].split(' ')[0]] day = s[0].split(' ')[1] time = '-'.join([year, mouth, day]) date_lst.append(time) for i in range(len(title_lst)): insert_info(title=title_lst[i], content=content_lst[i], time=date_lst[i], href=href_lst[i], source='vdoo') except Exception as a: pass
def hik(flag=1): url = 'http://www.hikvision.com/cn/support_list_591.html' html = get_page(url, sflag=flag) try: soup = BeautifulSoup(html, 'html.parser') tags = soup.select('li[class="clearfix"]') titles = [] hrefs = [] time = [] for tag in tags: titles.append(tag.a.string) hrefs.append('http://www.hikvision.com/cn/' + tag.a['href']) time.append(tag.span.string) for i in range(0, len(titles)): insert_info(title=titles[i], time=time[i], href=hrefs[i], source="hik") except Exception as a: print(a)
def yushi(flag=1): url = 'http://cn.uniview.com/Security/Notice/' html = get_page(url, sflag=flag) try: soup = BeautifulSoup(html, 'html.parser') tags = soup.select('ul[id="NewsListStyle"]') href_lst = [] title_lst = [] time_lst = [] for tag in (tags[0].find_all('a')): href_lst.append('http://cn.uniview.com' + tag['href']) title_lst.append(tag.string) date = tag['href'].split('/')[3][:4] + '-' + tag['href'].split( '/')[3][-2:] time_lst.append(date) for i in range(0, len(title_lst)): insert_info(title=title_lst[i], time=time_lst[i], href=href_lst[i], source='uniview') except Exception as a: print(a)