def analysis_rss(self, list_obj): logging.info('-- analysis rss --') if self.lclass == "intelligence": for i in self.content: u = { "class": self.lclass, "title": self.get_value(i, list_obj['response']['title']).strip(), "summary": self.get_value(i, list_obj['response']['summary']), "publish_time": time.strftime( "%Y-%m-%d %H:%M", self.get_value(i, list_obj['response']['publish_time'])), "source": self.get_value(i, list_obj['response']['source']), "raw_url": self.get_value(i, list_obj['response']['raw_url']) } url = u['raw_url'] uhash = str(md5(url)) if self.unique_url(url): u["rhash"] = uhash redis_c.lpush('result', json.dumps(u)) logging.info('-- push url %s --' % url) else: logging.info('-- exist url %s --' % url)
async def analysis(self, list_obj): logging.info('-- analysis event --') self.content = BeautifulSoup(self.content, 'html.parser') if list_obj['pattern']['type'] == "list": if list_obj['pattern'].get('class'): list_dom = self.content.find_all( 'li', class_=list_obj['pattern']['class']) elif list_obj['pattern'].get('selector'): list_dom = self.content.select(list_obj['pattern']['selector']) if list_obj['pattern']['type'] == "table": list_dom = self.content.find_all('tr') if list_obj['pattern']['type'] == "h2": if list_obj['pattern'].get('class'): list_dom = self.content.find_all( 'h2', class_=list_obj['pattern']['class']) else: list_dom = self.content.find_all('h2') base_url = "%s://%s" % (self.url_info.scheme, self.url_info.netloc) for i in list_dom: try: text = i.get_text() base_time = re.search(list_obj['basetime']['pattern'], text).group() url_dom = i.find('a') try: u = url_dom[list_obj['pattern']['title_key']] except Exception: # default href u = url_dom['href'] try: length = list_obj['pattern']['length'].split(":") if length[0] and length[1]: u = u[int(length[0]):int(length[1])] elif length[0] and not length[1]: u = u[int(length[0]):] elif not length[0] and length[1]: u = u[:int(length[1])] except Exception: pass url = base_url + u u = { "class": self.lclass, "type": self.ltype, "url": url, "event_type": list_obj['event_type'], "basetime": base_time + " 00:00:00" } if self.unique_url(str(url)): redis_c.lpush('target', json.dumps(u)) logging.info('-- push url %s --' % url) else: logging.error('-- exist url %s --' % url) except Exception: pass
async def analysis_update(self, obj): logging.info('-- START ANALYSIS UPDATE DETAIL PAGE --') for k, v in obj.items(): if not k or not v: raise Exception('config error') self.result[k] = self.get_value(v) self.result['class'] = "update" logging.info('-- FINISH ANALYSIS DETAIL PAGE --') logging.info(self.result) redis_c.lpush('result', json.dumps(self.result))
async def analysis(self, obj): logging.info('-- START ANALYSIS DETAIL PAGE --') try: for k, v in obj.items(): if not k or not v: logging.error('-- ERROR %s %s --' % (k, v)) raise Exception('config error') self.result[k] = self.get_value(v) self.result['class'] = "event" logging.info('-- FINISH ANALYSIS DETAIL PAGE --') logging.info(self.result) redis_c.lpush('result', json.dumps(self.result)) except Exception as e: logging.info('-- ERROR %s --' % e)
# -*- coding: utf-8 -*- import json from utils import redis_c, load_yaml # a = {"type": "tc260", "url": "https://www.tc260.org.cn/front/postDetail.html?id=20200527151336"} a = {"type": "cert", "class": "event"} # a = {"type": "cnvd", "class": "vul"} # a = {"type": "cnnvd", "class": "vul"} # a = {"type": "freebuf", "class": "intelligence"} # b = {"type": "xz", "class": "intelligence"} # c = {"type": "seebug", "class": "intelligence"} # a = {"type": "snyk", "class": "vul"} # a = {"type": "djbh", "url": "http://www.djbh.net/webdev/web/HomeWebAction.do?p=getXxgg&id=8a8182566ed3d102016fa6d2737f0034", "event_type": "法文法规"} # a = {"type": "tc260", "url": "https://www.tc260.org.cn/front/postDetail.html?id=20200527151336", "event_type": "法文法规"} redis_c.lpush("list", json.dumps(a)) # redis_c.lpush("list", json.dumps(a)) # redis_c.lpush("list", json.dumps(b)) # redis_c.lpush("list", json.dumps(c))
def analysis_html(self, list_obj): logging.info('-- analysis html --') # 如果不是bs的话,转换一道 if not isinstance(self.content, BeautifulSoup): self.content = BeautifulSoup(self.content, 'html.parser') if self.lclass == "intelligence": try: if list_obj['pattern']['type'] == "list": if list_obj['pattern'].get('class'): list_dom = self.content.find_all( 'div', class_=list_obj['pattern']['class']) elif list_obj['pattern'].get('selector'): list_dom = self.content.select( list_obj['pattern']['selector']) if list_obj['pattern']['type'] == "table": list_dom = self.content.find_all('tr') if list_obj['pattern']['type'] == "h2": if list_obj['pattern'].get('class'): list_dom = self.content.find_all( 'h2', class_=list_obj['pattern']['class']) else: list_dom = self.content.find_all('h2') self.current_obj = list_obj for i in list_dom: # url is the most import thing u = { "class": self.lclass, "title": self.get_value(i, list_obj['response']['title']).strip(), "summary": self.get_value(i, list_obj['response']['summary']), "publish_time": self.get_value(i, list_obj['response']['publish_time']), "source": self.get_value(i, list_obj['response']['source']), "raw_url": self.get_value(i, list_obj['response']['raw_url']) } url = u['raw_url'] uhash = str(md5(url)) if self.unique_url(url): u["rhash"] = uhash redis_c.lpush('result', json.dumps(u)) logging.info(u) logging.info('-- push url %s --' % url) else: logging.info('-- exist url %s --' % url) except Exception as e: logging.error('-- error %s --' % e) if self.lclass == "vul": if list_obj['pattern']['type'] == "list": if list_obj['pattern'].get('class'): list_dom = self.content.find_all( 'div', class_=list_obj['pattern']['class']) elif list_obj['pattern'].get('selector'): list_dom = self.content.select( list_obj['pattern']['selector']) if list_obj['pattern']['type'] == "table": table = self.content.find_all('table') list_dom = table[0].tbody.find_all('tr') if list_obj['pattern']['type'] == "h2": if list_obj['pattern'].get('class'): list_dom = self.content.find_all( 'h2', class_=list_obj['pattern']['class']) else: list_dom = self.content.find_all('h2') self.current_obj = list_obj for i in list_dom: # url is the most import thing u = { "class": self.lclass, "type": self.ltype, "source": self.get_value(i, list_obj['response']['source']), "title": self.get_value(i, list_obj['response']['title']).strip(), "url": self.get_value(i, list_obj['response']['url']), } url = u['url'] uhash = str(md5(url)) if self.unique_url(url): u["rhash"] = uhash redis_c.lpush('target', json.dumps(u)) logging.info('-- push url %s --' % url) if self.lclass == "update": logging.info('-- html update analysis --') if list_obj['pattern']['type'] == "h2": if list_obj['pattern'].get('class'): lists = self.content.find_all( 'h2', class_=list_obj['pattern']['class']) else: lists = self.content.find_all('h2') else: lists = self.content.select(list_obj['pattern']['selector']) self.current_obj = list_obj for i in lists: u = { "class": self.lclass, "type": self.ltype, "source": self.get_value(i, list_obj['response']['source']), "url": self.get_value(i, list_obj['response']['url']), "title": self.get_value(i, list_obj['response']['title']).strip(), } url = u['url'] uhash = str(md5(url)) if self.unique_url(url): u["rhash"] = uhash redis_c.lpush('target', json.dumps(u)) logging.info('-- push url %s --' % url) else: logging.info('-- exist url %s --' % url)
def analysis_json(self, list_obj): logging.info('-- analysis json --') if self.lclass == "intelligence": for i in self.content[list_obj['pattern']['selector']]: u = { "class": self.lclass, "raw_url": self.get_value(i, list_obj['response']['raw_url']), "title": self.get_value(i, list_obj['response']['title']).strip(), "summary": self.get_value(i, list_obj['response']['summary']), "publish_time": self.get_value(i, list_obj['response']['publish_time']), "source": self.get_value(i, list_obj['response']['source']) } url = u['raw_url'] uhash = str(md5(url)) if self.unique_url(url): u["rhash"] = uhash redis_c.lpush('result', json.dumps(u)) logging.info('-- push url %s --' % url) else: logging.error('-- exist url %s --' % url) elif self.lclass == "event": for i in self.content['list']: url = i[list_obj['pattern']['key']] if self.unique_url(url): u = { "class": self.lclass, "type": self.ltype, "url": url, "event_type": list_obj['event_type'], "basetime": i[list_obj['basetime']['key']][:-2] } redis_c.lpush('target', json.dumps(u)) logging.info('-- push url %s --' % url) else: logging.error('-- exist url %s --' % url) elif self.lclass == "update": for i in self.content: # logging.info(i) # update 组件更新情报的数据结构 u = { "class": self.lclass, "raw_url": self.get_value(i, list_obj['response']['url']), "component": self.get_value(i, list_obj['response']['component']), "commit_time": self.get_value(i, list_obj['response']['commit_time']), "description": self.get_value(i, list_obj['response']['description']), "source": self.get_value(i, list_obj['response']['source']), "update_type": self.get_value(i, list_obj['response']['update_type']), "cve_id": self.get_value(i, list_obj['response']['cve_id']), "version": self.get_value(i, list_obj['response']['version']), "level": self.get_value(i, list_obj['response']['level']), "source_platform": self.get_value(i, list_obj['response']['source_platform']), "commit_user": self.get_value(i, list_obj['response']['commit_user']), "update_title": self.get_value(i, list_obj['response']['update_title']), } url = u['raw_url'] if self.unique_url(url): uhash = str(md5(url)) u["source_hash"] = uhash redis_c.lpush('result', json.dumps(u)) logging.info('-- push url %s --' % url) else: logging.error('-- exist url %s --' % url)
def event_clawer(): for i in ['miit', 'cert']: # for i in ['miit', 'cac', 'tc260', 'cert', 'djbh']: data = {'type': i, 'class': 'event'} redis_c.lpush("list", json.dumps(data))
def update_clawer(): print("vul start") for i in ['github', 'postgresql', 'tsrc']: data = {'type': i, 'class': 'update'} redis_c.lpush("list", json.dumps(data))
def vul_clawer(): print("vul start") for i in ['cnvd', 'cnnvd']: data = {'type': i, 'class': 'vul'} redis_c.lpush("list", json.dumps(data))
def intelligence_clawer(): print("ti start") for i in ['anquanke', 'xz', 'doonsec', 'cnvd', 'seebug', 'freebuf']: data = {'type': i, 'class': 'intelligence'} redis_c.lpush("list", json.dumps(data))