def check(): setting_col.update_one({'key': 'task'}, {'$set': {'key': 'task', 'pid': os.getpid()}}, upsert=True) query_count = query_col.count({'enabled': True}) logger.info('需要处理的关键词总数: {}'.format(query_count)) if query_count: logger.info('需要处理的关键词总数: {}'.format(query_count)) else: logger.warning('请添加关键词') return if github_col.count({'rate_remaining': {'$gt': 5}}): pass else: logger.error('请配置github账号') return if setting_col.count({'key': 'task', 'page': {'$exists': True}}): setting_col.update_one({'key': 'task'}, {'$set': {'pid': os.getpid()}}) page = int(setting_col.find_one({'key': 'task'}).get('page')) for p in range(0, page): for query in query_col.find({'enabled': True}).sort('last', ASCENDING): github_account = random.choice( list(github_col.find({"rate_limit": {"$gt": 5}}).sort('rate_remaining', DESCENDING))) github_username = github_account.get('username') github_password = github_account.get('password') github_token = github_account.get('token') rate_remaining = github_account.get('rate_remaining') logger.info(github_username) logger.info(rate_remaining) g = Github(github_username, github_token, user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36') search.schedule(args=(query, p, g, github_username), delay=huey.pending_count() + huey.scheduled_count()) else: logger.error('请在页面上配置任务参数')
def post(self): parser = reqparse.RequestParser() parser.add_argument('webhook', type=str, required=True, help='WebHook URL') parser.add_argument('domain', type=str, help='System URL Host') parser.add_argument('enabled', type=inputs.boolean, default=False, help='Enabled Notice') parser.add_argument('test', type=inputs.boolean, default=False, help='Test Notice') args = parser.parse_args() if urlparse(args.get('webhook')).netloc not in ['oapi.dingtalk.com', 'qyapi.weixin.qq.com'] or urlparse( args.get('webhook')).scheme != 'https': data = {'status': 400, 'msg': '错误的 webhook 地址', 'result': []} return jsonify(data) if args.get('test'): if urlparse(args.get('webhook')).netloc == 'oapi.dingtalk.com': test_content = { "msgtype": "markdown", "markdown": {"title": "GitHub泄露", "text": '### 规则名称: [WebHook告警测试]({})'.format(args.get('domain')) }, "at": { "atMobiles": [ ], "isAtAll": False } } else: test_content = { "msgtype": "markdown", "markdown": { "content": '### 规则名称: [WebHook告警测试]({})'.format(args.get('domain')) } } response = requests.post( args.get('webhook'), json=test_content) if response.ok: if response.json().get('errmsg') == 'ok': data = {'status': 201, 'msg': '已发送,请前往钉钉/企业微信群查看', 'result': []} else: data = {'status': 400, 'msg': '发送失败,WebHook 响应: {}'.format(response.json().get('errmsg')), 'result': []} return jsonify(data) else: data = {'status': 400, 'msg': '发送失败,请检查服务器网络', 'result': []} return jsonify(data) del args['test'] setting_col.update_one({'webhook': args.get('webhook')}, {'$set': args}, upsert=True) result = setting_col.count({'webhook': args.get('webhook')}) if result > 0: data = {'status': 201, 'msg': '设置成功', 'result': result} else: data = {'status': 400, 'msg': '设置失败', 'result': result} return jsonify(data)
def search(query, page, g, github_username): mail_notice_list = [] webhook_notice_list = [] logger.info('开始抓取: tag is {} keyword is {}, page is {}'.format( query.get('tag'), query.get('keyword'), page + 1)) try: repos = g.search_code(query=query.get('keyword'), sort="indexed", order="desc") github_col.update_one({'username': github_username}, { '$set': { 'rate_remaining': int(g.get_rate_limit().search.remaining) } }) except Exception as error: logger.critical(error) logger.critical("触发限制啦") return try: for repo in repos.get_page(page): setting_col.update_one({'key': 'task'}, { '$set': { 'key': 'task', 'pid': os.getpid(), 'last': timestamp() } }, upsert=True) if not result_col.count({'_id': repo.sha}): try: code = str(repo.content).replace('\n', '') except: code = '' leakage = { 'link': repo.html_url, 'project': repo.repository.full_name, 'project_url': repo.repository.html_url, '_id': repo.sha, 'language': repo.repository.language, 'username': repo.repository.owner.login, 'avatar_url': repo.repository.owner.avatar_url, 'filepath': repo.path, 'filename': repo.name, 'security': 0, 'ignore': 0, 'tag': query.get('tag'), 'code': code, } try: leakage['affect'] = get_affect_assets(repo.decoded_content) except Exception as error: logger.critical('{} {}'.format(error, leakage.get('link'))) leakage['affect'] = [] if int(repo.raw_headers.get('x-ratelimit-remaining')) == 0: logger.critical('剩余使用次数: {}'.format( repo.raw_headers.get('x-ratelimit-remaining'))) return last_modified = datetime.datetime.strptime( repo.last_modified, '%a, %d %b %Y %H:%M:%S %Z') leakage['datetime'] = last_modified leakage['timestamp'] = last_modified.timestamp() in_blacklist = False for blacklist in blacklist_col.find({}): if blacklist.get('text').lower() in leakage.get( 'link').lower(): logger.warning('{} 包含白名单中的 {}'.format( leakage.get('link'), blacklist.get('text'))) in_blacklist = True if in_blacklist: continue if result_col.count({ "project": leakage.get('project'), "ignore": 1 }): continue if not result_col.count({ "project": leakage.get('project'), "filepath": leakage.get("filepath"), "security": 0 }): mail_notice_list.append( '上传时间:{} 地址: <a href={}>{}/{}</a>'.format( leakage.get('datetime'), leakage.get('link'), leakage.get('project'), leakage.get('filename'))) webhook_notice_list.append('[{}/{}]({}) 上传于 {}'.format( leakage.get('project').split('.')[-1], leakage.get('filename'), leakage.get('link'), leakage.get('datetime'))) try: result_col.insert_one(leakage) logger.info(leakage.get('project')) except errors.DuplicateKeyError: logger.info('已存在') logger.info('抓取关键字:{} {}'.format(query.get('tag'), leakage.get('link'))) except Exception as error: if 'Not Found' not in error.data: g, github_username = new_github() search.schedule(args=(query, page, g, github_username), delay=huey.pending_count() + huey.scheduled_count()) logger.critical(error) logger.error('抓取: tag is {} keyword is {}, page is {} 失败'.format( query.get('tag'), query.get('keyword'), page + 1)) return logger.info('抓取: tag is {} keyword is {}, page is {} 成功'.format( query.get('tag'), query.get('keyword'), page + 1)) query_col.update_one({'tag': query.get('tag')}, { '$set': { 'last': int(time.time()), 'status': 1, 'reason': '抓取第{}页成功'.format(page), 'api_total': repos.totalCount, 'found_total': result_col.count({'tag': query.get('tag')}) } }) if setting_col.count({ 'key': 'mail', 'enabled': True }) and len(mail_notice_list): main_content = '<h2>规则名称: {}</h2><br>{}'.format( query.get('tag'), '<br>'.join(mail_notice_list)) send_mail(main_content) logger.info(len(webhook_notice_list)) webhook_notice(query.get('tag'), webhook_notice_list)
from pymongo import errors, DESCENDING, ASCENDING from config.database import result_col, query_col, blacklist_col, notice_col, github_col, setting_col, REDIS_HOST, \ REDIS_PORT from utils.date import timestamp from utils.log import logger from utils.notice import mail_notice huey = RedisHuey('hawkeye', host=REDIS_HOST, port=int(REDIS_PORT)) base_path = os.path.split(os.path.realpath(__file__))[0] extract = tldextract.TLDExtract(cache_file='{}/.tld_set'.format(base_path)) if setting_col.count({ 'key': 'task', 'minute': { '$exists': True }, 'page': { '$exists': True } }): minute = int(setting_col.find_one({'key': 'task'}).get('minute')) setting_col.update_one( {'key': 'task'}, {'$set': { 'key': 'task', 'pid': os.getpid(), 'last': timestamp() }}, upsert=True) else:
def get(self): parser = reqparse.RequestParser() parser.add_argument('tag', type=str, help='') args = parser.parse_args() tag = args.get('tag') if tag: total = { 'total': result_col.count({'tag': tag}), 'ignore': result_col.count({ 'tag': tag, 'security': 1 }), 'risk': result_col.count({ 'tag': tag, 'security': 0, "desc": { "$exists": True } }) } today = { 'total': result_col.count({ 'tag': tag, 'timestamp': { '$gte': today_start() } }), 'ignore': result_col.count({ 'tag': tag, 'timestamp': { '$gte': today_start() }, 'security': 1 }), 'risk': result_col.count({ 'tag': tag, 'timestamp': { '$gte': today_start() }, 'security': 0, "desc": { "$exists": True } }), } else: total = { 'total': result_col.count(), 'ignore': result_col.count({'security': 1}), 'risk': result_col.count({ 'security': 0, "desc": { "$exists": True } }) } today = { 'total': result_col.count({'timestamp': { '$gte': today_start() }}), 'ignore': result_col.count({ 'timestamp': { '$gte': today_start() }, 'security': 1 }), 'risk': result_col.count({ 'timestamp': { '$gte': today_start() }, 'security': 0, "desc": { "$exists": True } }), } if setting_col.count({'key': 'task'}): status = psutil.pid_exists( int(setting_col.find_one({ 'key': 'task' }).get('pid'))) last = setting_col.find_one({'key': 'task'}).get('last') else: status = False last = 0 engine = { 'status': status, 'last': last, } result = {'all': total, 'today': today, 'engine': engine} data = {'status': 200, 'msg': '获取信息成功', 'result': result} return jsonify(data)
def crawl(query, page): mail_notice_list = [] webhook_notice_list = [] search_url = 'https://search.gitee.com/?skin=rec&type=code&q={1}&sort=last_indexed' \ '&pageno={0}' session = gitee_login() logger.info('Gitee开始抓取: tag is {} keyword is {}, page is {}'.format( query.get('tag'), query.get('keyword'), page + 1)) totalCount = 0 for page in range(page + 1, page + 2): try: logger.info("Gitee ------ 启动抓取: {}".format( search_url.format(page, query.get('keyword')))) resp = session.get(search_url.format(page, query.get('keyword'))) logger.info("Gitee 启动抓取: {}".format( search_url.format(page, query.get('keyword')))) tree = etree.HTML(resp.text) nodes = tree.xpath('//*[@id="hits-list"]/div[@class="item"]') for node in nodes: logger.info("Gitee 开始抓取节点") totalCount += 1 # i = nodes.index(node) + 1 leakage = {} leakage['affect'] = [] datetime_ = node.xpath(Gitee.DATETIME)[0].text # print(datetime) datetime_match = re.match("[^\d]*(?P<Date>\d+.*)", datetime_) if not datetime_match: leakage['datetime'] = _format_time( datetime.datetime.now().date()) else: leakage['datetime'] = _format_time( datetime_match.groups("Date")[0]) leakage['timestamp'] = leakage.get('datetime').timestamp() leakage['link'] = cut_tail( node.xpath(Gitee.LINK)[0].attrib['href']) leakage['filepath'] = node.xpath(Gitee.LINK)[0].text leakage['filename'] = leakage.get("filepath").split("/")[-1] # leakage['link'] = 'https://gitee.com' + realative_link leakage['_id'] = _md5(leakage['link']) logger.info("Gitee ****** 开始抓取节点 {}".format( leakage['datetime'])) project_username = node.xpath(Gitee.USERNAME)[0].text leakage["vendor"] = "GITEE" leakage['username'] = project_username.split("/")[0] leakage['project'] = project_username leakage['project_url'] = cut_tail( node.xpath(Gitee.USERNAME)[0].attrib['href']) logger.info("Gitee 抓取到 {}".format(leakage.get("project_url"))) if result_col.find_one({"link": leakage['link'], "datetime": leakage['datetime']}) or \ result_col.find_one({'_id': leakage['_id']}): continue in_blacklist = False for blacklist in blacklist_col.find({}): if blacklist.get('text').lower() in leakage.get( 'link').lower(): logger.warning('{} 包含白名单中的 {}'.format( leakage.get('link'), blacklist.get('text'))) in_blacklist = True if in_blacklist: continue if result_col.count({ "project": leakage.get('project'), "ignore": 1 }): continue #gitee中可以只有项目,没有代码 leakage['avatar_url'] = 'https://gitee.com/logo-black.svg' raw_code = gitee_raw_code(leakage['link']) leakage['code'] = base64.b64encode( raw_code.encode("utf-8")).decode("utf-8") try: leakage['affect'] = get_affect_assets(raw_code) except Exception as error: logger.critical('{} {}'.format(error, leakage.get('link'))) leakage['affect'] = [] leakage['tag'] = query['tag'] # leakage['detail'] = etree.tostring(node,encoding='unicode').replace('{{', '<<').\ # replace('}}', '>>') language_node = node.xpath(Gitee.LANGUAGE) if language_node: leakage['language'] = language_node[0].text.strip() else: leakage['language'] = 'Unknow' leakage['security'] = 0 leakage['ignore'] = 0 if not result_col.count({ "project": leakage.get('project'), "filepath": leakage.get("filepath"), "security": 0 }): mail_notice_list.append( '上传时间:{} 地址: <a href={}>{}/{}</a>'.format( leakage.get('datetime'), leakage.get('link'), leakage.get('project'), leakage.get('filename'))) webhook_notice_list.append('[{}/{}]({}) 上传于 {}'.format( leakage.get('project').split('/')[-1], leakage.get('filename'), leakage.get('link'), leakage.get('datetime'))) result_col.insert_one(leakage) logger.info("Gitee 抓取到的结果: {}".format( leakage.get("project_url"))) except Exception as e: raise (e) print(e) logger.error("Gitee error is {}".format(e)) return logger.info('Gitee抓取: tag is {} keyword is {}, page is {} 成功'.format( query.get('tag'), query.get('keyword'), page + 1)) query_col.update_one({'tag': query.get('tag')}, { '$set': { 'last': int(time.time()), 'status': 1, 'reason': '抓取第{}页成功'.format(page), 'api_total': totalCount, 'found_total': result_col.count({'tag': query.get('tag')}) } }) if setting_col.count({ 'key': 'mail', 'enabled': True }) and len(mail_notice_list): main_content = '<h2>规则名称: {}</h2><br>{}'.format( query.get('tag'), '<br>'.join(mail_notice_list)) send_mail(main_content) logger.info(len(webhook_notice_list)) webhook_notice(query.get('tag'), webhook_notice_list)
def run(): # setting_col.update_one({'key': 'task'}, {'$set': {'key': 'task', 'pid': os.getpid()}}, upsert=True) query_count = query_col.count({'enabled': True}) logger.info('需要处理的关键词总数: {}'.format(query_count)) if query_count: logger.info('需要处理的关键词总数: {}'.format(query_count)) else: logger.warning('请添加关键词') return if github_col.count({'rate_remaining': {'$gt': 5}}): pass else: logger.error('请配置github账号') return if setting_col.count({'key': 'task', 'page': {'$exists': True}}): setting_col.update_one({'key': 'task'}, {'$set': {'pid': os.getpid()}}) page = int(setting_col.find_one({'key': 'task'}).get('page')) for p in range(0, page): for query in query_col.find({'enabled': True}).sort('last', 1): github_account = random.choice( list( github_col.find({ "rate_limit": { "$gt": 5 } }).sort('rate_remaining', -1))) github_username = github_account.get('username') github_password = github_account.get('password') rate_remaining = github_account.get('rate_remaining') logger.info(github_username) logger.info(rate_remaining) g = Github( github_username, github_password, per_page=PER_PAGE, user_agent= 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36' ) # total = query.get('total') # if total is None: # repos = g.search_code(query=query.get('keyword'), # sort="indexed", order="desc") # total = repos.totalCount api_total = query.get('api_total') if api_total: total = api_total else: repos = g.search_code(query=query.get('keyword'), sort="indexed", order="desc") total = repos.totalCount if total > 1000: total = 1000 page_pre = int(query.get('page_pre')) if query.get( 'page_pre') is not None else -1 page_all = math.ceil(total / 30) if page_all == 0: continue if page_pre + 1 >= page_all: page_pre = -1 page_now = page_pre + 1 search(query, page_now, g, github_username) else: logger.error('请在页面上配置任务参数')