def dark_start(target): from dark_core.database.mysqlManger import sqlMg from hiddenDetect import hiddenlink_obj from dark_core.output.console import consoleLog from dark_core.output.textFile import fileLog from dark_core.output.logging import logger from dark_core.profile.profile import pf from dark_core.settings.settings import settings from dark_core.parser.urlParser import url_object from datetime import datetime # 设置日志模块 if pf.getLogType() == "True": file_path = settings.get("LOG_FILE_PATH") datetimestrf = datetime.now().strftime("%Y-%m-%d") domain = url_object(target).getRootDomain # 获取当前页面的根域名 file_name = file_path + domain + "_" + datetimestrf + ".log" # 检测文件名按域名_时间.log的形式加载 fileLog.set_file_name(file_name) # 设置日志文件名 logger.setOutputPlugin(fileLog) else: logger.setOutputPlugin(consoleLog) # 执行检测 hidden = hiddenlink_obj(target) hidden.init() hidden.run() hidden.finsh() # 关闭相关数据库的连接和日志打印模块 sqlMg.dispose() logger.endLogging()
def dark_start(target): from dark_core.database.mysqlManger import sqlMg from hiddenDetect import hiddenlink_obj from dark_core.output.console import consoleLog from dark_core.output.textFile import fileLog from dark_core.output.logging import logger from dark_core.profile.profile import pf from dark_core.settings.settings import settings from dark_core.parser.urlParser import url_object from datetime import datetime # 设置日志模块 if pf.getLogType() == 'True': file_path = settings.get('LOG_FILE_PATH') datetimestrf = datetime.now().strftime('%Y-%m-%d') domain = url_object(target).getRootDomain # 获取当前页面的根域名 file_name = file_path + domain + '_' + datetimestrf + '.log' # 检测文件名按域名_时间.log的形式加载 fileLog.set_file_name(file_name) # 设置日志文件名 logger.setOutputPlugin(fileLog) else: logger.setOutputPlugin(consoleLog) # 执行检测 hidden = hiddenlink_obj(target) hidden.init() hidden.run() hidden.finsh() # 关闭相关数据库的连接和日志打印模块 sqlMg.dispose() logger.endLogging()
def __init__(self, url): self.url = url self.doc = None self.html = None try: self.root = url_object(url).getRootDomain except Exception, msg: logger.error('Html parser initialization filed, please check it! Exception: %s' % msg)
def __init__(self, url): self.url = url # 要加载快照生成快照的组件名 self.target = self.url.replace('/', '_') self.root_target = url_object(self.url).getDomain self.snapshot_path = settings.get('SNAPSHOT_PATH') self._file_name = self.target + '_snapshot.html' self._file_path = os.path.join(self.snapshot_path, self._file_name) # eg:/tmp/www.kingboxs.com_aaa_snapshot.html self._initialized = False # 初始化标志
def __init__(self, url): self.url = url # 要加载快照生成快照的组件名 self.target = self.url.replace('/', '_') self.root_target = url_object(self.url).getDomain self.snapshot_path = settings.get('SNAPSHOT_PATH') self._file_name = self.target + '_snapshot.html' self._file_path = os.path.join( self.snapshot_path, self._file_name) # eg:/tmp/www.kingboxs.com_aaa_snapshot.html self._initialized = False # 初始化标志
def get_a_tag_link_list(self): ''' 描述: 获取包含link的所有<a>标签下的非css链接 ''' aTagLinkList = [] for link in self.get_links_from_doc: if link[0].tag == 'a' and 'href' in link[0].keys(): href = link[0].get("href") # print 'href:%s' %href # 在上述的方法中已经将其中的链接转换为绝对路径 if href.startswith("http") and not href.endswith("css"): rootDomain = url_object(href).getRootDomain # 获取每个链接的根域名 if rootDomain != self.root: aTagLinkList.append(link[0]) return aTagLinkList
def __init__(self, obj): self.obj = obj # 加载要进行报告生成的组件名 self.target = url_object(self.obj.url).getDomain root_path = os.path.dirname(os.path.realpath(__file__)) # 获取当前文件的工作目录 self.reportPath = settings.get("REPORT_PATH") # 设置报告生成的根目录 self._initialized = False # 初始化标志 self._html_filepath = root_path + os.path.sep self._style_main_filename = self._html_filepath + "main.css" # 加载的css文件位置 # These attributes hold the file pointers self._file = None datetimestrf = datetime.now().strftime("%Y-%m-%d") self._file_name = self.target + "_" + datetimestrf + "_a.html" self._file_path = os.path.join(self.reportPath, self._file_name)
def __init__(self, obj): self.obj = obj # 加载要进行报告生成的组件名 self.target = url_object(self.obj.url).getDomain root_path = os.path.dirname(os.path.realpath(__file__)) # 获取当前文件的工作目录 self.reportPath = settings.get('REPORT_PATH') # 设置报告生成的根目录 self._initialized = False # 初始化标志 self._html_filepath = root_path + os.path.sep self._style_main_filename = self._html_filepath + 'main.css' # 加载的css文件位置 # These attributes hold the file pointers self._file = None datetimestrf = datetime.now().strftime('%Y-%m-%d') self._file_name = self.target + '_' + datetimestrf + '_a.html' self._file_path = os.path.join(self.reportPath, self._file_name)
def run(self): while self.pageNum < 300: self.curUrl = self.url % self.pageNum print self.curUrl self.curRequest = Requset(self.curUrl, 10) self.curRequest.run() doc = self.curRequest.get_doc() infoTag = doc.xpath("//div[@class='info']/h3/a") for info in infoTag: try: domainTitle = info.text url = info.get("href") url = url.replace("/site_", "http://").replace(".html", "/") domain = url_object(url).getRootDomain self.urlList.append((domain, domainTitle)) except Exception, e: print "parse_web.rebot_obj.run: %s" % e self.pageNum += 1
def run(self): while self.pageNum < 300: self.curUrl = self.url % self.pageNum print self.curUrl self.curRequest = Requset(self.curUrl, 10) self.curRequest.run() doc = self.curRequest.get_doc() infoTag = doc.xpath("//div[@class='info']/h3/a") for info in infoTag: try: domainTitle = info.text url = info.get('href') url = url.replace("/site_", "http://").replace(".html", "/") domain = url_object(url).getRootDomain self.urlList.append((domain, domainTitle)) except Exception, e: print 'parse_web.rebot_obj.run: %s' % e self.pageNum += 1