def __init__(self, url, headers={}, threads=10, timeout=60, sleep=2, proxy={}, level=False, cert=None): threading.Thread.__init__(self) self.settings = {} self.settings['threads'] = int(threads) self.settings['timeout'] = int(timeout) self.settings['sleep'] = int(sleep) self.settings['proxy'] = proxy self.settings['level'] = level self.settings['headers'] = headers self.session = Session() self.block = [] #set() self.cert = cert self.url = url req = BaseRequest(self.url, proxy=self.settings['proxy'], session=self.session) res = req.response() self.basereq = req self.basereq.url = res.url self.website = BaseWebSite(self.basereq.url, proxy=self.settings['proxy'], session=self.session) self.ISSTART = True self.ReqQueue = queue.Queue() self.ResQueue = queue.Queue() self.Directory = {} #目录结构 self.SubDomain = set() #子域名列表 self.Page20x = set() self.Page30x = set() self.Page40x = set() self.Page50x = set()
def run(self): pool = ThreadPool(self.settings['threads']) self.FLAG = self.settings['timeout'] try: self.request(BaseRequest(self.basereq.url,headers=self.settings['headers'],session=self.session,proxy=self.settings['proxy'])) except Exception as e: print('err',e) self.ISSTART = False return #5分钟后还没有任务加进来就当爬完了 while self.ISSTART and self.FLAG > 0: #logging.load('Reload ... Wait for %s'%self.FLAG) try: req = self.ReqQueue.get(block=False) pool.spawn(self.request,req) except queue.Empty: time.sleep(1) self.FLAG -= 1 self.ISSTART = False pool.join()
def parse(self, response): content_type = response.headers.get('content-type', 'text') if content_type not in ("image", "octet-stream"): response = response.text urls = set() urls = urls.union( set(re.findall(r"""src[\s]*:[\s]*["'](.*?)["']""", response))) urls = urls.union( set(re.findall(r"""src[\s]*=[\s]*["'](.*?)["']""", response))) urls = urls.union( set(re.findall(r"""href[\s]*:[\s]*["'](.*?)["']""", response))) urls = urls.union( set(re.findall(r"""href[\s]*=[\s]*["'](.*?)["']""", response))) urls = urls.union( set(re.findall(r"""url[\s]*:[\s]*['"](.*?)['"]""", response))) urls = urls.union( set(re.findall(r"""url[\s]*=[\s]*['"](.*?)['"]""", response))) urls = urls.union( set( re.findall( r'''['"](/[^/\*'"][A-Za-z0-9\.\\/_-]{1,255})['"]''', response))) urls = urls.union( set( re.findall( r"""['"]([A-Za-z0-9\.\\/_-]{1,255}[a-zA-Z]\?[a-zA-Z].*?)['"]""", response))) urls = urls.union( set( re.findall( """(http[s]?://(?:[-a-zA-Z0-9_]+\.)+[a-zA-Z]+(?::\d+)?(?:/[-a-zA-Z0-9_%./]+)*\??[-a-zA-Z0-9_&%=.]*)""", response))) for url in urls: if url: req = BaseRequest(self.urljoin(url), session=self.session, proxy=self.settings['proxy']) self.addreq(req) if self.settings['level']: posts = [] for f in re.findall(r"""<form([\s\S]*?)</form>""", response): post = {} post['action'] = ''.join( re.findall(r"""action[\s]*=[\s]*["'](.*?)["']""", f)) or './' post['method'] = ''.join( re.findall(r"""method[\s]*=[\s]*["'](.*?)["']""", f)) or 'POST' post['data'] = {} for d in re.findall(r"""<input[\s\S]*?>""", f): name = ''.join( re.findall(r"""name[\s]*=[\s]*["'](.*?)["']""", d)) value = ''.join( re.findall(r"""value[\s]*=[\s]*["'](.*?)["']""", d)) if not value: value = name post['data'].update({name: value}) posts.append(post) for post in posts: req = BaseRequest(self.urljoin(post['action']), method=post['method'], data=post['data'], session=self.session, proxy=self.settings['proxy']) self.addreq(req)
def parse(self, response): content_type = response.headers.get('content-type', 'text') if 'text' in content_type or 'javascript' in content_type: response = response.text urls = set() #urls = urls.union(set(re.findall(r"""[href|src][\s]*[:=]["'\s]*(.*?)["'\s>]""",response))) urls = urls.union( set(re.findall(r"""src=([^'"].*?[^'"])[>\s]""", response))) urls = urls.union( set(re.findall(r"""href=([^'"].*?[^'"])[>\s]""", response))) urls = urls.union( set(re.findall(r"""src[\s]*:[\s]*["'](.*?)["']""", response))) urls = urls.union( set(re.findall(r"""src[\s]*=[\s]*["'](.*?)["']""", response))) urls = urls.union( set(re.findall(r"""href[\s]*:[\s]*["'](.*?)["']""", response))) urls = urls.union( set(re.findall(r"""href[\s]*=[\s]*["'](.*?)["']""", response))) urls = urls.union( set(re.findall(r"""url[\s]*:[\s]*['"](.*?)['"]""", response))) urls = urls.union( set(re.findall(r"""url[\s]*=[\s]*['"](.*?)['"]""", response))) urls = urls.union( set( re.findall( r'''['"](/[^/\*'"][A-Za-z0-9\.\\/_-]{1,255})['"]''', response))) urls = urls.union( set( re.findall( r"""['"]([A-Za-z0-9\.\\/_-]{1,255}[a-zA-Z]\?[a-zA-Z].*?)['"]""", response))) urls = urls.union( set( re.findall( """(http[s]?://(?:[-a-zA-Z0-9_]+\.)+[a-zA-Z]+(?::\d+)?(?:/[-a-zA-Z0-9_%./]+)*\??[-a-zA-Z0-9_&%=.]*)""", response))) for url in urls: url = self.urljoin(url) if url: req = BaseRequest(url, session=self.session, proxy=self.settings['proxy']) self.addreq(req) if self.settings['level']: posts = [] for k, v in re.findall( r"""<form([\s\S]*?>)([\s\S]*?)</form>""", response): post = {} post['action'] = ''.join( re.findall(r"""action[\s]*=["'\s]*(.*?)["'\s>]""", k)) or './' post['method'] = ''.join( re.findall(r"""method[\s]*=["'\s]*(.*?)["'\s>]""", k)) or 'POST' post['data'] = {} for d in re.findall(r"""<input([\s\S]*?)>""", v): name = ''.join( re.findall(r"""name[\s]*=["'\s]*(.*?)["'\s>]""", d)) value = ''.join( re.findall(r"""value[\s]*=["'\s]*(.*?)["'\s>]""", d)) if not value: value = name post['data'].update({name: value}) posts.append(post) for post in posts: #print(post) req = BaseRequest(self.urljoin(post['action']), method=post['method'], data=post['data'], session=self.session, proxy=self.settings['proxy']) self.addreq(req)