def table_print(tables): #表格 a = DB().query_all("desc %s" % (tables)) table_top = [] for x in a: table_top.append(x[0]) b = DB().query_all("select * from %s" % (tables)) table_lis = [] for x in b: table_lis.append(x) print tabulate(table_lis, table_top, tablefmt="grid")
def recursion_blast_url(self, tables, t, lis, url_list): progress = sys.stdout tables = tables.replace('.', '_') total = len(lis) #请求总数 fenliang = total / t #总数除以线程 得到每份数量 kaishi = 0 jiewei = fenliang self.simple = simple() now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") for recursion_url in url_list: print " URL:" + recursion_url[ 0] + "-->\033[1;32;1m Send out all the requests Current time: %s \r \033[0m" % ( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) while True: list2 = lis[kaishi:jiewei] #获取成员份量 t = threading.Thread( target=self.simple.recursion_h_get_blast_text, args=[recursion_url[0], list2, tables]) #判断是否存在域名 如果有就入库 表名是url的值 t.start() blast.xiancheng.append(t) if jiewei > total: kaishi = 0 jiewei = fenliang break else: kaishi = kaishi + fenliang jiewei = jiewei + fenliang time.sleep(0.02) sql = "update %s set recursion = 1 where url = '%s'" % ( tables, recursion_url[0]) DB().increase(sql) for tt in blast.xiancheng: tt.join() #等待所有线程结束 print "\033[1;32;1m <--Above the domain name to send complete 0o(^_^)o0 Current time: %s \033[0m \r\n" % ( datetime.datetime.now().strftime("%Y-%m-%d %H:%M"))
def handle(url, dangqian_url, tables, domain): url_2 = urlparse(url) url = url.replace('http://', '').replace('https://', '') if url_2.netloc != "": #判断是否有netloc; if re.search(".%s" % (domain), url_2.netloc) != None: #如果相关域名存在的话 print '\033[1;38;1m Get a 1 related domain name %s \033[0m' % ( url.split('/')[0]) strinfo = re.compile("_p$") tables = strinfo.sub('', tables) try: ip = socket.gethostbyname(url) except Exception, e: ip = False if ip: DB().Domain_storage(tables, url.split('/')[0], ip) #url.split('/')[0] split('/')[0] 意思是从第一个/ 开始删除 比如 baidu.com/s/w/1.asp 删除后就成了 baidu.com return False
def im_url(file_url,tables): if not os.path.exists(file_url): print '\033[1;31;1m Sorry, the file does not exist. \033[0m'; exit(); else: im_url_db = DB(); sql = "select count(table_name) from information_schema.tables where table_name = '%s' and TABLE_SCHEMA = '%s'" % (tables,core.CORE.db) if im_url_db.query(sql): for url in open(file_url): url = url.replace('http://','').replace('https://','').replace('\r','').replace('\n',''); im_resolver=dns.resolver.Resolver(); im_resolver.nameservers=core.default_dns try: ip=im_resolver.query(url,'A')[0]; except Exception,e: ip = False; if ip != False: sql = "select count(id) from %s where url = '%s' " % (tables,url); if not im_url_db.query(sql): sql = "insert into %s values (null,\"%s\",\"%s\",0,0,0,0)" % (tables,url,str(ip)); im_url_db.increase(sql); print url,"===>OK",ip;
h_crawler = args.crawler;#简单获取a标签 picture = random.randint(1, 4); #生成随机数 tsk = []; #等待线程结束的 crawler_progress = []; # 爬虫等待线程结束的 if thread > 500: print '\033[1;31;1m Command parse error !!! \033[0m'; exit(); if __name__ == '__main__': hound_db = DB(); blast = blast(); if Dictionaries: #批量导入字典 function.process(hound_db.Dictionaries,Dictionaries); elif imurl and len(imurl) == 2: #导入域名 function.im_url(imurl[0],imurl[1]) elif url: if picture == 1: function.a1(); elif picture == 2: function.a2(); elif picture == 3: function.a3(); elif picture == 4: function.a4(); lis = hound_db.query_all("select lis from lis"); #获取所有字典数据
strinfo = re.compile("_p$") tables = strinfo.sub('', tables) try: ip = socket.gethostbyname(url) except Exception, e: ip = False if ip: DB().Domain_storage(tables, url.split('/')[0], ip) #url.split('/')[0] split('/')[0] 意思是从第一个/ 开始删除 比如 baidu.com/s/w/1.asp 删除后就成了 baidu.com return False elif re.search("^/", url_2.path) != None: #如果一开始是/的话 那么他就会跳转到根目录的 sql = 'select count(*) from php90_cn_p where url like "%' + url_2.path + '%"' if DB().query(sql) > 100: return False else: return domain + url_2.path + url_2.query #不能删 提醒自己 elif re.search("^./", url_2.path) != None: sql = 'select count(*) from php90_cn_p where url like "%' + url_2.path + '%"' if DB().query(sql) > 100: return False else: if len(dangqian_url.split('/')) > 1: dangqian_url2 = dangqian_url.split('/')[-1] strinfo = re.compile("%s$" % (dangqian_url2)) dangqian_url = strinfo.sub('', dangqian_url) #把 最后一个 / 后面内容删掉
# -*- coding=utf-8 -*- import requests,re,sys,time,threading,Queue; import dns.resolver sys.path.append("..") from mysql.DB import DB; import core; db_plus = DB(); q = Queue.Queue(1); w = Queue.Queue(1); class simple(object): walk = 0; walk2 = 0; recursion_walk=0 """简单的http请求""" def __init__(self): self.header2 = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding':'gzip, deflate, br', 'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Connection':'keep-alive', 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:44.0) Gecko/20100101 Firefox/44.0'}; def h_get_text(self,url): #获取源码 try: r = requests.get(url,headers=self.header2,timeout=10); return r.text.encode('utf-8'); except Exception,e: print'\033[1;31;1m'+"Exception: %s Error: %s " % (Exception,e) +'\033[0m'; return ''; def h_post_text(self,url,canshu): #获取源码 try:
picture = random.randint(1, 4); tsk = []; #等待线程结束的 crawler_progress = []; # 爬虫等待线程结束的 if h_crawler != None: if len(h_crawler) < 2 : print '\033[1;31;1m Command parse error !!! \033[0m'; exit(); if thread > 1000: print '\033[1;31;1m Command parse error !!! \033[0m'; exit(); hound_db = DB(); blast = blast(); if Dictionaries: #批量导入字典 function.process(hound_db.Dictionaries,Dictionaries); elif url: if picture == 1: function.a1(); elif picture == 2: function.a2(); elif picture == 3: function.a3(); elif picture == 4: function.a4(); lis = hound_db.query_all("select lis from lis"); #获取所有字典数据 print "\033[1;35;1m Dictionary--> %i Tools--> hound version--> 0.3 \033[0m \n" % (len(lis));
h_crawler = args.crawler;#爬虫 picture = random.randint(1, 4); #生成随机数 tsk = []; #等待线程结束的 crawler_progress = []; # 爬虫等待线程结束的 if thread > 500: print '\033[1;31;1m Command parse error !!! \033[0m'; exit(); if __name__ == '__main__': hound_db = DB(); blast = blast(); if Dictionaries: #批量导入字典 function.process(hound_db.Dictionaries,Dictionaries); elif url: if picture == 1: function.a1(); elif picture == 2: function.a2(); elif picture == 3: function.a3(); elif picture == 4: function.a4(); lis = hound_db.query_all("select lis from lis"); #获取所有字典数据 print "\033[1;35;1m Dictionary--> %i Tools--> hound version--> 1.0 \033[0m \n" % (len(lis));
def __init__(self, table, url, thread, depth): #创建表 dangqiangurl = url print dangqiangurl, '----->Being crawler ! ^_^' depth = int(depth) thread = int(thread) tables = table + "_p" domain = url sql = "select count(table_name) from information_schema.tables where table_name = '%s' and TABLE_SCHEMA = '%s'" % ( tables, CORE.db) if not DB().query(sql): #判断表名存不存在 如果不存在就创建 sql = """CREATE TABLE IF NOT EXISTS %s ( id int not null primary key auto_increment, url text not null comment 'url', domain text not null comment 'yuming', state int default 0)DEFAULT CHARSET=utf8""" % (tables) DB().increase(sql) #创建表名 DB().p_url_increase(tables, url, domain) #入裤 while True: now_depth = DB().query( "select count(*) from %s where domain = '%s' and state = 1" % (tables, domain)) if now_depth > depth: break now_depth = DB().query("select count(*) from %s where state = 0" % (tables)) if now_depth == 0: break sql = "select url,domain from %s where state = 0 and domain = '%s' limit %i" % ( tables, domain, thread) url = DB().query_all(sql) if len(url) > 0: for x in url: ts = threading.Thread( target=crawler.p_get_text, args=["http://" + x[0], x[0], tables, x[1]]) ts.start() crawler.crawler_progress.append(ts) #设置T线程等待结束 sql = "update %s set state = 1 where url = '%s'" % (tables, x[0]) DB().increase(sql) time.sleep(0.2) for abcd in crawler.crawler_progress: abcd.join() #等待线程结束 time.sleep(0.5) else: break print dangqiangurl, '<----- OK End of crawler ^_^'
args=[tables, A, domain]) tkk.start() crawler.crawler_progress.append(tkk) print a_href, '-->Wait for all responses, and do the two processing ^_^' time.sleep(0.2) else: return False @staticmethod def if_code(tables, url, domain): try: code = requests.get("http://" + url, timeout=5).status_code except Exception, e: code = 404 if code != 404 and code != 403: #如果域名+ 文件存在 #入库之前先去除一些杂物 strinfo = re.compile("/+") a_href = strinfo.sub('/', url) #把 多个 "/" 替换成 / 当然 http:// 变成了http:/了 strinfo = re.compile("http:/") a_href = strinfo.sub('http://', url) #将http:/ 变成 http:// strinfo = re.compile("/+$") a_href = strinfo.sub('', url) #把最后的 / 删掉 DB().p_url_increase(tables, a_href, domain) #入裤
# -*- coding=utf-8 -*- import requests, re, sys, time, threading, socket sys.path.append("..") from mysql.DB import DB db_plus = DB() class simple(object): walk = 0 walk2 = 0 recursion_walk = 0 """简单的http请求""" def __init__(self): self.header2 = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:44.0) Gecko/20100101 Firefox/44.0' } def h_get_text(self, url): #获取源码 try: r = requests.get(url, headers=self.header2, timeout=10)
# -*- coding=utf-8 -*- import requests, re, sys, time, threading, socket import dns.resolver sys.path.append("..") from mysql.DB import DB import core db_plus = DB() class simple(object): walk = 0 walk2 = 0 recursion_walk = 0 """简单的http请求""" def __init__(self): self.header2 = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:44.0) Gecko/20100101 Firefox/44.0' } def h_get_text(self, url): #获取源码