예제 #1
0
파일: function.py 프로젝트: Key20/hound
def table_print(tables):  #表格

    a = DB().query_all("desc %s" % (tables))
    table_top = []

    for x in a:
        table_top.append(x[0])

    b = DB().query_all("select * from %s" % (tables))

    table_lis = []
    for x in b:
        table_lis.append(x)

    print tabulate(table_lis, table_top, tablefmt="grid")
예제 #2
0
    def recursion_blast_url(self, tables, t, lis, url_list):
        progress = sys.stdout
        tables = tables.replace('.', '_')
        total = len(lis)
        #请求总数

        fenliang = total / t
        #总数除以线程 得到每份数量

        kaishi = 0
        jiewei = fenliang

        self.simple = simple()
        now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        for recursion_url in url_list:
            print " URL:" + recursion_url[
                0] + "-->\033[1;32;1m  Send out all the requests  Current time:  %s \r \033[0m" % (
                    datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
            while True:
                list2 = lis[kaishi:jiewei]
                #获取成员份量

                t = threading.Thread(
                    target=self.simple.recursion_h_get_blast_text,
                    args=[recursion_url[0], list2, tables])
                #判断是否存在域名 如果有就入库 表名是url的值
                t.start()
                blast.xiancheng.append(t)
                if jiewei > total:
                    kaishi = 0
                    jiewei = fenliang
                    break
                else:
                    kaishi = kaishi + fenliang
                    jiewei = jiewei + fenliang
                time.sleep(0.02)
            sql = "update %s set recursion = 1 where url = '%s'" % (
                tables, recursion_url[0])
            DB().increase(sql)

        for tt in blast.xiancheng:
            tt.join()
            #等待所有线程结束

        print "\033[1;32;1m  <--Above the domain name to send complete 0o(^_^)o0  Current time: %s \033[0m  \r\n" % (
            datetime.datetime.now().strftime("%Y-%m-%d %H:%M"))
예제 #3
0
def handle(url, dangqian_url, tables, domain):

    url_2 = urlparse(url)
    url = url.replace('http://', '').replace('https://', '')

    if url_2.netloc != "":  #判断是否有netloc;

        if re.search(".%s" % (domain), url_2.netloc) != None:  #如果相关域名存在的话
            print '\033[1;38;1m  Get a 1 related domain name  %s \033[0m' % (
                url.split('/')[0])
            strinfo = re.compile("_p$")
            tables = strinfo.sub('', tables)

            try:
                ip = socket.gethostbyname(url)
            except Exception, e:
                ip = False
            if ip:
                DB().Domain_storage(tables,
                                    url.split('/')[0], ip)
                #url.split('/')[0]   split('/')[0]  意思是从第一个/ 开始删除 比如 baidu.com/s/w/1.asp 删除后就成了 baidu.com

            return False
예제 #4
0
def im_url(file_url,tables):
	if not os.path.exists(file_url):
		print '\033[1;31;1m   Sorry, the file does not exist.    \033[0m';
		exit();
	else:
		im_url_db = DB();
		sql = "select count(table_name) from information_schema.tables where table_name = '%s' and TABLE_SCHEMA = '%s'" % (tables,core.CORE.db)
  		if im_url_db.query(sql):
  			for url in open(file_url):
  				url = url.replace('http://','').replace('https://','').replace('\r','').replace('\n','');
  				im_resolver=dns.resolver.Resolver();
				im_resolver.nameservers=core.default_dns
				try:
					ip=im_resolver.query(url,'A')[0];
				except Exception,e:
					ip = False;
				if ip != False:
					sql = "select count(id) from %s where url = '%s' " % (tables,url);
					if not im_url_db.query(sql):
						sql = "insert into %s values (null,\"%s\",\"%s\",0,0,0,0)" % (tables,url,str(ip));
						im_url_db.increase(sql);
						print url,"===>OK",ip;
예제 #5
0
파일: hound.py 프로젝트: wsppt/hound
h_crawler = args.crawler;#简单获取a标签


picture = random.randint(1, 4); #生成随机数



tsk = [];  #等待线程结束的
crawler_progress = []; # 爬虫等待线程结束的

if thread > 500:
	print '\033[1;31;1m Command parse error !!! \033[0m';
	exit();

if __name__ == '__main__':
	hound_db = DB();
	blast = blast();
	if Dictionaries: #批量导入字典
		function.process(hound_db.Dictionaries,Dictionaries);
	elif imurl and len(imurl) == 2: #导入域名
		function.im_url(imurl[0],imurl[1])
	elif url:
		if picture == 1:
			function.a1();
		elif picture == 2:
			function.a2();
		elif picture == 3:
			function.a3();
		elif picture == 4:
			function.a4();
		lis = hound_db.query_all("select lis from lis"); #获取所有字典数据
예제 #6
0
            strinfo = re.compile("_p$")
            tables = strinfo.sub('', tables)

            try:
                ip = socket.gethostbyname(url)
            except Exception, e:
                ip = False
            if ip:
                DB().Domain_storage(tables,
                                    url.split('/')[0], ip)
                #url.split('/')[0]   split('/')[0]  意思是从第一个/ 开始删除 比如 baidu.com/s/w/1.asp 删除后就成了 baidu.com

            return False
    elif re.search("^/", url_2.path) != None:  #如果一开始是/的话 那么他就会跳转到根目录的
        sql = 'select count(*) from php90_cn_p where url like "%' + url_2.path + '%"'
        if DB().query(sql) > 100:
            return False
        else:

            return domain + url_2.path + url_2.query

        #不能删 提醒自己
    elif re.search("^./", url_2.path) != None:
        sql = 'select count(*) from php90_cn_p where url like "%' + url_2.path + '%"'
        if DB().query(sql) > 100:
            return False
        else:
            if len(dangqian_url.split('/')) > 1:
                dangqian_url2 = dangqian_url.split('/')[-1]
                strinfo = re.compile("%s$" % (dangqian_url2))
                dangqian_url = strinfo.sub('', dangqian_url)  #把 最后一个 / 后面内容删掉
예제 #7
0
# -*- coding=utf-8 -*-
import requests,re,sys,time,threading,Queue;
import dns.resolver
sys.path.append("..")
from mysql.DB import DB;
import core;
db_plus = DB();
q = Queue.Queue(1);
w = Queue.Queue(1);
class simple(object):
	walk = 0;
	walk2 = 0;
	recursion_walk=0
	"""简单的http请求"""
	def __init__(self):
		self.header2 = {
		'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
		'Accept-Encoding':'gzip, deflate, br',
		'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
		'Connection':'keep-alive',
		'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:44.0) Gecko/20100101 Firefox/44.0'};
	def h_get_text(self,url): #获取源码
		try:
			r = requests.get(url,headers=self.header2,timeout=10);  
			return r.text.encode('utf-8'); 
		except Exception,e:
			print'\033[1;31;1m'+"Exception: %s  Error: %s " % (Exception,e) +'\033[0m';
			return '';

	def h_post_text(self,url,canshu): #获取源码
		try:
예제 #8
0
파일: hound.py 프로젝트: dmwlei/hound
picture = random.randint(1, 4);



tsk = [];  #等待线程结束的
crawler_progress = []; # 爬虫等待线程结束的
if h_crawler != None:
	if len(h_crawler) < 2 :
		print '\033[1;31;1m Command parse error !!! \033[0m';
		exit();
if thread > 1000:
	print '\033[1;31;1m Command parse error !!! \033[0m';
	exit();


hound_db = DB();
blast = blast();
if Dictionaries: #批量导入字典
	function.process(hound_db.Dictionaries,Dictionaries);
elif url:
	if picture == 1:
		function.a1();
	elif picture == 2:
		function.a2();
	elif picture == 3:
		function.a3();
	elif picture == 4:
		function.a4();
	lis = hound_db.query_all("select lis from lis"); #获取所有字典数据
	
	print "\033[1;35;1m  Dictionary--> %i Tools--> hound version--> 0.3 \033[0m  \n" % (len(lis));
예제 #9
0
파일: hound.py 프로젝트: Key20/hound
h_crawler = args.crawler;#爬虫


picture = random.randint(1, 4); #生成随机数



tsk = [];  #等待线程结束的
crawler_progress = []; # 爬虫等待线程结束的

if thread > 500:
	print '\033[1;31;1m Command parse error !!! \033[0m';
	exit();

if __name__ == '__main__':
	hound_db = DB();
	blast = blast();
	if Dictionaries: #批量导入字典
		function.process(hound_db.Dictionaries,Dictionaries);
	elif url:
		if picture == 1:
			function.a1();
		elif picture == 2:
			function.a2();
		elif picture == 3:
			function.a3();
		elif picture == 4:
			function.a4();
		lis = hound_db.query_all("select lis from lis"); #获取所有字典数据
		
		print "\033[1;35;1m  Dictionary--> %i Tools--> hound version--> 1.0 \033[0m  \n" % (len(lis));
예제 #10
0
파일: crawler.py 프로젝트: uncia/hound
    def __init__(self, table, url, thread, depth):  #创建表
        dangqiangurl = url
        print dangqiangurl, '----->Being crawler ! ^_^'
        depth = int(depth)
        thread = int(thread)
        tables = table + "_p"
        domain = url

        sql = "select count(table_name) from information_schema.tables where table_name = '%s' and TABLE_SCHEMA = '%s'" % (
            tables, CORE.db)

        if not DB().query(sql):  #判断表名存不存在 如果不存在就创建
            sql = """CREATE TABLE IF NOT EXISTS %s (
				id int not null primary key auto_increment,
				url text not null comment 'url',
				domain text not null comment 'yuming',
				state int default 0)DEFAULT CHARSET=utf8""" % (tables)

            DB().increase(sql)
            #创建表名

        DB().p_url_increase(tables, url, domain)
        #入裤

        while True:

            now_depth = DB().query(
                "select count(*) from %s where domain = '%s' and state = 1" %
                (tables, domain))

            if now_depth > depth:
                break

            now_depth = DB().query("select count(*) from %s where state = 0" %
                                   (tables))

            if now_depth == 0:
                break

            sql = "select url,domain from %s where state = 0 and domain = '%s' limit %i" % (
                tables, domain, thread)

            url = DB().query_all(sql)

            if len(url) > 0:
                for x in url:

                    ts = threading.Thread(
                        target=crawler.p_get_text,
                        args=["http://" + x[0], x[0], tables, x[1]])
                    ts.start()
                    crawler.crawler_progress.append(ts)  #设置T线程等待结束
                    sql = "update %s set state = 1 where url = '%s'" % (tables,
                                                                        x[0])
                    DB().increase(sql)

                    time.sleep(0.2)

                for abcd in crawler.crawler_progress:
                    abcd.join()
                    #等待线程结束
                    time.sleep(0.5)

            else:
                break

        print dangqiangurl, '<----- OK End of crawler  ^_^'
예제 #11
0
파일: crawler.py 프로젝트: uncia/hound
                                               args=[tables, A, domain])
                        tkk.start()
                        crawler.crawler_progress.append(tkk)

                    print a_href, '-->Wait for all responses, and do the two processing ^_^'
                time.sleep(0.2)

        else:
            return False

    @staticmethod
    def if_code(tables, url, domain):
        try:

            code = requests.get("http://" + url, timeout=5).status_code

        except Exception, e:
            code = 404
        if code != 404 and code != 403:  #如果域名+ 文件存在
            #入库之前先去除一些杂物
            strinfo = re.compile("/+")
            a_href = strinfo.sub('/',
                                 url)  #把 多个 "/" 替换成 / 当然 http:// 变成了http:/了
            strinfo = re.compile("http:/")
            a_href = strinfo.sub('http://', url)  #将http:/ 变成 http://
            strinfo = re.compile("/+$")
            a_href = strinfo.sub('', url)  #把最后的 / 删掉

            DB().p_url_increase(tables, a_href, domain)
            #入裤
예제 #12
0
파일: simple.py 프로젝트: uncia/hound
# -*- coding=utf-8 -*-
import requests, re, sys, time, threading, socket
sys.path.append("..")
from mysql.DB import DB

db_plus = DB()


class simple(object):

    walk = 0
    walk2 = 0
    recursion_walk = 0
    """简单的http请求"""
    def __init__(self):
        self.header2 = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Encoding':
            'gzip, deflate, br',
            'Accept-Language':
            'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
            'Connection':
            'keep-alive',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:44.0) Gecko/20100101 Firefox/44.0'
        }

    def h_get_text(self, url):  #获取源码
        try:
            r = requests.get(url, headers=self.header2, timeout=10)
예제 #13
0
파일: simple.py 프로젝트: Key20/hound
# -*- coding=utf-8 -*-
import requests, re, sys, time, threading, socket
import dns.resolver

sys.path.append("..")
from mysql.DB import DB
import core

db_plus = DB()


class simple(object):
    walk = 0
    walk2 = 0
    recursion_walk = 0
    """简单的http请求"""
    def __init__(self):
        self.header2 = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Encoding':
            'gzip, deflate, br',
            'Accept-Language':
            'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
            'Connection':
            'keep-alive',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:44.0) Gecko/20100101 Firefox/44.0'
        }

    def h_get_text(self, url):  #获取源码