Exemplo n.º 1
0
def test():
    runque = RedisQueueConnection('scan').conn
    #########runque.flushdb()
    size = runque.qsize()
    t = 0
    cnt = 0
    port = 0
    data = []
    if size == 0:
        return
    tmp = runque.get()
    runque.put(tmp)
    port, iph, ipl = tmp.split(' ')

    print port, size

    raw_input('confirm:')

    while runque.qsize() > 0:
        tmp = runque.get()
        try:
            t += len(tmp.split(' ')[-1].split(','))
            data.append(tmp)
        except:
            pass
        cnt += 1

    f = "china%s_%s_%s.txt" % (port, size, t)
    fp = open(f, 'w')
    for item in data:
        fp.write(item + '\n')

    fp.close()
Exemplo n.º 2
0
def main():
    if len(sys.argv) != 4:
        print "wrong param"
        exit(0)
        

        
    port = int(sys.argv[1])
    cnt = int(sys.argv[2])
    cur = int(sys.argv[3])
    
    ol = 'ret_%s.txt' %  (port)
    f = open(il)
    i = 0
    slist = list()
    lines = f.read().strip().split('\n')
    
    slist = lines[ cur : len(lines) : cnt ]
    scancnt = len(slist)

    tmp = "scanips_%s_%s_%s" % (port, cnt, cur)
    if os.path.isfile(tmp):
        os.unlink(tmp)

    f = open(tmp,'w')
    for ip in slist:
        f.write(ip + '\n')
    f.close()

       
    cmd = scancmd % (port, tmp, ol)
    print cmd
    run(cmd)
    # when done
    # make bitmap to store the scanret and then insert into redis
    print "run done, we collect the ips"
    tmp = open(ol).read().split('\n')[1:-2]
    ips = list()
    for ip in tmp:
        ips.append(ip.split()[3])

    alivecnt = len(ips)
    print "SCAN: %d ALIVE: %d " % (scancnt, alivecnt)
    
    scanque = RedisQueueConnection('scan').conn
    
    ipd = dict()
    for ip in ips:
        h  = calc(ip)
        ht = ip.split('.')[-1]
        if h in ipd:
            ipd[h].append(ht)
        else:
            ipd[h] = [ht]
    for h in ipd:
        i = [port, h, ipd[h]]
        item = pickle.dumps(i)
        scanque.put(item)
    print "Insert into redis done "
    print "Total: %d" % (scanque.qsize())
Exemplo n.º 3
0
def test():
    runque = RedisQueueConnection('scan').conn
    #########runque.flushdb()
    size =  runque.qsize()
    t = 0
    cnt = 0
    port = 0
    data = []
    if size == 0:
        return
    tmp = runque.get()
    runque.put(tmp)
    port, iph, ipl = tmp.split(' ') 

    print port, size

    raw_input('confirm:')

    while runque.qsize() > 0:
        tmp =  runque.get()
        try:
            t += len(tmp.split(' ')[-1].split(','))
            data.append(tmp)
        except:
            pass
        cnt += 1

   
    f = "china%s_%s_%s.txt" % (port, size, t)
    fp = open(f, 'w')
    for item in data:
        fp.write(item + '\n')

    fp.close()
Exemplo n.º 4
0
def insert():
    
    runque = RedisQueueConnection('extracturls').conn
    urls = flist('urlstogetrobots1.txt')[100000:300000]
    print len(urls)
    for url in urls:
        runque.put(url)
Exemplo n.º 5
0
def test():
    runque = RedisQueueConnection('scan').conn

    size = runque.qsize()
    print size
    sleep(1)
    cnt = 0
    if size:
        while cnt < size:
            i = runque.get()
            print i
            runque.put(i)
            cnt += 1

        runque.flushdb()
    exit(0)

    f = open('seeds995k.txt')

    urls = f.read().strip().split('\n')

    if size == 0:
        i = 0
        st = time()
        for url in urls:
            runque.put(url)
Exemplo n.º 6
0
def main():
    if len(sys.argv) != 4:
        print "wrong param"
        exit(0)

    port = int(sys.argv[1])
    cnt = int(sys.argv[2])
    cur = int(sys.argv[3])

    ol = 'ret_%s.txt' % (port)
    f = open(il)
    i = 0
    slist = list()
    lines = f.read().strip().split('\n')

    slist = lines[cur:len(lines):cnt]
    scancnt = len(slist)

    tmp = "scanips_%s_%s_%s" % (port, cnt, cur)
    if os.path.isfile(tmp):
        os.unlink(tmp)

    f = open(tmp, 'w')
    for ip in slist:
        f.write(ip + '\n')
    f.close()

    cmd = scancmd % (port, tmp, ol)
    print cmd
    run(cmd)
    # when done
    # make bitmap to store the scanret and then insert into redis
    print "run done, we collect the ips"
    tmp = open(ol).read().split('\n')[1:-2]
    ips = list()
    for ip in tmp:
        ips.append(ip.split()[3])

    alivecnt = len(ips)
    print "SCAN: %d ALIVE: %d " % (scancnt, alivecnt)

    scanque = RedisQueueConnection('scan').conn

    ipd = dict()
    for ip in ips:
        h = calc(ip)
        ht = ip.split('.')[-1]
        if h in ipd:
            ipd[h].append(ht)
        else:
            ipd[h] = [ht]
    for h in ipd:
        i = [port, h, ipd[h]]
        item = pickle.dumps(i)
        scanque.put(item)
    print "Insert into redis done "
    print "Total: %d" % (scanque.qsize())
Exemplo n.º 7
0
def daemon(tempque):
    st = random.random() + 0.5
    doneque = RedisQueueConnection('robots').conn

    while True:
        if not tempque.empty():
            dat = tempque.get()
            doneque.put(dat)
        else:
            sleep(st)
Exemplo n.º 8
0
def inserturls():
    
    runque = RedisQueueConnection('extracturls').conn
    print runque.qsize()
    raw_input('flushdb?')
    runque.flushdb()
    urls = flist('urlstogetip.txt')
    for url in urls:
        runque.put(url)
    
    print runque.qsize()
Exemplo n.º 9
0
def test():
    runque = RedisQueueConnection('running').conn

    size = runque.qsize()
    print size

    exit(0)
    if size == 0:
        f = open('seeds995k.txt')
        for c in f:
            url = c.strip()
            runque.put(url)
Exemplo n.º 10
0
def test():
    runque = RedisQueueConnection('running').conn

    size = runque.qsize()
    print size



    exit(0)
    if size == 0:
        f = open('seeds995k.txt')
        for c in f:
            url = c.strip()
            runque.put(url)
Exemplo n.º 11
0
def test():
    runque = RedisQueueConnection('robots').conn
    #########runque.flushdb()
    size =  runque.qsize()
    item = runque.get()
    runque.put(item)
    print pickle.loads(item)

    print size
    
    return
    raw_input('cofrim')
    s = flist('urlstogetrobots.txt')
    for url in s:
        runque.put(url) 


    print runque.qsize()
Exemplo n.º 12
0
def show(name):
    runque = RedisQueueConnection(name).conn
    cnt = 0
    while cnt < runque.qsize():
        data = runque.get()
        runque.put(data)
        data = pickle.loads(data)
        
        seed =  data['seed']
        data = data['content'].replace('\r', '\n').replace('\n\n','\n').strip()
        if not data:
            continue
        if data.find('<') >= 0:
            #html page
            print seed
            continue
        
        robots = data.split('\n')
        print seed 
        print
        print "\n".join(robots)
        print  
        cnt += 1
Exemplo n.º 13
0
class Daemon:

    def __init__(self, done_que):
        self.cnt = 0
        self.showpercounts = 100
        self.dbsize = 0
        self.dbsizelimit = 536870912 # 512M 536870912
        self.spend = 0
        
        #queue for daemon recieve downloaded websites info
        self.done_que = done_que
        #urls queue to put filtered urls extracted from the webpage
        self.urls_que = RedisQueueConnection('extracturls').conn
        self.urlfilter = Filter()

        self.ip = getip()
        self.fname = self.getdbname()
        self.conn = sqlite3.connect(self.fname)
        self.conn.execute("create table if not exists mainpages (id integer primary key autoincrement, url TEXT,headers TEXT,content BLOB)")
       
        #when recv the ctrl+c signal, run out the extractation jobs and then quit 
        self.quit = False
 
    def getdbname(self, create=False):
        
        path = "/work/db"
        tf = "%Y%m%d-%H%M%S"
        pre = "sitedata"
        suf = ".db"
        dbsize = 0
 
        ip = getip()
        findname = "%s%s" % (ip, suf)

        if create == True:
            date = time.strftime(tf, time.localtime())
            lastname = "_".join([pre, date, ip]) + suf
            self.dbsize = 0
            print "Create db: ", lastname
            return os.path.join(path, lastname)
        
        fnames = os.listdir(path)

        last = 0
        lastname = ""
        for fname in fnames:
            if fname.endswith(findname):
                fnow = fname.split('_')[1]
                fnown = int(time.mktime(time.strptime(fnow, tf)))
                if fnown > last:
                    last = fnown
                    lastname = fname
        #can not found the newest db file, so create it
        if not last:
            date = time.strftime(tf, time.localtime())
            lastname = "_".join([pre, date, ip]) + suf
            print "Create db: ", lastname
            self.dbsize = 0
        else:
            print "Reuse the last db: ", lastname
            self.dbsize = os.stat(os.path.join(path, lastname)).st_size 
            
        return os.path.join(path, lastname)


    def geturls(self, seed, content):
        urls = []
        returls = []
        if not content  or len(content) == 0:
            return []
        try:
            urls = re.findall(self.urlfilter.urlpatern, content)
            returls = self.urlfilter.filter_urls(seed,urls)
        except:
            pass
        return returls


    def run(self):
        #backend job,
        sleep(2)
        while True:
            try:
                if self.done_que.empty():
                    if self.quit == True:
                        #the speed to extract urls is more higher than crawler
                        sleep(1)
                        if not self.done_que.empty():
                            continue
                        print "Daemon run done and quit successfuly"
                        exit(0)
                    
                    #print "Downloaded queue empty, wait crawler ..."
                    sleep(10)
                    continue

                data = cPickle.loads(self.done_que.get())

                seed  = data['seed']
                content = data['content']
                headers = str(data['headers'])
                
                urls = self.geturls(seed, content)
                
                #put the extracted urls to urls_que
                for url in urls:
                    self.urls_que.put(url)
 
                #use level 1 to compress data , we get enough compress ratio and speed
                gziphtml = sqlite3.Binary(gzip.zlib.compress(content, 1))
                self.dbsize += ( len(gziphtml) + len(seed) + len(headers) )

                self.conn.execute("insert into mainpages (url,headers,content) values (?,?,?)", (seed, headers, gziphtml))
                
                self.cnt += 1
                if self.cnt % self.showpercounts == 0:
                    self.conn.commit()
                    
                    print "\n%s\n\tExtract done:%d todo:%d size:%dM" % \
                         (self.ip, self.cnt, self.done_que.qsize(),  self.dbsize/1024/1024)
                    
                
                    if self.dbsize > self.dbsizelimit:
                        self.fname = self.getdbname(True)
                        self.conn.close()
                        self.conn = sqlite3.connect(self.fname)
                        self.conn.execute("create table if not exists mainpages (id integer primary key autoincrement, url TEXT,headers TEXT,content BLOB)")
        
            except Exception as e:
                print e
            except KeyboardInterrupt:
                print "Daemon recv quit singal, waiting for queue empty"
                self.quit = True
Exemplo n.º 14
0
            con = req.content
            #print url, len(con)
            req.close()
        
    except:
        pass
    data =  (url, con)
    cb(data)
   

from time import time


def cb(data):
    seed, con = data
    #print "\t", seed, len(con)

 
cnt = 0
sst = time()
while True:
    url = runque.get()
    runque.put(url)
    st = time()
    pool.spawn(httpget, url)
    et = time()
    cnt += 1

    if cnt % 10 == 0:
        print cnt / (et-sst) ,runque.qsize(), robotsque.qsize()
Exemplo n.º 15
0
class Daemon:
    def __init__(self, done_que):
        self.cnt = 0
        self.showpercounts = 100
        self.dbsize = 0
        self.dbsizelimit = 536870912  # 512M 536870912
        self.spend = 0

        #queue for daemon recieve downloaded websites info
        self.done_que = done_que
        #urls queue to put filtered urls extracted from the webpage
        self.urls_que = RedisQueueConnection('extracturls').conn
        self.urlfilter = Filter()

        self.ip = getip()
        self.fname = self.getdbname()
        self.conn = sqlite3.connect(self.fname)
        self.conn.execute(
            "create table if not exists mainpages (id integer primary key autoincrement, url TEXT,headers TEXT,content BLOB)"
        )

        #when recv the ctrl+c signal, run out the extractation jobs and then quit
        self.quit = False

    def getdbname(self, create=False):

        path = "/work/db"
        tf = "%Y%m%d-%H%M%S"
        pre = "sitedata"
        suf = ".db"
        dbsize = 0

        ip = getip()
        findname = "%s%s" % (ip, suf)

        if create == True:
            date = time.strftime(tf, time.localtime())
            lastname = "_".join([pre, date, ip]) + suf
            self.dbsize = 0
            print "Create db: ", lastname
            return os.path.join(path, lastname)

        fnames = os.listdir(path)

        last = 0
        lastname = ""
        for fname in fnames:
            if fname.endswith(findname):
                fnow = fname.split('_')[1]
                fnown = int(time.mktime(time.strptime(fnow, tf)))
                if fnown > last:
                    last = fnown
                    lastname = fname
        #can not found the newest db file, so create it
        if not last:
            date = time.strftime(tf, time.localtime())
            lastname = "_".join([pre, date, ip]) + suf
            print "Create db: ", lastname
            self.dbsize = 0
        else:
            print "Reuse the last db: ", lastname
            self.dbsize = os.stat(os.path.join(path, lastname)).st_size

        return os.path.join(path, lastname)

    def geturls(self, seed, content):
        urls = []
        returls = []
        if not content or len(content) == 0:
            return []
        try:
            urls = re.findall(self.urlfilter.urlpatern, content)
            returls = self.urlfilter.filter_urls(seed, urls)
        except:
            pass
        return returls

    def run(self):
        #backend job,
        sleep(2)
        while True:
            try:
                if self.done_que.empty():
                    if self.quit == True:
                        #the speed to extract urls is more higher than crawler
                        sleep(1)
                        if not self.done_que.empty():
                            continue
                        print "Daemon run done and quit successfuly"
                        exit(0)

                    #print "Downloaded queue empty, wait crawler ..."
                    sleep(10)
                    continue

                data = cPickle.loads(self.done_que.get())

                seed = data['seed']
                content = data['content']
                headers = str(data['headers'])

                urls = self.geturls(seed, content)

                #put the extracted urls to urls_que
                for url in urls:
                    self.urls_que.put(url)

                #use level 1 to compress data , we get enough compress ratio and speed
                gziphtml = sqlite3.Binary(gzip.zlib.compress(content, 1))
                self.dbsize += (len(gziphtml) + len(seed) + len(headers))

                self.conn.execute(
                    "insert into mainpages (url,headers,content) values (?,?,?)",
                    (seed, headers, gziphtml))

                self.cnt += 1
                if self.cnt % self.showpercounts == 0:
                    self.conn.commit()

                    print "\n%s\n\tExtract done:%d todo:%d size:%dM" % \
                         (self.ip, self.cnt, self.done_que.qsize(),  self.dbsize/1024/1024)

                    if self.dbsize > self.dbsizelimit:
                        self.fname = self.getdbname(True)
                        self.conn.close()
                        self.conn = sqlite3.connect(self.fname)
                        self.conn.execute(
                            "create table if not exists mainpages (id integer primary key autoincrement, url TEXT,headers TEXT,content BLOB)"
                        )

            except Exception as e:
                print e
            except KeyboardInterrupt:
                print "Daemon recv quit singal, waiting for queue empty"
                self.quit = True
Exemplo n.º 16
0
class Crawler:

    def __init__(self, done_que):

        self.showpercounts = 100
        self.timeout = 5
        self.starttime = time()

        self.quit = False

        self.run_que = RedisQueueConnection('running').conn
        self.done_que = done_que
        self.tasks = []
        self.done = 1

        self.errdone = set()
        self.err = Error()
        self.https_enable = 0 

        self.httpget = self.httpget_requests # down method self.httpget_requests | httpget_curl

        self.poolsize = 100
        self.down_pool = Pool(size=self.poolsize)

        self.totalnettime = 0
        self.totaldownsize = 0
        
        self.ip = getip()

    #callback function when greenlet of httpget run done
    def cb_httpget(self, data = None):

        if not data:
            return
        seed, err, headers, content = data

        if err:
            self.handle_error(err,seed)
            return

        data={'seed':seed,'headers':headers,'content':content}
        
        dat = cPickle.dumps(data)
        #self.done_que.put_nowait(dat)

        #print "done", seed
        if self.done % self.showpercounts == 0:
            self.out(seed)


    def out(self, seed):

        spendtime = time() - self.starttime
        spendtime = 1 if spendtime == 0 else spendtime
        nowh = str(int(spendtime)/3600)+":" if spendtime>3600 else ""
        now = "%s%02d:%02d" % (nowh, spendtime%3600/60, spendtime%60 )
        print "\n%s\t%s D:%-4d R:%-7d [QPS: %.2f  %.2f]  %s" % (self.ip, now, (self.done), self.run_que.qsize(), \
            self.done/spendtime, self.done/self.totalnettime , str(self.err) )
    
    
    def run(self):

        while self.quit == False:
            try:
                if self.run_que.qsize() == 0:
                    print "run que empty"
                    sleep(10)
                    continue
                url = self.run_que.get()
                self.run_que.put(url)
                #self.down_pool.apply_cb(self.httpget, (url,), callback=self.cb_httpget)
                # spawn is more fast?
                #url = 'http://www.sdust.edu.cn'
                self.down_pool.spawn(self.httpget, url)
                self.done += 1
            except KeyboardInterrupt:
                print "Crawler recv quit singal"
                self.quit = True

        self.down_pool.join()
        print "Crawler over, quit"

    def handle_error(self,e,url):
       
        self.err.lasterrurl = url 
        # do not record the err url, but record the least err url to show
        if e.find('DNSError') > 0 :
            self.err.dns += 1
            #self.err.rdns.append(url)
        elif e.find('reset') > 0 :#Connection reset
            self.err.reset += 1
            #self.err.rreset.append(url)
        elif e.find('Max retries') > 0 or e.find('Connection aborted'): #
            self.err.conntimeout += 1
            #self.err.rconntimeout.append(url)
        elif e.find('refused') > 0: #Connection refused
            self.err.refuse += 1
            #self.err.rrefuse.append(url)

        else:
            self.err.others +=1
            #self.err.rothers.append(url)
            print "Error", url, e

    
    # requests is better than curl in tests
    def httpget_requests(self, url):
       
        #return data
        data = None 
        st = time()
        con = ""
        e = ""
        res_headers = ""
        headers = {
                    'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.6',
                    'Accept-Encoding':'gzip,deflate',
                    'Connection':'close',
                    'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
                }


        res = None
        done = False
        try:
            with gevent.Timeout(3, False) as timeout:
                #req.max_redirects = 2
                res = requests.get(url, headers = headers )
                con = res.content
                res.close()
                done = True
        except KeyboardInterrupt:
                raise
        except Exception as e:
            e = str(e)
            if res:
                res.close()

            #as for spawn, no callback , we should call by ourself
            data = (url, e, None, None)
            #return url,e,None,None

        et = time()
        self.totalnettime += (et-st)
        #spawn
        if done:
            data = (url, e, res.headers, con)
            
        self.cb_httpget(data)
Exemplo n.º 17
0
            con = req.content
            #print url, len(con)
            req.close()

    except:
        pass
    data = (url, con)
    cb(data)


from time import time


def cb(data):
    seed, con = data
    #print "\t", seed, len(con)


cnt = 0
sst = time()
while True:
    url = runque.get()
    runque.put(url)
    st = time()
    pool.spawn(httpget, url)
    et = time()
    cnt += 1

    if cnt % 10 == 0:
        print cnt / (et - sst), runque.qsize(), robotsque.qsize()
Exemplo n.º 18
0
def getsize(name):
    runque = RedisQueueConnection(name).conn
    print runque.qsize()
    i = runque.get()
    runque.put(i)
    print i