Пример #1
0
def crawl(url, currentDepth, countUrls):

    redisCon = Redis(host=conf.REDIS_HOST,
                      port=conf.REDIS_PORT,
                      password=conf.REDIS_PASSWD)

    try:
        headers = dict()
        headers[HTTP_HEADER.USER_AGENT] = randomUserAgents()

        response = requests.get(url, timeout=10, headers=headers)
        # crawlMsg = 'crawled %s depth: %d count: %d' % (url, currentDepth, countVisitedUrls)
        # logger.log(CUSTOM_LOGGING.SYSINFO, crawlMsg)
        content = response.text

        kb.pageEncoding = response.encoding
        conf.cookie = str(response.cookies.get_dict())
        hashData = hashUrl(url)
        redisCon.sadd('visited', hashData)
        redisCon.lpush('visitedList', url)
        getDB().insert({'url':url, 'depth': currentDepth, 'count':countUrls})

    except Exception, ex:
        logger.log(CUSTOM_LOGGING.ERROR, ex)
        # print traceback.print_exc()
        return
Пример #2
0
    def crawlerThread():
        global countVisitedUrls

        while visitQueue.qsize() > 0:
            url = visitQueue.get()
            try:
                hashData = hashUrl(url)
                if hashData not in visited:
                    headers[HTTP_HEADER.USER_AGENT] = randomUserAgents()
                    response = requests.get(url, timeout=10, headers=headers)
                    crawlMsg = 'crawled %s depth: %d count: %d' % (url, currentDepth, countVisitedUrls)
                    logger.log(CUSTOM_LOGGING.SYSINFO, crawlMsg)
                    content = response.text

                    kb.pageEncoding = response.encoding
                    conf.cookie = str(response.cookies.get_dict())

                    try:
                        lock.acquire()
                        visited.add(hashData)
                        countVisitedUrls += 1
                        fp.write(url + '\n')
                        lock.release()
                    except Exception, ex:
                        logger.log(CUSTOM_LOGGING.ERROR, ex)
                        if lock.locked():
                            lock.release()
                        continue
                else:
                    continue
Пример #3
0
 def __init__(self):
     self.redisCon = Redis(host=conf.REDIS_HOST,
                           port=conf.REDIS_PORT,
                           password=conf.REDIS_PASSWD)
     self.jobQueue = Queue(connection=self.redisCon)
     map(lambda key: self.redisCon.delete(key), [key for key in self.redisCon.keys() if re.search('visit|rq:', key, re.I)])
     hashData = hashUrl(conf.CRAWL_SITE)
     self.redisCon.lpush('visit', conf.CRAWL_SITE)
     self.redisCon.sadd('visitSet', hashData)
Пример #4
0
 def __init__(self):
     self.redisCon = Redis(host=conf.REDIS_HOST,
                           port=conf.REDIS_PORT,
                           password=conf.REDIS_PASSWD)
     self.jobQueue = Queue(connection=self.redisCon)
     map(lambda key: self.redisCon.delete(key), [
         key for key in self.redisCon.keys()
         if re.search('visit|rq:', key, re.I)
     ])
     hashData = hashUrl(conf.CRAWL_SITE)
     self.redisCon.lpush('visit', conf.CRAWL_SITE)
     self.redisCon.sadd('visitSet', hashData)
Пример #5
0
def crawl(target):

    visited = set()
    visitQueue = Queue()
    visitQueue.put(target)
    hashData = hashUrl(target)
    visitSet.add(hashData)
    fp = codecs.open(paths.PENEWORK_ROOT_PATH+'/'+conf.STORE_FILENAME, 'w', 'utf-8')
    lock = Lock()
    headers = dict()

    currentDepth = 0

    def crawlerThread():
        global countVisitedUrls

        while visitQueue.qsize() > 0:
            url = visitQueue.get()
            try:
                hashData = hashUrl(url)
                if hashData not in visited:
                    headers[HTTP_HEADER.USER_AGENT] = randomUserAgents()
                    response = requests.get(url, timeout=10, headers=headers)
                    crawlMsg = 'crawled %s depth: %d count: %d' % (url, currentDepth, countVisitedUrls)
                    logger.log(CUSTOM_LOGGING.SYSINFO, crawlMsg)
                    content = response.text

                    kb.pageEncoding = response.encoding
                    conf.cookie = str(response.cookies.get_dict())

                    try:
                        lock.acquire()
                        visited.add(hashData)
                        countVisitedUrls += 1
                        fp.write(url + '\n')
                        lock.release()
                    except Exception, ex:
                        logger.log(CUSTOM_LOGGING.ERROR, ex)
                        if lock.locked():
                            lock.release()
                        continue
                else:
                    continue

            except Exception, ex:
                logger.log(CUSTOM_LOGGING.ERROR, ex)
                print traceback.print_exc()
                continue
Пример #6
0
    def start(self):

        initDB()

        countDepth = 0
        countUrls = 0

        while countDepth <= int(conf.CRAWL_DEPTH):

            while True:
                # wait for 10 minites
                # print 'len visite:', self.redisCon.llen('visit')
                # print 'len visited:', self.redisCon.scard('visited')
                url = self.redisCon.lpop('visit')
                if url:
                    countUrls += 1
                    print 'countDepth:', countDepth, 'countUrls:', countUrls
                    self.jobQueue.enqueue_call(crawl,
                                               args=(url, countDepth,
                                                     countUrls))
                else:
                    self.redisCon.delete('visitSet')
                    break

            while True:
                # wait 30 seconds, if timeout, jobqueue is empty(except failed job)
                keyUrl = self.redisCon.blpop('tmpVisit', timeout=30)
                if keyUrl:
                    url = keyUrl[1]
                    hashData = hashUrl(url)
                    if not self.redisCon.sismember('visited', hashData) and \
                            not self.redisCon.sismember('visitSet', hashData):
                        self.redisCon.lpush('visit', url)
                        self.redisCon.sadd('visitSet', hashData)
                else:
                    break

            countDepth += 1
Пример #7
0
    def start(self):

        initDB()

        countDepth = 0
        countUrls = 0

        while countDepth <= int(conf.CRAWL_DEPTH):

            while True:
                # wait for 10 minites
                # print 'len visite:', self.redisCon.llen('visit')
                # print 'len visited:', self.redisCon.scard('visited')
                url = self.redisCon.lpop('visit')
                if url:
                    countUrls += 1
                    print 'countDepth:', countDepth, 'countUrls:', countUrls
                    self.jobQueue.enqueue_call(crawl, args=(url, countDepth, countUrls))
                else:
                    self.redisCon.delete('visitSet')
                    break

            while True:
                # wait 30 seconds, if timeout, jobqueue is empty(except failed job)
                keyUrl = self.redisCon.blpop('tmpVisit', timeout=30)
                if keyUrl:
                    url = keyUrl[1]
                    hashData = hashUrl(url)
                    if not self.redisCon.sismember('visited', hashData) and \
                            not self.redisCon.sismember('visitSet', hashData):
                        self.redisCon.lpush('visit', url)
                        self.redisCon.sadd('visitSet', hashData)
                else:
                    break

            countDepth += 1
Пример #8
0
                    for form in forms:
                        formMsg = '%s has form, url: %s method: %s data: %s' % (url, form[0], form[1], form[2])
                        logger.log(CUSTOM_LOGGING.WARNING, formMsg)
                        try:
                            lock.acquire()
                            fp.write(formMsg + '\n')
                            lock.release()
                        except Exception, ex:
                            logger.log(CUSTOM_LOGGING.ERROR, ex)
                            if lock.locked():
                                lock.release()



    while conf.CRAWL_DEPTH >= currentDepth:

        links = Queue()
        # runThreads(conf.numThreads, crawlerThread)
        crawlerThread()
        visitSet = set()

        while links.qsize() > 0:
            tmpUrl = links.get()
            hashData = hashUrl(tmpUrl)
            if hashData not in visited and hashData not in visitSet:
                visitQueue.put(tmpUrl)
                visitSet.add(hashData)

        currentDepth += 1
    fp.close()