Пример #1
0
    def run(self):
        # dbtype = {'Mongo': MongoHelper, 'Sqlite': SqliteHelper}
        while True:
            print 'spider beginning -------'
            # sqlHelper = SqliteHelper()
            sqlHelper = MongoHelper()
            print 'validator beginning -------'
            validator = Validator(sqlHelper)
            count = validator.run_db()
            # count = sqlHelper.selectCount()
            print 'validator end ----count=%s' % count
            if count < MINNUM:
                proxys = self.crawl_pool.map(self.crawl, parserList)
                # 这个时候proxys的格式是[[{},{},{}],[{},{},{}]]
                proxys_tmp = []
                for proxy in proxys:
                    proxys_tmp.extend(proxy)

                proxys = proxys_tmp
                print 'first_proxys--%s', len(proxys)
                # 这个时候proxys的格式是[{},{},{},{},{},{}]
                # 这个时候开始去重:
                proxys = [dict(t) for t in set([tuple(proxy.items()) for proxy in proxys])]

                print 'spider proxys -------%s' % type(proxys)
                proxys = validator.run_list(proxys)  # 这个是检测后的ip地址
                proxys = [value for value in proxys if value is not None]
                print 'end_proxys--%s', len(proxys)
                for proxy in proxys:
                    exist = sqlHelper.selectOne({'ip': proxy['ip'], 'port': proxy['port'], 'type': proxy['type']})
                    if exist:
                        sqlHelper.update(exist, {'$set': {'updatetime': datetime.datetime.now()}})
                    else:

                        sqlHelper.update({'ip': proxy['ip'], 'port': proxy['port']}, proxy)
                print 'success ip = %s' % sqlHelper.selectCount()
            print 'spider end -------'
            time.sleep(UPDATE_TIME)
Пример #2
0
    def run(self):
        while True:
            logger.info("Start to run spider")
            sqlHelper = SqliteHelper()
            logger.info('Start to run validator')
            validator = Validator(sqlHelper)
            count = validator.run_db()
            logger.info('Finished to run validator, count=%s' % count)
            if count[0] < MINNUM:
                proxys = self.crawl_pool.map(self.crawl, parserList)
                #这个时候proxys的格式是[[{},{},{}],[{},{},{}]]
                # print proxys
                #这个时候应该去重:

                proxys_tmp = []
                for proxy in proxys:
                    proxys_tmp.extend(proxy)

                proxys = proxys_tmp
                logger.info('first_proxys: %s' % len(proxys))
                #这个时候proxys的格式是[{},{},{},{},{},{}]
                proxys_tmp = None
                #这个时候开始去重:
                proxys = [
                    dict(t)
                    for t in set([tuple(proxy.items()) for proxy in proxys])
                ]
                logger.info('end_proxy: %s' % len(proxys))
                logger.info('spider proxys: %s' % type(proxys))
                proxys = validator.run_list(proxys)  #这个是检测后的ip地址

                sqlHelper.batch_insert(sqlHelper.tableName, proxys)

                logger.info('success ip: %s' % sqlHelper.selectCount())
                sqlHelper.close()
            logger.info('Finished to run spider')
            time.sleep(UPDATE_TIME)
Пример #3
0
    def run(self):
        while True:
            print 'spider beginning -------'
            sqlHelper = SqliteHelper()
            print 'validator beginning -------'
            validator = Validator(sqlHelper)
            count = validator.run_db()
            print 'validator end ----count=%s'%count
            if count[0]< MINNUM:
                proxys = self.crawl_pool.map(self.crawl,parserList)
                #这个时候proxys的格式是[[{},{},{}],[{},{},{}]]
                # print proxys
                #这个时候应该去重:

                proxys_tmp = []
                for proxy in proxys:
                    proxys_tmp.extend(proxy)

                proxys = proxys_tmp
                print 'first_proxys--%s',len(proxys)
                #这个时候proxys的格式是[{},{},{},{},{},{}]
                proxys_tmp=None
                #这个时候开始去重:
                proxys = [dict(t) for t in set([tuple(proxy.items()) for proxy in proxys])]
                print 'end_proxys--%s',len(proxys)
                print 'spider proxys -------%s'%type(proxys)
                proxys = validator.run_list(proxys)#这个是检测后的ip地址


                sqlHelper.batch_insert(sqlHelper.tableName,proxys)


                print 'success ip =%s'%sqlHelper.selectCount()
                sqlHelper.close()
            print 'spider end -------'
            time.sleep(UPDATE_TIME)