def run(self): # dbtype = {'Mongo': MongoHelper, 'Sqlite': SqliteHelper} while True: print 'spider beginning -------' # sqlHelper = SqliteHelper() sqlHelper = MongoHelper() print 'validator beginning -------' validator = Validator(sqlHelper) count = validator.run_db() # count = sqlHelper.selectCount() print 'validator end ----count=%s' % count if count < MINNUM: proxys = self.crawl_pool.map(self.crawl, parserList) # 这个时候proxys的格式是[[{},{},{}],[{},{},{}]] proxys_tmp = [] for proxy in proxys: proxys_tmp.extend(proxy) proxys = proxys_tmp print 'first_proxys--%s', len(proxys) # 这个时候proxys的格式是[{},{},{},{},{},{}] # 这个时候开始去重: proxys = [dict(t) for t in set([tuple(proxy.items()) for proxy in proxys])] print 'spider proxys -------%s' % type(proxys) proxys = validator.run_list(proxys) # 这个是检测后的ip地址 proxys = [value for value in proxys if value is not None] print 'end_proxys--%s', len(proxys) for proxy in proxys: exist = sqlHelper.selectOne({'ip': proxy['ip'], 'port': proxy['port'], 'type': proxy['type']}) if exist: sqlHelper.update(exist, {'$set': {'updatetime': datetime.datetime.now()}}) else: sqlHelper.update({'ip': proxy['ip'], 'port': proxy['port']}, proxy) print 'success ip = %s' % sqlHelper.selectCount() print 'spider end -------' time.sleep(UPDATE_TIME)
def run(self): while True: logger.info("Start to run spider") sqlHelper = SqliteHelper() logger.info('Start to run validator') validator = Validator(sqlHelper) count = validator.run_db() logger.info('Finished to run validator, count=%s' % count) if count[0] < MINNUM: proxys = self.crawl_pool.map(self.crawl, parserList) #这个时候proxys的格式是[[{},{},{}],[{},{},{}]] # print proxys #这个时候应该去重: proxys_tmp = [] for proxy in proxys: proxys_tmp.extend(proxy) proxys = proxys_tmp logger.info('first_proxys: %s' % len(proxys)) #这个时候proxys的格式是[{},{},{},{},{},{}] proxys_tmp = None #这个时候开始去重: proxys = [ dict(t) for t in set([tuple(proxy.items()) for proxy in proxys]) ] logger.info('end_proxy: %s' % len(proxys)) logger.info('spider proxys: %s' % type(proxys)) proxys = validator.run_list(proxys) #这个是检测后的ip地址 sqlHelper.batch_insert(sqlHelper.tableName, proxys) logger.info('success ip: %s' % sqlHelper.selectCount()) sqlHelper.close() logger.info('Finished to run spider') time.sleep(UPDATE_TIME)
def run(self): while True: print 'spider beginning -------' sqlHelper = SqliteHelper() print 'validator beginning -------' validator = Validator(sqlHelper) count = validator.run_db() print 'validator end ----count=%s'%count if count[0]< MINNUM: proxys = self.crawl_pool.map(self.crawl,parserList) #这个时候proxys的格式是[[{},{},{}],[{},{},{}]] # print proxys #这个时候应该去重: proxys_tmp = [] for proxy in proxys: proxys_tmp.extend(proxy) proxys = proxys_tmp print 'first_proxys--%s',len(proxys) #这个时候proxys的格式是[{},{},{},{},{},{}] proxys_tmp=None #这个时候开始去重: proxys = [dict(t) for t in set([tuple(proxy.items()) for proxy in proxys])] print 'end_proxys--%s',len(proxys) print 'spider proxys -------%s'%type(proxys) proxys = validator.run_list(proxys)#这个是检测后的ip地址 sqlHelper.batch_insert(sqlHelper.tableName,proxys) print 'success ip =%s'%sqlHelper.selectCount() sqlHelper.close() print 'spider end -------' time.sleep(UPDATE_TIME)