def run(self): website = tools.getConfValue("collector", "website") if website == 'all': self.addYoukuUrl() self.addTencentUrl() self.addWangYiUrl() self.addPPTVUrl() self.addCCTVUrl() self.addKanKanUrl() self.addTouDouUrl() self.addV1Url() self.addKu6Url() elif website == Constance.YOUKU: self.addYoukuUrl() elif website == Constance.TENCENT: self.addTencentUrl() elif website == Constance.WANG_YI: self.addWangYiUrl() elif website == Constance.PPTV: self.addPPTVUrl() elif website == Constance.CCTV: self.addCCTVUrl() elif website == Constance.KAN_KAN: self.addKanKanUrl() elif website == Constance.TUDOU: self.addTouDouUrl() elif website == Constance.V1: self.addV1Url() elif website == Constance.KU6: self.addKu6Url()
def __inputData(self): if len(Collector._urls) > int( tools.getConfValue("collector", "max_size")): return mylock.acquire() #加锁 depth = int(tools.getConfValue("collector", "depth")) urlCount = int(tools.getConfValue("collector", "url_count")) if DEBUG: urlsList = Collector._db.urls.find( { "status": Constance.TODO, "depth": DEPTH }, { "url": 1, "_id": 0, "depth": 1, "description": 1 }).sort([("depth", 1)]).limit(urlCount) else: urlsList = Collector._db.urls.find( { "status": Constance.TODO, "depth": { "$lte": depth } }, { "url": 1, "_id": 0, "depth": 1, "description": 1 }).sort([("depth", 1)]).limit(urlCount) #sort -1 降序 1 升序 urlsList = list(urlsList) Collector._urls.extend(urlsList) #更新已取到的url状态为doing for url in urlsList: Collector._db.urls.update(url, {'$set': { 'status': Constance.DOING }}) if self.isAllHaveDone(): self.stop() mylock.release()
def isAllHaveDone(self): allowedNullTimes = int( tools.getConfValue("collector", 'allowed_null_times')) if Collector._urls == []: Collector._nullTimes += 1 if Collector._nullTimes >= allowedNullTimes: return True else: Collector._nullTimes = 0 return False
def run(self): website = tools.getConfValue("collector", "website") self.registUrlFunc() # 执行add url func for addUrlFunc in AddRootUrl._addUrlFuncs: addWebUrl = addUrlFunc[0] domain = addUrlFunc[1] if website == 'all': addWebUrl() elif website == domain: addWebUrl()
sys.path.append("..") import utils.tools as tools from utils.log import log from base.collector import Collector from base.root_url import AddRootUrl from html_parser.parser_control import PaserControl def init(): db = tools.getConnectedDB() # 设唯一索引 db.urls.ensure_index('url', unique=True) db.text_info.ensure_index('url', unique=True) if __name__ == '__main__': log.info("--------begin--------") init() addRootUrl = AddRootUrl() addRootUrl.start() coll = Collector() coll.start() paserCount = int(tools.getConfValue("html_parser", "parser_count")) while paserCount: paser = PaserControl() paser.start() paserCount = paserCount - 1
def __init__(self): super(PaserControl, self).__init__() self._collector = Collector() self._urlCount = int(tools.getConfValue("html_parser", "url_count")) self._interval = int(tools.getConfValue("html_parser", "sleep_time"))
class Collector(threading.Thread, Singleton): _db = tools.getConnectedDB() _threadStop = False _urls = [] _nullTimes = 0 _readPos = -1 _writePos = -1 _maxSize = int(tools.getConfValue("collector", "max_size")) _interval = int(tools.getConfValue("collector", "sleep_time")) _allowedNullTimes = int( tools.getConfValue("collector", 'allowed_null_times')) _website = tools.getConfValue("collector", "website") _depth = int(tools.getConfValue("collector", "depth")) _urlCount = int(tools.getConfValue("collector", "url_count")) #初始时将正在做的任务至为未做 beginTime = time.time() # _db.urls.update({'status':Constance.DOING}, {'$set':{'status':Constance.TODO}}, multi=True) endTime = time.time() log.debug('update url time' + str(endTime - beginTime)) if DEBUG: log.debug("is debug depth = %s" % DEPTH) def __init__(self): super(Collector, self).__init__() def run(self): while not Collector._threadStop: self.__inputData() time.sleep(Collector._interval) def stop(self): Collector._threadStop = True @tools.log_function_time def __inputData(self): log.debug('buffer size %d' % self.getMaxReadSize()) log.debug('buffer can write size = %d' % self.getMaxWriteSize()) if self.getMaxWriteSize() == 0: log.debug("collector 已满 size = %d" % self.getMaxReadSize()) return beginTime = time.time() urlCount = Collector._urlCount if Collector._urlCount <= self.getMaxWriteSize( ) else self.getMaxWriteSize() if DEBUG: urlsList = Collector._db.urls.find( { "status": Constance.TODO, "depth": DEPTH }, { "url": 1, "_id": 0, "depth": 1, "description": 1, "website_id": 1 }).sort([("depth", 1)]).limit(urlCount) elif Collector._website == 'all': urlsList = Collector._db.urls.find( { "status": Constance.TODO, "depth": { "$lte": Collector._depth } }, { "url": 1, "_id": 0, "depth": 1, "description": 1, "website_id": 1 }).sort([("depth", 1)]).limit(urlCount) #sort -1 降序 1 升序 else: websiteId = tools.getWebsiteId(Collector._website) urlsList = Collector._db.urls.find( { "status": Constance.TODO, "website_id": websiteId, "depth": { "$lte": Collector._depth } }, { "url": 1, "_id": 0, "depth": 1, "description": 1, "website_id": 1 }).sort([("depth", 1)]).limit(urlCount) endTime = time.time() urlsList = list(urlsList) log.debug('get url time ' + str(endTime - beginTime) + " size " + str(len(urlsList))) # 存url self.putUrls(urlsList) #更新已取到的url状态为doing beginTime = time.time() for url in urlsList: Collector._db.urls.update(url, {'$set': { 'status': Constance.DOING }}) endTime = time.time() log.debug('update url time ' + str(endTime - beginTime)) if self.isAllHaveDone(): self.stop() exportData.export() def isFinished(self): return Collector._threadStop def isAllHaveDone(self): if Collector._urls == []: Collector._nullTimes += 1 if Collector._nullTimes >= Collector._allowedNullTimes: return True else: Collector._nullTimes = 0 return False def getMaxWriteSize(self): size = 0 if Collector._readPos == Collector._writePos: size = Collector._maxSize elif Collector._readPos < Collector._writePos: size = Collector._maxSize - (Collector._writePos - Collector._readPos) else: size = Collector._readPos - Collector._writePos return size def getMaxReadSize(self): return Collector._maxSize - self.getMaxWriteSize() def putUrls(self, urlsList): # 添加url 到 _urls urlCount = len((urlsList)) endPos = urlCount + Collector._writePos + 1 # 判断是否超出队列容量 超出的话超出的部分需要从头写 # 超出部分 overflowEndPos = endPos - Collector._maxSize # 没超出部分 inPos = endPos if endPos <= Collector._maxSize else Collector._maxSize # 没超出部分的数量 urlsListCutPos = inPos - Collector._writePos - 1 beginTime = time.time() mylock.acquire() #加锁 Collector._urls[Collector._writePos + 1:inPos] = urlsList[:urlsListCutPos] if overflowEndPos > 0: Collector._urls[:overflowEndPos] = urlsList[urlsListCutPos:] mylock.release() log.debug('put url time ' + str(time.time() - beginTime) + " size " + str(len(urlsList))) Collector._writePos += urlCount Collector._writePos %= Collector._maxSize @tools.log_function_time def getUrls(self, count): mylock.acquire() #加锁 urls = [] count = count if count <= self.getMaxReadSize( ) else self.getMaxReadSize() endPos = Collector._readPos + count + 1 if endPos > Collector._maxSize: urls.extend(Collector._urls[Collector._readPos + 1:]) urls.extend(Collector._urls[:endPos % Collector._maxSize]) else: urls.extend(Collector._urls[Collector._readPos + 1:endPos]) Collector._readPos += len(urls) Collector._readPos %= Collector._maxSize mylock.release() return urls
def addHeadlinesListUrl(self): baseUrl = 'http://is.snssdk.com/api/news/feed/v46/' params = { 'version_code': '5.8.6', 'app_name': 'news_article', 'vid': 'B0DB5DD0-FF94-4773-85B1-EFC11132C2A4', 'device_id': '34633749953', 'channel': 'App Store', 'resolution': '1242*2208', 'aid': 13, 'ab_version': '91796,89593,83095,89184,87331,93903,94158,94056,93418,93085,92848,93981,31210,94178,93319,94042,92438,93526,93357,94163,94003,92487,87496,93887,87988', 'ab_feature': 'z1', 'build_version': '5.9.0.5', 'openudid': '7064ff7d773ef8efeb5d6a25f62cd3d85035674f', 'live_sdk_version': '1.3.0', 'idfv': 'B0DB5DD0-FF94-4773-85B1-EFC11132C2A4', 'ac': 'WIFI', 'os_version': '10.1.1', 'ssmix': 'a', 'device_platform': 'iphone', 'iid': 6542551421, 'ab_client': 'a1,f2,f7,e1', 'device_type': 'iPhone9,2', 'idfa': 'D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB', 'LBS_status': 'deny', 'category': 'news_local', 'city': '', 'concern_id': '', 'count': 20, 'cp': '548e4d7f7b1BCq1', 'detail': 1, 'image': 1, 'language': 'zh-Hans-CN', 'last_refresh_sub_entrance_interval': 1482077184, 'loc_mode': 0, 'max_behot_time': 1481063762, 'refer': 1, 'strict': 0, 'tt_from': 'load_more', 'user_city': '泸州' } contentReleasedTime = tools.getConfValue('content', 'content_released_time') timeInterval = int(tools.getConfValue('content', 'time_interval')) contentReleasedTime = tools.dateToTimestamp(contentReleasedTime) currentTimestamp = tools.getCurrentTimestamp() maxBehotTime = currentTimestamp while maxBehotTime >= contentReleasedTime: maxBehotTime -= timeInterval currentTimestamp = currentTimestamp + random.randint(60, 300) # 泸州的文章 params['category'] = 'news_local' params[ 'last_refresh_sub_entrance_interval'] = currentTimestamp # + random.randint(60, 300) params['max_behot_time'] = maxBehotTime url = tools.jointUrl(baseUrl, params) self.addUrl(url, Constance.NEWS_LOCAL) # 视频 params['category'] = 'video' params[ 'last_refresh_sub_entrance_interval'] = currentTimestamp # + random.randint(60, 300) params['max_behot_time'] = maxBehotTime url = tools.jointUrl(baseUrl, params) self.addUrl(url, Constance.VIDEO)
class Collector(threading.Thread, Singleton): _db = tools.getConnectedDB() _threadStop = False _urls = [] _interval = int(tools.getConfValue("collector", "sleep_time")) #初始时将正在做的任务至为未做 _db.urls.update({'status': Constance.DOING}, {'$set': { 'status': Constance.TODO }}, multi=True) if DEBUG: log.debug("is debug depth = %s" % DEPTH) def __init__(self): super(Collector, self).__init__() def run(self): while not Collector._threadStop: self.__inputData() time.sleep(Collector._interval) def stop(self): Collector._threadStop = False def __inputData(self): if len(Collector._urls) > int( tools.getConfValue("collector", "max_size")): return mylock.acquire() #加锁 website = tools.getConfValue("collector", "website") depth = int(tools.getConfValue("collector", "depth")) urlCount = int(tools.getConfValue("collector", "url_count")) if DEBUG: urlsList = Collector._db.urls.find( { "status": Constance.TODO, "depth": DEPTH }, { "url": 1, "_id": 0, "depth": 1, "description": 1, "website_id": 1 }).sort([("depth", 1)]).limit(urlCount) elif website == 'all': urlsList = Collector._db.urls.find( { "status": Constance.TODO, "depth": { "$lte": depth } }, { "url": 1, "_id": 0, "depth": 1, "description": 1, "website_id": 1 }).sort([("depth", 1)]).limit(urlCount) #sort -1 降序 1 升序 else: websiteId = tools.getWebsiteId(website) urlsList = Collector._db.urls.find( { "status": Constance.TODO, "website_id": websiteId, "depth": { "$lte": depth } }, { "url": 1, "_id": 0, "depth": 1, "description": 1, "website_id": 1 }).sort([("depth", 1)]).limit(urlCount) urlsList = list(urlsList) Collector._urls.extend(urlsList) #更新已取到的url状态为doing for url in urlsList: Collector._db.urls.update(url, {'$set': { 'status': Constance.DOING }}) mylock.release() def getUrls(self, count): mylock.acquire() #加锁 urls = Collector._urls[:count] del Collector._urls[:count] mylock.release() return urls
def __inputData(self): if len(Collector._urls) > int( tools.getConfValue("collector", "max_size")): log.debug("collector 已满 size = %d" % len(Collector._urls)) return mylock.acquire() #加锁 website = tools.getConfValue("collector", "website") depth = int(tools.getConfValue("collector", "depth")) urlCount = int(tools.getConfValue("collector", "url_count")) beginTime = time.time() if DEBUG: urlsList = Collector._db.urls.find( { "status": Constance.TODO, "depth": DEPTH }, { "url": 1, "_id": 0, "depth": 1, "description": 1, "website_id": 1 }).sort([("depth", 1)]).limit(urlCount) elif website == 'all': urlsList = Collector._db.urls.find( { "status": Constance.TODO, "depth": { "$lte": depth } }, { "url": 1, "_id": 0, "depth": 1, "description": 1, "website_id": 1 }).sort([("depth", 1)]).limit(urlCount) #sort -1 降序 1 升序 else: websiteId = tools.getWebsiteId(website) urlsList = Collector._db.urls.find( { "status": Constance.TODO, "website_id": websiteId, "depth": { "$lte": depth } }, { "url": 1, "_id": 0, "depth": 1, "description": 1, "website_id": 1 }).sort([("depth", 1)]).limit(urlCount) endTime = time.time() urlsList = list(urlsList) log.debug('get url time ' + str(endTime - beginTime) + " size " + str(len(urlsList))) beginTime = time.time() Collector._urls.extend(urlsList) log.debug('put get url time ' + str(time.time() - beginTime) + " size " + str(len(urlsList))) #更新已取到的url状态为doing beginTime = time.time() for url in urlsList: Collector._db.urls.update(url, {'$set': { 'status': Constance.DOING }}) endTime = time.time() log.debug('update url time ' + str(endTime - beginTime)) if self.isAllHaveDone(): self.stop() exportData.export() mylock.release()
class Collector(threading.Thread, Singleton): _db = tools.getConnectedDB() _threadStop = False _urls = [] _nullTimes = 0 _interval = int(tools.getConfValue("collector", "sleep_time")) #初始时将正在做的任务至为未做 beginTime = time.time() _db.urls.update({'status': Constance.DOING}, {'$set': { 'status': Constance.TODO }}, multi=True) endTime = time.time() log.debug('update url time' + str(endTime - beginTime)) if DEBUG: log.debug("is debug depth = %s" % DEPTH) def __init__(self): super(Collector, self).__init__() def run(self): while not Collector._threadStop: self.__inputData() time.sleep(Collector._interval) def stop(self): Collector._threadStop = True @tools.log_function_time def __inputData(self): if len(Collector._urls) > int( tools.getConfValue("collector", "max_size")): log.debug("collector 已满 size = %d" % len(Collector._urls)) return mylock.acquire() #加锁 website = tools.getConfValue("collector", "website") depth = int(tools.getConfValue("collector", "depth")) urlCount = int(tools.getConfValue("collector", "url_count")) beginTime = time.time() if DEBUG: urlsList = Collector._db.urls.find( { "status": Constance.TODO, "depth": DEPTH }, { "url": 1, "_id": 0, "depth": 1, "description": 1, "website_id": 1 }).sort([("depth", 1)]).limit(urlCount) elif website == 'all': urlsList = Collector._db.urls.find( { "status": Constance.TODO, "depth": { "$lte": depth } }, { "url": 1, "_id": 0, "depth": 1, "description": 1, "website_id": 1 }).sort([("depth", 1)]).limit(urlCount) #sort -1 降序 1 升序 else: websiteId = tools.getWebsiteId(website) urlsList = Collector._db.urls.find( { "status": Constance.TODO, "website_id": websiteId, "depth": { "$lte": depth } }, { "url": 1, "_id": 0, "depth": 1, "description": 1, "website_id": 1 }).sort([("depth", 1)]).limit(urlCount) endTime = time.time() urlsList = list(urlsList) log.debug('get url time ' + str(endTime - beginTime) + " size " + str(len(urlsList))) beginTime = time.time() Collector._urls.extend(urlsList) log.debug('put get url time ' + str(time.time() - beginTime) + " size " + str(len(urlsList))) #更新已取到的url状态为doing beginTime = time.time() for url in urlsList: Collector._db.urls.update(url, {'$set': { 'status': Constance.DOING }}) endTime = time.time() log.debug('update url time ' + str(endTime - beginTime)) if self.isAllHaveDone(): self.stop() exportData.export() mylock.release() def isFinished(self): return Collector._threadStop def isAllHaveDone(self): allowedNullTimes = int( tools.getConfValue("collector", 'allowed_null_times')) if Collector._urls == []: Collector._nullTimes += 1 if Collector._nullTimes >= allowedNullTimes: return True else: Collector._nullTimes = 0 return False @tools.log_function_time def getUrls(self, count): mylock.acquire() #加锁 urls = Collector._urls[:count] del Collector._urls[:count] mylock.release() return urls