def isUuidHandled(uuid): if not uuid: return False items = cmapi.getItemValue("uuids", [], modelname="RunStatus") if uuid in items: return True return False
def archiveData(key, datasources): oldValue = cmapi.getItemValue(key, [], modelname=DatasourceArchive) if oldValue: oldValue.extend(datasources) else: oldValue = datasources cmapi.saveItem(key, oldValue, modelname=DatasourceArchive)
def updateUuids(uuid): _MAX_ITEM_COUNT = 100 if not uuid: return items = cmapi.getItemValue("uuids", [], modelname="RunStatus") items.insert(0, uuid) items = items[:_MAX_ITEM_COUNT] cmapi.saveItem("uuids", items, modelname="RunStatus")
def getPages(datasources=None, keyname=None): if keyname: datasources = cmapi.getItemValue(keyname, [], modelname=LatestItem) pages = [] for datasource in datasources: for childPage in datasource['pages']: childPage['source'] = datasource['source'] pages.append(childPage) return pages
def getWordsConfig(): result = cmapi.getItemValue('words', {}) if 'stop.patterns' not in result: result['stop.patterns'] = [] if 'similar' not in result: result['similar'] = { '0': 6 } if 'hours.all' not in result: result['hours.all'] = 24 if 'hours.latest' not in result: result['hours.latest'] = 4 if 'psegs' not in result: # result['psegs'] = ['n', 'ns', 'nr', 'eng'] result['psegs'] = [] return result
def getUrlAdded(url, added): items = cmapi.getItemValue(_getKeyname(), [], modelname='RunStatus') found = _getItem(items, url) if found: found['count'] += 1 else: found = {} found['count'] = 1 found['url'] = url found['added'] = added items.append(found) found['updated'] = dateutil.getDateAs14(datetime.datetime.utcnow()) start14 = dateutil.getHoursAs14(24) items = [ item for item in items if item['updated'] > start14 ] cmapi.saveItem(_getKeyname(), items, modelname='RunStatus') return found['added']
def increaseIncomingBandwidth(bytes): itemKey = 'inbandwidth' inbandwidth = cmapi.getItemValue(itemKey, {}, modelname='RunStatus') allband = inbandwidth.get('all') if not allband: allband = {} allband['start'] = dateutil.getDateAs14(datetime.datetime.utcnow()) inbandwidth['all'] = allband allband['bytes'] = allband.get('bytes', 0) + bytes allband['fetch'] = allband.get('fetch', 0) + 1 timezonename = inbandwidth.get('tz') if not timezonename: timezonename = 'US/Pacific' inbandwidth['tz'] = timezonename nnow = datetime.datetime.now(tz=pytz.utc) tzdate = nnow.astimezone(pytz.timezone(timezonename)) key = tzdate.strftime('%Y%m%d') current = inbandwidth.get('current') if not current or current.get('key') != key: historycount = inbandwidth.get('historycount') if not historycount: historycount = 7 inbandwidth['historycount'] = historycount if current: history = inbandwidth.get('history') if not history: history = [] history.insert(0, current) history = history[:historycount] inbandwidth['history'] = history current = {'key': key, 'bytes': bytes, 'fetch': 1} inbandwidth['current'] = current else: current['fetch'] += 1 current['bytes'] += bytes cmapi.saveItem(itemKey, inbandwidth, modelname='RunStatus')
def savePageHistory(url): pages = cmapi.getItemValue('page.history', [], modelname='RunStatus') found = None for page in pages: if page.get('url') == url: found = page break if found: found['count'] += 1 else: found = {} found['count'] = 1 found['url'] = url pages.append(found) found['updated'] = datetime.datetime.utcnow().strftime('%Y%m%d%H%M%S') pages.sort(key=lambda page: page['updated'], reverse=True) pages.sort(key=lambda page: page['count'], reverse=True) MAX_COUNT = 1000 RESET_COUNT = 200 if len(pages) > MAX_COUNT: pages = pages[:RESET_COUNT] cmapi.saveItem('page.history', pages, modelname='RunStatus')
def isConstantTitle(titleConfig, url, title, sideEffect): if not url: return False netloc = urlparse.urlparse(url).netloc key = netloc value = cmapi.getItemValue(key, {}, modelname=PageConstantTitle) record = value.get(title) if not record: record = {} count = record.get('c', 0) isconstant = count >= titleConfig.get('occurrence', 1) if sideEffect: nnow = datetime.datetime.utcnow() record['c'] = count + 1 record['u'] = dateutil.getDateAs14(nnow) if len(value) > 20: for ik, iv in value.items(): if (nnow - dateutil.parseDate14(iv['u'])).days >= titleConfig.get('cache.day', 7): del value[ik] value[title] = record success = cmapi.saveItem(key, value, modelname=PageConstantTitle) return isconstant
def _saveNow(datasource, items, keyname): datasources = cmapi.getItemValue(keyname, [], modelname=LatestItem) days = 7 strStart = dateutil.getHoursAs14(days * 24) datasources = [child for child in datasources if child['source']['added'] >= strStart] data = { 'source': datasource, 'pages': items, } foundIndex = -1 for i in range(len(datasources)): item = datasources[i] if item['source'].get('slug') == datasource.get('slug'): foundIndex = i break if foundIndex >= 0: datasources[foundIndex] = data else: datasources.append(data) cmapi.saveItem(keyname, datasources, modelname=LatestItem)
def isSourceDeprecated(slug): return slug in cmapi.getItemValue(_getKeyname(), [], modelname="RunStatus")
def addDeprecatedSource(slug): items = cmapi.getItemValue(_getKeyname(), [], modelname="RunStatus") if slug not in items: items.append(slug) cmapi.saveItem(_getKeyname(), items, modelname="RunStatus")
def getStopWords(): return cmapi.getItemValue('words.stop', [])
def getDatasourceHistory(slug): key = _getDatasourceHistoryKey(slug) return cmapi.getItemValue(key, {}, modelname=DatasourceHistory)
def getPosters(): return cmapi.getItemValue(_getPosterListKey(), [])
def getEditorFormat(): return cmapi.getItemValue('editor.format', {})
def getFetchTimeout(): return cmapi.getItemValue('~.fetchtimeout', _FETCH_TIMEOUT)
def isPageInHistory(url): pages = cmapi.getItemValue('page.history', [], modelname='RunStatus') for page in pages: if page.get('url') == url: return True return False
def isSourceDeprecated(slug): return slug in cmapi.getItemValue(_getKeyname(), [], modelname='RunStatus')
def addDeprecatedSource(slug): items = cmapi.getItemValue(_getKeyname(), [], modelname='RunStatus') if slug not in items: items.append(slug) cmapi.saveItem(_getKeyname(), items, modelname='RunStatus')
def removeDeprecatedSource(slug): items = cmapi.getItemValue(_getKeyname(), [], modelname='RunStatus') if slug in items: items.remove(slug) cmapi.saveItem(_getKeyname(), items, modelname='RunStatus')
def getSiteConfig(): return cmapi.getItemValue('site', {'name': 'Site Name'})
def getWordsRequest(keyname): value = cmapi.getItemValue(keyname, modelname=WordsRequest) cmapi.removeItem(keyname, modelname=WordsRequest) return value
def getI18N(): return cmapi.getItemValue('i18n', {'home': 'Home'})
def getUserAgent(): return cmapi.getItemValue('~.useragent', _USER_AGENT)
def getRobots(): return cmapi.getItemValue('robots', DEFAULT_ROBOTS)
def getDatasources(keyname=None): if not keyname: keyname = 'sites' return cmapi.getItemValue(keyname, [], modelname=LatestItem)
def getDatasourceHistory(): key = _getDatasourceHistoryKey() items = cmapi.getItemValue(key, [], modelname=DatasourceHistory) return items
def getChannelGroups(slug): result = cmapi.getItemValue(slug, [], modelname=ChannelGroup) if not result: result = cmapi.getItemValue('default', [], modelname=ChannelGroup) return result
def getArchiveConfig(): return cmapi.getItemValue('archive', {})
def removeDeprecatedSource(slug): items = cmapi.getItemValue(_getKeyname(), [], modelname="RunStatus") if slug in items: items.remove(slug) cmapi.saveItem(_getKeyname(), items, modelname="RunStatus")
def getTwitterAccount(): return cmapi.getItemValue('twitter.account', {})
def getWordsDict(): return cmapi.getItemValue('words.dict', [])