示例#1
0
def isUuidHandled(uuid):
    if not uuid:
        return False
    items = cmapi.getItemValue("uuids", [], modelname="RunStatus")
    if uuid in items:
        return True
    return False
示例#2
0
def archiveData(key, datasources):
    oldValue = cmapi.getItemValue(key, [], modelname=DatasourceArchive)
    if oldValue:
        oldValue.extend(datasources)
    else:
        oldValue = datasources
    cmapi.saveItem(key, oldValue, modelname=DatasourceArchive)
示例#3
0
def updateUuids(uuid):
    _MAX_ITEM_COUNT = 100
    if not uuid:
        return
    items = cmapi.getItemValue("uuids", [], modelname="RunStatus")
    items.insert(0, uuid)
    items = items[:_MAX_ITEM_COUNT]
    cmapi.saveItem("uuids", items, modelname="RunStatus")
示例#4
0
def getPages(datasources=None, keyname=None):
    if keyname:
        datasources = cmapi.getItemValue(keyname, [], modelname=LatestItem)
    pages = []
    for datasource in datasources:
        for childPage in datasource['pages']:
            childPage['source'] = datasource['source']
            pages.append(childPage)
    return pages
示例#5
0
def getWordsConfig():
    result = cmapi.getItemValue('words', {})
    if 'stop.patterns' not in result:
        result['stop.patterns'] = []
    if 'similar' not in result:
        result['similar'] = {
                '0': 6
            }
    if 'hours.all' not in result:
        result['hours.all'] = 24
    if 'hours.latest' not in result:
        result['hours.latest'] = 4
    if 'psegs' not in result:
        # result['psegs'] = ['n', 'ns', 'nr', 'eng']
        result['psegs'] = []
    return result
示例#6
0
def getUrlAdded(url, added):
    items = cmapi.getItemValue(_getKeyname(), [],
                    modelname='RunStatus')
    found = _getItem(items, url)
    if found:
        found['count'] += 1
    else:
        found = {}
        found['count'] = 1
        found['url'] = url
        found['added'] = added
        items.append(found)
    found['updated'] = dateutil.getDateAs14(datetime.datetime.utcnow())
    start14 = dateutil.getHoursAs14(24)
    items = [ item for item in items if item['updated'] > start14 ]
    cmapi.saveItem(_getKeyname(), items,
                modelname='RunStatus')
    return found['added']
示例#7
0
def increaseIncomingBandwidth(bytes):
    itemKey = 'inbandwidth'
    inbandwidth = cmapi.getItemValue(itemKey, {}, modelname='RunStatus')

    allband = inbandwidth.get('all')
    if not allband:
        allband = {}
        allband['start'] = dateutil.getDateAs14(datetime.datetime.utcnow())
        inbandwidth['all'] = allband
    allband['bytes'] = allband.get('bytes', 0) + bytes
    allband['fetch'] = allband.get('fetch', 0) + 1

    timezonename = inbandwidth.get('tz')
    if not timezonename:
        timezonename = 'US/Pacific'
        inbandwidth['tz'] = timezonename

    nnow = datetime.datetime.now(tz=pytz.utc)
    tzdate = nnow.astimezone(pytz.timezone(timezonename))
    key = tzdate.strftime('%Y%m%d')

    current = inbandwidth.get('current')
    if not current or current.get('key') != key:
        historycount = inbandwidth.get('historycount')
        if not historycount:
            historycount = 7
            inbandwidth['historycount'] = historycount
        if current:
            history = inbandwidth.get('history')
            if not history:
                history = []
            history.insert(0, current)
            history = history[:historycount]
            inbandwidth['history'] = history
        current = {'key': key, 'bytes': bytes, 'fetch': 1}
        inbandwidth['current'] = current
    else:
        current['fetch'] += 1
        current['bytes'] += bytes

    cmapi.saveItem(itemKey, inbandwidth, modelname='RunStatus')
示例#8
0
def savePageHistory(url):
    pages = cmapi.getItemValue('page.history', [], modelname='RunStatus')
    found = None
    for page in pages:
        if page.get('url') == url:
            found = page
            break
    if found:
        found['count'] += 1
    else:
        found = {}
        found['count'] = 1
        found['url'] = url
        pages.append(found)
    found['updated'] = datetime.datetime.utcnow().strftime('%Y%m%d%H%M%S')
    pages.sort(key=lambda page: page['updated'], reverse=True)
    pages.sort(key=lambda page: page['count'], reverse=True)
    MAX_COUNT = 1000
    RESET_COUNT = 200
    if len(pages) > MAX_COUNT:
        pages = pages[:RESET_COUNT]
    cmapi.saveItem('page.history', pages, modelname='RunStatus')
示例#9
0
def isConstantTitle(titleConfig, url, title, sideEffect):
    if not url:
        return False
    netloc = urlparse.urlparse(url).netloc
    key = netloc
    value = cmapi.getItemValue(key, {}, modelname=PageConstantTitle)
    record = value.get(title)
    if not record:
        record = {}
    count = record.get('c', 0)
    isconstant = count >= titleConfig.get('occurrence', 1)
    if sideEffect:
        nnow = datetime.datetime.utcnow()
        record['c'] = count + 1
        record['u'] = dateutil.getDateAs14(nnow)
        if len(value) > 20:
            for ik, iv in value.items():
                if (nnow - dateutil.parseDate14(iv['u'])).days >= titleConfig.get('cache.day', 7):
                    del value[ik]
        value[title] = record
        success = cmapi.saveItem(key, value, modelname=PageConstantTitle)
    return isconstant
示例#10
0
def savePageHistory(url):
    pages = cmapi.getItemValue('page.history', [], modelname='RunStatus')
    found = None
    for page in pages:
        if page.get('url') == url:
            found = page
            break
    if found:
        found['count'] += 1
    else:
        found = {}
        found['count'] = 1
        found['url'] = url
        pages.append(found)
    found['updated'] = datetime.datetime.utcnow().strftime('%Y%m%d%H%M%S')
    pages.sort(key=lambda page: page['updated'], reverse=True)
    pages.sort(key=lambda page: page['count'], reverse=True)
    MAX_COUNT = 1000
    RESET_COUNT = 200
    if len(pages) > MAX_COUNT:
        pages = pages[:RESET_COUNT]
    cmapi.saveItem('page.history', pages, modelname='RunStatus')
示例#11
0
def _saveNow(datasource, items, keyname):
    datasources = cmapi.getItemValue(keyname, [], modelname=LatestItem)

    days = 7
    strStart = dateutil.getHoursAs14(days * 24)
    datasources = [child for child in datasources
                    if child['source']['added'] >= strStart]

    data = {
        'source': datasource,
        'pages': items,
    }

    foundIndex = -1
    for i in range(len(datasources)):
        item = datasources[i]
        if item['source'].get('slug') == datasource.get('slug'):
            foundIndex = i
            break
    if foundIndex >= 0:
        datasources[foundIndex] = data
    else:
        datasources.append(data)
    cmapi.saveItem(keyname, datasources, modelname=LatestItem)
示例#12
0
def isSourceDeprecated(slug):
    return slug in cmapi.getItemValue(_getKeyname(), [], modelname="RunStatus")
示例#13
0
def addDeprecatedSource(slug):
    items = cmapi.getItemValue(_getKeyname(), [], modelname="RunStatus")
    if slug not in items:
        items.append(slug)
        cmapi.saveItem(_getKeyname(), items, modelname="RunStatus")
示例#14
0
def getStopWords():
    return cmapi.getItemValue('words.stop', [])
示例#15
0
def getDatasourceHistory(slug):
    key = _getDatasourceHistoryKey(slug)
    return cmapi.getItemValue(key, {}, modelname=DatasourceHistory)
示例#16
0
def getPosters():
    return cmapi.getItemValue(_getPosterListKey(), [])
示例#17
0
def getEditorFormat():
    return cmapi.getItemValue('editor.format', {})
示例#18
0
def getFetchTimeout():
    return cmapi.getItemValue('~.fetchtimeout', _FETCH_TIMEOUT)
示例#19
0
def isPageInHistory(url):
    pages = cmapi.getItemValue('page.history', [], modelname='RunStatus')
    for page in pages:
        if page.get('url') == url:
            return True
    return False
示例#20
0
def isSourceDeprecated(slug):
    return slug in cmapi.getItemValue(_getKeyname(), [], modelname='RunStatus')
示例#21
0
def addDeprecatedSource(slug):
    items = cmapi.getItemValue(_getKeyname(), [], modelname='RunStatus')
    if slug not in items:
        items.append(slug)
        cmapi.saveItem(_getKeyname(), items, modelname='RunStatus')
示例#22
0
def removeDeprecatedSource(slug):
    items = cmapi.getItemValue(_getKeyname(), [], modelname='RunStatus')
    if slug in items:
        items.remove(slug)
        cmapi.saveItem(_getKeyname(), items, modelname='RunStatus')
示例#23
0
def getSiteConfig():
    return cmapi.getItemValue('site',
        {'name': 'Site Name'})
示例#24
0
def getWordsRequest(keyname):
    value = cmapi.getItemValue(keyname, modelname=WordsRequest)
    cmapi.removeItem(keyname, modelname=WordsRequest)
    return value
示例#25
0
def isPageInHistory(url):
    pages = cmapi.getItemValue('page.history', [], modelname='RunStatus')
    for page in pages:
        if page.get('url') == url:
            return True
    return False
示例#26
0
def getI18N():
    return cmapi.getItemValue('i18n',
        {'home': 'Home'})
示例#27
0
def getUserAgent():
    return cmapi.getItemValue('~.useragent', _USER_AGENT)
示例#28
0
def getPosters():
    return cmapi.getItemValue(_getPosterListKey(), [])
示例#29
0
def getRobots():
    return cmapi.getItemValue('robots', DEFAULT_ROBOTS)
示例#30
0
def getDatasources(keyname=None):
    if not keyname:
        keyname = 'sites'
    return cmapi.getItemValue(keyname, [], modelname=LatestItem)
示例#31
0
def getDatasourceHistory():
    key = _getDatasourceHistoryKey()
    items = cmapi.getItemValue(key, [], modelname=DatasourceHistory)
    return items
示例#32
0
def getChannelGroups(slug):
    result = cmapi.getItemValue(slug, [], modelname=ChannelGroup)
    if not result:
        result = cmapi.getItemValue('default', [], modelname=ChannelGroup)
    return result
示例#33
0
def getArchiveConfig():
    return cmapi.getItemValue('archive', {})
示例#34
0
def removeDeprecatedSource(slug):
    items = cmapi.getItemValue(_getKeyname(), [], modelname="RunStatus")
    if slug in items:
        items.remove(slug)
        cmapi.saveItem(_getKeyname(), items, modelname="RunStatus")
示例#35
0
def getTwitterAccount():
    return cmapi.getItemValue('twitter.account', {})
示例#36
0
def getWordsDict():
    return cmapi.getItemValue('words.dict', [])