def archiveData(timezone): nnow = datetime.datetime.utcnow() lend = datetime.datetime(nnow.year, nnow.month, nnow.day, 23, 59, 0) nend = lend - datetime.timedelta(hours=timezone) if nend > nnow: lend -= datetime.timedelta(days=1) nend -= datetime.timedelta(days=1) topnend = nend datasources = models.getDatasourceHistory() leftSources = datasources while True: strend = dateutil.getDateAs14(nend) leftSources = [item for item in leftSources if 'added' in item and item['added'] <= strend] if not leftSources: break nend2 = nend - datetime.timedelta(days=1) strend2 = dateutil.getDateAs14(nend2) matchedSources = [item for item in leftSources if 'added' in item and item['added'] > strend2] if matchedSources: models.archiveData(lend.strftime('%Y%m%d'), matchedSources) lend -= datetime.timedelta(days=1) nend -= datetime.timedelta(days=1) strtopend = dateutil.getDateAs14(topnend) datasources = [item for item in datasources if 'added' in item and item['added'] > strtopend] models.saveDatasourceHistory(datasources)
def getUrlAdded(url, added): items = cmapi.getItemValue(_getKeyname(), [], modelname='RunStatus') found = _getItem(items, url) if found: found['count'] += 1 else: found = {} found['count'] = 1 found['url'] = url found['added'] = added items.append(found) found['updated'] = dateutil.getDateAs14(datetime.datetime.utcnow()) start14 = dateutil.getHoursAs14(24) items = [ item for item in items if item['updated'] > start14 ] cmapi.saveItem(_getKeyname(), items, modelname='RunStatus') return found['added']
def getPage(url): result = {} pageInfo = pageinfo.fetch(url) if pageInfo: result['page'] = pageInfo alexaInfo = alexainfo.fetch(url) if alexaInfo: if 'alexa' in alexaInfo: result['alexa'] = alexaInfo['alexa'] if 'dmoz' in alexaInfo: result['dmoz'] = alexaInfo['dmoz'] pagerank = pagerankinfo.fetch(url) if pagerank >= 0: result['pagerank'] = pagerank if result: result['updated'] = dateutil.getDateAs14(datetime.datetime.utcnow()) return result
def _saveWords(keyname, words, pages): matchedWords = [] for keywords in words: word = {} word['keywords'] = keywords matched = globalutil.search(pages, keywords) if matched: wordPage = matched[0] word['page'] = wordPage word['size'] = len(matched) word['readablekeywords'] = _getNaturalKeywords(keywords, matched) matchedWords.append(word) nnow = dateutil.getDateAs14(datetime.datetime.utcnow()) data = { 'updated': nnow, 'words': matchedWords, } models.saveWords(keyname, data) return matchedWords
def increaseIncomingBandwidth(bytes): itemKey = 'inbandwidth' inbandwidth = cmapi.getItemValue(itemKey, {}, modelname='RunStatus') allband = inbandwidth.get('all') if not allband: allband = {} allband['start'] = dateutil.getDateAs14(datetime.datetime.utcnow()) inbandwidth['all'] = allband allband['bytes'] = allband.get('bytes', 0) + bytes allband['fetch'] = allband.get('fetch', 0) + 1 timezonename = inbandwidth.get('tz') if not timezonename: timezonename = 'US/Pacific' inbandwidth['tz'] = timezonename nnow = datetime.datetime.now(tz=pytz.utc) tzdate = nnow.astimezone(pytz.timezone(timezonename)) key = tzdate.strftime('%Y%m%d') current = inbandwidth.get('current') if not current or current.get('key') != key: historycount = inbandwidth.get('historycount') if not historycount: historycount = 7 inbandwidth['historycount'] = historycount if current: history = inbandwidth.get('history') if not history: history = [] history.insert(0, current) history = history[:historycount] inbandwidth['history'] = history current = {'key': key, 'bytes': bytes, 'fetch': 1} inbandwidth['current'] = current else: current['fetch'] += 1 current['bytes'] += bytes cmapi.saveItem(itemKey, inbandwidth, modelname='RunStatus')
def isConstantTitle(titleConfig, url, title, sideEffect): if not url: return False netloc = urlparse.urlparse(url).netloc key = netloc value = cmapi.getItemValue(key, {}, modelname=PageConstantTitle) record = value.get(title) if not record: record = {} count = record.get('c', 0) isconstant = count >= titleConfig.get('occurrence', 1) if sideEffect: nnow = datetime.datetime.utcnow() record['c'] = count + 1 record['u'] = dateutil.getDateAs14(nnow) if len(value) > 20: for ik, iv in value.items(): if (nnow - dateutil.parseDate14(iv['u'])).days >= titleConfig.get('cache.day', 7): del value[ik] value[title] = record success = cmapi.saveItem(key, value, modelname=PageConstantTitle) return isconstant
def post(self): data = json.loads(self.request.body) items = data['items'] oldHash= data['hash'] callbackurl = data['callbackurl'] resultItems = [] nnow14 = dateutil.getDateAs14(datetime.datetime.utcnow()) for item in items: pages = bs.search(item['title']) if pages: resultPage = pages[0] resultPage['added'] = models.getUrlAdded(resultPage['url'], nnow14) else: resultPage = {} resultPage['added'] = item['added'] resultPage['keyword'] = item['title'] resultPage['rank'] = item['rank'] resultPage['keywordadded'] = item['added'] resultItems.append(resultPage) contentHash = _calculateHash(resultItems) if oldHash == contentHash: logging.info('No change fetch for %s.' % (data['origin'], )) return responseData = { 'origin': data['origin'], 'items': resultItems, 'hash': contentHash, } success = networkutil.postData(callbackurl, responseData, trycount=_CALLBACK_TRYCOUNT, timeout=_URL_TIMEOUT) if success: message = 'Push %s back to %s.' % (data['origin'], callbackurl) else: message = 'Failed to push %s back to %s.' % (data['origin'], callbackurl) logging.info(message) self.response.out.write(message)
def summarizeEvents(eventCriterion, scope, words, pages, twitterAccount): exposePages = eventCriterion['expose.pages'] events = models.getEvents(scope) if not events: events = { 'counter': 0, 'items': [], } _archiveEvents(scope, events) nnow = dateutil.getDateAs14(datetime.datetime.utcnow()) for word in reversed(words): event = _summarizeEvent(exposePages, scope, events, word, nnow) if event: matcheds = globalutil.search(pages, word['keywords']) _saveEventItem(scope, event['id'], word, nnow, matcheds, twitterAccount) events['items'].sort(key=lambda item: item['updated'], reverse=True) events['items'].sort(key=lambda item: item['word']['size'], reverse=True) events['updated'] = nnow models.saveEvents(scope, events)