def __init__(self, config): self._conf = config self._hits = [] self._recenthits = [] self._sf = StorageFilters()
class MemoryStorage(object): def __init__(self, config): self._conf = config self._hits = [] self._recenthits = [] self._sf = StorageFilters() def clear_hits(self, days=7): self._hits = [] self._recenthits = [] def add_hit(self, hit): hitobj = {'url': hit.url(), 'timestamp': hit.timestamp(), 'keywords': hit.keywords(), 'path': hit.path(), 'title': hit.title(), 'source': hit.source()} if not self._sf.filter_path(hit.path()): # Don't store hits for blacklisted paths return self._hits.append(hitobj) self._recenthits.append(hitobj) recenthits_size = int(self._conf['recenthits_size']) if len(self._recenthits) > recenthits_size: self._recenthits = self._recenthits[-recenthits_size:] def get_recenthits(self, sources, last_timestamp=0): recenthits = self._recenthits recenthits = filter(self._sf.filter_timestamp( start_time = last_timestamp), recenthits) recenthits = filter(self._sf.filter_sources(sources), recenthits) return recenthits def list_urls(self, unique=False, start_time=None, end_time=None, minimum_hits=1): """Returns a list of all the urls. Optional parameters: unique Return only unique urls. start_time Return only urls requested after this timestamp. end_time Return only urls requested before this timestamp. minimum_hits Return only urls with at least this amount of hits. """ hits = self._hits hits = filter(self._sf.filter_timestamp(start_time, end_time), hits) urls = map(operator.itemgetter('url'), hits) if unique: return list(set(urls)) return urls def get_hitcount(self, url, start_time=None, end_time=None): """Returns number of hits for a specific url. Optional parameters: start_time Return only urls requested after this timestamp. end_time Return only urls requested before this timestamp. """ hits = self._hits hits = filter(self._sf.filter_url(url), hits) hits = filter(self._sf.filter_timestamp(start_time, end_time), hits) return len(hits) def get_hitcounts(self, start_time=None, end_time=None, minimum_hits=1, qfield='hit_path'): """Return dictionary of hitcounts for all urls using the format {name: count} Optional parameters: start_time Return only urls requested after this timestamp. end_time Return only urls requested before this timestamp. minimum_hits Return only urls with at least this amount of hits. """ hits = self._hits hits = filter(self._sf.filter_timestamp(start_time, end_time), hits) # Get a dictionary like {url: count} or {path: count} if qfield == 'hit_url': hitcounts = Counter(map(operator.itemgetter('url'), hits)) elif qfield == 'hit_title': hitcounts = Counter(map(operator.itemgetter('title'), hits)) else: hitcounts = Counter(map(operator.itemgetter('path'), hits)) # Iterate over the hitcounts, putting them through the filter function. # Items are removed if the filter function returns false. hitcounts = dict(filter(self._sf.filter_hitcount(minimum_hits), hitcounts.iteritems())) return hitcounts def get_keywords(self, url=None, start_time=None, end_time=None, minimum_count=None): """Get all keywords and their counts. Returns dictionary: {keyword: count} """ hits = self._hits hits = filter(self._sf.filter_url(url), hits) hits = filter(self._sf.filter_timestamp(start_time, end_time), hits) # Iterate over keywords in hits, combining them in a single list. # Then use Counter to get a dictionary like {keyword: count} keywords = Counter(reduce(operator.add, map(operator.itemgetter('keywords'), hits), [])) keywords = dict(filter(self._sf.filter_keywordcount(minimum_count), keywords.iteritems())) keywords = dict(filter(self._sf.filter_keywords(self._conf), keywords.iteritems())) return keywords def list_searches(self, keyword=None, limit=None): """List all the search phrases which contain the given keyword, or all phrases if no keyword given. """ phrases = [] sources = map(operator.itemgetter('source'), self._hits) sources = {}.fromkeys(sources).keys() # make unique htmlparser = HTMLParser() for source in sources: if source.startswith('searches'): qpos = source.find(': ') if qpos > 0: phrase = source[qpos+2:] if keyword is None or phrase.find(keyword) != -1: phrases.append(htmlparser.escape(phrase)) if limit is not None: return phrases[:limit] return phrases