def is_changed(self, url, page_hash): """Return true if the url has changed since the last time.""" if canonize(url) in self.store: data = self.store.get(canonize(url)) dist = hamming_distance(page_hash, data["page_hash"]) if dist < SIMILARITY_THRESHOLD: return False return True
def add(self, dmeta): """ Add a url and its alternatives into seen""" assert (isinstance(dmeta, DocumentMetadata)) # I want to merge previous aternatives with current prev_entry = self.get(canonize(dmeta.url)) prev_alt = [] if prev_entry: prev_alt = prev_entry.get("alternatives") canonized = [canonize(a) for a in dmeta.alternatives if a] canonized = list(set(canonized + prev_alt)) for n in canonized: self.store.add(n, dmeta.dhash, alternatives=canonized)
def get(self, url): """ Return the data associated with the url. The data we store in seen is a counter since the last time we fetched it and the list of alternatives. """ return self.store.get(canonize(url))
def store(self, data): ''' this function store new data into an hash. In case the status is not 200, the data will not be overwritten the history maintain anyway the last 10 status and dates. ''' data_old = self.db.get(canonize(data.url)) if data.status == 200 or not data_old: data.history = [(data.fetched_time, data.status)] if data_old: data.history += data_old["history"][:9] to_store = data.info else: data_old["history"] = [(data.fetched_time, data.status)] + data_old["history"][:9] to_store = data_old to_store.pop("fetched_time", None) to_store.pop("status", None) self.db.add(canonize(data.url), to_store)
def delete(self, data): self.db.delete(canonize(data.url))
def is_new(self, url): """Return true if the url is seen for the first time.""" if canonize(url) in self.store: return False return True
def incr_n(self, url): """Increment the `count` data associated to an url and its alternatives.""" if canonize(url) in self.store: data = self.store.get(canonize(url)) for alt in data["alternatives"]: self.store.incr_n(alt)
def delete(self, url): """Delete a url and its alternatives.""" if canonize(url) in self.store: data = self.store.get(canonize(url)) for alt in data["alternatives"]: self.store.delete(alt)