Пример #1
0
 def is_changed(self, url, page_hash):
     """Return true if the url has changed since the last time."""
     if canonize(url) in self.store:
         data = self.store.get(canonize(url))
         dist = hamming_distance(page_hash, data["page_hash"])
         if dist < SIMILARITY_THRESHOLD:
             return False
     return True
Пример #2
0
    def add(self, dmeta):
        """ Add a url and its alternatives into seen"""
        assert (isinstance(dmeta, DocumentMetadata))

        # I want to merge previous aternatives with current
        prev_entry = self.get(canonize(dmeta.url))
        prev_alt = []
        if prev_entry:
            prev_alt = prev_entry.get("alternatives")

        canonized = [canonize(a) for a in dmeta.alternatives if a]
        canonized = list(set(canonized + prev_alt))
        for n in canonized:
            self.store.add(n, dmeta.dhash, alternatives=canonized)
Пример #3
0
    def get(self, url):
        """
        Return the data associated with the url.

        The data we store in seen is a counter since the last time we fetched it and
        the list of alternatives.
        """
        return self.store.get(canonize(url))
Пример #4
0
    def store(self, data):
        '''
        this function store new data into an hash.
        In case the status is not 200, the data will not be overwritten
        the history maintain anyway the last 10 status and dates.
        '''
        data_old = self.db.get(canonize(data.url))
        if data.status == 200 or not data_old:
            data.history = [(data.fetched_time, data.status)]
            if data_old:
                data.history += data_old["history"][:9]
            to_store = data.info
        else:
            data_old["history"] = [(data.fetched_time, data.status)] + data_old["history"][:9]
            to_store = data_old

        to_store.pop("fetched_time", None)
        to_store.pop("status", None)
        self.db.add(canonize(data.url), to_store)
Пример #5
0
 def delete(self, data):
     self.db.delete(canonize(data.url))
Пример #6
0
 def is_new(self, url):
     """Return true if the url is seen for the first time."""
     if canonize(url) in self.store:
         return False
     return True
Пример #7
0
 def incr_n(self, url):
     """Increment the `count` data associated to an url and its alternatives."""
     if canonize(url) in self.store:
         data = self.store.get(canonize(url))
         for alt in data["alternatives"]:
             self.store.incr_n(alt)
Пример #8
0
 def delete(self, url):
     """Delete a url and its alternatives."""
     if canonize(url) in self.store:
         data = self.store.get(canonize(url))
         for alt in data["alternatives"]:
             self.store.delete(alt)