Пример #1
0
 def save_to_db(self, url, html):
     urlhash = farmhash.hash64(url)
     sql = 'select url from crawler_html where urlhash=%s'
     d = self.db.get(sql, urlhash)
     if d:
         if d['url'] != url:
             msg = 'farmhash collision: %s <=> %s' % (url, d['url'])
             self.logger.error(msg)
         return True
     if isinstance(html, str):
         html = html.encode('utf8')
     html_lzma = lzma.compress(html)
     sql = ('insert into crawler_html(urlhash, url, html_lzma) '
            'values(%s, %s, %s)')
     good = False
     try:
         self.db.execute(sql, urlhash, url, html_lzma)
         good = True
     except Exception as e:
         if e.args[0] == 1062:
             # Duplicate entry
             good = True
             pass
         else:
             traceback.print_exc()
             raise e
     return good
Пример #2
0
 def save_qo_sql(self,url,html):
     urlhash = farmhash.hash64(url)
     sql = 'select url from create_html where urlhash = "%s"'
     d = self.db.get(sql,urlhash)
     if d:
         if d['url'] != url:
             msg = 'farmhash collision:%s <=> %s'%(url,d['url'])
             self.logger.error(msg)
         return True
     if isinstance(html,str):
         html = html.encode('utf-8')
     html_zlib = zlib.compress(html) # 压缩文本
     sql = "insert into crawler_html(urlhash,url,html_zlib) values (%s,%s,%s)"
     good = False
     try:
         self.db.execute(sql,urlhash,url,html_zlib)
         good = True
     except Exception as e:
         if e.args[0] == 1062:
             """重复"""
             good = True
             pass
         else:
             traceback.print_exc()
             raise e
     return good
Пример #3
0
 def get_entity(schema_name, key, source=True):
     resp = requests.get('{0}/{1}/entity/{2}'.format(options.es, schema_name, farmhash.hash64(key)))
     if resp.status_code >= 300 and resp.status_code != 404:
         raise EntityError('get entity {0} error: {1}'.format(key, resp.text))
     if source:
         return resp.json().get('_source')
     return resp.json()
Пример #4
0
 def transform(self, doc):
     retVal = np.array([0] * self.size)
     for x in doc:
         idx = (x + self.salt) % self.size
         delta = 1 if farmhash.hash64(str(x + self.salt)) % 2 == 0 else -1
         retVal[idx] += delta
     return retVal
Пример #5
0
        def make_cache_key(f, *args, **kwargs):
            _timeout = getattr(timeout, 'cache_timeout', timeout)
            fname, version_data = self._memoize_version(f,
                                                        args=args,
                                                        timeout=_timeout)

            #: this should have to be after version_data, so that it
            #: does not break the delete_memoized functionality.
            if callable(make_name):
                altfname = make_name(fname)
            else:
                altfname = fname

            if callable(f):
                keyargs, keykwargs = self._memoize_kwargs_to_args(
                    f, *args, **kwargs)
            else:
                keyargs, keykwargs = args, kwargs

            try:
                updated = "{0}{1}{2}{3}".format(altfname, keyargs, keykwargs,
                                                version_data)
            except AttributeError:
                updated = "%s%s%s%s" % (altfname, keyargs, keykwargs,
                                        version_data)

            #cache_key = hashlib.md5()
            #cache_key.update(updated.encode('utf-8'))
            #cache_key = base64.b64encode(cache_key.digest())[:16]
            #cache_key = cache_key.decode('utf-8')
            #cache_key += version_data

            return farmhash.hash64('{0}{1}'.format(updated,
                                                   version_data))  #cache_key
Пример #6
0
def make_template_fragment_key(fragment_name, vary_on=[]):
    """
    Make a cache key for a specific fragment name
    """
    if vary_on:
        fragment_name = "%s_" % fragment_name
    return farmhash.hash64(TEMPLATE_FRAGMENT_KEY_TEMPLATE %
                           (fragment_name, "_".join(str(vary_on))))
Пример #7
0
            def make_cache_key(*args, **kwargs):
                if callable(key_prefix):
                    cache_key = key_prefix()
                elif '%s' in key_prefix:
                    cache_key = key_prefix % request.path
                else:
                    cache_key = key_prefix

                return farmhash.hash64(cache_key)
Пример #8
0
 def minhash(self, doc, salts):  #return a list of signature
     assert len(doc) > 0, "empty signature found; hashing aborted"
     retVal = []
     perms = [
         lambda x: farmhash.hash64(str(x + salt)) % self.col_size
         for salt in salts
     ]
     for perm in perms:
         retVal.append(doc[np.argmin([perm(x) for x in doc])])
     return retVal
Пример #9
0
    def __init__(self, tiles=None):
        self.current_ver = farmhash.hash64(datetime.utcnow().isoformat())

        if tiles is None:
            # Generate a new farm with some debris
            obj_points = np.random.randint(0, 10, (20, 16))
            for (x, y), value in np.ndenumerate(obj_points):
                tile = Tile(x, y, 'untilled')
                if value == 7:
                    tile.obj = Obj('tree')
                elif value == 6:
                    tile.obj = Obj('weeds')
                elif value == 5:
                    tile.obj = Obj('brown_leaves')
                self.tiles.append(tile)
        else:
            self.tiles = tiles
Пример #10
0
 def post(self, schema_name):
     node = os.path.join(options.root, schema_name)
     try:
         self.application.zk.create(node)
         schema = SchemaHandler.get_schema(schema_name)
         payload = self.get_payload()
         EntityHandler.validate_entity(schema, payload)
         entity = EntityHandler.get_entity(schema_name, payload[schema['pk']])
         if entity is None:
             payload['_meta'] = {
                 'schema': schema_name,
                 'version': 0,
                 'timestamp': int(datetime.datetime.now().timestamp() * 1000)
             }
         else:
             payload['_meta'] = {
                 'schema': schema_name,
                 'version': entity['_meta']['version'] + 1,
                 'timestamp': int(datetime.datetime.now().timestamp() * 1000)
             }
         r = requests.post('{0}/{1}/entity_history'.format(options.es, schema_name), json=payload)
         if r.status_code >= 300:
             logging.error('put entity history error: {0}'.format(r.text))
             raise HTTPError(status_code=500, reason='put entity history error: {0}'.format(r.text))
         key = farmhash.hash64(payload[schema['pk']])
         r = requests.put('{0}/{1}/entity/{2}'.format(options.es,
                                                      schema_name,
                                                      key), json=payload)
         if r.status_code >= 300:
             logging.error('put entity error: {0}'.format(r.text))
             raise HTTPError(status_code=500, reason='put entity error: {0}'.format(r.text))
         self.jsonify(code=200, entity=EntityHandler.get_entity(schema_name, payload[schema['pk']]))
     except NodeExistsError:
         node = None
         raise HTTPError(status_code=422, reason='schema {0} is locked'.format(schema_name))
     finally:
         if node is not None:
             self.application.zk.delete(node)
Пример #11
0
def generate_hashes(peaks, fan_value=DEFAULT_FAN_VALUE):
    """
    Hash list structure:
       sha1_hash[0:20]    time_offset
    [(e05b341a9b77a51fd26, 32), ... ]
    """
    if PEAK_SORT:
        peaks = sorted(peaks, key=itemgetter(1))

    lenPeaks = len(peaks)
    for i in range(lenPeaks):
        for j in range(1, fan_value):
            if (i + j) < lenPeaks:

                freq1 = peaks[i][IDX_FREQ_I]
                freq2 = peaks[i + j][IDX_FREQ_I]
                t1 = peaks[i][IDX_TIME_J]
                t2 = peaks[i + j][IDX_TIME_J]
                t_delta = t2 - t1

                if t_delta >= MIN_HASH_TIME_DELTA and t_delta <= MAX_HASH_TIME_DELTA:
                    h = farmhash.hash64("%s|%s|%s" % (str(freq1), str(freq2), str(t_delta)))
                    yield (format(h, '016X'), t1)
Пример #12
0
def get_hash_for_key(key):
    return str(farmhash.hash64(key.encode('ascii','ignore')))
Пример #13
0
# import leveldb
# db = leveldb.LevelDB('./db')
# db.Put('hello'.encode('utf8'), 'world'.encode('utf8'))
# print(db.Get('hello'.encode('utf8')))

import farmhash

print(farmhash.hash64('abc'))
Пример #14
0
def get_hash_for_key(key):
    return str(farmhash.hash64(key.encode('ascii', 'ignore')))
Пример #15
0
 def _hash_func(self, d):
     return farmhash.hash64(d)
Пример #16
0
 def _hashfunc64(self, str_value):
     return farmhash.hash64(str_value)