def _process_request(self, request, spider): """ Replace requested meta['splash']['args'] values with their fingerprints. This allows to store values only once in request queue, which helps with disk queue size. Downloader middleware should restore the values from fingerprints. """ if 'splash' not in request.meta: return request if '_replaced_args' in request.meta['splash']: # don't process re-scheduled requests # XXX: does it work as expected? warnings.warn( "Unexpected request.meta['splash']['_replaced_args']") return request request.meta['splash']['_replaced_args'] = [] cache_args = request.meta['splash'].get('cache_args', []) args = request.meta['splash'].setdefault('args', {}) for name in cache_args: if name not in args: continue value = args[name] fp = 'LOCAL+' + json_based_hash(value) spider.state[self.local_values_key][fp] = value args[name] = fp request.meta['splash']['_replaced_args'].append(name) return request
def _process_request(self, request, spider): """ Replace requested meta['splash']['args'] values with their fingerprints. This allows to store values only once in request queue, which helps with disk queue size. Downloader middleware should restore the values from fingerprints. """ if 'splash' not in request.meta: return request if '_replaced_args' in request.meta['splash']: # don't process re-scheduled requests # XXX: does it work as expected? warnings.warn("Unexpected request.meta['splash']['_replaced_args']") return request request.meta['splash']['_replaced_args'] = [] cache_args = request.meta['splash'].get('cache_args', []) args = request.meta['splash'].setdefault('args', {}) for name in cache_args: if name not in args: continue value = args[name] fp = 'LOCAL+' + json_based_hash(value) spider.state[self.local_values_key][fp] = value args[name] = fp request.meta['splash']['_replaced_args'].append(name) return request
def test_json_based_hash(val1, val2): assume(val1 != val2) assert json_based_hash(val1) == json_based_hash(val1) assert json_based_hash(val1) != json_based_hash(val2)