def __init__(self, path='/', base_url=None, query_string=None, method='GET', input_stream=None, content_type=None, content_length=None, errors_stream=None, multithread=False, multiprocess=False, run_once=False, headers=None, data=None, environ_base=None, environ_overrides=None, charset='utf-8'): if query_string is None and '?' in path: path, query_string = path.split('?', 1) self.charset = charset if isinstance(path, unicode): path = iri_to_uri(path, charset) self.path = path if base_url is not None: if isinstance(base_url, unicode): base_url = iri_to_uri(base_url, charset) else: base_url = url_fix(base_url, charset) self.base_url = base_url if isinstance(query_string, basestring): self.query_string = query_string else: if query_string is None: query_string = MultiDict() elif not isinstance(query_string, MultiDict): query_string = MultiDict(query_string) self.args = query_string self.method = method if headers is None: headers = Headers() elif not isinstance(headers, Headers): headers = Headers(headers) self.headers = headers self.content_type = content_type if errors_stream is None: errors_stream = sys.stderr self.errors_stream = errors_stream self.multithread = multithread self.multiprocess = multiprocess self.run_once = run_once self.environ_base = environ_base self.environ_overrides = environ_overrides self.input_stream = input_stream self.content_length = content_length self.closed = False if data: if input_stream is not None: raise TypeError('can\'t provide input stream and data') if isinstance(data, basestring): self.input_stream = StringIO(data) if self.content_length is None: self.content_length = len(data) else: for key, value in _iter_data(data): if isinstance(value, (tuple, dict)) or \ hasattr(value, 'read'): self._add_file_from_data(key, value) else: self.form.setlistdefault(key).append(value)
def add(self, site, url_list, qps=SITE_DEFAULT_QPS, batch_size=JOB_BATCH_SIZE): host_table_key = "host:%s" % site if not r.exists(host_table_key): ## create a new host entry host_record = { 'name':site, 'total_qps':qps, 'qps':qps, 'total_urls':0, 'total_batches':0, 'pending_batches':0, } ## 1.insert host into host table. ## ## 2.add the hostname into host list (sorted set) ## * score=now, optimized to processing existing host first ## r.zadd(host_list_key, now, site) ## * score=SITE_DEADZONE_OFFSET, optimized to processing new host first ## since the timestamp SITE_DEADZONE_OFFSET is the past r.pipeline() \ .hmset(host_table_key, host_record) \ .zadd(host_list_key, host_table_key, SITE_DEADZONE_OFFSET) \ .execute() host_pending_key = "host:%s:pending:%s" % (site, self.priority) job_batch_key = "jobs:jid:%s:batches" % self.id job_url_key = "jobs:jid:%s:urls" % self.id count = len(url_list) for i in range(0, count, batch_size): bid = r.incr("host:%s:nextBatchId" % site) batch_key = "host:%s:batch:%s" % (site, bid) batch_list = url_list[i:i+batch_size] batch_count = len(batch_list) ## add the batch record to DB. for url in batch_list: try: url = url_fix(urlnorm.norm(url.strip())) except urlnorm.InvalidUrl: continue except UnicodeDecodeError: continue r.rpush(batch_key, url) ## ## 1. update site and job counters. ## ## 2. update host pending list. ## ## 3. job batch list. ## ret = r.pipeline() \ .hincrby(host_table_key, "total_urls", batch_count) \ .hincrby(host_table_key, "total_batches", 1) \ .hincrby(host_table_key, "pending_batches", 1) \ .incr(job_url_key, batch_count) \ .rpush(host_pending_key, batch_key) \ .rpush(job_batch_key, batch_key) \ .execute() ## restore the host for scheduling if it's been in deadzone. resurrect_host(r, host_table_key)
def add(self, site, url_list, qps=SITE_DEFAULT_QPS, batch_size=JOB_BATCH_SIZE): host_table_key = "host:%s" % site if not r.exists(host_table_key): ## create a new host entry host_record = { "name": site, "total_qps": qps, "qps": qps, "total_urls": 0, "total_batches": 0, "pending_batches": 0, } ## 1.insert host into host table. ## ## 2.add the hostname into host list (sorted set) ## * score=now, optimized to processing existing host first ## r.zadd(host_list_key, now, site) ## * score=SITE_DEADZONE_OFFSET, optimized to processing new host first ## since the timestamp SITE_DEADZONE_OFFSET is the past r.pipeline().hmset(host_table_key, host_record).zadd( host_list_key, host_table_key, SITE_DEADZONE_OFFSET ).execute() host_pending_key = "host:%s:pending:%s" % (site, self.priority) job_batch_key = "jobs:jid:%s:batches" % self.id job_url_key = "jobs:jid:%s:urls" % self.id count = len(url_list) for i in range(0, count, batch_size): bid = r.incr("host:%s:nextBatchId" % site) batch_key = "host:%s:batch:%s" % (site, bid) batch_list = url_list[i : i + batch_size] batch_count = len(batch_list) ## add the batch record to DB. for url in batch_list: try: url = url_fix(urlnorm.norm(url.strip())) except urlnorm.InvalidUrl: continue except UnicodeDecodeError: continue r.rpush(batch_key, url) ## ## 1. update site and job counters. ## ## 2. update host pending list. ## ## 3. job batch list. ## ret = ( r.pipeline() .hincrby(host_table_key, "total_urls", batch_count) .hincrby(host_table_key, "total_batches", 1) .hincrby(host_table_key, "pending_batches", 1) .incr(job_url_key, batch_count) .rpush(host_pending_key, batch_key) .rpush(job_batch_key, batch_key) .execute() ) ## restore the host for scheduling if it's been in deadzone. resurrect_host(r, host_table_key)