def fetch(self): """ Download urls via multicurl. Get new tasks from queue. """ m = pycurl.CurlMulti() m.handles = [] # Create curl instances for x in xrange(self.thread_number): curl = pycurl.Curl() m.handles.append(curl) freelist = m.handles[:] # This is infinite cycle # You can break it only from outside code which # iterates over result of this method while True: cached_request = None while len(freelist): # Increase request counter if (self.request_limit is not None and self.counters['request'] >= self.request_limit): logging.debug('Request limit is reached: %s' %\ self.request_limit) if len(freelist) == self.thread_number: yield None else: break else: try: priority, task = self.taskq.get(True, 0.1) except Empty: # If All handlers are free and no tasks in queue # yield None signal if len(freelist) == self.thread_number: yield None else: break else: if not self._preprocess_task(task): continue task.network_try_count += 1 if task.task_try_count == 0: task.task_try_count = 1 if task.task_try_count > self.task_try_limit: logging.debug('Task tries ended: %s / %s' % (task.name, task.url)) self.add_item('too-many-task-tries', task.url) continue if task.network_try_count > self.network_try_limit: logging.debug('Network tries ended: %s / %s' % (task.name, task.url)) self.add_item('too-many-network-tries', task.url) continue #import pdb; pdb.set_trace() if task.grab: grab = task.grab else: # Set up curl instance via Grab interface grab = Grab(**self.grab_config) grab.setup(url=task.url) if self.use_cache and not task.get('disable_cache'): if grab.detect_request_method() == 'GET': url = grab.config['url'] cache_item = self.cache.find_one({'_id': url}) if cache_item: #if url in self.cache: #cache_item = pickle.loads(self.cache[url]) #logging.debug('From cache: %s' % url) cached_request = (grab, grab.clone(), task, cache_item) grab.prepare_request() self.inc_count('request-cache') # break from prepre-request cycle # and go to process-response code break self.inc_count('request-network') if self.proxylist_config: args, kwargs = self.proxylist_config grab.setup_proxylist(*args, **kwargs) curl = freelist.pop() curl.grab = grab curl.grab.curl = curl curl.grab_original = grab.clone() curl.grab.prepare_request() curl.task = task # Add configured curl instance to multi-curl processor m.add_handle(curl) # If there were done network requests if len(freelist) != self.thread_number: while True: status, active_objects = m.perform() if status != pycurl.E_CALL_MULTI_PERFORM: break if cached_request: grab, grab_original, task, cache_item = cached_request url = task.url # or grab.config['url'] grab.fake_response(cache_item['body']) def custom_prepare_response(g): g.response.head = cache_item['head'].encode('utf-8') g.response.body = cache_item['body'].encode('utf-8') g.response.code = cache_item['response_code'] g.response.time = 0 g.response.url = cache_item['url'] g.response.parse('utf-8') g.response.cookies = g.extract_cookies() grab.process_request_result(custom_prepare_response) yield { 'ok': True, 'grab': grab, 'grab_original': grab_original, 'task': task, 'ecode': None, 'emsg': None } self.inc_count('request') while True: queued_messages, ok_list, fail_list = m.info_read() results = [] for curl in ok_list: results.append((True, curl, None, None)) for curl, ecode, emsg in fail_list: results.append((False, curl, ecode, emsg)) for ok, curl, ecode, emsg in results: res = self.process_multicurl_response( ok, curl, ecode, emsg) m.remove_handle(curl) freelist.append(curl) yield res self.inc_count('request') if not queued_messages: break m.select(0.5)
def fetch(self): """ Download urls via multicurl. Get new tasks from queue. """ m = pycurl.CurlMulti() m.handles = [] # Create curl instances for x in xrange(self.thread_number): curl = pycurl.Curl() m.handles.append(curl) freelist = m.handles[:] # This is infinite cycle # You can break it only from outside code which # iterates over result of this method while True: cached_request = None while len(freelist): # Increase request counter if (self.request_limit is not None and self.counters['request'] >= self.request_limit): logging.debug('Request limit is reached: %s' %\ self.request_limit) if len(freelist) == self.thread_number: yield None else: break else: try: priority, task = self.taskq.get(True, 0.1) except Empty: # If All handlers are free and no tasks in queue # yield None signal if len(freelist) == self.thread_number: yield None else: break else: if not self._preprocess_task(task): continue task.network_try_count += 1 if task.task_try_count == 0: task.task_try_count = 1 if task.task_try_count > self.task_try_limit: logging.debug('Task tries ended: %s / %s' % ( task.name, task.url)) self.add_item('too-many-task-tries', task.url) continue if task.network_try_count > self.network_try_limit: logging.debug('Network tries ended: %s / %s' % ( task.name, task.url)) self.add_item('too-many-network-tries', task.url) continue #import pdb; pdb.set_trace() if task.grab: grab = task.grab else: # Set up curl instance via Grab interface grab = Grab(**self.grab_config) grab.setup(url=task.url) if self.use_cache and not task.get('disable_cache'): if grab.detect_request_method() == 'GET': url = grab.config['url'] utf_url = url.encode('utf-8') if isinstance(url, unicode) else url if self.cache_key_hash: url_hash = sha1(utf_url).hexdigest() else: url_hash = url cache_item = self.cache.find_one({'_id': url_hash}) if cache_item: #if url in self.cache: #cache_item = pickle.loads(self.cache[url]) logging.debug('From cache: %s' % url) cached_request = (grab, grab.clone(), task, cache_item) grab.prepare_request() self.inc_count('request-cache') # break from prepre-request cycle # and go to process-response code break self.inc_count('request-network') if self.proxylist_config: args, kwargs = self.proxylist_config grab.setup_proxylist(*args, **kwargs) curl = freelist.pop() curl.grab = grab curl.grab.curl = curl curl.grab_original = grab.clone() curl.grab.prepare_request() curl.task = task # Add configured curl instance to multi-curl processor m.add_handle(curl) # If there were done network requests if len(freelist) != self.thread_number: while True: status, active_objects = m.perform() if status != pycurl.E_CALL_MULTI_PERFORM: break if cached_request: grab, grab_original, task, cache_item = cached_request url = task.url# or grab.config['url'] grab.fake_response(cache_item['body']) if self.use_cache_compression: body = zlib.decompress(cache_item['body']) else: body = cache_item['body'].encode('utf-8') def custom_prepare_response(g): g.response.head = cache_item['head'].encode('utf-8') g.response.body = body g.response.code = cache_item['response_code'] g.response.time = 0 g.response.url = cache_item['url'] g.response.parse('utf-8') g.response.cookies = g.extract_cookies() grab.process_request_result(custom_prepare_response) yield {'ok': True, 'grab': grab, 'grab_original': grab_original, 'task': task, 'ecode': None, 'emsg': None} self.inc_count('request') while True: queued_messages, ok_list, fail_list = m.info_read() results = [] for curl in ok_list: results.append((True, curl, None, None)) for curl, ecode, emsg in fail_list: results.append((False, curl, ecode, emsg)) for ok, curl, ecode, emsg in results: res = self.process_multicurl_response(ok, curl, ecode, emsg) m.remove_handle(curl) freelist.append(curl) yield res self.inc_count('request') if not queued_messages: break m.select(0.5)