def publish(self, message): self._message_number_out += 1 amqp_message_update_meta(message, self.get_meta()) amqp_msg = amqp_message_encode(message) log.debug("Publish message #%s, AMQP message: %s" % (self._message_number_out, amqp_msg)) properties = BasicProperties( app_id=self.app_id, content_type='application/json', content_encoding='utf-8', delivery_mode=2, # persistent ) try: yield self._channel.basic_publish( self.exchange_name, self.queue_out_routing_key, amqp_msg, properties=properties, ) except ChannelClosed: self.retry_channel() self._cached_messages.append(message) except AMQPError: self.retry_connect() self._cached_messages.append(message)
def fetch_if_new(self, result, task_info): job_id = task_info['job_id'] if result: log.debug("Task Result already exists: %s" % job_id) file_name = result['result_url'] self.task_storage.set_jobid_result_url(job_id, file_name) dfd = defer.maybeDeferred(self.publish_result, file_name, task_info) else: dfd = defer.maybeDeferred(self.fetcher.fetch, task_info['fetch_uri']) # get file response body dfd.addCallbacks(self.parse_response, self.failed, callbackArgs=(job_id, ), errbackArgs=(job_id, )) # Save File dfd.addCallbacks(self.save_file_content, self.failed, callbackArgs=(job_id, ), errbackArgs=(job_id, )) # Callback to URI dfd.addCallbacks(self.publish_result, self.failed, callbackArgs=(task_info, ), errbackArgs=(job_id, ))
def _thread_finished(self, _, job_id): slot = self.task_slots[job_id] """ When a Crawl process finishes her job, :param _: :param slot: :return: """ log.debug("Task %s finished!" % job_id) def _do_cleanup(_, slot): thread = self.threads.pop(slot) self.finished.append(thread) # In case of shutdown self._wait_for_project(slot) # add another if job_id in self.job_results.keys(): result_type, result_message = self.job_results.pop(job_id) if result_type == 'FAILED': d = defer.maybeDeferred(self.poller.set_task_failed, job_id, result_message) if job_id in self.retry_counter: del self.retry_counter[job_id] elif result_type == 'RETRY': d = defer.maybeDeferred(self.poller.set_task_retry, job_id, result_message) del self.job_results[job_id] else: d = defer.maybeDeferred(self.poller.set_task_succesfull, job_id, 'task finished successfully!') d.addBoth(_do_cleanup, slot)
def read(self, queue_obj): ch, method, properties, msg = yield queue_obj.get() msg = amqp_message_decode(msg) log.debug("Consuming msg %s" % msg) self._message_number_in += 1 self.process_in_message(msg) time.sleep(self.consume_interval) yield ch.basic_ack(delivery_tag=method.delivery_tag) log.debug('Acknowledging message #%s' % self._message_number_in)
def run_task(self, slot, task_info): job_id = task_info['job_id'] log.debug("Running task: %s" % task_info) self.poller.set_task_running(job_id) dfd = self.poller.check_url_already_fetched(task_info['fetch_uri']) self.threads[slot] = dfd dfd.addCallback(self.fetch_if_new, task_info) dfd.addErrback(self.failed, job_id) dfd.addBoth(self._thread_finished, job_id)
def save_file_content(self, content, job_id): # @TODO add new service to call periodically failed requests # to the callback_uri if not content: raise NoResponseContent("Response has no body!") ext = get_buffer_extension(content) if ext not in self.VALID_RESPONSE_EXT: raise InvalidResponseRetry("Invalid content type, retry!") file_name = '{name}{ext}'.format(name=job_id, ext=ext) log.debug("Save file: %s" % file_name) save_path = os.path.join(self.storage_path, file_name) with open(save_path, 'wb+') as file: file.write(content) # Save jobID result URL self.task_storage.set_jobid_result_url(job_id, file_name) return file_name
def fetch_if_new(self, result, task_info): job_id = task_info['job_id'] if result: log.debug("Task Result already exists: %s" % job_id) file_name = result['result_url'] self.task_storage.set_jobid_result_url(job_id, file_name) dfd = defer.maybeDeferred(self.publish_result, file_name, task_info) else: dfd = defer.maybeDeferred(self.fetcher.fetch, task_info['fetch_uri']) # get file response body dfd.addCallbacks(self.parse_response, self.failed, callbackArgs=(job_id,), errbackArgs=(job_id,)) # Save File dfd.addCallbacks(self.save_file_content, self.failed, callbackArgs=(job_id,), errbackArgs=(job_id,)) # Callback to URI dfd.addCallbacks(self.publish_result, self.failed, callbackArgs=(task_info,), errbackArgs=(job_id,))
def update_tasks(self): log.debug("Poller > Updating tasks") self.tasks_list = self.task_storage.tasks
def fetch(self, url): log.debug("Fetch URL %s" % url) request = Request(url=url) self.process_request(request) return mustbe_deferred(self.downloader.download_request, request, None)
def update_tasks(self): log.debug("Scheduler > Updating tasks") self.tasks_list = self.task_storage.tasks log.debug("Current tasks count: %s" % len(self.task_storage))