Пример #1
0
    def publish(self, message):
        self._message_number_out += 1

        amqp_message_update_meta(message, self.get_meta())
        amqp_msg = amqp_message_encode(message)
        log.debug("Publish message #%s, AMQP message: %s" %
                  (self._message_number_out, amqp_msg))
        properties = BasicProperties(
            app_id=self.app_id,
            content_type='application/json',
            content_encoding='utf-8',
            delivery_mode=2,  # persistent
        )
        try:
            yield self._channel.basic_publish(
                self.exchange_name,
                self.queue_out_routing_key,
                amqp_msg,
                properties=properties,
            )
        except ChannelClosed:
            self.retry_channel()
            self._cached_messages.append(message)
        except AMQPError:
            self.retry_connect()
            self._cached_messages.append(message)
Пример #2
0
    def fetch_if_new(self, result, task_info):
        job_id = task_info['job_id']
        if result:
            log.debug("Task Result already exists: %s" % job_id)
            file_name = result['result_url']
            self.task_storage.set_jobid_result_url(job_id, file_name)
            dfd = defer.maybeDeferred(self.publish_result, file_name,
                                      task_info)
        else:
            dfd = defer.maybeDeferred(self.fetcher.fetch,
                                      task_info['fetch_uri'])

            # get file response body
            dfd.addCallbacks(self.parse_response,
                             self.failed,
                             callbackArgs=(job_id, ),
                             errbackArgs=(job_id, ))

            # Save File
            dfd.addCallbacks(self.save_file_content,
                             self.failed,
                             callbackArgs=(job_id, ),
                             errbackArgs=(job_id, ))

            # Callback to URI
            dfd.addCallbacks(self.publish_result,
                             self.failed,
                             callbackArgs=(task_info, ),
                             errbackArgs=(job_id, ))
Пример #3
0
    def _thread_finished(self, _, job_id):
        slot = self.task_slots[job_id]
        """
        When a Crawl process finishes her job,
        :param _:
        :param slot:
        :return:
        """
        log.debug("Task %s finished!" % job_id)

        def _do_cleanup(_, slot):
            thread = self.threads.pop(slot)
            self.finished.append(thread)
            # In case of shutdown
            self._wait_for_project(slot)  # add another

        if job_id in self.job_results.keys():
            result_type, result_message = self.job_results.pop(job_id)

            if result_type == 'FAILED':
                d = defer.maybeDeferred(self.poller.set_task_failed, job_id,
                                        result_message)
                if job_id in self.retry_counter:
                    del self.retry_counter[job_id]

            elif result_type == 'RETRY':
                d = defer.maybeDeferred(self.poller.set_task_retry, job_id,
                                        result_message)
            del self.job_results[job_id]
        else:
            d = defer.maybeDeferred(self.poller.set_task_succesfull, job_id,
                                    'task finished successfully!')

        d.addBoth(_do_cleanup, slot)
Пример #4
0
    def _thread_finished(self, _, job_id):
        slot = self.task_slots[job_id]
        """
        When a Crawl process finishes her job,
        :param _:
        :param slot:
        :return:
        """
        log.debug("Task %s finished!" % job_id)

        def _do_cleanup(_, slot):
            thread = self.threads.pop(slot)
            self.finished.append(thread)
            # In case of shutdown
            self._wait_for_project(slot)  # add another

        if job_id in self.job_results.keys():
            result_type, result_message = self.job_results.pop(job_id)

            if result_type == 'FAILED':
                d = defer.maybeDeferred(self.poller.set_task_failed, job_id, result_message)
                if job_id in self.retry_counter:
                    del self.retry_counter[job_id]

            elif result_type == 'RETRY':
                d = defer.maybeDeferred(self.poller.set_task_retry, job_id, result_message)
            del self.job_results[job_id]
        else:
            d = defer.maybeDeferred(self.poller.set_task_succesfull, job_id, 'task finished successfully!')

        d.addBoth(_do_cleanup, slot)
Пример #5
0
    def publish(self, message):
        self._message_number_out += 1

        amqp_message_update_meta(message, self.get_meta())
        amqp_msg = amqp_message_encode(message)
        log.debug("Publish message #%s, AMQP message: %s" % (self._message_number_out, amqp_msg))
        properties = BasicProperties(
            app_id=self.app_id,
            content_type='application/json',
            content_encoding='utf-8',
            delivery_mode=2,  # persistent
        )
        try:
            yield self._channel.basic_publish(
                self.exchange_name,
                self.queue_out_routing_key,
                amqp_msg,
                properties=properties,
            )
        except ChannelClosed:
            self.retry_channel()
            self._cached_messages.append(message)
        except AMQPError:
            self.retry_connect()
            self._cached_messages.append(message)
Пример #6
0
    def read(self, queue_obj):
        ch, method, properties, msg = yield queue_obj.get()

        msg = amqp_message_decode(msg)
        log.debug("Consuming msg %s" % msg)
        self._message_number_in += 1
        self.process_in_message(msg)
        time.sleep(self.consume_interval)
        yield ch.basic_ack(delivery_tag=method.delivery_tag)
        log.debug('Acknowledging message #%s' % self._message_number_in)
Пример #7
0
    def read(self, queue_obj):
        ch, method, properties, msg = yield queue_obj.get()

        msg = amqp_message_decode(msg)
        log.debug("Consuming msg %s" % msg)
        self._message_number_in += 1
        self.process_in_message(msg)
        time.sleep(self.consume_interval)
        yield ch.basic_ack(delivery_tag=method.delivery_tag)
        log.debug('Acknowledging message #%s' % self._message_number_in)
Пример #8
0
    def run_task(self, slot, task_info):
        job_id = task_info['job_id']

        log.debug("Running task: %s" % task_info)
        self.poller.set_task_running(job_id)

        dfd = self.poller.check_url_already_fetched(task_info['fetch_uri'])
        self.threads[slot] = dfd
        dfd.addCallback(self.fetch_if_new, task_info)
        dfd.addErrback(self.failed, job_id)

        dfd.addBoth(self._thread_finished, job_id)
Пример #9
0
    def run_task(self, slot, task_info):
        job_id = task_info['job_id']

        log.debug("Running task: %s" % task_info)
        self.poller.set_task_running(job_id)

        dfd = self.poller.check_url_already_fetched(task_info['fetch_uri'])
        self.threads[slot] = dfd
        dfd.addCallback(self.fetch_if_new, task_info)
        dfd.addErrback(self.failed, job_id)

        dfd.addBoth(self._thread_finished, job_id)
Пример #10
0
    def save_file_content(self, content, job_id):
        # @TODO add new service to call periodically failed requests
        # to the callback_uri
        if not content:
            raise NoResponseContent("Response has no body!")
        ext = get_buffer_extension(content)
        if ext not in self.VALID_RESPONSE_EXT:
            raise InvalidResponseRetry("Invalid content type, retry!")

        file_name = '{name}{ext}'.format(name=job_id, ext=ext)
        log.debug("Save file: %s" % file_name)

        save_path = os.path.join(self.storage_path, file_name)
        with open(save_path, 'wb+') as file:
            file.write(content)

        # Save jobID result URL
        self.task_storage.set_jobid_result_url(job_id, file_name)
        return file_name
Пример #11
0
    def save_file_content(self, content, job_id):
        # @TODO add new service to call periodically failed requests
        # to the callback_uri
        if not content:
            raise NoResponseContent("Response has no body!")
        ext = get_buffer_extension(content)
        if ext not in self.VALID_RESPONSE_EXT:
            raise InvalidResponseRetry("Invalid content type, retry!")

        file_name = '{name}{ext}'.format(name=job_id, ext=ext)
        log.debug("Save file: %s" % file_name)

        save_path = os.path.join(self.storage_path, file_name)
        with open(save_path, 'wb+') as file:
            file.write(content)

        # Save jobID result URL
        self.task_storage.set_jobid_result_url(job_id, file_name)
        return file_name
Пример #12
0
    def fetch_if_new(self, result, task_info):
        job_id = task_info['job_id']
        if result:
            log.debug("Task Result already exists: %s" % job_id)
            file_name = result['result_url']
            self.task_storage.set_jobid_result_url(job_id, file_name)
            dfd = defer.maybeDeferred(self.publish_result, file_name, task_info)
        else:
            dfd = defer.maybeDeferred(self.fetcher.fetch, task_info['fetch_uri'])

            # get file response body
            dfd.addCallbacks(self.parse_response, self.failed,
                             callbackArgs=(job_id,), errbackArgs=(job_id,))

            # Save File
            dfd.addCallbacks(self.save_file_content, self.failed,
                             callbackArgs=(job_id,), errbackArgs=(job_id,))

            # Callback to URI
            dfd.addCallbacks(self.publish_result, self.failed,
                             callbackArgs=(task_info,), errbackArgs=(job_id,))
Пример #13
0
 def update_tasks(self):
     log.debug("Poller > Updating tasks")
     self.tasks_list = self.task_storage.tasks
Пример #14
0
 def fetch(self, url):
     log.debug("Fetch URL %s" % url)
     request = Request(url=url)
     self.process_request(request)
     return mustbe_deferred(self.downloader.download_request, request, None)
Пример #15
0
 def update_tasks(self):
     log.debug("Scheduler > Updating tasks")
     self.tasks_list = self.task_storage.tasks
     log.debug("Current tasks count: %s" % len(self.task_storage))
Пример #16
0
 def update_tasks(self):
     log.debug("Poller > Updating tasks")
     self.tasks_list = self.task_storage.tasks
Пример #17
0
 def fetch(self, url):
     log.debug("Fetch URL %s" % url)
     request = Request(url=url)
     self.process_request(request)
     return mustbe_deferred(self.downloader.download_request, request, None)