def __init__(self, rabbitmq_url=None, queue=None, routing_key=None, exchange="message", exchange_type="direct", log=None, max_tasks=5, logging=None): """ == Config dict structure (case adjusted to json configuration): { "rabbit": { "url": "apmq://rabbit", "queue": "test", "routingKey": "example.json" "exchange": "message", // optional, default: message "exchangeType:" "topic" // optional, default: topic } } :param str rabbitmq_url: optional url to rabbitmq :param str queue: name of the queue :param str routing_key: routing key for queue :param str exchange: name of the exchange :param str exchange_type: type of the exchange :param dict config: Manager configuration from parsed json config all the above options can be configured from it :param logging.Logger log: optional logger that will replace new one :raises exceptions.NotConfigured: :return: """ if queue is None: raise exceptions.NotConfigured("Misssing queue") self._connection = None self._channel = None self._closing = False self._consumer_tag = None self._max_tasks = max_tasks # 2 cores + 1 self._tasks_number = 0 self._executor = ThreadPoolExecutor(max_workers=self._max_tasks) self._max_tasks_warning_counter = 0 self._rabbitmq_url = rabbitmq_url self._queue = queue self._routing_key = routing_key self._exchange = exchange self._exchange_type = exchange_type if log is None: from toddler.logging import setup_logging if logging is not None: self.log = setup_logging(config=logging) else: self.log = setup_logging() else: self.log = log
def get_documents(search_url, params: dict, nb_rows=600, per_page=100): log = setup_logging() params = Dict(**params) params.language = "en" params.synthesis = "disabled" # no synthesis params.hf = per_page context = None i = 0 for x in range(0, nb_rows, per_page): url = urljoin(search_url, "/search-api/search") params.start = x params.context = context while True: response = requests.get(url, params=params.to_dict()) """:type response: requests.Response""" if response.status_code != 200: log.error("Got 500: %s" % url) log.debug(params.to_dict()) log.debug(response.text) continue """:type response: requests.Response""" doc = BeautifulSoup(response.text.encode("utf8"), ['lxml', 'xml']) context = doc.Answer['context'] if int(doc.Answer['nhits']) > 0: for hit in doc.find_all("Hit"): log.info("Extracted %d document" % i) i += 1 yield _extract_hit(hit) break return
def __init__(self, rabbitmq_url, exchange, routing_key, logging=None, log=None, **kwargs): self.rabbitmq_url = rabbitmq_url self.exchange = exchange self.routing_key = routing_key self.scheduler = scheduler(time.time, time.sleep) self.delay_queue_thread = AnalysisTaskDelayQueueThread(self.rabbitmq_url) if log is None: from toddler.logging import setup_logging if logging is not None: self.log = setup_logging(config=logging) else: self.log = setup_logging() else: self.log = log pass