def _captcha_worker(self, update): while self._pusher_on.is_set(): with self._lock: while self._captcha_queue: try: captcha = self._captcha_queue.pop() update.message.reply_photo(photo=captcha[0], caption=captcha[1]) except IndexError: pass except Exception: self._log.exception('Captcha worker error') shallow_sleep(0.5) shallow_sleep(0.5)
def get_archive(self, type_, start=None): """Get archive from Zone-H by given archive type.""" domains = _CONF['zoneh']['filters']['domains'] self._api.init_cookies() page_queue = deque([start or START_PAGE]) while page_queue: page_num = page_queue.pop() html_page = self._api.get_page(page_num, type_) next_page = None try: for record, next_page in self._parser.get_records(html_page): url = record['defaced_url'] if all([domains, '...' in url, '/' not in url]): data = self._get_advanced_data(record['mirror']) record['defaced_url'] = data['defaced_url_full'] shallow_sleep(sleep_time()) yield record except exc.HTMLParserCaptchaRequest: captcha_manager.init_captcha(type_, page_num) while captcha.is_active: shallow_sleep(1) yield from self.get_archive(type_, page_num) except exc.HTMLParserCookiesError: self._api.init_cookies(force=True) shallow_sleep(2) yield from self.get_archive(type_, page_num) except Exception: err_msg = 'Exception during getting record' _log.exception(err_msg) raise exc.ScraperError(err_msg) if next_page: page_queue.appendleft(next_page) shallow_sleep(sleep_time())
def _pusher_worker(self, update): rec_num = 0 while self._pusher_on.is_set(): with self._lock: while self._processor.push_queue: try: record = self._processor.push_queue.pop() rec_num += 1 formatter = Formatter(record, rec_num) rec_formatted = formatter.format() keyboard = [[InlineKeyboardButton('Open mirror', url=formatter.get_mirror_url())]] reply_markup = InlineKeyboardMarkup(keyboard) update.message.reply_html(rec_formatted, reply_markup=reply_markup) except IndexError: pass shallow_sleep(1) shallow_sleep(1)
def _run(self): """Real thread run method.""" rec_num = 0 while self._run_trigger.is_set(): with self._lock: self._log.debug('Captcha is active: %s', captcha.is_active) self._log.debug('Captcha is sent: %s', captcha.is_sent) if captcha.is_active and not captcha.is_sent: self._send_captcha(self._update) while self._push_queue: try: record = self._push_queue.pop() except IndexError: pass rec_num += 1 self._process_record(record, rec_num) shallow_sleep(1) shallow_sleep(1)
def _worker(self): while self._processor_on.is_set(): try: for record in self._scraper.get_archive(_type=self._arch_type): if self._processor_on.is_set( ) and record not in self.temp_queue: self._log.debug(json.dumps(record)) self.temp_queue.appendleft(record) if self._filter.satisfy(record): self.push_queue.appendleft(record) else: break time_delta = int(time.time()) + self._rescan_period while int(time.time()) < time_delta: if not self._processor_on.is_set(): break shallow_sleep(0.5) except Exception as err: err_msg = 'Processor thread received error ' \ 'during handling scrape records' self._log.exception(err_msg) raise zoneh.exceptions.ProcessorError(err)
def get_archive(self, _type, start=None): page_url = const.ARCHIVE_TYPES[_type]['page'] domains = _CONF['zoneh']['filters']['domains'] if not self._session.cookies: self._initialize_cookies() try: page_queue = deque([start or const.START_PAGE]) while page_queue: page_num = page_queue.pop() page = self._make_request(page_url.format(page_num=page_num)) next_page = None for record, next_page in self._parser.get_records( page.content): url = record['defaced_url'] if all([domains, '...' in url, '/' not in url]): data = self._get_advanced_data(record['mirror']) record['defaced_url'] = data['defaced_url_full'] shallow_sleep(sleep_time()) yield record if next_page: page_queue.appendleft(next_page) shallow_sleep(sleep_time()) except zoneh.exceptions.HTMLParserCaptchaRequest: self._send_captcha() self.got_captcha = True self._captcha_page = (_type, page_num) while self.got_captcha: shallow_sleep(1) yield from self.get_archive(_type, page_num) except zoneh.exceptions.HTMLParserCookiesError: self._purge_cookies() self._initialize_cookies() yield from self.get_archive(_type, page_num) except Exception: err_msg = 'Exception during getting record' _LOG.exception(err_msg) raise zoneh.exceptions.ScraperError(err_msg)