예제 #1
0
    def _get_additional_info(self,
                             first_page: str = None) -> None:
        """ Get additional info (amount of found
        docs and contexts, link to the graphic).
        """
        params = self.params.copy()
        params['lang'] = 'ru'
        params.pop('expand', None)
        try:
            first_page = first_page or creq.get_htmls(RNC_URL, **params)[0]
        except creq.BaseRequestError:
            raise

        soup = bs4.BeautifulSoup(first_page, 'lxml')
        content = soup.find('div', {'class': 'content'})

        try:
            additional_info = Corpus._get_where_query_found(content)
            graphic_url = Corpus._get_graphic_url(content)
        except Exception as e:
            logger.error("Sth went wrong while "
                         f"getting additional info:\n{e}")
        else:
            if graphic_url:
                additional_info['graphic_link'] = graphic_url

            self._add_info = additional_info
예제 #2
0
    def request_examples(self) -> None:
        """ Request examples, parse them and update the data.

        If there are no results found, last page does not exist,
        params or query is wrong then exception.

        :return: None.

        :exception RuntimeError: if the data still exist.
        """
        if self.data:
            logger.error("Tried to request new examples, however data exist")
            raise RuntimeError("Data still exist")

        start = time.time()
        try:
            first, last = creq.is_request_correct(
                RNC_URL, self.p_count, **self.params)
        except creq.BaseRequestError as e:
            msg = f"Query = {self.forms_in_query}, " \
                  f"{self.p_count}, {self.params}\ne = {e}"
            logger.error(msg)
            raise

        # get additional info from the first RNC page.
        logger.debug("Getting additional info from the first RNC page")
        if self.out == 'normal':
            self._get_additional_info(first)
        else:
            self._get_additional_info()
        logger.debug("Additional info received")

        if self.p_count > 2:
            logger.debug("Main request")
            htmls = creq.get_htmls(RNC_URL, 1, self.p_count - 1, **self.params)
            htmls = [first] + htmls + [last]
            logger.debug("Main request completed")
        else:
            htmls = [first]
            if self.p_count == 2:
                htmls += [last]

        logger.debug("Parsing html started")
        try:
            parsing_start = time.time()
            parsed = self._parse_all_pages(htmls)
            parsing_stop = time.time()
        except Exception as e:
            logger.error(f"Error while parsing, query = {self.params}\n{e}")
            raise
        else:
            logger.debug("Parsing completed")
            logger.info(f"Parsing time: {parsing_stop - parsing_start:.2f}")
            logger.info(f"Overall time: {parsing_stop - start:.2f}")
            self._data = parsed[:]
예제 #3
0
def test_wrong_params():
    req.get_htmls(RNC_URL, **wrong_params)
예제 #4
0
def test_wait_some_time():
    correct_params['lex1'] = 'я'
    html_codes = req.get_htmls(RNC_URL, 0, 15, **correct_params)
    assert len(html_codes) == 15
예제 #5
0
def test_wrong_range():
    assert len(req.get_htmls(RNC_URL, 10, 0)) == 0