Пример #1
0
    def do_parse(self, feed_url):
        """Start parsing process. Get pages to parse. Return generator with parsed articles"""

        self.driver.get(feed_url)

        # Simulation of scrolling down process, until all articles will be shown
        found_not_actual = False
        while True and not found_not_actual:
            body = self.driver.find_element_by_tag_name('body')
            body.send_keys(Keys.END)

            try:
                WebDriverWait(self.driver, 5).until(
                    self.height_change
                )
            except TimeoutException:
                break

            try:
                t_links = self.driver.find_elements_by_class_name('t-link')
                for link in t_links:
                    if 'вчера' in link.text:
                        found_not_actual = True
                        break
            except StaleElementReferenceException:
                logger.error('Time tag not found in articles feed from DTF source.')

        html_articles = self.driver.find_elements_by_class_name('feed__item')
        articles = list()

        if html_articles:
            with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
                articles = executor.map(self.parse_article, html_articles)

        return articles
Пример #2
0
    def parse_article(self, article):
        parsed_article = dict()

        try:
            article_stamp = article.find_element_by_class_name('time').text
            article_time = datetime.now(pytz.timezone('Europe/Moscow'))
            article_time = article_time.replace(minute=int(article_stamp[3:]), hour=int(article_stamp[:2]))
        except NoSuchElementException:
            article_time = ''
            logger.error('Can\'t find time of the article with text: {}'.format(article.text))

        try:
            h2 = article.find_element_by_tag_name('a').text
        except NoSuchElementException:
            h2 = ''
            logger.error('Can\'t find h2 for article with text: {}'.format(article.text))

        try:
            a_tag = article.find_element_by_tag_name('a')
            href = a_tag.get_attribute('href')
        except NoSuchElementException:
            href = ''
            logger.error('Can\'t find a for article with text: {}'.format(article.text))

        if href:
            try:
                article = Article.objects.get(url=href)
            except Article.DoesNotExist:
                article = None

            if not article:
                try:
                    sleep(0.5)  # simulation user behavior
                    detail = BeautifulSoup(requests.get(href).content, 'lxml')
                    body_post = detail.find('div', {'class': 'news-item__content'})
                    if body_post:
                        picture = None

                        full_text = body_post.get_text().strip()
                        parsed_article = {
                            'url': href,
                            'picture': picture,
                            'header': h2,
                            'text': full_text,
                            'date': article_time,
                        }
                    else:
                        logger.error("URL {} has no body.".format(href))
                except Exception as ex:
                    logger.error('Error "{}" while trying to open url: {}'.format(ex, href))
            else:
                # we have already stored this article in database and just need to connect it with user
                parsed_article = {
                    'db_article': article
                }

        return parsed_article
Пример #3
0
    def test_connection(self):
        """Test if Selenium successfully connected to feed."""

        auth_flag = False
        nav_logo = self.driver.find_element_by_class_name('navLogo')
        if nav_logo:
            try:
                logo_link = nav_logo.find_element_by_tag_name('a')
                if logo_link.get_attribute('href') == 'https://shazoo.ru/':
                    auth_flag = True
            except NoSuchElementException:
                logger.error('Can\'t find a tag inside Shazoo logo.')

        return auth_flag
Пример #4
0
def start_parsing(source=None):
    """Fires parsing process."""

    if source is None:
        sources = Source.objects.all()
    else:
        sources = Source.objects.filter(label=source)

    for source in sources:
        with factory.create(source.label) as parser:
            if parser and parser.test_connection():
                logger.info('Successfully connected to source {}'.format(source.name))

                articles = parser.do_parse(source.url)
                save_to_db(articles, source)
            else:
                logger.error('Can\'t connect to source {}!'.format(source.name))
Пример #5
0
def manage_resources(request):
    if request.POST:
        source_id = int(request.POST['source'])
        status = request.POST['status']
        source_order = request.POST.get('source_order', None)
        try:
            source = Source.objects.get(pk=source_id)
            ok = True
        except Source.DoesNotExist:
            ok = False
        else:
            if status == 'enable':
                try:
                    user_source = UserSources.objects.get(user=request.user, source=source)
                    user_source.source_order = source_order
                except UserSources.DoesNotExist:
                    user_source = UserSources(user=request.user, source=source, source_order=source_order)
                finally:
                    user_source.save()
            else:
                try:
                    UserSources.objects.get(user=request.user, source=source).delete()
                except UserSources.DoesNotExist:
                    logger.error('Trying to disable user ({}) source ({}) that is not in DB'.format(
                        request.user.username, source.name))

        return JsonResponse({'ok': ok})
    else:
        user_sources = Source.objects.filter(usersources__user=request.user).order_by('usersources__source_order')
        all_sources = Source.objects.order_by('name')

        print(all_sources)

        context = {
            'user_sources': user_sources,
            'all_sources': all_sources
        }
        return render(request, 'manage_source.html', context=context)
Пример #6
0
    def parse_article(self, article):
        parsed_article = dict()

        try:
            article_stamp = int(article.find_element_by_tag_name('time').get_attribute('data-date'))
            article_time = datetime.fromtimestamp(article_stamp, pytz.timezone('Europe/Moscow'))
            is_actual_article = True if article_time.date() == datetime.today().date() else False
        except NoSuchElementException:
            article_time = ''
            is_actual_article = False
            logger.error('Can\'t find time of the article with text: {}'.format(article.text))

        if is_actual_article:
            try:
                h2 = article.find_element_by_tag_name('h2').text
            except NoSuchElementException:
                h2 = ''
                logger.error('Can\'t find h2 for article with text: {}'.format(article.text))

            try:
                a_tag = article.find_element_by_class_name('t-link')
                href = a_tag.get_attribute('href')
            except NoSuchElementException:
                href = ''
                logger.error('Can\'t find a for article with text: {}'.format(article.text))

            if href:
                try:
                    article = Article.objects.get(url=href)
                except Article.DoesNotExist:
                    article = None

                if not article:
                    try:
                        sleep(0.5)  # simulation user behavior
                        detail = BeautifulSoup(requests.get(href).content, 'lxml')
                        body_post = detail.find('div', {'class': 'content--full'})
                        if body_post:
                            img_wrapper = detail.find('div', {'class': 'andropov_image'})
                            picture = img_wrapper.attrs['data-image-src'] if img_wrapper else None

                            # remove some web page stuff
                            try:
                                body_post = clean_page(body_post, {
                                    'div': [
                                        'content-counters',
                                        'authorCard',
                                    ]
                                })

                                if body_post.find('figure'):
                                    body_post.find('figure').decompose()
                            except Exception as ex:
                                logger.error(
                                    'Error "{}" while trying to remove unused elements from page: {}'.format(ex, href))

                            full_text = body_post.get_text().strip()
                            parsed_article = {
                                'url': href,
                                'header': h2,
                                'picture': picture,
                                'text': full_text,
                                'date': article_time,
                            }
                        else:
                            logger.error("URL {} has no body.".format(href))
                    except Exception as ex:
                        logger.error('Error "{}" while trying to open url: {}'.format(ex, href))
                else:
                    # we have already stored this article in database and just need to connect it with user
                    parsed_article = {
                        'db_article': article
                    }

        return parsed_article
Пример #7
0
    def parse_article(self, article):
        parsed_article = dict()

        try:
            article_stamp = article.find_element_by_class_name(
                'item__category').text
            is_actual_article = True if len(article_stamp) == 5 else False
            if is_actual_article:
                article_time = datetime.now(pytz.timezone('Europe/Moscow'))
                article_time = article_time.replace(
                    minute=int(article_stamp[3:]), hour=int(article_stamp[:2]))
        except NoSuchElementException:
            article_time = ''
            is_actual_article = False
            logger.error(
                'Can\'t find time of the article with text: {}'.format(
                    article.text))

        if is_actual_article:
            try:
                h2 = article.find_element_by_class_name('item__title').text
            except NoSuchElementException:
                h2 = ''
                logger.error('Can\'t find h2 for article with text: {}'.format(
                    article.text))

            try:
                a_tag = article.find_element_by_class_name('item__link')
                href = a_tag.get_attribute('href')
            except NoSuchElementException:
                href = ''
                logger.error('Can\'t find a for article with text: {}'.format(
                    article.text))

            if href:
                try:
                    article = Article.objects.get(url=href)
                except Article.DoesNotExist:
                    article = None

                if not article:
                    try:
                        sleep(0.5)  # simulation user behavior
                        detail = BeautifulSoup(
                            requests.get(href).content, 'lxml')
                        body_post = detail.find('div', {'class': 'l-col-main'})
                        if body_post:
                            img = detail.find(
                                'img', {'class': 'article__main-image__image'})
                            picture = img.attrs['src'] if img else None

                            # remove some web page stuff
                            try:
                                body_post = clean_page(
                                    body_post, {
                                        'div': [
                                            'article__header',
                                            'article__inline-video',
                                            'article__inline-item__link',
                                            'article__inline - item__category',
                                            'article__inline-item',
                                            'pro-anons',
                                            'article__authors',
                                            'article__tags',
                                            'banner',
                                            'banner__median_mobile',
                                            'article__main-image',
                                        ]
                                    })
                            except Exception as ex:
                                logger.error(
                                    'Error "{}" while trying to remove unused elements from page: {}'
                                    .format(ex, href))

                            full_text = body_post.get_text().strip()
                            parsed_article = {
                                'url': href,
                                'header': h2,
                                'picture': picture,
                                'text': full_text,
                                'date': article_time,
                            }
                        else:
                            logger.error("URL {} has no body.".format(href))
                    except Exception as ex:
                        logger.error(
                            'Error "{}" while trying to open url: {}'.format(
                                ex, href))
                else:
                    # we have already stored this article in database and just need to connect it with user
                    parsed_article = {'db_article': article}

        return parsed_article
Пример #8
0
    def parse_article(self, article):
        parsed_article = dict()

        try:
            article_time = article.find_element_by_tag_name(
                'time').get_attribute('datetime')
            article_time = datetime.fromisoformat(article_time)
            is_actual_article = True if article_time.date() == datetime.today(
            ).date() else False
        except NoSuchElementException:
            article_time = ''
            is_actual_article = False
            logger.error(
                'Can\'t find time of the article with text: {}'.format(
                    article.text))

        if is_actual_article:
            try:
                h2 = article.find_element_by_tag_name('h2').text
                h2_a = article.find_element_by_tag_name(
                    'h2').find_element_by_tag_name('a')
                href = h2_a.get_attribute('href')
            except NoSuchElementException:
                href = ''
                h2 = ''
                logger.error('Can\'t find h2 for article with text: {}'.format(
                    article.text))

            if href:
                try:
                    article = Article.objects.get(url=href)
                except Article.DoesNotExist:
                    article = None

                if not article:
                    try:
                        sleep(0.5)  # simulation user behavior
                        detail = BeautifulSoup(
                            requests.get(href).content, 'lxml')
                        body_post = detail.find('section', {'class': 'body'})
                        if body_post:
                            picture = None
                            img_wrapper = detail.find(
                                'div', {'class': 'entryImageContainer'})
                            if img_wrapper:
                                picture = img_wrapper.find(
                                    'img').attrs['src'] if img_wrapper.find(
                                        'img') else None

                            full_text = body_post.get_text().strip()
                            parsed_article = {
                                'url': href,
                                'header': h2,
                                'picture': picture,
                                'text': full_text,
                                'date': article_time,
                            }
                        else:
                            logger.error("URL {} has no body.".format(href))
                    except Exception as ex:
                        logger.error(
                            'Error "{}" while trying to open url: {}'.format(
                                ex, href))
                else:
                    # we have already stored this article in database and just need to connect it with user
                    parsed_article = {'db_article': article}

        return parsed_article