def do_parse(self, feed_url): """Start parsing process. Get pages to parse. Return generator with parsed articles""" self.driver.get(feed_url) # Simulation of scrolling down process, until all articles will be shown found_not_actual = False while True and not found_not_actual: body = self.driver.find_element_by_tag_name('body') body.send_keys(Keys.END) try: WebDriverWait(self.driver, 5).until( self.height_change ) except TimeoutException: break try: t_links = self.driver.find_elements_by_class_name('t-link') for link in t_links: if 'вчера' in link.text: found_not_actual = True break except StaleElementReferenceException: logger.error('Time tag not found in articles feed from DTF source.') html_articles = self.driver.find_elements_by_class_name('feed__item') articles = list() if html_articles: with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: articles = executor.map(self.parse_article, html_articles) return articles
def parse_article(self, article): parsed_article = dict() try: article_stamp = article.find_element_by_class_name('time').text article_time = datetime.now(pytz.timezone('Europe/Moscow')) article_time = article_time.replace(minute=int(article_stamp[3:]), hour=int(article_stamp[:2])) except NoSuchElementException: article_time = '' logger.error('Can\'t find time of the article with text: {}'.format(article.text)) try: h2 = article.find_element_by_tag_name('a').text except NoSuchElementException: h2 = '' logger.error('Can\'t find h2 for article with text: {}'.format(article.text)) try: a_tag = article.find_element_by_tag_name('a') href = a_tag.get_attribute('href') except NoSuchElementException: href = '' logger.error('Can\'t find a for article with text: {}'.format(article.text)) if href: try: article = Article.objects.get(url=href) except Article.DoesNotExist: article = None if not article: try: sleep(0.5) # simulation user behavior detail = BeautifulSoup(requests.get(href).content, 'lxml') body_post = detail.find('div', {'class': 'news-item__content'}) if body_post: picture = None full_text = body_post.get_text().strip() parsed_article = { 'url': href, 'picture': picture, 'header': h2, 'text': full_text, 'date': article_time, } else: logger.error("URL {} has no body.".format(href)) except Exception as ex: logger.error('Error "{}" while trying to open url: {}'.format(ex, href)) else: # we have already stored this article in database and just need to connect it with user parsed_article = { 'db_article': article } return parsed_article
def test_connection(self): """Test if Selenium successfully connected to feed.""" auth_flag = False nav_logo = self.driver.find_element_by_class_name('navLogo') if nav_logo: try: logo_link = nav_logo.find_element_by_tag_name('a') if logo_link.get_attribute('href') == 'https://shazoo.ru/': auth_flag = True except NoSuchElementException: logger.error('Can\'t find a tag inside Shazoo logo.') return auth_flag
def start_parsing(source=None): """Fires parsing process.""" if source is None: sources = Source.objects.all() else: sources = Source.objects.filter(label=source) for source in sources: with factory.create(source.label) as parser: if parser and parser.test_connection(): logger.info('Successfully connected to source {}'.format(source.name)) articles = parser.do_parse(source.url) save_to_db(articles, source) else: logger.error('Can\'t connect to source {}!'.format(source.name))
def manage_resources(request): if request.POST: source_id = int(request.POST['source']) status = request.POST['status'] source_order = request.POST.get('source_order', None) try: source = Source.objects.get(pk=source_id) ok = True except Source.DoesNotExist: ok = False else: if status == 'enable': try: user_source = UserSources.objects.get(user=request.user, source=source) user_source.source_order = source_order except UserSources.DoesNotExist: user_source = UserSources(user=request.user, source=source, source_order=source_order) finally: user_source.save() else: try: UserSources.objects.get(user=request.user, source=source).delete() except UserSources.DoesNotExist: logger.error('Trying to disable user ({}) source ({}) that is not in DB'.format( request.user.username, source.name)) return JsonResponse({'ok': ok}) else: user_sources = Source.objects.filter(usersources__user=request.user).order_by('usersources__source_order') all_sources = Source.objects.order_by('name') print(all_sources) context = { 'user_sources': user_sources, 'all_sources': all_sources } return render(request, 'manage_source.html', context=context)
def parse_article(self, article): parsed_article = dict() try: article_stamp = int(article.find_element_by_tag_name('time').get_attribute('data-date')) article_time = datetime.fromtimestamp(article_stamp, pytz.timezone('Europe/Moscow')) is_actual_article = True if article_time.date() == datetime.today().date() else False except NoSuchElementException: article_time = '' is_actual_article = False logger.error('Can\'t find time of the article with text: {}'.format(article.text)) if is_actual_article: try: h2 = article.find_element_by_tag_name('h2').text except NoSuchElementException: h2 = '' logger.error('Can\'t find h2 for article with text: {}'.format(article.text)) try: a_tag = article.find_element_by_class_name('t-link') href = a_tag.get_attribute('href') except NoSuchElementException: href = '' logger.error('Can\'t find a for article with text: {}'.format(article.text)) if href: try: article = Article.objects.get(url=href) except Article.DoesNotExist: article = None if not article: try: sleep(0.5) # simulation user behavior detail = BeautifulSoup(requests.get(href).content, 'lxml') body_post = detail.find('div', {'class': 'content--full'}) if body_post: img_wrapper = detail.find('div', {'class': 'andropov_image'}) picture = img_wrapper.attrs['data-image-src'] if img_wrapper else None # remove some web page stuff try: body_post = clean_page(body_post, { 'div': [ 'content-counters', 'authorCard', ] }) if body_post.find('figure'): body_post.find('figure').decompose() except Exception as ex: logger.error( 'Error "{}" while trying to remove unused elements from page: {}'.format(ex, href)) full_text = body_post.get_text().strip() parsed_article = { 'url': href, 'header': h2, 'picture': picture, 'text': full_text, 'date': article_time, } else: logger.error("URL {} has no body.".format(href)) except Exception as ex: logger.error('Error "{}" while trying to open url: {}'.format(ex, href)) else: # we have already stored this article in database and just need to connect it with user parsed_article = { 'db_article': article } return parsed_article
def parse_article(self, article): parsed_article = dict() try: article_stamp = article.find_element_by_class_name( 'item__category').text is_actual_article = True if len(article_stamp) == 5 else False if is_actual_article: article_time = datetime.now(pytz.timezone('Europe/Moscow')) article_time = article_time.replace( minute=int(article_stamp[3:]), hour=int(article_stamp[:2])) except NoSuchElementException: article_time = '' is_actual_article = False logger.error( 'Can\'t find time of the article with text: {}'.format( article.text)) if is_actual_article: try: h2 = article.find_element_by_class_name('item__title').text except NoSuchElementException: h2 = '' logger.error('Can\'t find h2 for article with text: {}'.format( article.text)) try: a_tag = article.find_element_by_class_name('item__link') href = a_tag.get_attribute('href') except NoSuchElementException: href = '' logger.error('Can\'t find a for article with text: {}'.format( article.text)) if href: try: article = Article.objects.get(url=href) except Article.DoesNotExist: article = None if not article: try: sleep(0.5) # simulation user behavior detail = BeautifulSoup( requests.get(href).content, 'lxml') body_post = detail.find('div', {'class': 'l-col-main'}) if body_post: img = detail.find( 'img', {'class': 'article__main-image__image'}) picture = img.attrs['src'] if img else None # remove some web page stuff try: body_post = clean_page( body_post, { 'div': [ 'article__header', 'article__inline-video', 'article__inline-item__link', 'article__inline - item__category', 'article__inline-item', 'pro-anons', 'article__authors', 'article__tags', 'banner', 'banner__median_mobile', 'article__main-image', ] }) except Exception as ex: logger.error( 'Error "{}" while trying to remove unused elements from page: {}' .format(ex, href)) full_text = body_post.get_text().strip() parsed_article = { 'url': href, 'header': h2, 'picture': picture, 'text': full_text, 'date': article_time, } else: logger.error("URL {} has no body.".format(href)) except Exception as ex: logger.error( 'Error "{}" while trying to open url: {}'.format( ex, href)) else: # we have already stored this article in database and just need to connect it with user parsed_article = {'db_article': article} return parsed_article
def parse_article(self, article): parsed_article = dict() try: article_time = article.find_element_by_tag_name( 'time').get_attribute('datetime') article_time = datetime.fromisoformat(article_time) is_actual_article = True if article_time.date() == datetime.today( ).date() else False except NoSuchElementException: article_time = '' is_actual_article = False logger.error( 'Can\'t find time of the article with text: {}'.format( article.text)) if is_actual_article: try: h2 = article.find_element_by_tag_name('h2').text h2_a = article.find_element_by_tag_name( 'h2').find_element_by_tag_name('a') href = h2_a.get_attribute('href') except NoSuchElementException: href = '' h2 = '' logger.error('Can\'t find h2 for article with text: {}'.format( article.text)) if href: try: article = Article.objects.get(url=href) except Article.DoesNotExist: article = None if not article: try: sleep(0.5) # simulation user behavior detail = BeautifulSoup( requests.get(href).content, 'lxml') body_post = detail.find('section', {'class': 'body'}) if body_post: picture = None img_wrapper = detail.find( 'div', {'class': 'entryImageContainer'}) if img_wrapper: picture = img_wrapper.find( 'img').attrs['src'] if img_wrapper.find( 'img') else None full_text = body_post.get_text().strip() parsed_article = { 'url': href, 'header': h2, 'picture': picture, 'text': full_text, 'date': article_time, } else: logger.error("URL {} has no body.".format(href)) except Exception as ex: logger.error( 'Error "{}" while trying to open url: {}'.format( ex, href)) else: # we have already stored this article in database and just need to connect it with user parsed_article = {'db_article': article} return parsed_article