def worker(news): html = urlOpen.get_html(news.url) print(str(news.pk) + " ", end='\n') if html: text = textParser.get_text_from_html(html) url_list = [url for url in aParser.get_a_from_news_text(news_url=news.url, text=text)] text = aParser.remove_all_tags(text) text = text_prerparer.text_preparer(text) return NewsText(news=news, text=text), url_list
def parse_news_text(news_text: NewsText): print(str(news_text.pk) + " ", end='\r') for url in aParser.get_a_from_news_text(news_url=news_text.news.url, text=news_text.text): url_in_text = UrlInText.objects.filter(url=url)[:1] if url_in_text.exists(): url_in_text = url_in_text[0] else: url_in_text = UrlInText.objects.create(url=url) url_in_text.news.add(news_text.news) news_text.text = aParser.remove_all_tags(news_text.text) news_text.is_parsed = True news_text.save()
def worker(news): html = urlOpen.get_html(news.url) print(str(news.pk) + " ", end='\n') if html: text = textParser.get_text_from_html(html) url_list = [ url for url in aParser.get_a_from_news_text(news_url=news.url, text=text) ] text = aParser.remove_all_tags(text) text = text_prerparer.text_preparer(text) return NewsText(news=news, text=text), url_list
def writer(container): news_text_obj, url_list = container for url in url_list: url_in_text = UrlInText.objects.filter(url=url)[:1] if url_in_text.exists(): url_in_text = url_in_text[0] else: url_in_text = UrlInText(url=url) url_in_text.save() url_in_text.news.add(news_text_obj.news) news_text_obj.text = aParser.remove_all_tags(news_text_obj.text) news_text_obj.is_parsed = True news_text_obj.save() print("news_text_id {}".format(news_text_obj.pk))