예제 #1
0
 def worker(news):
     html = urlOpen.get_html(news.url)
     print(str(news.pk) + "     ", end='\n')
     if html:
         text = textParser.get_text_from_html(html)
         url_list = [url for url in aParser.get_a_from_news_text(news_url=news.url, text=text)]
         text = aParser.remove_all_tags(text)
         text = text_prerparer.text_preparer(text)
         return NewsText(news=news, text=text), url_list
예제 #2
0
def parse_news_text(news_text: NewsText):
    print(str(news_text.pk) + "     ", end='\r')
    for url in aParser.get_a_from_news_text(news_url=news_text.news.url, text=news_text.text):
        url_in_text = UrlInText.objects.filter(url=url)[:1]
        if url_in_text.exists():
            url_in_text = url_in_text[0]
        else:
            url_in_text = UrlInText.objects.create(url=url)
        url_in_text.news.add(news_text.news)
    news_text.text = aParser.remove_all_tags(news_text.text)
    news_text.is_parsed = True
    news_text.save()
예제 #3
0
 def worker(news):
     html = urlOpen.get_html(news.url)
     print(str(news.pk) + "     ", end='\n')
     if html:
         text = textParser.get_text_from_html(html)
         url_list = [
             url for url in aParser.get_a_from_news_text(news_url=news.url,
                                                         text=text)
         ]
         text = aParser.remove_all_tags(text)
         text = text_prerparer.text_preparer(text)
         return NewsText(news=news, text=text), url_list
예제 #4
0
def parse_news_text(news_text: NewsText):
    print(str(news_text.pk) + "     ", end='\r')
    for url in aParser.get_a_from_news_text(news_url=news_text.news.url,
                                            text=news_text.text):
        url_in_text = UrlInText.objects.filter(url=url)[:1]
        if url_in_text.exists():
            url_in_text = url_in_text[0]
        else:
            url_in_text = UrlInText.objects.create(url=url)
        url_in_text.news.add(news_text.news)
    news_text.text = aParser.remove_all_tags(news_text.text)
    news_text.is_parsed = True
    news_text.save()
예제 #5
0
 def writer(container):
     news_text_obj, url_list = container
     for url in url_list:
         url_in_text = UrlInText.objects.filter(url=url)[:1]
         if url_in_text.exists():
             url_in_text = url_in_text[0]
         else:
             url_in_text = UrlInText(url=url)
             url_in_text.save()
         url_in_text.news.add(news_text_obj.news)
     news_text_obj.text = aParser.remove_all_tags(news_text_obj.text)
     news_text_obj.is_parsed = True
     news_text_obj.save()
     print("news_text_id {}".format(news_text_obj.pk))
예제 #6
0
 def writer(container):
     news_text_obj, url_list = container
     for url in url_list:
         url_in_text = UrlInText.objects.filter(url=url)[:1]
         if url_in_text.exists():
             url_in_text = url_in_text[0]
         else:
             url_in_text = UrlInText(url=url)
             url_in_text.save()
         url_in_text.news.add(news_text_obj.news)
     news_text_obj.text = aParser.remove_all_tags(news_text_obj.text)
     news_text_obj.is_parsed = True
     news_text_obj.save()
     print("news_text_id {}".format(news_text_obj.pk))