/
async_html_parser.py
115 lines (91 loc) · 3.54 KB
/
async_html_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
from utils import DjangoSetup # setup django environment
from db.models import News, NewsText, UrlInText
from prjparser import textParser, urlOpen, aParser
from prjparser import multiproc, text_prerparer
# TODO надо объеденить классы или както переделеть логику
class HtmlParser(multiproc.MultiProc):
"""
Extract news text from html page
"""
task_manager = News.objects.filter(is_parsed=False).iterator
@staticmethod
def worker(news):
print(str(news.id) + " ", end='\n')
html = urlOpen.get_html(news.url)
if html:
text = textParser.get_text_from_html(html)
return NewsText(news=news, text=text)
@staticmethod
def writer(news_text):
news_text.save()
news_text.news.is_parsed = True
news_text.news.save()
class NewsTextParser(multiproc.MultiProc):
"""
Extract all links from news text
"""
task_manager = NewsText.objects.filter(is_parsed=False).iterator
@staticmethod
def worker(news_text: NewsText):
url_list = [url for url in aParser.get_a_from_news_text(news_url=news_text.news.url, text=news_text.text)]
return news_text, url_list
@staticmethod
def writer(container):
news_text_obj, url_list = container
for url in url_list:
url_in_text = UrlInText.objects.filter(url=url)[:1]
if url_in_text.exists():
url_in_text = url_in_text[0]
else:
url_in_text = UrlInText(url=url)
url_in_text.save()
url_in_text.news.add(news_text_obj.news)
news_text_obj.text = aParser.remove_all_tags(news_text_obj.text)
news_text_obj.is_parsed = True
news_text_obj.save()
print("news_text_id {}".format(news_text_obj.pk))
# TODO брать только тексты где ссылки выделны. Устновить флаг для защиты от повторной работы
class AsyncTextPreparer(multiproc.MultiProc):
task_manager = NewsText.objects.iterator
@staticmethod
def writer(write_obj):
news_text, refined_text = write_obj
news_text.text = refined_text
news_text.save()
@staticmethod
def worker(news_text):
try:
print(news_text.pk)
text = news_text.text
refined_text = text_prerparer.text_preparer(text)
return news_text, refined_text
except:
print(news_text)
class AsyncHtmlParser(multiproc.MultiProc):
task_manager = News.objects.filter(is_parsed=False).iterator
@staticmethod
def worker(news):
html = urlOpen.get_html(news.url)
print(str(news.pk) + " ", end='\n')
if html:
text = textParser.get_text_from_html(html)
url_list = [url for url in aParser.get_a_from_news_text(news_url=news.url, text=text)]
text = aParser.remove_all_tags(text)
text = text_prerparer.text_preparer(text)
return NewsText(news=news, text=text), url_list
@staticmethod
def writer(container):
news_text, url_list = container
news_text.save()
for url in url_list:
url_in_text, created = UrlInText.objects.get_or_create(url=url)
url_in_text.news.add(news_text.news)
news_text.news.is_parsed = True
news_text.news.save()
def main():
# HtmlParser().run()
# NewsTextParser().run()
# AsyncTextPreparer().run()
AsyncHtmlParser(7).run()
if __name__ == "__main__":
main()