Exemplo n.º 1
0
def main():
    test_url = "http://gazeta.ru/"
    txt = urlOpen.get_html(test_url)
    txt = textParser.tags_filter_head_and_script(txt)
    for url, text in get_url_and_url_text(txt, test_url):
        if url.startswith(test_url):
            print("{} {}\n".format(url, text))
Exemplo n.º 2
0
def main():
    test_url = "http://gazeta.ru/"
    txt = urlOpen.get_html(test_url)
    txt = textParser.tags_filter_head_and_script(txt)
    for url, text in get_url_and_url_text(txt, test_url):
        if url.startswith(test_url):
            print("{} {}\n".format(url, text))
Exemplo n.º 3
0
def parse(source_url: str):
    html_code = urlOpen.get_html(source_url)
    html_code = textParser.tags_filter_head_and_script(html_code)
    for url, text in get_url_and_url_text(html_code, source_url):
        yield model.NewsData(url=url,
                             title=text,
                             pub_date=datetime.now(),
                             summary=None)
Exemplo n.º 4
0
 def worker(news):
     html = urlOpen.get_html(news.url)
     print(str(news.pk) + "     ", end='\n')
     if html:
         text = textParser.get_text_from_html(html)
         url_list = [url for url in aParser.get_a_from_news_text(news_url=news.url, text=text)]
         text = aParser.remove_all_tags(text)
         text = text_prerparer.text_preparer(text)
         return NewsText(news=news, text=text), url_list
Exemplo n.º 5
0
def parse_news(n=None):

    for news in News.objects.filter(is_parsed=False)[:n].iterator():
        print(str(news.id) + "     ", end='\n')
        html = urlOpen.get_html(news.url)  # 0.19 - 2.5 s
        if html:
            text = textParser.get_text_from_html(html)  # 0.0099 - 0.026 s
            NewsText.objects.create(news=news, text=text)
            news.is_parsed = True
            news.save()  # 0.004 with atomic and 0.23 without
Exemplo n.º 6
0
def parse_news(n=None):

    for news in News.objects.filter(is_parsed=False)[:n].iterator():
        print(str(news.id) + "     ", end='\n')
        html = urlOpen.get_html(news.url)  # 0.19 - 2.5 s
        if html:
            text = textParser.get_text_from_html(html)  # 0.0099 - 0.026 s
            NewsText.objects.create(news=news, text=text)
            news.is_parsed = True
            news.save()  # 0.004 with atomic and 0.23 without
Exemplo n.º 7
0
 def worker(news):
     html = urlOpen.get_html(news.url)
     print(str(news.pk) + "     ", end='\n')
     if html:
         text = textParser.get_text_from_html(html)
         url_list = [
             url for url in aParser.get_a_from_news_text(news_url=news.url,
                                                         text=text)
         ]
         text = aParser.remove_all_tags(text)
         text = text_prerparer.text_preparer(text)
         return NewsText(news=news, text=text), url_list
Exemplo n.º 8
0
def worker(input_q: JoinableQueue, output: Queue):
    from django import db
    db.connection.close()
    while True:
        task = input_q.get()
        if task == "end":
            break
        html = urlOpen.get_html(task.url)
        if html:
            text = textParser.get_text_from_html(html)
        input_q.task_done()
        # info()
        output.put(task.url)
    print("exit")
Exemplo n.º 9
0
 def worker(news):
     print(str(news.id) + "     ", end='\n')
     html = urlOpen.get_html(news.url)
     if html:
         text = textParser.get_text_from_html(html)
         return NewsText(news=news, text=text)
Exemplo n.º 10
0
def parse(source_url: str):
    html_code = urlOpen.get_html(source_url)
    html_code = textParser.tags_filter_head_and_script(html_code)
    for url, text in get_url_and_url_text(html_code, source_url):
        yield model.NewsData(url=url, title=text, pub_date=datetime.now(), summary=None)
Exemplo n.º 11
0
 def worker(news):
     print(str(news.id) + "     ", end='\n')
     html = urlOpen.get_html(news.url)
     if html:
         text = textParser.get_text_from_html(html)
         return NewsText(news=news, text=text)