예제 #1
0
def recreate_db():
    try:
        db.reflect()
        db.drop_all()
    except SQLAlchemyError as e:
        raise ValueError(e)

    db.create_all()
    db.session.commit()
def parse_main_pages():
    try:
        last_article = db.session.query(Article).order_by(
            Article.id.desc()).first()
        max_id_pos_start = str(last_article).find("id=")
        max_id_pos_end = str(last_article).find("title=")
        max_id = str(last_article)[max_id_pos_start + 3:max_id_pos_end - 2]
        max_id = int(max_id) + 1
    except ValueError:
        max_id = 1

    print("max_id", max_id)
    flag_old_news = 0
    n_page = 0
    while flag_old_news != 1:
        n_page = n_page + 1
        article_date = ""
        print("n_page", n_page)
        if n_page == 1:
            url = MAIN_URL

        else:
            url = MAIN_URL_PAGE_FROM2 + str(n_page) + '.html'

        html_page = requests.get(
            url,
            headers={
                "user-agent":
                "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0)"
                "Gecko/20100101 Firefox/74.0"
            }).text

        soup = BeautifulSoup(html_page, 'html.parser')
        soup = soup.find_all("div", {"class": "st__news-list"})
        all_articles = soup[0].find_all("li", {"class": ""})
        all_articles2 = soup[0].find_all(
            "li", {"class": "st__news  st__hot-news_with-photo"})
        all_articles3 = soup[0].find_all(
            "li", {"class": "st__news st__news_with-photo"})
        for li_link_article in all_articles2:
            all_articles.append(li_link_article)

        for li_link_article2 in all_articles3:
            all_articles.append(li_link_article2)

        print("all_articles", all_articles)

        for article in all_articles:

            print()
            all_url_tag = article.find_all("a")
            # if all_url_tag is None or all_url_tag == "":

            print("all_url_tag", all_url_tag)
            try:
                url_article = all_url_tag[0].get("href")
                url_article = BASE_URL + str(url_article)
                print("url_article", url_article)

            except:
                print("CONTINUE")
                continue

            article_title, article_date, article_text = parse_article_pages(
                url_article)
            article_text = str(article_text).strip()
            print("article_title", article_title)
            if article_title == "":
                continue

            try:
                db.session.rollback()
                if db.session.query(Article.id).filter_by(
                        title=article_title).scalar() is not None:
                    print("Found in db")
                    continue

            except sqlite3.IntegrityError:
                continue
            except sqlalchemy.exc.IntegrityError:
                continue

            resource = "https://euvsdisinfo.eu/"
            print("article_text", article_text)
            print("article_date", article_date)

            translator = Translator()
            try:
                src_lang = translator.translate(article_title).src
            except json.decoder.JSONDecodeError:
                time.sleep(3)
                translator = Translator()
                src_lang = translator.translate(article_title).src

            # REINITIALIZE THE API
            translator = Translator()
            try:
                translated = translator.translate(article_title,
                                                  src=src_lang,
                                                  dest="en")
                article_title_en = translated.text
            except Exception as e:
                print(str(e))
                article_title_en = ""

            print("article_title_en", article_title_en)
            article_text = str(article_text).strip()
            article_date = str(article_date).strip()

            new_article = Article(id=max_id,
                                  title=article_title,
                                  title_en=article_title_en,
                                  text=article_text,
                                  date=article_date,
                                  resource=resource,
                                  url=url_article)

            max_id += 1

            try:
                db.session.add(new_article)
                db.session.commit()
                db.session.flush()
                db.create_all()
            except sqlalchemy.exc.IntegrityError:
                continue
            except sqlalchemy.exc.DataError:
                continue

        if n_page >= 700:
            flag_old_news = 1
def parse_main_pages():
    """parse main pages to get main url and titles"""
    urls_article = []
    n_article = -1
    try:
        last_article = db.session.query(ArticleFakeChecker2).order_by(ArticleFakeChecker2.id.desc()).first()
        max_id_pos_start = str(last_article).find("id=")
        max_id_pos_end = str(last_article).find("title=")
        max_id = str(last_article)[max_id_pos_start + 3: max_id_pos_end - 2]
        max_id = int(max_id) + 1
    except ValueError:
        max_id = 1

    print("max_id", max_id)
    for n_page in range(44, NUMBER_PAGES):
        print("n_page", n_page + 1)
        if n_page + 1 == 1:
            url = MAIN_URL

        else:
            url = MAIN_URL_PAGE_FROM2 + str(n_page + 1) + '/'

        html_page = requests.get(url,
                                 headers={
                                     "user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0)"
                                                   "Gecko/20100101 Firefox/74.0"}, verify=False).text
        soup = BeautifulSoup(html_page, 'html.parser')
        all_articles = soup.find_all("article")
        flag_video = 0

        for article in all_articles:
            all_span = article.find_all("span", {"class": "post-category"})
            n_article += 1
            for span in all_span:
                if "Відео" in str(span):
                    flag_video = 1
                    break

            if flag_video == 1:
                flag_video = 0
                continue

            all_a = article.find_all("a")
            url_article = all_a[0].get("href")
            urls_article.append(url_article)
            print()
            article_title, article_date, article_text = parse_article_pages(url_article)
            article_text = str(article_text).strip()
            print("article_title", article_title)
            try:
                db.session.rollback()
                if db.session.query(ArticleFakeChecker2.id).filter_by(title=article_title).scalar() is not None:
                    print("Found in db")
                    continue

            except sqlite3.IntegrityError:
                continue
            except sqlalchemy.exc.IntegrityError:
                continue

            translator = Translator()
            try:
                src_lang = translator.translate(article_title).src
            except json.decoder.JSONDecodeError:
                time.sleep(3)
                translator = Translator()
                src_lang = translator.translate(article_title).src

            # REINITIALIZE THE API
            translator = Translator()
            try:
                translated = translator.translate(article_title, src=src_lang, dest="en")
                article_title_en = translated.text
            except Exception as e:
                print(str(e))
                article_title_en = ""

            resource_end_pos = url_article.find("/uk")
            resource = url_article[:resource_end_pos + 3]
            print("article_text", article_text)

            new_article = ArticleFakeChecker2(id=max_id,
                                              title=article_title,
                                              title_en=article_title_en,
                                              text=article_text,
                                              date=article_date,
                                              resource=resource,
                                              url=url_article)

            max_id += 1

            try:
                db.session.add(new_article)
                db.session.commit()
                db.session.flush()
                db.create_all()
            except sqlalchemy.exc.IntegrityError:
                continue
            except sqlalchemy.exc.DataError:
                continue

        try:
            db.session.rollback()
            db.session.commit()
            db.create_all()
        except sqlalchemy.exc.IntegrityError:
            continue
def parse_all_pages(filename):
    """parse main pages to get all data"""
    with open(filename, "r", encoding="utf-8") as file:
        urls_article = json.load(file)

    try:
        last_article = db.session.query(Article).order_by(
            Article.id.desc()).first()
        max_id_pos_start = str(last_article).find("id=")
        max_id_pos_end = str(last_article).find("title=")
        max_id = str(last_article)[max_id_pos_start + 3:max_id_pos_end - 2]
        max_id = int(max_id) + 1
    except ValueError:
        max_id = 1
    print("max_id", max_id)

    for url in urls_article["urls_explorer"]:
        html_page = requests.get(
            url,
            headers={
                "user-agent":
                "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0)"
                "Gecko/20100101 Firefox/74.0"
            },
            verify=False).text
        soup = BeautifulSoup(html_page, 'html.parser')

        try:
            all_h = soup.find_all("h1", {"class": "news-full__title"})
            article_title = all_h[0].string
            article_title = str(article_title).strip()
        except Exception as e:
            print(str(e))
            continue
        print()
        print("article_title", article_title)

        try:
            db.session.rollback()
            if db.session.query(Article.id).filter_by(
                    title=article_title).scalar() is not None:
                print("Found in db")
                continue

        except sqlite3.IntegrityError:
            continue
        except sqlalchemy.exc.IntegrityError:
            continue

        try:
            all_div = soup.find_all(
                "div", {"class": "news-full__text io-article-body"})
            article_text = all_div[0]
            article_text = BeautifulSoup(str(article_text).strip(),
                                         "lxml").text
            article_text = str(article_text).strip()

        except Exception as e:
            print(str(e))
            article_text = ""

        print("article_text", article_text)

        try:
            article_date = soup.find_all(
                "time", {"class": "news-full__date--create"})[0].string
            article_date = str(article_date).strip()

        except Exception as e:
            print(str(e))
            article_date = ""

        print("article_date", article_date)
        resource = "https://www.obozrevatel.com/"

        if article_title != "":
            article_title_en = translate_title(article_title)
        else:
            article_title_en = ""

        new_article = Article(id=max_id,
                              title=article_title,
                              title_en=article_title_en,
                              text=article_text,
                              date=article_date,
                              resource=resource,
                              url=url)

        max_id += 1
        print("article_title_en", article_title_en)

        try:
            db.session.rollback()
            db.session.add(new_article)
            db.session.commit()
            db.session.flush()
            db.create_all()
        except sqlalchemy.exc.IntegrityError:
            continue
        except sqlalchemy.exc.DataError:
            continue
예제 #5
0
def init_request():
    db.create_all()
def parse_main_pages(site_name, url_main, url_page2, class_articles,
                     class_title, class_date, class_text):
    """parse pages to get main information"""
    try:
        last_article = db.session.query(ArticleFakeChecker2).order_by(
            ArticleFakeChecker2.id.desc()).first()
        max_id_pos_start = str(last_article).find("id=")
        max_id_pos_end = str(last_article).find("title=")
        max_id = str(last_article)[max_id_pos_start + 3:max_id_pos_end - 2]
        max_id = int(max_id) + 1
    except ValueError:
        max_id = 1

    print("max_id", max_id)
    flag_old_news = 0
    n_page = 0
    while flag_old_news != 1:
        n_page = n_page + 1
        article_date = ""
        print("n_page", n_page)
        if n_page == 1:
            url = url_main

        else:
            if site_name == "obozrevatel":
                url = url_page2 + str(n_page + 1) + '/'

            else:
                url = url_page2 + str(n_page + 1) + '/'

        html_page = requests.get(
            url,
            headers={
                "user-agent":
                "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0)"
                "Gecko/20100101 Firefox/74.0"
            },
            verify=False).text

        soup = BeautifulSoup(html_page, 'html.parser')
        all_articles = soup.find_all("a", {"class": "b-post__link"})

        for article in all_articles:

            print()
            url_article = article.get("href")

            article_title, article_date, article_text = parse_article_pages(
                url_article, class_title, class_date, class_text)
            article_text = str(article_text).strip()
            print("article_title", article_title)
            try:
                db.session.rollback()
                if db.session.query(ArticleFakeChecker2.id).filter_by(
                        title=article_title).scalar() is not None:
                    print("Found in db")
                    continue

            except sqlite3.IntegrityError:
                continue
            except sqlalchemy.exc.IntegrityError:
                continue

            if site_name == "euvsdisinfo":
                resource = "https://euvsdisinfo.eu/"

                new_article = ArticleFakeChecker2(id=max_id,
                                                  title=article_title,
                                                  title_en=article_title,
                                                  text=article_text,
                                                  date=article_date,
                                                  resource=resource,
                                                  url=url_article)
            elif site_name == "obozrevatel":
                resource = "https://www.obozrevatel.com/"

                if article_title != "":
                    article_title_en = translate_title(article_title)
                else:
                    article_title_en = ""

                new_article = Article(id=max_id,
                                      title=article_title,
                                      title_en=article_title_en,
                                      text=article_text,
                                      date=article_date,
                                      resource=resource,
                                      url=url)

            print("article_text", article_text)
            print("article_date", article_date)
            max_id += 1

            try:
                db.session.add(new_article)
                db.session.commit()
                db.session.flush()
                db.create_all()
            except sqlalchemy.exc.IntegrityError:
                continue
            except sqlalchemy.exc.DataError:
                continue

        if str(article_date).split(
                ", ")[-1].strip() == "2018" or n_page >= 700:
            flag_old_news = 1

        try:
            db.session.rollback()
            db.session.commit()
            db.create_all()
        except sqlalchemy.exc.IntegrityError:
            continue
예제 #7
0
from flask_app.app import db 

db.create_all()
def parse_main_pages():
    """parse main pages to get all data"""
    try:
        last_article = db.session.query(Article).order_by(
            Article.id.desc()).first()
        max_id_pos_start = str(last_article).find("id=")
        max_id_pos_end = str(last_article).find("title=")
        max_id = str(last_article)[max_id_pos_start + 3:max_id_pos_end - 2]
        max_id = int(max_id) + 1
    except ValueError:
        max_id = 1

    n_article = -1

    print("max_id", max_id)
    for n_page in range(500, NUMBER_PAGES):
        print("n_page", n_page + 1)
        if n_page + 1 == 1:
            url = MAIN_URL

        else:
            url = MAIN_URL_PAGE_FROM2 + str(n_page + 1) + '/'

        stop_requests = 0
        flag_bad_request_for_page = 0

        session = requests.Session()
        session.max_redirects = 60

        while str(
                session.get(url,
                            headers={
                                "user-agent":
                                "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0)"
                                "Gecko/20100101 Firefox/74.0"
                            })).strip() != "<Response [200]>":
            time.sleep(3)

            stop_requests += 1
            if stop_requests == 10:
                print()
                print("error!!!!!!!!!!!!!!!", session.get(url))
                flag_bad_request_for_page = 1
                break

        if flag_bad_request_for_page == 1:
            continue

        html_page = session.get(
            url,
            headers={
                "user-agent":
                "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0)"
                "Gecko/20100101 Firefox/74.0"
            }).text
        soup = BeautifulSoup(html_page, 'html.parser')

        all_span = soup.find_all("a", {"class": "lbn_link"})
        n_article += 1
        for span in all_span:
            if "Відео" in str(span):

                continue

            url_article = span.get("href")
            print()
            article_title, article_date, article_text = parse_article_pages(
                url_article)
            if article_title is None:
                continue

            article_text = str(article_text).strip()
            print("article_title", article_title)
            try:
                db.session.rollback()
                if db.session.query(Article.id).filter_by(
                        title=article_title).scalar() is not None:
                    print("Found in db")
                    continue

            except sqlite3.IntegrityError:
                continue
            except sqlalchemy.exc.IntegrityError:
                continue

            translator = Translator()
            try:
                src_lang = translator.translate(article_title).src
            except json.decoder.JSONDecodeError:
                time.sleep(3)
                translator = Translator()
                src_lang = translator.translate(article_title).src

            # REINITIALIZE THE API
            translator = Translator()
            try:
                translated = translator.translate(article_title,
                                                  src=src_lang,
                                                  dest="en")
                article_title_en = translated.text
            except Exception as e:
                print(str(e))
            article_title_en = ""

            resource_end_pos = url_article.find("/ua")
            resource = url_article[:resource_end_pos + 3]
            print("article_text", article_text)

            new_article = Article(id=max_id,
                                  title=article_title,
                                  title_en=article_title_en,
                                  text=article_text,
                                  date=article_date,
                                  resource=resource,
                                  url=url_article)

            max_id += 1

            try:
                db.session.add(new_article)
                db.session.commit()
                db.session.flush()
                db.create_all()
            except sqlalchemy.exc.IntegrityError:
                continue

        try:
            db.session.rollback()
            db.session.commit()
            db.create_all()
        except sqlalchemy.exc.IntegrityError:
            continue