def recreate_db(): try: db.reflect() db.drop_all() except SQLAlchemyError as e: raise ValueError(e) db.create_all() db.session.commit()
def parse_main_pages(): try: last_article = db.session.query(Article).order_by( Article.id.desc()).first() max_id_pos_start = str(last_article).find("id=") max_id_pos_end = str(last_article).find("title=") max_id = str(last_article)[max_id_pos_start + 3:max_id_pos_end - 2] max_id = int(max_id) + 1 except ValueError: max_id = 1 print("max_id", max_id) flag_old_news = 0 n_page = 0 while flag_old_news != 1: n_page = n_page + 1 article_date = "" print("n_page", n_page) if n_page == 1: url = MAIN_URL else: url = MAIN_URL_PAGE_FROM2 + str(n_page) + '.html' html_page = requests.get( url, headers={ "user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0)" "Gecko/20100101 Firefox/74.0" }).text soup = BeautifulSoup(html_page, 'html.parser') soup = soup.find_all("div", {"class": "st__news-list"}) all_articles = soup[0].find_all("li", {"class": ""}) all_articles2 = soup[0].find_all( "li", {"class": "st__news st__hot-news_with-photo"}) all_articles3 = soup[0].find_all( "li", {"class": "st__news st__news_with-photo"}) for li_link_article in all_articles2: all_articles.append(li_link_article) for li_link_article2 in all_articles3: all_articles.append(li_link_article2) print("all_articles", all_articles) for article in all_articles: print() all_url_tag = article.find_all("a") # if all_url_tag is None or all_url_tag == "": print("all_url_tag", all_url_tag) try: url_article = all_url_tag[0].get("href") url_article = BASE_URL + str(url_article) print("url_article", url_article) except: print("CONTINUE") continue article_title, article_date, article_text = parse_article_pages( url_article) article_text = str(article_text).strip() print("article_title", article_title) if article_title == "": continue try: db.session.rollback() if db.session.query(Article.id).filter_by( title=article_title).scalar() is not None: print("Found in db") continue except sqlite3.IntegrityError: continue except sqlalchemy.exc.IntegrityError: continue resource = "https://euvsdisinfo.eu/" print("article_text", article_text) print("article_date", article_date) translator = Translator() try: src_lang = translator.translate(article_title).src except json.decoder.JSONDecodeError: time.sleep(3) translator = Translator() src_lang = translator.translate(article_title).src # REINITIALIZE THE API translator = Translator() try: translated = translator.translate(article_title, src=src_lang, dest="en") article_title_en = translated.text except Exception as e: print(str(e)) article_title_en = "" print("article_title_en", article_title_en) article_text = str(article_text).strip() article_date = str(article_date).strip() new_article = Article(id=max_id, title=article_title, title_en=article_title_en, text=article_text, date=article_date, resource=resource, url=url_article) max_id += 1 try: db.session.add(new_article) db.session.commit() db.session.flush() db.create_all() except sqlalchemy.exc.IntegrityError: continue except sqlalchemy.exc.DataError: continue if n_page >= 700: flag_old_news = 1
def parse_main_pages(): """parse main pages to get main url and titles""" urls_article = [] n_article = -1 try: last_article = db.session.query(ArticleFakeChecker2).order_by(ArticleFakeChecker2.id.desc()).first() max_id_pos_start = str(last_article).find("id=") max_id_pos_end = str(last_article).find("title=") max_id = str(last_article)[max_id_pos_start + 3: max_id_pos_end - 2] max_id = int(max_id) + 1 except ValueError: max_id = 1 print("max_id", max_id) for n_page in range(44, NUMBER_PAGES): print("n_page", n_page + 1) if n_page + 1 == 1: url = MAIN_URL else: url = MAIN_URL_PAGE_FROM2 + str(n_page + 1) + '/' html_page = requests.get(url, headers={ "user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0)" "Gecko/20100101 Firefox/74.0"}, verify=False).text soup = BeautifulSoup(html_page, 'html.parser') all_articles = soup.find_all("article") flag_video = 0 for article in all_articles: all_span = article.find_all("span", {"class": "post-category"}) n_article += 1 for span in all_span: if "Відео" in str(span): flag_video = 1 break if flag_video == 1: flag_video = 0 continue all_a = article.find_all("a") url_article = all_a[0].get("href") urls_article.append(url_article) print() article_title, article_date, article_text = parse_article_pages(url_article) article_text = str(article_text).strip() print("article_title", article_title) try: db.session.rollback() if db.session.query(ArticleFakeChecker2.id).filter_by(title=article_title).scalar() is not None: print("Found in db") continue except sqlite3.IntegrityError: continue except sqlalchemy.exc.IntegrityError: continue translator = Translator() try: src_lang = translator.translate(article_title).src except json.decoder.JSONDecodeError: time.sleep(3) translator = Translator() src_lang = translator.translate(article_title).src # REINITIALIZE THE API translator = Translator() try: translated = translator.translate(article_title, src=src_lang, dest="en") article_title_en = translated.text except Exception as e: print(str(e)) article_title_en = "" resource_end_pos = url_article.find("/uk") resource = url_article[:resource_end_pos + 3] print("article_text", article_text) new_article = ArticleFakeChecker2(id=max_id, title=article_title, title_en=article_title_en, text=article_text, date=article_date, resource=resource, url=url_article) max_id += 1 try: db.session.add(new_article) db.session.commit() db.session.flush() db.create_all() except sqlalchemy.exc.IntegrityError: continue except sqlalchemy.exc.DataError: continue try: db.session.rollback() db.session.commit() db.create_all() except sqlalchemy.exc.IntegrityError: continue
def parse_all_pages(filename): """parse main pages to get all data""" with open(filename, "r", encoding="utf-8") as file: urls_article = json.load(file) try: last_article = db.session.query(Article).order_by( Article.id.desc()).first() max_id_pos_start = str(last_article).find("id=") max_id_pos_end = str(last_article).find("title=") max_id = str(last_article)[max_id_pos_start + 3:max_id_pos_end - 2] max_id = int(max_id) + 1 except ValueError: max_id = 1 print("max_id", max_id) for url in urls_article["urls_explorer"]: html_page = requests.get( url, headers={ "user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0)" "Gecko/20100101 Firefox/74.0" }, verify=False).text soup = BeautifulSoup(html_page, 'html.parser') try: all_h = soup.find_all("h1", {"class": "news-full__title"}) article_title = all_h[0].string article_title = str(article_title).strip() except Exception as e: print(str(e)) continue print() print("article_title", article_title) try: db.session.rollback() if db.session.query(Article.id).filter_by( title=article_title).scalar() is not None: print("Found in db") continue except sqlite3.IntegrityError: continue except sqlalchemy.exc.IntegrityError: continue try: all_div = soup.find_all( "div", {"class": "news-full__text io-article-body"}) article_text = all_div[0] article_text = BeautifulSoup(str(article_text).strip(), "lxml").text article_text = str(article_text).strip() except Exception as e: print(str(e)) article_text = "" print("article_text", article_text) try: article_date = soup.find_all( "time", {"class": "news-full__date--create"})[0].string article_date = str(article_date).strip() except Exception as e: print(str(e)) article_date = "" print("article_date", article_date) resource = "https://www.obozrevatel.com/" if article_title != "": article_title_en = translate_title(article_title) else: article_title_en = "" new_article = Article(id=max_id, title=article_title, title_en=article_title_en, text=article_text, date=article_date, resource=resource, url=url) max_id += 1 print("article_title_en", article_title_en) try: db.session.rollback() db.session.add(new_article) db.session.commit() db.session.flush() db.create_all() except sqlalchemy.exc.IntegrityError: continue except sqlalchemy.exc.DataError: continue
def init_request(): db.create_all()
def parse_main_pages(site_name, url_main, url_page2, class_articles, class_title, class_date, class_text): """parse pages to get main information""" try: last_article = db.session.query(ArticleFakeChecker2).order_by( ArticleFakeChecker2.id.desc()).first() max_id_pos_start = str(last_article).find("id=") max_id_pos_end = str(last_article).find("title=") max_id = str(last_article)[max_id_pos_start + 3:max_id_pos_end - 2] max_id = int(max_id) + 1 except ValueError: max_id = 1 print("max_id", max_id) flag_old_news = 0 n_page = 0 while flag_old_news != 1: n_page = n_page + 1 article_date = "" print("n_page", n_page) if n_page == 1: url = url_main else: if site_name == "obozrevatel": url = url_page2 + str(n_page + 1) + '/' else: url = url_page2 + str(n_page + 1) + '/' html_page = requests.get( url, headers={ "user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0)" "Gecko/20100101 Firefox/74.0" }, verify=False).text soup = BeautifulSoup(html_page, 'html.parser') all_articles = soup.find_all("a", {"class": "b-post__link"}) for article in all_articles: print() url_article = article.get("href") article_title, article_date, article_text = parse_article_pages( url_article, class_title, class_date, class_text) article_text = str(article_text).strip() print("article_title", article_title) try: db.session.rollback() if db.session.query(ArticleFakeChecker2.id).filter_by( title=article_title).scalar() is not None: print("Found in db") continue except sqlite3.IntegrityError: continue except sqlalchemy.exc.IntegrityError: continue if site_name == "euvsdisinfo": resource = "https://euvsdisinfo.eu/" new_article = ArticleFakeChecker2(id=max_id, title=article_title, title_en=article_title, text=article_text, date=article_date, resource=resource, url=url_article) elif site_name == "obozrevatel": resource = "https://www.obozrevatel.com/" if article_title != "": article_title_en = translate_title(article_title) else: article_title_en = "" new_article = Article(id=max_id, title=article_title, title_en=article_title_en, text=article_text, date=article_date, resource=resource, url=url) print("article_text", article_text) print("article_date", article_date) max_id += 1 try: db.session.add(new_article) db.session.commit() db.session.flush() db.create_all() except sqlalchemy.exc.IntegrityError: continue except sqlalchemy.exc.DataError: continue if str(article_date).split( ", ")[-1].strip() == "2018" or n_page >= 700: flag_old_news = 1 try: db.session.rollback() db.session.commit() db.create_all() except sqlalchemy.exc.IntegrityError: continue
from flask_app.app import db db.create_all()
def parse_main_pages(): """parse main pages to get all data""" try: last_article = db.session.query(Article).order_by( Article.id.desc()).first() max_id_pos_start = str(last_article).find("id=") max_id_pos_end = str(last_article).find("title=") max_id = str(last_article)[max_id_pos_start + 3:max_id_pos_end - 2] max_id = int(max_id) + 1 except ValueError: max_id = 1 n_article = -1 print("max_id", max_id) for n_page in range(500, NUMBER_PAGES): print("n_page", n_page + 1) if n_page + 1 == 1: url = MAIN_URL else: url = MAIN_URL_PAGE_FROM2 + str(n_page + 1) + '/' stop_requests = 0 flag_bad_request_for_page = 0 session = requests.Session() session.max_redirects = 60 while str( session.get(url, headers={ "user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0)" "Gecko/20100101 Firefox/74.0" })).strip() != "<Response [200]>": time.sleep(3) stop_requests += 1 if stop_requests == 10: print() print("error!!!!!!!!!!!!!!!", session.get(url)) flag_bad_request_for_page = 1 break if flag_bad_request_for_page == 1: continue html_page = session.get( url, headers={ "user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0)" "Gecko/20100101 Firefox/74.0" }).text soup = BeautifulSoup(html_page, 'html.parser') all_span = soup.find_all("a", {"class": "lbn_link"}) n_article += 1 for span in all_span: if "Відео" in str(span): continue url_article = span.get("href") print() article_title, article_date, article_text = parse_article_pages( url_article) if article_title is None: continue article_text = str(article_text).strip() print("article_title", article_title) try: db.session.rollback() if db.session.query(Article.id).filter_by( title=article_title).scalar() is not None: print("Found in db") continue except sqlite3.IntegrityError: continue except sqlalchemy.exc.IntegrityError: continue translator = Translator() try: src_lang = translator.translate(article_title).src except json.decoder.JSONDecodeError: time.sleep(3) translator = Translator() src_lang = translator.translate(article_title).src # REINITIALIZE THE API translator = Translator() try: translated = translator.translate(article_title, src=src_lang, dest="en") article_title_en = translated.text except Exception as e: print(str(e)) article_title_en = "" resource_end_pos = url_article.find("/ua") resource = url_article[:resource_end_pos + 3] print("article_text", article_text) new_article = Article(id=max_id, title=article_title, title_en=article_title_en, text=article_text, date=article_date, resource=resource, url=url_article) max_id += 1 try: db.session.add(new_article) db.session.commit() db.session.flush() db.create_all() except sqlalchemy.exc.IntegrityError: continue try: db.session.rollback() db.session.commit() db.create_all() except sqlalchemy.exc.IntegrityError: continue