def parse_page(url): print('пошел собирать заголовки') if platform.system() == 'Windows': locale.setlocale(locale.LC_ALL, 'russian') else: locale.setlocale(locale.LC_TIME, 'ru_RU') html = get_html(url) if html: soup = BeautifulSoup(html, 'html.parser') rows = soup.find_all('li', class_='content-list__item content-list__item_post shortcuts_item') for row in rows: try: url = row.find('h2').find('a')['href'] title = row.find('h2').find('a').text date = row.find('header', class_='post__meta').find('span', class_='post__time').text if 'сегодня' in date: today = datetime.now() date = date.replace('сегодня', today.strftime('%d %B %Y')) elif 'вчера' in date: yesterday = datetime.now() - timedelta(days=1) date = date.replace('вчера', yesterday.strftime('%d %B %Y')) try: date = datetime.strptime(date, '%d %B %Y в %H:%M') except ValueError: date = datetime.now() save_news(url, title, date) except (AttributeError, TypeError): pass
def get_habr_snippets(): html = get_html("https://habr.com/ru/search/?target_type=posts&q=python&order_by=date") if html: soup = BeautifulSoup(html, 'html.parser') all_news = soup.find('ul', class_='content-list_posts').findAll('li', class_='content-list__item_post') for news in all_news: title = news.find('a', class_='post__title_link').text url = news.find('a', class_='post__title_link')['href'] published = news.find('span', class_='post__time').text print(title, url, published)
def get_news_content(): news_without_text = News.query.filter(News.text.is_(None)) for news in news_without_text: html = get_html(news.url) if html: soup = BeautifulSoup(html, 'html.parser') news_text = soup.find('div', class_='post__text-html').decode_contents() if news_text: news.text = news_text db.session.add(news) db.session.commit()
def get_news_content(): news_without_text = News.query.filter(News.text.is_(None)) for news in news_without_text: html = get_html(news.url) if html: soup = BeautifulSoup(html, 'html.parser') article = soup.find('div', class_='tm-article-body').decode_contents() if article: news.text = article db.session.add(news) db.session.commit()
def get_news_content(): news_without_texh = News.query.filter(News.text.is_(None)) for news in news_without_texh: html = get_html(news.url) if html: soup = BeautifulSoup(html, 'html.parser') # .decode_contents() позволяет получить html вместо текста article = soup.find('div', class_='post__text-html').decode_contents() if article: news.text = article db.session.add(news) db.session.commit()
def get_news_snippets(): html = get_html("https://habr.com/ru/search/?target_type=posts&q=python&order_by=date") if html: soup = BeautifulSoup(html, 'html.parser') # Бреобразует наш HTML в дерево элементов с которым удобнее работать фунциям этой библиотеки all_news = soup.find("ul", class_="content-list_posts").findAll('li', class_='content-list__item_post') # Делаем выборку элементов страницы при помощи поиска #result_news = [] for news in all_news: title = news.find('a', class_="post__title_link").text # Выбираем текст заголовка url = news.find('a', class_="post__title_link")["href"] # Выбираем ссылку заголовка (к атрибутам обращаемся как к элементам славаря) published = news.find('span', class_="post__time").text # Выбираем время published = parse_habr_date(published) save_news(title, url, published) # После формирования, вызываем запись в базу
def get_news_snippets(): html = get_html("https://habr.com/ru/search/?target_type=posts&q=python&order_by=date") if html: soup = BeautifulSoup(html, "html.parser") all_news = soup.find("ul", class_="content-list_posts").find_all("li", class_="content-list__item_post") for news in all_news: title = news.find("a", class_="post__title_link").text url = news.find("a", class_="post__title_link")["href"] published = news.find("span", class_="post__time").text published = date_translate(published) save_news(title, url, published) return False
def get_news_content(): news_without_text = News.query.filter(News.text.is_(None)) for news in news_without_text: html = get_html(news.url) if html: soup = BeautifulSoup(html, 'html.parser') news_text = soup.find( 'div', class_='post__text-html').decode_contents() # получаем html if news_text: news.text = news_text # положим в таблицу news столбец text db.session.add(news) # добавляем в таблицу news db.session.commit()
def get_news_snippets(): # Сниппеты- это небольшие новостные блоки на станице html = get_html( "https://habr.com/ru/search/?target_type=posts&q=python&order_by=date") if html: soup = BeautifulSoup(html, 'html.parser') all_news = soup.find('ul', class_='content-list_posts').findAll( 'li', class_='content-list__item_post') for news in all_news: title = news.find('a', class_='post__title_link').text url = news.find('a', class_='post__title_link')['href'] published = news.find('span', class_='post__time').text published = parse_habr_date(published) save_news(title, url, published)
def habr_news_func(): html = get_html( "https://habr.com/ru/search/?target_type=posts&q=python&order_by=date") if html: soup = BeautifulSoup(html, 'html.parser') all_habr_news = soup.find('ul', class_='content-list_posts').findAll( 'li', class_='content-list__item_post') result_news = [] for habr_news in all_habr_news: title = habr_news.find('a', class_='post__title_link').text url = habr_news.find('a', class_='post__title_link')['href'] published = habr_news.find('span', class_='post__time').text published = parse_habr_date(published) save_news(title, url, published)
def get_text(): print('Собираю текста') news_without_text = News.query.filter(News.text.is_(None)) for news in news_without_text: print(f'{news.id} - у нее нет текста') html = get_html(news.url) print(news.url) if html: soup = BeautifulSoup(html, 'html.parser') full = soup.find('div', class_='post__body post__body_full').decode_contents() if full: news.text = full db.session.add(news) db.session.commit()
def get_python_news(): html = get_html("https://www.python.org/blogs/") if html: soup = BeautifulSoup(html, "html.parser") all_news = soup.find('ul', class_='list-recent-posts') all_news = all_news.findAll('li') for news in all_news: title = news.find('a').text url = news.find('a')['href'] published = news.find('time')['datetime'] try: published = datetime.strptime(published, '%Y-%m-%d') except ValueError: published = datetime.now() save_news(title, url, published)
def get_news_content(): news_without_text = News.query.filter(News.text.is_(None)) for news in news_without_text: html = get_html(news.url) if html: soup = BeautifulSoup(html, 'html.parser') # Только текст #article = soup.find('div', class_='post__text-html').text # html разметка article = soup.find('div', class_='post__text-html').decode_contents() if article: news.text = article db.session.add(news) db.session.commit()
def get_news_snippets(): html = get_html( "https://habr.com/ru/search/?target_type=posts&q=python&order_by=date") if html: soup = BeautifulSoup( html, "html.parser") #Преобразованный html в дерево супа all_news = soup.find("ul", class_="content-list_posts").findAll( "li", class_="content-list__item_post") # all_news = all_news.findAll("li") for news in all_news: title = news.find('a', class_="post__title_link").text url = news.find('a', class_="post__title_link")['href'] published = news.find('span', class_="post__time").text published = parse_habr_date(published) save_news(title, url, published)
def get_news_content(): news_without_text = News.query.filter(News.text.is_(None)) for news in news_without_text: html = get_html(news.url) try: if html: soup = BeautifulSoup(html, "html.parser") # Для получения именно html используем .decode_contents() а не .text news_text = soup.find( 'div', class_='post__text-html').decode_contents() if news_text: news.text = news_text db.session.add(news) db.session.commit() except AttributeError: print("Данных <div class='post__text-html' нет на странице")
def get_news_content(): news_without_text = News.query.filter( News.text.is_(None) ) # is_ позволяет сделать сравнение на идентичность, не используется .all(), и все равно можно сделать цикл for news in news_without_text: html = get_html(news.url) if html: soup = BeautifulSoup(html, 'html.parser') news_text = soup.find( 'div', class_='post__text-html' ).decode_contents( ) # decode_contents() показывает не просто текст, а html странички if news_text: news.text = news_text db.session.add(news) db.session.commit()
def get_news_content(): news_without_text = News.query.filter(News.text.is_(None)) for news in news_without_text: html = get_html(news.url) if not html: continue soup = BeautifulSoup(html, "html.parser") article = soup.find("div", class_="post__text-html").decode_contents() if not article: continue news.text = article db.session.add(news) db.session.commit()
def get_news_content(): news_without_text = News.query.filter(News.text.is_(None)) for news in news_without_text: html = get_html(news.url) try: if html: soup = BeautifulSoup(html, 'html.parser') article = soup.find( 'div', class_='post__text-html' ).decode_contents( ) # decode_contents() - позволяет не просто получить текст находящийся внутри div, а получить html код if article: news.text = article db.session.add(news) db.session.commit() except (AttributeError): continue
def get_news_snippets(): html = get_html( 'https://habr.com/ru/search/?target_type=posts&q=python&order_by=date') if html: soup = BeautifulSoup(html, 'html.parser') all_news = soup.find('ul', class_='content-list_posts').findAll( 'li', class_='content-list__item_post') # result_news = [] for news in all_news: try: title = news.find('a', class_='post__title_link').text url = news.find('a', class_='post__title_link')['href'] published = news.find('span', class_='post__time').text published = parse_habr_date(published) save_news(title, url, published) print(title, url, published) except AttributeError: title = news.find('a', class_='preview-data__title-link').text url = news.find('a', class_='preview-data__title-link')['href'] published = news.find( 'span', class_='preview-data__time-published').text
def get_news(): hubs = [ 'python', 'web_testing','it_testing', 'data_engineering', 'bigdata'] for hub in hubs: url = 'https://habr.com/ru/hub/' + hub parsehabr(get_html(url), hub)