def _get_articles_list(self, keyword, page_num, **kwargs): url = self._generic_url.format(keyword, kwargs['year'], page_num) response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') articles = [] article_titles_divs = soup.find_all('div', class_="card_title has_ellipsis") article_date_divs = soup.find_all('div', class_="card_date") assert len(article_titles_divs) == len(article_date_divs) for title_div, date_div in zip(article_titles_divs, article_date_divs): article_title = title_div.text article_url = "https://www.vecer.com%s" % date_div.parent.parent.find_all( 'a')[0]['href'] date = datetime.strptime(date_div.text, '%d.%m.%Y, %H.%M') articles.append( ShortArticle(keyword=keyword, url=article_url.strip(), title=article_title.strip(), time=date.strftime('%Y-%m-%d'), site_name=self._site_name)) return articles, False
def _get_articles_list(self, keyword, page_num, **kwargs): url = self._generic_url.format(keyword, page_num) response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') articles = [] date_divs = soup.find_all('div', class_="arial light-gray inline-block uppercase border-left px1 ml1") url_title_divs = soup.find_all('div', class_="clearfix h4 bold roboto-slab mt1") assert len(url_title_divs) == len(date_divs) for date_div, url_title_div in zip(date_divs, url_title_divs): # Get article date article_date = "{}, {}".format(date_div.find('span', class_='item-date').text, date_div.find('span', class_='item-time').text.split(' ')[-1]) # Get article url article_url = "http://www.politika.rs{}".format(url_title_div.find("a")['href']) # Get article title article_title = url_title_div.text date = datetime.strptime(article_date, '%d.%m.%Y, %H:%M') if len( article_date.split(',')) > 1 else datetime.strptime(article_date, '%d.%m.%Y') if date >= constants.MAX_DATE: continue if date < constants.MIN_DATE: return articles, True articles.append( ShortArticle(keyword=keyword, url=article_url.strip(), title=article_title.strip(), time=date.strftime('%Y-%m-%d'), site_name=self._site_name)) return articles, False
def _get_articles_list(self, keyword: str, page_num: int, **kwargs): url = self._generic_url.format(keyword, page_num) response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') articles = [] date_divs = soup.find_all('div', class_="sub-article-info") url_title_divs = soup.find_all('a', class_="sub-article group img-lin-grad") assert len(url_title_divs) == len(date_divs) for date_div, url_title_div in zip(date_divs, url_title_divs): # Get article date article_date = date_div.text.strip().split('\n')[-1].strip() # Get article url article_url = "https://novice.svet24.si{}".format( url_title_div['href']) # Get article title article_title = url_title_div.find('h4').text.strip() date = datetime.strptime(article_date, '%d. %b %Y, %H:%M') if date >= constants.MAX_DATE or date < constants.MIN_DATE: continue articles.append( ShortArticle(keyword=keyword, url=article_url.strip(), title=article_title.strip(), time=date.strftime('%Y-%m-%d'), site_name=self._site_name)) return articles, False
def _get_articles_list(self, keyword, page_num, **kwargs): url = self._generic_url.format(keyword, kwargs['year'], page_num) response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') articles = [] article_titles = soup.find_all('h2') article_date_divs = soup.find_all('div', class_="itemDatePublished") assert len(article_titles) == len(article_date_divs) for title, date_div in zip(article_titles, article_date_divs): article_title = title.text article_href = date_div.parent.find('a')['href'] article_url = "https://www.slovenskenovice.si%s" % article_href if 'http' not in article_href else article_href date = datetime.strptime( date_div.text.split('Objavljeno ')[-1].strip(), '%d.%m.%Y %H:%M') articles.append( ShortArticle(keyword=keyword, url=article_url.strip(), title=article_title.strip(), time=date.strftime('%Y-%m-%d'), site_name=self._site_name)) return articles, False
def _get_articles_list(self, keyword, page_num, **kwargs): url = self._generic_url.format(keyword, page_num) response = requests.get(url) if "Nema rezultata za ovu pretragu" in response.content.decode('utf-8'): return [], True article_divs = BeautifulSoup(response.content, 'html.parser').find_all('div', class_='categoryList__details') articles = [] for article_div in article_divs: article_date, article_url = article_div.find('ul').findAll('li') days = int(article_date.text.split(" ")[-1][:-1]) date = datetime.today() - timedelta(days=days) article_url = "http://www.alo.rs{}".format(article_url.find('a')['href']) # Get article title article_title = article_div.find('h2').find('span').text.strip() if not constants.MIN_DATE <= date < constants.MAX_DATE: continue articles.append( ShortArticle(keyword=keyword, url=article_url.strip(), title=article_title.strip(), time=date.strftime('%Y-%m-%d'), site_name=self._site_name)) return articles, False
def _get_short_articles(self, lang): """ Return list of articles (ListArticle objects) within defined range. :return: """ super()._get_short_articles(self._site_id.split('-')[0]) extended = self.extend_short_articles(newspaper="srbija-danas") for e in extended: self._articles.append( ShortArticle(e[1], e[0], "%s *****" % (e[0].split('/')[-1]), "None", self._site_name))
def _get_articles_list(self, keyword, **kwargs): json_response = kwargs['json_response'] articles = [] for article in json_response['objects']: article_title = article['title'] if article[ 'title'] is not None else "" article_url = article['url'] date = datetime.strptime(article['date_published'], '%Y-%m-%dT%H:%M:%SZ') articles.append( ShortArticle(keyword=keyword, url=article_url.strip(), title=article_title.strip(), time=date.strftime('%Y-%m-%d'), site_name=self._site_name)) return articles
def _get_articles_list(self, keyword, page_num, **kwargs): url = self._generic_url.format(keyword, page_num) response = requests.get(url) article_divs = BeautifulSoup(response.content, 'html.parser').find_all('article', class_='o-media') articles = [] for article_div in article_divs: article_url = "https://www.srbijadanas.com%s" % article_div.find('a', class_='o-media__link')['href'] date = article_div.find('time', class_='o-media__date')['datetime'] date = datetime.utcfromtimestamp(int(date)) if date >= constants.MAX_DATE: continue if date < constants.MIN_DATE: return articles, True article_title = article_div.find('h2').text.strip() articles.append( ShortArticle(keyword=keyword, url=article_url.strip(), title=article_title.strip(), time=date.strftime('%Y-%m-%d'), site_name=self._site_name)) return articles, False
def _get_short_articles(self, lang): """ Return list of articles (ListArticle objects) within defined range. :return: """ keywords = constants.keywords_serbian if lang == 'sr' else constants.keywords for keyword in keywords: logging.info("Keyword: %s" % keyword) page_num = 1 while True: logging.info("%d" % page_num) articles_list, stop_iteration = self._get_articles_list(keyword, page_num) self._articles.extend(articles_list) # Stop iteration if article is older than min date if stop_iteration: break page_num += 1 extended = self.extend_short_articles(newspaper="alo") logging.info("Total articles scraped: %d" % len(self._articles)) logging.info("Number of additional articles: %d" % len(extended)) for e in extended: self._articles.append( ShortArticle(e[1], e[0], "%s *****" % (e[0].split('/')[-1]), "None", self._site_name))
def _get_articles_list(self, keyword: str, page_num: int, **kwargs): url = self._generic_url.format(keyword, page_num) driver.get(url) articles = [] try: WebDriverWait(driver, 3).until( lambda x: x.find_elements_by_class_name("card__details")) dates = driver.find_elements_by_class_name("card__details") except TimeoutException: dates = [] for str_date in dates: # Get article url article_url = str_date.find_element_by_xpath( '../../../..').get_attribute('href') # Get article title title = str_date.find_element_by_xpath( '..').find_element_by_class_name('card__title-inside').text # Get article date date = datetime.strptime(str_date.text, '%d.%m.%Y, %H:%M') if len( str_date.text.split(',')) > 1 else datetime.strptime( str_date.text, '%d.%m.%Y') if date >= constants.MAX_DATE: continue if date < constants.MIN_DATE: return articles, True articles.append( ShortArticle(keyword=keyword, url=article_url, title=title, time=date.strftime('%Y-%m-%d'), site_name=self._site_name)) return articles, False