def scrape_article_data(article_url): try: db_helper = DbHelper(Config.db_english) if db_helper.data_present(article_url): return # db_helper.insert_article(article_url, Config.kantipur_daily_, category, title, date.text, article) db_helper.close_connection() Logger.add_log('Scrapping : ' + article_url) except TimeoutError: Logger.add_error('TimeoutError ' + article_url) except requests.ConnectionError: Logger.add_error('ConnectionError ' + article_url)
class KantipurDaily: def __init__(self, page_url): self.page_url = page_url self.db_helper = DbHelper() self.parent_url = 'https://www.kantipurdaily.com' def scrape_article_url(self): soup = get_url_soup(self.page_url) article_soup = soup.findAll('div', {'class': 'teaser offset'}) for data in article_soup: url_soup = get_txt_soup(data).find('h2') url_soup = get_txt_soup(url_soup).find('a', href=True) article_url = url_soup['href'] if self.db_helper.data_not_present(article_url): self.scrape_article_data(article_url) def scrape_article_data(self, article_url): article_url = self.parent_url + article_url.strip() soup = get_url_soup(article_url) print(article_url) title = soup.find('div', {'class': 'article-header'}) headline = get_txt_soup(title).find('h1') sub_headline = get_txt_soup(title).find('div', {'class': 'sub-headline'}) title = str(headline.text) if sub_headline is not None: title = str(headline.text) + '\n' + str(sub_headline.text) date = soup.find('time') article = soup.find('div', {'class': 'description'}) scripts = get_txt_soup(article).findAll('script') article = article.text for script in scripts: script_text = script.text if script_text in article: article = article.replace(script_text, '') self.db_helper.insert_article(article_url, title, date.text, article)
class EKantipur(object): def __init__(self, driver): self.driver = driver self.df = pd.DataFrame() self.db_helper = DbHelper() def scrape(self): self.skip_add() self.scrape_articles() self.db_helper.close_connection() def skip_add(self): try: WebDriverWait(self.driver, 90).until( ec.element_to_be_clickable( (By.XPATH, Locators.skip_add_button))) button = self.driver.find_element(By.XPATH, Locators.skip_add_button) button.click() except TimeoutException as e: print('timeout') Logger.add_error(str(e)) def scrape_articles(self): try: soup = get_url_soup(self.driver.current_url) tags = soup.findAll('div', {'class': 'total_comments'}) for url_data in tags: url_data = str(url_data).replace( '<div class="total_comments" onclick="showFBCommentBox(this,\'', '') url_data = str(url_data).replace( '\',\'eng\')"><span class="glyphicon glyphicon-comment"></span></div>', '') title, article = scrape_article_data(url_data) self.db_helper.insert_article(url_data, title, article) except Exception as e: Logger.add_error(str(e))
def scrape_article_data(article_url): try: db_helper = DbHelper(Config.db) article_url = article_url.split('||||') category = article_url[1] article_url = article_url[0] if db_helper.data_present(article_url): return soup = SoupHelper.get_url_soup(article_url) title_card = soup.find('div', {'class': 'nws__title--card'}) title = SoupHelper.get_txt_soup(title_card).find('h2') if title is None: Logger.add_error('Dead Link ' + str(article_url)) return date = soup.find('div', {'class': 'post__time'}) date = SoupHelper.get_txt_soup(date).find('span') title = title.text date = date.text date = date.split(' ') month = date[1] day = date[2] year = date[0] date = str(month) + ' ' + str(day) + ',' + str(year) article = soup.find('div', {'ok__news--wrap'}) article = SoupHelper.get_txt_soup(article).findAll('p') article_text = list() for data in article: article_text.append(data.text.strip()) article_text = ''.join(article_text) db_helper.insert_article(article_url, Config.online_khabar, category, title, date, article_text, 'ред') db_helper.close_connection() Logger.add_log('Scrapping : ' + article_url) except TimeoutError: Logger.add_error('TimeoutError ' + article_url) except requests.ConnectionError: Logger.add_error('ConnectionError ' + article_url) except requests.TooManyRedirects: Logger.add_error('Redirect Error ' + article_url)
def scrape_article_data(article_url): try: db_helper = DbHelper(Config.db) article_url = article_url.split('||||') category = article_url[1] article_url = article_url[0] if db_helper.data_present(article_url): return soup = SoupHelper.get_url_soup(article_url) title = soup.find('div', {'class': 'inner-section cover-news'}) title = SoupHelper.get_txt_soup(title).find('div', {'class': 'col-sm-12'}) title = SoupHelper.get_txt_soup(title).find('h1') if title is None: Logger.add_error('Dead Link ' + str(article_url)) return title = title.text date = soup.find('div', {'class', 'author-location'}) date = SoupHelper.get_txt_soup(date).find('span') date = date.text.split(',') date = date[1].strip().split(' ') month = date[1] day = date[0] year = date[2] date = str(month) + ' ' + str(day) + ',' + str(year) article = soup.find('div', {'id': 'newsContent'}) article = SoupHelper.get_txt_soup(article).findAll('p') article_text = list() for data in article: article_text.append(data.text.strip()) article_text = ''.join(article_text) db_helper.insert_article( article_url, Config.nagarik_news, Config.nagarik_news_sections_dict.get(category), title, date, article_text, 'ред') db_helper.close_connection() Logger.add_log('Scrapping : ' + article_url) except TimeoutError: Logger.add_error('TimeoutError ' + article_url) except requests.ConnectionError: Logger.add_error('ConnectionError ' + article_url)
def scrape_article_data(article_url): try: db_helper = DbHelper(Config.db) if db_helper.data_present(article_url): return soup = SoupHelper.get_url_soup(article_url) title = soup.find('div', {'class': 'article-header'}) headline = SoupHelper.get_txt_soup(title).find('h1') sub_headline = SoupHelper.get_txt_soup(title).find( 'div', {'class': 'sub-headline'}) if title is None: Logger.add_error('Dead Link ' + str(article_url)) return title = str(headline.text) if sub_headline is not None: title = str(headline.text) + '\n' + str(sub_headline.text) date = soup.find('time') article = soup.find('div', {'class': 'description'}) scripts = SoupHelper.get_txt_soup(article).findAll('script') article = article.text for script in scripts: script_text = script.text if script_text in article: article = article.replace(script_text, '') article = article.split('Share on Facebook') article = article[0] temp = article_url.split('/') category = temp[3] db_helper.insert_article(article_url, Config.kantipur_daily_, category, title, date.text, article, 'ред') db_helper.close_connection() Logger.add_log('Scrapping : ' + article_url) except TimeoutError: Logger.add_error('TimeoutError ' + article_url) except requests.ConnectionError: Logger.add_error('ConnectionError ' + article_url)
def scrape_article_data(article_url): try: db_helper = DbHelper(Config.db) if db_helper.data_present(article_url): return soup = SoupHelper.get_url_soup(article_url) title = soup.find('span', {'class': 'news-big-title'}) if title is None: Logger.add_error('Dead Link ' + str(article_url)) return title = title.text article_text = list() article = soup.find('div', {'class': 'editor-box'}) article = SoupHelper.get_txt_soup(article).findAll('p') for data in article: article_text.append(data.text) article_text = ' '.join(article_text) pub_date = soup.find('span', {'class': 'pub-date'}) pub_date = pub_date.text month = pub_date.split(',')[1].strip().split(' ')[0] day = pub_date.split(',')[1].strip().split(' ')[1] year = pub_date.split(',')[2].strip() date = str(month) + ' ' + str(day) + ',' + str(year) category = article_url.split('/')[3] db_helper.insert_article(article_url, Config.setopati, category, title, date, article_text, 'ред') db_helper.close_connection() Logger.add_log('Scrapping : ' + article_url) except TimeoutError: Logger.add_error('TimeoutError ' + article_url) except requests.ConnectionError: Logger.add_error('ConnectionError ' + article_url)
def scrape_article_data(article_url): try: db_helper = DbHelper(Config.db_english) temp = article_url.split('/') category = temp[4] if db_helper.data_present(article_url): return soup = SoupHelper.get_url_soup(article_url) title = soup.find('div', {'class': 'col-lg-12'}) title = SoupHelper.get_txt_soup(title).find('h4') if title is None: Logger.add_error('Dead Link ' + str(article_url)) return title = title.text date = soup.find('div', {'class': 'date-time'}) date = SoupHelper.get_txt_soup(date).find('span') date = date.text date = datetime.strptime(date, '%A, %b %d, %Y') date = date.strftime('%Y-%m-%d') temp_article = soup.find('div', {'class': 'mn-text'}) temp_article = SoupHelper.get_txt_soup(temp_article).findAll('p') article = list() for data in temp_article: article.append(data.text.strip()) article = ' '.join(article) db_helper.insert_article(article_url, Config.karobar_daily, category, title, date, article, '. ') db_helper.close_connection() Logger.add_log('Scrapping : ' + article_url) except TimeoutError: Logger.add_error('TimeoutError ' + article_url) except requests.ConnectionError: Logger.add_error('ConnectionError ' + article_url)
def __init__(self, driver): self.driver = driver self.df = pd.DataFrame() self.db_helper = DbHelper()
def __init__(self, page_url): self.page_url = page_url self.db_helper = DbHelper() self.parent_url = 'https://www.kantipurdaily.com'