def scrape_article_data(article_url): try: db_helper = DbHelper(Config.db) article_url = article_url.split('||||') category = article_url[1] article_url = article_url[0] if db_helper.data_present(article_url): return soup = SoupHelper.get_url_soup(article_url) title_card = soup.find('div', {'class': 'nws__title--card'}) title = SoupHelper.get_txt_soup(title_card).find('h2') if title is None: Logger.add_error('Dead Link ' + str(article_url)) return date = soup.find('div', {'class': 'post__time'}) date = SoupHelper.get_txt_soup(date).find('span') title = title.text date = date.text date = date.split(' ') month = date[1] day = date[2] year = date[0] date = str(month) + ' ' + str(day) + ',' + str(year) article = soup.find('div', {'ok__news--wrap'}) article = SoupHelper.get_txt_soup(article).findAll('p') article_text = list() for data in article: article_text.append(data.text.strip()) article_text = ''.join(article_text) db_helper.insert_article(article_url, Config.online_khabar, category, title, date, article_text, 'ред') db_helper.close_connection() Logger.add_log('Scrapping : ' + article_url) except TimeoutError: Logger.add_error('TimeoutError ' + article_url) except requests.ConnectionError: Logger.add_error('ConnectionError ' + article_url) except requests.TooManyRedirects: Logger.add_error('Redirect Error ' + article_url)
def scrape_article_data(article_url): try: db_helper = DbHelper(Config.db) article_url = article_url.split('||||') category = article_url[1] article_url = article_url[0] if db_helper.data_present(article_url): return soup = SoupHelper.get_url_soup(article_url) title = soup.find('div', {'class': 'inner-section cover-news'}) title = SoupHelper.get_txt_soup(title).find('div', {'class': 'col-sm-12'}) title = SoupHelper.get_txt_soup(title).find('h1') if title is None: Logger.add_error('Dead Link ' + str(article_url)) return title = title.text date = soup.find('div', {'class', 'author-location'}) date = SoupHelper.get_txt_soup(date).find('span') date = date.text.split(',') date = date[1].strip().split(' ') month = date[1] day = date[0] year = date[2] date = str(month) + ' ' + str(day) + ',' + str(year) article = soup.find('div', {'id': 'newsContent'}) article = SoupHelper.get_txt_soup(article).findAll('p') article_text = list() for data in article: article_text.append(data.text.strip()) article_text = ''.join(article_text) db_helper.insert_article( article_url, Config.nagarik_news, Config.nagarik_news_sections_dict.get(category), title, date, article_text, 'ред') db_helper.close_connection() Logger.add_log('Scrapping : ' + article_url) except TimeoutError: Logger.add_error('TimeoutError ' + article_url) except requests.ConnectionError: Logger.add_error('ConnectionError ' + article_url)
def scrape_article_data(article_url): try: db_helper = DbHelper(Config.db) if db_helper.data_present(article_url): return soup = SoupHelper.get_url_soup(article_url) title = soup.find('div', {'class': 'article-header'}) headline = SoupHelper.get_txt_soup(title).find('h1') sub_headline = SoupHelper.get_txt_soup(title).find( 'div', {'class': 'sub-headline'}) if title is None: Logger.add_error('Dead Link ' + str(article_url)) return title = str(headline.text) if sub_headline is not None: title = str(headline.text) + '\n' + str(sub_headline.text) date = soup.find('time') article = soup.find('div', {'class': 'description'}) scripts = SoupHelper.get_txt_soup(article).findAll('script') article = article.text for script in scripts: script_text = script.text if script_text in article: article = article.replace(script_text, '') article = article.split('Share on Facebook') article = article[0] temp = article_url.split('/') category = temp[3] db_helper.insert_article(article_url, Config.kantipur_daily_, category, title, date.text, article, 'ред') db_helper.close_connection() Logger.add_log('Scrapping : ' + article_url) except TimeoutError: Logger.add_error('TimeoutError ' + article_url) except requests.ConnectionError: Logger.add_error('ConnectionError ' + article_url)
def scrape_article_data(article_url): try: db_helper = DbHelper(Config.db) if db_helper.data_present(article_url): return soup = SoupHelper.get_url_soup(article_url) title = soup.find('span', {'class': 'news-big-title'}) if title is None: Logger.add_error('Dead Link ' + str(article_url)) return title = title.text article_text = list() article = soup.find('div', {'class': 'editor-box'}) article = SoupHelper.get_txt_soup(article).findAll('p') for data in article: article_text.append(data.text) article_text = ' '.join(article_text) pub_date = soup.find('span', {'class': 'pub-date'}) pub_date = pub_date.text month = pub_date.split(',')[1].strip().split(' ')[0] day = pub_date.split(',')[1].strip().split(' ')[1] year = pub_date.split(',')[2].strip() date = str(month) + ' ' + str(day) + ',' + str(year) category = article_url.split('/')[3] db_helper.insert_article(article_url, Config.setopati, category, title, date, article_text, 'ред') db_helper.close_connection() Logger.add_log('Scrapping : ' + article_url) except TimeoutError: Logger.add_error('TimeoutError ' + article_url) except requests.ConnectionError: Logger.add_error('ConnectionError ' + article_url)
def scrape_article_data(article_url): try: db_helper = DbHelper(Config.db_english) temp = article_url.split('/') category = temp[4] if db_helper.data_present(article_url): return soup = SoupHelper.get_url_soup(article_url) title = soup.find('div', {'class': 'col-lg-12'}) title = SoupHelper.get_txt_soup(title).find('h4') if title is None: Logger.add_error('Dead Link ' + str(article_url)) return title = title.text date = soup.find('div', {'class': 'date-time'}) date = SoupHelper.get_txt_soup(date).find('span') date = date.text date = datetime.strptime(date, '%A, %b %d, %Y') date = date.strftime('%Y-%m-%d') temp_article = soup.find('div', {'class': 'mn-text'}) temp_article = SoupHelper.get_txt_soup(temp_article).findAll('p') article = list() for data in temp_article: article.append(data.text.strip()) article = ' '.join(article) db_helper.insert_article(article_url, Config.karobar_daily, category, title, date, article, '. ') db_helper.close_connection() Logger.add_log('Scrapping : ' + article_url) except TimeoutError: Logger.add_error('TimeoutError ' + article_url) except requests.ConnectionError: Logger.add_error('ConnectionError ' + article_url)
def scrape_article_url(self, article_section_url): try: soup = SoupHelper.get_url_soup(article_section_url) article_soup = soup.findAll('h2', {'itemprop': 'headline'}) for data in article_soup: url_soup = SoupHelper.get_txt_soup(data).find('a', href=True) article_url = url_soup['href'] self.url_list.append(article_url) except TimeoutError: Logger.add_error('TimeoutError ' + article_section_url) except requests.ConnectionError: Logger.add_error('ConnectionError ' + article_section_url)
def parse(self, response): soup = SoupHelper.get_url_soup(response.url) title = soup.find('div', {'class': 'about-page-detailing'}) try: sub_title = SoupHelper.get_txt_soup(title).find('h5').text except AttributeError: sub_title = '' try: title = SoupHelper.get_txt_soup(title).find('h1').text except AttributeError: title = '' title = title + ' ' + sub_title title = title.strip() date = soup.find('span', {'class': 'dates'}) date = SoupHelper.get_txt_soup(date).find('a').text date = date.strip() url = response.url article = soup.find('div', {'class': 'elementor-section-wrap'}) article = SoupHelper.get_txt_soup(article).find_all('p') temp = list() for p in article: data = p.text data = data.strip() if '(adsbygoogle = window.adsbygoogle || []).push({});' not in data: temp.append(data) article = ' '.join(temp) article = article.strip() temp_df = pd.DataFrame() temp_df['url'] = [url] temp_df['title'] = [title] temp_df['date'] = [date] temp_df['article'] = [article] temp_df['category'] = [url.split('/')[3]] self.df = self.df.append(temp_df) self.df.to_csv(Config.nepali_times_spider_output_location, index=False)
def scrape_article_url(self, article_section_url): try: soup = SoupHelper.get_url_soup(article_section_url) article_soup = soup.findAll('div', {'class': 'teaser offset'}) for data in article_soup: url_soup = SoupHelper.get_txt_soup(data).find('h2') url_soup = SoupHelper.get_txt_soup(url_soup).find('a', href=True) article_url = url_soup['href'] article_url = Config.kantipur_daily_url + article_url.strip() self.url_list.append(article_url) except TimeoutError: Logger.add_error('TimeoutError ' + article_section_url) except requests.ConnectionError: Logger.add_error('ConnectionError ' + article_section_url)
def scrape_article_url(self, article_section_url, category): try: soup = SoupHelper.get_url_soup(article_section_url) article_soup = soup.findAll('div', {'class': 'item__wrap'}) for data in article_soup: url_soup = SoupHelper.get_txt_soup(data).find('a', href=True) article_url = url_soup['href'] article_url = article_url.strip() article_url = article_url + '||||' + str( Config.online_khabar_section_dict.get(category)) self.url_list.append(article_url) except TimeoutError: Logger.add_error('TimeoutError ' + article_section_url) except requests.ConnectionError: Logger.add_error('ConnectionError ' + article_section_url)
def scrape_article_url(self, article_section_url, category): try: soup = SoupHelper.get_url_soup(article_section_url) article_soup = soup.findAll('div', {'class': 'first-on first-list'}) for data in article_soup: url_soup = SoupHelper.get_txt_soup(data).find('h3') url_soup = SoupHelper.get_txt_soup(url_soup).find('a', href=True) article_url = url_soup['href'] article_url = Config.nagarik_news_url + article_url.strip() article_url = article_url + '||||' + str(category) self.url_list.append(article_url) except TimeoutError: Logger.add_error('TimeoutError ' + article_section_url) except requests.ConnectionError: Logger.add_error('ConnectionError ' + article_section_url)
def scrape_article_url(self, article_section_url): try: soup = SoupHelper.get_url_soup(article_section_url) md_4 = soup.findAll('div', {'class': 'items col-md-4'}) md_6 = soup.findAll('div', {'class': 'items col-md-6'}) all_url = list() for data in md_4: url_soup = SoupHelper.get_txt_soup(data).find('a', href=True) all_url.append(url_soup['href']) for data in md_6: try: url_soup = SoupHelper.get_txt_soup(data).find('a', href=True) all_url.append(url_soup['href']) except TypeError: pass while '#' in all_url: all_url.remove('#') while 'https://www.setopati.com' in all_url: all_url.remove('https://www.setopati.com') while 'http://icc.setopati.com/' in all_url: all_url.remove('http://icc.setopati.com/') while 'https://www.setopati.com/our-team' in all_url: all_url.remove('https://www.setopati.com/our-team') for article_url in all_url: self.url_list.append(article_url) except TimeoutError: Logger.add_error('TimeoutError ' + article_section_url) except requests.ConnectionError: Logger.add_error('ConnectionError ' + article_section_url)