def scrape_article_data(article_url):
    try:
        db_helper = DbHelper(Config.db)

        article_url = article_url.split('||||')
        category = article_url[1]
        article_url = article_url[0]

        if db_helper.data_present(article_url):
            return

        soup = SoupHelper.get_url_soup(article_url)

        title_card = soup.find('div', {'class': 'nws__title--card'})
        title = SoupHelper.get_txt_soup(title_card).find('h2')

        if title is None:
            Logger.add_error('Dead Link ' + str(article_url))
            return

        date = soup.find('div', {'class': 'post__time'})
        date = SoupHelper.get_txt_soup(date).find('span')

        title = title.text
        date = date.text

        date = date.split(' ')
        month = date[1]
        day = date[2]
        year = date[0]

        date = str(month) + ' ' + str(day) + ',' + str(year)

        article = soup.find('div', {'ok__news--wrap'})
        article = SoupHelper.get_txt_soup(article).findAll('p')

        article_text = list()
        for data in article:
            article_text.append(data.text.strip())
        article_text = ''.join(article_text)

        db_helper.insert_article(article_url, Config.online_khabar, category,
                                 title, date, article_text, 'ред')
        db_helper.close_connection()

        Logger.add_log('Scrapping : ' + article_url)

    except TimeoutError:
        Logger.add_error('TimeoutError ' + article_url)

    except requests.ConnectionError:
        Logger.add_error('ConnectionError ' + article_url)

    except requests.TooManyRedirects:
        Logger.add_error('Redirect Error ' + article_url)
Пример #2
0
def scrape_article_data(article_url):
    try:
        db_helper = DbHelper(Config.db)

        article_url = article_url.split('||||')
        category = article_url[1]
        article_url = article_url[0]

        if db_helper.data_present(article_url):
            return

        soup = SoupHelper.get_url_soup(article_url)

        title = soup.find('div', {'class': 'inner-section cover-news'})
        title = SoupHelper.get_txt_soup(title).find('div',
                                                    {'class': 'col-sm-12'})
        title = SoupHelper.get_txt_soup(title).find('h1')

        if title is None:
            Logger.add_error('Dead Link ' + str(article_url))
            return

        title = title.text

        date = soup.find('div', {'class', 'author-location'})
        date = SoupHelper.get_txt_soup(date).find('span')
        date = date.text.split(',')
        date = date[1].strip().split(' ')
        month = date[1]
        day = date[0]
        year = date[2]
        date = str(month) + ' ' + str(day) + ',' + str(year)

        article = soup.find('div', {'id': 'newsContent'})
        article = SoupHelper.get_txt_soup(article).findAll('p')

        article_text = list()
        for data in article:
            article_text.append(data.text.strip())
        article_text = ''.join(article_text)

        db_helper.insert_article(
            article_url, Config.nagarik_news,
            Config.nagarik_news_sections_dict.get(category), title, date,
            article_text, 'ред')
        db_helper.close_connection()

        Logger.add_log('Scrapping : ' + article_url)

    except TimeoutError:
        Logger.add_error('TimeoutError ' + article_url)

    except requests.ConnectionError:
        Logger.add_error('ConnectionError ' + article_url)
def scrape_article_data(article_url):
    try:
        db_helper = DbHelper(Config.db)

        if db_helper.data_present(article_url):
            return

        soup = SoupHelper.get_url_soup(article_url)

        title = soup.find('div', {'class': 'article-header'})
        headline = SoupHelper.get_txt_soup(title).find('h1')
        sub_headline = SoupHelper.get_txt_soup(title).find(
            'div', {'class': 'sub-headline'})

        if title is None:
            Logger.add_error('Dead Link ' + str(article_url))
            return

        title = str(headline.text)

        if sub_headline is not None:
            title = str(headline.text) + '\n' + str(sub_headline.text)

        date = soup.find('time')
        article = soup.find('div', {'class': 'description'})

        scripts = SoupHelper.get_txt_soup(article).findAll('script')
        article = article.text

        for script in scripts:
            script_text = script.text
            if script_text in article:
                article = article.replace(script_text, '')

        article = article.split('Share on Facebook')
        article = article[0]

        temp = article_url.split('/')
        category = temp[3]

        db_helper.insert_article(article_url, Config.kantipur_daily_, category,
                                 title, date.text, article, 'ред')
        db_helper.close_connection()

        Logger.add_log('Scrapping : ' + article_url)

    except TimeoutError:
        Logger.add_error('TimeoutError ' + article_url)

    except requests.ConnectionError:
        Logger.add_error('ConnectionError ' + article_url)
Пример #4
0
def scrape_article_data(article_url):
    try:
        db_helper = DbHelper(Config.db)

        if db_helper.data_present(article_url):
            return

        soup = SoupHelper.get_url_soup(article_url)
        title = soup.find('span', {'class': 'news-big-title'})

        if title is None:
            Logger.add_error('Dead Link ' + str(article_url))
            return

        title = title.text

        article_text = list()

        article = soup.find('div', {'class': 'editor-box'})
        article = SoupHelper.get_txt_soup(article).findAll('p')

        for data in article:
            article_text.append(data.text)

        article_text = ' '.join(article_text)

        pub_date = soup.find('span', {'class': 'pub-date'})

        pub_date = pub_date.text

        month = pub_date.split(',')[1].strip().split(' ')[0]
        day = pub_date.split(',')[1].strip().split(' ')[1]
        year = pub_date.split(',')[2].strip()

        date = str(month) + ' ' + str(day) + ',' + str(year)

        category = article_url.split('/')[3]

        db_helper.insert_article(article_url, Config.setopati, category, title,
                                 date, article_text, 'ред')
        db_helper.close_connection()

        Logger.add_log('Scrapping : ' + article_url)

    except TimeoutError:
        Logger.add_error('TimeoutError ' + article_url)

    except requests.ConnectionError:
        Logger.add_error('ConnectionError ' + article_url)
Пример #5
0
def scrape_article_data(article_url):
    try:
        db_helper = DbHelper(Config.db_english)

        temp = article_url.split('/')
        category = temp[4]

        if db_helper.data_present(article_url):
            return

        soup = SoupHelper.get_url_soup(article_url)

        title = soup.find('div', {'class': 'col-lg-12'})
        title = SoupHelper.get_txt_soup(title).find('h4')

        if title is None:
            Logger.add_error('Dead Link ' + str(article_url))
            return
        title = title.text

        date = soup.find('div', {'class': 'date-time'})
        date = SoupHelper.get_txt_soup(date).find('span')
        date = date.text
        date = datetime.strptime(date, '%A, %b %d, %Y')
        date = date.strftime('%Y-%m-%d')

        temp_article = soup.find('div', {'class': 'mn-text'})
        temp_article = SoupHelper.get_txt_soup(temp_article).findAll('p')

        article = list()

        for data in temp_article:
            article.append(data.text.strip())

        article = ' '.join(article)

        db_helper.insert_article(article_url, Config.karobar_daily,
                                 category, title, date,
                                 article, '. ')

        db_helper.close_connection()

        Logger.add_log('Scrapping : ' + article_url)

    except TimeoutError:
        Logger.add_error('TimeoutError ' + article_url)

    except requests.ConnectionError:
        Logger.add_error('ConnectionError ' + article_url)
Пример #6
0
class KantipurDaily:
    def __init__(self, page_url):
        self.page_url = page_url
        self.db_helper = DbHelper()
        self.parent_url = 'https://www.kantipurdaily.com'

    def scrape_article_url(self):
        soup = get_url_soup(self.page_url)
        article_soup = soup.findAll('div', {'class': 'teaser offset'})
        for data in article_soup:
            url_soup = get_txt_soup(data).find('h2')
            url_soup = get_txt_soup(url_soup).find('a', href=True)
            article_url = url_soup['href']

            if self.db_helper.data_not_present(article_url):
                self.scrape_article_data(article_url)

    def scrape_article_data(self, article_url):
        article_url = self.parent_url + article_url.strip()
        soup = get_url_soup(article_url)

        print(article_url)

        title = soup.find('div', {'class': 'article-header'})
        headline = get_txt_soup(title).find('h1')
        sub_headline = get_txt_soup(title).find('div',
                                                {'class': 'sub-headline'})

        title = str(headline.text)

        if sub_headline is not None:
            title = str(headline.text) + '\n' + str(sub_headline.text)

        date = soup.find('time')
        article = soup.find('div', {'class': 'description'})

        scripts = get_txt_soup(article).findAll('script')
        article = article.text

        for script in scripts:
            script_text = script.text
            if script_text in article:
                article = article.replace(script_text, '')

        self.db_helper.insert_article(article_url, title, date.text, article)
class EKantipur(object):
    def __init__(self, driver):
        self.driver = driver
        self.df = pd.DataFrame()
        self.db_helper = DbHelper()

    def scrape(self):
        self.skip_add()
        self.scrape_articles()
        self.db_helper.close_connection()

    def skip_add(self):
        try:

            WebDriverWait(self.driver, 90).until(
                ec.element_to_be_clickable(
                    (By.XPATH, Locators.skip_add_button)))

            button = self.driver.find_element(By.XPATH,
                                              Locators.skip_add_button)
            button.click()

        except TimeoutException as e:
            print('timeout')
            Logger.add_error(str(e))

    def scrape_articles(self):
        try:

            soup = get_url_soup(self.driver.current_url)
            tags = soup.findAll('div', {'class': 'total_comments'})

            for url_data in tags:
                url_data = str(url_data).replace(
                    '<div class="total_comments" onclick="showFBCommentBox(this,\'',
                    '')
                url_data = str(url_data).replace(
                    '\',\'eng\')"><span class="glyphicon glyphicon-comment"></span></div>',
                    '')
                title, article = scrape_article_data(url_data)
                self.db_helper.insert_article(url_data, title, article)

        except Exception as e:
            Logger.add_error(str(e))