def read_information(df_web_config):

    try:

        # Creating an empty Data frame with column names only
        df_news_data = pd.DataFrame(columns=['website', 'website_category', 'website_link', 'header',
                                             'sub_header', 'timestamp'])

        for row in df_web_config.itertuples(index=False):
            url_link = row.website_link

            # open with GET method
            resp = requests.get(url_link, headers={'User-Agent': 'Mozilla/5.0'})

            # http_response 200 means OK status
            if resp.status_code == 200:
                # parser
                soup = BeautifulSoup(resp.text, 'html.parser')

                website = 'india_reuters'
                categories = row.website_category
                news_link = ''
                header = ''
                sub_header = ''
                timestamp = ''

                for level_1 in soup.findAll("div", {"class": "news-headline-list"}):
                    for level_2 in level_1.findAll("div", {"class": "story-content"}):
                        for level_3 in level_2.findAll("a"):
                            news_link = level_3['href']
                            header = level_3.text.strip()

                        for level_3 in level_2.findAll("p"):
                            sub_header = level_3.text

                        for level_3 in level_2.findAll("time", {"class": "article-time"}):
                            timestamp = level_3.text.strip()

                        if len(header) > 0:
                            text_lang = detect(header)

                            if text_lang == "en":
                                df_news_data = df_news_data.append({'website': website, 'website_link': url_link,
                                                 'website_category': categories,'news_link': news_link, 'header': header,
                                                 'sub_header': sub_header, 'timestamp': timestamp},ignore_index=True)

            else:
                database_log.error_log("India - Loader - india_reuters : read_information", resp.status_code)

        if df_news_data.empty is not True:
            df_news_feed_list = [tuple(r) for r in df_news_data[['website', 'website_link', 'website_category',
                                                                 'news_link', 'header', 'sub_header',
                                                                 'timestamp']].values.tolist()]
            sql_execute.bulk_insert_news_feeds(df_news_feed_list)
        else:
            database_log.error_log("India - Loader - india_reuters : read_information", "no record found")

    except Exception as error:
        database_log.error_log("India - Loader - india_reuters : read_information", error)
示例#2
0
def load_data_to_tables(url_list):
    for file_name,url in url_list.items():
        df_news_data = pd.read_csv(data_path + file_name +".csv")

        if df_news_data.empty is not True:
            df_news_data["website_category"] = "Finance"
            df_news_feed_list = [tuple(r) for r in df_news_data[['website', 'web_url', 'website_category',
                                'article_url', 'header', 'sub_header','published_date']].values.tolist()]
            sql_execute.bulk_insert_news_feeds(df_news_feed_list)
示例#3
0
def read_information(df_web_config):

    try:

        # Creating an empty Data frame with column names only
        df_news_data = pd.DataFrame(columns=[
            'website', 'website_category', 'website_link', 'header',
            'sub_header', 'timestamp'
        ])

        for index, row in df_web_config.iterrows():
            url_link = row["website_link"]

            # open with GET method
            resp = requests.get(url_link,
                                headers={'User-Agent': 'Mozilla/5.0'})

            # http_response 200 means OK status
            if resp.status_code == 200:
                # parser
                soup = BeautifulSoup(resp.text, 'html.parser')

                website = 'mint'
                categories = row["website_category"]
                news_link = ''
                header = ''
                sub_header = ''
                timestamp = ''

                for level_1 in soup.findAll("div", {"class": "headlineSec"}):
                    for level_2 in level_1.find_all('h2',
                                                    {"class": "headline"}):
                        for level_3 in level_2.find_all('a', href=True):
                            news_link = level_3['href']
                            header = level_3.text

                    for level_2 in level_1.find_all('span',
                                                    {"class": "fl date"}):
                        for level_3 in level_2.find_all('span'):
                            timestamp = level_3.text

                    df_news_data = df_news_data.append(
                        {
                            'website': website,
                            'website_link': url_link,
                            'website_category': categories,
                            'news_link': news_link,
                            'header': header,
                            'sub_header': sub_header,
                            'timestamp': timestamp
                        },
                        ignore_index=True)
            else:
                database_log.error_log(
                    "India - Loader - mint : read_information",
                    resp.status_code)

        if df_news_data.empty is not True:
            df_news_feed_list = [
                tuple(r) for r in df_news_data[[
                    'website', 'website_link', 'website_category', 'news_link',
                    'header', 'sub_header', 'timestamp'
                ]].values.tolist()
            ]
            sql_execute.bulk_insert_news_feeds(df_news_feed_list)
        else:
            database_log.error_log("India - Loader - mint : read_information",
                                   "no record found")

    except Exception as error:
        database_log.error_log("India - Loader - mint : read_information",
                               error)