def read_information(df_web_config): try: # Creating an empty Data frame with column names only df_news_data = pd.DataFrame(columns=['website', 'website_category', 'website_link', 'header', 'sub_header', 'timestamp']) for row in df_web_config.itertuples(index=False): url_link = row.website_link # open with GET method resp = requests.get(url_link, headers={'User-Agent': 'Mozilla/5.0'}) # http_response 200 means OK status if resp.status_code == 200: # parser soup = BeautifulSoup(resp.text, 'html.parser') website = 'india_reuters' categories = row.website_category news_link = '' header = '' sub_header = '' timestamp = '' for level_1 in soup.findAll("div", {"class": "news-headline-list"}): for level_2 in level_1.findAll("div", {"class": "story-content"}): for level_3 in level_2.findAll("a"): news_link = level_3['href'] header = level_3.text.strip() for level_3 in level_2.findAll("p"): sub_header = level_3.text for level_3 in level_2.findAll("time", {"class": "article-time"}): timestamp = level_3.text.strip() if len(header) > 0: text_lang = detect(header) if text_lang == "en": df_news_data = df_news_data.append({'website': website, 'website_link': url_link, 'website_category': categories,'news_link': news_link, 'header': header, 'sub_header': sub_header, 'timestamp': timestamp},ignore_index=True) else: database_log.error_log("India - Loader - india_reuters : read_information", resp.status_code) if df_news_data.empty is not True: df_news_feed_list = [tuple(r) for r in df_news_data[['website', 'website_link', 'website_category', 'news_link', 'header', 'sub_header', 'timestamp']].values.tolist()] sql_execute.bulk_insert_news_feeds(df_news_feed_list) else: database_log.error_log("India - Loader - india_reuters : read_information", "no record found") except Exception as error: database_log.error_log("India - Loader - india_reuters : read_information", error)
def load_data_to_tables(url_list): for file_name,url in url_list.items(): df_news_data = pd.read_csv(data_path + file_name +".csv") if df_news_data.empty is not True: df_news_data["website_category"] = "Finance" df_news_feed_list = [tuple(r) for r in df_news_data[['website', 'web_url', 'website_category', 'article_url', 'header', 'sub_header','published_date']].values.tolist()] sql_execute.bulk_insert_news_feeds(df_news_feed_list)
def read_information(df_web_config): try: # Creating an empty Data frame with column names only df_news_data = pd.DataFrame(columns=[ 'website', 'website_category', 'website_link', 'header', 'sub_header', 'timestamp' ]) for index, row in df_web_config.iterrows(): url_link = row["website_link"] # open with GET method resp = requests.get(url_link, headers={'User-Agent': 'Mozilla/5.0'}) # http_response 200 means OK status if resp.status_code == 200: # parser soup = BeautifulSoup(resp.text, 'html.parser') website = 'mint' categories = row["website_category"] news_link = '' header = '' sub_header = '' timestamp = '' for level_1 in soup.findAll("div", {"class": "headlineSec"}): for level_2 in level_1.find_all('h2', {"class": "headline"}): for level_3 in level_2.find_all('a', href=True): news_link = level_3['href'] header = level_3.text for level_2 in level_1.find_all('span', {"class": "fl date"}): for level_3 in level_2.find_all('span'): timestamp = level_3.text df_news_data = df_news_data.append( { 'website': website, 'website_link': url_link, 'website_category': categories, 'news_link': news_link, 'header': header, 'sub_header': sub_header, 'timestamp': timestamp }, ignore_index=True) else: database_log.error_log( "India - Loader - mint : read_information", resp.status_code) if df_news_data.empty is not True: df_news_feed_list = [ tuple(r) for r in df_news_data[[ 'website', 'website_link', 'website_category', 'news_link', 'header', 'sub_header', 'timestamp' ]].values.tolist() ] sql_execute.bulk_insert_news_feeds(df_news_feed_list) else: database_log.error_log("India - Loader - mint : read_information", "no record found") except Exception as error: database_log.error_log("India - Loader - mint : read_information", error)