def get_company_overview(): company_df = query_from_db( """SELECT ent_text as company_name, cast(ADDTIME(news_published_date, '8:00:0') as DATE) as published_date, COUNT(*) as count FROM news_db.news_kw_view WHERE ent_text in (SELECT company_name FROM news_db.yahoo_stock_companies) and ent_type = 'ORG' and ADDTIME(news_published_date, '8:00:0') > '2021-01-19' GROUP BY ent_text, cast(ADDTIME(news_published_date, '8:00:0') as DATE);""") return company_df
def get_keywords_by_date(): kw_df = query_from_db( """SELECT ADDTIME(news_published_date, '8:00:0') published_date, news_keywords FROM news_db.news_contents WHERE news_keywords is not NULL and news_keywords != '' and ADDTIME(news_published_date, '8:00:0') > '2021-01-19';""" ) kw_df['published_date'] = kw_df['published_date'].dt.date return kw_df
def get_rss_data(): count_df = query_from_db(""" SELECT DATE(nrf.created_at) AS DATE, nc.category_name, COUNT(*) AS COUNT FROM news_db.news_rss_feeds nrf LEFT JOIN news_db.news_categories nc ON nc.rss_source = nrf.rss_source GROUP BY DATE(nrf.created_at), nc.category_name ORDER BY DATE(nrf.created_at);""" ) return count_df
def get_fail_parse_data(): fail_df = query_from_db(""" SELECT DATE(nrf.updated_at) AS DATE, nc.category_name, COUNT(*) AS COUNT FROM news_db.news_rss_feeds nrf LEFT JOIN news_db.news_categories nc ON nc.rss_source = nrf.rss_source WHERE nrf.processed_status = 1 and nrf.processed_success = 0 GROUP BY DATE(nrf.updated_at), nc.category_name ORDER BY DATE(nrf.updated_at);""" ) return fail_df
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.append(parent_dir) from db_func import query_from_db start = time.time() DIR_PATH = os.path.dirname(os.path.realpath(__file__)) FORMAT = '%(asctime)s %(levelname)s: %(message)s' logging.basicConfig(level=logging.INFO, filename=os.path.join(DIR_PATH, 'logs', 'sent_splitter.log'), filemode='a', format=FORMAT) raw_df = query_from_db( "SELECT news_id, news FROM news_contents WHERE processed_status = 0 LIMIT 300;" ) with open(os.path.join(parent_dir, 'configs', 'server2server.config'), 'rb') as f: configs = pickle.load(f) author_header = r'^(((.+?))|(\(.+?\))|(【.+?】)|(〔.+?〕)|(\[.+?\])|([.+?]))\s*' content_footer = r'^更多.*?報導:?$|^更多新聞推薦|^【更多新聞】|^延伸閱讀:|^【延伸閱讀】|^超人氣$|^看更多.*?文章|^更多匯流新聞網報導:|^原始連結|^更多\w+內容:|^《TVBS》提醒您:|^※|^(延伸閱讀:|^相關影音:|^責任編輯:|^☆|^更多\w+相關新聞|►' mydb = mysql.connector.connect(host=configs['host'], user=configs['user'], passwd=configs['passwd'], database=configs['database'])
spacy_start)) except Exception as e: logging.error('NLP process Error: {}\n Content ID: {}'.format( e, content_sent_id)) print('NLP process Error: {}\n Content ID: {}'.format( e, content_sent_id)) print("Process record in {} seconds".format(time.time() - process_start)) insert_start = time.time() insert_sentlevel_info(word_sentence_list[0], pos_sentence_list[0], content_sent_id, 'ckip-transformer') insert_sentlevel_info(word_sent_list_spacy, word_pos_list_spacy, content_sent_id, 'spacy-transformer', word_dep_list_spacy) insert_ner_info(entity_sentence_list[0], content_sent_id, 'ckip-transformer') insert_ner_info(entity_sent_list_spacy, content_sent_id, 'spacy-transformer') print("Insert record in {} seconds".format(time.time() - insert_start)) raw_df = query_from_db("SELECT * FROM news_db.financial_sent_view LIMIT 100;") print("finish load the data in {} seconds".format(time.time() - start)) sent_level_analysis(raw_df) mydb.close() logging.error('Finish process {} examples in {} seconds'.format( len(raw_df), time.time() - start)) print('Finish process {} examples in {} seconds'.format( len(raw_df), time.time() - start))
DIR_PATH = dirname(abspath(__file__)) parent_dir = os.path.dirname(DIR_PATH) sys.path.append(parent_dir) FORMAT = '%(asctime)s %(levelname)s: %(message)s' logging.basicConfig(level=logging.INFO, filename=os.path.join(DIR_PATH, 'logs', 'yahoo_stock_companies.log'), filemode='a', format=FORMAT) logger = logging.getLogger('yahoo_stock_logger') from db_func import query_from_db, bulk_insert_to_db start = time.time() stock_df = query_from_db( """SELECT stock_category_id ,category_name, category_url FROM news_db.yahoo_stock_categories WHERE valid = 1""" ) ### Define the global header for requests headers = { 'user-agent': 'Mozilla/5.0 (Macintosh Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36', 'Connection': 'close' } res = [] for idx, (stock_category_id, _, category_url) in stock_df.iterrows(): # print(stock_category_id, category_url) r = requests.get(category_url, headers=headers) r.encoding = 'big5-hkscs' web_content = r.text
def get_ner_by_date(selected_date): ner_df = query_from_db( """SELECT CAST(nkv.news_published_date AS DATE) AS published_date, nkv.ent_text, nkv.ent_type FROM news_db.news_kw_view nkv WHERE CAST(nkv.news_published_date AS DATE) = '{}';""".format(selected_date)) return ner_df
else: mydb.commit() process_cursor.close() def title_level_analysis(raw_df): for index, (news_id, title) in raw_df.iterrows(): insert_process_flag(news_id, 'title-words') process_start = time.time() try: word_title_list = ws_driver([title], use_delim=False) entity_title_list = ner_driver([title], use_delim=False) pos_title_list = pos_driver(word_title_list, use_delim=False) print("Ckip process time {} seconds".format(time.time() - process_start)) except Exception as e: logging.error('NLP process Error: {}\n News ID: {}'.format(e, news_id)) print('NLP process Error: {}\n News ID: {}'.format(e, news_id)) print("Process record in {} seconds".format(time.time() - process_start)) insert_start = time.time() insert_title_level_info(word_title_list[0], pos_title_list[0], news_id, 'ckip-transformer') insert_ner_info(entity_title_list[0], news_id, 'ckip-transformer') print("Insert record in {} seconds".format(time.time() - insert_start)) raw_df = query_from_db("SELECT news_id, news_title FROM news_db.financial_title_view LIMIT 100;") print("Finish load the data in {} seconds".format(time.time() - start)) title_level_analysis(raw_df) mydb.close() logging.error('Finish process {} examples in {} seconds'.format(len(raw_df), time.time() - start)) print('Finish process {} examples in {} seconds'.format(len(raw_df), time.time() - start))
DIR_PATH = dirname(abspath(__file__)) parent_dir = os.path.dirname(DIR_PATH) sys.path.append(parent_dir) FORMAT = '%(asctime)s %(levelname)s: %(message)s' logging.basicConfig(level=logging.INFO, filename=os.path.join(DIR_PATH, 'logs', 'stock_prices.log'), filemode='a', format=FORMAT) logger = logging.getLogger('stock_price_logger') from db_func import query_from_db, bulk_insert_to_db, insert_to_db start = time.time() ticker_df = query_from_db( "SELECT stock_ticker FROM news_db.yahoo_stock_companies") #ticker_df = ticker_df.iloc[ticker_df[ticker_df['stock_ticker'] == '2891B'].index.values[0]:] def date_convertor(date_text): temp_y, temp_m, temp_d = date_text.split('/') temp_y = str(int(temp_y) + 1911) temp = '/'.join([temp_y, temp_m, temp_d]) return datetime.strptime(temp, '%Y/%m/%d') #日期', '成交股數', '成交金額', '開盤價', '最高價', '最低價', '收盤價', '漲跌價差', '成交筆數' #'data': [['107/01/02', '13,698,944', '499,370,945', '36.45', '36.60', '36.05', '36.55', '+0.10', '3,932'] def format_convertor(stock_ticker, data): cur_date, volumn, total_price, open_price, high_price, low_price, close_price, price_diff, transaction = data transaction = int(transaction.replace(',', ''))