예제 #1
0
def get_company_overview():
    company_df = query_from_db(
        """SELECT ent_text as company_name, cast(ADDTIME(news_published_date, '8:00:0') as DATE) as published_date, COUNT(*) as count
		FROM news_db.news_kw_view 
		WHERE ent_text in (SELECT company_name FROM news_db.yahoo_stock_companies) and ent_type = 'ORG' and ADDTIME(news_published_date, '8:00:0') > '2021-01-19'
		GROUP BY ent_text, cast(ADDTIME(news_published_date, '8:00:0') as DATE);""")
    return company_df
예제 #2
0
def get_keywords_by_date():
    kw_df = query_from_db(
        """SELECT ADDTIME(news_published_date, '8:00:0') published_date, news_keywords 
		FROM news_db.news_contents
		WHERE news_keywords is not NULL and news_keywords != '' and ADDTIME(news_published_date, '8:00:0') > '2021-01-19';"""
    )
    kw_df['published_date'] = kw_df['published_date'].dt.date
    return kw_df
예제 #3
0
def get_rss_data():
    count_df = query_from_db("""
	SELECT DATE(nrf.created_at) AS DATE, nc.category_name, COUNT(*) AS COUNT
	FROM news_db.news_rss_feeds nrf
	LEFT JOIN news_db.news_categories nc ON nc.rss_source = nrf.rss_source
	GROUP BY DATE(nrf.created_at), nc.category_name ORDER BY DATE(nrf.created_at);"""
                             )
    return count_df
예제 #4
0
def get_fail_parse_data():
    fail_df = query_from_db("""
		SELECT DATE(nrf.updated_at) AS DATE, nc.category_name, COUNT(*) AS COUNT
		FROM news_db.news_rss_feeds nrf
		LEFT JOIN news_db.news_categories nc ON nc.rss_source = nrf.rss_source
		WHERE nrf.processed_status = 1 and nrf.processed_success = 0
		GROUP BY DATE(nrf.updated_at), nc.category_name ORDER BY DATE(nrf.updated_at);"""
                            )
    return fail_df
예제 #5
0
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)

from db_func import query_from_db

start = time.time()
DIR_PATH = os.path.dirname(os.path.realpath(__file__))

FORMAT = '%(asctime)s %(levelname)s: %(message)s'
logging.basicConfig(level=logging.INFO,
                    filename=os.path.join(DIR_PATH, 'logs',
                                          'sent_splitter.log'),
                    filemode='a',
                    format=FORMAT)
raw_df = query_from_db(
    "SELECT news_id, news FROM news_contents WHERE processed_status = 0 LIMIT 300;"
)

with open(os.path.join(parent_dir, 'configs', 'server2server.config'),
          'rb') as f:
    configs = pickle.load(f)

author_header = r'^(((.+?))|(\(.+?\))|(【.+?】)|(〔.+?〕)|(\[.+?\])|([.+?]))\s*'
content_footer = r'^更多.*?報導:?$|^更多新聞推薦|^【更多新聞】|^延伸閱讀:|^【延伸閱讀】|^超人氣$|^看更多.*?文章|^更多匯流新聞網報導:|^原始連結|^更多\w+內容:|^《TVBS》提醒您:|^※|^(延伸閱讀:|^相關影音:|^責任編輯:|^☆|^更多\w+相關新聞|►'

mydb = mysql.connector.connect(host=configs['host'],
                               user=configs['user'],
                               passwd=configs['passwd'],
                               database=configs['database'])

예제 #6
0
                                                        spacy_start))
        except Exception as e:
            logging.error('NLP process Error: {}\n Content ID: {}'.format(
                e, content_sent_id))
            print('NLP process Error: {}\n Content ID: {}'.format(
                e, content_sent_id))
        print("Process record in {} seconds".format(time.time() -
                                                    process_start))
        insert_start = time.time()
        insert_sentlevel_info(word_sentence_list[0], pos_sentence_list[0],
                              content_sent_id, 'ckip-transformer')
        insert_sentlevel_info(word_sent_list_spacy, word_pos_list_spacy,
                              content_sent_id, 'spacy-transformer',
                              word_dep_list_spacy)
        insert_ner_info(entity_sentence_list[0], content_sent_id,
                        'ckip-transformer')
        insert_ner_info(entity_sent_list_spacy, content_sent_id,
                        'spacy-transformer')
        print("Insert record in {} seconds".format(time.time() - insert_start))


raw_df = query_from_db("SELECT * FROM news_db.financial_sent_view LIMIT 100;")
print("finish load the data in {} seconds".format(time.time() - start))
sent_level_analysis(raw_df)
mydb.close()
logging.error('Finish process {} examples in {} seconds'.format(
    len(raw_df),
    time.time() - start))
print('Finish process {} examples in {} seconds'.format(
    len(raw_df),
    time.time() - start))
DIR_PATH = dirname(abspath(__file__))
parent_dir = os.path.dirname(DIR_PATH)
sys.path.append(parent_dir)

FORMAT = '%(asctime)s %(levelname)s: %(message)s'
logging.basicConfig(level=logging.INFO,
                    filename=os.path.join(DIR_PATH, 'logs',
                                          'yahoo_stock_companies.log'),
                    filemode='a',
                    format=FORMAT)
logger = logging.getLogger('yahoo_stock_logger')

from db_func import query_from_db, bulk_insert_to_db
start = time.time()
stock_df = query_from_db(
    """SELECT stock_category_id ,category_name, category_url FROM news_db.yahoo_stock_categories WHERE valid = 1"""
)

### Define the global header for requests
headers = {
    'user-agent':
    'Mozilla/5.0 (Macintosh Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
    'Connection': 'close'
}

res = []
for idx, (stock_category_id, _, category_url) in stock_df.iterrows():
    # print(stock_category_id, category_url)
    r = requests.get(category_url, headers=headers)
    r.encoding = 'big5-hkscs'
    web_content = r.text
예제 #8
0
def get_ner_by_date(selected_date):
    ner_df = query_from_db(
        """SELECT CAST(nkv.news_published_date AS DATE) AS published_date, nkv.ent_text, nkv.ent_type 
		FROM news_db.news_kw_view nkv
		WHERE CAST(nkv.news_published_date AS DATE) = '{}';""".format(selected_date))
    return ner_df
예제 #9
0
    else:
        mydb.commit()
    process_cursor.close()

def title_level_analysis(raw_df):
    for index, (news_id, title) in raw_df.iterrows():
        insert_process_flag(news_id, 'title-words')
        process_start = time.time()
        try:
            word_title_list  = ws_driver([title], use_delim=False)
            entity_title_list = ner_driver([title], use_delim=False)
            pos_title_list = pos_driver(word_title_list, use_delim=False)
            print("Ckip process time {} seconds".format(time.time() - process_start))
      
        except Exception as e:
            logging.error('NLP process Error: {}\n News ID: {}'.format(e, news_id))
            print('NLP process Error: {}\n News ID: {}'.format(e, news_id))
        print("Process record in {} seconds".format(time.time() - process_start))
        insert_start = time.time()
        insert_title_level_info(word_title_list[0],  pos_title_list[0], news_id, 'ckip-transformer')
        insert_ner_info(entity_title_list[0], news_id, 'ckip-transformer')
        print("Insert record in {} seconds".format(time.time() - insert_start))
     


raw_df = query_from_db("SELECT news_id, news_title FROM news_db.financial_title_view LIMIT 100;")
print("Finish load the data in {} seconds".format(time.time() - start))
title_level_analysis(raw_df)
mydb.close()
logging.error('Finish process {} examples in {} seconds'.format(len(raw_df), time.time() - start))
print('Finish process {} examples in {} seconds'.format(len(raw_df), time.time() - start))
예제 #10
0
DIR_PATH = dirname(abspath(__file__))
parent_dir = os.path.dirname(DIR_PATH)
sys.path.append(parent_dir)

FORMAT = '%(asctime)s %(levelname)s: %(message)s'
logging.basicConfig(level=logging.INFO,
                    filename=os.path.join(DIR_PATH, 'logs',
                                          'stock_prices.log'),
                    filemode='a',
                    format=FORMAT)
logger = logging.getLogger('stock_price_logger')

from db_func import query_from_db, bulk_insert_to_db, insert_to_db

start = time.time()
ticker_df = query_from_db(
    "SELECT stock_ticker FROM news_db.yahoo_stock_companies")


#ticker_df = ticker_df.iloc[ticker_df[ticker_df['stock_ticker'] == '2891B'].index.values[0]:]
def date_convertor(date_text):
    temp_y, temp_m, temp_d = date_text.split('/')
    temp_y = str(int(temp_y) + 1911)
    temp = '/'.join([temp_y, temp_m, temp_d])
    return datetime.strptime(temp, '%Y/%m/%d')


#日期', '成交股數', '成交金額', '開盤價', '最高價', '最低價', '收盤價', '漲跌價差', '成交筆數'
#'data': [['107/01/02', '13,698,944', '499,370,945', '36.45', '36.60', '36.05', '36.55', '+0.10', '3,932']
def format_convertor(stock_ticker, data):
    cur_date, volumn, total_price, open_price, high_price, low_price, close_price, price_diff, transaction = data
    transaction = int(transaction.replace(',', ''))