def sync_short_borrowed(trading_date): """ 可借券賣出 - 只有當天資料 - 待確認資料切換時間 """ dsitem = 'short_borrowed' logger = common.get_logger('finance') datestr = trading_date.replace('-', '') if not has_cache(dsitem, datestr, 'csv'): session = common.get_session(False) url = 'http://www.twse.com.tw/SBL/TWT96U?response=csv' resp = session.get(url) ds = resp.text line1 = ds[:ds.find('\r\n')] match = re.search(r'(\d{3})年(\d{2})月(\d{2})日', line1) if match is None: logger.error('可借券賣出的 CSV 無法取得日期字串') return yy = int(match.group(1)) + 1911 mm = match.group(2) dd = match.group(3) dsdate = '%04d%s%s' % (yy, mm, dd) if dsdate != datestr: logger.error('可借券賣出的資料日期與指定日期不同, 資料日期 %s, 指定日期 %s', dsdate, datestr) return logger.error('可借券賣出的資料寫入快取: %s', datestr) save_cache(dsitem, datestr, ds, 'csv') db_conn = db.get_connection() sql = ''' INSERT INTO `short_sell` ( trading_date, security_id, borrowed ) VALUES (?,?,?) ''' logger.info('載入 %s 的可借券賣出', datestr) csv_path = get_cache_path(dsitem, datestr, 'csv') col_names = ['sec1', 'vol1', 'sec2', 'vol2', 'shit'] df = pandas.read_csv(csv_path, sep=',', skiprows=3, header=None, names=col_names) cnt = 0 for index, row in df.iterrows(): security_id = row['sec1'].strip('="') borrowed = int(row['vol1'].replace(',', '')) db_conn.execute(sql, (trading_date, security_id, borrowed)) cnt += 1 security_id = row['sec2'].strip('="') if security_id != '_': borrowed = int(row['vol2'].replace(',', '')) db_conn.execute(sql, (trading_date, security_id, borrowed)) cnt += 1 db_conn.commit() db_conn.close()
def sync_dataset(dsitem, trading_date): """ 同步資料集共用流程 * HTTP 日期格式: 108/05/29 * DB, Cache 日期格式: 2019-05-29 """ logger = common.get_logger('finance') dtm = re.match(r'(\d{4})-(\d{2})-(\d{2})', trading_date) tokens = [str(int(dtm.group(1)) - 1911), dtm.group(2), dtm.group(3)] datestr = '/'.join(tokens) format = 'json' this_mod = sys.modules[__name__] if has_cache(dsitem, trading_date, format): # 載入快取資料集 logger.info('套用 %s 的 %s 快取', trading_date, dsitem) dataset = load_cache(dsitem, trading_date, format) else: # 下載資料集 dataset = None repeat = 0 hookfunc = getattr(this_mod, 'download_' + dsitem) while dataset is None and repeat < REPEAT_LIMIT: repeat += 1 if repeat > 1: time.sleep(REPEAT_INTERVAL) try: logger.info('下載 %s 的 %s', trading_date, dsitem) dataset = hookfunc(datestr) logger.info('儲存 %s 的 %s', trading_date, dsitem) save_cache(dsitem, trading_date, dataset, format) except Exception as ex: logger.error('無法取得 %s 的 %s (重試: %d, %s)', trading_date, dsitem, repeat, ex.reason) if dataset is None: return # return # 匯入資料庫 dbcon = db.get_connection() hookfunc = hookfunc = getattr(this_mod, 'import_' + dsitem) try: hookfunc(dbcon, trading_date, dataset) logger.info('匯入 %s 的 %s', trading_date, dsitem) except sqlite3.IntegrityError as ex: logger.warning('已經匯入過 %s 的 %s', trading_date, dsitem) except Exception as ex: # TODO: ex.args[0] 不確定是否可靠, 需要再確認 logger.error('無法匯入 %s 的 %s (%s)', trading_date, dsitem, ex.args[0]) dbcon.commit() dbcon.close()
def import_dist(csv_date='latest'): """ 匯入指定日期的股權分散表到資料庫 """ logger = common.get_logger('finance') if csv_date == 'latest': max_date = '' dir = os.path.expanduser('~/.twnews/cache/tdcc') for filename in os.listdir(dir): match = re.match(r'dist-(\d{8}).csv', filename) if match is not None: if max_date < match.group(1): max_date = match.group(1) csv_date = max_date iso_date = re.sub(r'(\d{4})(\d{2})(\d{2})', r'\1-\2-\3', csv_date) csv_path = os.path.expanduser('~/.twnews/cache/tdcc/dist-%s.csv' % csv_date) if not os.path.isfile(csv_path): logger.error('沒有這個日期的股權分散表檔案: %s', csv_path) return db_conn = db.get_connection() col_names = [ 'trading_date', 'security_id', 'level', 'numof_holders', 'numof_stocks', 'percentof_stocks' ] df = pandas.read_csv(csv_path, skiprows=1, header=None, names=col_names) # print(df.head(3)) # print(df.tail(3)) sql_template = ''' INSERT INTO level%02d ( trading_date, security_id, numof_holders, numof_stocks, percentof_stocks ) VALUES (?,?,?,?,?); ''' for index, row in df.iterrows(): sql = sql_template % row['level'] db_conn.execute(sql, ( iso_date, row['security_id'], row['numof_holders'], row['numof_stocks'], row['percentof_stocks'] )) if index > 0 and index % 5000 == 0: msg = '已儲存 %s 的 %d 筆股權分散資料' % (iso_date, index) logger.debug(msg) db_conn.commit() db_conn.close()
def sync_margin_trading(trading_date): """ 融資融券 """ dsitem = 'margin_trading' logger = common.get_logger('finance') datestr = trading_date.replace('-', '') # 快取處理 if has_cache(dsitem, datestr): logger.info('載入 %s 的融資融券', datestr) ds = load_cache(dsitem, datestr) else: logger.info('沒有 %s 的融資融券', datestr) session = common.get_session(False) url = 'http://www.twse.com.tw/exchangeReport/MI_MARGN?response=json&date=%s&selectType=ALL' % datestr resp = session.get(url) ds = resp.json() status = ds['stat'] # 注意! 即使發生問題, HTTP 回應碼也是 200, 必須依 JSON 分辨成功或失敗 # 成功: OK # 失敗: 查詢日期大於可查詢最大日期,請重新查詢! # 很抱歉,目前線上人數過多,請您稍候再試 if status != 'OK': logger.error('無法取得 %s 的融資融券資料, 原因: %s', datestr, status) return if len(ds['data']) == 0: logger.error('沒有 %s 的融資融券資料, 可能尚未結算或是非交易日', datestr) return logger.info('儲存 %s 的融資融券', datestr) save_cache(dsitem, datestr, ds) db_conn = db.get_connection() sql = ''' INSERT INTO `margin` ( trading_date, security_id, security_name, buying_balance, selling_balance ) VALUES (?,?,?,?,?) ''' for detail in ds['data']: security_id = detail[0] security_name = detail[1].strip() buying_balance = int(detail[6].replace(',', '')) selling_balance = int(detail[12].replace(',', '')) db_conn.execute(sql, (trading_date, security_id, security_name, buying_balance, selling_balance)) logger.debug( '[%s %s] 融資餘額: %s, 融券餘額: %s' % (security_id, security_name, buying_balance, selling_balance)) db_conn.commit() db_conn.close()
def sync_institution_trading(trading_date): """ 三大法人 """ dsitem = 'institution_trading' logger = common.get_logger('finance') datestr = trading_date.replace('-', '') # 快取處理 if has_cache(dsitem, datestr): logger.info('載入 %s 的三大法人', datestr) ds = load_cache(dsitem, datestr) else: logger.info('沒有 %s 的三大法人', datestr) session = common.get_session(False) url = 'http://www.twse.com.tw/fund/T86?response=json&date=%s&selectType=ALL' % datestr resp = session.get(url) ds = resp.json() status = ds['stat'] # 注意! 即使發生問題, HTTP 回應碼也是 200, 必須依 JSON 分辨成功或失敗 # 成功: OK # 失敗: 查詢日期大於可查詢最大日期,請重新查詢! # 很抱歉,目前線上人數過多,請您稍候再試 if status != 'OK': logger.error('無法取得 %s 的三大法人資料, 原因: %s', datestr, status) return logger.info('儲存 %s 的三大法人', datestr) save_cache(dsitem, datestr, ds) # 匯入 SQLite db_conn = db.get_connection() sql = ''' INSERT INTO `institution` ( trading_date, security_id, security_name, foreign_trend, stic_trend, dealer_trend ) VALUES (?,?,?,?,?,?) ''' for detail in ds['data']: security_id = detail[0] security_name = detail[1].strip() foreign_trend = int(detail[4].replace(',', '')) // 1000 stic_trend = int(detail[10].replace(',', '')) // 1000 dealer_trend = int(detail[11].replace(',', '')) // 1000 db_conn.execute(sql, (trading_date, security_id, security_name, foreign_trend, stic_trend, dealer_trend)) logger.debug('[%s %s] 外資: %s 投信: %s 自營商: %s', security_id, security_name, foreign_trend, stic_trend, dealer_trend) db_conn.commit() db_conn.close()
def sync_dataset(dsitem, trading_date): """ 同步資料集共用流程 """ logger = common.get_logger('finance') datestr = trading_date.replace('-', '') format = 'csv' if dsitem == 'borrowed' else 'json' this_mod = sys.modules[__name__] if has_cache(dsitem, datestr, format): # 載入快取資料集 logger.info('套用 %s 的 %s 快取', trading_date, dsitem) dataset = load_cache(dsitem, datestr, format) else: # 下載資料集 dataset = None repeat = 0 hookfunc = getattr(this_mod, 'download_' + dsitem) while dataset is None and repeat < REPEAT_LIMIT: repeat += 1 if repeat > 1: time.sleep(REPEAT_INTERVAL) try: logger.info('下載 %s 的 %s', trading_date, dsitem) dataset = hookfunc(datestr) logger.info('儲存 %s 的 %s', trading_date, dsitem) save_cache(dsitem, datestr, dataset, format) except Exception as ex: # 2019-08-08: 這裡的重試效果不夠理想,3 次重試的結果都失敗,可能要改用別的重試機制 logger.error('無法取得 %s 的 %s (重試: %d, %s)', trading_date, dsitem, repeat, ex.reason) if dataset is None: return # 匯入資料庫 dbcon = db.get_connection() hookfunc = hookfunc = getattr(this_mod, 'import_' + dsitem) try: hookfunc(dbcon, trading_date, dataset) logger.info('匯入 %s 的 %s', trading_date, dsitem) except sqlite3.IntegrityError as ex: logger.warning('已經匯入過 %s 的 %s', trading_date, dsitem) except Exception as ex: # TODO: ex.args[0] 不確定是否可靠, 需要再確認 logger.error('無法匯入 %s 的 %s (%s)', trading_date, dsitem, ex.args[0]) dbcon.commit() dbcon.close()
def sync_short_selled(trading_date): """ 已借券賣出 """ dsitem = 'short_selled' logger = common.get_logger('finance') datestr = trading_date.replace('-', '') # 快取處理 if has_cache(dsitem, datestr): logger.info('載入 %s 的已借券賣出', datestr) ds = load_cache(dsitem, datestr) else: logger.info('沒有 %s 的已借券賣出', datestr) session = common.get_session(False) url = 'http://www.twse.com.tw/exchangeReport/TWT93U?response=json&date=%s' % datestr resp = session.get(url) ds = resp.json() status = ds['stat'] # 注意! 即使發生問題, HTTP 回應碼也是 200, 必須依 JSON 分辨成功或失敗 # 成功: OK # 失敗: 查詢日期大於可查詢最大日期,請重新查詢! # 很抱歉,目前線上人數過多,請您稍候再試 if status != 'OK': logger.error('無法取得 %s 的已借券賣出資料, 原因: %s', datestr, status) return if len(ds['data']) == 0: logger.error('尚未生成 %s 的已借券賣出資料, 可能尚未結算或非交易日', datestr) return logger.info('儲存 %s 的已借券賣出', datestr) save_cache(dsitem, datestr, ds) db_conn = db.get_connection() sql = ''' UPDATE `short_sell` SET `security_name`=?, `selled`=? WHERE `trading_date`=? AND `security_id`=? ''' for detail in ds['data']: security_id = detail[0] security_name = detail[1].strip() balance = int(detail[12].replace(',', '')) // 1000 if security_id != '': db_conn.execute( sql, (security_name, balance, trading_date, security_id)) logger.debug('[%s %s] 已借券賣出餘額: %s', security_id, security_name, balance) db_conn.commit() db_conn.close()
def search_and_list(keyword, channel): """ 搜尋,然後列出新聞標題 """ print('測試搜尋') nsearch = NewsSearch(channel, limit=10) results = nsearch.by_keyword(keyword).to_dict_list() logger = get_logger() for (i, result) in enumerate(results): try: print('{:03d}: {}'.format(i, result['title'])) print(' 日期: {}'.format(result['date'])) print(' 連結: {}'.format(result['link'])) except ValueError as ex: logger.error('例外類型: %s', type(ex).__name__) logger.error(ex)
def main(): """ 下載最新的股權分散表,轉檔到資料庫: python3 -m twnews.finance.tdcc 使用既有的 CSV 檔案重建股權分散表資料庫: python3 -m twnews.finance.tdcc rebuild """ action = get_action() logger = common.get_logger('finance') if action == 'update': changed = backup_dist() if changed: import_dist() elif action == 'rebuild': rebuild_dist() else: logger.error('無法識別的動作 %s', action)
def search_and_soup(keyword, channel): """ 搜尋,然後分解新聞 """ print('測試搜尋與分解, 搜尋中 ...', end='', flush=True) logger = get_logger() nsearch = NewsSearch(channel, limit=10) nsoups = nsearch.by_keyword(keyword).to_soup_list() print('\r測試搜尋與分解' + ' ' * 20, flush=True) for (i, nsoup) in enumerate(nsoups): try: print('{:03d}: {}'.format(i, nsoup.path)) print(' 記者: {} / 日期: {}'.format(nsoup.author(), nsoup.date())) print(' 標題: {}'.format(nsoup.title())) print(' {} ...'.format(nsoup.contents(30)), flush=True) except ValueError as ex: logger.error('例外類型: %s', type(ex).__name__) logger.error(ex)
def search_and_soup(keyword, channel): """ search_and_soup(keyword, channel) """ print('測試搜尋與分解') logger = get_logger() nsearch = NewsSearch(channel, limit=10) nsoups = nsearch.by_keyword(keyword).to_soup_list() for (i, nsoup) in enumerate(nsoups): try: print('{:03d}: {}'.format(i, nsoup.path)) print(' 記者: {} / 日期: {}'.format(nsoup.author(), nsoup.date())) print(' 標題: {}'.format(nsoup.title())) print(' {} ...'.format(nsoup.contents()[0:30])) except Exception as ex: logger.error('例外類型: %s', type(ex).__name__) logger.error(ex)
def backup_dist(refresh=False): """ 備份最新的股權分散表 """ changed = False logger = common.get_logger('finance') url = 'https://smart.tdcc.com.tw/opendata/getOD.ashx?id=1-5' resp = requests.get(url) if resp.status_code == 200: # 確認統計日期 csv = resp.text dt_beg = csv.find('\n') + 1 dt_end = csv.find(',', dt_beg) csv_date = csv[dt_beg:dt_end] csv_dir = os.path.expanduser('~/.twnews/cache/tdcc') if not os.path.isdir(csv_dir): os.makedirs(csv_dir) csv_file = '{}/dist-{}.csv'.format(csv_dir, csv_date) if refresh: changed = True elif not os.path.isfile(csv_file): changed = True else: sz_local = os.path.getsize(csv_file) sz_remote = int(resp.headers['Content-Length']) if sz_local != sz_remote: changed = True # 製作備份檔 if changed: with open(csv_file, 'wt') as csvf: csvf.write(csv) logger.debug('已更新股權分散表: %s', csv_date) else: logger.debug('已存在股權分散表: %s, 不需更新', csv_date) else: logger.error('無法更新股權分散表') return changed
def sync_block_trading(trading_date): """ 鉅額交易 """ dsitem = 'block_trading' logger = common.get_logger('finance') datestr = trading_date.replace('-', '') # 快取處理 if has_cache(dsitem, datestr): logger.info('載入 %s 的鉅額交易', datestr) ds = load_cache(dsitem, datestr) else: logger.info('沒有 %s 的鉅額交易', datestr) session = common.get_session(False) url = 'http://www.twse.com.tw/block/BFIAUU?response=json&date=%s&selectType=S' % datestr resp = session.get(url) ds = resp.json() status = ds['stat'] # 注意! 即使發生問題, HTTP 回應碼也是 200, 必須依 JSON 分辨成功或失敗 # 成功: OK # 失敗: 查詢日期大於可查詢最大日期,請重新查詢! # 很抱歉,目前線上人數過多,請您稍候再試 if status != 'OK': logger.error('無法取得 %s 的鉅額交易資料, 原因: %s', datestr, status) return if len(ds['data']) == 0: logger.error('沒有 %s 的鉅額交易資料, 可能尚未結算或是非交易日', datestr) return logger.info('儲存 %s 的鉅額交易', datestr) save_cache(dsitem, datestr, ds) db_conn = db.get_connection() sql = ''' INSERT INTO `block` ( trading_date, security_id, security_name, tick_rank, tick_type, close, volume, total ) VALUES (?,?,?,?,?,?,?,?) ''' tick_rank = {} for trade in ds['data']: if trade[0] == '總計': break security_id = trade[0] security_name = trade[1] tick_type = trade[2] close = float(trade[3].replace(',', '')) volume = int(trade[4].replace(',', '')) total = int(trade[5].replace(',', '')) if security_id not in tick_rank: tick_rank[security_id] = 1 else: tick_rank[security_id] += 1 db_conn.execute( sql, (trading_date, security_id, security_name, tick_rank[security_id], tick_type, close, volume, total)) logger.debug('[%s %s] #%d %s 成交價: %s 股數: %s 金額: %s' % (security_id, security_name, tick_rank[security_id], tick_type, close, volume, total)) db_conn.commit() db_conn.close()
def sync_etf_net(trading_date): """ https://mis.twse.com.tw/stock/data/all_etf.txt { "a1": [ { "msgArray": [ { "a": "", 代碼 "b": "", 名稱 "c": "", 發行量 "d": "", 與前日發行量變化 "e": "", 成交價 "f": "", 淨值 "g": "", 折溢價率 "h": "", 前日淨值 "i": "", 日期 "j": "", 時間 "k": "", ETF 類型 (1~4) }, ... ] "refURL": "https://www.kgifund.com.tw/ETF/RWD/Introduction.aspx", "userDelay": "15000", "rtMessage": "OK", "rtCode": "0000" }, ... {} <-- 最後有一組空的 ] } """ dsitem = 'etf_net' logger = common.get_logger('finance') datestr = trading_date.replace('-', '') # 快取處理 if has_cache(dsitem, datestr): logger.info('載入 %s 的 ETF 溢價率快取', datestr) ds = load_cache(dsitem, datestr) else: logger.info('沒有 %s 的 ETF 溢價率快取', datestr) session = common.get_session(False) resp = session.get('https://mis.twse.com.tw/stock/data/all_etf.txt') ds = resp.json() dsdate = ds['a1'][1]['msgArray'][0]['i'] if datestr == dsdate: logger.info('儲存 %s 的 ETF 溢價率快取', datestr) save_cache(dsitem, datestr, ds) else: logger.info('無法取得 %s 的 ETF 溢價率資料', datestr) return # 來源資料轉換 key/value 形式 etf_dict = {} for fund in ds['a1']: if 'msgArray' in fund: for etf in fund['msgArray']: etf_dict[etf['a']] = etf # 依證券代碼順序處理 db_conn = db.get_connection() sql = ''' INSERT INTO `etf_offset` ( trading_date, security_id, security_name, close, net, offset ) VALUES (?,?,?,?,?,?) ''' for k in sorted(etf_dict.keys()): etf = etf_dict[k] db_conn.execute( sql, (trading_date, etf['a'], etf['b'], etf['e'], etf['f'], etf['g'])) logger.debug('%s, %s, %s, %s%%', etf['a'], etf['b'], etf['f'], etf['g']) db_conn.commit() db_conn.close()
def by_keyword(self, keyword, title_only=False): """ 關鍵字搜尋 """ logger = common.get_logger() page = 1 results = [] no_more = False begin_time = time.time() while not no_more and len(results) < self.limit: # 組查詢條件 replacement = { 'PAGE': page, 'KEYWORD': urllib.parse.quote_plus(keyword) } url = Template(self.conf['url']).substitute(replacement) # 再加上日期範圍 if self.beg_date is not None: url += self.beg_date.strftime(self.conf['begin_date_format']) url += self.end_date.strftime(self.conf['end_date_format']) # 查詢 session = common.get_session() logger.debug('新聞搜尋 ' + url) resp = session.get(url, allow_redirects=False) if resp.status_code == 200: logger.debug('回應 200 OK') for (k, v) in resp.headers.items(): logger.debug('{}: {}'.format(k, v)) ctype = resp.headers['Content-Type'] if 'text/html' in ctype: self.soup = BeautifulSoup(resp.text, 'lxml') if 'application/json' in ctype: self.json = resp.json() elif resp.status_code == 404: logger.debug('回應 404 Not Found,視為沒有更多查詢結果') self.json = None no_more = True else: logger.warning('回應碼: {}'.format(resp.status_code)) break # 拆查詢結果 Soup if self.soup is not None: result_nodes = self.soup.select(self.conf['result_node']) if len(result_nodes) > 0: for n in result_nodes: title = self.__parse_title_node(n) if (not title_only) or (keyword in title): link = self.__parse_link_node(n) date_inst = self.__parse_date_node(n) results.append({ "title": title, "link": link, 'date': date_inst }) if len(results) == self.limit: break else: no_more = True # 拆查詢結果 JSON if self.json is not None: result_nodes = self.json for k in self.conf['result_node']: result_nodes = result_nodes[k] for r in result_nodes: title = self.__parse_title_field(r) if (not title_only) or (keyword in title): link = self.__parse_link_field(r) date_inst = self.__parse_date_field(r) results.append({ "title": title, "link": link, 'date': date_inst }) if len(results) == self.limit: break page += 1 # 以連結網址為基準去重複化 filtered = [] for (i, r) in enumerate(results): duplicated = False for j in range(i): p = results[j] if r['link'] == p['link']: duplicated = True if not duplicated: filtered.append(r) else: logger.warning('查詢結果的 {}, {} 筆重複,新聞網址 {}'.format( i, j, r['link'])) self.result_list = filtered self.pages = page - 1 self.elapsed = time.time() - begin_time return self