def return_url_for_bankuai_stock(self, bankuai, page=1, page_size=10000): bankuai_tree = self.__bankuai_tree def return_bankuai_code(bankuai_tree, bankuai): # bankuai parameter is a list from the top bankuai to the bottom bankuai in the format below # ['板块','概念板块','AB股票'] # Parse the url of bankuai, for the url of bankuai under [概念板块, 地域板块, 行业板块], the numbers before the first underscore is the key to get the stocks belonging to that bankuai # e.g. For 板块->概念板块->AB股, the url is list.html#28003498_0_2, 28003498 is the key to get the stocks belonging to AB股 def drill_to_sub_bankuai(bankuai_dict,sub_bankuai): if sub_bankuai in bankuai_dict: return bankuai_dict[sub_bankuai] elif "children" in bankuai_dict and sub_bankuai in bankuai_dict["children"]: return bankuai_dict["children"][sub_bankuai] else: # This error should not be captured by the except block below raise RuntimeError(sub_bankuai + " is not found.", "in Eastmoney.py") try: bankuai_code = re.search(r'#(?P<bankuai_code>\d+)', reduce(drill_to_sub_bankuai, bankuai, bankuai_tree)["url"]).group("bankuai_code") except AttributeError: # The exception block only captures AttributeError: 'NoneType' object has no attribute 'group' print_log("The url of [" + ",".join(bankuai) + "] doesn't contain digits.") bankuai_code = "-99" return bankuai_code base_url = "http://hqdigi2.eastmoney.com/EM_Quote2010NumericApplication/index.aspx?type=s&sortType=C&sortRule=-1&jsName=quote_123" p_page_size = "&pageSize=%(page_size)s" p_page = "&page=%(page)s" p_bankuai_code = "&style=%(bankuai_code)s" bankuai_url = ( base_url + p_page_size + p_page + p_bankuai_code ) % {"page_size": page_size, "page": page, "bankuai_code": return_bankuai_code(bankuai_tree, bankuai)} print_log(bankuai_url) return bankuai_url
def download_log_checker(conn, start_date=options.start_date, end_date=options.end_date, stock_id=options.stock_id): start_date_dt = datetime.datetime.strptime(start_date,'%Y%m%d') end_date_dt = datetime.datetime.strptime(end_date,'%Y%m%d') # get stock ids which is_download_success=N chk_sql = ''' select t.biz_date, t.stock_id from ( select biz_date, stock_id, is_download_success, row_number() over(partition by biz_date, stock_id order by download_end_time desc nulls last) rankid from dw.log_stock_transaction where biz_date between '{start_date}' and '{end_date}' ) t where t.rankid = 1 and t.is_download_success = 'N' '''.format(start_date=start_date_dt, end_date=end_date_dt) if not stock_id is None: chk_sql = chk_sql + ' and t.stock_id = \'' + stock_id + '\'' cur = get_cur(conn) cur.execute(chk_sql) rows = list(cur) if len(rows) == 0: print_log('All the stocks have been downloaded successfully.') else: for row in rows: error_log(str(row['biz_date']) + ':' + row['stock_id'] + ' failed to download.') return len(rows)
def load_log_checker(conn, start_date=options.start_date, end_date=options.end_date, stock_id=options.stock_id): start_date_dt = datetime.datetime.strptime(start_date, '%Y%m%d') end_date_dt = datetime.datetime.strptime(end_date, '%Y%m%d') chk_sql = ''' select biz_date, stock_id from dw.log_stock_transaction where biz_date between '{start_date}' and '{end_date}' and is_download_success = 'Y' and (is_load_success = 'N' or is_load_success is null) '''.format(start_date=start_date_dt, end_date=end_date_dt) if not stock_id is None: chk_sql = chk_sql + ' and stock_id = \'' + stock_id + '\'' cur = get_cur(conn) cur.execute(chk_sql) rows = list(cur) if len(rows) == 0: print_log('All the stocks have been loaded successfully.') else: for row in rows: error_log( str(row['biz_date']) + ':' + row['stock_id'] + ' failed to load.') return len(rows)
def save_formatted_data(self): # save formatted data into file, \t as delimiter # 9:25:00 50.34 0.15 141 709794 买盘 with open(self.out_file, 'w') as file: file.write(self.stock_trans_object.get_stock_content()[ self.stock_id][self.date]) print_log('Formatted data saved to ' + self.out_file)
def download_to_local(self): # save raw data into local # if file already exists and size >= 5KB, it doesn't download again. if os.path.exists(self.stock_trans_object.download_file) and os.path.getsize(self.stock_trans_object.download_file) >= 1024 * 5: print_log(self.stock_trans_object.download_file + ' already exists.') else: self.stock_trans_object.download_to_local() return self.stock_trans_object.download_file
def delete_existing_records(self): del_sql = ''' delete from dw.stock_transaction where stock_id = '{0}' and biz_date = '{1}' '''.format(self.stock_id, datetime.datetime.strptime(self.date, '%Y%m%d')) get_query_result(self.conn, del_sql) print_log('Deletion for {0} {1} completed successfully.'.format( self.stock_id, self.date))
def delete_existing_records(self): del_sql = """ delete from dw.stock_transaction where stock_id = '{0}' and biz_date = '{1}' """.format( self.stock_id, datetime.datetime.strptime(self.date, "%Y%m%d") ) get_query_result(self.conn, del_sql) print_log("Deletion for {0} {1} completed successfully.".format(self.stock_id, self.date))
def inserter(conn, tabname, colnames, source_type, value, delimiter): # this function is used to insert value(a single line or each line in a file) into a specific table # tabname: table name # colnames: columns in the table # source_type: file|str # value: when source_type is file, value should be full path of a file, when source_type is str, value is value which will be inserted into table # delimiter: delimiter for columns column_type_sql = '''select lower(column_name) as column_name, lower(data_type) as data_type from information_schema.columns where table_schema || '.' || table_name = '{tabname}' '''.format( tabname=tabname) rows = get_query_result(conn, column_type_sql) sys_col_types = {} for row in rows: sys_col_types[row['column_name']] = row['data_type'] types = [] for colname in colnames.split(','): types.append(sys_col_types[colname.strip()]) if source_type == 'file': # insert rows in a file into table if os.path.exists(value): with open(value) as file: for row in file: row_splitted = row.strip().split(delimiter) if len(types) == len(row_splitted): out_value = insert_value_formatter( zip(types, row_splitted)) ins_sql = 'insert into {0}({1}) values ({2})'.format( tabname, colnames, out_value) sql_result = get_query_result(conn, ins_sql) else: raise RuntimeError( 'Len of types and value don\'t match [type:{0}, value:{1}]' .format(','.join(types), value)) else: raise RuntimeError('File doesn\'t exist. [{0}]'.format(value)) elif source_type == 'str': # insert a row into table # a,b,1,2 if len(types) == len(value.split(delimiter)): out_value = insert_value_formatter( zip(types, value.split(delimiter))) ins_sql = 'insert into {0}({1}) values ({2})'.format( tabname, colnames, out_value) sql_result = get_query_result(conn, ins_sql) else: raise RuntimeError( 'Len of types and value don\'t match [type:{0}, value:{1}]'. format(','.join(types), value)) else: raise RuntimeError('Unknown source type [{0}]'.format(source_type)) conn.commit() print_log('Insertion for {0} is done.'.format(tabname))
def download_to_local(self): # save raw data into local # if file already exists and size >= 5KB, it won't be downloaded again. if os.path.exists( self.stock_trans_object.download_file) and os.path.getsize( self.stock_trans_object.download_file) >= 1024 * 5: print_log(self.stock_trans_object.download_file + ' already exists.') else: self.stock_trans_object.download_to_local() return self.stock_trans_object.download_file
def export_bankuai_stock(self, out_file, in_bk=[]): # If in_bk parameter is not assigned, export all the bankuai stocks # in_bk could be [行业板块, 浙江板块] or [行业板块] bkst_exception = {} out_file = return_new_name_for_existing_file(out_file) bkstfile = open(out_file, 'wb') # open in wb is used to remove the blank lines bkstfile_writer = csv.writer(bkstfile,quoting=csv.QUOTE_NONNUMERIC) bkst_head = [u'板块',u'子版块',u'板块名称',u'股票代码',u'股票名称'] bkstfile_writer.writerow(bkst_head) for sub_bk in self.__bankuai_tree[u'板块']["children"]: if len(in_bk)>0 and sub_bk != in_bk[0]: continue print_log("Start to process -->" + sub_bk + "...") for dtl_bk in self.__bankuai_tree[u'板块']["children"][sub_bk]["children"]: if len(in_bk)>1 and dtl_bk != in_bk[1]: continue print_log("Start to process -->" + sub_bk + "-->" + dtl_bk + "...") parent_bk = [] for i in self.return_stock_in_bankuai([u'板块', sub_bk, dtl_bk]): bkst = [] if not isinstance(i, list): parent_bk.append(i) else: for j in i: bkst = parent_bk + j try: bkstfile_writer.writerow(bkst) except: if not j[0] in bkst_exception: bkst_exception[j[0]] = j[1] bkstfile.close() if len(bkst_exception.keys())>0: print_log("There are " + str(len(bkst_exception.keys())) + " exceptions!") for i in bkst_exception: print i + bkst_exception[i] else: print_log("Completed successfully.") return bkst_exception
def inserter(conn, tabname, colnames, source_type, value, delimiter): # this function is used to insert value(a single line or each line in a file) into a specific table # tabname: table name # colnames: columns in the table # source_type: file|str # value: when source_type is file, value should be full path of a file, when source_type is str, value is value which will be inserted into table # delimiter: delimiter for columns column_type_sql = '''select lower(column_name) as column_name, lower(data_type) as data_type from information_schema.columns where table_schema || '.' || table_name = '{tabname}' '''.format(tabname=tabname) rows = get_query_result(conn, column_type_sql) sys_col_types = {} for row in rows: sys_col_types[row['column_name']] = row['data_type'] types = [] for colname in colnames.split(','): types.append(sys_col_types[colname.strip()]) if source_type == 'file': # insert rows in a file into table if os.path.exists(value): with open(value) as file: for row in file: row_splitted = row.strip().split(delimiter) if len(types) == len(row_splitted): out_value = insert_value_formatter(zip(types,row_splitted)) ins_sql = 'insert into {0}({1}) values ({2})' . format(tabname, colnames, out_value) sql_result = get_query_result(conn, ins_sql) else: raise RuntimeError('Len of types and value don\'t match [type:{0}, value:{1}]'.format(','.join(types), value)) else: raise RuntimeError('File doesn\'t exist. [{0}]'.format(value)) elif source_type == 'str': # insert a row into table # a,b,1,2 if len(types) == len(value.split(delimiter)): out_value = insert_value_formatter(zip(types,value.split(delimiter))) ins_sql = 'insert into {0}({1}) values ({2})' . format(tabname, colnames, out_value) sql_result = get_query_result(conn, ins_sql) else: raise RuntimeError('Len of types and value don\'t match [type:{0}, value:{1}]'.format(','.join(types), value)) else: raise RuntimeError('Unknown source type [{0}]'.format(source_type)) conn.commit() print_log('Insertion for {0} is done.'.format(tabname))
def downloader(queue, conn, start_date=options.start_date, end_date=options.end_date, stock_id=options.stock_id, obj_selection=options.obj_selection): #-- object list obj_mapping = { 'T': 'Tengxun_stock_transaction', 'N': 'Netease_stock_transaction', 'S': 'Sina_stock_transaction', } if obj_selection is None: stock_objects = [ 'Tengxun_stock_transaction', 'Netease_stock_transaction', 'Sina_stock_transaction' ] else: stock_objects = [ obj_mapping[o] for o in obj_selection.split('|') if o in obj_mapping ] print_log('|'.join(stock_objects) + ' selected.') iter = len(stock_objects) cur_date_dt = datetime.datetime.strptime(start_date, '%Y%m%d') end_date_dt = datetime.datetime.strptime(end_date, '%Y%m%d') while cur_date_dt <= end_date_dt: #-- stock list stocks = get_stock_list(conn, cur_date_dt, stock_id) for stock in stocks: cur_date_str = cur_date_dt.strftime('%Y%m%d') cur_stock_object = stock_objects[ iter % len(stock_objects)] # choose stock object while queue.full(): print_log( '=================> queue is full, wait for 1 second...') time.sleep(1) s = Stock_trans_downloader(queue, conn, cur_stock_object, stock, cur_date_str) s.start() #s.join() print_log('-----> queue size: ' + str(queue.qsize())) iter += 1 cur_date_dt = cur_date_dt + datetime.timedelta(1) while not queue.empty(): print_log( '=================> queue is not empty yet, wait for 1 second...') time.sleep(1)
def export_bankuai_status(self, out_file, in_bk=[]): # If in_bk parameter is not assigned, export all the bankuais # in_bk could be [行业板块] bkbk_exception = [] out_file = return_new_name_for_existing_file(out_file) bkbkfile = open(out_file, 'wb') # open in wb is used to remove the blank lines bkbkfile_writer = csv.writer(bkbkfile,quoting=csv.QUOTE_NONNUMERIC) bkbk_head = [u'板块',u'子版块',u'板块名称',u'涨跌幅',u'总市值(亿)',u'换手率',u'上涨家数',u'下跌家数',u'领涨股票代码',u'领涨股票',u'领涨股票涨跌幅'] bkbkfile_writer.writerow(bkbk_head) for bk in self.__bankuai_tree[u'板块']["children"]: if len(in_bk)>0 and bk != in_bk[0]: continue print_log("Start to process -->" + bk + "...") parent_bk = [] for i in self.return_bankuai_in_bankuai([u'板块',bk]): bkbk = [] if not isinstance(i, list): parent_bk.append(i) else: for j in i: bkbk = parent_bk + j try: bkbkfile_writer.writerow(bkbk) except: if j[0] not in bkbk_exception: bkbk_exception.append(j[0]) bkbkfile.close() if len(bkbk_exception)>0: print_log("There are " + len(bkbk_exception) + " exceptions!") for i in bkbk_exception: print i else: print_log("Completed successfully.") return bkbk_exception
def run(self): self.check_row_id_existance() self.queue.put(self.getName()) self.log_load_start() self.delete_existing_records() try: if self.enable_copy: print_log("psql copy...") psql_copy_from( DB_HOST, DB_NAME, DB_UNAME, "dw.stock_transaction", self.file, DB_PORT, args=" with (encoding 'GBK')", ) else: print_log("psql insert...") inserter(self.conn, TABLE, COLS, "file", self.file, "\t") self.log_load_end(is_success=True) print_log( "Loading {stock_id} for {date} completes successfully.".format(stock_id=self.stock_id, date=self.date) ) except: traceback.print_exc() self.log_load_end(is_success=False) raise RuntimeError("Loading {stock_id} for {date} failed.".format(stock_id=self.stock_id, date=self.date)) finally: queue_name = self.queue.get()
def run(self): self.check_row_id_existance() self.queue.put(self.getName()) self.log_load_start() self.delete_existing_records() try: if self.enable_copy: print_log('psql copy...') psql_copy_from(DB_HOST, DB_NAME, DB_UNAME, 'dw.stock_transaction', self.file, DB_PORT, args=' with (encoding \'GBK\')') else: print_log('psql insert...') inserter(self.conn, TABLE, COLS, 'file', self.file, '\t') self.log_load_end(is_success=True) print_log( 'Loading {stock_id} for {date} completes successfully.'.format( stock_id=self.stock_id, date=self.date)) except: traceback.print_exc() self.log_load_end(is_success=False) raise RuntimeError('Loading {stock_id} for {date} failed.'.format( stock_id=self.stock_id, date=self.date)) finally: queue_name = self.queue.get()
def load_log_checker(conn, start_date=options.start_date, end_date=options.end_date, stock_id=options.stock_id): start_date_dt = datetime.datetime.strptime(start_date,'%Y%m%d') end_date_dt = datetime.datetime.strptime(end_date,'%Y%m%d') chk_sql = ''' select biz_date, stock_id from dw.log_stock_transaction where biz_date between '{start_date}' and '{end_date}' and is_download_success = 'Y' and (is_load_success = 'N' or is_load_success is null) '''.format(start_date=start_date_dt, end_date=end_date_dt) if not stock_id is None: chk_sql = chk_sql + ' and stock_id = \'' + stock_id + '\'' cur = get_cur(conn) cur.execute(chk_sql) rows = list(cur) if len(rows) == 0: print_log('All the stocks have been loaded successfully.') else: for row in rows: error_log(str(row['biz_date']) + ':' + row['stock_id'] + ' failed to load.') return len(rows)
def return_bankuai_code(bankuai_tree, bankuai): # bankuai parameter is a list from the top bankuai to the bottom bankuai in the format below # ['板块','概念板块','AB股票'] # Parse the url of bankuai, for the url of bankuai under [概念板块, 地域板块, 行业板块], the numbers before the first underscore is the key to get the stocks belonging to that bankuai # e.g. For 板块->概念板块->AB股, the url is list.html#28003498_0_2, 28003498 is the key to get the stocks belonging to AB股 def drill_to_sub_bankuai(bankuai_dict,sub_bankuai): if sub_bankuai in bankuai_dict: return bankuai_dict[sub_bankuai] elif "children" in bankuai_dict and sub_bankuai in bankuai_dict["children"]: return bankuai_dict["children"][sub_bankuai] else: # This error should not be captured by the except block below raise RuntimeError(sub_bankuai + " is not found.", "in Eastmoney.py") try: bankuai_code = re.search(r'#(?P<bankuai_code>\d+)', reduce(drill_to_sub_bankuai, bankuai, bankuai_tree)["url"]).group("bankuai_code") except AttributeError: # The exception block only captures AttributeError: 'NoneType' object has no attribute 'group' print_log("The url of [" + ",".join(bankuai) + "] doesn't contain digits.") bankuai_code = "-99" return bankuai_code
def download_log_checker(conn, start_date=options.start_date, end_date=options.end_date, stock_id=options.stock_id): start_date_dt = datetime.datetime.strptime(start_date, '%Y%m%d') end_date_dt = datetime.datetime.strptime(end_date, '%Y%m%d') # get stock ids which is_download_success=N chk_sql = ''' select t.biz_date, t.stock_id from ( select biz_date, stock_id, is_download_success, row_number() over(partition by biz_date, stock_id order by download_end_time desc nulls last) rankid from dw.log_stock_transaction where biz_date between '{start_date}' and '{end_date}' ) t where t.rankid = 1 and t.is_download_success = 'N' '''.format(start_date=start_date_dt, end_date=end_date_dt) if not stock_id is None: chk_sql = chk_sql + ' and t.stock_id = \'' + stock_id + '\'' cur = get_cur(conn) cur.execute(chk_sql) rows = list(cur) if len(rows) == 0: print_log('All the stocks have been downloaded successfully.') else: for row in rows: error_log( str(row['biz_date']) + ':' + row['stock_id'] + ' failed to download.') return len(rows)
def download_to_file(stocks, stock_obj_name, start_date, end_date, to_file, log_fh, warn_fh): #-- iterate stocks, download eod data from webside fh = open(to_file, 'a') num = 0 for s in stocks: #-- call method of stock object to get content of url try: new_class = '%(object)s("%(stock)s", "%(start_date)s", "%(end_date)s")' % { 'object': stock_obj_name, 'stock': s, 'start_date': start_date if stock_obj_name == 'Yahoo_stock' else 'dummy', 'end_date': end_date if stock_obj_name == 'Yahoo_stock' else 'dummy' } print_log(new_class) while True: # Infinite loop unitl stock download completes successfully try: obj = eval(new_class) for k, v in obj.get_stock_content().items(): print_log( '%(num)s - Writing %(code)s ...' % { 'num': num, 'code': k }, log_fh) if re.match(r'pv_none_match', v) or re.match( r'.+"";$', v): # match empty from tengxun and sina warn_log('No content fetched for ' + k, warn_fh) else: fh.write(v + '\n') num += 1 break except: warn_log('Connection lost, retry in 10 seconds ...') time.sleep(10) except KeyError: warn_log(s[0:2] + ' is not setup in ' + stock_obj_name, warn_fh) continue except HTTPError: # log and skip for stocks couldn't be returned from yahoo interface warn_log('Get content failed when ' + new_class, warn_fh) continue fh.close() print_log( '{num} stocks have been written into {file}.'.format(num=num, file=to_file), log_fh)
def downloader(queue, conn, start_date=options.start_date, end_date=options.end_date, stock_id=options.stock_id, obj_selection=options.obj_selection): #-- object list obj_mapping = { 'T': 'Tengxun_stock_transaction', 'N': 'Netease_stock_transaction', 'S': 'Sina_stock_transaction', } if obj_selection is None: stock_objects = ['Tengxun_stock_transaction', 'Netease_stock_transaction', 'Sina_stock_transaction'] else: stock_objects = [ obj_mapping[o] for o in obj_selection.split('|') if o in obj_mapping ] print_log('|'.join(stock_objects) + ' selected.') iter = len(stock_objects) cur_date_dt = datetime.datetime.strptime(start_date,'%Y%m%d') end_date_dt = datetime.datetime.strptime(end_date,'%Y%m%d') while cur_date_dt <= end_date_dt: #-- stock list stocks = get_stock_list(conn, cur_date_dt, stock_id) for stock in stocks: cur_date_str = cur_date_dt.strftime('%Y%m%d') cur_stock_object = stock_objects[iter%len(stock_objects)] # choose stock object while queue.full(): print_log('=================> queue is full, wait for 1 second...') time.sleep(1) s = Stock_trans_downloader(queue, conn, cur_stock_object, stock, cur_date_str) s.start() #s.join() print_log('-----> queue size: ' + str(queue.qsize())) iter += 1 cur_date_dt = cur_date_dt + datetime.timedelta(1) while not queue.empty(): print_log('=================> queue is not empty yet, wait for 1 second...') time.sleep(1)
def download_to_file(stocks, stock_obj_name, start_date, end_date, to_file, log_fh, warn_fh): # -- iterate stocks, download eod data from webside fh = open(to_file, "a") num = 0 for s in stocks: # -- call method of stock object to get content of url try: new_class = '%(object)s("%(stock)s", "%(start_date)s", "%(end_date)s")' % { "object": stock_obj_name, "stock": s, "start_date": start_date if stock_obj_name == "Yahoo_stock" else "dummy", "end_date": end_date if stock_obj_name == "Yahoo_stock" else "dummy", } print_log(new_class) while True: # Infinite loop unitl stock download completes successfully try: obj = eval(new_class) for k, v in obj.get_stock_content().items(): print_log("Writing %(code)s ..." % {"code": k}, log_fh) if re.match(r"pv_none_match", v) or re.match(r'.+"";$', v): # match empty from tengxun and sina warn_log("No content fetched for " + k, warn_fh) else: fh.write(v + "\n") num += 1 break except: warn_log("Connection lost, retry in 10 seconds ...") time.sleep(10) except KeyError: warn_log(s[0:2] + " is not setup in " + stock_obj_name, warn_fh) continue except HTTPError: # log and skip for stocks couldn't be returned from yahoo interface warn_log("Get content failed when " + new_class, warn_fh) continue fh.close() print_log("{num} stocks have been written into {file}.".format(num=num, file=to_file), log_fh)
stock_object = { 'tengxun': 'Tengxun_stock', 'sina': 'Sina_stock', 'yahoo': 'Yahoo_stock', } # check validation of object class if not options.object_class in stock_object: exit_error( '%(entered_object)s is not a valid object, it could be %(valid_objects)s' % { 'entered_object': options.object_class, 'valid_objects': '|'.join(stock_object) }) else: print_log(options.object_class + ' selected.') # check validation of mode and input file if not options.mode in ('download', 'load', 'downloadAndLoad'): exit_error( mode + ' is not recognized, it could be download|load|downloadAndLoad.') elif not options.file is None and not os.path.exists(options.file): exit_error(options.file + ' doesn\'t exist.') # check validation of start_date and end_date if options.object_class == 'yahoo' and options.mode == 'download': if options.start_date is None or options.end_date is None: exit_error( '--start_date|-s and --end_date|-e must be specified for yahoo class' )
def load_into_bankuai(db_conn, file, biz_date=None): # 板块 子版块 板块名称 涨跌幅 总市值(亿) 换手率 上涨家数 下跌家数 领涨股票代码 领涨股票 领涨股票涨跌幅 # 板块 概念板块 全息技术 3.95% 365.12 11.65 7 1 600288 大恒科技 10.03 # 板块 概念板块 网络安全 2.95% 818.79 25.61 19 1 002308 威创股份 10.01 # biz_date date not null, # bankuai_id integer not null, # rise varchar(16), # market_value_in_million decimal(12,2), # turnover_rate decimal(5,2), # num_of_rise integer, # num_of_drop integer, # leading_stock_id varchar(6), # rise_of_leading_stock decimal(10,2), # primary key(biz_date, bankuai_id) bk_id_dict = {} csv_data = [] v_biz_date = "" #-- build dict for bankuai name and bankuai id from db select_sql = 'select t.name, t.id from dw.dim_bankuai t' cur = get_cur(db_conn) cur.execute(select_sql) db_rows = list(cur) for db_row in db_rows: db_name = db_row["name"].decode("utf-8") db_id = db_row["id"] bk_id_dict[db_name] = db_id print_log("There are %(num)s records read from %(name)s" % { "num": len(bk_id_dict.keys()), "name": 'dw.dim_bankuai' }) #-- load CSV csvf = open(file) csvr = csv.DictReader(csvf) for row in csvr: bk_name = row[u'板块名称'.encode("gbk")].decode("gbk") bk_id = bk_id_dict[bk_name] row_dict = {} row_dict[bk_id] = {} row_dict[bk_id]["rise"] = row[u'涨跌幅'.encode("gbk")].decode("gbk") row_dict[bk_id]["market_value_in_million"] = row[u'总市值(亿)'.encode( "gbk")] row_dict[bk_id]["turnover_rate"] = row[u'换手率'.encode("gbk")] row_dict[bk_id]["num_of_rise"] = row[u'上涨家数'.encode("gbk")] row_dict[bk_id]["num_of_drop"] = row[u'下跌家数'.encode("gbk")] row_dict[bk_id]["leading_stock_id"] = row[u'领涨股票代码'.encode("gbk")] row_dict[bk_id]["rise_of_leading_stock"] = row[u'领涨股票涨跌幅'.encode( "gbk")] csv_data.append(row_dict) csvf.close() print_log("%(num)s records have been read from %(name)s." % { "num": len(csv_data), "name": file }) #-- determine biz_date if not biz_date is None: if re.search(r'\d{8}', biz_date): v_biz_date = biz_date else: raise RuntimeError( biz_date + " is not a valid date format, the date should be like YYYYMMDD." ) elif re.search(r'.*(?P<date>\d{8})\.csv', file): v_biz_date = re.search(r'.*(?P<date>\d{8})\.csv', file).group("date") else: raise RuntimeError( 'Can not determine biz_date, please check if file name has date included or pass biz_date when calling the function.' ) v_biz_date_dt = datetime.datetime.strptime(v_biz_date, '%Y%m%d') #-- delete biz_date from dw.bankuai del_sql = 'delete from dw.bankuai where biz_date = \'%(date)s \'' % { 'date': v_biz_date_dt } cur.execute(del_sql) db_conn.commit() print_log( "Deleted records from dw.bankuai where biz_date = '%(biz_date)s'." % {"biz_date": v_biz_date}) #-- insert into dw.bankuai iter = 0 for r in csv_data: k = r.keys()[0] iter += 1 ins_sql = '''insert into dw.bankuai( biz_date, bankuai_id, rise, market_value_in_million, turnover_rate, num_of_rise, num_of_drop, leading_stock_id, rise_of_leading_stock) values( '%(biz_date)s', %(bankuai_id)s, '%(rise)s', %(market_value_in_million)s, %(turnover_rate)s, %(num_of_rise)s, %(num_of_drop)s, '%(leading_stock_id)s', %(rise_of_leading_stock)s )''' % { 'biz_date': v_biz_date_dt, 'bankuai_id': k, 'rise': r[k]['rise'], 'market_value_in_million': r[k]['market_value_in_million'], 'turnover_rate': r[k]['turnover_rate'], 'num_of_rise': r[k]['num_of_rise'], 'num_of_drop': r[k]['num_of_drop'], 'leading_stock_id': r[k]['leading_stock_id'] if r[k]['leading_stock_id'] != '-' else '000000', # sometimes eastmoney doesn't return valid leading stock id, but '-', for this case, '000000' would replace it as an unknown stock id 'rise_of_leading_stock': r[k]['rise_of_leading_stock'] } cur.execute(ins_sql) db_conn.commit() print_log(str(iter) + " inserted into dw.bankuai.") print_log("dw.bankuai has been refreshed successfully.")
def insert_into_table(db_field_yaml, stock_obj_name, in_file, conn, log_fh, warn_fh): # based on the fields mapping between db and object, db type defined in yaml, generate delete sql and insert sql, and fire to db # this function could be used for any db insert, if yaml and object are setup properly # Yaml example # biz_date: # type: date # is_pk: Y # stock_object: # Tengxun_stock: date from object_impl.Sina_stock import Sina_stock from object_impl.Tengxun_stock import Tengxun_stock from object_impl.Yahoo_stock import Yahoo_stock db_field_mapping = get_yaml(db_field_yaml) tab_name = os.path.basename(db_field_yaml).replace( '.yml', '') # yml file name as table name tab_fields = [] # table field names tab_pk = [] # table pk tab_types = [] # table field types obj_attrs = [] # attribute names in stock object for k, v in db_field_mapping.items(): tab_type = v['type'] obj_attr = v['stock_object'][stock_obj_name] if obj_attr != None: # If None|Null is set for fields in yml, remove the fields from insertion tab_fields.append(k) if v['is_pk'] == 'Y': tab_pk.append(k) # pk, delete before insert tab_types.append(tab_type) obj_attrs.append(obj_attr) del_sql = 'delete from {tab_name} where 1=1 '.format(tab_name=tab_name) ins_sql = 'insert into {tab_name}({fields}) '.format( tab_name=tab_name, fields=','.join(tab_fields)) # iterate each row in the file, insert into table num = 0 with open(in_file) as f: for row in f.readlines(): # get_stock_object_from_str is a function should be available in all the stock objects # this function accepts the string returned from website and generate a dict for stock object # the dict is like {stock: {date: object}} # dynamically import object module, class name and file name should be identical #exec('from object_impl.{object} import {object}'.format(object = stock_obj_name), globals()) stock_dict = eval('{object}.get_stock_object_from_str(row)'.format( object=stock_obj_name, row=row)) for stock in stock_dict: # for Tengxun or sina interface, there is just one stock in one stock dict for date in stock_dict[ stock]: # for Tengxun or sina interface, there is just one date in one stock dict stock_obj = stock_dict[stock][ date] # this object is stock implementation object value_sql = reduce( lambda x, y: (x if re.match(r'stock_obj', x) else 'stock_obj.' + x + ', ') + "stock_obj.{attr_name}, ".format(attr_name=y), obj_attrs ) # add 'stock_obj.' to the first attr, and concatenate attrs to a string value_sql = value_sql[ 0: -2] # remove the last comma and the blankspace next to it value_sql = eval(value_sql) # tupe returned final_value_sql = '' del_where = '' for i, v in enumerate(value_sql): value = "'" + v + "'" if tab_types[ i] == 'date' or tab_types[ i] == 'varchar' else 'Null' if len( str(v) ) == 0 else str( v ) # date and varchar quoted by single quote, otherwise no quote or null(if length of value is 0) final_value_sql = final_value_sql + value + ', ' if tab_fields[i] in tab_pk: del_where = del_where + ' and {field}={value}'.format( field=tab_fields[i], value=value) final_value_sql = final_value_sql[0:-2] del_complete_sql = del_sql + del_where ins_complete_sql = ins_sql + ' values( ' + final_value_sql + ')' #print_log('Deleting [{stock},{date}] from {tab_name}...\n {sql}'.format(stock=stock,date=date,tab_name=tab_name,sql=del_complete_sql), log_fh) cur = get_cur(conn) cur.execute(del_complete_sql) cur.execute(ins_complete_sql) print_log( 'Inserted [{stock},{date}] into {tab_name}.'.format( stock=stock, date=date, tab_name=tab_name), log_fh) num += 1 if num % 1000 == 0: conn.commit() conn.commit() print_log( '{num} records have been written into {tab_name}.'.format( num=num, tab_name=tab_name), log_fh)
def load_into_dim_stock(db_conn, file): #-- load CSV csvf = open(file) csvr = csv.DictReader(csvf) codes = {} codes_to_update = {} codes_to_valid = [] codes_to_invalid = [] # 板块 子版块 板块名称 股票代码 股票名称 # 板块 概念板块 送转预期 600587 新华医疗 for row in csvr: code = row[u'股票代码'.encode("gbk")].decode("gbk") name = row[u'股票名称'.encode("gbk")].decode("gbk") codes[code] = name csvf.close() print_log("%(num)s records have been read from %(fname)s." % { "num": len(codes.keys()), "fname": file }) #---- get id, name from db, seach the combination in csv dict # if id exists but different name, update # if id doesn't exist, mark is_valid=N select_sql = "select t.id, t.name, t.is_valid from dw.dim_stock t /*where t.is_valid = 'Y'*/" cur = get_cur(db_conn) cur.execute(select_sql) db_rows = list(cur) for db_row in db_rows: db_name = db_row["name"].decode("utf-8") db_id = db_row["id"] db_is_valid = db_row["is_valid"] if db_id in codes and db_is_valid == "Y": if db_name == codes[db_id]: #delete from codes if it's already in the table and name is not changed. del codes[db_id] else: #delete from codes, we will use codes_to_update dict to update the name codes_to_update[db_id] = codes[db_id] del codes[db_id] elif db_id in codes and db_is_valid == "N": codes_to_valid.append("'" + str(db_id) + "'") del codes[db_id] elif db_is_valid == "N": # not in csv file and it's already invalid in db, do nothing pass else: # not in csv, but in db it's valid, mark it to invalid codes_to_invalid.append("'" + str(db_id) + "'") #---- mark stocks is_valid=N if len(codes_to_invalid) > 0: codes_to_invalid_str = ",".join(codes_to_invalid) #print_log("Mark stock ids to invalid: " + codes_to_invalid_str) print_log( "There are %(num)s stocks will be marked invalid. %(stocks)s" % { "num": len(codes_to_invalid), "stocks": codes_to_invalid_str }) upd_sql = "update dw.dim_stock t set is_valid = 'N', upd_time = now() where t.id in (%(ids)s)" % { "ids": codes_to_invalid_str } cur.execute(upd_sql) db_conn.commit() else: print_log("No stocks need to be marked invalid.") #---- mark stocks is_valid=Y if len(codes_to_valid) > 0: codes_to_valid_str = ",".join(codes_to_valid) print_log("There are %(num)s stocks will be marked valid. %(stocks)s" % { "num": len(codes_to_valid), "stocks": codes_to_valid_str }) upd_sql = "update dw.dim_stock t set is_valid = 'Y', upd_time = now() where t.id in (%(ids)s)" % { "ids": codes_to_valid_str } cur.execute(upd_sql) db_conn.commit() else: print_log("No stocks need to be marked valid.") #---- update stock names in dim_stock if len(codes_to_update.keys()) > 0: print_log("There are %(num)s stocks will be updated." % {"num": len(codes_to_update.keys())}) for id in codes_to_update: print_log(id) upd_sql = "update dw.dim_stock t set name = '%(name)s', upd_time = now() where t.id = '%(id)s'" % { "id": id, "name": codes_to_update[id] } cur.execute(upd_sql) db_conn.commit() else: print_log("No stocks need to be updated.") #---- insert stocks into dim_stock if len(codes.keys()) > 0: values = [] print_log("There are %(num)s stocks will be inserted." % {"num": len(codes.keys())}) for b in codes: print_log(b) values.append("('%(id)s', '%(name)s', now(), 'Y')" % { "id": b, "name": codes[b] }) values_str = ",".join(values) ins_sql = "insert into dw.dim_stock(id, name, upd_time, is_valid) values %(values)s" % { "values": values_str } cur.execute(ins_sql) db_conn.commit() else: print_log("No new stock ids.") print_log("dw.dim_stock has been refreshed successfully.")
def insert_into_table(db_field_yaml, stock_obj_name, in_file, conn, log_fh, warn_fh): # based on the fields mapping between db and object, db type defined in yaml, generate delete sql and insert sql, and fire to db # this function could be used for any db insert, if yaml and object are setup properly # Yaml example # biz_date: # type: date # is_pk: Y # stock_object: # Tengxun_stock: date from object_impl.Sina_stock import Sina_stock from object_impl.Tengxun_stock import Tengxun_stock from object_impl.Yahoo_stock import Yahoo_stock db_field_mapping = get_yaml(db_field_yaml) tab_name = os.path.basename(db_field_yaml).replace('.yml', '') # yml file name as table name tab_fields = [] # table field names tab_pk = [] # table pk tab_types = [] # table field types obj_attrs = [] # attribute names in stock object for k,v in db_field_mapping.items(): tab_type = v['type'] obj_attr = v['stock_object'][stock_obj_name] if obj_attr != None: # If None|Null is set for fields in yml, remove the fields from insertion tab_fields.append(k) if v['is_pk'] == 'Y': tab_pk.append(k) # pk, delete before insert tab_types.append(tab_type) obj_attrs.append(obj_attr) del_sql = 'delete from {tab_name} where 1=1 '.format(tab_name=tab_name) ins_sql = 'insert into {tab_name}({fields}) '.format(tab_name=tab_name, fields=','.join(tab_fields)) # iterate each row in the file, insert into table num = 0 with open(in_file) as f: for row in f.readlines(): # get_stock_object_from_str is a function should be available in all the stock objects # this function accepts the string returned from website and generate a dict for stock object # the dict is like {stock: {date: object}} # dynamically import object module, class name and file name should be identical #exec('from object_impl.{object} import {object}'.format(object = stock_obj_name), globals()) stock_dict = eval('{object}.get_stock_object_from_str(row)'.format(object=stock_obj_name, row=row)) for stock in stock_dict: # for Tengxun or sina interface, there is just one stock in one stock dict for date in stock_dict[stock]: # for Tengxun or sina interface, there is just one date in one stock dict stock_obj = stock_dict[stock][date] # this object is stock implementation object value_sql = reduce(lambda x, y: ( x if re.match(r'stock_obj', x) else 'stock_obj.' + x + ', ' ) + "stock_obj.{attr_name}, ".format(attr_name=y), obj_attrs) # add 'stock_obj.' to the first attr, and concatenate attrs to a string value_sql = value_sql[0:-2] # remove the last comma and the blankspace next to it value_sql = eval(value_sql) # tupe returned final_value_sql = '' del_where = '' for i, v in enumerate(value_sql): value = "'" + v + "'" if tab_types[i] == 'date' or tab_types[i] == 'varchar' else 'Null' if len(str(v)) == 0 else str(v) # date and varchar quoted by single quote, otherwise no quote or null(if length of value is 0) final_value_sql = final_value_sql + value + ', ' if tab_fields[i] in tab_pk: del_where = del_where + ' and {field}={value}'.format(field=tab_fields[i], value=value) final_value_sql = final_value_sql[0:-2] del_complete_sql = del_sql + del_where ins_complete_sql = ins_sql + ' values( ' + final_value_sql + ')' #print_log('Deleting [{stock},{date}] from {tab_name}...\n {sql}'.format(stock=stock,date=date,tab_name=tab_name,sql=del_complete_sql), log_fh) cur = get_cur(conn) cur.execute(del_complete_sql) cur.execute(ins_complete_sql) print_log('Inserted [{stock},{date}] into {tab_name}.'.format(stock=stock,date=date,tab_name=tab_name), log_fh) num += 1 if num % 1000 == 0: conn.commit() conn.commit() print_log('{num} records have been written into {tab_name}.'.format(num=num, tab_name=tab_name), log_fh)
def load_into_dim_stock_bankuai(db_conn, file ): #-- load CSV csvf = open(file) csvr = csv.DictReader(csvf) bk_st_pairs = [] bk_st_pairs_dict = {} bk_id_dict = {} codes_to_valid = [] codes_to_invalid = [] # 板块 子版块 板块名称 股票代码 股票名称 # 板块 概念板块 送转预期 600587 新华医疗 for row in csvr: bk_name = row[u'板块名称'.encode("gbk")].decode("gbk") st_id = row[u'股票代码'.encode("gbk")].decode("gbk") bk_st_pairs.append([bk_name, st_id]) csvf.close() print_log("%(num)s records have been read from %(fname)s." % {"num": len(bk_st_pairs), "fname": file}) #---- get bankuai_id from dim_bankuai select_sql = "select t.id, t.name from dw.dim_bankuai t" cur = get_cur(db_conn) cur.execute(select_sql) db_rows = list(cur) for db_row in db_rows: db_name = db_row["name"].decode("utf-8") db_id = db_row["id"] bk_id_dict[db_name] = db_id #---- convert to dict for i in range(len(bk_st_pairs)): bk_st_pairs[i][0] = bk_id_dict[bk_st_pairs[i][0]] bk_st_pairs[i].append(str(bk_st_pairs[i][0]) + "-" + str(bk_st_pairs[i][1])) # as PK bk_st_pairs_dict[bk_st_pairs[i][2]] = {"bk": bk_st_pairs[i][0], "st": bk_st_pairs[i][1]} #---- get bk_id, st_id from db, seach the combination in csv dict select_sql = "select t.stock_id, t.bankuai_id, t.is_valid from dw.dim_stock_bankuai t" cur.execute(select_sql) db_rows = list(cur) for db_row in db_rows: db_bk_id = db_row["bankuai_id"] db_st_id = db_row["stock_id"] db_pk = str(db_bk_id) + "-" + db_st_id db_is_valid = db_row["is_valid"] if db_pk in bk_st_pairs_dict and db_is_valid == "Y": del bk_st_pairs_dict[db_pk] elif db_pk in bk_st_pairs_dict and db_is_valid == "N": codes_to_valid.append(" ( bankuai_id = " + str(db_bk_id) + " and stock_id = '" + str(db_st_id) + "' ) ") del bk_st_pairs_dict[db_pk] elif db_is_valid == "N": # not in csv file and it's already invalid in db, do nothing pass else: # not in csv, but in db it's valid, mark it to invalid codes_to_invalid.append(" ( bankuai_id = " + str(db_bk_id) + " and stock_id = '" + str(db_st_id) + "' ) ") #---- mark is_valid=N if len(codes_to_invalid) > 0: codes_to_invalid_str = " or ".join(codes_to_invalid) print_log("There are %(num)s stock bankuai combination will be marked invalid. %(combination)s" % {"num": len(codes_to_invalid), "combination": codes_to_invalid_str}) upd_sql = "update dw.dim_stock_bankuai t set is_valid = 'N', upd_time = now() where %(combinations)s" % {"combinations": codes_to_invalid_str} cur.execute(upd_sql) db_conn.commit() else: print_log("No stock bankuai combinations need to be marked invalid.") #---- mark is_valid=Y if len(codes_to_valid) > 0: codes_to_valid_str = " or ".join(codes_to_valid) print_log("There are %(num)s stock bankuai combination will be marked valid. %(combination)s" % {"num": len(codes_to_valid), "combination": codes_to_valid_str}) upd_sql = "update dw.dim_stock_bankuai t set is_valid = 'Y', upd_time = now() where %(combinations)s" % {"combinations": codes_to_valid_str} cur.execute(upd_sql) db_conn.commit() else: print_log("No stock bankuai combinations need to be marked valid.") #---- insert stocks into dim_stock_bankuai if len(bk_st_pairs_dict.keys()) > 0: values = [] print_log("There are %(num)s stock bankuai combination will be inserted." % {"num": len(bk_st_pairs_dict.keys())}) for pk in bk_st_pairs_dict: print_log(pk) values.append("('%(stock_id)s', '%(bankuai_id)s', now(), 'Y')" % {"stock_id": bk_st_pairs_dict[pk]["st"], "bankuai_id": bk_st_pairs_dict[pk]["bk"]} ) values_str = ",".join(values) ins_sql = "insert into dw.dim_stock_bankuai(stock_id, bankuai_id, upd_time, is_valid) values %(values)s" % {"values": values_str} cur.execute(ins_sql) db_conn.commit() else: print_log("No new stock bankuai combination.") print_log("dw.dim_stock_bankuai has been refreshed successfully.")
def load_into_dim_stock(db_conn, file ): #-- load CSV csvf = open(file) csvr = csv.DictReader(csvf) codes = {} codes_to_update = {} codes_to_valid = [] codes_to_invalid = [] # 板块 子版块 板块名称 股票代码 股票名称 # 板块 概念板块 送转预期 600587 新华医疗 for row in csvr: code = row[u'股票代码'.encode("gbk")].decode("gbk") name = row[u'股票名称'.encode("gbk")].decode("gbk") codes[code] = name csvf.close() print_log("%(num)s records have been read from %(fname)s." % {"num": len(codes.keys()), "fname": file}) #---- get id, name from db, seach the combination in csv dict # if id exists but different name, update # if id doesn't exist, mark is_valid=N select_sql = "select t.id, t.name, t.is_valid from dw.dim_stock t /*where t.is_valid = 'Y'*/" cur = get_cur(db_conn) cur.execute(select_sql) db_rows = list(cur) for db_row in db_rows: db_name = db_row["name"].decode("utf-8") db_id = db_row["id"] db_is_valid = db_row["is_valid"] if db_id in codes and db_is_valid == "Y": if db_name == codes[db_id]: #delete from codes if it's already in the table and name is not changed. del codes[db_id] else: #delete from codes, we will use codes_to_update dict to update the name codes_to_update[db_id] = codes[db_id] del codes[db_id] elif db_id in codes and db_is_valid == "N": codes_to_valid.append("'" + str(db_id) + "'") del codes[db_id] elif db_is_valid == "N": # not in csv file and it's already invalid in db, do nothing pass else: # not in csv, but in db it's valid, mark it to invalid codes_to_invalid.append("'" + str(db_id) + "'") #---- mark stocks is_valid=N if len(codes_to_invalid) > 0: codes_to_invalid_str = ",".join(codes_to_invalid) #print_log("Mark stock ids to invalid: " + codes_to_invalid_str) print_log("There are %(num)s stocks will be marked invalid. %(stocks)s" % {"num": len(codes_to_invalid), "stocks": codes_to_invalid_str}) upd_sql = "update dw.dim_stock t set is_valid = 'N', upd_time = now() where t.id in (%(ids)s)" % {"ids": codes_to_invalid_str} cur.execute(upd_sql) db_conn.commit() else: print_log("No stocks need to be marked invalid.") #---- mark stocks is_valid=Y if len(codes_to_valid) > 0: codes_to_valid_str = ",".join(codes_to_valid) print_log("There are %(num)s stocks will be marked valid. %(stocks)s" % {"num": len(codes_to_valid), "stocks": codes_to_valid_str}) upd_sql = "update dw.dim_stock t set is_valid = 'Y', upd_time = now() where t.id in (%(ids)s)" % {"ids": codes_to_valid_str} cur.execute(upd_sql) db_conn.commit() else: print_log("No stocks need to be marked valid.") #---- update stock names in dim_stock if len(codes_to_update.keys()) > 0: print_log("There are %(num)s stocks will be updated." % {"num": len(codes_to_update.keys())}) for id in codes_to_update: print_log(id) upd_sql = "update dw.dim_stock t set name = '%(name)s', upd_time = now() where t.id = '%(id)s'" % {"id": id, "name": codes_to_update[id]} cur.execute(upd_sql) db_conn.commit() else: print_log("No stocks need to be updated.") #---- insert stocks into dim_stock if len(codes.keys()) > 0: values = [] print_log("There are %(num)s stocks will be inserted." % {"num": len(codes.keys())}) for b in codes: print_log(b) values.append("('%(id)s', '%(name)s', now(), 'Y')" % {"id": b, "name": codes[b]} ) values_str = ",".join(values) ins_sql = "insert into dw.dim_stock(id, name, upd_time, is_valid) values %(values)s" % {"values": values_str} cur.execute(ins_sql) db_conn.commit() else: print_log("No new stock ids.") print_log("dw.dim_stock has been refreshed successfully.")
def load_into_dim_stock_bankuai(db_conn, file): #-- load CSV csvf = open(file) csvr = csv.DictReader(csvf) bk_st_pairs = [] bk_st_pairs_dict = {} bk_id_dict = {} codes_to_valid = [] codes_to_invalid = [] # 板块 子版块 板块名称 股票代码 股票名称 # 板块 概念板块 送转预期 600587 新华医疗 for row in csvr: bk_name = row[u'板块名称'.encode("gbk")].decode("gbk") st_id = row[u'股票代码'.encode("gbk")].decode("gbk") bk_st_pairs.append([bk_name, st_id]) csvf.close() print_log("%(num)s records have been read from %(fname)s." % { "num": len(bk_st_pairs), "fname": file }) #---- get bankuai_id from dim_bankuai select_sql = "select t.id, t.name from dw.dim_bankuai t" cur = get_cur(db_conn) cur.execute(select_sql) db_rows = list(cur) for db_row in db_rows: db_name = db_row["name"].decode("utf-8") db_id = db_row["id"] bk_id_dict[db_name] = db_id #---- convert to dict for i in range(len(bk_st_pairs)): bk_st_pairs[i][0] = bk_id_dict[bk_st_pairs[i][0]] bk_st_pairs[i].append( str(bk_st_pairs[i][0]) + "-" + str(bk_st_pairs[i][1])) # as PK bk_st_pairs_dict[bk_st_pairs[i][2]] = { "bk": bk_st_pairs[i][0], "st": bk_st_pairs[i][1] } #---- get bk_id, st_id from db, seach the combination in csv dict select_sql = "select t.stock_id, t.bankuai_id, t.is_valid from dw.dim_stock_bankuai t" cur.execute(select_sql) db_rows = list(cur) for db_row in db_rows: db_bk_id = db_row["bankuai_id"] db_st_id = db_row["stock_id"] db_pk = str(db_bk_id) + "-" + db_st_id db_is_valid = db_row["is_valid"] if db_pk in bk_st_pairs_dict and db_is_valid == "Y": del bk_st_pairs_dict[db_pk] elif db_pk in bk_st_pairs_dict and db_is_valid == "N": codes_to_valid.append(" ( bankuai_id = " + str(db_bk_id) + " and stock_id = '" + str(db_st_id) + "' ) ") del bk_st_pairs_dict[db_pk] elif db_is_valid == "N": # not in csv file and it's already invalid in db, do nothing pass else: # not in csv, but in db it's valid, mark it to invalid codes_to_invalid.append(" ( bankuai_id = " + str(db_bk_id) + " and stock_id = '" + str(db_st_id) + "' ) ") #---- mark is_valid=N if len(codes_to_invalid) > 0: codes_to_invalid_str = " or ".join(codes_to_invalid) print_log( "There are %(num)s stock bankuai combination will be marked invalid. %(combination)s" % { "num": len(codes_to_invalid), "combination": codes_to_invalid_str }) upd_sql = "update dw.dim_stock_bankuai t set is_valid = 'N', upd_time = now() where %(combinations)s" % { "combinations": codes_to_invalid_str } cur.execute(upd_sql) db_conn.commit() else: print_log("No stock bankuai combinations need to be marked invalid.") #---- mark is_valid=Y if len(codes_to_valid) > 0: codes_to_valid_str = " or ".join(codes_to_valid) print_log( "There are %(num)s stock bankuai combination will be marked valid. %(combination)s" % { "num": len(codes_to_valid), "combination": codes_to_valid_str }) upd_sql = "update dw.dim_stock_bankuai t set is_valid = 'Y', upd_time = now() where %(combinations)s" % { "combinations": codes_to_valid_str } cur.execute(upd_sql) db_conn.commit() else: print_log("No stock bankuai combinations need to be marked valid.") #---- insert stocks into dim_stock_bankuai if len(bk_st_pairs_dict.keys()) > 0: values = [] print_log( "There are %(num)s stock bankuai combination will be inserted." % {"num": len(bk_st_pairs_dict.keys())}) for pk in bk_st_pairs_dict: print_log(pk) values.append( "('%(stock_id)s', '%(bankuai_id)s', now(), 'Y')" % { "stock_id": bk_st_pairs_dict[pk]["st"], "bankuai_id": bk_st_pairs_dict[pk]["bk"] }) values_str = ",".join(values) ins_sql = "insert into dw.dim_stock_bankuai(stock_id, bankuai_id, upd_time, is_valid) values %(values)s" % { "values": values_str } cur.execute(ins_sql) db_conn.commit() else: print_log("No new stock bankuai combination.") print_log("dw.dim_stock_bankuai has been refreshed successfully.")
jobs = { 'download_stock_bankuai': Task_download_stock_bankuai('download_stock_bankuai'), 'recon_stock_bankuai': Task_recon_stock_bankuai('recon_stock_bankuai'), 'download_stock_eod': Task_download_stock_eod('download_stock_eod'), 'download_stock_transaction': Task_download_stock_transaction('download_stock_transaction'), } #job_run_seq = ['download_stock_bankuai', 'recon_stock_bankuai', 'download_stock_eod', 'download_stock_transaction'] job_run_seq = ['download_stock_bankuai', 'recon_stock_bankuai', 'download_stock_eod'] job_to_run = [] # determine which jobs need to run for i, job in enumerate(job_run_seq): status = check_job_status(conn, job) print_log(job + ' ====> ' + status) if status == 'N': # one job failed, itself and the jobs depend on it will be added to to-run list job_to_run = job_run_seq[i:] break # add to flow flow = linear_flow.Flow('Eod loading') for job in job_to_run: flow.add(jobs[job]) engine = taskflow.engines.load(flow) engine.notifier.register('*', flow_watch) engine.task_notifier.register('*', task_watch) try: engine.run() except Exception as ex:
max_date = m.group("date") file_to_recon = file_db_recon[type]["file"].replace("$DATE", max_date) else: if not os.path.isfile(options.in_file): error_log("file can't be found! [" + options.in_file + "]") exit_process() else: file_to_recon = options.in_file #-- building dict for csv and db csvf = open(file_to_recon) csvr = csv.DictReader(csvf) #-- building dict for csv # based on the list of recon_fields_in_file, read the corresponding fields in csv and concatenate them together as a PK print_log("Start to read %(file)s..." % {"file": file_to_recon}) for row in csvr: key = [] for i in range(len(file_db_recon[type]["recon_fields_in_file"])): field = file_db_recon[type]["recon_fields_in_file"][i] key.append(row[field.encode("gbk")].decode("gbk")) csv_dict["-".join(key)] = "" print_log("%(num)s records loaded, dict for csv done." % {"num": len(csv_dict.keys())}) csvf.close() #-- building dict for db # based on the list of recon_fields_in_db, read the corresponding fields in db and concatenate them together as a PK print_log("Start to read db...") select_sql = file_db_recon[type]["sql"] cur = get_cur(conn)
def download_to_local(self): print_log('Reading data from ' + self.get_url()) save_file_from_url(self.__download_file, self.get_url()) print_log('Data saved to ' + self.__download_file)
jobs = { 'download_stock_bankuai': Task_download_stock_bankuai('download_stock_bankuai'), 'recon_stock_bankuai': Task_recon_stock_bankuai('recon_stock_bankuai'), 'download_stock_eod': Task_download_stock_eod('download_stock_eod'), 'download_stock_transaction': Task_download_stock_transaction('download_stock_transaction'), } #job_run_seq = ['download_stock_bankuai', 'recon_stock_bankuai', 'download_stock_eod', 'download_stock_transaction'] job_run_seq = ['download_stock_bankuai', 'recon_stock_bankuai', 'download_stock_eod'] job_to_run = [] # determine which jobs need to run for i, job in enumerate(job_run_seq): status = check_job_status(conn, job) print_log(job + ' ====> ' + status) if status == 'N': # one job failed, itself and the jobs depend on it will be added to to-run list job_to_run = job_run_seq[i:] break # add to flow flow = linear_flow.Flow('Eod loading') for job in job_to_run: flow.add(jobs[job]) engine = taskflow.engines.load(flow) engine.notifier.register('*', flow_watch) #engine.task_notifier.register('*', task_watch) try: engine.run() except Exception as ex:
if not (re.match("^\d{8}$", start_date) and re.match("^\d{8}$", end_date)): exit_error("start_date or end_date error! [" + start_date + "][" + end_date + "]") elif start_date > end_date: exit_error("start_date must be smaller than end_date! [" + start_date + "][" + end_date + "]") # ------------------------------------------- Downloading if options.mode in ("download", "downloadAndLoad"): e = Eastmoney() bkbkfile_full_name = Sys_paths.DATA_STOCK_BANKUAI_DAILY + Sys_paths.SEP + "bankuai_" + recent_working_day + ".csv" if os.path.exists(bkbkfile_full_name): bk_bkbkfile_full_name = bkbkfile_full_name + "." + datetime.datetime.now().strftime("%Y%m%d%H%M%S") os.rename(bkbkfile_full_name, bk_bkbkfile_full_name) # rename print_log("The original file " + bkbkfile_full_name + " has been renamed to " + bk_bkbkfile_full_name) e.export_bankuai_status(bkbkfile_full_name) bkstfile_full_name = ( Sys_paths.DATA_STOCK_BANKUAI_DAILY + Sys_paths.SEP + "bankuai_stock_" + recent_working_day + ".csv" ) if os.path.exists(bkstfile_full_name): bk_bkstfile_full_name = bkstfile_full_name + "." + datetime.datetime.now().strftime("%Y%m%d%H%M%S") os.rename(bkstfile_full_name, bk_bkstfile_full_name) # rename print_log("The original file " + bkstfile_full_name + " has been renamed to " + bk_bkstfile_full_name) e.export_bankuai_stock(bkstfile_full_name) # ------------------------------------------- LOADing if options.mode in ("downloadAndLoad", "load"): # -- determine file to load, $DATE is not replaced if options.in_file is None:
if not (options.mode in ['download', 'load', 'downloadAndLoad']): exit_error(mode + ' is not recognized, it could be download|load|downloadAndLoad.') # check validation of start_date and end_date if not (re.match("^\d{8}$", options.start_date) and re.match("^\d{8}$", options.end_date)): exit_error("Not valid start_date or end_date! [" + options.start_date + "][" + options.end_date + "]") elif options.start_date > options.end_date: exit_error("Start date is greater then end date! [" + options.start_date + "][" + options.end_date + "]") #-- create queue queue = Queue(QUEUE_DOWNLOAD_MAX_SIZE) #-- download stock info from internet if options.mode == 'download' or options.mode == 'downloadAndLoad': #-- at most run 3 times, just in case some stocks failed to download for i in ['1st', '2nd', '3rd']: print_log('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>') print_log('downloader running for the {n} time...' . format(n=i)) print_log('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>') downloader(queue, conn) error_num = download_log_checker(conn) if error_num == 0: break print_log('=================> waiting for 10 seconds to start the next round run...') time.sleep(10) #-- retry 3 times, still failed, raise runtime error if error_num > 0: exit_error('There are {num} stocks failed to download, please check.' . format(num=error_num)) #queue.task_done() #-- upsize queue size to speed up data loading queue = Queue(QUEUE_LOAD_MAX_SIZE) #-- load stock info into database if options.mode == 'load' or options.mode == 'downloadAndLoad':
continue fh.close() print_log("{num} stocks have been written into {file}.".format(num=num, file=to_file), log_fh) # -- parse input parameter, var assignment stock_object = {"tengxun": "Tengxun_stock", "sina": "Sina_stock", "yahoo": "Yahoo_stock"} # check validation of object class if not options.object_class in stock_object: exit_error( "%(entered_object)s is not a valid object, it could be %(valid_objects)s" % {"entered_object": options.object_class, "valid_objects": "|".join(stock_object)} ) else: print_log(options.object_class + " selected.") # check validation of mode and input file if not options.mode in ("download", "load", "downloadAndLoad"): exit_error(mode + " is not recognized, it could be download|load|downloadAndLoad.") elif not options.file is None and not os.path.exists(options.file): exit_error(options.file + " doesn't exist.") # check validation of start_date and end_date if options.object_class == "yahoo" and options.mode == "download": if options.start_date is None or options.end_date is None: exit_error("--start_date|-s and --end_date|-e must be specified for yahoo class") elif not (re.match("^\d{8}$", options.start_date) and re.match("^\d{8}$", options.end_date)): exit_error("Not valid start_date or end_date! [" + options.start_date + "][" + options.end_date + "]")
def loader(queue, conn, start_date=options.start_date, end_date=options.end_date, stock_id=options.stock_id, merge_before_copy=options.merge_before_copy, enable_copy=options.enable_copy): cur_date_dt = datetime.datetime.strptime(start_date, '%Y%m%d') end_date_dt = datetime.datetime.strptime(end_date, '%Y%m%d') stock_list_sql = ''' select row_id, biz_date, stock_id from dw.log_stock_transaction where biz_date = '{biz_date}' and is_download_success = 'Y' and (is_load_success = 'N' or is_load_success is null) ''' if not stock_id is None: stock_list_sql = stock_list_sql + ' and stock_id = \'' + stock_id + '\'' cur = get_cur(conn) while cur_date_dt <= end_date_dt: if merge_before_copy: # since load files one by one into table is taking too much time, the solution to boost the procedure is to merge all the pieces of files into one file and load the merge file into table, this takes less than 5 mins to complete. cur_date_str = cur_date_dt.strftime('%Y%m%d') working_dir = data_dir + SEP + cur_date_str file_merged = os.path.join(working_dir, "file_merged.csv") if os.path.exists(file_merged): warn_log('Removing old file: ' + file_merged) os.remove(file_merged) #-- Starting to merge files with open(file_merged, "a") as dest: i = 0 for _, _, filenames in os.walk(working_dir): for filename in fnmatch.filter(filenames, "[0-9]*.txt"): with open(os.path.join(working_dir, filename)) as src: shutil.copyfileobj(src, dest) i += 1 print_log('Merged ' + str(i) + ' files.') #-- Deleting records from db del_sql = '''delete from dw.stock_transaction where biz_date = '{}' '''.format( cur_date_str) get_query_result(conn, del_sql) conn.commit() print_log( 'Deletion for biz_date {} completed successfully.'.format( cur_date_str)) #-- Updating is_load_success to N in log table upd_sql = '''update dw.log_stock_transaction set is_load_success = 'N' where biz_date = '{}' and is_download_success = 'Y' '''.format( cur_date_str) get_query_result(conn, upd_sql) conn.commit() print_log('is_load_success is updated to N') #++++++++ Starting to load the merged file into table psql_copy_from(DB_HOST, DB_NAME, DB_UNAME, 'dw.stock_transaction', file_merged, DB_PORT, args=' with (encoding \'GBK\')') print_log('Successfully loaded {} into table.'.format(file_merged)) #-- Updating is_load_success to Y in log table upd_sql = '''update dw.log_stock_transaction set is_load_success = 'Y' where biz_date = '{}' and is_download_success = 'Y' '''.format( cur_date_str) get_query_result(conn, upd_sql) conn.commit() print_log('is_load_success is updated to Y') #-- Cleaning up working dir os.remove(file_merged) cur_date_dt = cur_date_dt + datetime.timedelta(1) else: stock_list_sql_var_replaced = stock_list_sql.format( biz_date=cur_date_dt) cur.execute(stock_list_sql_var_replaced) rows = list(cur) for row in rows: row_id = row['row_id'] biz_date = str(row['biz_date']).replace('-', '') stock_id = row['stock_id'] while queue.full(): print_log( '=================> queue is full, wait for 1 second...' ) time.sleep(1) s = Stock_trans_loader(queue, conn, row_id, stock_id, biz_date, enable_copy=enable_copy) s.start() print_log('-----> queue size: ' + str(queue.qsize())) conn.commit() cur_date_dt = cur_date_dt + datetime.timedelta(1) while not queue.empty(): print_log( '=================> queue is not empty yet, wait for 1 second...') time.sleep(1)
def save_formatted_data(self): # save formatted data into file, \t as delimiter # 9:25:00 50.34 0.15 141 709794 买盘 with open(self.out_file, 'w') as file: file.write(self.stock_trans_object.get_stock_content()[self.stock_id][self.date]) print_log('Formatted data saved to ' + self.out_file)
def loader(queue, conn, start_date=options.start_date, end_date=options.end_date, stock_id=options.stock_id, merge_before_copy=options.merge_before_copy, enable_copy=options.enable_copy): cur_date_dt = datetime.datetime.strptime(start_date,'%Y%m%d') end_date_dt = datetime.datetime.strptime(end_date,'%Y%m%d') stock_list_sql = ''' select row_id, biz_date, stock_id from dw.log_stock_transaction where biz_date = '{biz_date}' and is_download_success = 'Y' and (is_load_success = 'N' or is_load_success is null) ''' if not stock_id is None: stock_list_sql = stock_list_sql + ' and stock_id = \'' + stock_id + '\'' cur = get_cur(conn) while cur_date_dt <= end_date_dt: if merge_before_copy: # since load files one by one into table is taking too much time, the solution to boost the procedure is to merge all the pieces of files into one file and load the merge file into table, this takes less than 5 mins to complete. cur_date_str = cur_date_dt.strftime('%Y%m%d') working_dir = data_dir + SEP + cur_date_str file_merged = os.path.join(working_dir, "file_merged.csv") if os.path.exists(file_merged): warn_log('Removing old file: ' + file_merged) os.remove(file_merged) #-- Starting to merge files with open(file_merged, "a") as dest: i=0 for _, _, filenames in os.walk(working_dir): for filename in fnmatch.filter(filenames, "[0-9]*.txt"): with open(os.path.join(working_dir, filename)) as src: shutil.copyfileobj(src, dest) i+=1 print_log('Merged ' + str(i) + ' files.') #-- Deleting records from db del_sql = '''delete from dw.stock_transaction where biz_date = '{}' '''.format(cur_date_str) get_query_result(conn, del_sql) conn.commit() print_log('Deletion for biz_date {} completed successfully.'.format(cur_date_str)) #-- Updating is_load_success to N in log table upd_sql = '''update dw.log_stock_transaction set is_load_success = 'N' where biz_date = '{}' and is_download_success = 'Y' '''.format(cur_date_str) get_query_result(conn, upd_sql) conn.commit() print_log('is_load_success is updated to N') #++++++++ Starting to load the merged file into table psql_copy_from(DB_HOST, DB_NAME, DB_UNAME, 'dw.stock_transaction', file_merged, DB_PORT, args=' with (encoding \'GBK\')') print_log('Successfully loaded {} into table.'.format(file_merged)) #-- Updating is_load_success to Y in log table upd_sql = '''update dw.log_stock_transaction set is_load_success = 'Y' where biz_date = '{}' and is_download_success = 'Y' '''.format(cur_date_str) get_query_result(conn, upd_sql) conn.commit() print_log('is_load_success is updated to Y') #-- Cleaning up working dir os.remove(file_merged) cur_date_dt = cur_date_dt + datetime.timedelta(1) else: stock_list_sql_var_replaced = stock_list_sql.format(biz_date=cur_date_dt) cur.execute(stock_list_sql_var_replaced) rows = list(cur) for row in rows: row_id = row['row_id'] biz_date = str(row['biz_date']).replace('-','') stock_id = row['stock_id'] while queue.full(): print_log('=================> queue is full, wait for 1 second...') time.sleep(1) s = Stock_trans_loader(queue, conn, row_id, stock_id, biz_date, enable_copy=enable_copy ) s.start() print_log('-----> queue size: ' + str(queue.qsize())) conn.commit() cur_date_dt = cur_date_dt + datetime.timedelta(1) while not queue.empty(): print_log('=================> queue is not empty yet, wait for 1 second...') time.sleep(1)
def load_into_dim_bankuai(db_conn, file, parent_bankuai_ids={u'概念板块': 1, u'地域板块': 2, u'行业板块': 3} ): #-- load CSV csvf = open(file) csvr = csv.DictReader(csvf) bankuais = {} invalid_bankuai_ids = [] #---- get parent_bankuai_id, bankuai_name from csv for row in csvr: bankuai = row[u'板块名称'.encode("gbk")].decode("gbk") parent_bankuai = row[u'子版块'.encode("gbk")].decode("gbk") parent_bankuai_id = parent_bankuai_ids[parent_bankuai] bankuais[bankuai] = {} bankuais[bankuai]["parent_bankuai_id"] = parent_bankuai_id #bankuais[bankuai].setdefault("parent_bankuai_id", parent_bankuai_id) csvf.close() print_log("%(num)s records have been read from %(fname)s." % {"num": len(bankuais.keys()), "fname": file}) #---- get parent_bankuai_id, bankuai_name from db, seach the combination in csv dict, if it doesn't exist, add to invalid_bankuai_ids select_sql = "select t.parent_bankuai_id, t.name, t.id from dw.dim_bankuai t where t.is_valid = 'Y'" cur = get_cur(db_conn) cur.execute(select_sql) db_rows = list(cur) for db_row in db_rows: db_bankuai = db_row["name"].decode("utf-8") db_parent_bankuai_id = db_row["parent_bankuai_id"] db_id = db_row["id"] if db_bankuai in bankuais: if db_parent_bankuai_id == bankuais[db_bankuai]["parent_bankuai_id"]: #delete from bankuais if it's already in the table and is_valid=Y del bankuais[db_bankuai] else: invalid_bankuai_ids.append(str(db_id)) else: invalid_bankuai_ids.append(str(db_id)) #---- mark bankuais is_valid=N if len(invalid_bankuai_ids) > 0: invalid_bankuai_ids_str = ",".join(invalid_bankuai_ids) print_log("Invalid bankuai ids: " + invalid_bankuai_ids_str) upd_sql = "update dw.dim_bankuai t set is_valid = 'N', upd_time = now() where t.id in (%(ids)s)" % {"ids": invalid_bankuai_ids_str} cur.execute(upd_sql) db_conn.commit() else: print_log("No invalid bankuai ids.") #---- insert bankuais into dim_bankuai if len(bankuais.keys()) > 0: values = [] print_log("There are %(num)s bankuais will be inserted." % {"num": len(bankuais.keys())}) for b in bankuais: values.append("('%(name)s', '%(parent_bankuai_id)s', now(), 'Y')" % {"name": b, "parent_bankuai_id": bankuais[b]["parent_bankuai_id"]} ) values_str = ",".join(values) ins_sql = "insert into dw.dim_bankuai(name, parent_bankuai_id, upd_time, is_valid) values %(values)s" % {"values": values_str} cur.execute(ins_sql) db_conn.commit() else: print_log("No new bankuai ids.") print_log("dw.dim_bankuai has been refreshed successfully.")
max_date = m.group("date") file_to_recon = file_db_recon[type]["file"].replace("$DATE", max_date) else: if not os.path.isfile(options.in_file): error_log("file can't be found! [" + options.in_file + "]") exit_process() else: file_to_recon = options.in_file #-- building dict for csv and db csvf = open(file_to_recon) csvr = csv.DictReader(csvf) #-- building dict for csv # based on the list of recon_fields_in_file, read the corresponding fields in csv and concatenate them together as a PK print_log("Start to read %(file)s..." % {"file": file_to_recon}) for row in csvr: key = [] for i in range(len(file_db_recon[type]["recon_fields_in_file"])): field = file_db_recon[type]["recon_fields_in_file"][i] key.append(row[field.encode("gbk")].decode("gbk")) csv_dict["-".join(key)] = "" print_log("%(num)s records loaded, dict for csv done." % {"num": len(csv_dict.keys()) }) csvf.close() #-- building dict for db # based on the list of recon_fields_in_db, read the corresponding fields in db and concatenate them together as a PK print_log("Start to read db...") select_sql = file_db_recon[type]["sql"] cur = get_cur(conn) cur.execute(select_sql)
def load_into_dim_bankuai(db_conn, file, parent_bankuai_ids={u"概念板块": 1, u"地域板块": 2, u"行业板块": 3}): # -- load CSV csvf = open(file) csvr = csv.DictReader(csvf) bankuais = {} invalid_bankuai_ids = [] # ---- get parent_bankuai_id, bankuai_name from csv for row in csvr: bankuai = row[u"板块名称".encode("gbk")].decode("gbk") parent_bankuai = row[u"子版块".encode("gbk")].decode("gbk") parent_bankuai_id = parent_bankuai_ids[parent_bankuai] bankuais[bankuai] = {} bankuais[bankuai]["parent_bankuai_id"] = parent_bankuai_id # bankuais[bankuai].setdefault("parent_bankuai_id", parent_bankuai_id) csvf.close() print_log("%(num)s records have been read from %(fname)s." % {"num": len(bankuais.keys()), "fname": file}) # ---- get parent_bankuai_id, bankuai_name from db, seach the combination in csv dict, if it doesn't exist, add to invalid_bankuai_ids select_sql = "select t.parent_bankuai_id, t.name, t.id from dw.dim_bankuai t where t.is_valid = 'Y'" cur = get_cur(db_conn) cur.execute(select_sql) db_rows = list(cur) for db_row in db_rows: db_bankuai = db_row["name"].decode("utf-8") db_parent_bankuai_id = db_row["parent_bankuai_id"] db_id = db_row["id"] if db_bankuai in bankuais: if db_parent_bankuai_id == bankuais[db_bankuai]["parent_bankuai_id"]: # delete from bankuais if it's already in the table and is_valid=Y del bankuais[db_bankuai] else: invalid_bankuai_ids.append(str(db_id)) else: invalid_bankuai_ids.append(str(db_id)) # ---- mark bankuais is_valid=N if len(invalid_bankuai_ids) > 0: invalid_bankuai_ids_str = ",".join(invalid_bankuai_ids) print_log("Invalid bankuai ids: " + invalid_bankuai_ids_str) upd_sql = "update dw.dim_bankuai t set is_valid = 'N', upd_time = now() where t.id in (%(ids)s)" % { "ids": invalid_bankuai_ids_str } cur.execute(upd_sql) db_conn.commit() else: print_log("No invalid bankuai ids.") # ---- insert bankuais into dim_bankuai if len(bankuais.keys()) > 0: values = [] print_log("There are %(num)s bankuais will be inserted." % {"num": len(bankuais.keys())}) for b in bankuais: values.append( "('%(name)s', '%(parent_bankuai_id)s', now(), 'Y')" % {"name": b, "parent_bankuai_id": bankuais[b]["parent_bankuai_id"]} ) values_str = ",".join(values) ins_sql = "insert into dw.dim_bankuai(name, parent_bankuai_id, upd_time, is_valid) values %(values)s" % { "values": values_str } cur.execute(ins_sql) db_conn.commit() else: print_log("No new bankuai ids.") print_log("dw.dim_bankuai has been refreshed successfully.")
def load_into_bankuai(db_conn, file, biz_date=None ): # 板块 子版块 板块名称 涨跌幅 总市值(亿) 换手率 上涨家数 下跌家数 领涨股票代码 领涨股票 领涨股票涨跌幅 # 板块 概念板块 全息技术 3.95% 365.12 11.65 7 1 600288 大恒科技 10.03 # 板块 概念板块 网络安全 2.95% 818.79 25.61 19 1 002308 威创股份 10.01 # biz_date date not null, # bankuai_id integer not null, # rise varchar(16), # market_value_in_million decimal(12,2), # turnover_rate decimal(5,2), # num_of_rise integer, # num_of_drop integer, # leading_stock_id varchar(6), # rise_of_leading_stock decimal(10,2), # primary key(biz_date, bankuai_id) bk_id_dict = {} csv_data = [] v_biz_date = "" #-- build dict for bankuai name and bankuai id from db select_sql = 'select t.name, t.id from dw.dim_bankuai t' cur = get_cur(db_conn) cur.execute(select_sql) db_rows = list(cur) for db_row in db_rows: db_name = db_row["name"].decode("utf-8") db_id = db_row["id"] bk_id_dict[db_name] = db_id print_log("There are %(num)s records read from %(name)s" % {"num": len(bk_id_dict.keys()), "name": 'dw.dim_bankuai'}) #-- load CSV csvf = open(file) csvr = csv.DictReader(csvf) for row in csvr: bk_name = row[u'板块名称'.encode("gbk")].decode("gbk") bk_id = bk_id_dict[bk_name] row_dict = {} row_dict[bk_id] = {} row_dict[bk_id]["rise"] = row[u'涨跌幅'.encode("gbk")].decode("gbk") row_dict[bk_id]["market_value_in_million"] = row[u'总市值(亿)'.encode("gbk")] row_dict[bk_id]["turnover_rate"] = row[u'换手率'.encode("gbk")] row_dict[bk_id]["num_of_rise"] = row[u'上涨家数'.encode("gbk")] row_dict[bk_id]["num_of_drop"] = row[u'下跌家数'.encode("gbk")] row_dict[bk_id]["leading_stock_id"] = row[u'领涨股票代码'.encode("gbk")] row_dict[bk_id]["rise_of_leading_stock"] = row[u'领涨股票涨跌幅'.encode("gbk")] csv_data.append(row_dict) csvf.close() print_log("%(num)s records have been read from %(name)s." % {"num": len(csv_data), "name": file}) #-- determine biz_date if not biz_date is None: if re.search(r'\d{8}', biz_date): v_biz_date = biz_date else: raise RuntimeError(biz_date + " is not a valid date format, the date should be like YYYYMMDD.") elif re.search(r'.*(?P<date>\d{8})\.csv', file): v_biz_date = re.search(r'.*(?P<date>\d{8})\.csv', file).group("date") else: raise RuntimeError('Can not determine biz_date, please check if file name has date included or pass biz_date when calling the function.') v_biz_date_dt = datetime.datetime.strptime(v_biz_date,'%Y%m%d') #-- delete biz_date from dw.bankuai del_sql = 'delete from dw.bankuai where biz_date = \'%(date)s \'' % {'date': v_biz_date_dt} cur.execute(del_sql) db_conn.commit() print_log("Deleted records from dw.bankuai where biz_date = '%(biz_date)s'." % {"biz_date": v_biz_date}) #-- insert into dw.bankuai iter = 0 for r in csv_data: k = r.keys()[0] iter += 1 ins_sql = '''insert into dw.bankuai( biz_date, bankuai_id, rise, market_value_in_million, turnover_rate, num_of_rise, num_of_drop, leading_stock_id, rise_of_leading_stock) values( '%(biz_date)s', %(bankuai_id)s, '%(rise)s', %(market_value_in_million)s, %(turnover_rate)s, %(num_of_rise)s, %(num_of_drop)s, '%(leading_stock_id)s', %(rise_of_leading_stock)s )''' % { 'biz_date': v_biz_date_dt, 'bankuai_id': k, 'rise': r[k]['rise'], 'market_value_in_million': r[k]['market_value_in_million'], 'turnover_rate': r[k]['turnover_rate'], 'num_of_rise': r[k]['num_of_rise'], 'num_of_drop': r[k]['num_of_drop'], 'leading_stock_id': r[k]['leading_stock_id'] if r[k]['leading_stock_id'] != '-' else '000000', # sometimes eastmoney doesn't return valid leading stock id, but '-', for this case, '000000' would replace it as an unknown stock id 'rise_of_leading_stock': r[k]['rise_of_leading_stock'] } cur.execute(ins_sql) db_conn.commit() print_log( str(iter) + " inserted into dw.bankuai.") print_log("dw.bankuai has been refreshed successfully.")
# check validation of start_date and end_date if not (re.match("^\d{8}$", options.start_date) and re.match("^\d{8}$", options.end_date)): exit_error("Not valid start_date or end_date! [" + options.start_date + "][" + options.end_date + "]") elif options.start_date > options.end_date: exit_error("Start date is greater then end date! [" + options.start_date + "][" + options.end_date + "]") #-- create queue queue = Queue(QUEUE_DOWNLOAD_MAX_SIZE) #-- download stock info from internet if options.mode == 'download' or options.mode == 'downloadAndLoad': #-- at most run 3 times, just in case some stocks failed to download for i in ['1st', '2nd', '3rd']: print_log('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>') print_log('downloader running for the {n} time...'.format(n=i)) print_log('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>') downloader(queue, conn) error_num = download_log_checker(conn) if error_num == 0: break print_log( '=================> waiting for 10 seconds to start the next round run...' ) time.sleep(10) #-- retry 3 times, still failed, raise runtime error if error_num > 0: exit_error( 'There are {num} stocks failed to download, please check.'.format( num=error_num)) #queue.task_done()
if not (re.match("^\d{8}$", start_date) and re.match("^\d{8}$", end_date)): exit_error("start_date or end_date error! [" + start_date + "][" + end_date + "]") elif start_date > end_date: exit_error("start_date must be smaller than end_date! [" + start_date + "][" + end_date + "]") #------------------------------------------- Downloading if options.mode in ('download', 'downloadAndLoad'): e = Eastmoney() bkbkfile_full_name = Sys_paths.DATA_STOCK_BANKUAI_DAILY + Sys_paths.SEP + 'bankuai_' + recent_working_day + '.csv' if os.path.exists(bkbkfile_full_name): bk_bkbkfile_full_name = bkbkfile_full_name + "." + datetime.datetime.now().strftime("%Y%m%d%H%M%S") os.rename(bkbkfile_full_name, bk_bkbkfile_full_name) #rename print_log('The original file ' + bkbkfile_full_name + " has been renamed to " + bk_bkbkfile_full_name) e.export_bankuai_status(bkbkfile_full_name) bkstfile_full_name = Sys_paths.DATA_STOCK_BANKUAI_DAILY + Sys_paths.SEP + 'bankuai_stock_' + recent_working_day + '.csv' if os.path.exists(bkstfile_full_name): bk_bkstfile_full_name = bkstfile_full_name + "." + datetime.datetime.now().strftime("%Y%m%d%H%M%S") os.rename(bkstfile_full_name, bk_bkstfile_full_name) #rename print_log('The original file ' + bkstfile_full_name + " has been renamed to " + bk_bkstfile_full_name) e.export_bankuai_stock(bkstfile_full_name) #------------------------------------------- LOADing if options.mode in ('downloadAndLoad', 'load'): #-- determine file to load, $DATE is not replaced if options.in_file is None: if options.table is None: for tab in table_mapping: