def calRecordNum(trade_dates): tot_record_num = 0 for tmp_date in trade_dates: tmp_codes = c.sector("001004", tmp_date) tot_record_num += len(tmp_codes.Data) / 2 return tot_record_num
def getStockExistInfo(trade_dates): # csd can get data easily stock_record = pd.DataFrame([]) tot_record_num = 0 for tmp_date in trade_dates: tmp_codes = c.sector("001004", tmp_date) tmp_codes = [ tmp_codes.Data[i] for i in range(0, len(tmp_codes.Data), 2) ] # only get stock codes tmp_record = pd.DataFrame({'code': tmp_codes}) tmp_record.loc[:, 'date'] = tmp_date stock_record = stock_record.append(tmp_record) tot_record_num += len(tmp_codes) stock_record.loc[:, 'exist'] = 1 exist_stock_flag = stock_record.pivot_table(values='exist', index='date', columns='code', aggfunc=np.sum) # get the first and last existing date stock_existing_points = pd.DataFrame([], index=exist_stock_flag.columns, columns=['first', 'last']) for tmp_code in exist_stock_flag.columns: tmp_all_existing_dates = exist_stock_flag.loc[ ~exist_stock_flag[tmp_code].isnull()].index stock_existing_points.loc[tmp_code] = [ tmp_all_existing_dates[0], tmp_all_existing_dates[-1] ] return exist_stock_flag, stock_existing_points, tot_record_num
def updateCSS(max_data_num, start_date='2007-01-01', end_date=''): # get trade calendar quant_engine = create_engine( 'mysql+pymysql://{user}:{password}@{host}/{db}?charset={charset}'. format(**ConfigQuant)) if end_date == '': end_date = datetime.strftime(datetime.today() - timedelta(1), '%Y-%m-%d') # yesterday sql_statement = "select date from %s where date between '%s' and '%s'" % ( calendarTableName, start_date, end_date) trade_dates = pd.read_sql(sql_statement, quant_engine) trade_dates = trade_dates['date'].values # check if table exists, and make sure no duplicate data sql_statement = "select TABLE_NAME from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='quant' and TABLE_NAME='%s';" % targetTableName tmp_result = pd.read_sql(sql_statement, quant_engine) if tmp_result.shape[0] > 0: # table already exist write_method = 'append' # append data, keep the old data tmp_conn = pymysql.connect(**ConfigQuant) with tmp_conn.cursor() as tmp_cur: sql_statement = "delete from %s where date between '%s' and '%s'" % ( targetTableName, start_date, end_date) # avoid duplicated data tmp_num = tmp_cur.execute(sql_statement) print('delete duplicate record num:', tmp_num) sql_statement = "delete from %s where date between '%s' and '%s'" % ( adjQuoteTableName, start_date, end_date ) # avoid duplicated data tmp_num = tmp_cur.execute(sql_statement) print('delete duplicate record num:', tmp_num) tmp_conn.commit() tmp_conn.close() else: write_method = 'replace' # choice API data loginResult = c.start("ForceLogin=1", '', mainCallback) # login if (loginResult.ErrorCode != 0): print("login in fail") exit() # get total record num tot_record_num = calRecordNum( trade_dates) # c.sector not consume data limit print('total record num:', tot_record_num) if max_data_num < tot_record_num * (len(targetColNames) - 2): print('data quota not enough to download all the data') raise ValueError # loop over trade dates to download stock quotes from API choice_data = pd.DataFrame([], columns=targetColNames) dump_data_size = 50000 # dump data every n records # write_method = 'replace' tmp_index = 0 current_dump_num = 0 for tmp_date in trade_dates: tmp_codes = c.sector("001004", tmp_date) tmp_codes = [ tmp_codes.Data[i] for i in range(0, len(tmp_codes.Data), 2) ] tmp_codes = ','.join(tmp_codes) tmp_data = c.css(tmp_codes, choiceFieldName, "TradeDate=%s,AdjustFlag=1" % tmp_date) # 1: unadjusted price for tmp_c, tmp_quote in tmp_data.Data.items(): tmp_c = tmp_c[:-3] tmp_buffer = [tmp_date, tmp_c] tmp_buffer.extend(tmp_quote) choice_data.loc[tmp_index] = tmp_buffer tmp_index += 1 tmp_cols = set(targetColNames) - {'date', 'code'} for tmp_col in tmp_cols: choice_data.loc[:, tmp_col] = choice_data[tmp_col].astype( 'float') # make sure datetype consistent # dump data into database by trunk (or reach the final) if (choice_data.shape[0] > dump_data_size) or (tmp_date == trade_dates[-1]): choice_data.loc[:, 'time_stamp'] = datetime.now() choice_data.to_sql(targetTableName, quant_engine, index=False, if_exists=write_method) # Houfuquan quotes for tmp_col in adjColNames: choice_data.loc[:, tmp_col] = choice_data[ tmp_col] * choice_data['adj_factor'] choice_data = choice_data.drop('adj_factor', axis=1) choice_data.to_sql(adjQuoteTableName, quant_engine, index=False, if_exists=write_method) current_dump_num += choice_data.shape[0] print('finish dump data:', current_dump_num) write_method = 'append' choice_data = pd.DataFrame([], columns=targetColNames) c.stop() # logout