def replicateLatestRecord(self, no_update_code_list, trade_calendar, target_latest_trade_date): data_tot_result = pd.DataFrame([]) # copy the latest records for those not updated codes today_str = datetime.strftime(datetime.now(), '%Y-%m-%d') expand_dates = trade_calendar[ (trade_calendar > target_latest_trade_date) & (trade_calendar <= today_str)] if not expand_dates.empty: # today is not the latest record date in target table tmp_not_updated_codes = list( map(lambda x: "'%s'" % x, no_update_code_list)) tmp_not_updated_codes = ','.join(tmp_not_updated_codes) # get the latest record from target table tmp_state = "select * from `%s` where `%s` in (%s) and `%s` = '%s'" % ( self.targetTableName, self.codeField, tmp_not_updated_codes, self.dateField, target_latest_trade_date) tmp_latest_records = readDB(tmp_state, ConfigQuant) # expand the latest record up to today for tmp_idx in tmp_latest_records.index: tmp_record = tmp_latest_records.loc[tmp_idx] tmp_expand_records = pd.DataFrame([]) tmp_expand_records = tmp_expand_records.append( [tmp_record] * expand_dates.size) tmp_expand_records.loc[:, self.dateField] = expand_dates.tolist() data_tot_result = data_tot_result.append(tmp_expand_records) data_tot_result.loc[:, self.timeStampField] = datetime.now() return data_tot_result
def run(self): self.prepareData() if self.state == '': # already the latest data return # fetch data from sql dataO = readDB(self.state, ConfigQuant) dataO = dataO.drop_duplicates([self.dateField]) dataO = dataO.sort_values(self.dateField) data_result = pd.DataFrame(dataO[self.dateField]) tmp_df = priceTechnicalIndicatorTimeSeries(dataO[self.valueField], self.lags, 'PRICE') # MA, MA_DIFF... data_result = data_result.join(tmp_df) # trim data for increment if self.last_update_date != '': data_result = data_result.loc[ data_result[self.dateField] > self.last_update_date] # add timestamp data_result['time_stamp'] = datetime.now() # dump data into sql writeDB(self.targetTableName, data_result, ConfigQuant, self.if_exist)
def getStockCategory(self, stock_data, index_data, date_range, stock_area): if self.categoryField == 'industry': tmp_state = 'select `%s`, `%s`, `%s` from %s where %s in (%s)' % ( self.dateField, self.codeField, self.categoryField, self.sourceCategoryTableName, self.dateField, date_range) category = readDB(tmp_state, ConfigQuant) stock_data = stock_data.merge(category, on=[self.dateField, self.codeField]) stock_data = stock_data.merge( index_data, on=[self.dateField, self.categoryField], suffixes=['', '_index']) elif self.categoryField == 'area': stock_data = stock_data.merge(stock_area, on=self.codeField) stock_data = stock_data.merge( index_data, on=[self.dateField, self.categoryField], suffixes=['', '_index']) elif self.categoryField == 'market': stock_data[self.categoryField] = stock_data[self.codeField].apply( lambda x: 'SH' if x[:2] == '60' else ('SMEB' if x[:3] == '002' else ('GEB' if x[:3] == '300' else 'SZ'))) stock_data = stock_data.merge( index_data, on=[self.dateField, self.categoryField], suffixes=['', '_index']) elif self.categoryField == 'HS300': stock_data = stock_data.merge(index_data, on=self.dateField, suffixes=['', '_index']) return stock_data
def getMarketRiseRatio(self, stock_data, date_range, stock_area, date_set): if self.categoryField == 'industry': tmp_state = 'select `%s`, `%s`, `%s` from %s where %s in (%s)' % ( self.dateField, self.codeField, self.categoryField, self.sourceCategoryTableName, self.dateField, date_range) category = readDB(tmp_state, ConfigQuant) stock_data = stock_data.merge(category, on=[self.dateField, self.codeField]) elif self.categoryField == 'area': stock_data = stock_data.merge(stock_area, on=self.codeField) elif self.categoryField == 'market': stock_data[self.categoryField] = stock_data[self.codeField].apply( lambda x: 'SH' if x[:2] == '60' else ('SMEB' if x[:3] == '002' else ('GEB' if x[:3] == '300' else 'SZ'))) elif self.categoryField == 'all': rise_ratio = stock_data.groupby( self.dateField)[self.retSeriesField].apply(self.riseRatio) rise_ratio = rise_ratio.reset_index() return rise_ratio rise_ratio = stock_data.groupby( [self.dateField, self.categoryField])[self.retSeriesField].apply(self.riseRatio) rise_ratio = rise_ratio.reset_index() return rise_ratio
def run(self): self.prepareData() if self.state == '': # already the latest data return # fetch data from sql dataO = readDB(self.state, ConfigQuant) dataO = dataO.drop_duplicates([self.dateField]) dataO = dataO.sort_values(self.dateField) # sort by date dataO = dataO.fillna(method='ffill') # for field in [self.openField, self.highField, self.lowField, self.closeField, self.volumeField]: # dataO.loc[:, field] = dataO[field].astype('float') data_result = pd.DataFrame(dataO[self.dateField]) # MA, MA_DIFF... tmp_df = priceTechnicalIndicator(dataO[self.closeField], self.lags, '') data_result = data_result.join(tmp_df) # ADX, CCI ... tmp_df = priceTechnicalIndicatorOHLCV(dataO[self.openField], dataO[self.highField], dataO[self.lowField], dataO[self.closeField], dataO[self.volumeField]) data_result = data_result.join(tmp_df) # VMA, VMA_DIFF..., OBV, VOLUME_RATIO... tmp_df = volumeTechnicalIndicators(dataO[self.closeField], dataO[self.volumeField], self.lags, 'V') data_result = data_result.join(tmp_df) # trim data for increment if self.last_update_date != '': data_result = data_result.loc[ data_result[self.dateField] > self.last_update_date] # add timestamp data_result['time_stamp'] = datetime.now() # dump data into sql writeDB(self.targetTableName, data_result, ConfigQuant, self.if_exist)
def getStockCategory(self, stock_data, uni_dates, stock_area): if self.categoryField == 'industry': date_range = list(map(lambda x: "'%s'" % x, uni_dates)) date_range = ','.join(date_range) tmp_state = 'select `%s`, `%s`, `%s` from %s where %s in (%s)' % ( self.dateField, self.codeField, self.categoryField, self.sourceCategoryTableName, self.dateField, date_range) category = readDB(tmp_state, ConfigQuant) stock_data = stock_data.merge(category, on=[self.dateField, self.codeField]) elif self.categoryField == 'area': stock_data = stock_data.merge(stock_area, on=self.codeField) elif self.categoryField == 'market': stock_data[self.categoryField] = stock_data[self.codeField].apply( lambda x: 'SH' if x[:2] == '60' else ('SMEB' if x[:3] == '002' else ('GEB' if x[:3] == '300' else 'SZ'))) elif self.categoryField == 'all': pass return stock_data
def runFull(self): # get total code list tmp_state = 'select distinct %s from %s' % (self.codeField, self.sourceTableName) code_list = readDB(tmp_state, ConfigQuant).values code_list = code_list.T[0] # get trade date list from calendar tmp_state = 'select `date` from %s' % self.calendarTableName tradedates = readDB(tmp_state, ConfigQuant)['date'] # calculate num of loop loop_num = int(code_list.size / self.chunkSize) if code_list.size > loop_num * self.chunkSize: loop_num = loop_num + 1 if self.isMultiProcess: # use multi processing # register pool pool = Pool(processes=self.processNum) # fetch and process data from sql by chunk for i in range(loop_num): tmp_code = code_list[i * self.chunkSize:(i + 1) * self.chunkSize] tmp_code_str = list(map(lambda x: "'%s'" % x, tmp_code)) tmp_range = ','.join(tmp_code_str) tmp_state = self.state + "`%s` in (%s)" % (self.codeField, tmp_range) dataO = readDB(tmp_state, ConfigQuant) dataO.loc[:, self.reportPeriodField] = dataO[ self.yearField] * 100 + dataO[ self.seasonField] # combine report year and season dataO = dataO.drop_duplicates( [self.codeField, self.reportPeriodField]) dataO = dataO.sort_values( self.reportPeriodField) # sort by report period # process chunk data pool_results = [] for code in tmp_code: tmp_data = dataO.loc[dataO[self.codeField] == code] # dataO already sorted by date if tmp_data.empty: continue # multiprocessing tmp_procs = pool.apply_async(self.coreComputation, (code, tmp_data, tradedates)) pool_results.append(tmp_procs) # get result from the process pool data_tot_result = pd.DataFrame([]) for tmp_procs in pool_results: data_result = tmp_procs.get() data_tot_result = data_tot_result.append(data_result) # add timestamp data_tot_result['time_stamp'] = datetime.now() if data_tot_result.empty: continue # dump chunk data into sql writeDB(self.targetTableName, data_tot_result, ConfigQuant, self.if_exist) self.if_exist = 'append' pool.close() else: # not multiprocess # fetch and process data from sql by chunk for i in range(loop_num): tmp_code = code_list[i * self.chunkSize:(i + 1) * self.chunkSize] tmp_code_str = list(map(lambda x: "'%s'" % x, tmp_code)) tmp_range = ','.join(tmp_code_str) tmp_state = self.state + "`%s` in (%s)" % (self.codeField, tmp_range) dataO = readDB(tmp_state, ConfigQuant) dataO.loc[:, self.reportPeriodField] = dataO[ self.yearField] * 100 + dataO[ self.seasonField] # combine report year and season dataO = dataO.drop_duplicates( [self.codeField, self.reportPeriodField]) # dataO = dataO.sort_values(self.reportPeriodField) # sort by report period data_tot_result = pd.DataFrame([]) for code in tmp_code: tmp_data = dataO.loc[dataO[self.codeField] == code] # dataO already sorted by date tmp_data = tmp_data.sort_values( self.reportPeriodField) # sort by report period if tmp_data.empty: continue data_result = self.coreComputation(code, tmp_data, tradedates) data_tot_result = data_tot_result.append(data_result) # add timestamp data_tot_result['time_stamp'] = datetime.now() if data_tot_result.empty: continue # dump chunk data into sql writeDB(self.targetTableName, data_tot_result, ConfigQuant, self.if_exist) self.if_exist = 'append'
def runIncrm(self): # fetch and process all incremental data from sql dataO = readDB(self.state, ConfigQuant) dataO.loc[:, self.reportPeriodField] = dataO[ self.yearField] * 100 + dataO[self.seasonField] dataO = dataO.drop_duplicates([self.codeField, self.reportPeriodField]) # dataO = dataO.sort_values(self.reportPeriodField) # sort by date # get calendar tmp_state = "select `date` from `%s`;" % self.calendarTableName trade_calendar = readDB(tmp_state, ConfigQuant) trade_calendar = trade_calendar['date'] # get latest time stamp for each stock in the target table tmp_state = "select `%s`, max(`%s`) as %s from `%s` group by `%s`" % ( self.codeField, self.timeStampField, self.latestTimeStamp, self.targetTableName, self.codeField) target_latest_time_stamp = readDB(tmp_state, ConfigQuant) target_latest_time_stamp = target_latest_time_stamp.set_index( self.codeField) # get the latest trade date of the data in target table tmp_state = "select max(`%s`) from `%s`" % (self.dateField, self.targetTableName) target_latest_trade_date = readDB(tmp_state, ConfigQuant) target_latest_trade_date = target_latest_trade_date.iloc[0, 0] # process incremental data code_list = dataO[self.codeField].unique() data_tot_result = pd.DataFrame([]) # use multiprocessing to improve computation hours if self.isMultiProcess: pool = Pool(processes=self.processNum) pool_results = [] pool_data_first_date = [] no_update_code_list = [] # build pool for code in code_list: tmp_data = dataO.loc[dataO[self.codeField] == code] tmp_data = tmp_data.sort_values( self.reportPeriodField) # sorted by report period if tmp_data.empty: continue # find the latest time stamp, and compare it with the raw data, delete obsolete data if there exists is_new_data = False try: tmp_target_latest_time_stamp = target_latest_time_stamp.loc[ code, self.latestTimeStamp] tmp_data_source_new = tmp_data.loc[tmp_data[ self.timeStampField] >= tmp_target_latest_time_stamp] tmp_data_source_unexpanded = tmp_data.loc[tmp_data[ self.releaseDateField] > target_latest_trade_date] tmp_data_source_new = tmp_data_source_new.append( tmp_data_source_unexpanded) tmp_data_source_new = tmp_data_source_new.drop_duplicates( [self.codeField, self.yearField, self.seasonField]) tmp_data_source_new = tmp_data_source_new.loc[ ~tmp_data_source_new[self.releaseDateField].isnull()] if not tmp_data_source_new.empty: # obsolete data data_new_first_data = tmp_data_source_new[ self.releaseDateField].min( ) # find the earliest report in new update data if type( data_new_first_data ).__name__ == 'str': # else data_new_first_data is nan is_new_data = True deleteObsoleteDataFromDB( code, data_new_first_data, self.dateField, self.codeField, self.targetTableName, ConfigQuant ) # delete obsolet data not earlier than the eariliest report date except KeyError: is_new_data = True data_new_first_data = '2007-01-01' # this stock code is new to the target table if is_new_data: # have values updated or completely new tmp_result = pool.apply_async( self.coreComputation, (code, tmp_data, trade_calendar)) pool_results.append(tmp_result) data_new_first_data = min( data_new_first_data, trade_calendar[ trade_calendar > target_latest_trade_date].iloc[0]) pool_data_first_date.append(data_new_first_data) else: # no new data from source table to update target table no_update_code_list.append(code) # get result from the pool for tmp_result, tmp_first_date in zip(pool_results, pool_data_first_date): data_result = tmp_result.get() data_result = data_result.loc[ data_result[self.dateField] >= tmp_first_date] # slice data, from the earliest report release date print('%s regenerate %d data' % (code, data_result.shape[0])) data_tot_result = data_tot_result.append(data_result) # replicate the latest records for those not updated codes replicate_records = self.replicateLatestRecord( no_update_code_list, trade_calendar, target_latest_trade_date) data_tot_result = data_tot_result.append(replicate_records) else: # single process no_update_code_list = [] for code in code_list: tmp_data = dataO.loc[dataO[self.codeField] == code] tmp_data = tmp_data.sort_values( self.reportPeriodField) # sorted by report period if tmp_data.empty: continue # find the latest time stamp, and compare it with the raw data, delete obsolete data if there exists has_new_data = False try: tmp_target_latest_time_stamp = target_latest_time_stamp.loc[ code, self.latestTimeStamp] tmp_data_source_new = tmp_data[tmp_data[ self.timeStampField] >= tmp_target_latest_time_stamp] tmp_data_source_unexpanded = tmp_data[tmp_data[ self.releaseDateField] > target_latest_trade_date] tmp_data_source_new = tmp_data_source_new.append( tmp_data_source_unexpanded) tmp_data_source_new = tmp_data_source_new.drop_duplicates( [self.codeField, self.yearField, self.seasonField]) if not tmp_data_source_new.empty: # obsolete data tmp_data_new_first_data = tmp_data_source_new[ self.releaseDateField].min( ) # find the earliest report in new update data if type( tmp_data_new_first_data ).__name__ == 'str': # else tmp_data_new_first_data is nan has_new_data = True deleteObsoleteDataFromDB( code, tmp_data_new_first_data, self.dateField, self.codeField, self.targetTableName, ConfigQuant ) # delete obsolet data later than the eariliest report date except KeyError: has_new_data = True # this stock code is new to the target table tmp_data_new_first_data = '2007-01-01' if has_new_data: data_result = self.coreComputation(code, tmp_data, trade_calendar) tmp_data_new_first_data = min( tmp_data_new_first_data, trade_calendar[ trade_calendar > target_latest_trade_date].iloc[0]) data_result = data_result.loc[ data_result[self.dateField] >= tmp_data_new_first_data] print('%s regenerate %d data' % (code, data_result.shape[0])) data_tot_result = data_tot_result.append(data_result) else: # no new data from source table to update target table no_update_code_list.append(code) # replicate latest records for those not updated codes replicate_records = self.replicateLatestRecord( no_update_code_list, trade_calendar, target_latest_trade_date) data_tot_result = data_tot_result.append(replicate_records) if not data_tot_result.empty: # add timestamp data_tot_result['time_stamp'] = datetime.now() # dump chunk data into sql writeDB(self.targetTableName, data_tot_result, ConfigQuant, self.if_exist) self.if_exist = 'append'
def run(self, startDate='2007-01-01'): self.prepareData() if self.stockState == '': # already the latest data return # get total date list (horizontally) tmp_state = 'select distinct %s from %s' % (self.dateField, self.sourceStockTableName) date_list = readDB(tmp_state, ConfigQuant).values date_list = date_list.T[0] if self.last_update_date == '': date_list = date_list[date_list > startDate] else: date_list = date_list[date_list > self.last_update_date] # ****************** area is the same for all dates, that means using future data!!! need to be modified if self.categoryField == 'area': tmp_state = "SELECT `%s`, `%s` FROM `%s`;" % ( self.codeField, self.categoryField, self.sourceCategoryTableName) stock_area = readDB(tmp_state, ConfigQuant) else: stock_area = None # calculate num of loop loop_num = int(date_list.size / self.chunkSize) if date_list.size > loop_num * self.chunkSize: loop_num = loop_num + 1 # fetch and process data from sql by chunk for i in range(loop_num): tmp_date = date_list[i * self.chunkSize:(i + 1) * self.chunkSize] tmp_date_str = list(map(lambda x: "'%s'" % x, tmp_date)) tmp_range = ','.join(tmp_date_str) # read stock return data tmp_state = self.stockState + "`%s` in (%s)" % (self.dateField, tmp_range) dataStockO = readDB(tmp_state, ConfigQuant) # read index return data tmp_state = self.indexState + "`%s` in (%s)" % (self.dateField, tmp_range) dataIndexO = readDB(tmp_state, ConfigQuant) dataStockO = dataStockO.drop_duplicates( [self.dateField, self.codeField]) if self.categoryField == 'HS300': dataIndexO = dataIndexO.drop_duplicates(self.dateField) else: dataIndexO = dataIndexO.drop_duplicates( [self.dateField, self.categoryField]) # dataStockO = dataStockO.fillna(0) # dataIndexO = dataIndexO.fillna(0) data_result = self.getStockCategory(dataStockO, dataIndexO, tmp_range, stock_area) new_fields = list( map(lambda x: 'EXCESSIVE_%s' % x, self.retSeriesField)) for field in zip(new_fields, self.retSeriesField): data_result[field[0]] = data_result[field[1]] - data_result[ field[1] + '_index'] tot_fields = [self.dateField, self.codeField] tot_fields.extend(new_fields) data_result = data_result[tot_fields] # add timestamp print(self.targetTableName) print(data_result.shape) data_result.loc[:, 'time_stamp'] = datetime.now() # dump chunk data into sql writeDB(self.targetTableName, data_result, ConfigQuant, self.if_exist) self.if_exist = 'append'
def run(self, startDate='2007-01-01'): self.prepareData() if self.stockState == '': # already the latest data return # get total date list (horizontally) tmp_state = 'select distinct %s from %s' % ( self.dateField, self.sourceStockTechIndiTableName) date_list = readDB(tmp_state, ConfigQuant).values date_list = date_list.T[0] if self.last_update_date == '': date_list = date_list[date_list > startDate] else: date_list = date_list[date_list > self.last_update_date] # sort date_list = np.sort(date_list) # ****************** area is the same for all dates, using future data!!! need to be modified if self.categoryField == 'area': tmp_state = "SELECT `%s`, `%s` FROM `%s`;" % ( self.codeField, self.categoryField, self.sourceCategoryTableName) stock_area = readDB(tmp_state, ConfigQuant) else: stock_area = None # calculate num of loop loop_num = int(date_list.size / self.chunkSize) if date_list.size > loop_num * self.chunkSize: loop_num = loop_num + 1 # fetch and process data from sql by chunk for i in range(loop_num): tmp_date_set = date_list[i * self.chunkSize:(i + 1) * self.chunkSize] tmp_date_str = list(map(lambda x: "'%s'" % x, tmp_date_set)) tmp_range = ','.join(tmp_date_str) # read stock return data tmp_state = self.stockState + "`%s` in (%s)" % (self.dateField, tmp_range) dataStockO = readDB(tmp_state, ConfigQuant) dataStockO = dataStockO.drop_duplicates( [self.dateField, self.codeField]) dataStockO.loc[:, self.retSeriesField] = dataStockO[ self.retSeriesField].astype('float') # dataStockO = dataStockO.fillna(0) data_result = self.getMarketRiseRatio(dataStockO, tmp_range, stock_area, tmp_date_set) new_fields = list( map(lambda x: 'RATIO_%s' % x, self.retSeriesField)) rename_dict = {} for field in zip(new_fields, self.retSeriesField): rename_dict[field[1]] = field[0] data_result = data_result.rename(columns=rename_dict) # add timestamp data_result['time_stamp'] = datetime.now() # dump chunk data into sql writeDB(self.targetTableName, data_result, ConfigQuant, self.if_exist) self.if_exist = 'append'
def runFull(self): # get total code list tmp_state = 'select distinct %s from %s' % (self.codeField, self.sourceTableName) code_list = readDB(tmp_state, ConfigQuant).values code_list = code_list.T[0] # calculate num of loop loop_num = int(code_list.size / self.chunkSize) if code_list.size > loop_num * self.chunkSize: loop_num = loop_num + 1 if self.isMultiProcess: # use multi processing # register pool pool = Pool(processes=self.processNum) # fetch and process data from sql by chunk for i in range(loop_num): tmp_code = code_list[i * self.chunkSize:(i + 1) * self.chunkSize] tmp_code_str = list(map(lambda x: "'%s'" % x, tmp_code)) tmp_range = ','.join(tmp_code_str) tmp_state = self.state + "`%s` in (%s)" % (self.codeField, tmp_range) dataO = readDB(tmp_state, ConfigQuant) dataO = dataO.drop_duplicates([self.dateField, self.codeField]) dataO = dataO.sort_values(self.dateField) # sort by date # dataO = dataO.fillna(method='ffill') # **** fill with pre-close, not simply forward fill for field in [ self.openField, self.highField, self.lowField, self.closeField, self.volumeField ]: dataO.loc[:, field] = dataO[field].astype( 'float') # change data type if self.turnoverField == '': # no turnover field, fill with all nan self.turnoverField = 'turnover' dataO[self.turnoverField] = np.nan # process chunk data pool_results = [] for code in tmp_code: tmp_data = dataO.loc[dataO[self.codeField] == code] # dataO already sorted by date if tmp_data.empty: continue tmp_procs = pool.apply_async(self.coreComputationFull, (tmp_data, )) pool_results.append(tmp_procs) # data_result = self.coreComputationFull(tmp_data) # get result from the pool data_tot_result = pd.DataFrame([]) for tmp_procs in pool_results: data_result = tmp_procs.get() data_tot_result = data_tot_result.append(data_result) # # trim data for increment # if self.last_update_date != '': # data_tot_result = data_tot_result.loc[data_tot_result[self.dateField] > self.last_update_date] # add timestamp data_tot_result['time_stamp'] = datetime.now() if data_tot_result.empty: continue # dump chunk data into sql writeDB(self.targetTableName, data_tot_result, ConfigQuant, self.if_exist) self.if_exist = 'append' else: # fetch and process data from sql by chunk for i in range(loop_num): tmp_code = code_list[i * self.chunkSize:(i + 1) * self.chunkSize] tmp_code_str = list(map(lambda x: "'%s'" % x, tmp_code)) tmp_range = ','.join(tmp_code_str) tmp_state = self.state + "`%s` in (%s)" % (self.codeField, tmp_range) dataO = readDB(tmp_state, ConfigQuant) dataO = dataO.drop_duplicates([self.dateField, self.codeField]) dataO = dataO.sort_values(self.dateField) # sort by date # dataO = dataO.fillna(method='ffill') # **** fill with pre-close, not simply forward fill for field in [ self.openField, self.highField, self.lowField, self.closeField, self.volumeField ]: dataO.loc[:, field] = dataO[field].astype( 'float') # change data type if self.turnoverField == '': # no turnover field, fill with all nan self.turnoverField = 'turnover' dataO[self.turnoverField] = np.nan # process chunk data data_tot_result = pd.DataFrame([]) for code in tmp_code: tmp_data = dataO.loc[dataO[self.codeField] == code] # dataO already sorted by date if tmp_data.empty: continue # data_result = tmp_data[[self.dateField, self.codeField]] # # # MA, MA_DIFF... # tmp_df = priceTechnicalIndicator(tmp_data[self.closeField], self.lags, '') # data_result = data_result.join(tmp_df) # # # ADX, CCI ... # tmp_df = priceTechnicalIndicatorOHLCV(tmp_data[self.openField], tmp_data[self.highField], tmp_data[self.lowField], # tmp_data[self.closeField], tmp_data[self.volumeField]) # data_result = data_result.join(tmp_df) # # # VMA, VMA_DIFF..., OBV, VOLUME_RATIO... # tmp_df = volumeTechnicalIndicators(tmp_data[self.closeField], tmp_data[self.volumeField], self.lags, 'V') # data_result = data_result.join(tmp_df) # # # turnover rolling sum, amplitude # tmp_df = priceTechnicalIndicatorRollingSum(tmp_data[self.openField], tmp_data[self.highField], # tmp_data[self.lowField], # tmp_data[self.closeField], tmp_data[self.volumeField], # tmp_data[self.turnoverField], self.lags) # data_result = data_result.join(tmp_df) data_result = self.coreComputationFull(tmp_data) data_tot_result = data_tot_result.append(data_result) # # trim data for increment # if self.last_update_date != '': # data_tot_result = data_tot_result.loc[data_tot_result[self.dateField] > self.last_update_date] # add timestamp data_tot_result['time_stamp'] = datetime.now() if data_tot_result.empty: continue # dump chunk data into sql writeDB(self.targetTableName, data_tot_result, ConfigQuant, self.if_exist) self.if_exist = 'append'
def runIncrm(self): # fetch and process all incremental data from sql dataO = readDB(self.state, ConfigQuant) dataO = dataO.drop_duplicates([self.dateField, self.codeField]) dataO = dataO.sort_values(self.dateField) # sort by date # dataO = dataO.fillna(method='ffill') # **** fill with pre-close, not simply forward fill # for field in [self.openField, self.highField, self.lowField, self.closeField, self.volumeField]: # dataO.loc[:, field] = dataO[field].astype('float') # change data type if self.turnoverField == '': # no turnover field, fill with all nan self.turnoverField = 'turnover' dataO[self.turnoverField] = np.nan # process incremental data code_list = dataO[self.codeField].unique() data_tot_result = pd.DataFrame([]) # use multiprocessing to improve computation hours if self.isMultiProcess: pool = Pool(processes=self.processNum) pool_results = [] # build pool for code in code_list: tmp_data = dataO.loc[dataO[self.codeField] == code] # dataO already sorted by date if tmp_data.empty: continue tmp_result = pool.apply_async(self.coreComputation, (tmp_data, )) pool_results.append(tmp_result) # get result from the pool for tmp_result in pool_results: data_result = tmp_result.get() data_tot_result = data_tot_result.append(data_result) else: for code in code_list: tmp_data = dataO.loc[dataO[self.codeField] == code] # dataO already sorted by date if tmp_data.empty: continue data_result = self.coreComputation(tmp_data) data_tot_result = data_tot_result.append(data_result) # data_result = tmp_data[[self.dateField, self.codeField]] # # # MA, MA_DIFF... # tmp_df = priceTechnicalIndicator(tmp_data[self.closeField], self.lags, '') # data_result = data_result.join(tmp_df) # # # ADX, CCI ... # tmp_df = priceTechnicalIndicatorOHLCV(tmp_data[self.openField], tmp_data[self.highField], tmp_data[self.lowField], # tmp_data[self.closeField], tmp_data[self.volumeField]) # data_result = data_result.join(tmp_df) # # # OBV... # tmp_df = volumeTechnicalIndicators(tmp_data[self.closeField], tmp_data[self.volumeField], self.lags, 'V') # data_result = data_result.join(tmp_df) # # # turnover rolling sum, amplitude # tmp_df = priceTechnicalIndicatorRollingSum(tmp_data[self.openField], tmp_data[self.highField], # tmp_data[self.lowField], # tmp_data[self.closeField], tmp_data[self.volumeField], # tmp_data[self.turnoverField], self.lags) # data_result = data_result.join(tmp_df) # # # trim data for increment # data_result = data_result.loc[data_result[self.dateField] > self.last_update_date] # # data_tot_result = data_tot_result.append(data_result) # if self.last_update_date != '': # data_tot_result = data_tot_result.loc[data_tot_result[self.dateField] > self.last_update_date] if not data_tot_result.empty: # add timestamp data_tot_result['time_stamp'] = datetime.now() # dump chunk data into sql writeDB(self.targetTableName, data_tot_result, ConfigQuant, self.if_exist) self.if_exist = 'append'
def runFull(self): # get total code list tmp_state = 'select distinct %s from %s' % (self.codeField, self.sourceTableName) code_list = readDB(tmp_state, ConfigQuant).values code_list = code_list.T[0] # get total trade dates tmp_state = "select `%s` from %s" % (self.dateField, self.calendarTableName) trade_calendar = readDB(tmp_state, ConfigQuant) trade_calendar = trade_calendar[self.dateField].values trade_calendar = trade_calendar[trade_calendar >= self.init_date] # calculate num of loop for time series calculation code_chunk_num = int(code_list.size / self.chunkSize) if code_list.size > code_chunk_num * self.chunkSize: code_chunk_num = code_chunk_num + 1 # calculate num of loop for time horizon calculation date_chunk_num = int(trade_calendar.size / self.chunkSize) if trade_calendar.size > date_chunk_num * self.chunkSize: date_chunk_num = date_chunk_num + 1 if self.isMultiProcess: # use multi processing # register pool pool = Pool(processes=self.processNum) # calculate time series derivative features data_tot_result_time_series = pd.DataFrame([]) for i in range(code_chunk_num): tmp_code = code_list[i * self.chunkSize:(i + 1) * self.chunkSize] tmp_code_str = list(map(lambda x: "'%s'" % x, tmp_code)) tmp_range = ','.join(tmp_code_str) tmp_state = self.state + "`%s` in (%s)" % (self.codeField, tmp_range) dataO = readDB(tmp_state, ConfigQuant) dataO = dataO.drop_duplicates([self.dateField, self.codeField]) # dataO = dataO.sort_values(self.dateField) # sort by date # process chunk data pool_results = [] for code in tmp_code: tmp_data = dataO.loc[dataO[self.codeField] == code] # dataO already sorted by date tmp_data = tmp_data.sort_values(self.dateField) if tmp_data.empty: continue # multiprocessing tmp_procs = pool.apply_async( self.coreComputationTimeSeries, (tmp_data, )) pool_results.append(tmp_procs) # data_result = self.coreComputationFull(tmp_data) # get result from the process pool for tmp_procs in pool_results: data_result = tmp_procs.get() data_tot_result_time_series = data_tot_result_time_series.append( data_result) # calculate time horizon derivative features data_tot_result_time_horizon = pd.DataFrame([]) for i in range(date_chunk_num): tmp_date = trade_calendar[i * self.chunkSize:(i + 1) * self.chunkSize] tmp_date_str = list(map(lambda x: "'%s'" % x, tmp_date)) tmp_range = ','.join(tmp_date_str) tmp_state = self.state + "`%s` in (%s)" % (self.dateField, tmp_range) dataO = readDB(tmp_state, ConfigQuant) dataO = dataO.drop_duplicates([self.dateField, self.codeField]) # dataO = dataO.sort_values(self.dateField) # sort by date # process chunk data pool_results = [] for single_date in tmp_date: tmp_data = dataO.loc[ dataO[self.dateField] == single_date] # dataO already sorted by date if tmp_data.empty: continue # multiprocessing tmp_procs = pool.apply_async( self.coreComputationTimeHorizon, (tmp_data, )) pool_results.append(tmp_procs) # data_result = self.coreComputationFull(tmp_data) # get result from the process pool for tmp_procs in pool_results: data_result = tmp_procs.get() data_tot_result_time_horizon = data_tot_result_time_horizon.append( data_result) # combine time series and time horizon features data_tot_result = data_tot_result_time_series.merge( data_tot_result_time_horizon, on=[self.dateField, self.codeField], how='outer') # add timestamp data_tot_result.loc[:, 'time_stamp'] = datetime.now() # dump all data into database writeDB(self.targetTableName, data_tot_result, ConfigQuant, self.if_exist) self.if_exist = 'append' pool.close() else: # calculate time series derivative features data_tot_result_time_series = pd.DataFrame([]) for i in range(code_chunk_num): tmp_code = code_list[i * self.chunkSize:(i + 1) * self.chunkSize] tmp_code_str = list(map(lambda x: "'%s'" % x, tmp_code)) tmp_range = ','.join(tmp_code_str) tmp_state = self.state + "`%s` in (%s)" % (self.codeField, tmp_range) dataO = readDB(tmp_state, ConfigQuant) dataO = dataO.drop_duplicates([self.dateField, self.codeField]) # dataO = dataO.sort_values(self.dateField) # sort by date # process chunk data for code in tmp_code: tmp_data = dataO.loc[dataO[self.codeField] == code] # dataO already sorted by date tmp_data = tmp_data.sort_values(self.dateField) if tmp_data.empty: continue data_result = self.coreComputationTimeSeries(tmp_data) data_tot_result_time_series = data_tot_result_time_series.append( data_result) # calculate time horizon derivative features data_tot_result_time_horizon = pd.DataFrame([]) for i in range(date_chunk_num): tmp_date = trade_calendar[i * self.chunkSize:(i + 1) * self.chunkSize] tmp_date_str = list(map(lambda x: "'%s'" % x, tmp_date)) tmp_range = ','.join(tmp_date_str) tmp_state = self.state + "`%s` in (%s)" % (self.dateField, tmp_range) dataO = readDB(tmp_state, ConfigQuant) dataO = dataO.drop_duplicates([self.dateField, self.codeField]) # dataO = dataO.sort_values(self.dateField) # sort by date # process chunk data for single_date in tmp_date: tmp_data = dataO.loc[ dataO[self.dateField] == single_date] # dataO already sorted by date if tmp_data.empty: continue # multiprocessing data_result = self.coreComputationTimeHorizon(tmp_data) data_tot_result_time_horizon = data_tot_result_time_horizon.append( data_result) # combine time series and time horizon features data_tot_result = data_tot_result_time_series.merge( data_tot_result_time_horizon, on=[self.dateField, self.codeField], how='outer') # add timestamp data_tot_result.loc[:, 'time_stamp'] = datetime.now() # dump chunk data into database writeDB(self.targetTableName, data_tot_result, ConfigQuant, self.if_exist) self.if_exist = 'append'
def runIncrm(self): # fetch and process all incremental data from sql dataO = readDB(self.state, ConfigQuant) dataO = dataO.drop_duplicates([self.dateField, self.codeField]) # dataO = dataO.sort_values(self.dateField) # sort by date # process incremental data code_list = dataO[self.codeField].unique() trade_calendar = dataO[self.dateField].unique() trade_calendar = trade_calendar[trade_calendar > self.last_update_date] # ==== calculate features if self.isMultiProcess: # build pool pool = Pool(processes=self.processNum) # calculate time series features data_tot_result_time_series = pd.DataFrame([]) pool_results = [] for code in code_list: tmp_data = dataO.loc[dataO[self.codeField] == code] tmp_data = tmp_data.sort_values(self.dateField) # sort by date if tmp_data.empty: continue tmp_result = pool.apply_async(self.coreComputationTimeSeries, (tmp_data, )) pool_results.append(tmp_result) # get result from the pool (time series) for tmp_result in pool_results: data_result = tmp_result.get() data_tot_result_time_series = data_tot_result_time_series.append( data_result) # calculate time horizon features data_tot_result_time_horizon = pd.DataFrame([]) pool_results = [] for trade_date in trade_calendar: tmp_data = dataO.loc[dataO[self.dateField] == trade_date] # no need to sort date if tmp_data.empty: continue tmp_result = pool.apply_async(self.coreComputationTimeHorizon, (tmp_data, )) pool_results.append(tmp_result) # get result from the pool (time horizon) for tmp_result in pool_results: data_result = tmp_result.get() data_tot_result_time_horizon = data_tot_result_time_horizon.append( data_result) pool.terminate() # single process else: # calculate time series features data_tot_result_time_series = pd.DataFrame([]) for code in code_list: tmp_data = dataO.loc[dataO[self.codeField] == code] tmp_data = tmp_data.sort_values( self.dateField) # sorted by date if tmp_data.empty: continue data_result = self.coreComputationTimeSeries(tmp_data) data_tot_result_time_series = data_tot_result_time_series.append( data_result) # calculate time horizon features data_tot_result_time_horizon = pd.DataFrame([]) for trade_date in trade_calendar: tmp_data = dataO.loc[ dataO[self.dateField] == trade_date] # dataO already sorted by date if tmp_data.empty: continue data_result = self.coreComputationTimeHorizon(tmp_data) data_tot_result_time_horizon = data_tot_result_time_horizon.append( data_result) # write to sql (both multiprocess and single process) data_tot_result = data_tot_result_time_series.merge( data_tot_result_time_horizon, on=[self.dateField, self.codeField], how='outer') # combine time series and time horizon features data_tot_result = data_tot_result.loc[data_tot_result[ self.dateField] > self.last_update_date] # truncate data if not data_tot_result.empty: # add timestamp data_tot_result['time_stamp'] = datetime.now() # dump chunk data into sql writeDB(self.targetTableName, data_tot_result, ConfigQuant, self.if_exist)
def run(self, startDate='2007-01-01'): self.prepareData() if self.basicState == '': # already the latest data return # get total date list (horizontally) tmp_state = 'select distinct %s from %s' % (self.dateField, self.sourceBasicTableName) date_list = readDB(tmp_state, ConfigQuant).values date_list = date_list.T[0] if self.last_update_date == '': date_list = date_list[date_list > startDate] else: date_list = date_list[date_list > self.last_update_date] # get area (if category is area) ****************** area is the same for all dates, that means using future data!!! need to be modified if self.categoryField == 'area': tmp_state = "SELECT `%s`, `%s` FROM `%s`;" % ( self.codeField, self.categoryField, self.sourceCategoryTableName) stock_area = readDB(tmp_state, ConfigQuant) else: stock_area = None # calculate num of loop loop_num = int(date_list.size / self.chunkSize) if date_list.size > loop_num * self.chunkSize: loop_num = loop_num + 1 # fetch and process data from sql by chunk for i in range(loop_num): tmp_date = date_list[i * self.chunkSize:(i + 1) * self.chunkSize] tmp_date_str = list(map(lambda x: "'%s'" % x, tmp_date)) tmp_range = ','.join(tmp_date_str) # read basic data tmp_state = self.basicState + "`%s` in (%s)" % (self.dateField, tmp_range) dataBasicO = readDB(tmp_state, ConfigQuant) for field in self.reverseSeriesField: dataBasicO[field] = 1. / dataBasicO[field] # read technical data tmp_state = self.techState + "`%s` in (%s)" % (self.dateField, tmp_range) dataTechO = readDB(tmp_state, ConfigQuant) for tmp_type in ['dataBasicO', 'dataTechO']: exec( "{0} = {0}.drop_duplicates([self.dateField, self.codeField])" .format(tmp_type)) exec("{0} = {0}.sort_values(self.dateField)".format( tmp_type)) # sort by date # dataBasicO = dataBasicO.fillna(method='ffill') # for field in self.basicSeriesField: # dataBasicO.loc[:, field] = dataBasicO[field].astype('float') # change data type # process chunk data data_result = pd.DataFrame([]) # Fundamental data uni_dates = dataBasicO[self.dateField].unique() dataBasicO = self.getStockCategory( dataBasicO, uni_dates, stock_area) # merge with stock category tmp_df = priceOtherIndicatorRanking(dataBasicO, uni_dates, self.dateField, self.codeField, self.basicSeriesField, self.categoryField) # ranking data_result = pd.concat([data_result, tmp_df]) # Technical data uni_dates = dataTechO[self.dateField].unique() dataTechO = self.getStockCategory( dataTechO, uni_dates, stock_area) # merge with stock category tmp_df = priceOtherIndicatorRanking(dataTechO, uni_dates, self.dateField, self.codeField, self.techSeriesField, self.categoryField) # ranking data_result = data_result.merge( tmp_df, on=[self.dateField, self.codeField], how='outer') # add timestamp data_result['time_stamp'] = datetime.now() # dump chunk data into sql writeDB(self.targetTableName, data_result, ConfigQuant, self.if_exist) self.if_exist = 'append'