def predict(file): data = pd.read_csv(file) data = cleanse_sample(data, keys=['rdate', 'rid', 'hid'], indices=[]) # pre-process data try: modify = pd.read_csv(file.replace('.csv', '_modified.csv')) except FileNotFoundError: modify = RacingPredictor.pre_process(file, persistent=True) # perform standardization modify = standardize(modify) # slice data x_test, y_test = slice_classification_data(modify) # prediction clf = lgb.Booster(model_file='lgb_classifier.txt') winprob = clf.predict(x_test) data['winprob'] = 0 i = 0 groups = data.groupby(['rdate', 'rid']) for name, group in groups: total = np.sum(winprob[i, 0:len(group)]) j = 0 for index, row in group.iterrows(): row['winprob'] = winprob[i, j] / total data.iloc[index] = row j += 1 i += 1 data['plaprob'] = WinP2PlaP(data, wpcol='winprob') fixratio = 1 / 10000 mthresh = 9 print("Getting win stake...") data['winstake'] = fixratio * (data['winprob'] * data['win_t5'] > mthresh) print("Getting place stake...") data['plastake'] = fixratio * (data['plaprob'] * data['place_t5'] > mthresh) data.to_csv('test_result.csv')
def predict(file): data = pd.read_csv(file) data = cleanse_sample(data, keys=['rdate', 'rid', 'hid'], indices=[]) # pre-process data try: modify = pd.read_csv(file.replace('.csv', '_modified.csv')) except FileNotFoundError: modify = RacingPredictor.pre_process(file, persistent=True) # perform standardization modify = standardize(modify) # slice data x_test, y_test = slice_naive_data(modify) # prediction clf = lgb.Booster(model_file='lgb_classifier.txt') winprob = clf.predict(x_test) data['winprob'] = winprob[:, 1] data['plaprob'] = winprob[:, 1] + winprob[:, 2] + winprob[:, 3] fixratio = 5e-3 mthresh = 1.6 print("Getting win stake...") data['winstake'] = fixratio * (data['winprob'] * data['win_t5'] > mthresh) print("Getting place stake...") data['plastake'] = fixratio * (data['plaprob'] * data['place_t5'] > mthresh) data.to_csv('test_result.csv') return data
def pre_process(file, persistent=False): """ To pre-process the data for further operation(s). :param file: Path to a csv file. :param persistent: A boolean variable indicating whether to make the pre-processed data persistent locally. """ # create a duplicate of data print('start pre-processing...') duplicate = pd.read_csv(file) # define keys for detecting duplicates keys = ['rdate', 'rid', 'hid'] # define indices of rows to be removed indices = [] # cleanse invalid sample(s) print('cleansing invalid sample...') duplicate = cleanse_sample(duplicate, keys=keys, indices=indices) # define rules for dropping feature rules = [ # useless features 'horsenum', 'rfinishm', 'runpos', 'windist', 'win', 'place', '(rm|p|m|d)\d+', # features containing too many NANs 'ratechg', 'horseweightchg', 'besttime', 'age', 'priority', 'lastsix', 'runpos', 'datediff', # features which are difficult to process 'gear', 'pricemoney' ] # eliminate useless features print('eliminating useless features...') duplicate = cleanse_feature(duplicate, rules=rules) # specify columns to be filled columns = [ 'bardraw', 'finishm', 'exweight', 'horseweight', 'win_t5', 'place_t5' ] # specify corresponding methods methods = [('constant', 4), ('constant', 1e5), ('constant', 122.61638888121101), ('constant', 1106.368874062333), ('constant', 26.101661368452852), ('constant', 6.14878956518161)] # fill nan value(s) print('filling nans...') duplicate = fill_nan(duplicate, columns=columns, methods=methods) # specify columns to be replaced columns = ['bardraw', 'finishm', 'exweight', 'horseweight'] # specify schema(s) of replacement values = [(0, 14), (0, 1e5), (0, 122.61638888121101), (0, 1106.368874062333)] # replace invalid value(s) print('replacing invalid values...') duplicate = replace_invalid(duplicate, columns=columns, values=values) # convert 'finishm' into 'velocity' print('generating velocity...') duplicate[ 'velocity'] = 1e4 * duplicate['distance'] / duplicate['finishm'] # apply target encoding on 'class' print('processing class...') duplicate = process_class(duplicate) # apply target encoding on 'jname' and 'tname' print('processing jname and tname...') duplicate = process_name(duplicate) # apply target encoding on 'venue' and 'course' print('processing venue and course...') duplicate = process_course(duplicate) # apply target encoding on 'track' and 'going' print('processing track and going...') duplicate = process_going(duplicate) # conduct local persistence if persistent: # set index before saving duplicate.set_index('index', inplace=True) print('saving result...') duplicate.to_csv(file.replace('.csv', '_modified.csv')) return duplicate
def predict(self): # pre-process data try: modify = pd.read_csv(self.file.replace('.csv', '_modified.csv')) except FileNotFoundError: modify = self.pre_process() # perform standardization modify = standardize(modify) # slice data x_test, y_test = slice_regression_data(modify) # get graph graph = tf.get_default_graph() # session with tf.Session(graph=graph) as sess: # restore the latest model file_list = os.listdir('save/') file_list.sort(key=lambda val: val) loader = tf.train.import_meta_graph('save/%s/model.meta' % file_list[-2]) # get input tensor training_tensor = graph.get_tensor_by_name('init/training_1:0') input_tensor = graph.get_tensor_by_name('init/input_1:0') velocity_tensor = graph.get_tensor_by_name('init/velocity_1:0') # get output tensor output_tensor = graph.get_tensor_by_name( 'race_predictor/velocity_output/MatMul:0') alpha_tensor = graph.get_tensor_by_name( 'race_predictor/alpha_output/MatMul:0') # get loss tensor loss_tensor = graph.get_tensor_by_name('optimizer/velocity_loss:0') sess.run( tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())) loader.restore( sess, tf.train.latest_checkpoint('save/%s' % file_list[-2])) velocity, alpha, loss = sess.run( [output_tensor, alpha_tensor, loss_tensor], feed_dict={ training_tensor: False, input_tensor: x_test, velocity_tensor: y_test }) self.data = cleanse_sample(self.data, ['rdate', 'rid', 'hid'], []) self.data = self.data.reset_index(drop=True) self.data['p_velocity'] = 0 self.data['p_rank'] = 0 output = np.reshape(velocity - alpha, newshape=(-1, )) i = 0 groups = self.data.groupby(['rdate', 'rid']) for name, group in groups: if name[0] < '2019': i += len(group) continue match = output[i:i + len(group)] rank = np.argsort(match)[::-1] rank = np.array( [np.where(rank == k)[0][0] + 1 for k in range(len(match))]) j = 0 for index, row in group.iterrows(): row['p_velocity'] = match[j] row['p_rank'] = rank[j] self.data.iloc[index] = row j += 1 i += len(group) print( group.get( ['rdate', 'rid', 'hid', 'finishm', 'rank', 'velocity'])) print(match) print(rank) self.data.to_csv('test_result.csv')
def predict(self): # pre-process data try: modify = pd.read_csv(self.file.replace('.csv', '_modified.csv')) except FileNotFoundError: modify = self.pre_process(persistent=True) # perform standardization modify = standardize(modify) # slice data x_test, y_test = slice_classification_data(modify) # get graph graph = tf.get_default_graph() # session with tf.Session(graph=graph) as sess: # restore the latest model file_list = os.listdir('save/') file_list.sort(key=lambda val: val) loader = tf.train.import_meta_graph('save/%s/model.meta' % file_list[-2]) # get input tensor training_tensor = graph.get_tensor_by_name('init/training_1:0') input_tensor = graph.get_tensor_by_name('init/input_1:0') win_tensor = graph.get_tensor_by_name('init/win_1:0') # get output tensor output_tensor = graph.get_tensor_by_name( 'race_predictor/win_output:0') # get loss tensor loss_tensor = graph.get_tensor_by_name('optimizer/total_loss:0') sess.run( tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())) loader.restore( sess, tf.train.latest_checkpoint('save/%s' % file_list[-2])) win, loss = sess.run([output_tensor, loss_tensor], feed_dict={ training_tensor: False, input_tensor: x_test, win_tensor: y_test }) self.data = cleanse_sample(self.data, ['rdate', 'rid', 'hid'], []) self.data = self.data.reset_index(drop=True) self.data['winprob'] = 0 i = 0 groups = self.data.groupby(['rdate', 'rid']) for name, group in groups: total = np.sum(win[i, 0:len(group)]) j = 0 for index, row in group.iterrows(): row['winprob'] = win[i, j] / total self.data.iloc[index] = row j += 1 i += 1 self.data['plaprob'] = WinP2PlaP(self.data, wpcol='winprob') fixratio = 1 / 10000 mthresh = 9 print("Getting win stake...") self.data['winstake'] = fixratio * ( self.data['winprob'] * self.data['win_t5'] > mthresh) print("Getting place stake...") self.data['plastake'] = fixratio * ( self.data['plaprob'] * self.data['place_t5'] > mthresh) self.data.to_csv('test_result.csv')
def predict(file): data = pd.read_csv(file) data = cleanse_sample(data, keys=['rdate', 'rid', 'hid'], indices=[]) # pre-process data try: modify = pd.read_csv(file.replace('.csv', '_modified.csv')) except FileNotFoundError: modify = RacingPredictor.pre_process(file, persistent=True) # perform standardization modify = standardize(modify) # slice data x_test, y_test = slice_naive_data(modify) # get graph graph = tf.get_default_graph() # session with tf.Session(graph=graph) as sess: # restore the latest model file_list = os.listdir('save/') file_list.sort(key=lambda val: val) loader = tf.train.import_meta_graph('save/%s/model.meta' % file_list[-2]) # get input tensor training_tensor = graph.get_tensor_by_name('init/training:0') input_tensor = graph.get_tensor_by_name('init/input:0') win_tensor = graph.get_tensor_by_name('init/win:0') # get output tensor output_tensor = graph.get_tensor_by_name( 'race_predictor/win_output:0') # get loss tensor loss_tensor = graph.get_tensor_by_name('optimizer/total_loss:0') sess.run( tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())) loader.restore( sess, tf.train.latest_checkpoint('save/%s' % file_list[-2])) prob, loss = sess.run([output_tensor, loss_tensor], feed_dict={ training_tensor: False, input_tensor: x_test, win_tensor: y_test }) data['winprob'] = prob[:, 1] data['2ndprob'] = prob[:, 2] data['3rdprob'] = prob[:, 3] data['winprob'] = data.apply(normalize, axis=1, df=data, key='winprob') data['2ndprob'] = data.apply(normalize, axis=1, df=data, key='2ndprob') data['3rdprob'] = data.apply(normalize, axis=1, df=data, key='3rdprob') data['plaprob'] = data['winprob'] + data['2ndprob'] + data[ '3rdprob'] fixratio = 1e-4 mthresh = 2.5 print("Getting win stake...") data['winstake'] = fixratio * (data['winprob'] * data['win_t5'] > mthresh) print("Getting place stake...") data['plastake'] = fixratio * (data['plaprob'] * data['place_t5'] > mthresh) result = backtest(data, 'winprob', 'plaprob', 'winstake', 'plastake') return result
def train(self): # pre-process data try: modify = pd.read_csv(self.file.replace('.csv', '_modified.csv')) except FileNotFoundError: modify = RacingPredictor.pre_process(self.file, persistent=True) # drop outdated data # modify = modify[:][[val > '2015' for val in modify['rdate']]] # perform standardization modify = standardize(modify) # slice data x_train, y_train = slice_naive_data(modify) # define validation set validation = None x_test, y_test = None, None # generate model win = self.model() win_summary = tf.summary.histogram('win_summary', win) with tf.variable_scope(name_or_scope='optimizer'): # loss function # total_loss = tf.reduce_mean(tf.reduce_sum(cross_entropy(self._win, win), axis=-1), name='total_loss') total_loss = tf.reduce_mean(rmse(self._win, win), name='total_loss') loss_summary = tf.summary.scalar('loss_summary', total_loss) # optimizer update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_ops = tf.train.AdamOptimizer( learning_rate=self.learning_rate).minimize(total_loss) # configuration if not os.path.isdir('save'): os.mkdir('save') config = tf.ConfigProto() print('Start training') with tf.Session(config=config) as sess: # initialization sess.run( tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())) # saver optimal = np.inf saver = tf.train.Saver(max_to_keep=5) # store the network graph for tensorboard visualization writer = tf.summary.FileWriter('save/network_graph', sess.graph) merge_op = tf.summary.merge([win_summary, loss_summary]) # data set queue = tf.train.slice_input_producer([x_train, y_train], num_epochs=self.num_epochs, shuffle=True) x_batch, y_batch = tf.train.batch(queue, batch_size=self.batch_size, num_threads=1, allow_smaller_final_batch=False) # enable coordinator coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess, coord) try: for i in range(self.iterations): x, y = sess.run([x_batch, y_batch]) _, loss, sm = sess.run([train_ops, total_loss, merge_op], feed_dict={ self.training: True, self._input: x, self._win: y }) if i % 100 == 0: print('iteration %d: loss = %f' % (i, loss)) writer.add_summary(sm, i) writer.flush() if i % 500 == 0: if validation is None: # read validation set validation = pd.read_csv('new_data/test_new.csv') validation = cleanse_sample( validation, keys=['rdate', 'rid', 'hid'], indices=[]) # slice testing data x_test, y_test = slice_naive_data( standardize( pd.read_csv( 'new_data/test_new_modified.csv'))) prob, loss = sess.run( [win, total_loss], feed_dict={ self.training: False, self._input: x_test, self._win: y_test }) validation['winprob'] = prob[:, 1] validation['2ndprob'] = prob[:, 2] validation['3rdprob'] = prob[:, 3] validation['winprob'] = validation.apply(normalize, axis=1, df=validation, key='winprob') validation['2ndprob'] = validation.apply(normalize, axis=1, df=validation, key='2ndprob') validation['3rdprob'] = validation.apply(normalize, axis=1, df=validation, key='3rdprob') validation[ 'plaprob'] = validation['winprob'] + validation[ '2ndprob'] + validation['3rdprob'] fixratio = 5e-4 mthresh = 2.5 print("Getting win stake...") validation['winstake'] = fixratio * ( validation['winprob'] * validation['win_t5'] > mthresh) print("Getting place stake...") validation['plastake'] = fixratio * ( validation['plaprob'] * validation['place_t5'] > mthresh) result = backtest(validation, 'winprob', 'plaprob', 'winstake', 'plastake') if 0.35 * result['AverageRMSEwin'] + 0.65 * result[ 'AverageRMSEpalce'] < optimal: optimal = 0.35 * result[ 'AverageRMSEwin'] + 0.65 * result[ 'AverageRMSEpalce'] print( 'save at iteration %d with average loss of %f' % (i, optimal)) saver.save( sess, 'save/%s/model' % (time.strftime('%Y-%m-%d_%H-%M-%S', time.localtime(time.time())))) except tf.errors.OutOfRangeError: print('Done training -- epoch limit reached') saver.save( sess, 'save/%s/model' % (time.strftime( '%Y-%m-%d_%H-%M-%S', time.localtime(time.time())))) writer.close() finally: coord.request_stop() coord.join(threads)