def upload(args): args, path = args parse = lambda x: dateutil.parser.parse(x, dayfirst=True) try: directory, file_name = split(path) file_part, ext = splitext(file_name) formatter = logging.Formatter('%(asctime)s - ' + file_name + ' - %(levelname)s: %(message)s') configure_root_logger(args.logtty, args.logfile, formatter=formatter) db = MongoClient(args.host, args.port)[args.db] if ext == '.zip': logging.info('Reading zipped csv file into memory') fin = zipfile.ZipFile(path, 'r').open(file_part + '.csv') else: logging.info('Reading csv file into memory') fin = path bars = pd.read_csv(fin, parse_dates=['SCHEDULED_OFF'], date_parser=parse) bars.columns = bars.columns.map(lambda x: x.lower()) bars = bars.rename(columns={'event_id': 'market_id'}) for col in ['market_id', 'selection_id']: bars[col] = bars[col].map(str) # Make sure dtype==str # Insert other filters here: bars = bars[bars.in_play == 'PE'] bars['selection'] = bars['selection'].map(extract_horse_name) races = races_from_bars(bars).reset_index() train = training_from_races(races) vwao = vwao_from_bars(bars) try: db[args.races].insert(pandas_to_dicts(races), continue_on_error=True) except DuplicateKeyError as e: logging.error('Some duplicate keys in %s; If this is a surprise, ABORT! msg=%s' % (db[args.races], e)) try: db[args.train].insert(convert_types(train, {'n_runners': int}), continue_on_error=True) except DuplicateKeyError as e: logging.error('Some duplicate keys in %s; If this is a surprise, ABORT! msg=%s' % (db[args.train], e)) try: db[args.vwao].insert(vwao, continue_on_error=True) except DuplicateKeyError as e: logging.error('Some duplicate keys in %s; If this is a surprise, ABORT! msg=%s' % (db[args.vwao], e)) logging.info('Successfully uploaded to %s' % db) except Exception as e: logging.critical(e) raise
def run_backtest(context): n_bkt, args, mparams = context formatter = logging.Formatter('%(asctime)s - n_bkt=' + str(n_bkt) + ' - %(levelname)s: %(message)s') configure_root_logger(args.logtty, args.logfile, MongoClient(args.host, args.port)[args.db][args.logmongo] if args.logmongo is not None else None, formatter=formatter) db = MongoClient(args.host, args.port)[args.db] where_clause = defaultdict(lambda: {}) country, start_date, end_date = 'GB', parse_date(args.start), parse_date(args.end) if start_date is not None: where_clause['scheduled_off']['$gte'] = start_date if end_date is not None: where_clause['scheduled_off']['$lte'] = end_date if country is not None: where_clause['country'] = country sorted_races = db[args.train].find(where_clause, sort=[('scheduled_off', 1)], timeout=False) exec_services = HistoricalExecutionService(db) strat = strategy.Balius(mu=mparams['mu'], sigma=mparams['sigma'], beta=mparams['beta'], tau=mparams['tau'], draw_probability=mparams['draw_prob'], risk_aversion=mparams['risk_aversion'], min_races=mparams['min_races'], max_exposure=mparams['max_exposure']) st = time.clock() strategy.backtest(exec_services, strat, sorted_races) en = time.clock() logging.info('Backtest finished in %.2f seconds' % (en - st)) strat_dict = strat.to_dict() strat_id = db[STRATEGIES_COLL].insert(strat_dict) logging.info('Strategy serialised to %s with id=%s' % (db[STRATEGIES_COLL], strat_id)) bets = price_historical_bets(db, exec_services.get_mu_bets()[0]) scorecard = make_scorecard(bets) now = datetime.datetime.utcnow() scorecard.update({'params': {'ts': strat_dict['hm']['ts'], 'risk': strat_dict['risk']}, 'timestamp': now, 'run_seconds': en - st, 'strategy_id': strat_id}) scorecard_id = db[SCORECARDS_COLL].insert(scorecard) logging.info('Scorecard inserted in %s with id=%s' % (db[SCORECARDS_COLL], scorecard_id)) db[BETS_COLL].insert(add_scorecard_id_to_dicts(scorecard_id, bets)) logging.info('Associated bets inserted in %s' % db[BETS_COLL]) markets = market_breakdown(bets).reset_index() markets = pandas_to_dicts(markets, {'n_runners': int}) db[MARKETS_COLL].insert(add_scorecard_id_to_dicts(scorecard_id, markets)) logging.info('Associated market breakdown inserted in %s' % db[MARKETS_COLL])
def upload(args): first = lambda x: x.iget(0) args, path = args try: _, file_name = split(path) formatter = logging.Formatter('%(asctime)s - ' + file_name + ' - %(levelname)s: %(message)s') configure_root_logger(formatter=formatter) db = MongoClient(args.host, args.port)[args.db] logging.info('Reading csv file into memory') races = pd.read_csv(path, sep='\t', parse_dates=[[0, 1]], dayfirst=True) if len(races) <=2 : logging.warning('No races in file. Skipping') return races.rename(columns={'race_date_race_time': 'scheduled_off', 'horse_name': 'selection', 'place': 'ranking'}, inplace=True) races['selection'] = races['selection'].map(parse_horse_name) races['ranking'] = races['ranking'].map(parse_place) races = pd.DataFrame.from_dict([{'course': k[0], 'scheduled_off': k[1], 'selection': v['selection'][v.ranking >= 0].tolist(), 'ranking': (v['ranking'][v.ranking >= 0] - 1).tolist()} for k, v in races.groupby(['track', 'scheduled_off'])]) #print(races) dtypes = list(races.dtypes[races.dtypes == np.int64].index) type_mappers = dict(zip(dtypes, [int] * len(dtypes))) db[args.races].insert(pandas_to_dicts(races)) logging.info('Successfully uploaded to %s' % db) except Exception as e: logging.critical(e) raise
import zipfile from os.path import split, splitext import argparse from multiprocessing import Pool, cpu_count parser = argparse.ArgumentParser(description='Uploads Betfair historical data to a MongoDB database') parser.add_argument('files', metavar='FILES', type=str, nargs='+', help='zip/csv/pd files to upload') parser.add_argument('--host', type=str, action='store', default='localhost', help='MongoDB host (default=localhost)') parser.add_argument('--port', type=int, action='store', default=33000, help='MongoDB port (default=33000)') parser.add_argument('--db', type=str, action='store', default='betfair', help='db (default=betfair)') parser.add_argument('--jobs', type=int, action='store', default=-1, help='how many jobs to use') parser.add_argument('--races', type=str, action='store', default='races', help='races collection (default=races)') parser.add_argument('--train', type=str, action='store', default='train', help='training set collection (default=train)') parser.add_argument('--vwao', type=str, action='store', default='vwao', help='volume-weighted-average-odds (vwao) collection (default=vwao)') parser.add_argument('--logfile', type=str, action='store', default=None, help='specifies what log file to use') parser.add_argument('--logtty', help='prints logging info to the terminal', action='store_true') args = parser.parse_args() configure_root_logger(args.logtty, args.logfile) if len(args.files) > 1: cpus = min(cpu_count(), len(args.files)) if args.jobs < 0 else args.jobs logging.info('Creating a pool with %d worker processes..' % cpus) pool = Pool(processes=cpus) pool.map(upload, zip([args] * len(args.files), args.files)) else: upload((args, args.files[0]))
def upload(args): args, path = args parse = lambda x: dateutil.parser.parse(x, dayfirst=True) try: directory, file_name = split(path) file_part, ext = splitext(file_name) formatter = logging.Formatter('%(asctime)s - ' + file_name + ' - %(levelname)s: %(message)s') configure_root_logger(args.logtty, args.logfile, formatter=formatter) db = MongoClient(args.host, args.port)[args.db] if ext == '.zip': logging.info('Reading zipped csv file into memory') fin = zipfile.ZipFile(path, 'r').open(file_part + '.csv') else: logging.info('Reading csv file into memory') fin = path bars = pd.read_csv(fin, parse_dates=['SCHEDULED_OFF'], date_parser=parse) bars.columns = bars.columns.map(lambda x: x.lower()) bars = bars.rename(columns={'event_id': 'market_id'}) for col in ['market_id', 'selection_id']: bars[col] = bars[col].map(str) # Make sure dtype==str # Insert other filters here: bars = bars[bars.in_play == 'PE'] bars['selection'] = bars['selection'].map(extract_horse_name) races = races_from_bars(bars).reset_index() train = training_from_races(races) vwao = vwao_from_bars(bars) try: db[args.races].insert(pandas_to_dicts(races), continue_on_error=True) except DuplicateKeyError as e: logging.error( 'Some duplicate keys in %s; If this is a surprise, ABORT! msg=%s' % (db[args.races], e)) try: db[args.train].insert(convert_types(train, {'n_runners': int}), continue_on_error=True) except DuplicateKeyError as e: logging.error( 'Some duplicate keys in %s; If this is a surprise, ABORT! msg=%s' % (db[args.train], e)) try: db[args.vwao].insert(vwao, continue_on_error=True) except DuplicateKeyError as e: logging.error( 'Some duplicate keys in %s; If this is a surprise, ABORT! msg=%s' % (db[args.vwao], e)) logging.info('Successfully uploaded to %s' % db) except Exception as e: logging.critical(e) raise
type=str, action='store', default='train', help='training set collection (default=train)') parser.add_argument( '--vwao', type=str, action='store', default='vwao', help='volume-weighted-average-odds (vwao) collection (default=vwao)') parser.add_argument('--logfile', type=str, action='store', default=None, help='specifies what log file to use') parser.add_argument('--logtty', help='prints logging info to the terminal', action='store_true') args = parser.parse_args() configure_root_logger(args.logtty, args.logfile) if len(args.files) > 1: cpus = min(cpu_count(), len( args.files)) if args.jobs < 0 else args.jobs logging.info('Creating a pool with %d worker processes..' % cpus) pool = Pool(processes=cpus) pool.map(upload, zip([args] * len(args.files), args.files)) else: upload((args, args.files[0]))
logging.info('Successfully uploaded to %s' % db) except Exception as e: logging.critical(e) raise if __name__ == '__main__': import zipfile from os.path import split, splitext import argparse from multiprocessing import Pool, cpu_count from pymongo import MongoClient parser = argparse.ArgumentParser(description='Uploads horseracebase.com historical data to a MongoDB database') parser.add_argument('files', metavar='FILES', type=str, nargs='+', help='zip/csv/pd files to upload') parser.add_argument('--host', type=str, action='store', default='localhost', help='MongoDB host (default=localhost)') parser.add_argument('--port', type=int, action='store', default=33000, help='MongoDB port (default=33000)') parser.add_argument('--db', type=str, action='store', default='betfair', help='db (default=betfair)') parser.add_argument('--jobs', type=int, action='store', default=-1, help='how many jobs to use') parser.add_argument('--races', type=str, action='store', default='horseracebase', help='races collection (default=horseracebase)') args = parser.parse_args() configure_root_logger() cpus = min(cpu_count(), len(args.files)) if args.jobs < 0 else args.jobs logging.info('Creating a pool with %d worker processes..' % cpus) pool = Pool(processes=cpus) pool.map(upload, zip([args] * len(args.files), args.files))
def run_backtest(context): n_bkt, args, mparams = context formatter = logging.Formatter('%(asctime)s - n_bkt=' + str(n_bkt) + ' - %(levelname)s: %(message)s') configure_root_logger( args.logtty, args.logfile, MongoClient(args.host, args.port)[args.db][args.logmongo] if args.logmongo is not None else None, formatter=formatter) db = MongoClient(args.host, args.port)[args.db] where_clause = defaultdict(lambda: {}) country, start_date, end_date = 'GB', parse_date(args.start), parse_date( args.end) if start_date is not None: where_clause['scheduled_off']['$gte'] = start_date if end_date is not None: where_clause['scheduled_off']['$lte'] = end_date if country is not None: where_clause['country'] = country sorted_races = db[args.train].find(where_clause, sort=[('scheduled_off', 1)], timeout=False) exec_services = HistoricalExecutionService(db) strat = strategy.Balius(mu=mparams['mu'], sigma=mparams['sigma'], beta=mparams['beta'], tau=mparams['tau'], draw_probability=mparams['draw_prob'], risk_aversion=mparams['risk_aversion'], min_races=mparams['min_races'], max_exposure=mparams['max_exposure']) st = time.clock() strategy.backtest(exec_services, strat, sorted_races) en = time.clock() logging.info('Backtest finished in %.2f seconds' % (en - st)) strat_dict = strat.to_dict() strat_id = db[STRATEGIES_COLL].insert(strat_dict) logging.info('Strategy serialised to %s with id=%s' % (db[STRATEGIES_COLL], strat_id)) bets = price_historical_bets(db, exec_services.get_mu_bets()[0]) scorecard = make_scorecard(bets) now = datetime.datetime.utcnow() scorecard.update({ 'params': { 'ts': strat_dict['hm']['ts'], 'risk': strat_dict['risk'] }, 'timestamp': now, 'run_seconds': en - st, 'strategy_id': strat_id }) scorecard_id = db[SCORECARDS_COLL].insert(scorecard) logging.info('Scorecard inserted in %s with id=%s' % (db[SCORECARDS_COLL], scorecard_id)) db[BETS_COLL].insert(add_scorecard_id_to_dicts(scorecard_id, bets)) logging.info('Associated bets inserted in %s' % db[BETS_COLL]) markets = market_breakdown(bets).reset_index() markets = pandas_to_dicts(markets, {'n_runners': int}) db[MARKETS_COLL].insert(add_scorecard_id_to_dicts(scorecard_id, markets)) logging.info('Associated market breakdown inserted in %s' % db[MARKETS_COLL])
'--logmongo', type=str, action='store', default=None, help='specifies what collection to use for logging to MongoDB') parser.add_argument('--logtty', help='prints logging info to the terminal', action='store_true') parser.add_argument('train', type=str, action='store', help='training set collection') args = parser.parse_args() configure_root_logger( args.logtty, args.logfile, MongoClient(args.host, args.port)[args.db][args.logmongo] if args.logmongo is not None else None) keys = [ 'mu', 'sigma', 'beta', 'tau', 'draw_prob', 'risk_aversion', 'min_races', 'max_exposure' ] mparams = [ args.mu, args.sigma, args.beta, args.tau, args.draw_prob, args.risk_aversion, args.min_races, args.max_exposure ] n_backtests = reduce(lambda x, y: x * y, map(len, mparams)) logging.info( 'The specified ranges of parameters yield %d different backtests.' % n_backtests)
help='draw probability (default=%.2f)' % DEFAULT_DRAW) parser.add_argument('--risk-aversion', type=arg_linspace, action='store', default=[0.1], metavar='RA', help='risk aversion') parser.add_argument('--min-races', type=arg_linspace, action='store', default=[3], metavar='N', help='minimum no. of races required per horse before betting') parser.add_argument('--max-exposure', type=arg_linspace, action='store', default=[50], metavar='EXP', help='maximum exposure') parser.add_argument('--logfile', type=str, action='store', default=None, help='specifies what log file to use') parser.add_argument('--logmongo', type=str, action='store', default=None, help='specifies what collection to use for logging to MongoDB') parser.add_argument('--logtty', help='prints logging info to the terminal', action='store_true') parser.add_argument('train', type=str, action='store', help='training set collection') args = parser.parse_args() configure_root_logger(args.logtty, args.logfile, MongoClient(args.host, args.port)[args.db][args.logmongo] if args.logmongo is not None else None) keys = ['mu', 'sigma', 'beta', 'tau', 'draw_prob', 'risk_aversion', 'min_races', 'max_exposure'] mparams = [args.mu, args.sigma, args.beta, args.tau, args.draw_prob, args.risk_aversion, args.min_races, args.max_exposure] n_backtests = reduce(lambda x, y: x * y, map(len, mparams)) logging.info('The specified ranges of parameters yield %d different backtests.' % n_backtests) packed_args = ((n_bkt, args, dict(zip(keys, values))) for n_bkt, values in enumerate(product(*mparams))) if n_backtests > 1: n_processes = min(cpu_count(), n_backtests) if args.jobs < 0 else args.jobs logging.info('Creating a pool with %d worker processes..' % n_processes) pool = Pool(processes=n_processes) pool.map(run_backtest, packed_args)
HOST = 'localhost' PORT = 33000 DB = 'betfair' def ensure_index(collection, index, unique=False): if unique: logging.info('Ensuring unique index on %s: %s' % (collection, index)) else: logging.info('Ensuring index on %s: %s' % (collection, index)) collection.ensure_index(index, unique=unique, drop_dups=unique) configure_root_logger(True) db = MongoClient(HOST, PORT)[DB] logging.info('Initializing indexes in database %s' % db) ensure_index(db[VWAO_COLL], [('scheduled_off', 1)]) ensure_index(db[VWAO_COLL], [('market_id', 1), ('selection', 1)], unique=True) ensure_index(db[VWAO_COLL], [('market_id', 1), ('selection_id', 1)], unique=True) ensure_index(db[RACES_COLL], [('market_id', 1)], unique=True) ensure_index(db[TRAIN_COLL], [('scheduled_off', 1)]) ensure_index(db[TRAIN_COLL], [('market_id', 1)], unique=True)