forked from mcobzarenco/betfair-trading
-
Notifications
You must be signed in to change notification settings - Fork 0
/
upload.py
executable file
·196 lines (157 loc) · 8.08 KB
/
upload.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#!/usr/bin/python
from __future__ import print_function, division
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)
warnings.filterwarnings(action='ignore', category=UserWarning)
import logging
from itertools import imap
import dateutil
import numpy as np
import pandas as pd
from pymongo import MongoClient
from pymongo.errors import DuplicateKeyError
from harb.common import configure_root_logger, TO_BE_PLACED, convert_types, \
pandas_to_dicts, SELECTION_BLACK_LIST, extract_horse_name, get_first
def races_from_bars(bars):
logging.info('Creating races from bars..')
bars['winners'] = None
win_ix = bars['win_flag'] == 1
bars['winners'].ix[win_ix] = bars['selection'].ix[win_ix]
agg_dict = {'country': get_first,
'event': get_first,
'course': get_first,
'scheduled_off': get_first,
'selection_id': lambda x: list(set(x.dropna())),
'selection': lambda x: list(set(x.dropna())),
'winners': lambda x: None if len(x.dropna()) == 0 else list(set(x.dropna()))}
races = bars.groupby(['market_id']).agg(agg_dict)
# races.sort('scheduled_off', inplace=True)
return races
def vwao_from_bars(bars):
def format_vwao(d):
d['back_prices'] = [{'amount': d['total_matched'] / 2.0,
'price': d['vwao'],
'depth': 1,
'type': 'L'}]
d['lay_prices'] = [{'amount': d['total_matched'] / 2.0,
'price': d['vwao'],
'depth': 1,
'type': 'B'}]
d['last_price_matched'] = d['vwao']
del d['vwao']
return d
logging.info('Calculating VWAO from bars..')
bars['notional'] = bars.volume_matched * bars.odds
agg_dict = {'country': get_first,
'event': get_first,
'course': get_first,
'scheduled_off': get_first,
'notional': lambda x: float(np.sum(x)),
'volume_matched': lambda x: float(np.sum(x)),
'selection': get_first}
gb = bars.dropna(subset=['selection']).groupby(['market_id', 'selection_id']).aggregate(agg_dict) \
.rename(columns={'volume_matched': 'total_matched'})
gb['vwao'] = gb.notional / gb.total_matched
del gb['notional']
return imap(format_vwao, pandas_to_dicts(gb.reset_index()))
def training_from_races(races):
logging.info('Creating training set from races..')
races = races.dropna(subset=['selection', 'winners'])
races = races[races['selection'].map(lambda x: len(x) > 0)]
black_list = races['selection'].map(lambda xs: all([x not in SELECTION_BLACK_LIST for x in xs]))
logging.info('%d races removed using the selection black list' % (len(black_list) - sum(black_list)))
races = races[black_list]
races['n_runners'] = races['selection'].map(len)
frames = []
for (_, events) in races.groupby(['course', 'scheduled_off', 'n_runners']):
# print(len(events))
if len(events) == 1:
events = events.irow(0).to_dict()
events['event'] = events['event']
events['ranking'] = [int(r not in events['winners']) for r in events['selection']]
frames.append(events)
elif len(events) == 2:
placed = events[events['event'] == TO_BE_PLACED]
if len(placed) != 1:
logging.warning('Skipping events with the same (course, time) with no|multiple "to be placed"; '
'market_id: %s' % events['market_id'].tolist())
continue
placed = placed.irow(0).to_dict()
placed['ranking'] = [int(r not in placed['winners']) + 1 for r in placed['selection']]
towin = events[events['event'] != TO_BE_PLACED].irow(0).to_dict()
if len(towin['winners']) > 1:
continue
placed['ranking'][placed['selection'].index(towin['winners'][0])] = 0
towin['ranking'] = placed['ranking']
frames.append(placed)
frames.append(towin)
return frames
def upload(args):
args, path = args
parse = lambda x: dateutil.parser.parse(x, dayfirst=True)
try:
directory, file_name = split(path)
file_part, ext = splitext(file_name)
formatter = logging.Formatter('%(asctime)s - ' + file_name + ' - %(levelname)s: %(message)s')
configure_root_logger(args.logtty, args.logfile, formatter=formatter)
db = MongoClient(args.host, args.port)[args.db]
if ext == '.zip':
logging.info('Reading zipped csv file into memory')
fin = zipfile.ZipFile(path, 'r').open(file_part + '.csv')
else:
logging.info('Reading csv file into memory')
fin = path
bars = pd.read_csv(fin, parse_dates=['SCHEDULED_OFF'], date_parser=parse)
bars.columns = bars.columns.map(lambda x: x.lower())
bars = bars.rename(columns={'event_id': 'market_id'})
for col in ['market_id', 'selection_id']:
bars[col] = bars[col].map(str) # Make sure dtype==str
# Insert other filters here:
bars = bars[bars.in_play == 'PE']
bars['selection'] = bars['selection'].map(extract_horse_name)
races = races_from_bars(bars).reset_index()
train = training_from_races(races)
vwao = vwao_from_bars(bars)
try:
db[args.races].insert(pandas_to_dicts(races), continue_on_error=True)
except DuplicateKeyError as e:
logging.error('Some duplicate keys in %s; If this is a surprise, ABORT! msg=%s' % (db[args.races], e))
try:
db[args.train].insert(convert_types(train, {'n_runners': int}), continue_on_error=True)
except DuplicateKeyError as e:
logging.error('Some duplicate keys in %s; If this is a surprise, ABORT! msg=%s' % (db[args.train], e))
try:
db[args.vwao].insert(vwao, continue_on_error=True)
except DuplicateKeyError as e:
logging.error('Some duplicate keys in %s; If this is a surprise, ABORT! msg=%s' % (db[args.vwao], e))
logging.info('Successfully uploaded to %s' % db)
except Exception as e:
logging.critical(e)
raise
if __name__ == '__main__':
import zipfile
from os.path import split, splitext
import argparse
from multiprocessing import Pool, cpu_count
parser = argparse.ArgumentParser(description='Uploads Betfair historical data to a MongoDB database')
parser.add_argument('files', metavar='FILES', type=str, nargs='+', help='zip/csv/pd files to upload')
parser.add_argument('--host', type=str, action='store', default='localhost', help='MongoDB host (default=localhost)')
parser.add_argument('--port', type=int, action='store', default=33000, help='MongoDB port (default=33000)')
parser.add_argument('--db', type=str, action='store', default='betfair', help='db (default=betfair)')
parser.add_argument('--jobs', type=int, action='store', default=-1, help='how many jobs to use')
parser.add_argument('--races', type=str, action='store', default='races', help='races collection (default=races)')
parser.add_argument('--train', type=str, action='store', default='train',
help='training set collection (default=train)')
parser.add_argument('--vwao', type=str, action='store', default='vwao',
help='volume-weighted-average-odds (vwao) collection (default=vwao)')
parser.add_argument('--logfile', type=str, action='store', default=None, help='specifies what log file to use')
parser.add_argument('--logtty', help='prints logging info to the terminal', action='store_true')
args = parser.parse_args()
configure_root_logger(args.logtty, args.logfile)
if len(args.files) > 1:
cpus = min(cpu_count(), len(args.files)) if args.jobs < 0 else args.jobs
logging.info('Creating a pool with %d worker processes..' % cpus)
pool = Pool(processes=cpus)
pool.map(upload, zip([args] * len(args.files), args.files))
else:
upload((args, args.files[0]))