def best(players, name, threshold=0.1): def str2grams(s, n=2, blank="#", noWordOrder=True): blanks = blank * (n - 1) s = s.upper() s = blanks + (blanks.join(s.split(' ')) if noWordOrder else s) + blanks return set( [ s[i:i+n] for i in xrange(len(s)-n+1) ] if len(s) >= n else [s] ) def jaccard(x, y): return len(x & y) / len(x | y) bestPlayer, bestSim = None, 0.0 for player in players: sim = jaccard(str2grams(player), str2grams(name)) if sim > bestSim: bestPlayer, bestSim = player, sim if sim == 1.0: break if bestSim < threshold: print2(colored("Mismatch: " + str((bestPlayer, name)), "yellow")) return name return bestPlayer
def cache_function_last_result( f, msg1="[Evaluation] Reusing previous result of <name>", msg2="[Evaluation] Recalculating and memorizing result of <name>"): key = f.__name__ print2("[Evaluation] Results of %s will be cached." % key) def same_objects_on_lists(l1, l2): return len(l1) == len(l2) and 0 == sum( (e1 is not e2) for e1, e2 in zip(l1, l2)) def caching_function(*args): args2result = _last_result_cache.get(key, ([], None)) prev_args = args2result[0] if same_objects_on_lists(prev_args, args): if msg1 is not None: print2(msg1.replace("<name>", key)) else: if msg2 is not None: print2(msg2.replace("<name>", key)) result = f(*args) _last_result_cache[key] = args2result = (args, result) return args2result[1] caching_function.__name__ = key return caching_function
def caching_function(*args): args2result = _last_result_cache.get(key, ([], None)) prev_args = args2result[0] if same_objects_on_lists(prev_args, args): if msg1 is not None: print2(msg1.replace("<name>", key)) else: if msg2 is not None: print2(msg2.replace("<name>", key)) result = f(*args) _last_result_cache[key] = args2result = (args, result) return args2result[1]
def optimal_h_for_risk(self, ys): if self.RISK_OPTIMAL_H_NUMERICALLY: print2( "[Evaluation] Numerically optimizing h for qRisk. May take time..." ) utility = lambda h, y: -self.loss(h, y) h = optimal_h_numerically( ys, utility, data_mask=self.y_mask, start= None, #self.optimal_h_bayes_estimator(ys), #start from Bayes estimator for Risk max_niter=self.EVAL_MAX_NITER, tol=self.EVAL_SGD_PREC, tol_goal=-1, debug=True, lr=self.EVAL_LR) else: h = self.optimal_h_bayes_estimator(ys) print2("[Evaluation:optimal_h_for_risk] h=%s" % str(h)[:200]) return h
def optimal_h_for_gain(self, ys): if self.GAIN_OPTIMAL_H_NUMERICALLY: print2( "[Evaluation] Numerically optimizing h for qGain. May take time..." ) h = optimal_h_numerically( ys, self.utility, data_mask=self.y_mask, start= None, #self.optimal_h_bayes_estimator(ys), #start from Bayes estimator for Risk max_niter=self.EVAL_MAX_NITER, tol=self.EVAL_SGD_PREC, tol_goal=-1, debug=True, lr=self.EVAL_LR) #h = optimal_h_numerically_scipy(ys, self.utility, data_mask=self.y_mask, # max_niter=self.EVAL_MAX_NITER, tol=self.EVAL_SGD_PREC, tol_goal=-1, # lr=self.EVAL_LR, start=None, optimizer="COBYLA", # verbose=True, debug=False, sparse_verbose=True) else: h = self.optimal_h_bayes_estimator(ys) print2("[Evaluation:optimal_h_for_gain] h=%s" % str(h)[:200]) return h
Flattents the first two dimensions (samples of y for different thetas) from sample_predictive_y0. """ return flatten_first_two_dims( sample_predictive_y0(qw, qz, nsamples_theta, nsamples_y)) # # Constructing losses and utilities # In[19]: # mask used to select points to the utility-dependent term: use only training data utility_term_mask = training_mask loss, optimal_h_bayes_estimator = losses.LossFactory(**globals()).create(LOSS) print2("> <%s> loss: %s with (analytical/Bayes estimator) h: %s" % (LOSS, loss.__name__, optimal_h_bayes_estimator.__name__)) u = losses.UtilityFactory(**globals()).create(UTIL, loss) print2("> utility: %s" % u.__name__) # In[20]: utility_term_factory = utility_term_estimation.UtilityAggregatorFactory() # # Evaluation # In[21]: train_measures = evaluation.Measures( x, loss,
def optimal_h_numerically_scipy(ys, u, weights=None, utility_aggregator=gain_weighted, data_mask=None, max_niter=10000, tol=1e-4, tol_goal=-1, lr=0.01, start=None, optimizer="COBYLA", verbose=False, debug=False, sparse_verbose=True): """ Using numerical optimization (SciPy) finds optimal h for utility-dependent term expressed by utility_aggregator. Compatible with optimal_h_numerically. """ printv = lambda txt: (print2("[optimal_h_numerically_scipy]" + txt) if verbose else None) if sparse_verbose and not verbose: printv = lambda txt: sparse_print( "optimal_h_numerically_scipy", "[optimal_h_numerically_scipy]" + txt, 100) printd = lambda txt: (print2("[optimal_h_numerically_scipy]" + txt) if debug else None) assert len( signature(utility_aggregator).parameters )==3, \ "[optimal_h_numerically_scipy] Your utility_aggregator=%s takes wrong number of params! does not support weights?" % utility_aggregator env = torch if "cpu" in str(ys.device) else torch.cuda if data_mask is None: data_mask = (torch.ones(ys.shape[1:]) if len(ys.shape) > 1 else torch.tensor(1)).type(env.ByteTensor) if weights is None: weights = torch.ones_like(ys) weights /= weights.sum(0) #enforce normalization weights = torch.tensor(tonumpy(weights), requires_grad=False) y = torch.tensor(tonumpy(ys), requires_grad=False) if start is None: h = (y * weights).sum(0) #start from E(y) elif start is None or not is_valid(torch.tensor(start)): printd("start point is invalid. ignoring!") h = (y * weights).sum(0) #start from E(y) else: h = start start = time.time() x0 = tonumpy(h).flatten() fun = lambda h: -utility_aggregator( u(torch.tensor(h.reshape(y.shape[1:]), dtype=y.dtype), y), weights, data_mask).item() result = _scipy_minimize(fun, x0, method=optimizer, max_niter=max_niter, tol=tol, tol_goal=tol_goal, lr=lr, debug=debug) if verbose: printv( "[%.4f][optimizer=%s ys=%s max_niter=%i tol=%s tol_goal=%s lr=%s u=%s->%s] %s" % (time.time() - start, optimizer, tuple(ys.shape), max_niter, tol, tol_goal, lr, u.__name__, utility_aggregator.__name__, str(result).replace("\n", ";")[:200])) return torch.tensor(result["x"].reshape(y.shape[1:]), dtype=ys.dtype)
def optimal_h_numerically_ty(ys, u, utility_aggregator=gain, data_mask=None, max_niter=10000, tol=1e-4, tol_goal=-1, lr=0.01, start=None, optimizer=torch.optim.Adam, verbose=False, debug=False, sparse_verbose=True): """ Using numerical optimization finds optimal h for utility-dependent term expressed by utility_aggregator. Args: ys: Samples matrix. The dimensionality should match what utility_aggregator takes: #y-samples x #theta-samples x data-size. u: Utility function u(h, ys) -> utilities matrix (the same shape as ys). utility_aggregator: A function that calculate utility-dependent term. Should take exactly 2 params: utilites and data_mask. data_mask: A mask passed to utility_aggregator. """ printv = lambda txt: (print2("[optimal_h_numerically_ty] " + txt) if verbose else None) if sparse_verbose and not verbose: printv = lambda txt: sparse_print( "optimal_h_numerically_ty", "[optimal_h_numerically_ty]" + txt, 100 ) printd = lambda txt: (print2("[optimal_h_numerically_ty] " + txt) if debug else None) assert len( signature(utility_aggregator).parameters )==2, \ "[optimal_h_numerically_ty] Your utility_aggregator=%s takes wrong number of params! perhaps requires weights?" % utility_aggregator env = torch if "cpu" in str(ys.device) else torch.cuda if data_mask is None: #No data mask provided. Using all data points data_mask = (torch.ones(ys.shape[2:]) if len(ys.shape) > 2 else torch.tensor(1)).type(env.ByteTensor) y = torch.tensor(tonumpy(ys), requires_grad=False) if start is None: h = y.mean(0).mean(0).clone().detach().requires_grad_( True) #start from E(y) elif start is None or not is_valid(torch.tensor(start)): printd("start point is invalid. ignoring!") h = y.mean(0).mean(0).clone().detach().requires_grad_( True) #start from E(y) else: h = torch.tensor(tonumpy(start), requires_grad=True) optimizer = optimizer([h], lr=lr) prev_h, prev_goal = torch.tensor(tonumpy(h)), float("inf") start = time.time() for i in range(max_niter): goal = -utility_aggregator(u(h, y), data_mask) optimizer.zero_grad() goal.backward(retain_graph=False) optimizer.step() #check for convergence: if (prev_h - h).abs().max() <= tol: printv( "[%.2f][ys=%s max_niter=%i tol=%s tol_goal=%s lr=%s u=%s->%s] Finished at %i. iter (tolerance reached): obj=%.4f max-err=%.8f mean-err=%.8f" % (time.time() - start, tuple( ys.shape), max_niter, tol, tol_goal, lr, u.__name__, utility_aggregator.__name__, i + 1, goal.item(), (prev_h - h).abs().max(), (prev_h - h).abs().mean())) break if abs(prev_goal - goal.item()) <= tol_goal: printv( "[%.2f][ys=%s max_niter=%i tol=%s tol_goal=%s lr=%s u=%s->%s] Finished at %i. iter (objective tolerance reached): obj=%.4f max-err=%.8f mean-err=%.8f" % (time.time() - start, tuple( ys.shape), max_niter, tol, tol_goal, lr, u.__name__, utility_aggregator.__name__, i + 1, goal.item(), (prev_h - h).abs().max(), (prev_h - h).abs().mean())) break if i >= max_niter - 1: printv( "[%.2f][ys=%s max_niter=%i tol=%s tol_goal=%s lr=%s u=%s->%s] Finished at %i. iter (max number reached): obj=%.4f max-err=%.8f mean-err=%.8f" % (time.time() - start, tuple( ys.shape), max_niter, tol, tol_goal, lr, u.__name__, utility_aggregator.__name__, i + 1, goal.item(), (prev_h - h).abs().max(), (prev_h - h).abs().mean())) break if i % (max_niter // 10) == 0: printd("[%.2f] iter %i: objective=%.4f err=%.6f" % (time.time() - start, i, goal.item(), (prev_h - h).abs().max())) prev_h = torch.tensor(tonumpy(h)) prev_goal = goal.item() return h
Flattents the first two dimensions (samples of y for different thetas) from sample_predictive_y0. """ ys = sample_predictive_y0(data, q_theta, nsamples_theta, nsamples_y) return flatten_first_two_dims(ys) # # Constructing losses and utilities # In[181]: # include all (training) data points in utility-dependent term utility_term_mask = torch.ones(schools_dat["J"]).type(env.ByteTensor) loss, optimal_h_bayes_estimator = losses.LossFactory(**globals()).create(LOSS) print2("> <%s> loss: %s with (analytical/Bayes estimator) h: %s" % (LOSS, loss.__name__, optimal_h_bayes_estimator.__name__)) u = losses.UtilityFactory(**globals()).create(UTIL, loss) print2("> utility: %s" % u.__name__) # In[182]: utility_term_factory = utility_term_estimation.UtilityAggregatorFactory() # # Evaluation # In[183]: measures = evaluation.Measures( torch.tensor(schools_dat["y"], dtype=torch.float32), loss,
Flattents the first two dimensions (samples of y for different thetas) from sample_predictive_y0. """ return flatten_first_two_dims(sample_predictive_y0(qw, qz, nsamples_theta, nsamples_y)) # # Constructing losses and utilities # In[19]: # mask used to select points to the utility-dependent term: use only training data utility_term_mask = training_mask loss, optimal_h_bayes_estimator = losses.LossFactory(**globals()).create(LOSS) print2("> <%s> loss: %s with (analytical/Bayes estimator) h: %s" % (LOSS, loss.__name__, optimal_h_bayes_estimator.__name__)) u = losses.UtilityFactory(**globals()).create(UTIL, loss) print2("> utility: %s" % u.__name__) # In[20]: utility_term_factory = utility_term_estimation.UtilityAggregatorFactory() # # Evaluation # In[21]:
def crawl(sport, year, division, org, game, url, neutral=False): global data data = data.format(sport, year, division) gamename = game.replace('/', '.') def readFlag(flag): if not os.path.exists(os.path.join(data, org, gamename)): os.mkdir(os.path.join(data, org, gamename)) return os.path.exists(os.path.join(data, org, gamename, flag)) def setFlag(flag): with open(os.path.join(data, org, gamename, flag), 'w') as f: pass if neutral and not readFlag(".neutral"): setFlag(".neutral") filename = os.path.join(data, org, gamename, "{}.csv") if not readFlag(".done"): try: gamelink = urljoin(domain, url) log("{} {} {} {} {} {}".format(sport, year, division, org, game, dumpURL(gamelink))) gs = parseURL(gamelink) sleep(2) gamescore = None gameinfo = None periods = [] teams = [] nextPeriod = 0 for table in gs.select("div.header_menu a"): if ( table["href"] == "#" or not ( table["href"].startswith("/game/box_score") or table["href"].startswith("/game/play_by_play") ) ): continue tablelink = urljoin(domain, table["href"]) print2("{} \033[4m{}\033[0m".format(table.text.strip(), tablelink)) ts = parseURL(tablelink) if gamescore is None: gamescore = parseTable(ts.select("table:nth-of-type(1)")[0]) dumpTable( gamescore, filename.format("Score") ) if gameinfo is None: gameinfo = transposeTable( parseTable(ts.select("table:nth-of-type(3)")[0]) + parseTable(ts.select("table:nth-of-type(4)")[0]) ) dumpTable( gameinfo, filename.format("Info") ) teams = [gamescore[1][0].text.strip(), gamescore[2][0].text.strip()] periods = [v.text.strip() for v in gamescore[0][1:]] if table["href"].startswith("/game/box_score"): if table.text.strip() == "Box Score": sfilename = filename.format("Box Score - {}") else: sfilename = filename.format(periods[nextPeriod] + " - {}") nextPeriod += 1 dumpTable( parseTable(ts.select("table:nth-of-type(5)")[0], header=1), sfilename.format(teams[0]) ) dumpTable( parseTable(ts.select("table:nth-of-type(6)")[0], header=1), sfilename.format(teams[1]) ) elif table["href"].startswith("/game/play_by_play"): sfilename = filename.format("Play by Play - {}") for (i, period) in enumerate(periods[:-1]): dumpTable( parseTable(ts.select("table:nth-of-type({})".format(6 + 2 * i))[0], header=0), sfilename.format(period) ) sleep(2) if gamescore == gameinfo == None: raise Exception("Not a game.") setFlag(".done") sleep(2) except Exception as e: print2(colored("Error: ", "red"), e) finally: print2() if not readFlag(".parsed"): try: gamelink = urljoin(domain, url) log("{} {} {} {} {} {}".format(sport, year, division, org, game, dumpURL(gamelink))) print2("Parsing...") gamescore = loadTable(filename.format("Score")) sfilename = filename.format("Box Score - {}") teams = [gamescore[1][0], gamescore[2][0]] with open(filename.format("Box Score - All (Parsed)"), "w") as af: for team in teams: boxScore = parseBoxScore( sfilename.format(team), filename.format("Info"), team, "All" ) rawDumpTable(boxScore[(0 if team == teams[0] else 1):], af) sfilename = filename.format("Play by Play - {}") periods = gamescore[0][1:] with open(filename.format("Play by Play - All (Parsed)"), "w") as af: for period in periods[:-1]: playByPlay = parsePlayByPlay( sfilename.format(period), period, filename.format("Info") ) rawDumpTable(playByPlay[(0 if period == periods[0] else 1):], af) setFlag(".parsed") except Exception as e: print2(colored("Error: ", "red"), e) finally: print2()
def __init__(self, y, loss, u, sample_predictive_y, optimal_h_bayes_estimator=None, y_mask=None, GAIN_OPTIMAL_H_NUMERICALLY=True, RISK_OPTIMAL_H_NUMERICALLY=False, EVAL_NSAMPLES_UTILITY_TERM_THETA=1000, EVAL_NSAMPLES_UTILITY_TERM_Y=1, EVAL_MAX_NITER=10000, EVAL_SGD_PREC=0.0001, EVAL_LR=0.01, EVAL_RESAMPLE_EVERY_TIME=False): """ Args: y Evaluation data. y_mask A mask selecting data points for evaluation (default: all). loss A function y x h -> loss used to calculate risks. u A function y x h -> utility used to calculate gains. sample_predictive_y A function that for each data point from y, generates samples from predictive posterior. EVAL_RESAMPLE_EVERY_TIME Can results of sample_predictive_y, optimal_h_for_gain and optimal_h_for_risk be cached? """ self.y = y self.y_mask = y_mask if self.y_mask is None: print2( "[Evaluation] WARNING: using default all data points in evaluation." ) env = torch if "cpu" in str(self.y.device).lower() else torch.cuda self.y_mask = torch.ones_like(self.y).type(env.ByteTensor) self.loss = loss self.utility = u self.sample_predictive_y = sample_predictive_y self.optimal_h_bayes_estimator = optimal_h_bayes_estimator if (self.optimal_h_bayes_estimator is None) and \ (not GAIN_OPTIMAL_H_NUMERICALLY or not RISK_OPTIMAL_H_NUMERICALLY): print2( "[Evaluation] WARNING: Optimal decisions h for both Risk and Gain will be obtained numerically." ) self.optimal_h_bayes_estimator = lambda ys: None GAIN_OPTIMAL_H_NUMERICALLY, RISK_OPTIMAL_H_NUMERICALLY = True, True self.GAIN_OPTIMAL_H_NUMERICALLY = GAIN_OPTIMAL_H_NUMERICALLY self.RISK_OPTIMAL_H_NUMERICALLY = RISK_OPTIMAL_H_NUMERICALLY self.EVAL_NSAMPLES_UTILITY_TERM_THETA = EVAL_NSAMPLES_UTILITY_TERM_THETA self.EVAL_NSAMPLES_UTILITY_TERM_Y = EVAL_NSAMPLES_UTILITY_TERM_Y self.EVAL_MAX_NITER = EVAL_MAX_NITER self.EVAL_SGD_PREC = EVAL_SGD_PREC self.EVAL_LR = EVAL_LR print("[Evaluation] Configuration: %s" % " ".join("%s=%s" % (k, format_value(v)) for k, v in vars(self).items())) if not EVAL_RESAMPLE_EVERY_TIME: self.optimal_h_for_gain = cache_function_last_result( self.optimal_h_for_gain) self.optimal_h_for_risk = cache_function_last_result( self.optimal_h_for_risk) self.sample_predictive_posterior = cache_function_last_result( self.sample_predictive_posterior)
) sleep(2) for org in s.select("table a"): orgname = org.text.strip() if filterOrg != None and filterOrg != orgname: continue try: if not os.path.exists(os.path.join(data, orgname)): os.mkdir(os.path.join(data, orgname)) orglink = urljoin(domain, org['href']) print2() log("{} {} {} {} {}".format(sport, year, division, orgname, dumpURL(orglink))) cs = parseURL(orglink) for link in cs.select("#contentarea > a"): if link.text.strip() == "Roster": tq.enqueue( crawlTeam, sport, year, division, org.text.strip(), "Roster", urljoin(domain, link["href"]), 1, at_front=atFront )