def predict_proba(self, X_home, X_away, n_max=20): """ Predict match outcome probabilities. Parameters ---------- X_home: 2d array-like, shape (n_samples, n_features). Input information to predict estimated goals for home team. X_away: 2d array-like, shape (n_samples, n_features). Input information to predict estimated goals for away team. n_max: int, no less than 0. Maxmium goals for a team per match. Returns ------- p_matrix: 2d array-like, shape (n_samples, 3). Matrix of estimated probabilities. Each row is the probabilities for 3 possibile outcomes of each match. """ hgoal_lambda, agoal_lambda = self._lambda(X_home, X_away) p_win = np.sum(skellam.pmf(np.arange(n_max)+1, hgoal_lambda.reshape(-1,1), agoal_lambda.reshape(-1,1)), axis=1) p_draw = np.sum(skellam.pmf(0, hgoal_lambda.reshape(-1,1), agoal_lambda.reshape(-1,1)), axis=1) p_lose = np.sum(skellam.pmf(np.arange(n_max)-n_max, hgoal_lambda.reshape(-1,1), agoal_lambda.reshape(-1,1)), axis=1) p_matrix = np.array([p_win,p_draw, p_lose]).transpose() return p_matrix
def calculateStrategicUtilities(self, passedCandidates, passedElectors, MIN_UTIL, iteration): electorID = self.ID nCandidates = len(passedCandidates) self.allVotes = GlobalFuncs.countVoteIntentions(passedElectors, \ passedCandidates,iteration) self.chosenCandidate = self.chooseCandidate(passedCandidates, iteration) self.othersVotes = self.allVotes self.othersVotes[self.chosenCandidate.ID] = \ self.othersVotes[self.chosenCandidate.ID] - 1 for rowIndex in range(0,nCandidates): for colIndex in range(0,nCandidates): if rowIndex == colIndex: self.tieProbs[rowIndex,colIndex] = 1 self.pivotalityProbs[rowIndex,colIndex] = 1 self.winnerProbs[rowIndex,colIndex] = 1 else: skellamA = self.othersVotes[rowIndex] skellamB = self.othersVotes[colIndex] if skellamA == 0: skellamA = 10**-100 if skellamB == 0: skellamB = 10**-100 self.tieProbs[rowIndex,colIndex] = skellam.pmf(0,skellamA,skellamB) self.pivotalityProbs[rowIndex,colIndex] = skellam.pmf(-1,skellamA,skellamB) self.winnerProbs[rowIndex,colIndex] = 1 - skellam.cdf(-1,skellamA,skellamB) #UNCOMMENT ONLY IN CASE OF PROBLEMS WITH 0 ENTRIES############### #for rowIndex in range(0,nCandidates): # for colIndex in range(0,nCandidates): # if math.isnan(self.tieProbs[rowIndex,colIndex]): # self.tieProbs[rowIndex,colIndex] = 0 # if math.isnan(self.pivotalityProbs[rowIndex,colIndex]): # self.pivotalityProbs[rowIndex,colIndex] = 0 # if math.isnan(self.winnerProbs[rowIndex,colIndex]): # self.winnerProbs[rowIndex,colIndex] = 0 ################################################################# for rowIndex in range(0,nCandidates): for colIndex in range(0,nCandidates): if rowIndex != colIndex: probsWoutPair = np.delete(self.winnerProbs,rowIndex,0) probsWOutPair = np.delete(probsWoutPair,colIndex,1) probsProd = np.prod(probsWOutPair) otherPivsSum = self.pivotalityProbs[rowIndex,colIndex] + self.winnerProbs[rowIndex,colIndex] self.pivotalities[rowIndex,colIndex] = probsProd * otherPivsSum if iteration == 0: self.previousUtilities = self.sincereUtilities else: self.previousUtilities = self.strategicUtilities for cand in range(0,nCandidates): for otherCand in range(0,nCandidates): utilityDiff = self.previousUtilities[cand] - self.sincereUtilities[otherCand] self.newUtilDiff[otherCand] = utilityDiff * self.pivotalities[cand,otherCand] self.newUtilitySum[cand] = np.sum(self.newUtilDiff) self.newUtilitySum[np.argmin(self.sincereUtilities)] = MIN_UTIL self.strategicUtilities = self.newUtilitySum return self.strategicUtilities
def checkDiffInGoals(self, data=None): data = self._genData(data) skellam_pred = [ skellam.pmf(i, data.mean()[0], data.mean()[1]) for i in range(-6, 8) ] plt.hist(data[['HomeGoals']].values - data[['AwayGoals']].values, range(-6, 8), alpha=0.7, label='Actual', normed=True) plt.plot([i + 0.5 for i in range(-6, 8)], skellam_pred, linestyle='-', marker='o', label="Skellam", color='#CD5C5C') plt.legend(loc='upper right', fontsize=13) plt.xticks([i + 0.5 for i in range(-6, 8)], [i for i in range(-6, 8)]) plt.xlabel("Home Goals - Away Goals", size=13) plt.ylabel("Proportion of Matches", size=13) plt.title("Difference in Goals Scored (Home Team vs Away Team)", size=14, fontweight='bold') plt.ylim([-0.004, 0.26]) plt.tight_layout() plt.show()
def overtime(mu1, mu2, min, up, outcome): # Sloppy if (min < 90): aup = 0 amin = 90 else: aup = up amin = min if (amin <= 105): # 1st extra time time_r = (120.0-amin)+stoppage_1ot+stoppage_2ot elif (amin <= 120): # 2nd extra time time_r = (120.0-amin)+stoppage_2ot ft = time_r/(30.0+stoppage_1ot+stoppage_2ot) if (outcome=="draw"): p = skellam.pmf(-aup, ft*mu1*ot_ft, ft*mu2*ot_ft) elif (outcome == "lose"): p = skellam.cdf(-1-aup, ft*mu1*ot_ft, ft*mu2*ot_ft) else: p = skellam.cdf(-1+aup, ft*mu2*ot_ft, ft*mu1*ot_ft) return(p)
def outcome(mu1, mu2, min, up, outcome): # Sloppy if (min > 90): if (outcome=="draw"): p = 1.0 else: p = 0.0 return(p) if (min <= 45): # 1st half time_r = (90.0-min)+stoppage_1reg+stoppage_2reg elif (min <= 90): # 2nd half time_r = (90.0-min)+stoppage_2reg ft = time_r/(90.0+stoppage_1reg+stoppage_2reg) if (outcome=="draw"): p = skellam.pmf(-up, mu1*ft, mu2*ft) elif (outcome == "lose"): p = skellam.cdf(-1-up, mu1*ft, mu2*ft) else: p = skellam.cdf(-1+up, mu2*ft, mu1*ft) return(p)
def expected_result(elo_a, elo_b, winning_margin): """ https://en.wikipedia.org/wiki/Elo_rating_system#Mathematical_details """ px = skellam.cdf(winning_margin, elo_a, elo_b) pwm = skellam.pmf(winning_margin, elo_a, elo_b) expect_a = (px + pwm * 0.5) - 0.3 return expect_a
def BuildPoissonModels(hist_data, feature_list, comp_data=None): ''' Build score predictions via (linear) poisson regression. ''' hist_data_1 = hist_data[["team_1_score"] + feature_list] hist_data_2 = hist_data[["team_2_score"] + feature_list] formula_1 = "team_1_score ~ " + " + ".join(feature_list) formula_2 = "team_2_score ~ " + " + ".join(feature_list) # using the GEE package along with independance assumptions to fit poisson model. # Am assuming this is using a maximum likleyhood approach? fam = Poisson() ind = Independence() model_1 = GEE.from_formula(formula_1, "team_1_score", hist_data, cov_struct=ind, family=fam) model_2 = GEE.from_formula(formula_2, "team_2_score", hist_data, cov_struct=ind, family=fam) model_1_fit = model_1.fit() model_2_fit = model_2.fit() print(model_1_fit.summary()) hist_data['team_1_score_pred'] = model_1_fit.predict(hist_data) hist_data['team_2_score_pred'] = model_2_fit.predict(hist_data) # return historical data if comp_data wasn't passed. if comp_data is None: return hist_data # prepare comp data comp_data['team_1_score_pred'] = model_1_fit.predict( comp_data[feature_list]) comp_data['team_2_score_pred'] = model_2_fit.predict( comp_data[feature_list]) comp_data['team_1_prob'] = comp_data[[ 'team_1_score_pred', 'team_2_score_pred' ]].apply( lambda x: 1 - skellam.cdf(0, x['team_1_score_pred'], x[ 'team_2_score_pred']), 1) comp_data['team_tie_prob'] = comp_data[[ 'team_1_score_pred', 'team_2_score_pred' ]].apply( lambda x: skellam.pmf(0, x['team_1_score_pred'], x['team_2_score_pred'] ), 1) comp_data['team_2_prob'] = comp_data[[ 'team_1_score_pred', 'team_2_score_pred' ]].apply( lambda x: skellam.cdf(-1, x['team_1_score_pred'], x['team_2_score_pred' ]), 1) return hist_data, comp_data
def oddspredict2(fixtures, att_params, def_params, hmean, amean): resultodds = [] neutralscore = (hmean+amean)/2 for j in range(len(fixtures)): lamda = neutralscore * att_params[fixtures[j,0]] * def_params[fixtures[j,1]] mu = neutralscore * att_params[fixtures[j,1]] * def_params[fixtures[j,0]] px = skellam.cdf(-1, lamda, mu) p0 = skellam.pmf(0, lamda, mu) resultodds.append(px+p0*0.5) return resultodds
def calculateProb(home_id, away_id, params, num_teams): ''' Function to calculate the outcome probabilities between two teams @param z: Goal difference @param home_id: Home team id @param away_id: Away team id @param params: Array of parameters, result of the minimization problem ''' mu = params[0] h = params[1] lambda_one = np.exp(mu + h + params[1 + home_id] + params[num_teams + 1 + away_id]) lambda_two = np.exp(mu + params[1 + away_id] + params[num_teams + 1 + home_id]) home_loss = 0 draw = 0 home_win = 0 for z in range(1, 20): home_loss += skellam.pmf(-1 * z, lambda_one, lambda_two) home_win += skellam.pmf(z, lambda_one, lambda_two) draw = skellam.pmf(0, lambda_one, lambda_two) return np.array((home_win, draw, home_loss))
def sim_season(dataframe, iterations): # Team names as keys and points (initially 0) as values d = dict.fromkeys(dataframe['Home'].unique().tolist(), 0) for _ in range(iterations): for _, row in dataframe.iterrows(): h_xg = row['Home xG'] a_xg = row['Away xG'] # Calculate prob of home winning by 1-10 goals h_win = sum([skellam.pmf(sup, h_xg, a_xg) for sup in range(1, 10)]) # Calculate prob of away winning by 1-10 goals a_win = sum( [skellam.pmf(-sup, h_xg, a_xg) for sup in range(1, 10)]) # Supremacy of 0 is a draw draw = skellam.pmf(0, h_xg, a_xg) # Calculate match outcome result = random.choices(('Home', 'Draw', 'Away'), weights=[h_win, draw, a_win])[0] # Add 3 points for win, 1 for draw if result == 'Home': d[row.Home] += 3 elif result == 'Away': d[row.Away] += 3 else: d[row.Home] += 1 d[row.Away] += 1 # Update dict with average points rather than total d.update((team, pts / iterations) for team, pts in d.items()) # Create a list containing (team, points) tuples sorted by points points_sorted = sorted(d.items(), key=lambda x: x[1], reverse=True) return points_sorted
def calculateProb(z, home_team, away_team, team_dict, params): ''' Function to calculate the outcome probabilities between two teams @param z: Goal difference @param home_team: Home team string @param away_team: Away team string @param team_dict: Dictionary mapping team names to integers @param params: Array of parameters, result of the minimization problem ''' home_id = team_dict[home_team] away_id = team_dict[away_team] mu = params[0] h = params[1] lambda_one = np.exp(mu + h + params[1 + home_id] + params[21 + away_id]) lambda_two = np.exp(mu + params[1 + away_id] + params[21 + home_id]) return skellam.pmf(z, lambda_one, lambda_two)
def logp(self, dist): dist = int(dist) max_dist = 100 min_dist = -100 if dist > max_dist: dist = max_dist if dist < min_dist: dist = min_dist if dist in self._distribution_memo: logp = self._distribution_memo[dist] else: p = skellam.pmf(dist, mu1=self.mu1, mu2=self.mu2) if p <= 1e-6: p = 1e-6 logp = math.log(p) self._distribution_memo[dist] = logp return logp
def oddspredict(fixtures, att_params, def_params, hmean, amean): resultodds = [] neutralscore = (hmean+amean)/2 for j in range(len(fixtures)): lamda = neutralscore * att_params[fixtures[j,0]] * def_params[fixtures[j,1]] mu = neutralscore * att_params[fixtures[j,1]] * def_params[fixtures[j,0]] p_hw, p_drw, p_aw = 0, 0, 0 # calculate probability matrix for x in range(-75, 1): px = skellam.pmf(x, lamda, mu) if(x<0): p_aw = p_aw + px else: p_aw = p_aw + (px*0.5) resultodds.append(1-p_aw) return resultodds
def likelihoodFn(params, data): ''' Function to specify the likelihood given a set of parameters and data @param params: Array of parameters to use @param data: Array of data to use ''' mu = params[0] h = params[1] sum_lik = 0 for r in range(0, data.shape[0]): row = data[r, ] home_id = row[0] away_id = row[1] z = row[2] lambda_one = np.exp(mu + h + params[1 + home_id] + params[21 + away_id]) lambda_two = np.exp(mu + params[1 + away_id] + params[21 + home_id]) sum_lik -= np.log(skellam.pmf(z, lambda_one, lambda_two)) return sum_lik
df_future = df.loc[df['homeGoals'].isnull()] df_past['matchDate'] = pd.to_datetime(df_past['matchDate']) df_past['time_diff'] = (max(df_past['matchDate']) - df_past['matchDate']).dt.days df_past = df_past[[ 'homeTeam', 'homeGoals', 'awayTeam', 'awayGoals', 'time_diff' ]] df_past.head() # ============================================================================= # poisson regression model # ============================================================================= # work out poisson probabilities of goal differences between home and away team of - 8 to plus 8 skellam_pred = [ skellam.pmf(i, df_past['homeGoals'].mean(), df_past['awayGoals'].mean()) for i in range(-8, 8) ] # restructure dataframe by splitting home and away fixtures goal_model_data = pd.concat([ df_past[['homeTeam', 'awayTeam', 'homeGoals']].assign(home=1).rename(columns={ 'homeTeam': 'team', 'awayTeam': 'opponent', 'homeGoals': 'goals' }), df_past[['awayTeam', 'homeTeam', 'awayGoals']].assign(home=0).rename(columns={ 'awayTeam': 'team', 'homeTeam': 'opponent',
def _probGoalsDiff(self, diff, data): goals_diff = diff return skellam.pmf(goals_diff, data.mean()[0], data.mean()[1])
from scipy.stats import skellam import numpy as np import matplotlib.pyplot as plt fig, ax = plt.subplots() mu1, mu2 = 0.03, 0.02 mean, var, skew, kurt = skellam.stats(mu1, mu2, moments='mvsk') print(mean, var, skew, kurt) x = np.arange(skellam.ppf(0.01, mu1, mu2), skellam.ppf(0.99, mu1, mu2)) ax.plot(x, skellam.pmf(x, mu1, mu2), 'bo', ms=8, label = 'skellam pmf') ax.vlines(x, 0, skellam.pmf(x, mu1, mu2), colors='b', lw = 5, alpha=0.5) plt.show()
from scipy.stats import skellam import matplotlib.pyplot as plt fig, ax = plt.subplots(1, 1) # Calculate a few first moments: mu1, mu2 = 15, 8 mean, var, skew, kurt = skellam.stats(mu1, mu2, moments='mvsk') # Display the probability mass function (``pmf``): x = np.arange(skellam.ppf(0.01, mu1, mu2), skellam.ppf(0.99, mu1, mu2)) ax.plot(x, skellam.pmf(x, mu1, mu2), 'bo', ms=8, label='skellam pmf') ax.vlines(x, 0, skellam.pmf(x, mu1, mu2), colors='b', lw=5, alpha=0.5) # Alternatively, the distribution object can be called (as a function) # to fix the shape and location. This returns a "frozen" RV object holding # the given parameters fixed. # Freeze the distribution and display the frozen ``pmf``: rv = skellam(mu1, mu2) ax.vlines(x, 0, rv.pmf(x), colors='k', linestyles='-', lw=1, label='frozen pmf') ax.legend(loc='best', frameon=False) plt.show()
'weight': 'bold' }) plt.xticks([i - 0.5 for i in range(1, 9)], [i for i in range(9)]) plt.xlabel("Goals per Match", size=13) plt.ylabel("Proportion of Matches", size=13) plt.title("Number of Goals per Match (EPL Season 02/03 - 13/14 )", size=14, fontweight='bold') plt.ylim([-0.004, 0.4]) plt.tight_layout() plt.show() skellam_pred = [ skellam.pmf(i, Training_Set.mean()[0], Training_Set.mean()[1]) for i in range(-6, 9) ] plt.hist(Training_Set[['FTHG']].values - Training_Set[['FTAG']].values, range(-6, 9), alpha=0.7, label='Actual', normed=True) plt.plot([i + 0.5 for i in range(-6, 9)], skellam_pred, linestyle='-', marker='o', label="Skellam", color='#CD5C5C') plt.legend(loc='upper right', fontsize=13) plt.xticks([i + 0.5 for i in range(-6, 9)], [i for i in range(-6, 9)])
# | | | car rental 1 # | # | car rental 2 rent_1 = 3 return_1 = 3 rent_2 = 4 returns_2 = 2 move_cost = 20 car_reward = 100 gamma = 0.9 M = 20 threshold = 12 # skellam distribution rentals = [[skellam.pmf(x, 3, 3) for x in range(-20, 21, 1)], [skellam.pmf(x, 2, 4) for x in range(-20, 21, 1)]] # poisson distribution rents = [[poisson.pmf(x, mu=3) for x in range(21)], [poisson.pmf(x, mu=4) for x in range(21)]] returns = [[poisson.pmf(x, mu=3) for x in range(21)], [poisson.pmf(x, mu=2) for x in range(21)]] rents_cdf = [[poisson.cdf(x, mu=3) for x in range(21)], [poisson.cdf(x, mu=4) for x in range(21)]] returns_cdf = [[poisson.cdf(x, mu=3) for x in range(21)], [poisson.cdf(x, mu=2) for x in range(21)]] def get_particular_prob(i, i_poss, rent_num): if i_poss == M: partial_sum = 1.0 for x in range(0, i + 1): partial_sum -= rents[rent_num][x] * (returns_cdf[rent_num][x + i_poss - i] - returns[rent_num][x + i_poss - i]) partial_sum -= (1 - rents_cdf[rent_num][i]) * (returns_cdf[rent_num][i_poss])
plt.ylabel("Proportion of Matches",size=13) plt.title("Number of Goals per Match (Tippeligaen 2012 Season)",size=14,fontweight='bold') plt.ylim([-0.004, 0.4]) plt.tight_layout() plt.show() # Andel Hjemme maal vs Borte maal (Note that we consider the number of goals scored by each team to be # independent events (i.e. P(A n B) = P(A) P(B)). The difference of two Poisson distribution is actually # called a Skellam distribution.) skellam_pred = [skellam.pmf(i, norske_resultater.mean()[0], norske_resultater.mean()[1]) for i in range(-6,8)] plt.hist(norske_resultater[['HomeGoals']].values - norske_resultater[['AwayGoals']].values, range(-6,8), alpha=0.7, label='Actual',density=True) plt.plot([i+0.5 for i in range(-6,8)], skellam_pred, linestyle='-', marker='o',label="Skellam", color = '#CD5C5C') plt.legend(loc='upper right', fontsize=13) plt.xticks([i+0.5 for i in range(-6,8)],[i for i in range(-6,8)]) plt.xlabel("Home Goals - Away Goals",size=13) plt.ylabel("Proportion of Matches",size=13) plt.title("Difference in Goals Scored (Home Team vs Away Team)",size=14,fontweight='bold') plt.ylim([-0.004, 0.26]) plt.tight_layout() plt.show()
################################################################################ Debug.Print("rescale factor is: " + str(G_RESCALE_FACTOR)) # Initialize mu_A and mu_B randomly mu_A = N_VOTERS_PREF_A = int(np.random.uniform(0, N_VOTERS)) mu_B = N_VOTERS_PREF_B = N_VOTERS - mu_A Debug.Print("mu_A is: " + str(mu_A)) Debug.Print("mu_B is: " + str(mu_B)) #First two alphas are drawn from a skellam distribution with some upward rescale alpha1 = skellam.pmf(0, mu_A, mu_B) #prob n1 == n2 alpha2 = skellam.pmf(-1, mu_A, mu_B) #prob n1 == n2 - 1 alpha1 *= G_RESCALE_FACTOR alpha2 *= G_RESCALE_FACTOR Debug.Print("alpha1 is: " + str(alpha1)) Debug.Print("alpha2 is: " + str(alpha2) + "\n") # Init empty lists of length N_VOTERS_PREF A and B electorsA = [None] * N_VOTERS_PREF_A electorsB = [None] * N_VOTERS_PREF_B n_A = 0 for i in range(0, N_VOTERS_PREF_A):
'size': '14', 'weight': 'bold' }) plt.xticks([i - 0.5 for i in range(1, 9)], [i for i in range(9)]) plt.xlabel("Goals per Match", size=13) plt.ylabel("Proportion of Matches", size=13) plt.title("Number of Goals per Match (EPL 2016/17 Season)", size=14, fontweight='bold') plt.ylim([-0.004, 0.4]) plt.tight_layout() plt.show() # probability of draw between home and away team skellam.pmf(0.0, epl_1617.mean()[0], epl_1617.mean()[1]) # probability of home team winning by one goal skellam.pmf(1, epl_1617.mean()[0], epl_1617.mean()[1]) skellam_pred = [ skellam.pmf(i, epl_1617.mean()[0], epl_1617.mean()[1]) for i in range(-6, 8) ] plt.hist(epl_1617[['HomeGoals']].values - epl_1617[['AwayGoals']].values, range(-6, 8), alpha=0.7, label='Actual', normed=True)
import numpy as np import seaborn from scipy.stats import poisson, skellam ## Dataset ice = pd.read_csv("C:/data/hockey.csv") display(ice) ice.columns ice = ice[['Home','Visitor','G.1','G']] ice = ice.rename(columns={'G.1': 'Home Goals', 'G': 'Away Goals'}) ice.mean() ## using Skellam statistics ### probability of draw between home and away team skellam.pmf(0, ice.mean()[0], ice.mean()[1]) ### probability of home team winning by one goal skellam.pmf(1, ice.mean()[0], ice.mean()[1]) ### probability of home team winning by two goals skellam.pmf(2, ice.mean()[0], ice.mean()[1]) ### probability of home team losing by one goal skellam.pmf(-1, ice.mean()[0], ice.mean()[1]) ## importing the tools required for the Poisson regression model import statsmodels.api as sm import statsmodels.formula.api as smf ice.head() ice_h = ice[['Home','Visitor','Home Goals']] ice_h.columns = ['team','opponent','goals'] ice_h['field'] = 'home'
def _skellam_pmf(x, mu0, mu1): """ This is the probability mass function of the skellam distribution taken directly from the scipy stats package. """ px = skellam.pmf(x, mu1=mu0, mu2=mu1, loc=0) return px
def prob_win_change(n): z=np.zeros((n+1,4801)) for i in np.arange(n+1): z[i,:]=skellam.pmf(i,new_mean,new_mean)*0.5 return z
def BuildPoissonXGBTree(hist_data, feature_list, comp_data=None): ''' Build score predictions via (tree based) poisson regression. ''' dtrain_1 = xgb.DMatrix(data=np.matrix(hist_data[feature_list]), label=np.array(hist_data["team_1_score"]), feature_names=feature_list) dtrain_2 = xgb.DMatrix(data=np.matrix(hist_data[feature_list]), label=np.array(hist_data["team_2_score"]), feature_names=feature_list) param_1 = { 'max_depth': 2, 'eta': 0.1, 'silent': 1, 'objective': 'count:poisson' } param_1['nthread'] = 8 param_1['eval_metric'] = 'poisson-nloglik' param_2 = { 'max_depth': 2, 'eta': 0.1, 'silent': 1, 'objective': 'count:poisson' } param_2['nthread'] = 8 param_2['eval_metric'] = 'poisson-nloglik' #evallist_1 = [(dtrain, 'train'),(dtest, 'test')] evallist_1 = [(dtrain_1, 'train')] #evallist_2 = [(dtrain, 'train'),(dtest, 'test')] evallist_2 = [(dtrain_2, 'train')] num_round = 100 bst_1 = xgb.train(param_1, dtrain_1, num_round, evallist_1) bst_2 = xgb.train(param_2, dtrain_2, num_round, evallist_2) ypred_1 = bst_1.predict(dtrain_1) ypred_2 = bst_2.predict(dtrain_2) hist_data["team_1_score_pred"] = ypred_1 hist_data["team_2_score_pred"] = ypred_2 #hist_data[['team_1_score','team_1_score_pred','team_2_score','team_2_score_pred']] if comp_data is None: return hist_data dcomp = xgb.DMatrix(data=np.matrix(comp_data[feature_list]), feature_names=feature_list) # prepare comp data comp_data['team_1_score_pred'] = bst_1.predict(dcomp) comp_data['team_2_score_pred'] = bst_2.predict(dcomp) comp_data['team_1_prob'] = comp_data[[ 'team_1_score_pred', 'team_2_score_pred' ]].apply( lambda x: 1 - skellam.cdf(0, x['team_1_score_pred'], x[ 'team_2_score_pred']), 1) comp_data['team_tie_prob'] = comp_data[[ 'team_1_score_pred', 'team_2_score_pred' ]].apply( lambda x: skellam.pmf(0, x['team_1_score_pred'], x['team_2_score_pred'] ), 1) comp_data['team_2_prob'] = comp_data[[ 'team_1_score_pred', 'team_2_score_pred' ]].apply( lambda x: skellam.cdf(-1, x['team_1_score_pred'], x['team_2_score_pred' ]), 1) return hist_data, comp_data
import numpy as np import seaborn from scipy.stats import poisson,skellam # Gather and manipulate the data epl_1718 = pd.read_csv('EPL DATA 2017-2018.csv') epl_1718 = epl_1718[['HomeTeam','AwayTeam','FTHG','FTAG']] epl_1718 = epl_1718.rename(columns={'FTHG': 'HomeGoals', 'FTAG':'AwayGoals'}) epl_1718.head() # Since we're predicting the last round of matches, we need to remove the last 10 rows epl_1718 = epl_1718[:-10] epl_1718.mean() # Probability of a draw between home and away team skellam.pmf(0.0, epl_1718.mean()[0], epl_1718.mean()[1]) # Probability of Home team winning by one goal skellam.pmf(1, epl_1718.mean()[0], epl_1718.mean()[1]) # Import some more tools for Poisson Regression import statsmodels.api as sm import statsmodels.formula.api as smf # Making the model goal_model_data = pd.concat([epl_1718[['HomeTeam', 'AwayTeam', 'HomeGoals']].assign(home=1).rename( columns = {'HomeTeam':'team', 'AwayTeam':'opponent', 'HomeGoals':'goals'}), epl_1718[['AwayTeam', 'HomeTeam', 'AwayGoals']].assign(home=0).rename( columns = {'AwayTeam':'team', 'HomeTeam':'opponent', 'AwayGoals':'goals'})]) poisson_model = smf.glm(formula="goals ~ home + team + opponent", data=goal_model_data,
import pandas as pd import matplotlib.pyplot as plt import numpy as np import seaborn from scipy.stats import poisson, skellam import statsmodels.api as sm import statsmodels.formula.api as smf epl_1617 = pd.read_csv("http://www.football-data.co.uk/mmz4281/1617/E0.csv") epl_1617 = epl_1617[['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG']] epl_1617 = epl_1617.rename(columns={'FTHG': 'HomeGoals', 'FTAG': 'AwayGoals'}) epl_1617.head() epl_1617 = epl_1617[:-10] epl_1617.mean() skellam.pmf(0.0, epl_1617.mean()[0], epl_1617.mean()[1]) skellam.pmf(1, epl_1617.mean()[0], epl_1617.mean()[1]) goal_model_data = pd.concat([ epl_1617[['HomeTeam', 'AwayTeam', 'HomeGoals']].assign(home=1).rename(columns={ 'HomeTeam': 'team', 'AwayTeam': 'opponent', 'HomeGoals': 'goals' }), epl_1617[['AwayTeam', 'HomeTeam', 'AwayGoals']].assign(home=0).rename(columns={ 'AwayTeam': 'team', 'HomeTeam': 'opponent', 'AwayGoals': 'goals' }) ]) poisson_model = smf.glm(formula="goals ~ home + team + opponent",
'opponent': homeTeam,'home':0}, index=[1])).values[0] team_pred = [[poisson.pmf(i, team_avg) for i in range(0, max_goals+1)] for team_avg in [home_goals_avg, away_goals_avg]] return(np.outer(np.array(team_pred[0]), np.array(team_pred[1]))) epl_1617 = pd.read_csv("E0.csv") epl_1617 = epl_1617[['HomeTeam','AwayTeam','FTHG','FTAG']] epl_1617 = epl_1617.rename(columns={'FTHG': 'HomeGoals', 'FTAG': 'AwayGoals'}) print(epl_1617.head()) epl_1617 = epl_1617[:-10] print(epl_1617.mean()) print(skellam.pmf(0.0, epl_1617.mean()[0], epl_1617.mean()[1])) print(skellam.pmf(1, epl_1617.mean()[0], epl_1617.mean()[1])) goal_model_data = pd.concat([epl_1617[['HomeTeam','AwayTeam','HomeGoals']].assign(home=1).rename( columns={'HomeTeam':'team', 'AwayTeam':'opponent','HomeGoals':'goals'}), epl_1617[['AwayTeam','HomeTeam','AwayGoals']].assign(home=0).rename( columns={'AwayTeam':'team', 'HomeTeam':'opponent','AwayGoals':'goals'})]) poisson_model = smf.glm(formula="goals ~ home + team + opponent", data=goal_model_data, family=sm.families.Poisson()).fit() print(poisson_model.summary()) a=poisson_model.predict(pd.DataFrame(data={'team': 'Chelsea', 'opponent': 'Sunderland',