예제 #1
0
파일: RunModelMLB.py 프로젝트: moltypod/bet
class MLBMakePredictions(object):
    # Applies ml techniques to the mlb scraped data and uses the
    # resulting model to make predictions on unplayed games
    # (NB: unplayed game = mlb_db_name = mlb_team_data_x.db, for ex. the database of the CURRENT SEASON,
    # the season you want to make prediction)

    from ScrapeMLBTeamStats import AcquireTeamStats
    from ScrapeMLBGameStats import AcquireGameStats
    from PrepareForMLTechMLB import PrepareForML

    def __init__(self, current_season, feature_file, mlb_db_name):
        self.data = np.load(feature_file)
        self.tableau_input_filename = "mlb_tableau_input" + str(
            current_season) + '.csv'
        self.current_season = current_season
        self.X = self.data['X']
        self.y = self.data['y']
        self.mlb_db_name = mlb_db_name

    def __call__(self):
        print "Scraping MLB current season data for update...\n"
        self.acquire_current_season_data(self.current_season)
        # Train again on our data from 'features.npz' because for 'learning curves'
        # we performed the training process in the CV function
        print "Algorithm training..."
        self.train_logistic_regression()
        print "OK\n"

        print "Making predictions..."
        self.make_tableau_file(self.game_data_filename, self.datetime_filename)
        print "OK\n"
        add_id_column_to_csv(self.tableau_input_filename)

    # -----------------------ACQUIRE CURRENT SEASON DATA---------------------------
    def acquire_current_season_data(self, current_season):
        # Acquires all data structures needed to make predictions on current season

        team_data_filename = 'mlb_team_stats_' + str(current_season) + '.csv'
        game_data_filename = 'mlb_game_stats_' + str(current_season) + '.csv'
        datetime_filename = 'mlb_datetime_' + str(current_season) + '.csv'
        db_filename = 'mlb_team_data_' + str(current_season) + '.db'
        feature_filename = 'mlb_' + str(current_season) + '_features.npz'
        # if you want to filter,
        # uncomment and make changes to the corresponding section in the 'AcquireGameStats' class

        # Scrape data for the current season
        print "Scraping Team Stats..."
        mlb_teamdata = self.AcquireTeamStats(current_season, current_season,
                                             current_season,
                                             team_data_filename)
        mlb_teamdata()
        print "OK\n"

        print "Scraping Game Stats..."
        mlb_gamedata = self.AcquireGameStats(current_season, current_season,
                                             current_season,
                                             game_data_filename,
                                             datetime_filename)
        mlb_gamedata()
        print "OK\n"

        print "Preprocessing updated data for Machine Learning..."
        # Prepare for ML predictions
        pml = self.PrepareForML(game_data_filename, db_filename)
        pml.process_raw_data(team_data_filename)
        pml(feature_filename)
        print "OK\n"

        self.datetime_filename = datetime_filename
        self.game_data_filename = game_data_filename

    # -------------------------------------------CREATING CSV FILE FOR TABLEAU----------------------------------

    def make_tableau_file(self, game_data_filename, datetime_filename):
        # Produces a csv file containing predicted and actual game results for the current season
        # Tableau uses the contents of the file to produce visualization

        with open(self.tableau_input_filename, 'wb') as writefile:
            tableau_write = csv.writer(writefile)
            tableau_write.writerow([
                'Visitor_Team', 'V_Team_PTS', 'Home_Team', 'H_Team_PTS',
                'True_Result', 'Predicted_Result', 'Confidence', 'Date'
            ])

            with open(game_data_filename,
                      'rb') as readfile, open(datetime_filename,
                                              'rb') as readfile2:
                scorereader = csv.reader(readfile)
                scores = [row for row in scorereader]
                scores = scores[1::]
                daysreader = csv.reader(readfile2)
                days = [day for day in daysreader]
                if (len(scores) != len(days)):
                    print("File lengths do not match")
                else:
                    for i in range(len(days)):
                        tableau_content = scores[i][1::]
                        tableau_date = days[i]
                        # Append True_Result
                        try:
                            if int(tableau_content[3]) > int(
                                    tableau_content[1]):
                                tableau_content.append(1)
                            else:
                                tableau_content.append(0)
                        except:
                            pass
                        # Append 'Predicted_Result' and 'Confidence'
                        prediction_results = self.make_predictions(
                            tableau_content[0], tableau_content[2])
                        tableau_content += list(prediction_results)
                        tableau_content += tableau_date

                        tableau_write.writerow(tableau_content)
                        # -----------------------------------------------------------------------------------------------------------------------------------------

                        # -----------------------------ADD INDEX COLUMN TO CSV FOR TABLEAU ANALYSIS-----

                        # To run after the 'make_tableau_file' function so as to add 'ID' column to the
                        #  'tableau_input' file and so indexing each game...
                        # and thus facilitate analysis of each game in Tableau

    # E.G. THIS IS THE FILE TO TAKE FOR BETTING STRATEGIES IN TABLEAU!!!!

    # ----------------------------------------ALGORITHMS-------------------------------------------------------------------------------------
    # Each of the algorithm has a separation between 'instanciating
    # + training to the data' and only 'intanciating' (required for k-fold CV)

    # Logistic Regression
    def train_logistic_regression(self, scale_data=False):
        # IF YOU PASS 'scale_data' to True
        # DO NOT FORGET TO DO THE SAME IN 'make_predictions' to also normalize test data (current_season features)
        # Preprocessing step: False by default
        # Scaling the feature vector applying normalization 'Min-Max scaling' technique
        # If you want to use standardization use 'standardize_features' function
        # Hint: Better use normalization for SVM
        # (http://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf) or try both and
        # compare the results with cross validation
        if scale_data != False:
            self.X = standardize_features(self.X)
        else:
            pass
        X, y = shuffle(self.X, self.y)
        self.logreg = linear_model.LogisticRegression()
        self.logreg.fit(X, y)

    def instantiate_logistic_regression(self):
        # Only instantiate logistic regression model without fitting any data
        # Needed in the 'model_evaluation' function
        self.logreg2 = linear_model.LogisticRegression()
        pass

    # Radial Basis Function kernel SVM
    def train_rbf_svm(self, scale_data=True):
        # IF YOU PASS 'scale_data' to True DO NOT FORGET TO DO THE SAME IN 'make_predictions'
        # to also normalize test data (current_season features)
        # Preprocessing step: True by default(only for SVM as it is always recommended)
        # Scaling the feature vector applying normalization 'Min-Max scaling' technique
        # If you want to use standardization use 'standardize_features' function
        # Hint: Better use normalization for SVM
        # (http://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf) or try both and
        # compare the results with cross validation
        if scale_data != False:
            self.X = normalize_features(self.X)
        else:
            pass
        X, y = shuffle(self.X, self.y)
        self.clf = svm.SVC(probability=True, random_state=None)
        self.clf.fit(X, y)

    def instantiate_rbf_svm(self):
        self.clf2 = svm.SVC(probability=True, random_state=None)

        pass

    def train_adaboost(self):
        X, y = shuffle(self.X, self.y)
        self.dbt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                                      n_estimators=100)
        self.dbt.fix(X, y)

    def instantiate_adaboost(self):
        self.dbt2 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                                       n_estimators=100)
        pass

    # --------------------------------------------------------------------------------------------------------------------------------------

    # -----------------------MAKE PREDICTIONS (implemented in the 'make_tableau_file function)-----------------

    def make_predictions(self, team1, team2, scale_data=False):
        # Using prediction model, returns 1 if the model thinks team2 will beat team1, 0 otherwise
        # Advise: Respect the order:
        # V_Team for team1, H_Team for team2 for consistency with the PrepareForML class techniques

        query = 'SELECT * FROM Team_Stats WHERE Team = ?'

        con = lite.connect(self.mlb_db_name)
        with con:
            cur = con.cursor()
            cur.execute(query, (team1, ))
            feature1 = list(cur.fetchone()[2::])
            cur.execute(query, (team2, ))
            feature2 = list(cur.fetchone()[2::])
            feature = np.array(feature2).reshape(
                1, -1) - np.array(feature1).reshape(1, -1)

            if scale_data != False:
                feature = normalize_features(feature)

            else:
                pass

            # Make prediction
            # TO CHANGE ACCORDING THE ALGORITHM YOU WANT TO USE,
            # available classifiers: logreg, clf, dbt (change 2X)
            prediction_output = self.logreg.predict(
                feature)  # Predict class labels for samples in X
            prediction_probability = max(
                self.logreg.predict_proba(feature)
                [0])  # Returns the probability of "prediction_output"

            return prediction_output[0], prediction_probability

    # -------------------------------------PARAMETRIC--------------------------------------------

    def cval_score(self):
        # change 'logreg' to 'clf' if you want to perform it on SVM (dbt also available)
        scores = cross_val_score(self.logreg, self.X, self.y, cv=10)
        print scores.mean(), scores.std()

    # Used in for learning curves and model evaluation
    def train_test_split(self):
        self.trX, self.teX, self.trY, self.teY = train_test_split(
            self.X, self.y, test_size=0.30, random_state=None)
        pass

    # ----------------------------------CROSS VALIDATION AND MODEL EVALUATION-----------------------------

    # The below function takes a model,
    # a pre-split dataset(train/test X and Y arrays (Nb: use train_test_split function to do so)),
    # a scoring function as input and iterates through the dataset training
    # on n exponentially spaced subsets and returns the learning curves
    # e.g. score_func = metrics.accuracy_score
    def data_size_response(self, model, score_func, prob=True, n_subsets=10):

        # creating 2 empty arrays for train
        train_errs, test_errs = [], []
        # defining a var that take the value of "n_subsets" and creates "n_subsets" with corresponding size
        # "linspace returns num evenly spaced samples, calculated over the interval [start, stop]
        # shape[0] of an array Y of dimensions (n,m) merely means "n", number of rows
        subset_sizes = np.exp(
            np.linspace(3, np.log(self.trX.shape[0]), n_subsets)).astype(int)

        # looping over the subset_sizes
        for m in subset_sizes:
            # for each subset we fit the model on trX, trY
            model.fit(self.trX[:m], self.trY[:m])
            # if prob is 'True', we defining var 'train_err' & 'test_err'
            # which are scores of Y (true Y) and 'predict_proba' made on X (predict Y)
            # respectively for the train and test set
            if prob:
                train_err = score_func(self.trY[:m],
                                       model.predict_proba(self.trX[:m]))
                test_err = score_func(self.teY, model.predict_proba(self.teX))
            # if prob is 'True', we defining var 'train_err' & 'test_err'
            # which are scores of Y (true Y) and 'predict' made on X (predict Y)
            # respectively for the train and test set
            else:
                train_err = score_func(self.trY[:m],
                                       model.predict(self.trX[:m]))
                test_err = score_func(self.teY, model.predict(self.teX))
            # printing the results for the m subset
            print "training error(accuracy): %.3f test error(accuracy): %.3f subset size: %.3f" % (
                train_err, test_err, m)
            # appending the m results to the arrays 'train_errs' & 'test_errs'
            train_errs.append(train_err)
            test_errs.append(test_err)

        # returning the number of subsets and the arrays
        return subset_sizes, train_errs, test_errs

    # Plotting function for visualizing the above response, e.g. the train error and the test error

    # Instruction to execute the above functions
    # model = it is the model we use e.g LogisticRegression()
    # score_func = it is the score function we use e.g. cval_score()
    # We declare a variable response = data_size_response(model, trX, teX, trY, teY, score_func, prob=True, n_subsets=?)
    # We plot the response: plot_response(*response)

    def model_evaluation(self):

        # Do not forget to replace the classifier by the one you want to evaluate
        # Classifiers: logreg2, clf2, dbt2
        self.train_test_split()
        y_pred = self.logreg2.fit(self.trX, self.trY).predict(self.teX)
        # predictions are in columns and actual values in rows
        cm = metrics.confusion_matrix(self.teY, y_pred)
        # nb: The support is the number of occurrences of each class in y_true (e.g. what really happened)
        print metrics.classification_report(self.teY, y_pred)
        print "matthews correlation coefficent: %.2f" % (
            metrics.matthews_corrcoef(self.teY, y_pred))
        accuracy = float(cm[0][0] + cm[1][1]) / cm.sum()
        print "accuracy: %.2f" % (accuracy)
        away_accuracy = float(cm[0][0]) / (cm[0][0] + cm[0][1])
        print "away_accuracy: %.2f" % (away_accuracy)
        home_accuracy = float(cm[1][1]) / (cm[1][1] + cm[1][0])
        print "home_accuracy: %.2f" % (home_accuracy)
        plot_confusion_matrix(cm, title='Confusion matrix')