Python read_csv_as_np_array示例，src.read_csv_data.read_csv_as_np_array Python示例

示例#1

0

显示文件

文件： read_data.py 项目： goodspellr/kaggle-ncaa

def make_output_2015():
    tourney_seeds = read_csv_as_np_array('../Data/tourney_seeds_2015.csv',
                                         header=False)
    neutralized_season_data = neutralize_season_data(
        '../Data/regular_season_detailed_results_combined.csv',
        '../Data/tourney_detailed_results.csv',
        '../Data/neutralized_season_data_combined.csv'
    )

    tuning_params = dict(std=0.05)
    output = [['id', 'pred', 'score1', 'score2']]
    for season in ['2015']:
        tourney_teams = np.sort(tourney_seeds[np.where(tourney_seeds[:, 0] == season)[0]][:, 2])
        while len(tourney_teams) > 1:
            team1 = tourney_teams[0]
            tourney_teams= np.delete(tourney_teams, 0)
            for team2 in tourney_teams:
                features = calc_new_features(
                    neutralized_season_data,
                    season,
                    team1,
                    team2,
                    tuning_params
                    )
                prob = features[0]
                score1 = np.round(features[2]).astype(int)
                score2 = np.round(features[3]).astype(int)
                game_id = [season+'_'+team1+'_'+team2, str(prob), str(score1), str(score2)]
                output.append(game_id)
                print game_id

    output_file = open('../Data/out_2015_with_scores.csv', 'wb')
    csv.writer(output_file).writerows(output)

示例#2

0

显示文件

文件： seed.py 项目： goodspellr/kaggle-ncaa

class SeedLookupTable(object):

    _table = read_csv_as_np_array('./Data/tourney_seeds_combined.csv',
                                  header=True)
    _table_header = _table[0]
    _table = _table[1:]

    _dict = None

    _header_legend = dict(
        (category, i) for i, category in enumerate(_table_header))

    @classmethod
    def _initialize_dict(cls):
        cls._dict = dict()
        for line in cls._table:
            season = line[cls._header_legend['season']]
            if not season in cls._dict:
                cls._dict[season] = dict()

            team = line[cls._header_legend['team']]

            cls._dict[season][team] = int(
                line[cls._header_legend['seed']][1:3])

    @classmethod
    def lookup(cls, season, team):
        if cls._dict is None:
            cls._initialize_dict()

        return cls._dict[season][team]

示例#3

0

显示文件

文件： read_data.py 项目： goodspellr/kaggle-ncaa

def make_output():
    tourney_seeds = read_csv_as_np_array('../Data/tourney_seeds.csv',
                                         header=False)
    neutralized_season_data = neutralize_season_data(
        '../Data/regular_season_detailed_results.csv',
        '../Data/tourney_detailed_results.csv',
        '../Data/neutralized_season_data.csv'
    )

    tuning_params = dict(std=0.05)
    output = [['id', 'pred']]
    for season in ['2011', '2012', '2013', '2014']:
        tourney_teams = np.sort(tourney_seeds[np.where(tourney_seeds[:, 0] == season)[0]][:, 2])
        while len(tourney_teams) > 1:
            team1 = tourney_teams[0]
            tourney_teams= np.delete(tourney_teams, 0)
            for team2 in tourney_teams:
                prob = calc_new_features(
                    neutralized_season_data,
                    season,
                    team1,
                    team2,
                    tuning_params
                    )[0]
                game_id = [season+'_'+team1+'_'+team2, str(prob)]
                output.append(game_id)
                print game_id

    output_file = open('../Data/out.csv', 'wb')
    csv.writer(output_file).writerows(output)

示例#4

0

显示文件

文件： read_data.py 项目： goodspellr/kaggle-ncaa

def pick_winners():
    data = read_csv_as_np_array('../Data/out_2015_with_scores.csv', header=False)
    output = [['winner', 'loser']]
    for line in data:
        team1, team2 = line[0].split('_')[1:3]
        prob = float(line[1])
        score1, score2 = line[2:4]
        if prob >= 0.5:
            result = [TeamName.lookup(team1), TeamName.lookup(team2), prob, score1, score2]
        else:
            result = [TeamName.lookup(team2), TeamName.lookup(team1), 1.0 - prob, score2, score1]
        output.append(result)

    output_file = open('../Data/game_predictions_with_scores.csv', 'wb')
    csv.writer(output_file).writerows(output)

示例#5

0

显示文件

文件： read_data.py 项目： goodspellr/kaggle-ncaa

def make_output_seeds():
    tourney_seeds = read_csv_as_np_array('../Data/tourney_seeds_2015.csv',
                                         header=False)

    output = [['id', 'pred']]
    for season in ['2015']:
        tourney_teams = np.sort(tourney_seeds[np.where(tourney_seeds[:, 0] == season)[0]][:, 2])
        while len(tourney_teams) > 1:
            team1 = tourney_teams[0]
            tourney_teams= np.delete(tourney_teams, 0)
            for team2 in tourney_teams:
                seed1 = SeedLookupTable.lookup('2015', team1)
                seed2 = SeedLookupTable.lookup('2015', team2)
                prob = 0.5 + 0.03*(seed2 - seed1)
                game_id = [season+'_'+team1+'_'+team2, str(prob)]
                output.append(game_id)
                print game_id

    output_file = open('../Data/out_2015_seed_benchmark.csv', 'wb')
    csv.writer(output_file).writerows(output)

示例#6

0

显示文件

class TeamName(object):

    _table = read_csv_as_np_array('../Data/teams.csv', header=True)
    _table_header = _table[0]
    _table = _table[1:]

    _dict = None

    _header_legend = dict(
        (category, i) for i, category in enumerate(_table_header))

    @classmethod
    def _initialize_dict(cls):
        cls._dict = dict()
        for line in cls._table:
            cls._dict[line[0]] = line[1]

    @classmethod
    def lookup(cls, team_id):
        if cls._dict is None:
            cls._initialize_dict()

        return cls._dict[team_id]

示例#7

0

显示文件

文件： seed.py 项目： goodspellr/kaggle-ncaa

class Seed2DDistribution(object):

    _data = read_csv_as_np_array('./Data/tourney_compact_results.csv',
                                 header=True)
    _header = _data[0]
    _data = _data[1:]

    _header_legend = dict((category, i) for i, category in enumerate(_header))

    # drop data from 2011 or later
    _data_drop_index = np.min(
        np.where(_data[:, _header_legend['season']] == '2011'))
    _data = _data[:_data_drop_index]

    _dict = None

    @classmethod
    def _initialize_dict(cls):
        cls._dict = dict()
        for line in cls._data:

            season = line[cls._header_legend['season']]
            w_team = line[cls._header_legend['wteam']]
            l_team = line[cls._header_legend['lteam']]

            w_seed = SeedLookupTable.lookup(season, w_team)
            l_seed = SeedLookupTable.lookup(season, l_team)

            greater_seed = w_seed if w_seed >= l_seed else l_seed
            lesser_seed = w_seed if w_seed <= l_seed else l_seed

            greater_seed_str = "%02d" % greater_seed
            lesser_seed_str = "%02d" % lesser_seed

            if not greater_seed_str in cls._dict:
                cls._dict[greater_seed_str] = dict()

            if not lesser_seed_str in cls._dict[greater_seed_str]:
                cls._dict[greater_seed_str][lesser_seed_str] = [0, 0]

            if w_seed == l_seed:
                cls._dict[greater_seed_str][lesser_seed_str][0] += 0.5
                cls._dict[greater_seed_str][lesser_seed_str][1] += 0.5
            elif w_seed > l_seed:
                cls._dict[greater_seed_str][lesser_seed_str][0] += 1
            else:
                cls._dict[greater_seed_str][lesser_seed_str][1] += 1

    @classmethod
    def lookup(cls, seed1, seed2):

        if cls._dict is None:
            cls._initialize_dict()

        if seed1 >= seed2:
            g_str = "%02d" % seed1
            l_str = "%02d" % seed2
        else:
            g_str = "%02d" % seed2
            l_str = "%02d" % seed1

        if not g_str in cls._dict:
            cls._dict[g_str] = dict()

        if not l_str in cls._dict[g_str]:
            cls._dict[g_str][l_str] = [0, 0]

        result = cls._dict[g_str][l_str]

        if seed2 > seed1:
            result = [result[1], result[0]]

        return result

示例#8

0

显示文件

文件： read_data.py 项目： goodspellr/kaggle-ncaa

def compare_with_history():
    my_prediction = read_csv_as_np_array('../Data/submission_2015_03_14.csv', header=False)
    results = read_csv_as_np_array('../Data/tourney_compact_results.csv', header=False)

    my_dict = dict()
    results_dict = dict()

    for i in xrange(len(my_prediction)):
        key = my_prediction[i, 0]
        my_dict[key] = my_prediction[i, 1].astype(float)

    for i in xrange(len(results)):
        year = results[i, 0]
        if int(year) >= 2011:
            team1 = results[i, 2]
            team2 = results[i, 4]
            if int(team1) < int(team2):
                results_key = '_'.join([year, team1, team2])
                res = 1
            else:
                results_key = '_'.join([year, team2, team1])
                res = 0
            results_dict[results_key] = [my_dict[results_key], res]

    data = []
    game_id_list = []
    for key in sorted(results_dict.keys()):
        game_id_list.append(key)
        data.append(results_dict[key])

    data = np.array(data)
    bins = np.arange(0.05, 1.05, 0.1)
    p = []
    for midpoint in bins:
        ind = np.where(np.logical_and(data[:, 0] >= (midpoint - 0.025), data[:, 0] < (midpoint + 0.025)))[0]
        p.append((np.sum(data[ind, 1])+1.0)/(len(ind)+1.0))

    p = np.array(p)
    ind_sort = np.argsort(data[:, 0])
    data = data[ind_sort]
    window = np.sqrt(len(data)).astype(int)
    test0 = np.convolve(data[:, 0], np.ones((window,))/window, mode='valid')
    test1 = np.convolve(data[:, 1], np.ones((window,))/window, mode='valid')
    plt.scatter(data[:, 0], data[:, 1])
#    plt.plot(test0, test1)
    plt.plot(bins, p)
    plt.ylabel('Result')
    plt.xlabel('My Prediction')
    plt.title("Result vs. My Prediction - 2011-2014")
    plt.savefig('../outputs/comparison_history_vs_my.png')
    plt.close()

    x = data[np.flatnonzero(data[:, 1]), 0]
    nbins = np.sqrt(len(x)).astype(int)
    bins = np.linspace(x.min(), x.max(), nbins)
    bins = np.linspace(0, 1, 10)
    plt.hist(x, bins, normed=True, alpha=0.5)
    x = data[np.flatnonzero(1-data[:, 1]), 0]
    nbins = np.sqrt(len(x)).astype(int)
    bins = np.linspace(x.min(), x.max(), nbins)
    bins = np.linspace(0, 1, 10)
    plt.hist(x, bins, normed=True, alpha=0.5)
    plt.savefig('./comparison_history_vs_my_histogram.png')
    plt.close()

示例#9

0

显示文件

文件： read_data.py 项目： goodspellr/kaggle-ncaa

def compare_with_net_prophet():
    my_prediction = read_csv_as_np_array('../Data/out_2015.csv', header=False)
    np_prediction = read_csv_as_np_array('../Data/net_prophet_kaggle_submission_public.csv', header=False)
    sb_prediction = read_csv_as_np_array('../Data/out_2015_seed_benchmark.csv', header=False)

    if len(my_prediction) != len(np_prediction):
        raise RuntimeError('something wrong')

    my_dict = dict()
    np_dict = dict()
    sb_dict = dict()
    for i in xrange(len(my_prediction)):
        key = my_prediction[i, 0]
        my_dict[key] = my_prediction[i, 1].astype(float)
        key = np_prediction[i, 0]
        np_dict[key] = np_prediction[i, 1].astype(float)
        key = sb_prediction[i, 0]
        sb_dict[key] = sb_prediction[i, 1].astype(float)

    data = []
    game_id_list = []
    for key in sorted(my_dict.keys()):
        game_id_list.append(key)
        data.append([my_dict[key], np_dict[key], sb_dict[key]])

    data = np.array(data)
    plt.scatter(data[:, 0], data[:, 1])
    plt.ylabel('Net Prophet Prediction')
    plt.xlabel('My Prediction')
    plt.title("Net Prophet's Prediction vs. My Prediction - 2015")
    plt.savefig('./comparison_np_vs_my.png')
    plt.close()

    plt.scatter(data[:, 2], data[:, 0])
    plt.xlabel('Seed Benchmark Prediction')
    plt.xlabel('My Prediction')
    plt.title("My Prediction vs. Seed Benchmark Prediction - 2015")
    plt.savefig('./comparison_my_vs_sb.png')
    plt.close()

    plt.scatter(data[:, 2], data[:, 1])
    plt.ylabel('Net Prophet Prediction')
    plt.xlabel('Seed Benchmark Prediction')
    plt.title("Net Prophet's Prediction vs. Seed Benchmark Prediction - 2015")
    plt.savefig('./comparison_np_vs_sb.png')
    plt.close()

    print 'E[log_loss | Net Prophet Model correct]:'
    print ' ----- My Model:                 ', calc_log_loss(data[:, 0], data[:, 1])
    print ' ----- Seed Benchmark Model:     ', calc_log_loss(data[:, 2], data[:, 1])
    print '\nE[log_loss | My Model correct]:'
    print ' ----- Net Prophet Model:        ', calc_log_loss(data[:, 1], data[:, 0])
    print ' ----- Seed Benchmark Model:     ', calc_log_loss(data[:, 2], data[:, 0])
    print '\nE[log_loss | Seed Benchmark correct]:'
    print ' ----- My Model:                 ', calc_log_loss(data[:, 0], data[:, 2])
    print ' ----- Net Prophet Model:        ', calc_log_loss(data[:, 1], data[:, 2])

    # for weight in np.linspace(0, 1, 21):
    #     print weight, calc_log_loss(data[:, 2], weight*data[:, 0] + (1-weight)*data[:, 1])

    merged_prediction = 0.5*data[:, 0] + 0.5*data[:, 1]
    output = [['id', 'pred']]
    for i, game_id in enumerate(game_id_list):
        output.append([game_id, merged_prediction[i]])

    output_file = open('../Data/out_2015_merged.csv', 'wb')
    csv.writer(output_file).writerows(output)

示例#10

0

显示文件

文件： read_data.py 项目： goodspellr/kaggle-ncaa

def main_2015_03_12():
    tourney_data = read_csv_as_np_array('../Data/tourney_detailed_results.csv',
                                         header=False)

    neutralized_season_data = neutralize_season_data(
        '../Data/regular_season_detailed_results.csv',
        '../Data/tourney_detailed_results.csv',
        '../Data/neutralized_season_data.csv'
    )

    # test on 2011-2014 tourney results
    test_data_min_index = np.min(np.where(tourney_data[:, 0] == '2014')[0])

    train_data = tourney_data

    for std in [0.05]: #  np.linspace(0.01, 0.1, 10):
        tuning_params = dict(std=std)
        #scramble the results to eliminate fact that team1 is always winner in tourney data
        results = np.random.randint(0, 2, len(train_data))

        features = []
        output = [['id', 'pred']]
        for i in xrange(len(train_data)):

            season = train_data[i, 0]
            if results[i] == 1:
                team1 = train_data[i, 2]
                team2 = train_data[i, 4]
            else:
                team1 = train_data[i, 4]
                team2 = train_data[i, 2]

            features.append(
                calc_new_features(
                    neutralized_season_data,
                    season,
                    team1,
                    team2,
                    tuning_params
                    )
            )

            if int(season) >= 2011:
                if int(team1) < int(team2):
                    game_id = [season+'_'+team1+'_'+team2, str(features[-1][0])]
                else:
                    game_id = [season+'_'+team2+'_'+team1, str(1.0-features[-1][0])]

                output.append(game_id)

        # output_file = open('../Data/out.csv', 'wb')
        # csv.writer(output_file).writerows(output)
        features = np.array(features)

        for season in np.unique(train_data[:, 0]):  # ['2011', '2012', '2013', '2014']:
            idx = np.where(train_data[:, 0] == season)[0]
            log_loss = -np.mean(results[idx]*np.log(features[idx, 0])
                           + (1-results[idx])*np.log(1.0 - features[idx, 0]))

            junk = 0.98*(features[idx, 0]-features[idx, 0].min())/(features[idx, 0].max() - features[idx, 0].min()) + 0.01
            junk_log_loss = -np.mean(results[idx]*np.log(junk)
                           + (1-results[idx])*np.log(1.0 - junk))

            print std, season, log_loss, junk_log_loss

示例#11

0

显示文件

文件： read_data.py 项目： goodspellr/kaggle-ncaa

def main_2015_03_08():

    standardized_season_data = standardize_season_data(
        '../Data/regular_season_detailed_results.csv',
        '../Data/tourney_detailed_results.csv',
        '../Data/standardized_season_data.csv'
    )

    tourney_data = read_csv_as_np_array('../Data/tourney_detailed_results.csv',
                                header=False)

    tourney_seeds = read_csv_as_np_array('../Data/tourney_seeds.csv',
                                header=False)

    #win_loss_data = get_win_loss_data()

    # test on 2011-2014 tourney results
    test_data_min_index = np.min(np.where(tourney_data[:, 0] == '2011')[0])

    train_data = tourney_data  # tourney_data[:, test_data_min_index]
    test_data = tourney_data[test_data_min_index:]

    #scramble the results to eliminate fact that team1 is always winner in tourney data
    results = np.random.randint(0, 2, len(train_data))

    features = []
    for i in xrange(len(train_data)):
        if results[i] == 1:
            team1 = train_data[i, 2]
            team2 = train_data[i, 4]
        else:
            team1 = train_data[i, 4]
            team2 = train_data[i, 2]

        features.append(
            calc_features(
                standardized_season_data,
                train_data[i, 0],
                team1,
                team2,
                tourney_seeds=tourney_seeds
                )
        )

    #features = np.array(features)
    # try just using gaussian win prob and team seeds
    features = np.array(features)[:, np.array([-5, -2, -1])]

    # logistic regression
    lr_model = LogisticRegression()
    lr_model.fit(features[:test_data_min_index], results[:test_data_min_index])

    logistic_probability = lr_model.predict_proba(features[test_data_min_index:])[:, 1]
    log_loss = -np.mean(results[test_data_min_index:]*np.log(logistic_probability)
                   + (1-results[test_data_min_index:])*np.log(1.0 - logistic_probability))

    print lr_model.coef_, lr_model.intercept_
    print log_loss


    # try just the seed difference as a feature in a logistic regression
    seed_diff = features[:, -1] - features[:, -2]
    lr_model = LogisticRegression()
    lr_model.fit(seed_diff[:test_data_min_index, np.newaxis], results[:test_data_min_index])

    logistic_probability = lr_model.predict_proba(seed_diff[test_data_min_index:, np.newaxis])[:, 1]
    log_loss = -np.mean(results[test_data_min_index:]*np.log(logistic_probability)
                   + (1-results[test_data_min_index:])*np.log(1.0 - logistic_probability))

    print lr_model.coef_, lr_model.intercept_
    print log_loss

    quick_prob = 0.5+0.03*(features[test_data_min_index:, -1] - features[test_data_min_index:, -2])
    print -np.mean(results[test_data_min_index:]*np.log(quick_prob)
                   + (1-results[test_data_min_index:])*np.log(1.0 - quick_prob))
    plt.scatter(quick_prob, logistic_probability)
    plt.scatter(quick_prob, quick_prob, color='red')
    #plt.scatter(logistic_probability, results[test_data_min_index:])
    plt.show()

    # print 'about to do gmm'
    # gmm_model = mixture.GMM(n_components=2)
    # gmm_model.fit(features[:test_data_min_index])
    # gmm_probs = gmm_model.predict_proba(features[test_data_min_index:])
    #
    # log_loss = -np.mean(
    #     results[test_data_min_index:, np.newaxis]*np.log(gmm_probs)
    #                + (1-results[test_data_min_index:, np.newaxis])*np.log(1.0 - gmm_probs),
    #     axis=0
    # )
    #
    # print log_loss

    print 'done'