def __init__(self, path_root=''):
     self.path_root = path_root
     self.sqlite_data = sqlite_data_handler(self.path_root)
     self.competition = self.sqlite_data.table_Competitions
     self.team = self.sqlite_data.table_Teams
     self.submission = self.sqlite_data.table_Submissions
     self.competitionhostsegment = self.sqlite_data.table_CompetitionHostSegments
            plt.scatter(points['x'], points['y'], linewidths=0.5, s=10)
        plt.show()

    def describe(self):
        for k in np.unique(self.labels):
            cluster = self.data[(self.data['cluster'] == k)]
            print("Cluster: ", k)
            print("Median:")
            print(cluster.median())
            print("Mean:")
            print(cluster.mean())
            print("\n")


if __name__ == '__main__':
    data = sqlite_data_handler("./data")
    processor = pp.preProcessor(data)
    users = processor.get_users()

    # Select users and features to cluster upon
    users = users.loc[
        (users['HighestRanking'] < 200) & (users['NumSubmissions'] > 1),
        ['NumSubmissions', 'NumPosts', 'DaysTillActivity', 'HighestRanking']]

    # Select columns to apply log to
    users = pu.log_data(users, ['NumSubmissions', 'NumPosts'])
    print(users.describe())
    users = pu.normalize_data(users)

    # Select User features
    users = users.sample(frac=0.1)
Пример #3
0
    comps = points.CompetitionId.sort_values().unique()
    m = pd.DataFrame(np.zeros((len(users), len(comps))))
    m.columns = comps
    m.index = users

    def to_matrix(row):
        m.loc[row.UserId,
              row.CompetitionId] = row.Points + sum(m.loc[row.UserId, ])

    points = points.sort_values(['CompetitionId', 'UserId'])
    points.apply(to_matrix, axis=1)
    return m


if __name__ == '__main__':
    db = sqlite_data_handler('data')
    print("Computing Points...")
    points = computePoints(db)
    comps = points.groupby(['CompetitionId']).agg({
        'Points':
        'sum',
        'UserId':
        'nunique',
        'UserRankMultiplier':
        'median'
    })

    # TODO: User Comp matrix with aggregated User Counts
    # TODO: 2. aggregated user counts have to be normalized? Mean better?
    m = user_comp_matrix(points)
    print(m)