def __init__(self, path_root=''): self.path_root = path_root self.sqlite_data = sqlite_data_handler(self.path_root) self.competition = self.sqlite_data.table_Competitions self.team = self.sqlite_data.table_Teams self.submission = self.sqlite_data.table_Submissions self.competitionhostsegment = self.sqlite_data.table_CompetitionHostSegments
plt.scatter(points['x'], points['y'], linewidths=0.5, s=10) plt.show() def describe(self): for k in np.unique(self.labels): cluster = self.data[(self.data['cluster'] == k)] print("Cluster: ", k) print("Median:") print(cluster.median()) print("Mean:") print(cluster.mean()) print("\n") if __name__ == '__main__': data = sqlite_data_handler("./data") processor = pp.preProcessor(data) users = processor.get_users() # Select users and features to cluster upon users = users.loc[ (users['HighestRanking'] < 200) & (users['NumSubmissions'] > 1), ['NumSubmissions', 'NumPosts', 'DaysTillActivity', 'HighestRanking']] # Select columns to apply log to users = pu.log_data(users, ['NumSubmissions', 'NumPosts']) print(users.describe()) users = pu.normalize_data(users) # Select User features users = users.sample(frac=0.1)
comps = points.CompetitionId.sort_values().unique() m = pd.DataFrame(np.zeros((len(users), len(comps)))) m.columns = comps m.index = users def to_matrix(row): m.loc[row.UserId, row.CompetitionId] = row.Points + sum(m.loc[row.UserId, ]) points = points.sort_values(['CompetitionId', 'UserId']) points.apply(to_matrix, axis=1) return m if __name__ == '__main__': db = sqlite_data_handler('data') print("Computing Points...") points = computePoints(db) comps = points.groupby(['CompetitionId']).agg({ 'Points': 'sum', 'UserId': 'nunique', 'UserRankMultiplier': 'median' }) # TODO: User Comp matrix with aggregated User Counts # TODO: 2. aggregated user counts have to be normalized? Mean better? m = user_comp_matrix(points) print(m)