def get_submission_pieces(submission_id, tournament, round_number, db_manager, filemanager): """Get validation, test, and live ids sorted from submission_id Parameters: ----------- submission_id : string ID of the submission round_number : int Numerical ID of the competition round of the tournament db_manager : DatabaseManager DB data access object that has read and write functions to NoSQL DB filemanager : FileManager S3 Bucket data access object for querying competition datasets Returns: -------- validation : ndarray Sorted validation ids from submission data tests : ndarray Sorted test ids from submission data live : ndarray Sorted live ids from submission data """ s3_file, _ = common.get_filename(db_manager.postgres_db, submission_id) local_file = filemanager.download([s3_file])[0] data = pd.read_csv(local_file) val_ids, test_ids, live_ids = get_ids(filemanager, tournament, round_number) validation, tests, live = get_sorted_split(data, val_ids, test_ids, live_ids) return validation, tests, live
def update_leaderboard(self, submission_id, filemanager): """Update the leaderboard with a submission Parameters: ---------- submission_id : string ID of the submission filemanager : FileManager S3 Bucket data access object for querying competition datasets """ round_number = self.get_round_number(submission_id) # Get the tournament data extract_dir = filemanager.download_dataset(round_number) tournament_data = pd.read_csv( os.path.join(extract_dir, "numerai_tournament_data.csv")) # Get the user submission s3_file, _ = common.get_filename(self.postgres_db, submission_id) local_file = filemanager.download([s3_file])[0] submission_data = pd.read_csv(local_file) validation_data = tournament_data[tournament_data.data_type == "validation"] validation_submission_data = submission_data[submission_data.id.isin( validation_data.id.values)] validation_eras = np.unique(validation_data.era.values) num_eras = len(validation_eras) # Calculate era loglosses better_than_random_era_count = 0 for era in validation_eras: era_data = validation_data[validation_data.era == era] submission_era_data = validation_submission_data[ validation_submission_data.id.isin(era_data.id.values)] era_data = era_data.sort_values(["id"]) submission_era_data = submission_era_data.sort_values(["id"]) logloss = log_loss(era_data.target.values, submission_era_data.probability.values) if logloss < -math.log(0.5): better_than_random_era_count += 1 consistency = better_than_random_era_count / num_eras * 100 print("Consistency: {}".format(consistency)) # Update consistency and insert pending originality and concordance into Postgres cursor = self.postgres_db.cursor() cursor.execute( "UPDATE submissions SET consistency={} WHERE id = '{}'".format( consistency, submission_id)) cursor.execute( "INSERT INTO originalities(pending, submission_id) VALUES(TRUE, '{}') ON CONFLICT (submission_id) DO NOTHING;" .format(submission_id)) cursor.execute( "INSERT INTO concordances(pending, submission_id) VALUES(TRUE, '{}') ON CONFLICT (submission_id) DO NOTHING;" .format(submission_id)) self.postgres_db.commit() cursor.close()
def get_submission(db_manager, filemanager, submission_id): """Gets the submission file from S3 Parameters: ----------- db_manager: DatabaseManager DB data access object that has read and write functions to NoSQL DB filemanager: FileManager S3 Bucket data access object for querying competition datasets submission_id : string The ID of the submission Returns: -------- submission : ndarray 2d array of the submission probabilities. First column is sorted by ID and second column is sorted by probability. """ if not submission_id: return None s3_filename, _ = common.get_filename(db_manager.postgres_db, submission_id) try: local_files = filemanager.download([s3_filename]) if len(local_files) != 1: logging.getLogger().info("Error looking for submission {}, found files {}".format(submission_id, local_files)) return None local_file = local_files[0] except Exception: logging.getLogger().info("Could not get submission {} at S3 path {}".format(submission_id, s3_filename)) return None df = pd.read_csv(local_file) assert "id" in df.columns, "No id column in submission {}".format(s3_filename) assert "probability" in df.columns, "No probability column in submission {}".format(s3_filename) df.sort_values("id", inplace=True) df = df["probability"] a = df.as_matrix() a_sorted = np.sort(a) # make a two-column numpy array: first column is sorted by id; second # column is sorted by probability a = a.reshape(-1, 1) a_sorted = a_sorted.reshape(-1, 1) a = np.hstack((a, a_sorted)) return a
def update_leaderboard(self, submission_id, filemanager): """Update the leaderboard with a submission Parameters: ---------- submission_id : string ID of the submission filemanager : FileManager S3 Bucket data access object for querying competition datasets """ print("Calculating consistency for submission_id {}...".format( submission_id)) tournament, round_number, _dataset_path = common.get_round( self.postgres_db, submission_id) # Get the tournament data print("Getting public dataset for round number {}-{}".format( tournament, round_number)) extract_dir = filemanager.download_dataset(tournament, round_number) tournament_data = pd.read_csv( os.path.join(extract_dir, "numerai_tournament_data.csv")) # Get the user submission s3_file, _ = common.get_filename(self.postgres_db, submission_id) submission_data = filemanager.read_csv(s3_file) validation_data = tournament_data[tournament_data.data_type == "validation"] validation_submission_data = submission_data[submission_data.id.isin( validation_data.id.values)] validation_eras = np.unique(validation_data.era.values) print(validation_eras) num_eras = len(validation_eras) assert num_eras == 12 # Calculate era loglosses better_than_random_era_count = 0 for era in validation_eras: era_data = validation_data[validation_data.era == era] submission_era_data = validation_submission_data[ validation_submission_data.id.isin(era_data.id.values)] assert len( submission_era_data > 0), "There must be data for every era" era_data = era_data.sort_values(["id"]) submission_era_data = submission_era_data.sort_values(["id"]) logloss = log_loss(era_data[common.TARGETS[tournament]].values, submission_era_data.probability.values) if logloss < BENCHMARK: better_than_random_era_count += 1 consistency = better_than_random_era_count / num_eras * 100 print("Consistency: {}".format(consistency)) # Update consistency and insert pending concordance into Postgres cursor = self.postgres_db.cursor() cursor.execute( "UPDATE submissions SET consistency={} WHERE id = '{}'".format( consistency, submission_id)) cursor.execute( "INSERT INTO concordances(pending, submission_id) VALUES(TRUE, '{}') ON CONFLICT (submission_id) DO NOTHING;" .format(submission_id)) self.postgres_db.commit() cursor.close()