예제 #1
0
def get_submission_pieces(submission_id, tournament, round_number, db_manager, filemanager):
    """Get validation, test, and live ids sorted from submission_id

    Parameters:
    -----------
    submission_id : string
        ID of the submission

    round_number : int
        Numerical ID of the competition round of the tournament

    db_manager : DatabaseManager
        DB data access object that has read and write functions to NoSQL DB

    filemanager : FileManager
        S3 Bucket data access object for querying competition datasets

    Returns:
    --------
    validation : ndarray
        Sorted validation ids from submission data

    tests : ndarray
        Sorted test ids from submission data

    live : ndarray
        Sorted live ids from submission data
    """
    s3_file, _ = common.get_filename(db_manager.postgres_db, submission_id)
    local_file = filemanager.download([s3_file])[0]
    data = pd.read_csv(local_file)
    val_ids, test_ids, live_ids = get_ids(filemanager, tournament, round_number)
    validation, tests, live = get_sorted_split(data, val_ids, test_ids, live_ids)
    return validation, tests, live
    def update_leaderboard(self, submission_id, filemanager):
        """Update the leaderboard with a submission

        Parameters:
        ----------
        submission_id : string
            ID of the submission

        filemanager : FileManager
            S3 Bucket data access object for querying competition datasets
        """
        round_number = self.get_round_number(submission_id)

        # Get the tournament data
        extract_dir = filemanager.download_dataset(round_number)
        tournament_data = pd.read_csv(
            os.path.join(extract_dir, "numerai_tournament_data.csv"))
        # Get the user submission
        s3_file, _ = common.get_filename(self.postgres_db, submission_id)
        local_file = filemanager.download([s3_file])[0]
        submission_data = pd.read_csv(local_file)
        validation_data = tournament_data[tournament_data.data_type ==
                                          "validation"]
        validation_submission_data = submission_data[submission_data.id.isin(
            validation_data.id.values)]
        validation_eras = np.unique(validation_data.era.values)
        num_eras = len(validation_eras)

        # Calculate era loglosses
        better_than_random_era_count = 0

        for era in validation_eras:
            era_data = validation_data[validation_data.era == era]
            submission_era_data = validation_submission_data[
                validation_submission_data.id.isin(era_data.id.values)]
            era_data = era_data.sort_values(["id"])
            submission_era_data = submission_era_data.sort_values(["id"])
            logloss = log_loss(era_data.target.values,
                               submission_era_data.probability.values)
            if logloss < -math.log(0.5):
                better_than_random_era_count += 1

        consistency = better_than_random_era_count / num_eras * 100

        print("Consistency: {}".format(consistency))

        # Update consistency and insert pending originality and concordance into Postgres
        cursor = self.postgres_db.cursor()
        cursor.execute(
            "UPDATE submissions SET consistency={} WHERE id = '{}'".format(
                consistency, submission_id))
        cursor.execute(
            "INSERT INTO originalities(pending, submission_id) VALUES(TRUE, '{}') ON CONFLICT (submission_id) DO NOTHING;"
            .format(submission_id))
        cursor.execute(
            "INSERT INTO concordances(pending, submission_id) VALUES(TRUE, '{}') ON CONFLICT (submission_id) DO NOTHING;"
            .format(submission_id))
        self.postgres_db.commit()
        cursor.close()
예제 #3
0
def get_submission(db_manager, filemanager, submission_id):
    """Gets the submission file from S3

    Parameters:
    -----------
    db_manager: DatabaseManager
        DB data access object that has read and write functions to NoSQL DB

    filemanager: FileManager
        S3 Bucket data access object for querying competition datasets

    submission_id : string
        The ID of the submission

    Returns:
    --------
    submission : ndarray
        2d array of the submission probabilities. First column is sorted by ID
        and second column is sorted by probability.
    """
    if not submission_id:
        return None

    s3_filename, _ = common.get_filename(db_manager.postgres_db, submission_id)
    try:

        local_files = filemanager.download([s3_filename])
        if len(local_files) != 1:
            logging.getLogger().info("Error looking for submission {}, found files {}".format(submission_id, local_files))
            return None

        local_file = local_files[0]
    except Exception:
        logging.getLogger().info("Could not get submission {} at S3 path {}".format(submission_id, s3_filename))
        return None

    df = pd.read_csv(local_file)
    assert "id" in df.columns, "No id column in submission {}".format(s3_filename)
    assert "probability" in df.columns, "No probability column in submission {}".format(s3_filename)

    df.sort_values("id", inplace=True)
    df = df["probability"]
    a = df.as_matrix()
    a_sorted = np.sort(a)

    # make a two-column numpy array: first column is sorted by id; second
    # column is sorted by probability
    a = a.reshape(-1, 1)
    a_sorted = a_sorted.reshape(-1, 1)
    a = np.hstack((a, a_sorted))

    return a
    def update_leaderboard(self, submission_id, filemanager):
        """Update the leaderboard with a submission

        Parameters:
        ----------
        submission_id : string
            ID of the submission

        filemanager : FileManager
            S3 Bucket data access object for querying competition datasets
        """
        print("Calculating consistency for submission_id {}...".format(
            submission_id))
        tournament, round_number, _dataset_path = common.get_round(
            self.postgres_db, submission_id)

        # Get the tournament data
        print("Getting public dataset for round number {}-{}".format(
            tournament, round_number))
        extract_dir = filemanager.download_dataset(tournament, round_number)
        tournament_data = pd.read_csv(
            os.path.join(extract_dir, "numerai_tournament_data.csv"))
        # Get the user submission
        s3_file, _ = common.get_filename(self.postgres_db, submission_id)
        submission_data = filemanager.read_csv(s3_file)
        validation_data = tournament_data[tournament_data.data_type ==
                                          "validation"]
        validation_submission_data = submission_data[submission_data.id.isin(
            validation_data.id.values)]
        validation_eras = np.unique(validation_data.era.values)
        print(validation_eras)
        num_eras = len(validation_eras)
        assert num_eras == 12

        # Calculate era loglosses
        better_than_random_era_count = 0

        for era in validation_eras:
            era_data = validation_data[validation_data.era == era]
            submission_era_data = validation_submission_data[
                validation_submission_data.id.isin(era_data.id.values)]
            assert len(
                submission_era_data > 0), "There must be data for every era"
            era_data = era_data.sort_values(["id"])
            submission_era_data = submission_era_data.sort_values(["id"])
            logloss = log_loss(era_data[common.TARGETS[tournament]].values,
                               submission_era_data.probability.values)
            if logloss < BENCHMARK:
                better_than_random_era_count += 1

        consistency = better_than_random_era_count / num_eras * 100

        print("Consistency: {}".format(consistency))

        # Update consistency and insert pending concordance into Postgres
        cursor = self.postgres_db.cursor()
        cursor.execute(
            "UPDATE submissions SET consistency={} WHERE id = '{}'".format(
                consistency, submission_id))
        cursor.execute(
            "INSERT INTO concordances(pending, submission_id) VALUES(TRUE, '{}') ON CONFLICT (submission_id) DO NOTHING;"
            .format(submission_id))
        self.postgres_db.commit()
        cursor.close()