Пример #1
0
    EXAMPLE_PREDS_COL
)


# download all the things

napi = NumerAPI()

current_round = napi.get_current_round()

# Tournament data changes every week so we specify the round in their name. Training
# and validation data only change periodically, so no need to download them every time.
print('Downloading dataset files...')

Path("./v4").mkdir(parents=False, exist_ok=True)
napi.download_dataset("v4/train.parquet")
napi.download_dataset("v4/validation.parquet")
napi.download_dataset("v4/live.parquet", f"v4/live_{current_round}.parquet")
napi.download_dataset("v4/validation_example_preds.parquet")
napi.download_dataset("v4/features.json")

print('Reading minimal training data')
# read the feature metadata and get a feature set (or all the features)
with open("v4/features.json", "r") as f:
    feature_metadata = json.load(f)
# features = list(feature_metadata["feature_stats"].keys()) # get all the features
# features = feature_metadata["feature_sets"]["small"] # get the small feature set
features = feature_metadata["feature_sets"]["medium"] # get the medium feature set
# read in just those features along with era and target columns
read_columns = features + [ERA_COL, DATA_TYPE_COL, TARGET_COL]
# a value of 10 means use every 10th row
downsample_cross_val = 20
downsample_full_train = 2

# if model_selection_loop=True get OOS performance for training_data
# and use that to select best model
# if model_selection_loop=False, just predict on tournament data using existing models and model config
model_selection_loop = True
model_config_name = "advanced_example_model"

napi = NumerAPI()

current_round = napi.get_current_round()

Path("./v4").mkdir(parents=False, exist_ok=True)
napi.download_dataset("v4/train.parquet")
napi.download_dataset("v4/features.json")

print("Entering model selection loop.  This may take awhile.")
if model_selection_loop:
    model_config = {}
    print('reading training_data')
    training_data = pd.read_parquet('v4/train.parquet')

    # keep track of some prediction columns
    ensemble_cols = set()
    pred_cols = set()

    # pick some targets to use
    possible_targets = [
        c for c in training_data.columns if c.startswith("target_")