class TrainAndPredict(luigi.Task): """ Trains a naïve bayes classifier with an assumed bernoulli distribution of the features, then predicts the targets on the tournament data. The default signature of this task is ``TrainAndPredict(output_path='./data')``. :param: output_path (str): path to the directory where the predictions shall be saved to, defaults to ``./data``. """ output_path = luigi.Parameter(default='./data/') def requires(self): """ Dependencies to be fullfiled prior to execution. This task needs the :py:class:`tasks.numerai_fetch_training_data.FetchAndExtractData` task that provides the training/tournament data. """ return FetchAndExtractData(output_path=self.output_path) def output(self): """ Saves outputs of this task--which is a csv file of the predictions made for the given data. """ self.apc = NumerAPI() fn = 'predictions_{0}_LogisticRegression.csv'.format( self.apc.get_current_round()) return luigi.LocalTarget(os.path.join(self.output_path, fn)) def run(self): """ Trains a model and makes predictions given the data. These are then saved to a csv file. """ data = self.input() out = self.output() training_data = pd.read_csv(data['training_data.csv'].path, header=0) prediction_data = pd.read_csv(data['tournament_data.csv'].path, header=0) # Transform the loaded CSV data into numpy arrays features = [f for f in list(training_data) if "feature" in f] X = training_data[features] Y = training_data["target"] x_prediction = prediction_data[features] ids = prediction_data["id"] # This is your model that will learn to predict model = linear_model.LogisticRegression(n_jobs=-1) # Your model is trained on the training_data model.fit(X, Y) # Your trained model is now used to make predictions on the # numerai_tournament_data # The model returns two columns: [probability of 0, probability of 1] # We are just interested in the probability that the target is 1. y_prediction = model.predict_proba(x_prediction) results = y_prediction[:, 1] results_df = pd.DataFrame(data={'probability': results}) joined = pd.DataFrame(ids).join(results_df) print("Writing predictions to predictions.csv") # Save the predictions out to a CSV file joined.to_csv("predictions.csv", index=False) y_prediction = model.predict_proba(x_prediction) results = y_prediction[:, 1] results_df = pd.DataFrame(data={'probability': results}) joined = pd.DataFrame(ids).join(results_df) print("Writing predictions to predictions.csv") # Save the predictions out to a CSV file joined.to_csv(out.path, index=False)
class FetchAndExtractData(luigi.Task): """ Fetches the most recent dataset and extracts the contents to the given path if not yet done (default path is ``./data``). :param: output_path: (relative) path where the data should be written to. Defaults to ``./data``. Default signature is ``FetchAndExtractData(output_path='./data')``. :: data ├── numerai_dataset_95 │ ├── example_model.py │ ├── example_model.r │ ├── example_predictions.csv │ ├── numerai_tournament_data.csv │ └── numerai_training_data.csv └── numerai_dataset_95.zip """ output_path = luigi.Parameter(default='./data/') def output(self): """ Manages the files to be written and determines their existence. This is determined by checking all the listed files below. If any of them does not exist, :py:func:`run` is evoked. :returns: A ``dict`` with the following keys: * ``zipfile``: original file as downloaded (``numerai_dataset_xxx.zip``) * ``training_data.csv``: the training data (``numerai_training_data.csv``) * ``tournament_data.csv``: the tournament data (``numerai_tournament_data.csv``) * ``example_predictions.csv``: example predictions (``example_predictions.csv``) Note that ``example_model.py`` and ``example_model.r`` are not referenced, as these are to no use for us. """ self.apc = NumerAPI() current_round = self.apc.get_current_round() dataset_name = "numerai_dataset_{0}.zip".format(current_round) dataset_dir = "numerai_dataset_{0}".format(current_round) assert self.apc.download_current_dataset(dest_path=self.output_path, dest_filename=dataset_name, unzip=True) # see numerapi download_current_dataset dataset_path = os.path.join(self.output_path, dataset_dir) test_data_path = os.path.join(dataset_path, 'numerai_training_data.csv') tournament_data_path = os.path.join(dataset_path, 'numerai_tournament_data.csv') example_data_path = os.path.join(dataset_path, 'example_predictions.csv') out = { 'zipfile': luigi.LocalTarget(os.path.join(self.output_path, dataset_name)), 'training_data.csv': luigi.LocalTarget(test_data_path), 'tournament_data.csv': luigi.LocalTarget(tournament_data_path), 'example_predictions.csv': luigi.LocalTarget(example_data_path) } print(out) return out def run(self): out = self.output()
def test_get_current_round(): api = NumerAPI() current_round = api.get_current_round() assert current_round >= 82
def test_get_current_round(api: NumerAPI): current_round = api.get_current_round() assert current_round >= 82