示例#1
0
class TrainAndPredict(luigi.Task):
    """
    Trains a naïve bayes classifier with an assumed bernoulli distribution of
    the features, then predicts the targets on the tournament data.
    The default signature of this task is ``TrainAndPredict(output_path='./data')``.

    :param: output_path (str):
        path to the directory where the predictions shall be saved to, defaults to
        ``./data``.
    """
    output_path = luigi.Parameter(default='./data/')

    def requires(self):
        """
        Dependencies to be fullfiled prior to execution. This task needs the
        :py:class:`tasks.numerai_fetch_training_data.FetchAndExtractData` task that provides
        the training/tournament data.
        """
        return FetchAndExtractData(output_path=self.output_path)

    def output(self):
        """
        Saves outputs of this task--which is a csv file of the predictions made for the
        given data.
        """
        self.apc = NumerAPI()
        fn = 'predictions_{0}_LogisticRegression.csv'.format(
            self.apc.get_current_round())
        return luigi.LocalTarget(os.path.join(self.output_path, fn))

    def run(self):
        """
        Trains a model and makes predictions given the data. These are then saved
        to a csv file.
        """
        data = self.input()
        out = self.output()

        training_data = pd.read_csv(data['training_data.csv'].path, header=0)
        prediction_data = pd.read_csv(data['tournament_data.csv'].path,
                                      header=0)

        # Transform the loaded CSV data into numpy arrays
        features = [f for f in list(training_data) if "feature" in f]
        X = training_data[features]
        Y = training_data["target"]
        x_prediction = prediction_data[features]
        ids = prediction_data["id"]

        # This is your model that will learn to predict
        model = linear_model.LogisticRegression(n_jobs=-1)

        # Your model is trained on the training_data
        model.fit(X, Y)

        # Your trained model is now used to make predictions on the
        # numerai_tournament_data
        # The model returns two columns: [probability of 0, probability of 1]
        # We are just interested in the probability that the target is 1.
        y_prediction = model.predict_proba(x_prediction)
        results = y_prediction[:, 1]
        results_df = pd.DataFrame(data={'probability': results})
        joined = pd.DataFrame(ids).join(results_df)

        print("Writing predictions to predictions.csv")
        # Save the predictions out to a CSV file
        joined.to_csv("predictions.csv", index=False)
        y_prediction = model.predict_proba(x_prediction)
        results = y_prediction[:, 1]
        results_df = pd.DataFrame(data={'probability': results})
        joined = pd.DataFrame(ids).join(results_df)

        print("Writing predictions to predictions.csv")
        # Save the predictions out to a CSV file
        joined.to_csv(out.path, index=False)
class FetchAndExtractData(luigi.Task):
    """
    Fetches the most recent dataset and extracts the contents to the given
    path if not yet done (default path is ``./data``).

    :param: output_path:
        (relative) path where the data should be written to. Defaults to
        ``./data``. Default signature is
        ``FetchAndExtractData(output_path='./data')``.

    ::

        data
        ├── numerai_dataset_95
        │   ├── example_model.py
        │   ├── example_model.r
        │   ├── example_predictions.csv
        │   ├── numerai_tournament_data.csv
        │   └── numerai_training_data.csv
        └── numerai_dataset_95.zip

    """
    output_path = luigi.Parameter(default='./data/')

    def output(self):
        """
        Manages the files to be written and determines their existence.
        This is determined by checking all the listed files below. If any
        of them does not exist, :py:func:`run` is evoked.

        :returns:
            A ``dict`` with the following keys:

            * ``zipfile``: original file as downloaded
            (``numerai_dataset_xxx.zip``)
            * ``training_data.csv``: the training data
            (``numerai_training_data.csv``)
            * ``tournament_data.csv``: the tournament data
            (``numerai_tournament_data.csv``)
            * ``example_predictions.csv``: example predictions
            (``example_predictions.csv``)

            Note that ``example_model.py`` and ``example_model.r`` are not referenced,
            as these are to no use for us.
        """
        self.apc = NumerAPI()

        current_round = self.apc.get_current_round()
        dataset_name = "numerai_dataset_{0}.zip".format(current_round)
        dataset_dir = "numerai_dataset_{0}".format(current_round)

        assert self.apc.download_current_dataset(dest_path=self.output_path,
                                                 dest_filename=dataset_name,
                                                 unzip=True)

        # see numerapi download_current_dataset
        dataset_path = os.path.join(self.output_path, dataset_dir)

        test_data_path = os.path.join(dataset_path,
                                      'numerai_training_data.csv')
        tournament_data_path = os.path.join(dataset_path,
                                            'numerai_tournament_data.csv')
        example_data_path = os.path.join(dataset_path,
                                         'example_predictions.csv')

        out = {
            'zipfile':
            luigi.LocalTarget(os.path.join(self.output_path, dataset_name)),
            'training_data.csv':
            luigi.LocalTarget(test_data_path),
            'tournament_data.csv':
            luigi.LocalTarget(tournament_data_path),
            'example_predictions.csv':
            luigi.LocalTarget(example_data_path)
        }
        print(out)
        return out

    def run(self):
        out = self.output()
示例#3
0
def test_get_current_round():
    api = NumerAPI()
    current_round = api.get_current_round()
    assert current_round >= 82
示例#4
0
def test_get_current_round(api: NumerAPI):
    current_round = api.get_current_round()
    assert current_round >= 82