Exemplo n.º 1
0
class worker(Config):
    # NOTE: `section.config-variable` in the config_path argument is deprecated in favor of `worker.config_variable`

    ping_interval = FloatParameter(default=1.0,
                                   config_path=dict(
                                       section='core',
                                       name='worker-ping-interval'))
    keep_alive = BoolParameter(default=False,
                               config_path=dict(section='core',
                                                name='worker-keep-alive'))
    count_uniques = BoolParameter(
        default=False,
        config_path=dict(section='core', name='worker-count-uniques'),
        description='worker-count-uniques means that we will keep a '
        'worker alive only if it has a unique pending task, as '
        'well as having keep-alive true')
    wait_interval = FloatParameter(default=1.0,
                                   config_path=dict(
                                       section='core',
                                       name='worker-wait-interval'))
    wait_jitter = FloatParameter(default=5.0)

    max_reschedules = IntParameter(default=1,
                                   config_path=dict(
                                       section='core',
                                       name='worker-max-reschedules'))
    timeout = IntParameter(default=0,
                           config_path=dict(section='core',
                                            name='worker-timeout'))
    task_limit = IntParameter(default=None,
                              config_path=dict(section='core',
                                               name='worker-task-limit'))
    retry_external_tasks = BoolParameter(
        default=False,
        config_path=dict(section='core', name='retry-external-tasks'),
        description='If true, incomplete external tasks will be '
        'retested for completion while Luigi is running.')
    no_install_shutdown_handler = BoolParameter(
        default=False,
        description='If true, the SIGUSR1 shutdown handler will'
        'NOT be install on the worker')
Exemplo n.º 2
0
class worker(Config):
    # NOTE: `section.config-variable` in the config_path argument is deprecated in favor of `worker.config_variable`

    ping_interval = FloatParameter(default=1.0,
                                   config_path=dict(section='core', name='worker-ping-interval'))
    keep_alive = BoolParameter(default=False,
                               config_path=dict(section='core', name='worker-keep-alive'))
    count_uniques = BoolParameter(default=False,
                                  config_path=dict(section='core', name='worker-count-uniques'),
                                  description='worker-count-uniques means that we will keep a '
                                  'worker alive only if it has a unique pending task, as '
                                  'well as having keep-alive true')
    count_last_scheduled = BoolParameter(default=False,
                                         description='Keep a worker alive only if there are '
                                                     'pending tasks which it was the last to '
                                                     'schedule.')
    wait_interval = FloatParameter(default=1.0,
                                   config_path=dict(section='core', name='worker-wait-interval'))
    wait_jitter = FloatParameter(default=5.0)

    max_reschedules = IntParameter(default=1,
                                   config_path=dict(section='core', name='worker-max-reschedules'))
    timeout = IntParameter(default=0,
                           config_path=dict(section='core', name='worker-timeout'))
    task_limit = IntParameter(default=None,
                              config_path=dict(section='core', name='worker-task-limit'))
    retry_external_tasks = BoolParameter(default=False,
                                         config_path=dict(section='core', name='retry-external-tasks'),
                                         description='If true, incomplete external tasks will be '
                                         'retested for completion while Luigi is running.')
    send_failure_email = BoolParameter(default=True,
                                       description='If true, send e-mails directly from the worker'
                                                   'on failure')
    no_install_shutdown_handler = BoolParameter(default=False,
                                                description='If true, the SIGUSR1 shutdown handler will'
                                                'NOT be install on the worker')
    check_unfulfilled_deps = BoolParameter(default=True,
                                           description='If true, check for completeness of '
                                           'dependencies before running a task')
class postgres_cfg_music(Config):
    user = Parameter(visibility=ParameterVisibility.PRIVATE, significant=False)
    password = Parameter(visibility=ParameterVisibility.PRIVATE,
                         significant=False)
    host = Parameter(visibility=ParameterVisibility.PRIVATE, significant=False)
    port = IntParameter(visibility=ParameterVisibility.PRIVATE,
                        significant=False)
    database = Parameter(visibility=ParameterVisibility.PRIVATE,
                         significant=False)
    read_user = Parameter(visibility=ParameterVisibility.PRIVATE,
                          significant=False)
    read_password = Parameter(visibility=ParameterVisibility.PRIVATE,
                              significant=False)
Exemplo n.º 4
0
class ThePayneMixin(SlurmMixin, BaseTask):

    task_namespace = "ThePayne"

    n_steps = IntParameter(
        default=100000,
        config_path=dict(section=task_namespace, name="n_steps")
    )
    n_neurons = IntParameter(
        default=300,
        config_path=dict(section=task_namespace, name="n_neurons")
    )
    weight_decay = FloatParameter(
        default=0.0,
        config_path=dict(section=task_namespace, name="weight_decay")
    )
    learning_rate = FloatParameter(
        default=0.001,
        config_path=dict(section=task_namespace, name="learning_rate")
    )
    training_set_path = Parameter(
        config_path=dict(section=task_namespace, name="training_set_path")
    )
Exemplo n.º 5
0
class worker(Config):

    ping_interval = FloatParameter(default=1.0,
                                   config_path=dict(section='core', name='worker-ping-interval'))
    keep_alive = BoolParameter(default=False,
                               config_path=dict(section='core', name='worker-keep-alive'))
    count_uniques = BoolParameter(default=False,
                                  config_path=dict(section='core', name='worker-count-uniques'),
                                  description='worker-count-uniques means that we will keep a '
                                  'worker alive only if it has a unique pending task, as '
                                  'well as having keep-alive true')
    wait_interval = IntParameter(default=1,
                                 config_path=dict(section='core', name='worker-wait-interval'))
    max_reschedules = IntParameter(default=1,
                                   config_path=dict(section='core', name='worker-max-reschedules'))
    timeout = IntParameter(default=0,
                           config_path=dict(section='core', name='worker-timeout'))
    task_limit = IntParameter(default=None,
                              config_path=dict(section='core', name='worker-task-limit'))
    retry_external_tasks = BoolParameter(default=False,
                                         config_path=dict(section='core', name='retry-external-tasks'),
                                         description='If true, incomplete external tasks will be '
                                         'retested for completion while Luigi is running.')
class Download(Task):
    import praw
    
    # Die Version des Models
    version = IntParameter(default=1)
    
    # Es werden maximal 500 Posts pro Klasse geladen
    limit = IntParameter(default=500)
    
    # Definition der Subreddits mit der der DecisionTree trainiert wird
    subreddits = ["datascience", "gameofthrones"]
    
    # PRAW benötigt einen Account bei Reddit 
    # inklusive einer registrierten Anwendung mit Client-ID und Secret
    reddit = praw.Reddit(user_agent="test",
                         client_id="wpaIV3-b3AYOJQ", client_secret="-M_LPtLCpkqlJTCyg--Rg9ePAwg")
    
    # Das LocalTarget fuer die rohen Daten
    # Die Daten werden unter
    # "model/<version>/raw.csv gespeichert
    def output(self):
        return LocalTarget("model/%d/raw.csv" % self.version)
    
    # Die Posts werden heruntergeladen,
    # in einen Dataframe konvertiert
    # und als CSV in das Target geschrieben
    def run(self):
        dataset = reduce(lambda p, n: p.append(n), self.fetch_reddit_data())
        with self.output().open("w") as out:
            dataset.to_csv(out, encoding='utf-8', index=False, sep=';')

    def fetch_reddit_data(self):
        from pandas import DataFrame
        for sub in self.subreddits:
            posts = list(self.reddit.subreddit(sub).hot(limit=self.limit))
            relevant = DataFrame([p.__dict__ for p in posts])[['title', 'selftext', "subreddit"]]
            yield relevant
Exemplo n.º 7
0
class ExtractDataset(ExternalProgramTask):
    
    dataset_version = IntParameter(default=1)
    dataset_name = Parameter(default="dataset")
    
    def requires(self):
        return DownloadDataset(self.dataset_version, self.dataset_name)

    def output(self):
        return LocalTarget("datasets/fruit-images-dataset/%d" % self.dataset_version)

    def program_args(self):
        self.output().makedirs()
        return ["unzip", "-u", "-q",
                "-d", self.output().path,
                self.input().path]
Exemplo n.º 8
0
class DownloadDataset(ExternalProgramTask):

    dataset_version = IntParameter(default=1)
    dataset_name = Parameter(default="dataset")

    base_url = "http://plainpixels.work/resources/datasets"
    file_fomat = "zip"

    def output(self):
        return LocalTarget(
            "/tmp/%s_v%d.%s" %
            (self.dataset_name, self.dataset_version, self.file_fomat))

    def program_args(self):
        url = "%s/%s_v%d.%s" % (self.base_url, self.dataset_name,
                                self.dataset_version, self.file_fomat)
        return ["curl", "-L", "-o", self.output().path, url]
Exemplo n.º 9
0
class ClassifyWhiteDwarfMixin(BaseTask):

    """
    Mix-in class for classifying white dwarfs.
    """

    model_path = Parameter()

    wavelength_regions = ListParameter(
        default=[
            [3860, 3900], # Balmer line
            [3950, 4000], # Balmer line
            [4085, 4120], # Balmer line
            [4320, 4360], # Balmer line
            [4840, 4880], # Balmer line
            [6540, 6580], # Balmer line
            [3880, 3905], # He I/II line
            [3955, 3975], # He I/II line
            [3990, 4056], # He I/II line
            [4110, 4140], # He I/II line
            [4370, 4410], # He I/II line
            [4450, 4485], # He I/II line
            [4705, 4725], # He I/II line
            [4900, 4950], # He I/II line
            [5000, 5030], # He I/II line
            [5860, 5890], # He I/II line
            [6670, 6700], # He I/II line
            [7050, 7090], # He I/II line
            [7265, 7300], # He I/II line
            [4600, 4750], # Molecular C absorption band
            [5000, 5160], # Molecular C absorption band
            [3925, 3940], # Ca H/K line
            [3960, 3975], # Ca H/K line
        ]
    )

    polyfit_order = IntParameter(default=5)
    polyfit_regions = ListParameter(
        default=[
            [3850, 3870],
            [4220, 4245],
            [5250, 5400],
            [6100, 6470],
            [7100, 9000]
        ]
    )
Exemplo n.º 10
0
class BaselineValidation(Task):

    dataset_version = IntParameter(default=1)
    dataset_name = Parameter(default="dataset")
    config_name = Parameter(default="standard")

    validation_set = "Test"
    baseline_name = "find_round_objects"

    def requires(self):
        yield ExtractDataset(self.dataset_version, self.dataset_name)
        yield Configure(self.config_name)

    def output(self):
        return LocalTarget("baseline/%s.json" % self.baseline_name)

    def run(self):
        dataset = self.input()[0].path
        config = self.input()[1].path
        test_data = build_generator(config, dataset, self.validation_set)
        result = calc_baseline_acc(test_data, dataset, self.validation_set)
        with self.output().open("wb") as f:
            json.dump(result, f)
Exemplo n.º 11
0
class stockfish_cfg(Config):
    depth = IntParameter()
    location = Parameter(visibility=ParameterVisibility.PRIVATE,
                         significant=False)
Exemplo n.º 12
0
class worker(Config):
    # NOTE: `section.config-variable` in the config_path argument is deprecated in favor of `worker.config_variable`

    ping_interval = FloatParameter(default=1.0,
                                   config_path=dict(
                                       section='core',
                                       name='worker-ping-interval'))
    keep_alive = BoolParameter(default=False,
                               config_path=dict(section='core',
                                                name='worker-keep-alive'))
    count_uniques = BoolParameter(
        default=False,
        config_path=dict(section='core', name='worker-count-uniques'),
        description='worker-count-uniques means that we will keep a '
        'worker alive only if it has a unique pending task, as '
        'well as having keep-alive true')
    count_last_scheduled = BoolParameter(
        default=False,
        description='Keep a worker alive only if there are '
        'pending tasks which it was the last to '
        'schedule.')
    wait_interval = FloatParameter(default=1.0,
                                   config_path=dict(
                                       section='core',
                                       name='worker-wait-interval'))
    wait_jitter = FloatParameter(default=5.0)

    max_reschedules = IntParameter(default=1,
                                   config_path=dict(
                                       section='core',
                                       name='worker-max-reschedules'))
    timeout = IntParameter(default=0,
                           config_path=dict(section='core',
                                            name='worker-timeout'))
    task_limit = IntParameter(default=None,
                              config_path=dict(section='core',
                                               name='worker-task-limit'))
    retry_external_tasks = BoolParameter(
        default=False,
        config_path=dict(section='core', name='retry-external-tasks'),
        description='If true, incomplete external tasks will be '
        'retested for completion while Luigi is running.')
    send_failure_email = BoolParameter(
        default=True,
        description='If true, send e-mails directly from the worker'
        'on failure')
    no_install_shutdown_handler = BoolParameter(
        default=False,
        description='If true, the SIGUSR1 shutdown handler will'
        'NOT be install on the worker')
    check_unfulfilled_deps = BoolParameter(
        default=True,
        description='If true, check for completeness of '
        'dependencies before running a task')
    force_multiprocessing = BoolParameter(
        default=False,
        description='If true, use multiprocessing also when '
        'running with 1 worker')
    task_process_context = Parameter(
        default=None,
        description='If set to a fully qualified class name, the class will '
        'be instantiated with a TaskProcess as its constructor parameter and '
        'applied as a context manager around its run() call, so this can be '
        'used for obtaining high level customizable monitoring or logging of '
        'each individual Task run.')
Exemplo n.º 13
0
class TrainTheCannonBase(TheCannonMixin):
    """
    A base task for training The Cannon.

    :param label_names:
        A list of label names.
    
    :param order: (optional)
        The polynomial order to use for this model (default: 2).    

    :param regularization: (optional)
        The strength of L1-regularization to apply during training.
    
    :param threads: (optional)
        The number of threads to use (default: 1).
    
    :param plot: (optional)
        A boolean flag to indicate whether to produce post-training quality plots.
    """

    regularization = FloatParameter(default=0.0)
    threads = IntParameter(default=1, significant=False)
    plot = BoolParameter(default=True, significant=False)

    def run(self):
        """ Execute this task. """

        # Load training set labels and spectra.
        labels, dispersion, training_set_flux, training_set_ivar = read_training_set(
            self.input().path, )

        # Set the vectorizer.
        # We sort the label names so that luigi doesn't re-train models if we alter the order.
        vectorizer = tc.vectorizer.PolynomialVectorizer(
            sorted(self.label_names), self.order)

        # Initiate model.
        model = tc.model.CannonModel(labels,
                                     training_set_flux,
                                     training_set_ivar,
                                     vectorizer=vectorizer,
                                     dispersion=dispersion,
                                     regularization=self.regularization)

        log.info(f"Training The Cannon model {model}")
        model.train(threads=self.threads)

        output_path = self.output().path
        log.info(f"Writing The Cannon model {model} to disk {output_path}")
        model.write(output_path)

        if self.plot:
            # Plot zeroth and first order coefficients.
            fig = plot.theta(
                model,
                indices=np.arange(1 + len(model.vectorizer.label_names)),
                normalize=False)
            fig.savefig(f"{self.task_id}-theta.png")

            # Plot scatter.
            fig = plot.scatter(model)
            fig.savefig(f"{self.task_id}-scatter.png")

            # Plot one-to-one.
            test_labels, test_cov, test_meta = model.test(
                training_set_flux,
                training_set_ivar,
                initial_labels=model.training_set_labels)
            fig = plot.one_to_one(model, test_labels, cov=test_cov)
            fig.savefig(f"{self.task_id}-one-to-one.png")

    def output(self):
        """ The output of this task. """
        return LocalTarget(
            os.path.join(self.output_base_dir, f"{self.task_id}.pkl"))
Exemplo n.º 14
0
class PredictTrend(luigi.Task):

    dataset_version = DateParameter(default=datetime.date.today())
    dataset_name = Parameter(default="covidIT")
    attribute = Parameter(default="total_positive")
    window_size = IntParameter(default=7)
    model_name = Parameter(default="LR")
    n_days_to_predict = IntParameter(default=7)

    def requires(self):
        return {
            "model":
            Modeling(
                self.dataset_version,
                self.dataset_name,
                self.attribute,
                self.window_size,
                self.model_name,
            ),
            "data_transformed":
            DataTransform(
                self.dataset_version,
                self.dataset_name,
                self.attribute,
                self.window_size,
            ),
        }

    output_folder = os.path.join(output_dir, "prediction")

    def output(self):
        return LocalTarget(
            os.path.join(
                self.output_folder,
                f"{self.dataset_name}_prediction_{self.attribute}_w{self.window_size}_{self.model_name}_N{self.n_days_to_predict}_v{self.dataset_version}.csv",
            ))

    def run(self):
        df_date = pd.read_csv(self.input()["data_transformed"].path,
                              index_col="date")
        df_date.index = pd.to_datetime(df_date.index)

        import pickle

        with open(self.input()["model"].path, "rb") as f:
            regr = pickle.load(f)

        df_windows_pred = self.predictWindowing(
            df_date,
            self.attribute,
            regr,
            self.window_size,
            self.n_days_to_predict,
        )

        Path(self.output_folder).mkdir(parents=True, exist_ok=True)

        df_windows_pred.to_csv(self.output().path)

    def predictWindowing(self,
                         df_windows_train,
                         attribute,
                         regr,
                         window,
                         n_days_to_predict=10):

        date_end_train_date = df_windows_train.index[-1].date()
        date_previous_window = date_end_train_date - datetime.timedelta(
            days=window + 1)
        df_test_window = pd.DataFrame(
            df_windows_train[date_previous_window:date_end_train_date]
            [attribute])
        test_window = df_test_window[attribute].values
        start_i = len(test_window)
        X_test_prog = []
        y_pred_prog = []
        for i in range(start_i, start_i + n_days_to_predict):
            # X: |window| preceding samples
            X_test_i = test_window[i - window:i][::-1]
            X_test_prog.append(X_test_i)
            # y: regressor estimation given |window| preceding samples
            y_pred_prog.append(regr.predict([X_test_i])[0])
            test_window = np.append(test_window, regr.predict([X_test_i])[0])

        # Dataframe X |window| preceding samples
        df_pred = pd.DataFrame(
            X_test_prog, columns=[f"v_t-{i}" for i in range(1, window + 1)])
        # y_predicted --> y estimated by the regressor
        df_pred[f"y_pred_{attribute}"] = y_pred_prog

        # Add date and sed as index
        start_pred_date = date_end_train_date + datetime.timedelta(days=1)
        datelist = pd.date_range(start_pred_date, periods=n_days_to_predict)
        df_pred.set_index(datelist, inplace=True)
        df_pred.index.name = "date"

        return df_pred
Exemplo n.º 15
0
class Classify(PySparkTask):
    from datetime import date

    date = DateParameter(default=date.today())
    version = IntParameter(default=1)

    # PySpark Parameter
    driver_memory = '1g'
    executor_memory = '2g'
    executor_cores = '2'
    num_executors = '4'
    master = 'local'

    # Als Abhaengigkeit werden
    # Task *Clean* und *ModelExists*
    # zurueckgegeben
    def requires(self):
        return [ModelExists(self.version), Clean(self.date)]

    # Das LocalTarget fuer die Klassifikation
    # Die Daten werden unter
    # "daily/<datum>/ergebnis.csv gespeichert
    def output(self):
        prefix = self.date.strftime("%m-%d-%Y")
        return LocalTarget("daily/%s/ergebnis.csv" % prefix)

    def main(self, sc, *args):
        from pyspark.sql.session import SparkSession
        from pyspark.ml import PipelineModel
        from pyspark.sql.functions import when

        # Initialisiere den SQLContext
        sql = SparkSession.builder\
            .enableHiveSupport() \
            .config("hive.exec.dynamic.partition", "true") \
            .config("hive.exec.dynamic.partition.mode", "nonstrict") \
            .config("hive.exec.max.dynamic.partitions", "4096") \
            .getOrCreate()

        # Lade die bereinigten Daten
        df = sql.read.format("com.databricks.spark.csv") \
                     .option("delimiter", ";") \
                     .option("header", "true") \
                     .load(self.input()[1].path)

        # Lade das Model das zuvor mit SparkML trainiert wurde
        model = PipelineModel.load(self.input()[0].path)

        # Klassifiziere die Datensaetze eines Tages mit dem Model
        ergebnis = model.transform(df)[["id",
                                        "subreddit",
                                        "probability",
                                        "prediction"]]

        # Eine kleine Aufbereitung der Daten denn
        # die Klasse "1" hat den Namen "datascience"
        ergebnis = ergebnis.withColumn("prediction_label",
                                        when(ergebnis.prediction==1,
                                            "datascience") \
                                        .otherwise("gameofthrones"))

        # Der Einfachheit halber wird der Dataframe
        # in einen Pandas Dataframe konvertiert.
        # Dies sollte bei grossen Datenmengen vermieden.
        with self.output().open("w") as out:
            ergebnis.toPandas().to_csv(out,
                                       encoding='utf-8',
                                       index=False,
                                       sep=';')
Exemplo n.º 16
0
class ModelExists(WrapperTask):
    version = IntParameter(default=1)

    def output(self):
        return LocalTarget("model/%d/model" % self.version)
Exemplo n.º 17
0
class PlotFutureTrend(luigi.Task):

    dataset_version = DateParameter(default=datetime.date.today())

    dataset_name = Parameter(default="covidIT")
    attribute = Parameter(default="total_positive")
    window_size = IntParameter(default=7)
    model_name = Parameter(default="LR")
    n_days_to_predict = IntParameter(default=7)

    def requires(self):
        return {
            "data_pred":
            PredictTrend(
                self.dataset_version,
                self.dataset_name,
                self.attribute,
                self.window_size,
                self.model_name,
                self.n_days_to_predict,
            ),
            "data_transformed":
            DataTransform(
                self.dataset_version,
                self.dataset_name,
                self.attribute,
                self.window_size,
            ),
        }

    output_folder = os.path.join(output_dir, "report_future_trend")

    def output(self):
        return LocalTarget(
            os.path.join(
                self.output_folder,
                f"{self.dataset_name}_future_trend_{self.attribute}_w{self.window_size}_N{self.n_days_to_predict}_{self.model_name}_v{self.dataset_version}.png",
            ))

    def run(self):
        df_windows_pred = pd.read_csv(self.input()["data_pred"].path,
                                      index_col="date")
        df_windows_pred.index = pd.to_datetime(df_windows_pred.index)
        df_date = pd.read_csv(self.input()["data_transformed"].path,
                              index_col="date")
        df_date.index = pd.to_datetime(pd.to_datetime(df_date.index).date)

        import datetime

        fig = self.plotEstimatedTrend(df_date, df_windows_pred, self.attribute)

        Path(self.output_folder).mkdir(parents=True, exist_ok=True)

        fig.savefig(self.output().path)

    def plotEstimatedTrend(
        self,
        df_date,
        df_windows_predicted,
        attribute,
        start_train=None,
        date_end_train=None,
        interval=40,
    ):
        import datetime

        # Starting date of the plot
        start_train = df_date.index[0] if start_train is None else start_train
        # End date of the true label/value of the plot
        date_end_train = df_date.index[
            -1] if date_end_train is None else date_end_train

        start_test = date_end_train.date() + datetime.timedelta(days=1)
        if df_windows_predicted[start_test:].empty:
            # TODO
            raise ValueError

        fig, ax = plt.subplots(figsize=(12, 5))
        ax.grid()
        # Observed trend until training date
        x_date = df_date[start_train:date_end_train].index
        y_train = df_date[start_train:date_end_train][attribute].values
        ax.scatter(x_date, y_train, s=3, color="blue", label=attribute)
        # Predicted future trend
        ax.scatter(
            df_windows_predicted[start_test:].index,
            df_windows_predicted[start_test:][f"y_pred_{attribute}"].values,
            s=4,
            color="orange",
            label=f"{attribute} predicted",
        )
        ax.legend()
        ax.set(xlabel="Date", ylabel=attribute, title=attribute)
        date_form = DateFormatter("%d-%m")
        ax.xaxis.set_major_formatter(date_form)
        ax.xaxis.set_major_locator(mdates.DayLocator(interval=interval))
        return fig