Пример #1
0
class worker(Config):
    # NOTE: `section.config-variable` in the config_path argument is deprecated in favor of `worker.config_variable`

    ping_interval = FloatParameter(default=1.0,
                                   config_path=dict(section='core', name='worker-ping-interval'))
    keep_alive = BoolParameter(default=False,
                               config_path=dict(section='core', name='worker-keep-alive'))
    count_uniques = BoolParameter(default=False,
                                  config_path=dict(section='core', name='worker-count-uniques'),
                                  description='worker-count-uniques means that we will keep a '
                                  'worker alive only if it has a unique pending task, as '
                                  'well as having keep-alive true')
    wait_interval = FloatParameter(default=1.0,
                                   config_path=dict(section='core', name='worker-wait-interval'))
    wait_jitter = FloatParameter(default=5.0)

    max_reschedules = IntParameter(default=1,
                                   config_path=dict(section='core', name='worker-max-reschedules'))
    timeout = IntParameter(default=0,
                           config_path=dict(section='core', name='worker-timeout'))
    task_limit = IntParameter(default=None,
                              config_path=dict(section='core', name='worker-task-limit'))
    retry_external_tasks = BoolParameter(default=False,
                                         config_path=dict(section='core', name='retry-external-tasks'),
                                         description='If true, incomplete external tasks will be '
                                         'retested for completion while Luigi is running.')
    no_install_shutdown_handler = BoolParameter(default=False,
                                                description='If true, the SIGUSR1 shutdown handler will'
                                                'NOT be install on the worker')
Пример #2
0
class worker(Config):

    ping_interval = FloatParameter(default=1.0,
                                   config_path=dict(section='core',
                                                    name='retry-delay'))
    keep_alive = BoolParameter(default=False,
                               config_path=dict(section='core',
                                                name='worker-keep-alive'))
    count_uniques = BoolParameter(
        default=False,
        config_path=dict(section='core', name='worker-count-uniques'),
        description='worker-count-uniques means that we will keep a '
        'worker alive only if it has a unique pending task, as '
        'well as having keep-alive true')
    wait_interval = IntParameter(default=1,
                                 config_path=dict(section='core',
                                                  name='worker-wait-interval'))
    max_reschedules = IntParameter(default=1,
                                   config_path=dict(
                                       section='core',
                                       name='worker-max-reschedules'))
    timeout = IntParameter(default=0,
                           config_path=dict(section='core',
                                            name='worker-timeout'))
    task_limit = IntParameter(default=None,
                              config_path=dict(section='core',
                                               name='worker-task-limit'))
    retry_external_tasks = BoolParameter(
        default=False,
        config_path=dict(section='core', name='retry-external-tasks'),
        description='If true, incomplete external tasks will be '
        'retested for completion while Luigi is running.')
Пример #3
0
class worker(Config):
    # NOTE: `section.config-variable` in the config_path argument is deprecated in favor of `worker.config_variable`

    ping_interval = FloatParameter(default=1.0,
                                   config_path=dict(
                                       section='core',
                                       name='worker-ping-interval'))
    keep_alive = BoolParameter(default=False,
                               config_path=dict(section='core',
                                                name='worker-keep-alive'))
    count_uniques = BoolParameter(
        default=False,
        config_path=dict(section='core', name='worker-count-uniques'),
        description='worker-count-uniques means that we will keep a '
        'worker alive only if it has a unique pending task, as '
        'well as having keep-alive true')
    count_last_scheduled = BoolParameter(
        default=False,
        description='Keep a worker alive only if there are '
        'pending tasks which it was the last to '
        'schedule.')
    wait_interval = FloatParameter(default=1.0,
                                   config_path=dict(
                                       section='core',
                                       name='worker-wait-interval'))
    wait_jitter = FloatParameter(default=5.0)

    max_reschedules = IntParameter(default=1,
                                   config_path=dict(
                                       section='core',
                                       name='worker-max-reschedules'))
    timeout = IntParameter(default=0,
                           config_path=dict(section='core',
                                            name='worker-timeout'))
    task_limit = IntParameter(default=None,
                              config_path=dict(section='core',
                                               name='worker-task-limit'))
    retry_external_tasks = BoolParameter(
        default=False,
        config_path=dict(section='core', name='retry-external-tasks'),
        description='If true, incomplete external tasks will be '
        'retested for completion while Luigi is running.')
    send_failure_email = BoolParameter(
        default=True,
        description='If true, send e-mails directly from the worker'
        'on failure')
    no_install_shutdown_handler = BoolParameter(
        default=False,
        description='If true, the SIGUSR1 shutdown handler will'
        'NOT be install on the worker')
    check_unfulfilled_deps = BoolParameter(
        default=True,
        description='If true, check for completeness of '
        'dependencies before running a task')
Пример #4
0
class worker(Config):
    # NOTE: `section.config-variable` in the config_path argument is deprecated in favor of `worker.config_variable`

    ping_interval = FloatParameter(default=1.0,
                                   config_path=dict(section='core', name='worker-ping-interval'))
    keep_alive = BoolParameter(default=False,
                               config_path=dict(section='core', name='worker-keep-alive'))
    count_uniques = BoolParameter(default=False,
                                  config_path=dict(section='core', name='worker-count-uniques'),
                                  description='worker-count-uniques means that we will keep a '
                                  'worker alive only if it has a unique pending task, as '
                                  'well as having keep-alive true')
    count_last_scheduled = BoolParameter(default=False,
                                         description='Keep a worker alive only if there are '
                                                     'pending tasks which it was the last to '
                                                     'schedule.')
    wait_interval = FloatParameter(default=1.0,
                                   config_path=dict(section='core', name='worker-wait-interval'))
    wait_jitter = FloatParameter(default=5.0)

    max_reschedules = IntParameter(default=1,
                                   config_path=dict(section='core', name='worker-max-reschedules'))
    timeout = IntParameter(default=0,
                           config_path=dict(section='core', name='worker-timeout'))
    task_limit = IntParameter(default=None,
                              config_path=dict(section='core', name='worker-task-limit'))
    retry_external_tasks = BoolParameter(default=False,
                                         config_path=dict(section='core', name='retry-external-tasks'),
                                         description='If true, incomplete external tasks will be '
                                         'retested for completion while Luigi is running.')
    send_failure_email = BoolParameter(default=True,
                                       description='If true, send e-mails directly from the worker'
                                                   'on failure')
    no_install_shutdown_handler = BoolParameter(default=False,
                                                description='If true, the SIGUSR1 shutdown handler will'
                                                'NOT be install on the worker')
    check_unfulfilled_deps = BoolParameter(default=True,
                                           description='If true, check for completeness of '
                                           'dependencies before running a task')
    force_multiprocessing = BoolParameter(default=False,
                                          description='If true, use multiprocessing also when '
                                          'running with 1 worker')
    task_process_context = OptionalParameter(default=None,
                                             description='If set to a fully qualified class name, the class will '
                                             'be instantiated with a TaskProcess as its constructor parameter and '
                                             'applied as a context manager around its run() call, so this can be '
                                             'used for obtaining high level customizable monitoring or logging of '
                                             'each individual Task run.')
class FetchLichessApiJSON(Task):

    player = Parameter(default='thibault')
    perf_type = Parameter(default='blitz')
    since = DateParameter(default=datetime.today().date() - timedelta(days=1))
    single_day = BoolParameter()

    def output(self):
        import os

        file_location = (f'~/Temp/luigi/{self.since}-raw-games-'
                         f'{self.player}-{self.perf_type}-json.pckl')
        return LocalTarget(os.path.expanduser(file_location), format=Nop)

    def run(self):
        import lichess.api
        from lichess.format import JSON
        from pandas import json_normalize
        from calendar import timegm

        self.output().makedirs()

        if self.single_day:
            unix_time_until = timegm((self.since
                                      + timedelta(days=1)).timetuple())
        else:
            unix_time_until = timegm(datetime.today().date().timetuple())
        self.until = int(1000 * unix_time_until)

        unix_time_since = timegm(self.since.timetuple())
        self.since_unix = int(1000 * unix_time_since)

        token = lichess_token().token

        games = lichess.api.user_games(self.player,
                                       since=self.since_unix,
                                       until=self.until,
                                       perfType=self.perf_type,
                                       auth=token,
                                       evals='false',
                                       clocks='false',
                                       moves='false',
                                       format=JSON)

        df = json_normalize([game
                             for game in games],
                            sep='_')

        with self.output().temporary_path() as temp_output_path:
            df.to_pickle(temp_output_path, compression=None)
Пример #6
0
class CleanedHeadlines(Task):
    '''
    This class loads the data from the AWS instance if it exists and preprocesses the data for analysis 
    The class returns a dataframe with pre-processed reviews that can be loaded in the Topic Modeling class 
    for additional analysis 

    :input : s3 Path to Article Headlines 
    :output : Creates a Local Parquet File with the preprocessed data 
    '''
    subset = BoolParameter(default=True)
    requires = Requires()
    article_headlines = Requirement(ArticleHeadlines)
    date = datetime.datetime.now()
    date_suffix = str(date.month) + '_' + str(date.day) + '_' + str(date.year)

    output = TargetOutput(
        target_class=ParquetTarget,
        ext='-' + date_suffix,
        glob="*.parquet",
    )

    def run(self):
        dsk = self.input()['article_headlines'].read_dask(
            dtype={
                "publish_date": "int32",
                "headline_text": "str",
                "headline_id": "str"
            },
            storage_options=dict(requester_pays=True),
        )

        # dsk_df = dsk.compute()
        headlines_concat = "".join(dsk["headline_id"])
        headlines_hash = hash_str(headlines_concat, get_csci_salt()).hex()[:8]
        self.output().write_dask(dsk, compression="gzip")

    def print_results(self):
        print(self.output().read_dask().compute())
Пример #7
0
class EstimateStellarLabels(ThePayneMixin):
    
    """
    Use a pre-trained neural network to estimate stellar labels. This should be sub-classed to inherit properties from the type of spectra to be analysed.
    
    :param training_set_path:
        The path where the training set spectra and labels are stored.
        This should be a binary pickle file that contains a dictionary with the following keys:

        - wavelength: an array of shape (P, ) where P is the number of pixels
        - spectra: an array of shape (N, P) where N is the number of spectra and P is the number of pixels
        - labels: an array of shape (L, P) where L is the number of labels and P is the number of pixels
        - label_names: a tuple of length L that contains the names of the labels
    
    :param n_steps: (optional)
        The number of steps to train the network for (default 100000).
    
    :param n_neurons: (optional)
        The number of neurons to use in the hidden layer (default: 300).
    
    :param weight_decay: (optional)
        The weight decay to use during training (default: 0)
    
    :param learning_rate: (optional)
        The learning rate to use during training (default: 0.001).
    """

    max_batch_size = 10_000
    analyze_individual_visits = BoolParameter(default=False)

    def prepare_observation(self):
        """ Prepare the observations for analysis. """

        data_slice = None if self.analyze_individual_visits else [0, 1]
        observation = Spectrum1D.read(
            self.input()["observation"].path,
            data_slice=slice(*data_slice)
        )

        if "continuum" in self.input():
            continuum_path = self.input()["continuum"]["continuum"].path
            while True:
                with open(continuum_path, "rb") as fp:
                    continuum = pickle.load(fp)

                # If there is a shape mis-match between the observations and the continuum
                # then it likely means that there have been observations taken since the
                # continuum task was run. In this case we need to re-run the continuum
                # normalisation.

                #log.debug(f"Continuum for {self} original shape {continuum.shape}")
                if self.analyze_individual_visits is not None:
                    continuum = continuum[slice(*data_slice)]

                #log.debug(f"New shapes {observation.flux.shape} {continuum.shape}")
                
                O = observation.flux.shape[0]
                C = continuum.shape[0]

                # TODO: Consider if this is what we want to be doing..
                if O == C:
                    break

                else:
                    if O > C:
                        log.warn(f"Re-doing continuum for task {self} at runtime")
                    else:
                        log.warn(f"More continuum than observations in {self}?!")
                    
                    os.unlink(continuum_path)
                    self.requires()["continuum"].run()
        else:
            continuum = 1

        normalized_flux = observation.flux.value / continuum
        normalized_ivar = continuum * observation.uncertainty.array * continuum

        return (observation, continuum, normalized_flux, normalized_ivar)


    @slurmify
    def run(self):
        """ Execute this task. """

        # Load the model.
        log.info(f"Loading model for {self}")
        state = testing.load_state(self.input()["model"].path)

        # We can run this in batch mode.
        label_names = state["label_names"]
        tqdm_kwds = dict(total=self.get_batch_size(), desc="The Payne")
        for init, task in tqdm(timer(self.get_batch_tasks()), **tqdm_kwds):
            if task.complete():
                continue
            
            #log.debug(f"Running {task}")
            spectrum, continuum, normalized_flux, normalized_ivar = task.prepare_observation()

            #log.debug(f"Prepared observations for {task}")
            
            p_opt, p_cov, model_flux, meta = testing.test(
                spectrum.wavelength.value,
                normalized_flux,
                normalized_ivar,
                **state
            )

            #log.debug(f"Completed inference on {task}. p_opt has shape {p_opt.shape}")

            results = dict(zip(label_names, p_opt.T))
            # Note: we count the number of label names here in case we are sometimes using
            #       radial velocity determination or not, before we add in the SNR.

            L = len(results)
            # Add in uncertainties on parameters.
            results.update(dict(zip(
                (f"u_{ln}" for ln in label_names),
                np.sqrt(p_cov[:, np.arange(L), np.arange(L)].T)
            )))

            # Add in SNR values for conveninence.
            results.update(snr=spectrum.meta["snr"])
            
            # Write AstraSource object.
            if "AstraSource" in task.output():
                #log.debug(f"Writing AstraSource object for {task}")    
                task.output()["AstraSource"].write(
                    spectrum=spectrum,
                    normalized_flux=normalized_flux,
                    normalized_ivar=normalized_ivar,
                    continuum=continuum,
                    model_flux=model_flux,
                    # TODO: Project uncertainties to flux space.
                    model_ivar=None,
                    results_table=Table(results)
                )

            # Write output to database.
            if "database" in task.output():
                #log.debug(f"Writing database output for {task}")
                task.output()["database"].write(results)

            # Trigger this event as complete, and record task duration.
            task.trigger_event_processing_time(time() - init, cascade=True)

        return None


    def output(self):
        """ The output of this task. """
        if self.is_batch_mode:
            return (task.output() for task in self.get_batch_tasks())
        
        return dict(
            database=DatabaseTarget(astradb.ThePayne, self),
            #AstraSource=AstraSource(self)
        )
class FetchLichessApiPGN(Task):

    player = Parameter(default='thibault')
    perf_type = Parameter(default='blitz')
    since = DateParameter(default=datetime.today().date() - timedelta(days=1))
    single_day = BoolParameter()

    def output(self):
        import os

        file_location = (f'~/Temp/luigi/{self.since}-raw-games-'
                         f'{self.player}-{self.perf_type}-pgn.pckl')
        return LocalTarget(os.path.expanduser(file_location), format=Nop)

    def run(self):
        import lichess.api
        from lichess.format import PYCHESS
        from pandas import DataFrame, read_pickle
        from calendar import timegm
        from pipeline_import.visitors import EvalsVisitor, ClocksVisitor
        from pipeline_import.visitors import QueenExchangeVisitor
        from pipeline_import.visitors import CastlingVisitor, PositionsVisitor
        from pipeline_import.visitors import PromotionsVisitor, MaterialVisitor

        self.output().makedirs()

        with self.input().open('r') as f:
            json = read_pickle(f, compression=None)
            game_count = len(json)

        if self.single_day:
            unix_time_until = timegm((self.since
                                      + timedelta(days=1)).timetuple())
        else:
            unix_time_until = timegm(datetime.today().date().timetuple())
        self.until = int(1000 * unix_time_until)

        unix_time_since = timegm(self.since.timetuple())
        self.since_unix = int(1000 * unix_time_since)

        token = lichess_token().token

        games = lichess.api.user_games(self.player,
                                       since=self.since_unix,
                                       until=self.until,
                                       perfType=self.perf_type,
                                       auth=token,
                                       clocks='true',
                                       evals='true',
                                       opening='true',
                                       format=PYCHESS)

        visitors = [EvalsVisitor,
                    ClocksVisitor,
                    QueenExchangeVisitor,
                    CastlingVisitor,
                    PromotionsVisitor,
                    PositionsVisitor,
                    MaterialVisitor,
                    ]

        header_infos = []

        counter = 0

        for game in games:
            game_infos = parse_headers(game, visitors)
            header_infos.append(game_infos)

            # progress bar stuff
            counter += 1

            current = f'{game_infos["UTCDate"]} {game_infos["UTCTime"]}'

            current_progress = counter / game_count
            self.set_status_message(f'Parsed until {current} :: '
                                    f'{counter} / {game_count}')
            self.set_progress_percentage(round(current_progress * 100, 2))

        df = DataFrame(header_infos)

        self.set_status_message('Parsed all games')
        self.set_progress_percentage(100)

        with self.output().temporary_path() as temp_output_path:
            df.to_pickle(temp_output_path, compression=None)
class GetEvals(Task):

    local_stockfish = BoolParameter()

    def output(self):
        import os

        file_location = (f'~/Temp/luigi/{self.since}-game-evals-'
                         f'{self.player}-{self.perf_type}.pckl')
        return LocalTarget(os.path.expanduser(file_location), format=Nop)

    def run(self):
        from pandas import read_pickle, to_numeric, concat, DataFrame

        self.output().makedirs()

        with self.input().open('r') as f:
            df = read_pickle(f, compression=None)

        if df.empty:

            def complete(self):
                return True

            with self.output().temporary_path() as temp_output_path:
                df.to_pickle(temp_output_path, compression=None)

            return

        stockfish_params = stockfish_cfg()

        df = df[['evaluations', 'eval_depths', 'positions']]

        # explode the two different list-likes separately, then concat
        no_evals = df[~df['evaluations'].astype(bool)]
        df = df[df['evaluations'].astype(bool)]

        no_evals = DataFrame(no_evals['positions'].explode())
        no_evals['positions'] = get_clean_fens(no_evals['positions'])

        evals = df['evaluations'].explode().reset_index(drop=True)
        depths = df['eval_depths'].explode().reset_index(drop=True)
        positions = df['positions'].explode().reset_index(drop=True)
        positions = get_clean_fens(positions)

        sql = """SELECT fen, evaluation, eval_depth
                 FROM position_evals
                 WHERE fen IN %(positions)s;
                 """
        db_evaluations = run_remote_sql_query(sql,
                                              positions=tuple(positions.tolist() + no_evals['positions'].tolist()),  # noqa
                                              )
        positions_evaluated = db_evaluations['fen'].drop_duplicates()

        df = concat([positions, evals, depths], axis=1)

        if self.local_stockfish:

            local_evals = []

            counter = 0
            position_count = len(no_evals['positions'])

            for position in no_evals['positions'].tolist():
                if position in positions_evaluated.values:
                    # position will be dropped later if evaluation is None
                    evaluation = None
                else:
                    sf_eval = get_sf_evaluation(position + ' 0',
                                                stockfish_params.location,
                                                stockfish_params.depth)
                    if sf_eval is not None:
                        evaluation = sf_eval
                local_evals.append(evaluation)

                # progress bar stuff
                counter += 1

                current_progress = counter / position_count
                self.set_status_message(f'Analyzed :: '
                                        f'{counter} / {position_count}')
                self.set_progress_percentage(round(current_progress * 100, 2))

            self.set_status_message(f'Analyzed all {position_count} positions')
            self.set_progress_percentage(100)

            no_evals['evaluations'] = local_evals
            no_evals['eval_depths'] = stockfish_params.depth
            no_evals.dropna(inplace=True)

            df = concat([df, no_evals], axis=0, ignore_index=True)

        df = df[~df['positions'].isin(positions_evaluated)]

        df.rename(columns={'evaluations': 'evaluation',
                           'eval_depths': 'eval_depth',
                           'positions': 'fen'},
                  inplace=True)
        df['evaluation'] = to_numeric(df['evaluation'],
                                      errors='coerce')

        df.dropna(inplace=True)
        df = concat([df, db_evaluations], axis=0, ignore_index=True)

        with self.output().temporary_path() as temp_output_path:
            df.to_pickle(temp_output_path, compression=None)
Пример #10
0
class OverwriteAwareHiveQueryDataTask(WarehouseMixin, OverwriteOutputMixin,
                                      HiveQueryTask):
    """
    A generalized Data task whose output is a hive table populated from a hive query.
    """

    overwrite_target_partition = BoolParameter(
        significant=False,
        description=
        'Overwrite the target partition, deleting any existing data.  This will not impact other '
        'partitions.  Do not use with incrementally built partitions.',
        default=True)

    @property
    def insert_query(self):
        """The query builder that controls the structure and fields inserted into the new table.  This insert_query()
        is used as part of the query() function below."""
        raise NotImplementedError

    @property
    def hive_partition_task(self):
        """The HivePartitionTask that needs to be generated."""
        raise NotImplementedError

    @property
    def data_modification_sql_text(self):
        """Returns the appropriate SQL text for the chosen overwrite_target_partition strategy."""
        if self.overwrite_target_partition:
            return "OVERWRITE"
        else:
            return "INTO"

    def query(self):  # pragma: no cover
        full_insert_query = """
                    USE {database_name};
                    INSERT {into_or_overwrite} TABLE {table}
                    PARTITION ({partition.query_spec})
                    {insert_query};
                    """.format(
            database_name=hive_database_name(),
            into_or_overwrite=self.data_modification_sql_text,
            table=self.partition_task.hive_table_task.table,
            partition=self.partition,
            insert_query=self.insert_query.strip(),  # pylint: disable=no-member
        )
        return textwrap.dedent(full_insert_query)

    @property
    def partition_task(self):  # pragma: no cover
        """The task that creates the partition used by this job."""
        if not hasattr(self, '_partition_task'):
            self._partition_task = self.hive_partition_task
        return self._partition_task

    @property
    def partition(self):  # pragma: no cover
        """A shorthand for the partition information on the upstream partition task."""
        return self.partition_task.partition  # pylint: disable=no-member

    def output(self):  # pragma: no cover
        output_root = url_path_join(self.warehouse_path,
                                    self.partition_task.hive_table_task.table,
                                    self.partition.path_spec + '/')
        return get_target_from_url(output_root, marker=True)

    def on_success(self):  # pragma: no cover
        """Overload the success method to touch the _SUCCESS file.  Any class that uses a separate Marker file from the
        data file will need to override the base on_success() call to create this marker."""
        self.output().touch_marker()

    def run(self):
        self.remove_output_on_overwrite()
        return super(OverwriteAwareHiveQueryDataTask, self).run()

    def requires(self):  # pragma: no cover
        for requirement in super(OverwriteAwareHiveQueryDataTask,
                                 self).requires():
            yield requirement
        yield self.partition_task
Пример #11
0
class SuperJobTask(JobTask):

    local = BoolParameter(default=False)

    INPUT_PROTOCOL = RawValueProtocol
    INTERNAL_PROTOCOL = RawValueProtocol
    OUTPUT_PROTOCOL = RawValueProtocol

    @classmethod
    def get_param_values(cls, params, args, kwargs):

        for param_name, param_obj in params:
            if isinstance(param_obj, SuperParameter):
                if param_name not in kwargs:
                    kwargs[param_name] = param_obj.default

        return JobTask.get_param_values(params, args, kwargs)

    def __init__(self, *args, **kwargs):

        super(SuperJobTask, self).__init__(*args, **kwargs)

        self.map_task_num = 800
        self.red_task_num = 800

        self.map_memory = 2000
        self.red_memory = 2000

        self.priority = "NORMAL"

        self.options = dict()
        self.job_confs = dict()

        self.input_protocol = self.INPUT_PROTOCOL()
        self.internal_protocol = self.INTERNAL_PROTOCOL()
        self.output_protocol = self.OUTPUT_PROTOCOL()

    def _get_working_file_path(self, path):

        if not self.local:
            return path.rsplit('/', 1)[-1]
        else:
            return path

    def _output(self, output_path):

        if self.local:

            return LocalTarget(output_path)
        else:

            return HdfsTarget(output_path)

    def _input(self, input_path):

        if self.local:
            return LocalExternalData(input_path)
        else:
            return HadoopExternalData(input_path)

    def reader(self, input_stream):
        """Reader which uses python eval on each part of a tab separated string.
        Yields a tuple of python objects."""
        for input in input_stream:
            yield self.input_protocol.read(input)

    def writer(self, outputs, stdout, stderr=sys.stderr):
        for key, value in outputs:
            print >> stdout, self.output_protocol.write(key, value)

    def mapper(self, item):
        """Re-define to process an input item (usually a line of input data)


        Defaults to identity mapper that sends all lines to the same reducer"""
        key, value = item
        yield key, value

#    mapper = NotImplemented

    def _map_input(self, input_stream):
        """Iterate over input and call the mapper for each item.
           If the job has a parser defined, the return values from the parser will
           be passed as arguments to the mapper.

           If the input is coded output from a previous run, the arguments will be splitted in key and value."""
        for key, value in self.reader(input_stream):
            mapper_result = self.mapper((key, value))
            if mapper_result:
                for k, v in mapper_result:
                    yield k, v
        if self.final_mapper != NotImplemented:
            for k, v in self.final_mapper():
                yield k, v
        self._flush_batch_incr_counter()

    def _reduce_input(self, inputs, reducer, final=NotImplemented):
        """Iterate over input, collect values with the same key, and call the reducer for each uniqe key."""
        for key, values in groupby(inputs, itemgetter(0)):
            for output in reducer(key, (v[1] for v in values)):
                yield output
        if final != NotImplemented:
            for output in final():
                yield output
        self._flush_batch_incr_counter()

    def _run_mapper(self, stdin=sys.stdin, stdout=sys.stdout):
        """Run the mapper on the hadoop node."""
        self.init_hadoop()
        self.init_mapper()
        outputs = self._map_input((line[:-1] for line in stdin))
        if self.reducer == NotImplemented:
            self.writer(outputs, stdout)
        else:
            self.internal_writer(outputs, stdout)

    def _run_reducer(self, stdin=sys.stdin, stdout=sys.stdout):
        """Run the reducer on the hadoop node."""
        self.init_hadoop()
        self.init_reducer()
        if self.mapper == NotImplemented:
            outputs = self._reduce_input(
                self.reader((line[:-1] for line in stdin)), self.reducer,
                self.final_reducer)
        else:
            outputs = self._reduce_input(
                self.internal_reader((line[:-1] for line in stdin)),
                self.reducer, self.final_reducer)
        self.writer(outputs, stdout)

    def _run_combiner(self, stdin=sys.stdin, stdout=sys.stdout):
        self.init_hadoop()
        self.init_combiner()
        if self.mapper == NotImplemented:
            outputs = self._reduce_input(
                self.reader((line[:-1] for line in stdin)), self.combiner,
                self.final_combiner)
        else:
            outputs = self._reduce_input(
                self.internal_reader((line[:-1] for line in stdin)),
                self.combiner, self.final_combiner)
        self.internal_writer(outputs, stdout)

    def internal_reader(self, input_stream):
        """Reader which uses python eval on each part of a tab separated string.
        Yields a tuple of python objects."""
        for input in input_stream:
            yield self.internal_protocol.read(input)

    def internal_writer(self, outputs, stdout):
        """Writer which outputs the python repr for each item"""
        for key, value in outputs:
            print >> stdout, self.internal_protocol.write(key, value)

    def job_runner(self):
        # We recommend that you define a subclass, override this method and set up your own config
        """ Get the MapReduce runner for this job

        If all outputs are HdfsTargets, the DefaultHadoopJobRunner will be used. Otherwise, the LocalJobRunner which streams all data through the local machine will be used (great for testing).
        """
        if self.local:
            return LocalJobRunner()
        else:
            return SuperHadoopJobRunner(self.options)

    def extra_archives(self):

        return []

    def jobconfs(self):

        jcs = super(SuperJobTask, self).jobconfs()

        idx = [
            i for i, conf in enumerate(jcs)
            if conf.startswith("mapred.job.name=")
        ]

        if idx:
            task_name = self.task_id.replace('(', "").replace(")", "")
            idx = idx[0]
            jcs[idx] = 'mapred.job.name=\"%s\"' % task_name

        custom_setting = set([
            'mapred.map.tasks', "mapred.job.map.capacity",
            "mapred.reduce.tasks", "mapred.job.reduce.capacity",
            "mapred.job.priority", "stream.memory.limit"
        ])

        jcs = [
            conf for conf in jcs if conf.split('=', 1)[0] not in custom_setting
        ]

        jcs.append("mapred.map.tasks=%s" % self.map_task_num)
        jcs.append("mapred.job.map.capacity=%s" % self.map_task_num)
        if self.reducer != NotImplemented:
            jcs.append("mapred.reduce.tasks=%s" % self.red_task_num)
            jcs.append("mapred.job.reduce.capacity=%s" % self.red_task_num)
        else:
            jcs.append("mapred.reduce.tasks=%s" % 0)
            jcs.append("mapred.job.reduce.capacity=%s" % 0)

        jcs.append("mapred.job.priority=%s" % self.priority)
        jcs.append("mapred.map.memory.limit=%s" % self.map_memory)
        jcs.append("mapred.reduce.memory.limit=%s" % self.red_memory)

        for k, v in self.job_confs.iteritems():
            jcs.append("{0}={1}".format(k, v))

        return jcs

    def add_second_sort_support(self, key_field_separator="."):

        self.options[
            "partitioner"] = "org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner"
        self.job_confs["stream.num.map.output.key.fields"] = "2"
        self.job_confs["mapred.text.key.partitioner.options"] = "-k1,1"
        self.job_confs["map.output.key.field.separator"] = key_field_separator
        self.job_confs[
            "mapred.output.key.comparator.class"] = "org.apache.hadoop.mapred.lib.KeyFieldBasedComparator"
        self.job_confs["mapred.text.key.comparator.options"] = "-k1,1 -k2,2n"

    def add_newsecond_sort_support(self, key_field_separator="."):

        self.options[
            "partitioner"] = "org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner"
        self.job_confs["stream.map.output.field.separator"] = "\t"
        self.job_confs["stream.num.map.output.key.fields"] = "3"

        self.job_confs["map.output.key.field.separator"] = "\t"
        self.job_confs["mapred.text.key.partitioner.options"] = "-k1,1"

        #self.job_confs["mapred.output.key.comparator.class"] = "org.apache.hadoop.mapred.lib.KeyFieldBasedComparator"
        #self.job_confs["mapred.text.key.comparator.options"] = "-k1,1 -k2,2 -k3,3"

    def add_compress_support(self):

        self.job_confs["mapred.compress.map.output"] = "true"
        self.job_confs[
            "mapred.map.output.compression.codec"] = "org.apache.hadoop.io.compress.QuickLzCodec"
        #self.job_confs["mapred.output.compress"] = "true"
        #self.job_confs["mapred.output.compression.code"] = "org.apache.hadoop.io.compress.LzmaCodec"

    def add_combined_input_support(self):

        self.options[
            "inputformat"] = "org.apache.hadoop.mapred.CombineTextInputFormat"

    def add_multiple_output_support(self):

        self.options[
            "outputformat"] = "org.apache.hadoop.mapred.lib.SuffixMultipleTextOutputFormat"

    def set_min_split_size(self, splitsize):
        self.job_confs['mapred.min.split.size'] = splitsize

    def set_max_split_size(self, splitsize):
        self.job_confs['mapred.max.split.size'] = splitsize

    def set_memory_size(self, memory_size):
        self.memory = memory_size
Пример #12
0
class TrainTheCannonBase(TheCannonMixin):
    """
    A base task for training The Cannon.

    :param label_names:
        A list of label names.
    
    :param order: (optional)
        The polynomial order to use for this model (default: 2).    

    :param regularization: (optional)
        The strength of L1-regularization to apply during training.
    
    :param threads: (optional)
        The number of threads to use (default: 1).
    
    :param plot: (optional)
        A boolean flag to indicate whether to produce post-training quality plots.
    """

    regularization = FloatParameter(default=0.0)
    threads = IntParameter(default=1, significant=False)
    plot = BoolParameter(default=True, significant=False)

    def run(self):
        """ Execute this task. """

        # Load training set labels and spectra.
        labels, dispersion, training_set_flux, training_set_ivar = read_training_set(
            self.input().path, )

        # Set the vectorizer.
        # We sort the label names so that luigi doesn't re-train models if we alter the order.
        vectorizer = tc.vectorizer.PolynomialVectorizer(
            sorted(self.label_names), self.order)

        # Initiate model.
        model = tc.model.CannonModel(labels,
                                     training_set_flux,
                                     training_set_ivar,
                                     vectorizer=vectorizer,
                                     dispersion=dispersion,
                                     regularization=self.regularization)

        log.info(f"Training The Cannon model {model}")
        model.train(threads=self.threads)

        output_path = self.output().path
        log.info(f"Writing The Cannon model {model} to disk {output_path}")
        model.write(output_path)

        if self.plot:
            # Plot zeroth and first order coefficients.
            fig = plot.theta(
                model,
                indices=np.arange(1 + len(model.vectorizer.label_names)),
                normalize=False)
            fig.savefig(f"{self.task_id}-theta.png")

            # Plot scatter.
            fig = plot.scatter(model)
            fig.savefig(f"{self.task_id}-scatter.png")

            # Plot one-to-one.
            test_labels, test_cov, test_meta = model.test(
                training_set_flux,
                training_set_ivar,
                initial_labels=model.training_set_labels)
            fig = plot.one_to_one(model, test_labels, cov=test_cov)
            fig.savefig(f"{self.task_id}-one-to-one.png")

    def output(self):
        """ The output of this task. """
        return LocalTarget(
            os.path.join(self.output_base_dir, f"{self.task_id}.pkl"))
Пример #13
0
class GetEvals(Task):

    local_stockfish = BoolParameter()
    columns = ListParameter()

    def output(self):
        import os

        file_location = (f'~/Temp/luigi/{self.since}-game-evals-'
                         f'{self.player}.pckl')
        return LocalTarget(os.path.expanduser(file_location), format=Nop)

    def run(self):
        from pandas import read_pickle, to_numeric, concat, DataFrame

        self.output().makedirs()

        with self.input().open('r') as f:
            df = read_pickle(f, compression=None)

        if df.empty:

            def complete(self):
                return True

            with self.output().temporary_path() as temp_output_path:
                df.to_pickle(temp_output_path, compression=None)

            return

        stockfish_params = stockfish_cfg()

        df = df[['evaluations', 'eval_depths', 'positions']]

        positions_evaluated = query_for_column('position_evals', 'fen')

        # explode the two different list-likes separately, then concat
        no_evals = df[~df['evaluations'].astype(bool)]
        df = df[df['evaluations'].astype(bool)]

        evals = df['evaluations'].explode().reset_index(drop=True)
        depths = df['eval_depths'].explode().reset_index(drop=True)
        positions = df['positions'].explode().reset_index(drop=True)
        positions = positions.str.split().str[:-1].str.join(' ')

        df = concat([positions, evals, depths], axis=1)

        if self.local_stockfish:
            no_evals = DataFrame(no_evals['positions'].explode())
            no_evals['positions'] = (no_evals['positions'].str.split()
                                                          .str[:-1]
                                                          .str.join(' '))

            local_evals = []

            counter = 0
            position_count = len(no_evals['positions'])

            for position in no_evals['positions'].tolist():
                if position in positions_evaluated.values:
                    evaluation = None
                else:
                    evaluation = (get_sf_evaluation(position + ' 0',
                                                    stockfish_params.location,
                                                    stockfish_params.depth)
                                  or evaluation)
                local_evals.append(evaluation)

                # progress bar stuff
                counter += 1

                current_progress = counter / position_count
                self.set_status_message(f'Analyzed :: '
                                        f'{counter} / {position_count}')
                self.set_progress_percentage(round(current_progress * 100, 2))

            self.set_status_message(f'Analyzed all {position_count} positions')
            self.set_progress_percentage(100)

            no_evals['evaluations'] = local_evals
            no_evals['eval_depths'] = stockfish_params.depth
            no_evals.dropna(inplace=True)

            df = concat([df, no_evals], axis=0, ignore_index=True)

        df = df[~df['positions'].isin(positions_evaluated)]

        df.rename(columns={'evaluations': 'evaluation',
                           'eval_depths': 'eval_depth',
                           'positions': 'fen'},
                  inplace=True)
        df['evaluation'] = to_numeric(df['evaluation'],
                                      errors='coerce')

        df.dropna(inplace=True)

        df = df[list(self.columns)]

        with self.output().temporary_path() as temp_output_path:
            df.to_pickle(temp_output_path, compression=None)
Пример #14
0
class FetchLichessApiPGN(Task):

    player = Parameter(default='thibault')
    perf_type = Parameter(default='blitz')
    since = DateParameter(default=datetime.today().date() - timedelta(days=1))
    single_day = BoolParameter()

    def output(self):
        import os

        file_location = (f'~/Temp/luigi/{self.since}-raw-games-'
                         f'{self.player}-pgn.pckl')
        return LocalTarget(os.path.expanduser(file_location), format=Nop)

    def run(self):
        import lichess.api
        from lichess.format import PYCHESS
        from pandas import DataFrame, read_pickle
        from calendar import timegm
        from pipeline_import.visitors import EvalsVisitor, ClocksVisitor
        from pipeline_import.visitors import QueenExchangeVisitor
        from pipeline_import.visitors import CastlingVisitor, PositionsVisitor
        from pipeline_import.visitors import PromotionsVisitor, MaterialVisitor

        self.output().makedirs()

        with self.input().open('r') as f:
            json = read_pickle(f, compression=None)
            game_count = len(json)

        if self.single_day:
            unix_time_until = timegm((self.since
                                      + timedelta(days=1)).timetuple())
        else:
            unix_time_until = timegm(datetime.today().date().timetuple())
        self.until = int(1000 * unix_time_until)

        unix_time_since = timegm(self.since.timetuple())
        self.since_unix = int(1000 * unix_time_since)

        token = lichess_token().token

        games = lichess.api.user_games(self.player,
                                       since=self.since_unix,
                                       until=self.until,
                                       perfType=self.perf_type,
                                       auth=token,
                                       clocks='true',
                                       evals='true',
                                       opening='true',
                                       format=PYCHESS)

        visitors = [EvalsVisitor,
                    ClocksVisitor,
                    QueenExchangeVisitor,
                    CastlingVisitor,
                    PromotionsVisitor,
                    PositionsVisitor,
                    MaterialVisitor,
                    ]

        visitor_stats = {'clocks': 'clocks',
                         'evaluations': 'evals',
                         'eval_depths': 'eval_depths',
                         'queen_exchange': 'queen_exchange',
                         'castling_sides': 'castling',
                         'has_promotion': 'has_promotion',
                         'promotion_count_white': 'promotion_count_white',
                         'promotion_count_black': 'promotion_count_black',
                         'promotions_white': 'promotions_white',
                         'promotions_black': 'promotions_black',
                         'positions': 'positions',
                         'black_berserked': 'black_berserked',
                         'white_berserked': 'white_berserked',
                         'material_by_move': 'material_by_move',
                         }

        header_infos = []

        counter = 0

        for game in games:
            game_infos = {x: y for x, y in game.headers.items()}
            if game.headers['Variant'] == 'From Position':
                game.headers['Variant'] = 'Standard'
            for visitor in visitors:
                game.accept(visitor(game))
            for k, v in visitor_stats.items():
                game_infos[k] = getattr(game, v)
            game_infos['moves'] = [x.san() for x in game.mainline()]
            header_infos.append(game_infos)

            # progress bar stuff
            counter += 1

            current = f'{game_infos["UTCDate"]} {game_infos["UTCTime"]}'

            current_progress = counter / game_count
            self.set_status_message(f'Parsed until {current} :: '
                                    f'{counter} / {game_count}')
            self.set_progress_percentage(round(current_progress * 100, 2))

        df = DataFrame(header_infos)

        self.set_status_message('Parsed all games')
        self.set_progress_percentage(100)

        with self.output().temporary_path() as temp_output_path:
            df.to_pickle(temp_output_path, compression=None)