class worker(Config): # NOTE: `section.config-variable` in the config_path argument is deprecated in favor of `worker.config_variable` ping_interval = FloatParameter(default=1.0, config_path=dict(section='core', name='worker-ping-interval')) keep_alive = BoolParameter(default=False, config_path=dict(section='core', name='worker-keep-alive')) count_uniques = BoolParameter(default=False, config_path=dict(section='core', name='worker-count-uniques'), description='worker-count-uniques means that we will keep a ' 'worker alive only if it has a unique pending task, as ' 'well as having keep-alive true') wait_interval = FloatParameter(default=1.0, config_path=dict(section='core', name='worker-wait-interval')) wait_jitter = FloatParameter(default=5.0) max_reschedules = IntParameter(default=1, config_path=dict(section='core', name='worker-max-reschedules')) timeout = IntParameter(default=0, config_path=dict(section='core', name='worker-timeout')) task_limit = IntParameter(default=None, config_path=dict(section='core', name='worker-task-limit')) retry_external_tasks = BoolParameter(default=False, config_path=dict(section='core', name='retry-external-tasks'), description='If true, incomplete external tasks will be ' 'retested for completion while Luigi is running.') no_install_shutdown_handler = BoolParameter(default=False, description='If true, the SIGUSR1 shutdown handler will' 'NOT be install on the worker')
class worker(Config): ping_interval = FloatParameter(default=1.0, config_path=dict(section='core', name='retry-delay')) keep_alive = BoolParameter(default=False, config_path=dict(section='core', name='worker-keep-alive')) count_uniques = BoolParameter( default=False, config_path=dict(section='core', name='worker-count-uniques'), description='worker-count-uniques means that we will keep a ' 'worker alive only if it has a unique pending task, as ' 'well as having keep-alive true') wait_interval = IntParameter(default=1, config_path=dict(section='core', name='worker-wait-interval')) max_reschedules = IntParameter(default=1, config_path=dict( section='core', name='worker-max-reschedules')) timeout = IntParameter(default=0, config_path=dict(section='core', name='worker-timeout')) task_limit = IntParameter(default=None, config_path=dict(section='core', name='worker-task-limit')) retry_external_tasks = BoolParameter( default=False, config_path=dict(section='core', name='retry-external-tasks'), description='If true, incomplete external tasks will be ' 'retested for completion while Luigi is running.')
class worker(Config): # NOTE: `section.config-variable` in the config_path argument is deprecated in favor of `worker.config_variable` ping_interval = FloatParameter(default=1.0, config_path=dict( section='core', name='worker-ping-interval')) keep_alive = BoolParameter(default=False, config_path=dict(section='core', name='worker-keep-alive')) count_uniques = BoolParameter( default=False, config_path=dict(section='core', name='worker-count-uniques'), description='worker-count-uniques means that we will keep a ' 'worker alive only if it has a unique pending task, as ' 'well as having keep-alive true') count_last_scheduled = BoolParameter( default=False, description='Keep a worker alive only if there are ' 'pending tasks which it was the last to ' 'schedule.') wait_interval = FloatParameter(default=1.0, config_path=dict( section='core', name='worker-wait-interval')) wait_jitter = FloatParameter(default=5.0) max_reschedules = IntParameter(default=1, config_path=dict( section='core', name='worker-max-reschedules')) timeout = IntParameter(default=0, config_path=dict(section='core', name='worker-timeout')) task_limit = IntParameter(default=None, config_path=dict(section='core', name='worker-task-limit')) retry_external_tasks = BoolParameter( default=False, config_path=dict(section='core', name='retry-external-tasks'), description='If true, incomplete external tasks will be ' 'retested for completion while Luigi is running.') send_failure_email = BoolParameter( default=True, description='If true, send e-mails directly from the worker' 'on failure') no_install_shutdown_handler = BoolParameter( default=False, description='If true, the SIGUSR1 shutdown handler will' 'NOT be install on the worker') check_unfulfilled_deps = BoolParameter( default=True, description='If true, check for completeness of ' 'dependencies before running a task')
class worker(Config): # NOTE: `section.config-variable` in the config_path argument is deprecated in favor of `worker.config_variable` ping_interval = FloatParameter(default=1.0, config_path=dict(section='core', name='worker-ping-interval')) keep_alive = BoolParameter(default=False, config_path=dict(section='core', name='worker-keep-alive')) count_uniques = BoolParameter(default=False, config_path=dict(section='core', name='worker-count-uniques'), description='worker-count-uniques means that we will keep a ' 'worker alive only if it has a unique pending task, as ' 'well as having keep-alive true') count_last_scheduled = BoolParameter(default=False, description='Keep a worker alive only if there are ' 'pending tasks which it was the last to ' 'schedule.') wait_interval = FloatParameter(default=1.0, config_path=dict(section='core', name='worker-wait-interval')) wait_jitter = FloatParameter(default=5.0) max_reschedules = IntParameter(default=1, config_path=dict(section='core', name='worker-max-reschedules')) timeout = IntParameter(default=0, config_path=dict(section='core', name='worker-timeout')) task_limit = IntParameter(default=None, config_path=dict(section='core', name='worker-task-limit')) retry_external_tasks = BoolParameter(default=False, config_path=dict(section='core', name='retry-external-tasks'), description='If true, incomplete external tasks will be ' 'retested for completion while Luigi is running.') send_failure_email = BoolParameter(default=True, description='If true, send e-mails directly from the worker' 'on failure') no_install_shutdown_handler = BoolParameter(default=False, description='If true, the SIGUSR1 shutdown handler will' 'NOT be install on the worker') check_unfulfilled_deps = BoolParameter(default=True, description='If true, check for completeness of ' 'dependencies before running a task') force_multiprocessing = BoolParameter(default=False, description='If true, use multiprocessing also when ' 'running with 1 worker') task_process_context = OptionalParameter(default=None, description='If set to a fully qualified class name, the class will ' 'be instantiated with a TaskProcess as its constructor parameter and ' 'applied as a context manager around its run() call, so this can be ' 'used for obtaining high level customizable monitoring or logging of ' 'each individual Task run.')
class FetchLichessApiJSON(Task): player = Parameter(default='thibault') perf_type = Parameter(default='blitz') since = DateParameter(default=datetime.today().date() - timedelta(days=1)) single_day = BoolParameter() def output(self): import os file_location = (f'~/Temp/luigi/{self.since}-raw-games-' f'{self.player}-{self.perf_type}-json.pckl') return LocalTarget(os.path.expanduser(file_location), format=Nop) def run(self): import lichess.api from lichess.format import JSON from pandas import json_normalize from calendar import timegm self.output().makedirs() if self.single_day: unix_time_until = timegm((self.since + timedelta(days=1)).timetuple()) else: unix_time_until = timegm(datetime.today().date().timetuple()) self.until = int(1000 * unix_time_until) unix_time_since = timegm(self.since.timetuple()) self.since_unix = int(1000 * unix_time_since) token = lichess_token().token games = lichess.api.user_games(self.player, since=self.since_unix, until=self.until, perfType=self.perf_type, auth=token, evals='false', clocks='false', moves='false', format=JSON) df = json_normalize([game for game in games], sep='_') with self.output().temporary_path() as temp_output_path: df.to_pickle(temp_output_path, compression=None)
class CleanedHeadlines(Task): ''' This class loads the data from the AWS instance if it exists and preprocesses the data for analysis The class returns a dataframe with pre-processed reviews that can be loaded in the Topic Modeling class for additional analysis :input : s3 Path to Article Headlines :output : Creates a Local Parquet File with the preprocessed data ''' subset = BoolParameter(default=True) requires = Requires() article_headlines = Requirement(ArticleHeadlines) date = datetime.datetime.now() date_suffix = str(date.month) + '_' + str(date.day) + '_' + str(date.year) output = TargetOutput( target_class=ParquetTarget, ext='-' + date_suffix, glob="*.parquet", ) def run(self): dsk = self.input()['article_headlines'].read_dask( dtype={ "publish_date": "int32", "headline_text": "str", "headline_id": "str" }, storage_options=dict(requester_pays=True), ) # dsk_df = dsk.compute() headlines_concat = "".join(dsk["headline_id"]) headlines_hash = hash_str(headlines_concat, get_csci_salt()).hex()[:8] self.output().write_dask(dsk, compression="gzip") def print_results(self): print(self.output().read_dask().compute())
class EstimateStellarLabels(ThePayneMixin): """ Use a pre-trained neural network to estimate stellar labels. This should be sub-classed to inherit properties from the type of spectra to be analysed. :param training_set_path: The path where the training set spectra and labels are stored. This should be a binary pickle file that contains a dictionary with the following keys: - wavelength: an array of shape (P, ) where P is the number of pixels - spectra: an array of shape (N, P) where N is the number of spectra and P is the number of pixels - labels: an array of shape (L, P) where L is the number of labels and P is the number of pixels - label_names: a tuple of length L that contains the names of the labels :param n_steps: (optional) The number of steps to train the network for (default 100000). :param n_neurons: (optional) The number of neurons to use in the hidden layer (default: 300). :param weight_decay: (optional) The weight decay to use during training (default: 0) :param learning_rate: (optional) The learning rate to use during training (default: 0.001). """ max_batch_size = 10_000 analyze_individual_visits = BoolParameter(default=False) def prepare_observation(self): """ Prepare the observations for analysis. """ data_slice = None if self.analyze_individual_visits else [0, 1] observation = Spectrum1D.read( self.input()["observation"].path, data_slice=slice(*data_slice) ) if "continuum" in self.input(): continuum_path = self.input()["continuum"]["continuum"].path while True: with open(continuum_path, "rb") as fp: continuum = pickle.load(fp) # If there is a shape mis-match between the observations and the continuum # then it likely means that there have been observations taken since the # continuum task was run. In this case we need to re-run the continuum # normalisation. #log.debug(f"Continuum for {self} original shape {continuum.shape}") if self.analyze_individual_visits is not None: continuum = continuum[slice(*data_slice)] #log.debug(f"New shapes {observation.flux.shape} {continuum.shape}") O = observation.flux.shape[0] C = continuum.shape[0] # TODO: Consider if this is what we want to be doing.. if O == C: break else: if O > C: log.warn(f"Re-doing continuum for task {self} at runtime") else: log.warn(f"More continuum than observations in {self}?!") os.unlink(continuum_path) self.requires()["continuum"].run() else: continuum = 1 normalized_flux = observation.flux.value / continuum normalized_ivar = continuum * observation.uncertainty.array * continuum return (observation, continuum, normalized_flux, normalized_ivar) @slurmify def run(self): """ Execute this task. """ # Load the model. log.info(f"Loading model for {self}") state = testing.load_state(self.input()["model"].path) # We can run this in batch mode. label_names = state["label_names"] tqdm_kwds = dict(total=self.get_batch_size(), desc="The Payne") for init, task in tqdm(timer(self.get_batch_tasks()), **tqdm_kwds): if task.complete(): continue #log.debug(f"Running {task}") spectrum, continuum, normalized_flux, normalized_ivar = task.prepare_observation() #log.debug(f"Prepared observations for {task}") p_opt, p_cov, model_flux, meta = testing.test( spectrum.wavelength.value, normalized_flux, normalized_ivar, **state ) #log.debug(f"Completed inference on {task}. p_opt has shape {p_opt.shape}") results = dict(zip(label_names, p_opt.T)) # Note: we count the number of label names here in case we are sometimes using # radial velocity determination or not, before we add in the SNR. L = len(results) # Add in uncertainties on parameters. results.update(dict(zip( (f"u_{ln}" for ln in label_names), np.sqrt(p_cov[:, np.arange(L), np.arange(L)].T) ))) # Add in SNR values for conveninence. results.update(snr=spectrum.meta["snr"]) # Write AstraSource object. if "AstraSource" in task.output(): #log.debug(f"Writing AstraSource object for {task}") task.output()["AstraSource"].write( spectrum=spectrum, normalized_flux=normalized_flux, normalized_ivar=normalized_ivar, continuum=continuum, model_flux=model_flux, # TODO: Project uncertainties to flux space. model_ivar=None, results_table=Table(results) ) # Write output to database. if "database" in task.output(): #log.debug(f"Writing database output for {task}") task.output()["database"].write(results) # Trigger this event as complete, and record task duration. task.trigger_event_processing_time(time() - init, cascade=True) return None def output(self): """ The output of this task. """ if self.is_batch_mode: return (task.output() for task in self.get_batch_tasks()) return dict( database=DatabaseTarget(astradb.ThePayne, self), #AstraSource=AstraSource(self) )
class FetchLichessApiPGN(Task): player = Parameter(default='thibault') perf_type = Parameter(default='blitz') since = DateParameter(default=datetime.today().date() - timedelta(days=1)) single_day = BoolParameter() def output(self): import os file_location = (f'~/Temp/luigi/{self.since}-raw-games-' f'{self.player}-{self.perf_type}-pgn.pckl') return LocalTarget(os.path.expanduser(file_location), format=Nop) def run(self): import lichess.api from lichess.format import PYCHESS from pandas import DataFrame, read_pickle from calendar import timegm from pipeline_import.visitors import EvalsVisitor, ClocksVisitor from pipeline_import.visitors import QueenExchangeVisitor from pipeline_import.visitors import CastlingVisitor, PositionsVisitor from pipeline_import.visitors import PromotionsVisitor, MaterialVisitor self.output().makedirs() with self.input().open('r') as f: json = read_pickle(f, compression=None) game_count = len(json) if self.single_day: unix_time_until = timegm((self.since + timedelta(days=1)).timetuple()) else: unix_time_until = timegm(datetime.today().date().timetuple()) self.until = int(1000 * unix_time_until) unix_time_since = timegm(self.since.timetuple()) self.since_unix = int(1000 * unix_time_since) token = lichess_token().token games = lichess.api.user_games(self.player, since=self.since_unix, until=self.until, perfType=self.perf_type, auth=token, clocks='true', evals='true', opening='true', format=PYCHESS) visitors = [EvalsVisitor, ClocksVisitor, QueenExchangeVisitor, CastlingVisitor, PromotionsVisitor, PositionsVisitor, MaterialVisitor, ] header_infos = [] counter = 0 for game in games: game_infos = parse_headers(game, visitors) header_infos.append(game_infos) # progress bar stuff counter += 1 current = f'{game_infos["UTCDate"]} {game_infos["UTCTime"]}' current_progress = counter / game_count self.set_status_message(f'Parsed until {current} :: ' f'{counter} / {game_count}') self.set_progress_percentage(round(current_progress * 100, 2)) df = DataFrame(header_infos) self.set_status_message('Parsed all games') self.set_progress_percentage(100) with self.output().temporary_path() as temp_output_path: df.to_pickle(temp_output_path, compression=None)
class GetEvals(Task): local_stockfish = BoolParameter() def output(self): import os file_location = (f'~/Temp/luigi/{self.since}-game-evals-' f'{self.player}-{self.perf_type}.pckl') return LocalTarget(os.path.expanduser(file_location), format=Nop) def run(self): from pandas import read_pickle, to_numeric, concat, DataFrame self.output().makedirs() with self.input().open('r') as f: df = read_pickle(f, compression=None) if df.empty: def complete(self): return True with self.output().temporary_path() as temp_output_path: df.to_pickle(temp_output_path, compression=None) return stockfish_params = stockfish_cfg() df = df[['evaluations', 'eval_depths', 'positions']] # explode the two different list-likes separately, then concat no_evals = df[~df['evaluations'].astype(bool)] df = df[df['evaluations'].astype(bool)] no_evals = DataFrame(no_evals['positions'].explode()) no_evals['positions'] = get_clean_fens(no_evals['positions']) evals = df['evaluations'].explode().reset_index(drop=True) depths = df['eval_depths'].explode().reset_index(drop=True) positions = df['positions'].explode().reset_index(drop=True) positions = get_clean_fens(positions) sql = """SELECT fen, evaluation, eval_depth FROM position_evals WHERE fen IN %(positions)s; """ db_evaluations = run_remote_sql_query(sql, positions=tuple(positions.tolist() + no_evals['positions'].tolist()), # noqa ) positions_evaluated = db_evaluations['fen'].drop_duplicates() df = concat([positions, evals, depths], axis=1) if self.local_stockfish: local_evals = [] counter = 0 position_count = len(no_evals['positions']) for position in no_evals['positions'].tolist(): if position in positions_evaluated.values: # position will be dropped later if evaluation is None evaluation = None else: sf_eval = get_sf_evaluation(position + ' 0', stockfish_params.location, stockfish_params.depth) if sf_eval is not None: evaluation = sf_eval local_evals.append(evaluation) # progress bar stuff counter += 1 current_progress = counter / position_count self.set_status_message(f'Analyzed :: ' f'{counter} / {position_count}') self.set_progress_percentage(round(current_progress * 100, 2)) self.set_status_message(f'Analyzed all {position_count} positions') self.set_progress_percentage(100) no_evals['evaluations'] = local_evals no_evals['eval_depths'] = stockfish_params.depth no_evals.dropna(inplace=True) df = concat([df, no_evals], axis=0, ignore_index=True) df = df[~df['positions'].isin(positions_evaluated)] df.rename(columns={'evaluations': 'evaluation', 'eval_depths': 'eval_depth', 'positions': 'fen'}, inplace=True) df['evaluation'] = to_numeric(df['evaluation'], errors='coerce') df.dropna(inplace=True) df = concat([df, db_evaluations], axis=0, ignore_index=True) with self.output().temporary_path() as temp_output_path: df.to_pickle(temp_output_path, compression=None)
class OverwriteAwareHiveQueryDataTask(WarehouseMixin, OverwriteOutputMixin, HiveQueryTask): """ A generalized Data task whose output is a hive table populated from a hive query. """ overwrite_target_partition = BoolParameter( significant=False, description= 'Overwrite the target partition, deleting any existing data. This will not impact other ' 'partitions. Do not use with incrementally built partitions.', default=True) @property def insert_query(self): """The query builder that controls the structure and fields inserted into the new table. This insert_query() is used as part of the query() function below.""" raise NotImplementedError @property def hive_partition_task(self): """The HivePartitionTask that needs to be generated.""" raise NotImplementedError @property def data_modification_sql_text(self): """Returns the appropriate SQL text for the chosen overwrite_target_partition strategy.""" if self.overwrite_target_partition: return "OVERWRITE" else: return "INTO" def query(self): # pragma: no cover full_insert_query = """ USE {database_name}; INSERT {into_or_overwrite} TABLE {table} PARTITION ({partition.query_spec}) {insert_query}; """.format( database_name=hive_database_name(), into_or_overwrite=self.data_modification_sql_text, table=self.partition_task.hive_table_task.table, partition=self.partition, insert_query=self.insert_query.strip(), # pylint: disable=no-member ) return textwrap.dedent(full_insert_query) @property def partition_task(self): # pragma: no cover """The task that creates the partition used by this job.""" if not hasattr(self, '_partition_task'): self._partition_task = self.hive_partition_task return self._partition_task @property def partition(self): # pragma: no cover """A shorthand for the partition information on the upstream partition task.""" return self.partition_task.partition # pylint: disable=no-member def output(self): # pragma: no cover output_root = url_path_join(self.warehouse_path, self.partition_task.hive_table_task.table, self.partition.path_spec + '/') return get_target_from_url(output_root, marker=True) def on_success(self): # pragma: no cover """Overload the success method to touch the _SUCCESS file. Any class that uses a separate Marker file from the data file will need to override the base on_success() call to create this marker.""" self.output().touch_marker() def run(self): self.remove_output_on_overwrite() return super(OverwriteAwareHiveQueryDataTask, self).run() def requires(self): # pragma: no cover for requirement in super(OverwriteAwareHiveQueryDataTask, self).requires(): yield requirement yield self.partition_task
class SuperJobTask(JobTask): local = BoolParameter(default=False) INPUT_PROTOCOL = RawValueProtocol INTERNAL_PROTOCOL = RawValueProtocol OUTPUT_PROTOCOL = RawValueProtocol @classmethod def get_param_values(cls, params, args, kwargs): for param_name, param_obj in params: if isinstance(param_obj, SuperParameter): if param_name not in kwargs: kwargs[param_name] = param_obj.default return JobTask.get_param_values(params, args, kwargs) def __init__(self, *args, **kwargs): super(SuperJobTask, self).__init__(*args, **kwargs) self.map_task_num = 800 self.red_task_num = 800 self.map_memory = 2000 self.red_memory = 2000 self.priority = "NORMAL" self.options = dict() self.job_confs = dict() self.input_protocol = self.INPUT_PROTOCOL() self.internal_protocol = self.INTERNAL_PROTOCOL() self.output_protocol = self.OUTPUT_PROTOCOL() def _get_working_file_path(self, path): if not self.local: return path.rsplit('/', 1)[-1] else: return path def _output(self, output_path): if self.local: return LocalTarget(output_path) else: return HdfsTarget(output_path) def _input(self, input_path): if self.local: return LocalExternalData(input_path) else: return HadoopExternalData(input_path) def reader(self, input_stream): """Reader which uses python eval on each part of a tab separated string. Yields a tuple of python objects.""" for input in input_stream: yield self.input_protocol.read(input) def writer(self, outputs, stdout, stderr=sys.stderr): for key, value in outputs: print >> stdout, self.output_protocol.write(key, value) def mapper(self, item): """Re-define to process an input item (usually a line of input data) Defaults to identity mapper that sends all lines to the same reducer""" key, value = item yield key, value # mapper = NotImplemented def _map_input(self, input_stream): """Iterate over input and call the mapper for each item. If the job has a parser defined, the return values from the parser will be passed as arguments to the mapper. If the input is coded output from a previous run, the arguments will be splitted in key and value.""" for key, value in self.reader(input_stream): mapper_result = self.mapper((key, value)) if mapper_result: for k, v in mapper_result: yield k, v if self.final_mapper != NotImplemented: for k, v in self.final_mapper(): yield k, v self._flush_batch_incr_counter() def _reduce_input(self, inputs, reducer, final=NotImplemented): """Iterate over input, collect values with the same key, and call the reducer for each uniqe key.""" for key, values in groupby(inputs, itemgetter(0)): for output in reducer(key, (v[1] for v in values)): yield output if final != NotImplemented: for output in final(): yield output self._flush_batch_incr_counter() def _run_mapper(self, stdin=sys.stdin, stdout=sys.stdout): """Run the mapper on the hadoop node.""" self.init_hadoop() self.init_mapper() outputs = self._map_input((line[:-1] for line in stdin)) if self.reducer == NotImplemented: self.writer(outputs, stdout) else: self.internal_writer(outputs, stdout) def _run_reducer(self, stdin=sys.stdin, stdout=sys.stdout): """Run the reducer on the hadoop node.""" self.init_hadoop() self.init_reducer() if self.mapper == NotImplemented: outputs = self._reduce_input( self.reader((line[:-1] for line in stdin)), self.reducer, self.final_reducer) else: outputs = self._reduce_input( self.internal_reader((line[:-1] for line in stdin)), self.reducer, self.final_reducer) self.writer(outputs, stdout) def _run_combiner(self, stdin=sys.stdin, stdout=sys.stdout): self.init_hadoop() self.init_combiner() if self.mapper == NotImplemented: outputs = self._reduce_input( self.reader((line[:-1] for line in stdin)), self.combiner, self.final_combiner) else: outputs = self._reduce_input( self.internal_reader((line[:-1] for line in stdin)), self.combiner, self.final_combiner) self.internal_writer(outputs, stdout) def internal_reader(self, input_stream): """Reader which uses python eval on each part of a tab separated string. Yields a tuple of python objects.""" for input in input_stream: yield self.internal_protocol.read(input) def internal_writer(self, outputs, stdout): """Writer which outputs the python repr for each item""" for key, value in outputs: print >> stdout, self.internal_protocol.write(key, value) def job_runner(self): # We recommend that you define a subclass, override this method and set up your own config """ Get the MapReduce runner for this job If all outputs are HdfsTargets, the DefaultHadoopJobRunner will be used. Otherwise, the LocalJobRunner which streams all data through the local machine will be used (great for testing). """ if self.local: return LocalJobRunner() else: return SuperHadoopJobRunner(self.options) def extra_archives(self): return [] def jobconfs(self): jcs = super(SuperJobTask, self).jobconfs() idx = [ i for i, conf in enumerate(jcs) if conf.startswith("mapred.job.name=") ] if idx: task_name = self.task_id.replace('(', "").replace(")", "") idx = idx[0] jcs[idx] = 'mapred.job.name=\"%s\"' % task_name custom_setting = set([ 'mapred.map.tasks', "mapred.job.map.capacity", "mapred.reduce.tasks", "mapred.job.reduce.capacity", "mapred.job.priority", "stream.memory.limit" ]) jcs = [ conf for conf in jcs if conf.split('=', 1)[0] not in custom_setting ] jcs.append("mapred.map.tasks=%s" % self.map_task_num) jcs.append("mapred.job.map.capacity=%s" % self.map_task_num) if self.reducer != NotImplemented: jcs.append("mapred.reduce.tasks=%s" % self.red_task_num) jcs.append("mapred.job.reduce.capacity=%s" % self.red_task_num) else: jcs.append("mapred.reduce.tasks=%s" % 0) jcs.append("mapred.job.reduce.capacity=%s" % 0) jcs.append("mapred.job.priority=%s" % self.priority) jcs.append("mapred.map.memory.limit=%s" % self.map_memory) jcs.append("mapred.reduce.memory.limit=%s" % self.red_memory) for k, v in self.job_confs.iteritems(): jcs.append("{0}={1}".format(k, v)) return jcs def add_second_sort_support(self, key_field_separator="."): self.options[ "partitioner"] = "org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner" self.job_confs["stream.num.map.output.key.fields"] = "2" self.job_confs["mapred.text.key.partitioner.options"] = "-k1,1" self.job_confs["map.output.key.field.separator"] = key_field_separator self.job_confs[ "mapred.output.key.comparator.class"] = "org.apache.hadoop.mapred.lib.KeyFieldBasedComparator" self.job_confs["mapred.text.key.comparator.options"] = "-k1,1 -k2,2n" def add_newsecond_sort_support(self, key_field_separator="."): self.options[ "partitioner"] = "org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner" self.job_confs["stream.map.output.field.separator"] = "\t" self.job_confs["stream.num.map.output.key.fields"] = "3" self.job_confs["map.output.key.field.separator"] = "\t" self.job_confs["mapred.text.key.partitioner.options"] = "-k1,1" #self.job_confs["mapred.output.key.comparator.class"] = "org.apache.hadoop.mapred.lib.KeyFieldBasedComparator" #self.job_confs["mapred.text.key.comparator.options"] = "-k1,1 -k2,2 -k3,3" def add_compress_support(self): self.job_confs["mapred.compress.map.output"] = "true" self.job_confs[ "mapred.map.output.compression.codec"] = "org.apache.hadoop.io.compress.QuickLzCodec" #self.job_confs["mapred.output.compress"] = "true" #self.job_confs["mapred.output.compression.code"] = "org.apache.hadoop.io.compress.LzmaCodec" def add_combined_input_support(self): self.options[ "inputformat"] = "org.apache.hadoop.mapred.CombineTextInputFormat" def add_multiple_output_support(self): self.options[ "outputformat"] = "org.apache.hadoop.mapred.lib.SuffixMultipleTextOutputFormat" def set_min_split_size(self, splitsize): self.job_confs['mapred.min.split.size'] = splitsize def set_max_split_size(self, splitsize): self.job_confs['mapred.max.split.size'] = splitsize def set_memory_size(self, memory_size): self.memory = memory_size
class TrainTheCannonBase(TheCannonMixin): """ A base task for training The Cannon. :param label_names: A list of label names. :param order: (optional) The polynomial order to use for this model (default: 2). :param regularization: (optional) The strength of L1-regularization to apply during training. :param threads: (optional) The number of threads to use (default: 1). :param plot: (optional) A boolean flag to indicate whether to produce post-training quality plots. """ regularization = FloatParameter(default=0.0) threads = IntParameter(default=1, significant=False) plot = BoolParameter(default=True, significant=False) def run(self): """ Execute this task. """ # Load training set labels and spectra. labels, dispersion, training_set_flux, training_set_ivar = read_training_set( self.input().path, ) # Set the vectorizer. # We sort the label names so that luigi doesn't re-train models if we alter the order. vectorizer = tc.vectorizer.PolynomialVectorizer( sorted(self.label_names), self.order) # Initiate model. model = tc.model.CannonModel(labels, training_set_flux, training_set_ivar, vectorizer=vectorizer, dispersion=dispersion, regularization=self.regularization) log.info(f"Training The Cannon model {model}") model.train(threads=self.threads) output_path = self.output().path log.info(f"Writing The Cannon model {model} to disk {output_path}") model.write(output_path) if self.plot: # Plot zeroth and first order coefficients. fig = plot.theta( model, indices=np.arange(1 + len(model.vectorizer.label_names)), normalize=False) fig.savefig(f"{self.task_id}-theta.png") # Plot scatter. fig = plot.scatter(model) fig.savefig(f"{self.task_id}-scatter.png") # Plot one-to-one. test_labels, test_cov, test_meta = model.test( training_set_flux, training_set_ivar, initial_labels=model.training_set_labels) fig = plot.one_to_one(model, test_labels, cov=test_cov) fig.savefig(f"{self.task_id}-one-to-one.png") def output(self): """ The output of this task. """ return LocalTarget( os.path.join(self.output_base_dir, f"{self.task_id}.pkl"))
class GetEvals(Task): local_stockfish = BoolParameter() columns = ListParameter() def output(self): import os file_location = (f'~/Temp/luigi/{self.since}-game-evals-' f'{self.player}.pckl') return LocalTarget(os.path.expanduser(file_location), format=Nop) def run(self): from pandas import read_pickle, to_numeric, concat, DataFrame self.output().makedirs() with self.input().open('r') as f: df = read_pickle(f, compression=None) if df.empty: def complete(self): return True with self.output().temporary_path() as temp_output_path: df.to_pickle(temp_output_path, compression=None) return stockfish_params = stockfish_cfg() df = df[['evaluations', 'eval_depths', 'positions']] positions_evaluated = query_for_column('position_evals', 'fen') # explode the two different list-likes separately, then concat no_evals = df[~df['evaluations'].astype(bool)] df = df[df['evaluations'].astype(bool)] evals = df['evaluations'].explode().reset_index(drop=True) depths = df['eval_depths'].explode().reset_index(drop=True) positions = df['positions'].explode().reset_index(drop=True) positions = positions.str.split().str[:-1].str.join(' ') df = concat([positions, evals, depths], axis=1) if self.local_stockfish: no_evals = DataFrame(no_evals['positions'].explode()) no_evals['positions'] = (no_evals['positions'].str.split() .str[:-1] .str.join(' ')) local_evals = [] counter = 0 position_count = len(no_evals['positions']) for position in no_evals['positions'].tolist(): if position in positions_evaluated.values: evaluation = None else: evaluation = (get_sf_evaluation(position + ' 0', stockfish_params.location, stockfish_params.depth) or evaluation) local_evals.append(evaluation) # progress bar stuff counter += 1 current_progress = counter / position_count self.set_status_message(f'Analyzed :: ' f'{counter} / {position_count}') self.set_progress_percentage(round(current_progress * 100, 2)) self.set_status_message(f'Analyzed all {position_count} positions') self.set_progress_percentage(100) no_evals['evaluations'] = local_evals no_evals['eval_depths'] = stockfish_params.depth no_evals.dropna(inplace=True) df = concat([df, no_evals], axis=0, ignore_index=True) df = df[~df['positions'].isin(positions_evaluated)] df.rename(columns={'evaluations': 'evaluation', 'eval_depths': 'eval_depth', 'positions': 'fen'}, inplace=True) df['evaluation'] = to_numeric(df['evaluation'], errors='coerce') df.dropna(inplace=True) df = df[list(self.columns)] with self.output().temporary_path() as temp_output_path: df.to_pickle(temp_output_path, compression=None)
class FetchLichessApiPGN(Task): player = Parameter(default='thibault') perf_type = Parameter(default='blitz') since = DateParameter(default=datetime.today().date() - timedelta(days=1)) single_day = BoolParameter() def output(self): import os file_location = (f'~/Temp/luigi/{self.since}-raw-games-' f'{self.player}-pgn.pckl') return LocalTarget(os.path.expanduser(file_location), format=Nop) def run(self): import lichess.api from lichess.format import PYCHESS from pandas import DataFrame, read_pickle from calendar import timegm from pipeline_import.visitors import EvalsVisitor, ClocksVisitor from pipeline_import.visitors import QueenExchangeVisitor from pipeline_import.visitors import CastlingVisitor, PositionsVisitor from pipeline_import.visitors import PromotionsVisitor, MaterialVisitor self.output().makedirs() with self.input().open('r') as f: json = read_pickle(f, compression=None) game_count = len(json) if self.single_day: unix_time_until = timegm((self.since + timedelta(days=1)).timetuple()) else: unix_time_until = timegm(datetime.today().date().timetuple()) self.until = int(1000 * unix_time_until) unix_time_since = timegm(self.since.timetuple()) self.since_unix = int(1000 * unix_time_since) token = lichess_token().token games = lichess.api.user_games(self.player, since=self.since_unix, until=self.until, perfType=self.perf_type, auth=token, clocks='true', evals='true', opening='true', format=PYCHESS) visitors = [EvalsVisitor, ClocksVisitor, QueenExchangeVisitor, CastlingVisitor, PromotionsVisitor, PositionsVisitor, MaterialVisitor, ] visitor_stats = {'clocks': 'clocks', 'evaluations': 'evals', 'eval_depths': 'eval_depths', 'queen_exchange': 'queen_exchange', 'castling_sides': 'castling', 'has_promotion': 'has_promotion', 'promotion_count_white': 'promotion_count_white', 'promotion_count_black': 'promotion_count_black', 'promotions_white': 'promotions_white', 'promotions_black': 'promotions_black', 'positions': 'positions', 'black_berserked': 'black_berserked', 'white_berserked': 'white_berserked', 'material_by_move': 'material_by_move', } header_infos = [] counter = 0 for game in games: game_infos = {x: y for x, y in game.headers.items()} if game.headers['Variant'] == 'From Position': game.headers['Variant'] = 'Standard' for visitor in visitors: game.accept(visitor(game)) for k, v in visitor_stats.items(): game_infos[k] = getattr(game, v) game_infos['moves'] = [x.san() for x in game.mainline()] header_infos.append(game_infos) # progress bar stuff counter += 1 current = f'{game_infos["UTCDate"]} {game_infos["UTCTime"]}' current_progress = counter / game_count self.set_status_message(f'Parsed until {current} :: ' f'{counter} / {game_count}') self.set_progress_percentage(round(current_progress * 100, 2)) df = DataFrame(header_infos) self.set_status_message('Parsed all games') self.set_progress_percentage(100) with self.output().temporary_path() as temp_output_path: df.to_pickle(temp_output_path, compression=None)