Exemplo n.º 1
0
class AllState(luigi.WrapperTask):
    state = luigi.Parameter()

    def requires(self):
        for year in STATE_YEARS[self.state]:
            yield StateFiles(year=year, state=self.state)
Exemplo n.º 2
0
class MergeMorphologyBase(luigi.Task):
    """ MergeMorphology base class
    """

    task_name = 'merge_morphology'
    src_file = os.path.abspath(__file__)
    allow_retry = False

    input_path = luigi.Parameter()
    input_key = luigi.Parameter()
    output_path = luigi.Parameter()
    output_key = luigi.Parameter()
    number_of_labels = luigi.IntParameter()
    prefix = luigi.Parameter()
    #
    dependency = luigi.TaskParameter()

    def requires(self):
        return self.dependency

    def run_impl(self):
        # get the global config and init configs
        shebang = self.global_config_values()[0]
        self.init(shebang)

        # load the task config
        config = self.get_task_config()

        out_shape = (int(self.number_of_labels), 11)
        out_chunks = (min(int(self.number_of_labels), 100000), 11)
        block_list = vu.blocks_in_volume([out_shape[0]], [out_chunks[0]])

        # create output dataset
        with vu.file_reader(self.output_path) as f:
            f.require_dataset(self.output_key,
                              shape=out_shape,
                              chunks=out_chunks,
                              compression='gzip',
                              dtype='float64')

        # update the config with input and graph paths and keys
        # as well as block shape
        config.update({
            'input_path': self.input_path,
            'input_key': self.input_key,
            'output_path': self.output_path,
            'output_key': self.output_key,
            'out_shape': out_shape,
            'out_chunks': out_chunks
        })

        # prime and run the jobs
        n_jobs = min(len(block_list), self.max_jobs)
        self.prepare_jobs(n_jobs, block_list, config, self.prefix)
        self.submit_jobs(n_jobs, self.prefix)

        # wait till jobs finish and check for job success
        self.wait_for_jobs(self.prefix)
        self.check_jobs(n_jobs, self.prefix)

    # part of the luigi API
    def output(self):
        return luigi.LocalTarget(
            os.path.join(self.tmp_folder,
                         self.task_name + '_%s.log' % self.prefix))
Exemplo n.º 3
0
class HivePartitionTask(WarehouseMixin, OverwriteOutputMixin, HiveQueryTask):
    """
    Abstract class that represents the metadata associated with a partition in a Hive table.

    Note that all this task does is ensure that the partition is created, it does not populate it with any data, simply
    runs the DDL commands to create the partition.
    """

    partition_value = luigi.Parameter()

    def query(self):
        if self.overwrite:
            drop_on_overwrite = 'ALTER TABLE `{table}` DROP IF EXISTS PARTITION ({partition.query_spec});'.format(
                table=self.hive_table_task.table, partition=self.partition)
        else:
            drop_on_overwrite = ''

        query_format = """
            USE {database_name};
            {drop_on_overwrite}
            ALTER TABLE `{table}` ADD IF NOT EXISTS PARTITION ({partition.query_spec});
        """

        query = query_format.format(database_name=hive_database_name(),
                                    table=self.hive_table_task.table,
                                    partition=self.partition,
                                    drop_on_overwrite=drop_on_overwrite)

        return textwrap.dedent(query)

    @property
    def hive_table_task(self):
        """Returns a reference to the task that represents the table that this partition is part of."""
        raise NotImplementedError

    @property
    def data_task(self):
        """Returns a luigi task that is used to insert real data into this partition."""
        return None

    @property
    def partition(self):
        """Returns a HivePartition object that represents the partition."""
        return HivePartition(self.hive_table_task.partition_by,
                             self.partition_value)

    @property
    def partition_location(self):
        """Returns the full URL of the partition. This allows data to be written to the partition by external systems"""
        return url_path_join(self.hive_table_task.table_location,
                             self.partition.path_spec + '/')

    def requires(self):
        if self.data_task is not None:
            yield self.data_task
        yield self.hive_table_task

    def output(self):
        # Ugh.  A change in Luigi 1.0.22 (after our 1.0.17 fork) resulted in a change in ApacheHiveCommandClient.table_exists()
        # behavior, so that it throws an exception when checking for a specific partition when the table doesn't exist.
        # This means that HivePartitionTarget.exists() will fail, where before it succeeded even if the table did not exist.
        # So change fail_missing_table=False here.  There is no reason for it anyway.
        return HivePartitionTarget(self.hive_table_task.table,
                                   self.partition.as_dict(),
                                   database=hive_database_name(),
                                   fail_missing_table=False)

    def job_runner(self):
        return OverwriteAwareHiveQueryRunner()

    def remove_output_on_overwrite(self):
        # Note that the query takes care of actually removing the old partition.
        if self.overwrite:
            self.attempted_removal = True
Exemplo n.º 4
0
class KickHkmaT0101(MkDir):
    workdir = luigi.Parameter(default='t0101-monetary-base')

    def requires(self):
        extract = ExtractHkma(**self.givedir, urlpath='/T0101.xls')
        yield extract
Exemplo n.º 5
0
class MortarRTask(luigi.Task):
    """
    Luigi Task to run an R script.

    To use this Task in a pipeline, create a subclass that overrides the methods:

    * `rscript`
    * `arguments`

    seealso:: https://help.mortardata.com/technologies/luigi/r_tasks
    """

    # Location where completion tokens are written
    # e.g. s3://my-bucket/my-path
    token_path = luigi.Parameter()

    def output_token(self):
        """
        Luigi Target providing path to a token that indicates
        completion of this Task.

        :rtype: Target:
        :returns: Target for Task completion token
        """
        return target_factory.get_target(
            '%s/%s' % (self.token_path, self.__class__.__name__))

    def output(self):
        """
        The output for this Task. Returns the output token
        by default, so the task only runs if the token does not 
        already exist.

        :rtype: Target:
        :returns: Target for Task completion token
        """
        return [self.output_token()]

    @abc.abstractmethod
    def rscript(self):
        """
        Path to the R script to run, relative to the root of your Mortar project.

        Ex:
            If you have two files in your Mortar project:
                * luigiscripts/my_r_luigiscript.py
                * rscripts/my_r_script.R

            You would return:
                "rscripts/my_r_script.R"

        :rtype: str:
        :returns: Path to your R script relative to the root of your Mortar project. e.g. rscripts/my_r_script.R
        """
        raise RuntimeError(
            "Please implement the rscript method in your MortarRTask to specify which script to run."
        )

    def arguments(self):
        """
        Returns list of arguments to be sent to your R script.

        :rtype: list of str:
        :returns: List of arguments to pass to your R script. Default: []
        """
        return []

    def run(self):
        """
        Run an R script using the Rscript program. Pipes stdout and
        stderr back to the logging facility.
        """
        cmd = self._subprocess_command()
        output = subprocess.Popen(cmd,
                                  shell=True,
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.STDOUT,
                                  bufsize=1)
        for line in iter(output.stdout.readline, b''):
            logger.info(line)
        out, err = output.communicate()
        rc = output.returncode
        if rc != 0:
            raise RuntimeError('%s returned non-zero error code %s' %
                               (self._subprocess_command(), rc))

        target_factory.write_file(self.output_token())

    def _subprocess_command(self):
        return "Rscript %s %s" % (self.rscript(), " ".join(self.arguments()))
Exemplo n.º 6
0
class ForgotParam(luigi.Task):
    param = luigi.Parameter()

    def run(self):
        pass
Exemplo n.º 7
0
class KickHkmaT030502(MkDir):
    workdir = luigi.Parameter(default='t030502-econ-sector-loans-and-advances')

    def requires(self):
        extract = ExtractHkma(**self.givedir, urlpath='/T030502.xls')
        yield extract
Exemplo n.º 8
0
 class InsignificantParameterTask(luigi.Task):
     foo = luigi.Parameter(significant=False)
     bar = luigi.Parameter()
Exemplo n.º 9
0
 def testNoValue(self):
     self.assertRaises(ParameterException, lambda: luigi.Parameter(config_path=dict(section="foo", name="bar")).value)
class ElasticsearchIndexTask(OverwriteOutputMixin, MapReduceJobTask):
    """
    Index a stream of documents in an elasticsearch index.

    This task is intended to do the following:
    * Create a new index that is unique to this task run (all significant parameters).
    * Load all of the documents into this unique index.
    * If the alias is already pointing at one or more indexes, switch it so that it only points at this newly loaded
      index.
    * Delete any indexes that were previously pointed at by the alias, leaving only the newly loaded index.

    """

    host = luigi.Parameter(
        is_list=True,
        config_path={
            'section': 'elasticsearch',
            'name': 'host'
        },
        description=
        'Hostnames for the elasticsearch cluster nodes. They can be specified in any of the formats'
        ' accepted by the elasticsearch-py library. This includes complete URLs such as http://foo.com/, or'
        ' host port pairs such as foo:8000. Note that if you wish to use SSL you should specify a full URL'
        ' and the "https" scheme.')
    timeout = luigi.FloatParameter(
        config_path={
            'section': 'elasticsearch',
            'name': 'timeout'
        },
        significant=False,
        default=60,
        description=
        'Maximum number of seconds to wait when attempting to make connections to the elasticsearch cluster'
        ' before assuming the cluster is not responding and giving up with a timeout error.'
    )
    connection_type = luigi.Parameter(
        config_path={
            'section': 'elasticsearch',
            'name': 'connection_type'
        },
        significant=False,
        default='urllib',
        description=
        'If not specified, default to using urllib3 to make HTTP requests to elasticsearch. The other valid'
        ' value is "aws" which can be used to connect to clusters that are managed by AWS. See'
        ' `AWS elasticsearch service <https://aws.amazon.com/elasticsearch-service/>`_'
    )
    alias = luigi.Parameter(
        description=
        'Name of the alias in elasticsearch that will point to the complete index when loaded. This value '
        ' should match the settings of edx-analytics-data-api.')
    number_of_shards = luigi.Parameter(
        default=None,
        description=
        'Number of `shards <https://www.elastic.co/guide/en/elasticsearch/reference/current/glossary.html'
        '#glossary-shard>`_ to use in the elasticsearch index.')
    throttle = luigi.FloatParameter(
        default=0.1,
        significant=False,
        description=
        'Wait this many seconds between batches of records submitted to the cluster to be indexed. This can'
        ' be used to tune the indexing process, allowing the cluster to successfully "keep up" with the'
        ' loader. Note that often the hadoop cluster can load records much more quickly than the cluster'
        ' can index them, which eventually causes queues to overflow within the elasticsearch cluster.'
    )
    batch_size = luigi.IntParameter(
        default=1000,
        significant=False,
        description=
        'Number of records to submit to the cluster to be indexed in a single request. A small value here'
        ' will result in more, smaller, requests and a larger value will result in fewer, bigger requests.'
    )
    indexing_tasks = luigi.IntParameter(
        default=None,
        significant=False,
        description=
        'Number of parallel processes to use to submit records to be indexed from. The stream of records'
        ' will be divided up evenly among these processes during the indexing procedure.'
    )
    max_attempts = luigi.IntParameter(
        default=10,
        significant=False,
        description=
        'If the elasticsearch cluster rejects a batch of records (usually because it is too busy) the'
        ' indexing process will retry up to this many times before giving up. It uses an exponential back-'
        'off strategy, so a high value here can result in very significant wait times before retrying.'
    )

    # These attributes should be overridden, but don't need to be.
    settings = {}
    properties = {}

    def __init__(self, *args, **kwargs):
        super(ElasticsearchIndexTask, self).__init__(*args, **kwargs)

        self.other_reduce_tasks = self.n_reduce_tasks
        if self.indexing_tasks is not None:
            self.n_reduce_tasks = self.indexing_tasks

        self.batch_index = 0
        self.index = self.alias + '_' + str(hash(self.update_id()))
        self.indexes_for_alias = set()

    def init_local(self):
        super(ElasticsearchIndexTask, self).init_local()

        elasticsearch_client = self.create_elasticsearch_client()

        # Find all indexes that are referred to by this alias (currently). These will be deleted after a successful
        # load of the new index.
        aliases = elasticsearch_client.indices.get_aliases(name=self.alias)
        self.indexes_for_alias.update([
            index for index, alias_info in aliases.iteritems()
            if self.alias in alias_info['aliases'].keys()
        ])

        if self.index in self.indexes_for_alias:
            if not self.overwrite:
                raise RuntimeError(
                    'Index {0} is currently in use by alias {1}'.format(
                        self.index, self.alias))
            else:
                # These indexes will be deleted, after the alias swap, make sure we don't delete the index we just
                # populated.
                self.indexes_for_alias.remove(self.index)

        if not self.overwrite and len(self.indexes_for_alias) > 1:
            raise RuntimeError(
                'Invalid state, multiple existing indexes ({0}) found for alias {1}'
                .format(', '.join(self.indexes_for_alias), self.alias))

        # In order for the OverwriteOutputMixin to recognize that this task has run we need to let it know. This will
        # allow it to actually check if the task is complete after it is run.
        self.attempted_removal = True

        if elasticsearch_client.indices.exists(index=self.index):
            elasticsearch_client.indices.delete(index=self.index)

        settings = {
            'refresh_interval': -1,
        }
        if self.number_of_shards is not None:
            settings['number_of_shards'] = self.number_of_shards

        if self.settings:
            settings.update(self.settings)

        elasticsearch_client.indices.create(index=self.index,
                                            body={
                                                'settings': settings,
                                                'mappings': {
                                                    self.doc_type: {
                                                        'properties':
                                                        self.properties
                                                    }
                                                }
                                            })

    def create_elasticsearch_client(self):
        """Build an elasticsearch client using the various parameters passed into this task."""
        kwargs = {}
        if self.connection_type == 'aws':
            kwargs['connection_class'] = AwsHttpConnection
        return elasticsearch.Elasticsearch(
            hosts=self.host,
            timeout=self.timeout,
            retry_on_status=(HTTP_CONNECT_TIMEOUT_STATUS_CODE,
                             HTTP_GATEWAY_TIMEOUT_STATUS_CODE),
            retry_on_timeout=True,
            **kwargs)

    def mapper(self, line):
        yield (random.randrange(int(self.n_reduce_tasks)), line.rstrip('\r\n'))

    def reducer(self, _key, lines):
        """
        Given a batch of records, transmit them to the elasticsearch cluster to be indexed.

        There should be one reducer per parallel indexing thread. Controlling the number of reducers is the way to
        control the level of parallelism in the load process.
        """
        elasticsearch_client = self.create_elasticsearch_client()

        document_iterator = self.document_generator(lines)
        first_batch = True
        while True:
            bulk_action_batch = self.next_bulk_action_batch(document_iterator)

            if not bulk_action_batch:
                break

            if not first_batch and self.throttle:
                time.sleep(self.throttle)
            first_batch = False

            if self.send_bulk_action_batch(elasticsearch_client,
                                           bulk_action_batch):
                self.incr_counter('Elasticsearch', 'Committed Batches', 1)

                # Note that each document produces two entries in the bulk_action_batch list.
                num_records = len(bulk_action_batch) / 2
                self.incr_counter('Elasticsearch', 'Records Indexed',
                                  num_records)
            else:
                raise IndexingError(
                    'Batch of records rejected too many times. Aborting.')

        # Luigi requires the reducer to actually return something, so we just return empty strings that are written
        # to a temp file in HDFS that is immediately cleaned up after the job finishes.
        yield ('', '')

    def next_bulk_action_batch(self, document_iterator):
        """
        Read a batch of documents from the iterator and convert them into bulk index actions.

        Elasticsearch expects each document to actually be transmitted on two lines the first of which details the
        action to take, and the second contains the actual document.

        See the `Cheaper in Bulk <https://www.elastic.co/guide/en/elasticsearch/guide/1.x/bulk.html>`_ guide.

        Arguments:
            document_iterator (iterator of dicts):

        Returns: A list of dicts that can be transmitted to elasticsearch using the "bulk" request.
        """
        bulk_action_batch = []
        for raw_data in islice(document_iterator, self.batch_size):
            action, data = elasticsearch.helpers.expand_action(raw_data)
            bulk_action_batch.append(action)
            if data is not None:
                bulk_action_batch.append(data)
        return bulk_action_batch

    def send_bulk_action_batch(self, elasticsearch_client, bulk_action_batch):
        """
        Given a batch of actions, transmit them in bulk to the elasticsearch cluster.

        This method handles back-pressure from the elasticsearch cluster which queues up writes. When the queue is full
        the cluster will start rejecting additional bulk indexing requests. This method implements an exponential
        back-off, allowing the cluster to catch-up with the client.

        Arguments:
            elasticsearch_client (elasticsearch.Elasticsearch): A reference to an elasticsearch client.
            bulk_action_batch (list of dicts): A list of bulk actions followed by their respective documents.

        Raises:
            IndexingError: If a record cannot be indexed by elasticsearch this method assumes that is a fatal error and
                it immediately raises this exception. If we try to transmit a batch repeatedly and it is continually
                rejected by the cluster, this method will give up after `max_attempts` and raise this error.

        Returns: True iff the batch of actions was successfully transmitted to and acknowledged by the elasticsearch
            cluster.
        """
        attempts = 0
        batch_written_successfully = False
        while True:
            try:
                resp = elasticsearch_client.bulk(bulk_action_batch,
                                                 index=self.index,
                                                 doc_type=self.doc_type)
            except TransportError as transport_error:
                if transport_error.status_code not in (
                        REJECTED_REQUEST_STATUS,
                        HTTP_SERVICE_UNAVAILABLE_STATUS_CODE):
                    raise transport_error
            else:
                num_errors = 0
                for raw_data in resp['items']:
                    _op_type, item = raw_data.popitem()
                    successful = 200 <= item.get('status', 500) < 300
                    if not successful:
                        log.error('Failed to index: %s', str(item))
                        num_errors += 1

                if num_errors == 0:
                    batch_written_successfully = True
                    break
                else:
                    raise IndexingError(
                        'Failed to index {0} records. Aborting.'.format(
                            num_errors))

            attempts += 1
            if attempts < self.max_attempts:
                sleep_duration = 2**attempts
                self.incr_counter('Elasticsearch', 'Rejected Batches', 1)
                log.warn(
                    'Batch of records rejected. Sleeping for %d seconds before retrying.',
                    sleep_duration)
                time.sleep(sleep_duration)
            else:
                batch_written_successfully = False
                break

        return batch_written_successfully

    def document_generator(self, lines):
        """
        Given lines of raw text, generates structured documents that will be indexed by elasticsearch.

        The returned document should have roughly the following structure:

            {
                "_id": "(optional) your custom identifier for the document",
                "_source": {
                    "prop0": "you should have one key-value pair for each property and its value"
                }
            }

        Note that you can also specify other "special" fields other than "_id":

        - _index
        - _parent
        - _percolate
        - _routing
        - _timestamp
        - _ttl
        - _type
        - _version
        - _version_type
        - _retry_on_conflict

        The "_source" field is required.

        Arguments:
            lines (iterable of unicode strings): This is the raw data to be indexed.

        Yields:
            dict: The document to index in the format expected by the elasticsearch bulk loading process.
        """
        raise NotImplementedError

    @property
    def doc_type(self):
        """
        Elasticsearch `document type <https://www.elastic.co/guide/en/elasticsearch/guide/current/mapping.html>`_.
        """
        raise NotImplementedError

    def extra_modules(self):
        import urllib3

        packages = [elasticsearch, urllib3]

        return packages

    def jobconfs(self):
        jcs = super(ElasticsearchIndexTask, self).jobconfs()
        jcs.append('mapred.reduce.tasks.speculative.execution=false')
        return jcs

    def update_id(self):
        """A unique identifier for this task instance that is used to determine if it should be run again."""
        return self.task_id

    def output(self):
        return ElasticsearchTarget(client=self.create_elasticsearch_client(),
                                   index=self.alias,
                                   doc_type=self.doc_type,
                                   update_id=self.update_id())

    def commit(self):
        """
        If all documents have been loaded successfully, make the changes visible to users.
        """
        # The ordering of operations here is sensitive.

        elasticsearch_client = self.create_elasticsearch_client()

        # First "refresh" the newly loaded index. We disable refreshes during the load to keep throughput high. This
        # step is necessary to ensure all of the documents are properly indexed and user-visible.
        elasticsearch_client.indices.refresh(index=self.index)

        # Perform an atomic swap of the alias.
        actions = []
        old_indexes = [
            ix for ix in self.indexes_for_alias
            if elasticsearch_client.indices.exists(index=ix)
        ]
        for old_index in old_indexes:
            actions.append(
                {"remove": {
                    "index": old_index,
                    "alias": self.alias
                }})
        actions.append({"add": {"index": self.index, "alias": self.alias}})
        elasticsearch_client.indices.update_aliases({"actions": actions})

        # Update the luigi metadata to indicate that the task ran successfully.
        self.output().touch()

        # Attempt to remove any old indexes that are now no longer user-visible.
        for old_index in old_indexes:
            elasticsearch_client.indices.delete(index=old_index)

    def rollback(self):
        """
        If something goes wrong during the load, attempt to clean up the partially loaded index.
        """
        elasticsearch_client = self.create_elasticsearch_client()
        try:
            if elasticsearch_client.indices.exists(index=self.index):
                elasticsearch_client.indices.delete(index=self.index)
        except Exception:  # pylint: disable=broad-except
            log.exception("Unable to rollback the elasticsearch load.")

    def run(self):
        try:
            super(ElasticsearchIndexTask, self).run()
        except Exception:  # pylint: disable=broad-except
            self.rollback()
            raise
        else:
            self.commit()
Exemplo n.º 11
0
    global_bool_param = luigi.BoolParameter(is_global=True, default=False)

    def run(self):
        self.complete = lambda: True

    def complete(self):
        return False


class HasGlobalParamDep(luigi.Task):
    x = luigi.Parameter()

    def requires(self):
        return HasGlobalParam(self.x)

_shared_global_param = luigi.Parameter(is_global=True, default='123')


class SharedGlobalParamA(luigi.Task):
    shared_global_param = _shared_global_param


class SharedGlobalParamB(luigi.Task):
    shared_global_param = _shared_global_param


class BananaDep(luigi.Task):
    x = luigi.Parameter()
    y = luigi.Parameter(default='def')

    def output(self):
Exemplo n.º 12
0
class RayTracingLoop(QueenbeeTask):
    """Run ray-tracing and post-process the results for a point-in-time simulation."""

    # DAG Input parameters
    _input_params = luigi.DictParameter()

    # Task inputs
    @property
    def radiance_parameters(self):
        return self._input_params['radiance_parameters']

    @property
    def metric(self):
        return self._input_params['metric']

    fixed_radiance_parameters = luigi.Parameter(default='-h')

    @property
    def grid(self):
        value = pathlib.Path(self.input()['SplitGrid']['output_folder'].path,
                             self.item['path'])
        return value.as_posix() if value.is_absolute() \
            else pathlib.Path(self.initiation_folder, value).resolve().as_posix()

    @property
    def scene_file(self):
        value = pathlib.Path(self._input_params['octree_file'])
        return value.as_posix() if value.is_absolute() \
            else pathlib.Path(self.initiation_folder, value).resolve().as_posix()

    # get item for loop
    try:
        item = luigi.DictParameter()
    except Exception:
        item = luigi.Parameter()

    @property
    def execution_folder(self):
        return pathlib.Path(self._input_params['simulation_folder'],
                            'results').resolve().as_posix()

    @property
    def initiation_folder(self):
        return pathlib.Path(self._input_params['simulation_folder']).as_posix()

    @property
    def params_folder(self):
        return pathlib.Path(
            self.execution_folder,
            self._input_params['params_folder']).resolve().as_posix()

    def command(self):
        return 'honeybee-radiance raytrace point-in-time scene.oct grid.pts --rad-params "{radiance_parameters}" --rad-params-locked "{fixed_radiance_parameters}" --metric {metric} --output grid.res'.format(
            radiance_parameters=self.radiance_parameters,
            fixed_radiance_parameters=self.fixed_radiance_parameters,
            metric=self.metric)

    def requires(self):
        return {'SplitGrid': SplitGrid(_input_params=self._input_params)}

    def output(self):
        return {
            'result':
            luigi.LocalTarget(
                pathlib.Path(
                    self.execution_folder, '{item_name}.res'.format(
                        item_name=self.item['name'])).resolve().as_posix())
        }

    @property
    def input_artifacts(self):
        return [{
            'name': 'grid',
            'to': 'grid.pts',
            'from': self.grid,
            'optional': False
        }, {
            'name': 'scene_file',
            'to': 'scene.oct',
            'from': self.scene_file,
            'optional': False
        }]

    @property
    def output_artifacts(self):
        return [{
            'name':
            'result',
            'from':
            'grid.res',
            'to':
            pathlib.Path(self.execution_folder, '{item_name}.res'.format(
                item_name=self.item['name'])).resolve().as_posix()
        }]
Exemplo n.º 13
0
class PrepareMovielensData(luigi.Task):
    '''
    Splits the data into training, validation and testing.
    Reindex it according to the okapi needs, i.e., the item and user index starts at 0

    The output contains 4 files:
    testing, training, info, and validation. The info file contains info about the training
    data set (#user, #items, etc). I is used for testing the model.
    '''
    
    fraction = luigi.Parameter(description="The fraction of data we want to use", default=1.0)

    #remaking ids for okapi: starts 1 (items -1), no gaps, no new items in the test/validation set
    training_users = {"original_id" :0}
    training_items = {"original_id": 0}

    def requires(self):
        return DownloadMovielens()

    def output(self):
        return [luigi.hdfs.HdfsTarget('movielens.testing_{}'.format(self.fraction)),
                luigi.hdfs.HdfsTarget('movielens.training_{}'.format(self.fraction)),
                luigi.hdfs.HdfsTarget('movielens.training.info_{}'.format(self.fraction)),
                luigi.hdfs.HdfsTarget('movielens.validation_{}'.format(self.fraction))]

    def local_output(self):
        return [luigi.file.File(tempfile.gettempdir()+'/movielens.testing_{}'.format(self.fraction)),
                luigi.file.File(tempfile.gettempdir()+'/movielens.training_{}'.format(self.fraction)),
                luigi.file.File(tempfile.gettempdir()+'/movielens.training.info_{}'.format(self.fraction)),
                luigi.file.File(tempfile.gettempdir()+'/movielens.validation_{}'.format(self.fraction))]


    def _get_id(self, original_id, dictionary):
        id = dictionary.get(original_id, len(dictionary))
        dictionary[original_id] = id
        return id

    def run(self):
        '''
        1. 70% entries go to training, others go to memory
        2. from memory, items and users that are in training 33% of items go into validation, 66% go to testing
        '''
	frac = float(self.fraction)
        import random
        random.seed(123)#just that all user would have the same data sets

        f = self.input().open('r') # this will return a file stream that reads from movielens ratings.dat
        training = self.local_output()[1].open('w')

        hdfs_client = luigi.hdfs.HdfsClient()

        #lets first write training set and store in memory user and item indexes
        testing_validation = []
        cnt = 0
        for line in f:
	    if random.random() > frac:
                continue

            user,item,rating,time = line.split("::")
            rating = int(float(rating))

            if random.random() < 0.7: #write to training
	        userid = self._get_id(user, self.training_users)
	        itemid = self._get_id(item, self.training_items)
                training.write("{0} {1} {2}\n".format(userid, itemid, rating))
                cnt += 1
            else:
                testing_validation.append((user, item, rating))
        training.close() # needed because files are atomic


        #now lets write out the testing and validation
        testing = self.local_output()[0].open('w')
        validation = self.local_output()[3].open('w')
        for u,i,rating in testing_validation:
            if u in self.training_users and i in self.training_items:
                if random.random() < 0.33:
                    validation.write('{0} {1} {2}\n'.format(self.training_users[u], self.training_items[i], rating))
                else:
                    testing.write('{0} {1} {2}\n'.format(self.training_users[u], self.training_items[i], rating))
        testing.close()
        validation.close()
        f.close()

        info = self.local_output()[2].open('w')
        info.write('n_users: {0}, n_items: {1}, n_entries: {2}\n'.format(len(self.training_users), len(self.training_items), cnt))
        info.close()

	hdfs_client.put(self.local_output()[0].path, self.output()[0].path)
	hdfs_client.put(self.local_output()[1].path, self.output()[1].path)
	hdfs_client.put(self.local_output()[2].path, self.output()[2].path)
	hdfs_client.put(self.local_output()[3].path, self.output()[3].path)
Exemplo n.º 14
0
class OkapiTrainModelTask(luigi.hadoop_jar.HadoopJarJobTask):
    '''Trains a model'''

    fraction = luigi.Parameter(description="The fraction of data we want to use", default=1.0)
    model_name = luigi.Parameter(description="The model: {"+" | ".join(methods)+"}")
    out_hdfs = luigi.Parameter(description="Output dir for the task")

    def requires(self):
        #we need to delete a special zookeeper dir because of some strange behaviour
        return PrepareMovielensData(self.fraction)

    def output(self):
        return luigi.hdfs.HdfsTarget(self.out_hdfs)

    def _get_conf(self, section, name):
        return luigi.configuration.get_config().get(section, name)

    def get_computation_class(self):
        if self.model_name in methods:
            return methods[self.model_name]
        else:
            raise "Not implemented method. Please choose from {"+ " | ".join(methods.keys())+"}"

    def get_input_format(self):
        return 'ml.grafos.okapi.cf.CfLongIdFloatTextInputFormat'

    def get_output_format(self):
        return 'org.apache.giraph.io.formats.IdWithValueTextOutputFormat'

    def get_input(self):
        training = self.input()[1].path
        return training

    def get_output(self):
        return self.out_hdfs

    def run(self):
        self.set_hadoop_classpath()
        DeleteDir(self._get_conf("hadoop", "zookeeper-dir")).run()
        super(OkapiTrainModelTask, self).run()

    def get_libjars(self):
        return [self.giraph_jar(), self.okapi_jar()]

    def set_hadoop_classpath(self):
        '''we need to put our jars into the classpath of the hadoop'''
        hadoop_cp = ':'.join(filter(None, self.get_libjars()))
        if os.environ.get('HADOOP_CLASSPATH', None):
            if not hadoop_cp in os.environ['HADOOP_CLASSPATH']:
                os.environ['HADOOP_CLASSPATH'] = os.environ['HADOOP_CLASSPATH']+":"+hadoop_cp
        else:
            os.environ['HADOOP_CLASSPATH'] = hadoop_cp
        logger.debug("HADOOP_CLASSPATH={0}".format(os.environ['HADOOP_CLASSPATH']))

    def jar(self):
        return self.giraph_jar()

    def main(self):
        return 'org.apache.giraph.GiraphRunner'

    def get_jar(self, group, jarname):
        config = luigi.configuration.get_config()
        jar = config.get(group, jarname)
        if not jar:
            logger.error("You must specify {0} in client.cfg".format(jarname))
            raise
        if not os.path.exists(jar):
            logger.error("Can't find {0} jar: ".format(jarname))
            raise
        return jar

    def okapi_jar(self):
        return self.get_jar("okapi", "okapi-jar")

    def giraph_jar(self):
        return self.get_jar("okapi", "giraph-jar")

    def get_custom_arguments(self, info_filename):
        #we check how many items there are in the training set
        f = info_filename.open()
        line = f.readlines()
        f.close()
        maxItems = int(line[0].split(",")[1].split(':')[1])
        return ['-ca', 'minItemId=1',
                '-ca', 'maxItemId='+str(maxItems-1)]

    def get_custom_method_params(self, model_name):
	if model_name=="SGD":
		return ['-mc', 'ml.grafos.okapi.cf.sgd.Sgd$MasterCompute', 
			'-ca', 'iterations=20',
			'-ca', 'gamma=0.005',
			'-ca', 'lambda=0.01',		
			'-ca', 'dim=20',
                        '-ca', 'debug=true']	
	elif model_name=="ALS":
		return ['-mc', 'ml.grafos.okapi.cf.als.Als$MasterCompute', 
			'-ca', 'iterations=20',
			'-ca', 'lambda=0.01',		
			'-ca', 'dim=20',
			'-ca', 'debug=true']	
	elif model_name=="SVD":
		return ['-mc', 'ml.grafos.okapi.cf.svd.Svdpp$MasterCompute', 
			'-ca', 'iterations=20',
			'-ca', 'dim=20',
                        '-ca', 'debug=true']	
	else:
		return []

    def args(self):
        return [
            "-libjars", ",".join(self.get_libjars()),
            "-Dmapred.child.java.opts="+self._get_conf('hadoop', 'hadoop-mem'),
            "-Dgiraph.zkManagerDirectory="+self._get_conf('hadoop', 'zookeeper-dir'),
            "-Dgiraph.useSuperstepCounters=false",
            self.get_computation_class(),
            '-eif', self.get_input_format(),
            '-eip', self.get_input(),
            '-vof', self.get_output_format(),
            '-op', self.get_output(),
            '-w', self._get_conf("okapi", "workers"),
	    '-ca', "giraph.numComputeThreads="+self._get_conf('okapi', 'threads')] \
            + self.get_custom_arguments(self.input()[2]) \
	    + self.get_custom_method_params(self.model_name)
Exemplo n.º 15
0
 def testWithDefaultAndMissing(self):
     p = luigi.Parameter(config_path=dict(section="foo", name="bar"), default='blah')
     self.assertEqual('blah', p.value)
Exemplo n.º 16
0
 class A(luigi.Task):
     p = luigi.Parameter(config_path=dict(section="foo", name="bar"))
Exemplo n.º 17
0
 def testGlobalAndMissing(self):
     p = luigi.Parameter(config_path=dict(section="foo", name="bar"), is_global=True, default='blah')
     self.assertEqual('blah', p.value)
     p.set_global('meh')
     self.assertEqual('meh', p.value)
Exemplo n.º 18
0
class WithDefault(luigi.Task):
    x = luigi.Parameter(default='xyz')
Exemplo n.º 19
0
class HasGlobalParamDep(luigi.Task):
    x = luigi.Parameter()

    def requires(self):
        return HasGlobalParam(self.x)
Exemplo n.º 20
0
class Foo(luigi.Task):
    bar = luigi.Parameter()
    p2 = luigi.IntParameter()
    multi = luigi.Parameter(is_list=True)
    not_a_param = "lol"
Exemplo n.º 21
0
class ExtractHkma(ExtractHttp):
    domain = luigi.Parameter(
        default=
        "http://www.hkma.gov.hk/media/eng/doc/market-data-and-statistics/monthly-statistical-bulletin"
    )
Exemplo n.º 22
0
 def testHasDefaultNoValue(self):
     self.assertFalse(luigi.Parameter(config_path=dict(section="foo", name="bar")).has_value)
Exemplo n.º 23
0
class QueryDb(_utils.DataPreparationTask):
    """Make an SQL query and store the results into an output file."""

    query = luigi.Parameter(
        description="The SQL query to perform on the DB"
    )

    args = _utils.ObjectParameter(
        default=(),
        description="The SQL query's positional arguments"
    )

    kwargs = _utils.ObjectParameter(
        default={},
        description="The SQL query's named arguments"
    )

    limit = luigi.parameter.IntParameter(
        default=-1,
        description="The maximum number of rows to fetch. Optional. If -1, "
                    "all rows will be fetched.")

    shuffle = luigi.BoolParameter(
        default=False,
        description="If True, all rows will be shuffled. For debugging and "
                    "exploration purposes. Might impact performance.")

    def output(self):

        return luigi.LocalTarget(
            f'{self.output_dir}/{self.task_id}.csv',
            format=UTF8
        )

    def run(self):

        query = self.build_query()
        rows, columns = self.db_connector.query_with_header(
            query, *self.args, **self.kwargs)
        df = pd.DataFrame(rows, columns=columns)

        df = self.transform(df)

        self.write_output(df)

    def build_query(self):

        query = self.query
        if self.shuffle:
            query += ' ORDER BY RANDOM()'
        if self.minimal_mode and self.limit == -1:
            self.limit = 50
        if self.limit and self.limit != -1:
            query += f' LIMIT {self.limit}'
        return query

    def transform(self, df):
        """Provide a hook for subclasses."""
        return df

    def write_output(self, df):

        with self.output().open('w') as output_stream:
            df.to_csv(output_stream, index=False, header=True)
Exemplo n.º 24
0
 def testHasDefaultWithBoth(self):
     self.assertTrue(luigi.Parameter(config_path=dict(section="foo", name="bar")).has_value)
Exemplo n.º 25
0
class MortarProjectTask(MortarTask):
    """
    Luigi Task to run a job on the Mortar platform. 
    If the job fails, the task will exit with an error.

    To use this class, define the following section in your Luigi 
    client configuration file:

    ::[mortar]
    ::email: ${MORTAR_EMAIL}
    ::api_key: ${MORTAR_API_KEY}
    ::host: api.mortardata.com
    ::project_name: ${MORTAR_PROJECT_NAME}

    see also:: https://help.mortardata.com/technologies/luigi/mortar_tasks
    """

    # A cluster size of 2 or greater will use a Hadoop cluster.  If there
    # is an idle cluster of cluster_size or greater that cluster will be used.
    # Otherwise a new cluster will be started.
    # A cluster size of 0 will run the Mortar job directly on the Mortar Pig
    # server in local mode (no cluster).
    # All other cluster_size values are invalid.
    cluster_size = luigi.IntParameter(default=2)

    # A single use cluster will be terminated immediately after this
    # Mortar job completes.  Otherwise it will be terminated automatically
    # after being idle for one hour.
    # This option does not apply when running the Mortar job in local mode
    # (cluster_size = 0).
    run_on_single_use_cluster = luigi.BooleanParameter(False)

    # If False, this task will only run on an idle cluster or will
    # start up a new cluster if no idle clusters are found.  If True,
    # this task may run on a cluster that has other jobs already running on it.
    # If run_on_single_use_cluster is True, this parameter will be ignored.
    share_running_cluster = luigi.BooleanParameter(False)

    # Whether a launched Hadoop cluster will take advantage of AWS
    # Spot Pricing (https://help.mortardata.com/technologies/hadoop/spot_instance_clusters)
    # This option does not apply when running in local mode (cluster_size = 0).
    use_spot_instances = luigi.BooleanParameter(True)

    # The Git reference (commit hash or branch name) to use when running
    # this Mortar job.  The default value NO_GIT_REF_FLAG is a flag value
    # that indicates no value was entered as a parameter.  If no value
    # is passed as a parameter the environment value "MORTAR_LUIGI_GIT_REF"
    # is used.  If that is not set the "master" is used.
    git_ref = luigi.Parameter(default=NO_GIT_REF_FLAG)

    # Set to true to receive an email upon completion
    # of this Mortar job.
    notify_on_job_finish = luigi.BooleanParameter(default=False)

    # Internval (in seconds) to poll for job status.
    job_polling_interval = luigi.IntParameter(default=5)

    # Number of retries before giving up on polling.
    num_polling_retries = luigi.IntParameter(default=3)

    # Version of Pig to use.
    pig_version = luigi.Parameter(default='0.12')

    def project(self):
        """
        Override this method to provide the name of 
        the Mortar Project.

        :rtype: str:
        :returns: Your project name, e.g. my-mortar-recsys
        """
        if luigi.configuration.get_config().has_option('mortar',
                                                       'project_name'):
            project_name = luigi.configuration.get_config().get(
                'mortar', 'project_name')
            return project_name
        raise RuntimeError(
            "Please implement the project method or provide a project_name configuration item to return your project name"
        )

    @abc.abstractmethod
    def script(self):
        """
        Override this method to provide the name of 
        the script to run.

        :rtype: str:
        :returns: Script name, e.g. my_pig_script
        """
        raise RuntimeError(
            "Please implement the script method to return your script name")

    @abc.abstractmethod
    def is_control_script(self):
        """
        [DEPRECATED] Whether this job should run a control script.

        :rtype: bool:
        :returns: [DEPRECATED] whether this job should run a control script
        """
        raise RuntimeError("Please implement the is_control_script method")

    def parameters(self):
        """
        This method defines the parameters that Mortar will pass to your
        your script when it runs.

        :rtype: dict:
        :returns: dict of parameters to pass, e.g. {'my-param': 'my-value'}. Default: {}
        """
        return {}

    def output(self):
        """
        The output for this Task. Returns the `success_token`
        by default, so the Task only runs if a token indiciating success
        has not been previously written.

        :rtype: list of Target:
        :returns: list containing one output, the `success_token`
        """
        return [self.success_token()]

    def token_path(self):
        """
        The MortarProjectTask writes out several "tokens" as it executes, indicating
        whether it is Running and then when it is Complete. These tokens are
        used to ensure that the task is not rerun once it has already completed.

        This method provides the base path where those tokens are written. By default,
        tokens are written to a temporary directory on the file system.

        However, for running in a cluster setting, you should overrides this method
        to use an S3 path (e.g. s3://my-bucket/my-token-path), 
        ensuring that tokens will be available from any machine.

        :rtype: str:
        :returns: default token path on file system - file://tempdirectory
        """
        # override with S3 path for usage across machines or on clusters
        return 'file://%s' % tempfile.gettempdir()

    @abc.abstractmethod
    def script_output(self):
        """
        List of locations where your script writes output. If your script fails, Luigi
        will clear any output from these locations to ensure that the next run of your
        Task is idempotent.

        :rtype: list of Target:
        :returns: list of Target to clear in case of Task failure
        """
        raise RuntimeError("Please implement the script_output method")

    def running_token(self):
        """
        The MortarProjectTask writes out several "tokens" as it executes to ensure
        idempotence. This method provides the token file that indicates that the job 
        is running.

        By default, it is stored underneath the path provided by the `token_path` method,
        and is named after your class name. So, if your `token_path` is set to 
        `s3://my-bucket/my-folder` and your Task is named FooTask, the token will be:

        `s3://my-bucket/my-folder/FooTask-Running`

        This token will contain the Mortar job_id of the job that is running.

        :rtype: Target:
        :returns: Target for the token that indicates job is running.
        """
        return target_factory.get_target(
            '%s/%s-%s' %
            (self.token_path(), self.__class__.__name__, 'Running'))

    def success_token(self):
        """
        The MortarProjectTask writes out several "tokens" as it executes to ensure
        idempotence. This method provides the token file that indicates that the job 
        has finished successfully. If this token exists, the Task will not be rerun.

        By default, it is stored underneath the path provided by the `token_path` method,
        and is named after your class name. So, if your `token_path` is set to 
        `s3://my-bucket/my-folder` and your Task is named FooTask, the token will be:

        `s3://my-bucket/my-folder/FooTask`

        If you want this Task to be rerun, you should delete that token.

        :rtype: Target:
        :returns: Target for the token that indicates that this Task has succeeded.
        """
        return target_factory.get_target(
            '%s/%s' % (self.token_path(), self.__class__.__name__))

    def run(self):
        """
        Run a Mortar job using the Mortar API.

        This method writes out several "tokens" as it executes to ensure
        idempotence:

        * `running_token`: This token indicates that the job is currently running. If a token
          exists at this path, Luigi will poll the currently running job instead of starting a 
          new one.
        * `success_token`: This token indicates that the job has already completed successfully.
          If this token exists, Luigi will not rerun the task.
        """
        api = self._get_api()
        if self.running_token().exists():
            job_id = self.running_token().open().read().strip()
        else:
            job_id = self._run_job(api)
            # to guarantee idempotence, record that the job is running
            target_factory.write_file(self.running_token(), text=job_id)
        job = self._poll_job_completion(api, job_id)
        final_job_status_code = job.get('status_code')
        # record that the job has finished
        self.running_token().remove()
        if final_job_status_code != jobs.STATUS_SUCCESS:
            for out in self.script_output():
                logger.info(
                    'Mortar script failed: removing incomplete data in %s' %
                    out)
                out.remove()
            raise Exception(
                'Mortar job_id [%s] failed with status_code: [%s], error details: %s'
                % (job_id, final_job_status_code, job.get('error')))
        else:
            target_factory.write_file(self.success_token())
            logger.info('Mortar job_id [%s] completed successfully' % job_id)

    def _git_ref(self):
        """
        Figure out value to use for git ref.  Order of precendence is:

        1. git_ref parameter is set.
        2. environment variable MORTAR_LUIGI_GIT_REF is set
        3. master
        """
        if self.git_ref != NO_GIT_REF_FLAG:
            return self.git_ref
        else:
            import os
            env_git_ref = os.environ.get('MORTAR_LUIGI_GIT_REF')
            if env_git_ref:
                return env_git_ref
            else:
                return 'master'

    def _run_job(self, api):
        cluster_type = clusters.CLUSTER_TYPE_SINGLE_JOB if self.run_on_single_use_cluster \
            else clusters.CLUSTER_TYPE_PERSISTENT
        cluster_id = None
        if self.cluster_size == 0:
            # Use local cluster
            cluster_id = clusters.LOCAL_CLUSTER_ID
        elif not self.run_on_single_use_cluster:
            # search for a suitable cluster
            usable_clusters = self._get_usable_clusters(
                api, min_size=self.cluster_size)
            if usable_clusters:
                # grab the largest usable cluster
                largest_cluster = sorted(usable_clusters,
                                         key=lambda c: int(c['size']),
                                         reverse=True)[0]
                logger.info('Using largest running usable cluster with cluster_id [%s], size [%s]' % \
                    (largest_cluster['cluster_id'], largest_cluster['size']))
                cluster_id = largest_cluster['cluster_id']

        if cluster_id:
            job_id = jobs.post_job_existing_cluster(
                api,
                self.project(),
                self.script(),
                cluster_id,
                git_ref=self._git_ref(),
                parameters=self.parameters(),
                notify_on_job_finish=self.notify_on_job_finish,
                is_control_script=self.is_control_script(),
                pig_version=self.pig_version,
                pipeline_job_id=self._get_pipeline_job_id())
        else:
            job_id = jobs.post_job_new_cluster(
                api,
                self.project(),
                self.script(),
                self.cluster_size,
                cluster_type=cluster_type,
                git_ref=self._git_ref(),
                parameters=self.parameters(),
                notify_on_job_finish=self.notify_on_job_finish,
                is_control_script=self.is_control_script(),
                pig_version=self.pig_version,
                use_spot_instances=self.use_spot_instances,
                pipeline_job_id=self._get_pipeline_job_id())
        logger.info('Submitted new job to mortar with job_id [%s]' % job_id)
        return job_id

    def _get_usable_clusters(self, api, min_size=0):
        return [cluster for cluster in clusters.get_clusters(api)['clusters'] \
            if (    (cluster.get('status_code') == clusters.CLUSTER_STATUS_RUNNING)
                and (cluster.get('cluster_type_code') != clusters.CLUSTER_TYPE_SINGLE_JOB)
                and (int(cluster.get('size')) >= min_size)
                and (    len(cluster.get('running_jobs')) == 0
                      or self.share_running_cluster)
               )]

    def _poll_job_completion(self, api, job_id):

        current_job_status = None
        current_progress = None

        exception_count = 0
        while True:
            try:
                # fetch job
                job = jobs.get_job(api, job_id)
                new_job_status = job.get('status_code')

                # check for updated status
                if new_job_status != current_job_status:
                    current_job_status = new_job_status
                    logger.info('Mortar job_id [%s] switched to status_code [%s], description: %s' % \
                        (job_id, new_job_status, self._get_job_status_description(job)))

                # check for updated progress on running job
                if (new_job_status == jobs.STATUS_RUNNING) and (
                        job.get('progress') != current_progress):
                    current_progress = job.get('progress')
                    logger.info('Mortar job_id [%s] progress: [%s%%]' %
                                (job_id, current_progress))

                # final state
                if current_job_status in jobs.COMPLETE_STATUSES:
                    return job
                else:
                    # reset exception count on successful loop
                    exception_count = 0

                    # sleep and continue polling
                    time.sleep(self.job_polling_interval)
            except Exception, e:
                if exception_count < self.num_polling_retries:
                    exception_count += 1
                    logger.info('Failure to get job status for job %s: %s' %
                                (job_id, str(e)))
                    time.sleep(self.job_polling_interval)
                else:
                    raise
Exemplo n.º 26
0
 def testDefaultList(self):
     p = luigi.Parameter(is_list=True, config_path=dict(section="foo", name="bar"))
     self.assertEqual(('one', 'two', 'three'), p.value)
Exemplo n.º 27
0
class MakeClickTrainData(gokart.TaskOnKart):
    task_namespace = 'redshells.word_item_similarity'
    click_data_task = gokart.TaskInstanceParameter()
    min_user_count = luigi.IntParameter(default=100)  # type: int
    min_item_count = luigi.IntParameter(default=100)  # type: int
    max_item_frequency = luigi.FloatParameter(default=0.05)  # type: float
    user_column_name = luigi.Parameter()  # type: str
    item_column_name = luigi.Parameter()  # type: str
    service_column_name = luigi.Parameter()  # type: str
    output_file_path = luigi.Parameter(default='app/word_item_similarity/clicks_train_data.pkl')  # type: str

    def requires(self):
        return self.click_data_task

    def output(self):
        return self.make_target(self.output_file_path)

    def run(self):
        data = self.load_data_frame(required_columns={self.user_column_name, self.item_column_name, self.service_column_name})
        data = pd.concat([self._make_click_data(grouped) for name, grouped in data.groupby(self.service_column_name)])
        logger.info('dumping...')
        self.dump(data)

    def _make_click_data(self, data: pd.DataFrame):
        logger.info(f'filtering... size={data.shape}')
        data = self._filter_data(data)
        logger.info(f'size={data.shape}')

        data['click'] = 1
        logger.info(f'data size is {data.shape}.')
        logger.info('sampling...')
        negative = self._sample_negative_examples(data)
        logger.info(f'negative samples size is {negative.shape}.')
        logger.info('concatenating...')
        data = pd.concat([data, negative], sort=False)
        return data

    def _sample_negative_examples(self, df: pd.DataFrame) -> pd.DataFrame:
        logger.info('preprocessing...')
        user_ids = df[self.user_column_name].unique()
        item_ids = df[self.item_column_name].unique()
        item2service = dict(zip(df[self.item_column_name].tolist(), df[self.service_column_name].tolist()))
        user2index = dict(zip(user_ids, list(range(len(user_ids)))))
        item2index = dict(zip(item_ids, list(range(len(item_ids)))))
        n_users = len(user_ids)
        n_items = len(item_ids)
        positive_examples = set(list(df[self.user_column_name].apply(user2index.get).values + df[self.item_column_name].apply(item2index.get).values * n_users))
        n_positive_examples = len(positive_examples)
        logger.info('negative sampling...')
        negative_examples = set(np.random.randint(low=0, high=n_users * n_items, size=n_positive_examples * 2))
        logger.info('making unique list...')
        negative_examples = np.array(list(negative_examples - positive_examples))
        logger.info('shuffling...')
        negative_examples = sklearn.utils.shuffle(negative_examples)
        negative_examples = negative_examples[:n_positive_examples]

        logger.info('making data frame...')
        examples = pd.DataFrame(dict(user_id=negative_examples % n_users, item_id=negative_examples // n_users, click=0))
        examples[self.user_column_name] = user_ids[examples[self.user_column_name].values]
        examples[self.item_column_name] = item_ids[examples[self.item_column_name].values]
        examples[self.service_column_name] = examples[self.item_column_name].apply(item2service.get)
        examples.drop_duplicates(inplace=True)
        return examples

    def _filter_data(self, df: pd.DataFrame) -> pd.DataFrame:
        df.drop_duplicates(inplace=True)
        n_users = len(set(df[self.user_column_name]))
        max_item_count = n_users * self.max_item_frequency
        logger.info(f'max_item_count={max_item_count}')
        logger.info(f'min_item_count={self.min_item_count}')
        logger.info(f'min_user_count={self.min_user_count}')
        df = df.groupby(self.item_column_name).filter(lambda xs: self.min_item_count <= len(xs) <= max_item_count)
        df = df.groupby(self.user_column_name).filter(lambda xs: self.min_user_count <= len(xs))
        return df
Exemplo n.º 28
0
 def testWithDefault(self):
     p = luigi.Parameter(config_path=dict(section="foo", name="bar"), default='blah')
     self.assertEqual('baz', p.value)  # config overrides default
Exemplo n.º 29
0
class DictFile(luigi.ExternalTask):
    hdfs_path = luigi.Parameter()

    def output(self):
        return luigi.contrib.hdfs.HdfsTarget(self.hdfs_path)
Exemplo n.º 30
0
class StateFiles(luigi.WrapperTask):
    year = luigi.IntParameter()
    state = luigi.Parameter()

    def requires(self):
        return CodeTables(), DataTables(year=self.year, state=self.state)