Exemplos de DateSecondParameter em Python, exemplos de luigi.DateSecondParameter em Python

Exemplo n.º 1

0

Exibir arquivo

class RunTasks(luigi.Task):
    """
    Main entrypoint for this module.
    """
    start_date = luigi.DateSecondParameter(default=(dt.datetime.today() -
                                                    dt.timedelta(days=1)))
    load_date = luigi.DateSecondParameter(default=dt.datetime.today())

    pl = Pipeline("Load2Sites", "RunTasks")

    def output(self):
        return PipelineTasks(self.pl.etl_name, self.pl.task_name,
                             self.load_date.date(), self.start_date.date())

    def run(self):
        try:
            yield [
                TransformLoadSites(start_date=self.start_date,
                                   load_date=self.load_date)
            ]

        except Exception as e:
            sys.exit(e)

        self.pl.log_insert(self.load_date, self.start_date)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: evaluate_classifier.py Projeto: dssg/IEFP-RecSys_public

class EvaluateLogisticRegression(luigi.Task):
    date = luigi.DateSecondParameter(default=datetime.now())
    task_complete = False

    def requires(self):
        return [SplitTrainTest(), TrainLogisticRegression(self.date)]

    def run(self):
        df_test = s3.read_parquet(self.input()[0][1].path)
        y_test = df_test.loc[:, "ttj_sub_12"]
        X_test = df_test.drop(["ttj", "ttj_sub_12"], axis="columns")

        lg = s3.read_pickle(self.input()[1].path)
        metrics = evaluate(lg, X_test, y_test)

        model_info_to_db(
            engine=get_db_engine(),
            model=lg,
            metrics=metrics,
            features=X_test.columns.tolist(),
            date=self.date,
            model_path=self.input()[1].path,
            train_data_path=self.input()[0][2].path,
            test_data_path=self.input()[0][3].path,
        )
        # NOTE: Set task as completed manually. Use the build-in
        # luigi.contrib.postgres.CopyToTable Task would the right.
        self.task_complete = True

    def complete(self):
        return self.task_complete

Exemplo n.º 3

0

Exibir arquivo

Arquivo: extract.py Projeto: paezraphael/etl_luigi

class GetUserFromOmie(luigi.Task):
    date = luigi.DateSecondParameter()

    def run(self):
        key = OmieAPI().key
        secret = OmieAPI().secret

        headers = {"Content-type": "application/json"}

        data = {
            "call": "ListarUsuarios",
            "app_key": key,
            "app_secret": secret,
            "param": [{
                "pagina": 1,
                "registros_por_pagina": 100
            }]
        }

        response = requests.post(
            'https://app.omie.com.br/api/v1/crm/usuarios/',
            headers=headers,
            json=data)

        with self.output().open('w') as outfile:
            json.dump(response.json(), outfile, indent=4, ensure_ascii=False)

    def output(self):
        path = f"data/users_{str(self.date)}.json"
        return luigi.LocalTarget(path)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: recommendation_eval.py Projeto: dssg/IEFP-RecSys_public

class EvaluateRecommendationsRF(luigi.Task):
    date = luigi.DateSecondParameter(default=datetime.now())
    task_complete = False

    def requires(self):
        return [TrainRandomForest(self.date), EvaluateRandomForest(self.date)]

    def run(self):
        params = yaml.load(open("./conf/base/parameters.yml"),
                           Loader=yaml.FullLoader)["evaluation_params"]

        model = s3.read_pickle(self.input()[0].path)
        model_id, test_path, train_path = get_model_info_by_path(
            self.input()[0].path)

        df_train = s3.read_parquet(train_path)
        df_test = s3.read_parquet(test_path)

        rec_error = get_aggregate_recommendation_error(
            df_train,
            df_test,
            model,
            params["set_size"],
            params["num_recs"],
            params["percent_sample"],
        )

        write_recommendation_eval(get_db_engine(), rec_error, model_id, params)
        self.task_complete = True

    def complete(self):
        return self.task_complete

Exemplo n.º 5

0

Exibir arquivo

class TimestampPartitionMixin(object):
    """
    This mixin provides its task with a formatted date partition value property.

    The partition value is the `date` parameter, formatted by the `partition_date` parameter.

    It can be used by HivePartitionTasks and tasks which invoke downstream HivePartitionTasks.
    """
    date = luigi.DateSecondParameter(
        default=datetime.datetime.utcnow(),
        description='Date/time for the data partition.  Default is UTC now.'
        'Note that though this is a DateParameter, it also supports datetime objects, and so can '
        'be used to create time-based data partitions.',
    )
    partition_format = luigi.Parameter(
        config_path={
            'section': 'course-list',
            'name': 'partition_format'
        },
        default='%Y-%m-%d',
        description=
        'Format string for the course list table partition\'s `date` parameter. '
        'Must result in a filename-safe string, or your partitions will fail to be created.\n'
        'The default value of "%Y-%m-%d" changes daily, and so causes a new course partition to to be '
        'created once a day.  For example, use "%Y-%m-%dT%H" to update hourly, though beware of load on '
        'the edX REST API.  See strftime for options.',
    )

    @property
    def partition_value(self):
        """Partition based on the task's date and partition format strings."""
        return unicode(self.date.strftime(self.partition_format))

Exemplo n.º 6

0

Exibir arquivo

Arquivo: global_config.py Projeto: BDRD-Genomics/ExempliPhi

class Globals(luigi.Config):
    '''
    Global variables. Set using luigi configuration file.
    '''
    # Path to installation of exempliphi
    PIPELINE_ROOT = luigi.Parameter(
        default=os.path.split(os.path.dirname(os.path.realpath(__file__)))[0])
    # Directory to write output to
    OUTPUT_DIR = luigi.Parameter(default=os.path.join(
        os.path.split(os.path.dirname(os.path.realpath(__file__)))[0],
        'output'))
    # Number of cores available in computing environment
    NUM_THREADS = luigi.IntParameter(default=1)
    # Coda environment holding dependencies
    PRIMARY_CONDA_ENV = luigi.Parameter(default='exempliphi')
    # Path to conda executable. Default expects it to be in PATH.
    CONDA_EXE = luigi.Parameter(default='conda')
    # Lower boundary of insert size
    INSERT_SIZE_LB = luigi.IntParameter(
        default=250)  # Defaults were recommendations from Dr. Ken Frey
    # Upper boundary of insert size
    INSERT_SIZE_UB = luigi.IntParameter(default=500)
    # NCBI nt blast database
    NT = luigi.Parameter()
    # Number of workers to use when running pipeline
    NUM_WORKERS = luigi.IntParameter(default=10)
    # What cluster scheduling software do you use? SGE, SLURM, or False for single node
    CLUSTER = luigi.Parameter(default=False)
    # Date of run
    RUN_DATE = luigi.DateSecondParameter(default=datetime.now())

Exemplo n.º 7

0

Exibir arquivo

Arquivo: make.py Projeto: bvancil/luigi-stan-template

class All(luigi.WrapperTask):
    timestamp = luigi.DateSecondParameter(default=datetime.datetime.now())

    def run(self):
        print("Running All")

    def requires(self):
        return TransformedDataset()

Exemplo n.º 8

0

Exibir arquivo

class Send_Results_To_MAS(PipelineTask):
    annotation_accession = luigi.Parameter(default='')
    database = luigi.Parameter()
    tool = luigi.Parameter()
    run_time = luigi.DateSecondParameter(default=datetime.now())
    run_locally = True

    def requires(self):
        if self.tool == 'blastp':
            return Blastp(
                annotation_accession=self.annotation_accession,
                database=self.database,
                run_time=self.run_time,
                mas_server=self.mas_server
            )

        elif self.tool == 'hhsearch':
            return HHsearch(
                annotation_accession=self.annotation_accession,
                database=self.database,
                run_time=self.run_time,
                mas_server=self.mas_server
            )

        elif self.tool == 'rpsblast':
            return RPSBlast(
                annotation_accession=self.annotation_accession,
                database=self.database,
                run_time=self.run_time,
                mas_server=self.mas_server
            )

    def out_file_path(self, temp=False):
        return {}

    def out_dir(self, temp=False):
        folder = '{}-temp'.format(self.database) if temp else self.database
        return os.path.join(self.pipeline_out_dir(), self.task_family, self.tool, folder)

    def do_task(self):
        r = requests.post(
            self.mas_server + reverse('upload_results'),
            auth=(self.g.MAS_USERNAME, self.g.MAS_PASSWORD),
            files=[('result', open(self.input()['results'].path))],
            data={
                'tool': self.tool,
                'accession': self.annotation_accession,
                'database': self.database,
                'status': 0
            },
            verify=self.g.MAS_CRT
        )

        if r.status_code != 200:
            raise requests.ConnectionError(
                'Request to post results to MAS server failed (status code = %i):\n%s' % (r.status_code, r.text)
            )

Exemplo n.º 9

0

Exibir arquivo

class HHblits(PipelineTask):
    '''
    Run HHblits on each protein
    '''
    annotation_accession = luigi.Parameter(default='')
    protein_id = luigi.Parameter(default='')
    iterations = luigi.IntParameter(default=3)
    database = luigi.Parameter()
    run_time = luigi.DateSecondParameter()
    tool = 'hhsearch'

    # At this point this is the only database option. The database parameter is for HHSearch!
    uniclust = luigi.Parameter()
    uniclust_cpu = luigi.Parameter()

    def __init__(self, *args, **kwargs):
        super(HHblits, self).__init__(*args, **kwargs)

        if self.uniclust_cpu:
            self.n_cpu = self.uniclust_cpu

    def out_dir(self, temp=False):
        '''
        Returns the directory which will hold the files output by this task.
        '''
        base_name = '{}_{}'.format(self.task_family, self.annotation_accession)

        folder = '{}-temp'.format(base_name) if temp else base_name
        return os.path.join(self.pipeline_out_dir(), self.task_family, folder)

    def requires(self):
        return Pull_Protein(
            annotation_accession=self.annotation_accession,
            run_time=self.run_time,
            tool=self.tool,
            database=self.database,
            mas_server=self.mas_server
        )

    def out_file_path(self, temp=False):
        name = self.annotation_accession
        return {
            'alignment': os.path.join(self.out_dir(temp), '{}.a3m'.format(name))
        }

    def do_task(self):
        db_path = self.uniclust
        in_file = self.input()['fasta'].path

        self._run_command([
            'hhblits',
            '-i', in_file,
            '-oa3m', self.out_file_path(True)['alignment'],
            '-n', str(self.iterations),
            '-cpu', str(self.n_cpu),
            '-d', db_path
        ])

Exemplo n.º 10

0

Exibir arquivo

class RPSBlast(PipelineTask):
    '''
    Generate rpsblast (CD-Search) results for a given proteome
    '''
    annotation_accession = luigi.Parameter(default='')
    e_value = luigi.FloatParameter(default=0.0001)
    database = luigi.Parameter()
    run_time = luigi.DateSecondParameter()
    tool = 'rpsblast'

    # Available Database Choices
    cdd = luigi.Parameter()
    cdd_cpu = luigi.Parameter()

    def __init__(self, *args, **kwargs):
        super(RPSBlast, self).__init__(*args, **kwargs)

        if self.database == 'cdd':
            self.n_cpu = self.cdd_cpu

    def requires(self):
        return Pull_Protein(
            annotation_accession=self.annotation_accession,
            run_time=self.run_time,
            tool=self.tool,
            database=self.database,
            mas_server=self.mas_server
        )

    def out_file_path(self, temp=False):
        name = self.annotation_accession
        return {
            'results': os.path.join(self.out_dir(temp), '{}_{}_rpsblast_results.xml'.format(name, self.database))
        }

    def out_dir(self, temp=False):
        folder = '{}-temp'.format(self.database) if temp else self.database
        return os.path.join(self.pipeline_out_dir(), self.task_family, folder)

    def do_task(self):
        if self.database == 'cdd':
            db_path = self.cdd

        else:
            raise ValueError('Invalid database ' + self.database)

        self._run_command([
            'rpsblast',
            '-query', self.input()['fasta'].path,
            '-db', db_path,
            '-evalue', self.e_value,
            '-out', self.out_file_path(True)['results'],
            '-outfmt', '5',
            '-num_threads', str(self.n_cpu)
        ])

Exemplo n.º 11

0

Exibir arquivo

class HHsearch(PipelineTask):
    protein_id = luigi.Parameter(default='')
    annotation_accession = luigi.Parameter(default='')
    run_time = luigi.DateSecondParameter()
    tool = 'hhsearch'
    database = luigi.Parameter()

    # Available Database Choices
    pdb = luigi.Parameter()

    pdb_cpu = luigi.Parameter()

    def __init__(self, *args, **kwargs):
        super(HHsearch, self).__init__(*args, **kwargs)

        if self.pdb_cpu:
            self.n_cpu = self.pdb_cpu

    def out_dir(self, temp=False):
        '''
        Returns the directory which will hold the files output by this task.
        '''
        base_name = '{}_{}'.format(self.task_family, self.annotation_accession)

        folder = '{}-temp'.format(base_name) if temp else base_name
        return os.path.join(self.pipeline_out_dir(), self.task_family, folder)

    def requires(self):
       return HHblits(
            annotation_accession=self.annotation_accession,
            run_time=self.run_time,
            database=self.database,
            mas_server=self.mas_server
        )

    def out_file_path(self, temp=False):
        name = self.annotation_accession

        return {
            'results': os.path.join(self.out_dir(temp), '{}.hhr'.format(name))
        }

    def do_task(self):
        if self.database == 'pdb':
            db_path = self.pdb
        else:
            raise ValueError('Invalid database ' + self.database)

        self._run_command([
            'hhsearch',
            '-i', self.input()['alignment'].path,
            '-d', db_path,
            '-o', self.out_file_path(True)['results'],
            '-cpu', str(self.n_cpu)
        ])

Exemplo n.º 12

0

Exibir arquivo

class CdxIndexer(luigi.contrib.hadoop_jar.HadoopJarJobTask):
    input_file = luigi.Parameter()
    cdx_service = luigi.Parameter()
    # This is used to add a timestamp to the output file, so this task can always be re-run:
    timestamp = luigi.DateSecondParameter(default=datetime.datetime.now())
    meta_flag = ''

    task_namespace = "access.index"

    num_reducers = 5

    def requires(self):
        return CopyToHDFS(input_file=self.input_file,
                          prefix="/9_processing/warcs2cdx/")

    def ssh(self):
        return {
            'host': 'mapred',
            'key_file': '~/.ssh/id_rsa',
            'username': '******'
        }

    def jar(self):
        #        dir_path = os.path.dirname(os.path.realpath(__file__))
        #        return os.path.join(dir_path, "../jars/warc-hadoop-recordreaders-3.0.0-SNAPSHOT-job.jar")
        # Note that when using ssh to submit jobs, this needs to be a JAR on the remote server:
        return "/home/access/github/ukwa-manage/tasks/jars/warc-hadoop-recordreaders-3.0.0-SNAPSHOT-job.jar"

    def main(self):
        return "uk.bl.wa.hadoop.mapreduce.cdx.ArchiveCDXGenerator"

    def args(self):
        return [
            "-Dmapred.compress.map.output=true",
            "-Dmapred.output.compress=true",
            "-Dmapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec",
            "-i",
            self.input(), "-o",
            self.output(), "-r", self.num_reducers, "-w", "-h", "-m",
            self.meta_flag, "-t", self.cdx_service, "-c",
            "CDX N b a m s k r M S V g"
        ]

    def output(self):
        timestamp = self.timestamp.isoformat()
        timestamp = timestamp.replace(':', '-')
        file_prefix = os.path.splitext(os.path.basename(self.input_file))[0]
        return state_file(self.timestamp,
                          'warcs2cdx',
                          '%s-submitted-%s.txt' % (file_prefix, timestamp),
                          on_hdfs=True)

Exemplo n.º 13

0

Exibir arquivo

class snowplow_enriched_upload_data(luigi.Task):
	dataset_date = luigi.DateParameter(default=date.today() - timedelta(days=1))
	# force_run = luigi.BoolParameter()
	_start = luigi.DateSecondParameter(default=datetime.utcnow())
	file_root = luigi.Parameter()

	credentials = GoogleCredentials.get_application_default()

	def run(self):
		client = gcs.GCSClient(oauth_credentials=self.credentials)
		client.put(self.input().path, self.output().path)

	def output(self):
		return gcs.GCSTarget("gs://snowplow_tracker/%s_%s.json.gz" % (self.file_root, self.dataset_date.strftime("%Y$m%d")))

Exemplo n.º 14

0

Exibir arquivo

class TrainRandomForest(luigi.Task):
    date = luigi.DateSecondParameter(default=datetime.now())

    def requires(self):
        return SplitTrainTest(self.date)

    def output(self):
        return S3Target(
            s3.path(S3.MODELS +
                    "{date:%Y/%m/%d/random_forest_T%H%M%S.pkl}".format(
                        date=self.date)),
            client=s3.create_client(),
        )

    def run(self):
        df_train = s3.read_parquet(self.input()[0].path)
        y_train = df_train.loc[:, "ttj_sub_12"]
        X_train = df_train.drop(["ttj", "ttj_sub_12"], axis="columns")

        grid = yaml.load(open("./conf/base/parameters.yml"),
                         Loader=yaml.FullLoader)["rf_small_grid"]
        model = self.train_rf_cv(X_train,
                                 y_train,
                                 scoring_metric="f1",
                                 grid=grid)

        s3.write_pickle(model, self.output().path)

    def train_rf_cv(self, X, y, scoring_metric, grid=dict()):
        """
        Runs grid search on a random forest classifier

        :param X: Feature matrix of training set
        :param y: Target vector of training set
        :param scoring_metric: Single metric from which we choose the best classifier
        :param grid: Cross validation grid
        :return: Best trained model after grid search
        """
        rf = RandomForestClassifier(random_state=0,
                                    n_jobs=-1,
                                    class_weight="balanced")
        rf_grid_search = GridSearchCV(rf,
                                      grid,
                                      scoring=scoring_metric,
                                      cv=5,
                                      refit=True)
        rf_grid_search.fit(X, y)

        return rf_grid_search.best_estimator_

Exemplo n.º 15

0

Exibir arquivo

class process_raw_snowplow_event_data(luigi.Task):
	dataset_date = luigi.DateParameter(default=date.today() - timedelta(days=1))
	# force_run = luigi.BoolParameter()
	_start = luigi.DateSecondParameter(default=datetime.utcnow())
	file_root = luigi.Parameter()


	def download_s3_file(self, s3_filename):

		local_filename = "/Users/samuel.peltz/etl/%s" % s3_filename

		s3_file_full_path =re.compile("snowplow-enrich-output/enriched/archive/run=" + self.dataset_date.strftime("%Y-%m-%d") +r"-\d{2}-\d{2}-\d{2}/*.")


		try:
			s3.download_file(Bucket=os.environ.get('SP_BUCKET'), Key=s3_file_full_path, Filename=local_filename)
		except Exception as e:
			logger.error("%s - Could not retrieve %s because: %s" % ("download_s3_file()", s3_file_full_path, e))
			raise

		return local_filename

	def list_files(self, sp_bucket):
		files = []
		response = s3.list_objects_v2(Bucket=os.environ.get('SP_BUCKET'))
		while True:
			files.extend([o['Key'] for o in response['Contents']])
			if not response['IsTruncated']:
				break
		else:
			response = s3.list_objects_v2(Bucket=os.environ.get('SP_BUCKET'),
							 ContinuationToken=response['NextContinuationToken'])

		pattern = re.compile(r"snowplow-enrich-output/enriched/archive/run=" + self.dataset_date.strftime("%Y-%m-%d") + r"-\d{2}-\d{2}-\d{2}/part-\d{5}\.*")		

		for thisfile in files:
			if re.match(pattern, thisfile):
				s3.download_file(Bucket=os.environ.get('SP_BUCKET'), Key=thisfile)

		return files

	def output(self):
		return luigi.LocalTarget("/Users/samuel.peltz/etl/%s_%s.json.gz" % (self.file_root, self.dataset_date.strftime("%Y%m%d")))

Exemplo n.º 16

0

Exibir arquivo

class Run_Pipeline_For_Proteins(luigi.WrapperTask):
    input_list = luigi.ListParameter()
    run_time = luigi.DateSecondParameter(default=datetime.now())
    mas_server = luigi.Parameter()

    def requires(self):
        job_array = []
        for search_params in self.input_list:
            if set(search_params) != {'accession', 'tool', 'database'}:
                raise KeyError('Incorrect dict keys for pipeline parameters')

            job_array.append(
                Send_Results_To_MAS(
                    mas_server=self.mas_server,
                    annotation_accession=search_params['accession'],
                    database=search_params['database'],
                    tool=search_params['tool']
                )
            )
        return job_array

Exemplo n.º 17

0

Exibir arquivo

class Pull_Protein(PipelineTask):
    '''
    Pull a single protein through MAS's REST API
    '''
    annotation_accession = luigi.Parameter()
    run_locally = True
    database = luigi.Parameter()
    run_time = luigi.DateSecondParameter()
    tool = luigi.Parameter()

    def __init__(self, *args, **kwargs):
        super(Pull_Protein, self).__init__(*args, **kwargs)
        self.task_id = '{}_{}_{}'.format(self.get_task_family(), self.run_time, self.annotation_accession)

    def out_file_path(self, temp=False):
        return {
            'fasta': os.path.join(self.out_dir(temp), '%s.faa' % self.annotation_accession)
        }

    def do_task(self):
        # Get protein sequence from MAS's REST API
        response = requests.get(
            self.mas_server + reverse('get_protein', kwargs={'accession': self.annotation_accession}),
            auth=(self.g.MAS_USERNAME, self.g.MAS_PASSWORD),
            verify=self.g.MAS_CRT
        )

        if response.status_code != 200:
            self.logger.error(
                'Response status code = {}. Response text = {}.'.format(response.status_code, response.text)
            )
            raise requests.ConnectionError('Request to get protein sequence from MAS server failed')

        # Write sequence file
        protein_seq = Seq(response.json()['sequence'], IUPAC.IUPACProtein)
        rec = SeqRecord(protein_seq, id=self.annotation_accession, description='')
        SeqIO.write(rec, self.out_file_path(True)['fasta'], 'fasta')

Exemplo n.º 18

0

Exibir arquivo

class snowplow_enriched_insert_data(bigquery.BigqueryLoadTask):
	dataset_date = luigi.DateParameter(default=date.today() - timedelta(days=1))
	# force_run = luigi.BoolParameter()
	_start = luigi.DateSecondParameter(default=datetime.utcnow())
	file_root = luigi.Parameter()

	credentials = GoogleCredentials.get_application_default()
	source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
	write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

	def source_uris(self):
		return [x.path for x in luigi.task.flatten(self.input())]

	def output(self):
		return bigquery.BigqueryTarget(
			"realself-main",
			"snowplow",
			"Events"
			)

	def complete(self):
		return check_partition_modified(
			table="%s.%s" % (self.output().table.dataset_id, self.file_root),
			partition=self.dataset_date.strftime("%Y%m%d"), threshold=60, time_ref=self._start)

Exemplo n.º 19

0

Exibir arquivo

Arquivo: transform.py Projeto: paezraphael/etl_luigi

class ExtractNameId(luigi.Task):
    date = luigi.DateSecondParameter()

    def requires(self):
        return [GetUserFromOmie(self.date)]

    def get_name_id(self):
        with self.input()[0].open('r') as json_file:
            users = json.load(json_file)
        users_ids = []
        for cadastro in users['cadastros']:
            users_ids.append({
                'code': cadastro['nCodigo'],
                'name': cadastro['cNome']
            })
        return users_ids

    def run(self):
        df = pd.DataFrame(self.get_name_id())
        df.to_parquet(self.output().path, index=False)

    def output(self):
        path = f"data/users_{str(self.date)}_ids.parquet"
        return luigi.LocalTarget(path)

Exemplo n.º 20

0

Exibir arquivo

Arquivo: date_parameter_test.py Projeto: zgsxwsdxg/luigi

 def test_parse(self):
     ds = luigi.DateSecondParameter().parse('2013-02-01T184227')
     self.assertEqual(ds, datetime.datetime(2013, 2, 1, 18, 42, 27))

Exemplo n.º 21

0

Exibir arquivo

Arquivo: sqs_history_test.py Projeto: skollcaku/luigi

class ListDateParamTask(luigi.Task):
    param1 = luigi.Parameter()
    param2 = luigi.DateSecondParameter(default=datetime.now())
    param3 = luigi.Parameter(default=['something'])

Exemplo n.º 22

0

Exibir arquivo

class ExtractSitesSummary(luigi.Task):
    """
    Extracts Site Summary data.
    """
    start_date = luigi.DateSecondParameter(default=(dt.datetime.today() -
                                                    dt.timedelta(days=1)))
    load_date = luigi.DateSecondParameter(default=dt.datetime.today())

    extract_performance = ExtractSitesPerformance()
    tmp_files = CleanUpTempFiles()
    pl = Pipeline("LoadSites", "ExtractSitesSummary")

    # This class ensures that the status of the APICA Checks are up-to-date.
    check_status = CheckStatus()

    # Database connections
    conn = Connection()
    pw_src_cur = conn.pw_src.return_cursor()
    pw_tar_cur = conn.pw_tar.return_cursor()

    def output(self):
        return PipelineTasks(self.pl.etl_name, self.pl.task_name,
                             self.load_date.date(), self.start_date.date())

    def run(self):
        processing_time_span = self.get_processing_timespan(
            self.start_date.date())

        # Remove the previous days temp files.
        self.tmp_files.remove()

        check_ids = self.get_multi_url_check_ids()

        if len(check_ids) == 0:
            self.inform_luigi_processing_completed()
            return

        for check_id in check_ids:
            url = self.get_url(processing_time_span, check_id[0])
            json_dict_summary_result = self.get_data(url, check_id[0])

            if json_dict_summary_result is not None and len(
                    json_dict_summary_result) > 0:
                self.extract_performance.run(json_dict_summary_result,
                                             self.start_date.date())
            else:
                # Write empty performance data since there is no header info for the check id.
                empty_result = []
                fact_path = self.extract_performance.get_file_path(
                    self.start_date.date(), check_id[0])
                self.save_data(fact_path, empty_result)

            # Save the header results.
            full_path = self.get_file_path(check_id[0])
            self.save_data(full_path, json_dict_summary_result)

        self.pl.tasks_insert(self.load_date, self.start_date)

    def get_processing_timespan(self, start_date=''):

        if start_date == '':
            start_date = (dt.datetime.today() - dt.timedelta(days=1)).date()

        date_time_span = dict()
        date_time_span['start_day'] = start_date
        date_time_span['end_day'] = start_date
        date_time_span['to_hour'] = 'T23:59:59'
        date_time_span['from_hour'] = 'T00:00:00'

        return date_time_span

    def get_multi_url_check_ids(self):
        """
        Obtain the Check IDs that have multiple URLs associated with them.
        :return: List of check IDs
        """

        try:
            check_id_list = self.pw_src_cur.execute(
                var.GET_MULTI_URL_CHECKS).fetchall()

        except Exception as e:
            error_msg = var.GET_MULTI_URL_CHECKS_ERROR_MSG.format(e)
            self.pl.logger.error(error_msg)
            sys.exit(error_msg)

        return check_id_list

    def get_url(self, processing_time_span, check_id):
        """
        Builds the URL.
        :param processing_time_span: Time block for desired data.
        :param check_id: Check Id for the particular client.
        :return: URL that constitutes the Get API call.
        """
        ts_for_api = var.UTC_TIMESPAN.format(processing_time_span['start_day'],
                                             processing_time_span['from_hour'],
                                             processing_time_span['end_day'],
                                             processing_time_span['to_hour'])

        url = var.URL_QUERY_INFO.format(var.BASE_URL, str(check_id),
                                        ts_for_api, var.AUTH_TICKET)
        return url

    def get_data(self, url, check_id):
        """
        Executes the Get API call.
        :param url: URL that constitutes the Get API call.
        :param check_id: Check Id for the particular client.
        :return: Site performance data for the particular client.
        """
        try:
            response = urllib.request.urlopen(url)
        except Exception as e:
            error_msg = "FAILED to obtain data from URL. ERROR MESSAGE: {} --URL: {} ".format(
                e, url)
            self.pl.logger.exception(error_msg)
            self.apica_check_status.determine_check_id_status(check_id)
            return None

        # The HTTP call returns a JSON array of monitors.
        json_list_result = response.read()
        json_dict_result = json.loads(json_list_result.decode('utf-8'))

        if not json_dict_result:
            self.pl.logger.info("No data to process.")
            return None

        return json_dict_result

    def get_file_path(self, check_id):
        prefix = var.SUMMARIES_HEADERS.format(check_id)
        filename = get_filename(self.start_date.date(), prefix, 'json')
        return os.path.join(var.DATA_SITES_PTH, filename)

    def save_data(self, full_path, json_dict_result):
        with open(full_path, 'w') as outfile:
            json.dump(json_dict_result, outfile)

Exemplo n.º 23

0

Exibir arquivo

class TransformLoadSites(luigi.Task):
    start_date = luigi.DateSecondParameter(default=(dt.datetime.today() -
                                                    dt.timedelta(days=1)))
    load_date = luigi.DateSecondParameter(default=dt.datetime.today())

    pl = Pipeline("Load2Sites", "TransformSitesSummary")

    site_manager = SiteManager()
    conn = Connection()
    pw_src_cur = conn.pw_src.return_cursor()
    plat_cur = conn.plat_src.return_cursor()
    pw_tar_cur = conn.pw_tar.return_cursor()
    pw_tar_sa = conn.sa_create_engine(conn.pw_tar_cstr)
    pw_src_conn = conn.pw_src.return_conn()

    def output(self):
        return PipelineTasks(self.pl.etl_name, self.pl.task_name,
                             self.load_date.date(), self.start_date.date())

    def requires(self):
        return ExtractSitesSummary(start_date=self.start_date,
                                   load_date=self.load_date)

    def run(self):
        check_ids = self.get_multi_url_check_ids()

        url_org_id_mappings_df = self.get_url_org_id_mappings()

        for check_id in check_ids:
            # Process Dim information for check id.
            header_data = self.get_data(check_id[0], 'headers')
            if header_data is not None:
                headers = []
                for dict_data in header_data:
                    if self.is_valid_header(dict_data):
                        headers.append(self.transform_header_data(dict_data))
                transformed_headers = pd.DataFrame(headers,
                                                   columns=var.HEADER_COLUMNS)
                is_deleted = self.delete_dim_records(
                    transformed_headers['ResultID'])
                if is_deleted:
                    self.load_data(transformed_headers, var.DIM_SITES_TABLE)

                # Process Fact information for check id.

                performance_data = self.get_data(check_id[0], 'performance')
                performance_results = self.transform_performance_data(
                    performance_data, url_org_id_mappings_df)
                transformed_results = pd.DataFrame(performance_results,
                                                   columns=var.SITES_COLUMNS)
                is_facts_deleted = self.delete_fact_records(
                    transformed_results['ResultID'].iloc[0])
                if is_facts_deleted:
                    self.load_data(transformed_results, var.FACT_SITES_TABLE)

        self.pl.tasks_insert(self.load_date, self.start_date)

    def get_data(self, check_id, file_type):
        """
        Get the data from previously saved json files.
        :param check_id:
        :param file_type: header file or performance file (details)
        :return: Json object.
        """
        prefix = ''
        if file_type == 'headers':
            prefix = 'Summaries-Headers-{}'.format(check_id)
        else:
            prefix = 'SitesPerformance-{}'.format(check_id)

        filename = get_filename(self.start_date.date(), prefix, 'json')
        full_path = os.path.join(var.DATA_SITES_PTH, filename)

        data = []
        with open(full_path) as data_file:
            data = json.load(data_file)

        return data

    def delete_dim_records(self, result_ids):
        """
        If this date is being re-run the existing Dim records must deleted.
        :param result_ids: List of Result IDs
        :return: True is deletion was successful, otherwise False. This is to ensure that the insertion is not
        executed if an error occurs.
        """
        if len(result_ids) == 0:
            return True

        # Build the list for the IN clause.
        in_values = ''
        for rid in result_ids:
            in_values = in_values + "'{}',".format(rid)

        # Drop the last ',' from the string.
        in_values = in_values[0:len(in_values) - 1]

        try:
            self.pw_tar_cur.execute(
                var.DELETE_DIM_SITES.format(in_values)).commit()
        except Exception as e:
            error_msg = "Unable to delete from DimSites table. ResultIDs: {}. ERROR Message: {}"\
                .format(in_values, e)
            self.pl.logger.exception(error_msg)
            return False

        return True

    def delete_fact_records(self, result_id):
        """
        If this date is being re-run the existing fact records must deleted.
        :param result_id: Result ID being processed.
        :return: True is deletion was successful, otherwise False. This is to ensure that the insertion is not
        executed if an error occurs.
        """
        try:
            self.pw_tar_cur.execute(
                var.DELETE_SITES.format(result_id)).commit()
        except Exception as e:
            error_msg = "Unable to delete from FactSites table. ResultID: {}. ERROR Message: {}"\
                .format(result_id, e)
            self.pl.logger.exception(error_msg)
            return False

        return True

    def load_data(self, df, table_name):
        """
        Saves records to the database.
        :param df: The pandas dataframe that is to be saved.
        :param table_name: The table the data is to be saved to.
        :return: True if the records were successfully saved.
        """
        try:
            df.to_sql(table_name,
                      schema='dw',
                      if_exists='append',
                      con=self.pw_tar_sa,
                      index=None)
        except Exception as e:
            err_msg = "ERROR occurred while inserting new checks to StgChecks table. Actual message: {}".format(
                e)
            self.pl.logger.error(err_msg)
            sys.exit(err_msg)

        return True

    def get_multi_url_check_ids(self):
        """
        Obtain the Check IDs that have multiple URLs associated with them.
        :return: List of check IDs
        """

        try:
            check_id_list = self.pw_src_cur.execute(
                var.GET_MULTI_URL_CHECKS).fetchall()

        except Exception as e:
            error_msg = var.GET_MULTI_URL_CHECKS_ERROR_MSG.format(e)
            self.pl.logger.error(error_msg)
            sys.exit(error_msg)

        return check_id_list

    def is_valid_header(self, dict_data):
        """
        Validates the numeric Dim data returned.
        :param dict_data: A row of Dim data.
        :return: True if all fields contain digits.otherwise, False.
        """
        is_valid = True

        if 'check_id' in dict_data and not type(dict_data['check_id']) is int:
            is_valid = False
        if 'value' in dict_data and not type(dict_data['value']) is int:
            is_valid = False
        if 'result_code' in dict_data and not type(
                dict_data['result_code']) is int:
            is_valid = False
        if 'attempts' in dict_data and not type(dict_data['attempts']) is int:
            is_valid = False

        return is_valid

    def transform_performance_data(self, performance_data,
                                   url_org_id_mappings_df):
        """
        Validates and transforms the Performance data so that it can be saved to the database.
        If Check_ID, Result Id, TimestampUTC contains invalid data the entire set is rejected. If the url
        is not provided the performance result for that record is not saved.
        :param performance_data: Performance data for a specified Result ID
        :param url_org_id_mappings_df: url to organization mapping.
        :return: Transformed data.
        """

        processed_check_results = []

        data = performance_data[0]
        check_results = data['check_results']

        for check_result in check_results:
            is_valid_check_result = True

            check_id = 0
            if 'check_id' in check_result and type(
                    check_result['check_id']) is int:
                check_id = check_result['check_id']
            else:
                is_valid_check_result = False

            result_id = ''
            if 'result_id' in check_result and check_result[
                    'result_id'] is not None:
                result_id = check_result['result_id'][0:60]
            else:
                is_valid_check_result = False

            time_stamp_utc = ''
            if 'time_stamp_utc' in check_result and check_result[
                    'time_stamp_utc'] is not None:
                time_stamp_utc = check_result['time_stamp_utc']
            else:
                is_valid_check_result = False

            url_results = check_result['url_results']

            if not is_valid_check_result or len(url_results) == 0:
                return processed_check_results

            for url_result in url_results:

                is_valid_record = True
                url_number = -1
                if 'url_number' in url_result and type(
                        url_result['url_number']) is int:
                    url_number = url_result['url_number']

                url_domain_name = ''
                url_hash = 0x00
                if 'url' in url_result and url_result['url'] is not None:
                    url = url_result['url'][0:2083]
                    url_domain_name = self.get_url_name(url)
                    url_domain_name = url_domain_name.lower().strip()
                    url_hash = self.site_manager.get_hash(url_domain_name)
                else:
                    is_valid_record = False

                organization_id = 0
                tenant_id = 0

                if len(url_domain_name) > 0:
                    result = url_org_id_mappings_df[
                        url_org_id_mappings_df['URL'] == url_domain_name]
                    if len(result) > 0:
                        organization_id = result['OrganizationID']
                        tenant_id = result['ParentID']

                elapsed_ms = -1
                if 'elapsed_ms' in url_result and type(
                        url_result['elapsed_ms']) is int:
                    elapsed_ms = url_result['elapsed_ms']

                received_bytes = -1
                if 'received_bytes' in url_result and type(
                        url_result['received_bytes']) is int:
                    received_bytes = url_result['received_bytes']

                http_method = 'Not Provided'
                if 'http_method' in url_result and url_result[
                        'http_method'] is not None:
                    http_method = url_result['http_method']

                http_status_code = -1
                if 'http_status_code' in url_result and type(
                        url_result['http_status_code']) is int:
                    http_status_code = url_result['http_status_code']

                dns_lookup_duration_ms = -1
                if 'dns_lookup_duration_ms' in url_result and type(
                        url_result['dns_lookup_duration_ms']) is int:
                    dns_lookup_duration_ms = url_result[
                        'dns_lookup_duration_ms']

                connect_duration_ms = -1
                if 'connect_duration_ms' in url_result and type(
                        url_result['connect_duration_ms']) is int:
                    connect_duration_ms = url_result['connect_duration_ms']

                send_duration_ms = -1
                if 'send_duration_ms' in url_result and type(
                        url_result['send_duration_ms']) is int:
                    send_duration_ms = url_result['send_duration_ms']

                wait_duration_ms = -1
                if 'wait_duration_ms' in url_result and type(
                        url_result['wait_duration_ms']) is int:
                    wait_duration_ms = url_result['wait_duration_ms']

                receive_duration_ms = -1
                if 'receive_duration_ms' in url_result and type(
                        url_result['receive_duration_ms']) is int:
                    receive_duration_ms = url_result['receive_duration_ms']

                headers = 'Not Provided'
                if 'headers' in url_result and url_result[
                        'headers'] is not None:
                    tmp = str(url_result['headers'])
                    headers = tmp[0:2083]

                multiple_timings = 'Not Provided'
                if 'multiple_timings' in url_result and url_result[
                        'multiple_timings'] is not None:
                    multiple_timings = str(url_result['multiple_timings'])

                if is_valid_record:
                    result = {
                        "CheckID": check_id,
                        "ResultID": result_id,
                        "URLNumber": url_number,
                        "URL": url_domain_name,
                        "URLHash": url_hash,
                        "TenantID": int(tenant_id),
                        "OrganizationID": int(organization_id),
                        "TimestampUTC": time_stamp_utc,
                        "ElapsedMS": elapsed_ms,
                        "ReceivedBytes": received_bytes,
                        "HTTPMethod": http_method,
                        "HTTPStatusCode": http_status_code,
                        "DNSLookupDurationMS": dns_lookup_duration_ms,
                        "ConnectDurationMS": connect_duration_ms,
                        "SendDurationMS": send_duration_ms,
                        "WaitDurationMS": wait_duration_ms,
                        "ReceiveDurationMS": receive_duration_ms,
                        "Headers": headers,
                        "MultipleTimings": multiple_timings
                    }
                    processed_check_results.append(result)

        return processed_check_results

    def transform_header_data(self, dict_data):

        check_id = 0
        if 'check_id' in dict_data:
            check_id = int(dict_data['check_id'])

        result_id = ''
        if 'identifier' in dict_data:
            result_id = dict_data['identifier'][0:60]

        message = ''
        if 'message' in dict_data:
            message = dict_data['message']

        attempts = 0
        if 'attempts' in dict_data:
            attempts = int(dict_data['attempts'])

        result_code = 0
        if 'result_code' in dict_data:
            result_code = int(dict_data['result_code'])

        timestamp_utc = ''
        if 'timestamp_utc' in dict_data:
            timestamp_utc = dict_data['timestamp_utc']

        severity = ''
        if 'severity' in dict_data:
            severity = dict_data['severity']

        value = 0
        if 'value' in dict_data:
            value = dict_data['value']

        unit = ''
        if 'unit' in dict_data:
            unit = dict_data['unit']

        header = {
            "ResultID": result_id,
            "CheckID": check_id,
            "TimeStampUTC": timestamp_utc,
            "Message": message,
            "Attempts": attempts,
            "ResultCode": result_code,
            "Severity": severity,
            "Value": value,
            "Unit": unit
        }

        return header

    def get_url_org_id_mappings(self):
        try:
            url_orgid_mappings_tuples = self.plat_cur.execute(
                var.GET_URL_ORGID_MAPPINGS_WITH_HASH).fetchall()

        except Exception as e:
            error_msg = var.URL_ORGID_MAPPINGS_ERROR_MSG.format(e)
            self.pl.logger.error(error_msg)
            sys.exit(error_msg)

        # Convert to lists.
        mapping_list = []
        for row in url_orgid_mappings_tuples:
            a_row = list()
            a_row.append(row[0])
            a_row.append(row[1])
            a_row.append(row[2])
            mapping_list.append(a_row)

        mappings = pd.DataFrame(mapping_list,
                                columns=var.URL_ORGID_MAPPINGS_COLUMNS)
        return mappings

    def get_url_name(self, url):
        """
        Remove the http and other stuff from the url.
        """
        if len(url) == 0:
            return ''

        url_info = urllib.parse.urlsplit(url)

        return url_info.hostname.rstrip()

Exemplo n.º 24

0

Exibir arquivo

Arquivo: train_test_split.py Projeto: dssg/IEFP-RecSys_public

class SplitTrainTest(luigi.Task):
    date = luigi.DateSecondParameter(default=datetime.now())

    def requires(self):
        return CreateModellingTable()

    def output(self):
        return [
            S3Target(s3.path(S3.MODELLING + "train.parquet"),
                     client=s3.create_client()),
            S3Target(s3.path(S3.MODELLING + "test.parquet"),
                     client=s3.create_client()),
            S3Target(
                s3.path(S3.MODELS +
                        "{date:%Y/%m/%d/train_T%H%M%S.parquet}".format(
                            date=self.date)),
                client=s3.create_client(),
            ),
            S3Target(
                s3.path(S3.MODELS +
                        "{date:%Y/%m/%d/test_T%H%M%S.parquet}".format(
                            date=self.date)),
                client=s3.create_client(),
            ),
        ]

    def run(self):
        df_modelling = s3.read_parquet(self.input().path)
        df_train, df_test = self.train_test_split(df_modelling)
        df_train, df_test = self.scale_numeric_feats(df_train, df_test)

        # NOTE: Save both datasets twice.
        # - One set that is tied to a trained model
        # - One set that gets overwritten with the current one
        s3.write_parquet(df_train, self.output()[0].path)
        s3.write_parquet(df_test, self.output()[1].path)
        s3.write_parquet(df_train, self.output()[2].path)
        s3.write_parquet(df_test, self.output()[3].path)

    def train_test_split(self, df_modelling):
        """
        Split modelling table into training and test set.
        Keep most recent-year long data as test set.

        :param df_train: Training dataframe
        :param df_test: Test dataframe
        :return: tuple(df_train, df_test)
        """

        cutoff = df_modelling["exit_date"].max() - np.timedelta64(1, "Y")

        df_train = df_modelling[df_modelling["exit_date"] < cutoff].drop(
            "exit_date", axis="columns")
        df_test = df_modelling[df_modelling["exit_date"] >= cutoff].drop(
            "exit_date", axis="columns")

        # Resample into training set to to 80/20 ratio if test set is larger
        test_size = round((len(df_train) + len(df_test)) * 0.2)
        if len(df_test) > test_size:
            df_sample = df_test.sample(n=(len(df_test) - test_size),
                                       random_state=1)
            df_test = df_test.drop(df_sample.index)
            df_train = df_train.append(df_sample)

        return df_train, df_test

    def scale_numeric_feats(self, df_train, df_test):
        """
        Scales numeric feats of test and training set.
        Scale test set with training scaler, to prevent data leakage

        :param df_train: Training dataframe
        :param df_test: Test dataframe
        :return: tuple(df_train, df_test)
        """
        scaler = MinMaxScaler()

        num_cols = list(df_train.select_dtypes(include=[np.number]))
        scaler.fit(df_train[num_cols])

        df_train[num_cols] = scaler.transform(df_train[num_cols])
        df_test[num_cols] = scaler.transform(df_test[num_cols])
        return df_train, df_test

Exemplo n.º 25

0

Exibir arquivo

class Blastp(PipelineTask):
    '''
    Generate blastp results for a given proteome
    '''
    annotation_accession = luigi.Parameter(default='')
    e_value = luigi.FloatParameter(default=0.01)
    database = luigi.Parameter()
    run_time = luigi.DateSecondParameter()
    tool = 'blastp'

    # Available Database Choices
    swissprot = luigi.Parameter()
    nr = luigi.Parameter()
    internal = luigi.Parameter()

    # specific # CPU to use for each job
    swissprot_cpu = luigi.Parameter()
    nr_cpu = luigi.Parameter()
    internal_cpu = luigi.Parameter()

    def __init__(self, *args, **kwargs):
        super(Blastp, self).__init__(*args, **kwargs)

        if self.database == 'nr':
            self.n_cpu = self.nr_cpu
        elif self.database == 'swissprot':
            self.n_cpu = self.swissprot_cpu
        elif self.database == 'internal':
            self.n_cpu = self.internal_cpu

    def requires(self):
        return Pull_Protein(
            annotation_accession=self.annotation_accession,
            run_time=self.run_time,
            tool=self.tool,
            database=self.database,
            mas_server=self.mas_server
        )

    def out_file_path(self, temp=False):
        name = self.annotation_accession

        return {
            'results': os.path.join(self.out_dir(temp), '{}_{}_blastp_results.xml'.format(name, self.database))
        }

    def out_dir(self, temp=False):
        folder = '{}-temp'.format(self.database) if temp else self.database
        return os.path.join(self.pipeline_out_dir(), self.task_family, folder)

    def do_task(self):
        if self.database == 'nr':
            db_path = self.nr

        elif self.database == 'swissprot':
            db_path = self.swissprot

        elif self.database == 'internal':
            db_path = self.internal

        else:
            raise ValueError('Invalid database ' + self.database)

        self._run_command([
            'blastp',
            '-query', self.input()['fasta'].path,
            '-db', db_path,
            '-evalue', str(self.e_value),
            '-outfmt', '5',
            '-out', self.out_file_path(True)['results'],
            '-num_threads', str(self.n_cpu)
        ])

Exemplo n.º 26

0

Exibir arquivo

class ExtractCapacities(GomusScraperTask):
    """Extract all capacities from the fetched gomus pages."""

    today = luigi.DateSecondParameter(default=dt.datetime.today())

    popover_pattern = regex.compile(r'''
        <script> \s* \$\("\#info-\d+"\)\.popover\( ( \{ \s*
            (?<elem> \w+ \s* : \s* '(?:\\.|[^\\\'])*' \s*){0}
            (?:(?&elem) , \s*)*
            (?&elem)
        \} ) \); \s* </script>
        ''',
                                    flags=regex.X)

    def output(self):

        return luigi.LocalTarget(f'{self.output_dir}/gomus/capacities.csv',
                                 format=UTF8)

    def requires(self):

        return FetchCapacities(today=self.today)

    def run(self):

        with self.input().open() as input_:
            df_htmls = pd.read_csv(input_)

        capacities = [
            self.extract_capacities(html_path)
            for html_path in self.tqdm(df_htmls['file_path'],
                                       desc="Extracting capacities")
        ]

        df_capacities = pd.DataFrame(columns=[
            'quota_id', 'date', 'time', 'max', 'sold', 'reserved', 'available',
            'last_updated'
        ])
        if capacities:
            df_capacities = pd.concat([df_capacities, *capacities])
            df_capacities['last_updated'] = self.today

        with self.output().open('w') as output:
            df_capacities.to_csv(output, index=False)

    def extract_capacities(self, html_path):

        with open(html_path) as file:
            src = file.read()
        dom: html.HtmlElement = html.fromstring(src)

        quota_id, min_date = self.extract_header(dom)
        logger.debug("Scraping capacities from quota_id=%s for min_date=%s",
                     quota_id, min_date)

        capacities = self.create_zero_data(min_date)

        def load_data(data):
            return pd.DataFrame(
                data,
                columns=[*capacities.index.names, *capacities.columns],
                dtype=object).set_index(capacities.index.names)

        basic_capacities = load_data(self.extract_basic_capacities(dom))
        capacities.update(basic_capacities)

        detailed_capacities = load_data(
            self.extract_detailed_capacities(src, min_date))
        capacities.update(detailed_capacities)

        capacities = capacities.reset_index()
        capacities.insert(0, 'quota_id', quota_id)

        return capacities

    def create_zero_data(self, min_date: dt.date):

        df = pd.DataFrame(columns=['max', 'sold', 'reserved', 'available'])
        dates = [min_date + dt.timedelta(days=days) for days in range(0, 7)]
        times = list(
            self.create_time_range(delta=dt.timedelta(
                minutes=SLOT_LENGTH_MINUTES)))
        return df.reindex(pd.MultiIndex.from_product([dates, times],
                                                     names=['date', 'time']),
                          fill_value=0)

    def extract_header(self, dom: html.HtmlElement):
        """Extract general information from the DOM, e.g. quota ID or date."""
        quota_id = self.parse_int(
            dom, '//body/div[2]/div[2]/div[2]/div/div/ol/li[2]/a/div')
        min_date = self.parse_date(
            dom,
            '//body/div[2]/div[2]/div[3]/div/div[1]/div/div[2]/form/div[2]/'
            'div/div/input/@value')
        return quota_id, min_date

    def extract_basic_capacities(self, dom: html.HtmlElement):
        """
        Extract basic capacity values from the DOM.

        These are the values from the table indicating the availabilities for
        each slot. Generally, this is only a subset of data returned by
        extract_detailed_capacities(). However, in some cases, gomus displays
        (defect) negative values in the table and does not provide details
        about them, so this method is required to record the defect values
        anyway.
        """
        cells = dom.xpath(
            '//body/div[2]/div[2]/div[3]/div/div[2]/div/div[2]/table/tbody/'
            'tr[position()>1]/td[position()>1]')
        for cell in cells:
            datetime = dt.datetime.fromtimestamp(
                int(cell.get('data-timestamp')))
            available = int(cell.text_content().strip())
            yield dict(date=datetime.date(),
                       time=datetime.time(),
                       max=available,
                       available=available)

    def extract_detailed_capacities(self, src: str, min_date: dt.date):
        """Extract capacity details from the hovercards in the HTML source."""
        js_infos = [match[0] for match in self.popover_pattern.findall(src)]
        infos = [js2py.eval_js(f'd = {js}') for js in js_infos]
        for info in infos:
            yield self.extract_capacity(info, min_date)

    def extract_capacity(self, info, min_date):
        """Extract capacity details from a single hovercard info."""
        title: html.HtmlElement = html.fromstring(info['title'])
        content: html.HtmlElement = html.fromstring(info['content'])

        datetime = self.parse_date(title, relative_base=min_date)

        return dict(date=datetime.date(),
                    time=datetime.time(),
                    max=self.parse_int(content, '//tbody[1]/tr[1]/td[2]'),
                    sold=self.parse_int(content, '//tbody[1]/tr[2]/td[2]'),
                    reserved=self.parse_int(content, '//tbody[1]/tr[3]/td[2]'),
                    available=self.parse_int(content, '//tfooter[1]/tr/td[2]'))

    @staticmethod
    def create_time_range(delta: dt.timedelta) -> Iterable[dt.time]:
        assert delta.days == 0
        time = npt.nptime()
        while True:
            yield time
            prev_time = time
            time += delta
            if time <= prev_time:
                break