예제 #1
0
    def parse(self):
        """Parse data

        This is a basic implementation that carries out the whole pipeline of reading and parsing datafiles including
        calculating secondary data.

        Returns:
            Parser: The parsed data
        """
        if self.file_path is None:
            self.file_path = files.path(self.file_key,
                                        file_vars=self.vars,
                                        download_missing=True)

        parser_package, parser_name = self.__module__.rsplit(".", maxsplit=1)
        with timer("Finish {} ({}) - {} in".format(parser_name, parser_package,
                                                   self.file_key)):
            if self.data_available:
                self.read_data()

            if not self.data_available:  # May have been set to False by self.read_data()
                log.warn(
                    f"No data found by {self.__class__.__name__} for {self.rundate.strftime(config.FMT_date)} "
                    f"(was looking for {self.file_path})")
                return self

            self.calculate_data()
            dependencies.add(*self.dependencies, label=self.file_key)

        return self
예제 #2
0
파일: __main__.py 프로젝트: vpuenteg/where
def main():
    """Parse command line options and run the Where analysis

    Do simple parsing of command line arguments. Set up config-files and start the analysis. See the help docstring at
    the top of the file for more information about the workflow.
    """
    # Start logging
    log.init()

    # Read command line options
    if util.check_options("--doy"):
        rundate = util.parse_args("doy", doc_module=__name__)
    else:
        rundate = util.parse_args("date", doc_module=__name__)
    pipeline = pipelines.get_from_options()
    session = pipelines.get_session(rundate, pipeline)

    # Pretend to empty mailbox
    pretend_to_empty_mailbox()

    # Start an interactive session
    if util.check_options("-I", "--interactive"):
        from where.tools import interactive

        interactive.interactive(rundate, pipeline, session)
        return

    # Set up the configuration for a new analysis or update an existing one
    setup.setup_config(rundate, pipeline, session)

    # Run the analysis
    setup.add_timestamp(rundate, pipeline, session, "last run")
    with timer(f"Finish pipeline {pipeline.upper()} in"):
        pipelines.run(rundate, pipeline, session)
예제 #3
0
 def calculate_data(self):
     """
     TODO: Description?
     """
     for calculator in self.setup_calculators():
         log.debug(
             f"Start calculator {calculator.__name__} in {self.__module__}")
         with timer(
                 f"Finish calculator {calculator.__name__} ({self.__module__}) in",
                 logger=log.debug):
             calculator()
예제 #4
0
파일: _parser.py 프로젝트: vpuenteg/where
    def calculate_data(self):
        """Do simple manipulations on the data after they are read

        Simple manipulations of data may be performed in calculators after they are read. They should be kept simple so
        that a parser returns as true representation of the data file as possible. Advanced calculations may be done
        inside apriori classes or similar.

        To add a calculator, define it in its own method, and override the `setup_calculators`-method to return a list
        of all calculators.
        """
        for calculator in self.setup_calculators():
            log.debug("Start calculator {} in {}", calculator.__name__,
                      self.__module__)
            with timer("Finish calculator {} ({}) in".format(
                    calculator.__name__, self.__module__),
                       logger=log.debug):
                calculator()
예제 #5
0
def main():
    """Parse command line options and run the Where analysis

    Do simple parsing of command line arguments. Set up config-files and start the analysis. See the help docstring at
    the top of the file for more information about the workflow.
    """
    util.check_help_and_version(doc_module=__name__)

    # Start logging
    log.init(config.where.log.default_level.str)
    log.debug(
        f"Use {util.get_python_version()} on process {util.get_pid_and_server()}"
    )

    # Read command line options
    pipeline = pipelines.get_from_options()
    config.read_pipeline(pipeline)
    if util.check_options("--doy"):
        rundate = util.parse_args("doy", doc_module=__name__)
    else:
        rundate = util.parse_args("date", doc_module=__name__)
    session = pipelines.get_session(rundate, pipeline)

    # Pretend to empty mailbox
    pretend_to_empty_mailbox()

    # Start an interactive session
    if util.check_options("-I", "--interactive"):
        from where.tools import interactive  # Local import because interactive imports many external packages

        interactive.interactive(rundate, pipeline, session)
        return

    # Set up the configuration for a new analysis or update an existing one
    setup.setup_config(rundate, pipeline, session)

    # Run the analysis
    setup.add_timestamp(rundate, pipeline, session, "last run")
    with timer(f"Finish pipeline {pipeline.upper()} in"):
        pipelines.run(rundate, pipeline, session)
예제 #6
0
def run(rundate, pipeline, session=""):
    """Run a Where pipeline for a given date and session

    Args:
        rundate:   Rundate of analysis.
        pipeline:  Pipeline used for analysis.
        session:   Session in analysis.
    """
    if not setup.has_config(rundate, pipeline, session):
        log.fatal(
            f"No configuration found for {pipeline.upper()} {session} {rundate.strftime(config.FMT_date)}"
        )

    # Set up session config
    config.init(rundate=rundate, tech_name=pipeline, session=session)

    # Set up prefix for console logger and start file logger
    log_cfg = config.where.log
    prefix = f"{pipeline.upper()} {session} {rundate:%Y-%m-%d}"
    log.init(log_level=log_cfg.default_level.str, prefix=prefix)
    if log_cfg.log_to_file.bool:
        log.file_init(
            file_path=files.path("log"),
            log_level=log_cfg.default_level.str,
            prefix=prefix,
            rotation=log_cfg.number_of_log_backups.int,
        )

    # Read which stages to skip from technique configuration file.
    skip_stages = config.tech.get("skip_stages", default="").list

    # Register filekey suffix
    filekey_suffix = config.tech.filekey_suffix.list
    if filekey_suffix:
        config.files.profiles = filekey_suffix

    # Find which stages we will run analysis for
    # TODO: Specify stage_list in config
    stage_list = [s for s in stages(pipeline) if s not in skip_stages]

    # Start file logging and reporting
    reports.report.init(sessions=[session])
    reports.report.start_session(session)
    reports.report.text("header", session.replace("_", " ").title())

    # Update analysis config and file variables
    config.set_analysis(rundate=rundate,
                        tech=pipeline,
                        analysis=pipeline,
                        session=session)
    config.set_file_vars(file_vars())

    # Log the name of the session
    log.blank()  # Empty line for visual clarity
    log.info(f"Start session {session}")
    session_timer = timer(f"Finish session {session} in")
    session_timer.start()

    # Run stages, keep track of previous stage
    dset = None
    dep_fast = config.where.files.dependencies_fast.bool
    for prev_stage, stage in zip([None] + stage_list, stage_list):

        # Skip stages where no dependencies have changed
        dep_path = files.path("depends", file_vars=dict(stage=stage))
        if not (dependencies.changed(dep_path, fast_check=dep_fast)
                or util.check_options("-F", "--force")):
            log.info(
                f"Not necessary to run {stage} for {pipeline.upper()} {rundate.strftime(config.FMT_date)}"
            )
            continue
        elif dset is None:
            # Create or read dataset
            empty = stage == stage_list[0]
            dset = dataset.Dataset(rundate,
                                   tech=pipeline,
                                   stage=prev_stage,
                                   dataset_name=session,
                                   dataset_id="last",
                                   empty=empty)

        # Report on the stage
        reports.report.start_section(stage)
        reports.report.text("header", stage.replace("_", " ").title())
        if prev_stage:
            log.blank()  # Empty line for visual clarity

        # Set up dependencies. Add dependencies to previous stage and config file
        dependencies.init(dep_path, fast_check=dep_fast)
        dependencies.add(files.path("depends",
                                    file_vars=dict(stage=prev_stage)),
                         label="depends")
        dependencies.add(*config.tech.sources, label="config")

        # Delete old datasets for this stage
        dset.delete_from_file(stage=stage, dataset_id="all")

        # Call the current stage. Skip rest of stages if current stage returns False (compare with is since by
        # default stages return None)
        plugins.call(package_name=__name__,
                     plugin_name=pipeline,
                     part=stage,
                     stage=stage,
                     dset=dset,
                     plugin_logger=log.info)
        dependencies.write()
        if dset.num_obs == 0:
            log.warn(
                f"No observations in dataset after {stage} stage. Exiting pipeline"
            )
            break
    else:  # Only done if loop does not break (all stages finish normally)
        # Publish files for session
        files.publish_files()

    session_timer.end()

    # Store configuration to library
    setup.store_config_to_library(rundate, pipeline, session)

    # Write reports specified in config
    reports.write(rundate, pipeline)

    # Write requirements to file for reproducibility
    util.write_requirements()
예제 #7
0
def call_one(package_name,
             plugin_name,
             part=None,
             prefix=None,
             logger=log.time,
             use_timer=True,
             do_report=True,
             **kwargs):
    """Call one plug-in

    If the plug-in is not part of the package an UnknownPluginError is raised.

    If there are several functions registered in a plug-in and `part` is not specified, then the first function
    registered in the plug-in will be called.

    The file containing the source code of the plug-in is added to the list of dependencies.

    Args:
        package_name (String):  Name of package containing plug-ins.
        plugin_name (String):   Name of the plug-in, i.e. the module containing the plug-in.
        part (String):          Name of function to call within the plug-in (optional).
        prefix (String):        Prefix of the plug-in name, used if the plug-in name is unknown (optional).
        logger (Function):      Logger from the lib.log package specifying the level of logging to be used (optional).
        use_timer (Boolean):    Whether to time and log the call to the plug-in (optional).
        do_report (Boolean):    Whether to add the call to the plug-in to the report (optional).
        kwargs:                 Named arguments passed on to the plug-in.

    Returns:
        Return value of the plug-in.
    """
    # Get Plugin-object
    plugin_name = load_one(package_name, plugin_name, prefix=prefix)
    part = "__default__" if part is None else part
    try:
        plugin = _PLUGINS[package_name][plugin_name][part]
    except KeyError:
        raise exceptions.UnknownPluginError(
            "Plugin '{}' not found for '{}' in '{}'"
            "".format(part, plugin_name, package_name)) from None

    # Add plug-in to report
    if do_report:
        from where.reports import report

        code_kwargs = kwargs.copy()
        if "dset" in code_kwargs:
            code_kwargs["dset"] = code_kwargs["dset"].repr
        report.add(
            package_name,
            __plugin__=plugin.name,
            __doc__=plugin.function.__doc__,
            __text__="TODO",
            __code__=
            "kwargs = {}\n{} = plugins.call_one('{}', '{}', part='{}', **kwargs)"
            "".format(code_kwargs, plugin_name, package_name, plugin_name,
                      part),
            **kwargs,
        )

    # Call plug-in
    dependencies.add(plugin.file_path, label="plugin")
    if logger:
        logger(f"Start {plugin.name} in {package_name}")
        time_logger = log.time if use_timer else None
    else:
        time_logger = None
    with timer(f"Finish {plugin.name} ({package_name}) in",
               logger=time_logger):
        return plugin.function(**kwargs)
예제 #8
0
def run(rundate, pipeline, session=""):
    """Run a Where pipeline for a given date and session

    Args:
        rundate:   Rundate of analysis.
        pipeline:  Pipeline used for analysis.
        session:   Session in analysis.
    """
    if not setup.has_config(rundate, pipeline, session):
        log.fatal(
            f"No configuration found for {pipeline.upper()} {session} {rundate.strftime(config.FMT_date)}"
        )

    # Set up tech config and file logging
    config.init(rundate=rundate, tech_name=pipeline, session=session)
    log.file_init(log_path=files.path("log"))

    # Read which stages to skip from technique configuration file.
    skip_stages = config.tech.get("skip_stages", default="").list

    # Register filekey suffix
    filekey_suffix = config.tech.filekey_suffix.list
    if filekey_suffix:
        files.use_filelist_profiles(*filekey_suffix)

    # Find which stages we will run analysis for
    stage_list = [s for s in stages(pipeline) if s not in skip_stages]

    # Start file logging and reporting
    reports.report.init(sessions=[session])
    reports.report.start_session(session)
    reports.report.text("header", session.replace("_", " ").title())

    # Update analysis config and file variables
    config.set_analysis(rundate=rundate,
                        tech=pipeline,
                        analysis=pipeline,
                        session=session)
    config.set_file_vars(file_vars())

    # Log the name of the session
    log.blank()  # Empty line for visual clarity
    log.info(f"Start session {session}")
    session_timer = timer(f"Finish session {session} in")
    session_timer.start()

    # Run stages, keep track of previous stage
    dep_fast = config.where.files.dependencies_fast.bool
    for prev_stage, stage in zip([None] + stage_list, stage_list):

        # Skip stages where no dependencies have changed
        if not (dependencies.changed(fast_check=dep_fast,
                                     rundate=rundate,
                                     tech=pipeline,
                                     session=session,
                                     stage=stage)
                or util.check_options("-F", "--force")):
            log.info(
                f"Not necessary to run {stage} for {pipeline.upper()} {rundate.strftime(config.FMT_date)}"
            )
            continue

        # Report on the stage
        reports.report.start_section(stage)
        reports.report.text("header", stage.replace("_", " ").title())
        if prev_stage:
            log.blank()  # Empty line for visual clarity

        # Set up dependencies. Add dependencies to previous stage and config file
        dependencies.init(fast_check=dep_fast, session=session, stage=stage)
        dependencies.add(
            files.path("model_run_depends",
                       file_vars=dict(session=session, stage=prev_stage)))
        dependencies.add(*config.tech.sources)

        # Call the current stage. Skip rest of stages if current stage returns False (compare with is since by
        # default stages return None)
        do_next_stage = call(pipeline,
                             stage,
                             rundate=rundate,
                             session=session,
                             prev_stage=prev_stage,
                             stage=stage,
                             logger=log.info)
        dependencies.write()
        if do_next_stage is False:
            break  # TODO, this does not work together with dependencies changed ...

    # Publish files for session
    files.publish_files()
    session_timer.end()

    # Store configuration to library
    setup.store_config_to_library(rundate, pipeline, session)

    # Write reports specified in config
    reports.write(rundate, pipeline)

    # Write requirements to file for reproducibility
    util.write_requirements()