Python AirflowSkipException示例，airflow.exceptions.AirflowSkipException Python示例

示例#1

0

显示文件

def get_best_result(task, ti, **kwargs):
    """
    When there are numerous FERRE tasks that are upstream, this
    function will return the primary keys of the task instances that gave
    the best result on a per-observation basis.
    """

    # Get the PKs from upstream.
    pks = []
    log.debug(f"Upstream tasks: {task.upstream_list}")
    for upstream_task in task.upstream_list:
        pks.append(ti.xcom_pull(task_ids=upstream_task.task_id))

    pks = flatten(pks)
    log.debug(f"Getting best initial guess among primary keys {pks}")

    # Need to uniquely identify observations.
    param_bit_mask = bitmask.ParamBitMask()
    bad_grid_edge = (param_bit_mask.get_value("GRIDEDGE_WARN") | param_bit_mask.get_value("GRIDEDGE_BAD"))

    trees = {}
    best_tasks = {}
    for i, pk in enumerate(pks):
        q = session.query(astradb.TaskInstance).filter(astradb.TaskInstance.pk==pk)
        instance = q.one_or_none()

        if instance.output is None:
            log.warning(f"No output found for task instance {instance}")
            continue

        p = instance.parameters

        # Check that the telescope is the same as what we expect from this task ID.
        # This is a bit of a hack. Let us explain.

        # The "BA" grid does not have a telescope/fiber model, so you can run LCO and APO
        # data through the initial-BA grid. And those outputs go to the "get_best_results"
        # for each of the APO and LCO tasks (e.g., this function).
        # If there is only APO data, then the LCO "get_best_result" will only have one
        # input: the BA results. Then it will erroneously think that's the best result
        # for that source.

        # It's hacky to put this logic in here. It should be in the DAG instead. Same
        # thing for parsing 'telescope' name in the DAG (eg 'APO') from 'apo25m'.
        this_telescope_short_name = p["telescope"][:3].upper()
        expected_telescope_short_name = task.task_id.split(".")[1]
        log.info(f"For instance {instance} we have {this_telescope_short_name} and {expected_telescope_short_name}")
        if this_telescope_short_name != expected_telescope_short_name:
            continue

        try:
            tree = trees[p["release"]]                
        except KeyError:
            tree = trees[p["release"]] = SDSSPath(release=p["release"])
        
        key = "_".join([
            p['release'],
            p['filetype'],
            *[p[k] for k in tree.lookup_keys(p['filetype'])]
        ])
        
        best_tasks.setdefault(key, (np.inf, None))
        
        # TODO: Confirm that this is base10 log. This should also be 'log_reduced_chisq_fit',
        #       according to the documentation.
        log_chisq_fit, *_ = instance.output.log_chisq_fit
        previous_teff, *_ = instance.output.teff
        bitmask_flag, *_ = instance.output.bitmask_flag
        
        log.debug(f"Result {instance} {instance.output} with log_chisq_fit = {log_chisq_fit} and {previous_teff} and {bitmask_flag}")
        
        # Note: If FERRE totally fails then it will assign -999 values to the log_chisq_fit. So we have to
        #       check that the log_chisq_fit is actually sensible!
        #       (Or we should only query task instances where the output is sensible!)
        if log_chisq_fit < 0: # TODO: This is a f*****g hack.
            log.debug(f"Skipping result for {instance} {instance.output} as log_chisq_fit = {log_chisq_fit}")
            continue
            
        parsed_header = utils.parse_header_path(p["header_path"])
        
        # Penalise chi-sq in the same way they did for DR17.
        # See github.com/sdss/apogee/python/apogee/aspcap/aspcap.py#L658
        if parsed_header["spectral_type"] == "GK" and previous_teff < 3900:
            log.debug(f"Increasing \chisq because spectral type GK")
            log_chisq_fit += np.log10(10)

        bitmask_flag_logg, bitmask_flag_teff = bitmask_flag[-2:]
        if bitmask_flag_logg & bad_grid_edge:
            log.debug(f"Increasing \chisq because logg flag is bad edge")
            log_chisq_fit += np.log10(5)
            
        if bitmask_flag_teff & bad_grid_edge:
            log.debug(f"Increasing \chisq because teff flag is bad edge")
            log_chisq_fit += np.log10(5)
        
        # Is this the best so far?
        if log_chisq_fit < best_tasks[key][0]:
            log.debug(f"Assigning this output to best task as {log_chisq_fit} < {best_tasks[key][0]}: {pk}")
            best_tasks[key] = (log_chisq_fit, pk)
    
    for key, (log_chisq_fit, pk) in best_tasks.items():
        if pk is None:
            log.warning(f"No good task found for key {key}: ({log_chisq_fit}, {pk})")
        else:
            log.info(f"Best task for key {key} with \chi^2 of {log_chisq_fit:.2f} is primary key {pk}")

    if best_tasks:
        return [pk for (log_chisq_fit, pk) in best_tasks.values() if pk is not None]
    else:
        raise AirflowSkipException(f"no task outputs found from {len(pks)} primary keys")

示例#2

0

显示文件

    def execute(self, context):
        """
        Create task instances for all the data model identifiers, which could include
        multiple task instances for each data model identifier set.

        :param context:
            The Airflow context dictionary.
        """

        # Get header information.
        grid_info = utils.parse_grid_information(self.header_paths)

        args = (context["dag"].dag_id, context["task"].task_id, context["run_id"])

        # Get parameters from the parent class initialisation that should also be stored.
        common_task_parameters = self.common_task_parameters()

        pks = []
        trees = {}
        
        for data_model_identifiers in self.data_model_identifiers(context):

            parameters = { **common_task_parameters, **data_model_identifiers }

            release = parameters["release"]
            tree = trees.get(release, None)
            if tree is None:
                trees[release] = tree = SDSSPath(release=release)

            path = tree.full(**parameters)
            
            # Generate initial guess(es).
            initial_guesses = []

            # From headers
            try:
                header = getheader(path)

                teff = safe_read_header(header, ("RV_TEFF", "RVTEFF"))
                logg = safe_read_header(header, ("RV_LOGG", "RVLOGG"))
                fe_h = safe_read_header(header, ("RV_FEH", "RVFEH"))

                # Get information relevant for matching initial guess and grids.
                initial_guesses.append(dict(
                    telescope=parameters["telescope"], # important for LSF information
                    mean_fiber=header["MEANFIB"], # important for LSF information
                    teff=teff,
                    logg=logg,
                    metals=fe_h,
                ))

            except:
                log.exception(f"Unable to load relevant headers from path {path}")
                continue
            
            # Add any other initial guesses? From Gaia? etc?
            for initial_guess in initial_guesses:
                for header_path, _ in utils.yield_suitable_grids(grid_info, **initial_guess):
                    parameters.update(
                        header_path=header_path,
                        initial_teff=np.round(initial_guess["teff"], 0),
                        initial_logg=np.round(initial_guess["logg"], 3),
                        initial_metals=np.round(initial_guess["metals"], 3),
                        initial_log10vdop=np.round(utils.approximate_log10_microturbulence(initial_guess["logg"]), 3),
                        initial_o_mg_si_s_ca_ti=0.0,
                        initial_lgvsini=1.0,  # :eyes:
                        initial_c=0.0,
                        initial_n=0.0,
                    )
                    instance = create_task_instance(*args, parameters)
                    pks.append(instance.pk)
                    
                    log.debug(f"Created {instance} with parameters {parameters}")

        if not pks:
            raise AirflowSkipException("No data model identifiers found for this time period.")

        return pks

示例#3

0

显示文件

    def execute(self, context):
        if not os.path.exists(os.path.dirname(self.folder)):
            try:
                os.makedirs(os.path.dirname(self.folder))
            except OSError as exc:  # Guard against race condition
                if exc.errno != errno.EEXIST:
                    raise
        folder = self.folder
        if self.str_files is None:
            self.str_files = common.getUpstreamVariable(self, context)
        if self.str_files is None or len(self.str_files) == 0:
            raise AirflowSkipException("there is not files")
        i = 0
        _nc_files = [x for x in self.str_files if ".nc" in x]
        _files = [
            x for x in _nc_files if "{}.nc".format(self.output_type) in x and
            (self.lat is None or "{}_{}".format(self.lat[0], self.lon[0]) in x)
            and self.year is None or "_{}_".format(self.year)
        ]
        _other_files = [x for x in self.str_files if ".nc" not in x]

        kwargs = self.alg_kwargs
        xarrs = {}
        for _f in _files:

            _xarr = common.readNetCDF(_f)
            if len(_xarr.data_vars) == 0:
                raise AirflowSkipException("No data inside the files ")
            xarrs[os.path.basename(_f)] = _xarr
        kwargs["xarrs"] = xarrs
        kwargs["product"] = self.product
        kwargs["folder"] = folder
        kwargs["other_files"] = _other_files
        exec(
            open(common.ALGORITHMS_FOLDER + "/" + self.algorithm + "/" +
                 self.algorithm + "_" + str(self.version) + ".py",
                 encoding='utf-8').read(), kwargs)
        fns = []

        history = u'Creado con CDCOL con el algoritmo {} y  ver. {}'.format(
            self.algorithm, str(self.version))
        if not self.lat is None and not self.year is None:
            _exp = "{}_{}_{}_{}_{}".format(self.task_id, str(self.algorithm),
                                           self.lat[0], self.lon[0], self.year)
        elif not self.lat is None:
            _exp = "{}_{}_{}_{}_all".format(self.task_id, str(self.algorithm),
                                            self.lat[0], self.lon[0])
        elif not self.year is None:
            _exp = "{}_{}_{}_{}_{}".format(self.task_id, str(self.algorithm),
                                           "All", "All", self.year)
        else:
            _exp = "{}_{}_{}_{}_{}".format(self.task_id, str(self.algorithm),
                                           "All", "All", "All")

        if "output" in kwargs:  #output debería ser un xarray
            #Guardar a un archivo...

            output = kwargs["output"]
            if self.to_tiff:
                filename = folder + "{}_output.tif".format(_exp)
                common.write_geotiff_from_xr(filename, output)
                # Siguientes 4 lineas descomentadas por Aurelio
                #filename = folder + "{}_output.nc".format(_exp)

                #filename = folder + "{}_{}_{}_{}_{}_output.nc".format(self.task_id, str(self.algorithm),
                #                                                       _fn.split("_")[2], _fn.split("_")[3],
                #                                                       _fn.split("_")[4])
                #common.saveNC(output, filename, history)
                # common.translate_netcdf_to_tiff(self.task_id, str(self.algorithm), self.folder, [filename])
            else:
                filename = folder + "{}_output.nc".format(_exp)
                common.saveNC(output, filename, history)
            fns.append(filename)
        if "outputs" in kwargs:
            if self.to_tiff:
                common.write_geotiff_from_xr(filename, ouput, bands)
                for xa in kwargs["outputs"]:
                    filename = folder + "{}_{}.tif".format(_exp, xa)
                    common.write_geotiff_from_xr(filename,
                                                 kwargs["outputs"][xa])
                    fns.append(filename)
            else:
                for xa in kwargs["outputs"]:
                    filename = folder + "{}_{}.tif".format(_exp, xa)
                    common.saveNC(kwargs["outputs"][xa], filename, history)
                    fns.append(filename)
        if "outputtxt" in kwargs:
            filename = folder + "{}.txt".format(_exp)
            with open(filename, "w") as text_file:
                text_file.write(kwargs["outputtxt"])
            fns.append(filename)
        if "outputxcom" in kwargs:
            fns.append(kwargs["outputxcom"])
        return fns

示例#4

0

显示文件

文件： query_operator.py 项目： OpenDatacubeIDEAM/cdcol-workflows

    def execute(self, context):
        if not os.path.exists(os.path.dirname(self.folder)):
            try:
                os.makedirs(os.path.dirname(self.folder))
            except OSError as exc:  # Guard against race condition
                if exc.errno != errno.EEXIST:
                    raise
        folder = self.folder
        dc = datacube.Datacube(app=self.execID)
        kwargs = self.alg_kwargs
        xanm = "xarr"
        start = time.time()
        bands = []
        print(self.time_ranges)
        if self.product['bands'] != None and len(self.product['bands']) > 0:
            bands = self.product['bands']
        if isinstance(
                self.time_ranges,
                list) and self.alg_folder == common.COMPLETE_ALGORITHMS_FOLDER:
            i = 0
            for t in self.time_ranges:
                kwargs[xanm + str(i)] = dc.load(product=self.product['name'],
                                                measurements=bands,
                                                longitude=self.lon,
                                                latitude=self.lat,
                                                time=t)
                if len(kwargs[xanm + str(i)].data_vars) == 0:
                    print("ERROR: NO HAY DATOS EN LA ZONA")
                    open(
                        posixpath.join(common.LOGS_FOLDER, self.execID,
                                       self.task_id, "no_data.lock"),
                        "w+").close()
                    raise AirflowSkipException("No hay datos en la zona")
                i += 1
        else:
            kwargs[xanm + str(0)] = dc.load(product=self.product['name'],
                                            measurements=bands,
                                            longitude=self.lon,
                                            latitude=self.lat,
                                            time=self.time_ranges)
            if len(kwargs[xanm + str(0)].data_vars) == 0:
                print("ERROR: NO HAY DATOS EN LA ZONA")
                open(
                    posixpath.join(common.LOGS_FOLDER, self.execID,
                                   self.task_id, "no_data.lock"),
                    "w+").close()
                raise AirflowSkipException("No hay datos en la zona")
        # kwargs[xanm] = dc.load(product=self.product['name'], longitude=self.lon, latitude=self.lat, time=self.time_ranges)

        dc.close()
        end = time.time()
        logging.info('TIEMPO CONSULTA:' + str((end - start)))
        kwargs["product"] = self.product
        kwargs["folder"] = folder
        path = posixpath.join(self.alg_folder, self.algorithm,
                              self.algorithm + "_" + str(self.version) + ".py")
        exec(open(path, encoding='utf-8').read(), kwargs)
        fns = []

        history = u'Creado con CDCOL con el algoritmo {} y  ver. {}'.format(
            self.algorithm, str(self.version))
        if "output" in kwargs:  # output debería ser un xarray
            # Guardar a un archivo...

            output = kwargs["output"]
            if self.to_tiff:
                filename = folder + "{}_{}_{}_{}_{}_output.tif".format(
                    self.task_id, str(self.algorithm),
                    self.lat[0], self.lon[0],
                    re.sub('[^\w_.)(-]', '', str(self.time_ranges)))
                common.write_geotiff_from_xr(filename, output)
                #common.saveNC(output, filename, history)
            else:
                filename = folder + "{}_{}_{}_{}_{}_output.nc".format(
                    self.task_id, str(self.algorithm),
                    self.lat[0], self.lon[0],
                    re.sub('[^\w_.)(-]', '', str(self.time_ranges)))
                common.saveNC(output, filename, history)
            fns.append(filename)
        if "outputs" in kwargs:

            if self.to_tiff:

                for xa in kwargs["outputs"]:
                    filename = folder + "{}_{}_{}_{}_{}_{}.tif".format(
                        self.task_id, str(self.algorithm), self.lat[0],
                        self.lon[0],
                        re.sub('[^\w_.)(-]', '', str(self.time_ranges)), xa)
                    common.write_geotiff_from_xr(filename,
                                                 kwargs["outputs"][xa])
                    fns.append(filename)
            else:
                for xa in kwargs["outputs"]:
                    filename = folder + "{}_{}_{}_{}_{}_{}.nc".format(
                        self.task_id, str(self.algorithm), self.lat[0],
                        self.lon[0],
                        re.sub('[^\w_.)(-]', '', str(self.time_ranges)), xa)
                    common.saveNC(kwargs["outputs"][xa], filename, history)
                    fns.append(filename)
        if "outputtxt" in kwargs:
            filename = folder + "{}_{}_{}.txt".format(
                self.lat[0], self.lon[0],
                re.sub('[^\w_.)(-]', '', str(self.time_ranges)))
            with open(filename, "w") as text_file:
                text_file.write(kwargs["outputtxt"])
            fns.append(filename)
        if "outputxcom" in kwargs:
            fns.append(kwargs["outputxcom"])
        return fns

示例#5

0

显示文件

def trigger_null(context):
    raise AirflowSkipException('Intentionally not doing it')

示例#6

0

显示文件

文件： util.py 项目： aaronsql2019/dagster

def check_events_for_skips(events):
    check.list_param(events, 'events', of_type=DagsterEvent)
    skipped = any([e.event_type_value == DagsterEventType.STEP_SKIPPED.value for e in events])
    if skipped:
        raise AirflowSkipException('Dagster emitted skip event, skipping execution in Airflow')