Пример #1
0
    def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix, global_config):
        sim_tasks = Task.get_task_of_type(prior_tasks, SNANASimulation, DataPrep)
        classifier_tasks = Task.get_task_of_type(prior_tasks, Classifier)

        def _get_aggregator_dir(base_output_dir, stage_number, agg_name):
            return f"{base_output_dir}/{stage_number}_AGG/{agg_name}"

        tasks = []

        # Check for recalibration, and if so, find that task first
        for agg_name in c.get("AGGREGATION", []):
            config = c["AGGREGATION"][agg_name]
            if config is None:
                config = {}
            options = config.get("OPTS", {})
            mask = config.get("MASK", "")
            mask_sim = config.get("MASK_SIM", "")
            mask_clas = config.get("MASK_CLAS", "")
            recalibration = config.get("RECALIBRATION")
            recal_simtask = None
            recal_aggtask = None
            if recalibration:
                recal_sim = [i for i, s in enumerate(sim_tasks) if s.name == recalibration]

                if len(recal_sim) == 0:
                    Task.fail_config(f"Recalibration sim {recalibration} not in the list of available sims: {[s.name for s in sim_tasks]}")
                elif len(recal_sim) > 1:
                    Task.fail_config(f"Recalibration aggregation {recalibration} not in the list of available aggs: {[s.name for s in sim_tasks]}")

                # Move the recal sim task to the front of the queue so it executes first
                recal_sim_index = recal_sim[0]
                recal_simtask = sim_tasks[recal_sim_index]
                sim_tasks.insert(0, sim_tasks.pop(recal_sim_index))

            for sim_task in sim_tasks:
                if mask_sim not in sim_task.name or mask not in sim_task.name and recal_simtask != sim_task:
                    continue

                agg_name2 = f"{agg_name}_{sim_task.name}"
                deps = [
                    c
                    for c in classifier_tasks
                    if mask in c.name and mask_clas in c.name and c.mode == Classifier.PREDICT and c.get_simulation_dependency() == sim_task
                ]
                if len(deps) == 0:
                    deps = [sim_task]

                if recalibration and sim_task != recal_simtask:
                    if recal_aggtask is None:
                        Task.fail_config(f"The aggregator task for {recalibration} has not been made yet. Sam probably screwed up dependency order.")
                    else:
                        deps.append(recal_aggtask)
                a = Aggregator(agg_name2, _get_aggregator_dir(base_output_dir, stage_number, agg_name2), config, deps, options, recal_aggtask)
                if sim_task == recal_simtask:
                    recal_aggtask = a
                Task.logger.info(f"Creating aggregation task {agg_name2} for {sim_task.name} with {a.num_jobs} jobs")
                tasks.append(a)

        return tasks
Пример #2
0
    def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix,
                  global_config):
        agg_tasks = Task.get_task_of_type(prior_tasks, Aggregator)
        lcfit_tasks = Task.get_task_of_type(prior_tasks, SNANALightCurveFit)
        tasks = []

        def _get_merge_output_dir(base_output_dir, stage_number, merge_name,
                                  lcfit_name):
            return f"{base_output_dir}/{stage_number}_MERGE/{merge_name}_{lcfit_name}"

        for name in c.get("MERGE", []):
            num_gen = 0
            config = c["MERGE"].get(name, {})
            if config is None:
                config = {}
            options = config.get("OPTS", {})
            mask = config.get("MASK", "")
            mask_sim = config.get("MASK_SIM", "")
            mask_lc = config.get("MASK_FIT", "")
            mask_agg = config.get("MASK_AGG", "")

            for lcfit in lcfit_tasks:
                if mask and mask not in lcfit.name:
                    continue
                if mask_lc and mask_lc not in lcfit.name:
                    continue
                sim = lcfit.get_dep(SNANASimulation, DataPrep)
                if mask and mask not in sim.name:
                    continue
                if mask_sim and mask_sim not in sim.name:
                    continue

                for agg in agg_tasks:
                    if mask_agg and mask_agg not in agg.name:
                        continue
                    if mask and mask not in agg.name:
                        continue

                    # Check if the sim is the same for both
                    if sim != agg.get_underlying_sim_task():
                        continue
                    num_gen += 1

                    merge_name2 = f"{name}_{lcfit.name}"
                    task = Merger(
                        merge_name2,
                        _get_merge_output_dir(base_output_dir, stage_number,
                                              name, lcfit.name), config,
                        [lcfit, agg], options)
                    Task.logger.info(
                        f"Creating merge task {merge_name2} for {lcfit.name} and {agg.name} with {task.num_jobs} jobs"
                    )
                    tasks.append(task)
            if num_gen == 0:
                Task.fail_config(
                    f"Merger {name} with mask {mask} matched no combination of aggregators and fits"
                )
        return tasks
Пример #3
0
    def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix,
                  global_config):

        create_cov_tasks = Task.get_task_of_type(prior_tasks, CreateCov)

        def _get_wfit_dir(base_output_dir, stage_number, name):
            return f"{base_output_dir}/{stage_number}_COSMOFIT/WFIT/{name}"

        tasks = []
        key = "WFIT"
        for name in c.get(key, []):
            config = c[key].get(name, {})
            name = f"WFIT_{name}"
            options = config.get("OPTS", {})

            mask = config.get("MASK", "")

            ctasks = [
                ctask for ctask in create_cov_tasks if mask in ctask.name
            ]

            t = WFit(name, _get_wfit_dir(base_output_dir, stage_number, name),
                     ctasks, config, options, global_config)
            Task.logger.info(f"Creating WFit task {name} {t.num_jobs} jobs")
            tasks.append(t)

            if len(create_cov_tasks) == 0:
                Task.fail_config(
                    f"WFit task {name} has no create_cov task to run on!")
        return tasks
Пример #4
0
    def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix,
                  global_config):

        create_cov_tasks = Task.get_task_of_type(prior_tasks, CreateCov)

        def _get_cosmomc_dir(base_output_dir, stage_number, name):
            return f"{base_output_dir}/{stage_number}_COSMOFIT/COSMOMC/{name}"

        tasks = []
        key = "COSMOMC"
        for cname in c.get(key, []):
            config = c[key].get(cname, {})
            options = config.get("OPTS", {})

            mask = config.get("MASK_CREATE_COV", config.get("MASK", ""))

            # Check if this is static. Could scan the folder, but dont have all the chains yet.
            # TODO: Update this when I have all the chains
            if options.get("INI") in ["cmb_omw", "cmb_omol"]:
                a = CosmoMC(
                    cname,
                    _get_cosmomc_dir(base_output_dir, stage_number, cname),
                    config, options, global_config)
                Task.logger.info(
                    f"Creating CosmoMC task {cname} for {a.num_jobs} jobs")
                tasks.append(a)

            else:
                for ctask in create_cov_tasks:
                    if mask not in ctask.name:
                        continue
                    name = f"COSMOMC_{cname}_{ctask.name}"
                    a = CosmoMC(name,
                                _get_cosmomc_dir(base_output_dir, stage_number,
                                                 name),
                                config,
                                options,
                                global_config,
                                dependencies=[ctask])
                    Task.logger.info(
                        f"Creating CosmoMC task {name} for {ctask.name} with {a.num_jobs} jobs"
                    )
                    tasks.append(a)

                if len(create_cov_tasks) == 0:
                    Task.fail_config(
                        f"CosmoMC task {cname} has no create_cov task to run on!"
                    )

        return tasks
Пример #5
0
    def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix,
                  global_config):

        biascor_tasks = Task.get_task_of_type(prior_tasks, BiasCor)

        def _get_createcov_dir(base_output_dir, stage_number, name):
            return f"{base_output_dir}/{stage_number}_CREATE_COV/{name}"

        tasks = []
        for cname in c.get("CREATE_COV", []):
            config = c["CREATE_COV"][cname]
            if config is None:
                config = {}
            options = config.get("OPTS", {})
            mask = config.get("MASK", config.get("MASK_BIASCOR", ""))

            for btask in biascor_tasks:
                if mask not in btask.name:
                    continue

                num = len(btask.output["subdirs"])
                for i in range(num):
                    ii = "" if num == 1 else f"_{i + 1}"

                    name = f"{cname}_{btask.name}{ii}"
                    a = CreateCov(name,
                                  _get_createcov_dir(base_output_dir,
                                                     stage_number, name),
                                  config,
                                  options,
                                  global_config,
                                  dependencies=[btask],
                                  index=i)
                    Task.logger.info(
                        f"Creating createcov task {name} for {btask.name} with {a.num_jobs} jobs"
                    )
                    tasks.append(a)

            if len(biascor_tasks) == 0:
                Task.fail_config(
                    f"Create cov task {cname} has no biascor task to run on!")

        return tasks
Пример #6
0
    def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix,
                  global_config):
        from pippin.classifiers.factory import ClassifierFactory

        def _get_clas_output_dir(base_output_dir,
                                 stage_number,
                                 sim_name,
                                 fit_name,
                                 clas_name,
                                 index=None,
                                 extra=None):
            sim_name = "" if sim_name is None or fit_name is not None else "_" + sim_name
            fit_name = "" if fit_name is None else "_" + fit_name
            extra_name = "" if extra is None else "_" + extra
            index = "" if index is None else f"_{index}"
            return f"{base_output_dir}/{stage_number}_CLAS/{clas_name}{index}{sim_name}{fit_name}{extra_name}"

        def get_num_ranseed(sim_task, lcfit_task):
            if sim_task is not None:
                return len(sim_task.output["sim_folders"])
            if lcfit_task is not None:
                return len(lcfit_task.output["fitres_dirs"])
            raise ValueError(
                "Classifier dependency has no sim_task or lcfit_task?")

        tasks = []
        lcfit_tasks = Task.get_task_of_type(prior_tasks, SNANALightCurveFit)
        sim_tasks = Task.get_task_of_type(prior_tasks, DataPrep,
                                          SNANASimulation)
        for clas_name in c.get("CLASSIFICATION", []):
            config = c["CLASSIFICATION"][clas_name]
            name = config["CLASSIFIER"]
            cls = ClassifierFactory.get(name)
            options = config.get("OPTS", {})
            if "MODE" not in config:
                Task.fail_config(
                    f"Classifier task {clas_name} needs to specify MODE as train or predict"
                )
            mode = config["MODE"].lower()
            assert mode in ["train", "predict"
                            ], "MODE should be either train or predict"
            if mode == "train":
                mode = Classifier.TRAIN
            else:
                mode = Classifier.PREDICT

            # Validate that train is not used on certain classifiers
            if mode == Classifier.TRAIN:
                assert name not in [
                    "PerfectClassifier", "UnityClassifier", "FitProbClassifier"
                ], f"Can not use train mode with {name}"

            needs_sim, needs_lc = cls.get_requirements(options)

            runs = []
            if needs_sim and needs_lc:
                runs = [(l.dependencies[0], l) for l in lcfit_tasks]
            elif needs_sim:
                runs = [(s, None) for s in sim_tasks]
            elif needs_lc:
                runs = [(l.dependencies[0], l) for l in lcfit_tasks]
            else:
                Task.logger.warn(
                    f"Classifier {name} does not need sims or fits. Wat.")

            num_gen = 0
            mask = config.get("MASK", "")
            mask_sim = config.get("MASK_SIM", "")
            mask_fit = config.get("MASK_FIT", "")
            for s, l in runs:

                sim_name = s.name if s is not None else None
                fit_name = l.name if l is not None else None
                matched_sim = True
                matched_fit = True
                if mask:
                    matched_sim = matched_sim and mask in sim_name
                if mask_sim:
                    matched_sim = matched_sim and mask_sim in sim_name
                if mask:
                    matched_fit = matched_fit and mask in sim_name
                if mask_fit:
                    matched_fit = matched_fit and mask_sim in sim_name
                if not matched_fit or not matched_sim:
                    continue
                deps = []
                if s is not None:
                    deps.append(s)
                if l is not None:
                    deps.append(l)

                model = options.get("MODEL")

                # Validate to make sure training samples only have one sim.
                if mode == Classifier.TRAIN:
                    if s is not None:
                        folders = s.output["sim_folders"]
                        assert (
                            len(folders) == 1
                        ), f"Training requires one version of the sim, you have {len(folders)} for sim task {s}. Make sure your training sim doesn't set RANSEED_CHANGE"
                    if l is not None:
                        folders = l.output["fitres_dirs"]
                        assert (
                            len(folders) == 1
                        ), f"Training requires one version of the lcfits, you have {len(folders)} for lcfit task {l}. Make sure your training sim doesn't set RANSEED_CHANGE"
                if model is not None:
                    if "/" in model or "." in model:
                        potential_path = get_output_loc(model)
                        if os.path.exists(potential_path):
                            extra = os.path.basename(
                                os.path.dirname(potential_path))

                            # Nasty duplicate code, TODO fix this
                            indexes = get_num_ranseed(s, l)
                            for i in range(indexes):
                                num = i + 1 if indexes > 1 else None
                                clas_output_dir = _get_clas_output_dir(
                                    base_output_dir,
                                    stage_number,
                                    sim_name,
                                    fit_name,
                                    clas_name,
                                    index=num,
                                    extra=extra)
                                cc = cls(clas_name,
                                         clas_output_dir,
                                         config,
                                         deps,
                                         mode,
                                         options,
                                         index=i,
                                         model_name=extra)
                                Task.logger.info(
                                    f"Creating classification task {name} with {cc.num_jobs} jobs, for LC fit {fit_name} on simulation {sim_name} and index {i}"
                                )
                                num_gen += 1
                                tasks.append(cc)

                        else:
                            Task.fail_config(
                                f"Your model {model} looks like a path, but I couldn't find a model at {potential_path}"
                            )
                    else:
                        for t in tasks:
                            if model == t.name:
                                # deps.append(t)
                                extra = t.get_unique_name()

                                assert t.__class__ == cls, f"Model {clas_name} with class {cls} has model {model} with class {t.__class__}, they should match!"

                                indexes = get_num_ranseed(s, l)
                                for i in range(indexes):
                                    num = i + 1 if indexes > 1 else None
                                    clas_output_dir = _get_clas_output_dir(
                                        base_output_dir,
                                        stage_number,
                                        sim_name,
                                        fit_name,
                                        clas_name,
                                        index=num,
                                        extra=extra)
                                    cc = cls(clas_name,
                                             clas_output_dir,
                                             config,
                                             deps + [t],
                                             mode,
                                             options,
                                             index=i)
                                    Task.logger.info(
                                        f"Creating classification task {name} with {cc.num_jobs} jobs, for LC fit {fit_name} on simulation {sim_name} and index {i}"
                                    )
                                    num_gen += 1
                                    tasks.append(cc)
                else:

                    indexes = get_num_ranseed(s, l)
                    for i in range(indexes):
                        num = i + 1 if indexes > 1 else None
                        clas_output_dir = _get_clas_output_dir(base_output_dir,
                                                               stage_number,
                                                               sim_name,
                                                               fit_name,
                                                               clas_name,
                                                               index=num)
                        cc = cls(clas_name,
                                 clas_output_dir,
                                 config,
                                 deps,
                                 mode,
                                 options,
                                 index=i)
                        Task.logger.info(
                            f"Creating classification task {name} with {cc.num_jobs} jobs, for LC fit {fit_name} on simulation {sim_name} and index {i}"
                        )
                        num_gen += 1
                        tasks.append(cc)

            if num_gen == 0:
                Task.fail_config(
                    f"Classifier {clas_name} with masks |{mask}|{mask_sim}|{mask_fit}| matched no combination of sims and fits"
                )
        return tasks
Пример #7
0
    def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix,
                  global_config):
        merge_tasks = Task.get_task_of_type(prior_tasks, Merger)
        prob_cols = {
            k: v
            for d in [t.output["classifier_merge"] for t in merge_tasks]
            for k, v in d.items()
        }
        classifier_tasks = Task.get_task_of_type(prior_tasks, Classifier)
        tasks = []

        def _get_biascor_output_dir(base_output_dir, stage_number,
                                    biascor_name):
            return f"{base_output_dir}/{stage_number}_BIASCOR/{biascor_name}"

        for name in c.get("BIASCOR", []):
            gname = name
            config = c["BIASCOR"][name]
            options = config.get("OPTS", {})
            deps = []

            # Create dict but swap out the names for tasks
            # do this for key 0 and for muopts
            # modify config directly
            # create copy to start with to keep labels if needed
            config_copy = copy.deepcopy(config)

            # Should return a single classifier task which maps to the desired prob column
            def resolve_classifiers(names):
                task = [c for c in classifier_tasks if c.name in names]
                if len(task) == 0:
                    if len(names) > 1:
                        Task.fail_config(
                            f"CLASSIFIERS {names} do not match any classifiers. If these are prob column names, you must specify only one!"
                        )
                    Task.logger.info(
                        f"CLASSIFIERS {names} matched no classifiers. Checking prob column names instead."
                    )
                    task = [
                        c for c in classifier_tasks
                        if prob_cols[c.name] in names
                    ]
                    if len(task) == 0:
                        choices = [prob_cols[c.name] for c in task]
                        message = f"Unable to resolve classifiers {names} from list of classifiers {classifier_tasks} using either name or prob columns {choices}"
                        Task.fail_config(message)
                    else:
                        task = [task[0]]
                elif len(task) > 1:
                    choices = list(set([prob_cols[c.name] for c in task]))
                    if len(choices) == 1:
                        task = [task[0]]
                    else:
                        Task.fail_config(
                            f"Found multiple classifiers. Please instead specify a column name. Your choices: {choices}"
                        )
                return task[0]  # We only care about the prob column name

            def resolve_merged_fitres_files(name, classifier_name):
                task = [
                    m for m in merge_tasks if m.output["lcfit_name"] == name
                ]
                if len(task) == 0:
                    valid = [m.output["lcfit_name"] for m in merge_tasks]
                    message = f"Unable to resolve merge {name} from list of merge_tasks. There are valid options: {valid}"
                    Task.fail_config(message)
                elif len(task) > 1:
                    message = f"Resolved multiple merge tasks {task} for name {name}"
                    Task.fail_config(message)
                else:
                    if classifier_name is not None and classifier_name not in task[
                            0].output["classifier_names"]:
                        if prob_cols[classifier_name] not in [
                                prob_cols[n]
                                for n in task[0].output['classifier_names']
                        ]:
                            Task.logger.warning(
                                f"When constructing Biascor {gname}, merge input {name} does not have classifier {classifier_name}. "
                                f"If this is a spec confirmed sample, or an EXTERNAL task, all good, else check this."
                            )
                    return task[0]

            # Ensure classifiers point to the same prob column
            def validate_classifiers(classifier_names):
                prob_col = []
                for name in classifier_names:
                    col = prob_cols.get(name)
                    if col is None:
                        # Check whether it is instead the prob_col name
                        if name in prob_cols.values():
                            prob_col.append(name)
                        else:
                            Task.fail_config(
                                f"Classifier {name} has no prob column name in {prob_cols}. This should never happen!"
                            )
                    else:
                        prob_col.append(col)
                if len(set(prob_col)) > 1:
                    Task.fail_config(
                        f"Classifiers {classifier_names} map to different probability columns: {prob_cols}, you may need to map them to the same name via MERGE_CLASSIFIERS in the AGGREGATION stage."
                    )
                else:
                    Task.logger.debug(
                        f"Classifiers {classifier_names} map to {prob_col[0]}")

            def resolve_conf(subdict, default=None):
                """ Resolve the sub-dictionary and keep track of all the dependencies """
                deps = []

                # If this is a muopt, allow access to the base config's resolution
                if default is None:
                    default = {}

                # Get the specific classifier
                classifier_names = subdict.get(
                    "CLASSIFIER")  # Specific classifier name
                if classifier_names is not None:
                    classifier_names = ensure_list(classifier_names)
                    validate_classifiers(classifier_names)
                #Task.logger.debug(f"XXX names: {classifier_names}")
                # Only if all classifiers point to the same prob_column should you continue
                classifier_task = None
                if classifier_names is not None:
                    classifier_task = resolve_classifiers(classifier_names)
                #Task.logger.debug(f"XXX tasks: {classifier_task}")
                classifier_dep = classifier_task or default.get(
                    "CLASSIFIER")  # For resolving merge tasks
                if classifier_dep is not None:
                    classifier_dep = classifier_dep.name
                #Task.logger.debug(f"XXX deps: {classifier_dep}")
                if "CLASSIFIER" in subdict:
                    subdict["CLASSIFIER"] = classifier_task
                    if classifier_task is not None:
                        deps.append(classifier_task)
                #Task.logger.debug(f"XXX global deps: {deps}")

                # Get the Ia sims
                simfile_ia = subdict.get("SIMFILE_BIASCOR")
                if default is None and simfile_ia is None:
                    Task.fail_config(
                        f"You must specify SIMFILE_BIASCOR for the default biascor. Supply a simulation name that has a merged output"
                    )
                if simfile_ia is not None:
                    simfile_ia = ensure_list(simfile_ia)
                    simfile_ia_tasks = [
                        resolve_merged_fitres_files(s, classifier_dep)
                        for s in simfile_ia
                    ]
                    deps += simfile_ia_tasks
                    subdict["SIMFILE_BIASCOR"] = simfile_ia_tasks

                # Resolve the cc sims
                simfile_cc = subdict.get("SIMFILE_CCPRIOR")
                if default is None and simfile_ia is None:
                    message = f"No SIMFILE_CCPRIOR specified. Hope you're doing a Ia only analysis"
                    Task.logger.warning(message)
                if simfile_cc is not None:
                    simfile_cc = ensure_list(simfile_cc)
                    simfile_cc_tasks = [
                        resolve_merged_fitres_files(s, classifier_dep)
                        for s in simfile_cc
                    ]
                    deps += simfile_cc_tasks
                    subdict["SIMFILE_CCPRIOR"] = simfile_cc_tasks

                return deps  # Changes to dict are by ref, will modify original

            deps += resolve_conf(config)
            # Resolve the data section
            data_names = config.get("DATA")
            if data_names is None:
                Task.fail_config(
                    "For BIASCOR tasks you need to specify an input DATA which is a mask for a merged task"
                )
            data_names = ensure_list(data_names)
            class_task = config.get("CLASSIFIER")
            class_name = class_task.name if class_task is not None else None
            data_tasks = [
                resolve_merged_fitres_files(s, class_name) for s in data_names
            ]
            deps += data_tasks
            config["DATA"] = data_tasks

            config["PROB_COLS"] = prob_cols

            # Resolve every MUOPT
            muopts = config.get("MUOPTS", {})
            for label, mu_conf in muopts.items():
                deps += resolve_conf(mu_conf, default=config)

            task = BiasCor(
                name,
                _get_biascor_output_dir(base_output_dir, stage_number, name),
                config, deps, options, global_config)
            Task.logger.info(
                f"Creating aggregation task {name} with {task.num_jobs}")
            tasks.append(task)

        return tasks