Exemplo n.º 1
0
def main(cfg_name):
    if not cfg_name:
        print(
            f"{_NAME_}: ERROR: No config file specified; set"
            " _PBENCH_SERVER_CONFIG env variable or use --config <file> on the"
            " command line",
            file=sys.stderr,
        )
        return 2

    try:
        config = PbenchServerConfig(cfg_name)
    except BadConfig as e:
        print(f"{_NAME_}: {e} (config file {cfg_name})", file=sys.stderr)
        return 1

    logger = get_pbench_logger(_NAME_, config)

    qdir, receive_dir = fetch_config_val(config, logger)

    if qdir is None and receive_dir is None:
        return 2

    qdir_md5 = qdirs_check("quarantine", Path(qdir, "md5-002"), logger)
    duplicates = qdirs_check("duplicates", Path(qdir, "duplicates-002"),
                             logger)

    # The following directory holds tarballs that are quarantined because
    # of operational errors on the server. They should be retried after
    # the problem is fixed: basically, move them back into the reception
    # area for 002 agents and wait.
    errors = qdirs_check("errors", Path(qdir, "errors-002"), logger)

    if qdir_md5 is None or duplicates is None or errors is None:
        return 1

    counts = process_tb(config, logger, receive_dir, qdir_md5, duplicates,
                        errors)

    result_string = (f"{config.TS}: Processed {counts.ntotal} entries,"
                     f" {counts.ntbs} tarballs successful,"
                     f" {counts.nquarantined} quarantined tarballs,"
                     f" {counts.ndups} duplicately-named tarballs,"
                     f" {counts.nerrs} errors.")

    logger.info(result_string)

    # prepare and send report
    with tempfile.NamedTemporaryFile(mode="w+t", dir=config.TMP) as reportfp:
        reportfp.write(f"{counts.nstatus}{result_string}\n")
        reportfp.seek(0)

        report = Report(config, _NAME_)
        report.init_report_template()
        try:
            report.post_status(config.timestamp(), "status", reportfp.name)
        except Exception as exc:
            logger.warning("Report post Unsuccesful: '{}'", exc)

    return 0
Exemplo n.º 2
0
    def process_tb(self, tarballs):
        """Process Tarballs For Indexing and creates report

            "tarballs" - List of tarball, it is the second value of
                the tuple returned by collect_tb() """
        res = 0
        idxctx = self.idxctx
        error_code = self.error_code

        tb_deque = deque(sorted(tarballs))

        # At this point, tarballs contains a list of tar balls sorted by size
        # that were available as symlinks in the various 'linksrc' directories.
        idxctx.logger.debug("Preparing to index {:d} tar balls", len(tb_deque))

        try:
            # Now that we are ready to begin the actual indexing step, ensure we
            # have the proper index templates in place.
            idxctx.logger.debug("update_templates [start]")
            idxctx.templates.update_templates(idxctx.es)
        except TemplateError as e:
            res = self.emit_error(idxctx.logger.error,
                                  "TEMPLATE_CREATION_ERROR", e)
        except SigTermException:
            # Re-raise a SIGTERM to avoid it being lumped in with general
            # exception handling below.
            raise
        except Exception:
            idxctx.logger.exception(
                "update_templates [end]: Unexpected template"
                " processing error")
            res = error_code["GENERIC_ERROR"]
        else:
            idxctx.logger.debug("update_templates [end]")
            res = error_code["OK"]

        if not res.success:
            # Exit early if we encounter any errors.
            return res.value

        report = Report(
            idxctx.config,
            self.name,
            es=idxctx.es,
            pid=idxctx.getpid(),
            group_id=idxctx.getgid(),
            user_id=idxctx.getuid(),
            hostname=idxctx.gethostname(),
            version=VERSION,
            templates=idxctx.templates,
        )
        # We use the "start" report ID as the tracking ID for all indexed
        # documents.
        try:
            tracking_id = report.post_status(tstos(idxctx.time()), "start")
        except SigTermException:
            # Re-raise a SIGTERM to avoid it being lumped in with general
            # exception handling below.
            raise
        except Exception:
            idxctx.logger.error("Failed to post initial report status")
            return error_code["GENERIC_ERROR"].value
        else:
            idxctx.set_tracking_id(tracking_id)

        with tempfile.TemporaryDirectory(prefix=f"{self.name}.",
                                         dir=idxctx.config.TMP) as tmpdir:
            idxctx.logger.debug("start processing list of tar balls")
            tb_list = Path(tmpdir, f"{self.name}.{idxctx.TS}.list")
            try:
                with tb_list.open(mode="w") as lfp:
                    # Write out all the tar balls we are processing so external
                    # viewers can follow along from home.
                    for size, controller, tb in tarballs:
                        print(f"{size:20d} {controller} {tb}", file=lfp)

                indexed = Path(tmpdir, f"{self.name}.{idxctx.TS}.indexed")
                erred = Path(tmpdir, f"{self.name}.{idxctx.TS}.erred")
                skipped = Path(tmpdir, f"{self.name}.{idxctx.TS}.skipped")
                ie_filepath = Path(
                    tmpdir, f"{self.name}.{idxctx.TS}.indexing-errors.json")

                # We use a list object here so that when we close over this
                # variable in the handler, the list object will be closed over,
                # but not its contents.
                sigquit_interrupt = [False]

                def sigquit_handler(*args):
                    sigquit_interrupt[0] = True

                sighup_interrupt = [False]

                def sighup_handler(*args):
                    sighup_interrupt[0] = True

                signal.signal(signal.SIGQUIT, sigquit_handler)
                signal.signal(signal.SIGHUP, sighup_handler)
                count_processed_tb = 0

                try:
                    while len(tb_deque) > 0:
                        size, controller, tb = tb_deque.popleft()
                        # Sanity check source tar ball path
                        linksrc_dir = Path(tb).parent
                        linksrc_dirname = linksrc_dir.name
                        count_processed_tb += 1
                        assert linksrc_dirname == self.linksrc, (
                            f"Logic bomb!  tar ball "
                            f"path {tb} does not contain {self.linksrc}")

                        idxctx.logger.info("Starting {} (size {:d})", tb, size)
                        dataset = None
                        ptb = None
                        username = None
                        try:
                            path = os.path.realpath(tb)

                            try:
                                dataset = Dataset.attach(
                                    path=path,
                                    state=States.INDEXING,
                                )
                            except DatasetNotFound:
                                idxctx.logger.warn(
                                    "Unable to locate Dataset {}",
                                    path,
                                )
                            except DatasetTransitionError as e:
                                # TODO: This means the Dataset is known, but not in a
                                # state where we'd expect to be indexing it. So what do
                                # we do with it? (Note: this is where an audit log will
                                # be handy; i.e., how did we get here?) For now, just
                                # let it go.
                                idxctx.logger.warn(
                                    "Unable to advance dataset state: {}",
                                    str(e))
                            else:
                                username = dataset.owner

                            # "Open" the tar ball represented by the tar ball object
                            idxctx.logger.debug("open tar ball")
                            ptb = PbenchTarBall(
                                idxctx,
                                username,
                                path,
                                tmpdir,
                                Path(self.incoming, controller),
                            )

                            # Construct the generator for emitting all actions.  The
                            # `idxctx` dictionary is passed along to each generator so
                            # that it can add its context for error handling to the
                            # list.
                            idxctx.logger.debug("generator setup")
                            if self.options.index_tool_data:
                                actions = ptb.mk_tool_data_actions()
                            else:
                                actions = ptb.make_all_actions()

                            # File name for containing all indexing errors that
                            # can't/won't be retried.
                            with ie_filepath.open(mode="w") as fp:
                                idxctx.logger.debug("begin indexing")
                                try:
                                    signal.signal(signal.SIGINT,
                                                  sigint_handler)
                                    es_res = es_index(
                                        idxctx.es,
                                        actions,
                                        fp,
                                        idxctx.logger,
                                        idxctx._dbg,
                                    )
                                except SigIntException:
                                    idxctx.logger.exception(
                                        "Indexing interrupted by SIGINT, continuing to next tarball"
                                    )
                                    continue
                                finally:
                                    # Turn off the SIGINT handler when not indexing.
                                    signal.signal(signal.SIGINT,
                                                  signal.SIG_IGN)
                        except UnsupportedTarballFormat as e:
                            tb_res = self.emit_error(idxctx.logger.warning,
                                                     "TB_META_ABSENT", e)
                        except BadDate as e:
                            tb_res = self.emit_error(idxctx.logger.warning,
                                                     "BAD_DATE", e)
                        except FileNotFoundError as e:
                            tb_res = self.emit_error(idxctx.logger.warning,
                                                     "FILE_NOT_FOUND_ERROR", e)
                        except BadMDLogFormat as e:
                            tb_res = self.emit_error(idxctx.logger.warning,
                                                     "BAD_METADATA", e)
                        except SigTermException:
                            idxctx.logger.exception(
                                "Indexing interrupted by SIGTERM, terminating")
                            break
                        except Exception as e:
                            tb_res = self.emit_error(idxctx.logger.exception,
                                                     "GENERIC_ERROR", e)
                        else:
                            beg, end, successes, duplicates, failures, retries = es_res
                            idxctx.logger.info(
                                "done indexing (start ts: {}, end ts: {}, duration:"
                                " {:.2f}s, successes: {:d}, duplicates: {:d},"
                                " failures: {:d}, retries: {:d})",
                                tstos(beg),
                                tstos(end),
                                end - beg,
                                successes,
                                duplicates,
                                failures,
                                retries,
                            )
                            tb_res = error_code[
                                "OP_ERROR" if failures > 0 else "OK"]
                        finally:
                            if dataset:
                                try:
                                    dataset.advance(
                                        States.INDEXED if tb_res.
                                        success else States.QUARANTINED)
                                    Metadata.remove(dataset, Metadata.REINDEX)
                                except DatasetTransitionError:
                                    idxctx.logger.exception(
                                        "Dataset state error")

                        try:
                            ie_len = ie_filepath.stat().st_size
                        except FileNotFoundError:
                            # Above operation never made it to actual indexing, ignore.
                            pass
                        except SigTermException:
                            # Re-raise a SIGTERM to avoid it being lumped in with
                            # general exception handling below.
                            raise
                        except Exception:
                            idxctx.logger.exception(
                                "Unexpected error handling"
                                " indexing errors file: {}",
                                ie_filepath,
                            )
                        else:
                            # Success fetching indexing error file size.
                            if ie_len > len(tb) + 1:
                                try:
                                    report.post_status(tstos(end), "errors",
                                                       ie_filepath)
                                except Exception:
                                    idxctx.logger.exception(
                                        "Unexpected error issuing"
                                        " report status with errors: {}",
                                        ie_filepath,
                                    )
                        finally:
                            # Unconditionally remove the indexing errors file.
                            try:
                                os.remove(ie_filepath)
                            except SigTermException:
                                # Re-raise a SIGTERM to avoid it being lumped in with
                                # general exception handling below.
                                raise
                            except Exception:
                                pass
                        # Distinguish failure cases, so we can retry the indexing
                        # easily if possible.  Different `linkerrdest` directories for
                        # different failures; the rest are going to end up in
                        # `linkerrdest` for later retry.
                        controller_path = linksrc_dir.parent

                        if tb_res is error_code["OK"]:
                            idxctx.logger.info(
                                "{}: {}/{}: success",
                                idxctx.TS,
                                controller_path.name,
                                os.path.basename(tb),
                            )
                            # Success
                            with indexed.open(mode="a") as fp:
                                print(tb, file=fp)
                            rename_tb_link(
                                tb, Path(controller_path, self.linkdest),
                                idxctx.logger)
                        elif tb_res is error_code["OP_ERROR"]:
                            idxctx.logger.warning(
                                "{}: index failures encountered on {}",
                                idxctx.TS, tb)
                            with erred.open(mode="a") as fp:
                                print(tb, file=fp)
                            rename_tb_link(
                                tb,
                                Path(controller_path, f"{self.linkerrdest}.1"),
                                idxctx.logger,
                            )
                        elif tb_res in (error_code["CFG_ERROR"],
                                        error_code["BAD_CFG"]):
                            assert False, (
                                f"Logic Bomb!  Unexpected tar ball handling "
                                f"result status {tb_res.value:d} for tar ball {tb}"
                            )
                        elif tb_res.tarball_error:
                            # # Quietly skip these errors
                            with skipped.open(mode="a") as fp:
                                print(tb, file=fp)
                            rename_tb_link(
                                tb,
                                Path(
                                    controller_path,
                                    f"{self.linkerrdest}.{tb_res.value:d}",
                                ),
                                idxctx.logger,
                            )
                        else:
                            idxctx.logger.error(
                                "{}: index error {:d} encountered on {}",
                                idxctx.TS,
                                tb_res.value,
                                tb,
                            )
                            with erred.open(mode="a") as fp:
                                print(tb, file=fp)
                            rename_tb_link(
                                tb,
                                Path(controller_path, self.linkerrdest),
                                idxctx.logger,
                            )
                        idxctx.logger.info(
                            "Finished{} {} (size {:d})",
                            "[SIGQUIT]" if sigquit_interrupt[0] else "",
                            tb,
                            size,
                        )

                        if sigquit_interrupt[0]:
                            break
                        if sighup_interrupt[0]:
                            status, new_tb = self.collect_tb()
                            if status == 0:
                                if not set(new_tb).issuperset(tb_deque):
                                    idxctx.logger.info(
                                        "Tarballs supposed to be in 'TO-INDEX' are no longer present",
                                        set(tb_deque).difference(new_tb),
                                    )
                                tb_deque = deque(sorted(new_tb))
                            idxctx.logger.info(
                                "SIGHUP status (Current tar ball indexed: ({}), Remaining: {}, Completed: {}, Errors_encountered: {}, Status: {})",
                                Path(tb).name,
                                len(tb_deque),
                                count_processed_tb,
                                _count_lines(erred),
                                tb_res,
                            )
                            sighup_interrupt[0] = False
                            continue
                except SigTermException:
                    idxctx.logger.exception(
                        "Indexing interrupted by SIGQUIT, stop processing tarballs"
                    )
                finally:
                    # Turn off the SIGQUIT and SIGHUP handler when not indexing.
                    signal.signal(signal.SIGQUIT, signal.SIG_IGN)
                    signal.signal(signal.SIGHUP, signal.SIG_IGN)
            except SigTermException:
                # Re-raise a SIGTERM to avoid it being lumped in with general
                # exception handling below.
                raise
            except Exception:
                idxctx.logger.exception(error_code["GENERIC_ERROR"].message)
                res = error_code["GENERIC_ERROR"]
            else:
                # No exceptions while processing tar ball, success.
                res = error_code["OK"]
            finally:
                if idxctx:
                    idxctx.dump_opctx()
                idxctx.logger.debug("stopped processing list of tar balls")

                idx = _count_lines(indexed)
                skp = _count_lines(skipped)
                err = _count_lines(erred)

                idxctx.logger.info(
                    "{}.{}: indexed {:d} (skipped {:d}) results,"
                    " {:d} errors",
                    self.name,
                    idxctx.TS,
                    idx,
                    skp,
                    err,
                )

                if err > 0:
                    if skp > 0:
                        subj = (
                            f"{self.name}.{idxctx.TS} - Indexed {idx:d} results, skipped {skp:d}"
                            f" results, w/ {err:d} errors")
                    else:
                        subj = (
                            f"{self.name}.{idxctx.TS} - Indexed {idx:d} results, w/ {err:d}"
                            " errors")
                else:
                    if skp > 0:
                        subj = f"{self.name}.{idxctx.TS} - Indexed {idx:d} results, skipped {skp:d} results"
                    else:
                        subj = f"{self.name}.{idxctx.TS} - Indexed {idx:d} results"

                report_fname = Path(tmpdir, f"{self.name}.{idxctx.TS}.report")
                with report_fname.open(mode="w") as fp:
                    print(subj, file=fp)
                    if idx > 0:
                        print("\nIndexed Results\n===============", file=fp)
                        with indexed.open() as ifp:
                            for line in sorted(ifp):
                                print(line.strip(), file=fp)
                    if err > 0:
                        print(
                            "\nResults producing errors"
                            "\n========================",
                            file=fp,
                        )
                        with erred.open() as efp:
                            for line in sorted(efp):
                                print(line.strip(), file=fp)
                    if skp > 0:
                        print("\nSkipped Results\n===============", file=fp)
                        with skipped.open() as sfp:
                            for line in sorted(sfp):
                                print(line.strip(), file=fp)
                try:
                    report.post_status(tstos(idxctx.time()), "status",
                                       report_fname)
                except SigTermException:
                    # Re-raise a SIGTERM to avoid it being lumped in with general
                    # exception handling below.
                    raise
                except Exception:
                    pass

        return res.value
Exemplo n.º 3
0
)
parsed = parser.parse_args()

try:
    config = PbenchServerConfig(parsed.cfg_name)
except BadConfig as e:
    print(f"{_prog}: {e}", file=sys.stderr)
    sys.exit(1)

# We're going to need the Postgres DB to track dataset state, so setup
# DB access. We don't pass a Logger here, because that introduces lots
# of spurious changes in the gold CLI test output.
init_db(config, None)

hostname = gethostname()
pid = parsed.pid
group_id = parsed.group_id
user_id = parsed.user_id

report = Report(
    config, parsed.name, pid=pid, group_id=group_id, user_id=user_id, hostname=hostname
)
report.init_report_template()
try:
    report.post_status(parsed.timestamp, parsed.doctype, parsed.file_to_index[0])
except Exception:
    status = 1
else:
    status = 0
sys.exit(status)
Exemplo n.º 4
0
def main(cfg_name):
    if not cfg_name:
        print(
            f"{_NAME_}: ERROR: No config file specified; set"
            " _PBENCH_SERVER_CONFIG env variable or use --config <file> on the"
            " command line",
            file=sys.stderr,
        )
        return 2

    try:
        config = PbenchServerConfig(cfg_name)
    except BadConfig as e:
        print(f"{_NAME_}: {e}", file=sys.stderr)
        return 1

    logger = get_pbench_logger(_NAME_, config)

    # We're going to need the Postgres DB to track dataset state, so setup
    # DB access.
    init_db(config, logger)

    # Add a BACKUP and QDIR field to the config object
    config.BACKUP = config.conf.get("pbench-server", "pbench-backup-dir")
    config.QDIR = config.get("pbench-server", "pbench-quarantine-dir")

    # call the LocalBackupObject class
    lb_obj = LocalBackupObject(config)

    # call the S3Config class
    s3_obj = S3Config(config, logger)

    lb_obj, s3_obj = sanity_check(lb_obj, s3_obj, config, logger)

    if lb_obj is None and s3_obj is None:
        return 3

    logger.info("start-{}", config.TS)

    # Initiate the backup
    counts = backup_data(lb_obj, s3_obj, config, logger)

    result_string = (f"Total processed: {counts.ntotal},"
                     f" Local backup successes: {counts.nbackup_success},"
                     f" Local backup failures: {counts.nbackup_fail},"
                     f" S3 upload successes: {counts.ns3_success},"
                     f" S3 upload failures: {counts.ns3_fail},"
                     f" Quarantined: {counts.nquaran}")

    logger.info(result_string)

    prog = Path(sys.argv[0]).name

    # prepare and send report
    with tempfile.NamedTemporaryFile(mode="w+t", dir=config.TMP) as reportfp:
        reportfp.write(
            f"{prog}.{config.timestamp()}({config.PBENCH_ENV})\n{result_string}\n"
        )
        reportfp.seek(0)

        report = Report(config, _NAME_)
        report.init_report_template()
        try:
            report.post_status(config.timestamp(), "status", reportfp.name)
        except Exception:
            pass

    logger.info("end-{}", config.TS)

    return 0
Exemplo n.º 5
0
def main(options):
    if not options.cfg_name:
        print(
            f"{_NAME_}: ERROR: No config file specified; set"
            " _PBENCH_SERVER_CONFIG env variable",
            file=sys.stderr,
        )
        return 1

    try:
        config = PbenchServerConfig(options.cfg_name)
    except BadConfig as e:
        print(f"{_NAME_}: {e}", file=sys.stderr)
        return 2

    logger = get_pbench_logger(_NAME_, config)

    archivepath = config.ARCHIVE

    incoming = config.INCOMING
    incomingpath = config.get_valid_dir_option("INCOMING", incoming, logger)
    if not incomingpath:
        return 3

    results = config.RESULTS
    resultspath = config.get_valid_dir_option("RESULTS", results, logger)
    if not resultspath:
        return 3

    users = config.USERS
    userspath = config.get_valid_dir_option("USERS", users, logger)
    if not userspath:
        return 3

    # Fetch the configured maximum number of days a tar can remain "unpacked"
    # in the INCOMING tree.
    try:
        max_unpacked_age = config.conf.get("pbench-server", "max-unpacked-age")
    except NoOptionError as e:
        logger.error(f"{e}")
        return 5
    try:
        max_unpacked_age = int(max_unpacked_age)
    except Exception:
        logger.error("Bad maximum unpacked age, {}", max_unpacked_age)
        return 6

    # First phase is to find all the tar balls which are beyond the max
    # unpacked age, and which still have an unpacked directory in INCOMING.
    if config._ref_datetime is not None:
        try:
            curr_dt = config._ref_datetime
        except Exception:
            # Ignore bad dates from test environment.
            curr_dt = datetime.utcnow()
    else:
        curr_dt = datetime.utcnow()

    _msg = "Culling unpacked tar balls {} days older than {}"
    if options.dry_run:
        print(
            _msg.format(max_unpacked_age, curr_dt.strftime(_STD_DATETIME_FMT)))
    else:
        logger.debug(_msg, max_unpacked_age,
                     curr_dt.strftime(_STD_DATETIME_FMT))

    actions_taken = []
    errors = 0
    start = pbench.server._time()

    gen = gen_list_unpacked_aged(incomingpath, archivepath, curr_dt,
                                 max_unpacked_age)
    if config._unittests:
        # force the generator and sort the list
        gen = sorted(list(gen))

    for tb_incoming_dir, controller_name in gen:
        act_set = remove_unpacked(
            tb_incoming_dir,
            controller_name,
            resultspath,
            userspath,
            logger,
            options.dry_run,
        )
        unpacked_dir_name = Path(tb_incoming_dir).name
        act_path = Path(controller_name, unpacked_dir_name)
        act_set.set_name(act_path)
        actions_taken.append(act_set)
        if act_set.errors > 0:
            # Stop any further unpacked tar ball removal if an error is
            # encountered.
            break
    end = pbench.server._time()

    # Generate the ${TOP}/public_html prefix so we can strip it from the
    # various targets in the report.
    public_html = os.path.realpath(os.path.join(config.TOP, "public_html"))

    # Write the actions taken into a report file.
    with tempfile.NamedTemporaryFile(mode="w+t",
                                     prefix=f"{_NAME_}.",
                                     suffix=".report",
                                     dir=config.TMP) as tfp:
        duration = end - start
        total = len(actions_taken)
        print(
            f"Culled {total:d} unpacked tar ball directories ({errors:d}"
            f" errors) in {duration:0.2f} secs",
            file=tfp,
        )
        if total > 0:
            print("\nActions Taken:", file=tfp)
        for act_set in sorted(actions_taken, key=lambda a: a.name):
            print(
                f"  - {act_set.name} ({act_set.errors:d} errors,"
                f" {act_set.duration():0.2f} secs)",
                file=tfp,
            )
            for act in act_set.actions:
                assert act.noun.startswith(
                    public_html
                ), f"Logic bomb! {act.noun} not in .../public_html/"
                tgt = Path(act.noun[len(public_html) + 1:])
                if act.verb == "mv":
                    name = tgt.name
                    controller = tgt.parent
                    ex_tgt = controller / f".delete.{name}"
                    print(f"      $ {act.verb} {tgt} {ex_tgt}  # {act.status}",
                          file=tfp)
                else:
                    print(f"      $ {act.verb} {tgt}  # {act.status}",
                          file=tfp)

        # Flush out the report ahead of posting it.
        tfp.flush()
        tfp.seek(0)

        # We need to generate a report that lists all the actions taken.
        report = Report(config, _NAME_)
        report.init_report_template()
        try:
            report.post_status(config.timestamp(),
                               "status" if errors == 0 else "errors", tfp.name)
        except Exception:
            pass
    return errors
Exemplo n.º 6
0
def main(options, name):
    """Main entry point to pbench-index.

       The caller is required to pass the "options" argument with the following
       expected attributes:
           cfg_name              - Name of the configuration file to use
           dump_index_patterns   - Don't do any indexing, but just emit the
                                   list of index patterns that would be used
           dump_templates        - Dump the templates that would be used
           index_tool_data       - Index tool data only
           re_index              - Consider tar balls marked for re-indexing
       All exceptions are caught and logged to syslog with the stacktrace of
       the exception in a sub-object of the logged JSON document.

       Status codes used by es_index and the error below are defined from the
       list below to maintain compatibility with the previous code base when
       pbench-index was a bash script and invoked index-pbench (yes, a bit
       confusing), the predecessor to this program.  The codes were used to
       sort the errors we encountered processing tar balls in to categories
       of retry or not:

            0 - normal, successful exit, no errors
            1 - Operational error while indexing
            2 - Configuration file not specified
            3 - Bad configuration file
            4 - Tar ball does not contain a metadata.log file
            5 - Bad start run date value encountered
            6 - File Not Found error
            7 - Bad metadata.log file encountered
            8 - Error reading a mapping file for Elasticsearch templates
            9 - Error creating one of the Elasticsearch templates
           10 - Bad hostname in a sosreport
           11 - Failure unpacking the tar ball
           12 - generic error, needs to be investigated and can be retried
                after any indexing bugs are fixed.

       Return Values (now a sub-set of the original status codes above):
         0 - Successfully processed all tar balls (errors processing tar
             balls are reported in the logs and in index status reports)
         1 - Failed to process one or more tar balls for unknown reasons
             (see logs)
         2 - Missing configuration file
         3 - Invalid configuration file
         8 - Unable to load and process expected mapping files
         9 - Unable to update index templates in configured Elasticsearch
             instance

        Signal Handlers used to establish different patterns for the three
        behaviors:

        1. Gracefully stop processing tar balls
            - SIGQUIT
            - The current tar ball is indexed until completion, but no other
              tar balls are processed.
            - Handler Behavior:
                - Sets a flag that causes the code flow to break out of the
                  for loop.
                - Does not raise an exception.

        2. Interrupt the current tar ball being indexed, and proceed to the
           next one, if any
            - SIGINT
            - Handler Behavior:
                - try/except/finally placed immediately around the es_index()
                  call so that the signal handler will only be established for
                  the duration of the call.
                - Raises an exception caught by above try/except/finally.
                - The finally clause would take down the signal handler.

        3. Stop processing tar balls immediately and exit gracefully
            - SIGTERM
            - Handler Behavior:
                - Raises an exception caught be a new, outer-most, try/except
                  block that does not have a finally clause (as you don't want
                  any code execution in the finally block).
    """
    _name_suf = "-tool-data" if options.index_tool_data else ""
    _name_re = "-re" if options.re_index else ""
    name = f"{name}{_name_re}{_name_suf}"

    if not options.cfg_name:
        print(
            f"{name}: ERROR: No config file specified; set"
            " _PBENCH_SERVER_CONFIG env variable or"
            " use --config <file> on the command line",
            file=sys.stderr,
        )
        return 2

    idxctx = None
    try:
        idxctx = IdxContext(options, name, _dbg=_DEBUG)
    except (ConfigFileError, ConfigParserError) as e:
        print(f"{name}: {e}", file=sys.stderr)
        return 2
    except BadConfig as e:
        print(f"{name}: {e}", file=sys.stderr)
        return 3
    except JsonFileError as e:
        print(f"{name}: {e}", file=sys.stderr)
        return 8

    if options.dump_index_patterns:
        idxctx.templates.dump_idx_patterns()
        return 0

    if options.dump_templates:
        idxctx.templates.dump_templates()
        return 0

    _re_idx = "RE-" if options.re_index else ""
    if options.index_tool_data:
        # The link source and destination for the operation of this script
        # when it only indexes tool data.
        linksrc = "TO-INDEX-TOOL"
        linkdest = "INDEXED"
    else:
        # The link source and destination for the operation of this script
        # when it indexes run, table-of-contents, and result data.
        linksrc = f"TO-{_re_idx}INDEX"
        linkdest = "TO-INDEX-TOOL"
    # We only ever use a symlink'd error destination for indexing
    # problems.
    linkerrdest = "WONT-INDEX"

    res = 0

    ARCHIVE_rp = idxctx.config.ARCHIVE

    INCOMING_rp = idxctx.config.INCOMING
    INCOMING_path = idxctx.config.get_valid_dir_option("INCOMING", INCOMING_rp,
                                                       idxctx.logger)
    if not INCOMING_path:
        res = 3

    qdir = idxctx.config.get_conf("QUARANTINE", "pbench-server",
                                  "pbench-quarantine-dir", idxctx.logger)
    if not qdir:
        res = 3
    else:
        qdir_path = idxctx.config.get_valid_dir_option("QDIR", Path(qdir),
                                                       idxctx.logger)
        if not qdir_path:
            res = 3

    if res != 0:
        # Exit early if we encounter any errors.
        return res

    idxctx.logger.debug("{}.{}: starting", name, idxctx.TS)

    # find -L $ARCHIVE/*/$linksrc -name '*.tar.xz' -printf "%s\t%p\n" 2>/dev/null | sort -n > $list
    tarballs = []
    try:
        tb_glob = os.path.join(ARCHIVE_rp, "*", linksrc, "*.tar.xz")
        for tb in glob.iglob(tb_glob):
            try:
                rp = Path(tb).resolve(strict=True)
            except OSError:
                idxctx.logger.warning("{} does not resolve to a real path", tb)
                quarantine(qdir, idxctx.logger, tb)
                continue
            controller_path = rp.parent
            controller = controller_path.name
            archive_path = controller_path.parent
            if str(archive_path) != str(ARCHIVE_rp):
                idxctx.logger.warning(
                    "For tar ball {}, original home is not {}", tb, ARCHIVE_rp)
                quarantine(qdir, idxctx.logger, tb)
                continue
            if not Path(f"{rp}.md5").is_file():
                idxctx.logger.warning("Missing .md5 file for {}", tb)
                quarantine(qdir, idxctx.logger, tb)
                # Audit should pick up missing .md5 file in ARCHIVE directory.
                continue
            try:
                # get size
                size = rp.stat().st_size
            except OSError:
                idxctx.logger.warning("Could not fetch tar ball size for {}",
                                      tb)
                quarantine(qdir, idxctx.logger, tb)
                # Audit should pick up missing .md5 file in ARCHIVE directory.
                continue
            else:
                tarballs.append((size, controller, tb))
    except SigTermException:
        # Re-raise a SIGTERM to avoid it being lumped in with general
        # exception handling below.
        raise
    except Exception:
        idxctx.logger.exception("Unexpected error encountered generating list"
                                " of tar balls to process")
        return 12
    else:
        if not tarballs:
            idxctx.logger.info("No tar balls found that need processing")
            return 0

    # We always process the smallest tar balls first.
    tarballs = sorted(tarballs)

    # At this point, tarballs contains a list of tar balls sorted by size
    # that were available as symlinks in the various 'linksrc' directories.
    idxctx.logger.debug("Preparing to index {:d} tar balls", len(tarballs))

    try:
        # Now that we are ready to begin the actual indexing step, ensure we
        # have the proper index templates in place.
        idxctx.logger.debug("update_templates [start]")
        idxctx.templates.update_templates(idxctx.es)
    except TemplateError as e:
        idxctx.logger.error("update_templates [end], error {}", repr(e))
        res = 9
    except SigTermException:
        # Re-raise a SIGTERM to avoid it being lumped in with general
        # exception handling below.
        raise
    except Exception:
        idxctx.logger.exception("update_templates [end]: Unexpected template"
                                " processing error")
        res = 12
    else:
        idxctx.logger.debug("update_templates [end]")
        res = 0

    if res != 0:
        # Exit early if we encounter any errors.
        return res

    report = Report(
        idxctx.config,
        name,
        es=idxctx.es,
        pid=idxctx.getpid(),
        group_id=idxctx.getgid(),
        user_id=idxctx.getuid(),
        hostname=idxctx.gethostname(),
        version=VERSION,
        templates=idxctx.templates,
    )
    # We use the "start" report ID as the tracking ID for all indexed
    # documents.
    try:
        tracking_id = report.post_status(tstos(idxctx.time()), "start")
    except SigTermException:
        # Re-raise a SIGTERM to avoid it being lumped in with general
        # exception handling below.
        raise
    except Exception:
        idxctx.logger.error("Failed to post initial report status")
        return 12
    else:
        idxctx.set_tracking_id(tracking_id)

    with tempfile.TemporaryDirectory(prefix=f"{name}.",
                                     dir=idxctx.config.TMP) as tmpdir:
        idxctx.logger.debug("start processing list of tar balls")
        tb_list = Path(tmpdir, f"{name}.{idxctx.TS}.list")
        try:
            with tb_list.open(mode="w") as lfp:
                # Write out all the tar balls we are processing so external
                # viewers can follow along from home.
                for size, controller, tb in tarballs:
                    print(f"{size:20d} {controller} {tb}", file=lfp)

            indexed = Path(tmpdir, f"{name}.{idxctx.TS}.indexed")
            erred = Path(tmpdir, f"{name}.{idxctx.TS}.erred")
            skipped = Path(tmpdir, f"{name}.{idxctx.TS}.skipped")
            ie_filepath = Path(tmpdir,
                               f"{name}.{idxctx.TS}.indexing-errors.json")

            # We use a list object here so that when we close over this
            # variable in the handler, the list object will be closed over,
            # but not its contents.
            sigquit_interrupt = [False]

            def sigquit_handler(*args):
                sigquit_interrupt[0] = True

            signal.signal(signal.SIGQUIT, sigquit_handler)

            for size, controller, tb in tarballs:
                # Sanity check source tar ball path
                linksrc_dir = Path(tb).parent
                linksrc_dirname = linksrc_dir.name
                assert linksrc_dirname == linksrc, (
                    f"Logic bomb!  tar ball "
                    f"path {tb} does not contain {linksrc}")

                idxctx.logger.info("Starting {} (size {:d})", tb, size)

                ptb = None
                try:
                    # "Open" the tar ball represented by the tar ball object
                    idxctx.logger.debug("open tar ball")
                    ptb = PbenchTarBall(
                        idxctx,
                        os.path.realpath(tb),
                        tmpdir,
                        Path(INCOMING_rp, controller),
                    )

                    # Construct the generator for emitting all actions.  The
                    # `idxctx` dictionary is passed along to each generator so
                    # that it can add its context for error handling to the
                    # list.
                    idxctx.logger.debug("generator setup")
                    if options.index_tool_data:
                        actions = ptb.mk_tool_data_actions()
                    else:
                        actions = ptb.make_all_actions()

                    # File name for containing all indexing errors that
                    # can't/won't be retried.
                    with ie_filepath.open(mode="w") as fp:
                        idxctx.logger.debug("begin indexing")
                        try:
                            signal.signal(signal.SIGINT, sigint_handler)
                            es_res = es_index(idxctx.es, actions, fp,
                                              idxctx.logger, idxctx._dbg)
                        except SigIntException:
                            idxctx.logger.exception(
                                "Indexing interrupted by SIGINT, continuing to next tarball"
                            )
                            continue
                        finally:
                            # Turn off the SIGINT handler when not indexing.
                            signal.signal(signal.SIGINT, signal.SIG_IGN)
                except UnsupportedTarballFormat as e:
                    idxctx.logger.warning("Unsupported tar ball format: {}", e)
                    tb_res = 4
                except BadDate as e:
                    idxctx.logger.warning("Bad Date: {!r}", e)
                    tb_res = 5
                except _filenotfounderror as e:
                    idxctx.logger.warning("No such file: {}", e)
                    tb_res = 6
                except BadMDLogFormat as e:
                    idxctx.logger.warning(
                        "The metadata.log file is curdled in"
                        " tar ball: {}", e)
                    tb_res = 7
                except SosreportHostname as e:
                    idxctx.logger.warning("Bad hostname in sosreport: {}", e)
                    tb_res = 10
                except tarfile.TarError as e:
                    idxctx.logger.error("Can't unpack tar ball into {}: {}",
                                        ptb.extracted_root, e)
                    tb_res = 11
                except SigTermException:
                    idxctx.logger.exception(
                        "Indexing interrupted by SIGTERM, terminating")
                    break
                except Exception as e:
                    idxctx.logger.exception("Other indexing error: {}", e)
                    tb_res = 12
                else:
                    beg, end, successes, duplicates, failures, retries = es_res
                    idxctx.logger.info(
                        "done indexing (start ts: {}, end ts: {}, duration:"
                        " {:.2f}s, successes: {:d}, duplicates: {:d},"
                        " failures: {:d}, retries: {:d})",
                        tstos(beg),
                        tstos(end),
                        end - beg,
                        successes,
                        duplicates,
                        failures,
                        retries,
                    )
                    tb_res = 1 if failures > 0 else 0
                try:
                    ie_len = ie_filepath.stat().st_size
                except _filenotfounderror:
                    # Above operation never made it to actual indexing, ignore.
                    pass
                except SigTermException:
                    # Re-raise a SIGTERM to avoid it being lumped in with
                    # general exception handling below.
                    raise
                except Exception:
                    idxctx.logger.exception(
                        "Unexpected error handling"
                        " indexing errors file: {}",
                        ie_filepath,
                    )
                else:
                    # Success fetching indexing error file size.
                    if ie_len > len(tb) + 1:
                        try:
                            report.post_status(tstos(end), "errors",
                                               ie_filepath)
                        except Exception:
                            idxctx.logger.exception(
                                "Unexpected error issuing"
                                " report status with errors: {}",
                                ie_filepath,
                            )
                finally:
                    # Unconditionally remove the indexing errors file.
                    try:
                        os.remove(ie_filepath)
                    except SigTermException:
                        # Re-raise a SIGTERM to avoid it being lumped in with
                        # general exception handling below.
                        raise
                    except Exception:
                        pass
                # Distinguish failure cases, so we can retry the indexing
                # easily if possible.  Different `linkerrdest` directories for
                # different failures; the rest are going to end up in
                # `linkerrdest` for later retry.
                controller_path = linksrc_dir.parent

                if tb_res == 0:
                    idxctx.logger.info(
                        "{}: {}/{}: success",
                        idxctx.TS,
                        controller_path.name,
                        os.path.basename(tb),
                    )
                    # Success
                    with indexed.open(mode="a") as fp:
                        print(tb, file=fp)
                    rename_tb_link(tb, Path(controller_path, linkdest),
                                   idxctx.logger)
                elif tb_res == 1:
                    idxctx.logger.warning(
                        "{}: index failures encountered on {}", idxctx.TS, tb)
                    with erred.open(mode="a") as fp:
                        print(tb, file=fp)
                    rename_tb_link(
                        tb,
                        Path(controller_path, f"{linkerrdest}.1"),
                        idxctx.logger,
                    )
                elif tb_res in (2, 3):
                    assert False, (
                        f"Logic Bomb!  Unexpected tar ball handling "
                        f"result status {tb_res:d} for tar ball {tb}")
                elif tb_res >= 4 or res <= 11:
                    # # Quietly skip these errors
                    with skipped.open(mode="a") as fp:
                        print(tb, file=fp)
                    rename_tb_link(
                        tb,
                        Path(controller_path, f"{linkerrdest}.{tb_res:d}"),
                        idxctx.logger,
                    )
                else:
                    idxctx.logger.error(
                        "{}: index error {:d} encountered on {}", idxctx.TS,
                        tb_res, tb)
                    with erred.open(mode="a") as fp:
                        print(tb, file=fp)
                    rename_tb_link(tb, Path(controller_path, linkerrdest),
                                   idxctx.logger)
                idxctx.logger.info(
                    "Finished{} {} (size {:d})",
                    "[SIGQUIT]" if sigquit_interrupt[0] else "",
                    tb,
                    size,
                )

                if sigquit_interrupt[0]:
                    break
        except SigTermException:
            # Re-raise a SIGTERM to avoid it being lumped in with general
            # exception handling below.
            raise
        except Exception:
            idxctx.logger.exception("Unexpected setup error")
            res = 12
        else:
            # No exceptions while processing tar ball, success.
            res = 0
        finally:
            if idxctx:
                idxctx.dump_opctx()
            idxctx.logger.debug("stopped processing list of tar balls")

            idx = _count_lines(indexed)
            skp = _count_lines(skipped)
            err = _count_lines(erred)

            idxctx.logger.info(
                "{}.{}: indexed {:d} (skipped {:d}) results,"
                " {:d} errors",
                name,
                idxctx.TS,
                idx,
                skp,
                err,
            )

            if err > 0:
                if skp > 0:
                    subj = (
                        f"{name}.{idxctx.TS} - Indexed {idx:d} results, skipped {skp:d}"
                        f" results, w/ {err:d} errors")
                else:
                    subj = (
                        f"{name}.{idxctx.TS} - Indexed {idx:d} results, w/ {err:d}"
                        " errors")
            else:
                if skp > 0:
                    subj = f"{name}.{idxctx.TS} - Indexed {idx:d} results, skipped {skp:d} results"
                else:
                    subj = f"{name}.{idxctx.TS} - Indexed {idx:d} results"

            report_fname = Path(tmpdir, f"{name}.{idxctx.TS}.report")
            with report_fname.open(mode="w") as fp:
                print(subj, file=fp)
                if idx > 0:
                    print("\nIndexed Results\n===============", file=fp)
                    with indexed.open() as ifp:
                        for line in sorted(ifp):
                            print(line.strip(), file=fp)
                if err > 0:
                    print(
                        "\nResults producing errors"
                        "\n========================",
                        file=fp,
                    )
                    with erred.open() as efp:
                        for line in sorted(efp):
                            print(line.strip(), file=fp)
                if skp > 0:
                    print("\nSkipped Results\n===============", file=fp)
                    with skipped.open() as sfp:
                        for line in sorted(sfp):
                            print(line.strip(), file=fp)
            try:
                report.post_status(tstos(idxctx.time()), "status",
                                   report_fname)
            except SigTermException:
                # Re-raise a SIGTERM to avoid it being lumped in with general
                # exception handling below.
                raise
            except Exception:
                pass

    return res
def main():
    cfg_name = os.environ.get("_PBENCH_SERVER_CONFIG")
    if not cfg_name:
        print(
            f"{_NAME_}: ERROR: No config file specified; set _PBENCH_SERVER_CONFIG env variable or"
            f" use --config <file> on the command line",
            file=sys.stderr,
        )
        return 2

    try:
        config = PbenchServerConfig(cfg_name)
    except BadConfig as e:
        print(f"{_NAME_}: {e}", file=sys.stderr)
        return 1

    logger = get_pbench_logger(_NAME_, config)

    # We're going to need the Postgres DB to track dataset state, so setup
    # DB access.
    init_db(config, logger)

    # add a BACKUP field to the config object
    config.BACKUP = backup = config.conf.get("pbench-server",
                                             "pbench-backup-dir")

    if not backup:
        logger.error(
            "Unspecified backup directory, no pbench-backup-dir config in"
            " pbench-server section")
        return 1

    backuppath = config.get_valid_dir_option("BACKUP", backup, logger)
    if not backuppath:
        return 1

    # instantiate the s3config class
    s3_config_obj = S3Config(config, logger)
    s3_config_obj = sanity_check(s3_config_obj, logger)

    logger.info("start-{}", config.TS)
    start = config.timestamp()

    prog = Path(sys.argv[0]).name

    sts = 0
    # N.B. tmpdir is the pathname of the temp directory.
    with tempfile.TemporaryDirectory() as tmpdir:

        archive_obj = BackupObject("ARCHIVE", config.ARCHIVE, tmpdir, logger)
        local_backup_obj = BackupObject("BACKUP", config.BACKUP, tmpdir,
                                        logger)
        s3_backup_obj = BackupObject("S3", s3_config_obj, tmpdir, logger)

        with tempfile.NamedTemporaryFile(mode="w+t", dir=tmpdir) as reportfp:
            reportfp.write(
                f"{prog}.{config.TS} ({config.PBENCH_ENV}) started at {start}\n"
            )
            if s3_config_obj is None:
                reportfp.write(
                    "\nNOTICE: S3 backup service is inaccessible; skipping"
                    " ARCHIVE to S3 comparison\n\n")

            # FIXME: Parallelize these three ...

            # Create entry list for archive
            logger.debug("Starting archive list creation")
            ar_start = config.timestamp()
            ret_sts = archive_obj.entry_list_creation()
            if ret_sts == Status.FAIL:
                sts += 1
            logger.debug("Finished archive list ({!r})", ret_sts)

            # Create entry list for backup
            logger.debug("Starting local backup list creation")
            lb_start = config.timestamp()
            ret_sts = local_backup_obj.entry_list_creation()
            if ret_sts == Status.FAIL:
                sts += 1
            logger.debug("Finished local backup list ({!r})", ret_sts)

            # Create entry list for S3
            if s3_config_obj is not None:
                logger.debug("Starting S3 list creation")
                s3_start = config.timestamp()
                ret_sts = s3_backup_obj.entry_list_creation()
                if ret_sts == Status.FAIL:
                    sts += 1
                logger.debug("Finished S3 list ({!r})", ret_sts)

            logger.debug("Checking MD5 signatures of archive")
            ar_md5_start = config.timestamp()
            try:
                # Check the data integrity in ARCHIVE (Question 1).
                md5_result_archive = archive_obj.checkmd5()
            except Exception as ex:
                msg = f"Failed to check data integrity of ARCHIVE ({config.ARCHIVE})"
                logger.exception(msg)
                reportfp.write(f"\n{msg} - '{ex}'\n")
                sts += 1
            else:
                if md5_result_archive > 0:
                    # Create a report for failed MD5 results from ARCHIVE (Question 1)
                    archive_obj.report_failed_md5(reportfp)
                    sts += 1
                    logger.debug(
                        "Checking MD5 signature of archive: {} errors",
                        md5_result_archive,
                    )
            logger.debug("Finished checking MD5 signatures of archive")

            logger.debug("Checking MD5 signatures of local backup")
            lb_md5_start = config.timestamp()
            try:
                # Check the data integrity in BACKUP (Question 2).
                md5_result_backup = local_backup_obj.checkmd5()
            except Exception as ex:
                msg = f"Failed to check data integrity of BACKUP ({config.BACKUP})"
                logger.exception(msg)
                reportfp.write(f"\n{msg} - '{ex}'\n")
            else:
                if md5_result_backup > 0:
                    # Create a report for failed MD5 results from BACKUP (Question 2)
                    local_backup_obj.report_failed_md5(reportfp)
                    sts += 1
                    logger.debug(
                        "Checking MD5 signature of local backup: {} errors",
                        md5_result_backup,
                    )
            logger.debug("Finished checking MD5 signatures of local backup")

            # Compare ARCHIVE with BACKUP (Questions 3 and 3a).
            msg = "Comparing ARCHIVE with BACKUP"
            reportfp.write(f"\n{msg}\n{'-' * len(msg)}\n")
            logger.debug("{}: start", msg)
            compare_entry_lists(archive_obj, local_backup_obj, reportfp,
                                logger)
            logger.debug("{}: end", msg)

            if s3_config_obj is not None:
                # Compare ARCHIVE with S3 (Questions 4, 4a, and 4b).
                msg = "Comparing ARCHIVE with S3"
                reportfp.write(f"\n{msg}\n{'-' * len(msg)}\n")
                logger.debug("{}: start", msg)
                compare_entry_lists(archive_obj, s3_backup_obj, reportfp,
                                    logger)
                logger.debug("{}: end", msg)

            if s3_config_obj is None:
                s3_start = "<skipped>"
            reportfp.write(f"\n\nPhases (started):\n"
                           f"Archive List Creation:       {ar_start}\n"
                           f"Local Backup List Creation:  {lb_start}\n"
                           f"S3 List Creation:            {s3_start}\n"
                           f"Archive MD5 Checks:          {ar_md5_start}\n"
                           f"Local Backup MD5 Checks:     {lb_md5_start}\n")

            end = config.timestamp()
            reportfp.write(
                f"\n{prog}.{config.TS} ({config.PBENCH_ENV}) finished at {end}\n"
            )

            # Rewind to the beginning.
            reportfp.seek(0)

            logger.debug("Sending report: start")
            report = Report(config, _NAME_)
            report.init_report_template()
            try:
                report.post_status(config.timestamp(), "status", reportfp.name)
            except Exception:
                pass
            logger.debug("Sending report: end")

    logger.info("end-{}", config.TS)

    return sts