def quarantine(dest, logger, *files): """Quarantine problematic tarballs. Errors here are fatal but we log an error message to help diagnose problems. """ try: os.mkdir(dest) except FileExistsError: # directory already exists, ignore pass except Exception: logger.exception('quarantine {} {!r}: "mkdir -p {}/" failed', dest, files, dest) sys.exit(101) for afile in files: if not os.path.exists(afile) and not os.path.islink(afile): continue try: # If the file we're moving is a tarball, update the dataset # state. (If it's the associated MD5 file, skip that.) if str(afile).endswith(".tar.xz"): try: Dataset.attach(path=afile, state=States.QUARANTINED) except DatasetNotFound: logger.exception("quarantine dataset {} not found", afile) shutil.move(afile, os.path.join(dest, os.path.basename(afile))) except Exception: logger.exception('quarantine {} {!r}: "mv {} {}/" failed', dest, files, afile, dest) sys.exit(102)
def test_attach_none(self): """ Test expected failure when we try to attach to a dataset that does not exist. """ with pytest.raises(DatasetNotFound): Dataset.attach(controller="frodo", name="venus", state=States.UPLOADING)
def test_upload(client, pytestconfig, caplog, server_config): filename = "log.tar.xz" datafile = Path("./lib/pbench/test/unit/server/fixtures/upload/", filename) controller = socket.gethostname() with open(f"{datafile}.md5") as md5sum_check: md5sum = md5sum_check.read() with open(datafile, "rb") as data_fp: response = client.put( f"{server_config.rest_uri}/upload/ctrl/{controller}", data=data_fp, headers={"filename": filename, "Content-MD5": md5sum}, ) assert response.status_code == 201, repr(response) sfilename = secure_filename(filename) tmp_d = pytestconfig.cache.get("TMP", None) receive_dir = os.path.join( tmp_d, "srv", "pbench", "pbench-move-results-receive", "fs-version-002" ) assert os.path.exists(receive_dir), ( f"receive_dir = '{receive_dir}', filename = '{filename}'," f" sfilename = '{sfilename}'" ) dataset = Dataset.attach(controller=controller, path=filename) assert dataset is not None assert dataset.md5 == md5sum assert dataset.controller == controller assert dataset.name == "log" assert dataset.state == States.UPLOADED for record in caplog.records: assert record.levelname not in ("WARNING", "ERROR", "CRITICAL")
def process(self, link: str, state: States) -> int: """ process Create Dataset records for pre-existing server tarballs that are in a specified filesystem "state" (the link directory in the archive tree), in a specified Dataset state. Each tarball for which a Dataset record already exists is IGNORED, and we don't attempt to advance the state. Args: :link (str): Filesystem "state" link directory (e.g., TO-INDEX) :state (States): A state enum value representing desired Dataset state. Returns: int: Status (0 success, 1 failure) """ logger = self.logger done = 0 fail = 0 ignore = 0 args = {} owner = User.validate_user(self.options.user) for tarball in self._collect_tb(link): if self.options.verify: print(f"Processing {tarball} from {link} -> state {state}") try: args["path"] = tarball try: dataset = Dataset.attach(**args) if self.options.verify: print(f"Found existing {dataset}: {dataset.state}") ignore = ignore + 1 except DatasetNotFound: a = args.copy() a["md5"] = open(f"{tarball}.md5").read().split()[0] # NOTE: including "state" on attach above would attempt to # advance the dataset's state, which we don't want for # import, so we add it only here. "owner" would be ignored # by attach, but we add it here anyway for clarity. a["state"] = state a["owner"] = owner dataset = Dataset.create(**a) print(f"Imported {dataset}: {state}") done = done + 1 except Exception as e: # Stringify any exception and report it; then fail logger.exception("Import of dataset {} failed", tarball) print(f"{_NAME_}: dataset {tarball} failed with {e}", file=sys.stderr) fail = fail + 1 print( f"Imported {done} datasets from {link} with {fail} errors and {ignore} ignored" ) return 1 if fail > 0 else 0
def test_attach_filename(self): """ Test that we can create a dataset using the full tarball file path. """ ds1 = Dataset(owner="webb", path="/foo/bilbo/rover.tar.xz", state=States.QUARANTINED) ds1.add() ds2 = Dataset.attach(controller="bilbo", name="rover") assert ds2.owner == ds1.owner assert ds2.controller == ds1.controller assert ds2.name == ds1.name assert ds2.state == States.QUARANTINED assert ds2.md5 is ds1.md5 assert ds2.id is ds1.id
def test_attach_controller_path(self): """ Test that we can attach using controller and name to a dataset created by file path. """ ds1 = Dataset( owner="pete", path="/foo/frodo/rover.tar.xz", state=States.INDEXING, ) ds1.add() ds2 = Dataset.attach(controller="frodo", name="rover") assert ds2.owner == ds1.owner assert ds2.controller == ds1.controller assert ds2.name == ds1.name assert ds2.state == States.INDEXING assert ds2.md5 is ds1.md5 assert ds2.id is ds1.id
def test_attach_exists(self): """ Test that we can attach to a dataset """ ds1 = Dataset(owner="drb", controller="frodo", name="fido", state=States.INDEXING) ds1.add() ds2 = Dataset.attach(controller="frodo", name="fido", state=States.INDEXED) assert ds2.owner == ds1.owner assert ds2.controller == ds1.controller assert ds2.name == ds1.name assert ds2.state == States.INDEXED assert ds2.md5 is ds1.md5 assert ds2.id is ds1.id
def process_tb(self, tarballs): """Process Tarballs For Indexing and creates report "tarballs" - List of tarball, it is the second value of the tuple returned by collect_tb() """ res = 0 idxctx = self.idxctx error_code = self.error_code tb_deque = deque(sorted(tarballs)) # At this point, tarballs contains a list of tar balls sorted by size # that were available as symlinks in the various 'linksrc' directories. idxctx.logger.debug("Preparing to index {:d} tar balls", len(tb_deque)) try: # Now that we are ready to begin the actual indexing step, ensure we # have the proper index templates in place. idxctx.logger.debug("update_templates [start]") idxctx.templates.update_templates(idxctx.es) except TemplateError as e: res = self.emit_error(idxctx.logger.error, "TEMPLATE_CREATION_ERROR", e) except SigTermException: # Re-raise a SIGTERM to avoid it being lumped in with general # exception handling below. raise except Exception: idxctx.logger.exception( "update_templates [end]: Unexpected template" " processing error") res = error_code["GENERIC_ERROR"] else: idxctx.logger.debug("update_templates [end]") res = error_code["OK"] if not res.success: # Exit early if we encounter any errors. return res.value report = Report( idxctx.config, self.name, es=idxctx.es, pid=idxctx.getpid(), group_id=idxctx.getgid(), user_id=idxctx.getuid(), hostname=idxctx.gethostname(), version=VERSION, templates=idxctx.templates, ) # We use the "start" report ID as the tracking ID for all indexed # documents. try: tracking_id = report.post_status(tstos(idxctx.time()), "start") except SigTermException: # Re-raise a SIGTERM to avoid it being lumped in with general # exception handling below. raise except Exception: idxctx.logger.error("Failed to post initial report status") return error_code["GENERIC_ERROR"].value else: idxctx.set_tracking_id(tracking_id) with tempfile.TemporaryDirectory(prefix=f"{self.name}.", dir=idxctx.config.TMP) as tmpdir: idxctx.logger.debug("start processing list of tar balls") tb_list = Path(tmpdir, f"{self.name}.{idxctx.TS}.list") try: with tb_list.open(mode="w") as lfp: # Write out all the tar balls we are processing so external # viewers can follow along from home. for size, controller, tb in tarballs: print(f"{size:20d} {controller} {tb}", file=lfp) indexed = Path(tmpdir, f"{self.name}.{idxctx.TS}.indexed") erred = Path(tmpdir, f"{self.name}.{idxctx.TS}.erred") skipped = Path(tmpdir, f"{self.name}.{idxctx.TS}.skipped") ie_filepath = Path( tmpdir, f"{self.name}.{idxctx.TS}.indexing-errors.json") # We use a list object here so that when we close over this # variable in the handler, the list object will be closed over, # but not its contents. sigquit_interrupt = [False] def sigquit_handler(*args): sigquit_interrupt[0] = True sighup_interrupt = [False] def sighup_handler(*args): sighup_interrupt[0] = True signal.signal(signal.SIGQUIT, sigquit_handler) signal.signal(signal.SIGHUP, sighup_handler) count_processed_tb = 0 try: while len(tb_deque) > 0: size, controller, tb = tb_deque.popleft() # Sanity check source tar ball path linksrc_dir = Path(tb).parent linksrc_dirname = linksrc_dir.name count_processed_tb += 1 assert linksrc_dirname == self.linksrc, ( f"Logic bomb! tar ball " f"path {tb} does not contain {self.linksrc}") idxctx.logger.info("Starting {} (size {:d})", tb, size) dataset = None ptb = None username = None try: path = os.path.realpath(tb) try: dataset = Dataset.attach( path=path, state=States.INDEXING, ) except DatasetNotFound: idxctx.logger.warn( "Unable to locate Dataset {}", path, ) except DatasetTransitionError as e: # TODO: This means the Dataset is known, but not in a # state where we'd expect to be indexing it. So what do # we do with it? (Note: this is where an audit log will # be handy; i.e., how did we get here?) For now, just # let it go. idxctx.logger.warn( "Unable to advance dataset state: {}", str(e)) else: username = dataset.owner # "Open" the tar ball represented by the tar ball object idxctx.logger.debug("open tar ball") ptb = PbenchTarBall( idxctx, username, path, tmpdir, Path(self.incoming, controller), ) # Construct the generator for emitting all actions. The # `idxctx` dictionary is passed along to each generator so # that it can add its context for error handling to the # list. idxctx.logger.debug("generator setup") if self.options.index_tool_data: actions = ptb.mk_tool_data_actions() else: actions = ptb.make_all_actions() # File name for containing all indexing errors that # can't/won't be retried. with ie_filepath.open(mode="w") as fp: idxctx.logger.debug("begin indexing") try: signal.signal(signal.SIGINT, sigint_handler) es_res = es_index( idxctx.es, actions, fp, idxctx.logger, idxctx._dbg, ) except SigIntException: idxctx.logger.exception( "Indexing interrupted by SIGINT, continuing to next tarball" ) continue finally: # Turn off the SIGINT handler when not indexing. signal.signal(signal.SIGINT, signal.SIG_IGN) except UnsupportedTarballFormat as e: tb_res = self.emit_error(idxctx.logger.warning, "TB_META_ABSENT", e) except BadDate as e: tb_res = self.emit_error(idxctx.logger.warning, "BAD_DATE", e) except FileNotFoundError as e: tb_res = self.emit_error(idxctx.logger.warning, "FILE_NOT_FOUND_ERROR", e) except BadMDLogFormat as e: tb_res = self.emit_error(idxctx.logger.warning, "BAD_METADATA", e) except SigTermException: idxctx.logger.exception( "Indexing interrupted by SIGTERM, terminating") break except Exception as e: tb_res = self.emit_error(idxctx.logger.exception, "GENERIC_ERROR", e) else: beg, end, successes, duplicates, failures, retries = es_res idxctx.logger.info( "done indexing (start ts: {}, end ts: {}, duration:" " {:.2f}s, successes: {:d}, duplicates: {:d}," " failures: {:d}, retries: {:d})", tstos(beg), tstos(end), end - beg, successes, duplicates, failures, retries, ) tb_res = error_code[ "OP_ERROR" if failures > 0 else "OK"] finally: if dataset: try: dataset.advance( States.INDEXED if tb_res. success else States.QUARANTINED) Metadata.remove(dataset, Metadata.REINDEX) except DatasetTransitionError: idxctx.logger.exception( "Dataset state error") try: ie_len = ie_filepath.stat().st_size except FileNotFoundError: # Above operation never made it to actual indexing, ignore. pass except SigTermException: # Re-raise a SIGTERM to avoid it being lumped in with # general exception handling below. raise except Exception: idxctx.logger.exception( "Unexpected error handling" " indexing errors file: {}", ie_filepath, ) else: # Success fetching indexing error file size. if ie_len > len(tb) + 1: try: report.post_status(tstos(end), "errors", ie_filepath) except Exception: idxctx.logger.exception( "Unexpected error issuing" " report status with errors: {}", ie_filepath, ) finally: # Unconditionally remove the indexing errors file. try: os.remove(ie_filepath) except SigTermException: # Re-raise a SIGTERM to avoid it being lumped in with # general exception handling below. raise except Exception: pass # Distinguish failure cases, so we can retry the indexing # easily if possible. Different `linkerrdest` directories for # different failures; the rest are going to end up in # `linkerrdest` for later retry. controller_path = linksrc_dir.parent if tb_res is error_code["OK"]: idxctx.logger.info( "{}: {}/{}: success", idxctx.TS, controller_path.name, os.path.basename(tb), ) # Success with indexed.open(mode="a") as fp: print(tb, file=fp) rename_tb_link( tb, Path(controller_path, self.linkdest), idxctx.logger) elif tb_res is error_code["OP_ERROR"]: idxctx.logger.warning( "{}: index failures encountered on {}", idxctx.TS, tb) with erred.open(mode="a") as fp: print(tb, file=fp) rename_tb_link( tb, Path(controller_path, f"{self.linkerrdest}.1"), idxctx.logger, ) elif tb_res in (error_code["CFG_ERROR"], error_code["BAD_CFG"]): assert False, ( f"Logic Bomb! Unexpected tar ball handling " f"result status {tb_res.value:d} for tar ball {tb}" ) elif tb_res.tarball_error: # # Quietly skip these errors with skipped.open(mode="a") as fp: print(tb, file=fp) rename_tb_link( tb, Path( controller_path, f"{self.linkerrdest}.{tb_res.value:d}", ), idxctx.logger, ) else: idxctx.logger.error( "{}: index error {:d} encountered on {}", idxctx.TS, tb_res.value, tb, ) with erred.open(mode="a") as fp: print(tb, file=fp) rename_tb_link( tb, Path(controller_path, self.linkerrdest), idxctx.logger, ) idxctx.logger.info( "Finished{} {} (size {:d})", "[SIGQUIT]" if sigquit_interrupt[0] else "", tb, size, ) if sigquit_interrupt[0]: break if sighup_interrupt[0]: status, new_tb = self.collect_tb() if status == 0: if not set(new_tb).issuperset(tb_deque): idxctx.logger.info( "Tarballs supposed to be in 'TO-INDEX' are no longer present", set(tb_deque).difference(new_tb), ) tb_deque = deque(sorted(new_tb)) idxctx.logger.info( "SIGHUP status (Current tar ball indexed: ({}), Remaining: {}, Completed: {}, Errors_encountered: {}, Status: {})", Path(tb).name, len(tb_deque), count_processed_tb, _count_lines(erred), tb_res, ) sighup_interrupt[0] = False continue except SigTermException: idxctx.logger.exception( "Indexing interrupted by SIGQUIT, stop processing tarballs" ) finally: # Turn off the SIGQUIT and SIGHUP handler when not indexing. signal.signal(signal.SIGQUIT, signal.SIG_IGN) signal.signal(signal.SIGHUP, signal.SIG_IGN) except SigTermException: # Re-raise a SIGTERM to avoid it being lumped in with general # exception handling below. raise except Exception: idxctx.logger.exception(error_code["GENERIC_ERROR"].message) res = error_code["GENERIC_ERROR"] else: # No exceptions while processing tar ball, success. res = error_code["OK"] finally: if idxctx: idxctx.dump_opctx() idxctx.logger.debug("stopped processing list of tar balls") idx = _count_lines(indexed) skp = _count_lines(skipped) err = _count_lines(erred) idxctx.logger.info( "{}.{}: indexed {:d} (skipped {:d}) results," " {:d} errors", self.name, idxctx.TS, idx, skp, err, ) if err > 0: if skp > 0: subj = ( f"{self.name}.{idxctx.TS} - Indexed {idx:d} results, skipped {skp:d}" f" results, w/ {err:d} errors") else: subj = ( f"{self.name}.{idxctx.TS} - Indexed {idx:d} results, w/ {err:d}" " errors") else: if skp > 0: subj = f"{self.name}.{idxctx.TS} - Indexed {idx:d} results, skipped {skp:d} results" else: subj = f"{self.name}.{idxctx.TS} - Indexed {idx:d} results" report_fname = Path(tmpdir, f"{self.name}.{idxctx.TS}.report") with report_fname.open(mode="w") as fp: print(subj, file=fp) if idx > 0: print("\nIndexed Results\n===============", file=fp) with indexed.open() as ifp: for line in sorted(ifp): print(line.strip(), file=fp) if err > 0: print( "\nResults producing errors" "\n========================", file=fp, ) with erred.open() as efp: for line in sorted(efp): print(line.strip(), file=fp) if skp > 0: print("\nSkipped Results\n===============", file=fp) with skipped.open() as sfp: for line in sorted(sfp): print(line.strip(), file=fp) try: report.post_status(tstos(idxctx.time()), "status", report_fname) except SigTermException: # Re-raise a SIGTERM to avoid it being lumped in with general # exception handling below. raise except Exception: pass return res.value
def backup_data(lb_obj, s3_obj, config, logger): qdir = config.QDIR tarlist = glob.iglob( os.path.join(config.ARCHIVE, "*", _linksrc, "*.tar.xz")) ntotal = nbackup_success = nbackup_fail = ns3_success = ns3_fail = nquaran = 0 for tb in sorted(tarlist): ntotal += 1 # resolve the link try: tar = Path(tb).resolve(strict=True) except FileNotFoundError: logger.error( "Tarball link, '{}', does not resolve to a real location", tb) logger.debug("Start backup of {}.", tar) # check tarball exist and it is a regular file if tar.exists() and tar.is_file(): pass else: # tarball does not exist or it is not a regular file quarantine(qdir, logger, tb) nquaran += 1 logger.error( "Quarantine: {}, {} does not exist or it is not a regular file", tb, tar, ) continue archive_md5 = Path(f"{tar}.md5") # check that the md5 file exists and it is a regular file if archive_md5.exists() and archive_md5.is_file(): pass else: # md5 file does not exist or it is not a regular file quarantine(qdir, logger, tb) nquaran += 1 logger.error( "Quarantine: {}, {} does not exist or it is not a regular file", tb, archive_md5, ) continue # read the md5sum from md5 file try: with archive_md5.open() as f: archive_md5_hex_value = f.readline().split(" ")[0] except Exception: # Could not read file. quarantine(qdir, logger, tb) nquaran += 1 logger.exception("Quarantine: {}, Could not read {}", tb, archive_md5) continue # match md5sum of the tarball to its md5 file try: archive_tar_hex_value = md5sum(tar) except Exception: # Could not read file. quarantine(qdir, logger, tb) nquaran += 1 logger.exception("Quarantine: {}, Could not read {}", tb, tar) continue if archive_tar_hex_value != archive_md5_hex_value: quarantine(qdir, logger, tb) nquaran += 1 logger.error( "Quarantine: {}, md5sum of {} does not match with its md5 file {}", tb, tar, archive_md5, ) continue resultname = tar.name controller_path = tar.parent controller = controller_path.name try: # This tool can't see a dataset until it's been prepared either # by server PUT or by pbench-server-prep-shim-002.py; in either # case, the Dataset object must already exist. dataset = Dataset.attach(controller=controller, path=resultname) except DatasetError as e: logger.warning("Trouble tracking {}:{}: {}", controller, resultname, str(e)) dataset = None # This will handle all the local backup related # operations and count the number of successes and failures. local_backup_result = backup_to_local( lb_obj, logger, controller_path, controller, tb, tar, resultname, archive_md5, archive_md5_hex_value, ) if local_backup_result == Status.SUCCESS: nbackup_success += 1 elif local_backup_result == Status.FAIL: nbackup_fail += 1 else: assert ( False ), f"Impossible situation, local_backup_result = {local_backup_result!r}" # This will handle all the S3 bucket related operations # and count the number of successes and failures. s3_backup_result = backup_to_s3( s3_obj, logger, controller_path, controller, tb, tar, resultname, archive_md5_hex_value, ) if s3_backup_result == Status.SUCCESS: ns3_success += 1 elif s3_backup_result == Status.FAIL: ns3_fail += 1 else: assert ( False ), f"Impossible situation, s3_backup_result = {s3_backup_result!r}" if local_backup_result == Status.SUCCESS and ( s3_obj is None or s3_backup_result == Status.SUCCESS): # Move tar ball symlink to its final resting place rename_tb_link(tb, Path(controller_path, _linkdest), logger) else: # Do nothing when the backup fails, allowing us to retry on a # future pass. pass if dataset: Metadata.create(dataset=dataset, key=Metadata.ARCHIVED, value="True") logger.debug("End backup of {}.", tar) return Results( ntotal=ntotal, nbackup_success=nbackup_success, nbackup_fail=nbackup_fail, ns3_success=ns3_success, ns3_fail=ns3_fail, nquaran=nquaran, )
def reindex(controller_name, tb_name, archive_p, incoming_p, dry_run=False): """reindex - re-index the given tar ball name. This method is responsible for finding the current symlink to the tar ball and moving it to the TO-RE-INDEX directory, creating that directory if it does not exist. """ assert tb_name.endswith(".tar.xz"), f"invalid tar ball name, '{tb_name}'" if not (incoming_p / controller_name / tb_name[:-7]).exists(): # Can't re-index tar balls that are not unpacked return (controller_name, tb_name, "not-unpacked", "") # Construct the controller path object used throughout the rest of this # method. controller_p = archive_p / controller_name # Construct the target path to which all tar ball symlinks will be moved. newpath = controller_p.joinpath("TO-RE-INDEX", tb_name) paths = [] _linkdirs = ("TO-INDEX-TOOL", "INDEXED") for linkname_p in controller_p.glob(f"*/{tb_name}"): # Consider all existing tar ball symlinks if linkname_p.parent.name in ("TO-INDEX", "TO-RE-INDEX"): msg = ( f"WARNING: {linkname_p.parent.name} link already exists for" f" {controller_p / tb_name}" ) # Indicate no action taken, and exit early. return (controller_name, tb_name, "exists", msg) elif linkname_p.parent.name in _linkdirs: # One of the expected symlinks, add it for consideration below. paths.append(linkname_p) elif linkname_p.parent.name.startswith("WONT-INDEX"): # One of the expected WONT-INDEX* symlinks, also added for # consideration below. paths.append(linkname_p) # else: # All other symlinks are not considered. if not paths: # No existing TO-INDEX or TO-RE-INDEX symlink, and no previous # indexing symlinks, exit early. return (controller_name, tb_name, "noop", "") if len(paths) > 1: # If we have more than one path then just flag this as a bad state # and exit early. return (controller_name, tb_name, "badstate", "") # At this point we are guaranteed to have only one path. assert len(paths) == 1, f"Logic bomb! len(paths) ({len(paths)}) != 1" try: if not dry_run: paths[0].rename(newpath) ds = Dataset.attach(controller=controller_name, path=tb_name) Metadata.create(dataset=ds, key=Metadata.REINDEX, value="True") except DatasetError as exc: msg = f"WARNING: unable to set REINDEX metadata for {controller_name}:{tb_name}: {str(exc)}" res = "error" except Exception as exc: msg = ( f"WARNING: failed to rename symlink '{paths[0]}' to" f" '{newpath}', '{exc}'" ) res = "error" else: msg = "" res = "succ" return (controller_name, tb_name, res, msg)