def subject(self): """Subject line for the report's email message.""" if not hasattr(self, "_subject"): tier = Tier().name self._subject = f"[{tier}] CDR Pending Scheduled Jobs" return self._subject
def tier(self): """ Identification of which CDR server is running the publishing job """ if not hasattr(self, "_tier"): self._tier = Tier() return self._tier
def __init__(self, logger=None, recip=None): """ Collect the glossary term information. Pass: logger - the scheduled job's logger (unless testing from the command line) recip - optional email address for testing without spamming the users """ self.tier = Tier() self.logger = logger self.recip = recip if self.logger is None: self.logger = cdr.Logging.get_logger("glossifier", level="debug") self.conn = db.connect() self.cursor = self.conn.cursor()
def __init__(self): """ Constructs job control object for restoring data on CDR DEV server. 1. Make sure we're running on the DEV tier. 2. Get the parameters for this job. 3. Create the control object for the job. """ # 1. Safety check. if Tier().name != "DEV": raise Exception("This script must only be run on the DEV tier.") # 2. Get what we need from the command line. parser = ArgumentParser() parser.add_argument("--directory", required=True, help="directory to restore from") parser.add_argument("--user", required=True, help="user ID") parser.add_argument("--session", required=True, help="user session") parser.add_argument( "--skip-content", action="store_true", help="exclude practice documents from being restored") opts = parser.parse_args() # 3. Create objects used to do the job's work. self._logger = cdr.Logging.get_logger("PushDevData", console=True) self._conn = db.connect(user="******") self._cursor = self._conn.cursor() self._dir = opts.directory self._skip_content = opts.skip_content or False self._old = cdr_dev_data.Data(self._dir) self._new = cdr_dev_data.Data(self._cursor, self._old) self._uid = opts.user self._session = opts.session self._logger.info("session %s", self._session) self._logger.info("using data preserved in %s", self._dir) self._new_doc_types = []
def tier(self): """Name of the local tier.""" if not hasattr(self, "_tier"): self._tier = Tier() return self._tier
import cdr from cdrapi import db as cdrdb from cdrapi.settings import Tier parser = argparse.ArgumentParser() parser.add_argument("doctype") parser.add_argument("--max-docs", type=int) parser.add_argument("--tier") parser.add_argument("--skip", type=int) opts = parser.parse_args() cursor = cdrdb.connect(user="******", tier=opts.tier).cursor() query = cdrdb.Query("document d", "d.id") query.join("doc_type t", "t.id = d.doc_type") query.where(query.Condition("t.name", opts.doctype)) if opts.max_docs: query.limit(opts.max_docs) rows = query.order("d.id").execute(cursor).fetchall() if opts.skip: rows = rows[opts.skip:] where = opts.tier if opts.tier else Tier().name stderr.write("reindexing {} documents on {}\n".format(len(rows), where)) count = 0 for doc_id, in rows: count += 1 args = doc_id, count, len(rows) stderr.write("\rreindexing CDR{:010d} {:d} of {:d}".format(*args)) resp = cdr.reindex("guest", doc_id, tier=opts.tier) if resp: stderr.write("\n{!r}\n".format(resp)) stderr.write("\n")
return sets def get_members(members): lines = [] for member in members: if isinstance(member, FilterSet): lines.append(f"filter set: {member.name}") else: lines.append(f"filter: {member.title}") return lines parser = ArgumentParser() parser.add_argument("--other_tier", default="PROD") parser.add_argument("--local_tier", default=Tier().name) opts = parser.parse_args() local = get_sets(opts.local_tier) other = get_sets(opts.other_tier) other_names = sorted(other) position = 0 banner = f"Comparing FilterSets between {opts.other_tier} and {opts.local_tier}" for name in sorted(local): while position < len(other_names) and other_names[position] < name: if banner: print(banner) print() banner = None print(other_names[position]) print(f"local name is {name!r}") print(f"other name is {other_names[position]!r}")
#---------------------------------------------------------------------- # Preprocess login creation/configuration script for this tier. #---------------------------------------------------------------------- import time import cdrpw from cdrapi.settings import Tier script = open("CreateLogins.sql", "rb").read() try: with open("d:/etc/cdrenv.rc") as fp: env = fp.read().strip() except: env = "CBIIT" tier = Tier().name dboPw = cdrpw.password(env, tier, "cdr", "CdrSqlAccount") guestPw = cdrpw.password(env, tier, "cdr", "CdrGuest") pubPw = cdrpw.password(env, tier, "cdr", "CdrPublishing") script = script.replace("@@DBOPW@@", dboPw) script = script.replace("@@GUESTPW@@", guestPw) script = script.replace("@@PUBPW@@", pubPw) now = time.strftime("%Y%m%d%H%M%S") name = "CreateLogins-%s.sql" % now with open(name, "wb") as fp: fp.write(script) print("wrote", name)
def tier(self): """Run time settings.""" if not hasattr(self, "_tier"): self._tier = Tier() return self._tier
args = parser.parse_args() opts["thread"] = args.thread if args.debug: opts["level"] = "DEBUG" if args.output: opts["output-dir"] = args.output control = Control(args.session, args.job, args.spec, *args.docs, **opts) try: control.run() control.logger.info("Thread %05d complete", args.thread) for thread in threading.enumerate(): control.logger.info("%s is active", thread) sys.exit(0) except Exception: control.logger.exception("Thread %05d failure", args.thread) sys.exit(1) if __name__ == "__main__": """ Don't invoke `main()` if loaded as a module. """ try: main() except Exception: tier = Tier() logger = tier.get_logger("export-docs") logger.exception("Unable to construct CDR document export controller") sys.exit(1)
def tier(self): """Which CDR tier are we using?""" if not hasattr(self, "_tier"): self._tier = Tier() return self._tier
class Settings: TIER = Tier() HOSTNAMES = TIER.hosts LOGFILE = f"{cdr.DEFAULT_LOGDIR}/fetch-tier-settings.log" WD = cdr.WORK_DRIVE WEBCONFIG_ROOT = f"{WD}:/Inetpub/wwwroot/web.config" WEBCONFIG_SECURE = f"{WD}:/Inetpub/wwwroot/cgi-bin/secure/web.config" WEBCONFIG_GLOSSIFIER = f"{WD}:/cdr/Glossifier/cgi-bin/web.config" def __init__(self, session): self.session = session try: with open(f"self.TIER.etc/cdrenv.rc") as fp: self.org = fp.read().strip() except: self.org = "CBIIT" self.tier = self.TIER.name self.windows = self.get_windows_settings() def get_iis_settings(self): return { "account": cdr.run_command("whoami").stdout.strip(), "version": os.environ.get("SERVER_SOFTWARE"), "web.config": { "root": self.xmltojson(self.WEBCONFIG_ROOT), "secure": self.xmltojson(self.WEBCONFIG_SECURE), "glossifier": self.xmltojson(self.WEBCONFIG_GLOSSIFIER), } } def xmltojson(self, path): root = etree.parse(path).getroot() return {root.tag: self.extract_node(root)} def extract_node(self, node): children = {} for key in node.keys(): children[key] = [node.get(key)] for child in node: if child.tag not in children: children[child.tag] = [] children[child.tag].append(self.extract_node(child)) for name in children: if len(children[name]) == 1: children[name] = children[name][0] return children def get_windows_settings(self): winver = sys.getwindowsversion() settings = {"version": {}} for name in ("major", "minor", "build", "platform", "service_pack"): settings["version"][name] = getattr(winver, name, "") settings["environ"] = dict(os.environ) path = [p for p in os.environ.get("PATH").split(";") if p] settings["search_path"] = path settings["mssql"] = self.get_mssql_settings() settings["python"] = self.get_python_settings() settings["iis"] = self.get_iis_settings() settings["files"] = self.get_files() settings["doctypes"] = self.get_doctypes() return settings def get_doctypes(self): doctypes = {} path = f"{self.WD}:/cdr/ClientFiles/CdrDocTypes.xml" root = etree.parse(path).getroot() for node in root.findall("CdrGetDocTypeResp"): key = node.get("Type") doctypes[key] = {} for child in node: if child.tag == "EnumSet": values = [vv.text for vv in child.findall("ValidValue")] doctypes[key][child.get("Node")] = sorted(values) elif child.tag == "LinkingElements": elems = [e.text for e in child.findall("LinkingElements")] doctypes[key]["linking-elements"] = sorted(elems) return doctypes def get_files(self): files = {} self.walk(files, f"{self.WD}:/cdr/lib") self.walk(files, f"{self.WD}:/cdr/Bin") self.walk(files, f"{self.WD}:/cdr/Build") self.walk(files, f"{self.WD}:/cdr/ClientFiles") self.walk(files, f"{self.WD}:/cdr/Glossifier") self.walk(files, f"{self.WD}:/cdr/Licensee") self.walk(files, f"{self.WD}:/cdr/Mailers") self.walk(files, f"{self.WD}:/cdr/Publishing") self.walk(files, f"{self.WD}:/cdr/Licensee") self.walk(files, f"{self.WD}:/Inetpub/wwwroot") return files def walk(self, files, path): for path, dirs, filenames in os.walk(path): if "__pycache__" in path: continue path = path.replace("\\", "/") directory = files for name in path.split("/")[1:]: if name not in directory: directory[name] = {} directory = directory[name] for name in filenames: self.add_file(path, name, directory) def add_file(self, path, name, files): try: path = "%s/%s" % (path, name) fp = open(path, "rb") bytes = fp.read() fp.close() md5 = hashlib.md5() md5.update(bytes) md5 = md5.hexdigest().lower() except Exception as e: md5 = "unreadable" files[name] = md5 def get_python_settings(self): env = pkg_resources.Environment() settings = dict(python=sys.version) for name in env: for package in env[name]: settings[package.project_name] = package.version return settings def get_mssql_settings(self): cursor = db.connect().cursor() cursor.execute("EXEC sp_server_info") settings = {} for attr_id, attr_name, attr_value in cursor.fetchall(): settings[attr_name] = attr_value return settings def serialize(self): return json.dumps({ "windows": self.windows, }, indent=2) def run(self): print(f"Content-type: application/json\n\n{self.serialize()}")
def setUp(self): password = Tier().password(self.USERNAME) opts = dict(comment="filter testing", password=password) Tests.session = Session.create_session(self.USERNAME, **opts)
class Control(Controller): """Logic control center for the script.""" SUBTITLE = "Glossary Term Audio Review" SAVE = "Save" BASEDIR = Tier().basedir LOGNAME = "GlossaryTermAudioReview" ZIPDIR = f"{BASEDIR}/Audio_from_CIPSFTP" REVDIR = f"{ZIPDIR}/GeneratedRevisionSheets" IGNORE = "__MACOSX" NOTEPAT = compile(r"[\r\n]+") NAMEPAT = compile(r"(?i)(?P<base>Week_\d{4}_\d\d)(?P<rev>_Rev\d)*.zip") REVPAT = compile(r"(?i)_Rev(?P<num>\d+)") MAXNOTE = 2040 MAXFILE = 250 MAXTERM = 250 MAXNAME = 250 PERMISSION = "REVIEW TERM AUDIO" FIXNAME_INSTRUCTIONS = [ "Please correct the name to reflect one of the following formats " "or contact programming support staff for assistance.", "Week_YYYY_WW.zip or Week_YYYY_WW_RevN.zip", "... where 'Y', 'W', and 'N' represent decimal digits.", ] def run(self): """Provide custom routing.""" args = self.request, self.name, self.id self.logger.debug("request=%s name=%s, id=%s", *args) try: if self.book: return self.send_book() if self.mp3: return self.send_mp3() elif self.request == self.SAVE: return self.save() except Exception as e: self.logger.exception("Failure") self.bail(e) Controller.run(self) def populate_form(self, page): """Show the review form for a set, or the set list if none selected. The landing page for this script shows the list of audio file sets on the disk. If the user selects one of the sets, we draw the form for rewviewing the audio files in that set. Pass: page - HTMLPage object on which we draw the form """ # Set the table background to match the rest of the form page. rules = ["td, th { background:#e8e8e8; }"] # Show the review form for an audio file set if one has been picked. if self.audio_set: instructions = ( "Click a hyperlinked mp3 filename to play the sound in " "your browser-configured mp3 player (files which have " "already been reviewed files are at the bottom of the " "list of files.)", "Use the radio buttons to approve or reject a file.", "When finished, click 'Save' to save any changes to " "the database. If all files in the set have been reviewed " "and any have been rejected, a spreadsheet containing " "rejected terms will be created and displayed on your " "workstation. Please save it for future use.", ) page.form.append(page.hidden_field("id", self.audio_set.id)) fieldset = page.fieldset("Instructions") for paragraph in instructions: fieldset.append(page.B.P(paragraph)) page.form.append(fieldset) page.form.append(self.audio_set.table) rules += ( "td, th { border-color:#888; }", "fieldset{ width: 900px; }", ".status-buttons { width: 86px; white-space: nowrap; }", ".status-buttons input { padding-left: 10px; }", "td:last-child: padding: 0 2px; }", ) # Otherwise, show the list of all the sets on the disk. else: fieldset = page.fieldset("Instructions") instructions = ( "Click a link to a zip file to review from the table below. " "Only those files that have not yet been completely reviewed " "are hyperlinked.") fieldset.append(page.B.P(instructions)) page.form.append(fieldset) columns = "File name", "Review status", "Date modified" columns = [page.B.TH(column) for column in columns] table = page.B.TABLE(page.B.TR(*columns)) for zipfile in self.zipfiles_on_disk: table.append(zipfile.row) fieldset = page.fieldset("Audio Zip Files") fieldset.append(table) page.form.append(fieldset) rules += [ "td, th { border-color: #bbb; }", "table { width: 95%; }", ] page.add_css("\n".join(rules)) def save(self): """Save review results and show another form. If the user has not completed the review of this set, redisplay its review form. Otherwise, go back to the display of all the sets on the disk. """ if not self.session.can_do(self.PERMISSION): self.bail("User not authorized to review term audio files") updates = 0 for mp3 in self.audio_set.audio_files: status = self.fields.getvalue(f"status-{mp3.id}") or "U" note = self.fields.getvalue(f"note-{mp3.id}") or "" note = self.NOTEPAT.sub("\n", note.strip())[:self.MAXNOTE] if note != mp3.reviewer_note or status != mp3.review_status: mp3.update(status, note) updates += 1 # If there have been any changes, commit them and refresh the set. if updates: self.logger.info("updated %d mp3 rows", updates) self.conn.commit() if self.audio_set.done: book_name = self.audio_set.close() legend = f"Audio Set {self.audio_set.name} Review Complete" fieldset = self.form_page.fieldset(legend) args = ["All of the audio files in this set have been reviewed. "] if book_name: url = self.make_url(self.script, book=book_name) label = "the workbook for these rejected audio files" link = self.form_page.B.A(label, href=url) args += [ "Some of the audio files were rejected. You can retrieve ", link, ", which can be used for the next round of audio files.", ] else: args.append( "None of the files in the set were rejected, so there " "is no new workbook for a subsequent round of files.") paragraph = self.form_page.B.P(*args) fieldset.append(paragraph) self.form_page.form.append(fieldset) self.audio_set = None else: if updates: self.subtitle = f"Saved updates for {updates} recording(s)" self.show_form() def send_book(self): """Serve up the new workbook with rejected audio files.""" with open(f"{self.REVDIR}/{self.book}.xlsx", "rb") as fp: book_bytes = fp.read() mime_type = f"application/{Excel.MIME_SUBTYPE}" self.send_bytes(book_bytes, f"{self.book}.xlsx", mime_type) def send_bytes(self, payload, name, mime_type): """Return a binary file to the browser. Used by `send_book()` and `send_mp3()`. Pass: payload - the bytes to return name - string for the content disposition's filename mime_type - standard RFC6838 type/subtype string """ headers = ( f"Content-Type: {mime_type}", f"Content-disposition: inline; filename={name}", f"Content-Length: {len(payload):d}", ) for header in headers: stdout.buffer.write(header.encode("utf-8")) stdout.buffer.write(b"\n") stdout.buffer.write(b"\n") stdout.buffer.write(payload) def send_mp3(self): """Let the reviewer listen to the audio file.""" query = self.Query("term_audio_mp3 m", "m.mp3_name", "z.filename") query.join("term_audio_zipfile z", "z.id = m.zipfile_id") query.where(query.Condition("m.id", self.mp3)) mp3_name, filename = query.execute(self.cursor).fetchone() with ZipFile(f"{self.ZIPDIR}/{filename}") as zipfile: mp3_bytes = zipfile.read(mp3_name) self.send_bytes(mp3_bytes, mp3_name, "audio/mpeg") @property def audio_set(self): """Information about the set of MP3 files being reviewed.""" if not hasattr(self, "_audio_set"): self._audio_set = None if self.name: self._audio_set = AudioSet(self, name=self.name) elif self.id: self._audio_set = AudioSet(self, id=self.id) return self._audio_set @audio_set.setter def audio_set(self, value): """Allow the audio_set to be reset after review is done. Pass: value - new value for the property (will be None in this case) """ self._audio_set = value @property def book(self): """Name of new workbook with rejected audio files. Used by the callback to fetch the new Excel file. """ return self.fields.getvalue("book") @property def buttons(self): """Customize the action list (this isn't a report).""" if not self.audio_set: return self.ADMINMENU, self.LOG_OUT else: return self.SAVE, self.ADMINMENU, self.LOG_OUT @property def id(self): """ID of the MP3 file set's row in the database table.""" return self.fields.getvalue("id") @property def mp3(self): """ID of the MP3 file the reviewer wishes to hear.""" return self.fields.getvalue("mp3") @property def name(self): """File name for the selected MP3 file set to be reviewed.""" return self.fields.getvalue("name") @property def name_counts(self): """Index of integers for new MP3 names. This is used to prevent name collisions in the event there are multiple Spanish names for the same term. """ if not hasattr(self, "_name_counts"): self._name_counts = dict() return self._name_counts @property def subtitle(self): """String to be displayed under the main banner.""" if not hasattr(self, "_subtitle"): self._subtitle = self.SUBTITLE return self._subtitle @subtitle.setter def subtitle(self, value): """Allow the display to be overriden after saving reviews. Pass: value - new string to be displayed under the banner """ self._subtitle = value @property def user_id(self): """Account ID for the current CDR user.""" return self.session.user_id @property def zipfiles(self): """Load the complete set of term audio zipfiles from the database. Does not include zipfiles which are in the file system but have not yet been reviewed. See the `zipfiles_on_disk` property for the list of all file in the zipfile directory which match our conventional filename pattern for audio zipfiles. This set contains zipfiles which are no longer in the audio files directory in the file system (because they have been archived by the scheduler file sweeper). Only those zipfiles which are still in the file system are shown in the list of zipfiles on this script's initial page. """ if not hasattr(self, "_zipfiles"): class ZipFiles: """ID and name indexes to the term audio zipfiles.""" def __init__(self, control): """Save the reference to the control object. Pass: control - access to the DB and the HTML builder class """ self.__control = control @property def files(self): """Sequence of `ZipFile` objects.""" if not hasattr(self, "_files"): ctrl = self.__control query = ctrl.Query("term_audio_zipfile", "*") rows = query.execute(ctrl.cursor).fetchall() self._files = [self.ZipFile(ctrl, row) for row in rows] return self._files @property def ids(self): """Dictionary of zipfiles by primary key.""" if not hasattr(self, "_ids"): ids = dict([(file.id, file) for file in self.files]) self._ids = ids return self._ids @property def names(self): """Dictionary of zipfiles by primary key.""" if not hasattr(self, "_names"): names = dict([(f.filename, f) for f in self.files]) self._names = names return self._names class ZipFile: """Information about a single archive of audio files. This is a simpler class than the global `AudioSet` class. That class has information about the audio files in the zip file. This class has just enough information to meet the needs of the page which displays all of the zipfiles. Properties: id - integer primary key for the zipfile record filename - string for the zipfile's name filedate - date/time stamp for the zipfile complete - Boolean indicating whether reviews are done """ PROPS = "id", "filename", "filedate", "complete" def __init__(self, control, row): """Capture the caller's information. Pass: control - access to the HTML builder class row - result set row from the SQL query """ self.__control = control self.__row = row def __getattr__(self, name): """Return the other properties directly.""" return getattr(self.__row, name) def __str__(self): """String for debugging/logging.""" if not hasattr(self, "_str"): names = self.PROPS props = [f"{n}={getattr(self, n)}" for n in names] self._str = " ".join(props) return self._str @property def complete(self): """True if all the audio files have been reviewed.""" return self.__row.complete == "Y" self.__zipfiles = ZipFiles(self) self._zipfiles = {} for zipfile in self.__zipfiles.files: self._zipfiles[zipfile.filename.lower()] = zipfile return self._zipfiles @property def zipfile_names(self): """Index by name of all the audio set zipfiles on the disk.""" if not hasattr(self, "_zipfile_names"): self._zipfile_names = {} for zipfile in self.zipfiles_on_disk: self._zipfile_names[zipfile.key] = zipfile return self._zipfile_names @property def zipfiles_on_disk(self): """Zipfiles in the file system.""" if not hasattr(self, "_zipfiles_on_disk"): class DiskFile: STARTED = "Started" UNREVIEWED = "Unreviewed" COMPLETED = "Completed" STATUS_SORT = {STARTED: 1, UNREVIEWED: 2, COMPLETED: 3} def __init__(self, control, entry): self.__control = control self.__entry = entry def __lt__(self, other): """Sort by status then by filename.""" return self.sortkey < other.sortkey @property def control(self): return self.__control @property def datetime(self): if not hasattr(self, "_datetime"): mtime = self.__entry.stat().st_mtime self._datetime = datetime.fromtimestamp(mtime) return self._datetime @property def db_info(self): """Information about this file from the database.""" if not hasattr(self, "_db_info"): self._db_info = self.control.zipfiles.get(self.key) return self._db_info @property def name(self): return self.__entry.name @property def path(self): if not hasattr(self, "_path"): self._path = self.__entry.path.replace("\\", "/") return self._path @property def key(self): if not hasattr(self, "_key"): self._key = self.name.lower() return self._key @property def sortkey(self): "Major sort by status, subsort by filename" if not hasattr(self, "_sortkey"): self._sortkey = self.STATUS_SORT[self.status], self.key return self._sortkey @property def status(self): if not hasattr(self, "_status"): if not self.db_info: self._status = self.UNREVIEWED elif self.db_info.complete: self._status = self.COMPLETED else: self._status = self.STARTED return self._status @property def row(self): if not hasattr(self, "_row"): B = self.control.HTMLPage.B filename = self.__entry.name if self.status != self.COMPLETED: script = self.control.script if self.status == self.UNREVIEWED: params = dict(name=self.name) else: params = dict(id=self.db_info.id) url = self.control.make_url(script, **params) filename = B.A(filename, href=url) filename = B.TD(filename) status = B.TD(self.status, B.CLASS("center")) modified = str(self.datetime)[:19] modified = B.TD(modified, B.CLASS("center")) self._row = B.TR(filename, status, modified) return self._row files = [] for entry in scandir(self.ZIPDIR): key = entry.name.lower() if key.startswith("week") and key.endswith(".zip"): if self.NAMEPAT.match(entry.name): files.append(DiskFile(self, entry)) else: message = f"Found file {entry.name!r}." self.logger.warning(message) self.bail(message, extra=self.FIXNAME_INSTRUCTIONS) self._zipfiles_on_disk = sorted(files) return self._zipfiles_on_disk
class Control(Controller): """Processing logic.""" TIER = Tier() SUBTITLE = "Retrieve Audio Files From CIPSFTP Server" LOGNAME = "FtpAudio" USER = "******" WEEK = r"^Week_\d{4}_\d\d(_Rev\d)?" FILE = r"\d+_e[ns]\d*" SSH_KEY = r"\etc\cdroperator_rsa" CDRSTAGING = "/sftp/sftphome/cdrstaging" AUDIO_DIR = f"{CDRSTAGING}/ciat/{TIER.name.lower()}/Audio" SOURCE_DIR = f"{AUDIO_DIR}/Term_Audio" TARGET_DIR = f"{TIER.basedir}/Audio_from_CIPSFTP" TRANSFERRED_DIR = f"{AUDIO_DIR}/Audio_Transferred" INSTRUCTIONS = ( "Files which match the pattern Week_YYYY_WW.zip or " "Week_YYYY_WW_RevN.zip will be retrieved from the source " "directory on the NCI SFTP server and placed in the destination " "directory on the Windows CDR server. Then they will be copied " "(if running in test mode) or moved to a backup location on the SFTP server " "(referred to below as the Transferred directory). By default, " "retrieval of a zip file will be skipped if the file already exists " "on the Windows CDR server (though this can be overridden). " "In test mode, the retrievals will be reported but not " "performed. ") BUFSIZE = 2**15 def populate_form(self, page): """Add fields to the form. Pass: page - HTMLPage object to be populated """ fieldset = page.fieldset("Instructions") fieldset.append(page.B.P(self.INSTRUCTIONS)) page.form.append(fieldset) fieldset = page.fieldset("Directories") fieldset.set("id", "paths") fieldset.append(page.text_field("source", value=self.SOURCE_DIR)) fieldset.append(page.text_field("destination", value=self.TARGET_DIR)) opts = dict(value=self.TRANSFERRED_DIR) fieldset.append(page.text_field("transferred", **opts)) page.form.append(fieldset) fieldset = page.fieldset("Options") label = "Keep documents in 'Source' directory" opts = dict(value="keep", label=label) fieldset.append(page.checkbox("options", **opts)) opts = dict(value="test", label="Run in test mode") fieldset.append(page.checkbox("options", **opts)) label = "Overwrite files in 'Destination' directory if they already exist" opts = dict(value="overwrite", label=label) fieldset.append(page.checkbox("options", **opts)) page.form.append(fieldset) page.add_css("fieldset {width:600px} #paths input {width:400px}") def build_tables(self): """Perform the retrievals and report the processing outcome.""" if not self.session.can_do("AUDIO DOWNLOAD"): self.bail("Not authorized") self.logger.info("Running in %s mode", self.mode) lines = [ f"Processing mode: {self.mode}", f"Source directory: {self.source_dir}", f"Destination directory: {self.destination_dir}", f"Transferred directory: {self.transferred_dir}", ] if not self.zipfiles: lines.append("No zip files found to be transferred") else: errors = [] for name in self.zipfiles: ### pass ### errors += self.check_mp3_paths(name) if errors: lines += errors lines.append("Retrieval aborted by failed MP3 path checks") else: for name in self.zipfiles: lines += self.retrieve(name) for name in self.rejected: lines.append(f"Skipped {name}") rows = [[line] for line in lines] caption = "Processing Results" return self.Reporter.Table(rows, caption=caption) def retrieve(self, name): """Transfer zipfile if appropriate and possible. Pass: name - string for the name of the zipfile to transfer Return: array of strings for the processing results table """ source = f"{self.source_dir}/{name}" target = f"{self.destination_dir}/{name}" retrieve = not self.test if name.lower() in self.already_transferred: if self.overwrite: line = f"Retrieved {name}, overwriting file at destination" else: line = f"Skipping {name}, which already exists at destination" retrieve = False else: line = f"Retrieved {name}" failed = False if retrieve: try: with self.connection.open_sftp() as sftp: sftp.get(source, target) except Exception as e: self.logger.exception("Retrieving %s", source) line = f"Failed retrieval of {name}: {e}" failed = True process = run_command(f"fix-permissions {target}") if process.stderr: self.bail(f"Unable to fix permissions for {target}", extra=[process.stderr]) lines = [line] # Copy or move the source files to a backup location on the FTP server # There are several different scenarios: # a) The specific file to be copied already exists # If file already exists in transfer directory first move the # existing file to a backup location (adding time stamp to file name) # b) Running in Test or Live mode # In test mode files are always copied # In live mode files are moved unless option to keep source is specified # c) Setting option to keep files in source directory # In test mode files are always kept in source directory # In live mode files are moved unless option to keep source is specified # # File exists Test/Live Keep Y/N Action # --------------------------------------------- # N Test N copy # N Test Y copy # N Live N move # N Live Y copy # # Y Test N move backup, then copy # Y Test Y move backup, then copy # Y Live N move backup, then move # Y Live Y move backup, then copy # ------------------------------------------------------------------------- ### if not failed and not self.keep: if not failed: target = f"{self.transferred_dir}/{name}" program = "cp" # Check if target file already exists. Move to backup location ls_cmd = f"ls {target}" stdin, stdout, stderr = self.connection.exec_command(ls_cmd) ls_error = stderr.readlines() mode_flag = "T" if self.test else "L" # File already exists if ls command succeeds if not ls_error: self.logger.info( f"Found existing file {target.split('/')[-1]}") backup = f"{target}-{mode_flag}-{self.stamp}" self.logger.info(f"Create backup file {backup.split('/')[-1]}") cmd = f"mv {target} {backup}" stdin, stdout, stderr = self.connection.exec_command(cmd) errors = stderr.readlines() if errors: lines.append(f"Errors moving existing file {target}") self.logger.info(errors) if not self.test and not self.keep: program = "mv" cmd = f"{program} {source} {target}" stdin, stdout, stderr = self.connection.exec_command(cmd) errors = stderr.readlines() if errors: if self.test: lines.append(f"Errors copying {name} to {target}") else: lines.append(f"Errors moving {name} to {target}") lines += errors elif self.test: lines.append(f"Copied {name} to Transferred directory") self.logger.info(f"Copied {name} to {target}") else: action = "Copied" if self.keep else "Moved" lines.append(f"{action} {name} to Transferred directory") self.logger.info(f"{action} {name} to {target}") return lines def check_mp3_paths(self, filename): """Make sure the spreadsheet and zip file MP3 paths match. Also ensures that the paths follow the pattern convention established for the audio files. Pass: filename - string for the name of the zipfile to inspect Return: Possibly empty sequence of error strings """ with self.connection.open_sftp() as sftp: zip_path = f"{self.source_dir}/{filename}" with sftp.open(zip_path, bufsize=self.BUFSIZE) as fp: zipfile = ZipFile(BytesIO(fp.read())) self.logger.info("Verifying MP3 paths in %s", zip_path) mp3_paths = set() col_paths = set() errors = [] for name in zipfile.namelist(): normalized = name.lower() if "macosx" not in normalized: if normalized.endswith(".mp3"): mp3_paths.add(name) elif normalized.endswith(".xlsx"): opts = dict(read_only=True, data_only=True) book = load_workbook(BytesIO(zipfile.read(name)), **opts) sheet = book.active headers = True for row in sheet: if headers: headers = False else: try: value = row[4].value if not isinstance(value, str): errors.append("Missing MP3 path") else: col_paths.add(value) except: errors.append("Missing MP3 path") all_paths = mp3_paths | col_paths for path in all_paths: if not self.member_pattern.match(path): errors.append(f"{filename} has invalid MP3 path format {path}") missing = col_paths - mp3_paths for path in missing: errors.append(f"{filename} does not contain {path}") unused = mp3_paths - col_paths for path in unused: errors.append(f"{filename} has unused MP3 file {path}") return errors @property def connection(self): """Connection to the SFTP server.""" if not hasattr(self, "_connection"): self._connection = paramiko.SSHClient() policy = paramiko.AutoAddPolicy() self._connection.set_missing_host_key_policy(policy) pkey = paramiko.RSAKey.from_private_key_file(self.SSH_KEY) opts = dict(hostname=self.server, username=self.USER, pkey=pkey) self.logger.info("Connecting to %s ...", self.server) self._connection.connect(**opts) self.logger.info("Connected") return self._connection @property def destination_dir(self): """Directory to which we copy the audio zip archives.""" if not hasattr(self, "_destination_dir"): directory = self.fields.getvalue("destination") if not os.path.exists(directory): try: os.mkdir(directory) except Exception as e: self.logger.exception("Creating %s", directory) self.bail(e) self.logger.info("Destination directory: %s", directory) self._destination_dir = directory return self._destination_dir @property def keep(self): """If True, don't move files to transferred directory.""" return "keep" in self.options @property def mode(self): """One of 'test' or 'live' values.""" return "test" if self.test else "live" @property def names(self): """All the file names found in the source directory.""" if not hasattr(self, "_names"): command = f"ls {self.SOURCE_DIR}/*" self.logger.info("Running %s", command) stdin, stdout, stderr = self.connection.exec_command(command) self._names = [] for name in stdout.readlines(): self._names.append(name.split("/")[-1].strip()) return self._names @property def already_transferred(self): """Zipfiles which already exist in the destination directory.""" if not hasattr(self, "_already_transferred"): os.chdir(self.destination_dir) names = glob("*.zip") self.logger.info("Destination dir has %s", names) self._already_transferred = set([name.lower() for name in names]) return self._already_transferred @property def options(self): """Overrides of runtime defaults.""" if not hasattr(self, "_options"): self._options = self.fields.getlist("options") return self._options @property def overwrite(self): """Boolean indicating whether it is OK to overwrite destination files.""" return "overwrite" in self.options @property def pattern(self): """Files we want will match this regular expression.""" if not hasattr(self, "_pattern"): self._pattern = re.compile(f"^{self.WEEK}.zip$") return self._pattern @property def member_pattern(self): """Members of zip files must match this regular expression.""" if not hasattr(self, "_member_pattern"): self._member_pattern = re.compile(f"^{self.WEEK}/{self.FILE}.mp3$") return self._member_pattern @property def rejected(self): """File names which don't match our naming convention. We don't have to do anything but reference the `zipfiles` property, which takes care of populating both its own property and this one. """ if self.zipfiles and not hasattr(self, "_rejected"): self.bail("Internal error") return self._rejected @property def server(self): """Local name of the SFTP server.""" if not hasattr(self, "_server"): self._server = self.session.tier.hosts["SFTP"].split(".")[0] return self._server @property def source_dir(self): """Directory from which we copy the audio zip archives.""" if not hasattr(self, "_source_dir"): self._source_dir = self.fields.getvalue("source") self.logger.info("Source directory: %s", self._source_dir) return self._source_dir @property def stamp(self): """String used to name files moved in test mode.""" if not hasattr(self, "_stamp"): self._stamp = self.started.strftime("%Y%m%d%H%M%S") return self._stamp @property def test(self): """Are we testing the waters instead of running in live mode?""" return "test" in self.options @property def transferred_dir(self): """Directory where source files are moved after being transferred.""" if not hasattr(self, "_transferred_dir"): directory = self.fields.getvalue("transferred") self.logger.info("Transferred directory: %s", directory) self._transferred_dir = directory return self._transferred_dir @property def zipfiles(self): """Names of files to be transferred.""" if not hasattr(self, "_zipfiles"): zipfiles = [] rejected = [] for name in self.names: if self.pattern.match(name): zipfiles.append(name) else: rejected.append(name) self._zipfiles = zipfiles if not hasattr(self, "_rejected"): self._rejected = rejected if not zipfiles: self.logger.warning("No audio archive files found to transfer") else: self.logger.info("%d audio archive files found to transfer", len(zipfiles)) for name in zipfiles: self.logger.info(name) if rejected: self.logger.warning("Ignored files: %r", rejected) return self._zipfiles
class Terms: SERVER = socket.gethostname().split(".")[0] SENDER = "cdr@{}.nci.nih.gov".format(SERVER.lower()) SUBJECT = "DUPLICATE GLOSSARY TERM NAME MAPPINGS ON " + SERVER.upper() UNREPORTED = set() # see OCECDR-4795 set(["tpa", "cab", "ctx", "receptor"]) GROUP = "glossary-servers" def __init__(self, logger=None, recip=None): """ Collect the glossary term information. Pass: logger - the scheduled job's logger (unless testing from the command line) recip - optional email address for testing without spamming the users """ self.tier = Tier() self.logger = logger self.recip = recip if self.logger is None: self.logger = cdr.Logging.get_logger("glossifier", level="debug") self.conn = db.connect() self.cursor = self.conn.cursor() def save(self): """ Store the serialized name information in the database. """ names = repr(self.names) self.logger.info("saving glossifier names (%d bytes)", len(names)) self.cursor.execute("""\ UPDATE glossifier SET refreshed = GETDATE(), terms = ? WHERE pk = 1""", names) self.conn.commit() def send(self): """ Send the glossary information to registered Drupal CMS servers """ failures = [] success = "Sent glossary to server %r at %s" failure = "Failure sending glossary to server %r at %s: %s" for alias, base in self.servers.items(): url = "{}/pdq/api/glossifier/refresh".format(base) try: response = requests.post(url, json=self.data, auth=self.auth) if response.ok: self.logger.info(success, alias, base) else: args = alias, base, response.reason self.logger.error(failure, *args) failures.append(args) except Exception as e: args = alias, base, e self.logger.exception(failure, *args) failures.append(args) if failures: group = "Developers Notification" if self.recip: recips = [self.recip] else: recips = Job.get_group_email_addresses(group) if not recips: raise Exception("no recips found for glossary failure message") tier = self.tier.name subject = "[{}] Failure sending glossary information".format(tier) lines = [] for args in failures: lines.append("Server {!r} at {}: {}".format(*args)) body = "\n".join(lines) opts = dict(subject=subject, body=body) message = cdr.EmailMessage(self.SENDER, recips, **opts) self.logger.error("send failure notice sent to %r", recips) @property def auth(self): """ Basic authorization credentials pair for Drupal CMS servers """ if not hasattr(self, "_auth"): password = self.tier.password("PDQ") if not password: raise Exception("Unable to find PDQ CMS credentials") self._auth = "PDQ", password return self._auth @property def concepts(self): """ Dictionary information for the term concepts. """ if not hasattr(self, "_concepts"): class Concept: """ CDR GlossaryTermConcept document. Attributes: - id: integer for the document's CDR ID - dictionaries: English and Spanish dictionaries for which we have definitions """ def __init__(self, doc_id): self.id = doc_id self.dictionaries = dict(en=set(), es=set()) self._concepts = {} tags = dict(en="TermDefinition", es="TranslatedTermDefinition") for lang in tags: path = "/GlossaryTermConcept/{}/Dictionary".format(tags[lang]) query = db.Query("query_term_pub", "doc_id", "value") query.where(query.Condition("path", path)) rows = query.execute(self.cursor).fetchall() self.logger.debug("fetched %d %s dictionaries", len(rows), lang) for doc_id, dictionary in rows: concept = self._concepts.get(doc_id) if not concept: concept = self._concepts[doc_id] = Concept(doc_id) concept.dictionaries[lang].add(dictionary.strip()) return self._concepts @property def data(self): """ JSON-serializable glossary data for the Drupal CMS servers JSON can't deal with sets, so we transform the sets of dictionaries into plain lists. """ if not hasattr(self, "_data"): names = dict() for name, docs in self.names.items(): names[name] = dict() for doc_id, languages in docs.items(): names[name][doc_id] = dict() for language, dictionaries in languages.items(): names[name][doc_id][language] = list(dictionaries) self._data = names return self._data @property def extra_names(self): """Fetch variant names from the external_map table.""" if not hasattr(self, "_extra_names"): self._extra_names = {} for langcode in Term.USAGES: query = db.Query("external_map m", "m.value", "m.doc_id") query.join("external_map_usage u", "u.id = m.usage") query.where(query.Condition("u.name", Term.USAGES[langcode])) rows = query.execute(self.cursor).fetchall() args = len(rows), langcode self.logger.debug("fetched %d extra %s names", *args) names = {} for name, doc_id in rows: if doc_id not in names: names[doc_id] = [name] else: names[doc_id].append(name) self._extra_names[langcode] = names return self._extra_names @property def names(self): """ Dictionary of name information used by the glossifier. Only unique usage information is included in the returned dictionary. Duplicate usage is stored in the `dups` attribute as a side effect of this method, so that they can be reported via email notification. There are a handful of unreported duplicates which CIAT has decided not to eliminate. Return: nested dictionary indexed by normalized name strings: names[normalized-name][doc_id][language] => set of dictionaries """ if not hasattr(self, "_names"): self.dups = dict() names = dict() for key in self.usages: name, language, dictionary = key ids = list(self.usages[key]) if len(ids) > 1: if name not in self.UNREPORTED: self.dups[key] = ids else: doc_id = ids[0] if name not in names: names[name] = {} if doc_id not in names[name]: names[name][doc_id] = {} if language not in names[name][doc_id]: names[name][doc_id][language] = set() if dictionary is not None: names[name][doc_id][language].add(dictionary) self._names = names return self._names @property def servers(self): """ Servers who receive scheduled updated glossary data This property is a dictionary of each server's base URL, indexed by a unique alias. The servers are stored in the CDR control table. Each server gets a row in the table, with `GROUP` as the value of the `grp` column, and a unique alias for the server stored in the `name` column. The URL for the server is stored in the `val` column. If no servers are found in the table, then fetch the DRUPAL CMS with which this tier is associated, and use the alias "Primary" for the server. """ if not hasattr(self, "_servers"): self._servers = cdr.getControlGroup(self.GROUP) if not self._servers: server = self.tier.hosts.get("DRUPAL") self._servers = dict(Primary="https://{}".format(server)) return self._servers @property def usages(self): """ Published glossary term name documents. Property value is a dictionary indexed by a tuple containing: - normalized term name string - language ("en" or "es") - dictionary (e.g., "Cancer.gov"; None if no dictionaries assigned for this language) The values of the dictionaries are sequence of glossary term name documents which are found for the tuple's values. In order to be usable by the glossifier, each value must be unique (that is, the sequence must have exactly one term name doc ID). """ if not hasattr(self, "_usages"): # Start with an empty usages dictionary. self._usages = {} # Get the dictionary of Concept object with dictionary information. concepts = self.concepts self.logger.debug("fetched %d concepts", len(concepts)) # Fetch all of the published CDR glossary term documents. columns = "v.id", "v.xml", "q.int_val" joins = ( ("pub_proc_doc d", "d.doc_id = v.id", "d.doc_version = v.num"), ("pub_proc_cg c", "c.id = v.id", "c.pub_proc = d.pub_proc"), ("query_term_pub q", "q.doc_id = v.id"), ) path = "/GlossaryTermName/GlossaryTermConcept/@cdr:ref" query = db.Query("doc_version v", *columns) for args in joins: query.join(*args) query.where(query.Condition("q.path", path)) rows = query.execute(self.cursor).fetchall() self.logger.debug("processing %d glossary terms", len(rows)) # Use the term information to populate the usages dictionary. for term_id, doc_xml, concept_id in rows: term = Term(self, term_id, doc_xml, concepts.get(concept_id)) term.record_usages(self._usages) return self._usages def report_duplicates(self): """ Send a report on duplicate name+language+dictionary mappings. """ if not self.dups: self.logger.error("no duplicates to report") return if self.recip: recips = [self.recip] else: recips = Job.get_group_email_addresses("GlossaryDupGroup") if not recips: raise Exception("no recipients found for glossary dup message") body = ["The following {:d} sets of ".format(len(self.dups)), "duplicate glossary mappings were found in the CDR ", "on {}. ".format(self.SERVER.upper()), "Mappings for any phrase + language + dictionary must ", "be unique. ", "Please correct the data so that this requirement is met. ", "You may need to look at the External Map Table for ", "Glossary Terms to find some of the mappings.\n"] template = "\n{} (language={!r} dictionary={!r})\n" for key in sorted(self.dups): name, language, dictionary = key args = name.upper(), language, dictionary body.append(template.format(*args)) for doc_id in self.dups[key]: body.append("\tCDR{:010d}\n".format(doc_id)) body = "".join(body) opts = dict(subject=self.SUBJECT, body=body) message = cdr.EmailMessage(self.SENDER, recips, **opts) message.send() self.logger.info("duplicate mapping notification sent to %r", recips)
FS_LOGGER = None # supplied later by the FileSweeper object. # Don't go wild creating output files MAX_OUTPUT_FILES_WITH_ONE_NAME = 5 # Size for read/write BLOCK_SIZE = 4096 # Date constants, YEARS_OLD is max time we'll look back, sanity check DAY_SECS = 86400 YEAR_DAYS = 365.25 YEARS_OLD = 10 LONG_TIME = DAY_SECS * YEAR_DAYS * YEARS_OLD # Where are we running? TIER = Tier().name class FileSweeper(Job): """ Adapter to allow the overall file clean up task to be driven from the CDR scheduler. Required jobParam fields: ConfigFile Full or relative path to configuration file. Optional jobParam fields: TestMode Boolean value. Create output files but delete nothing. (default False) Email Alternate email list for fatal error msgs. If more than one address, use '+' as separator, no spaces.
def tier(self): """Which CDR server are we using?""" if not hasattr(self, "_tier"): self._tier = Tier(self.opts.get("tier")) return self._tier