def CreateFulltextIdsFile(ids_output_file, log_file_name): elasticsearch_access_conf = "/usr/local/var/lib/tuelib/Elasticsearch.conf" if os.access(elasticsearch_access_conf, os.F_OK): util.ExecOrDie("/usr/local/bin/extract_existing_fulltext_ids.sh", [ids_output_file], log_file_name) else: # Skip if configuration is not present util.ExecOrDie(util.Which("truncate"), ["-s", "0", log_file_name]) util.ExecOrDie(util.Which("echo"), [ "Skip extraction since " + elasticsearch_access_conf + " not present" ], log_file_name)
def DumpTranslationsDB(database, user, password, outfile_name): ClearOutFile(outfile_name) util.ExecOrDie("/usr/bin/mysqldump", [ "--single-transaction", "--database", re.sub('^"|"$', '', database), "--user="******"|"$', '', user), "--password="******"|"$', '', password) ], outfile_name)
def Main(): if len(sys.argv) != 2: util.SendEmail(os.path.basename(sys.argv[0]), "This script needs to be called with an email address as the only argument!\n", priority=1) sys.exit(-1) util.default_email_recipient = sys.argv[1] try: config = util.LoadConfigFile() ftp_host = config.get("FTP", "host") ftp_user = config.get("FTP", "username") ftp_passwd = config.get("FTP", "password") except Exception as e: util.Error("failed to read config file! (" + str(e) + ")") # Download data from Crossref: log_file_name = CreateLogFileName() crossref_xml_file = "/tmp/crossref.xml" os.unlink(crossref_xml_file) util.ExecOrDie("/usr/local/bin/crossref_downloader", [ crossref_xml_file ], log_file_name) # Upload the XML data to the BSZ FTP server: ftp = util.FTPLogin(ftp_host, ftp_user, ftp_passwd) try: with open(crossref_xml_file, "rb") as xml_file: ftp.storbinary("STOR crossref.xml", xml_file) except Exception as e: util.Error("failed to read config file! (" + str(e) + ")") os.unlink(crossref_xml_file) util.SendEmail("Crossref Data Import", "Successfully imported Crossref data and uploaded it to the BSZ FTP server.", priority=5)
def CleanUp(title_data_file, log_file_name): # Terminate the temporary solr instance util.ExecOrDie("/usr/local/bin/shutdown_refterm_solr.sh", [], log_file_name) # Clean up temporary title data if title_data_file is not None: util.Remove(title_data_file)
def DownloadCrossrefData(output_marc_filename): util.ExecOrDie("/usr/local/bin/crossref_downloader", [ "/usr/local/var/lib/tuelib/crossref_downloader/crossref_journal_list", output_marc_filename ], "/proc/self/fd/1") process = subprocess.Popen(["marc_size", output_marc_filename], stdout=subprocess.PIPE) size = process.stdout.readline() return int(size) if len(size) > 0 else 0
def StartPipeline(pipeline_script_name, marc_title, conf): log_file_name = util.MakeLogFileName(pipeline_script_name, util.GetLogDirectory()) util.ExecOrDie(pipeline_script_name, [marc_title], log_file_name) log_file_name = util.MakeLogFileName("import_into_vufind", util.GetLogDirectory()) ImportIntoVuFind(conf.get("FileNames", "title_marc_data"), conf.get("FileNames", "authority_marc_data"), log_file_name)
def DumpMongoDB(config, log_file_name="/dev/stderr"): # Backup to intermediate hidden directory that is exluded from backup # to prevent inconsistent saving dump_base_path = config.get("LocalConfig", "dump_base_path") dump_root = config.get("LocalConfig", "dump_root") intermediate_dump_dir = dump_base_path + '/.' + dump_root util.ExecOrDie(util.Which("mongodump"), ["--out=" + intermediate_dump_dir, "--gzip"], log_file_name) final_dump_dir = dump_base_path + '/' + dump_root if os.path.exists(final_dump_dir) and os.path.isdir(final_dump_dir): rmtree(final_dump_dir) move(intermediate_dump_dir, final_dump_dir)
def RunPipelineAndImportIntoSolr(pipeline_script_name, marc_title, conf): log_file_name = util.MakeLogFileName(pipeline_script_name, util.GetLogDirectory()) util.ExecOrDie(pipeline_script_name, [marc_title], log_file_name) log_file_name = util.MakeLogFileName("import_into_vufind", util.GetLogDirectory()) ImportIntoVuFind(conf.get("FileNames", "title_marc_data"), conf.get("FileNames", "authority_marc_data"), log_file_name) # Write timestamp file for last successful Solr import: with open( os.open('/usr/local/vufind/public/last_solr_import', os.O_CREAT | os.O_WRONLY, 0o644), 'w') as output: output.write(str(datetime.datetime.now()))
def ImportOADOIsToMongo(update_list, source_directory=None, log_file_name="/dev/stderr"): if not source_directory is None: os.chdir(source_directory) imported_symlinks_directory = os.getcwd() + "/imported" for filename in update_list: imported_symlink_full_path = imported_symlinks_directory + "/" + filename if os.path.islink(imported_symlink_full_path): print("Skipping " + filename + " since apparently already imported") continue print("Importing \"" + filename + "\"") util.ExecOrDie(util.Which("import_oadois_to_mongo.sh"), [filename], log_file_name) CreateImportedSymlink(filename, imported_symlink_full_path)
def Main(): if len(sys.argv) != 2: util.SendEmail( os.path.basename(sys.argv[0]), "This script needs to be called with an email address as the only argument!\n", priority=1) sys.exit(-1) util.default_email_recipient = sys.argv[1] try: config = util.LoadConfigFile() sftp_host = config.get("SFTP", "host") sftp_user = config.get("SFTP", "username") sftp_keyfile = config.get("SFTP", "keyfile") local_directory = config.get("Upload", "local_directory") directory_on_sftp_server = config.get("Upload", "directory_on_sftp_server") except Exception as e: util.Error("failed to read config file! (" + str(e) + ")") # Check directories with new Data fulltext_files = GetExistingFiles(local_directory) dirs_to_transfer = GetFulltextDirectoriesToTransfer( local_directory, fulltext_files) # If nothing to do if not dirs_to_transfer: util.SendEmail("Transfer Fulltexts", "No directories to transfer", priority=5) return # Transfer the data util.ExecOrDie("/usr/local/bin/transfer_fulltext.sh", [ sftp_host, sftp_user, sftp_keyfile, local_directory, directory_on_sftp_server ] + list(dirs_to_transfer)) # Clean up on the server CleanUpFiles(fulltext_files) email_msg_body = "Found Files:\n\n" + '\n'.join( fulltext_files) + "\n\nTransferred directories:\n\n" + '\n'.join( dirs_to_transfer) util.SendEmail("Transfer Fulltexts", email_msg_body, priority=5)
def Main(): if len(sys.argv) != 3: util.SendEmail(os.path.basename(sys.argv[0]), "This script needs to be called with an email address and the system type!\n", priority=1) sys.exit(-1) util.default_email_recipient = sys.argv[1] system_type = sys.argv[2] if system_type != "krimdok" and system_type != "relbib" and system_type != "ixtheo": util.SendEmail(os.path.basename(sys.argv[0]), "This system_type must be one of {krimdok,relbib,ixtheo}!\n", priority=1) sys.exit(-1) output_file = "/tmp/collect_solr_stats_data.csv" util.Remove(output_file) util.ExecOrDie("/usr/local/bin/collect_solr_stats_data", [ system_type, output_file ], "/usr/local/var/log/tuefind/collect_solr_stats_data.log") util.SendEmail("Solr Stats Collector", "Successfully generated Solr statistics and updated Ingo's MySQL database.", priority=5)
def ImportIntoVuFind(title_pattern, authority_pattern, log_file_name): vufind_dir = os.getenv("VUFIND_HOME") if vufind_dir == None: util.Error("VUFIND_HOME not set, cannot start solr import!") # import title data title_index = 'biblio' title_args = [sorted(glob.glob(title_pattern), reverse=True)[0]] if len(title_args) != 1: util.Error("\"" + title_pattern + "\" matched " + str(len(title_args)) + " files! (Should have matched exactly 1 file!)") ClearSolrIndex(title_index) util.ExecOrDie(vufind_dir + "/import-marc.sh", title_args, log_file_name) OptimizeSolrIndex(title_index) # import authority data authority_index = 'authority' authority_args = [sorted(glob.glob(authority_pattern), reverse=True)[0]] if len(authority_args) != 1: util.Error("\"" + authority_pattern + "\" matched " + str(len(authority_args)) + " files! (Should have matched exactly 1 file!)") ClearSolrIndex(authority_index) util.ExecOrDie(vufind_dir + "/import-marc-auth.sh", authority_args, log_file_name) OptimizeSolrIndex(authority_index) util.ExecOrDie( util.Which("sudo"), ["-u", "solr", "-E", vufind_dir + "/index-alphabetic-browse.sh"], log_file_name) # cleanup logs util.ExecOrDie("/usr/local/bin/summarize_logs", [vufind_dir + "/import/solrmarc.log", solrmarc_log_summary]) util.ExecOrDie("/usr/local/bin/log_rotate", [vufind_dir + "/import/", "solrmarc\\.log"]) util.ExecOrDie("/usr/local/bin/summarize_logs", [log_file_name, import_log_summary]) util.ExecOrDie( "/usr/local/bin/log_rotate", [os.path.dirname(log_file_name), os.path.basename(log_file_name)])
def ExtractRefDataMarcFile(gzipped_tar_archive, output_marc_file, log_file_name): util.ExecOrDie("/usr/local/bin/extract_refterm_archive.sh", [gzipped_tar_archive, output_marc_file], log_file_name)
def CompressAndEncryptFile(infile, outfile, archive_password): util.ExecOrDie("/usr/bin/7za", ['a', "-p" + archive_password, outfile, infile])
def ExtractOADOIURLs(share_directory, all_dois_file, urls_file, log_file_name): print("Extract URLs for DOI's in " + all_dois_file) util.ExecOrDie(util.Which("extract_oadoi_urls.sh"), [share_directory + '/' + all_dois_file, urls_file], log_file_name)
def Main(): if len(sys.argv) != 4 and len(sys.argv) != 5: util.SendEmail(os.path.basename( sys.argv[0] ), "This script needs to be called with an email address, the beacon header file, an output " "path and an optional ppn-filter file as arguments!\n", priority=1) sys.exit(-1) util.default_email_recipient = sys.argv[1] most_recent_authority_filename = GetMostRecentBSZFile( "^Normdaten-(\d\d\d\d\d\d).mrc$") if most_recent_authority_filename is None: util.SendEmail("Beacon Generator", "Found no matching authority files!", priority=1) most_recent_titles_filename = GetMostRecentBSZFile( "^GesamtTiteldaten-(\d\d\d\d\d\d).mrc$") if most_recent_titles_filename is None: util.SendEmail("Beacon Generator", "Found no matching title files!", priority=1) # Extract the GND numbers from the 035$a subfield of the MARC authority data for authors: gnd_numbers_path = "/tmp/gnd_numbers" util.ExecOrDie("/usr/local/bin/extract_person_gnd_numbers", [most_recent_authority_filename], gnd_numbers_path) # Count GND references in the title data: gnd_counts_filename = "/tmp/gnd_counts" if len(sys.argv) == 4: util.ExecOrDie("/usr/local/bin/count_gnd_refs", [ gnd_numbers_path, most_recent_titles_filename, gnd_counts_filename ]) else: util.ExecOrDie("/usr/local/bin/count_gnd_refs", [ "--control-number-list=" + sys.argv[4], gnd_numbers_path, most_recent_titles_filename, gnd_counts_filename ]) # Generate a file with a timestamp in the Beacon format: timestamp_filename = "/tmp/beacon_timestamp" with open(timestamp_filename, "w") as timestamp_file: timestamp_file.write("#TIMESTAMP: " + str(datetime.date.today()) + "\n") # Now generate the final output (header + counts): if not util.ConcatenateFiles( [sys.argv[2], timestamp_filename, gnd_counts_filename], sys.argv[3]): util.SendEmail("Beacon Generator", "An unexpected error occurred: could not write \"" + sys.argv[3] + "\"!", priority=1) # Cleanup of temp files: os.unlink(gnd_numbers_path) os.unlink(timestamp_filename) os.unlink(gnd_counts_filename) util.SendEmail("Beacon File Generator", "Successfully created a Beacon file.", priority=5)
def SetupTemporarySolrInstance(title_data_file, conf, log_file_name): # Setup a temporary solr instance in a ramdisk and import title data util.ExecOrDie("/usr/local/bin/setup_refterm_solr.sh", [title_data_file], log_file_name)
def CreateMatchDB(title_marc_data, log_file_name): util.ExecOrDie("/usr/local/bin/create_match_db", [ title_marc_data ], log_file_name, setsid=False);
def Main(): if len(sys.argv) != 4 and len(sys.argv) != 5 and len(sys.argv) != 6 \ or (len(sys.argv) == 6 and not sys.argv[1].startswith("--filter-field=")): SendUsageAndExit() count_gnd_refs_args = [] if sys.argv[1].startswith("--filter-field="): count_gnd_refs_args.append(sys.argv[1]) del sys.argv[1] if len(sys.argv) != 4 and len(sys.argv) != 5: SendUsageAndExit() util.default_email_recipient = sys.argv[1] most_recent_authority_filename = GetMostRecentBSZFile( "^Normdaten-(\d\d\d\d\d\d).mrc$") if most_recent_authority_filename is None: util.SendEmailAndExit("Beacon Generator", "Found no matching authority files!", priority=1) most_recent_titles_filename = GetMostRecentBSZFile( "^GesamtTiteldaten-post-pipeline-(\d\d\d\d\d\d).mrc$") if most_recent_titles_filename is None: util.SendEmailAndExit("Beacon Generator", "Found no matching title files!", priority=1) # Extract the GND numbers from the 035$a subfield of the MARC authority data for authors: gnd_numbers_path = "/tmp/gnd_numbers" util.ExecOrDie("/usr/local/bin/extract_person_gnd_numbers", [most_recent_authority_filename], gnd_numbers_path) # Count GND references in the title data: gnd_counts_filename = "/tmp/gnd_counts" if len(sys.argv) > 4: count_gnd_refs_args.append("--control-number-list=" + sys.argv[4]) count_gnd_refs_args.extend( [gnd_numbers_path, most_recent_titles_filename, gnd_counts_filename]) util.ExecOrDie("/usr/local/bin/count_gnd_refs", count_gnd_refs_args) # Generate a file with a timestamp in the Beacon format: timestamp_filename = "/tmp/beacon_timestamp" with open(timestamp_filename, "w") as timestamp_file: timestamp_file.write("#TIMESTAMP: " + str(datetime.date.today()) + "\n") # Now generate the final output (header + counts): if not util.ConcatenateFiles( [sys.argv[2], timestamp_filename, gnd_counts_filename], sys.argv[3]): util.SendEmailAndExit( "Beacon Generator", "An unexpected error occurred: could not write \"" + sys.argv[3] + "\"!", priority=1) # Cleanup of temp files: os.unlink(gnd_numbers_path) os.unlink(timestamp_filename) os.unlink(gnd_counts_filename) util.SendEmailAndExit("Beacon File Generator", "Successfully created a Beacon file.", priority=5)