def main(argv): '''Main application access point''' options = get_options() init_logger('update.log') print('Processing Images...') # Get a list of images to process filepaths = find_images(options.source) if len(filepaths) is 0: return images = [] # Move images to main archive for filepath in filepaths: dest = os.path.join(options.destination, os.path.relpath(filepath, options.source)) # Parse image header image_params = create_image_data(filepath) image_params['filepath'] = dest images.append(image_params) directory = os.path.dirname(dest) if not os.path.isdir(directory): os.makedirs(directory) shutil.move(filepath, dest) # Add images to the database db, cursor = get_db_cursor(options.dbhost, options.dbname, options.dbuser, options.dbpass) process_jp2_images(images, options.destination, cursor, True) cursor.close() print('Finished!')
def main(argv): '''Main application access point''' options = get_options() init_logger('update.log') print('Processing Images...') # Get a list of images to process filepaths = find_images(options.source) if len(filepaths) is 0: return images = [] # Move images to main archive for filepath in filepaths: dest = os.path.join(options.destination, os.path.relpath(filepath, options.source)) # Parse image header image_params = sunpy.read_header(filepath) image_params['filepath'] = dest images.append(image_params) directory = os.path.dirname(dest) if not os.path.isdir(directory): os.makedirs(directory) shutil.move(filepath, dest) # Add images to the database cursor = get_db_cursor(options.dbname, options.dbuser, options.dbpass) process_jp2_images(images, options.destination, cursor, True) cursor.close() print('Finished!')
"have the proper permissions and try again.") self.shutdown_requested = True try: shutil.move(filepath, dest) except IOError: logging.error("Unable to move files to destination. Is there " "enough free space?") self.shutdown_requested = True # Add to list to send to main database images.append(image_params) # Add valid images to main Database process_jp2_images(images, self.image_archive, self._db) logging.info("Added %d images to database", len(images)) if (len(corrupt) > 0): logging.info("Marked %d images as corrupt", len(corrupt)) def send_email_alert(self, message): """Sends an email notification to the Helioviewer admin(s) when a one of the data sources becomes unreachable.""" # If no server was specified, don't do anything if self.email_server is "": return # import email modules import smtplib
class ImageRetrievalDaemon: """Retrieves images from the server as specified""" def __init__(self, servers, browse_method, download_method, conf): """Explain.""" # MySQL/Postgres info self.dbhost = conf.get('database', 'dbhost') self.dbname = conf.get('database', 'dbname') self.dbuser = conf.get('database', 'dbuser') self.dbpass = conf.get('database', 'dbpass') # MySQL/Postgres info v2 self.dbhost_v2 = conf.get('database_v2', 'dbhost_v2') self.dbname_v2 = conf.get('database_v2', 'dbname_v2') self.dbuser_v2 = conf.get('database_v2', 'dbuser_v2') self.dbpass_v2 = conf.get('database_v2', 'dbpass_v2') self.downloaders = [] try: self._db, self._cursor = get_db_cursor(self.dbhost, self.dbname, self.dbuser, self.dbpass) except mysqld.OperationalError: logging.error( "Unable to access MySQL. Is the database daemon running?") self.shutdown() self.stop() # v2 database if self.dbhost_v2 != "" and self.dbname_v2 != "": try: self._db_v2, self._cursor_v2 = get_db_cursor( self.dbhost_v2, self.dbname_v2, self.dbuser_v2, self.dbpass_v2) except mysqld.OperationalError: logging.error( "Unable to access MySQL. Is the database daemon running (v2)?" ) self.shutdown() self.stop() else: self._db_v2 = None self._cursor_v2 = None # Email notification self.email_server = conf.get('notifications', 'server') self.email_from = conf.get('notifications', 'from') self.email_to = conf.get('notifications', 'to') # Warning flags self.sent_diskspace_warning = False # Maximum number of simultaneous downloads self.max_downloads = conf.getint('network', 'max_downloads') # Directories self.working_dir = os.path.expanduser( conf.get('directories', 'working_dir')) self.image_archive = os.path.expanduser( conf.get('directories', 'image_archive')) self.incoming = os.path.join(self.working_dir, 'incoming') self.quarantine = os.path.join(self.working_dir, 'quarantine') self.kdu_transcode = os.path.expanduser( conf.get('kakadu', 'kdu_transcode')) # Check directory permission self._init_directories() # Load data server, browser, and downloader self.servers = self._load_servers(servers) self.browsers = [] self.downloaders = [] self.queues = [] # For each server instantiate a browser and one or more downloaders for server in self.servers: self.browsers.append(self._load_browser(browse_method, server)) global queue queue = queue.Queue() self.queues.append(queue) self.downloaders.append([ self._load_downloader(download_method, queue) for i in range(self.max_downloads) ]) # Shutdown switch self.shutdown_requested = False def start(self, starttime=None, endtime=None, backfill=None): """Start daemon operation.""" logging.info("Initializing HVPull") date_fmt = "%Y-%m-%d %H:%M:%S" # @TODO: Process urls in batches of ~1-500.. this way images start # appearing more quickly when filling in large gaps, etc. # @TODO: Redo handling of server-specific start time and pause # time # # @TODO: Send email notification when HVpull stops/exits for any reason? # Determine starttime and endtime to use if backfill is None: if starttime is not None: starttime = datetime.datetime.strptime(starttime, date_fmt) else: starttime = self.servers[0].get_starttime() # If end time is specified, fill in data from start to end if endtime is not None: endtime = datetime.datetime.strptime(endtime, date_fmt) self.query(starttime, endtime) self.sleep() return None else: # Otherwise, first query from start -> now now = datetime.datetime.utcnow() self.query(starttime, now) self.sleep() else: # Backfill process has been requested. Look for data in the last # "backfill" days. In normal operations, only the most recent # data from each instrument is ingested. If the pipeline halts for # some reason, then the the regular ingestion process can leave # gaps since it looks for data at some fixed time back from now. # The backfill process is a less regularly run process that looks # much further back for data that may have been missed. This is # intended to be a relatively infrequently run data ingestion # process, and should be run as a cron job. starttime = datetime.datetime.utcnow() - datetime.timedelta( days=backfill[0]) endtime = datetime.datetime.utcnow() - datetime.timedelta( days=backfill[1]) self.query(starttime, endtime) self.stop() # Begin main loop while not self.shutdown_requested: now = datetime.datetime.utcnow() starttime = self.servers[0].get_starttime() # get a list of files available self.query(starttime, now) self.sleep() # Shutdown self.stop() def sleep(self): """Sleep for some time before checking again for new images""" if self.shutdown_requested: return logging.info("Sleeping for %d minutes." % (self.servers[0].pause.total_seconds() / 60)) time.sleep(self.servers[0].pause.total_seconds()) def stop(self): logging.info("Exiting HVPull") sys.exit() def query(self, starttime, endtime): """Query and retrieve data within the specified range. Checks for data in the specified range and retrieves any new files. After execution is completed, the same range is checked again to see if any new files have appeared since the first execution. This continues until no new files are found (for xxx minutes?) """ urls = [] fmt = '%Y-%m-%d %H:%M:%S' logging.info("Querying time range %s - %s", starttime.strftime(fmt), endtime.strftime(fmt)) for browser in self.browsers: matches = self.query_server(browser, starttime, endtime) if len(matches) > 0: urls.append(matches) # Remove duplicate files, randomizing to spread load across servers if len(urls) > 1: urls = self._deduplicate(urls) # Filter out files that are already in the database new_urls = [] for url_list in urls: filtered = None while filtered is None: try: filtered = list(filter(self._filter_new, url_list)) except mysqld.OperationalError: # MySQL has gone away -- try again in 5s logging.warning(( "Unable to access database to check for file existence. Will try again in 5 seconds." )) time.sleep(5) # Try and reconnect # @note: May be a good idea to move the reconnect # functionality to the db module and have it occur # for all queries. try: self._db, self._cursor = get_db_cursor( self.dbhost, self.dbname, self.dbuser, self.dbpass) except: pass new_urls.append(filtered) # check disk space if not self.sent_diskspace_warning: self._check_free_space() # acquire the data files self.acquire(new_urls) def query_server(self, browser, starttime, endtime): """Queries a single server for new files""" # Get a list of directories which may contain new images directories = browser.get_directories(starttime, endtime) # Get a sorted list of available JP2 files via browser files = [] # Check each remote directory for new files for directory in directories: if self.shutdown_requested: return [] matches = None num_retries = 0 logging.info('(%s) Scanning %s' % (browser.server.name, directory)) # Attempt to read directory contents. Retry up to 10 times # if failed and then notify admin while matches is None: if self.shutdown_requested: return [] try: matches = browser.get_files(directory, "jp2") files.extend(matches) except NetworkError: if num_retries >= 3 * 1440: logging.error( "Unable to reach %s. Shutting down HVPull.", browser.server.name) msg = "Unable to reach %s. Is the server online?" self.send_email_alert(msg % browser.server.name) self.shutdown() else: msg = "Unable to reach %s. Will try again in 60 seconds." if num_retries > 0: msg += " (retry %d)" % num_retries logging.warning(msg, browser.server.name) time.sleep(60) num_retries += 1 return files def acquire(self, urls): """Acquires all the available files.""" # If no new files are available do nothing if not urls: logging.info("Found no new files.") return n = sum(len(x) for x in urls) # Keep track of progress total = n counter = 0 logging.info("Found %d new files", n) # Download files while n > 0: finished = [] # Download files 100 at a time to avoid blocking shutdown requests # and to allow images to be added to database sooner for i, server in enumerate(list(urls)): for j in range(100): #pylint: disable=W0612 if len(list(server)) > 0: url = server.pop() finished.append(url) counter += 1. self.queues[i].put([ self.servers[i].name, (counter / total) * 100, url ]) n -= 1 for q in self.queues: q.join() self.ingest(finished) if self.shutdown_requested: break def ingest(self, urls): """ Add images to helioviewer data db. (1) Make sure the file exists (2) Make sure the file is 'good', and quarantine if it is not. (3) Apply the ESA JPIP encoding. (4) Ingest (5) Update database to say that the file has been successfully 'ingested'. """ # Get filepaths filepaths = [] images = [] corrupt = [] for url in urls: path = os.path.join( self.incoming, os.path.basename(url)) # @TODO: Better path computation if os.path.isfile(path): filepaths.append(path) # Add to hvpull/Helioviewer.org databases for filepath in filepaths: filename = os.path.basename(filepath) # Parse header and validate metadata try: try: image_params = create_image_data(filepath) except: raise BadImage("HEADER") logging.warn('BadImage("HEADER") error raised') self._validate(image_params) except BadImage as e: logging.warn("Quarantining invalid image: %s", filename) logging.warn("BadImage found; error message= %s", e.get_message()) shutil.move(filepath, os.path.join(self.quarantine, filename)) mark_as_corrupt(self._cursor, filename, e.get_message()) corrupt.append(filename) continue # If everything looks good, move to archive and add to database # print image_params['date'] date_str = image_params['date'].strftime('%Y/%m/%d') # The files must be transcoded in order to work with JHelioviewer. # Therefore, any problem with the transcoding process must raise # an error. try: if image_params['instrument'] == "AIA": self._transcode(filepath, cprecincts=[128, 128]) else: self._transcode(filepath) except KduTranscodeError, e: logging.error("kdu_transcode: " + e.get_message()) # Move to archive if image_params['observatory'] == "Hinode": directory = os.path.join(self.image_archive, image_params['nickname'], date_str, str(image_params['filter1']), str(image_params['filter2'])) else: directory = os.path.join(self.image_archive, image_params['nickname'], date_str, str(image_params['measurement'])) dest = os.path.join(directory, filename) image_params['filepath'] = dest if not os.path.exists(directory): try: os.makedirs(directory) except OSError: logging.error("Unable to create the directory '" + directory + "'. Please ensure that you " "have the proper permissions and try again.") self.shutdown_requested = True try: shutil.move(filepath, dest) except IOError: logging.error("Unable to move files to destination. Is there " "enough free space?") self.shutdown_requested = True # Add to list to send to main database images.append(image_params) # Add valid images to main Database process_jp2_images(images, self.image_archive, self._db, self._cursor, True, None, self._cursor_v2) logging.info("Added %d images to database", len(images)) if len(corrupt) > 0: logging.info("Marked %d images as corrupt", len(corrupt))
directory + "'. Please ensure that you " "have the proper permissions and try again.") self.shutdown_requested = True try: shutil.move(filepath, dest) except IOError: logging.error("Unable to move files to destination. Is there " "enough free space?") self.shutdown_requested = True # Add to list to send to main database images.append(image_params) # Add valid images to main Database process_jp2_images(images, self.image_archive, self._db) logging.info("Added %d images to database", len(images)) if (len(corrupt) > 0): logging.info("Marked %d images as corrupt", len(corrupt)) def send_email_alert(self, message): """Sends an email notification to the Helioviewer admin(s) when a one of the data sources becomes unreachable.""" # If no server was specified, don't do anything if self.email_server is "": return # import email modules import smtplib