示例#1
0
def main(argv):
    '''Main application access point'''
    options = get_options()

    init_logger('update.log')

    print('Processing Images...')

    # Get a list of images to process
    filepaths = find_images(options.source)

    if len(filepaths) is 0:
        return

    images = []

    # Move images to main archive
    for filepath in filepaths:
        dest = os.path.join(options.destination,
                            os.path.relpath(filepath, options.source))

        # Parse image header
        image_params = create_image_data(filepath)
        image_params['filepath'] = dest

        images.append(image_params)

        directory = os.path.dirname(dest)

        if not os.path.isdir(directory):
            os.makedirs(directory)

        shutil.move(filepath, dest)

    # Add images to the database
    db, cursor = get_db_cursor(options.dbhost, options.dbname, options.dbuser,
                               options.dbpass)
    process_jp2_images(images, options.destination, cursor, True)
    cursor.close()

    print('Finished!')
def main(argv):
    '''Main application access point'''
    options = get_options()
    
    init_logger('update.log')
    
    print('Processing Images...')
    
    # Get a list of images to process
    filepaths = find_images(options.source)
    
    if len(filepaths) is 0:
        return

    images = []

    # Move images to main archive
    for filepath in filepaths:
        dest = os.path.join(options.destination, 
                            os.path.relpath(filepath, options.source))
        
        # Parse image header
        image_params = sunpy.read_header(filepath)
        image_params['filepath'] = dest
        
        images.append(image_params)

        directory = os.path.dirname(dest)
        
        if not os.path.isdir(directory):
            os.makedirs(directory)

        shutil.move(filepath, dest)
    
    # Add images to the database
    cursor = get_db_cursor(options.dbname, options.dbuser, options.dbpass)
    process_jp2_images(images, options.destination, cursor, True)    
    cursor.close()
    
    print('Finished!')
                                  "have the proper permissions and try again.")
                    self.shutdown_requested = True

            try:
                shutil.move(filepath, dest)
            except IOError:
                logging.error("Unable to move files to destination. Is there "
                              "enough free space?")
                self.shutdown_requested = True


            # Add to list to send to main database
            images.append(image_params)

        # Add valid images to main Database
        process_jp2_images(images, self.image_archive, self._db)

        logging.info("Added %d images to database", len(images))

        if (len(corrupt) > 0):
            logging.info("Marked %d images as corrupt", len(corrupt))

    def send_email_alert(self, message):
        """Sends an email notification to the Helioviewer admin(s) when a
        one of the data sources becomes unreachable."""
        # If no server was specified, don't do anything
        if self.email_server is "":
            return

        # import email modules
        import smtplib
示例#4
0
class ImageRetrievalDaemon:
    """Retrieves images from the server as specified"""
    def __init__(self, servers, browse_method, download_method, conf):
        """Explain."""
        # MySQL/Postgres info
        self.dbhost = conf.get('database', 'dbhost')
        self.dbname = conf.get('database', 'dbname')
        self.dbuser = conf.get('database', 'dbuser')
        self.dbpass = conf.get('database', 'dbpass')
        # MySQL/Postgres info v2
        self.dbhost_v2 = conf.get('database_v2', 'dbhost_v2')
        self.dbname_v2 = conf.get('database_v2', 'dbname_v2')
        self.dbuser_v2 = conf.get('database_v2', 'dbuser_v2')
        self.dbpass_v2 = conf.get('database_v2', 'dbpass_v2')

        self.downloaders = []

        try:
            self._db, self._cursor = get_db_cursor(self.dbhost, self.dbname,
                                                   self.dbuser, self.dbpass)
        except mysqld.OperationalError:
            logging.error(
                "Unable to access MySQL. Is the database daemon running?")
            self.shutdown()
            self.stop()

        # v2 database
        if self.dbhost_v2 != "" and self.dbname_v2 != "":
            try:
                self._db_v2, self._cursor_v2 = get_db_cursor(
                    self.dbhost_v2, self.dbname_v2, self.dbuser_v2,
                    self.dbpass_v2)
            except mysqld.OperationalError:
                logging.error(
                    "Unable to access MySQL. Is the database daemon running (v2)?"
                )
                self.shutdown()
                self.stop()
        else:
            self._db_v2 = None
            self._cursor_v2 = None

        # Email notification
        self.email_server = conf.get('notifications', 'server')
        self.email_from = conf.get('notifications', 'from')
        self.email_to = conf.get('notifications', 'to')

        # Warning flags
        self.sent_diskspace_warning = False

        # Maximum number of simultaneous downloads
        self.max_downloads = conf.getint('network', 'max_downloads')

        # Directories
        self.working_dir = os.path.expanduser(
            conf.get('directories', 'working_dir'))
        self.image_archive = os.path.expanduser(
            conf.get('directories', 'image_archive'))
        self.incoming = os.path.join(self.working_dir, 'incoming')
        self.quarantine = os.path.join(self.working_dir, 'quarantine')
        self.kdu_transcode = os.path.expanduser(
            conf.get('kakadu', 'kdu_transcode'))

        # Check directory permission
        self._init_directories()

        # Load data server, browser, and downloader
        self.servers = self._load_servers(servers)

        self.browsers = []
        self.downloaders = []
        self.queues = []

        # For each server instantiate a browser and one or more downloaders
        for server in self.servers:
            self.browsers.append(self._load_browser(browse_method, server))
            global queue
            queue = queue.Queue()
            self.queues.append(queue)
            self.downloaders.append([
                self._load_downloader(download_method, queue)
                for i in range(self.max_downloads)
            ])

        # Shutdown switch
        self.shutdown_requested = False

    def start(self, starttime=None, endtime=None, backfill=None):
        """Start daemon operation."""
        logging.info("Initializing HVPull")

        date_fmt = "%Y-%m-%d %H:%M:%S"

        # @TODO: Process urls in batches of ~1-500.. this way images start
        # appearing more quickly when filling in large gaps, etc.

        # @TODO: Redo handling of server-specific start time and pause
        # time
        #
        # @TODO: Send email notification when HVpull stops/exits for any reason?

        # Determine starttime and endtime to use
        if backfill is None:
            if starttime is not None:
                starttime = datetime.datetime.strptime(starttime, date_fmt)
            else:
                starttime = self.servers[0].get_starttime()

            # If end time is specified, fill in data from start to end
            if endtime is not None:
                endtime = datetime.datetime.strptime(endtime, date_fmt)
                self.query(starttime, endtime)
                self.sleep()

                return None
            else:
                # Otherwise, first query from start -> now
                now = datetime.datetime.utcnow()
                self.query(starttime, now)
                self.sleep()
        else:
            # Backfill process has been requested.  Look for data in the last
            # "backfill" days. In normal operations, only the most recent
            # data from each instrument is ingested.  If the pipeline halts for
            # some reason, then the the regular ingestion process can leave
            # gaps since it looks for data at some fixed time back from now.
            # The backfill process is a less regularly run process that looks
            # much further back for data that may have been missed.  This is
            # intended to be a relatively infrequently run data ingestion
            # process, and should be run as a cron job.
            starttime = datetime.datetime.utcnow() - datetime.timedelta(
                days=backfill[0])
            endtime = datetime.datetime.utcnow() - datetime.timedelta(
                days=backfill[1])
            self.query(starttime, endtime)
            self.stop()

        # Begin main loop
        while not self.shutdown_requested:
            now = datetime.datetime.utcnow()
            starttime = self.servers[0].get_starttime()

            # get a list of files available
            self.query(starttime, now)

            self.sleep()

        # Shutdown
        self.stop()

    def sleep(self):
        """Sleep for some time before checking again for new images"""
        if self.shutdown_requested:
            return

        logging.info("Sleeping for %d minutes." %
                     (self.servers[0].pause.total_seconds() / 60))
        time.sleep(self.servers[0].pause.total_seconds())

    def stop(self):
        logging.info("Exiting HVPull")
        sys.exit()

    def query(self, starttime, endtime):
        """Query and retrieve data within the specified range.

        Checks for data in the specified range and retrieves any new files.
        After execution is completed, the same range is checked again to see
        if any new files have appeared since the first execution. This continues
        until no new files are found (for xxx minutes?)
        """
        urls = []

        fmt = '%Y-%m-%d %H:%M:%S'

        logging.info("Querying time range %s - %s", starttime.strftime(fmt),
                     endtime.strftime(fmt))

        for browser in self.browsers:
            matches = self.query_server(browser, starttime, endtime)

            if len(matches) > 0:
                urls.append(matches)

        # Remove duplicate files, randomizing to spread load across servers
        if len(urls) > 1:
            urls = self._deduplicate(urls)

        # Filter out files that are already in the database
        new_urls = []

        for url_list in urls:
            filtered = None

            while filtered is None:
                try:
                    filtered = list(filter(self._filter_new, url_list))
                except mysqld.OperationalError:
                    # MySQL has gone away -- try again in 5s
                    logging.warning((
                        "Unable to access database to check for file existence. Will try again in 5 seconds."
                    ))
                    time.sleep(5)

                    # Try and reconnect

                    # @note: May be a good idea to move the reconnect
                    # functionality to the db module and have it occur
                    # for all queries.
                    try:
                        self._db, self._cursor = get_db_cursor(
                            self.dbhost, self.dbname, self.dbuser, self.dbpass)
                    except:
                        pass

            new_urls.append(filtered)

        # check disk space
        if not self.sent_diskspace_warning:
            self._check_free_space()

        # acquire the data files
        self.acquire(new_urls)

    def query_server(self, browser, starttime, endtime):
        """Queries a single server for new files"""
        # Get a list of directories which may contain new images
        directories = browser.get_directories(starttime, endtime)

        # Get a sorted list of available JP2 files via browser
        files = []

        # Check each remote directory for new files
        for directory in directories:
            if self.shutdown_requested:
                return []

            matches = None
            num_retries = 0

            logging.info('(%s) Scanning %s' % (browser.server.name, directory))

            # Attempt to read directory contents. Retry up to 10 times
            # if failed and then notify admin
            while matches is None:
                if self.shutdown_requested:
                    return []

                try:
                    matches = browser.get_files(directory, "jp2")

                    files.extend(matches)
                except NetworkError:
                    if num_retries >= 3 * 1440:
                        logging.error(
                            "Unable to reach %s. Shutting down HVPull.",
                            browser.server.name)
                        msg = "Unable to reach %s. Is the server online?"
                        self.send_email_alert(msg % browser.server.name)
                        self.shutdown()
                    else:
                        msg = "Unable to reach %s. Will try again in 60 seconds."
                        if num_retries > 0:
                            msg += " (retry %d)" % num_retries
                        logging.warning(msg, browser.server.name)
                        time.sleep(60)
                        num_retries += 1

        return files

    def acquire(self, urls):
        """Acquires all the available files."""
        # If no new files are available do nothing
        if not urls:
            logging.info("Found no new files.")
            return

        n = sum(len(x) for x in urls)

        # Keep track of progress
        total = n
        counter = 0

        logging.info("Found %d new files", n)

        # Download files
        while n > 0:
            finished = []

            # Download files 100 at a time to avoid blocking shutdown requests
            # and to allow images to be added to database sooner
            for i, server in enumerate(list(urls)):
                for j in range(100):  #pylint: disable=W0612
                    if len(list(server)) > 0:
                        url = server.pop()

                        finished.append(url)

                        counter += 1.

                        self.queues[i].put([
                            self.servers[i].name, (counter / total) * 100, url
                        ])

                        n -= 1

            for q in self.queues:
                q.join()

            self.ingest(finished)

            if self.shutdown_requested:
                break

    def ingest(self, urls):
        """
        Add images to helioviewer data db.
          (1) Make sure the file exists
          (2) Make sure the file is 'good', and quarantine if it is not.
          (3) Apply the ESA JPIP encoding.
          (4) Ingest
          (5) Update database to say that the file has been successfully
              'ingested'.
        """
        # Get filepaths
        filepaths = []
        images = []
        corrupt = []

        for url in urls:
            path = os.path.join(
                self.incoming,
                os.path.basename(url))  # @TODO: Better path computation
            if os.path.isfile(path):
                filepaths.append(path)

        # Add to hvpull/Helioviewer.org databases
        for filepath in filepaths:
            filename = os.path.basename(filepath)

            # Parse header and validate metadata
            try:
                try:
                    image_params = create_image_data(filepath)
                except:
                    raise BadImage("HEADER")
                    logging.warn('BadImage("HEADER") error raised')
                self._validate(image_params)
            except BadImage as e:
                logging.warn("Quarantining invalid image: %s", filename)
                logging.warn("BadImage found; error message= %s",
                             e.get_message())
                shutil.move(filepath, os.path.join(self.quarantine, filename))
                mark_as_corrupt(self._cursor, filename, e.get_message())
                corrupt.append(filename)
                continue

            # If everything looks good, move to archive and add to database
            # print image_params['date']
            date_str = image_params['date'].strftime('%Y/%m/%d')

            # The files must be transcoded in order to work with JHelioviewer.
            # Therefore, any problem with the transcoding process must raise
            # an error.
            try:
                if image_params['instrument'] == "AIA":
                    self._transcode(filepath, cprecincts=[128, 128])
                else:
                    self._transcode(filepath)
            except KduTranscodeError, e:
                logging.error("kdu_transcode: " + e.get_message())

            # Move to archive
            if image_params['observatory'] == "Hinode":
                directory = os.path.join(self.image_archive,
                                         image_params['nickname'], date_str,
                                         str(image_params['filter1']),
                                         str(image_params['filter2']))
            else:
                directory = os.path.join(self.image_archive,
                                         image_params['nickname'], date_str,
                                         str(image_params['measurement']))

            dest = os.path.join(directory, filename)

            image_params['filepath'] = dest

            if not os.path.exists(directory):
                try:
                    os.makedirs(directory)
                except OSError:
                    logging.error("Unable to create the directory '" +
                                  directory + "'. Please ensure that you "
                                  "have the proper permissions and try again.")
                    self.shutdown_requested = True

            try:
                shutil.move(filepath, dest)
            except IOError:
                logging.error("Unable to move files to destination. Is there "
                              "enough free space?")
                self.shutdown_requested = True

            # Add to list to send to main database
            images.append(image_params)

        # Add valid images to main Database
        process_jp2_images(images, self.image_archive, self._db, self._cursor,
                           True, None, self._cursor_v2)

        logging.info("Added %d images to database", len(images))

        if len(corrupt) > 0:
            logging.info("Marked %d images as corrupt", len(corrupt))
示例#5
0
                                  directory + "'. Please ensure that you "
                                  "have the proper permissions and try again.")
                    self.shutdown_requested = True

            try:
                shutil.move(filepath, dest)
            except IOError:
                logging.error("Unable to move files to destination. Is there "
                              "enough free space?")
                self.shutdown_requested = True

            # Add to list to send to main database
            images.append(image_params)

        # Add valid images to main Database
        process_jp2_images(images, self.image_archive, self._db)

        logging.info("Added %d images to database", len(images))

        if (len(corrupt) > 0):
            logging.info("Marked %d images as corrupt", len(corrupt))

    def send_email_alert(self, message):
        """Sends an email notification to the Helioviewer admin(s) when a
        one of the data sources becomes unreachable."""
        # If no server was specified, don't do anything
        if self.email_server is "":
            return

        # import email modules
        import smtplib