예제 #1
0
    def parse_catalog_path(scope_path: str = None) -> str:
        """
        Attempts to find the catalog file given a current path. It does this by first checking to see if there is a
        local copy of the catalog in the scope specified. If there isn't, the search then goes to the project data
        directory to get the possible global copy.

        :param scope_path: The root path of the scope to search for the catalog.csv in or None to default to the
        global, storm non-specific file if one exists ('default.csv')
        :return: The path to the catalog file, including the filename and extension
        :except CatalogNotFoundException: If a suitable catalog file cannot be found in the scope or project dir
        """

        storm_id: str or None = Cataloging._get_storm_from_path(
            scope_path=scope_path)
        catalog_path: str = h.validate_and_expand_path(
            Cataloging.get_catalog_path(storm_id=storm_id))
        alt_catalog_path: str = h.validate_and_expand_path(
            os.path.join(scope_path, s.CATALOG_FILE_DEFAULT))

        if os.path.exists(catalog_path) and os.path.isfile(catalog_path):
            # The catalog exists somewhere in the global catalog directory
            return h.validate_and_expand_path(catalog_path)

        elif os.path.exists(alt_catalog_path) and os.path.isfile(
                alt_catalog_path):
            # The catalog was found as catalog.csv in the scope path
            return h.validate_and_expand_path(
                os.path.join(scope_path, s.CATALOG_FILE_DEFAULT))

        else:
            raise CatalogNotFoundException
예제 #2
0
    def get_catalog_path(storm_id: str = None) -> str:
        """
        Get the catalog path as specified in s.py. This will return the absolute path on the local machine including
        the file name and extension (e.g. '/home/psic_user/Poststorm_Imagery/data/catalogs/v1/florence.csv').

        :param storm_id: The id of the storm (usually the name of the storm, lower-cased with '_' instead of spaces)
        :return: The absolute path of where the catalog should be (may not actually exist) including the filename and
        extension
        """

        if storm_id is None:
            return h.validate_and_expand_path(
                os.path.join(s.CATALOG_DATA_PATH, s.CATALOG_FILE_DEFAULT))

        else:
            return h.validate_and_expand_path(
                os.path.join(s.CATALOG_DATA_PATH,
                             s.CATALOG_FILE.replace('${storm_id}', storm_id)))
예제 #3
0
    def _get_image_from_path(
            scope_path: Union[bytes, str] = None) -> str or None:

        scope_path = h.validate_and_expand_path(scope_path)

        image_file: str = os.path.split(scope_path)[1]

        if '.jpg' in image_file:
            return image_file
        else:
            return None
예제 #4
0
def main():
    if len(sys.argv) >= 2 and sys.argv[1] in PATHS.keys():
        script = h.validate_and_expand_path(
            os.path.join(SELF_PATH, PATHS[sys.argv[1]]))
        sys.argv.remove(sys.argv[1])
        sys.argv[0] = str(script)
        exec(open(sys.argv[0]).read())
    elif len(sys.argv) >= 2 and (sys.argv[1] != ('--help' or '-h')):
        print('Unknown sub-command "%s".'
              '\nValid sub-commands are %s' % (sys.argv[1], ', '.join(PATHS)))
    else:
        print('Usage: %s [%s]' % (s.ROOT_CMD, '|'.join(PATHS)))
예제 #5
0
    def _get_archive_from_path(scope_path: Union[bytes, str] = None) -> str:

        scope_path = h.validate_and_expand_path(scope_path)

        path_head, path_tail = os.path.split(scope_path)

        if path_tail == '':
            # If the filesystem root directory is reached, a storm-specific catalog cannot be found

            raise PathParsingException(objective='the archive name')

        if ('20' in path_tail
                and '_' in path_tail) is False or scope_path == s.DATA_PATH:
            # If the current directory does not look like an archive name or is the data path

            # Keep recursively checking each directory to match the pattern (traverse back through path)
            return Cataloging._get_archive_from_path(
                scope_path=os.path.split(scope_path)[0])

        else:
            return path_tail
예제 #6
0
    def _get_archive_from_path(scope_path: Union[bytes, str],
                               storm_id: str) -> str:

        scope_path = h.validate_and_expand_path(scope_path)

        path_head, path_tail = os.path.split(scope_path)

        if path_tail == '':
            # If the filesystem root directory is reached, a storm-specific catalog cannot be found

            raise PathParsingException(objective='the archive name')

        if os.path.split(path_head)[1].lower() == storm_id:
            # If the parent directory is the storm directory

            return path_tail

        else:
            # Keep recursively checking each directory to match the pattern (traverse back through path)
            return Cataloging._get_archive_from_path(
                scope_path=os.path.split(scope_path)[0], storm_id=storm_id)
예제 #7
0
    def _get_storm_from_path(scope_path: Union[bytes, str] = None,
                             debug: bool = s.DEFAULT_DEBUG,
                             recurse_count: int = 0) -> str or None:

        if debug:
            print('Looking for storm in path: ' + str(scope_path))

        scope_path = h.validate_and_expand_path(scope_path)

        path_head, path_tail = os.path.split(scope_path)

        if path_head == os.path.splitdrive(scope_path)[1] and path_tail == '':
            # If the filesystem root directory is reached, a storm-specific catalog cannot be found

            raise PathParsingException(objective='the storm name')

        if recurse_count > 10:
            raise RecursionError(
                'Could not find storm in path after 10 iterations!')

        if path_tail[0].islower() or re.match(
                '.*([._]).*', path_tail) or scope_path == s.DATA_PATH:
            # If the first character of the directory's name is lower-cased (storms should have capitals)
            # or the directory is actually a file or archive or is the data path

            # Keep recursively checking each directory to match the pattern (traverse back through path)
            return Cataloging._get_storm_from_path(
                scope_path=os.path.split(scope_path)[0],
                recurse_count=(recurse_count + 1))

        else:
            if debug:
                print('Found storm name (' + str(path_tail) + ') in path: ' +
                      str(scope_path))

            return path_tail
예제 #8
0
    'immediately, without waiting on a report to print) (Default: %(default)s).'
)

parser.add_argument(
    '--overwrite',
    '-o',
    action='store_true',
    help=
    'If included, the program will overwrite any existing archive files found in the directory by '
    'the same name (Default: %(default)s).')

# Add custom OPTIONS to the script when running command-line
OPTIONS: argparse.Namespace = parser.parse_args()

# Clean up path input and validate it
DOWNLOAD_PATH = h.validate_and_expand_path(OPTIONS.path)

c = ConnectionHandler()

storms: List[Storm] = c.get_storm_list(OPTIONS.storm)

if len(storms) == 0:  # pragma: no cover
    h.print_error(
        'No storms matched the expression provided for --storm / -s: "' +
        OPTIONS.storm + '"')
    exit(1)

# Only display status report if user requests it, otherwise just start downloads
if OPTIONS.no_status is False:

    storm_number: int = 1  # Displayed number associates with storm list
예제 #9
0
    def generate_index_from_scope(scope_path: Union[str, bytes] = s.DATA_PATH,
                                  fields_needed: Set = s.DEFAULT_FIELDS.copy(),
                                  save_interval: int = 1000,
                                  require_geom: bool = False,
                                  override_catalog_path: Union[bytes, str,
                                                               None] = None,
                                  debug: bool = s.DEFAULT_DEBUG,
                                  verbosity: int = s.DEFAULT_VERBOSITY,
                                  **kwargs) -> None:
        """
        A function to generate an index of all the data in the scope specified. Does not generate statistics, but
        instead allows for listing the data details based off of each file's attributes. Returns a Generator (an
        iterable object) that can be looped through with a for-loop or similar.

        :param scope_path: The root path to start indexing files from
        :param fields_needed: The fields to include in the catalog (gathered from the local file system)
        :param save_interval: The interval in which to save the data to the disk when accessing the .geom files,
        measured in file access operations. (0 = never save, 1000 = save after every 1,000 files read, etc.)
        :param require_geom: Whether (True) or not (False) to require a .geom file present in search for valid files
        :param override_catalog_path: If set, the program will not search for a catalog, and instead use the path to
        the catalog provided as a string.
        :param debug: Whether (True) or not (False) to override default debug flag and output additional statements
        :param verbosity: The frequency of debug statement output (1 = LOW, 2 = MEDIUM, 3 = HIGH)
        """

        global flag_unsaved_changes  # Include the global variable defined at top of this script

        print(
            'Parsing out current path to determine catalog variables to use ... ',
            end='')

        scope_path = h.validate_and_expand_path(path=scope_path)
        storm_id: str or None = Cataloging._get_storm_from_path(
            scope_path=scope_path, debug=debug)

        if override_catalog_path is None:  # pragma: no cover
            try:
                catalog_path = Cataloging.parse_catalog_path(
                    scope_path=scope_path)
            except CatalogNotFoundException:
                catalog_path = Cataloging.get_catalog_path(storm_id=storm_id)

        else:
            # A catalog path is provided, so no need to search (used for testing)
            catalog_path = override_catalog_path

        print('DONE')

        ##########################################
        # Collect matching files from filesystem #
        ##########################################

        print('Getting a list of all valid images ... ', end='')

        # Get a list of all files starting at the path specified
        files: List[str] = h.all_files_recursively(scope_path,
                                                   unix_sep=True,
                                                   require_geom=require_geom,
                                                   debug=debug,
                                                   verbosity=verbosity,
                                                   **kwargs)

        all_files = files.copy()
        for file_path in all_files:
            if os.path.split(file_path)[0].endswith('bak'):

                # Remove all files that are in 'bak' dirs (backup files)
                files.remove(file_path)

        if len(files) == 0:
            raise CatalogNoEntriesException(curr_dir=scope_path)

        if debug and verbosity >= 2:

            if verbosity < 3 and len(files) > 10:
                # Print only the first five and last five elements (similar to pandas's DataFrames)
                for i in (list(range(1, 6)) +
                          list(range(len(files) - 4,
                                     len(files) + 1))):

                    # Right-align the file numbers, because why not
                    print(('{:>' + str(len(str(len(files) + 1))) +
                           '}').format(i) + '  ' + files[i - 1])
                    if i == 5:
                        print(('{:>' + str(len(str(len(files) + 1))) +
                               '}').format('...'))

            else:
                file_list_number = 1

                # Print all elements if there are 10 or less
                for f in files:

                    # Right-align the file numbers, because why not
                    print(('{:>' + str(len(str(len(files) + 1))) +
                           '}').format(file_list_number) + '  ' + f)
                    file_list_number += 1

        print('DONE')

        ####################################################################
        # Load / generate the table (DataFrame) if it doesn't exist        #
        # and populate with file path, file size, and date image was taken #
        ####################################################################

        current_fields_needed: Set = fields_needed.copy()
        flag_unsaved_changes = False

        catalog: pd.DataFrame

        if os.path.exists(catalog_path) is False:
            # If the catalog file doesn't exist, create a new one

            print('Parsing out information about images from their paths ... ',
                  end='')

            entries: List[Dict[str, str or int]] = list()

            for i in range(len(files)):
                entry: dict = dict()
                entry['file'] = files[i]
                entry['storm_id'] = Cataloging._get_storm_from_path(
                    os.path.join(scope_path, files[i])).lower()
                entry['archive'] = Cataloging._get_archive_from_path(
                    os.path.join(scope_path, files[i])).lower()
                entry['image'] = Cataloging._get_image_from_path(
                    os.path.join(scope_path, files[i]))
                entries.append(entry)

            catalog: pd.DataFrame = pd.DataFrame(entries)

            # DataFrame is populated with these fields, so remove them from the needed list
            current_fields_needed -= {'file', 'storm_id', 'archive', 'image'}

            print('DONE')

            if 'size' in current_fields_needed:
                sizes: List[int] = list()

                for i in range(len(files)):

                    print(
                        f'\rGetting size of file {i + 1} of {len(files)} ({round((i / len(files)) * 100, 2)}%) '
                        + '.' * (math.floor(((i + 1) % 9) / 3) + 1),
                        end=' ')
                    sizes.append(
                        os.path.getsize(os.path.join(scope_path, files[i])))

                catalog['size'] = sizes
                flag_unsaved_changes = True
                current_fields_needed.remove('size')

            if 'date' in current_fields_needed:
                dates: List[str] = list()

                for i in range(len(files)):

                    print(
                        f'\rGetting date taken from file {i + 1} of {len(files)} ({round((i / len(files)) * 100, 2)}%) '
                        + '.' * (math.floor(((i + 1) % 9) / 3) + 1),
                        end=' ')
                    dates.append(
                        Cataloging._get_best_date(
                            os.path.join(scope_path, files[i])))

                catalog['date'] = dates
                flag_unsaved_changes = True
                current_fields_needed.remove('date')

            # Create the file in the scope directory
            Cataloging._force_save_catalog(catalog=catalog,
                                           scope_path=scope_path)

        else:

            print(
                'Reading in existing catalog to try and fill in any missing values ... ',
                end='')

            catalog = pd.read_csv(
                catalog_path,
                usecols=lambda col_label: col_label in current_fields_needed)

            if catalog.shape[0] < len(files):
                # If there are fewer images found than listed in current catalog
                h.print_error(
                    f'Found {catalog.shape[0]} entries in the existing catalog and {len(files)} files in the '
                    f'scope directory. Files are most likely missing or misplaced! '
                )
                exit(1)

            elif catalog.shape[0] > len(files):
                # The number of images in the directory exceed the amount listed in the current catalog
                h.print_error(
                    'The catalog seems to be missing some entries! Deleting old one and trying again ... '
                )
                os.remove(catalog_path)
                Cataloging.generate_index_from_scope(
                    scope_path=scope_path,
                    fields_needed=fields_needed,
                    save_interval=save_interval,
                    require_geom=require_geom,
                    override_catalog_path=override_catalog_path,
                    debug=debug,
                    verbosity=verbosity,
                    **kwargs)
                exit(0)

            # Remove basic info as it should already exist in the CSV file
            current_fields_needed -= {
                'file', 'storm_id', 'archive', 'image', 'date', 'size'
            }

            print('DONE')

        ##########################################################################################
        # Collect information from the .geom files about latitude and longitude of image corners #
        ##########################################################################################

        if debug and verbosity >= 1:
            print(
                'Basic data is complete! Moving on to .geom specific data ... '
            )

        for field in current_fields_needed:

            # If a column for each field does not exist, create one for each field with all the values as empty strings
            if field not in catalog:
                catalog[field] = ''
                flag_unsaved_changes = True

        stat_files_accessed: int = 0

        # For any remaining fields needed (i.e. ll_lat), look for them in the .geom files
        for i, row in catalog.iterrows():

            dots = math.floor(((i + 1) % 9) / 3) + 1

            print(
                f'\rProcessing .geom attributes of file {i + 1} of {len(files)} '
                f'({round((i / len(files)) * 100, 2)}%) ' + '.' * dots + ' ' *
                (3 - dots),
                end=' ')

            row_fields_needed = current_fields_needed.copy()
            row_fields_existing = set()

            # Remove redundant queries to .geom file if the data is already present in the catalog
            for field in current_fields_needed:
                if (type(row[field]) is str and len(row[field]) > 0) \
                        or (type(row[field]) is not str and str(row[field]).lower() != "nan"):
                    row_fields_existing.add(field)
                    row_fields_needed.remove(field)

            ending: str
            if debug and verbosity >= 3:
                ending = '\n'
            else:
                ending = '\r'

            if len(row_fields_existing) > 0:
                print(
                    f'Found existing data for {row_fields_existing} ... skipping these fields!',
                    end=ending)

            # Only query the .geom file if there are fields still unfilled
            if len(row_fields_needed) > 0:

                # Look up the fields that are needed and still missing data
                geom_data: Dict[str,
                                str] or None = Cataloging._get_geom_fields(
                                    field_id_set=row_fields_needed,
                                    file_path=os.path.join(
                                        scope_path,
                                        os.path.normpath(row['file'])),
                                    debug=debug,
                                    verbosity=verbosity)
                stat_files_accessed += 1

                if geom_data is not None:

                    # Store the values in the catalog's respective column by field name, in memory
                    for key, value in geom_data.items():
                        try:
                            catalog.at[i, key] = value
                        except ValueError as e:
                            h.print_error(
                                'The catalog seems to be corrupted or out of date! Deleting old one and '
                                f'trying again ... \nError: {e}')
                            os.remove(catalog_path)
                            Cataloging.generate_index_from_scope(
                                scope_path=scope_path,
                                fields_needed=fields_needed,
                                save_interval=save_interval,
                                require_geom=require_geom,
                                override_catalog_path=override_catalog_path,
                                debug=debug,
                                verbosity=verbosity,
                                **kwargs)
                            exit(0)

                        flag_unsaved_changes = True

            if save_interval > 0 and stat_files_accessed != 0 and stat_files_accessed % save_interval == 0:

                print('\rSaving catalog to disk (' + str(stat_files_accessed) +
                      ' .geom files accessed) ... ',
                      end='')
                Cataloging._force_save_catalog(catalog=catalog,
                                               scope_path=scope_path)

        if debug:
            print('\r')

            if verbosity >= 1:
                print()
                print(catalog)

        # Do a final save of the file
        Cataloging._force_save_catalog(catalog=catalog, scope_path=scope_path)

        print('Saved all existing data successfully!\n')
예제 #10
0
    def _get_geom_fields(field_id_set: Set[str] or str, file_path: Union[bytes, str],
                         debug: bool = s.DEFAULT_DEBUG, verbosity: int = s.DEFAULT_VERBOSITY) \
            -> Union[Dict[str, str], str, None]:

        is_single_input = False

        # If only one id is entered (a single string), convert to a set of 1 element
        if type(field_id_set) is str:
            field_id_set: Set[str] = {field_id_set}
            is_single_input = True

        # Get the .geom file that corresponds to this file (substitute existing extension for ".geom")
        geom_path = h.validate_and_expand_path(
            re.sub(pattern='\\.[^.]*$', repl='.geom', string=str(file_path)))

        result: Dict[str] = dict()

        if os.path.exists(geom_path) is False:
            h.print_error('\n\nCould not find .geom file for "' + file_path +
                          '": "' + geom_path + '"')
            for field_id in field_id_set:
                # Since no .geom file was found, fill with nan values
                result[field_id] = np.nan

            return result

        if os.path.getsize(geom_path) == 0:
            h.print_error(
                '\n\nThe .geom file for "' + file_path + '": "' + geom_path +
                '" is 0 KiBs.\n'
                'Bad file access may have caused this, so check the archive to see if the image and '
                'the .geom files in the archive are the same as the unzipped versions!\n'
            )
            for field_id in field_id_set:
                # Since no .geom file was found, fill with nan values
                result[field_id] = np.nan

            return result

        # Generate a new hash object to store the hash data
        hashing: hashlib.md5 = hashlib.md5()

        with open(geom_path, 'rb') as f:
            # Use the geom file's bytes to generate a checksum

            if 'geom_checksum' in field_id_set:
                # Generate a md5 hash to help ensure the correct data is being referenced if compared elsewhere
                hashing.update(f.read())
                result['geom_checksum'] = hashing.hexdigest()
                field_id_set.remove('geom_checksum')

        with open(geom_path, 'r') as f:

            for line in f.readlines():

                # If there are no more fields to find, close the file and return the resulting dictionary or string
                if len(field_id_set) == 0:
                    f.close()

                    if debug and verbosity >= 2:
                        print('\rFound ' + str(len(result)) + ' value(s) in ' +
                              geom_path,
                              end='')

                        if verbosity >= 3:
                            print()  # RIP your console if you get here

                    if is_single_input and len(result) == 1:
                        # Return the first (and only value) as a single string
                        return str(list(result.values())[0])

                    return result

                field_id_set_full = field_id_set.copy()
                for field_id in field_id_set_full:
                    value = re.findall(field_id + ':\\s+(.*)', line)
                    if len(value) == 1:
                        result[field_id] = str(value[0])
                        field_id_set.remove(field_id)

            f.close()
            h.print_error('\nCould not find any values for fields ' +
                          str(field_id_set) + ' in ' + geom_path)
            for field_id in field_id_set:
                # Fill missing fields with nan values
                result[field_id] = np.nan

            return result