Exemplo n.º 1
0
def validateOrthosAndFireball(options, fileType, logger):
    '''Validate ortho and fireball files within the current frame range. This
    is expected to be in called in parallel for smaller chunks. Lidar files
    will be validated serially. Jpegs get validated when converted to tif.
    Return True if all is good.'''

    badFiles = False
    logger.info("Validating files of type: " + fileType)

    if fileType == 'ortho':
        dataFolder = icebridge_common.getOrthoFolder(options.outputFolder)
    elif fileType == 'fireball':
        dataFolder = icebridge_common.getFireballFolder(options.outputFolder)
    else:
        raise Exception("Unknown file type: " + fileType)

    indexPath = icebridge_common.csvIndexFile(dataFolder)
    if not os.path.exists(indexPath):
        # The issue of what to do when the index does not exist should
        # have been settled by now.
        return (not badFiles)

    # Fetch from disk the set of already validated files, if any
    validFilesList = icebridge_common.validFilesList(options.outputFolder,
                                                     options.startFrame,
                                                     options.stopFrame)
    validFilesSet = set()
    validFilesSet = icebridge_common.updateValidFilesListFromDisk(
        validFilesList, validFilesSet)
    numInitialValidFiles = len(validFilesSet)

    (frameDict, urlDict) = icebridge_common.readIndexFile(indexPath,
                                                          prependFolder=True)
    for frame in frameDict.keys():

        if frame < options.startFrame or frame > options.stopFrame:
            continue

        outputPath = frameDict[frame]
        xmlFile = icebridge_common.xmlFile(outputPath)

        if outputPath in validFilesSet and os.path.exists(outputPath) and \
            xmlFile in validFilesSet and os.path.exists(xmlFile):
            #logger.info('Previously validated: ' + outputPath + ' ' + xmlFile)
            continue
        else:
            isGood = icebridge_common.hasValidChkSum(outputPath, logger)
            if not isGood:
                logger.info('Found invalid data. Will wipe: ' + outputPath +
                            ' ' + xmlFile)
                os.system('rm -f ' + outputPath)  # will not throw
                os.system('rm -f ' + xmlFile)  # will not throw
                badFiles = True
            else:
                logger.info('Valid file: ' + outputPath)
                validFilesSet.add(outputPath)
                validFilesSet.add(xmlFile)

        if fileType != 'fireball':
            continue

        # Also validate tfw
        tfwFile = icebridge_common.tfwFile(outputPath)
        xmlFile = icebridge_common.xmlFile(tfwFile)
        if tfwFile in validFilesSet and os.path.exists(tfwFile) and \
            xmlFile in validFilesSet and os.path.exists(xmlFile):
            #logger.info('Previously validated: ' + tfwFile + ' ' + xmlFile)
            continue
        else:
            isGood = icebridge_common.isValidTfw(tfwFile, logger)
            if not isGood:
                logger.info('Found invalid tfw. Will wipe: ' + tfwFile + ' ' +
                            xmlFile)
                os.system('rm -f ' + tfwFile)  # will not throw
                os.system('rm -f ' + xmlFile)  # will not throw
                badFiles = True
            else:
                logger.info('Valid tfw file: ' + tfwFile)
                validFilesSet.add(tfwFile)
                validFilesSet.add(xmlFile)

    # Write to disk the list of validated files, but only if new
    # validations happened.  First re-read that list, in case a
    # different process modified it in the meantime, such as if two
    # managers are running at the same time.
    numFinalValidFiles = len(validFilesSet)
    if numInitialValidFiles != numFinalValidFiles:
        validFilesSet = \
                      icebridge_common.updateValidFilesListFromDisk(validFilesList, validFilesSet)
        icebridge_common.writeValidFilesList(validFilesList, validFilesSet)

    return (not badFiles)
def validateOrthosAndFireball(options, fileType, logger):
    '''Validate ortho and fireball files within the current frame range. This
    is expected to be in called in parallel for smaller chunks. Lidar files
    will be validated serially. Jpegs get validated when converted to tif.
    Return True if all is good.'''

    badFiles = False
    logger.info("Validating files of type: " + fileType)
    
    if fileType   == 'ortho':
        dataFolder = icebridge_common.getOrthoFolder(options.outputFolder)
    elif fileType == 'fireball':
        dataFolder = icebridge_common.getFireballFolder(options.outputFolder)
    else:
        raise Exception("Unknown file type: " + fileType)

    indexPath = icebridge_common.csvIndexFile(dataFolder)
    if not os.path.exists(indexPath):
        # The issue of what to do when the index does not exist should
        # have been settled by now.
        return (not badFiles)

    # Fetch from disk the set of already validated files, if any
    validFilesList = icebridge_common.validFilesList(options.outputFolder,
                                                     options.startFrame, options.stopFrame)
    validFilesSet = set()
    validFilesSet = icebridge_common.updateValidFilesListFromDisk(validFilesList, validFilesSet)
    numInitialValidFiles = len(validFilesSet)
    
    (frameDict, urlDict) = icebridge_common.readIndexFile(indexPath, prependFolder = True)
    for frame in frameDict.keys():

        if frame < options.startFrame or frame > options.stopFrame:
            continue

        outputPath = frameDict[frame]
        xmlFile = icebridge_common.xmlFile(outputPath)

        if outputPath in validFilesSet and os.path.exists(outputPath) and \
            xmlFile in validFilesSet and os.path.exists(xmlFile):
            #logger.info('Previously validated: ' + outputPath + ' ' + xmlFile)
            continue
        else:
            isGood = icebridge_common.hasValidChkSum(outputPath, logger)
            if not isGood:
                logger.info('Found invalid data. Will wipe: ' + outputPath + ' ' + xmlFile)
                os.system('rm -f ' + outputPath) # will not throw
                os.system('rm -f ' + xmlFile) # will not throw
                badFiles = True
            else:
                logger.info('Valid file: ' + outputPath)
                validFilesSet.add(outputPath)
                validFilesSet.add(xmlFile)
            
        if fileType != 'fireball':
            continue

        # Also validate tfw
        tfwFile = icebridge_common.tfwFile(outputPath)
        xmlFile = icebridge_common.xmlFile(tfwFile)
        if tfwFile in validFilesSet and os.path.exists(tfwFile) and \
            xmlFile in validFilesSet and os.path.exists(xmlFile):
            #logger.info('Previously validated: ' + tfwFile + ' ' + xmlFile)
            continue
        else:
            isGood = icebridge_common.isValidTfw(tfwFile, logger)
            if not isGood:
                logger.info('Found invalid tfw. Will wipe: ' + tfwFile + ' ' + xmlFile)
                os.system('rm -f ' + tfwFile) # will not throw
                os.system('rm -f ' + xmlFile) # will not throw
                badFiles = True
            else:
                logger.info('Valid tfw file: ' + tfwFile)
                validFilesSet.add(tfwFile)
                validFilesSet.add(xmlFile)
        
    # Write to disk the list of validated files, but only if new
    # validations happened.  First re-read that list, in case a
    # different process modified it in the meantime, such as if two
    # managers are running at the same time.
    numFinalValidFiles = len(validFilesSet)
    if numInitialValidFiles != numFinalValidFiles:
        validFilesSet = \
                      icebridge_common.updateValidFilesListFromDisk(validFilesList, validFilesSet)
        icebridge_common.writeValidFilesList(validFilesList, validFilesSet)

    return (not badFiles)
Exemplo n.º 3
0
def doFetch(options, outputFolder):
    '''The main fetch function.
       Returns the number of failures.'''

    # Verify that required files exist
    home = os.path.expanduser("~")
    if not (os.path.exists(home + '/.netrc')
            and os.path.exists(home + '/.urs_cookies')):
        logger.error(
            'Missing a required authentication file!  See instructions here:\n'
            +
            '    https://nsidc.org/support/faq/what-options-are-available-bulk-'
            + 'downloading-data-https-earthdata-login-enabled')
        return -1

    curlPath = asp_system_utils.which("curl")
    curlOpts = ' -n -L '
    cookiePaths = ' -b ~/.urs_cookies -c ~/.urs_cookies '
    baseCurlCmd = curlPath + curlOpts + cookiePaths

    logger.info('Creating output folder: ' + outputFolder)
    os.system('mkdir -p ' + outputFolder)

    isSouth = (options.site == 'AN')

    if options.type == 'nav':  # Nav fetching is much less complicated
        return fetchNavData(options, outputFolder)

    parsedIndexPath = fetchAndParseIndexFile(options, isSouth, baseCurlCmd,
                                             outputFolder)
    if not icebridge_common.fileNonEmpty(parsedIndexPath):
        # Some dirs are weird, both images, fireball dems, and ortho.
        # Just accept whatever there is, but with a warning.
        logger.info('Warning: Missing index file: ' + parsedIndexPath)

    # Store file information in a dictionary
    # - Keep track of the earliest and latest frame
    logger.info('Reading file list from ' + parsedIndexPath)
    try:
        (frameDict, urlDict) = icebridge_common.readIndexFile(parsedIndexPath)
    except:
        # We probably ran into old format index file. Must refetch.
        logger.info('Could not read index file. Try again.')
        options.refetchIndex = True
        parsedIndexPath = fetchAndParseIndexFile(options, isSouth, baseCurlCmd,
                                                 outputFolder)
        (frameDict, urlDict) = icebridge_common.readIndexFile(parsedIndexPath)

    if options.stopAfterIndexFetch:
        return 0

    isLidar = (options.type in LIDAR_TYPES)

    allFrames = sorted(frameDict.keys())

    if not isLidar:
        # The lidar frames use a totally different numbering than the image/ortho/dem frames
        firstFrame = icebridge_common.getLargestFrame()  # start big
        lastFrame = icebridge_common.getSmallestFrame()  # start small
        for frameNumber in allFrames:
            if frameNumber < firstFrame:
                firstFrame = frameNumber
            if frameNumber > lastFrame:
                lastFrame = frameNumber

        if options.allFrames:
            options.startFrame = firstFrame
            options.stopFrame = lastFrame

    if isLidar:
        # Based on image frames, determine which lidar frames to fetch.
        if options.ignoreMissingLidar and len(frameDict.keys()) == 0:
            # Nothing we can do if this run has no lidar and we are told to continue
            logger.info("Warning: missing lidar, but continuing.")
            lidarsToFetch = set()
        else:
            lidarsToFetch = lidarFilesInRange(frameDict, outputFolder,
                                              options.startFrame,
                                              options.stopFrame)

    # There is always a chance that not all requested frames are available.
    # That is particularly true for Fireball DEMs. Instead of failing,
    # just download what is present and give a warning.
    if options.startFrame not in frameDict and not isLidar:
        logger.info("Warning: Frame " + str(options.startFrame) +
                    " is not found in this flight.")

    if options.stopFrame and (options.stopFrame
                              not in frameDict) and not isLidar:
        logger.info("Warning: Frame " + str(options.stopFrame) +
                    " is not found in this flight.")

    allFilesToFetch = [
    ]  # Files that we will fetch, relative to the current dir.
    allUrlsToFetch = []  # Full url of each file.

    # Loop through all found frames within the provided range
    currentFileCount = 0
    lastFrame = ""
    if len(allFrames) > 0:
        lastFrame = allFrames[len(allFrames) - 1]

    hasTfw = (options.type == 'fireball')
    hasXml = (isLidar or (options.type == 'ortho') or hasTfw)
    numFetched = 0
    skipCount = 0
    for frame in allFrames:

        # Skip frame outside of range
        if isLidar:
            if frameDict[frame] not in lidarsToFetch:
                continue
        else:
            if ((frame < options.startFrame) or (frame > options.stopFrame)):
                continue

        # Handle the frame skip option
        if options.frameSkip > 0:
            if skipCount < options.frameSkip:
                skipCount += 1
                continue
            skipCount = 0

        filename = frameDict[frame]

        # Some files have an associated xml file. Fireball DEMs also have a tfw file.
        currFilesToFetch = [filename]
        if hasXml:
            currFilesToFetch.append(icebridge_common.xmlFile(filename))
        if hasTfw:
            currFilesToFetch.append(icebridge_common.tfwFile(filename))

        for filename in currFilesToFetch:
            url = os.path.join(urlDict[frame], filename)
            outputPath = os.path.join(outputFolder, filename)
            allFilesToFetch.append(outputPath)
            allUrlsToFetch.append(url)

    # Restrict lidar fetch amount according to the parameter
    if (isLidar and options.maxNumLidarToFetch > 0
            and len(allFilesToFetch) > options.maxNumLidarToFetch):

        # Ensure an even number, to fetch both the lidar file and its xml
        if options.maxNumLidarToFetch % 2 == 1:
            options.maxNumLidarToFetch += 1

        allFilesToFetch = allFilesToFetch[0:options.maxNumLidarToFetch]
        allUrlsToFetch = allUrlsToFetch[0:options.maxNumLidarToFetch]

    icebridge_common.fetchFilesInBatches(baseCurlCmd, MAX_IN_ONE_CALL,
                                         options.dryRun, outputFolder,
                                         allFilesToFetch, allUrlsToFetch,
                                         logger)

    # Fetch from disk the set of already validated files, if any
    validFilesList = icebridge_common.validFilesList(
        os.path.dirname(outputFolder), options.startFrame, options.stopFrame)
    validFilesSet = set()
    validFilesSet = icebridge_common.updateValidFilesListFromDisk(
        validFilesList, validFilesSet)
    numInitialValidFiles = len(validFilesSet)

    # Verify that all files were fetched and are in good shape
    failedFiles = []
    for outputPath in allFilesToFetch:

        if options.skipValidate:
            continue

        if not icebridge_common.fileNonEmpty(outputPath):
            logger.info('Missing file: ' + outputPath)
            failedFiles.append(outputPath)
            continue

        if icebridge_common.hasImageExtension(outputPath):
            if False:
                # This check is just so slow. Turn it off for now.
                # This will impact only the validation of jpegs,
                # as the other files can be validated via the checksum.
                # Jpegs will be validated when converting them to 1 band images
                if outputPath in validFilesSet and os.path.exists(outputPath):
                    #logger.info('Previously validated: ' + outputPath)   # verbose
                    continue
                else:
                    if not icebridge_common.isValidImage(outputPath):
                        logger.info('Found an invalid image. Will wipe it: ' +
                                    outputPath)
                        if os.path.exists(outputPath): os.remove(outputPath)
                        failedFiles.append(outputPath)
                        continue
                    else:
                        logger.info('Valid image: ' + outputPath)
                        validFilesSet.add(outputPath)  # mark it as validated

        # Sanity check: XML files must have the right latitude.
        if icebridge_common.fileExtension(outputPath) == '.xml':
            if outputPath in validFilesSet and os.path.exists(outputPath):
                #logger.info('Previously validated: ' + outputPath) #verbose
                continue
            else:
                if os.path.exists(outputPath):
                    try:
                        latitude = icebridge_common.parseLatitude(outputPath)
                        logger.info('Valid file: ' + outputPath)
                        validFilesSet.add(outputPath)  # mark it as validated
                    except:
                        # Corrupted file
                        logger.info("Failed to parse latitude, will wipe: " +
                                    outputPath)
                        if os.path.exists(outputPath): os.remove(outputPath)
                        failedFiles.append(outputPath)

                    # On a second thought, don't wipe files with wrong latitude, as
                    # next time we run fetch we will have to fetch them again.
                    # Hopefully they will be ignored.
                    #isGood = hasGoodLat(latitude, isSouth)
                    #if not isGood:
                    #    logger.info("Wiping XML file " + outputPath + " with bad latitude " + \
                    #                str(latitude))
                    #    os.remove(outputPath)
                    #    imageFile = icebridge_common.xmlToImage(outputPath)
                    #    if os.path.exists(imageFile):
                    #        logger.info("Wiping TIF file " + imageFile + " with bad latitude " + \
                    #                    str(latitude))
                    #        os.remove(imageFile)

        # Verify the chcksum
        if hasXml and len(outputPath) >= 4 and outputPath[-4:] != '.xml' \
               and outputPath[-4:] != '.tfw':
            if outputPath in validFilesSet and os.path.exists(outputPath):
                #logger.info('Previously validated: ' + outputPath) # verbose
                continue
            else:
                isGood = icebridge_common.hasValidChkSum(outputPath, logger)
                if not isGood:
                    xmlFile = icebridge_common.xmlFile(outputPath)
                    logger.info('Found invalid data. Will wipe: ' +
                                outputPath + ' ' + xmlFile)
                    if os.path.exists(outputPath): os.remove(outputPath)
                    if os.path.exists(xmlFile): os.remove(xmlFile)
                    failedFiles.append(outputPath)
                    failedFiles.append(xmlFile)
                    continue
                else:
                    logger.info('Valid file: ' + outputPath)
                    validFilesSet.add(outputPath)

        if hasTfw and icebridge_common.fileExtension(outputPath) == '.tfw':
            if outputPath in validFilesSet and os.path.exists(outputPath):
                #logger.info('Previously validated: ' + outputPath)
                continue
            else:
                isGood = icebridge_common.isValidTfw(outputPath, logger)
                if not isGood:
                    xmlFile = icebridge_common.xmlFile(outputPath)
                    logger.info('Found invalid tfw. Will wipe: ' + outputPath +
                                ' ' + xmlFile)
                    if os.path.exists(outputPath): os.remove(outputPath)
                    if os.path.exists(xmlFile): os.remove(xmlFile)
                    failedFiles.append(outputPath)
                    failedFiles.append(xmlFile)
                    continue
                else:
                    logger.info('Valid tfw file: ' + outputPath)
                    validFilesSet.add(outputPath)

    # Write to disk the list of validated files, but only if new
    # validations happened.  First re-read that list, in case a
    # different process modified it in the meantime, such as if two
    # managers are running at the same time.
    numFinalValidFiles = len(validFilesSet)
    if numInitialValidFiles != numFinalValidFiles:
        validFilesSet = \
                      icebridge_common.updateValidFilesListFromDisk(validFilesList, validFilesSet)
        icebridge_common.writeValidFilesList(validFilesList, validFilesSet)

    numFailed = len(failedFiles)
    if numFailed > 0:
        logger.info("Number of files that could not be processed: " +
                    str(numFailed))

    return numFailed
Exemplo n.º 4
0
def fetchAndParseIndexFileAux(isSouth, separateByLat, dayVal, baseCurlCmd,
                              folderUrl, path, fileType):
    '''Retrieve the index file for a folder of data and create
    a parsed version of it that contains frame number / filename pairs.'''

    # Download the html file
    curlCmd = baseCurlCmd + ' ' + folderUrl + ' > ' + path
    logger.info(curlCmd)
    p = subprocess.Popen(curlCmd, shell=True)
    os.waitpid(p.pid, 0)

    # Find all the file names in the index file and
    #  dump them to a new index file
    logger.info('Extracting file name list from index.html file...')
    with open(path, 'r') as f:
        indexText = f.read()

    # Must wipe this html file. We fetch it too often in different
    # contexts.  If not wiped, the code fails to work in some
    # very rare but real situations.
    if os.path.exists(path):
        os.remove(path)

    # Extract just the file names
    fileList = []  # ensure initialization
    if fileType == 'jpeg':
        fileList = re.findall(">[0-9_]*.JPG", indexText, re.IGNORECASE)
    if fileType == 'ortho':
        fileList = re.findall(">DMS\w*.tif<", indexText, re.IGNORECASE)
    if fileType == 'fireball':
        # Fireball DEMs
        fileList = re.findall(">IODMS\w*DEM.tif", indexText, re.IGNORECASE)
    if fileType == 'lvis':
        fileList = re.findall(">ILVIS\w+.TXT", indexText, re.IGNORECASE)
    if fileType == 'atm1':
        fileList = re.findall(">ILATM1B[0-9_]*.ATM4\w+.qi", indexText,
                              re.IGNORECASE)
        #                      >ILATM1B_20111018_145455.ATM4BT4.qi
        #   or                 >ILATM1B_20091016_165112.atm4cT3.qi
    if fileType == 'atm2':
        # Match ILATM1B_20160713_195419.ATM5BT5.h5
        fileList = re.findall(">ILATM1B[0-9_]*.ATM\w+.h5", indexText,
                              re.IGNORECASE)

    # Get rid of '>' and '<'
    for fileIter in range(len(fileList)):
        fileList[fileIter] = fileList[fileIter].replace(">", "")
        fileList[fileIter] = fileList[fileIter].replace("<", "")

    # Some runs, eg, https://n5eil01u.ecs.nsidc.org/ICEBRIDGE/IODMS1B.001/2015.09.24
    # have files for both GR and AN, with same frame number. Those need to be separated
    # by latitude. This is a problem only with orthoimages.
    badXmls = set()
    outputFolder = os.path.dirname(path)
    if separateByLat:
        allFilesToFetch = []
        allUrlsToFetch = []
        for filename in fileList:
            xmlFile = icebridge_common.xmlFile(filename)
            url = os.path.join(folderUrl, xmlFile)
            outputPath = os.path.join(outputFolder, xmlFile)
            allFilesToFetch.append(outputPath)
            allUrlsToFetch.append(url)

        dryRun = False
        icebridge_common.fetchFilesInBatches(baseCurlCmd, MAX_IN_ONE_CALL,
                                             dryRun, outputFolder,
                                             allFilesToFetch, allUrlsToFetch,
                                             logger)

        # Mark the bad ones
        for xmlFile in allFilesToFetch:
            latitude = icebridge_common.parseLatitude(xmlFile)
            isGood = hasGoodLat(latitude, isSouth)
            if not isGood:
                badXmls.add(xmlFile)

    elif (fileType == 'ortho' or fileType == 'fireball'):
        # Sometimes there is a large gap in the timestamp. That means orthoimages
        # from previous day are spilling over. If dayVal is 0, we must ignore
        # the spillover images. If dayVal is 1, we must keep the spillover images
        # and igore the others.
        list1 = []
        list2 = []
        isBigGap = False
        prevStamp = -1
        for filename in fileList:
            [imageDateString,
             imageTimeString] = icebridge_common.parseTimeStamps(filename)
            currStamp = float(imageTimeString) / 1000000.0  # hours
            if prevStamp < 0:
                list1.append(filename)
                prevStamp = currStamp
                continue

            # Note that once isBigGap becomes true, it stays true
            # even when the gap gets small again
            if currStamp - prevStamp >= 6:  # six hour gap is a lot
                isBigGap = True
            if not isBigGap:
                list1.append(filename)
            else:
                list2.append(filename)

            prevStamp = currStamp  # for next iteration

        if isBigGap:
            if dayVal == 0:
                fileList = list2[:]  # current day
            else:
                fileList = list1[:]  # spillover from prev day

    # For each entry that matched the regex, record: the frame number and the file name.
    frameDict = {}
    urlDict = {}
    badFiles = []
    for filename in fileList:

        if len(badXmls) > 0:
            xmlFile = os.path.join(outputFolder,
                                   icebridge_common.xmlFile(filename))
            if xmlFile in badXmls:
                continue

        frame = icebridge_common.getFrameNumberFromFilename(filename)
        if frame in frameDict.keys():

            # The same frame must not occur twice.
            if fileType not in LIDAR_TYPES:
                logger.error("Error: Found two file names with same frame number: " + \
                             frameDict[frame] + " and " + filename)
                badFiles.append(filename)
                badFiles.append(frameDict[frame])

        # note that folderUrl can vary among orthoimages, as sometimes
        # some of them are in a folder for the next day.
        frameDict[frame] = filename
        urlDict[frame] = folderUrl

    # Wipe them all, to be sorted later
    for badFile in badFiles:
        if os.path.exists(badFile):
            logger.info("Deleting: " + badFile)
            os.remove(badFile)
        xmlFile = icebridge_common.xmlFile(badFile)
        if os.path.exists(xmlFile):
            logger.info("Deleting: " + xmlFile)
            os.remove(xmlFile)

    if len(badFiles) > 0:
        raise Exception("Found files with same frame number")

    return (frameDict, urlDict)
def convertLidarDataToCsv(lidarFolder, startFrame, stopFrame, skipValidate,
                          logger):
    '''Make sure all lidar data is available in a readable text format.
       Returns false if any files failed to convert.'''

    logger.info('Converting LIDAR files...')

    lidarIndexPath = icebridge_common.csvIndexFile(lidarFolder)
    (frameDict, urlDict) = icebridge_common.readIndexFile(lidarIndexPath)

    if not skipValidate:
        validFilesList = icebridge_common.validFilesList(
            os.path.dirname(lidarFolder), startFrame, stopFrame)
        validFilesSet = set()
        validFilesSet = icebridge_common.updateValidFilesListFromDisk(
            validFilesList, validFilesSet)
        numInitialValidFiles = len(validFilesSet)

    convDict = {}

    # Loop through all files in the folder
    badFiles = False
    for frame in sorted(frameDict.keys()):

        f = frameDict[frame]
        extension = icebridge_common.fileExtension(f)

        # Only interested in a few file types
        if (extension != '.qi') and (extension != '.hdf5') and (extension !=
                                                                '.h5'):
            convDict[frame] = f  # these are already in plain text
            continue

        convDict[frame] = os.path.splitext(f)[0] + '.csv'
        outputPath = os.path.join(lidarFolder, convDict[frame])

        # Handle paths
        fullPath = os.path.join(lidarFolder, f)
        if not os.path.exists(fullPath):
            logger.info("Cannot convert missing file: " + fullPath)
            continue

        # If the input is invalid, wipe both it, its xml, and the output
        # Hopefully there will be a subsquent fetch step where it will get
        # refetched.
        if not icebridge_common.hasValidChkSum(fullPath, logger):
            logger.info("Will wipe invalid file: " + fullPath)
            xmlFile = icebridge_common.xmlFile(fullPath)
            os.system('rm -f ' + fullPath)  # will not throw
            os.system('rm -f ' + xmlFile)  # will not throw
            os.system('rm -f ' + outputPath)  # will not throw
            badFiles = True
            continue

        # Skip existing valid files
        if skipValidate:
            if os.path.exists(outputPath):
                logger.info("File exists, skipping: " + outputPath)
                continue
        else:
            if outputPath in validFilesSet and os.path.exists(outputPath):
                #logger.info('Previously validated: ' + outputPath) # verbose
                continue
            if icebridge_common.isValidLidarCSV(outputPath):
                #logger.info("File exists and is valid, skipping: " + outputPath)
                continue

        # Call the conversion
        logger.info("Process " + fullPath)
        extract_icebridge_ATM_points.main([fullPath])

        # Check the result
        if not icebridge_common.isValidLidarCSV(outputPath):
            logger.error('Failed to parse LIDAR file, will wipe: ' +
                         outputPath)
            os.system('rm -f ' + outputPath)  # will not throw
            badFiles = True
        else:
            if not skipValidate:
                validFilesSet.add(outputPath)  # mark it as validated

    convLidarFile = icebridge_common.getConvertedLidarIndexFile(lidarFolder)

    willWriteConvFile = False
    if not os.path.exists(convLidarFile):
        willWriteConvFile = True
    else:
        # Bugfix: Sometimes the written converted file has the wrong size, maybe
        # something got interrupted earlier.
        (lidarDictIn,
         dummyUrlDict) = icebridge_common.readIndexFile(convLidarFile)
        if lidarDictIn != convDict:
            willWriteConvFile = True

    if willWriteConvFile:
        logger.info("Writing: " + convLidarFile)
        icebridge_common.writeIndexFile(convLidarFile, convDict, {})

    if not skipValidate:
        # Write to disk the list of validated files, but only if new
        # validations happened.  First re-read that list, in case a
        # different process modified it in the meantime, such as if two
        # managers are running at the same time.
        numFinalValidFiles = len(validFilesSet)
        if numInitialValidFiles != numFinalValidFiles:
            validFilesSet = icebridge_common.updateValidFilesListFromDisk(
                validFilesList, validFilesSet)
            icebridge_common.writeValidFilesList(validFilesList, validFilesSet)

    return (not badFiles)
Exemplo n.º 6
0
def fetchAndParseIndexFile(options, isSouth, baseCurlCmd, outputFolder):
    '''Create a list of all files that must be fetched unless done already.'''

    # For AN 20091112, etc, some of the ortho images are stored at the
    # beginning of the next day's flight. Need to sort this out, and
    # it is tricky. More comments within the code.
    fetchNextDay = True

    separateByLat = (options.type == 'ortho'
                     and isInSeparateByLatTable(options.yyyymmdd))
    if separateByLat:
        # Here we won't fetch the next day, we will just separate by latitude within
        # a given day
        fetchNextDay = False

    orthoOrFireball = ((options.type == 'ortho')
                       or (options.type == 'fireball'))

    if fetchNextDay:
        # Normally we fetch for next day only for ortho or fireball. However,
        # for one single special flight, we do it for jpeg too, as then
        # the jpegs are also split.
        if orthoOrFireball or \
           ((options.type == 'jpeg') and twoFlightsInOneDay(options.site, options.yyyymmdd)):
            fetchNextDay = True
        else:
            fetchNextDay = False

    # If we need to parse the next flight day as well, as expected in some runs,
    # we will fetch two html files, but create a single index out of them.
    dayVals = [0]
    if fetchNextDay:
        dayVals.append(1)

    indexPath = icebridge_common.htmlIndexFile(outputFolder)

    currIndexPath = indexPath
    parsedIndexPath = icebridge_common.csvIndexFile(outputFolder)

    if options.refetchIndex:
        os.system('rm -f ' + indexPath)
        os.system('rm -f ' + parsedIndexPath)

    if icebridge_common.fileNonEmpty(parsedIndexPath):
        logger.info('Already have the index file ' + parsedIndexPath +
                    ', keeping it.')
        return parsedIndexPath

    frameDict = {}
    urlDict = {}

    # We need the list of jpeg frames. Sometimes when fetching ortho images,
    # and we have to fetch from the next day, don't fetch unless
    # in the jpeg index.
    if len(dayVals) > 1 and options.type != 'jpeg':
        jpegFolder = icebridge_common.getJpegFolder(
            os.path.dirname(outputFolder))
        jpegIndexPath = icebridge_common.csvIndexFile(jpegFolder)
        (jpegFrameDict,
         jpegUrlDict) = icebridge_common.readIndexFile(jpegIndexPath)

    orthoStamp = {}
    if options.type == 'fireball':
        # This is a bugfix. Ensure that the fireball DEM has not just
        # the same frame number, but also same timestamp as the ortho.
        orthoFolder = icebridge_common.getOrthoFolder(
            os.path.dirname(outputFolder))
        orthoIndexPath = icebridge_common.csvIndexFile(orthoFolder)
        (orthoFrameDict,
         orthoUrlDict) = icebridge_common.readIndexFile(orthoIndexPath)
        for frame in sorted(orthoFrameDict.keys()):
            filename = orthoFrameDict[frame]
            [imageDateString,
             imageTimeString] = icebridge_common.parseTimeStamps(filename)
            orthoStamp[frame] = imageTimeString

    for dayVal in dayVals:

        if len(dayVals) > 1:
            currIndexPath = indexPath + '.day' + str(dayVal)
            if options.refetchIndex:
                os.system('rm -f ' + currIndexPath)

        # Find folderUrl which contains all of the files
        if options.type in LIDAR_TYPES:
            options.allFrames = True  # For lidar, always get all the frames!

            # For lidar, the data can come from one of three sources.
            # Unfortunately sometimes there is more than one source, and then
            # we need to pick by latitude.
            folderUrls = []
            lidar_types = []
            for lidar in LIDAR_TYPES:
                folderUrl = getFolderUrl(
                    options.yyyymmdd,
                    options.year,
                    options.month,
                    options.day,
                    dayVal,  # note here the dayVal
                    options.site,
                    lidar)
                logger.info('Checking lidar URL: ' + folderUrl)
                if checkIfUrlExists(folderUrl):
                    logger.info('Found match with lidar type: ' + lidar)
                    folderUrls.append(folderUrl)
                    lidar_types.append(lidar)

            if len(folderUrls) == 0:
                logger.info(
                    'WARNING: Could not find any lidar data for the given date!'
                )

            elif len(folderUrls) == 1:
                # Unique solution
                folderUrl = folderUrls[0]
                options.type = lidar_types[0]

            elif len(folderUrls) >= 2:
                # Multiple solutions. Pick the good one by latitude.
                logger.info("Multiples URLs to search: " +
                            " ".join(folderUrls))
                count = -1
                isGood = False
                for folderUrl in folderUrls:
                    count += 1
                    (localFrameDict, localUrlDict) = \
                                     fetchAndParseIndexFileAux(isSouth,
                                                               separateByLat, dayVal,
                                                               baseCurlCmd, folderUrl,
                                                               currIndexPath,
                                                               lidar_types[count])
                    for frame in sorted(localFrameDict.keys()):
                        filename = localFrameDict[frame]
                        xmlFile = icebridge_common.xmlFile(filename)
                        url = os.path.join(folderUrl, xmlFile)

                        # Download the file
                        curlCmd = baseCurlCmd + ' ' + url + ' > ' + xmlFile
                        logger.info(curlCmd)
                        p = subprocess.Popen(curlCmd, shell=True)
                        os.waitpid(p.pid, 0)

                        latitude = icebridge_common.parseLatitude(xmlFile)
                        if os.path.exists(xmlFile): os.remove(xmlFile)

                        if hasGoodLat(latitude, isSouth):
                            isGood = True
                            options.type = lidar_types[count]
                            logger.info("Good latitude " + str(latitude) +
                                        ", will use " + folderUrl +
                                        " of type " + lidar_types[count])
                        else:
                            logger.info("Bad latitude " + str(latitude) +
                                        ", will not use " + folderUrl +
                                        " of type " + lidar_types[count])

                        # Stop at first file no matter what
                        break

                    if isGood:
                        break

                if not isGood:
                    if options.type in LIDAR_TYPES and options.ignoreMissingLidar:
                        logger.info("No lidar. None of these URLs are good: " +
                                    " ".join(folderUrls))
                    else:
                        raise Exception("None of these URLs are good: " +
                                        " ".join(folderUrls))

        else:  # Other cases are simpler
            folderUrl = getFolderUrl(
                options.yyyymmdd,
                options.year,
                options.month,
                options.day,
                dayVal,  # note here the dayVal
                options.site,
                options.type)

        logger.info('Fetching from URL: ' + folderUrl)
        (localFrameDict, localUrlDict) = \
                         fetchAndParseIndexFileAux(isSouth,
                                                   separateByLat, dayVal,
                                                   baseCurlCmd, folderUrl,
                                                   currIndexPath, options.type)

        # Append to the main index
        for frame in sorted(localFrameDict.keys()):

            if options.type == 'fireball':
                # This is a bugfix. Ensure that the fireball DEM has not just
                # the same frame number, but also same timestamp as the ortho.
                # Otherwise we may accidentally getting one from next day.
                [imageDateString, imageTimeString] = \
                                  icebridge_common.parseTimeStamps(localFrameDict[frame])
                if frame not in orthoStamp:
                    #logger.info("Missing ortho for fireball: " + localFrameDict[frame])
                    continue
                if abs(int(imageTimeString) - int(orthoStamp[frame])) > 1000:
                    # Apparently a tolerance is needed. Use 10 seconds, so the number 1000.
                    #logger.info("Will not use fireball DEM whose timestamp differs from ortho.")
                    #logger.info("Fireball is: " + localFrameDict[frame])
                    #logger.info("Ortho is:    " + orthoFrameDict[frame])
                    continue

            # Fetch from next day, unless already have a value. And don't fetch
            # frames not in the jpeg index.
            if len(dayVals) > 1 and options.type != 'jpeg':
                if not frame in jpegFrameDict.keys(): continue
                if frame in frameDict.keys(): continue

            frameDict[frame] = localFrameDict[frame]
            urlDict[frame] = localUrlDict[frame]

    # Write the combined index file
    icebridge_common.writeIndexFile(parsedIndexPath, frameDict, urlDict)

    return parsedIndexPath
def fetchAndParseIndexFile(options, isSouth, baseCurlCmd, outputFolder):
    '''Create a list of all files that must be fetched unless done already.'''

    # For AN 20091112, etc, some of the ortho images are stored at the
    # beginning of the next day's flight. Need to sort this out, and
    # it is tricky. More comments within the code. 
    fetchNextDay = True
    
    separateByLat = (options.type == 'ortho' and isInSeparateByLatTable(options.yyyymmdd))
    if separateByLat:
        # Here we won't fetch the next day, we will just separate by latitude within
        # a given day
        fetchNextDay = False

    orthoOrFireball = ( (options.type == 'ortho') or (options.type == 'fireball') )

    if fetchNextDay:
        # Normally we fetch for next day only for ortho or fireball. However,
        # for one single special flight, we do it for jpeg too, as then
        # the jpegs are also split. 
       if orthoOrFireball or \
          ((options.type == 'jpeg') and twoFlightsInOneDay(options.site, options.yyyymmdd)):
           fetchNextDay = True
       else:
           fetchNextDay = False
           
    # If we need to parse the next flight day as well, as expected in some runs,
    # we will fetch two html files, but create a single index out of them.
    dayVals = [0]
    if fetchNextDay:
        dayVals.append(1)

    indexPath       = icebridge_common.htmlIndexFile(outputFolder)
    
    currIndexPath   = indexPath
    parsedIndexPath = icebridge_common.csvIndexFile(outputFolder)

    if options.refetchIndex:
        os.system('rm -f ' + indexPath)
        os.system('rm -f ' + parsedIndexPath)

    if icebridge_common.fileNonEmpty(parsedIndexPath):
        logger.info('Already have the index file ' + parsedIndexPath + ', keeping it.')
        return parsedIndexPath
    
    frameDict  = {}
    urlDict    = {}

    # We need the list of jpeg frames. Sometimes when fetching ortho images,
    # and we have to fetch from the next day, don't fetch unless
    # in the jpeg index.
    if len(dayVals) > 1 and options.type != 'jpeg':
        jpegFolder = icebridge_common.getJpegFolder(os.path.dirname(outputFolder))
        jpegIndexPath = icebridge_common.csvIndexFile(jpegFolder)
        (jpegFrameDict, jpegUrlDict) = icebridge_common.readIndexFile(jpegIndexPath)

    orthoStamp = {}
    if options.type == 'fireball':
        # This is a bugfix. Ensure that the fireball DEM has not just
        # the same frame number, but also same timestamp as the ortho.
        orthoFolder = icebridge_common.getOrthoFolder(os.path.dirname(outputFolder))
        orthoIndexPath = icebridge_common.csvIndexFile(orthoFolder)
        (orthoFrameDict, orthoUrlDict) = icebridge_common.readIndexFile(orthoIndexPath)
        for frame in sorted(orthoFrameDict.keys()):
            filename = orthoFrameDict[frame]
            [imageDateString, imageTimeString] = icebridge_common.parseTimeStamps(filename)
            orthoStamp[frame] = imageTimeString
            
    for dayVal in dayVals:

        if len(dayVals) > 1:
            currIndexPath = indexPath + '.day' + str(dayVal)
            if options.refetchIndex:
                os.system('rm -f ' + currIndexPath)
            
        # Find folderUrl which contains all of the files
        if options.type in LIDAR_TYPES:
            options.allFrames = True # For lidar, always get all the frames!
            
            # For lidar, the data can come from one of three sources.
            # Unfortunately sometimes there is more than one source, and then
            # we need to pick by latitude.
            folderUrls = []
            lidar_types = []
            for lidar in LIDAR_TYPES:
                folderUrl = getFolderUrl(options.yyyymmdd, options.year, options.month,
                                         options.day, dayVal, # note here the dayVal
                                         options.site, lidar)
                logger.info('Checking lidar URL: ' + folderUrl)
                if checkIfUrlExists(folderUrl, baseCurlCmd):
                    logger.info('Found match with lidar type: ' + lidar)
                    folderUrls.append(folderUrl)
                    lidar_types.append(lidar)

            if len(folderUrls) == 0:
                logger.info('WARNING: Could not find any lidar data for the given date!')

            elif len(folderUrls) == 1:
                # Unique solution
                folderUrl = folderUrls[0]
                options.type = lidar_types[0]

            elif len(folderUrls) >= 2:
                # Multiple solutions. Pick the good one by latitude.
                logger.info("Multiples URLs to search: " + " ".join(folderUrls))
                count = -1
                isGood = False
                for folderUrl in folderUrls:
                    count += 1
                    (localFrameDict, localUrlDict) = \
                                     fetchAndParseIndexFileAux(isSouth,
                                                               separateByLat, dayVal,
                                                               baseCurlCmd, folderUrl,
                                                               currIndexPath,
                                                               lidar_types[count])
                    for frame in sorted(localFrameDict.keys()):
                        filename = localFrameDict[frame]
                        xmlFile  = icebridge_common.xmlFile(filename)
                        url      = os.path.join(folderUrl, xmlFile)
                                        
                        # Download the file
                        curlCmd = baseCurlCmd + ' ' + url + ' > ' + xmlFile
                        logger.info(curlCmd)
                        p = subprocess.Popen(curlCmd, shell=True, universal_newlines=True)
                        os.waitpid(p.pid, 0)
                        
                        latitude = icebridge_common.parseLatitude(xmlFile)
                        if os.path.exists(xmlFile): os.remove(xmlFile)

                        if hasGoodLat(latitude, isSouth):
                            isGood = True
                            options.type = lidar_types[count]
                            logger.info("Good latitude " + str(latitude) + ", will use " +
                                        folderUrl + " of type " + lidar_types[count])
                        else:
                            logger.info("Bad latitude " + str(latitude) + ", will not use " +
                                        folderUrl + " of type " + lidar_types[count])
                            
                        # Stop at first file no matter what
                        break

                    if isGood:
                        break

                if not isGood:
                    if options.type in LIDAR_TYPES and options.ignoreMissingLidar:
                        logger.info("No lidar. None of these URLs are good: " +
                                    " ".join(folderUrls))
                    else:
                        raise Exception("None of these URLs are good: " +
                                        " ".join(folderUrls))
                    
        else: # Other cases are simpler
            folderUrl = getFolderUrl(options.yyyymmdd, options.year, options.month,
                                     options.day, dayVal, # note here the dayVal
                                     options.site, options.type)

        logger.info('Fetching from URL: ' + folderUrl)
        (localFrameDict, localUrlDict) = \
                         fetchAndParseIndexFileAux(isSouth,
                                                   separateByLat, dayVal,
                                                   baseCurlCmd, folderUrl,
                                                   currIndexPath, options.type)

        # Append to the main index
        for frame in sorted(localFrameDict.keys()):

            if options.type == 'fireball':
                # This is a bugfix. Ensure that the fireball DEM has not just
                # the same frame number, but also same timestamp as the ortho.
                # Otherwise we may accidentally getting one from next day.
                [imageDateString, imageTimeString] = \
                                  icebridge_common.parseTimeStamps(localFrameDict[frame])
                if frame not in orthoStamp:
                    #logger.info("Missing ortho for fireball: " + localFrameDict[frame])
                    continue
                if abs(int(imageTimeString) - int(orthoStamp[frame])) > 1000:
                    # Apparently a tolerance is needed. Use 10 seconds, so the number 1000.
                    #logger.info("Will not use fireball DEM whose timestamp differs from ortho.")
                    #logger.info("Fireball is: " + localFrameDict[frame])
                    #logger.info("Ortho is:    " + orthoFrameDict[frame])
                    continue
                
            # Fetch from next day, unless already have a value. And don't fetch
            # frames not in the jpeg index.
            if len(dayVals) > 1 and options.type != 'jpeg':
                if not frame in jpegFrameDict.keys(): continue
                if frame in frameDict.keys(): continue
                
            frameDict[frame] = localFrameDict[frame]
            urlDict[frame]   = localUrlDict[frame]
        
    # Write the combined index file
    icebridge_common.writeIndexFile(parsedIndexPath, frameDict, urlDict)
            
    return parsedIndexPath
def doFetch(options, outputFolder):
    '''The main fetch function.
       Returns the number of failures.'''
    
    # Verify that required files exist
    home = os.path.expanduser("~")
    if not (os.path.exists(home+'/.netrc') and os.path.exists(home+'/.urs_cookies')):
        logger.error('Missing a required authentication file!  See instructions here:\n' +
                     '    https://nsidc.org/support/faq/what-options-are-available-bulk-' +
                     'downloading-data-https-earthdata-login-enabled')
        return -1
    
    curlPath = asp_system_utils.which("curl")
    curlOpts    = ' -n -L '
    cookiePaths = ' -b ~/.urs_cookies -c ~/.urs_cookies '
    baseCurlCmd = curlPath + curlOpts + cookiePaths

    logger.info('Creating output folder: ' + outputFolder)
    os.system('mkdir -p ' + outputFolder)  

    isSouth = (options.site == 'AN')
    
    if options.type == 'nav': # Nav fetching is much less complicated
        return fetchNavData(options, outputFolder)
    
    parsedIndexPath = fetchAndParseIndexFile(options, isSouth, baseCurlCmd, outputFolder)
    if not icebridge_common.fileNonEmpty(parsedIndexPath):
        # Some dirs are weird, both images, fireball dems, and ortho.
        # Just accept whatever there is, but with a warning.
        logger.info('Warning: Missing index file: ' + parsedIndexPath)

    # Store file information in a dictionary
    # - Keep track of the earliest and latest frame
    logger.info('Reading file list from ' + parsedIndexPath)
    try:
        (frameDict, urlDict) = icebridge_common.readIndexFile(parsedIndexPath)
    except:
        # We probably ran into old format index file. Must refetch.
        logger.info('Could not read index file. Try again.')
        options.refetchIndex = True
        parsedIndexPath = fetchAndParseIndexFile(options, isSouth, baseCurlCmd, outputFolder)
        (frameDict, urlDict) = icebridge_common.readIndexFile(parsedIndexPath)

    if options.stopAfterIndexFetch:
        return 0
    
    isLidar = (options.type in LIDAR_TYPES)

    allFrames  = sorted(frameDict.keys())
    
    if not isLidar:
        # The lidar frames use a totally different numbering than the image/ortho/dem frames
        firstFrame = icebridge_common.getLargestFrame()    # start big
        lastFrame  = icebridge_common.getSmallestFrame()   # start small
        for frameNumber in allFrames:
            if frameNumber < firstFrame:
                firstFrame = frameNumber
            if frameNumber > lastFrame:
                lastFrame = frameNumber

        if options.allFrames:
            options.startFrame = firstFrame
            options.stopFrame  = lastFrame

    if isLidar:
        # Based on image frames, determine which lidar frames to fetch.
        if options.ignoreMissingLidar and len(frameDict.keys()) == 0:
            # Nothing we can do if this run has no lidar and we are told to continue
            logger.info("Warning: missing lidar, but continuing.")
            lidarsToFetch = set()
        else:
            lidarsToFetch = lidarFilesInRange(frameDict, outputFolder,
                                              options.startFrame, options.stopFrame)
        
    # There is always a chance that not all requested frames are available.
    # That is particularly true for Fireball DEMs. Instead of failing,
    # just download what is present and give a warning. 
    if options.startFrame not in frameDict and not isLidar:
        logger.info("Warning: Frame " + str(options.startFrame) +
                    " is not found in this flight.")
                    
    if options.stopFrame and (options.stopFrame not in frameDict) and not isLidar:
        logger.info("Warning: Frame " + str(options.stopFrame) +
                    " is not found in this flight.")

    allFilesToFetch = [] # Files that we will fetch, relative to the current dir. 
    allUrlsToFetch  = [] # Full url of each file.
    
    # Loop through all found frames within the provided range
    currentFileCount = 0
    lastFrame = ""
    if len(allFrames) > 0:
        lastFrame = allFrames[len(allFrames)-1]

    hasTfw = (options.type == 'fireball')
    hasXml = ( isLidar or (options.type == 'ortho') or hasTfw )
    numFetched = 0
    skipCount  = 0
    for frame in allFrames:

        # Skip frame outside of range
        if isLidar:
            if frameDict[frame] not in lidarsToFetch:
                continue
        else:       
            if ((frame < options.startFrame) or (frame > options.stopFrame) ):
                continue
                
        # Handle the frame skip option
        if options.frameSkip > 0: 
            if skipCount < options.frameSkip:
                skipCount += 1
                continue
            skipCount = 0

        filename = frameDict[frame]
        
        # Some files have an associated xml file. Fireball DEMs also have a tfw file.
        currFilesToFetch = [filename]
        if hasXml: 
            currFilesToFetch.append(icebridge_common.xmlFile(filename))
        if hasTfw: 
            currFilesToFetch.append(icebridge_common.tfwFile(filename))

        for filename in currFilesToFetch:    
            url        = os.path.join(urlDict[frame], filename)
            outputPath = os.path.join(outputFolder, filename)
            allFilesToFetch.append(outputPath)
            allUrlsToFetch.append(url)

    # Restrict lidar fetch amount according to the parameter
    if (isLidar and options.maxNumLidarToFetch > 0 and 
           len(allFilesToFetch) > options.maxNumLidarToFetch):

        # Ensure an even number, to fetch both the lidar file and its xml
        if options.maxNumLidarToFetch % 2 == 1:
            options.maxNumLidarToFetch += 1
        
        allFilesToFetch = allFilesToFetch[0:options.maxNumLidarToFetch]
        allUrlsToFetch  = allUrlsToFetch [0:options.maxNumLidarToFetch]
                
    icebridge_common.fetchFilesInBatches(baseCurlCmd, MAX_IN_ONE_CALL, options.dryRun,
                                         outputFolder,
                                         allFilesToFetch, allUrlsToFetch, logger)

    # Fetch from disk the set of already validated files, if any
    validFilesList = icebridge_common.validFilesList(os.path.dirname(outputFolder),
                                                     options.startFrame, options.stopFrame)
    validFilesSet = set()
    validFilesSet = icebridge_common.updateValidFilesListFromDisk(validFilesList, validFilesSet)
    numInitialValidFiles = len(validFilesSet)
    
    # Verify that all files were fetched and are in good shape
    failedFiles = []
    for outputPath in allFilesToFetch:

        if options.skipValidate:
            continue
        
        if not icebridge_common.fileNonEmpty(outputPath):
            logger.info('Missing file: ' + outputPath)
            failedFiles.append(outputPath)
            continue

        if icebridge_common.hasImageExtension(outputPath):
            if False:
                # This check is just so slow. Turn it off for now.
                # This will impact only the validation of jpegs,
                # as the other files can be validated via the checksum.
                # Jpegs will be validated when converting them to 1 band images
                if outputPath in validFilesSet and os.path.exists(outputPath):
                    #logger.info('Previously validated: ' + outputPath)   # verbose
                    continue
                else:
                    if not icebridge_common.isValidImage(outputPath):
                        logger.info('Found an invalid image. Will wipe it: ' + outputPath)
                        if os.path.exists(outputPath): os.remove(outputPath)
                        failedFiles.append(outputPath)
                        continue
                    else:
                        logger.info('Valid image: ' + outputPath)
                        validFilesSet.add(outputPath) # mark it as validated

        # Sanity check: XML files must have the right latitude.
        if icebridge_common.fileExtension(outputPath) == '.xml':
            if outputPath in validFilesSet and os.path.exists(outputPath):
                #logger.info('Previously validated: ' + outputPath) #verbose
                continue
            else:
                if os.path.exists(outputPath):
                    try:
                        latitude = icebridge_common.parseLatitude(outputPath)
                        logger.info('Valid file: ' + outputPath)
                        validFilesSet.add(outputPath) # mark it as validated
                    except:
                        # Corrupted file
                        logger.info("Failed to parse latitude, will wipe: " + outputPath)
                        if os.path.exists(outputPath): os.remove(outputPath)
                        failedFiles.append(outputPath)

                    # On a second thought, don't wipe files with wrong latitude, as
                    # next time we run fetch we will have to fetch them again.
                    # Hopefully they will be ignored.
                    #isGood = hasGoodLat(latitude, isSouth)
                    #if not isGood:
                    #    logger.info("Wiping XML file " + outputPath + " with bad latitude " + \
                    #                str(latitude))
                    #    os.remove(outputPath)
                    #    imageFile = icebridge_common.xmlToImage(outputPath)
                    #    if os.path.exists(imageFile):
                    #        logger.info("Wiping TIF file " + imageFile + " with bad latitude " + \
                    #                    str(latitude))
                    #        os.remove(imageFile)
                    
        # Verify the chcksum    
        if hasXml and len(outputPath) >= 4 and outputPath[-4:] != '.xml' \
               and outputPath[-4:] != '.tfw':
            if outputPath in validFilesSet and os.path.exists(outputPath):
                #logger.info('Previously validated: ' + outputPath) # verbose
                continue
            else:
                isGood = icebridge_common.hasValidChkSum(outputPath, logger)
                if not isGood:
                    xmlFile = icebridge_common.xmlFile(outputPath)
                    logger.info('Found invalid data. Will wipe: ' + outputPath + ' ' + xmlFile)
                    if os.path.exists(outputPath): os.remove(outputPath)
                    if os.path.exists(xmlFile):    os.remove(xmlFile)
                    failedFiles.append(outputPath)
                    failedFiles.append(xmlFile)
                    continue
                else:
                    logger.info('Valid file: ' + outputPath)
                    validFilesSet.add(outputPath)

        if hasTfw and icebridge_common.fileExtension(outputPath) == '.tfw':
            if outputPath in validFilesSet and os.path.exists(outputPath):
                #logger.info('Previously validated: ' + outputPath)
                continue
            else:
                isGood = icebridge_common.isValidTfw(outputPath, logger)
                if not isGood:
                    xmlFile = icebridge_common.xmlFile(outputPath)
                    logger.info('Found invalid tfw. Will wipe: ' + outputPath + ' ' + xmlFile)
                    if os.path.exists(outputPath): os.remove(outputPath)
                    if os.path.exists(xmlFile):    os.remove(xmlFile)
                    failedFiles.append(outputPath)
                    failedFiles.append(xmlFile)
                    continue
                else:
                    logger.info('Valid tfw file: ' + outputPath)
                    validFilesSet.add(outputPath)

    # Write to disk the list of validated files, but only if new
    # validations happened.  First re-read that list, in case a
    # different process modified it in the meantime, such as if two
    # managers are running at the same time.
    numFinalValidFiles = len(validFilesSet)
    if numInitialValidFiles != numFinalValidFiles:
        validFilesSet = \
                      icebridge_common.updateValidFilesListFromDisk(validFilesList, validFilesSet)
        icebridge_common.writeValidFilesList(validFilesList, validFilesSet)

    numFailed = len(failedFiles)
    if numFailed > 0:
        logger.info("Number of files that could not be processed: " + str(numFailed))
        
    return numFailed
def fetchAndParseIndexFileAux(isSouth, separateByLat, dayVal,
                              baseCurlCmd, folderUrl, path, fileType):
    '''Retrieve the index file for a folder of data and create
    a parsed version of it that contains frame number / filename pairs.'''

    # Download the html file
    curlCmd = baseCurlCmd + ' ' + folderUrl + ' > ' + path
    logger.info(curlCmd)
    p = subprocess.Popen(curlCmd, shell=True, universal_newlines=True)
    os.waitpid(p.pid, 0)

    # Find all the file names in the index file and
    #  dump them to a new index file
    logger.info('Extracting file name list from index.html file...')
    with open(path, 'r') as f:
        indexText = f.read()

    # Must wipe this html file. We fetch it too often in different
    # contexts.  If not wiped, the code fails to work in some
    # very rare but real situations.
    if os.path.exists(path):
        os.remove(path)
    
    # Extract just the file names
    fileList = [] # ensure initialization
    if fileType == 'jpeg':
        fileList = re.findall(">[0-9_]*.JPG", indexText, re.IGNORECASE)
    if fileType == 'ortho':
        fileList = re.findall(">DMS\w*.tif<", indexText, re.IGNORECASE) 
    if fileType == 'fireball':
        # Fireball DEMs
        fileList = re.findall(">IODMS\w*DEM.tif", indexText, re.IGNORECASE)
    if fileType == 'lvis':
        fileList = re.findall(">ILVIS\w+.TXT", indexText, re.IGNORECASE)
    if fileType == 'atm1':
        fileList = re.findall(">ILATM1B[0-9_]*.ATM4\w+.qi", indexText, re.IGNORECASE)
        #                      >ILATM1B_20111018_145455.ATM4BT4.qi
        #   or                 >ILATM1B_20091016_165112.atm4cT3.qi
    if fileType == 'atm2':
        # Match ILATM1B_20160713_195419.ATM5BT5.h5 
        fileList = re.findall(">ILATM1B[0-9_]*.ATM\w+.h5", indexText, re.IGNORECASE)

    # Get rid of '>' and '<'
    for fileIter in range(len(fileList)):
        fileList[fileIter] = fileList[fileIter].replace(">", "")
        fileList[fileIter] = fileList[fileIter].replace("<", "")

    # Some runs, eg, https://n5eil01u.ecs.nsidc.org/ICEBRIDGE/IODMS1B.001/2015.09.24
    # have files for both GR and AN, with same frame number. Those need to be separated
    # by latitude. This is a problem only with orthoimages.
    badXmls = set()
    outputFolder = os.path.dirname(path)
    if separateByLat:
        allFilesToFetch = []
        allUrlsToFetch  = []
        for filename in fileList:
            xmlFile  = icebridge_common.xmlFile(filename)
            url      = os.path.join(folderUrl, xmlFile)
            outputPath = os.path.join(outputFolder, xmlFile)
            allFilesToFetch.append(outputPath)
            allUrlsToFetch.append(url)
            
        dryRun = False
        icebridge_common.fetchFilesInBatches(baseCurlCmd, MAX_IN_ONE_CALL,
                                             dryRun, outputFolder,
                                             allFilesToFetch, allUrlsToFetch,
                                             logger)

        # Mark the bad ones
        for xmlFile in allFilesToFetch:
            latitude = icebridge_common.parseLatitude(xmlFile)
            isGood = hasGoodLat(latitude, isSouth)
            if not isGood:
                badXmls.add(xmlFile)
                
    elif (fileType == 'ortho' or fileType == 'fireball'):
        # Sometimes there is a large gap in the timestamp. That means orthoimages 
        # from previous day are spilling over. If dayVal is 0, we must ignore
        # the spillover images. If dayVal is 1, we must keep the spillover images
        # and igore the others.
        list1 = []
        list2 = []
        isBigGap = False
        prevStamp = -1
        for filename in fileList:
            [imageDateString, imageTimeString] = icebridge_common.parseTimeStamps(filename)
            currStamp = float(imageTimeString)/1000000.0 # hours
            if prevStamp < 0:
                list1.append(filename)
                prevStamp = currStamp
                continue
            
            # Note that once isBigGap becomes true, it stays true
            # even when the gap gets small again
            if currStamp - prevStamp >= 6: # six hour gap is a lot
                isBigGap = True
            if not isBigGap:
                list1.append(filename)
            else:
                list2.append(filename)

            prevStamp = currStamp # for next iteration

        if isBigGap:
           if dayVal == 0:
               fileList = list2[:] # current day
           else:
               fileList = list1[:] # spillover from prev day

    # For each entry that matched the regex, record: the frame number and the file name.
    frameDict = {}
    urlDict   = {}
    badFiles = []
    for filename in fileList:

        if len(badXmls) > 0:
            xmlFile  = os.path.join(outputFolder, icebridge_common.xmlFile(filename))
            if xmlFile in badXmls:
                continue
            
        frame = icebridge_common.getFrameNumberFromFilename(filename)
        if frame in frameDict.keys():
            
            # The same frame must not occur twice.
            if fileType not in LIDAR_TYPES:
                logger.error("Error: Found two file names with same frame number: " + \
                             frameDict[frame] + " and " + filename)
                badFiles.append(filename)
                badFiles.append(frameDict[frame])
                            
        # note that folderUrl can vary among orthoimages, as sometimes
        # some of them are in a folder for the next day.
        frameDict[frame] = filename
        urlDict[frame]   = folderUrl
        
    # Wipe them all, to be sorted later
    for badFile in badFiles:
        if os.path.exists(badFile):
            logger.info("Deleting: " + badFile)
            os.remove(badFile)
        xmlFile  = icebridge_common.xmlFile(badFile)
        if os.path.exists(xmlFile):
            logger.info("Deleting: " + xmlFile)
            os.remove(xmlFile)

    if len(badFiles) > 0:
        raise Exception("Found files with same frame number")
    
    return (frameDict, urlDict)
Exemplo n.º 10
0
def doFetch(options, outputFolder):

    # Verify that required files exist
    home = os.path.expanduser("~")
    if not (os.path.exists(home + '/.netrc')
            and os.path.exists(home + '/.urs_cookies')):
        logger.error(
            'Missing a required authentication file!  See instructions here:\n'
            +
            '    https://nsidc.org/support/faq/what-options-are-available-bulk-downloading-data-https-earthdata-login-enabled'
        )
        return -1

    curlPath = asp_system_utils.which("curl")
    curlOpts = ' -n -L '
    cookiePaths = ' -b ~/.urs_cookies -c ~/.urs_cookies '
    baseCurlCmd = curlPath + curlOpts + cookiePaths

    logger.info('Creating output folder: ' + outputFolder)
    os.system('mkdir -p ' + outputFolder)

    isSouth = (options.site == 'AN')
    parsedIndexPath = fetchAndParseIndexFile(options, isSouth, baseCurlCmd,
                                             outputFolder)
    if not icebridge_common.fileNonEmpty(parsedIndexPath):
        # Some dirs are weird, both images, dems, and ortho.
        # Just accept whatever there is, but with a warning.
        logger.info('Warning: Missing index file: ' + parsedIndexPath)

    # Store file information in a dictionary
    # - Keep track of the earliest and latest frame
    logger.info('Reading file list from ' + parsedIndexPath)
    try:
        (frameDict, urlDict) = readIndexFile(parsedIndexPath)
    except:
        # We probably ran into old format index file. Must refetch.
        logger.info('Could not read index file. Try again.')
        options.refetchIndex = True
        parsedIndexPath = fetchAndParseIndexFile(options, isSouth, baseCurlCmd,
                                                 outputFolder)
        (frameDict, urlDict) = readIndexFile(parsedIndexPath)

    allFrames = sorted(frameDict.keys())
    firstFrame = icebridge_common.getLargestFrame()  # start big
    lastFrame = icebridge_common.getSmallestFrame()  # start small
    for frameNumber in allFrames:
        if frameNumber < firstFrame:
            firstFrame = frameNumber
        if frameNumber > lastFrame:
            lastFrame = frameNumber

    if options.allFrames:
        options.startFrame = firstFrame
        options.stopFrame = lastFrame

    # There is always a chance that not all requested frames are available.
    # That is particularly true for Fireball DEMs. Instead of failing,
    # just download what is present and give a warning.
    if options.startFrame not in frameDict:
        logger.info("Warning: Frame " + str(options.startFrame) + \
                    " is not found in this flight.")

    if options.stopFrame and (options.stopFrame not in frameDict):
        logger.info("Warning: Frame " + str(options.stopFrame) + \
                    " is not found in this flight.")

    allFilesToFetch = [
    ]  # Files that we will fetch, relative to the current dir.
    allUrlsToFetch = []  # Full url of each file.

    # Loop through all found frames within the provided range
    currentFileCount = 0
    lastFrame = ""
    if len(allFrames) > 0:
        lastFrame = allFrames[len(allFrames) - 1]

    hasTfw = (options.type == 'dem')
    hasXml = ((options.type in LIDAR_TYPES) or (options.type == 'ortho')
              or hasTfw)
    numFetched = 0
    for frame in allFrames:
        if (frame >= options.startFrame) and (frame <= options.stopFrame):

            filename = frameDict[frame]

            # Some files have an associated xml file. DEMs also have a tfw file.
            currFilesToFetch = [filename]
            if hasXml:
                currFilesToFetch.append(icebridge_common.xmlFile(filename))
            if hasTfw:
                currFilesToFetch.append(icebridge_common.tfwFile(filename))

            for filename in currFilesToFetch:
                url = os.path.join(urlDict[frame], filename)
                outputPath = os.path.join(outputFolder, filename)
                allFilesToFetch.append(outputPath)
                allUrlsToFetch.append(url)

    if options.maxNumToFetch > 0 and len(
            allFilesToFetch) > options.maxNumToFetch:
        allFilesToFetch = allFilesToFetch[0:options.maxNumToFetch]
        allUrlsToFetch = allUrlsToFetch[0:options.maxNumToFetch]

    icebridge_common.fetchFilesInBatches(baseCurlCmd, MAX_IN_ONE_CALL,
                                         options.dryRun, outputFolder,
                                         allFilesToFetch, allUrlsToFetch,
                                         logger)

    # Verify that all files were fetched and are in good shape
    failedFiles = []
    for outputPath in allFilesToFetch:

        if options.skipValidate: continue

        if not icebridge_common.fileNonEmpty(outputPath):
            logger.info('Missing file: ' + outputPath)
            failedFiles.append(outputPath)
            continue

        if icebridge_common.hasImageExtension(outputPath):
            if not icebridge_common.isValidImage(outputPath):
                logger.info('Found an invalid image. Will wipe it: ' +
                            outputPath)
                if os.path.exists(outputPath): os.remove(outputPath)
                failedFiles.append(outputPath)
                continue
            else:
                logger.info('Valid image: ' + outputPath)

        # Sanity check: XML files must have the right latitude.
        if icebridge_common.fileExtension(outputPath) == '.xml':
            if os.path.exists(outputPath):
                latitude = icebridge_common.parseLatitude(outputPath)
                isGood = hasGoodLat(latitude, isSouth)
                if not isGood:
                    logger.info("Wiping XML file " + outputPath + " with bad latitude " + \
                                str(latitude))
                    os.remove(outputPath)
                    imageFile = icebridge_common.xmlToImage(outputPath)
                    if os.path.exists(imageFile):
                        logger.info("Wiping TIF file " + imageFile + " with bad latitude " + \
                                    str(latitude))
                        os.remove(imageFile)

        # Verify the chcksum
        if hasXml and len(outputPath) >= 4 and outputPath[-4:] != '.xml' \
               and outputPath[-4:] != '.tfw':
            isGood = icebridge_common.hasValidChkSum(outputPath)
            if not isGood:
                xmlFile = icebridge_common.xmlFile(outputPath)
                logger.info('Found invalid data. Will wipe it: ' + outputPath +
                            ' ' + xmlFile)
                if os.path.exists(outputPath): os.remove(outputPath)
                if os.path.exists(xmlFile): os.remove(xmlFile)
                failedFiles.append(outputPath)
                failedFiles.append(xmlFile)
                continue
            else:
                logger.info('Valid chksum: ' + outputPath)

        if hasTfw and icebridge_common.fileExtension(outputPath) == '.tfw':
            isGood = icebridge_common.isValidTfw(outputPath)
            if not isGood:
                xmlFile = icebridge_common.xmlFile(outputPath)
                logger.info('Found invalid data. Will wipe it: ' + outputPath +
                            ' ' + xmlFile)
                if os.path.exists(outputPath): os.remove(outputPath)
                if os.path.exists(xmlFile): os.remove(xmlFile)
                failedFiles.append(outputPath)
                failedFiles.append(xmlFile)
                continue
            else:
                logger.info('Valid tfw file: ' + outputPath)

    numFailed = len(failedFiles)
    if numFailed > 0:
        logger.info("Number of files that could not be processed: " +
                    str(numFailed))

    return numFailed
Exemplo n.º 11
0
def fetchAndParseIndexFile(options, isSouth, baseCurlCmd, outputFolder):

    # If we need to parse the next flight day as well, as expected in some runs,
    # we will fetch two html files, but create a single index out of them.
    dayVals = [0]
    if options.fetchNextDay:
        dayVals.append(1)
        options.refetchIndex = True  # Force refetch, to help with old archives

    # See if to wipe the index
    if options.type in LIDAR_TYPES:
        filename = 'lidar' + '_index.html'
    else:
        filename = options.type + '_index.html'

    indexPath = os.path.join(outputFolder, filename)
    parsedIndexPath = indexPath + '.csv'

    if options.refetchIndex:
        os.system('rm -f ' + indexPath)
        os.system('rm -f ' + parsedIndexPath)

    if icebridge_common.fileNonEmpty(parsedIndexPath):
        logger.info('Already have the index file ' + parsedIndexPath +
                    ', keeping it.')
        return parsedIndexPath

    frameDict = {}
    urlDict = {}

    tryToSeparateByLat = (options.type == 'ortho')

    for dayVal in dayVals:

        if len(dayVals) > 1:
            indexPath = indexPath + '.day' + str(dayVal)
            if options.refetchIndex:
                os.system('rm -f ' + indexPath)

        # Find folderUrl which contains all of the files
        if options.type in LIDAR_TYPES:
            options.allFrames = True  # For lidar, always get all the frames!

            # For lidar, the data can come from one of three sources.
            # Unfortunately sometimes there is more than one source, and then
            # we need to pick by latitude.
            folderUrls = []
            lidar_types = []
            for lidar in LIDAR_TYPES:
                folderUrl = getFolderUrl(
                    options.year,
                    options.month,
                    options.day + dayVal,  # note here the dayVal
                    options.ext,
                    options.site,
                    lidar)
                logger.info('Checking lidar URL: ' + folderUrl)
                if checkIfUrlExists(folderUrl):
                    logger.info('Found match with lidar type: ' + lidar)
                    folderUrls.append(folderUrl)
                    lidar_types.append(lidar)

            if len(folderUrls) == 0:
                logger.info(
                    'WARNING: Could not find any lidar data for the given date!'
                )

            elif len(folderUrls) == 1:
                # Unique solution
                folderUrl = folderUrls[0]
                options.type = lidar_types[0]

            elif len(folderUrls) >= 2:
                # Multiple solutions. Pick the good one by latitude.
                logger.info("Multiples URLs to search: " +
                            " ".join(folderUrls))
                count = -1
                isGood = False
                for folderUrl in folderUrls:
                    count += 1
                    (localFrameDict, localUrlDict) = \
                                     fetchAndParseIndexFileAux(isSouth, tryToSeparateByLat,
                                                               baseCurlCmd, folderUrl,
                                                               indexPath, lidar_types[count])
                    for frame in sorted(localFrameDict.keys()):
                        filename = localFrameDict[frame]
                        xmlFile = icebridge_common.xmlFile(filename)
                        url = os.path.join(folderUrl, xmlFile)

                        # Download the file
                        curlCmd = baseCurlCmd + ' ' + url + ' > ' + xmlFile
                        logger.info(curlCmd)
                        p = subprocess.Popen(curlCmd, shell=True)
                        os.waitpid(p.pid, 0)

                        latitude = icebridge_common.parseLatitude(xmlFile)
                        if os.path.exists(xmlFile): os.remove(xmlFile)

                        if hasGoodLat(latitude, isSouth):
                            isGood = True
                            options.type = lidar_types[count]

                        # Stop at first file no matter what
                        break

                    if isGood:
                        break

                if not isGood:
                    raise Exception("None of these URLs are good: " +
                                    " ".join(folderUrls))

        else:  # Other cases are simpler
            folderUrl = getFolderUrl(
                options.year,
                options.month,
                options.day + dayVal,  # note here the dayVal
                options.ext,
                options.site,
                options.type)

        logger.info('Fetching from URL: ' + folderUrl)
        (localFrameDict, localUrlDict) = \
                         fetchAndParseIndexFileAux(isSouth, tryToSeparateByLat,
                                                   baseCurlCmd, folderUrl,
                                                   indexPath, options.type)

        # Append to the main index
        for frame in sorted(localFrameDict.keys()):
            # If we already have this frame, don't read it from the current day
            # as that's a recipe for mix-up
            if frame not in frameDict.keys():
                frameDict[frame] = localFrameDict[frame]
                urlDict[frame] = localUrlDict[frame]

    # Write the combined index file
    with open(parsedIndexPath, 'w') as f:
        for frame in sorted(frameDict.keys()):
            f.write(
                str(frame) + ', ' + frameDict[frame] + ', ' + urlDict[frame] +
                '\n')

    return parsedIndexPath
Exemplo n.º 12
0
def fetchAndParseIndexFileAux(isSouth, tryToSeparateByLat, baseCurlCmd,
                              folderUrl, path, fileType):
    '''Retrieve the index file for a folder of data and create
    a parsed version of it that contains frame number / filename pairs.'''

    # Download the html file
    curlCmd = baseCurlCmd + ' ' + folderUrl + ' > ' + path
    logger.info(curlCmd)
    p = subprocess.Popen(curlCmd, shell=True)
    os.waitpid(p.pid, 0)

    # Find all the file names in the index file and
    #  dump them to a new index file
    logger.info('Extracting file name list from index.html file...')
    with open(path, 'r') as f:
        indexText = f.read()

    # Must wipe this html file. We fetch it too often in different
    # contexts.  If not wiped, the code fails to work in some
    # very rare but real situations.
    if os.path.exists(path): os.remove(path)

    # Extract just the file names
    fileList = []  # ensure initialization
    if fileType == 'image':
        fileList = re.findall(">[0-9_]*.JPG", indexText, re.IGNORECASE)
    if fileType == 'ortho':
        fileList = re.findall(">DMS\w*.tif<", indexText, re.IGNORECASE)
    if fileType == 'dem':
        fileList = re.findall(">IODMS\w*DEM.tif", indexText, re.IGNORECASE)
    if fileType == 'lvis':
        fileList = re.findall(">ILVIS\w+.TXT", indexText, re.IGNORECASE)
    if fileType == 'atm1':
        fileList = re.findall(">ILATM1B[0-9_]*.ATM4\w+.qi", indexText,
                              re.IGNORECASE)
        #                      >ILATM1B_20111018_145455.ATM4BT4.qi
        #   or                 >ILATM1B_20091016_165112.atm4cT3.qi
    if fileType == 'atm2':
        # Match ILATM1B_20160713_195419.ATM5BT5.h5
        fileList = re.findall(">ILATM1B[0-9_]*.ATM\w+.h5", indexText,
                              re.IGNORECASE)

    # Get rid of '>' and '<'
    for fileIter in range(len(fileList)):
        fileList[fileIter] = fileList[fileIter].replace(">", "")
        fileList[fileIter] = fileList[fileIter].replace("<", "")

    # Some runs, eg, https://n5eil01u.ecs.nsidc.org/ICEBRIDGE/IODMS1B.001/2015.09.24
    # have files for both GR and AN, with same frame number. Those need to be separated
    # by latitude. This is a problem only with orthoimages.
    haveToSeparateByLat = False
    frameDict = {}
    for filename in fileList:
        if not tryToSeparateByLat: continue
        frame = icebridge_common.getFrameNumberFromFilename2(filename)
        if frame in frameDict.keys():
            haveToSeparateByLat = True
            logger.info("Found a run with files from both AN and GR.")
            logger.info("Files with same frame number: " + frameDict[frame] +
                        " and " + filename)
            logger.info("Need to see which to keep.")
            break
        frameDict[frame] = filename

    badXmls = set()
    outputFolder = os.path.dirname(path)
    if haveToSeparateByLat:
        allFilesToFetch = []
        allUrlsToFetch = []
        for filename in fileList:
            xmlFile = icebridge_common.xmlFile(filename)
            url = os.path.join(folderUrl, xmlFile)
            outputPath = os.path.join(outputFolder, xmlFile)
            allFilesToFetch.append(outputPath)
            allUrlsToFetch.append(url)

        dryRun = False
        icebridge_common.fetchFilesInBatches(baseCurlCmd, MAX_IN_ONE_CALL,
                                             dryRun, outputFolder,
                                             allFilesToFetch, allUrlsToFetch,
                                             logger)

        # Mark the bad ones and wipe them too
        for xmlFile in allFilesToFetch:
            latitude = icebridge_common.parseLatitude(xmlFile)
            isGood = hasGoodLat(latitude, isSouth)
            if not isGood:
                badXmls.add(xmlFile)
                if os.path.exists(xmlFile): os.remove(xmlFile)

    # For each entry that matched the regex, record: the frame number and the file name.
    frameDict = {}
    urlDict = {}
    for filename in fileList:

        xmlFile = os.path.join(outputFolder,
                               icebridge_common.xmlFile(filename))
        if xmlFile in badXmls:
            continue

        frame = icebridge_common.getFrameNumberFromFilename2(filename)
        if frame in frameDict.keys() and haveToSeparateByLat:
            # This time the same frame must not occur twice
            raise Exception("Found two file names with same frame number: " + \
                            frameDict[frame] + " and " + filename)

        frameDict[frame] = filename
        # note that folderUrl can vary among orthoimages, as sometimes
        # some of them are in a folder for the next day.
        urlDict[frame] = folderUrl

    return (frameDict, urlDict)
def convertLidarDataToCsv(lidarFolder, startFrame, stopFrame, 
                          skipValidate, logger):
    '''Make sure all lidar data is available in a readable text format.
       Returns false if any files failed to convert.'''

    logger.info('Converting LIDAR files...')

    lidarIndexPath = icebridge_common.csvIndexFile(lidarFolder)
    (frameDict, urlDict) = icebridge_common.readIndexFile(lidarIndexPath)

    if not skipValidate:
        validFilesList = icebridge_common.validFilesList(os.path.dirname(lidarFolder),
                                                         startFrame, stopFrame)
        validFilesSet = set()
        validFilesSet = icebridge_common.updateValidFilesListFromDisk(validFilesList, validFilesSet)
        numInitialValidFiles = len(validFilesSet)

    convDict = {}
    
    # Loop through all files in the folder
    badFiles = False
    for frame in sorted(frameDict.keys()):

        f = frameDict[frame]
        extension = icebridge_common.fileExtension(f)
        
        # Only interested in a few file types
        if (extension != '.qi') and (extension != '.hdf5') and (extension != '.h5'):
            convDict[frame] = f # these are already in plain text
            continue

        convDict[frame] = os.path.splitext(f)[0] + '.csv'
        outputPath = os.path.join(lidarFolder, convDict[frame])

        # Handle paths
        fullPath = os.path.join(lidarFolder, f)
        if not os.path.exists(fullPath):
            logger.info("Cannot convert missing file: " + fullPath)
            continue

        # If the input is invalid, wipe both it, its xml, and the output
        # Hopefully there will be a subsquent fetch step where it will get
        # refetched.
        if not icebridge_common.hasValidChkSum(fullPath, logger):
            logger.info("Will wipe invalid file: " + fullPath)
            xmlFile = icebridge_common.xmlFile(fullPath)
            os.system('rm -f ' + fullPath) # will not throw
            os.system('rm -f ' + xmlFile) # will not throw
            os.system('rm -f ' + outputPath) # will not throw
            badFiles = True
            continue

        # Skip existing valid files
        if skipValidate:
            if os.path.exists(outputPath):
                logger.info("File exists, skipping: " + outputPath)
                continue
        else:
            if outputPath in validFilesSet and os.path.exists(outputPath):
                #logger.info('Previously validated: ' + outputPath) # verbose
                continue
            if icebridge_common.isValidLidarCSV(outputPath):
                #logger.info("File exists and is valid, skipping: " + outputPath)
                continue
        
        # Call the conversion
        logger.info("Process " + fullPath)
        extract_icebridge_ATM_points.main([fullPath])
        
        # Check the result
        if not icebridge_common.isValidLidarCSV(outputPath):
            logger.error('Failed to parse LIDAR file, will wipe: ' + outputPath)
            os.system('rm -f ' + outputPath) # will not throw            
            badFiles = True
        else:
            if not skipValidate:
                validFilesSet.add(outputPath) # mark it as validated
            
    convLidarFile = icebridge_common.getConvertedLidarIndexFile(lidarFolder)

    willWriteConvFile = False
    if not os.path.exists(convLidarFile):
        willWriteConvFile = True
    else: 
        # Bugfix: Sometimes the written converted file has the wrong size, maybe
        # something got interrupted earlier.
        (lidarDictIn, dummyUrlDict) = icebridge_common.readIndexFile(convLidarFile)
        if lidarDictIn != convDict:
            willWriteConvFile = True
            
    if willWriteConvFile:
        logger.info("Writing: " + convLidarFile)
        icebridge_common.writeIndexFile(convLidarFile, convDict, {})
        
    if not skipValidate:
        # Write to disk the list of validated files, but only if new
        # validations happened.  First re-read that list, in case a
        # different process modified it in the meantime, such as if two
        # managers are running at the same time.
        numFinalValidFiles = len(validFilesSet)
        if numInitialValidFiles != numFinalValidFiles:
            validFilesSet = icebridge_common.updateValidFilesListFromDisk(validFilesList,
                                                                          validFilesSet)
            icebridge_common.writeValidFilesList(validFilesList, validFilesSet)

    return (not badFiles)