예제 #1
0
def main():
    import dhdt
    import argparse

    parser = argparse.ArgumentParser(
        parents=[dhdt.dhdtLog(), dhdt.batchParser()])
    parser.add_argument('config',
                        metavar='CFG',
                        help="name of the configuration file")

    args = parser.parse_args()

    # read the configuration
    cfg = dhdt.Config()
    cfg.readCfg(args.config)

    if args.submit == 'sge':
        batch = dhdt.SGEProcess(args)
    elif args.submit == 'pbs':
        batch = dhdt.PBSProcess(args)
    else:
        batch = None

    if batch is not None:
        batch.serial(['createOutDirs', args.config])
    else:
        createOutDirs(cfg)
예제 #2
0
파일: mergeData.py 프로젝트: whigg/cryomlt
def main():
    parser = argparse.ArgumentParser(
        parents=[dhdt.dhdtLog(), dhdt.batchParser()])
    parser.add_argument('config',
                        metavar='CFG',
                        help="name of the configuration file")
    parser.add_argument('-B',
                        '--display-progress-bar',
                        default=False,
                        action='store_true',
                        help="display a progress bar")

    args = parser.parse_args()
    dhdt.initLog(args)

    # read the configuration
    cfg = dhdt.Config()
    cfg.readCfg(args.config)

    if args.submit == 'sge':
        batch = dhdt.SGEProcess(args)
    elif args.submit == 'pbs':
        batch = dhdt.PBSProcess(args)
    else:
        batch = None

    if batch is not None:
        batch.serial(['mergeData', args.config])
    else:
        mergeData(cfg, displayProgress=args.display_progress_bar)
예제 #3
0
def main():
    import dhdt
    import argparse
    parser = argparse.ArgumentParser(parents=[dhdt.dhdtLog()])
    parser.add_argument('config',
                        metavar='CFG',
                        help="name of the configuration file")
    parser.add_argument('-t',
                        '--total',
                        action='store_true',
                        default=False,
                        help="get total completed")
    parser.add_argument('-a',
                        '--all',
                        action='store_true',
                        default=False,
                        help="display all tiles")
    parser.add_argument('-n',
                        '--no-update',
                        action='store_true',
                        default=False,
                        help="do not update the database")
    parser.add_argument('-o',
                        '--output',
                        metavar='TILELIST',
                        help="write missing tile numbers to file TILELIST")

    args = parser.parse_args()
    dhdt.initLog(args)

    # read the configuration
    cfg = dhdt.Config()
    cfg.readCfg(args.config)

    if args.output is not None:
        out = open(args.output, 'w')
    else:
        out = sys.stdout

    tiles = DHDTTiles(cfg)

    if args.all:
        ti = range(len(tiles))
    else:
        ti = iter(tiles)

    if args.no_update:
        ti = tiles.noUpdateIter()

    for t in ti:
        out.write('%d %f\n' % (t, tiles.completed(t)))

    if args.total:
        out.write('total: %.2f%%\n' % (tiles.totalCompleted() * 100.))
예제 #4
0
def main():
    import argparse
    parser = argparse.ArgumentParser(
        parents=[dhdt.dhdtLog(),
                 dhdt.batchParser(taskfarm=True)])
    parser.add_argument('config',
                        metavar='CFG',
                        help="name of the configuration file")
    parser.add_argument(
        '-r',
        '--reprocess-data',
        action='store_true',
        default=False,
        help="process data even if previous run was successful")
    parser.add_argument('--monitor-memory',
                        action='store_true',
                        default=False,
                        help="monitor CPU and memory usage")
    args = parser.parse_args()

    if args.submit == 'sge':
        batch = dhdt.SGEProcess(args)
    elif args.submit == 'pbs':
        batch = dhdt.PBSProcess(args)
    else:
        batch = None

    # read the configuration
    cfg = dhdt.Config()
    cfg.readCfg(args.config)

    if batch is not None:
        dhdt.initLog(args)
        cmd = [
            'processDataTF', args.config, '-l', args.log_level, '-L',
            args.log_file
        ]
        if args.reprocess_data:
            cmd.append('-r')
        if args.monitor_memory:
            cmd.append('--monitor-memory')
        batch.mpi(cmd)
    else:
        from mpi4py import MPI
        comm = MPI.COMM_WORLD
        rank = comm.Get_rank()
        dhdt.initLog(args, mpi_rank=rank)

        processDataTF(cfg,
                      reprocess=args.reprocess_data,
                      monitor=args.monitor_memory)
예제 #5
0
def processMP(cname, np, runtime, reprocess=True, monitor=False):
    # extract run time
    rt = 0
    factor = 1
    tl = runtime.split(':')
    tl.reverse()
    if len(tl) > 3:
        parser.error('cannot parse runtime argument %s' % (runtime))
    for ts in tl:
        try:
            t = int(ts)
        except:
            parser.error('cannot parse runtime argument %s' % (runtime))
        rt += factor * t
        factor = factor * 60

    # read the configuration
    cfg = dhdt.Config()
    cfg.readCfg(cname)

    # create output directories
    dhdt.createOutDirs(cfg)

    # read the data
    if reprocess or not dhdt.checkStore(cfg):
        dhdt.readData(cfg)

    # process all data
    original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
    pool = multiprocessing.Pool(processes=np)
    signal.signal(signal.SIGINT, original_sigint_handler)

    tiles = list(dhdt.DHDTTiles(cfg, reprocess=reprocess))

    logging.info("processing %d tiles" % len(tiles))

    if len(tiles) > 0:

        f = ProcessData(cname, reprocess=reprocess, monitor=monitor)

        try:
            res = pool.map_async(f, tiles)
            res.get(rt)
        except KeyboardInterrupt:
            pool.terminate()
        else:
            pool.close()
        pool.join()

    # merge data files
    dhdt.mergeData.mergeData(cfg)
예제 #6
0
파일: readDataTF.py 프로젝트: whigg/cryomlt
def main():
    import argparse
    parser = argparse.ArgumentParser(
        parents=[dhdt.dhdtLog(),
                 dhdt.batchParser(taskfarm=True)])
    parser.add_argument('config',
                        metavar='CFG',
                        help="name of the configuration file")
    parser.add_argument(
        '-r',
        '--rebuild-store',
        action='store_true',
        default=False,
        help=
        "rebuild data store even though store is newer than the infput files")
    args = parser.parse_args()
    dhdt.initLog(args)

    if args.submit == 'sge':
        batch = dhdt.SGEProcess(args)
    elif args.submit == 'pbs':
        batch = dhdt.PBSProcess(args)
    else:
        batch = None

    # read the configuration
    cfg = dhdt.Config()
    cfg.readCfg(args.config)

    if batch is not None:
        cmd = ['readDataTF', args.config]
        if args.rebuild_store:
            cmd.append('-r')
        batch.mpi(cmd)
    else:
        readDataTF(cfg, rebuild=args.rebuild_store)
예제 #7
0
def main():

    parser = argparse.ArgumentParser(
        parents=[dhdt.dhdtLog(), dhdt.batchParser()])
    parser.add_argument('config',
                        metavar='CFG',
                        help="name of the configuration file")
    parser.add_argument(
        '-r',
        '--reprocess-data',
        action='store_true',
        default=False,
        help="process data even if previous run was successful")
    parser.add_argument('--process',
                        '-p',
                        metavar='N',
                        default=0,
                        type=int,
                        help="compute tasks for process N")
    parser.add_argument('--tile-file',
                        '-T',
                        metavar='TFILE',
                        help="get tile IDs from file TFILE")
    parser.add_argument('--monitor-memory',
                        action='store_true',
                        default=False,
                        help="monitor CPU and memory usage")
    args = parser.parse_args()
    dhdt.initLog(args)

    if args.submit == 'sge':
        batch = dhdt.SGEProcess(args)
    elif args.submit == 'pbs':
        batch = dhdt.PBSProcess(args)
    else:
        batch = None

    # read the configuration
    cfg = dhdt.Config()
    cfg.readCfg(args.config)

    processTile = args.process
    if args.tile_file is not None:
        i = 0
        tfile = open(args.tile_file, 'r')
        for line in tfile.readlines():
            if i == args.process:
                processTile = int(line)
                break
            i = i + 1
        else:
            parser.error("could not find tile %d in tile file %s" %
                         (args.process, args.tile_file))
        tfile.close()

    if not args.reprocess_data:
        if dhdt.checkNC(cfg['grid']['output'], processTile):
            logging.info('tile %d has already been successfully processed' %
                         processTile)
            return

    if batch is not None:
        cmd = [
            'processData', '-p',
            str(processTile), args.config, '-l', args.log_level, '-L',
            args.log_file
        ]
        if args.monitor_memory:
            cmd.append('--monitor-memory')
        if args.reprocess_data:
            cmd.append('-r')
        batch.serial(cmd)
    else:
        processData(cfg,
                    processTile,
                    monitor=args.monitor_memory,
                    reprocess=args.reprocess_data)
예제 #8
0
                        metavar='N',
                        default=0,
                        type=int,
                        help="compute tasks for process N")
    parser.add_argument('--tile-file',
                        '-T',
                        metavar='TFILE',
                        help="get tile IDs from file TFILE")
    parser.add_argument('--monitor-memory',
                        action='store_true',
                        default=False,
                        help="monitor CPU and memory usage")
    args = parser.parse_args()
    dhdt.initLog(args)

    cfg = dhdt.Config()
    cfg.readCfg(args.config)

    grid = dhdt.cfg2grid(cfg)

    process = args.process

    radius = numpy.maximum(cfg['dhdt']['lew_where'], cfg['dhdt']['pow_where'])
    margin = cfg['dhdt']['radius'] + radius

    print grid.bboxGeo(process, margin=margin)
    print grid.bboxGeo(process, margin=cfg['dhdt']['radius'])

    dataSwath, dataPoca = dhdt.getStore(cfg, mode='r')
    gPoca = dataPoca.getGeoPandas(
        crs=cfg['data']['projection'],
예제 #9
0
def processSGE(cname, batch, reprocess=True, monitor=False, taskFarm=False):
    # read the configuration
    cfg = dhdt.Config()
    cfg.readCfg(cname)

    # create the output directories
    cmd = ['createOutDirs', cname]
    jid = batch.serial(cmd)

    # read the data
    if cfg['data']['storeType'] == 'multihdf':
        # use multi-file backend
        cmd = ['readDataTF', cname]
        if reprocess:
            cmd.append('-r')
        llgrid = dhdt.cfg2llgrid(cfg)
        mname = os.path.join(cfg['data']['store'], 'meta.json')
        if os.path.exists(mname) and not llgrid.check(mname):
            logging.error('multi-file store settings do not match')
            return
        jid = batch.mpi(cmd,
                        njobs=min(batch.num_processes, llgrid.ncells),
                        wait=jid)
    else:
        cmd = ['readData', cname]
        if reprocess or not dhdt.checkStore(cfg):
            jid = batch.serial(cmd, wait=jid)
        else:
            jid = None

    if taskFarm:
        # process data using task farm
        cmd = ['processDataTF', cname]
        if reprocess:
            cmd.append('-r')
        jid = batch.mpi(cmd, wait=jid)
    else:
        # process data using array jobs, one for each tile
        cmd = ['processData', cname]
        if reprocess:
            cmd.append('-r')
        if monitor:
            cmd.append('--monitor-memory')
        cmd.append('-T')
        cmd.append('')  # dummy tile filename
        tiles = dhdt.DHDTTiles(cfg, reprocess=reprocess)
        jids = []
        iterTiles = iter(tiles)
        allDone = False
        while not allDone:
            # open the tile file
            tFile = tempfile.NamedTemporaryFile(prefix=cfg['grid']['output'] +
                                                "_tiles.",
                                                delete=False)
            nTiles = 0

            while nTiles < batch.max_array_size:
                try:
                    t = next(iterTiles)
                except:
                    allDone = True
                    break
                tFile.write("%d\n" % t)
                nTiles += 1

            cmd[-1] = tFile.name
            tFile.close()
            if nTiles > 0:
                jids.append(batch.array(cmd, njobs=nTiles, wait=jid))

        jid = string.join(jids, ',')
        if len(jid) == 0:
            jid = None

    # merge data files
    jid = batch.serial(['mergeData', cname], wait=jid)
예제 #10
0
def main():
    ''' main entry point to code '''

    ########## Config ##########

    #Set test flag if wish to generate single matlab or csv file to test config ok.
    test = False

    converter = 'Matlab'  #Or 'CSV'

    #Optional bbox in wgs84 coordinates [bottom left to top right]
    #bbox = [81.4,-96.734,81.41,-96.73]
    bbox = None

    #Matlab or CSV file pattern match
    fileglob = '/media/martin/DATA/Data/MatFiles/Swath/2012/*1B_201202*.mat'

    #Location to store shards
    storeFolder = '/media/martin/DATA/Data/hdf/swath/2012/'

    #Shard prefix, so can easily identify. Put 'swath_', 'poca_' or 'oib_' or anything else
    filePrefix = 'swath_'

    #Set data lengths - want large for swath (500k, small for poca and oib)
    recordLength = 500000
    recordBuffer = 150000  #Maximum over recordLength to allow in file
    recordMin = 400000  #Minimum to allow in file - will concatenate files if below this

    #Alternate for Poca or OIB
    #recordLength = 6000
    #recordBuffer = 2000
    #recordMin = 4000

    ######### Additional DHDT Config ###########

    #This is dhdt config parsing code. I have not modified

    parser = argparse.ArgumentParser(
        parents=[dhdt.dhdtLog(), dhdt.batchParser()])
    parser.add_argument('config',
                        metavar='CFG',
                        help="name of the configuration file")
    parser.add_argument(
        '-r',
        '--rebuild-store',
        action='store_true',
        default=False,
        help=
        "rebuild data store even though store is newer than the input files")
    args = parser.parse_args()
    dhdt.initLog(args)

    if args.submit == 'sge':
        batch = dhdt.SGEProcess(args)
    elif args.submit == 'pbs':
        batch = dhdt.PBSProcess(args)
    else:
        batch = None

    # read the configuration
    cfg = dhdt.Config()
    cfg.readCfg(args.config)

    ############# Start of code ##############

    start = time.time()

    #Define reader
    if converter == 'Matlab':
        reader = dhdt.dataReader.MatlabReader()
    else:
        reader = dhdt.dataReader.CSVReader()

    #Temporary variables
    hold = False
    holdData = None

    #Get file list in order
    fileList = glob.glob(fileglob)
    fileList.sort()

    #Set counters
    ii = 0
    fileCount = len(fileList)

    #Iterate through each file and convert to shards
    for d in fileList:

        ii += 1
        print('Processing file {} of {}: {}'.format(ii, fileCount, d))

        loadData = reader.read(d, bbox)

        if loadData.shape[0] == 0:
            print('Empty File - Skipping')
            if d != fileList[-1]:
                continue

        #Hold the data for appending next files (if less than minimum record length)
        allData = loadData

        #Deterime if data being held (if less than minimum record length), if so concatenate
        if hold:
            if loadData.shape[0] == 0:
                allData = holdData
            else:
                allData = pd.concat([holdData, loadData])

        #Less than minimum record length so hold data and continue loop to append next files
        if allData.shape[0] < recordMin and d != fileList[-1]:
            hold = True
            holdData = allData
            continue
        else:
            hold = False
            holdData = None

        if allData.shape[0] == 0:
            continue

        #Must now be above minimum record length (or at end of file list)

        #Convert to geo coordinates and project to polar stereographic
        allData = GeoWrapper.convertToGeo(allData, cfg['data']['projection'],
                                          False)
        allData = GeoWrapper.project(allData, cfg['grid']['projection'])
        allData = GeoWrapper.extractXYtoCols(allData, 0)
        allData = allData.drop(['geometry'], axis=1)

        #Create dataframe
        allData = pd.DataFrame(allData)

        #Write counters
        i = 0
        dLength = allData.shape[0]
        j = 0

        #Loop over data to create files of maximum record length
        while i <= dLength:

            increment = recordLength
            if i + recordLength + recordBuffer > dLength:
                increment = recordLength + recordBuffer

            #Take slice of data up to maximum data length
            data = allData[i:i + increment]
            data = data.reset_index().drop(['index'], axis=1)

            #Only do next steps if have data
            if data.shape[0] > 0:

                #Create index
                indexData = DataStoreHelper.createIndex(
                    data, ['lat', 'lon', 'x', 'y', 'startTime'])

                #Create files name
                fileTime = DataStoreHelper.createFileDateTime(indexData)
                fullPath = storeFolder + filePrefix + fileTime + '_' + str(
                    j) + '.h5'

                #Write data
                store = pd.HDFStore(fullPath,
                                    mode='w',
                                    complevel=9,
                                    complib='blosc')
                store.append('data', data, index=False, data_columns=True)
                store.append('index',
                             indexData,
                             index=False,
                             data_columns=True)
                store.close()

                #remove in-memory data to keep efficient
                del data
                del indexData

            i += increment
            j += 1

        #remove in-memory data to keep efficient
        del loadData
        del allData
        del holdData

        #Set if want to run a test
        if test:
            if ii >= 1:
                print("Time Taken: {}".format(time.time() - start))
                return

    print("Complete")
    print("Time Taken: {}".format(time.time() - start))