Exemplo n.º 1
0
def main():
    import dhdt
    import argparse

    parser = argparse.ArgumentParser(
        parents=[dhdt.dhdtLog(), dhdt.batchParser()])
    parser.add_argument('config',
                        metavar='CFG',
                        help="name of the configuration file")

    args = parser.parse_args()

    # read the configuration
    cfg = dhdt.Config()
    cfg.readCfg(args.config)

    if args.submit == 'sge':
        batch = dhdt.SGEProcess(args)
    elif args.submit == 'pbs':
        batch = dhdt.PBSProcess(args)
    else:
        batch = None

    if batch is not None:
        batch.serial(['createOutDirs', args.config])
    else:
        createOutDirs(cfg)
Exemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser(
        parents=[dhdt.dhdtLog(), dhdt.batchParser()])
    parser.add_argument('config',
                        metavar='CFG',
                        help="name of the configuration file")
    parser.add_argument('-B',
                        '--display-progress-bar',
                        default=False,
                        action='store_true',
                        help="display a progress bar")

    args = parser.parse_args()
    dhdt.initLog(args)

    # read the configuration
    cfg = dhdt.Config()
    cfg.readCfg(args.config)

    if args.submit == 'sge':
        batch = dhdt.SGEProcess(args)
    elif args.submit == 'pbs':
        batch = dhdt.PBSProcess(args)
    else:
        batch = None

    if batch is not None:
        batch.serial(['mergeData', args.config])
    else:
        mergeData(cfg, displayProgress=args.display_progress_bar)
Exemplo n.º 3
0
def main():
    import argparse

    parser = argparse.ArgumentParser(
        parents=[dhdt.dhdtLog(), dhdt.batchParser()])
    parser.add_argument('config',
                        metavar='CFG',
                        help="name of the configuration file")
    parser.add_argument(
        '-r',
        '--reprocess-data',
        action='store_true',
        default=False,
        help="process data even if previous run was successful")
    parser.add_argument(
        '-n',
        '--num-processes',
        type=int,
        default=8,
        help=
        "set the number of processes to use (either on workstation or for MPI taskfarm)"
    )
    parser.add_argument('--monitor-memory',
                        action='store_true',
                        default=False,
                        help="monitor CPU and memory usage")
    parser.add_argument('-T',
                        '--task-farm',
                        action='store_true',
                        default=False,
                        help="use MPI task farm")
    args = parser.parse_args()
    dhdt.initLog(args)

    if args.submit == 'sge':
        batch = dhdt.SGEProcess(args)
    elif args.submit == 'pbs':
        batch = dhdt.PBSProcess(args)
    else:
        batch = None

    if batch is not None:
        processSGE(args.config,
                   batch,
                   reprocess=args.reprocess_data,
                   monitor=args.monitor_memory,
                   taskFarm=args.task_farm)
    else:
        processMP(args.config,
                  args.num_processes,
                  args.run_time,
                  reprocess=args.reprocess_data,
                  monitor=args.monitor_memory)
Exemplo n.º 4
0
def main():
    import argparse
    parser = argparse.ArgumentParser(
        parents=[dhdt.dhdtLog(),
                 dhdt.batchParser(taskfarm=True)])
    parser.add_argument('config',
                        metavar='CFG',
                        help="name of the configuration file")
    parser.add_argument(
        '-r',
        '--reprocess-data',
        action='store_true',
        default=False,
        help="process data even if previous run was successful")
    parser.add_argument('--monitor-memory',
                        action='store_true',
                        default=False,
                        help="monitor CPU and memory usage")
    args = parser.parse_args()

    if args.submit == 'sge':
        batch = dhdt.SGEProcess(args)
    elif args.submit == 'pbs':
        batch = dhdt.PBSProcess(args)
    else:
        batch = None

    # read the configuration
    cfg = dhdt.Config()
    cfg.readCfg(args.config)

    if batch is not None:
        dhdt.initLog(args)
        cmd = [
            'processDataTF', args.config, '-l', args.log_level, '-L',
            args.log_file
        ]
        if args.reprocess_data:
            cmd.append('-r')
        if args.monitor_memory:
            cmd.append('--monitor-memory')
        batch.mpi(cmd)
    else:
        from mpi4py import MPI
        comm = MPI.COMM_WORLD
        rank = comm.Get_rank()
        dhdt.initLog(args, mpi_rank=rank)

        processDataTF(cfg,
                      reprocess=args.reprocess_data,
                      monitor=args.monitor_memory)
Exemplo n.º 5
0
def main():
    import argparse
    parser = argparse.ArgumentParser(
        parents=[dhdt.dhdtLog(),
                 dhdt.batchParser(taskfarm=True)])
    parser.add_argument('config',
                        metavar='CFG',
                        help="name of the configuration file")
    parser.add_argument(
        '-r',
        '--rebuild-store',
        action='store_true',
        default=False,
        help=
        "rebuild data store even though store is newer than the infput files")
    args = parser.parse_args()
    dhdt.initLog(args)

    if args.submit == 'sge':
        batch = dhdt.SGEProcess(args)
    elif args.submit == 'pbs':
        batch = dhdt.PBSProcess(args)
    else:
        batch = None

    # read the configuration
    cfg = dhdt.Config()
    cfg.readCfg(args.config)

    if batch is not None:
        cmd = ['readDataTF', args.config]
        if args.rebuild_store:
            cmd.append('-r')
        batch.mpi(cmd)
    else:
        readDataTF(cfg, rebuild=args.rebuild_store)
Exemplo n.º 6
0
def main():

    parser = argparse.ArgumentParser(
        parents=[dhdt.dhdtLog(), dhdt.batchParser()])
    parser.add_argument('config',
                        metavar='CFG',
                        help="name of the configuration file")
    parser.add_argument(
        '-r',
        '--reprocess-data',
        action='store_true',
        default=False,
        help="process data even if previous run was successful")
    parser.add_argument('--process',
                        '-p',
                        metavar='N',
                        default=0,
                        type=int,
                        help="compute tasks for process N")
    parser.add_argument('--tile-file',
                        '-T',
                        metavar='TFILE',
                        help="get tile IDs from file TFILE")
    parser.add_argument('--monitor-memory',
                        action='store_true',
                        default=False,
                        help="monitor CPU and memory usage")
    args = parser.parse_args()
    dhdt.initLog(args)

    if args.submit == 'sge':
        batch = dhdt.SGEProcess(args)
    elif args.submit == 'pbs':
        batch = dhdt.PBSProcess(args)
    else:
        batch = None

    # read the configuration
    cfg = dhdt.Config()
    cfg.readCfg(args.config)

    processTile = args.process
    if args.tile_file is not None:
        i = 0
        tfile = open(args.tile_file, 'r')
        for line in tfile.readlines():
            if i == args.process:
                processTile = int(line)
                break
            i = i + 1
        else:
            parser.error("could not find tile %d in tile file %s" %
                         (args.process, args.tile_file))
        tfile.close()

    if not args.reprocess_data:
        if dhdt.checkNC(cfg['grid']['output'], processTile):
            logging.info('tile %d has already been successfully processed' %
                         processTile)
            return

    if batch is not None:
        cmd = [
            'processData', '-p',
            str(processTile), args.config, '-l', args.log_level, '-L',
            args.log_file
        ]
        if args.monitor_memory:
            cmd.append('--monitor-memory')
        if args.reprocess_data:
            cmd.append('-r')
        batch.serial(cmd)
    else:
        processData(cfg,
                    processTile,
                    monitor=args.monitor_memory,
                    reprocess=args.reprocess_data)
Exemplo n.º 7
0
     #nearest = gpd2.geometry == nearest_points(point, pts)[1]
     #return gpd2[nearest].Place.get_values()[0]

#gpd1['Nearest'] = gpd1.apply(lambda row: near(row.geometry), axis=1)


#def main():
    
parser = argparse.ArgumentParser(parents=[dhdt.dhdtLog(),dhdt.batchParser()])
parser.add_argument('config',metavar='CFG',help="name of the configuration file")
parser.add_argument('-r','--rebuild-store',action='store_true',default=False,help="rebuild data store even though store is newer than the input files")
args = parser.parse_args()
dhdt.initLog(args)

if args.submit == 'sge':
    batch = dhdt.SGEProcess(args)
elif args.submit == 'pbs':
    batch = dhdt.PBSProcess(args)
else:
    batch = None

# read the configuration
cfg = dhdt.Config()
cfg.readCfg(args.config)


def nearest(row, geom_union, df1, df2, geom1_col='geometry', geom2_col='geometry', src_column=None):
    """Find the nearest point and return the corresponding value from specified column."""
    # Find the geometry that is closest
    nearest = df2[geom2_col] == nearest_points(row[geom1_col], geom_union)[1]
    # Get the corresponding value from df2 (matching is based on the geometry)
Exemplo n.º 8
0
def main():
    ''' main entry point to code '''

    ########## Config ##########

    #Set test flag if wish to generate single matlab or csv file to test config ok.
    test = False

    converter = 'Matlab'  #Or 'CSV'

    #Optional bbox in wgs84 coordinates [bottom left to top right]
    #bbox = [81.4,-96.734,81.41,-96.73]
    bbox = None

    #Matlab or CSV file pattern match
    fileglob = '/media/martin/DATA/Data/MatFiles/Swath/2012/*1B_201202*.mat'

    #Location to store shards
    storeFolder = '/media/martin/DATA/Data/hdf/swath/2012/'

    #Shard prefix, so can easily identify. Put 'swath_', 'poca_' or 'oib_' or anything else
    filePrefix = 'swath_'

    #Set data lengths - want large for swath (500k, small for poca and oib)
    recordLength = 500000
    recordBuffer = 150000  #Maximum over recordLength to allow in file
    recordMin = 400000  #Minimum to allow in file - will concatenate files if below this

    #Alternate for Poca or OIB
    #recordLength = 6000
    #recordBuffer = 2000
    #recordMin = 4000

    ######### Additional DHDT Config ###########

    #This is dhdt config parsing code. I have not modified

    parser = argparse.ArgumentParser(
        parents=[dhdt.dhdtLog(), dhdt.batchParser()])
    parser.add_argument('config',
                        metavar='CFG',
                        help="name of the configuration file")
    parser.add_argument(
        '-r',
        '--rebuild-store',
        action='store_true',
        default=False,
        help=
        "rebuild data store even though store is newer than the input files")
    args = parser.parse_args()
    dhdt.initLog(args)

    if args.submit == 'sge':
        batch = dhdt.SGEProcess(args)
    elif args.submit == 'pbs':
        batch = dhdt.PBSProcess(args)
    else:
        batch = None

    # read the configuration
    cfg = dhdt.Config()
    cfg.readCfg(args.config)

    ############# Start of code ##############

    start = time.time()

    #Define reader
    if converter == 'Matlab':
        reader = dhdt.dataReader.MatlabReader()
    else:
        reader = dhdt.dataReader.CSVReader()

    #Temporary variables
    hold = False
    holdData = None

    #Get file list in order
    fileList = glob.glob(fileglob)
    fileList.sort()

    #Set counters
    ii = 0
    fileCount = len(fileList)

    #Iterate through each file and convert to shards
    for d in fileList:

        ii += 1
        print('Processing file {} of {}: {}'.format(ii, fileCount, d))

        loadData = reader.read(d, bbox)

        if loadData.shape[0] == 0:
            print('Empty File - Skipping')
            if d != fileList[-1]:
                continue

        #Hold the data for appending next files (if less than minimum record length)
        allData = loadData

        #Deterime if data being held (if less than minimum record length), if so concatenate
        if hold:
            if loadData.shape[0] == 0:
                allData = holdData
            else:
                allData = pd.concat([holdData, loadData])

        #Less than minimum record length so hold data and continue loop to append next files
        if allData.shape[0] < recordMin and d != fileList[-1]:
            hold = True
            holdData = allData
            continue
        else:
            hold = False
            holdData = None

        if allData.shape[0] == 0:
            continue

        #Must now be above minimum record length (or at end of file list)

        #Convert to geo coordinates and project to polar stereographic
        allData = GeoWrapper.convertToGeo(allData, cfg['data']['projection'],
                                          False)
        allData = GeoWrapper.project(allData, cfg['grid']['projection'])
        allData = GeoWrapper.extractXYtoCols(allData, 0)
        allData = allData.drop(['geometry'], axis=1)

        #Create dataframe
        allData = pd.DataFrame(allData)

        #Write counters
        i = 0
        dLength = allData.shape[0]
        j = 0

        #Loop over data to create files of maximum record length
        while i <= dLength:

            increment = recordLength
            if i + recordLength + recordBuffer > dLength:
                increment = recordLength + recordBuffer

            #Take slice of data up to maximum data length
            data = allData[i:i + increment]
            data = data.reset_index().drop(['index'], axis=1)

            #Only do next steps if have data
            if data.shape[0] > 0:

                #Create index
                indexData = DataStoreHelper.createIndex(
                    data, ['lat', 'lon', 'x', 'y', 'startTime'])

                #Create files name
                fileTime = DataStoreHelper.createFileDateTime(indexData)
                fullPath = storeFolder + filePrefix + fileTime + '_' + str(
                    j) + '.h5'

                #Write data
                store = pd.HDFStore(fullPath,
                                    mode='w',
                                    complevel=9,
                                    complib='blosc')
                store.append('data', data, index=False, data_columns=True)
                store.append('index',
                             indexData,
                             index=False,
                             data_columns=True)
                store.close()

                #remove in-memory data to keep efficient
                del data
                del indexData

            i += increment
            j += 1

        #remove in-memory data to keep efficient
        del loadData
        del allData
        del holdData

        #Set if want to run a test
        if test:
            if ii >= 1:
                print("Time Taken: {}".format(time.time() - start))
                return

    print("Complete")
    print("Time Taken: {}".format(time.time() - start))