def __init__(self, minX, maxX, minY, maxY, logFile=None):
        '''

        :param minX:
        :param maxX:
        :param minY:
        :param maxY:
        :param logFile: if logfile is specified logger will write into file instead of the terminal
        '''
        if logFile is None:
            logging.basicConfig(
                format=
                '%(asctime)s, %(threadName)s %(thread)d: %(name)s %(levelname)s %(message)s',
                datefmt='%H:%M:%S',
                level=logging.INFO)
        else:
            logging.basicConfig(
                filename=logFile,
                filemode='a',
                format=
                '%(asctime)s, %(threadName)s %(thread)d: %(name)s %(levelname)s %(message)s',
                datefmt='%H:%M:%S',
                level=logging.INFO)
        sys.excepthook = self.uncaughtErrorHandler

        self.logger = logging.getLogger(__name__)
        self.logger.info('Process started')

        self.client = MalardClient(notebook=False)

        self.minX = minX
        self.maxX = maxX
        self.minY = minY
        self.maxY = maxY
        self.parentDsName = self.config('parentDsName')
        self.outputDataSet = self.config('outputDataSet')
        self.inputDataSet = self.config('inputDataSet')
        self.region = self.config('region')
        self.maskDataSet = self.config('maskDataSet')
        self.withinDataSets = self.config('withinDataSets')
        self.withinDataSetTypes = self.config('withinDataSetTypes')
        self.runName = self.config('runName')

        assert (self.maxX - self.minX) == (self.maxY - self.minY)
        self.size = maxX - minX
        self.dataSet = DataSet(parentDs=self.config('parentDsName'),
                               dataSet=self.config('inputDataSet'),
                               region=self.config('region'))
示例#2
0
    def __init__(self, logFile=None):
        '''

        :param logFile: if logfile is specified logger will write into file instead of the terminal
        '''
        if logFile is None:
            logging.basicConfig(
                format=
                '%(asctime)s, %(threadName)s %(thread)d: %(name)s %(levelname)s %(message)s',
                datefmt='%H:%M:%S',
                level=logging.INFO)
        else:
            logging.basicConfig(
                filename=logFile,
                filemode='a',
                format=
                '%(asctime)s, %(threadName)s %(thread)d: %(name)s %(levelname)s %(message)s',
                datefmt='%H:%M:%S',
                level=logging.INFO)
        sys.excepthook = self.uncaughtErrorHandler

        self.logger = logging.getLogger(__name__)
        self.logger.info('Process started')

        #self.parentDsName = self.config('parentDsName')
        self.inputDataSet = DataSet(parentDs=self.config('parentDsName'),
                                    dataSet=self.config('inputDataSet'),
                                    region=self.config('region'))
        #self.region = self.config('region')
        self.runName = self.config('runName')

        self.client = MalardClient(notebook=False)

        self.query_sync = DataSetQuery(self.config('malardSyncURL'),
                                       self.config('malardEnvironmentName'))
        #self.query_async = AsyncDataSetQuery.AsyncDataSetQuery(self.config('malardAsyncURL'), self.config('malardEnvironmentName'), False)
        # get projection
        #self.projection = json.loads(self.client.getProjection(self.parentDsName, self.region))['proj4']
        self.projection = self.client.getProjection(self.inputDataSet).proj4
示例#3
0
文件: DataSets.py 项目: whigg/malard
        with open(out_xyz, 'r') as f:
            f.readline()  # skip first row
            for l in f:
                row = l.split()
                xy.append([float(row[0]), float(row[1])])
                values.append(float(row[2]))
        # remove xyz file
        if deleteTemporaryFiles:
            os.remove(out_xyz)
        return xy, values


if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO)

    client = MalardClient()

    parentDataSet = 'cryotempo'
    dataSet = 'GRIS_BaselineC_Q2'
    region = 'greenland'

    inputDs = DataSet(parentDataSet, dataSet, region)

    proj4 = client.getProjection(inputDs).proj4
    print(proj4)

    bb = client.boundingBox(inputDs)

    gridCells = client.gridCells(inputDs, bb)

    minT = datetime.datetime(2011, 3, 1, 0, 0, 0)
示例#4
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Nov 20 09:33:33 2019

@author: jon
"""

from MalardClient.MalardClient import MalardClient
from MalardClient.DataSet import DataSet
from MalardClient.BoundingBox import BoundingBox

client = MalardClient()

ds = DataSet("cryotempo","poca","greenland" )

dsSwath = DataSet("cryotempo","GRIS_BaselineC_Q2","greenland" )

bb = client.boundingBox(ds)

gcs = client.gridCells(ds, bb)

minX=-1600000
maxX=-1500000
minY=-2600000
maxY=-2500000
minT=1298912551
maxT=1298912551

bb = BoundingBox( minX, maxX, minY, maxY, minT, maxT )
示例#5
0
def main(pub_month, pub_year, loadConfig):

    region = loadConfig["region"]
    parentDataSet = loadConfig["parentDataSet"]
    uncertainty_threshold = loadConfig[
        "uncertainty_threshold"] if "uncertainty_threshold" in loadConfig else None
    powerdB = loadConfig["powerdB"]
    coh = loadConfig["coh"]
    dataSetName = loadConfig["resultsetName"]

    pocaParentDataSet = loadConfig["pocaParentDataSet"]
    pocaDataSetName = loadConfig["pocaDataSet"]
    pocaDemDiff = loadConfig["pocaDemDiff"]
    output_path = os.path.join(loadConfig["resultPath"], "pointProduct")
    ensure_dir(output_path)

    malardEnv = loadConfig["MalardEnvironment"]

    client = MalardClient(malardEnv)

    uncDatasetName = "{}_unc".format(
        dataSetName) if uncertainty_threshold is not None else dataSetName
    uncDataSet = DataSet(parentDataSet, uncDatasetName, region)
    dataSet = DataSet(parentDataSet, dataSetName, region)

    pocaDataSet = DataSet(pocaParentDataSet, pocaDataSetName, region)
    pocaDataSet_noDemDiff = DataSet(pocaParentDataSet,
                                    pocaDataSetName.replace("_demDiff", ""),
                                    region)

    projections = [
        'x', 'y', 'time', 'elev', 'powerdB', 'coh', 'demDiff', 'demDiffMad',
        'swathFileId', 'Q_uStd'
    ]
    filters = [{
        'column': 'Q_uStd',
        'op': 'lte',
        'threshold': uncertainty_threshold
    }, {
        'column': 'powerdB',
        'op': 'gte',
        'threshold': powerdB
    }, {
        'column': 'coh',
        'op': 'gte',
        'threshold': coh
    }, {
        'column': 'inRegionMask',
        'op': 'eq',
        'threshold': 1.0
    }]
    filters_poca = [{
        "column": "demDiff",
        "op": "lte",
        "threshold": pocaDemDiff
    }, {
        "column": "demDiff",
        "op": "gte",
        "threshold": -pocaDemDiff
    }, {
        'column': 'inRegionMask',
        'op': 'eq',
        'threshold': 1.0
    }]

    from_dt = datetime(pub_year, pub_month, 1, 0, 0, 0)
    to_dt = from_dt + relativedelta(months=1) - timedelta(seconds=1)

    bb = client.boundingBox(uncDataSet)
    gridcells = client.gridCells(
        uncDataSet,
        BoundingBox(bb.minX, bb.maxX, bb.minY, bb.maxY, from_dt, to_dt))

    proj4 = client.getProjection(uncDataSet).proj4

    print("Number of Gridcells found to process {}".format(len(gridcells)))
    process_start = datetime.now()

    print("MinT={} MaxT={}".format(from_dt, to_dt))
    #Create a shapefile index for each month
    index = s.ShapeFileIndex(output_path, "THEM_POINT", proj4,
                             uncDataSet.region, from_dt)

    for i, gc in enumerate(gridcells):
        gc_start = datetime.now()
        month_gc = BoundingBox(gc.minX, gc.maxX, gc.minY, gc.maxY, from_dt,
                               to_dt)
        queryInfo = client.executeQuery(uncDataSet,
                                        month_gc,
                                        projections=projections,
                                        filters=filters)

        if queryInfo.status == "Success" and not queryInfo.resultFileName.startswith(
                "Error"):

            data = queryInfo.to_df

            dataSwathStr = np.array(len(data), "S5")
            dataSwathStr.fill("swath")
            data["swathPoca"] = dataSwathStr
            swath_file_ids = data['swathFileId'].unique()
            pocaInfo = client.executeQuery(pocaDataSet,
                                           gc,
                                           filters=filters_poca)

            pocaDf = pd.DataFrame()
            if pocaInfo.status == "Success" and not pocaInfo.resultFileName.startswith(
                    "Error"):
                pocaDf = pocaInfo.to_df

                if len(pocaDf) > 0:
                    pocaStr = np.empty(len(pocaDf), "S5")
                    pocaStr.fill("poca")
                    pocaDf["swathPoca"] = pocaStr
                    poca_file_ids = pocaDf['swathFileId'].unique()
                    print("Poca points to include {}".format(len(pocaDf)))

                    data = pd.concat([data, pocaDf], sort=False)

            print("Found {} data rows".format(len(data)))
            if len(data) > 0:
                results = client.getSwathNamesFromIds(dataSet, swath_file_ids)
                if len(pocaDf) > 0:
                    try:
                        results.update(
                            client.getSwathNamesFromIds(
                                pocaDataSet_noDemDiff, poca_file_ids))
                    except KeyError as ex:
                        print(
                            "Exception caught while retrieving swathIds for data set {} file ids {}"
                            .format(pocaDataSet_noDemDiff, poca_file_ids))
                        raise KeyError(ex)

                writePointProduct(output_path, dataSet, month_gc, data, proj4,
                                  results, index)

            client.releaseCacheHandle(pocaInfo.resultFileName)
        else:
            print("Grid Cells skipped X=[{}] Y=[{}] with message [{}] ".format(
                gc.minX, gc.minY, queryInfo.status))
        client.releaseCacheHandle(queryInfo.resultFileName)

    index.close()
    gc_elapsed = (datetime.now() - gc_start).total_seconds()
    print('Processed [{}] grid cells. Took=[{}]s'.format(i + 1, gc_elapsed))

    process_elapsed = (datetime.now() - process_start).total_seconds()
    print("Took [{}s] to process".format(process_elapsed))
示例#6
0
class TimeseriesRun:

    # __conf = {
    #     "outputFileName": "himalayas-mad-tdx2.json",
    #     "inputDataSet": "HimMad2",
    #     "runName": "HimMad2",
    #     "region":"himalayas",
    #     "parentDsName": "mtngla",
    #     "outputPath": "timeseries_results",
    #     "malardEnvironmentName": "DEVv2",
    #     "malardSyncURL": "http://localhost:9000",
    #     "malardAsyncURL": "ws://localhost:9000",
    #     "filters" : [{'column':'power','op':'gt','threshold':10000},{'column':'coh','op':'gt','threshold':0.6}, \
    #                  {'column':'demDiff','op':'lt','threshold':100}, {'column':'demDiffMadNew','op':'lt','threshold':10}, \
    #                  {'column':'demDiff','op':'gt','threshold':-100}, \
    #                  {'column':'refDifference','op':'gt','threshold':-150}, {'column':'refDifference','op':'lt','threshold':150}, \
    #                  {'column':'within_DataSet','op':'gt','threshold':1}]
    # }

    __conf = {
        "outputFileName": "alaska-gridcells-double.json",
        "inputDataSet": "AlaskaMad",
        "runName": "AlaskaMad",
        "region":"alaska",
        "parentDsName": "mtngla",
        "outputPath": "timeseries_results",
        "malardEnvironmentName": "DEVv2",
        "malardSyncURL": "http://localhost:9000",
        "malardAsyncURL": "ws://localhost:9000",
        "filters" : [{'column':'power','op':'gt','threshold':10000},{'column':'coh','op':'gt','threshold':0.6}, \
                     {'column':'demDiff','op':'lt','threshold':100}, {'column':'demDiffMad','op':'lt','threshold':10}, \
                     {'column':'demDiff','op':'gt','threshold':-100}, {'column':'demDiffMad','op':'gt','threshold':-10}, \
                     {'column':'refDifference','op':'gt','threshold':-150}, {'column':'refDifference','op':'lt','threshold':150}]
    }

    def __init__(self, logFile=None):
        '''

        :param logFile: if logfile is specified logger will write into file instead of the terminal
        '''
        if logFile is None:
            logging.basicConfig(
                format=
                '%(asctime)s, %(threadName)s %(thread)d: %(name)s %(levelname)s %(message)s',
                datefmt='%H:%M:%S',
                level=logging.INFO)
        else:
            logging.basicConfig(
                filename=logFile,
                filemode='a',
                format=
                '%(asctime)s, %(threadName)s %(thread)d: %(name)s %(levelname)s %(message)s',
                datefmt='%H:%M:%S',
                level=logging.INFO)
        sys.excepthook = self.uncaughtErrorHandler

        self.logger = logging.getLogger(__name__)
        self.logger.info('Process started')

        #self.parentDsName = self.config('parentDsName')
        self.inputDataSet = DataSet(parentDs=self.config('parentDsName'),
                                    dataSet=self.config('inputDataSet'),
                                    region=self.config('region'))
        #self.region = self.config('region')
        self.runName = self.config('runName')

        self.client = MalardClient(notebook=False)

        self.query_sync = DataSetQuery(self.config('malardSyncURL'),
                                       self.config('malardEnvironmentName'))
        #self.query_async = AsyncDataSetQuery.AsyncDataSetQuery(self.config('malardAsyncURL'), self.config('malardEnvironmentName'), False)
        # get projection
        #self.projection = json.loads(self.client.getProjection(self.parentDsName, self.region))['proj4']
        self.projection = self.client.getProjection(self.inputDataSet).proj4

    def gridcellTimeseries(self,
                           boundingBox,
                           startdate,
                           enddate,
                           interval,
                           weighted=[]):
        filters = self.config('filters')
        self.logger.info("Filtering dataset=%s with criteria %s" %
                         (self.inputDataSet, filters))
        result = self.client.executeQuery(self.inputDataSet,
                                          boundingBox,
                                          projections=[],
                                          filters=filters)
        self.logger.info("Result message: result=%s, message=%s" %
                         (result.status, result.message))

        data = PointDataSet(result.resultFileName, self.projection)
        # release cache of file
        self.client.releaseCacheHandle(result.resultFileName)
        results = {}
        if data.hasData():
            self.logger.info('Data length={}'.format(data.length()))
            r = data.timeSeries(startdate=startdate,
                                enddate=enddate,
                                interval=interval,
                                weighted=weighted)
            results = {**results, **r}
            self.logger.info(results)
        else:
            self.logger.info('No data in file')

        return results

    def timeseriesFromStats(self,
                            startdate,
                            enddate,
                            interval=3,
                            minT=None,
                            maxT=None,
                            minCount=0,
                            save=True,
                            weighted=None):
        self.logger.info("Get run statistics for parentDS=%s runName=%s ..." %
                         (self.inputDataSet.parentDataSet, self.runName))
        stats = self.query_sync.getRunStatistics(
            self.inputDataSet.parentDataSet, self.runName)
        stats = json.loads(stats)
        dfStats = json_normalize(stats)
        if minT is None and maxT is None:
            bbx = self.client.boundingBox(self.inputDataSet)
            minT = bbx.minT
            maxT = bbx.maxT

        for idx, line in dfStats.iterrows():
            if line['statistics.afterGlacierMask'] > minCount:
                minX, maxX = line['gridCell.minX'], line[
                    'gridCell.minX'] + line['gridCell.size']
                minY, maxY = line['gridCell.minY'], line[
                    'gridCell.minY'] + line['gridCell.size']
                self.logger.info("Calculating gridcell minX=%s minY=%s ..." %
                                 (minX, maxX))

                bbx_in = BoundingBox(minX, maxX, minY, maxY, minT, maxT)

                results = self.gridcellTimeseries(bbx_in,
                                                  startdate,
                                                  enddate,
                                                  interval,
                                                  weighted=weighted)
                self.logger.info("Adding timeseries results to stats...")
                for key in results:
                    if isinstance(results[key], list):
                        if not np.isin(key, dfStats.columns):
                            newColumn = [key]
                            #
                            dfStats = dfStats.reindex(columns=np.append(
                                dfStats.columns.values, newColumn))
                            dfStats[[key]] = dfStats[[key
                                                      ]].astype('object',
                                                                inplace=True)
                            dfStats.at[idx, key] = results[key]
                        else:
                            dfStats.at[idx, key] = results[key]
                    else:
                        dfStats.at[idx, key] = results[key]

        #size = dfStats['gridCell.size']
        #geometry = [Point(xy) for xy in zip(dfStats['gridCell.minX']+(size/2), dfStats['gridCell.minY']+(size/2))]
        #dfStats = gp.GeoDataFrame(dfStats, crs=self.projection, geometry=geometry)

        if save:
            file = os.path.join(self.config("outputPath"),
                                self.config("outputFileName"))
            self.logger.info("Saving results under file=%s" % file)
            dfStats.to_json(file)

        return dfStats

    def timeseriesFromList(self,
                           gridcells,
                           startdate,
                           enddate,
                           interval=3,
                           minT=None,
                           maxT=None,
                           save=True,
                           weighted=None):

        dfStats = pd.DataFrame(gridcells)

        if minT is None and maxT is None:
            bbx = self.client.boundingBox(self.inputDataSet)
            minT = bbx.minT
            maxT = bbx.maxT

        for idx, line in dfStats.iterrows():

            self.logger.info(
                "Calculating gridcell minX=%s maxX=%s minY=%s maxY=%s minT=%s maxT=%s ..."
                % (line['minX'], line['maxX'], line['minY'], line['maxY'],
                   minT, maxT))
            bbx_in = BoundingBox(line['minX'].item(), line['maxX'].item(),
                                 line['minY'].item(), line['maxY'].item(),
                                 minT, maxT)

            results = self.gridcellTimeseries(bbx_in,
                                              startdate,
                                              enddate,
                                              interval,
                                              weighted=weighted)

            self.logger.info("Adding timesereis results to stats...")
            for key in results:
                if isinstance(results[key], list):
                    if not np.isin(key, dfStats.columns):
                        newColumn = [key]
                        #
                        dfStats = dfStats.reindex(columns=np.append(
                            dfStats.columns.values, newColumn))
                        dfStats[[key]] = dfStats[[key]].astype('object',
                                                               inplace=True)
                        dfStats.at[idx, key] = results[key]
                    else:
                        dfStats.at[idx, key] = results[key]

        if save:
            file = os.path.join(self.config("outputPath"),
                                self.config("outputFileName"))
            self.logger.info("Saving results under file=%s" % file)
            dfStats.to_json(file)

        return dfStats

    def timeseriesFromFile(self,
                           file,
                           startdate,
                           enddate,
                           interval=3,
                           minT=None,
                           maxT=None,
                           save=True,
                           weighted=None):
        ''' Calcualtes timeseries from cells corresponding to the cells of a given input file

        :param file: textfile with gridcell extents
        :return:
        '''

        self.logger.info("Start timeseries from file for parentDS=%s ..." %
                         (self.inputDataSet.parentDataSet))
        if minT is None and maxT is None:
            bbx = self.client.boundingBox(self.inputDataSet)
            minT = bbx.minT
            maxT = bbx.maxT

        extents = []
        with open(file) as f:
            for line in f:
                split = line.strip().split(",")
                ext = {
                    'minX': int(split[0]),
                    'maxX': int(split[1]),
                    'minY': int(split[2]),
                    'maxY': int(split[3])
                }
                extents.append(ext)
        stats = self.timeseriesFromList(extents,
                                        startdate=startdate,
                                        enddate=enddate,
                                        interval=interval,
                                        minT=minT,
                                        maxT=maxT,
                                        save=save,
                                        weighted=weighted)

        return stats

    @staticmethod
    def config(name):
        return TimeseriesRun.__conf[name]

    def uncaughtErrorHandler(self, type, value, tb):
        self.logger.error("Uncaught exception", exc_info=(type, value, tb))
示例#7
0
# -*- coding: utf-8 -*-
"""
Created on Wed Nov 13 09:55:03 2019

@author: jon
"""

import MalardGDAL as mg
from MalardClient.MalardClient import MalardClient
from MalardClient.DataSet import DataSet
from MalardClient.BoundingBox import BoundingBox
from MalardClient.MaskFilter import MaskFilter

from datetime import datetime

client = MalardClient()

ds = DataSet("cryotempo", "swath_c", "greenland")

proj4 = client.getProjection(ds).proj4

print(proj4)

minX = 700000
minY = -2200000
cell_size = 130000

bbox = BoundingBox(minX, minX + cell_size, minY, minY + cell_size,
                   datetime(2011, 2, 1, 0, 0), datetime(2011, 5, 1, 0, 0))

## TODO: These need to be stored in Malard by DataSet and Type.
示例#8
0
from dateutil.relativedelta import relativedelta

import math
import pandas as pd
import ND as nd


def distance(x1s, y1s, x2s, y2s):

    return [
        math.sqrt((x1 - x2)**2 + (y1 - y2)**2)
        for x1, y1, x2, y2 in zip(x1s, y1s, x2s, y2s)
    ]


client = MalardClient()

ds = DataSet("cryotempo", "poca", "greenland")

dsSwath = DataSet("cryotempo", "GRIS_BaselineC_Q2", "greenland")

ds_oib = DataSet("cryotempo", "oib", "greenland")

filters = [{
    'column': 'coh',
    'op': 'gte',
    'threshold': 0.5
}, {
    'column': 'power',
    'op': 'gte',
    'threshold': 1000.0
示例#9
0
文件: NN2.py 项目: whigg/malard
# -*- coding: utf-8 -*-
"""
Created on Fri Nov 22 19:16:07 2019

@author: jon
"""

from MalardClient.MalardClient import MalardClient
from MalardClient.DataSet import DataSet
from MalardClient.BoundingBox import BoundingBox
from datetime import datetime
from dateutil.relativedelta import relativedelta

import numpy as np

client = MalardClient()

ds = DataSet("cryotempo", "poca", "greenland")

dsSwath = DataSet("cryotempo", "GRIS_BaselineC_Q2", "greenland")

ds_oib = DataSet("cryotempo", "oib", "greenland")

bb = client.boundingBox(ds)

minX = -200000
maxX = -100000
minY = -2400000
maxY = -2300000
minT = datetime(2011, 3, 1, 0, 0, 0)
maxT = datetime(2011, 3, 31, 23, 59, 59)
class MtnGlaGridcellProcess:

    #"referenceDem":"/data/puma1/scratch/DEMs/srtm_test.tif"
    #"referenceDem":"/data/puma1/scratch/mtngla/dems/HMA_TDX_Masked_SRTM_Merged_coreg_aea_clip.tif"

    # HIMALAYAS
    #"runName": "ReadyHim2",
    #"outputDataSet": "Ready8",
    #"parentDsName": "mtngla",
    #"region":"himalayas",
    #"maskDataSet": "RGIv60",
    #"withinDataSets": ["SDCv10", "/data/puma1/scratch/mtngla/dems/Tdx_SRTM_SurfaceSplit.tiff"],
    #"withinDataSetTypes": ["Debris", "DataSet"],
    #"referenceDem": "/data/puma1/scratch/mtngla/dems/HMA_TDX_Masked_SRTM_Merged_coreg_aea_clip.tif",
    #"inputDataSet": "tdx2",
    #
    # "runName": "HimMad2",
    # "outputDataSet": "HimMad2",
    # "parentDsName": "mtngla",
    # "region":"himalayas",

    # ALASKA
    #"runName": "AlaskaRun1",
    #"outputDataSet": "ReadyDataAlaska2",
    #"parentDsName": "mtngla",
    #"region":"alaska",
    #"maskDataSet": "RGIv60",
    #"withinDataSets": ["SDCv10", "/data/puma1/scratch/mtngla/dems/TD_AD_Interp_SurfaceSplit.tiff"],
    #"withinDataSetTypes": ["Debris", "DataSet"],
    #"referenceDem": "/data/puma1/scratch/mtngla/dems/PCR_TdxFilledWithAD_Masked_Polar_Interp_clip.tif",
    #"inputDataSet": "ADwithTDX",


    __conf = {
        "runName": "AlaskaMad",
        "outputDataSet": "AlaskaMad",
        "parentDsName": "mtngla",
        "region":"alaska",
        "maskDataSet": "RGIv60",
        "withinDataSets": ["SDCv10", "/data/puma1/scratch/mtngla/dems/TD_AD_Interp_SurfaceSplit.tiff"],
        "withinDataSetTypes": ["Debris", "DataSet"],
        "referenceDem": "/data/puma1/scratch/mtngla/dems/PCR_TdxFilledWithAD_Masked_Polar_Interp_clip.tif",
        "inputDataSet": "tdx_mad",
        "malardEnvironmentName": "DEVv2",
        "malardSyncURL": "http://localhost:9000",
        "malardAsyncURL": "ws://localhost:9000",
        "filters" : [{'column':'power','op':'gt','threshold':10000},{'column':'coh','op':'gt','threshold':0.6}, \
                     {'column':'demDiff','op':'lt','threshold':100}, {'column':'demDiffMadNew','op':'lt','threshold':10}, \
                     {'column':'demDiff','op':'gt','threshold':-100}]

    }

    def __init__(self, minX, maxX, minY, maxY, logFile=None):
        '''

        :param minX:
        :param maxX:
        :param minY:
        :param maxY:
        :param logFile: if logfile is specified logger will write into file instead of the terminal
        '''
        if logFile is None:
            logging.basicConfig(
                format=
                '%(asctime)s, %(threadName)s %(thread)d: %(name)s %(levelname)s %(message)s',
                datefmt='%H:%M:%S',
                level=logging.INFO)
        else:
            logging.basicConfig(
                filename=logFile,
                filemode='a',
                format=
                '%(asctime)s, %(threadName)s %(thread)d: %(name)s %(levelname)s %(message)s',
                datefmt='%H:%M:%S',
                level=logging.INFO)
        sys.excepthook = self.uncaughtErrorHandler

        self.logger = logging.getLogger(__name__)
        self.logger.info('Process started')

        self.client = MalardClient(notebook=False)

        self.minX = minX
        self.maxX = maxX
        self.minY = minY
        self.maxY = maxY
        self.parentDsName = self.config('parentDsName')
        self.outputDataSet = self.config('outputDataSet')
        self.inputDataSet = self.config('inputDataSet')
        self.region = self.config('region')
        self.maskDataSet = self.config('maskDataSet')
        self.withinDataSets = self.config('withinDataSets')
        self.withinDataSetTypes = self.config('withinDataSetTypes')
        self.runName = self.config('runName')

        assert (self.maxX - self.minX) == (self.maxY - self.minY)
        self.size = maxX - minX
        self.dataSet = DataSet(parentDs=self.config('parentDsName'),
                               dataSet=self.config('inputDataSet'),
                               region=self.config('region'))

    def startProcess(self):
        self.logger.info(
            'Starting gridcell: minX=%s, minY=%s, parentDs=%s, inputDataSet=%s, outputDataSet=%s, runName=%s,',
            self.minX, self.minY, self.parentDsName, self.inputDataSet,
            self.outputDataSet, self.runName)
        self.defineVariables()
        if os.path.exists(self.maskDataSetFile):
            self.data = self.filter()

            # To Geodata
            self.logger.info('Converting to Geodataset...')
            self.data = self.data.asGeoDataSet()
            self.applyMasks()

            # Calculate elevation difference
            if self.data.hasData():
                raster = RasterDataSet(self.config('referenceDem'))
                assert (self.maxX - self.minX) == (self.maxY - self.minY)
                buffer = (self.maxX - self.minX) * 0.1
                self.data.calculateElevationDifference(raster, buffer=buffer)

                self.addStatistics()
                self.publish()
                self.logger.info("STATISTICS: %s", self.data.getStats())
        else:
            self.logger.info(
                "No valid mask (fp=%s) found for %s, %s, %s, minX=%s, minY=%s, size=%s",
                self.maskDataSetFile, self.maskDataSet, 'Glacier', self.region,
                self.minX, self.minY, self.size)

        # shutdown
        self.logger.info("Finished process for: minX=%s, minY=%s, size=%s",
                         self.minX, self.minY, self.size)
        self.logger.info(
            '------------------------------------------------------------------'
        )
        logging.shutdown()

        # clear variables
        sys.modules[__name__].__dict__.clear()

    def filter(self):
        filters = self.config('filters')
        self.logger.info("Filtering dataset=%s with criteria %s" %
                         (self.inputDataSet, filters))
        result = self.client.executeQuery(self.dataSet, self.bbx, [], filters)
        self.logger.info("Result message: %s, %s" %
                         (result.status, result.message))
        fp = result.resultFileName
        data = PointDataSet(fp, self.projection)
        # release cache of file
        self.client.releaseCacheHandle(result.resultFileName)
        data.addStatistic('%s_filtered' % self.inputDataSet, data.length())
        self.logger.info("Filter %s result count [%d]" %
                         (self.inputDataSet, data.length()))
        return data

    def applyMasks(self):
        # Mask
        self.data.applyMask(self.maskDataSetFile, 'Glacier')

        # Add column if point is inside masks
        for idx, i in enumerate(self.withinDataSetFiles):
            self.data.withinMask(i, self.withinDataSetTypes[idx])

    def addStatistics(self):
        self.logger.info('Adding additional statistics')
        # number of srtm and number of tandemX
        self.data.addStatistic('result_total', self.data.length())
        #stats['result_srtm'] = float(data.loc[data.dataset == 'SRTM', 'dataset'].count())
        #stats['result_tandemx'] = float(data.loc[data.dataset == 'TandemX', 'dataset'].count())
        self.data.addStatistic('result_avgX', self.data.mean('x'))
        self.data.addStatistic('result_avgY', self.data.mean('y'))
        self.data.addStatistic(
            'result_offsetX',
            self.data.getStats()['result_avgX'] - (self.minX +
                                                   (self.size / 2)))
        self.data.addStatistic(
            'result_offsetY',
            self.data.getStats()['result_avgY'] - (self.minX +
                                                   (self.size / 2)))

        # counts per year
        # @TODO do this in glacier years
        years = [x for x in range(self.minT.year, self.maxT.year + 1)]
        for year in years:
            start = datetime.datetime(year, 1, 1, 0, 0)
            end = datetime.datetime(year + 1, 1, 1, 0, 0)
            start = calendar.timegm(start.utctimetuple())
            end = calendar.timegm(end.utctimetuple())
            # count
            keyCount = "result_count_%s" % (year)
            peryear = float(
                self.data.data.loc[(self.data.data.time >= start)
                                   & (self.data.data.time < end)].shape[0])
            self.data.addStatistic(keyCount, peryear)
            # elevation difference
            elevDiff = "result_refDifference_%s" % (year)
            if peryear > 0.0:
                self.data.addStatistic(
                    elevDiff,
                    float(self.data.data.loc[(self.data.data.time >= start) &
                                             (self.data.data.time < end),
                                             'refDifference'].mean()))
            else:
                self.data.addStatistic(elevDiff, 0.0)

    def publish(self, outEnvironment='/data/puma1/scratch/mtngla/ReadyData'):
        # get data as normal pandas dataframe without the geo ref
        data = self.data.getData(geo=False)

        outPath = os.path.join(
            outEnvironment,
            "ReadyData_%s_x%s_y%s.nc" % (self.minX, self.minY, self.size))
        xarr = data.to_xarray()
        xarr.to_netcdf(outPath)

        # publish
        self.logger.info('Publish new dataset...')
        result = self.query_async.publishGridCellPoints(
            self.parentDsName,
            self.outputDataSet, self.region, self.minX, self.minY,
            self.data.min('time'), self.size, outPath, self.projection)
        self.logger.info('Response: %s' % result.json)
        # delete temporary file
        os.remove(outPath)

        # publish stats
        self.logger.info('Publish gridcell statistics...')
        response = self.query_sync.publishGridCellStats(
            self.parentDsName, self.runName, self.minX, self.minY, self.size,
            self.data.getStats())
        self.logger.info('Response: %s' % response)

    def defineVariables(self):
        self.query_sync = DataSetQuery(self.config('malardSyncURL'),
                                       self.config('malardEnvironmentName'))
        self.query_async = AsyncDataSetQuery(
            self.config('malardAsyncURL'),
            self.config('malardEnvironmentName'), False)
        # minT and maxT

        # get projection
        self.projection = self.client.getProjection(self.dataSet).proj4

        #minT and maxT
        bbx = self.client.boundingBox(self.dataSet)
        self.minT = bbx.minT
        self.maxT = bbx.maxT

        self.bbx = BoundingBox(self.minX, self.maxX, self.minY, self.maxY,
                               self.minT, self.maxT)

        # masks
        mGla = self.query_sync.getGridCellMask(self.parentDsName,
                                               self.maskDataSet, 'Glacier',
                                               self.region, self.minX,
                                               self.minY, self.size)
        self.maskDataSetFile = json.loads(mGla)['fileName']

        self.withinDataSetFiles = []
        for i, el in enumerate(self.withinDataSets):
            if os.path.exists(el):
                self.withinDataSetFiles.append(el)
            else:
                mask = self.query_sync.getGridCellMask(
                    self.parentDsName, el, self.withinDataSetTypes[i],
                    self.region, self.minX, self.minY, self.size)
                self.withinDataSetFiles.append(json.loads(mask)['fileName'])

    @staticmethod
    def config(name):
        return MtnGlaGridcellProcess.__conf[name]

    def uncaughtErrorHandler(self, type, value, tb):
        self.logger.error("Uncaught exception", exc_info=(type, value, tb))
class DemDiffMadProcess:

    # ALASKA
    #"runName": "AlaskaRun1",
    #"outputDataSet": "ReadyDataAlaska2",
    #"parentDsName": "mtngla",
    #"region":"alaska",
    #"maskDataSet": "RGIv60",
    #"withinDataSets": ["SDCv10", "/data/puma1/scratch/mtngla/dems/TD_AD_Interp_SurfaceSplit.tiff"],
    #"withinDataSetTypes": ["Debris", "DataSet"],
    #"referenceDem": "/data/puma1/scratch/mtngla/dems/PCR_TdxFilledWithAD_Masked_Polar_Interp_clip.tif",
    #"inputDataSet": "ADwithTDX",

    __conf = {
        "outputDataSet":
        "tdx4",
        "parentDsName":
        "mtngla",
        "region":
        "himalayas",
        "inputDataSet":
        "tdx2",
        "malardEnvironmentName":
        "DEVv2",
        "malardSyncURL":
        "http://localhost:9000",
        "malardAsyncURL":
        "ws://localhost:9000",
        "buffer":
        15000,
        "maskDataSet":
        "RGIv60",
        "filters": [{
            'column': 'power',
            'op': 'gt',
            'threshold': 10000
        }, {
            'column': 'coh',
            'op': 'gt',
            'threshold': 0.6
        }]
    }

    def __init__(self, minX, maxX, minY, maxY, logFile=None, notebook=False):
        '''

        :param minX:
        :param maxX:
        :param minY:
        :param maxY:
        :param logFile: if logfile is specified logger will write into file instead of the terminal
        '''
        if logFile is None:
            logging.basicConfig(
                format=
                '%(asctime)s, %(threadName)s %(thread)d: %(name)s %(levelname)s %(message)s',
                datefmt='%H:%M:%S',
                level=logging.INFO)
        else:
            logging.basicConfig(
                filename=logFile,
                filemode='a',
                format=
                '%(asctime)s, %(threadName)s %(thread)d: %(name)s %(levelname)s %(message)s',
                datefmt='%H:%M:%S',
                level=logging.INFO)
        sys.excepthook = self.uncaughtErrorHandler

        self.logger = logging.getLogger(__name__)
        self.logger.info('Process started')

        self.client = MalardClient(notebook=notebook)
        self.query_async = AsyncDataSetQuery(
            self.config('malardAsyncURL'),
            self.config('malardEnvironmentName'), False)

        self.minX = minX
        self.maxX = maxX
        self.minY = minY
        self.maxY = maxY
        assert (self.maxX - self.minX) == (self.maxY - self.minY)
        self.size = maxX - minX
        self.inputDataSet = DataSet(parentDs=self.config('parentDsName'),
                                    dataSet=self.config('inputDataSet'),
                                    region=self.config('region'))
        self.parentDsName = self.config('parentDsName')
        self.outputDataSet = self.config('outputDataSet')
        self.region = self.config('region')
        self.buffer = self.config('buffer')

        self.projection = self.client.getProjection(self.inputDataSet).proj4

        bbx = self.client.boundingBox(self.inputDataSet)
        self.minT = bbx.minT
        self.maxT = bbx.maxT

        # masks
        maskDataSet = self.config('maskDataSet')
        query_sync = DataSetQuery(self.config('malardSyncURL'),
                                  self.config('malardEnvironmentName'))
        mGla = query_sync.getGridCellMask(self.parentDsName, maskDataSet,
                                          'Glacier', self.region, self.minX,
                                          self.minY, self.size)
        self.maskDataSetFile = json.loads(mGla)['fileName']

    def startProcess(self):
        self.logger.info(
            'Starting gridcell: minX=%s, minY=%s, parentDs=%s, inputDataSet=%s, outputDataSet=%s',
            self.minX, self.minY, self.parentDsName, self.inputDataSet,
            self.outputDataSet)
        if os.path.exists(self.maskDataSetFile):
            self.data = self.filter()
            # Calculate elevation difference
            if self.data.hasData():

                # magic
                self.logger.info('Calculate demDiffMad...')
                self.data.data['demDiffMadNew'] = self.data.data[
                    'demDiff'].groupby([
                        self.data.data['swathFileId'],
                        self.data.data['wf_number']
                    ]).transform('mad')
                # delete gridcells outside cell (the ones that are within a buffer zone)
                self.logger.info('Cut down to gridcell...')
                filtered = self.data.data[((self.data.data.x > self.minX) &
                                           (self.data.data.x < self.maxX) &
                                           (self.data.data.y > self.minY) &
                                           (self.data.data.y < self.maxY))]
                self.logger.info(
                    'Count data before cut to gridcell =%s, after cut=%s',
                    self.data.data.shape[0], filtered.shape[0])
                self.data.data = filtered
                if self.data.hasData():
                    self.publish()

            else:
                self.logger.info("No data in result query")

        else:
            self.logger.info("No glacier mask for area.")

        # shutdown
        self.logger.info("Finished process for: minX=%s, minY=%s, size=%s",
                         self.minX, self.minY, self.size)
        self.logger.info(
            '------------------------------------------------------------------'
        )
        logging.shutdown()

        # clear variables
        sys.modules[__name__].__dict__.clear()

    def filter(self):
        filters = self.config('filters')
        self.logger.info("Filtering dataset=%s with criteria %s" %
                         (self.inputDataSet.dataSet, filters))
        minXb = self.minX - self.buffer
        maxXb = self.maxX + self.buffer
        minYb = self.minY - self.buffer
        maxYb = self.maxY + self.buffer

        self.logger.info(
            "Bounding box with buffer: minX=%s maxX=%s, minY=%s, mayY=%s" %
            (minXb, maxXb, minYb, maxYb))

        bbx_in = BoundingBox(minXb, maxXb, minYb, maxYb, self.minT, self.maxT)
        result = self.client.executeQuery(self.inputDataSet,
                                          bbx_in,
                                          projections=[],
                                          filters=filters)
        self.logger.info("Result message: %s, %s" %
                         (result.status, result.message))
        data = PointDataSet(result.resultFileName, self.projection)

        self.logger.info("Data points count: %s" % (data.data.shape[0]))

        # release cache of file
        self.client.releaseCacheHandle(result.resultFileName)
        return data

    def publish(self, outEnvironment='/data/puma1/scratch/mtngla/ReadyData'):
        outPath = os.path.join(
            outEnvironment,
            "ReadyData_%s_x%s_y%s.nc" % (self.minX, self.minY, self.size))
        xarr = self.data.data.to_xarray()
        xarr.to_netcdf(outPath)

        # publish
        self.logger.info('Publish new dataset...')
        result = self.query_async.publishGridCellPoints(
            self.parentDsName,
            self.outputDataSet, self.region, self.minX, self.minY,
            self.data.min('time'), self.size, outPath, self.projection)
        self.logger.info('Response: %s' % result.json)
        # delete temporary file
        os.remove(outPath)

    @staticmethod
    def config(name):
        return DemDiffMadProcess.__conf[name]

    def uncaughtErrorHandler(self, type, value, tb):
        self.logger.error("Uncaught exception", exc_info=(type, value, tb))
    def __init__(self, minX, maxX, minY, maxY, logFile=None, notebook=False):
        '''

        :param minX:
        :param maxX:
        :param minY:
        :param maxY:
        :param logFile: if logfile is specified logger will write into file instead of the terminal
        '''
        if logFile is None:
            logging.basicConfig(
                format=
                '%(asctime)s, %(threadName)s %(thread)d: %(name)s %(levelname)s %(message)s',
                datefmt='%H:%M:%S',
                level=logging.INFO)
        else:
            logging.basicConfig(
                filename=logFile,
                filemode='a',
                format=
                '%(asctime)s, %(threadName)s %(thread)d: %(name)s %(levelname)s %(message)s',
                datefmt='%H:%M:%S',
                level=logging.INFO)
        sys.excepthook = self.uncaughtErrorHandler

        self.logger = logging.getLogger(__name__)
        self.logger.info('Process started')

        self.client = MalardClient(notebook=notebook)
        self.query_async = AsyncDataSetQuery(
            self.config('malardAsyncURL'),
            self.config('malardEnvironmentName'), False)

        self.minX = minX
        self.maxX = maxX
        self.minY = minY
        self.maxY = maxY
        assert (self.maxX - self.minX) == (self.maxY - self.minY)
        self.size = maxX - minX
        self.inputDataSet = DataSet(parentDs=self.config('parentDsName'),
                                    dataSet=self.config('inputDataSet'),
                                    region=self.config('region'))
        self.parentDsName = self.config('parentDsName')
        self.outputDataSet = self.config('outputDataSet')
        self.region = self.config('region')
        self.buffer = self.config('buffer')

        self.projection = self.client.getProjection(self.inputDataSet).proj4

        bbx = self.client.boundingBox(self.inputDataSet)
        self.minT = bbx.minT
        self.maxT = bbx.maxT

        # masks
        maskDataSet = self.config('maskDataSet')
        query_sync = DataSetQuery(self.config('malardSyncURL'),
                                  self.config('malardEnvironmentName'))
        mGla = query_sync.getGridCellMask(self.parentDsName, maskDataSet,
                                          'Glacier', self.region, self.minX,
                                          self.minY, self.size)
        self.maskDataSetFile = json.loads(mGla)['fileName']
class RegressionRun:

    # __conf = {
    #     "outputFileName": "himalayas-gridcells.gpkg",
    #     "inputDataSet": "HimMad2",
    #     "runName": "HimMad2",
    #     "region":"himalayas",
    #     "parentDsName": "mtngla",
    #     "outputPath": "regression_results",
    #     "malardEnvironmentName": "DEVv2",
    #     "malardSyncURL": "http://localhost:9000",
    #     "malardAsyncURL": "ws://localhost:9000",
    #    "filters" : [{'column':'power','op':'gt','threshold':10000},{'column':'coh','op':'gt','threshold':0.6}, \
    #                 {'column':'demDiff','op':'lt','threshold':100}, {'column':'demDiffMadNew','op':'lt','threshold':10}, \
    #                 {'column':'demDiff','op':'gt','threshold':-100}, \
    #                 {'column':'refDifference','op':'gt','threshold':-150}, {'column':'refDifference','op':'lt','threshold':150}, \
    #                 {'column':'within_DataSet','op':'gt','threshold':1}]
    # }

    __conf = {
        "outputFileName": "alaska-gridcells-double.gpkg",
        "inputDataSet": "AlaskaMad",
        "runName": "AlaskaMad",
        "region":"alaska",
        "parentDsName": "mtngla",
        "outputPath": "regression_results",
        "malardEnvironmentName": "DEVv2",
        "malardSyncURL": "http://localhost:9000",
        "malardAsyncURL": "ws://localhost:9000",
        "filters" : [{'column':'power','op':'gt','threshold':10000},{'column':'coh','op':'gt','threshold':0.6}, \
                     {'column':'demDiff','op':'lt','threshold':100}, {'column':'demDiffMadNew','op':'lt','threshold':10}, \
                     {'column':'demDiff','op':'gt','threshold':-100}, \
                     {'column':'refDifference','op':'gt','threshold':-150}, {'column':'refDifference','op':'lt','threshold':150}, \
                     {'column':'within_DataSet','op':'gt','threshold':1}]
    }

    # __conf = {
    #     "outputFileName": "iceland5.gpkg",
    #     "inputDataSet": "tdx",
    #     "runName": "RunIce",
    #     "region":"iceland",
    #     "parentDsName": "mtngla",
    #     "outputPath": "regression_results",
    #     "malardEnvironmentName": "DEVv2",
    #     "malardSyncURL": "http://localhost:9000",
    #     "malardAsyncURL": "ws://localhost:9000",
    #     "filters" : [{'column':'powerScaled','op':'gt','threshold':10000},{'column':'coh','op':'gt','threshold':0.8}, \
    #                   {'column':'demDiff','op':'lt','threshold':200}, {'column':'demDiffMadNew','op':'lt','threshold':40}, \
    #                   ]
    # }

    def __init__(self, logFile=None, notebook=False):
        '''

        :param logFile: if logfile is specified logger will write into file instead of the terminal
        '''
        if logFile is None:
            logging.basicConfig(
                format=
                '%(asctime)s, %(threadName)s %(thread)d: %(name)s %(levelname)s %(message)s',
                datefmt='%H:%M:%S',
                level=logging.INFO)
        else:
            logging.basicConfig(
                filename=logFile,
                filemode='a',
                format=
                '%(asctime)s, %(threadName)s %(thread)d: %(name)s %(levelname)s %(message)s',
                datefmt='%H:%M:%S',
                level=logging.INFO)
        sys.excepthook = self.uncaughtErrorHandler

        self.logger = logging.getLogger(__name__)
        self.logger.info('Process started')

        #self.parentDsName = self.config('parentDsName')
        self.inputDataSet = DataSet(parentDs=self.config('parentDsName'),
                                    dataSet=self.config('inputDataSet'),
                                    region=self.config('region'))
        #self.region = self.config('region')
        self.runName = self.config('runName')

        self.client = MalardClient(notebook=notebook)

        self.query_sync = DataSetQuery(self.config('malardSyncURL'),
                                       self.config('malardEnvironmentName'))
        #self.query_async = AsyncDataSetQuery.AsyncDataSetQuery(self.config('malardAsyncURL'), self.config('malardEnvironmentName'), False)
        # get projection
        #self.projection = json.loads(self.client.getProjection(self.parentDsName, self.region))['proj4']
        self.projection = self.client.getProjection(self.inputDataSet).proj4

    def gridcellRegression(self,
                           boundingBox,
                           linear=True,
                           robust=True,
                           weighted=None,
                           minCount=10,
                           radius=None,
                           filters=None):
        if filters is None:
            filters = self.config('filters')
        self.logger.info("Filtering dataset=%s with criteria %s" %
                         (self.inputDataSet, filters))

        result = self.client.executeQuery(self.inputDataSet,
                                          boundingBox,
                                          projections=[],
                                          filters=filters)
        #result = self.client.executeQuery(self.inputDataSet, boundingBox, projections=[])

        self.logger.info("Result message: status=%s, message=%s" %
                         (result.status, result.message))
        data = PointDataSet(result.resultFileName, self.projection)
        self.logger.info("Dataset has %s points" % (data.data.shape[0]))

        if radius is not None:
            centerX = boundingBox.minX + (
                abs(boundingBox.maxX - boundingBox.minX) / 2)
            centerY = boundingBox.minY + (
                abs(boundingBox.maxY - boundingBox.minY) / 2)
            self.logger.info("Apply radius with centerX=%s and centerY=%s..." %
                             (centerX, centerY))
            self.logger.info("Before radius count=%s..." %
                             (data.data.shape[0]))
            data.applyRadius(radius=radius, centerX=centerX, centerY=centerY)
            self.logger.info("After radius count=%s..." % (data.data.shape[0]))

        # release cache of file
        self.client.releaseCacheHandle(result.resultFileName)
        results = {}
        if data.data.shape[0] > minCount and not data.data['time'].nunique(
        ) == 1:
            if linear:
                r = data.linearRegression()
                results = {**results, **r}
            if robust:
                r = data.robustRegression()
                results = {**results, **r}
            if weighted is not None:
                for w in weighted:
                    r = data.weightedRegression(weight=w['weight'],
                                                mask=w['mask_std_dev'])
                    results = {**results, **r}
            self.logger.info(results)
        else:
            self.logger.info("Not enough data in cell (%s points)" %
                             (data.data.shape[0]))

        return results

    def regressionFromStats(self,
                            linear=True,
                            robust=True,
                            weighted=None,
                            minT=None,
                            maxT=None,
                            minCount=50,
                            save=True):
        self.logger.info("Get run statistics for parentDS=%s runName=%s ..." %
                         (self.inputDataSet.parentDataSet, self.runName))
        stats = self.query_sync.getRunStatistics(
            self.inputDataSet.parentDataSet, self.runName)
        stats = json.loads(stats)
        dfStats = json_normalize(stats)
        if minT is None and maxT is None:
            bbx = self.client.boundingBox(self.inputDataSet)
            minT = bbx.minT
            maxT = bbx.maxT

        for idx, line in dfStats.iterrows():
            if line['statistics.afterGlacierMask'] > minCount:
                minX, maxX = line['gridCell.minX'], line[
                    'gridCell.minX'] + line['gridCell.size']
                minY, maxY = line['gridCell.minY'], line[
                    'gridCell.minY'] + line['gridCell.size']

                self.logger.info(
                    "Calculating gridcell minX=%s maxX=%s minY=%s maxY=%s minT=%s maxT=%s ..."
                    % (minX, maxX, minY, maxY, minT, maxT))

                bbx_in = BoundingBox(minX, maxX, minY, maxY, minT, maxT)

                results = self.gridcellRegression(bbx_in,
                                                  linear=linear,
                                                  robust=robust,
                                                  weighted=weighted)
                self.logger.info("Adding regression results to stats...")
                for key in results:
                    if isinstance(results[key], list):
                        if not np.isin(key, dfStats.columns):
                            newColumn = [key]
                            #
                            dfStats = dfStats.reindex(columns=np.append(
                                dfStats.columns.values, newColumn))
                            dfStats[[key]] = dfStats[[key
                                                      ]].astype('object',
                                                                inplace=True)
                            dfStats.at[idx, key] = results[key]
                    else:
                        dfStats.at[idx, key] = results[key]

        size = dfStats['gridCell.size']
        geometry = [
            Point(xy)
            for xy in zip(dfStats['gridCell.minX'] +
                          (size / 2), dfStats['gridCell.minY'] + (size / 2))
        ]
        dfStats = gp.GeoDataFrame(dfStats,
                                  crs=self.projection,
                                  geometry=geometry)

        if save:
            file = os.path.join(self.config("outputPath"),
                                self.config("outputFileName"))
            self.logger.info("Saving results under file=%s" % file)
            dfStats.to_file(file, driver="GPKG")

        return dfStats

    def regressionFromList(self,
                           gridcells,
                           linear=True,
                           robust=True,
                           weighted=None,
                           minT=None,
                           maxT=None,
                           save=True,
                           radius=None,
                           geometry='point'):

        dfStats = pd.DataFrame(gridcells)

        if minT is None and maxT is None:
            bbx = self.client.boundingBox(self.inputDataSet)
            minT = bbx.minT
            maxT = bbx.maxT

        for idx, line in dfStats.iterrows():

            self.logger.info(
                "Calculating gridcell minX=%s maxX=%s minY=%s maxY=%s minT=%s maxT=%s ..."
                % (line['minX'], line['maxX'], line['minY'], line['maxY'],
                   minT, maxT))
            bbx_in = BoundingBox(line['minX'].item(), line['maxX'].item(),
                                 line['minY'].item(), line['maxY'].item(),
                                 minT, maxT)

            results = self.gridcellRegression(bbx_in,
                                              linear=linear,
                                              robust=robust,
                                              weighted=weighted,
                                              radius=radius)

            self.logger.info("Adding regression results to stats...")
            for key in results:
                if isinstance(results[key], list):
                    if not np.isin(key, dfStats.columns):
                        newColumn = [key]
                        #
                        dfStats = dfStats.reindex(columns=np.append(
                            dfStats.columns.values, newColumn))
                        dfStats[[key]] = dfStats[[key]].astype('object',
                                                               inplace=True)
                        dfStats.at[idx, key] = results[key]
                else:
                    dfStats.at[idx, key] = results[key]

        size = dfStats['maxX'] - dfStats['minX']
        if geometry == 'point:':
            self.logger.info("Converted to point geometry")
            geometry = [
                Point(xy)
                for xy in zip(dfStats['minX'] + (size / 2), dfStats['minY'] +
                              (size / 2))
            ]
        elif geometry == 'cell':
            self.logger.info("Converted to cell geometry")
            geometry = []
            for idx, line in dfStats.iterrows():
                minX, maxX = line['minX'], line['maxX']
                minY, maxY = line['minY'], line['maxY']
                geometry.append(
                    Polygon([(minX, minY), (minX, maxY), (maxX, maxY),
                             (maxX, minY), (minX, minY)]))
        else:
            self.logger.info(
                "Error: not valid geometry specified. Should be either 'point' or 'cell'"
            )
        dfStats = gp.GeoDataFrame(dfStats,
                                  crs=self.projection,
                                  geometry=geometry)

        if save:
            file = os.path.join(self.config("outputPath"),
                                self.config("outputFileName"))
            self.logger.info("Saving results under file=%s" % file)
            dfStats.to_file(file, driver="GPKG")

        return dfStats

    def regressionFromRaster(self,
                             file,
                             linear=True,
                             robust=True,
                             weighted=None,
                             minT=None,
                             maxT=None,
                             save=True,
                             rasterNoData=-1000000,
                             radius=None,
                             geometry='point'):
        ''' Calcualtes regression from cells corresponding to the cells of a given input raster

        :param file: rasterfile path
        :param radius: if None the exact extent of the raster cell is used, else the center of the rastercell and the points within a given rasius is used
        :return:
        '''
        self.logger.info(
            "Start regression from raster for parentDS=%s runName=%s ..." %
            (self.inputDataSet.parentDataSet, self.runName))
        if minT is None and maxT is None:
            bbx = self.client.boundingBox(self.inputDataSet)
            minT = bbx.minT
            maxT = bbx.maxT

        raster = RasterDataSet(file)

        if radius is None:
            extents = raster.getCellsAsExtent()
        else:
            xy, values = raster.getCenterPoints()
            extents = []
            for i, el in enumerate(xy):
                self.logger.info("Calculating gridcell %s / %s ..." %
                                 (i + 1, len(values)))
                if values[i] != rasterNoData:
                    ext = {
                        'minX': el[0] - radius,
                        'maxX': el[0] + radius,
                        'minY': el[1] - radius,
                        'maxY': el[1] + radius
                    }
                    extents.append(ext)
                    self.logger.info(
                        "Extent with radius=%s is minX=%s maxX=%s minY=%s maxY=%s ..."
                        % (radius, ext['minX'], ext['maxX'], ext['minY'],
                           ext['maxY']))
                else:
                    self.logger.info(
                        "Raster cell=%s has no data no (datavalue=%s) and is skipped  ..."
                        % (el, rasterNoData))

        stats = self.regressionFromList(extents,
                                        linear=linear,
                                        robust=robust,
                                        weighted=weighted,
                                        minT=minT,
                                        maxT=maxT,
                                        save=save,
                                        radius=radius,
                                        geometry=geometry)

        return stats

    def regressionFromFile(self,
                           file,
                           linear=True,
                           robust=True,
                           weighted=None,
                           minT=None,
                           maxT=None,
                           save=True,
                           radius=None,
                           geometry='point'):
        ''' Calcualtes regression from cells corresponding to the cells of a given input raster

        :param file: rasterfile path
        :param radius: if None the exact extent of the raster cell is used, else the center of the rastercell and the points within a given rasius is used
        :return:
        '''

        self.logger.info("Start regression from file for parentDS=%s ..." %
                         (self.inputDataSet.parentDataSet))
        if minT is None and maxT is None:
            bbx = self.client.boundingBox(self.inputDataSet)
            minT = bbx.minT
            maxT = bbx.maxT

        extents = []
        with open(file) as f:
            for line in f:
                split = line.strip().split(",")
                ext = {
                    'minX': int(split[0]),
                    'maxX': int(split[1]),
                    'minY': int(split[2]),
                    'maxY': int(split[3])
                }
                extents.append(ext)
        stats = self.regressionFromList(extents,
                                        linear=linear,
                                        robust=robust,
                                        weighted=weighted,
                                        minT=minT,
                                        maxT=maxT,
                                        save=save,
                                        radius=radius,
                                        geometry=geometry)

        return stats

    @staticmethod
    def config(name):
        return RegressionRun.__conf[name]

    def uncaughtErrorHandler(self, type, value, tb):
        self.logger.error("Uncaught exception", exc_info=(type, value, tb))
示例#14
0
    df = pd.DataFrame()
    df['x'] = x
    df['y'] = y

    return df


environmentName = 'DEVv2'

gridCellSize = 100000
resolution = 500

mask_prefix = "SARIN"

client = MalardClient(environmentName, True)

dataSet = DataSet('cryotempo', 'GRIS_BaselineC_Q2', 'greenland')

proj4 = client.getProjection(dataSet).proj4

mask = '/data/puma1/scratch/cryotempo/masks/icesheets.shp' if mask_prefix == "ICE" else '/data/puma1/scratch/cryotempo/sarinmasks/{}_Greenland.shp'.format(
    mask_prefix)

tmpPath = '/home/jon/data/masks/'

bbox = client.boundingBox(dataSet)

gridCells = client.gridCells(dataSet, bbox)

for gc in gridCells: