Exemplo n.º 1
0
def metadata_convert(path, bucket=None):
    """Prepare metadata prior to datacube indexing

    Given a directory containing biogeo zones tiles generated by the rasterize_vector_file command line
    prepares a metadata string with the appropriate formating for indexing in the datacube
    The other ``metadata_convert`` functions assume that datasets are separated in different different
    so that the ``prepare_metadata`` command line handles the optional iteration based on this assumption.
    Because all tiles of a country mask are all in a same folder, iteration has to be handles by the
    present function.
    The tile are expected to be in EPSG:4326 crs

    Args:
        path (str): Path of the directory containing the biogeographic zones tiles
        bucket (str or None): Name of the s3 bucket containing the data. If ``None``
            (default), data are considered to be on a mounted filesystem

    Examples:
        >>> from madmex.ingestion.biogeographic_zones import metadata_convert
        >>> path = '/path/to/mask/dir'
        >>> yaml_str = metadata_convert(path)
        >>> with open('/path/to/metadata_out.yaml', 'w') as dst:
        >>>     dst.write(yaml_str)

    Returns:
        str: The content of the metadata for later writing to file.
    """
    if bucket is not None:
        file_list = s3.list_files(bucket, path, r'.*_\d+.tif$')
        file_list = [s3.build_rasterio_path(bucket, x) for x in file_list]
    else:
        file_list = glob(os.path.join(path, '*_*.tif'))

    def build_meta_string(x):
        """Generate the yaml string for a single tile

        Args:
            x (str): The path of the dataset
        """
        try:
            with rasterio.open(x) as src:
                crs = src.crs
                bounds = src.bounds
            meta_out = {
                'id': uuid.uuid5(uuid.NAMESPACE_URL, x),
                'll_lat': bounds.bottom,
                'lr_lat': bounds.bottom,
                'ul_lat': bounds.top,
                'ur_lat': bounds.top,
                'll_lon': bounds.left,
                'lr_lon': bounds.right,
                'ul_lon': bounds.left,
                'ur_lon': bounds.right,
                'crs': crs.wkt,
                'band': x,
            }
            # Load template
            env = Environment(loader=PackageLoader('madmex', 'templates'))
            template = env.get_template('biogeographic_zones.yaml')
            out = template.render(**meta_out)
            return out
        except Exception as e:
            pass

    yaml_list = [build_meta_string(x) for x in file_list]
    yaml_list = [x for x in yaml_list if x is not None]
    return '\n---\n'.join(yaml_list)
Exemplo n.º 2
0
 def get_band(suffix):
     pattern = re.compile(r'.*GRANULE/.*/IMG_DATA/R20m/.*%s_20m\.jp2$' % suffix)
     band = [x for x in all_files if pattern.search(x)][0]
     if bucket is not None:
         band = s3.build_rasterio_path(bucket, band)
     return band
Exemplo n.º 3
0
def metadata_convert(path, bucket=None):
    """Prepare metadata prior to datacube indexing

    Given a directory containing sentinel1 () polarisation VH and VV)
    data preprocessed with snappy, prepares a metadata string with
    the appropriate formating. 

    Args:
        path (str): Path of the directory containing sentinel1 data. 
        bucket (str or None): Name of the s3 bucket containing the data. If ``None``
            (default), data are considered to be on a mounted filesystem

    Examples:
        >>> from madmex.ingestion.s1_grd_vh_vv import metadata_convert

        >>> path = '/path/to/s1/dir'                                       
        >>> yaml_str = metadata_convert(path)                                   
                                                                        
        >>> with open('/path/to/metadata_out.yaml', 'w') as dst:                
        >>>     dst.write(yaml_str) 

    Returns:
        str: The content of the metadata for later writing to file.
    """
    if bucket is not None:
        file_list = [
            os.path.basename(x)
            for x in s3.list_files(bucket, path, r'.*filtered\.tif$')
        ]
        path = s3.build_rasterio_path(bucket, path)
    else:
        file_list = [
            os.path.basename(x)
            for x in glob(os.path.join(path, '*filtered.tif'))
        ]
    pol_vh = [x for x in file_list if '_VH_' in x][0]
    pol_vv = [x for x in file_list if '_VV_' in x][0]
    pol_vh = os.path.join(path, pol_vh)
    pol_vv = os.path.join(path, pol_vv)

    fname = os.path.basename(pol_vh).split("_")[0]
    if 'T' in fname:
        date_str = fname.replace('T', '')
    else:
        date_str = fname
    dt = datetime.strptime(date_str, '%Y%m%d%H%M%S')
    # Check that these files exist
    with rasterio.open(pol_vh) as src:
        crs = src.crs
        bounds = src.bounds
    meta_out = {
        'id': uuid.uuid5(uuid.NAMESPACE_URL, path),
        'll_lat': bounds.bottom,
        'lr_lat': bounds.bottom,
        'ul_lat': bounds.top,
        'ur_lat': bounds.top,
        'll_lon': bounds.left,
        'lr_lon': bounds.right,
        'ul_lon': bounds.left,
        'ur_lon': bounds.right,
        'dt': dt.strftime('%Y-%m-%dT%H:%M:%S'),  # 2018-01-22T17:56:29
        'crs': crs.wkt,
        'pol_vh': pol_vh,
        'pol_vv': pol_vv,
    }
    # Load template
    env = Environment(loader=PackageLoader('madmex', 'templates'))
    template = env.get_template('s1_grd_vh_vv.yaml')
    out = template.render(**meta_out)
    return out
Exemplo n.º 4
0
def metadata_convert(path, bucket=None):
    """Prepare metatdata prior to datacube indexing

    Given a directory containing landsat surface reflectance bands and a MLT.txt
    file, prepares a metadata string with the appropriate formating.

    Args:
        path (str): Path of the directory containing the surface reflectance bands
            and the Landsat metadata file.
        bucket (str or None): Name of the s3 bucket containing the data. If ``None``
            (default), data are considered to be on a mounted filesystem

    Examples:
        >>> from madmex.ingestion.landsat_espa import metadata_convert
        >>> from glob import glob

        >>> scene_list = glob('/path/to/scenes/*')
        >>> yaml_list = [metadata_convert(x) for x in scene_list]

        >>> with open('/path/to/metadata_out.yaml', 'w') as dst:
        >>>     for yaml in yaml_list:
        >>>         dst.write(yaml)
        >>>         dst.write('\n---\n')

    Returns:
        str: The content of the metadata for later writing to file.
    """
    pattern = re.compile(
        r'[A-Z0-9]{4}_[A-Z0-9]{4}_\d{6}_\d{8}_\d{8}_01_(T1|T2|RT)\.xml')
    if bucket is None:
        # Check that path is a dir and contains appropriate files
        if not os.path.isdir(path):
            raise ValueError('Argument path= is not a directory')
        mtl_file_list = glob(os.path.join(path, '*.xml'))
        # Filter list of xml files with regex (there could be more than one in case
        # some bands have been opend in qgis for example)
        mtl_file_list = [x for x in mtl_file_list if pattern.search(x)]
        print(mtl_file_list)
        if len(mtl_file_list) != 1:
            raise ValueError('Could not identify a unique xml metadata file')
        mtl_file = mtl_file_list[0]
        # Start parsing xml
        root = ET.parse(mtl_file).getroot()
    else:
        file_list = s3.list_files(bucket=bucket, path=path)
        pattern = re.compile(r'.*\.xml$')
        mtl_file_list = [x for x in file_list if pattern.search(x)]
        if len(mtl_file_list) != 1:
            raise ValueError('Could not identify a unique xml metadata file')
        mtl_file = mtl_file_list[0]
        # REad xml as string
        xml_str = s3.read_file(bucket, mtl_file)
        # generate element tree root
        root = ET.fromstring(xml_str)
        path = s3.build_rasterio_path(bucket, path)

    ns = 'http://espa.cr.usgs.gov/v2'
    # Build datetime from date and time
    date_str = root.find('ns:global_metadata/ns:acquisition_date',
                         namespaces={
                             'ns': ns
                         }).text
    time_str = root.find('ns:global_metadata/ns:scene_center_time',
                         namespaces={
                             'ns': ns
                         }).text
    dt = '%sT%s' % (date_str, time_str[:8])
    # satellite sensor metadata
    instrument = root.find('ns:global_metadata/ns:instrument',
                           namespaces={
                               'ns': ns
                           }).text
    satellite = root.find('ns:global_metadata/ns:satellite',
                          namespaces={
                              'ns': ns
                          }).text
    # Scene corners in projected coordinates
    ulx = float(
        root.find(
            'ns:global_metadata/ns:projection_information/ns:corner_point[@location="UL"]',
            namespaces={
                'ns': ns
            }).attrib['x'])
    uly = float(
        root.find(
            'ns:global_metadata/ns:projection_information/ns:corner_point[@location="UL"]',
            namespaces={
                'ns': ns
            }).attrib['y'])
    lrx = float(
        root.find(
            'ns:global_metadata/ns:projection_information/ns:corner_point[@location="LR"]',
            namespaces={
                'ns': ns
            }).attrib['x'])
    lry = float(
        root.find(
            'ns:global_metadata/ns:projection_information/ns:corner_point[@location="LR"]',
            namespaces={
                'ns': ns
            }).attrib['y'])
    utm_zone = int(
        root.find(
            'ns:global_metadata/ns:projection_information/ns:utm_proj_params/ns:zone_code',
            namespaces={
                'ns': ns
            }).text)
    crs = CRS({'proj': 'utm', 'zone': utm_zone})
    # Get coorner coordinates in long lat by transforming from projected values
    p = Proj(crs)
    ul_lon, ul_lat = p(ulx, uly, inverse=True)
    lr_lon, lr_lat = p(lrx, lry, inverse=True)
    ll_lon, ll_lat = p(ulx, lry, inverse=True)
    ur_lon, ur_lat = p(lrx, uly, inverse=True)
    # Prepare metadata fields
    meta_out = {
        'id':
        uuid.uuid5(uuid.NAMESPACE_URL, path),
        'dt':
        dt,
        'll_lat':
        ll_lat,
        'lr_lat':
        lr_lat,
        'ul_lat':
        ul_lat,
        'ur_lat':
        ur_lat,
        'll_lon':
        ll_lon,
        'lr_lon':
        lr_lon,
        'ul_lon':
        ul_lon,
        'ur_lon':
        ur_lon,
        'll_x':
        ulx,
        'lr_x':
        lrx,
        'ul_x':
        ulx,
        'ur_x':
        lrx,
        'll_y':
        lry,
        'lr_y':
        lry,
        'ul_y':
        uly,
        'ur_y':
        uly,
        'crs':
        crs.wkt,
        'blue':
        os.path.join(
            path,
            root.find('ns:bands/ns:band[@name="%s"]/ns:file_name' %
                      LANDSAT_BANDS[instrument]['blue'],
                      namespaces={
                          'ns': 'http://espa.cr.usgs.gov/v2'
                      }).text),
        'green':
        os.path.join(
            path,
            root.find('ns:bands/ns:band[@name="%s"]/ns:file_name' %
                      LANDSAT_BANDS[instrument]['green'],
                      namespaces={
                          'ns': 'http://espa.cr.usgs.gov/v2'
                      }).text),
        'red':
        os.path.join(
            path,
            root.find('ns:bands/ns:band[@name="%s"]/ns:file_name' %
                      LANDSAT_BANDS[instrument]['red'],
                      namespaces={
                          'ns': 'http://espa.cr.usgs.gov/v2'
                      }).text),
        'nir':
        os.path.join(
            path,
            root.find('ns:bands/ns:band[@name="%s"]/ns:file_name' %
                      LANDSAT_BANDS[instrument]['nir'],
                      namespaces={
                          'ns': 'http://espa.cr.usgs.gov/v2'
                      }).text),
        'swir1':
        os.path.join(
            path,
            root.find('ns:bands/ns:band[@name="%s"]/ns:file_name' %
                      LANDSAT_BANDS[instrument]['swir1'],
                      namespaces={
                          'ns': 'http://espa.cr.usgs.gov/v2'
                      }).text),
        'swir2':
        os.path.join(
            path,
            root.find('ns:bands/ns:band[@name="%s"]/ns:file_name' %
                      LANDSAT_BANDS[instrument]['swir2'],
                      namespaces={
                          'ns': 'http://espa.cr.usgs.gov/v2'
                      }).text),
        'qual':
        os.path.join(
            path,
            root.find('ns:bands/ns:band[@name="pixel_qa"]/ns:file_name',
                      namespaces={
                          'ns': 'http://espa.cr.usgs.gov/v2'
                      }).text),
        'instrument':
        instrument,
        'platform':
        satellite,
    }
    # Load template
    env = Environment(loader=PackageLoader('madmex', 'templates'))
    template = env.get_template('landsat_espa.yaml')
    out = template.render(**meta_out)
    return out
Exemplo n.º 5
0
def metadata_convert(path, bucket=None):
    """Prepare metadata prior to datacube indexing

    Given a directory containing bioclimatics raster information prepares
    a metadata string with the appropriate formating for indexing in the datacube

    Args:
        path (str): Path of the directory containing temperature and
            precipitation measurements.
        bucket (str or None): Name of the s3 bucket containing the data. If ``None``
            (default), data are considered to be on a mounted filesystem

    Examples:
        >>> from madmex.ingestion.bioclimatics import metadata_convert
        >>> path = '/path/to/bioclim/dir'
        >>> yaml_str = metadata_convert(path)

        >>> with open('/path/to/metadata_out.yaml', 'w') as dst:
        >>>     dst.write(yaml_str)

    Returns:
        str: The content of the metadata for later writing to file.
    """
    if bucket is not None:
        path = s3.build_rasterio_path(bucket, path)
    tmax_jan = os.path.join(path, 'tmax_1.tif')
    tmean_jan = os.path.join(path, 'tmean_1.tif')
    tmin_jan = os.path.join(path, 'tmin_1.tif')

    tmax_feb = os.path.join(path, 'tmax_2.tif')
    tmean_feb = os.path.join(path, 'tmean_2.tif')
    tmin_feb = os.path.join(path, 'tmin_2.tif')

    tmax_mar = os.path.join(path, 'tmax_3.tif')
    tmean_mar = os.path.join(path, 'tmean_3.tif')
    tmin_mar = os.path.join(path, 'tmin_3.tif')

    tmax_apr = os.path.join(path, 'tmax_4.tif')
    tmean_apr = os.path.join(path, 'tmean_4.tif')
    tmin_apr = os.path.join(path, 'tmin_4.tif')

    tmax_may = os.path.join(path, 'tmax_5.tif')
    tmean_may = os.path.join(path, 'tmean_5.tif')
    tmin_may = os.path.join(path, 'tmin_5.tif')

    tmax_jun = os.path.join(path, 'tmax_6.tif')
    tmean_jun = os.path.join(path, 'tmean_6.tif')
    tmin_jun = os.path.join(path, 'tmin_6.tif')

    tmax_jul = os.path.join(path, 'tmax_7.tif')
    tmean_jul = os.path.join(path, 'tmean_7.tif')
    tmin_jul = os.path.join(path, 'tmin_7.tif')

    tmax_aug = os.path.join(path, 'tmax_8.tif')
    tmean_aug = os.path.join(path, 'tmean_8.tif')
    tmin_aug = os.path.join(path, 'tmin_8.tif')

    tmax_sep = os.path.join(path, 'tmax_9.tif')
    tmean_sep = os.path.join(path, 'tmean_9.tif')
    tmin_sep = os.path.join(path, 'tmin_9.tif')

    tmax_oct = os.path.join(path, 'tmax_10.tif')
    tmean_oct = os.path.join(path, 'tmean_10.tif')
    tmin_oct = os.path.join(path, 'tmin_10.tif')

    tmax_nov = os.path.join(path, 'tmax_11.tif')
    tmean_nov = os.path.join(path, 'tmean_11.tif')
    tmin_nov = os.path.join(path, 'tmin_11.tif')

    tmax_dec = os.path.join(path, 'tmax_12.tif')
    tmean_dec = os.path.join(path, 'tmean_12.tif')
    tmin_dec = os.path.join(path, 'tmin_12.tif')

    # Check that these files exist
    #check_exist = [os.path.isfile(x) for x in [tmax_jan, tmean_jan, tmin_jan]]
    #if not all(check_exist):
    #    raise ValueError('Target directory must at least contain the 3 following files (tmax_1.tif, tmean_1.tif, tmin_1.tif)')

    with rasterio.open(tmax_jan) as src:
        crs = src.crs
        bounds = src.bounds
    meta_out = {
        'id': uuid.uuid5(uuid.NAMESPACE_URL, path),
        'll_lat': bounds.bottom,
        'lr_lat': bounds.bottom,
        'ul_lat': bounds.top,
        'ur_lat': bounds.top,
        'll_lon': bounds.left,
        'lr_lon': bounds.right,
        'ul_lon': bounds.left,
        'ur_lon': bounds.right,
        'crs': crs.wkt,
        'tmax_jan': tmax_jan,
        'tmean_jan': tmean_jan,
        'tmin_jan': tmin_jan,
        'tmax_feb': tmax_feb,
        'tmean_feb': tmean_feb,
        'tmin_feb': tmin_feb,
        'tmax_mar': tmax_mar,
        'tmean_mar': tmean_mar,
        'tmin_mar': tmin_mar,
        'tmax_apr': tmax_apr,
        'tmean_apr': tmean_apr,
        'tmin_apr': tmin_apr,
        'tmax_may': tmax_may,
        'tmean_may': tmean_may,
        'tmin_may': tmin_may,
        'tmax_jun': tmax_jun,
        'tmean_jun': tmean_jun,
        'tmin_jun': tmin_jun,
        'tmax_jul': tmax_jul,
        'tmean_jul': tmean_jul,
        'tmin_jul': tmin_jul,
        'tmax_aug': tmax_aug,
        'tmean_aug': tmean_aug,
        'tmin_aug': tmin_aug,
        'tmax_sep': tmax_sep,
        'tmean_sep': tmean_sep,
        'tmin_sep': tmin_sep,
        'tmax_oct': tmax_oct,
        'tmean_oct': tmean_oct,
        'tmin_oct': tmin_oct,
        'tmax_nov': tmax_nov,
        'tmean_nov': tmean_nov,
        'tmin_nov': tmin_nov,
        'tmax_dec': tmax_dec,
        'tmean_dec': tmean_dec,
        'tmin_dec': tmin_dec,
    }
    # Load template
    env = Environment(loader=PackageLoader('madmex', 'templates'))
    template = env.get_template('bioclimatics.yaml')
    out = template.render(**meta_out)
    return out