Python merge_df示例，gasp.pyt.df.to.merge_df Python示例

示例#1

0

显示文件

文件： mng.py 项目： jasp382/gasp

def dfcolstorows(inDf, colField, valField, colFid=None):
    """
    Dataframe Like:
    
      | pop_res |   ind2  | ind3 | id_unit
    0 |   571   | 35.0975 | 123  |  3768 
    1 |   938   | 18.2114 | 265  |  3618 
    2 |   554   | 44.3149 |  76  |  3788 
    3 |   711   | 37.8619 | 134  |  3766
    4 |  1268   | 46.0733 | 203  |  3690
    
    To:
    
      | colField | valField
    0 | pop_res |   571
    1 |  ind2   | 35.0975
    2 |  ind3   |   123
    3 | id_unit |  3768 
    4 | pop_res | 938
    5 |  ind2   | 18.2114
    6 |  ind3   | 265
    7 | id_unit | 3618
    """

    from gasp.pyt.df.to import merge_df

    newDfs = []
    cols = list(inDf.columns.values)

    if colFid and colFid in cols:
        cols.remove(colFid)

    for col in cols:
        ndf = inDf.copy()

        ndf.drop([c for c in cols if c != col], axis=1, inplace=True)
        ndf[colField] = col
        ndf.rename(columns={col: valField}, inplace=True)

        newDfs.append(ndf)

    res = merge_df(newDfs)

    return res

示例#2

0

显示文件

def same_attr_to_shp(inShps,
                     interestCol,
                     outFolder,
                     basename="data_",
                     resultDict=None):
    """
    For several SHPS with the same field, this program will list
    all values in such field and will create a new shp for all
    values with the respective geometry regardeless the origin shp.
    """

    import os
    from gasp.pyt import obj_to_lst
    from gasp.gt.fmshp import shp_to_obj
    from gasp.pyt.df.to import merge_df
    from gasp.gt.toshp import df_to_shp

    EXT = os.path.splitext(inShps[0])[1]

    shpDfs = [shp_to_obj(shp) for shp in inShps]

    DF = merge_df(shpDfs, ignIndex=True)
    #DF.dropna(axis=0, how='any', inplace=True)

    uniqueVal = DF[interestCol].unique()

    nShps = [] if not resultDict else {}
    for val in uniqueVal:
        ndf = DF[DF[interestCol] == val]

        KEY = str(val).split('.')[0] if '.' in str(val) else str(val)

        nshp = df_to_shp(
            ndf, os.path.join(outFolder, '{}{}{}'.format(basename, KEY, EXT)))

        if not resultDict:
            nShps.append(nshp)
        else:
            nShps[KEY] = nshp

    return nShps

示例#3

0

显示文件

文件： sentinel.py 项目： jasp382/gasp

def lst_prod_by_cell_and_year(shp,
                              id_col,
                              year,
                              outshp,
                              platform="Sentinel-2",
                              processingl='Level-2A',
                              epsg=32629):
    """
    Get a list of images:
    * one for each grid in shp;
    * one for each month in one year - the choosen image will be the one
    with lesser area occupied by clouds;
    total_images = grid_number * number_months_year
    """

    from gasp.gt.fmshp import shp_to_obj
    from gasp.pyt.df.to import merge_df
    from gasp.gt.toshp import df_to_shp
    from gasp.g.to import df_to_geodf

    months = {
        '01': '31',
        '02': '28',
        '03': '31',
        '04': '30',
        '05': '31',
        '06': '30',
        '07': '31',
        '08': '31',
        '09': '30',
        '10': '31',
        '11': '30',
        '12': '31'
    }

    # Open SHP
    grid = shp_to_obj(shp, srs_to=4326)

    def get_grid_id(row):
        row['cellid'] = row.title.split('_')[5][1:]

        return row

    # Search for images
    dfs = []
    for idx, cell in grid.iterrows():
        for k in months:
            start = "{}{}01".format(str(year), k)
            end = "{}{}{}".format(str(year), k, months[k])

            if year == 2018 and processingl == 'Level-2A':
                if k == '01' or k == '02':
                    plevel = 'Level-2Ap'
                else:
                    plevel = processingl
            else:
                plevel = processingl

        prod = lst_prod(cell.geometry.wkt,
                        start,
                        end,
                        platname=platform,
                        procLevel=plevel)

        if not prod.shape[0]:
            continue

        # Get area
        prod = prod.to_crs({'init': 'epsg:{}'.format(str(epsg))})
        prod['areav'] = prod.geometry.area / 1000000

        # We want only images with more than 70% of data
        prod = prod[prod.areav >= 7000]

        # ID Cell ID
        prod = prod.apply(lambda x: get_grid_id(x), axis=1)
        # Filter Cell ID
        prod = prod[prod.cellid == cell[id_col]]

        # Sort by cloud cover and date
        prod = prod.sort_values(['cloudcoverpercentage', 'ingestiondate'],
                                ascending=[True, True])

        # Get only the image with less cloud cover
        prod = prod.head(1)

        dfs.append(prod)

    fdf = merge_df(dfs)
    fdf = df_to_geodf(fdf, 'geometry', epsg)

    df_to_shp(fdf, outshp)

    return outshp

示例#4

0

显示文件

def dsnsearch_by_cell(GRID_PNT, EPSG, RADIUS, DATA_SOURCE, db, OUTPUT_TABLE):
    """
    Search for data in DSN and other platforms by cell
    """
    
    import time;
    from gasp.gt.fmshp          import shp_to_obj
    from gasp.sql.db            import create_db
    from gasp.sde.dsn.fb.places import places_by_query
    from gasp.g.prj             import df_prj
    from gasp.pyt.df.to         import merge_df
    from gasp.gt.toshp.db       import dbtbl_to_shp
    from gasp.sql.to            import q_to_ntbl
    from gasp.sql.to            import df_to_db
    
    # Open GRID SHP
    GRID_DF = shp_to_obj(GRID_PNT)
    GRID_DF = df_prj(GRID_DF, 4326) if EPSG != 4326 else GRID_DF
    
    GRID_DF["lng"]     = GRID_DF.geometry.x.astype(float)
    GRID_DF["lat"]     = GRID_DF.geometry.y.astype(float)
    GRID_DF["grid_id"] = GRID_DF.index
    
    # GET DATA
    RESULTS = []
    def get_data(row, datasrc):
        if datasrc == 'facebook':
            d = places_by_query(
                {'x' : row.lng, 'y' : row.lat, 'r' : RADIUS}, 4326,
                keyword=None, epsgOut=EPSG, _limit='100',
                onlySearchAreaContained=None
            )
        
        else:
            raise ValueError('{} as datasource is not a valid value'.format(datasrc))
        
        if type(d) == int:
            return
        
        d['grid_id'] = row.grid_id
        
        RESULTS.append(d)
        
        time.sleep(5)
    
    GRID_DF.apply(lambda x: get_data(x, DATA_SOURCE), axis=1)
    
    RT = merge_df(RESULTS)
    
    # Create DB
    create_db(db, overwrite=True, api='psql')
    
    # Send Data to PostgreSQL
    df_to_db(
        db, RT, "{}_data".format(DATA_SOURCE),
        EPSG, "POINT",
        colGeom='geometry' if 'geometry' in RT.columns.values else 'geom'
    )
    
    COLS = [
        x for x in RT.columns.values if x != "geometry" and \
        x != 'geom' and x != "grid_id"
    ] + ["geom"]
    
    GRP_BY_TBL = q_to_ntbl(db, "{}_grpby".format(DATA_SOURCE), (
        "SELECT {cols}, CAST(array_agg(grid_id) AS text) AS grid_id "
        "FROM {dtsrc}_data GROUP BY {cols}"
    ).format(cols=", ".join(COLS), dtsrc=DATA_SOURCE), api='psql')
    
    dbtbl_to_shp(
        db, GRP_BY_TBL, "geom", OUTPUT_TABLE,
        api="psql", epsg=EPSG
    )
    
    return OUTPUT_TABLE

示例#5

0

显示文件

文件： mng.py 项目： jasp382/gasp

def df_cols_to_rows(inDf, TO_COLS, col_old_col_name, key_old_col_name,
                    col_mantain):
    """
    Dataframe like:

      | pop_res |   ind2  | ind3 |  ind5   | id_unit |pop_res_int | ind2_int | ind3_int| ind5_int
    0 |   571   | 35.0975 | 123  | 97.373  |   3768  |     2      |    6     |    2    |    7
    1 |   938   | 18.2114 | 265  | 93.4968 |   3618  |     3      |    1     |    5    |    4
    2 |   554   | 44.3149 |  76  | 97.4074 |   3788  |     1      |    7     |    1    |    7
    3 |   711   | 37.8619 | 134  | 96.1429 |   3766  |     2      |    6     |    3    |    6
    4 |  1268   | 46.0733 | 203  | 90.9385 |   3690  |     5      |    7     |    4    |    3

    To:

    0 | id_unit | id_indicator |  value  | cls
    2 |   3768  |    pop_res   |   571   |  2
    3 |   3768  |     ind2     | 35.0975 |  6
    4 |   3768  |     ind3     |   123   |  2
    5 |   3768  |     ind5     | 97.373  |  7
    6 |   3618  |    pop_res   |   938   |  3
    7 |   3618  |     ind2     | 18.2114 |  1
    8 |   3618  |     ind3     |   265   |  5 
    9 |   3618  |     ind5     | 93.4968 |  4
    ...
    
    Using as parameters:
    data_cols = ['pop_res', 'ind2', 'ind3', 'ind5']
    col_mantain = 'id_unit'
    TO_COLS = {
        # Dict values should have the same length
        'value' : data_cols,
        'cls'   : [i + '_int' for i in data_cols]
    }
    
    col_old_col_name = 'id_indicator'
    key_old_col_name = 'value'
    """

    from gasp.pyt import obj_to_lst

    col_mantain = obj_to_lst(col_mantain)
    newCols = list(TO_COLS.keys())
    newDfs = []

    for i in range(len(TO_COLS[newCols[0]])):
        ndf = inDf.copy(deep=True)

        DROP_COLS = []
        COLS_MANT = col_mantain.copy()

        for K in TO_COLS:
            COLS_MANT.append(TO_COLS[K][i])

        for col in ndf.columns.values:
            if col not in COLS_MANT:
                DROP_COLS.append(col)

        ndf.drop(DROP_COLS, axis=1, inplace=True)
        ndf.rename(columns={TO_COLS[k][i]: k for k in TO_COLS}, inplace=True)

        ndf[col_old_col_name] = TO_COLS[key_old_col_name][i]

        newDfs.append(ndf)

    outDf = merge_df(newDfs)

    return outDf