示例#1
0
def get_from_oracle_sql():

    params = get_config_params('../config/database.ini',
                               section='oracle_general')
    parusr = get_config_params('../config/database.ini',
                               section='oracle_redes')
    dsn_tns = cxO.makedsn(**params)
    conn = cxO.connect(parusr['user'], parusr['password'], dsn_tns)

    sql = """SELECT *
    FROM AGREGADOS_MENSUALES
    WHERE ESTACION LIKE '2401%'
    AND VARIABLE LIKE '%PRECIP%TOTAL%'
    """

    results = get_by_query_oracle(conn, sql)
    name_cols = [
        'Station', 'Date', 'Timestamp', 'Flags', 'Observations', 'Variable',
        'Value'
    ]

    if len(results) > 0:
        df_results = pd.DataFrame(results)
        df_results.columns = name_cols
        # df_results.set_index('Date', inplace=True)

        df_results.to_pickle('../results/oracle/redes_2401.pkl')
示例#2
0
def get_data_stations():
    params = get_config_params('../config/database.ini',
                               section='oracle_general')
    parusr = get_config_params('../config/database.ini', section='oracle_sshm')
    dsn_tns = cxO.makedsn(**params)
    conn = cxO.connect(parusr['user'], parusr['password'], dsn_tns)

    stations = [2171, 2181, 1288, 2098, 3519, 3536, 3544, 7427]
    xls_data = pd.ExcelWriter('../results/consulta.xlsx')

    for station in stations:
        sql = """
        SELECT CSMT_FECHA, CSMT_PREC
        FROM SHMT_CUASIRREAL_METEOROLOGIA
        WHERE CSMT_ESTACION_ID = {}
        ORDER BY CSMT_FECHA
        """.format(station)

        results = get_by_query_oracle(conn=conn, sql=sql, header=True)
        df_station = pd.DataFrame(results[0])
        df_station.columns = ['Fecha', 'Precipitacion']
        df_station.set_index('Fecha', inplace=True)
        df_station.to_excel(xls_data, str(station))

    xls_data.save()
示例#3
0
def get_df_data_oracle(df_catalog,
                       backward_period='1D',
                       sensor='0240',
                       current_time=None):
    """
    Returns a dataframe from a query to oracle db based on a zone catalog.

    :param df_catalog:
    :param backward_period:
    :param sensor:
    :param current_time:
    :return:
    """
    start_time, end_time = date_start_end(backward_period=backward_period,
                                          current_time=current_time)

    if sensor != '0070':
        start_time = start_time - pd.DateOffset(days=1)
        end_time = end_time - pd.DateOffset(days=1)

    params = get_config_params('../config/database.ini',
                               section='oracle_general')
    parusr = get_config_params('../config/database.ini', section='oracle_sshm')
    dsn_tns = cxO.makedsn(**params)
    conn = cxO.connect(parusr['user'], parusr['password'], dsn_tns)
    query_table = 'SHMT_CUASIRREAL_METEOROLOGIA'
    name_cols = ['Station', 'Value', 'Count', 'From', 'To']

    sql = query_sentence_oracle(table=query_table,
                                sensor=sensor,
                                backward_period=backward_period,
                                current_time=current_time)

    results = get_by_query_oracle(conn, sql)
    conn.close()

    if len(results) > 0:
        df_results = pd.DataFrame(results)
        df_results.columns = name_cols
        df_results.set_index('Station', inplace=True)
        stations = df_results.index.intersection(df_catalog.index)
        df_results = df_results.loc[stations]
        df_results.loc[stations, 'Name'] = df_catalog.loc[stations, 'nombre']
        df_results.loc[stations, 'Freq'] = df_catalog.loc[stations,
                                                          'frecuencia']
        df_results['Total'] = int((end_time - start_time) / pd.Timedelta('1D'))
        df_results['Gaps'] = 100. * df_results['Count'] / df_results['Total']
        df_results.loc[stations, 'lng'] = df_catalog.loc[stations, 'lng']
        df_results.loc[stations, 'lat'] = df_catalog.loc[stations, 'lat']
        df_results.loc[stations, 'Elevation'] = df_catalog.loc[stations,
                                                               'elevacion']
        df_results.loc[stations, 'AH'] = df_catalog.loc[stations, 'ah']
        df_results.sort_values(['Value'], ascending=False, inplace=True)

        return df_results

    else:
        return None
示例#4
0
def summary_sensors_stations():
    """
    Makes a summary of stations and sensors in order to identify what sensors are
    recording in each station.

    :param table_data:
    :param catalog:
    :type table_data: str
    :type catalog: str
    :return:
    """
    params = get_config_params('../config/database.ini',
                               section='oracle_general')
    parusr = get_config_params('../config/database.ini', section='oracle_sshm')
    dsn_tns = cxO.makedsn(**params)
    conn = cxO.connect(parusr['user'], parusr['password'], dsn_tns)

    tables = {
        'HYDRO': 'SHMT_CUASIRREAL_HIDROLOGIA',
        'METEO': 'SHMT_CUASIRREAL_METEOROLOGIA'
    }
    # cols_catalog = ['ID_ES', 'COD_CATALOGO_ES', 'NOMBRE_ES', 'NOMBRE_FGDA',
    #                 'GRADOS_LATITUD', 'MINUTOS_LATITUD', 'SEGUNDOS_LATITUD', 'DIRECCION_LATITUD',
    #                 'GRADOS_LONGITUD', 'MINUTOS_LONGITUD', 'SEGUNDOS_LONGITUD', 'DIRECCION_LONGITUD']

    xls_summary = pd.ExcelWriter('../results/sshm_summary.xlsx')

    for table in tables:
        sql_catalog = sql_catalog_oracle(table=tables[table])
        results, cols_catalog = get_by_query_oracle(conn=conn,
                                                    sql=sql_catalog,
                                                    header=True)

        df_catalog = pd.DataFrame(results)
        df_catalog.drop(0, axis=1, inplace=True)
        df_catalog.columns = cols_catalog[1:]
        df_catalog.set_index('ID_ES', inplace=True)
        df_catalog.sort_index(inplace=True)
        df_catalog.to_excel(xls_summary, 'CAT_{}'.format(table))

        df_summary = pd.DataFrame(index=df_catalog.index,
                                  columns=dt_sensors_oracle)

        for sensor in dt_sensors_oracle:
            sql_sensors = sql_summary_sensor(tables[table], sensor)
            results_sensor = get_by_query_oracle(conn, sql_sensors)

            if results_sensor is not None:
                list_stations = [i[0] for i in results_sensor]
                df_summary.loc[list_stations, sensor] = 1

            else:
                df_summary.drop(sensor, axis=1, inplace=True)

        df_summary.to_excel(xls_summary, table)

    xls_summary.save()
示例#5
0
def download_all_data(sensor='0240', table='weather_events', multiprocess=False):
    """
    Downloads all data from a table through a query based on sensor and
    stations from presto_data_summary.xlsx file.
    :param sensor:
    :param table:
    :param multiprocess:
    :return:
    """

    xls_stations = pd.ExcelFile('../data/presto_cassandra_summary.xlsx')
    df_summary = xls_stations.parse('Summary', index_col='Station')
    sr_0240 = df_summary[sensor]
    stations_0240 = sr_0240.dropna().index

    params = get_config_params('../config/database.ini', section='prestodb_cassandra')
    conn = prestodb.dbapi.Connection(**params)

    partial_fn = partial(get_sensor_station_data, sensor=sensor, conn=conn, table=table)

    if multiprocess:
        pool = Pool()
        pool.map(partial_fn, stations_0240)
        pool.close()

    else:
        map(partial_fn, stations_0240[:2])
示例#6
0
def download_from_stations(catalog='cassandra'):
    df_data = pd.read_excel('../data/extract_data.xlsx', sheetname='Sibate', index_col='COD_INTERNO')
    stations = df_data.index

    section = 'prestodb_{}'.format(catalog)
    params = get_config_params('../config/database.ini', section=section)
    conn = prestodb.dbapi.Connection(**params)

    for station in stations:
        get_sensor_station_data(station, sensor='0240', conn=conn)
示例#7
0
def summary_sensors_stations(catalog='cassandra', table_data='last_month_observations'):
    """
    Makes a summary of stations and sensors in order to identify what sensors are
    recording in each station.

    :param table_data:
    :param catalog:
    :type table_data: str
    :type catalog: str
    :return:
    """
    section = 'prestodb_{}'.format(catalog)
    params = get_config_params('../config/database.ini', section=section)
    conn = prestodb.dbapi.Connection(**params)

    # query_sensors = get_all_data(conn, table='sensors')
    # df_sensors = pd.DataFrame(data=query_sensors[1:], columns=query_sensors[0])
    # df_sensors.set_index('sensorid', inplace=True)
    # sensors = df_sensors.index

    sensors = ['0240', '0068', '0069', '0070', '0230', '0255', '0103', '0104', '0027', '0239']

    if catalog == 'postgresql':
        sensors = [dt_sensors_postgres[i] for i in sensors]
        table_stations = 'configuration.stations'
        id_stations = 'id_stz'

    else:
        table_stations = 'raw.stations'
        id_stations = 'stationid'

    query_stations = get_all_data(conn, table=table_stations)
    df_stations = pd.DataFrame(data=query_stations[1:], columns=query_stations[0])
    df_stations.set_index(id_stations, inplace=True)
    stations = df_stations.index

    df_stations_sensors = pd.DataFrame(index=pd.Index(stations, name='Station'), columns=sensors)

    for sensor in sorted(sensors):
        print sensor
        results = get_stations_by_sensor(conn, table=table_data, sensor=sensor)
        list_stations = [i[0] for i in results]
        idx_miss = pd.Index(list_stations).difference(df_stations_sensors.index)

        for sta_miss in idx_miss:
            df_stations_sensors.loc[sta_miss] = float('NaN')

        df_stations_sensors.loc[list_stations, sensor] = 1

    xls_output = pd.ExcelWriter('../data/presto_{}_summary.xlsx'.format(catalog))
    # df_sensors.to_excel(xls_output, 'Sensors')
    df_stations.to_excel(xls_output, 'Stations')
    df_stations_sensors.to_excel(xls_output, 'Summary')
    xls_output.save()
示例#8
0
def get_elevations():
    """
    Returns elevations for a set of stations based on a DEM.

    :return:
    """
    params = get_config_params('../config/database.ini',
                               section='prestodb_cassandra')
    conn = prestodb.dbapi.Connection(**params)
    query_table = 'last_month_observations'
    sensor = '0240'

    stations = get_stations_by_sensor(conn=conn,
                                      table=query_table,
                                      sensor=sensor)
    stations = [item for sublist in stations for item in sublist]
    stations.sort()

    df_summary = pd.DataFrame(index=stations,
                              columns=['Freq', 'Start', 'End', 'Count'])
    df_summary.index.name = 'Station'

    for station in stations:
        # print(station)
        results = get_sensor_station_data(station=station,
                                          sensor=sensor,
                                          conn=conn,
                                          table=query_table,
                                          write_csv=False)
        df_station = pd.DataFrame(results)
        sr_station = format_df_data(df_input=df_station, station=station)
        freq_min = calc_freq(sr_station)
        start = sr_station.index.min()
        end = sr_station.index.max()
        count = sr_station.dropna().count()
        df_summary.loc[station] = [freq_min, start, end, count]

    df_summary.to_excel('../results/elevations_cassandra.xlsx')
    return df_summary
示例#9
0
def get_data(station, sensor, table='last_month_observations'):
    """
    Returns the whole data from a given table for the sensor and station given.

    :param station:
    :type station: str
    :param sensor:
    :type sensor: str
    :param table:
    :type table: str
    :return:
    """
    catalog = 'cassandra'
    section = 'prestodb_{}'.format(catalog)
    params = get_config_params('../config/database.ini', section=section)
    conn = prestodb.dbapi.Connection(**params)

    sql = """
    SELECT to_char(event_time + interval '5' hour,'yyyy-mm-dd hh24:mi') event_time, event_value
    FROM {0}
    WHERE station = '{1}'
    AND sensor = '{2}'
    ORDER BY event_time
    """.format(table, station, sensor)

    results = get_by_query_presto(conn=conn, sql=sql)
    name_cols = ['Time', station]

    if len(results) > 0:
        df_results = pd.DataFrame(results)
        df_results.columns = name_cols
        df_results['Time'] = pd.to_datetime(df_results['Time'],
                                            format='%Y-%m-%d %H:%M')
        df_results.set_index('Time', inplace=True)

        df_results[station].to_pickle('../test/{}_{}_lm.pkl'.format(
            sensor, station))
示例#10
0
def get_df_data_presto(df_catalog,
                       backward_period='1D',
                       sensor='0240',
                       section='prestodb_postgresql',
                       table='recent_data',
                       current_time=None):
    """
    Returns a dataframe from a query to prestodb based on a zone catalog.

    It works for any database managed by presto.
    :param df_catalog:
    :param backward_period:
    :param sensor:
    :param section:
    :param table:
    :param current_time:
    :return:
    """
    start_time, end_time = date_start_end(backward_period=backward_period,
                                          current_time=current_time)
    stations = df_catalog.index
    params = get_config_params('../config/database.ini', section=section)
    catalog = params['catalog']
    schema = params['schema']
    conn = prestodb.dbapi.Connection(**params)
    name_cols = ['Station', 'Value', 'Count', 'From', 'To']
    dict_total = {}
    sensor_abb = dt_sensors_abb[sensor]

    for query_station in stations:
        # print query_station
        max_value = df_catalog.loc[
            query_station, 'max {}'.format(dt_sensors_abb[sensor]).lower()]

        if sensor == '0240':
            min_value = 0

        else:
            min_value = df_catalog.loc[
                query_station, 'min {}'.format(dt_sensors_abb[sensor]).lower()]

        if pd.notnull(max_value):
            name = df_catalog.loc[query_station, 'nombre']
            freq = df_catalog.loc[query_station,
                                  'frec_{}'.format(sensor_abb.lower())]
            longitude = df_catalog.loc[query_station, 'lng']
            latitude = df_catalog.loc[query_station, 'lat']
            total_data = (end_time - start_time) / pd.Timedelta(
                '{}M'.format(freq))
            # elevation = df_catalog.loc[query_station, 'elevacion']
            hydro_area = df_catalog.loc[query_station, 'ah']

            sql = query_sentence_presto(station=query_station,
                                        max_value=max_value,
                                        min_value=min_value,
                                        catalog=catalog,
                                        schema=schema,
                                        table=table,
                                        sensor=sensor,
                                        backward_period=backward_period,
                                        current_time=current_time)

            results = get_by_query_presto(conn, sql)
            conn.close()

            if len(results) > 0:
                df_results = pd.DataFrame(results)
                df_results.columns = name_cols
                df_results['Name'] = name
                df_results['Freq'] = freq
                df_results['Total'] = int(total_data)
                df_results['Gaps'] = 100. * df_results.loc[
                    0, 'Count'] / total_data
                df_results['lng'] = longitude
                df_results['lat'] = latitude
                # df_results['Elevation'] = elevation
                df_results['AH'] = hydro_area
                dict_total[query_station] = df_results

    df_total = pd.concat(dict_total)
    df_total.sort_values(['Value'], ascending=False, inplace=True)
    df_total.reset_index(inplace=True, drop=True)
    df_total.set_index(['Station'], inplace=True)
    return df_total