def get_from_oracle_sql(): params = get_config_params('../config/database.ini', section='oracle_general') parusr = get_config_params('../config/database.ini', section='oracle_redes') dsn_tns = cxO.makedsn(**params) conn = cxO.connect(parusr['user'], parusr['password'], dsn_tns) sql = """SELECT * FROM AGREGADOS_MENSUALES WHERE ESTACION LIKE '2401%' AND VARIABLE LIKE '%PRECIP%TOTAL%' """ results = get_by_query_oracle(conn, sql) name_cols = [ 'Station', 'Date', 'Timestamp', 'Flags', 'Observations', 'Variable', 'Value' ] if len(results) > 0: df_results = pd.DataFrame(results) df_results.columns = name_cols # df_results.set_index('Date', inplace=True) df_results.to_pickle('../results/oracle/redes_2401.pkl')
def get_data_stations(): params = get_config_params('../config/database.ini', section='oracle_general') parusr = get_config_params('../config/database.ini', section='oracle_sshm') dsn_tns = cxO.makedsn(**params) conn = cxO.connect(parusr['user'], parusr['password'], dsn_tns) stations = [2171, 2181, 1288, 2098, 3519, 3536, 3544, 7427] xls_data = pd.ExcelWriter('../results/consulta.xlsx') for station in stations: sql = """ SELECT CSMT_FECHA, CSMT_PREC FROM SHMT_CUASIRREAL_METEOROLOGIA WHERE CSMT_ESTACION_ID = {} ORDER BY CSMT_FECHA """.format(station) results = get_by_query_oracle(conn=conn, sql=sql, header=True) df_station = pd.DataFrame(results[0]) df_station.columns = ['Fecha', 'Precipitacion'] df_station.set_index('Fecha', inplace=True) df_station.to_excel(xls_data, str(station)) xls_data.save()
def get_df_data_oracle(df_catalog, backward_period='1D', sensor='0240', current_time=None): """ Returns a dataframe from a query to oracle db based on a zone catalog. :param df_catalog: :param backward_period: :param sensor: :param current_time: :return: """ start_time, end_time = date_start_end(backward_period=backward_period, current_time=current_time) if sensor != '0070': start_time = start_time - pd.DateOffset(days=1) end_time = end_time - pd.DateOffset(days=1) params = get_config_params('../config/database.ini', section='oracle_general') parusr = get_config_params('../config/database.ini', section='oracle_sshm') dsn_tns = cxO.makedsn(**params) conn = cxO.connect(parusr['user'], parusr['password'], dsn_tns) query_table = 'SHMT_CUASIRREAL_METEOROLOGIA' name_cols = ['Station', 'Value', 'Count', 'From', 'To'] sql = query_sentence_oracle(table=query_table, sensor=sensor, backward_period=backward_period, current_time=current_time) results = get_by_query_oracle(conn, sql) conn.close() if len(results) > 0: df_results = pd.DataFrame(results) df_results.columns = name_cols df_results.set_index('Station', inplace=True) stations = df_results.index.intersection(df_catalog.index) df_results = df_results.loc[stations] df_results.loc[stations, 'Name'] = df_catalog.loc[stations, 'nombre'] df_results.loc[stations, 'Freq'] = df_catalog.loc[stations, 'frecuencia'] df_results['Total'] = int((end_time - start_time) / pd.Timedelta('1D')) df_results['Gaps'] = 100. * df_results['Count'] / df_results['Total'] df_results.loc[stations, 'lng'] = df_catalog.loc[stations, 'lng'] df_results.loc[stations, 'lat'] = df_catalog.loc[stations, 'lat'] df_results.loc[stations, 'Elevation'] = df_catalog.loc[stations, 'elevacion'] df_results.loc[stations, 'AH'] = df_catalog.loc[stations, 'ah'] df_results.sort_values(['Value'], ascending=False, inplace=True) return df_results else: return None
def summary_sensors_stations(): """ Makes a summary of stations and sensors in order to identify what sensors are recording in each station. :param table_data: :param catalog: :type table_data: str :type catalog: str :return: """ params = get_config_params('../config/database.ini', section='oracle_general') parusr = get_config_params('../config/database.ini', section='oracle_sshm') dsn_tns = cxO.makedsn(**params) conn = cxO.connect(parusr['user'], parusr['password'], dsn_tns) tables = { 'HYDRO': 'SHMT_CUASIRREAL_HIDROLOGIA', 'METEO': 'SHMT_CUASIRREAL_METEOROLOGIA' } # cols_catalog = ['ID_ES', 'COD_CATALOGO_ES', 'NOMBRE_ES', 'NOMBRE_FGDA', # 'GRADOS_LATITUD', 'MINUTOS_LATITUD', 'SEGUNDOS_LATITUD', 'DIRECCION_LATITUD', # 'GRADOS_LONGITUD', 'MINUTOS_LONGITUD', 'SEGUNDOS_LONGITUD', 'DIRECCION_LONGITUD'] xls_summary = pd.ExcelWriter('../results/sshm_summary.xlsx') for table in tables: sql_catalog = sql_catalog_oracle(table=tables[table]) results, cols_catalog = get_by_query_oracle(conn=conn, sql=sql_catalog, header=True) df_catalog = pd.DataFrame(results) df_catalog.drop(0, axis=1, inplace=True) df_catalog.columns = cols_catalog[1:] df_catalog.set_index('ID_ES', inplace=True) df_catalog.sort_index(inplace=True) df_catalog.to_excel(xls_summary, 'CAT_{}'.format(table)) df_summary = pd.DataFrame(index=df_catalog.index, columns=dt_sensors_oracle) for sensor in dt_sensors_oracle: sql_sensors = sql_summary_sensor(tables[table], sensor) results_sensor = get_by_query_oracle(conn, sql_sensors) if results_sensor is not None: list_stations = [i[0] for i in results_sensor] df_summary.loc[list_stations, sensor] = 1 else: df_summary.drop(sensor, axis=1, inplace=True) df_summary.to_excel(xls_summary, table) xls_summary.save()
def download_all_data(sensor='0240', table='weather_events', multiprocess=False): """ Downloads all data from a table through a query based on sensor and stations from presto_data_summary.xlsx file. :param sensor: :param table: :param multiprocess: :return: """ xls_stations = pd.ExcelFile('../data/presto_cassandra_summary.xlsx') df_summary = xls_stations.parse('Summary', index_col='Station') sr_0240 = df_summary[sensor] stations_0240 = sr_0240.dropna().index params = get_config_params('../config/database.ini', section='prestodb_cassandra') conn = prestodb.dbapi.Connection(**params) partial_fn = partial(get_sensor_station_data, sensor=sensor, conn=conn, table=table) if multiprocess: pool = Pool() pool.map(partial_fn, stations_0240) pool.close() else: map(partial_fn, stations_0240[:2])
def download_from_stations(catalog='cassandra'): df_data = pd.read_excel('../data/extract_data.xlsx', sheetname='Sibate', index_col='COD_INTERNO') stations = df_data.index section = 'prestodb_{}'.format(catalog) params = get_config_params('../config/database.ini', section=section) conn = prestodb.dbapi.Connection(**params) for station in stations: get_sensor_station_data(station, sensor='0240', conn=conn)
def summary_sensors_stations(catalog='cassandra', table_data='last_month_observations'): """ Makes a summary of stations and sensors in order to identify what sensors are recording in each station. :param table_data: :param catalog: :type table_data: str :type catalog: str :return: """ section = 'prestodb_{}'.format(catalog) params = get_config_params('../config/database.ini', section=section) conn = prestodb.dbapi.Connection(**params) # query_sensors = get_all_data(conn, table='sensors') # df_sensors = pd.DataFrame(data=query_sensors[1:], columns=query_sensors[0]) # df_sensors.set_index('sensorid', inplace=True) # sensors = df_sensors.index sensors = ['0240', '0068', '0069', '0070', '0230', '0255', '0103', '0104', '0027', '0239'] if catalog == 'postgresql': sensors = [dt_sensors_postgres[i] for i in sensors] table_stations = 'configuration.stations' id_stations = 'id_stz' else: table_stations = 'raw.stations' id_stations = 'stationid' query_stations = get_all_data(conn, table=table_stations) df_stations = pd.DataFrame(data=query_stations[1:], columns=query_stations[0]) df_stations.set_index(id_stations, inplace=True) stations = df_stations.index df_stations_sensors = pd.DataFrame(index=pd.Index(stations, name='Station'), columns=sensors) for sensor in sorted(sensors): print sensor results = get_stations_by_sensor(conn, table=table_data, sensor=sensor) list_stations = [i[0] for i in results] idx_miss = pd.Index(list_stations).difference(df_stations_sensors.index) for sta_miss in idx_miss: df_stations_sensors.loc[sta_miss] = float('NaN') df_stations_sensors.loc[list_stations, sensor] = 1 xls_output = pd.ExcelWriter('../data/presto_{}_summary.xlsx'.format(catalog)) # df_sensors.to_excel(xls_output, 'Sensors') df_stations.to_excel(xls_output, 'Stations') df_stations_sensors.to_excel(xls_output, 'Summary') xls_output.save()
def get_elevations(): """ Returns elevations for a set of stations based on a DEM. :return: """ params = get_config_params('../config/database.ini', section='prestodb_cassandra') conn = prestodb.dbapi.Connection(**params) query_table = 'last_month_observations' sensor = '0240' stations = get_stations_by_sensor(conn=conn, table=query_table, sensor=sensor) stations = [item for sublist in stations for item in sublist] stations.sort() df_summary = pd.DataFrame(index=stations, columns=['Freq', 'Start', 'End', 'Count']) df_summary.index.name = 'Station' for station in stations: # print(station) results = get_sensor_station_data(station=station, sensor=sensor, conn=conn, table=query_table, write_csv=False) df_station = pd.DataFrame(results) sr_station = format_df_data(df_input=df_station, station=station) freq_min = calc_freq(sr_station) start = sr_station.index.min() end = sr_station.index.max() count = sr_station.dropna().count() df_summary.loc[station] = [freq_min, start, end, count] df_summary.to_excel('../results/elevations_cassandra.xlsx') return df_summary
def get_data(station, sensor, table='last_month_observations'): """ Returns the whole data from a given table for the sensor and station given. :param station: :type station: str :param sensor: :type sensor: str :param table: :type table: str :return: """ catalog = 'cassandra' section = 'prestodb_{}'.format(catalog) params = get_config_params('../config/database.ini', section=section) conn = prestodb.dbapi.Connection(**params) sql = """ SELECT to_char(event_time + interval '5' hour,'yyyy-mm-dd hh24:mi') event_time, event_value FROM {0} WHERE station = '{1}' AND sensor = '{2}' ORDER BY event_time """.format(table, station, sensor) results = get_by_query_presto(conn=conn, sql=sql) name_cols = ['Time', station] if len(results) > 0: df_results = pd.DataFrame(results) df_results.columns = name_cols df_results['Time'] = pd.to_datetime(df_results['Time'], format='%Y-%m-%d %H:%M') df_results.set_index('Time', inplace=True) df_results[station].to_pickle('../test/{}_{}_lm.pkl'.format( sensor, station))
def get_df_data_presto(df_catalog, backward_period='1D', sensor='0240', section='prestodb_postgresql', table='recent_data', current_time=None): """ Returns a dataframe from a query to prestodb based on a zone catalog. It works for any database managed by presto. :param df_catalog: :param backward_period: :param sensor: :param section: :param table: :param current_time: :return: """ start_time, end_time = date_start_end(backward_period=backward_period, current_time=current_time) stations = df_catalog.index params = get_config_params('../config/database.ini', section=section) catalog = params['catalog'] schema = params['schema'] conn = prestodb.dbapi.Connection(**params) name_cols = ['Station', 'Value', 'Count', 'From', 'To'] dict_total = {} sensor_abb = dt_sensors_abb[sensor] for query_station in stations: # print query_station max_value = df_catalog.loc[ query_station, 'max {}'.format(dt_sensors_abb[sensor]).lower()] if sensor == '0240': min_value = 0 else: min_value = df_catalog.loc[ query_station, 'min {}'.format(dt_sensors_abb[sensor]).lower()] if pd.notnull(max_value): name = df_catalog.loc[query_station, 'nombre'] freq = df_catalog.loc[query_station, 'frec_{}'.format(sensor_abb.lower())] longitude = df_catalog.loc[query_station, 'lng'] latitude = df_catalog.loc[query_station, 'lat'] total_data = (end_time - start_time) / pd.Timedelta( '{}M'.format(freq)) # elevation = df_catalog.loc[query_station, 'elevacion'] hydro_area = df_catalog.loc[query_station, 'ah'] sql = query_sentence_presto(station=query_station, max_value=max_value, min_value=min_value, catalog=catalog, schema=schema, table=table, sensor=sensor, backward_period=backward_period, current_time=current_time) results = get_by_query_presto(conn, sql) conn.close() if len(results) > 0: df_results = pd.DataFrame(results) df_results.columns = name_cols df_results['Name'] = name df_results['Freq'] = freq df_results['Total'] = int(total_data) df_results['Gaps'] = 100. * df_results.loc[ 0, 'Count'] / total_data df_results['lng'] = longitude df_results['lat'] = latitude # df_results['Elevation'] = elevation df_results['AH'] = hydro_area dict_total[query_station] = df_results df_total = pd.concat(dict_total) df_total.sort_values(['Value'], ascending=False, inplace=True) df_total.reset_index(inplace=True, drop=True) df_total.set_index(['Station'], inplace=True) return df_total