def download__update_chart_by_parameters(signal, start_date, end_date, limit): if signal is None: raise PreventUpdate logging.info(f'download__update_chart_by_parameters - start_date: {start_date}; ' f'end_date: {end_date}; limit: {limit}') start_date, end_date = __convert_dates_from_str_to_date(start_date, end_date) # if start date is greater than end date or limit is None, # then the callback returns an empty object if start_date > end_date or limit is None: return {'data': [], 'layout': {}, 'frames': []} sub_df = __create_sub_df_based_on_parameters( # get data from cache cache.get('download:df_download'), start_date, end_date, limit ) # filter the previous dataframe to get the number of downloaded scenes by date only sub_df = sub_df.groupby(['date'])['number'].sum().to_frame('number').reset_index() sub_df = sub_df.sort_values(['date'], ascending=True) return __get_figure_of_number_of_downloaded_scenes_time_series( sub_df, title='Time Series: Number of Download Scenes by Date' )
def download__update_information_table(signal, start_date, end_date): if signal is None: raise PreventUpdate logging.info(f'download__update_information_table - start_date: {start_date}; ' f'end_date: {end_date}') start_date, end_date = __convert_dates_from_str_to_date(start_date, end_date) # if start date is greater than end date or limit is None, # then the callback returns an empty object if start_date > end_date: return [] sub_df = __create_sub_df_based_on_parameters( # get data from cache cache.get('download:df_base'), start_date, end_date ) # update `Number of downloaded scenes (by range)` data df_information.loc[2, 'value'] = len(sub_df.index) # update `Number of downloaded assets (by range)` data df_information.loc[3, 'value'] = sub_df['nofbi'].sum() # return the new information table return df_information.to_dict('records')
def scene__update_information_table(signal, start_date, end_date): if signal is None: raise PreventUpdate logging.info( f'scene__update_information_table - start_date: {start_date}; end_date: {end_date}' ) start_date, end_date = __convert_dates_from_str_to_date( start_date, end_date) # if start date is greater than end date or limit is None, then the callback returns an empty object if start_date > end_date: return [] sub_df = __create_sub_df_based_on_parameters( # get data from cache cache.get('scene:df_base'), start_date, end_date) # update `Number of items (by range)` data df_information.loc[3, 'value'] = len(sub_df.index) # update `Number of distinct scenes (by range)` data df_information.loc[4, 'value'] = len(sub_df.item_id.unique()) # return the new information table return df_information.to_dict('records')
def execute(self, query, is_transaction=False): logging.info('MySQLConnection.execute()') try: logging.info('MySQLConnection.execute() - query: %s\n', query) self.try_to_connect() if not is_transaction: return read_sql(query, con=self.engine) # if it is a transaction with self.engine.begin() as connection: # runs a transaction connection.execute(query) except SQLAlchemyError as error: # self.rollback() error_message = 'An error occurred during query execution' logging.error('MySQLConnection.execute() - error.code: %s', error.code) logging.error('MySQLConnection.execute() - error.args: %s', error.args) logging.error('MySQLConnection.execute() - %s: %s\n', error_message, error) error_message += ': ' + str(error.args) raise Exception(error_message) # finally is always executed (both at try and except) finally: self.close()
def scene__update_map_by_parameters(signal, start_date, end_date): if signal is None: raise PreventUpdate logging.info( f'scene__update_map_by_parameters - start_date: {start_date}; end_date: {end_date}' ) start_date, end_date = __convert_dates_from_str_to_date( start_date, end_date) # if start date is greater than end date or limit is None, then the callback returns an empty object if start_date > end_date: return dicts_to_geojson([]) # convert the dates from datetime to str again in order to pass the xaxis range to build the figure xaxis_range = [ start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d') ] # get data from cache df_sd_ds_ym_long_lat = cache.get('scene:df_sd_ds_ym_long_lat') # when user selects an invalid range, the df has len equals to 0, # then the callback returns an empty object to avoid error message if len(df_sd_ds_ym_long_lat.index) == 0: return dicts_to_geojson([]) # get a sub set from the df according to the selected date range sub_df = df_sd_ds_ym_long_lat[__get_logical_date_range( df_sd_ds_ym_long_lat, xaxis_range)] # build the geojson object with a list of markers return __get_geojson_data(sub_df)
def scene__update_graph_x_number_of_scenes_based_on_date_picker_range( signal, start_date, end_date): if signal is None: raise PreventUpdate logging.info( f'update_graph_number_of_scenes - start_date: {start_date}; end_date: {end_date}' ) start_date, end_date = __convert_dates_from_str_to_date( start_date, end_date) # if start date is greater than end date or limit is None, then the callback returns an empty object if start_date > end_date: return {'data': [], 'layout': {}, 'frames': []} # convert the dates from datetime to str again in order to pass the xaxis range to build the figure xaxis_range = [ start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d') ] return get_figure_of_graph_bar_plot_number_of_scenes( # get data from cache cache.get('scene:df_sd_dataset_year_month'), xaxis_range=xaxis_range, title='Number of Scenes by Dataset and Year-Month')
def get_figure_of_graph_bar_plot_number_of_scenes(df, xaxis_range=[], title=None): figure_height = 800 df_copy = df.copy() logging.info(f'get_figure_of_graph_bar_plot_number_of_scenes - df_copy.head(): \n{df_copy.head()}\n') logging.info(f'get_figure_of_graph_bar_plot_number_of_scenes - xaxis_range: {xaxis_range}\n') logical_date_range = __get_logical_date_range(df_copy, xaxis_range) # I'm goint to build the `data` parameter of `Figure` data = [] # I would like to build each `bar` based on each dataset for dataset in df_copy['collection'].unique(): sub_df = df_copy[(df_copy['collection'] == dataset) & logical_date_range] hovertext = 'Number of Scenes: ' + sub_df['number'].map(str) + '<br>' + \ 'Period: ' + sub_df['year_month'].map(str) + '<br>' + \ 'Dataset: ' + sub_df['collection'].map(str) data.append(Bar({ 'x': sub_df['year_month'], 'y': sub_df['number'], 'name': dataset, 'text': sub_df['number'], # text inside the bar 'textposition': 'auto', 'hovertext': hovertext, })) fig = Figure({ 'data': data, 'layout': { 'title': title, 'xaxis': {'title': 'Period'}, 'yaxis': {'title': 'Number of scenes'}, 'plot_bgcolor': colors['background'], 'paper_bgcolor': colors['background'], 'font': { 'color': colors['text'] } } }) fig.update_layout( barmode='group', height=figure_height, xaxis_tickangle=-45 ) return fig
def __configure_df_user(self): logging.info('**************************************************') logging.info('* __configure_df_user *') logging.info('**************************************************') # rename columns self.df_user.rename(columns={'userId': 'username'}, inplace=True) self.df_user.rename(columns={'addressId': 'address_id'}, inplace=True) self.df_user.rename(columns={'fullname': 'name'}, inplace=True) self.df_user.rename(columns={'registerDate': 'created_at'}, inplace=True) self.df_user.rename(columns={'areaCode': 'ddd'}, inplace=True) self.df_user.rename(columns={'company': 'company_name'}, inplace=True) self.df_user.rename(columns={'companyType': 'company_type'}, inplace=True) self.df_user.rename(columns={'activity': 'company_activity'}, inplace=True) # delete unnecessary columns del self.df_user['CNPJ_CPF'] del self.df_user['compCNPJ'] del self.df_user['fax'] del self.df_user['userType'] del self.df_user['userStatus'] del self.df_user['unblockDate'] del self.df_user['siape'] # fix cases self.df_user['username'] = self.df_user['username'].str.lower( ).replace(' ', '_') self.df_user['name'] = self.df_user['name'].str.title() # ignore single quote to save in the postgres db self.df_user.replace("'", "", regex=True, inplace=True) # remove unnecessary chars self.df_user.replace("%", "", regex=True, inplace=True) self.df_user.replace(r"\\", "", regex=True, inplace=True) self.__configure_df_user__fix_columns_types() # if there is an invalid `created_at` cell, then I add the above cell in the place for i in range(1, len(self.df_user.index)): if self.df_user.loc[i, 'created_at'] == '0000-00-00 00:00:00' or \ self.df_user.loc[i, 'created_at'] == '': self.df_user.loc[i, 'created_at'] = self.df_user.loc[i - 1, 'created_at'] # if username is empty, then fill it with email data self.df_user['username'] = self.df_user.apply( lambda row: row['username'] if row['username'] != '' else row['email'], axis=1) self.__fix_duplicated_user() # generate INSERT clause for each row # self.df_user['insert'] = self.df_user.apply(generate_user_insert_clause, axis=1) logging.info(f'df_user: \n{self.df_user.head()}\n')
def __main__get_dfs_configure_dfs_and_save_dfs(self, is_to_get_dfs_from_db=True): logging.info('**************************************************') logging.info('* main - settings *') logging.info('**************************************************') if is_to_get_dfs_from_db: # delete and recreate `assets` folder delete_and_recreate_folder(DATA_PATH) logging.info( f'`{DATA_PATH}` folder has been recreated successfully!\n') # remove invalid rows from database, if they exist self.__remove_invalid_rows_from_database() # get dataframes from database and save them in CSV files self.__get_dfs_from_mysqldb() self.__save_dfs() # get the saved dataframes self.__get_dfs_from_csv_files() # configure dataframes self.__configure_df_address() self.__configure_df_user() self.__configure_df_user_address() self.__configure_df_location() self.__configure_df_download() # save a new version of the dataframes after modifications self.__save_dfs(address_file_name='address_configured.csv', user_file_name='user_configured.csv', user_address_file_name='user_address_configured.csv', location_file_name='location_configured.csv', download_file_name='download_configured.csv')
def scene__date_picker_range__event__button_submit(n_clicks, start_date, end_date): """This event is called after user submit button in order to update the caching data.""" logging.info( f'scene__date_picker_range__event__button_submit - n_clicks: {n_clicks}' ) logging.info( f'scene__date_picker_range__event__button_submit - start_date: {start_date}; end_date: {end_date}' ) _start_date, _end_date = __convert_dates_from_str_to_date( start_date, end_date) # if start date is greater than end date or limit is None, then the callback returns an empty object if _start_date > _end_date: raise PreventUpdate # create df_base and save it in the cache df_base = create_df_base(start_date, end_date) logging.info( f'scene__date_picker_range__event__button_submit - df_base has been created!' ) # create auxiliar dfs and save them in the cache create_df_sd_dataset_year_month(df_base) create_df_sd_ds_ym_long_lat(df_base) logging.info( f'scene__date_picker_range__event__button_submit - auxiliar dfs have been created!' ) return n_clicks
def create_df_sd_ds_ym_long_lat(df_base): # I group my df by 'collection', 'year_month', longitude' and 'latitude' to build the map df_sd_ds_ym_long_lat = filter_df_by( df_base, group_by=['collection', 'year_month', 'longitude', 'latitude'], sort_by=['year_month', 'collection', 'longitude', 'latitude'] ) logging.info(f'create_df_sd_ds_ym_long_lat - df_sd_ds_ym_long_lat.head(): \n{df_sd_ds_ym_long_lat.head()}\n') # save the variable inside the cache cache.set('scene:df_sd_ds_ym_long_lat', df_sd_ds_ym_long_lat) return df_sd_ds_ym_long_lat
def __get_logical_date_range(df, xaxis_range=None): # if there are values, then get a boolean df according to the selected date range if xaxis_range: # [:-3] - extract the string without the last 3 chars, in other words, I get just the year and month start_date = xaxis_range[0][:-3] end_date = xaxis_range[1][:-3] logging.info(f'__get_logical_date_range - start_date: {start_date}') logging.info(f'__get_logical_date_range - end_date: {end_date}\n') # extract a boolean df from the original one by the selected date range return ((df['year_month'] >= start_date) & (df['year_month'] <= end_date)) else: raise CatalogDashException('Invalid `xaxis_range`, it is empty!')
def create_df_sd_dataset_year_month(df_base): # I group my df by 'collection' and 'year_month' to build the table df_sd_dataset_year_month = filter_df_by( df_base, group_by=['collection', 'year_month'], sort_by=['year_month', 'collection'], ascending=False ) logging.info(f'create_df_sd_dataset_year_month - df_sd_dataset_year_month.head(): \n{df_sd_dataset_year_month.head()}\n') # save the variable inside the cache cache.set('scene:df_sd_dataset_year_month', df_sd_dataset_year_month) return df_sd_dataset_year_month
def wrapper(*args, **kwargs): attempts = 1 while attempts <= max_retries: logging.info(f'max retries to connect: `{max_retries}``.') logging.info(f'wait `{wait}` secs.') try: return function(*args, **kwargs) except SQLAlchemyError: logging.error( f'try to reconnect... attempts: `{attempts}`') attempts += 1 sleep(wait) logging.error(f'Max retries was exceeded...')
def create_df_download(df_base): # df_download - number of downloaded scenes by user, date and long/lat # this df contains all columns I need to build the tables and charts df_download = filter_df_by( df_base, count='item_id', group_by=['collection', 'satellite_sensor', 'email', 'name', 'date', 'longitude', 'latitude'], sort_by=['number'], ascending=False ) logging.info(f'create_df_download - df_download.head(): \n{df_download.head()}\n') # save the variable inside the cache cache.set('download:df_download', df_download) return df_download
def __remove_invalid_rows_from_database(self): logging.info('**************************************************') logging.info('* __remove_invalid_rows_from_database *') logging.info('**************************************************') # remove invalid rows from Download table, if they exist self.db_mysql.execute( 'DELETE FROM Download WHERE CHAR_LENGTH(sceneId) < 10;', is_transaction=True) logging.info('Invalid rows have been removed.\n')
def __recreate_tables_in_the_database(self): """Recreate the tables in the PostgreSQL database""" logging.info('**************************************************') logging.info('* __recreate_tables_in_the_database *') logging.info('**************************************************') self.db_postgres.init_db() logging.info( 'All tables have been recreated in the database successfully!\n')
def download__update_tables_by_parameters(signal, start_date, end_date, limit): if signal is None: raise PreventUpdate logging.info(f'download__update_tables_by_parameters - start_date: {start_date}; ' f'end_date: {end_date}; limit: {limit}') start_date, end_date = __convert_dates_from_str_to_date(start_date, end_date) # if start date is greater than end date or limit is None, # then it prevents to update the tables if start_date > end_date or limit is None: raise PreventUpdate # filter base dataframe based on start date, end date and limit sub_df = __create_sub_df_based_on_parameters( # get data from cache cache.get('download:df_download'), start_date, end_date, limit ) # filter the sub dataframe to get the number of downloaded items by collection sub_df_nodib_collection = sub_df.groupby(['collection'])['number'] \ .sum().to_frame('number').reset_index() \ .sort_values(['number'], ascending=False) # filter the sub dataframe to get the number of downloaded items by satellite and sensor sub_df_nodib_satellite_sensor = sub_df.groupby(['satellite_sensor'])['number'] \ .sum().to_frame('number').reset_index() \ .sort_values(['number'], ascending=False) # filter the sub dataframe to get the number of downloaded items by user and date sub_df_nodib_user_date = sub_df.groupby(['email', 'name', 'date'])['number'] \ .sum().to_frame('number').reset_index() \ .sort_values(['number'], ascending=False) # filter the previous dataframe to get the number of downloaded items by date only sub_df_nodib_date = sub_df_nodib_user_date.groupby(['date'])['number'] \ .sum().to_frame('number').reset_index() \ .sort_values(['number'], ascending=False) return sub_df_nodib_collection.to_dict('records'), \ sub_df_nodib_satellite_sensor.to_dict('records'), \ sub_df_nodib_date.to_dict('records'), \ sub_df_nodib_user_date.to_dict('records')
def get_data_from_db(start_date=None, end_date=None): """Gets data from database.""" # postgres connection db = PostgreSQLRegister(database=PGDB_REGISTER) # df_base - `number of downloaded files by items` dataframe df_base = db.select_from_download_nofbi(start_date, end_date) logging.info(f'get_data_from_db - df_base size: {len(df_base.index)}') logging.info( 'get_data_from_db - df_base.head(): \n' f"{df_base[['nofbi', 'item_id', 'username', 'date', 'longitude', 'latitude']].head()}\n" ) # save the variable inside the cache cache.set('download:df_base', df_base) return df_base
def __fix_sequences_in_the_database(self): """Fix the tables sequences in the PostgreSQL database.""" logging.info('**************************************************') logging.info('* __fix_sequences_in_the_database *') logging.info('**************************************************') self.db_postgres.fix_sequences() logging.info( 'All sequences have been recreated in the database successfully!\n' )
def download__update_map_by_parameters(signal, start_date, end_date, limit): if signal is None: raise PreventUpdate logging.info(f'download__update_map_by_parameters - start_date: {start_date}; ' f'end_date: {end_date}; limit: {limit}') start_date, end_date = __convert_dates_from_str_to_date(start_date, end_date) # if start date is greater than end date or limit is None, # then the callback returns an empty object if start_date > end_date or limit is None: return dicts_to_geojson([]) sub_df = __create_sub_df_based_on_parameters( # get data from cache cache.get('download:df_download'), start_date, end_date, limit ) # build the geojson object with a list of markers return __get_geojson_data(sub_df)
def __main__clear_and_insert_values_in_the_database(self): logging.info('**************************************************') logging.info('*__main__clear_and_insert_values_in_the_database *') logging.info('**************************************************') # initialize database before inserting records self.__recreate_tables_in_the_database() # insert rows self.__insert_df_into_database(self.df_location, df_name='df_location') # sort the df by address_id for eficient deletes afterwards df_sorted_by_address_id = self.df_user_address.sort_values( 'address_id') self.__insert_df_into_database(df_sorted_by_address_id, df_name='df_user_address', insert_column='insert_address') self.__insert_df_into_database(self.df_user_address, df_name='df_user_address', insert_column='insert_user') self.__insert_df_into_database(self.df_download, df_name='df_download', chunks=40000) # fix sequences after inserting records self.__fix_sequences_in_the_database()
def __get_dfs_from_mysqldb(self): logging.info('**************************************************') logging.info('* __get_dfs_from_mysqldb *') logging.info('**************************************************') # get the dfs from database self.df_download = self.db_mysql.select_from_download() self.df_user = self.db_mysql.select_from_user() self.df_address = self.db_mysql.select_from_address() self.df_location = self.db_mysql.select_from_location() # create an empty df just to avoid errors self.df_user_address = DataFrame({'test': [1]}) logging.info('Dataframes have been loaded successfully.\n')
def __configure_df_location(self): logging.info('**************************************************') logging.info('* __configure_df_location *') logging.info('**************************************************') # rename columns self.df_location.rename(columns={'timestamp': 'created_at'}, inplace=True) # scape single quote to save in the postgres db self.df_location.replace("'", "''", regex=True, inplace=True) self.__configure_df_location__fix_columns_types() # generate INSERT clause for each row self.df_location['insert'] = self.df_location.apply( generate_location_insert_clause, axis=1) logging.info(f'df_location: \n{self.df_location.head()}\n')
def __main_read_dataframes_from_csv_files(self): logging.info('**************************************************') logging.info('* __main_initialize_dataframes *') logging.info('**************************************************') # read CSV files self.__get_dfs_from_csv_files( address_file_name='address_configured.csv', user_file_name='user_configured.csv', user_address_file_name='user_address_configured.csv', location_file_name='location_configured.csv', download_file_name='download_configured.csv') # configure dataframes self.__configure_df_address__fix_columns_types() self.__configure_df_user__fix_columns_types() self.__configure_df_user_address__fix_columns_types() self.__configure_df_location__fix_columns_types() self.__configure_df_download__fix_columns_types() logging.info('Dataframes have been initialized successfully.\n')
def __configure_df_user_address(self): logging.info('**************************************************') logging.info('* __configure_df_user_address *') logging.info('**************************************************') # merge dataframes self.df_user_address = merge(self.df_user, self.df_address, how='left', on='address_id') self.__fix_duplicated_address_id() # generate INSERT clause for each row self.df_user_address['insert_address'] = self.df_user_address.apply( generate_address_insert_clause, axis=1) self.df_user_address['insert_user'] = self.df_user_address.apply( generate_user_insert_clause, axis=1) logging.info( f'self.df_user_address: \n{self.df_user_address.head()}\n')
def download__date_picker_range__event__button_submit(n_clicks, start_date, end_date): """This event is called after user submit button in order to update the caching data.""" logging.info(f'download__date_picker_range__event__button_submit - n_clicks: {n_clicks}') logging.info(f'download__date_picker_range__event__button_submit - start_date: {start_date}; end_date: {end_date}') _start_date, _end_date = __convert_dates_from_str_to_date(start_date, end_date) # if start date is greater than end date or limit is None, then the callback returns an empty object if _start_date > _end_date: raise PreventUpdate # create df_download and save it in the cache df_base = get_data_from_db(start_date, end_date) logging.info(f'download__date_picker_range__event__button_submit - df_download has been created!') # create auxiliar df and save it in the cache create_df_download(df_base) logging.info(f'download__date_picker_range__event__button_submit - auxiliar df has been created!') return n_clicks
def create_df_base(start_date=None, end_date=None): logging.info(f'create_df_base - start_date: {start_date}; end_date: {end_date}') df_base = copy_and_organize_df(get_data_from_db(start_date=start_date, end_date=end_date)) logging.info(f'create_df_base - df_base size: {len(df_base.index)}') logging.info(f'create_df_base - df_base.head(): \n{df_base.head()}\n') # memory_usage = df_base.memory_usage(index=True).sum() # logging.info(f'create_df_base - df_base (df) memory_usage: {bytesto(memory_usage, to="m")} MB\n') # save the variable inside the cache cache.set('scene:df_base', df_base) return df_base
def __save_dfs(self, download_file_name='download.csv', user_file_name='user.csv', address_file_name='address.csv', user_address_file_name='user_address.csv', location_file_name='location.csv'): """Save the dataframes in CSV files""" logging.info('**************************************************') logging.info('* __save_dfs *') logging.info('**************************************************') self.df_location.to_csv(DATA_PATH + location_file_name, index=False) self.df_user.to_csv(DATA_PATH + user_file_name, index=False) self.df_address.to_csv(DATA_PATH + address_file_name, index=False) self.df_user_address.to_csv(DATA_PATH + user_address_file_name, index=False) self.df_download.to_csv(DATA_PATH + download_file_name, index=False) logging.info( f'`{download_file_name}`, `{user_file_name}`, `{address_file_name}`, ' f'`{user_address_file_name}` and `{location_file_name}` files ' 'have been saved successfully!\n')
def __configure_df_address(self): logging.info('**************************************************') logging.info('* __configure_df_address *') logging.info('**************************************************') # rename columns self.df_address.rename(columns={'addressId': 'address_id'}, inplace=True) # delete unnecessary columns del self.df_address['addressType'] del self.df_address['CNPJ_CPF'] del self.df_address['compCNPJ'] del self.df_address['digitCNPJ'] del self.df_address['delivery'] del self.df_address['payment'] del self.df_address['userId'] # fix cases self.df_address['street'] = self.df_address['street'].str.title() self.df_address['number'] = self.df_address['number'].str.strip() self.df_address['city'] = self.df_address['city'].str.title() self.df_address['state'] = self.df_address['state'].str.upper() self.df_address['country'] = self.df_address['country'].str.title() # remove unnecessary chars self.df_address.replace("%", "", regex=True, inplace=True) # ignore single quote to save in the postgres db self.df_address.replace("'", "", regex=True, inplace=True) self.__configure_df_address__fix_columns_types() # generate INSERT clause for each row # self.df_address['insert'] = self.df_address.apply(generate_address_insert_clause, axis=1) logging.info(f'df_address: \n{self.df_address.head()}\n')