def sql_upload_step_one( request: http.HttpRequest, workflow: models.Workflow, conn: models.SQLConnection, run_params: Dict, ): """Perform the first step to load a data frame from a SQL connection. :param request: Request received. :param workflow: Workflow being processed. :param conn: Database connection object. :param run_params: Dictionary with the additional run parameters. :return: Nothing, it creates the new dataframe in the database """ # Process SQL connection using pandas data_frame = _load_df_from_sqlconnection(conn, run_params) # Verify the data frame pandas.verify_data_frame(data_frame) # Store the data frame in the DB. # Get frame info with three lists: names, types and is_key frame_info = pandas.store_temporary_dataframe(data_frame, workflow) # Dictionary to populate gradually throughout the sequence of steps. It # is stored in the session. request.session['upload_data'] = { 'initial_column_names': frame_info[0], 'column_types': frame_info[1], 'src_is_key_column': frame_info[2], 'step_1': reverse('dataops:sqlupload_start', kwargs={'pk': conn.id}), 'log_upload': models.Log.WORKFLOW_DATA_SQL_UPLOAD }
def validate_data_frame(self): """Check that the dataframe can be properly stored. :return: The cleaned data """ try: # Verify the data frame pandas.verify_data_frame(self.data_frame) except OnTaskDataFrameNoKey as exc: self.add_error(None, str(exc)) return # Store the data frame in the DB. try: # Get frame info with three lists: names, types and is_key self.frame_info = pandas.store_temporary_dataframe( self.data_frame, self.workflow) except Exception as exc: self.add_error(None, _('Unable to process file ({0}).').format(str(exc)))
def batch_load_df_from_athenaconnection( workflow: models.Workflow, conn: models.AthenaConnection, run_params: Dict, log_item: models.Log, ): """Batch load a DF from an Athena connection. run_params has: aws_secret_access_key: Optional[str] = None, aws_session_token: Optional[str] = None, table_name: Optional[str] = None key_column_name[str] = None merge_method[str] = None from pyathena import connect from pyathena.pandas_cursor import PandasCursor cursor = connect( aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, aws_session_token=aws_session_token, s3_staging_dir=staging_dir, region_name=region_name) df = pd.read_sql('SELECT * FROM given_table_name', cursor) print(df.describe()) print(df.head()) :param workflow: Workflow to store the new data :param conn: AthenaConnection object with the connection parameters. :param run_params: Dictionary with additional connection parameters :param log_item: Log object to reflect the status of the execution :return: Nothing. """ staging_dir = 's3://{0}'.format(conn.aws_bucket_name) if conn.aws_file_path: staging_dir = staging_dir + '/' + conn.aws_file_path cursor = connect(aws_access_key_id=conn.aws_access_key, aws_secret_access_key=run_params['aws_secret_access_key'], aws_session_token=run_params['aws_session_token'], s3_staging_dir=staging_dir, region_name=conn.aws_region_name) data_frame = pd.read_sql_table(run_params['table_name'], cursor) # Strip white space from all string columns and try to convert to # datetime just in case data_frame = pandas.detect_datetime_columns(data_frame) pandas.verify_data_frame(data_frame) col_names, col_types, is_key = pandas.store_temporary_dataframe( data_frame, workflow) upload_data = { 'initial_column_names': col_names, 'col_types': col_types, 'src_is_key_column': is_key, 'rename_column_names': col_names[:], 'columns_to_upload': [True] * len(col_names), 'keep_key_column': is_key[:] } if not workflow.has_data_frame(): # Regular load operation pandas.store_workflow_table(workflow, upload_data) log_item.payload['col_names'] = col_names log_item.payload['col_types'] = col_types log_item.payload['column_unique'] = is_key log_item.payload['num_rows'] = workflow.nrows log_item.payload['num_cols'] = workflow.ncols log_item.save(update_fields=['payload']) return # Merge operation upload_data['dst_column_names'] = workflow.get_column_names() upload_data['dst_is_unique_column'] = workflow.get_column_unique() upload_data['dst_unique_col_names'] = [ cname for idx, cname in enumerate(upload_data['dst_column_names']) if upload_data['dst_column_names'][idx] ] upload_data['src_selected_key'] = run_params['merge_key'] upload_data['dst_selected_key'] = run_params['merge_key'] upload_data['how_merge'] = run_params['merge_method'] dst_df = pandas.load_table(workflow.get_data_frame_table_name()) src_df = pandas.load_table(workflow.get_data_frame_upload_table_name()) try: pandas.perform_dataframe_upload_merge(workflow, dst_df, src_df, upload_data) except Exception as exc: # Nuke the temporary table sql.delete_table(workflow.get_data_frame_upload_table_name()) raise Exception( _('Unable to perform merge operation: {0}').format(str(exc))) col_names, col_types, is_key = workflow.get_column_info() log_item.payload['col_names'] = col_names log_item.payload['col_types'] = col_types log_item.payload['column_unique'] = is_key log_item.payload['num_rows'] = workflow.nrows log_item.payload['num_cols'] = workflow.ncols log_item.save(update_fields=['payload'])
def sqlupload_start( request: HttpRequest, pk: int, workflow: Optional[Workflow] = None, ) -> HttpResponse: """Load a data frame using a SQL connection. The four step process will populate the following dictionary with name upload_data (divided by steps in which they are set STEP 1: initial_column_names: List of column names in the initial file. column_types: List of column types as detected by pandas src_is_key_column: Boolean list with src columns that are unique step_1: URL name of the first step :param request: Web request :param pk: primary key of the SQL conn used :return: Creates the upload_data dictionary in the session """ conn = SQLConnection.objects.filter( pk=pk ).filter(enabled=True).first() if not conn: return redirect('dataops:sqlconns_instructor_index_instructor_index') form = None missing_field = conn.has_missing_fields() if missing_field: # The connection needs a password to operate form = SQLRequestConnectionParam(request.POST or None, instance=conn) context = { 'form': form, 'wid': workflow.id, 'dtype': 'SQL', 'dtype_select': _('SQL connection'), 'valuerange': range(5) if workflow.has_table() else range(3), 'prev_step': reverse('dataops:sqlconns_instructor_index'), 'conn_type': conn.conn_type, 'conn_driver': conn.conn_driver, 'db_user': conn.db_user, 'db_passwd': _('<PROTECTED>') if conn.db_password else '', 'db_host': conn.db_host, 'db_port': conn.db_port, 'db_name': conn.db_name, 'db_table': conn.db_table} if request.method == 'POST' and (not missing_field or form.is_valid()): run_params = conn.get_missing_fields(form.cleaned_data) # Process SQL connection using pandas try: data_frame = load_df_from_sqlconnection(conn, run_params) # Verify the data frame verify_data_frame(data_frame) except OnTaskDataFrameNoKey as exc: messages.error(request, str(exc)) return render(request, 'dataops/sqlupload_start.html', context) except Exception as exc: messages.error( request, _('Unable to obtain data: {0}').format(str(exc))) return render(request, 'dataops/sqlupload_start.html', context) # Store the data frame in the DB. try: # Get frame info with three lists: names, types and is_key frame_info = store_temporary_dataframe( data_frame, workflow) except Exception: form.add_error( None, _('The data from this connection cannot be processed.'), ) return render(request, 'dataops/sqlupload_start.html', context) # Dictionary to populate gradually throughout the sequence of steps. It # is stored in the session. request.session['upload_data'] = { 'initial_column_names': frame_info[0], 'column_types': frame_info[1], 'src_is_key_column': frame_info[2], 'step_1': reverse( 'dataops:sqlupload_start', kwargs={'pk': conn.id}), } return redirect('dataops:upload_s2') return render(request, 'dataops/sqlupload_start.html', context)