def put( self, request: HttpRequest, wid: int, format=None, workflow: Optional[Workflow] = None, ) -> HttpResponse: """Process the put request.""" # Get the dst_df dst_df = load_table(workflow.get_data_frame_table_name()) serializer = self.serializer_class(data=request.data) if not serializer.is_valid(): return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST) # Check that the parameters are correct how = serializer.validated_data['how'] if how == '' or how not in ['left', 'right', 'outer', 'inner']: raise APIException( _('how must be one of left, right, outer or inner')) left_on = serializer.validated_data['left_on'] if not is_unique_column(dst_df[left_on]): raise APIException( _('column {0} does not contain a unique key.').format(left_on)) # Operation has been accepted by the serializer src_df = serializer.validated_data['src_df'] right_on = serializer.validated_data['right_on'] if right_on not in list(src_df.columns): raise APIException( _('column {0} not found in data frame').format(right_on)) if not is_unique_column(src_df[right_on]): raise APIException( _('column {0} does not contain a unique key.').format( right_on)) merge_info = { 'how_merge': how, 'dst_selected_key': left_on, 'src_selected_key': right_on, 'initial_column_names': list(src_df.columns), 'rename_column_names': list(src_df.columns), 'columns_to_upload': [True] * len(list(src_df.columns)), } # Ready to perform the MERGE try: perform_dataframe_upload_merge(workflow, dst_df, src_df, merge_info) except Exception as exc: raise APIException( _('Unable to perform merge operation: {0}').format(str(exc))) # Merge went through. return Response(serializer.data, status=status.HTTP_201_CREATED)
def upload_step_four( request: http.HttpRequest, workflow: models.Workflow, upload_data: Dict, ) -> http.HttpResponse: """Perform the merge operation. :param request: Received request :param workflow: Workflow being processed :param upload_data: Dictionary with all the information about the merge. :return: HttpResponse """ # Get the dataframes to merge try: dst_df = pandas.load_table(workflow.get_data_frame_table_name()) src_df = pandas.load_table(workflow.get_upload_table_name()) except Exception: return render(request, 'error.html', {'message': _('Exception while loading data frame')}) try: pandas.perform_dataframe_upload_merge(workflow, dst_df, src_df, upload_data) except Exception as exc: # Nuke the temporary table sql.delete_table(workflow.get_upload_table_name()) col_info = workflow.get_column_info() workflow.log(request.user, models.Log.WORKFLOW_DATA_FAILEDMERGE, column_names=col_info[0], column_types=col_info[1], column_unique=col_info[2], error_message=str(exc)) messages.error(request, _('Merge operation failed. ') + str(exc)) return redirect(reverse('table:display')) col_info = workflow.get_column_info() workflow.log(request.user, upload_data['log_upload'], column_names=col_info[0], column_types=col_info[1], column_unique=col_info[2]) store_workflow_in_session(request.session, workflow) request.session.pop('upload_data', None) return redirect(reverse('table:display'))
def put( self, request: http.HttpRequest, wid: int, format=None, workflow: Optional[models.Workflow] = None, ) -> http.HttpResponse: """Process the put request.""" del wid, format # Get the dst_df dst_df = pandas.load_table(workflow.get_data_frame_table_name()) serializer = self.serializer_class(data=request.data) if not serializer.is_valid(): return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST) src_df = serializer.validated_data['src_df'] how = serializer.validated_data['how'] left_on = serializer.validated_data['left_on'] right_on = serializer.validated_data['right_on'] error = pandas.validate_merge_parameters(dst_df, src_df, how, left_on, right_on) if error: raise APIException(error) # Ready to perform the MERGE try: pandas.perform_dataframe_upload_merge( workflow, dst_df, src_df, { 'how_merge': how, 'dst_selected_key': left_on, 'src_selected_key': right_on, 'initial_column_names': list(src_df.columns), 'rename_column_names': list(src_df.columns), 'columns_to_upload': [True] * len(list(src_df.columns)) }) except Exception as exc: raise APIException( _('Unable to perform merge operation: {0}').format(str(exc))) # Merge went through. return Response(serializer.data, status=status.HTTP_201_CREATED)
def test_merge_inner(self): # Get the workflow self.workflow = Workflow.objects.all()[0] # Parse the source data frame df_dst, df_src = self.parse_data_frames() self.merge_info['how_merge'] = 'inner' result = perform_dataframe_upload_merge(self.workflow, df_dst, df_src, self.merge_info) # Load again the workflow data frame df_dst = load_table(self.workflow.get_data_frame_table_name()) # Result must be correct (None) self.assertEquals(result, None)
def batch_load_df_from_athenaconnection( workflow: models.Workflow, conn: models.AthenaConnection, run_params: Dict, log_item: models.Log, ): """Batch load a DF from an Athena connection. run_params has: aws_secret_access_key: Optional[str] = None, aws_session_token: Optional[str] = None, table_name: Optional[str] = None key_column_name[str] = None merge_method[str] = None from pyathena import connect from pyathena.pandas_cursor import PandasCursor cursor = connect( aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, aws_session_token=aws_session_token, s3_staging_dir=staging_dir, region_name=region_name) df = pd.read_sql('SELECT * FROM given_table_name', cursor) print(df.describe()) print(df.head()) :param workflow: Workflow to store the new data :param conn: AthenaConnection object with the connection parameters. :param run_params: Dictionary with additional connection parameters :param log_item: Log object to reflect the status of the execution :return: Nothing. """ staging_dir = 's3://{0}'.format(conn.aws_bucket_name) if conn.aws_file_path: staging_dir = staging_dir + '/' + conn.aws_file_path cursor = connect(aws_access_key_id=conn.aws_access_key, aws_secret_access_key=run_params['aws_secret_access_key'], aws_session_token=run_params['aws_session_token'], s3_staging_dir=staging_dir, region_name=conn.aws_region_name) data_frame = pd.read_sql_table(run_params['table_name'], cursor) # Strip white space from all string columns and try to convert to # datetime just in case data_frame = pandas.detect_datetime_columns(data_frame) pandas.verify_data_frame(data_frame) col_names, col_types, is_key = pandas.store_temporary_dataframe( data_frame, workflow) upload_data = { 'initial_column_names': col_names, 'col_types': col_types, 'src_is_key_column': is_key, 'rename_column_names': col_names[:], 'columns_to_upload': [True] * len(col_names), 'keep_key_column': is_key[:] } if not workflow.has_data_frame(): # Regular load operation pandas.store_workflow_table(workflow, upload_data) log_item.payload['col_names'] = col_names log_item.payload['col_types'] = col_types log_item.payload['column_unique'] = is_key log_item.payload['num_rows'] = workflow.nrows log_item.payload['num_cols'] = workflow.ncols log_item.save(update_fields=['payload']) return # Merge operation upload_data['dst_column_names'] = workflow.get_column_names() upload_data['dst_is_unique_column'] = workflow.get_column_unique() upload_data['dst_unique_col_names'] = [ cname for idx, cname in enumerate(upload_data['dst_column_names']) if upload_data['dst_column_names'][idx] ] upload_data['src_selected_key'] = run_params['merge_key'] upload_data['dst_selected_key'] = run_params['merge_key'] upload_data['how_merge'] = run_params['merge_method'] dst_df = pandas.load_table(workflow.get_data_frame_table_name()) src_df = pandas.load_table(workflow.get_data_frame_upload_table_name()) try: pandas.perform_dataframe_upload_merge(workflow, dst_df, src_df, upload_data) except Exception as exc: # Nuke the temporary table sql.delete_table(workflow.get_data_frame_upload_table_name()) raise Exception( _('Unable to perform merge operation: {0}').format(str(exc))) col_names, col_types, is_key = workflow.get_column_info() log_item.payload['col_names'] = col_names log_item.payload['col_types'] = col_types log_item.payload['column_unique'] = is_key log_item.payload['num_rows'] = workflow.nrows log_item.payload['num_cols'] = workflow.ncols log_item.save(update_fields=['payload'])
def upload_s4( request: HttpRequest, workflow: Optional[Workflow] = None, ) -> HttpResponse: """Step 4: Show the user the expected effect of the merge and perform it. ASSUMES: initial_column_names: List of column names in the initial file. column_types: List of column types as detected by pandas src_is_key_column: Boolean list with src columns that are unique step_1: URL name of the first step rename_column_names: Modified column names to remove ambiguity when merging. columns_to_upload: Boolean list denoting the columns in SRC that are marked for upload. dst_column_names: List of column names in destination frame dst_is_unique_column: Boolean list with dst columns that are unique dst_unique_col_names: List with the column names that are unique dst_selected_key: Key column name selected in DST src_selected_key: Key column name selected in SRC how_merge: How to merge. One of {left, right, outter, inner} :param request: Web request :return: """ # Get the dictionary containing the information about the upload upload_data = request.session.get('upload_data') if not upload_data: # If there is nsendo object, someone is trying to jump directly here. return redirect('dataops:uploadmerge') # Check the type of request that is being processed if request.method == 'POST': # We are processing a POST request # Get the dataframes to merge try: dst_df = load_table( workflow.get_data_frame_table_name(), ) src_df = load_table( workflow.get_data_frame_upload_table_name(), ) except Exception: return render( request, 'error.html', {'message': _('Exception while loading data frame')}) try: perform_dataframe_upload_merge( workflow, dst_df, src_df, upload_data) except Exception as exc: # Nuke the temporary table table_queries.delete_table( workflow.get_data_frame_upload_table_name(), ) col_info = workflow.get_column_info() workflow.log( request.user, Log.WORKFLOW_DATA_FAILEDMERGE, column_names=col_info[0], column_types=col_info[1], column_unique=col_info[2], error_message=str(exc)) messages.error(request, _('Merge operation failed. ') + str(exc)) return redirect(reverse('table:display')) col_info = workflow.get_column_info() workflow.log( request.user, Log.WORKFLOW_DATA_MERGE, column_names=col_info[0], column_types=col_info[1], column_unique=col_info[2]) store_workflow_in_session(request, workflow) request.session.pop('upload_data', None) return redirect(reverse('table:display')) # We are processing a GET request dst_column_names = upload_data['dst_column_names'] dst_selected_key = upload_data['dst_selected_key'] src_selected_key = upload_data['src_selected_key'] # List of final column names final_columns = sorted(set().union( dst_column_names, upload_data['rename_column_names'], )) # Dictionary with (new src column name: (old name, is_uploaded?) src_info = {rname: (iname, upload) for (rname, iname, upload) in zip( upload_data['rename_column_names'], upload_data['initial_column_names'], upload_data['columns_to_upload'], )} # Create the strings to show in the table for each of the rows explaining # what is going to be the effect of the update operation over them. # # There are 8 cases depending on the column name being a key column, # in DST, SRC, if SRC is being renamed, and SRC is being loaded. # # Case 1: The column is the key column used for the merge (skip it) # # Case 2: in DST, NOT in SRC: # Dst | | # # Case 3: in DST, in SRC, NOT LOADED # Dst Name | <-- | Src new name (Ignored) # # Case 4: NOT in DST, in SRC, NOT LOADED # | | Src new name (Ignored) # # Case 5: in DST, in SRC, Loaded, no rename: # Dst Name (Update) | <-- | Src name # # Case 6: in DST, in SRC, loaded, rename: # Dst Name (Update) | <-- | Src new name (Renamed) # # Case 7: NOT in DST, in SRC, loaded, no rename # Dst Name (NEW) | <-- | src name # # Case 8: NOT in DST, in SRC, loaded, renamed # Dst Name (NEW) | <-- | src name (renamed) # column_info = [] for colname in final_columns: # Case 1: Skip the keys if colname == src_selected_key or colname == dst_selected_key: continue # Case 2: Column is in DST and left untouched (no counter part in SRC) if colname not in list(src_info.keys()): column_info.append((colname, False, '')) continue # Get old name and if it is going to be loaded old_name, to_load = src_info[colname] # Column is not going to be loaded anyway if not to_load: if colname in dst_column_names: # Case 3 column_info.append((colname, False, colname + _(' (Ignored)'))) else: # Case 4 column_info.append(('', False, colname + _(' (Ignored)'))) continue # Initial name on the dst data frame dst_name = colname # Column not present in DST, so it is a new column if colname not in dst_column_names: dst_name += _(' (New)') else: dst_name += _(' (Update)') src_name = colname if colname != old_name: src_name += _(' (Renamed)') # Cases 5 - 8 column_info.append((dst_name, True, src_name)) # Store the value in the request object and update request.session['upload_data'] = upload_data return render( request, 'dataops/upload_s4.html', { 'prev_step': reverse('dataops:upload_s3'), 'info': column_info, 'valuerange': range(5), 'next_name': 'Finish'})
def _execute_plugin( workflow, plugin_info, input_column_names, output_column_names, output_suffix, merge_key, plugin_params, ): """ Execute the run method in the plugin. Execute the run method in a plugin with the dataframe from the given workflow :param workflow: Workflow object being processed :param plugin_info: PluginReistry object being processed :param input_column_names: List of input column names :param output_column_names: List of output column names :param output_suffix: Suffix that is added to the output column names :param merge_key: Key column to use in the merge :param plugin_params: Dictionary with the parameters to execute the plug in :return: Nothing, the result is stored in the log with log_id """ try: plugin_instance, msgs = load_plugin(plugin_info.filename) except OnTaskServiceException: raise Exception( ugettext('Unable to instantiate plugin "{0}"').format( plugin_info.name), ) # Check that the list of given inputs is consistent: if plugin has a list # of inputs, it has to have the same length as the given list. if (plugin_instance.get_input_column_names() and len(plugin_instance.get_input_column_names()) != len(input_column_names)): raise Exception( ugettext( 'Inconsistent number of inputs when invoking plugin "{0}"', ).format(plugin_info.name), ) # Check that the list of given outputs has the same length as the list of # outputs proposed by the plugin if (plugin_instance.get_output_column_names() and len(plugin_instance.get_output_column_names()) != len(output_column_names)): raise Exception( ugettext( 'Inconsistent number of outputs when invoking plugin "{0}"', ).format(plugin_info.name), ) # Get the data frame from the workflow try: df = pandas.load_table(workflow.get_data_frame_table_name()) except Exception as exc: raise Exception( ugettext( 'Exception when retrieving the data frame from workflow: {0}', ).format(str(exc)), ) # Set the updated names of the input, output columns, and the suffix if not plugin_instance.get_input_column_names(): plugin_instance.input_column_names = input_column_names plugin_instance.output_column_names = output_column_names plugin_instance.output_suffix = output_suffix # Create a new dataframe with the given input columns, and rename them if # needed try: sub_df = pd.DataFrame(df[input_column_names]) if plugin_instance.get_input_column_names(): sub_df.columns = plugin_instance.get_input_column_names() except Exception as exc: raise Exception( ugettext( 'Error when creating data frame for plugin: {0}', ).format( str(exc))) # Try the execution and catch any exception try: new_df = plugin_instance.run(sub_df, parameters=plugin_params) except Exception as exc: raise Exception( ugettext('Error while executing plugin: {0}').format(str(exc)), ) # If plugin does not return a data frame, flag as error if not isinstance(new_df, pd.DataFrame): raise Exception( ugettext( 'Plugin executed but did not return a pandas data frame.'), ) # Execution is DONE. Now we have to perform various additional checks # Result has to have the exact same number of rows if new_df.shape[0] != df.shape[0]: raise Exception( ugettext('Incorrect number of rows ({0}) in result data frame.', ).format(new_df.shape[0]), ) # Merge key name cannot be part of the output df if merge_key in new_df.columns: raise Exception( ugettext( 'Column name {0} cannot be in the result data frame.'.format( merge_key)), ) # Result column names are consistent if set(new_df.columns) != set(plugin_instance.get_output_column_names()): raise Exception(ugettext('Incorrect columns in result data frame.')) # Add the merge column to the result df new_df[merge_key] = df[merge_key] # Proceed with the merge try: new_frame = pandas.perform_dataframe_upload_merge( workflow, df, new_df, { 'how_merge': 'inner', 'dst_selected_key': merge_key, 'src_selected_key': merge_key, 'initial_column_names': list(new_df.columns), 'rename_column_names': list(new_df.columns), 'columns_to_upload': [True] * len(list(new_df.columns)), }, ) except Exception as exc: raise Exception( ugettext('Error while merging result: {0}.').format(str(exc)), ) if isinstance(new_frame, str): raise Exception( ugettext('Error while merging result: {0}.').format(new_frame)) # Update execution time in the plugin plugin_info.executed = datetime.now(pytz.timezone(settings.TIME_ZONE), ) plugin_info.save(update_fields=['executed']) return True
def execute_operation( self, user, workflow: Optional[models.Workflow] = None, action: Optional[models.Action] = None, payload: Optional[Dict] = None, log_item: Optional[models.Log] = None, ): """Perform a SQL upload asynchronously. :param user: User object :param workflow: Optional workflow object :param action: Empty :param payload: has fields: - connection_id: PK of the connection object to use - dst_key: Key column in the existing dataframe (if any) for merge - src_key: Key column in the external dataframe (for merge) - how_merge: Merge method: inner, outer, left, right - db_password: Encoded password if not stored in the connection - db_table: Table name if not stored in the connection :param log_item: Optional logitem object. :return: Elements to add to the "extend" field. Empty list in this case Upload/merge the data in the given workflow. """ del action # Get the connection conn = models.SQLConnection.objects.filter( pk=payload['connection_id']).first() if not conn: msg = _('Incorrect connection identifier.') log_item.payload['error'] = msg log_item.save(update_fields=['payload']) raise ontask.OnTaskException(msg) # Get the dataframe from the connection src_df = _load_df_from_sqlconnection(conn, payload) # Create session session = SESSION_STORE() session.create() workflow = acquire_workflow_access(user, session, workflow.id) if not workflow: msg = _('Unable to acquire access to workflow.') log_item.payload['error'] = msg log_item.save(update_fields=['payload']) raise ontask.OnTaskException(msg) # IDEA: How to deal with failure to acquire access? # Include a setting with the time to wait and number of retries? if not workflow.has_data_frame(): # Simple upload pandas.store_dataframe(src_df, workflow) return [] # At this point the operation is a merge dst_df = pandas.load_table(workflow.get_data_frame_table_name()) dst_key = payload.get('dst_key') if not dst_key: msg = _('Missing key column name of existing table.') log_item.payload['error'] = msg log_item.save(update_fields=['payload']) raise ontask.OnTaskException(msg) src_key = payload.get('src_key') if not src_key: msg = _('Missing key column name of new table.') log_item.payload['error'] = msg log_item.save(update_fields=['payload']) raise ontask.OnTaskException(msg) how_merge = payload.get('how_merge') if not how_merge: msg = _('Missing merge method.') log_item.payload['error'] = msg log_item.save(update_fields=['payload']) raise ontask.OnTaskException(msg) # Check additional correctness properties in the parameters error = pandas.validate_merge_parameters(dst_df, src_df, how_merge, dst_key, src_key) if error: log_item.payload['error'] = error log_item.save(update_fields=['payload']) raise ontask.OnTaskException(error) merge_info = { 'how_merge': how_merge, 'dst_selected_key': dst_key, 'src_selected_key': src_key, 'initial_column_names': list(src_df.columns), 'rename_column_names': list(src_df.columns), 'columns_to_upload': [True] * len(list(src_df.columns)) } # Perform the merge pandas.perform_dataframe_upload_merge(workflow, dst_df, src_df, merge_info) log_item.payload = merge_info log_item.save(update_fields=['payload'])