def add_random_column( user, workflow: models.Workflow, column: models.Column, data_frame: pd.DataFrame, ): """Add the formula column to the workflow. :param user: User making the request :param workflow: Workflow to add the column :param column: Column being added :param data_frame: Data frame of the current workflow :return: Column is added to the workflow """ # Empty new column new_column = [None] * workflow.nrows categories = column.get_categories() # Create the random partitions partitions = _partition( [idx for idx in range(workflow.nrows)], len(categories)) # Assign values to partitions for idx, indexes in enumerate(partitions): for col_idx in indexes: new_column[col_idx] = categories[idx] # Assign the new column to the data frame data_frame[column.name] = pd.Series( new_column, dtype=_ontask_type_to_pd_type[column.data_type], index=data_frame.index) if column.data_type == 'datetime': data_frame[column.name] = data_frame[column.name].dt.tz_localize( settings.TIME_ZONE) # Update the positions of the appropriate columns workflow.reposition_columns(workflow.ncols + 1, column.position) workflow.refresh_from_db() # Store the df to DB try: pandas.store_dataframe(data_frame, workflow) except Exception as exc: raise services.OnTaskWorkflowStoreError( message=_('Unable to add the column: {0}').format(str(exc)), to_delete=[column]) workflow.ncols = workflow.columns.count() workflow.save() # Log the event column.log(user, models.Log.COLUMN_ADD_RANDOM)
def add_formula_column( user, workflow: models.Workflow, column: models.Column, operation: str, selected_columns: List[models.Column], ): """Add the formula column to the workflow. :param user: User making the request :param workflow: Workflow to add the column :param column: Column being added :param operation: string denoting the operation :param selected_columns: List of columns selected for the operation. :return: Column is added to the workflow """ # Save the column object attached to the form and add additional fields column.workflow = workflow column.is_key = False # Save the instance column.save() # Update the data frame df = pandas.load_table(workflow.get_data_frame_table_name()) # Add the column with the appropriate computation cnames = [col.name for col in selected_columns] df[column.name] = _op_distrib[operation](df[cnames]) # Populate the column type column.data_type = pandas.datatype_names.get( df[column.name].dtype.name) # Update the positions of the appropriate columns workflow.reposition_columns(workflow.ncols + 1, column.position) column.save() workflow.refresh_from_db() # Store the df to DB try: pandas.store_dataframe(df, workflow) except Exception as exc: raise services.OnTaskWorkflowAddColumn( message=_('Unable to add column: {0}').format(str(exc)), to_delete=[column]) workflow.ncols = workflow.columns.count() workflow.save() column.log(user, models.Log.COLUMN_ADD_FORMULA)
def override( self, request: http.HttpRequest, wid: int, format=None, workflow: Optional[models.Workflow] = None, ) -> http.HttpResponse: """Override the content in the workflow. :param request: Received request object :param wid: Workflow ID :param format: format for the response :param workflow: Workflow being manipulated (set by decorator) """ del wid, format # Try to retrieve the wflow to check for permissions serializer = self.serializer_class(data=request.data) if not serializer.is_valid(): # Flag the error return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST) # Data received is a correct data frame. df = serializer.validated_data['data_frame'] try: # Verify the data frame pandas.verify_data_frame(df) # Store the content in the db and... pandas.store_dataframe(df, workflow) except OnTaskDataFrameNoKey as exc: return Response(str(exc), status=status.HTTP_400_BAD_REQUEST) # Update all the counters in the conditions for action in workflow.actions.all(): action.update_n_rows_selected() return Response(None, status=status.HTTP_201_CREATED)
def perform_dataframe_upload_merge( workflow, dst_df: pd.DataFrame, src_df: pd.DataFrame, merge_info: Dict, ): """Merge the existing data frame (dst) with a new one (src). It combines the two data frames dst_df and src_df and stores its content. The combination of dst_df and src_df assumes: - dst_df has a set of columns (potentially empty) that do not overlap in name with the ones in src_df (dst_df[NO_OVERLAP_DST]) - dst_df and src_df have a set of columns (potentially empty) that overlap in name (dst_df[OVERLAP] and src_df[OVERLAP] respectively) - src_df has a set of columns (potentially empty) that do not overlap in name with the ones in dst_df (src_df[NO_OVERLAP_SRC]) The function combines dst_df and src_df following two main steps (in both steps, the number of rows processed are derived from the parameter merge_info['how_merge']). STEP A: A new data frame dst_df_tmp1 is created using the pandas "merge" operation between dst_df and src_df[NO_OVERLAP_SRC]. This increases the number of columns in dst_df_tmp1 with respect to dst_df by adding the new columns from src_df. The pseudocode for this step is: dst_df_tmp1 = pd.merge(dst_df, src_df[NO_OVERLAP_SRC], how=merge['how_merge'], left_on=merge_info['dst_selected_key'], right_on=merge_info['src_selected_key']) STEP B: The data frame dst_df_tmp1 is then updated with the values in src_df[OVERLAP]. :param workflow: Workflow with the data frame :param dst_df: Destination dataframe (already stored in DB) :param src_df: Source dataframe, stored in temporary table :param merge_info: Dictionary with merge options - initial_column_names: List of initial column names in src data frame. - rename_column_names: Columns that need to be renamed in src data frame. - columns_to_uplooad: Columns to be considered for the update - src_selected_key: Key in the source data frame - dst_selected_key: key in the destination (existing) data frame - how_merge: How to merge: inner, outer, left or right :return: None or Exception with anomaly in the message """ # STEP 1 Rename the column names. src_df = src_df.rename( columns=dict(list(zip( merge_info['initial_column_names'], merge_info['rename_column_names'])))) # STEP 2 Drop the columns not selected columns_to_upload = merge_info['columns_to_upload'] src_df.drop( [ col for idx, col in enumerate(list(src_df.columns)) if not columns_to_upload[idx] ], axis=1, inplace=True) # If no keep_key_column value is given, initialize to True if 'keep_key_column' not in merge_info: kk_column = [] for cname in merge_info['rename_column_names']: kk_column.append(pandas.is_unique_column(src_df[cname])) merge_info['keep_key_column'] = kk_column # Get the keys src_key = merge_info['src_selected_key'] dst_key = merge_info['dst_selected_key'] # STEP 3 Perform the combination # Separate the columns in src that overlap from those that do not # overlap, but include the key column in both data frames. overlap_names = set(dst_df.columns).intersection(src_df.columns) src_no_overlap_names = set(src_df.columns).difference(overlap_names) src_df_overlap = src_df[list(overlap_names.union({src_key}))] src_df_no_overlap = src_df[list(src_no_overlap_names.union({src_key}))] # Step A. Perform the merge of non-overlapping columns new_df = _perform_non_overlapping_column_merge( dst_df, src_df_no_overlap, merge_info, dst_key, src_key) # Step B. Perform the update with the overlapping columns new_df = _perform_overlap_update( new_df, src_df_overlap, dst_key, src_key, merge_info['how_merge']) # If the merge produced a data frame with no rows, flag it as an error to # prevent loosing data when there is a mistake in the key column if new_df.shape[0] == 0: raise Exception(gettext( 'Merge operation produced a result with no rows')) # If the merge produced a data frame with no unique columns, flag it as an # error to prevent the data frame from propagating without a key column if not pandas.has_unique_column(new_df): raise Exception(gettext( 'Merge operation produced a result without any key columns. ' + 'Review the key columns in the data to upload.', )) # Store the result back in the DB pandas.store_dataframe(new_df, workflow) _update_is_key_field(merge_info, workflow) # Recompute all the values of the conditions in each of the actions for action in workflow.actions.all(): action.update_n_rows_selected()
def add_random_column( user, workflow: models.Workflow, column: models.Column, data_frame: pd.DataFrame, ): """Add the formula column to the workflow. :param user: User making the request :param workflow: Workflow to add the column :param column: Column being added :param data_frame: Data frame of the current workflow :return: Column is added to the workflow """ # Save the column object attached to the form and add additional fields column.workflow = workflow column.is_key = False # Detect the case of a single integer as initial value so that it is # expanded try: int_value = int(column.categories[0]) if int_value <= 1: raise services.OnTaskWorkflowIntegerLowerThanOne( field_name='values', message=_('The integer value has to be larger than 1')) column.set_categories([idx + 1 for idx in range(int_value)]) except (ValueError, TypeError, IndexError): pass column.save() # Empty new column new_column = [None] * workflow.nrows # Create the random partitions partitions = _partition([idx for idx in range(workflow.nrows)], len(column.categories)) # Assign values to partitions for idx, indexes in enumerate(partitions): for col_idx in indexes: new_column[col_idx] = column.categories[idx] # Assign the new column to the data frame data_frame[column.name] = new_column # Update the positions of the appropriate columns workflow.reposition_columns(workflow.ncols + 1, column.position) workflow.refresh_from_db() # Store the df to DB try: pandas.store_dataframe(data_frame, workflow) except Exception as exc: raise services.OnTaskWorkflowStoreError( message=_('Unable to add the column: {0}').format(str(exc)), to_delete=[column]) workflow.ncols = workflow.columns.count() workflow.save() # Log the event column.log(user, models.Log.COLUMN_ADD_RANDOM)
def random_column_add( request: HttpRequest, workflow: Optional[Workflow] = None, ) -> JsonResponse: """Create a column with random values (Modal). :param request: :return: """ # Get the workflow element if workflow.nrows == 0: messages.error(request, _('Cannot add column to a workflow without data')) return JsonResponse({'html_redirect': ''}) # Form to read/process data form = RandomColumnAddForm(data=request.POST or None) if request.method == 'POST' and form.is_valid(): # Save the column object attached to the form and add additional fields column = form.save(commit=False) column.workflow = workflow column.is_key = False # Save the instance try: column = form.save() form.save_m2m() except IntegrityError: form.add_error('name', _('A column with that name already exists')) return JsonResponse({ 'html_form': render_to_string( 'workflow/includes/partial_random_column_add.html', {'form': form}, request=request), }) # Update the data frame df = load_table(workflow.get_data_frame_table_name()) # Get the values and interpret its meaning column_values = form.cleaned_data['column_values'] # First, try to see if the field is a valid integer try: int_value = int(column_values) except ValueError: int_value = None if int_value: # At this point the field is an integer if int_value <= 1: form.add_error('values', _('The integer value has to be larger than 1')) return JsonResponse({ 'html_form': render_to_string( 'workflow/includes/partial_random_column_add.html', {'form': form}, request=request), }) intvals = [idx + 1 for idx in range(int_value)] else: # At this point the field is a string and the values are the comma # separated strings. intvals = [ valstr.strip() for valstr in column_values.strip().split(',') if valstr ] if not intvals: form.add_error( 'values', _('The value has to be a comma-separated list'), ) return JsonResponse({ 'html_form': render_to_string( 'workflow/includes/partial_random_column_add.html', {'form': form}, request=request), }) # Empty new column new_column = [None] * workflow.nrows # Create the random partitions partitions = _partition([idx for idx in range(workflow.nrows)], len(intvals)) # Assign values to partitions for idx, indexes in enumerate(partitions): for col_idx in indexes: new_column[col_idx] = intvals[idx] # Assign the new column to the data frame df[column.name] = new_column # Populate the column type column.data_type = pandas_datatype_names.get( df[column.name].dtype.name, ) # Update the positions of the appropriate columns workflow.reposition_columns(workflow.ncols + 1, column.position) column.save() workflow.refresh_from_db() # Store the df to DB try: store_dataframe(df, workflow) except Exception as exc: messages.error(request, _('Unable to add the column: {0}').format(str(exc))) return JsonResponse({'html_redirect': ''}) # Log the event Log.objects.register( request.user, Log.COLUMN_ADD_RANDOM, workflow, { 'id': workflow.id, 'name': workflow.name, 'column_name': column.name, 'column_type': column.data_type, 'value': column_values }) # The form has been successfully processed return JsonResponse({'html_redirect': ''}) return JsonResponse({ 'html_form': render_to_string('workflow/includes/partial_random_column_add.html', {'form': form}, request=request), })
def formula_column_add( request: HttpRequest, workflow: Optional[Workflow] = None, ) -> JsonResponse: """Add a formula column. :param request: :return: """ # Get the workflow element if workflow.nrows == 0: messages.error( request, _('Cannot add column to a workflow without data'), ) return JsonResponse({'html_redirect': ''}) # Form to read/process data form = FormulaColumnAddForm( form_data=request.POST or None, operands=_formula_column_operands, columns=workflow.columns.all(), ) if request.method == 'POST' and form.is_valid(): # Save the column object attached to the form and add additional fields column = form.save(commit=False) column.workflow = workflow column.is_key = False # Save the instance try: column.save() form.save_m2m() except IntegrityError: form.add_error('name', _('A column with that name already exists')) return JsonResponse({ 'html_form': render_to_string( 'workflow/includes/partial_formula_column_add.html', {'form': form}, request=request), }) # Update the data frame df = load_table(workflow.get_data_frame_table_name()) try: # Add the column with the appropriate computation operation = form.cleaned_data['op_type'] cnames = [col.name for col in form.selected_columns] df[column.name] = _op_distrib[operation](df[cnames]) except Exception as exc: # Something went wrong in pandas, we need to remove the column column.delete() # Notify in the form form.add_error( None, _('Unable to add the column: {0}').format(exc, ), ) return JsonResponse({ 'html_form': render_to_string( 'workflow/includes/partial_formula_column_add.html', {'form': form}, request=request, ), }) # Populate the column type column.data_type = pandas_datatype_names.get( df[column.name].dtype.name) # Update the positions of the appropriate columns workflow.reposition_columns(workflow.ncols + 1, column.position) # Save column and refresh the prefetched related in the workflow column.save() workflow.refresh_from_db() # Store the df to DB try: store_dataframe(df, workflow) except Exception as exc: messages.error(request, _('Unable to clone column: {0}').format(str(exc))) return JsonResponse({'html_redirect': ''}) # Log the event Log.objects.register( request.user, Log.COLUMN_ADD_FORMULA, workflow, { 'id': workflow.id, 'name': workflow.name, 'column_name': column.name, 'column_type': column.data_type }) # The form has been successfully processed return JsonResponse({'html_redirect': ''}) return JsonResponse({ 'html_form': render_to_string( 'workflow/includes/partial_formula_column_add.html', {'form': form}, request=request, ), })
def execute_operation( self, user, workflow: Optional[models.Workflow] = None, action: Optional[models.Action] = None, payload: Optional[Dict] = None, log_item: Optional[models.Log] = None, ): """Perform a SQL upload asynchronously. :param user: User object :param workflow: Optional workflow object :param action: Empty :param payload: has fields: - connection_id: PK of the connection object to use - dst_key: Key column in the existing dataframe (if any) for merge - src_key: Key column in the external dataframe (for merge) - how_merge: Merge method: inner, outer, left, right - db_password: Encoded password if not stored in the connection - db_table: Table name if not stored in the connection :param log_item: Optional logitem object. :return: Elements to add to the "extend" field. Empty list in this case Upload/merge the data in the given workflow. """ del action # Get the connection conn = models.SQLConnection.objects.filter( pk=payload['connection_id']).first() if not conn: msg = _('Incorrect connection identifier.') log_item.payload['error'] = msg log_item.save(update_fields=['payload']) raise ontask.OnTaskException(msg) # Get the dataframe from the connection src_df = _load_df_from_sqlconnection(conn, payload) # Create session session = SESSION_STORE() session.create() workflow = acquire_workflow_access(user, session, workflow.id) if not workflow: msg = _('Unable to acquire access to workflow.') log_item.payload['error'] = msg log_item.save(update_fields=['payload']) raise ontask.OnTaskException(msg) # IDEA: How to deal with failure to acquire access? # Include a setting with the time to wait and number of retries? if not workflow.has_data_frame(): # Simple upload pandas.store_dataframe(src_df, workflow) return [] # At this point the operation is a merge dst_df = pandas.load_table(workflow.get_data_frame_table_name()) dst_key = payload.get('dst_key') if not dst_key: msg = _('Missing key column name of existing table.') log_item.payload['error'] = msg log_item.save(update_fields=['payload']) raise ontask.OnTaskException(msg) src_key = payload.get('src_key') if not src_key: msg = _('Missing key column name of new table.') log_item.payload['error'] = msg log_item.save(update_fields=['payload']) raise ontask.OnTaskException(msg) how_merge = payload.get('how_merge') if not how_merge: msg = _('Missing merge method.') log_item.payload['error'] = msg log_item.save(update_fields=['payload']) raise ontask.OnTaskException(msg) # Check additional correctness properties in the parameters error = pandas.validate_merge_parameters(dst_df, src_df, how_merge, dst_key, src_key) if error: log_item.payload['error'] = error log_item.save(update_fields=['payload']) raise ontask.OnTaskException(error) merge_info = { 'how_merge': how_merge, 'dst_selected_key': dst_key, 'src_selected_key': src_key, 'initial_column_names': list(src_df.columns), 'rename_column_names': list(src_df.columns), 'columns_to_upload': [True] * len(list(src_df.columns)) } # Perform the merge pandas.perform_dataframe_upload_merge(workflow, dst_df, src_df, merge_info) log_item.payload = merge_info log_item.save(update_fields=['payload'])
def random_column_add( request: HttpRequest, workflow: Optional[Workflow] = None, ) -> JsonResponse: """Create a column with random values (Modal). :param request: :return: """ # Get the workflow element if workflow.nrows == 0: messages.error(request, _('Cannot add column to a workflow without data')) return JsonResponse({'html_redirect': ''}) # Form to read/process data form = RandomColumnAddForm(data=request.POST or None, workflow=workflow, allow_interval_as_initial=True) if request.method == 'POST' and form.is_valid(): # Save the column object attached to the form and add additional fields column: Column = form.save(commit=False) column.workflow = workflow column.is_key = False # Save the instance try: column = form.save() form.save_m2m() except IntegrityError: form.add_error('name', _('A column with that name already exists')) return JsonResponse({ 'html_form': render_to_string( 'workflow/includes/partial_random_column_add.html', {'form': form}, request=request), }) # Detect the case of a single integer as initial value so that it is # expanded try: int_value: int = int(column.categories[0]) except (ValueError, TypeError, IndexError): int_value: Optional[str] = None if int_value: # At this point the field is an integer if int_value <= 1: form.add_error('values', _('The integer value has to be larger than 1')) return JsonResponse({ 'html_form': render_to_string( 'workflow/includes/partial_random_column_add.html', {'form': form}, request=request), }) column.set_categories([idx + 1 for idx in range(int_value)]) column.save() # Empty new column new_column = [None] * workflow.nrows # Create the random partitions partitions = _partition([idx for idx in range(workflow.nrows)], len(column.categories)) # Assign values to partitions for idx, indexes in enumerate(partitions): for col_idx in indexes: new_column[col_idx] = column.categories[idx] # Assign the new column to the data frame form.data_frame[column.name] = new_column # Update the positions of the appropriate columns workflow.reposition_columns(workflow.ncols + 1, column.position) workflow.refresh_from_db() # Store the df to DB try: store_dataframe(form.data_frame, workflow) except Exception as exc: messages.error(request, _('Unable to add the column: {0}').format(str(exc))) column.delete() return JsonResponse({'html_redirect': ''}) workflow.ncols = workflow.columns.count() workflow.save() # Log the event column.log(request.user, Log.COLUMN_ADD_RANDOM) # The form has been successfully processed return JsonResponse({'html_redirect': ''}) return JsonResponse({ 'html_form': render_to_string('workflow/includes/partial_random_column_add.html', {'form': form}, request=request), })