Python store_dataframe 예제들, ontask.dataops.pandas.store_dataframe Python 예제들

예제 #1

0

파일 보기

def add_random_column(
    user,
    workflow: models.Workflow,
    column: models.Column,
    data_frame: pd.DataFrame,
):
    """Add the formula column to the workflow.

    :param user: User making the request
    :param workflow: Workflow to add the column
    :param column: Column being added
    :param data_frame: Data frame of the current workflow
    :return: Column is added to the workflow
    """
    # Empty new column
    new_column = [None] * workflow.nrows
    categories = column.get_categories()
    # Create the random partitions
    partitions = _partition(
        [idx for idx in range(workflow.nrows)],
        len(categories))

    # Assign values to partitions
    for idx, indexes in enumerate(partitions):
        for col_idx in indexes:
            new_column[col_idx] = categories[idx]

    # Assign the new column to the data frame
    data_frame[column.name] = pd.Series(
        new_column,
        dtype=_ontask_type_to_pd_type[column.data_type],
        index=data_frame.index)

    if column.data_type == 'datetime':
        data_frame[column.name] = data_frame[column.name].dt.tz_localize(
            settings.TIME_ZONE)

    # Update the positions of the appropriate columns
    workflow.reposition_columns(workflow.ncols + 1, column.position)
    workflow.refresh_from_db()

    # Store the df to DB
    try:
        pandas.store_dataframe(data_frame, workflow)
    except Exception as exc:
        raise services.OnTaskWorkflowStoreError(
            message=_('Unable to add the column: {0}').format(str(exc)),
            to_delete=[column])

    workflow.ncols = workflow.columns.count()
    workflow.save()

    # Log the event
    column.log(user, models.Log.COLUMN_ADD_RANDOM)

예제 #2

0

파일 보기

def add_formula_column(
    user,
    workflow: models.Workflow,
    column: models.Column,
    operation: str,
    selected_columns: List[models.Column],
):
    """Add the formula column to the workflow.

    :param user: User making the request
    :param workflow: Workflow to add the column
    :param column: Column being added
    :param operation: string denoting the operation
    :param selected_columns: List of columns selected for the operation.
    :return: Column is added to the workflow
    """
    # Save the column object attached to the form and add additional fields
    column.workflow = workflow
    column.is_key = False

    # Save the instance
    column.save()

    # Update the data frame
    df = pandas.load_table(workflow.get_data_frame_table_name())

    # Add the column with the appropriate computation
    cnames = [col.name for col in selected_columns]
    df[column.name] = _op_distrib[operation](df[cnames])

    # Populate the column type
    column.data_type = pandas.datatype_names.get(
        df[column.name].dtype.name)

    # Update the positions of the appropriate columns
    workflow.reposition_columns(workflow.ncols + 1, column.position)
    column.save()
    workflow.refresh_from_db()

    # Store the df to DB
    try:
        pandas.store_dataframe(df, workflow)
    except Exception as exc:
        raise services.OnTaskWorkflowAddColumn(
            message=_('Unable to add column: {0}').format(str(exc)),
            to_delete=[column])

    workflow.ncols = workflow.columns.count()
    workflow.save()
    column.log(user, models.Log.COLUMN_ADD_FORMULA)

예제 #3

0

파일 보기

파일: api.py 프로젝트: ubc/ontask_b

    def override(
        self,
        request: http.HttpRequest,
        wid: int,
        format=None,
        workflow: Optional[models.Workflow] = None,
    ) -> http.HttpResponse:
        """Override the content in the workflow.

        :param request: Received request object
        :param wid: Workflow ID
        :param format: format for the response
        :param workflow: Workflow being manipulated (set by decorator)
        """
        del wid, format
        # Try to retrieve the wflow to check for permissions
        serializer = self.serializer_class(data=request.data)
        if not serializer.is_valid():
            # Flag the error
            return Response(serializer.errors,
                            status=status.HTTP_400_BAD_REQUEST)

        # Data received is a correct data frame.
        df = serializer.validated_data['data_frame']

        try:
            # Verify the data frame
            pandas.verify_data_frame(df)
            # Store the content in the db and...
            pandas.store_dataframe(df, workflow)
        except OnTaskDataFrameNoKey as exc:
            return Response(str(exc), status=status.HTTP_400_BAD_REQUEST)

        # Update all the counters in the conditions
        for action in workflow.actions.all():
            action.update_n_rows_selected()

        return Response(None, status=status.HTTP_201_CREATED)

예제 #4

0

파일 보기

파일: merge.py 프로젝트: ubc/ontask_b

def perform_dataframe_upload_merge(
    workflow,
    dst_df: pd.DataFrame,
    src_df: pd.DataFrame,
    merge_info: Dict,
):
    """Merge the existing data frame (dst) with a new one (src).

    It combines the two data frames dst_df and src_df and stores its content.

    The combination of dst_df and src_df assumes:

    - dst_df has a set of columns (potentially empty) that do not overlap in
      name with the ones in src_df (dst_df[NO_OVERLAP_DST])

    - dst_df and src_df have a set of columns (potentially empty) that overlap
      in name (dst_df[OVERLAP] and src_df[OVERLAP] respectively)

    - src_df has a set of columns (potentially empty) that do not overlap in
      name with the ones in dst_df (src_df[NO_OVERLAP_SRC])

    The function combines dst_df and src_df following two main steps (in both
    steps, the number of rows processed are derived from the parameter
    merge_info['how_merge']).

    STEP A: A new data frame dst_df_tmp1 is created using the pandas "merge"
    operation between dst_df and src_df[NO_OVERLAP_SRC]. This increases the
    number of columns in dst_df_tmp1 with respect to dst_df by adding the new
    columns from src_df.

    The pseudocode for this step is:

    dst_df_tmp1 = pd.merge(dst_df,
                           src_df[NO_OVERLAP_SRC],
                           how=merge['how_merge'],
                           left_on=merge_info['dst_selected_key'],
                           right_on=merge_info['src_selected_key'])

    STEP B: The data frame dst_df_tmp1 is then updated with the values in
    src_df[OVERLAP].

    :param workflow: Workflow with the data frame
    :param dst_df: Destination dataframe (already stored in DB)
    :param src_df: Source dataframe, stored in temporary table
    :param merge_info: Dictionary with merge options
           - initial_column_names: List of initial column names in src data
             frame.
           - rename_column_names: Columns that need to be renamed in src data
             frame.
           - columns_to_uplooad: Columns to be considered for the update
           - src_selected_key: Key in the source data frame
           - dst_selected_key: key in the destination (existing) data frame
           - how_merge: How to merge: inner, outer, left or right
    :return: None or Exception with anomaly in the message
    """
    # STEP 1 Rename the column names.
    src_df = src_df.rename(
        columns=dict(list(zip(
            merge_info['initial_column_names'],
            merge_info['rename_column_names']))))

    # STEP 2 Drop the columns not selected
    columns_to_upload = merge_info['columns_to_upload']
    src_df.drop(
        [
            col for idx, col in enumerate(list(src_df.columns))
            if not columns_to_upload[idx]
        ],
        axis=1,
        inplace=True)

    # If no keep_key_column value is given, initialize to True
    if 'keep_key_column' not in merge_info:
        kk_column = []
        for cname in merge_info['rename_column_names']:
            kk_column.append(pandas.is_unique_column(src_df[cname]))
        merge_info['keep_key_column'] = kk_column

    # Get the keys
    src_key = merge_info['src_selected_key']
    dst_key = merge_info['dst_selected_key']

    # STEP 3 Perform the combination
    # Separate the columns in src that overlap from those that do not
    # overlap, but include the key column in both data frames.
    overlap_names = set(dst_df.columns).intersection(src_df.columns)
    src_no_overlap_names = set(src_df.columns).difference(overlap_names)
    src_df_overlap = src_df[list(overlap_names.union({src_key}))]
    src_df_no_overlap = src_df[list(src_no_overlap_names.union({src_key}))]

    # Step A. Perform the merge of non-overlapping columns
    new_df = _perform_non_overlapping_column_merge(
        dst_df,
        src_df_no_overlap,
        merge_info,
        dst_key,
        src_key)

    # Step B. Perform the update with the overlapping columns
    new_df = _perform_overlap_update(
        new_df,
        src_df_overlap,
        dst_key,
        src_key,
        merge_info['how_merge'])

    # If the merge produced a data frame with no rows, flag it as an error to
    # prevent loosing data when there is a mistake in the key column
    if new_df.shape[0] == 0:
        raise Exception(gettext(
            'Merge operation produced a result with no rows'))

    # If the merge produced a data frame with no unique columns, flag it as an
    # error to prevent the data frame from propagating without a key column
    if not pandas.has_unique_column(new_df):
        raise Exception(gettext(
            'Merge operation produced a result without any key columns. '
            + 'Review the key columns in the data to upload.',
        ))

    # Store the result back in the DB
    pandas.store_dataframe(new_df, workflow)

    _update_is_key_field(merge_info, workflow)

    # Recompute all the values of the conditions in each of the actions
    for action in workflow.actions.all():
        action.update_n_rows_selected()

예제 #5

0

파일 보기

파일: column_crud.py 프로젝트: cab938/ontask_b

def add_random_column(
    user,
    workflow: models.Workflow,
    column: models.Column,
    data_frame: pd.DataFrame,
):
    """Add the formula column to the workflow.

    :param user: User making the request
    :param workflow: Workflow to add the column
    :param column: Column being added
    :param data_frame: Data frame of the current workflow
    :return: Column is added to the workflow
    """
    # Save the column object attached to the form and add additional fields
    column.workflow = workflow
    column.is_key = False

    # Detect the case of a single integer as initial value so that it is
    # expanded
    try:
        int_value = int(column.categories[0])
        if int_value <= 1:
            raise services.OnTaskWorkflowIntegerLowerThanOne(
                field_name='values',
                message=_('The integer value has to be larger than 1'))
        column.set_categories([idx + 1 for idx in range(int_value)])
    except (ValueError, TypeError, IndexError):
        pass

    column.save()

    # Empty new column
    new_column = [None] * workflow.nrows
    # Create the random partitions
    partitions = _partition([idx for idx in range(workflow.nrows)],
                            len(column.categories))

    # Assign values to partitions
    for idx, indexes in enumerate(partitions):
        for col_idx in indexes:
            new_column[col_idx] = column.categories[idx]

    # Assign the new column to the data frame
    data_frame[column.name] = new_column

    # Update the positions of the appropriate columns
    workflow.reposition_columns(workflow.ncols + 1, column.position)
    workflow.refresh_from_db()

    # Store the df to DB
    try:
        pandas.store_dataframe(data_frame, workflow)
    except Exception as exc:
        raise services.OnTaskWorkflowStoreError(
            message=_('Unable to add the column: {0}').format(str(exc)),
            to_delete=[column])

    workflow.ncols = workflow.columns.count()
    workflow.save()

    # Log the event
    column.log(user, models.Log.COLUMN_ADD_RANDOM)

예제 #6

0

파일 보기

def random_column_add(
    request: HttpRequest,
    workflow: Optional[Workflow] = None,
) -> JsonResponse:
    """Create a column with random values (Modal).

    :param request:

    :return:
    """
    # Get the workflow element
    if workflow.nrows == 0:
        messages.error(request,
                       _('Cannot add column to a workflow without data'))
        return JsonResponse({'html_redirect': ''})

    # Form to read/process data
    form = RandomColumnAddForm(data=request.POST or None)

    if request.method == 'POST' and form.is_valid():

        # Save the column object attached to the form and add additional fields
        column = form.save(commit=False)
        column.workflow = workflow
        column.is_key = False

        # Save the instance
        try:
            column = form.save()
            form.save_m2m()
        except IntegrityError:
            form.add_error('name', _('A column with that name already exists'))
            return JsonResponse({
                'html_form':
                render_to_string(
                    'workflow/includes/partial_random_column_add.html',
                    {'form': form},
                    request=request),
            })

        # Update the data frame
        df = load_table(workflow.get_data_frame_table_name())

        # Get the values and interpret its meaning
        column_values = form.cleaned_data['column_values']
        # First, try to see if the field is a valid integer
        try:
            int_value = int(column_values)
        except ValueError:
            int_value = None

        if int_value:
            # At this point the field is an integer
            if int_value <= 1:
                form.add_error('values',
                               _('The integer value has to be larger than 1'))
                return JsonResponse({
                    'html_form':
                    render_to_string(
                        'workflow/includes/partial_random_column_add.html',
                        {'form': form},
                        request=request),
                })

            intvals = [idx + 1 for idx in range(int_value)]
        else:
            # At this point the field is a string and the values are the comma
            # separated strings.
            intvals = [
                valstr.strip() for valstr in column_values.strip().split(',')
                if valstr
            ]
            if not intvals:
                form.add_error(
                    'values',
                    _('The value has to be a comma-separated list'),
                )
                return JsonResponse({
                    'html_form':
                    render_to_string(
                        'workflow/includes/partial_random_column_add.html',
                        {'form': form},
                        request=request),
                })

        # Empty new column
        new_column = [None] * workflow.nrows
        # Create the random partitions
        partitions = _partition([idx for idx in range(workflow.nrows)],
                                len(intvals))

        # Assign values to partitions
        for idx, indexes in enumerate(partitions):
            for col_idx in indexes:
                new_column[col_idx] = intvals[idx]

        # Assign the new column to the data frame
        df[column.name] = new_column

        # Populate the column type
        column.data_type = pandas_datatype_names.get(
            df[column.name].dtype.name, )

        # Update the positions of the appropriate columns
        workflow.reposition_columns(workflow.ncols + 1, column.position)

        column.save()
        workflow.refresh_from_db()

        # Store the df to DB
        try:
            store_dataframe(df, workflow)
        except Exception as exc:
            messages.error(request,
                           _('Unable to add the column: {0}').format(str(exc)))
            return JsonResponse({'html_redirect': ''})

        # Log the event
        Log.objects.register(
            request.user, Log.COLUMN_ADD_RANDOM, workflow, {
                'id': workflow.id,
                'name': workflow.name,
                'column_name': column.name,
                'column_type': column.data_type,
                'value': column_values
            })

        # The form has been successfully processed
        return JsonResponse({'html_redirect': ''})

    return JsonResponse({
        'html_form':
        render_to_string('workflow/includes/partial_random_column_add.html',
                         {'form': form},
                         request=request),
    })

예제 #7

0

파일 보기

def formula_column_add(
    request: HttpRequest,
    workflow: Optional[Workflow] = None,
) -> JsonResponse:
    """Add a formula column.

    :param request:

    :return:
    """
    # Get the workflow element
    if workflow.nrows == 0:
        messages.error(
            request,
            _('Cannot add column to a workflow without data'),
        )
        return JsonResponse({'html_redirect': ''})

    # Form to read/process data
    form = FormulaColumnAddForm(
        form_data=request.POST or None,
        operands=_formula_column_operands,
        columns=workflow.columns.all(),
    )

    if request.method == 'POST' and form.is_valid():
        # Save the column object attached to the form and add additional fields
        column = form.save(commit=False)
        column.workflow = workflow
        column.is_key = False

        # Save the instance
        try:
            column.save()
            form.save_m2m()
        except IntegrityError:
            form.add_error('name', _('A column with that name already exists'))
            return JsonResponse({
                'html_form':
                render_to_string(
                    'workflow/includes/partial_formula_column_add.html',
                    {'form': form},
                    request=request),
            })

        # Update the data frame
        df = load_table(workflow.get_data_frame_table_name())

        try:
            # Add the column with the appropriate computation
            operation = form.cleaned_data['op_type']
            cnames = [col.name for col in form.selected_columns]
            df[column.name] = _op_distrib[operation](df[cnames])
        except Exception as exc:
            # Something went wrong in pandas, we need to remove the column
            column.delete()

            # Notify in the form
            form.add_error(
                None,
                _('Unable to add the column: {0}').format(exc, ),
            )
            return JsonResponse({
                'html_form':
                render_to_string(
                    'workflow/includes/partial_formula_column_add.html',
                    {'form': form},
                    request=request,
                ),
            })

        # Populate the column type
        column.data_type = pandas_datatype_names.get(
            df[column.name].dtype.name)

        # Update the positions of the appropriate columns
        workflow.reposition_columns(workflow.ncols + 1, column.position)

        # Save column and refresh the prefetched related in the workflow
        column.save()
        workflow.refresh_from_db()

        # Store the df to DB
        try:
            store_dataframe(df, workflow)
        except Exception as exc:
            messages.error(request,
                           _('Unable to clone column: {0}').format(str(exc)))
            return JsonResponse({'html_redirect': ''})

        # Log the event
        Log.objects.register(
            request.user, Log.COLUMN_ADD_FORMULA, workflow, {
                'id': workflow.id,
                'name': workflow.name,
                'column_name': column.name,
                'column_type': column.data_type
            })

        # The form has been successfully processed
        return JsonResponse({'html_redirect': ''})

    return JsonResponse({
        'html_form':
        render_to_string(
            'workflow/includes/partial_formula_column_add.html',
            {'form': form},
            request=request,
        ),
    })

예제 #8

0

파일 보기

파일: sql_upload.py 프로젝트: ubc/ontask_b

    def execute_operation(
        self,
        user,
        workflow: Optional[models.Workflow] = None,
        action: Optional[models.Action] = None,
        payload: Optional[Dict] = None,
        log_item: Optional[models.Log] = None,
    ):
        """Perform a SQL upload asynchronously.

        :param user: User object
        :param workflow: Optional workflow object
        :param action: Empty
        :param payload: has fields:
          - connection_id: PK of the connection object to use
          - dst_key: Key column in the existing dataframe (if any) for merge
          - src_key: Key column in the external dataframe (for merge)
          - how_merge: Merge method: inner, outer, left, right
          - db_password: Encoded password if not stored in the connection
          - db_table: Table name if not stored in the connection
        :param log_item: Optional logitem object.
        :return: Elements to add to the "extend" field. Empty list in this case
                 Upload/merge the data in the given workflow.
        """
        del action

        # Get the connection
        conn = models.SQLConnection.objects.filter(
            pk=payload['connection_id']).first()
        if not conn:
            msg = _('Incorrect connection identifier.')
            log_item.payload['error'] = msg
            log_item.save(update_fields=['payload'])
            raise ontask.OnTaskException(msg)

        # Get the dataframe from the connection
        src_df = _load_df_from_sqlconnection(conn, payload)

        # Create session
        session = SESSION_STORE()
        session.create()

        workflow = acquire_workflow_access(user, session, workflow.id)
        if not workflow:
            msg = _('Unable to acquire access to workflow.')
            log_item.payload['error'] = msg
            log_item.save(update_fields=['payload'])
            raise ontask.OnTaskException(msg)

        # IDEA: How to deal with failure to acquire access?
        #       Include a setting with the time to wait and number of retries?

        if not workflow.has_data_frame():
            # Simple upload
            pandas.store_dataframe(src_df, workflow)
            return []

        # At this point the operation is a merge

        dst_df = pandas.load_table(workflow.get_data_frame_table_name())

        dst_key = payload.get('dst_key')
        if not dst_key:
            msg = _('Missing key column name of existing table.')
            log_item.payload['error'] = msg
            log_item.save(update_fields=['payload'])
            raise ontask.OnTaskException(msg)

        src_key = payload.get('src_key')
        if not src_key:
            msg = _('Missing key column name of new table.')
            log_item.payload['error'] = msg
            log_item.save(update_fields=['payload'])
            raise ontask.OnTaskException(msg)

        how_merge = payload.get('how_merge')
        if not how_merge:
            msg = _('Missing merge method.')
            log_item.payload['error'] = msg
            log_item.save(update_fields=['payload'])
            raise ontask.OnTaskException(msg)

        # Check additional correctness properties in the parameters
        error = pandas.validate_merge_parameters(dst_df, src_df, how_merge,
                                                 dst_key, src_key)
        if error:
            log_item.payload['error'] = error
            log_item.save(update_fields=['payload'])
            raise ontask.OnTaskException(error)

        merge_info = {
            'how_merge': how_merge,
            'dst_selected_key': dst_key,
            'src_selected_key': src_key,
            'initial_column_names': list(src_df.columns),
            'rename_column_names': list(src_df.columns),
            'columns_to_upload': [True] * len(list(src_df.columns))
        }
        # Perform the merge
        pandas.perform_dataframe_upload_merge(workflow, dst_df, src_df,
                                              merge_info)

        log_item.payload = merge_info
        log_item.save(update_fields=['payload'])

예제 #9

0

파일 보기

파일: column_crud.py 프로젝트: ritotombe/ontask_b

def random_column_add(
    request: HttpRequest,
    workflow: Optional[Workflow] = None,
) -> JsonResponse:
    """Create a column with random values (Modal).

    :param request:

    :return:
    """
    # Get the workflow element
    if workflow.nrows == 0:
        messages.error(request,
                       _('Cannot add column to a workflow without data'))
        return JsonResponse({'html_redirect': ''})

    # Form to read/process data
    form = RandomColumnAddForm(data=request.POST or None,
                               workflow=workflow,
                               allow_interval_as_initial=True)

    if request.method == 'POST' and form.is_valid():

        # Save the column object attached to the form and add additional fields
        column: Column = form.save(commit=False)
        column.workflow = workflow
        column.is_key = False

        # Save the instance
        try:
            column = form.save()
            form.save_m2m()
        except IntegrityError:
            form.add_error('name', _('A column with that name already exists'))
            return JsonResponse({
                'html_form':
                render_to_string(
                    'workflow/includes/partial_random_column_add.html',
                    {'form': form},
                    request=request),
            })

        # Detect the case of a single integer as initial value so that it is
        # expanded
        try:
            int_value: int = int(column.categories[0])
        except (ValueError, TypeError, IndexError):
            int_value: Optional[str] = None

        if int_value:
            # At this point the field is an integer
            if int_value <= 1:
                form.add_error('values',
                               _('The integer value has to be larger than 1'))
                return JsonResponse({
                    'html_form':
                    render_to_string(
                        'workflow/includes/partial_random_column_add.html',
                        {'form': form},
                        request=request),
                })

            column.set_categories([idx + 1 for idx in range(int_value)])

        column.save()

        # Empty new column
        new_column = [None] * workflow.nrows
        # Create the random partitions
        partitions = _partition([idx for idx in range(workflow.nrows)],
                                len(column.categories))

        # Assign values to partitions
        for idx, indexes in enumerate(partitions):
            for col_idx in indexes:
                new_column[col_idx] = column.categories[idx]

        # Assign the new column to the data frame
        form.data_frame[column.name] = new_column

        # Update the positions of the appropriate columns
        workflow.reposition_columns(workflow.ncols + 1, column.position)
        workflow.refresh_from_db()

        # Store the df to DB
        try:
            store_dataframe(form.data_frame, workflow)
        except Exception as exc:
            messages.error(request,
                           _('Unable to add the column: {0}').format(str(exc)))
            column.delete()
            return JsonResponse({'html_redirect': ''})

        workflow.ncols = workflow.columns.count()
        workflow.save()

        # Log the event
        column.log(request.user, Log.COLUMN_ADD_RANDOM)

        # The form has been successfully processed
        return JsonResponse({'html_redirect': ''})

    return JsonResponse({
        'html_form':
        render_to_string('workflow/includes/partial_random_column_add.html',
                         {'form': form},
                         request=request),
    })