예제 #1
0
def store_table_in_db(data_frame, pk, table_name, temporary=False):
    """
    Update or create a table in the DB with the data in the data frame. It
    also updates the corresponding column information

    :param data_frame: Data frame to dump to DB
    :param pk: Corresponding primary key of the workflow
    :param table_name: Table to use in the DB
    :param temporary: Boolean stating if the table is temporary,
           or it belongs to an existing workflow.
    :return: If temporary = True, then return a list with three lists:
             - column names
             - column types
             - column is unique
             If temporary = False, return None. All this info is stored in
             the workflow
    """

    if settings.DEBUG:
        print('Storing table ', table_name)

    # get column names
    df_column_names = list(data_frame.columns)

    # if the data frame is temporary, the procedure is much simpler
    if temporary:
        # Get the if the columns have unique values per row
        column_unique = are_unique_columns(data_frame)

        # Store the table in the DB
        store_table(data_frame, table_name)

        # Get the column types
        df_column_types = df_column_types_rename(table_name)

        # Return a list with three list with information about the
        # data frame that will be needed in the next steps
        return [df_column_names, df_column_types, column_unique]

    # We are modifying an existing DF

    # Get the workflow and its columns
    workflow = Workflow.objects.get(id=pk)
    wf_col_names = Column.objects.filter(workflow__id=pk).values_list(
        "name", flat=True)

    # Loop over the columns in the data frame and reconcile the column info
    # with the column objects attached to the WF
    for cname in df_column_names:
        # See if this is a new column
        if cname in wf_col_names:
            # If column already exists in wf_col_names, no need to do anything
            continue

        # Create the new column
        column = Column(
            name=cname,
            workflow=workflow,
            data_type=pandas_datatype_names[data_frame[cname].dtype.name],
            is_key=is_unique_column(data_frame[cname]),
            position=Column.objects.filter(workflow=workflow).count() + 1,
        )
        column.save()

    # Get now the new set of columns with names
    wf_columns = Column.objects.filter(workflow__id=pk)

    # Reorder the columns in the data frame
    data_frame = data_frame[[x.name for x in wf_columns]]

    # Store the table in the DB
    store_table(data_frame, table_name)

    # Review the column types because some "objects" are stored as booleans
    column_types = df_column_types_rename(table_name)
    for ctype, col in zip(column_types, wf_columns):
        if col.data_type != ctype:
            # If the column type in the DB is different from the one in the
            # object, update
            col.data_type = ctype
            col.save()

    # Update workflow fields and save
    workflow.nrows = data_frame.shape[0]
    workflow.ncols = data_frame.shape[1]
    workflow.set_query_builder_ops()
    workflow.data_frame_table_name = table_name
    workflow.save()

    return None
예제 #2
0
def run(request, pk):
    """
    View provided as the first step to execute a plugin.
    :param request: HTTP request received
    :param pk: primary key of the plugin
    :return: Page offering to select the columns to invoke
    """

    # Get the workflow and the plugin information
    workflow = get_workflow(request)
    if not workflow:
        return redirect('workflow:index')
    try:
        plugin_info = PluginRegistry.objects.get(pk=pk)
    except PluginRegistry.DoesNotExist:
        return redirect('workflow:index')

    plugin_instance, msgs = load_plugin(plugin_info.filename)
    if plugin_instance is None:
        messages.error(
            request,
            'Unable to instantiate plugin "{0}"'.format(plugin_info.name))
        return redirect('dataops:transform')

    if len(plugin_instance.input_column_names) > 0:
        # The plug in works with a fixed set of columns
        cnames = workflow.columns.all().values_list('name', flat=True)
        if not set(plugin_instance.input_column_names) < set(cnames):
            # The set of columns are not part of the workflow
            messages.error(
                request,
                'Workflow does not have the correct columns to run this plugin'
            )
            return redirect('dataops:transform')

    # create the form to select the columns and the corresponding dictionary
    form = SelectColumnForm(request.POST or None,
                            workflow=workflow,
                            plugin_instance=plugin_instance)

    # Set the basic elements in the context
    context = {
        'form':
        form,
        'output_column_fields':
        [x for x in list(form) if x.name.startswith(field_prefix + 'output')],
        'parameters': [
            x for x in list(form)
            if x.name.startswith(field_prefix + 'parameter')
        ],
        'pinstance':
        plugin_instance,
        'id':
        workflow.id
    }

    # If it is a GET request or non valid, render the form.
    if request.method == 'GET' or not form.is_valid():
        return render(request, 'dataops/plugin_info_for_run.html', context)

    # POST is correct proceed with execution

    # Get the data frame and select the appropriate columns
    try:
        dst_df = pandas_db.load_from_db(workflow.id)
    except Exception:
        messages.error(request, 'Exception while retrieving the data frame')
        return render(request, 'error.html', {})

    # Take the list of inputs from the form if empty list is given.
    if not plugin_instance.input_column_names:
        plugin_instance.input_column_names = \
            [c.name for c in form.cleaned_data['columns']]

    # Get the proper subset of the data frame
    sub_df = dst_df[[form.cleaned_data['merge_key']] +
                    plugin_instance.input_column_names]

    # Process the output columns
    for idx, output_cname in enumerate(plugin_instance.output_column_names):
        new_cname = form.cleaned_data[field_prefix + 'output_%s' % idx]
        if form.cleaned_data['out_column_suffix']:
            new_cname += form.cleaned_data['out_column_suffix']
        plugin_instance.output_column_names[idx] = new_cname

    # Pack the parameters
    params = dict()
    for idx, tpl in enumerate(plugin_instance.parameters):
        params[tpl[0]] = form.cleaned_data[field_prefix + 'parameter_%s' % idx]

    # Execute the plugin
    result_df, status = run_plugin(plugin_instance, sub_df,
                                   form.cleaned_data['merge_key'], params)

    if status is not None:
        context['exec_status'] = status

        # Log the event
        logs.ops.put(request.user, 'plugin_execute', workflow, {
            'id': plugin_info.id,
            'name': plugin_info.name,
            'status': status
        })

        return render(request, 'dataops/plugin_execution_report.html', context)

    # Additional checks
    # Result has the same number of rows
    if result_df.shape[0] != dst_df.shape[0]:
        status = 'Incorrect number of rows in result data frame.'
        context['exec_status'] = status

        # Log the event
        logs.ops.put(request.user, 'plugin_execute', workflow, {
            'id': plugin_info.id,
            'name': plugin_info.name,
            'status': status
        })

        return render(request, 'dataops/plugin_execution_report.html', context)

    # Result column names are consistent
    if set(result_df.columns) != \
            set([form.cleaned_data['merge_key']] +
                plugin_instance.output_column_names):
        status = 'Incorrect columns in result data frame.'
        context['exec_status'] = status

        # Log the event
        logs.ops.put(request.user, 'plugin_execute', workflow, {
            'id': plugin_info.id,
            'name': plugin_info.name,
            'status': status
        })

        return render(request, 'dataops/plugin_execution_report.html', context)

    # Proceed with the merge
    try:
        result = ops.perform_dataframe_upload_merge(
            workflow.id, dst_df, result_df, {
                'how_merge': 'left',
                'dst_selected_key': form.cleaned_data['merge_key'],
                'src_selected_key': form.cleaned_data['merge_key'],
                'initial_column_names': list(result_df.columns),
                'rename_column_names': list(result_df.columns),
                'columns_to_upload': [True] * len(list(result_df.columns))
            })
    except Exception as e:
        context['exec_status'] = e.message
        return render(request, 'dataops/plugin_execution_report.html', context)

    if isinstance(result, str):
        # Something went wrong
        context['exec_status'] = result
        return render(request, 'dataops/plugin_execution_report.html', context)

    # Get the resulting dataframe
    final_df = pandas_db.load_from_db(workflow.id)

    # Update execution time
    plugin_info.executed = datetime.now(
        pytz.timezone(ontask_settings.TIME_ZONE))
    plugin_info.save()

    # List of pairs (column name, column type) in the result to create the
    # log event
    result_columns = zip(list(result_df.columns),
                         pandas_db.df_column_types_rename(result_df))

    # Log the event
    logs.ops.put(
        request.user, 'plugin_execute', workflow, {
            'id': plugin_info.id,
            'name': plugin_info.name,
            'status': status,
            'result_columns': result_columns
        })

    # Create the table information to show in the report.
    column_info = []
    dst_names = list(dst_df.columns)
    result_names = list(result_df.columns)
    for c in list(final_df.columns):
        if c not in result_names:
            column_info.append((c, ''))
        elif c not in dst_names:
            column_info.append(('', c + ' (New)'))
        else:
            if c == form.cleaned_data['merge_key']:
                column_info.append((c, c))
            else:
                column_info.append((c + ' (Update)', c))

    context['info'] = column_info
    context['key'] = form.cleaned_data['merge_key']
    context['id'] = workflow.id

    # Redirect to the notification page with the proper info
    return render(request, 'dataops/plugin_execution_report.html', context)
예제 #3
0
파일: ops.py 프로젝트: leihuagh/ontask_b
def store_table_in_db(data_frame,
                      pk,
                      table_name,
                      temporary=False,
                      reset_keys=True):
    """
    Update or create a table in the DB with the data in the data frame. It
    also updates the corresponding column information

    :param data_frame: Data frame to dump to DB
    :param pk: Corresponding primary key of the workflow
    :param table_name: Table to use in the DB
    :param temporary: Boolean stating if the table is temporary,
           or it belongs to an existing workflow.
    :param reset_keys: Reset the value of the field is_key computing it from
           scratch
    :return: If temporary = True, then return a list with three lists:
             - column names
             - column types
             - column is unique
             If temporary = False, return None. All this info is stored in
             the workflow
    """

    if settings.DEBUG:
        print('Storing table ', table_name)

    # get column names
    df_column_names = list(data_frame.columns)

    # if the data frame is temporary, the procedure is much simpler
    if temporary:
        # Get the if the columns have unique values per row
        column_unique = are_unique_columns(data_frame)

        # Store the table in the DB
        store_table(data_frame, table_name)

        # Get the column types
        df_column_types = df_column_types_rename(table_name)

        # Return a list with three list with information about the
        # data frame that will be needed in the next steps
        return [df_column_names, df_column_types, column_unique]

    # We are modifying an existing DF

    # Get the workflow and its columns
    workflow = Workflow.objects.get(id=pk)
    wf_cols = workflow.columns.all()

    # Loop over the columns in the Workflow to refresh the is_key value. There
    # may be values that have been added to the column, so this field needs to
    # be reassessed
    for col in wf_cols:
        if reset_keys:
            new_val = is_unique_column(data_frame[col.name])
            if col.is_key and not new_val:
                # Only set the is_key value if the column states that it is a
                # key column, but the values say no. Othe other way around
                # is_key is false in the column will be ignored as it may have
                # been set by the user
                col.is_key = new_val
                col.save()

        # Remove this column name from wf_col_names, no further processing is
        # needed.
        df_column_names.remove(col.name)

    # Loop over the remaining columns in the data frame and create the new
    # column objects in the workflow
    for cname in df_column_names:
        # Create the new column
        column = Column(
            name=cname,
            workflow=workflow,
            data_type=pandas_datatype_names[data_frame[cname].dtype.name],
            is_key=is_unique_column(data_frame[cname]),
            position=workflow.columns.count() + 1
        )
        column.save()

    # Get the new set of columns with names
    wf_columns = workflow.columns.all()

    # Reorder the columns in the data frame
    data_frame = data_frame[[x.name for x in wf_columns]]

    # Store the table in the DB
    store_table(data_frame, table_name)

    # Review the column types because some "objects" are stored as booleans
    column_types = df_column_types_rename(table_name)
    for ctype, col in zip(column_types, wf_columns):
        if col.data_type != ctype:
            # If the column type in the DB is different from the one in the
            # object, update
            col.data_type = ctype
            col.save()

    # Update workflow fields and save
    workflow.nrows = data_frame.shape[0]
    workflow.ncols = data_frame.shape[1]
    workflow.set_query_builder_ops()
    workflow.data_frame_table_name = table_name
    workflow.save()

    return None