예제 #1
0
    def _CheckHierarchies(self):
        """Make sure that each concept instance has no more than one parent."""
        for column in self.column_bundle.GetColumnIterator():
            total_vals = {}

            if column.parent_ref:
                # Do not count total values as instances
                if column.total_val:
                    total_vals[column.column_id] = column.total_val

                if self.column_bundle.GetColumnByID(
                        column.parent_ref).total_val:
                    total_vals[column.parent_ref] = (
                        self.column_bundle.GetColumnByID(
                            column.parent_ref).total_val)

                combination_count = self.data_container.CombinationCount(
                    column.column_id, column.parent_ref, total_vals)
                error_values = []

                for row in combination_count:
                    if row[1] > 1:
                        error_values.append(row[0])

                if error_values:
                    raise data_source.DataSourceError(
                        'Instances of column %s have multiple parent values: %s'
                        % (column.column_id, error_values))
    def _CheckHierarchies(self):
        """Make sure that each concept instance has no more than one parent."""
        cursor = self.sqlite_connection.cursor()

        for column in self.column_bundle.GetColumnIterator():
            if column.parent_ref:
                if column.total_val:
                    where_clause = ('WHERE %s != "%s"' %
                                    (column.column_id, column.total_val))
                else:
                    where_clause = ''

                query_str = (
                    'SELECT %s, COUNT(*) FROM (SELECT DISTINCT %s, %s '
                    'FROM csv_table %s) GROUP BY %s' %
                    (column.column_id, column.column_id, column.parent_ref,
                     where_clause, column.column_id))

                try:
                    cursor.execute(query_str)
                except sqlite3.OperationalError as e:
                    raise data_source.DataSourceError(
                        'Error executing query: %s\n%s' % (query_str, str(e)))

                error_values = []

                for row in cursor:
                    if int(row[1]) > 1:
                        error_values.append(row[0])

                if error_values:
                    raise data_source.DataSourceError(
                        'Instances of column %s have multiple parent values: %s'
                        % (column.column_id, error_values))

        cursor.close()
예제 #3
0
def _HeaderToColumn(header_string):
    """Parse the header string for a column.

  Args:
    header_string: The complete string for the column header

  Returns:
    A DataColumn object populated based on the header data

  Raises:
    DataSourceError: If there are any errors in parsing, e.g. if an unrecognized
                     key is found.
  """
    # The column id must be at least one character long, and cannot contain the
    # characters '[', ']', ';', or whitespace
    parameters_match = re.match('^([^\]\[;\s]+)(?:\[(.*)\]){0,1}$',
                                header_string.strip().replace('"', ''))

    if not parameters_match:
        raise data_source.DataSourceError(
            'Formatting error for header string: %s' % header_string)

    column_id = parameters_match.group(1)
    column = data_source.DataSourceColumn(column_id, internal_parameters={})

    if parameters_match.group(2):
        # Parse the column parameters
        key_value_pairs = parameters_match.group(2).split(';')

        for key_value_pair in key_value_pairs:
            try:
                [key, value] = key_value_pair.split('=')
            except ValueError:
                raise data_source.DataSourceError(
                    'Formatting error for header string: %s' % header_string)

            # Map the key to the appropriate field of the DataSourceColumn object
            if key == 'type':
                if value not in ['date', 'float', 'integer', 'string']:
                    raise data_source.DataSourceError(
                        'Unknown data type for column %s: %s' %
                        (column.column_id, value))

                column.data_type = value
            elif key == 'format':
                column.data_format = value
            elif key == 'concept':
                column.concept_ref = value
            elif key == 'extends':
                column.concept_extension = value
            elif key == 'parent':
                column.parent_ref = value
            elif key == 'slice_role':
                role_value = value.lower()

                if role_value not in ['dimension', 'metric']:
                    raise data_source.DataSourceError(
                        'Unrecognized slice_roll in column %s: %s' %
                        (column.column_id, value))
                else:
                    column.slice_role = role_value
            elif key == 'rollup':
                if value.lower() == 'true':
                    column.rollup = True
                elif value.lower() == 'false':
                    column.rollup = False
                else:
                    raise data_source.DataSourceError(
                        'Unrecognized boolean value in column %s: %s' %
                        (column.column_id, value))
            elif key == 'total_val':
                column.total_val = value
            elif key == 'dropif':
                column.internal_parameters['dropif_val'] = value
            elif key == 'zeroif':
                column.internal_parameters['zeroif_val'] = value
            elif key == 'aggregation':
                if string.lower(value) not in [
                        'sum', 'max', 'min', 'avg', 'count'
                ]:
                    raise data_source.DataSourceError(
                        'Unknown aggregation for column %s: %s' %
                        (column.column_id, value))

                column.internal_parameters['aggregation'] = value
            else:
                raise data_source.DataSourceError(
                    'Unknown parameter for column %s: %s' %
                    (column.column_id, key))
    return column
예제 #4
0
def ConstructColumnBundle(csv_file, verbose=True):
    """Construct a ColumnBundle from the header information in a CSV file.

  Args:
    csv_file: The complete string for the column header
    verbose: Print out extra information to stdout

  Returns:
    A data_source.ColumnBundle object populated based on the CSV header

  Raises:
    DataSourceError: If there are any parsing errors or data
                     inconsistencies
  """
    # Get the first and second rows of the CSV
    header_csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"')
    header_row_values = header_csv_reader.next()
    second_row_values = header_csv_reader.next()
    csv_file.seek(0)

    # Check that second row is properly formatted
    if len(header_row_values) != len(second_row_values):
        raise data_source.DataSourceError(
            'Number of columns in row 2 (%d) does not match number '
            'expected (%d)' % (len(second_row_values), len(header_row_values)))

    column_bundle = data_source.DataSourceColumnBundle()

    for header_element in header_row_values:
        column_bundle.AddColumn(_HeaderToColumn(header_element))

    num_date_columns = 0
    has_metric_column = False
    column_ids = [
        column.column_id for column in column_bundle.GetColumnIterator()
    ]

    # Iterate through columns, populating and refining DataSourceColumn
    # parameters as necessary
    for c, column in enumerate(column_bundle.GetColumnIterator()):
        if verbose:
            print '\nEvaluating column %s' % column.column_id

        # Check data type
        if not column.data_type:
            column.data_type = (data_source.GuessDataType(
                second_row_values[c], column.column_id))

            if verbose:
                print 'Guessing that column %s is of type %s' % (
                    column.column_id, column.data_type)

        # Check slice type
        if not column.slice_role:
            if column.data_type == 'integer' or column.data_type == 'float':
                column.slice_role = 'metric'
            else:
                column.slice_role = 'dimension'

            if verbose:
                print 'Guessing that column %s is a %s' % (column.column_id,
                                                           column.slice_role)

        # Check aggregation
        if column.slice_role == 'metric':
            has_metric_column = True

            if 'aggregation' not in column.internal_parameters:
                column.internal_parameters['aggregation'] = 'SUM'

                if verbose:
                    print 'Guessing that column %s should be aggregated by %s' % (
                        column.column_id,
                        column.internal_parameters['aggregation'])

        # Check parent
        if column.parent_ref:
            if column.parent_ref not in column_ids:
                raise data_source.DataSourceError(
                    'Column %s references a parent not defined in this dataset: %s'
                    % (column.column_id, column.parent_ref))

            parent_column = column_bundle.GetColumnByID(column.parent_ref)

            if not parent_column.rollup:
                parent_column.rollup = True

                if verbose:
                    print(
                        'Making column %s rollup since it is a parent to column %s'
                        % (parent_column.column_id, column.column_id))

        # Check date format and concept
        if column.data_type == 'date':
            num_date_columns += 1

            if not column.data_format:
                column.data_format = (data_source.GuessDateFormat(
                    second_row_values[c]))

            if not column.concept_ref:
                column.concept_ref = (data_source.GuessDateConcept(
                    column.data_format))

            if verbose:
                print(
                    'Guessing that column %s is formatted as %s and '
                    'corresponds to %s' %
                    (column.column_id, column.data_format, column.concept_ref))

    # Warn user if their file will not produce interesting DSPL visualizations
    if num_date_columns == 0:
        warnings.warn('Input file does not have a date column',
                      data_source.DataSourceWarning)

    elif num_date_columns > 1:
        warnings.warn('Input file has more than one date column',
                      data_source.DataSourceWarning)

    if not has_metric_column:
        warnings.warn('Input file does not have any metrics',
                      data_source.DataSourceWarning)

    return column_bundle
    def __init__(self, csv_file, verbose=True):
        """Populate a CSVDataSourceSqlite object based on a CSV file.

    Note that the caller is responsible for closing the csv_file.

    Args:
      csv_file: A file-like object, opened for reading, that has CSV data in it
      verbose: Print out status messages to stdout

    Raises:
      DataSourceError: If CSV isn't properly formatted
    """
        self.sqlite_dir = tempfile.mkdtemp()
        self.verbose = verbose
        self.column_bundle = csv_utilities.ConstructColumnBundle(
            csv_file, verbose)

        num_columns = self.column_bundle.GetNumColumns()

        # Set up sqlite table to store data
        columns_string = (','.join([
            '%s %s' %
            (column.column_id, _DSPL_TYPE_TO_SQLITE_TYPE[column.data_type])
            for column in self.column_bundle.GetColumnIterator()
        ]))

        if self.verbose:
            print '\nCreating sqlite3 table: %s' % (columns_string)

        self.sqlite_connection = sqlite3.connect(
            os.path.join(self.sqlite_dir, 'db.dat'))
        cursor = self.sqlite_connection.cursor()
        cursor.execute('create table csv_table (%s)' % (columns_string))

        if self.verbose:
            print 'Adding CSV data to SQLite table'

        body_csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"')
        body_csv_reader.next()

        for r, row in enumerate(body_csv_reader):
            transformed_row_values = []

            # Ignore blank rows
            if row:
                if len(row) != num_columns:
                    raise data_source.DataSourceError(
                        'Number of columns in row %d (%d) does not match number '
                        'expected (%d)' % (r + 2, len(row), num_columns))

                skip_row = False

                for v, row_value in enumerate(row):
                    column = self.column_bundle.GetColumnByOrder(v)

                    # Handle dropif_val and zeroif_val parameters
                    if 'dropif_val' in column.internal_parameters:
                        if row_value == column.internal_parameters[
                                'dropif_val']:
                            skip_row = True
                            break
                    elif 'zeroif_val' in column.internal_parameters:
                        if row_value == column.internal_parameters[
                                'zeroif_val']:
                            row_value = '0'

                    transformed_row_values.append(
                        _CleanDBValue(row_value, column.data_type))

                if skip_row:
                    continue

                # Add row to sqlite table
                transformed_values_str = ','.join(transformed_row_values)

                try:
                    cursor.execute('insert into csv_table values (%s)' %
                                   (transformed_values_str))
                except sqlite3.OperationalError as e:
                    raise data_source.DataSourceError(
                        'Error putting line %d of input file into database: %s'
                        '\n%s' % (r + 2, transformed_values_str, str(e)))

        if self.verbose:
            print 'Committing transactions\n'

        self.sqlite_connection.commit()

        cursor.close()

        if self.verbose:
            print 'Checking concept hierarchies'

        self._CheckHierarchies()
    def GetTableData(self, query_parameters):
        """Calculate and return the requested table data.

    Uses sqlite to group and aggregate the raw data from the original CSV.

    Args:
      query_parameters: A QueryParameters object

    Returns:
      A TableData object containing the data for the requested table

    Raises:
      DataSourceError: If query against sqlite instance fails
    """
        if query_parameters.query_type == data_source.QueryParameters.CONCEPT_QUERY:
            # This request is for a concept definition table

            # Filter out total values
            where_statements = []

            for column_id in query_parameters.column_ids:
                column = self.column_bundle.GetColumnByID(column_id)

                if column.total_val:
                    where_statements.append(
                        '%s != "%s"' % (column.column_id, column.total_val))
            if where_statements:
                where_clause = 'WHERE ' + ','.join(where_statements)
            else:
                where_clause = ''

            query_str = ('SELECT DISTINCT %s FROM csv_table %s ORDER BY %s' %
                         (','.join(query_parameters.column_ids), where_clause,
                          ','.join(query_parameters.column_ids)))
        elif query_parameters.query_type == data_source.QueryParameters.SLICE_QUERY:
            # This request is for a slice table
            sql_names = []
            dimension_sql_names = []
            where_statements = []

            time_dimension_id = ''

            # Construct a SQL query that selects all parameters (with the necessary
            # aggregations), groups by non-time dimensions, and orders by all the
            # dimensions, with time last.
            for column_id in query_parameters.column_ids:
                column = self.column_bundle.GetColumnByID(column_id)

                if column.total_val:
                    where_statements.append(
                        '%s != "%s"' % (column.column_id, column.total_val))

                if column.slice_role == 'dimension':
                    sql_names.append(column_id)
                    dimension_sql_names.append(column_id)

                    if column.data_type == 'date':
                        time_dimension_id = column_id
                elif column.slice_role == 'metric':
                    sql_names.append(
                        '%s(%s) AS %s' %
                        (column.internal_parameters['aggregation'], column_id,
                         column_id))

            order_sql_names = ([
                d for d in dimension_sql_names if d != time_dimension_id
            ])

            if time_dimension_id:
                order_sql_names.append(time_dimension_id)

            # Handle total values in non-selected columns
            for column in self.column_bundle.GetColumnIterator():
                if column.column_id not in query_parameters.column_ids:
                    if column.total_val:
                        where_statements.append(
                            '%s = "%s"' % (column.column_id, column.total_val))

            if where_statements:
                where_clause = 'WHERE ' + ','.join(where_statements)
            else:
                where_clause = ''

            query_str = (
                'SELECT %s FROM csv_table %s GROUP BY %s ORDER BY %s' %
                (','.join(sql_names), where_clause,
                 ','.join(dimension_sql_names), ','.join(order_sql_names)))
        else:
            raise data_source.DataSourceError('Unknown query type: %s' %
                                              query_parameters.query_type)

        if self.verbose:
            print 'Executing query:\n%s\n' % (query_str)

        # Execute the query against the sqlite backend
        cursor = self.sqlite_connection.cursor()

        try:
            cursor.execute(query_str)
        except sqlite3.OperationalError as e:
            raise data_source.DataSourceError('Error executing query: %s\n%s' %
                                              (query_str, str(e)))

        query_results = []

        for row in cursor:
            query_results.append(list(row))

        cursor.close()

        return data_source.TableData(rows=query_results)
예제 #7
0
    def GetTableData(self, query_parameters):
        """Calculate and return the requested table data.

    Use in-memory data container to group and aggregate the raw data from the
    original CSV.

    Args:
      query_parameters: A QueryParameters object

    Returns:
      A TableData object containing the data for the requested table

    Raises:
      DataSourceError: If query against sqlite instance fails
    """
        if query_parameters.query_type == data_source.QueryParameters.CONCEPT_QUERY:
            omitted_values = {}

            # Pull out total values
            for column_id in query_parameters.column_ids:
                column = self.column_bundle.GetColumnByID(column_id)

                if column.total_val:
                    omitted_values[column_id] = [column.total_val]

            query_results = self.data_container.DistinctValues(
                query_parameters.column_ids, omitted_values)
        elif query_parameters.query_type == data_source.QueryParameters.SLICE_QUERY:
            # This request is for a slice table
            all_columns = []

            dimension_columns = []
            time_dimension_id = ''
            metric_columns = []
            metric_aggregation_map = {}

            query_total_vals = {}

            # Construct a SQL query that selects all parameters (with the necessary
            # aggregations), groups by non-time dimensions, and orders by all the
            # dimensions, with time last.
            for column_id in query_parameters.column_ids:
                column = self.column_bundle.GetColumnByID(column_id)

                all_columns.append(column_id)

                if column.slice_role == 'dimension':
                    dimension_columns.append(column_id)

                    if column.data_type == 'date':
                        time_dimension_id = column_id
                elif column.slice_role == 'metric':
                    metric_columns.append(column_id)
                    metric_aggregation_map[column_id] = (
                        column.internal_parameters['aggregation'])

                if column.total_val:
                    query_total_vals[column.column_id] = column.total_val

            order_by_columns = ([
                d for d in dimension_columns if d != time_dimension_id
            ])

            if time_dimension_id:
                order_by_columns.append(time_dimension_id)

            # Calculate the rows to filter out based on totals
            aggregated_total_vals = {}

            for column in self.column_bundle.GetColumnIterator():
                if column.column_id not in query_parameters.column_ids:
                    if column.total_val:
                        aggregated_total_vals[
                            column.column_id] = column.total_val

            query_results = self.data_container.GroupedValues(
                all_columns, dimension_columns, order_by_columns,
                metric_aggregation_map, aggregated_total_vals,
                query_total_vals)
        else:
            raise data_source.DataSourceError('Unknown query type: %s' %
                                              query_parameters.query_type)

        return data_source.TableData(rows=query_results)
예제 #8
0
    def __init__(self, csv_file, verbose=True):
        """Populate a CSVDataSource object based on a CSV file.

    Note that the caller is responsible for closing the csv_file.

    Args:
      csv_file: A file-like object, opened for reading, that has CSV data in it
      verbose: Print out status messages to stdout

    Raises:
      DataSourceError: If CSV isn't properly formatted
    """
        self.verbose = verbose
        self.column_bundle = csv_utilities.ConstructColumnBundle(
            csv_file, verbose)

        column_ids = [
            column.column_id
            for column in self.column_bundle.GetColumnIterator()
        ]
        num_columns = self.column_bundle.GetNumColumns()
        self.data_container = DataContainer(column_ids)

        if self.verbose:
            print 'Reading CSV data'

        body_csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"')
        body_csv_reader.next()

        for r, row in enumerate(body_csv_reader):
            transformed_row_values = []

            # Ignore blank rows
            if row:
                if len(row) != num_columns:
                    raise data_source.DataSourceError(
                        'Number of columns in row %d (%d) does not match number '
                        'expected (%d)' % (r + 2, len(row), num_columns))

                skip_row = False

                for v, row_value in enumerate(row):
                    column = self.column_bundle.GetColumnByOrder(v)

                    # Handle dropif_val and zeroif_val parameters
                    if 'dropif_val' in column.internal_parameters:
                        if row_value == column.internal_parameters[
                                'dropif_val']:
                            skip_row = True
                            break
                    elif 'zeroif_val' in column.internal_parameters:
                        if row_value == column.internal_parameters[
                                'zeroif_val']:
                            row_value = 0.0

                    if column.data_type == 'integer':
                        typed_row_value = int(row_value)
                    elif column.data_type == 'float':
                        typed_row_value = float(row_value)
                    else:
                        typed_row_value = row_value

                    transformed_row_values.append(typed_row_value)

                if skip_row:
                    continue

                self.data_container.AddRow(transformed_row_values)

        if self.verbose:
            print 'Checking concept hierarchies'

        self._CheckHierarchies()