예제 #1
0
    def test_get_table_level_metadata_date_last_updated_not_empty(self):
        self.engine.execute("""
            INSERT INTO metabase.data_table (data_table_id, file_table_name)
                VALUES (1, 'data.table_test_date_last_updated');

            CREATE TABLE data.table_test_date_last_updated (c1 INT);

            INSERT INTO data.table_test_date_last_updated (c1) VALUES (1);
        """)

        with patch('metabase.extract_metadata.settings', self.mock_params):
            extract = extract_metadata.ExtractMetadata(data_table_id=1)

        extract._get_table_level_metadata()

        self.engine.execute('DROP TABLE data.table_test_date_last_updated;')

        result = self.engine.execute("""
            SELECT date_last_updated
            FROM metabase.data_table
            WHERE data_table_id = 1
        """).fetchall()

        result_date_last_updated = result[0][0]

        assert isinstance(result_date_last_updated, datetime.datetime)
예제 #2
0
    def test_get_table_level_metadata_num_of_cols_2_cols_2_row(self):
        self.engine.execute("""
            INSERT INTO metabase.data_table (data_table_id, file_table_name)
                VALUES (1, 'data.table_test_n_cols');

            ALTER TABLE data.table_test_n_cols ADD c2 TEXT;

            INSERT INTO data.table_test_n_cols (c1, c2) VALUES (2, 'text');
        """)

        with patch('metabase.extract_metadata.settings', self.mock_params):
            extract = extract_metadata.ExtractMetadata(data_table_id=1)

        extract._get_table_level_metadata()

        self.engine.execute('DROP TABLE data.table_test_n_cols;')

        result = self.engine.execute("""
            SELECT number_columns, number_rows
            FROM metabase.data_table
            WHERE data_table_id = 1
        """).fetchall()

        result_n_cols_n_rows = result[0]

        assert (2, 2) == result_n_cols_n_rows
예제 #3
0
def test_get_column_level_metadata_type_overrides_numeric(
        setup_module, setup_get_column_level_metadata):
    """Test type overrides when text overrides numeric."""

    with patch('metabase.extract_metadata.settings', setup_module.mock_params):
        extract = extract_metadata.ExtractMetadata(data_table_id=1)

    type_overrides = {'c_num': 'text'}
    extract.process_table(categorical_threshold=2,
                          type_overrides=type_overrides)

    engine = setup_module.engine
    results = engine.execute("""
            SELECT
            data_table_id,
            column_name,
            max_length,
            min_length,
            median_length,
            updated_by,
            date_last_updated
        FROM metabase.text_column
    """).fetchall()

    text_columns = (results[0]['column_name'], results[1]['column_name'])

    assert 'c_num' in text_columns
예제 #4
0
def test_get_column_level_metadata_type_overrides_numeric_with_code(
        setup_module, setup_get_column_level_metadata):
    """Test type overrides when code overrides numeric."""

    with patch('metabase.extract_metadata.settings', setup_module.mock_params):
        extract = extract_metadata.ExtractMetadata(data_table_id=1)

    type_overrides = {'c_num': 'code'}
    extract.process_table(categorical_threshold=2,
                          type_overrides=type_overrides)

    engine = setup_module.engine
    results = engine.execute("""
        SELECT
            data_table_id,
            column_name,
            code,
            frequency,
            updated_by,
            date_last_updated
        FROM metabase.code_frequency

    """).fetchall()

    code_columns = [i['column_name'] for i in results]
    assert 'c_num' in code_columns
예제 #5
0
    def test_get_column_level_metadata__column_info(self):
        """Test extracting column level metadata into Column Info table."""

        self.engine.execute("""
           INSERT INTO metabase.data_table (data_table_id, file_table_name)
           VALUES (1, 'data.table_1');

           CREATE TABLE data.table_1
               (c_num INT, c_text TEXT, c_code TEXT, c_date DATE);

           INSERT INTO data.table_1 (c_num, c_text, c_code, c_date)
           VALUES
           (1, 'text_1', 'code_1', '2018-01-01'),
           (2, 'text_2', 'code_1', '2018-02-01'),
           (3, 'text_3', 'code_2', '2018-03-02');
        """)

        with patch('metabase.extract_metadata.settings', self.mock_params):
            extract = extract_metadata.ExtractMetadata(data_table_id=1)

        extract._get_column_level_metadata(categorical_threshold=2)

        # Check column info retuls.
        results = self.engine.execute(
            "SELECT * FROM metabase.column_info").fetchall()

        assert 4 == len(results)
예제 #6
0
def test_get_column_level_metadata_date(setup_module,
                                        setup_get_column_level_metadata):
    """Test extracting date column level metadata."""

    with patch('metabase.extract_metadata.settings', setup_module.mock_params):
        extract = extract_metadata.ExtractMetadata(data_table_id=1)
    extract.process_table(categorical_threshold=2)

    engine = setup_module.engine
    results = engine.execute("""
        SELECT
            data_table_id,
            column_name,
            min_date,
            max_date,
            updated_by,
            date_last_updated
        FROM metabase.date_column
    """).fetchall()[0]

    assert 1 == results['data_table_id']
    assert 'c_date' == results['column_name']
    assert datetime.date(2018, 1, 1) == results['min_date']
    assert datetime.date(2018, 3, 2) == results['max_date']
    assert isinstance(results[4], str)
    assert isinstance(results[5], datetime.datetime)
예제 #7
0
def test_get_column_level_metadata_numeric(setup_module,
                                           setup_get_column_level_metadata):
    """Test extracting numeric column level metadata."""

    with patch('metabase.extract_metadata.settings', setup_module.mock_params):
        extract = extract_metadata.ExtractMetadata(data_table_id=1)
    extract.process_table(categorical_threshold=2)

    engine = setup_module.engine
    results = engine.execute("""
        SELECT
            data_table_id,
            column_name,
            minimum,
            maximum,
            mean,
            median,
            updated_by,
            date_last_updated
        FROM metabase.numeric_column
    """).fetchall()[0]

    assert 1 == results['data_table_id']
    assert 'c_num' == results['column_name']
    assert 1 == results['minimum']
    assert 3 == results['maximum']
    assert 2 == results['mean']
    assert 2 == results['median']
    assert isinstance(results['updated_by'], str)
    assert isinstance(results['date_last_updated'], datetime.datetime)
예제 #8
0
    def test_get_table_level_metadata_num_of_cols_0_col_raise_error(self):
        """
        The following group of tests share data table `data.table_test_n_cols`:

            - test_get_table_level_metadata_num_of_cols_0_col_raise_error
            - test_get_table_level_metadata_num_of_cols_1_col_0_row_raise_error
            - test_get_table_level_metadata_num_of_cols_1_col_1_row
            - test_get_table_level_metadata_num_of_cols_2_cols_2_row

        `data.table_test_n_cols` will be dropped at the end of the last test
        in this group.

        """
        self.engine.execute("""
            INSERT INTO metabase.data_table (data_table_id, file_table_name)
                VALUES (1, 'data.table_test_n_cols');

            CREATE TABLE data.table_test_n_cols ();
        """)

        with patch('metabase.extract_metadata.settings', self.mock_params):
            extract = extract_metadata.ExtractMetadata(data_table_id=1)

        with pytest.raises(ValueError):
            extract._get_table_level_metadata()
예제 #9
0
    def test_get_table_level_metadata_num_of_rows_2_rows(self):
        self.engine.execute("""
            INSERT INTO metabase.data_table (data_table_id, file_table_name)
                VALUES (1, 'data.table_test_n_rows');

            INSERT INTO data.table_test_n_rows (c1)
                VALUES (2);
        """)

        with patch('metabase.extract_metadata.settings', self.mock_params):
            extract = extract_metadata.ExtractMetadata(data_table_id=1)

        extract._get_table_level_metadata()

        self.engine.execute('DROP TABLE data.table_test_n_rows;')

        result = self.engine.execute("""
            SELECT number_rows
            FROM metabase.data_table
            WHERE data_table_id = 1
        """).fetchall()

        result_n_rows = result[0][0]

        assert 2 == result_n_rows
예제 #10
0
def test_empty_table(setup_module, setup_empty_table):
    """Test extracting column level metadata from an empy table."""

    with patch('metabase.extract_metadata.settings', setup_module.mock_params):
        extract = extract_metadata.ExtractMetadata(data_table_id=1)

    with pytest.raises(ValueError):
        extract.process_table(categorical_threshold=2)
예제 #11
0
 def test_get_table_name_data_table_id_not_found(self):
     """
     Test the validity of `data_table_id` as an argument to the constructor
     of ExtractMetadata.
     """
     with pytest.raises(ValueError):
         # Will raise error since `metabase.data_table` is empty
         with patch('metabase.extract_metadata.settings', self.mock_params):
             extract_metadata.ExtractMetadata(data_table_id=1)
예제 #12
0
    def test_get_table_name_file_table_name_not_splitable(self):
        self.engine.execute("""
            INSERT INTO metabase.data_table (data_table_id, file_table_name)
                VALUES (1, 'unqualified_table_name');
        """)

        with pytest.raises(ValueError):
            with patch('metabase.extract_metadata.settings', self.mock_params):
                extract_metadata.ExtractMetadata(data_table_id=1)
예제 #13
0
    def test_get_table_name_file_table_name_contain_extra_dot(self):
        self.engine.execute("""
            INSERT INTO metabase.data_table (data_table_id, file_table_name)
                VALUES (1, 'lots.of.dots');
        """)

        with pytest.raises(ValueError):
            with patch('metabase.extract_metadata.settings', self.mock_params):
                extract_metadata.ExtractMetadata(data_table_id=1)
예제 #14
0
def test_get_column_level_metadata_invalid_override(
        setup_module, setup_get_column_level_metadata):
    """Test invalid type override raises error."""

    with patch('metabase.extract_metadata.settings', setup_module.mock_params):
        extract = extract_metadata.ExtractMetadata(data_table_id=1)

    type_overrides = {'c_text': 'numeric'}
    with pytest.raises(ValueError):
        extract.process_table(categorical_threshold=2,
                              type_overrides=type_overrides)
예제 #15
0
    def test_get_table_name_one_data_table(self):
        self.engine.execute("""
            INSERT INTO metabase.data_table (data_table_id, file_table_name)
                VALUES (1, 'data.data_table_name');
        """)

        with patch('metabase.extract_metadata.settings', self.mock_params):
            extract = extract_metadata.ExtractMetadata(data_table_id=1)

        assert (('data', 'data_table_name') == (extract.schema_name,
                                                extract.table_name))
예제 #16
0
    def test_get_table_level_metadata_num_of_cols_1_col_0_row_raise_error(
            self):
        self.engine.execute("""
            INSERT INTO metabase.data_table (data_table_id, file_table_name)
                VALUES (1, 'data.table_test_n_cols');

            ALTER TABLE data.table_test_n_cols ADD c1 INT PRIMARY KEY;
        """)

        with patch('metabase.extract_metadata.settings', self.mock_params):
            extract = extract_metadata.ExtractMetadata(data_table_id=1)

        with pytest.raises(ValueError):
            extract._get_table_level_metadata()
예제 #17
0
def test_get_column_level_metadata_column_info(
        setup_module, setup_get_column_level_metadata):
    """Test extracting column level metadata into Column Info table."""

    with patch('metabase.extract_metadata.settings', setup_module.mock_params):
        extract = extract_metadata.ExtractMetadata(data_table_id=1)

    extract.process_table(categorical_threshold=2)

    # Check if the length of column info results equals to 4 columns.
    engine = setup_module.engine
    results = engine.execute('SELECT * FROM metabase.column_info').fetchall()

    assert 4 == len(results)
예제 #18
0
def test_get_column_level_metadata_date_format_wrong_format_treat_as_text(
        setup_module, setup_date_format):
    with patch('metabase.extract_metadata.settings', setup_module.mock_params):
        extract = extract_metadata.ExtractMetadata(data_table_id=1)

    extract.process_table(
        categorical_threshold=0,
        date_format_dict={'c_date_invalid': 'YYYY-MM-DD'},
    )

    engine = setup_module.engine
    result = engine.execute("""
        SELECT * FROM metabase.column_info WHERE column_name = 'c_date_invalid'
    """).fetchall()[0]

    assert 'text' == result['data_type']
예제 #19
0
def test_get_column_level_metadata_code(setup_module,
                                        setup_get_column_level_metadata):
    """Test extracting code column level metadata."""

    with patch('metabase.extract_metadata.settings', setup_module.mock_params):
        extract = extract_metadata.ExtractMetadata(data_table_id=1)
    extract.process_table(categorical_threshold=2)

    engine = setup_module.engine
    results = engine.execute("""
        SELECT
            data_table_id,
            column_name,
            code,
            frequency,
            updated_by,
            date_last_updated
        FROM metabase.code_frequency
    """).fetchall()

    assert 3 == len(results)

    assert 1 == results[0]['data_table_id']
    assert 'c_code' == results[0]['column_name']
    assert (results[0]['code'] in ('M', 'F', None))
    assert isinstance(results[0]['updated_by'], str)
    assert isinstance(results[0]['date_last_updated'], datetime.datetime)

    assert 1 == results[1]['data_table_id']
    assert 'c_code' == results[1]['column_name']
    assert (results[1]['code'] in ('M', 'F', None))
    assert isinstance(results[1]['updated_by'], str)
    assert isinstance(results[1]['date_last_updated'], datetime.datetime)

    assert 1 == results[2]['data_table_id']
    assert 'c_code' == results[2]['column_name']
    assert (results[2]['code'] in ('M', 'F', None))
    assert isinstance(results[2]['updated_by'], str)
    assert isinstance(results[2]['date_last_updated'], datetime.datetime)

    frequency_1 = results[0]['code'], results[0]['frequency']
    frequency_2 = results[1]['code'], results[1]['frequency']
    frequency_3 = results[2]['code'], results[2]['frequency']
    all_frequencies = set([frequency_1, frequency_2, frequency_3])
    expected = set([('M', 1), ('F', 2), (None, 1)])

    assert expected == all_frequencies
예제 #20
0
    def test_get_column_level_metadata_code(self):
        """Test extracting code column level metadata."""

        self.engine.execute("""
           INSERT INTO metabase.data_table (data_table_id, file_table_name)
           VALUES (1, 'data.table_1');

           CREATE TABLE data.table_1 (c_code TEXT);

           INSERT INTO data.table_1 (c_code)
            VALUES
                ('M'),
                ('F'),
                ('F');
        """)

        with patch('metabase.extract_metadata.settings', self.mock_params):
            extract = extract_metadata.ExtractMetadata(data_table_id=1)

        extract._get_column_level_metadata(categorical_threshold=2)

        results = self.engine.execute("""
            SELECT
                data_table_id,
                column_name,
                code,
                frequency,
                updated_by,
                date_last_updated
            FROM metabase.code_frequency
        """).fetchall()

        assert results[0][0] == 1
        assert results[0][1] == 'c_code'
        assert results[0][2] == 'F'
        assert results[0][3] == 2
        assert isinstance(results[0][4], str)
        assert isinstance(results[0][5], datetime.datetime)

        assert results[1][0] == 1
        assert results[1][1] == 'c_code'
        assert results[1][2] == 'M'
        assert results[1][3] == 1
        assert isinstance(results[1][4], str)
        assert isinstance(results[1][5], datetime.datetime)
예제 #21
0
    def test_get_column_level_metadata_numeric(self):
        """Test extracting numeric column level metadata."""

        self.engine.execute("""
           INSERT INTO metabase.data_table (data_table_id, file_table_name)
           VALUES (1, 'data.table_1');

           CREATE TABLE data.table_1
               (c_num INT);

           INSERT INTO data.table_1 (c_num)
           VALUES
           (1),
           (2),
           (3);
        """)

        with patch('metabase.extract_metadata.settings', self.mock_params):
            extract = extract_metadata.ExtractMetadata(data_table_id=1)

        extract._get_column_level_metadata(categorical_threshold=2)

        # Check Numeric results.
        results = self.engine.execute("""
            SELECT
            data_table_id,
            column_name,
            minimum,
            maximum,
            mean,
            median,
            updated_by,
            date_last_updated
            FROM metabase.numeric_column
        """).fetchall()[0]

        assert results[0] == 1
        assert results[1] == 'c_num'
        assert results[2] == 1
        assert results[3] == 3
        assert results[4] == 2
        assert results[5] == 2
        assert isinstance(results[6], str)
        assert isinstance(results[7], datetime.datetime)
예제 #22
0
    def test_get_column_level_metadata_text(self):
        """Test extracting text column level metadata."""

        self.engine.execute("""
           INSERT INTO metabase.data_table (data_table_id, file_table_name)
           VALUES (1, 'data.table_1');

           CREATE TABLE data.table_1 (c_text TEXT);

           INSERT INTO data.table_1 (c_text)
           VALUES
           ('abc'),
           ('efgh'),
           ('ijklm');
        """)

        with patch('metabase.extract_metadata.settings', self.mock_params):
            extract = extract_metadata.ExtractMetadata(data_table_id=1)

        extract._get_column_level_metadata(categorical_threshold=2)

        # Check date results.
        results = self.engine.execute("""
            SELECT
            data_table_id,
            column_name,
            max_length,
            min_length,
            median_length,
            updated_by,
            date_last_updated
            FROM metabase.text_column
        """).fetchall()[0]

        assert results[0] == 1
        assert results[1] == 'c_text'
        assert results[2] == 5
        assert results[3] == 3
        assert results[4] == 4
        assert isinstance(results[5], str)
        assert isinstance(results[6], datetime.datetime)
예제 #23
0
def test_get_column_level_metadata_date_format_partly_specified(
        setup_module, setup_date_format):
    """
    If date format is partly specified, Metabase should try to convert columns
    whose date format is provided, and use the default date parser for
    columns whose date format is not provided.
    """

    with patch('metabase.extract_metadata.settings', setup_module.mock_params):
        extract = extract_metadata.ExtractMetadata(data_table_id=1)

    extract.process_table(
        categorical_threshold=0,
        date_format_dict={
            'c_date_text': 'YYYY-MM-DD',
            'c_date_text_2': 'YYYY-DD-MM',
        },
    )

    engine = setup_module.engine
    result = engine.execute("""
        SELECT * FROM metabase.date_column WHERE column_name = 'c_date_text';
    """).fetchall()[0]

    assert (datetime.date(2019, 1, 11) == result['min_date'] ==
            result['max_date'])

    result = engine.execute("""
        SELECT * FROM metabase.date_column WHERE column_name = 'c_date_text_2';
    """).fetchall()[0]

    assert (datetime.date(2019, 11, 1) == result['min_date'] ==
            result['max_date'])

    results = engine.execute("""
        SELECT * FROM metabase.column_info WHERE data_type = 'date';
    """).fetchall()

    date_column_names_set = set(v['column_name'] for v in results)

    assert 'c_date_date' in date_column_names_set
예제 #24
0
    def test_get_column_level_metadata_date(self):
        """Test extracting date column level metadata."""

        self.engine.execute("""
           INSERT INTO metabase.data_table (data_table_id, file_table_name)
           VALUES (1, 'data.table_1');

           CREATE TABLE data.table_1
               (c_date DATE);

           INSERT INTO data.table_1 ( c_date)
           VALUES
           ('2018-01-01'),
           ('2018-02-01'),
           ('2018-03-02');
        """)

        with patch('metabase.extract_metadata.settings', self.mock_params):
            extract = extract_metadata.ExtractMetadata(data_table_id=1)

        extract._get_column_level_metadata(categorical_threshold=2)

        # Check date results.
        results = self.engine.execute("""
            SELECT
            data_table_id,
            column_name,
            min_date,
            max_date,
            updated_by,
            date_last_updated
            FROM metabase.date_column
        """).fetchall()[0]

        assert results[0] == 1
        assert results[1] == 'c_date'
        assert results[2] == datetime.date(2018, 1, 1)
        assert results[3] == datetime.date(2018, 3, 2)
        assert isinstance(results[4], str)
        assert isinstance(results[5], datetime.datetime)
예제 #25
0
def test_get_column_level_metadata_date_format_detect_by_default(
        setup_module, setup_date_format):
    """
    If date format is not specified at all, Metabase should try to detect
    which columns are like dates.
    """

    with patch('metabase.extract_metadata.settings', setup_module.mock_params):
        extract = extract_metadata.ExtractMetadata(data_table_id=1)

    extract.process_table(
        categorical_threshold=0,
        date_format_dict={},
    )

    engine = setup_module.engine
    results = engine.execute("""
        SELECT * FROM metabase.column_info WHERE data_type = 'date';
    """).fetchall()

    date_column_names_set = set(v['column_name'] for v in results)

    assert 'c_date_invalid' not in date_column_names_set
    assert {'c_date_text', 'c_date_date'} < date_column_names_set
예제 #26
0
# Update meatabase.data_table with this new table.
max_id = engine.execute(
    'SELECT MAX(data_table_id) FROM metabase.data_table').fetchall()[0][0]
if max_id is None:
    new_id = 1
else:
    new_id = max_id + 1
print("data_table_id is {} for table {}".format(new_id, full_table_name))

engine.execute(
    """
    INSERT INTO metabase.data_table
    (
    data_table_id,
    file_table_name
    )
    VALUES
    (
    %(data_table_id)s,
    %(file_table_name)s
    )
    """, {
        'data_table_id': new_id,
        'file_table_name': full_table_name
    })

# Extract metadata from data.
extract = extract_metadata.ExtractMetadata(data_table_id=new_id)
extract.process_table(categorical_threshold=categorical_threshold)