Exemplo n.º 1
0
def test_invalid_zip_file(postgres, db_conn, tmpdir, mocked_config,
                          monkeypatch):
    """Test that invalid zip files are properly handled."""
    valid_csv_file_path = str(
        tmpdir.join('operator1_with_rat_info_20160701_20160731.csv'))
    invalid_file_zip_path = valid_csv_file_path[:-3] + 'zip'
    with open(valid_csv_file_path, 'w') as f:
        f.close()
    os.rename(valid_csv_file_path, invalid_file_zip_path)

    catalog_config_dict = {
        'prospectors': [{
            'file_type': 'operator',
            'paths': [str(tmpdir)],
            'schema_filename': 'OperatorImportSchema.csvs'
        }, {
            'file_type': 'operator',
            'paths': [invalid_file_zip_path],
            'schema_filename': 'OperatorImportSchema_v2.csvs'
        }],
        'perform_prevalidation':
        True
    }

    catalog_config = CatalogConfig(ignore_env=True, **catalog_config_dict)
    monkeypatch.setattr(mocked_config, 'catalog_config', catalog_config)
    # Run dirbs-catalog using db args from the temp postgres instance
    runner = CliRunner()
    result = runner.invoke(dirbs_catalog_cli,
                           obj={'APP_CONFIG': mocked_config},
                           catch_exceptions=False)
    assert result.exit_code == 0
Exemplo n.º 2
0
def test_non_unique_path_failure(mocked_config, logger, tmpdir, monkeypatch):
    """Test if same path is defined two or more times; then config parsing fails."""
    # Set config file using the environment variable
    catalog_config_dict = {
        'prospectors': [{
            'file_type': 'operator',
            'paths': [str(tmpdir.join('operator.zip'))],
            'schema_filename': 'OperatorImportSchema_v2.csvs'
        }, {
            'file_type':
            'operator',
            'paths': [
                str(tmpdir.join('operator.zip')),
                str(tmpdir.join('operator1.zip'))
            ],
            'schema_filename':
            'OperatorImportSchema_v2.csvs'
        }],
        'perform_prevalidation':
        False
    }

    with pytest.raises(Exception) as ex:
        CatalogConfig(ignore_env=True, **catalog_config_dict)

    assert 'The paths specified in the catalog config are not globally unique' in str(
        ex)
Exemplo n.º 3
0
def test_perform_prevalidation_option(postgres, db_conn, tmpdir, monkeypatch, mocked_config):
    """Test pre-validation is not performed if option is turned off in the config."""
    files_to_zip = ['unittest_data/operator/operator1_with_rat_info_20160701_20160731.csv']
    zip_files_to_tmpdir(files_to_zip, tmpdir)
    catalog_config_dict = {
        'prospectors': [
            {
                'file_type': 'operator',
                'paths': [str(tmpdir.join('operator1_with_rat_info_20160701_20160731.zip'))],
                'schema_filename': 'OperatorImportSchema_v2.csvs'
            }
        ],
        'perform_prevalidation': False
    }

    catalog_config = CatalogConfig(ignore_env=True, **catalog_config_dict)
    monkeypatch.setattr(mocked_config, 'catalog_config', catalog_config)
    # Run dirbs-catalog using db args from the temp postgres instance
    runner = CliRunner()
    result = runner.invoke(dirbs_catalog_cli, obj={'APP_CONFIG': mocked_config})
    assert result.exit_code == 0
    # This test basically checks that when pre-validation is disabled, then it is skipped during catalog.
    # The is_valid_format field would be NULL is that scenario as tested below. The scenario with
    # pre-validation enabled is implicitly tested in test_all_files_are_harvested test case.
    with db_conn.cursor() as cursor:
        cursor.execute('SELECT is_valid_format FROM data_catalog WHERE filename = '
                       '\'operator1_with_rat_info_20160701_20160731.zip\'')
        assert cursor.fetchone().is_valid_format is None
Exemplo n.º 4
0
def test_non_zip_files_are_not_harvested(postgres, db_conn, tmpdir, mocker, mocked_config, monkeypatch):
    """Test non-zip files are not cataloged."""
    catalog_config_dict = {
        'prospectors': [
            {
                'file_type': 'operator',
                'paths': [str(tmpdir)],
                'schema_filename': 'OperatorImportSchema.csvs'
            },
        ],
        'perform_prevalidation': False
    }

    catalog_config = CatalogConfig(ignore_env=True, **catalog_config_dict)
    monkeypatch.setattr(mocked_config, 'catalog_config', catalog_config)

    # Mock os.listdir call to return the unzipped test file
    mocker.patch.object(os, 'listdir', new_callable=mocker.MagicMock(
        return_value=[os.path.abspath(os.path.dirname(__file__)),
                      'unittest_data/operator/operator1_with_rat_info_20160701_20160731.csv']))

    # Run dirbs-catalog using db args from the temp postgres instance
    runner = CliRunner()
    result = runner.invoke(dirbs_catalog_cli, obj={'APP_CONFIG': mocked_config})
    assert result.exit_code == 0

    with db_conn.cursor() as cursor:
        cursor.execute('SELECT COUNT(*) FROM data_catalog WHERE filename = '
                       '\'operator1_with_rat_info_20160701_20160731.csv\'')
        assert cursor.fetchone()[0] == 0
Exemplo n.º 5
0
def test_file_specified_explicitly_is_cataloged_correctly(postgres, db_conn, tmpdir, mocked_config, monkeypatch):
    """Test that if file is specified explicitly; it is pre-validated using the correct schema."""
    files_to_zip = ['unittest_data/operator/operator1_with_rat_info_20160701_20160731.csv']
    zip_files_to_tmpdir(files_to_zip, tmpdir)
    catalog_config_dict = {
        'prospectors': [
            {
                'file_type': 'operator',
                'paths': [str(tmpdir)],
                'schema_filename': 'OperatorImportSchema.csvs'
            },
            {
                'file_type': 'operator',
                'paths': [str(tmpdir.join('operator1_with_rat_info_20160701_20160731.zip'))],
                'schema_filename': 'OperatorImportSchema_v2.csvs'
            }
        ],
        'perform_prevalidation': True
    }

    catalog_config = CatalogConfig(ignore_env=True, **catalog_config_dict)
    monkeypatch.setattr(mocked_config, 'catalog_config', catalog_config)
    # Run dirbs-catalog using db args from the temp postgres instance
    runner = CliRunner()
    result = runner.invoke(dirbs_catalog_cli, obj={'APP_CONFIG': mocked_config})
    assert result.exit_code == 0

    with db_conn.cursor() as cursor:
        cursor.execute('SELECT is_valid_format FROM data_catalog WHERE filename = '
                       '\'operator1_with_rat_info_20160701_20160731.zip\'')
        assert cursor.fetchone().is_valid_format
Exemplo n.º 6
0
def test_num_records_uncompressed_size(mocked_config, tmpdir, monkeypatch,
                                       flask_app, api_version):
    """Test import status info in catalog api.

    - num_records: the number of lines in the file minus the header.
    - uncompressed_size_bytes.
    """
    # import operator status success
    here = path.abspath(path.dirname(__file__))
    data_dir = path.join(here, 'unittest_data/operator')
    valid_csv_operator_data_file_name = 'operator1_20160701_20160731.csv'
    valid_csv_operator_data_file = path.join(data_dir, valid_csv_operator_data_file_name)

    # create a zip file inside a temp dir
    valid_zip_operator_data_file_path = \
        str(tmpdir.join('operator1_20160701_20160731.zip'))
    with zipfile.ZipFile(valid_zip_operator_data_file_path, 'w',
                         compression=zipfile.ZIP_DEFLATED) as valid_csv_operator_data_file_zfile:
        # zipfile write() method supports an extra argument (arcname) which is the
        # archive name to be stored in the zip file.
        valid_csv_operator_data_file_zfile.write(valid_csv_operator_data_file, valid_csv_operator_data_file_name)

    catalog_config_dict = {
        'prospectors': [
            {
                'file_type': 'operator',
                'paths': [valid_zip_operator_data_file_path],
                'schema_filename': 'OperatorImportSchema_v2.csvs'
            }
        ],
        'perform_prevalidation': False
    }

    catalog_config = CatalogConfig(ignore_env=True, **catalog_config_dict)
    monkeypatch.setattr(mocked_config, 'catalog_config', catalog_config)
    # Run dirbs-catalog using db args from the temp postgres instance
    runner = CliRunner()
    result = runner.invoke(dirbs_catalog_cli, obj={'APP_CONFIG': mocked_config})
    assert result.exit_code == 0

    # call APIs
    if api_version == 'v1':
        rv = flask_app.get(url_for('{0}.catalog_api'.format(api_version)))
        assert rv.status_code == 200
        data = json.loads(rv.data.decode('utf-8'))
        assert data[0]['filename'] == 'operator1_20160701_20160731.zip'
        assert data[0]['num_records'] == 20
        assert data[0]['uncompressed_size_bytes'] == 1066
        assert data[0]['compressed_size_bytes'] == 400
    else:  # api version 2.0
        rv = flask_app.get(url_for('{0}.catalog_get_api'.format(api_version)))
        assert rv.status_code == 200
        data = json.loads(rv.data.decode('utf-8'))['files']
        assert data[0]['filename'] == 'operator1_20160701_20160731.zip'
        assert data[0]['num_records'] == 20
        assert data[0]['uncompressed_size_bytes'] == 1066
        assert data[0]['compressed_size_bytes'] == 400
Exemplo n.º 7
0
def test_catalog(per_test_postgres, tmpdir, db_user, mocked_config,
                 monkeypatch):
    """Test catalog works with the security role created based on abstract role."""
    files_to_zip = [
        'unittest_data/operator/operator1_with_rat_info_20160701_20160731.csv'
    ]
    zip_files_to_tmpdir(files_to_zip, tmpdir)

    catalog_config_dict = {
        'prospectors': [{
            'file_type':
            'operator',
            'paths': [
                str(
                    tmpdir.join(
                        'operator1_with_rat_info_20160701_20160731.zip'))
            ],
            'schema_filename':
            'OperatorImportSchema_v2.csvs'
        }],
        'perform_prevalidation':
        False
    }

    catalog_config = CatalogConfig(ignore_env=True, **catalog_config_dict)
    monkeypatch.setattr(mocked_config, 'catalog_config', catalog_config)

    # Run dirbs-catalog using db args from the temp postgres instance
    runner = CliRunner()
    monkeypatch.setattr(mocked_config.db_config, 'user', db_user)
    result = runner.invoke(dirbs_catalog_cli,
                           obj={'APP_CONFIG': mocked_config})

    if db_user in ['dirbs_poweruser_login', 'dirbs_catalog_user']:
        assert result.exit_code == 0
    else:
        assert result.exit_code != 0
Exemplo n.º 8
0
def test_import_status(db_conn, mocked_config, tmpdir, monkeypatch, flask_app,
                       api_version, logger, mocked_statsd, metadata_db_conn):
    """Test import status info in catalog api.

    - import_status:
        - ever_imported_successfully: true or false
        - most_recent_import: status
    Generate an MD5 hash of the file during import and store it in job_metadata.
    Then, when cataloging, look at the most recent import job in the job_metadata table where
    the file had the same MD5 hash and lookup the status.
    ever_imported_successfully will be true if there is any successfull import - joining on files md5
    most_recent_import returns the status of the most recent import - joining on files md5
    """
    # Step 1
    # try to import something successfully to get most_recent_import = success
    # and test the md5 created in the abstract importer using dirbs-import cli command
    here = path.abspath(path.dirname(__file__))
    data_dir = path.join(here, 'unittest_data/operator')
    valid_csv_operator_data_file_name = 'operator1_20160701_20160731.csv'
    valid_csv_operator_data_file = path.join(
        data_dir, valid_csv_operator_data_file_name)

    # create a zip file inside a temp dir
    valid_zip_operator_data_file_path = \
        str(tmpdir.join('operator1_20160701_20160731.zip'))
    with zipfile.ZipFile(valid_zip_operator_data_file_path,
                         'w') as valid_csv_operator_data_file_zfile:
        # zipfile write() method supports an extra argument (arcname) which is the
        # archive name to be stored in the zip file.
        valid_csv_operator_data_file_zfile.write(
            valid_csv_operator_data_file, valid_csv_operator_data_file_name)

    runner = CliRunner()
    result = runner.invoke(dirbs_import_cli, [
        'operator', 'Operator1', '--disable-rat-import',
        '--disable-region-check', '--disable-home-check',
        valid_zip_operator_data_file_path
    ],
                           obj={'APP_CONFIG': mocked_config})
    assert result.exit_code == 0

    catalog_config_dict = {
        'prospectors': [{
            'file_type': 'operator',
            'paths': [valid_zip_operator_data_file_path],
            'schema_filename': 'OperatorImportSchema_v2.csvs'
        }],
        'perform_prevalidation':
        False
    }

    catalog_config = CatalogConfig(ignore_env=True, **catalog_config_dict)
    monkeypatch.setattr(mocked_config, 'catalog_config', catalog_config)
    # Run dirbs-catalog using db args from the temp postgres instance
    runner = CliRunner()
    result = runner.invoke(dirbs_catalog_cli,
                           obj={'APP_CONFIG': mocked_config})
    assert result.exit_code == 0

    # call apis
    if api_version == 'v1':
        rv = flask_app.get(url_for('{0}.catalog_api'.format(api_version)))
        assert rv.status_code == 200
        data = json.loads(rv.data.decode('utf-8'))
        assert data[0]['import_status']['most_recent_import'] == 'success'
        assert data[0]['import_status']['ever_imported_successfully'] is True

        with db_conn.cursor() as cursor:
            cursor.execute('SELECT md5 FROM data_catalog')
            md5 = cursor.fetchone().md5

        # Step 2
        with db_conn.cursor() as cursor:
            cursor.execute('TRUNCATE TABLE job_metadata')

        # status error
        job_metadata_importer(db_conn=db_conn,
                              command='dirbs-import',
                              run_id=10,
                              subcommand='operator',
                              status='error',
                              start_time='2017-08-15 01:15:39.54785+00',
                              extra_metadata={'input_file_md5': md5})

        # status in progress, most recent
        job_metadata_importer(db_conn=db_conn,
                              command='dirbs-import',
                              run_id=11,
                              subcommand='operator',
                              status='running',
                              start_time='2017-08-15 01:15:40.54785+00',
                              extra_metadata={'input_file_md5': md5})

        # call API
        rv = flask_app.get(url_for('{0}.catalog_api'.format(api_version)))
        assert rv.status_code == 200
        data = json.loads(rv.data.decode('utf-8'))
        assert data[0]['import_status']['most_recent_import'] == 'running'
        assert data[0]['import_status']['ever_imported_successfully'] is False
        assert len(data) == 1

        # Step 3 try a different order
        with db_conn.cursor() as cursor:
            cursor.execute('TRUNCATE TABLE job_metadata')

        job_metadata_importer(db_conn=db_conn,
                              command='dirbs-import',
                              run_id=13,
                              subcommand='gsma',
                              status='success',
                              start_time='2017-08-15 01:15:39.54785+00',
                              extra_metadata={'input_file_md5': md5})

        # status in progress, most recent
        job_metadata_importer(db_conn=db_conn,
                              command='dirbs-import',
                              run_id=14,
                              subcommand='gsma',
                              status='error',
                              start_time='2017-08-15 01:15:40.54785+00',
                              extra_metadata={'input_file_md5': md5})

        # call API
        rv = flask_app.get(url_for('{0}.catalog_api'.format(api_version)))
        assert rv.status_code == 200
        data = json.loads(rv.data.decode('utf-8'))
        assert data[0]['import_status']['most_recent_import'] == 'error'
        assert data[0]['import_status']['ever_imported_successfully'] is True
        assert len(data) == 1
    else:  # api version 2.0
        rv = flask_app.get(url_for('{0}.catalog_get_api'.format(api_version)))
        assert rv.status_code == 200
        data = json.loads(rv.data.decode('utf-8'))['files']
        assert data[0]['import_status']['most_recent_import'] == 'success'
        assert data[0]['import_status']['ever_imported_successfully'] is True

        with db_conn.cursor() as cursor:
            cursor.execute('SELECT md5 FROM data_catalog')
            md5 = cursor.fetchone().md5

        # Step 2
        with db_conn.cursor() as cursor:
            cursor.execute('TRUNCATE TABLE job_metadata')

        # status error
        job_metadata_importer(db_conn=db_conn,
                              command='dirbs-import',
                              run_id=10,
                              subcommand='operator',
                              status='error',
                              start_time='2017-08-15 01:15:39.54785+00',
                              extra_metadata={'input_file_md5': md5})

        # status in progress, most recent
        job_metadata_importer(db_conn=db_conn,
                              command='dirbs-import',
                              run_id=11,
                              subcommand='operator',
                              status='running',
                              start_time='2017-08-15 01:15:40.54785+00',
                              extra_metadata={'input_file_md5': md5})

        # call API
        rv = flask_app.get(url_for('{0}.catalog_get_api'.format(api_version)))
        assert rv.status_code == 200
        data = json.loads(rv.data.decode('utf-8'))['files']
        assert data[0]['import_status']['most_recent_import'] == 'running'
        assert data[0]['import_status']['ever_imported_successfully'] is False
        assert len(data) == 1

        # Step 3 try a different order
        with db_conn.cursor() as cursor:
            cursor.execute('TRUNCATE TABLE job_metadata')

        job_metadata_importer(db_conn=db_conn,
                              command='dirbs-import',
                              run_id=13,
                              subcommand='gsma',
                              status='success',
                              start_time='2017-08-15 01:15:39.54785+00',
                              extra_metadata={'input_file_md5': md5})

        # status in progress, most recent
        job_metadata_importer(db_conn=db_conn,
                              command='dirbs-import',
                              run_id=14,
                              subcommand='gsma',
                              status='error',
                              start_time='2017-08-15 01:15:40.54785+00',
                              extra_metadata={'input_file_md5': md5})

        # call API
        rv = flask_app.get(url_for('{0}.catalog_get_api'.format(api_version)))
        assert rv.status_code == 200
        data = json.loads(rv.data.decode('utf-8'))['files']
        assert data[0]['import_status']['most_recent_import'] == 'error'
        assert data[0]['import_status']['ever_imported_successfully'] is True
        assert len(data) == 1
Exemplo n.º 9
0
def test_all_files_are_harvested(postgres, db_conn, tmpdir, logger,
                                 monkeypatch, mocked_config):
    """Test all input files are correctly harvested and cataloged."""
    files_to_zip = [
        'unittest_data/operator/operator1_with_rat_info_20160701_20160731.csv',
        'unittest_data/gsma/sample_gsma_import_list_anonymized.txt',
        'unittest_data/stolen_list/sample_stolen_list.csv',
        'unittest_data/registration_list/sample_registration_list.csv',
        'unittest_data/pairing_list/sample_pairinglist.csv',
        'unittest_data/golden_list/sample_golden_list.csv'
    ]
    zip_files_to_tmpdir(files_to_zip, tmpdir)
    catalog_config_dict = {
        'prospectors': [{
            'file_type':
            'operator',
            'paths': [
                str(
                    tmpdir.join(
                        'operator1_with_rat_info_20160701_20160731.zip'))
            ],
            'schema_filename':
            'OperatorImportSchema_v2.csvs'
        }, {
            'file_type':
            'gsma_tac',
            'paths':
            [str(tmpdir.join('sample_gsma_import_list_anonymized.zip'))],
            'schema_filename':
            'GSMASchema.csvs'
        }, {
            'file_type': 'stolen_list',
            'paths': [str(tmpdir.join('sample_stolen_list.zip'))],
            'schema_filename': 'StolenListSchema.csvs'
        }, {
            'file_type': 'pairing_list',
            'paths': [str(tmpdir.join('sample_pairinglist.zip'))],
            'schema_filename': 'PairingListSchema.csvs'
        }, {
            'file_type':
            'registration_list',
            'paths': [str(tmpdir.join('sample_registration_list.zip'))],
            'schema_filename':
            'RegistrationListSchema.csvs'
        }, {
            'file_type': 'golden_list',
            'paths': [str(tmpdir.join('sample_golden_list.zip'))],
            'schema_filename': 'GoldenListSchemaData.csvs'
        }],
        'perform_prevalidation':
        True
    }

    catalog_config = CatalogConfig(ignore_env=True, **catalog_config_dict)
    monkeypatch.setattr(mocked_config, 'catalog_config', catalog_config)

    # Run dirbs-catalog using db args from the temp postgres instance
    runner = CliRunner()

    # Run dirbs-catalog using db args from the temp postgres instance
    runner = CliRunner()
    result = runner.invoke(dirbs_catalog_cli,
                           obj={'APP_CONFIG': mocked_config})
    assert result.exit_code == 0

    with db_conn.cursor() as cursor:
        cursor.execute('SELECT * FROM data_catalog')
        res = [(res.filename, res.file_type, res.compressed_size_bytes,
                res.is_valid_zip, res.is_valid_format, res.extra_attributes)
               for res in cursor.fetchall()]
        assert ('operator1_with_rat_info_20160701_20160731.zip', 'operator',
                797, True, True, {
                    'filename_check': True
                }) in res
        assert ('sample_gsma_import_list_anonymized.zip', 'gsma_tac', 1083,
                True, True, {}) in res
        assert ('sample_stolen_list.zip', 'stolen_list', 529, True, True,
                {}) in res
        assert ('sample_registration_list.zip', 'registration_list', 858, True,
                True, {}) in res
        assert ('sample_pairinglist.zip', 'pairing_list', 312, True, True,
                {}) in res
        assert ('sample_golden_list.zip', 'golden_list', 474, True, True,
                {}) in res

    # Run dirbs-catalog again to verify that no new files are discovered
    result = runner.invoke(dirbs_catalog_cli,
                           obj={'APP_CONFIG': mocked_config})
    assert result.exit_code == 0
    assert 'Data catalog is already up-to-date!' in logger_stream_contents(
        logger)