예제 #1
0
def test_timbrazil_ingest_voice_data(dagbag, populate_s3, mock_anonymizer):
    mock_anonymizer({'5548999027010': '1234567890'})
    # mock s3 data required by the dag
    populate_s3('timbrazil-internal', 'timbrazil-public')
    # run the tasks in the dag
    dag = dagbag.get_dag('timbrazil.anonymize_voice')
    run_dag_tasks(dag)
    # get the dag input and output for comparison
    prefix = 'ocs/ocs_moc/2020/04/01/00'
    raw = pd.read_csv(get_object(bucket='timbrazil-internal',
                                 key=f'{prefix}/ocs_moc_01_20200401_0001.GZ'),
                      compression='gzip',
                      header=None,
                      skiprows=1,
                      delimiter='|')

    anonymized = pd.read_csv(
        get_object(bucket='timbrazil-public',
                   key=f'{prefix}/ocs_moc_01_20200401_0001.csv'))
    # check that files have same structure
    columns = set(anonymized.columns)
    assert 'execution_date' in columns
    assert 'uuid' in columns
    assert anonymized.shape == (2, 456)
    assert raw.shape == (3, 454)

    # check pii encrypted
    pii_columns = [22, 32, 372, 373, 374, 375, 376, 377, 391, 398]
    raw_equivalent = [str(i - 1) for i in pii_columns]
    # create dataframe where any values are the same
    equal_pii_columns = anonymized[raw_equivalent].eq(raw[pii_columns])
    # check if any column has any value the same
    assert not any(equal_pii_columns.any())
예제 #2
0
    def test_anonymization(self, mock_anonymizer, populate_s3):
        # explicit test where some non numeric values are interspersed
        # with numeric in an msisdn column
        mock_anonymizer({"8675309": "1234567890"})
        populate_s3("bucket/", "out/")

        run_operator(
            PIIOperator(
                task_id="ingest",
                input_path="bucket/prefix",
                output_path="out/prefix",
                key_pattern="anonymization.csv",
                msisdn_column="msisdn",
            ))
        path = "out/prefix/2020/04/01/00/anonymization.csv"
        df = pd.read_csv(get_object(path=path), dtype=str)
        assert df.uuid.to_list() == [
            "1234567890",
            "1234567890",
            np.nan,
            np.nan,
            np.nan,
            "1234567890",
            np.nan,
            "1234567890",
        ]
예제 #3
0
    def test_pii_ingestion(self, path, out_prefix, populate_s3):
        pattern = r"key.csv"
        populate_s3("bucket/prefix", "out/")

        def callback(df, *a):
            df["callback_column"] = True
            return df

        out_path = "out/prefix" if out_prefix else "out"
        run_operator(
            PIIOperator(
                task_id="ingest",
                input_path=path,
                output_path=out_path,
                key_pattern=pattern,
                pii_columns=["msisdn"],
                transform_func=callback,
            ))
        path = f"s3://{out_path}/2020/04/01/00/key.csv"
        df = pd.read_csv(get_object(path=path))

        assert df.msisdn[0] not in {"123", 123}
        assert df.shape == (1, 4)
        assert df.columns.tolist() == [
            "msisdn",
            "pet",
            "callback_column",
            "execution_date",
        ]
예제 #4
0
    def test_anonymization_cache(self, mock_anonymizer, populate_s3):
        mock_anonymizer({"8675309": "1234567890"})
        populate_s3("bucket/", "out/")
        uuid_lookup = "out/msisdn_lookup.csv.gz"

        # test dump
        run_operator(
            PIIOperator(
                task_id="ingest",
                input_path="bucket/prefix",
                output_path="out/prefix",
                uuid_write_path=uuid_lookup,
                key_pattern="anonymization.csv",
                msisdn_column="msisdn",
            ))
        with gzip.open(get_object(path=uuid_lookup), "rt") as f:
            content = f.read()
        assert content == "8675309,1234567890\n"

        # test load
        run_operator(
            PIIOperator(
                task_id="ingest",
                input_path="bucket/prefix",
                output_path="out/prefix",
                uuid_read_path=uuid_lookup,
                key_pattern="anonymization.csv",
                msisdn_column="msisdn",
            ))
예제 #5
0
def test_timbrazil_ingest_sms(dagbag, populate_s3, mock_anonymizer):
    mock_anonymizer({'5548999027010': '1234567890'})
    populate_s3('timbrazil-internal', 'timbrazil-public')
    dag = dagbag.get_dag('timbrazil.anonymize_sms')
    run_dag_tasks(dag)
    assert get_object(
        bucket='timbrazil-public',
        key='ocs/ocs_sms/2020/04/01/00/ocs_sms_01_20200401_0001.csv')
예제 #6
0
def test_timbrazil_ingest_large_file(dagbag, populate_s3, mock_anonymizer,
                                     monkeypatch):
    mock_anonymizer({'5548999027010': '1234567890'})
    # mock s3 data required by the dag
    populate_s3('timbrazil-internal', 'timbrazil-public')
    # run the tasks in the dag
    dag = dagbag.get_dag('timbrazil.anonymize_voice')
    # set chunksize to 1 to emulate a large file
    monkeypatch.setattr(dag.tasks[0], 'chunksize', 1)
    run_dag_tasks(dag)
    # get the dag input and output for comparison
    prefix = 'ocs/ocs_moc/2020/04/01/00'
    raw = pd.read_csv(get_object(bucket='timbrazil-internal',
                                 key=f'{prefix}/ocs_moc_01_20200401_0001.GZ'),
                      compression='gzip',
                      header=None,
                      skiprows=1,
                      delimiter='|')

    anonymized = pd.read_csv(
        get_object(bucket='timbrazil-public',
                   key=f'{prefix}/ocs_moc_01_20200401_0001.csv'))
    assert anonymized.shape == (2, 456)
    assert raw.shape == (3, 454)