Exemplo n.º 1
0
def test_sets_only_existing_adapters():
    shgraph = SnowShuGraph()

    test_relation = Relation(database='SNOWSHU_DEVELOPMENT',
                             schema='SOURCE_SYSTEM',
                             name='ORDER_ITEMS',
                             materialization=mz.TABLE,
                             attributes=[])
    test_relation.include_outliers, test_relation.unsampled = [
        False for _ in range(2)
    ]
    test_relation.sampling = DefaultSampling()
    config_dict = copy.deepcopy(CONFIGURATION)
    config_dict['preserve_case'] = True
    config_dict['source']['specified_relations'][1]['sampling'] = 'lucky_guess'
    with pytest.raises(AttributeError):
        config = ConfigurationParser().from_file_or_path(
            StringIO(yaml.dump(config_dict)))

    assert isinstance(test_relation.sampling, DefaultSampling)
    config_dict['source']['specified_relations'][1]['sampling'] = 'brute_force'
    config = ConfigurationParser().from_file_or_path(
        StringIO(yaml.dump(config_dict)))

    assert isinstance(
        shgraph._set_overriding_params_for_node(test_relation,
                                                config).sampling,
        BruteForceSampling)
Exemplo n.º 2
0
def test_traverse_and_execute_analyze(stub_graph_set):
    source_adapter, target_adapter = [mock.MagicMock() for _ in range(2)]
    source_adapter.predicate_constraint_statement.return_value = str()
    source_adapter.upstream_constraint_statement.return_value = str()
    source_adapter.union_constraint_statement.return_value = str()
    source_adapter.sample_statement_from_relation.return_value = str()
    runner = GraphSetRunner()
    runner.barf = False
    graph_set, vals = stub_graph_set
    source_adapter.scalar_query.return_value = 1000
    source_adapter.check_count_and_query.return_value = pd.DataFrame(
        [dict(population_size=1000, sample_size=100)])
    dag = copy.deepcopy(graph_set[-1])  # last graph in the set is the dag

    ## stub in the sampling pop defaults
    for rel in dag.nodes:
        rel.unsampled = False
        rel.include_outliers = False
        rel.sampling = DefaultSampling()

    dag_executable = GraphExecutable(dag, source_adapter, target_adapter, True)

    # longer dag
    runner._traverse_and_execute(dag_executable)
    for rel in dag.nodes:
        assert not isinstance(getattr(rel, 'data', None), pd.DataFrame)
        assert rel.source_extracted is True
        assert rel.target_loaded is False
        assert rel.sample_size == 100
        assert rel.population_size == 1000

    # iso dag
    iso = copy.deepcopy(graph_set[0])  # first graph in the set is an iso
    [node for node in iso.nodes][0].sampling = DefaultSampling()
    [node for node in iso.nodes][0].unsampled = False
    [node for node in iso.nodes][0].include_outliers = False

    iso_executable = GraphExecutable(iso, source_adapter, target_adapter, True)
    assert not isinstance(getattr(vals.iso_relation, 'data', None),
                          pd.DataFrame)
    runner._traverse_and_execute(iso_executable)
    iso_relation = [node for node in iso.nodes][0]
    assert iso_relation.source_extracted is True
    assert iso_relation.target_loaded is False
    assert iso_relation.sample_size == 100
    assert iso_relation.population_size == 1000
Exemplo n.º 3
0
def test_default_sampling_override_min(mock_args):
    default = DefaultSampling(0.1, 0.50, 5000)
    mock_args[0].population_size = ONE_HUNDRED_THOUSAND_ROWS
    default.prepare(*mock_args)
    assert default.sample_method.rows == 5000
Exemplo n.º 4
0
def test_default_sampling_fine(mock_args):
    default = DefaultSampling(0.01, 0.99)
    mock_args[0].population_size = ONE_BILLION_ROWS
    default.prepare(*mock_args)
    assert default.sample_method.rows == 16588
Exemplo n.º 5
0
def test_default_sampling_stock(mock_args):
    mock_args[0].population_size = ONE_BILLION_ROWS
    default = DefaultSampling()
    default.prepare(*mock_args)

    assert default.sample_method.rows == 4147
Exemplo n.º 6
0
def stub_out_sampling(rel: Relation) -> Relation:
    rel.sampling = DefaultSampling()
    rel.sampling.sample_method = BernoulliSampleMethod(1500, units='rows')
    return rel