Exemplo n.º 1
0
def test_all_null_inferred_type_is_null():
    counts = {
        Type.NULL: 1,
    }
    tracker = SchemaTracker()
    multiple_track(tracker, counts)
    inferred_type = tracker.infer_type()
    assert inferred_type.type == Type.NULL
Exemplo n.º 2
0
def test_majority_int():
    tracker = SchemaTracker()
    type_counts = {
        Type.INTEGRAL: 50,
        Type.STRING: 30,
        Type.UNKNOWN: 20,
    }
    multiple_track(tracker, type_counts)
    assert tracker.infer_type().type == Type.INTEGRAL
Exemplo n.º 3
0
def test_all_types_equal_coerced_to_string():
    tracker = SchemaTracker()
    multiple_track(tracker,
                   counts={
                       Type.INTEGRAL: 20,
                       Type.FRACTIONAL: 29,
                       Type.STRING: 50,
                   })
    assert tracker.infer_type().type == Type.STRING
Exemplo n.º 4
0
def test_float_and_int():
    tracker = SchemaTracker()
    multiple_track(tracker,
                   counts={
                       Type.INTEGRAL: 50,
                       Type.FRACTIONAL: 50,
                       Type.STRING: 10,
                   })
    assert tracker.infer_type().type == Type.FRACTIONAL
Exemplo n.º 5
0
def test_round_trip_with_legacy():
    type_counts = {
        Type.INTEGRAL: 3,
        Type.STRING: 4,
        Type.FRACTIONAL: 5,
        Type.BOOLEAN: 6,
        Type.UNKNOWN: 1,
    }
    tracker = SchemaTracker(type_counts, legacy_null_count=1)
    assert tracker.get_count(Type.NULL) == 1
Exemplo n.º 6
0
def test_serialization_roundtrip():
    tracker = SchemaTracker()
    type_count = {
        Type.INTEGRAL: 10,
        Type.STRING: 100,
    }
    multiple_track(tracker, type_count)

    roundtrip = SchemaTracker.from_protobuf(tracker.to_protobuf())

    assert tracker.to_protobuf() == roundtrip.to_protobuf()
    assert roundtrip.get_count(Type.INTEGRAL) == 10
    assert roundtrip.get_count(Type.STRING) == 100
Exemplo n.º 7
0
 def __init__(
     self,
     name: str,
     number_tracker: NumberTracker = None,
     string_tracker: StringTracker = None,
     schema_tracker: SchemaTracker = None,
     counters: CountersTracker = None,
     frequent_items: FrequentItemsSketch = None,
     cardinality_tracker: HllSketch = None,
 ):
     # Handle default values
     if counters is None:
         counters = CountersTracker()
     if number_tracker is None:
         number_tracker = NumberTracker()
     if string_tracker is None:
         string_tracker = StringTracker()
     if schema_tracker is None:
         schema_tracker = SchemaTracker()
     if frequent_items is None:
         frequent_items = FrequentItemsSketch()
     if cardinality_tracker is None:
         cardinality_tracker = HllSketch()
     # Assign values
     self.column_name = name
     self.number_tracker = number_tracker
     self.string_tracker = string_tracker
     self.schema_tracker = schema_tracker
     self.counters = counters
     self.frequent_items = frequent_items
     self.cardinality_tracker = cardinality_tracker
Exemplo n.º 8
0
def test_summary():
    type_counts = {
        Type.INTEGRAL: 3,
        Type.STRING: 4,
        Type.FRACTIONAL: 5,
        Type.BOOLEAN: 6,
        Type.UNKNOWN: 1,
    }
    tracker = SchemaTracker()
    multiple_track(tracker, type_counts)

    summary = tracker.to_summary()
    c = summary.type_counts
    assert c["INTEGRAL"] == type_counts[Type.INTEGRAL]
    assert c["STRING"] == type_counts[Type.STRING]
    assert c["FRACTIONAL"] == type_counts[Type.FRACTIONAL]
    assert c["BOOLEAN"] == type_counts[Type.BOOLEAN]
    assert c["UNKNOWN"] == type_counts[Type.UNKNOWN]

    assert summary.inferred_type.type == tracker.infer_type().type
Exemplo n.º 9
0
def test_merge_total_counts_match():
    x1 = SchemaTracker()
    multiple_track(x1, {
        Type.INTEGRAL: 10,
        Type.FRACTIONAL: 10,
        Type.BOOLEAN: 10,
        Type.UNKNOWN: 10
    })

    x2 = SchemaTracker()
    multiple_track(x2, {
        Type.INTEGRAL: 20,
        Type.FRACTIONAL: 20,
        Type.BOOLEAN: 20,
        Type.UNKNOWN: 20
    })

    merged = x1.merge(x2)
    assert merged.get_count(Type.INTEGRAL) == 30
    assert merged.get_count(Type.FRACTIONAL) == 30
    assert merged.get_count(Type.BOOLEAN) == 30
    assert merged.get_count(Type.UNKNOWN) == 30

    # Make sure we can serialize round trip
    SchemaTracker.from_protobuf(merged.to_protobuf())
Exemplo n.º 10
0
def test_mainly_null_inferred_type_not_null():
    counts = {Type.INTEGRAL: 1, Type.NULL: 30}
    tracker = SchemaTracker()
    multiple_track(tracker, counts)
    inferred_type = tracker.infer_type()
    assert inferred_type.type == Type.INTEGRAL

    counts = {Type.INTEGRAL: 1, Type.STRING: 2, Type.NULL: 30}
    tracker = SchemaTracker()
    multiple_track(tracker, counts)
    inferred_type = tracker.infer_type()
    assert inferred_type.type != Type.NULL
Exemplo n.º 11
0
    def from_protobuf(message):
        """
        Load from a protobuf message

        Returns
        -------
        column_profile : ColumnProfile
        """
        return ColumnProfile(
            message.name,
            counters=CountersTracker.from_protobuf(message.counters),
            schema_tracker=SchemaTracker.from_protobuf(message.schema),
            number_tracker=NumberTracker.from_protobuf(message.numbers),
            string_tracker=StringTracker.from_protobuf(message.strings),
            frequent_items=FrequentItemsSketch.from_protobuf(
                message.frequent_items),
            cardinality_tracker=HllSketch.from_protobuf(
                message.cardinality_tracker),
        )
Exemplo n.º 12
0
def test_track_datatype_counts():
    type_counts = {
        Type.INTEGRAL: 2,
        Type.STRING: 2,
        Type.FRACTIONAL: 2,
        Type.BOOLEAN: 2,
        Type.UNKNOWN: 2,
    }
    tracker = SchemaTracker()
    multiple_track(tracker, type_counts)

    assert type_counts[Type.INTEGRAL] == tracker.get_count(Type.INTEGRAL)
    assert type_counts[Type.STRING] == tracker.get_count(Type.STRING)
    assert type_counts[Type.FRACTIONAL] == tracker.get_count(Type.FRACTIONAL)
    assert type_counts[Type.BOOLEAN] == tracker.get_count(Type.BOOLEAN)
    assert type_counts[Type.UNKNOWN] == tracker.get_count(Type.UNKNOWN)
Exemplo n.º 13
0
def test_70percent_string():
    tracker = SchemaTracker()
    type_counts = {Type.INTEGRAL: 29, Type.STRING: 71}
    multiple_track(tracker, type_counts)
    assert tracker.infer_type().type == Type.STRING
Exemplo n.º 14
0
def test_track_nothing_should_return_unknown():
    tracker = SchemaTracker()
    inferred_type = tracker.infer_type()
    assert inferred_type.type == Type.UNKNOWN
    assert inferred_type.ratio == 0.0