def test_reduce_1(): reductions = ['sum', 'mean', 'std', 'var', 'prod'] for red in reductions: schema = Schema() schema.add_string_column('name') schema.add_double_column('amount') schema.add_integer_column('hours') tp = TransformProcess(schema) tp.reduce('name', red) tp.to_java()
def test_schema(): schema = Schema() schema.add_string_column('str1') schema.add_string_column('str2') schema.add_integer_column('int1') schema.add_integer_column('int2') schema.add_double_column('dbl1') schema.add_double_column('dbl2') schema.add_float_column('flt1') schema.add_float_column('flt2') schema.add_categorical_column('cat1', ['A', 'B', 'C']) schema.add_categorical_column('cat2', ['A', 'B', 'C']) schema.to_java()
# Basic example from pydatavec import Schema, TransformProcess from pydatavec import NotInSet, LessThan # Let's define the schema of the data that we want to import # The order in which columns are defined here should match the order in which they appear in the input data input_schema = Schema() input_schema.add_string_column("DateTimeString") input_schema.add_string_column("CustomerID") input_schema.add_string_column("MerchantID") input_schema.add_integer_column("NumItemsInTransaction") input_schema.add_categorical_column("MerchantCountryCode", ["USA", "CAN", "FR", "MX"]) # Some columns have restrictions on the allowable values, that we consider valid: input_schema.add_double_column( "TransactionAmountUSD", 0.0, None, False, False) # $0.0 or more, no maximum limit, no NaN and no Infinite values input_schema.add_categorical_column("FraudLabel", ["Fraud", "Legit"]) # Lets define some operations to execute on the data... # We do this by defining a TransformProcess # At each step, we identify column by the name we gave them in the input data schema, above