def test_remove_white_spaces(): schema = Schema() schema.add_string_column('str1') tp = TransformProcess(schema) tp.remove_white_spaces('str1') tp.to_java()
def test_lower(): schema = Schema() schema.add_string_column('str1') tp = TransformProcess(schema) tp.lower('str1') tp.to_java()
def test_replace_empty(): schema = Schema() schema.add_string_column('str1') tp = TransformProcess(schema) tp.replace_empty_string('str1', 'xx') tp.to_java()
def test_append_string(): schema = Schema() schema.add_string_column('str1') tp = TransformProcess(schema) tp.append_string('str1', 'xxx') tp.to_java()
def test_remove(): schema = Schema() schema.add_string_column('str1') schema.add_string_column('str2') tp = TransformProcess(schema) tp.remove_column('str1') assert tp.final_schema.columns.keys() == ['str2'] tp.to_java()
def test_concat(): schema = Schema() schema.add_string_column('str1') schema.add_string_column('str2') tp = TransformProcess(schema) tp.concat(['str1', 'str2'], 'str3') assert 'str3' in tp.final_schema.columns tp.to_java()
def test_reduce_4(): reductions = ['first', 'last', 'append', 'prepend', 'count', 'count_unique'] for red in reductions: schema = Schema() schema.add_string_column('col1') schema.add_string_column('col2') tp = TransformProcess(schema) tp.reduce('col1', red) tp.to_java()
def test_rename(): schema = Schema() schema.add_string_column('str1') tp = TransformProcess(schema) tp.rename_column('str1', 'str2') assert 'str1' not in tp.final_schema.columns assert 'str2' in tp.final_schema.columns tp.to_java()
def test_str_to_time(): schema = Schema() schema.add_string_column('str1') schema.add_string_column('str2') tp = TransformProcess(schema) tp.string_to_time('str1') assert tp.final_schema.get_column_type('str1') == 'DateTime' tp.to_java()
def test_reduce_1(): reductions = ['sum', 'mean', 'std', 'var', 'prod'] for red in reductions: schema = Schema() schema.add_string_column('name') schema.add_double_column('amount') schema.add_integer_column('hours') tp = TransformProcess(schema) tp.reduce('name', red) tp.to_java()
def test_schema(): schema = Schema() schema.add_string_column('str1') schema.add_string_column('str2') schema.add_integer_column('int1') schema.add_integer_column('int2') schema.add_double_column('dbl1') schema.add_double_column('dbl2') schema.add_float_column('flt1') schema.add_float_column('flt2') schema.add_categorical_column('cat1', ['A', 'B', 'C']) schema.add_categorical_column('cat2', ['A', 'B', 'C']) schema.to_java()
def test_derive_col_from_time(): schema = Schema() schema.add_string_column('str1') schema.add_string_column('str2') tp = TransformProcess(schema) tp.string_to_time('str1') tp.derive_column_from_time('str1', 'hour', 'hour_of_day') assert 'hour' in tp.final_schema.columns tp.to_java()
# under the License. # # SPDX-License-Identifier: Apache-2.0 ################################################################################ # Basic example from pydatavec import Schema, TransformProcess from pydatavec import NotInSet, LessThan # Let's define the schema of the data that we want to import # The order in which columns are defined here should match the order in which they appear in the input data input_schema = Schema() input_schema.add_string_column("DateTimeString") input_schema.add_string_column("CustomerID") input_schema.add_string_column("MerchantID") input_schema.add_integer_column("NumItemsInTransaction") input_schema.add_categorical_column("MerchantCountryCode", ["USA", "CAN", "FR", "MX"]) # Some columns have restrictions on the allowable values, that we consider valid: input_schema.add_double_column( "TransactionAmountUSD", 0.0, None, False, False) # $0.0 or more, no maximum limit, no NaN and no Infinite values input_schema.add_categorical_column("FraudLabel", ["Fraud", "Legit"])
# under the License. # # SPDX-License-Identifier: Apache-2.0 ################################################################################ ''' In this simple example: We'll show how to combine multiple independent records by key. Specifically, assume we have data like "person,country_visited,entry_time" and we want to know how many times each person has entered each country. ''' from pydatavec import Schema, TransformProcess # Define the input schema schema = Schema() schema.add_string_column('person') schema.add_categorical_column('country_visited', ['USA', 'Japan', 'China', 'India']) schema.add_string_column('entry_time') # Define the operations we want to do tp = TransformProcess(schema) # Parse date-time # Format for parsing times is as per http://www.joda.org/joda-time/apidocs/org/joda/time/format/DateTimeFormat.html tp.string_to_time('entry_time', 'YYYY/MM/dd') # Take the "country_visited" column and expand it to a one-hot representation # So, "USA" becomes [1,0,0,0], "Japan" becomes [0,1,0,0], "China" becomes [0,0,1,0] etc