Exemplo n.º 1
0
def test_reduce_4():
    reductions = ['first', 'last', 'append', 'prepend', 'count', 'count_unique']
    for red in reductions:
        schema = Schema()
        schema.add_string_column('col1')
        schema.add_string_column('col2')

        tp = TransformProcess(schema)
        tp.reduce('col1', red)

        tp.to_java()
Exemplo n.º 2
0
def test_reduce_1():
    reductions = ['sum', 'mean', 'std', 'var', 'prod']
    for red in reductions:
        schema = Schema()
        schema.add_string_column('name')
        schema.add_double_column('amount')
        schema.add_integer_column('hours')

        tp = TransformProcess(schema)
        tp.reduce('name', red)

        tp.to_java()
Exemplo n.º 3
0
def test_derive_col_from_time():
    schema = Schema()
    schema.add_string_column('str1')
    schema.add_string_column('str2')

    tp = TransformProcess(schema)

    tp.string_to_time('str1')
    tp.derive_column_from_time('str1', 'hour', 'hour_of_day')

    assert 'hour' in tp.final_schema.columns

    tp.to_java()
Exemplo n.º 4
0
def test_replace_empty():
    schema = Schema()
    schema.add_string_column('str1')

    tp = TransformProcess(schema)
    tp.replace_empty_string('str1', 'xx')

    tp.to_java()
Exemplo n.º 5
0
def test_remove_white_spaces():
    schema = Schema()
    schema.add_string_column('str1')

    tp = TransformProcess(schema)
    tp.remove_white_spaces('str1')

    tp.to_java()
Exemplo n.º 6
0
def test_lower():
    schema = Schema()
    schema.add_string_column('str1')

    tp = TransformProcess(schema)
    tp.lower('str1')

    tp.to_java()
Exemplo n.º 7
0
def test_append_string():
    schema = Schema()
    schema.add_string_column('str1')

    tp = TransformProcess(schema)
    tp.append_string('str1', 'xxx')

    tp.to_java()
Exemplo n.º 8
0
def test_cat_to_int():
    schema = Schema()
    schema.add_categorical_column('cat', ['A', 'B', 'C'])

    tp = TransformProcess(schema)
    tp.categorical_to_integer('cat')

    assert tp.final_schema.get_column_type('cat') == 'integer'

    tp.to_java()
Exemplo n.º 9
0
def test_rename():
    schema = Schema()
    schema.add_string_column('str1')

    tp = TransformProcess(schema)
    tp.rename_column('str1', 'str2')

    assert 'str1' not in tp.final_schema.columns
    assert 'str2' in tp.final_schema.columns

    tp.to_java()
Exemplo n.º 10
0
def test_remove():
    schema = Schema()
    schema.add_string_column('str1')
    schema.add_string_column('str2')

    tp = TransformProcess(schema)
    tp.remove_column('str1')

    assert tp.final_schema.columns.keys() == ['str2']

    tp.to_java()
Exemplo n.º 11
0
def test_concat():
    schema = Schema()
    schema.add_string_column('str1')
    schema.add_string_column('str2')

    tp = TransformProcess(schema)
    tp.concat(['str1', 'str2'], 'str3')

    assert 'str3' in tp.final_schema.columns

    tp.to_java()
Exemplo n.º 12
0
def test_str_to_time():
    schema = Schema()
    schema.add_string_column('str1')
    schema.add_string_column('str2')

    tp = TransformProcess(schema)

    tp.string_to_time('str1')

    assert tp.final_schema.get_column_type('str1') == 'DateTime'

    tp.to_java()
Exemplo n.º 13
0
def test_reduce_1():
    reductions = ['sum', 'mean', 'std', 'var', 'prod']
    for red in reductions:
        schema = Schema()
        schema.add_string_column('name')
        schema.add_double_column('amount')
        schema.add_integer_column('hours')

        tp = TransformProcess(schema)
        tp.reduce('name', red)

        tp.to_java()
Exemplo n.º 14
0
def test_reduce_4():
    reductions = ['first', 'last', 'append',
                  'prepend', 'count', 'count_unique']
    for red in reductions:
        schema = Schema()
        schema.add_string_column('col1')
        schema.add_string_column('col2')

        tp = TransformProcess(schema)
        tp.reduce('col1', red)

        tp.to_java()
# We use pyspark to filter empty lines
sc = pyspark.SparkContext(master='local[*]', appName='iris')
data = sc.textFile('iris.data')
filtered_data = data.filter(lambda d: len(d) > 0)

# Define Input Schema
input_schema = Schema()
input_schema.add_double_column('Sepal length')
input_schema.add_double_column('Sepal width')
input_schema.add_double_column('Petal length')
input_schema.add_double_column('Petal width')
input_schema.add_categorical_column(
    "Species", ["Iris-setosa", "Iris-versicolor", "Iris-virginica"])

# Define Transform Process
tp = TransformProcess(input_schema)
tp.one_hot("Species")

# Do the transformation on spark and convert to numpy
output = tp(filtered_data)
np_array = np.array([[float(i) for i in x.split(',')] for x in output])
x = np_array[:, :-3]
y = np_array[:, -3:]

# Build the Keras model
model = Sequential()
model.add(Dense(10, input_shape=(4,), activation='relu', name='fc1'))
model.add(Dense(10, activation='relu', name='fc2'))
model.add(Dense(3, activation='softmax', name='output'))

optimizer = Adam(lr=0.001)
Exemplo n.º 16
0
each person has entered each country.
'''

from pydatavec import Schema, TransformProcess

# Define the input schema

schema = Schema()
schema.add_string_column('person')
schema.add_categorical_column('country_visited',
                              ['USA', 'Japan', 'China', 'India'])
schema.add_string_column('entry_time')

# Define the operations we want to do

tp = TransformProcess(schema)

# Parse date-time
# Format for parsing times is as per http://www.joda.org/joda-time/apidocs/org/joda/time/format/DateTimeFormat.html

tp.string_to_time('entry_time', 'YYYY/MM/dd')

# Take the "country_visited" column and expand it to a one-hot representation
# So, "USA" becomes [1,0,0,0], "Japan" becomes [0,1,0,0], "China" becomes [0,0,1,0] etc

tp.one_hot('country_visited')

# For each person, reduce all columns using `sum` op, except for entry_time; reduce it using `max` op :
tp.reduce('person', 'sum', {'entry_time': 'max'})

# Rename column
Exemplo n.º 17
0
input_schema.add_categorical_column("MerchantCountryCode",
                                    ["USA", "CAN", "FR", "MX"])

# Some columns have restrictions on the allowable values, that we consider valid:

input_schema.add_double_column(
    "TransactionAmountUSD", 0.0, None, False,
    False)  # $0.0 or more, no maximum limit, no NaN and no Infinite values

input_schema.add_categorical_column("FraudLabel", ["Fraud", "Legit"])

# Lets define some operations to execute on the data...
# We do this by defining a TransformProcess
# At each step, we identify column by the name we gave them in the input data schema, above

tp = TransformProcess(input_schema)

# Let's remove some column we don't need

tp.remove_column("CustomerID")
tp.remove_column("MerchantID")

# Now, suppose we only want to analyze transactions involving merchants in USA or Canada. Let's filter out
# everything except for those countries.
# Here, we are applying a conditional filter. We remove all of the examples that match the condition
# The condition is "MerchantCountryCode" isn't one of {"USA", "CAN"}

tp.filter(NotInSet("MerchantCountryCode", ["USA", "CAN"]))

# Let's suppose our data source isn't perfect, and we have some invalid data: negative dollar amounts that we want to replace with 0.0
# For positive dollar amounts, we don't want to modify those values
Exemplo n.º 18
0
temp_filename = filename + '_temp'
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"

if not os.path.isfile(filename):
    if os.path.isfile(temp_filename):
        os.remove(temp_filename)
    download_file(url, temp_filename)
    os.rename(temp_filename, filename)

# We use pyspark to filter empty lines
sc = pyspark.SparkContext(master='local[*]', appName='iris')
data = sc.textFile('iris.data')
filtered_data = data.filter(lambda x: len(x) > 0)

# Define Input Schema
input_schema = Schema()
input_schema.add_double_column('Sepal length')
input_schema.add_double_column('Sepal width')
input_schema.add_double_column('Petal length')
input_schema.add_double_column('Petal width')
input_schema.add_categorical_column("Species", ["Iris-setosa", "Iris-versicolor", "Iris-virginica"])

# Define Transform Process
tp = TransformProcess(input_schema)
tp.categorical_to_integer("Species")

# Do the transformation on spark
output = tp(filtered_data)

print(list(output))