Пример #1
0
def string_index(spark_df):
    """Create an index for each of the categorical column of the data set."""
    for i in spark_df.columns:
        inp_col = str(i)
        out_col = str(i) + "_indexed"
        fit_on = spark_df.select(str(i))
        df_i_indexed = StringIndexer(inputCol = inp_col, outputCol = out_col).fit(fit_on).transform(fit_on)
        indexed_col = df_i_indexed.select(out_col)
        print(i)
        indexed_col.printSchema()
        out_col_ohe = str(i) + "_encoded"
        try:
            df_i_encoded = OneHotEncoder(inputCol = out_col, outputCol = out_col_ohe).transform(df_i_indexed).show()
            df_i_encoded.select(out_col_ohe)
            vecAssembler = VectorAssembler(inputCols = out_col_ohe, outputCol="features")
            vecAssembler.transform(spark_df)
        except:
            pass
    return None
Пример #2
0
def load_csv(sc, filename='200[0-5].csv'):
    sql_context = SQLContext(sc)
    df = sql_context.read.option('mode', 'PERMISSIVE')\
                            .load(filename,
                            format='com.databricks.spark.csv',
                            header='true',
                            nullValue='NA',
                            inferSchema='true').cache()
    df = df[FEATURE_USED]
    df = df.na.drop()
    # turn string to index
    for col in ['UniqueCarrier', 'Origin', 'Dest']:
        df = StringIndexer(inputCol=col,
                           outputCol=col + '_value').fit(df).transform(df)
        df = df.drop(col)

    # reordering
    df = df.select([
        'Month', 'DayofMonth', 'DayOfWeek', 'CRSDepTime', 'CRSArrTime',
        'UniqueCarrier_value', 'FlightNum', 'CRSElapsedTime', 'Origin_value',
        'Dest_value', 'Distance', 'Cancelled'
    ])
    return df
Пример #3
0
# ---------------------------------------

df3 = df2.select('Sex','Pclass','Survived','Embarked')
df3.show()
df3.printSchema()

from pyspark.ml.feature import StringIndexer
df3 = StringIndexer(inputCol='Sex',outputCol='Gender').fit(df3).transform(df3)
df3.groupby(df3.Embarked,'Embarked').agg({'Embarked':'count'}).show()
df3 = StringIndexer(inputCol='Embarked',outputCol='Embarked_Transformed').fit(df3).transform(df3)
#df3.groupby(df3.Embarked,'Embarked').agg({'Embarked':'count'}).show()
df3.show()
df3.printSchema()

df3 = df3.select(df3.Pclass.cast('double'),df3.SibSp.cast('double'),df3.Survived.cast('double'),df3.Fare.cast('double'))
df3.show()
df3.printSchema()

# Vector assembler

from pyspark.ml.feature import VectorAssembler
df3 = VectorAssembler(inputCols=['Pclass','SibSp','Fare'],outputCol='Features').transform(df3)

df3.show()
#
# 1 choose approach
from pyspark.ml.classification import DecisionTreeClassifier
dt1 = DecisionTreeClassifier(featuresCol='Features',labelCol='Survived',maxDepth=10,impurity='entropy')

# 2 learning process - created a model
Пример #4
0
df3.show()

df3 = OneHotEncoder(inputCol='Embarked1',
                    outputCol='Embarked2',
                    dropLast=False).transform(df3)
df3.show()

# --------------------------------------------

df3 = StringIndexer(inputCol='Sex', outputCol='Gender').fit(df3).transform(df3)
df3 = OneHotEncoder(inputCol='Gender', outputCol='Gender1',
                    dropLast=False).transform(df3)
df3.show()

# cast to double
df3 = df3.select(df3.Pclass.cast('double'), df3.Gender1, df3.Embarked2,
                 df3.Survived.cast('double'))
df3.printSchema()

# Vector assembler

df3 = VectorAssembler(inputCols=['Pclass', 'Gender1', 'Embarked2'],
                      outputCol='Features').transform(df3)
df3.show(truncate=False)

training = df3
training1 = df3

training.show(truncate=False, n=5)

# 1 choose approach
from pyspark.ml.classification import DecisionTreeClassifier
Пример #5
0
df5 = spark.read.csv('E:/kaggle/titanic/test.csv',header=True).select('PassengerId','Sex','Pclass','Embarked')

df5 = StringIndexer(inputCol='Embarked',outputCol='Embarked1').fit(df5).transform(df5)
df5.show()

df5 = OneHotEncoder(inputCol='Embarked1',outputCol='Embarked2',dropLast=False).transform(df5)
df5.show()

# --------------------------------------------

df5 = StringIndexer(inputCol='Sex',outputCol='Gender').fit(df5).transform(df5)
df5 = OneHotEncoder(inputCol='Gender',outputCol='Gender1',dropLast=False).transform(df5)
df5.show()


df5 = df5.select(df5.Pclass.cast('double'),df5.Gender1,df5.Embarked2,df5.PassengerId)
df5.printSchema()

# Vector assembler

df5 = VectorAssembler(inputCols=['Pclass','Gender1','Embarked2'],outputCol='Features').transform(df5)
df5.show(truncate=False)


df5_1 = model2.transform(df5)
df5_1.show()

df5_1.select('PassengerId','prediction').coalesce(1).write.csv('c:/test5.csv')

#  df5_1.select('PassengerId','prediction').toPandas().to_csv('c:/test5.csv')
# Make predictions on the testing data
prediction = logistic.transform(sms_test)

# Create a confusion matrix, comparing predictions to known labels
prediction.groupBy('label', 'prediction').count().show()

# One-hot encoding
from pyspark.ml.feature import OneHotEncoderEstimator
onehot = OneHotEncoderEstimator(inputCols=['type_idx'], outputCol=['type_dummy'])
# Fit the encoder to the data
onehot = onehot.fit(cars)
# How many category levels?
print(onehot.categorySizes)

cars = onehot.transform(cars)
cars.select('type', 'type_idx', 'type_dummy').distinct().sort('type_idx').show()

# Dense verse sparse
from spark.mllib.linalg import DenseVector, SparseVector
DenseVector([1, 0, 0, 0, 0, 7, 0, 0])
SparseVector(8, [0, 5], [1, 7])

# Import the one hot encoder class
from pyspark.ml.feature import OneHotEncoderEstimator

# Create an instance of the one hot encoder
onehot = OneHotEncoderEstimator(inputCols=['org_idx'], outputCols=['org_dummy'])

# Apply the one hot encoder to the flights data
onehot = onehot.fit(flights)
flights_onehot = onehot.transform(flights)
Пример #7
0
#df3.show(10)
#df3.schema
#df3.printSchema()
## --------------------------------------------
#
#
##df4.show()
##df4.printSchema()
#
##fit(si1)
#male   = 0
#female = 1
#
##transform

df3 = df3.select(df3.Pclass.cast('double'), df3.Gender,
                 df3.Survived.cast('double'))
df3.printSchema()

# Vector assembler

from pyspark.ml.feature import VectorAssembler
df3 = VectorAssembler(inputCols=['Pclass'],
                      outputCol='Features').transform(df3)
df3.show()
#
# 1 choose approach
from pyspark.ml.classification import DecisionTreeClassifier
dt1 = DecisionTreeClassifier(featuresCol='Features', labelCol='Survived')

# 2 learning process - created a model
model2 = dt1.fit(df3)
Пример #8
0
# #### View the output of the KMeans model
# The prediction field denotes the cluster number

# In[15]:

clusterdData.toPandas().head()

# #### Get the average of each feature in the original data
# This is the equivalent of the cluster center when our dataset is one big cluster
# * We import all sql functions as we need the avg and count functions among others

# In[16]:

from pyspark.sql.functions import *

dataset.select(avg('Survived'), avg('Pclass'), avg('Age'), avg('Fare'),
               avg('Gender'), avg('Boarded')).toPandas()

# #### A more intuitive way to view the cluster centers in our clusterdData
# * We group by clusterID (prediction) and compute the average of all features
# * We do a count of values in each cluster

# In[17]:

clusterdData.groupBy('prediction').agg(
    avg('Survived'), avg('Pclass'), avg('Age'), avg('Fare'), avg('Gender'),
    avg('Boarded'), count('prediction')).orderBy('prediction').toPandas()

# #### Examine all rows in one of the clusters

# In[18]:
Пример #9
0
flights = flights.dropna()
print("\nThe data contains %d records after dropping records with na values." % flights.count())

# Create an indexer for carrier categorical feature
indexer = StringIndexer(inputCol="carrier", outputCol='carrier_idx')

# Indexer identifies categories in the data
indexer_model = indexer.fit(flights)

# Indexer creates a new column with numeric index values
flights_indexed = indexer_model.transform(flights)

# Repeat the process for the org categorical feature
flights_indexed = StringIndexer(inputCol="org", outputCol='org_idx').fit(flights_indexed).transform(flights_indexed)
# Check first five records
flights_indexed.show(5)

flites = flights_indexed.select('carrier', 'org', 'org_idx')

# Create an instance of the one hot encoder
onehot = OneHotEncoderEstimator(inputCols=["org_idx"], outputCols=["org_dummy"])

# Apply the one hot encoder to the flights data
onehot = onehot.fit(flites)
flights_onehot = onehot.transform(flites)

# Check the results
flights_onehot.select('org', 'org_idx', 'org_dummy').distinct().sort('org_idx').show()

spark.stop()
#Check schema and first rows
customers.printSchema() #Schema is ok
customers.toPandas().head(5)

#Find missings
customers.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in customers.columns)) #No missings

#Renaming the CustomerID column for future joins
customers = customers.withColumnRenamed("CustomerID","cIDCustomer")
#DELIVERY
#Check schema and first rows
delivery.printSchema() #Schema is ok
delivery.toPandas().head(5)

#Find missings
delivery.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in delivery.columns)) #Found 780 missings in the DeliveryClass column

#Treating missing values
delivery = delivery.where(col("DeliveryClass").isNotNull())

#Encoding string columns in "Delivery"
delivery = StringIndexer(inputCol="DeliveryClass", outputCol="DeliveryClass_index").fit(delivery).transform(delivery)
delivery = StringIndexer(inputCol="DeliveryTypeName", outputCol="DeliveryTypeName_index").fit(delivery).transform(delivery)

#Renaming the SubscriptionID column for future joins
delivery = delivery.withColumnRenamed("SubscriptionID","sID_Delivery")
#FORMULA
#Check schema and first rows
formula.printSchema() #Schema is ok
formula.toPandas().head(5)
Пример #11
0
# Convert 'mile' to 'km' and drop 'mile' column
flights = flights.withColumn('km', round(flights.mile * 1.60934, 0)) \
                 .drop('mile')

# Remove records with missing values in any column and get the number of remaining rows
flights = flights.dropna()
print("The data contains %d records after dropping records with na values." %
      flights.count())

# Create an indexer for org categorical feature
flights_indexed = StringIndexer(
    inputCol="org", outputCol='org_idx').fit(flights).transform(flights)
# Check first five records
#flights_indexed.show(5)

flites = flights_indexed.select('km', 'org_idx', 'duration')

# Create 'features' vector: 'weight_kg', 'cyl', 'type_dummy'
assembler = VectorAssembler(inputCols=['km', 'org_idx'], outputCol='features')

# Consolidate predictor columns
flights_assembled = assembler.transform(flites)

# Check the resulting column
flites = flights_assembled.select('duration', 'features')
#flites.distinct().show(8, truncate=False)

print("Sample model input")
print(flites.toPandas().sample(12))

# Split the data into training and testing sets
Пример #12
0
# In[95]:


df2.show(5)


# In[93]:


desidxer_df.show(5)


# In[101]:


desidxer_df.select(['air_time','distance','carrier_idx','origin_idx','dest_idx']).describe().show()


# In[103]:


desidxer_df.select(desidxer_df.air_time IsNull()')


# In[108]:


desidxer_df.select(desidxer_df.air_time =='NA')


# In[110]:
Пример #13
0
def extract_features(df):
    """
    Create a vector Assembler of the features.
    
    Arguments:
        df: Dataframe consisting the relevant data columns.
    Returns:
        Dataframe with extracted features in the column "features".
    """
    feature_df = df.select("userId").distinct()
    col_names = []

    ts_dt_udf = udf(lambda x: x // 1000, LongType())
    df = df.withColumn("registration_dt",
                       ts_dt_udf(df.registration).cast("timestamp"))
    df = df.withColumn("timestamp_dt", ts_dt_udf(df.ts).cast("timestamp"))

    # Session Counts
    session_counts = df.groupby('userId').agg(
        countDistinct('sessionId').alias('session_count'))

    feature_df = feature_df.join(session_counts, on="userId")
    col_names.append("session_count")

    # Page Counts
    pages = df.select('page').distinct().sort('page')
    pages_list = [r.page for r in pages.collect()]
    page_counts = df.groupby('userId').pivot('page', pages_list).count()

    # Drop the "Cancel" page column
    # Fill NaNs with 0 - This will inherently transform "Cancellation Confirmation" column into "label"
    # with 1 as churned and 0 as non churned
    page_counts = page_counts.drop("Cancel")
    page_counts = page_counts.fillna(value=0)
    page_counts = page_counts.withColumnRenamed("Cancellation Confirmation",
                                                "label")

    # Join these feature columns to our feature dataframe
    feature_df = feature_df.join(page_counts, on="userId")

    # Normalize by session counts
    cut_columns = {'userId', 'session_count', 'label'}
    remaining_cols = sorted(list(set(feature_df.columns) - cut_columns))
    for column in remaining_cols:
        feature_df = feature_df.withColumn(
            column,
            col(column) / feature_df.session_count)
    col_names.extend(remaining_cols)

    # Time since registration
    user_ages = df.select([
        "userId", datediff("timestamp_dt", "registration_dt")
    ]).groupBy("userId").max().select(
        "userId",
        col("max(datediff(timestamp_dt, registration_dt))").alias("age"))
    feature_df = feature_df.join(user_ages, on="userId")
    col_names.append("age")

    # Total number of events
    user_number_events = df.groupBy("userId").count().select(
        "userId",
        col("count").alias("num_events"))
    feature_df = feature_df.join(user_number_events, on="userId")
    col_names.append("num_events")

    # Include device categorical variable
    device_udf = udf(
        lambda x: str(re.findall(r'\((.*?)\)', x)[0].split(";")[0].split()[0])
        if x is not None else None, StringType())
    df = df.withColumn("device", device_udf(df.userAgent))

    df_device = df.select(["userId", "device"]).distinct()
    df_device = StringIndexer(
        inputCol="device",
        outputCol="device_index").fit(df_device).transform(df_device)
    df_device = OneHotEncoderEstimator(
        inputCols=["device_index"],
        outputCols=["device_classVec"]).fit(df_device).transform(df_device)
    feature_df = feature_df.join(df_device.select("userId", "device_classVec"),
                                 on="userId")
    col_names.append("device_classVec")

    print(col_names)
    # Assemble the vector
    assembler = VectorAssembler(inputCols=col_names, outputCol='features')

    return assembler.transform(feature_df)
Пример #14
0
#then drop rows with leftover na's
#df = df.na.drop(how='any')
#df.count() #if loss is big, investigate and fill.na as needed

#otherwise, remove df2
#del(df2)

#export to csv (via coalesce)

print('\n\n\nGetting ready to write data to csv\n')

#df = df.select(float_x_vars + cat_x_vars + y_vars)
#df = df.na.drop(how='any')
#df.coalesce(1).write.csv('data/pdDataNN.csv')

#Using pandas
pdData = df.select(float_x_vars + cat_x_vars + y_vars)
pdData = pdData.na.drop(how='any')

#del(df)

#pdData.count()
#If there is a large loss, then investigate why
pdData = pdData.toPandas()
pdData.to_csv('data/pdDataNN.csv', index=False)

del (pdData)

spark.stop()
Пример #15
0
col_string = col_string.iloc[:, 0].tolist()
col = set(col) - set(col_a)
col_test = set(col) - set(['HasDetections'])
col = list(col)
col_test = list(col_test)
col_test.append('MachineIdentifier')

col_si = []
for i in col:
    for j in col_string:
        if i == j:
            col_si.append(i)

col_num = list(set(col) - set(col_si))

test = test.select(col_test)

for i in col_test:
    if i == 'MachineIdentifier':
        continue
    else:
        test = StringIndexer(inputCol=i,
                             outputCol=i + "_index").fit(test).transform(test)

# encoder_input_col = []
# for i in col:
#     encoder_input_col.append(i + '_index')
# for i in col_num:
#     encoder_input_col.append(i)

encoder_input_col = [