Python StringIndexer.select примеры использования

Язык программирования: Python

Пространство имен/Пакет: pyspark.ml.feature

Класс/Тип: StringIndexer

Метод/Функция: select

Примеров на hotexamples.com: 15

Python StringIndexer.select - 15 примеров найдено. Это лучшие примеры Python кода для pyspark.ml.feature.StringIndexer.select, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

StringIndexer(30)

fit(30)

transform(30)

getOutputCol(22)

show(19)

select(15)

setHandleInvalid(14)

write(10)

drop(9)

randomSplit(8)

toPandas(4)

withColumnRenamed(4)

getInputCol(3)

withColumn(3)

groupBy(3)

where(3)

printSchema(3)

save(2)

setInputCol(2)

count(2)

take(1)

describe(1)

setOutputCol(1)

filter(1)

dropna(1)

fitAsync(1)

orderBy(1)

_call_java(1)

labels(1)

groupby(1)

getOutputCols(1)

fillna(1)

load(1)

Пример #1

Показать файл

def string_index(spark_df):
    """Create an index for each of the categorical column of the data set."""
    for i in spark_df.columns:
        inp_col = str(i)
        out_col = str(i) + "_indexed"
        fit_on = spark_df.select(str(i))
        df_i_indexed = StringIndexer(inputCol = inp_col, outputCol = out_col).fit(fit_on).transform(fit_on)
        indexed_col = df_i_indexed.select(out_col)
        print(i)
        indexed_col.printSchema()
        out_col_ohe = str(i) + "_encoded"
        try:
            df_i_encoded = OneHotEncoder(inputCol = out_col, outputCol = out_col_ohe).transform(df_i_indexed).show()
            df_i_encoded.select(out_col_ohe)
            vecAssembler = VectorAssembler(inputCols = out_col_ohe, outputCol="features")
            vecAssembler.transform(spark_df)
        except:
            pass
    return None

Пример #2

Показать файл

Файл: hw4.py Проект: toosyou/big_data_analysis_hw

def load_csv(sc, filename='200[0-5].csv'):
    sql_context = SQLContext(sc)
    df = sql_context.read.option('mode', 'PERMISSIVE')\
                            .load(filename,
                            format='com.databricks.spark.csv',
                            header='true',
                            nullValue='NA',
                            inferSchema='true').cache()
    df = df[FEATURE_USED]
    df = df.na.drop()
    # turn string to index
    for col in ['UniqueCarrier', 'Origin', 'Dest']:
        df = StringIndexer(inputCol=col,
                           outputCol=col + '_value').fit(df).transform(df)
        df = df.drop(col)

    # reordering
    df = df.select([
        'Month', 'DayofMonth', 'DayOfWeek', 'CRSDepTime', 'CRSArrTime',
        'UniqueCarrier_value', 'FlightNum', 'CRSElapsedTime', 'Origin_value',
        'Dest_value', 'Distance', 'Cancelled'
    ])
    return df

Пример #3

Показать файл

# ---------------------------------------

df3 = df2.select('Sex','Pclass','Survived','Embarked')
df3.show()
df3.printSchema()

from pyspark.ml.feature import StringIndexer
df3 = StringIndexer(inputCol='Sex',outputCol='Gender').fit(df3).transform(df3)
df3.groupby(df3.Embarked,'Embarked').agg({'Embarked':'count'}).show()
df3 = StringIndexer(inputCol='Embarked',outputCol='Embarked_Transformed').fit(df3).transform(df3)
#df3.groupby(df3.Embarked,'Embarked').agg({'Embarked':'count'}).show()
df3.show()
df3.printSchema()

df3 = df3.select(df3.Pclass.cast('double'),df3.SibSp.cast('double'),df3.Survived.cast('double'),df3.Fare.cast('double'))
df3.show()
df3.printSchema()

# Vector assembler

from pyspark.ml.feature import VectorAssembler
df3 = VectorAssembler(inputCols=['Pclass','SibSp','Fare'],outputCol='Features').transform(df3)

df3.show()
#
# 1 choose approach
from pyspark.ml.classification import DecisionTreeClassifier
dt1 = DecisionTreeClassifier(featuresCol='Features',labelCol='Survived',maxDepth=10,impurity='entropy')

# 2 learning process - created a model

Пример #4

Показать файл

df3.show()

df3 = OneHotEncoder(inputCol='Embarked1',
                    outputCol='Embarked2',
                    dropLast=False).transform(df3)
df3.show()

# --------------------------------------------

df3 = StringIndexer(inputCol='Sex', outputCol='Gender').fit(df3).transform(df3)
df3 = OneHotEncoder(inputCol='Gender', outputCol='Gender1',
                    dropLast=False).transform(df3)
df3.show()

# cast to double
df3 = df3.select(df3.Pclass.cast('double'), df3.Gender1, df3.Embarked2,
                 df3.Survived.cast('double'))
df3.printSchema()

# Vector assembler

df3 = VectorAssembler(inputCols=['Pclass', 'Gender1', 'Embarked2'],
                      outputCol='Features').transform(df3)
df3.show(truncate=False)

training = df3
training1 = df3

training.show(truncate=False, n=5)

# 1 choose approach
from pyspark.ml.classification import DecisionTreeClassifier

Пример #5

Показать файл

df5 = spark.read.csv('E:/kaggle/titanic/test.csv',header=True).select('PassengerId','Sex','Pclass','Embarked')

df5 = StringIndexer(inputCol='Embarked',outputCol='Embarked1').fit(df5).transform(df5)
df5.show()

df5 = OneHotEncoder(inputCol='Embarked1',outputCol='Embarked2',dropLast=False).transform(df5)
df5.show()

# --------------------------------------------

df5 = StringIndexer(inputCol='Sex',outputCol='Gender').fit(df5).transform(df5)
df5 = OneHotEncoder(inputCol='Gender',outputCol='Gender1',dropLast=False).transform(df5)
df5.show()


df5 = df5.select(df5.Pclass.cast('double'),df5.Gender1,df5.Embarked2,df5.PassengerId)
df5.printSchema()

# Vector assembler

df5 = VectorAssembler(inputCols=['Pclass','Gender1','Embarked2'],outputCol='Features').transform(df5)
df5.show(truncate=False)


df5_1 = model2.transform(df5)
df5_1.show()

df5_1.select('PassengerId','prediction').coalesce(1).write.csv('c:/test5.csv')

#  df5_1.select('PassengerId','prediction').toPandas().to_csv('c:/test5.csv')

Пример #6

Показать файл

Файл: pyspark_tutorials.py Проект: linghui-wu/LargeScaleComputing_A20

# Make predictions on the testing data
prediction = logistic.transform(sms_test)

# Create a confusion matrix, comparing predictions to known labels
prediction.groupBy('label', 'prediction').count().show()

# One-hot encoding
from pyspark.ml.feature import OneHotEncoderEstimator
onehot = OneHotEncoderEstimator(inputCols=['type_idx'], outputCol=['type_dummy'])
# Fit the encoder to the data
onehot = onehot.fit(cars)
# How many category levels?
print(onehot.categorySizes)

cars = onehot.transform(cars)
cars.select('type', 'type_idx', 'type_dummy').distinct().sort('type_idx').show()

# Dense verse sparse
from spark.mllib.linalg import DenseVector, SparseVector
DenseVector([1, 0, 0, 0, 0, 7, 0, 0])
SparseVector(8, [0, 5], [1, 7])

# Import the one hot encoder class
from pyspark.ml.feature import OneHotEncoderEstimator

# Create an instance of the one hot encoder
onehot = OneHotEncoderEstimator(inputCols=['org_idx'], outputCols=['org_dummy'])

# Apply the one hot encoder to the flights data
onehot = onehot.fit(flights)
flights_onehot = onehot.transform(flights)

Пример #7

Показать файл

#df3.show(10)
#df3.schema
#df3.printSchema()
## --------------------------------------------
#
#
##df4.show()
##df4.printSchema()
#
##fit(si1)
#male   = 0
#female = 1
#
##transform

df3 = df3.select(df3.Pclass.cast('double'), df3.Gender,
                 df3.Survived.cast('double'))
df3.printSchema()

# Vector assembler

from pyspark.ml.feature import VectorAssembler
df3 = VectorAssembler(inputCols=['Pclass'],
                      outputCol='Features').transform(df3)
df3.show()
#
# 1 choose approach
from pyspark.ml.classification import DecisionTreeClassifier
dt1 = DecisionTreeClassifier(featuresCol='Features', labelCol='Survived')

# 2 learning process - created a model
model2 = dt1.fit(df3)

Пример #8

Показать файл

Файл: m3_demo01_KMeansClustering.py Проект: GCPBigData/ds

# #### View the output of the KMeans model
# The prediction field denotes the cluster number

# In[15]:

clusterdData.toPandas().head()

# #### Get the average of each feature in the original data
# This is the equivalent of the cluster center when our dataset is one big cluster
# * We import all sql functions as we need the avg and count functions among others

# In[16]:

from pyspark.sql.functions import *

dataset.select(avg('Survived'), avg('Pclass'), avg('Age'), avg('Fare'),
               avg('Gender'), avg('Boarded')).toPandas()

# #### A more intuitive way to view the cluster centers in our clusterdData
# * We group by clusterID (prediction) and compute the average of all features
# * We do a count of values in each cluster

# In[17]:

clusterdData.groupBy('prediction').agg(
    avg('Survived'), avg('Pclass'), avg('Age'), avg('Fare'), avg('Gender'),
    avg('Boarded'), count('prediction')).orderBy('prediction').toPandas()

# #### Examine all rows in one of the clusters

# In[18]:

Пример #9

Показать файл

Файл: Ex3a.1.py Проект: wel51x/Machine_Learning_and_Spark

flights = flights.dropna()
print("\nThe data contains %d records after dropping records with na values." % flights.count())

# Create an indexer for carrier categorical feature
indexer = StringIndexer(inputCol="carrier", outputCol='carrier_idx')

# Indexer identifies categories in the data
indexer_model = indexer.fit(flights)

# Indexer creates a new column with numeric index values
flights_indexed = indexer_model.transform(flights)

# Repeat the process for the org categorical feature
flights_indexed = StringIndexer(inputCol="org", outputCol='org_idx').fit(flights_indexed).transform(flights_indexed)
# Check first five records
flights_indexed.show(5)

flites = flights_indexed.select('carrier', 'org', 'org_idx')

# Create an instance of the one hot encoder
onehot = OneHotEncoderEstimator(inputCols=["org_idx"], outputCols=["org_dummy"])

# Apply the one hot encoder to the flights data
onehot = onehot.fit(flites)
flights_onehot = onehot.transform(flites)

# Check the results
flights_onehot.select('org', 'org_idx', 'org_dummy').distinct().sort('org_idx').show()

spark.stop()

Пример #10

Показать файл

Файл: Databricks_Churn Prediction.py Проект: Betsy-Varghese/Predictive-Modeling-Python

#Check schema and first rows
customers.printSchema() #Schema is ok
customers.toPandas().head(5)

#Find missings
customers.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in customers.columns)) #No missings

#Renaming the CustomerID column for future joins
customers = customers.withColumnRenamed("CustomerID","cIDCustomer")
#DELIVERY
#Check schema and first rows
delivery.printSchema() #Schema is ok
delivery.toPandas().head(5)

#Find missings
delivery.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in delivery.columns)) #Found 780 missings in the DeliveryClass column

#Treating missing values
delivery = delivery.where(col("DeliveryClass").isNotNull())

#Encoding string columns in "Delivery"
delivery = StringIndexer(inputCol="DeliveryClass", outputCol="DeliveryClass_index").fit(delivery).transform(delivery)
delivery = StringIndexer(inputCol="DeliveryTypeName", outputCol="DeliveryTypeName_index").fit(delivery).transform(delivery)

#Renaming the SubscriptionID column for future joins
delivery = delivery.withColumnRenamed("SubscriptionID","sID_Delivery")
#FORMULA
#Check schema and first rows
formula.printSchema() #Schema is ok
formula.toPandas().head(5)

Пример #11

Показать файл

Файл: Ex3b.2.py Проект: wel51x/Machine_Learning_and_Spark

# Convert 'mile' to 'km' and drop 'mile' column
flights = flights.withColumn('km', round(flights.mile * 1.60934, 0)) \
                 .drop('mile')

# Remove records with missing values in any column and get the number of remaining rows
flights = flights.dropna()
print("The data contains %d records after dropping records with na values." %
      flights.count())

# Create an indexer for org categorical feature
flights_indexed = StringIndexer(
    inputCol="org", outputCol='org_idx').fit(flights).transform(flights)
# Check first five records
#flights_indexed.show(5)

flites = flights_indexed.select('km', 'org_idx', 'duration')

# Create 'features' vector: 'weight_kg', 'cyl', 'type_dummy'
assembler = VectorAssembler(inputCols=['km', 'org_idx'], outputCol='features')

# Consolidate predictor columns
flights_assembled = assembler.transform(flites)

# Check the resulting column
flites = flights_assembled.select('duration', 'features')
#flites.distinct().show(8, truncate=False)

print("Sample model input")
print(flites.toPandas().sample(12))

# Split the data into training and testing sets

Пример #12

Показать файл

Файл: Untitled1.py Проект: sosam29/pycode

# In[95]:


df2.show(5)


# In[93]:


desidxer_df.show(5)


# In[101]:


desidxer_df.select(['air_time','distance','carrier_idx','origin_idx','dest_idx']).describe().show()


# In[103]:


desidxer_df.select(desidxer_df.air_time IsNull()')


# In[108]:


desidxer_df.select(desidxer_df.air_time =='NA')


# In[110]:

Пример #13

Показать файл

Файл: train.py Проект: nirajpandkar/sparkify

def extract_features(df):
    """
    Create a vector Assembler of the features.
    
    Arguments:
        df: Dataframe consisting the relevant data columns.
    Returns:
        Dataframe with extracted features in the column "features".
    """
    feature_df = df.select("userId").distinct()
    col_names = []

    ts_dt_udf = udf(lambda x: x // 1000, LongType())
    df = df.withColumn("registration_dt",
                       ts_dt_udf(df.registration).cast("timestamp"))
    df = df.withColumn("timestamp_dt", ts_dt_udf(df.ts).cast("timestamp"))

    # Session Counts
    session_counts = df.groupby('userId').agg(
        countDistinct('sessionId').alias('session_count'))

    feature_df = feature_df.join(session_counts, on="userId")
    col_names.append("session_count")

    # Page Counts
    pages = df.select('page').distinct().sort('page')
    pages_list = [r.page for r in pages.collect()]
    page_counts = df.groupby('userId').pivot('page', pages_list).count()

    # Drop the "Cancel" page column
    # Fill NaNs with 0 - This will inherently transform "Cancellation Confirmation" column into "label"
    # with 1 as churned and 0 as non churned
    page_counts = page_counts.drop("Cancel")
    page_counts = page_counts.fillna(value=0)
    page_counts = page_counts.withColumnRenamed("Cancellation Confirmation",
                                                "label")

    # Join these feature columns to our feature dataframe
    feature_df = feature_df.join(page_counts, on="userId")

    # Normalize by session counts
    cut_columns = {'userId', 'session_count', 'label'}
    remaining_cols = sorted(list(set(feature_df.columns) - cut_columns))
    for column in remaining_cols:
        feature_df = feature_df.withColumn(
            column,
            col(column) / feature_df.session_count)
    col_names.extend(remaining_cols)

    # Time since registration
    user_ages = df.select([
        "userId", datediff("timestamp_dt", "registration_dt")
    ]).groupBy("userId").max().select(
        "userId",
        col("max(datediff(timestamp_dt, registration_dt))").alias("age"))
    feature_df = feature_df.join(user_ages, on="userId")
    col_names.append("age")

    # Total number of events
    user_number_events = df.groupBy("userId").count().select(
        "userId",
        col("count").alias("num_events"))
    feature_df = feature_df.join(user_number_events, on="userId")
    col_names.append("num_events")

    # Include device categorical variable
    device_udf = udf(
        lambda x: str(re.findall(r'\((.*?)\)', x)[0].split(";")[0].split()[0])
        if x is not None else None, StringType())
    df = df.withColumn("device", device_udf(df.userAgent))

    df_device = df.select(["userId", "device"]).distinct()
    df_device = StringIndexer(
        inputCol="device",
        outputCol="device_index").fit(df_device).transform(df_device)
    df_device = OneHotEncoderEstimator(
        inputCols=["device_index"],
        outputCols=["device_classVec"]).fit(df_device).transform(df_device)
    feature_df = feature_df.join(df_device.select("userId", "device_classVec"),
                                 on="userId")
    col_names.append("device_classVec")

    print(col_names)
    # Assemble the vector
    assembler = VectorAssembler(inputCols=col_names, outputCol='features')

    return assembler.transform(feature_df)

Пример #14

Показать файл

#then drop rows with leftover na's
#df = df.na.drop(how='any')
#df.count() #if loss is big, investigate and fill.na as needed

#otherwise, remove df2
#del(df2)

#export to csv (via coalesce)

print('\n\n\nGetting ready to write data to csv\n')

#df = df.select(float_x_vars + cat_x_vars + y_vars)
#df = df.na.drop(how='any')
#df.coalesce(1).write.csv('data/pdDataNN.csv')

#Using pandas
pdData = df.select(float_x_vars + cat_x_vars + y_vars)
pdData = pdData.na.drop(how='any')

#del(df)

#pdData.count()
#If there is a large loss, then investigate why
pdData = pdData.toPandas()
pdData.to_csv('data/pdDataNN.csv', index=False)

del (pdData)

spark.stop()

Пример #15

Показать файл

Файл: main.py Проект: jwzcheng/survey

col_string = col_string.iloc[:, 0].tolist()
col = set(col) - set(col_a)
col_test = set(col) - set(['HasDetections'])
col = list(col)
col_test = list(col_test)
col_test.append('MachineIdentifier')

col_si = []
for i in col:
    for j in col_string:
        if i == j:
            col_si.append(i)

col_num = list(set(col) - set(col_si))

test = test.select(col_test)

for i in col_test:
    if i == 'MachineIdentifier':
        continue
    else:
        test = StringIndexer(inputCol=i,
                             outputCol=i + "_index").fit(test).transform(test)

# encoder_input_col = []
# for i in col:
#     encoder_input_col.append(i + '_index')
# for i in col_num:
#     encoder_input_col.append(i)

encoder_input_col = [