Exemplo n.º 1
0
                                ",").load(input_path + "\\adult.csv")
data = data.withColumnRenamed("age", "label").select(
    "label",
    col("education-num").alias("education-num"),
    col(" hours-per-week").alias("hours-per-week"),
    col(" education").alias("education"),
    col(" fnlwgt").alias("fnlwgt"),
    col(" sex").alias("sex"),
    col(" relationship").alias("relationship"))
data = data.select(data.label.cast("double"), "education-num",
                   "hours-per-week", "education", "sex", "fnlwgt",
                   "relationship")

new_data = data.toDF("label", "education-num", "hours-per-week", "education",
                     "sex", "fnlwgt", "relationship")
indexer = StringIndexer(inputCol="education", outputCol="new_education")
indexed = indexer.fit(new_data).transform(new_data)

indexer1 = StringIndexer(inputCol="sex", outputCol="new_sex")
indexed1 = indexer1.fit(indexed).transform(indexed)

indexer2 = StringIndexer(inputCol="relationship", outputCol="new_rel")
indexed2 = indexer2.fit(indexed1).transform(indexed1)

indexed2 = indexed2.drop("sex", "education", "relationship")
indexed2.show()

# Create vector assembler for feature columns
assembler = VectorAssembler(inputCols=indexed2.columns[1:],
                            outputCol="features")
data = assembler.transform(indexed2)
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("DecisionTreeClassificationExample")\
        .getOrCreate()

    # $example on$
    # Load the data stored in LIBSVM format as a DataFrame.
    data = spark.read.format("libsvm").load(
        "data/mllib/sample_libsvm_data.txt")

    # Index labels, adding metadata to the label column.
    # Fit on whole dataset to include all labels in index.
    labelIndexer = StringIndexer(inputCol="label",
                                 outputCol="indexedLabel").fit(data)
    # Automatically identify categorical features, and index them.
    # We specify maxCategories so features with > 4 distinct values are treated as continuous.
    featureIndexer =\
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a DecisionTree model.
    dt = DecisionTreeClassifier(labelCol="indexedLabel",
                                featuresCol="indexedFeatures")

    # Chain indexers and tree in a Pipeline
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
Exemplo n.º 3
0
#sentenceData = spark.createDataFrame([(0, "I heard about Spark and I love Spark"),(0, "I wish Java could use case classes"),(1, "Logistic regression models are neat")]).toDF("label", "sentence")

sentenceData = spark.createDataFrame(tranform_data, ["label", "sentence"])
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)

#计算TF-IDF
hashingTF = HashingTF(inputCol="words",
                      outputCol="rawFeatures",
                      numFeatures=3000)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

forData = StringIndexer().setInputCol("label").setOutputCol("indexed").fit(
    rescaledData).transform(rescaledData)
(trainingData, testData) = forData.randomSplit([0.8, 0.2], seed=0)
print(trainingData.take(1))
rfClassifier = RandomForestClassifier(numTrees=10,
                                      maxDepth=10,
                                      seed=0,
                                      labelCol="indexed")

start_time = time.time()
modelClassifier = rfClassifier.fit(trainingData)
end_time = time.time()

cost_time = end_time - start_time
print("spark rf time  :", cost_time)

predictionsClassifier = modelClassifier.transform(testData)
Exemplo n.º 4
0
def main(spark, trainFilePath, valFilePath, downPercentage):
    '''
    trainFilePath: path of training set
    
    valFilePath: path of validation file or test file, whichever the evaluation is based on.
    
    downPercentage: percentage of training set used to train the model. Use 1 if using the full set.
    
    output: Print the precedure of the whole process, including MAP and precision at 500 for each configuration. At last, print the dictionary containing all configuration and scores, and then print the configuration that has the highest MAP score and the score itself.
    '''

    # Get Train sample & Validation
    downPercentage = float(downPercentage)
    if 'validation' in valFilePath:
        print('Using ' + str(downPercentage * 100) +
              '% of the training set and the validation set...')
    else:
        print('Using ' + str(downPercentage * 100) +
              '% of the training set and the test set...')
    train = spark.read.parquet(trainFilePath)
    train_sample = train.sample(False, downPercentage, seed=0)
    val = spark.read.parquet(valFilePath)

    # Generate indexers and fit to train
    indexerUser = StringIndexer(inputCol="user_id",
                                outputCol="user_index",
                                handleInvalid='skip')
    indexerTrack = StringIndexer(inputCol="track_id",
                                 outputCol="track_index",
                                 handleInvalid='skip')

    print('Generate the model for transforming user_id and track_id')
    indexers = Pipeline(stages=[indexerUser, indexerTrack])
    model = indexers.fit(train_sample)

    # Transform indexers to train sample and val
    print('Transform user_id and track_id into numerical values')
    train_sample = model.transform(train_sample)
    val_sample = model.transform(val)

    # Get intersection of user indexs
    valUsers = val_sample.select('user_index').distinct()

    # Initialize ALS Parameters
    param = [[0.001, 0.01, 0.1], [5, 8, 10], [0.3, 0.7, 1]]
    config = list(itertools.product(*param))

    print('Hyper-parameter tuning...')
    performance = {}
    # Grid Search
    for conf in config:
        print('Configuration: regParam = ' + str(conf[0]) + ', rank = ' +
              str(conf[1]) + ', alpha = ' + str(conf[2]) + '.')
        print('Generating model...')
        als = ALS(alpha=conf[2],
                  rank=conf[1],
                  regParam=conf[0],
                  userCol="user_index",
                  itemCol="track_index",
                  ratingCol="count",
                  coldStartStrategy="drop",
                  implicitPrefs=True)
        model_als = als.fit(train_sample)

        print('Getting the Prediction list...')
        # Get top 500 recommended items for val users: Prediction List
        top500_val = model_als.recommendForUserSubset(valUsers, 500).cache()
        predList = top500_val.select(
            top500_val.user_index,
            top500_val.recommendations.track_index.alias('pred_list'))

        print('Getting the True list...')
        # Build True List
        trueList = val_sample.groupBy('user_index')\
        .agg(expr('collect_list(track_index) as true_list'))
        # Join The lists and generate RDD for ranking metric
        trueList = trueList.alias('trueList')
        predList = predList.alias('predList')
        predTrueList = predList.join(
            trueList, predList.user_index == trueList.user_index).select(
                'predList.pred_list', 'trueList.true_list')
        predictionAndLabels = predTrueList.rdd.map(
            lambda row: (row.pred_list, row.true_list))

        print('Getting the evaluation...')
        # Build Evaluator and get MAP
        rankmetrics = RankingMetrics(predictionAndLabels)
        performance[conf] = [
            rankmetrics.meanAveragePrecision,
            rankmetrics.precisionAt(500)
        ]
        print('The MAP is: ' + str(performance[conf][0]))
        print('The Precision at 500 is: ' + str(performance[conf][1]))

    print(performance)
    best_config = list(performance.keys())[np.argmax(
        [i[0] for i in performance.values()])]
    print(
        'The best MAP performance comes from the configuration: regParam = ' +
        str(best_config[0]) + ', rank = ' + str(best_config[1]) +
        ', alpha = ' + str(best_config[2]) + '.')
    print('The MAP is: ' + str(performance[best_config][0]) + '.')
Exemplo n.º 5
0
# Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
trainingSummary.roc.show()
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

# Set the model threshold to maximize F-Measure
fMeasure = trainingSummary.fMeasureByThreshold
maxFMeasure = fMeasure.groupBy().max('F-Measure').select(
    'max(F-Measure)').head()
bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
    .select('threshold').head()['threshold']
lr.setThreshold(bestThreshold)

# We can also use the multinomial family for binary classification
mlr = LogisticRegression(maxIter=10,
                         regParam=0.3,
                         elasticNetParam=0.8,
                         family="multinomial")

# Fit the model
mlrModel = mlr.fit(left_join)

# Print the coefficients and intercepts for logistic regression with multinomial family
print("Multinomial coefficients: " + str(mlrModel.coefficientMatrix))
print("Multinomial intercepts: " + str(mlrModel.interceptVector))

# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label",
                             outputCol="indexedLabel").fit(left_join)
    def transformation(self, df1, sqlContext):

        ## continous to integer buckets

        discretizer1 = QuantileDiscretizer(handleInvalid="keep",
                                           numBuckets=15,
                                           inputCol="price_cents",
                                           outputCol="price_cents_q")
        discretizer2 = QuantileDiscretizer(handleInvalid="keep",
                                           numBuckets=5,
                                           inputCol="images_count",
                                           outputCol="images_count_q")
        discretizer3 = QuantileDiscretizer(handleInvalid="keep",
                                           numBuckets=5,
                                           inputCol="utilities_count",
                                           outputCol="utilities_q")

        #categorical/string to integers
        stringIndexer1 = StringIndexer(handleInvalid='keep',
                                       inputCol='city',
                                       outputCol="city_i")
        stringIndexer2 = StringIndexer(handleInvalid='keep',
                                       inputCol='province',
                                       outputCol="province_i")
        stringIndexer3 = StringIndexer(handleInvalid='keep',
                                       inputCol='parking_types',
                                       outputCol="parking_types_i")
        stringIndexer4 = StringIndexer(handleInvalid='keep',
                                       inputCol='property_sub_type',
                                       outputCol="property_sub_type_i")
        stringIndexer5 = StringIndexer(handleInvalid='keep',
                                       inputCol='postal',
                                       outputCol="postal_i")
        #stringIndexer6 = StringIndexer(handleInvalid='keep', inputCol='address_street',    outputCol="address_street_i")

        encoder = OneHotEncoderEstimator(
            inputCols=[
                "price_cents_q", "images_count_q", "utilities_q", "city_i",
                "province_i", "parking_types_i", "property_sub_type_i",
                "postal_i", "year_built", "beds", "bathrooms", "has_garage",
                "has_fireplace", "has_pool", "has_basement"
            ],
            outputCols=[
                "price_cents_o", "images_count_o", "utilities_o", "city_o",
                "province_o", "parking_types_o", "property_sub_type_o",
                "postal_o", "year_built_o", "beds_o", "bathrooms_o",
                "has_garage_o", "has_fireplace_o", "has_pool_o",
                "has_basement_o"
            ])

        logging.warning("  pipeline called -  ")

        try:

            stages = [
                discretizer1, discretizer2, discretizer3, stringIndexer1,
                stringIndexer2, stringIndexer3, stringIndexer4, stringIndexer5,
                encoder
            ]

            pipeline = Pipeline(stages=stages)

            model = pipeline.fit(df1)
            df2 = model.transform(df1)

        except Exception as e:
            logging.exception("EXCEPTION -  Pipeline Logic")
            raise e

        logging.warning("  pipeline finished -  ")

        features = [
            "price_cents_o", "price_cents_o", "images_count_o", "utilities_o",
            "city_o", "city_o", "city_o", "province_o", "province_o",
            "province_o", "province_o", "province_o", "parking_types_o",
            "property_sub_type_o", "postal_o", "postal_o", "year_built_o",
            "beds_o", "beds_o", "bathrooms_o", "has_garage_o",
            "has_fireplace_o", "has_pool_o", "has_basement_o"
        ]

        logging.warning("  VectorAssembler called -  ")
        try:
            assembler = VectorAssembler(inputCols=features,
                                        outputCol="scaled_features")

            df3 = assembler.transform(df2)

        except Exception as e:
            print("Error in assembler logic  -  " + str(e))
            raise e

        logging.warning("  VectorAssembler finished -  ")

        return df3
bankLp = bankData.rdd.map(transformToLabeledPoint)
bankLp.collect()
bankDF = spark.createDataFrame(bankLp, ["label", "features"])
bankDF.select("label", "features").show(10)

#Perform PCA
from pyspark.ml.feature import PCA
bankPCA = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
pcaModel = bankPCA.fit(bankDF)
pcaResult = pcaModel.transform(bankDF).select("label", "pcaFeatures")
pcaResult.show(truncate=False)

#Indexing needed as pre-req for Decision Trees
from pyspark.ml.feature import StringIndexer
stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
si_model = stringIndexer.fit(pcaResult)
td = si_model.transform(pcaResult)
td.collect()

#Split into training and testing data
(trainingData, testData) = td.randomSplit([0.7, 0.3])
trainingData.count()
testData.count()
testData.collect()

from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#Create the model
rmClassifer = RandomForestClassifier(labelCol="indexed",
Exemplo n.º 8
0
if __name__ == "__main__":

    spark = SparkSession \
        .builder \
        .getOrCreate()

    # Prepare data
    raw = spark.read.csv("hdfs://devenv/user/spark/spark_mllib_101/titanic",
                         inferSchema=True,
                         header=True)

    # Preprocessing and feature engineering
    data = raw.select("Survived","Pclass","Sex","Age","Fare","Embarked") \
              .dropna()

    feature_prep = StringIndexer(
        inputCol="Sex", outputCol="SexIndex").fit(data).transform(data)

    feature_prep = OneHotEncoder(inputCol="SexIndex",
                                 outputCol="SexVec").transform(feature_prep)

    feature_prep = StringIndexer(
        inputCol="Embarked",
        outputCol="EmbarkIndex").fit(feature_prep).transform(feature_prep)

    feature_prep = OneHotEncoder(inputCol="EmbarkIndex",
                                 outputCol="EmbarkVec").transform(feature_prep)

    final_data = VectorAssembler(
        inputCols=["Survived", "Pclass", "SexVec", "Age", "Fare", "EmbarkVec"],
        outputCol="features").transform(feature_prep)
Exemplo n.º 9
0
tw2 = tw1.filter("polarity != 2").withColumn('words', tokenize(tw1['text']))
tw3 = (tw2.select(
    "user", "hour", "dayofweek", "month", "words",
    F.when(tw2.polarity == 4, "Pos").otherwise("Neg").alias("sentiment"),
    pos_score(tw2["words"]).alias("pscore"),
    neg_score(tw2["words"]).alias("nscore")))
tw3.registerTempTable("fm")

# 분류 모델 구축
# 모델링 매개변수
numFeatures = 5000
minDocFreq = 50
numTrees = 1000

# 머신 러닝 파이프라인 구축
inx1 = StringIndexer(inputCol="hour", outputCol="hour-inx")
inx2 = StringIndexer(inputCol="month", outputCol="month-inx")
inx3 = StringIndexer(inputCol="dayofweek", outputCol="dow-inx")
inx4 = StringIndexer(inputCol="sentiment", outputCol="label")
hashingTF = HashingTF(numFeatures=numFeatures,
                      inputCol="words",
                      outputCol="hash-tf")
idf = IDF(minDocFreq=minDocFreq, inputCol="hash-tf", outputCol="hash-tfidf")
va = VectorAssembler(inputCols=[
    "hour-inx", "month-inx", "dow-inx", "hash-tfidf", "pscore", "nscore"
],
                     outputCol="features")
rf = RandomForestClassifier(numTrees=numTrees,
                            maxDepth=4,
                            maxBins=32,
                            labelCol="label",
Exemplo n.º 10
0
                                outputCol="words",
                                pattern="\\W")

# stop words
add_stopwords = ["http", "https", "amp", "rt", "t", "c", "the", "RT", "@"]
stopwordsRemover = StopWordsRemover(
    inputCol="words", outputCol="filtered").setStopWords(add_stopwords)

# bag of words count
countVectors = CountVectorizer(inputCol="filtered",
                               outputCol="features",
                               vocabSize=10000,
                               minDF=5)

# convert string labels to indexes
indexer = StringIndexer(inputCol="polarity", outputCol="label")

# feature-selector
selector = ChiSqSelector(numTopFeatures=10,
                         featuresCol="features",
                         outputCol="selectedFeatures",
                         labelCol="label")

# logistic regression model
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

# build the pipeline
pipeline = Pipeline(stages=[
    regexTokenizer, stopwordsRemover, countVectors, indexer, selector, lr
])
Exemplo n.º 11
0
# Get number of records
print("The data contain %d records." % flights.count())

# Remove the 'flight' column
flights = flights.drop('flight')

# Convert 'mile' to 'km' and drop 'mile' column
flights = flights.withColumn('km', round(flights.mile * 1.60934, 0)) \
                 .drop('mile')

# Remove records with missing values in any column and get the number of remaining rows
flights = flights.dropna()
print("The data contains %d records after dropping records with na values." % flights.count())

# Create an indexer for carrier categorical feature
indexer = StringIndexer(inputCol="carrier", outputCol='carrier_idx')

# Indexer identifies categories in the data
indexer_model = indexer.fit(flights)

# Indexer creates a new column with numeric index values
flights_indexed = indexer_model.transform(flights)

# Repeat the process for the org categorical feature
flites = StringIndexer(inputCol="org", outputCol='org_idx').fit(flights_indexed).transform(flights_indexed)

# Create an instance of the one hot encoder
onehot = OneHotEncoderEstimator(inputCols=["org_idx"], outputCols=["org_dummy"])

# Apply the one hot encoder to the flights data
onehot = onehot.fit(flites)
Exemplo n.º 12
0
'JobInvolvement','JobLevel','JobSatisfaction',\

'MonthlyIncome','MonthlyRate','NumCompaniesWorked',\

'PercentSalaryHike','PerformanceRating','RelationshipSatisfaction',\

'StandardHours','StockOptionLevel','TotalWorkingYears',\

'TrainingTimesLastYear','WorkLifeBalance',\

'YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager']

# In[99]:

indexers = [
    StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c))
    for c in categoricalCols
]

encoders = [
    OneHotEncoder(inputCol=indexer.getOutputCol(),
                  outputCol="{0}_encoded".format(indexer.getOutputCol()))
    for indexer in indexers
]

# In[100]:

assembler = VectorAssembler(
    inputCols=[encoder.getOutputCol()
               for encoder in encoders] + continuousCols,
    outputCol="features")
Exemplo n.º 13
0
def MLClassifierDFPrep(df,
                       input_columns,
                       dependent_var,
                       treat_outliers=True,
                       treat_neg_values=True):

    # change label (class variable) to string type to prep for reindexing
    # Pyspark is expecting a zero indexed integer for the label column.
    # Just incase our data is not in that format... we will treat it by using the StringIndexer built in method
    renamed = df.withColumn("label_str", df[dependent_var].cast(
        StringType()))  #Rename and change to string type
    indexer = StringIndexer(
        inputCol="label_str",
        outputCol="label")  #Pyspark is expecting the this naming convention
    indexed = indexer.fit(renamed).transform(renamed)
    print(indexed.groupBy("class", "label").count().show(100))

    # Convert all string type data in the input column list to numeric
    # Otherwise the Algorithm will not be able to process it
    numeric_inputs = []
    string_inputs = []
    for column in input_columns:
        if str(indexed.schema[column].dataType) == 'StringType':
            indexer = StringIndexer(inputCol=column, outputCol=column + "_num")
            indexed = indexer.fit(indexed).transform(indexed)
            new_col_name = column + "_num"
            string_inputs.append(new_col_name)
        else:
            numeric_inputs.append(column)

    if treat_outliers == True:
        print("We are correcting for non normality now!")
        # empty dictionary d
        d = {}
        # Create a dictionary of quantiles
        for col in numeric_inputs:
            d[col] = indexed.approxQuantile(
                col, [0.01, 0.99], 0.25
            )  #if you want to make it go faster increase the last number
        #Now fill in the values
        for col in numeric_inputs:
            skew = indexed.agg(skewness(
                indexed[col])).collect()  #check for skewness
            skew = skew[0][0]
            # This function will floor, cap and then log+1 (just in case there are 0 values)
            if skew > 1:
                indexed = indexed.withColumn(
                    col,
                    log(
                        when(df[col] < d[col][0], d[col][0]).when(
                            indexed[col] > d[col][1], d[col][1]).otherwise(
                                indexed[col]) + 1).alias(col))
                print(
                    col +
                    " has been treated for positive (right) skewness. (skew =)",
                    skew, ")")
            elif skew < -1:
                indexed = indexed.withColumn(
                    col,
                    exp(
                        when(df[col] < d[col][0], d[col][0]).when(
                            indexed[col] > d[col][1],
                            d[col][1]).otherwise(indexed[col])).alias(col))
                print(
                    col +
                    " has been treated for negative (left) skewness. (skew =",
                    skew, ")")

    # Produce a warning if there are negative values in the dataframe that Naive Bayes cannot be used.
    # Note: we only need to check the numeric input values since anything that is indexed won't have negative values
    minimums = df.select([
        min(c).alias(c) for c in df.columns if c in numeric_inputs
    ])  # Calculate the mins for all columns in the df
    min_array = minimums.select(array(numeric_inputs).alias(
        "mins"))  # Create an array for all mins and select only the input cols
    df_minimum = min_array.select(array_min(
        min_array.mins)).collect()  # Collect golobal min as Python object
    df_minimum = df_minimum[0][0]  # Slice to get the number itself

    features_list = numeric_inputs + string_inputs
    assembler = VectorAssembler(inputCols=features_list, outputCol='features')
    output = assembler.transform(indexed).select('features', 'label')

    #     final_data = output.select('features','label') #drop everything else

    # Now check for negative values and ask user if they want to correct that?
    if df_minimum < 0:
        print(" ")
        print(
            "WARNING: The Naive Bayes Classifier will not be able to process your dataframe as it contains negative values"
        )
        print(" ")

    if treat_neg_values == True:
        print(
            "You have opted to correct that by rescaling all your features to a range of 0 to 1"
        )
        print(" ")
        print("We are rescaling you dataframe....")
        scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

        # Compute summary statistics and generate MinMaxScalerModel
        scalerModel = scaler.fit(output)

        # rescale each feature to range [min, max].
        scaled_data = scalerModel.transform(output)
        final_data = scaled_data.select(
            'label', 'scaledFeatures')  # added class to the selection
        final_data = final_data.withColumnRenamed('scaledFeatures', 'features')
        print("Done!")

    else:
        print(
            "You have opted not to correct that therefore you will not be able to use to Naive Bayes classifier"
        )
        print("We will return the dataframe unscaled.")
        final_data = output

    return final_data
df.printSchema()

from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=[
    'Apps', 'Accept', 'Enroll', 'Top10perc', 'Top25perc', 'F_Undergrad',
    'P_Undergrad', 'Outstate', 'Room_Board', 'Books', 'Personal', 'PhD',
    'Terminal', 'S_F_Ratio', 'perc_alumni', 'Expend', 'Grad_Rate'
],
                            outputCol='features')

output = assembler.transform(df)

from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol='Private', outputCol='PrivateIndex')

outputFixed = indexer.fit(output).transform(output)

outputFixed.printSchema()

final_data = outputFixed.select('features', 'PrivateIndex')

train_data, test_data = final_data.randomSplit([0.75, 0.25])

from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, DecisionTreeClassifier
from pyspark.ml import Pipeline

dtc = DecisionTreeClassifier(labelCol='PrivateIndex', featuresCol='features')
rfc = RandomForestClassifier(numTrees=25,
                             labelCol='PrivateIndex',
Exemplo n.º 15
0
def main(path,test_path,write_location,test_write_location):
    sparse_features = ['C' + str(i) for i in range(1, 27)]
    dense_features = ['I' + str(i) for i in range(1, 14)]

    if is_comp:
        df = spark.read.csv(path,header=True).cache()
        npart = 10000
    else:
        # define schema
        structs = [StructField("Label", IntegerType(), True)]

        for dfeat in dense_features:
            structs.append(StructField(dfeat,DoubleType(), True))

        for cfeat in sparse_features:
            structs.append(StructField(cfeat,StringType(), True))

        schema = StructType(structs)
        
        df1 = spark.read.csv(path+"/day_[0-9]",sep="\t",schema=schema)
        df2 = spark.read.csv(path+"/day_1[0-9]",sep="\t",schema=schema)
        
        tcolumns = df1.columns
        df = df1.select(tcolumns).union(df2.select(tcolumns))
        npart = 860000


    print("Num examples: ",df.count())

    # transformation training data

    # change datatype of dense features
    for col_t in dense_features:
        df = df.withColumn(col_t,col(col_t).cast(DoubleType()))

    ## fill nulls
    df = df.fillna('NULL',subset=sparse_features)
    df = df.fillna(0.,subset=dense_features)

    # compute statistics
    ## dense features
    scaled_max = 1
    scaled_min = 0
    dense_meta = {}
    for col_t in dense_features:
        print("dense: ",col_t)
        min_t = df.agg({col_t:"min"}).collect()[0][0]
        max_t = df.agg({col_t:"max"}).collect()[0][0]
        dense_meta[col_t] = [min_t, max_t]
        df = df.withColumn(col_t+"_scaled",(col(col_t)-min_t)/(max_t-min_t)*(scaled_max-scaled_min)+scaled_min)
        df = df.drop(col_t).withColumnRenamed(col_t+"_scaled",col_t)

    ## index categoricals
    indexers = {}
    for col_t in sparse_features:
        print("cat:",col_t)
        indexer = StringIndexer(inputCol=col_t, outputCol=col_t+"_indexed") 
        fitted_indexer = indexer.fit(df)
        df = fitted_indexer.transform(df)
        indexers[col_t] = fitted_indexer # save indexer for test data
        df = df.drop(col_t).withColumnRenamed(col_t+"_indexed",col_t)
        df = df.withColumn(col_t,col(col_t).cast(IntegerType()))

    # convert label dtype
    df = df.withColumn("Label",col("Label").cast(DoubleType()))

    # save statistics/meta data locally
    all_index = {}
    for xk in indexers.keys():
        x = indexers[xk]

        index2name = dict([y for y in zip(range(len(x.labels)),x.labels)])
        name2index = {v: k for k, v in index2name.items()}
        
        all_index[xk] = {'index2name':index2name,
                                         'name2index':name2index}
        
    cat_meta = {}
    for xk in indexers.keys():
        x = indexers[xk]
        cat_meta[xk] = len(x.labels)
        
    json.dump(cat_meta,open("categorical-meta.json","w"))
    json.dump(all_index,open("categorical.json",'w'))
    json.dump(dense_meta,open("dense-meta.json",'w'))


    # (optional) store in s3
    # Insert uploading code here

    # save training data
    df = df.repartition(npart)
    df.write.mode("overwrite").csv(write_location,header=True)

    # read test data
    if is_comp:
        df = spark.read.csv(test_path,header=True)
    else:
        df = spark.read.csv(test_path,sep="\t",schema=schema)
    # transform test data
    # change datatype of dense features
    for col_t in dense_features:
        df = df.withColumn(col_t,col(col_t).cast(DoubleType()))

    ## fill nulls
    df = df.fillna('NULL',subset=sparse_features)
    df = df.fillna(0.,subset=dense_features)

    # use already computed statistics
    ## dense features
    dense_meta = {}
    for col_t in dense_features:
        min_t = dense_meta[col_t][0]
        max_t = dense_meta[col_t][1]
        dense_meta[col_t] = [min_t, max_t]
        df = df.withColumn(col_t+"_scaled",(col(col_t)-min_t)/(max_t-min_t)*(scaled_max-scaled_min)+scaled_min)
        df = df.drop(col_t).withColumnRenamed(col_t+"_scaled",col_t)

    ## index categoricals
    indexers = {}
    for col_t in sparse_features:
        fitted_indexer = indexers[col_t]
        df = fitted_indexer.transform(df)
        indexers[col_t] = fitted_indexer # save indexer for test data
        df = df.drop(col_t).withColumnRenamed(col_t+"_indexed",col_t)
        df = df.withColumn(col_t,col(col_t).cast(IntegerType()))

    # convert label dtype
    df = df.withColumn("Label",col("Label").cast(DoubleType()))

    df = df.repartition(npart)
    df.write.mode("overwrite").csv(test_write_location,header=True)
Exemplo n.º 16
0
print(categorical_data)

# COMMAND ----------

# MAGIC %sql
# MAGIC 
# MAGIC select distinct education_1989_revision from CDCData

# COMMAND ----------

from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
stages = [] # stages in our Pipeline
for categoricalCol in categorical_data:
    # Category Indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    # Use OneHotEncoder to convert categorical variables into binary SparseVectors
    # encoder = OneHotEncoderEstimator(inputCol=categoricalCol + "Index", outputCol=categoricalCol + "classVec")
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    # Add stages.  These are not run here, but will run all at once later on.
    stages += [stringIndexer, encoder]

# COMMAND ----------

# Convert label into label indices using the StringIndexer
label_stringIdx = StringIndexer(inputCol="method_of_disposition", outputCol="label")
stages += [label_stringIdx]

# COMMAND ----------

# Transform all features into a vector using VectorAssembler
Exemplo n.º 17
0
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

# Load training data
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Auto').getOrCreate()

data = spark.read.csv(
    "/Users/sai/Documents/GitHub/CSEE5590_BIGDATA_PROGAMMING_Fall2018/ICP14/adult.csv",
    header=True,
    inferSchema="true")

from pyspark.ml.feature import StringIndexer
# Convert target into numerical categories
labelIndexer = StringIndexer(inputCol="Salary", outputCol="label")

from pyspark.ml.feature import VectorAssembler

featureAssembler = VectorAssembler(
    inputCols=["Age", "sex", "capital-gain", "capital-loss", "hours-per-week"],
    outputCol='features')

#output = featureAssembler.transform(data)

splits = data.randomSplit([0.7, 0.3])
train = splits[0]
test = splits[1]

# create the trainer and set its parameters
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
  .na.fill(0)\
  .withColumn("day_of_week", date_format(col("InvoiceDate"), "EEEE"))\
  .coalesce(5)

# COMMAND ----------

trainDataFrame = preppedDataFrame\
  .where("InvoiceDate < '2011-07-01'")
testDataFrame = preppedDataFrame\
  .where("InvoiceDate >= '2011-07-01'")

# COMMAND ----------

from pyspark.ml.feature import StringIndexer
indexer = StringIndexer()\
  .setInputCol("day_of_week")\
  .setOutputCol("day_of_week_index")

# COMMAND ----------

from pyspark.ml.feature import OneHotEncoder
encoder = OneHotEncoder()\
  .setInputCol("day_of_week_index")\
  .setOutputCol("day_of_week_encoded")

# COMMAND ----------

from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler()\
  .setInputCols(["UnitPrice", "Quantity", "day_of_week_encoded"])\
Exemplo n.º 19
0
                         inferSchema="true",
                         header="false")
# test = spark.read.load("./val/part-00002",
# 	format="csv", sep="\t", inferSchema="true", header="false")
# total = train.union(test)

train, val, _ = train1.randomSplit([0.01, 0.01, 0.98])

train = train.rdd.map(lambda x: [x[10], x[8], x[6], x[1] % 1000]).toDF(
    ["_c10", "_c8", "_c6", "_c1"])
val = val.rdd.map(lambda x: [x[10], x[8], x[6], x[1] % 1000]).toDF(
    ["_c10", "_c8", "_c6", "_c1"])
total = train.union(val)

# create features
indexer = StringIndexer(inputCol="_c10", outputCol="c21")
indexer = indexer.fit(total)
train = indexer.transform(train)
val = indexer.transform(val)
indexer = StringIndexer(inputCol="_c8", outputCol="c23")
indexer = indexer.fit(total)
train = indexer.transform(train)
val = indexer.transform(val)
indexer = StringIndexer(inputCol="_c6", outputCol="c24")
indexer = indexer.fit(total)
train = indexer.transform(train)
val = indexer.transform(val)
# create label
indexer = StringIndexer(inputCol="_c1", outputCol="label")
indexer = indexer.fit(total)
train = indexer.transform(train)
Exemplo n.º 20
0
data["DEPARTURE_DELAY"].cast(DoubleType())).where("CANCELLED = 0")

data = data.withColumnRenamed('DEPARTURE_DELAY', 
                              'label')

data = data.dropna()

##########
# separo en train y test sets

train_data, test_data = data.randomSplit([0.7, 0.3])

##########
# transformacion de variables para el pipeline

day_of_week_indexer = StringIndexer(inputCol="DAY_OF_WEEK", outputCol="DAY_OF_WEEK_CATEGORICAL")
airline_indexer = StringIndexer(inputCol="AIRLINE", outputCol="AIRLINE_CATEGORICAL")
hour_departure_indexer = StringIndexer(inputCol="HOUR_DEPARTURE", outputCol="HOUR_DEPARTURE_CATEGORICAL")

day_of_week_encoder = OneHotEncoder(inputCol="DAY_OF_WEEK_CATEGORICAL", outputCol="DAY_OF_WEEK_DUMMY")
airline_encoder = OneHotEncoder(inputCol="AIRLINE_CATEGORICAL", outputCol="AIRLINE_DUMMY")
hour_departure_encoder = OneHotEncoder(inputCol="HOUR_DEPARTURE_CATEGORICAL", outputCol="HOUR_DEPARTURE_DUMMY")

assembler = VectorAssembler(inputCols = ["DAY_OF_WEEK_DUMMY", "AIRLINE_DUMMY", "HOUR_DEPARTURE_DUMMY"],
                            outputCol = "features")

##########
# modelos

# modelo 1
Exemplo n.º 21
0
],
                           how="any",
                           thresh=12)

df_node3 = df_node2.randomSplit(seed=1234, weights=[0.6, 0.2, 0.2])

df_node3[2].write.format("parquet").save(
    path="hdfs://namenode:9000/example4/test.parquet")

mmi_value_0_node4 = ["Sex", "Embarked", "Survived"]
mmi_value_1_node4 = ["indexedSex", "indexedEmbarked", "indexedSurvived"]
stages_node4 = []
for i in range(len(mmi_value_0_node4)):
    stages_node4.append(
        StringIndexer(inputCol=mmi_value_0_node4[i],
                      outputCol=mmi_value_1_node4[i],
                      handleInvalid="error",
                      stringOrderType="frequencyDesc"))

mmi_value_0_node5 = ["indexedSex", "indexedEmbarked"]
mmi_value_1_node5 = ['sexVec', 'embarkedVec']
stages_node5 = []
for i in range(len(mmi_value_0_node5)):
    stages_node5.append(
        OneHotEncoder(inputCol=mmi_value_0_node5[i],
                      outputCol=mmi_value_1_node5[i]))

pipeline_stage_node6 = VectorAssembler(
    outputCol="features",
    inputCols=["Pclass", "sexVec", "Age", "SibSp", "Fare", "embarkedVec"])

stages_node7 = [stages_node4, stages_node5, pipeline_stage_node6]
print("********* TRAINING DATA ***********")
print(train.limit(10).toPandas())

reg = 0.1
# Load Regularization Rate from argument
if len(sys.argv) > 1:
    reg = float(sys.argv[1])
print("Regularization Rate is {}.".format(reg))
run_logger.log("Regularization Rate", reg)

# create a new Logistic Regression model.
lr = LogisticRegression(regParam=reg)

# string-index and one-hot encode the education column
si1 = StringIndexer(inputCol=' education', outputCol='ed')
ohe1 = OneHotEncoder(inputCol='ed', outputCol='ed-encoded')

# string-index and one-hot encode the matrial-status column
si2 = StringIndexer(inputCol=' marital-status', outputCol='ms')
ohe2 = OneHotEncoder(inputCol='ms', outputCol='ms-encoded')

# string-index the label column into a column named "label"
si3 = StringIndexer(inputCol=' income', outputCol='label')

# assemble the encoded feature columns in to a column named "features"
assembler = VectorAssembler(
    inputCols=['ed-encoded', 'ms-encoded', ' hours-per-week'],
    outputCol="features")

# put together the pipeline
Exemplo n.º 23
0
    .config("spark.executor.cores", 4) \
    .config("spark.driver.memory", "12g") \
    .getOrCreate()

train = spark.read.load("hdfs://10.190.2.112/data/train_set.txt",
  format="csv", sep="\t", inferSchema="true", header="false")
val = spark.read.load("hdfs://10.190.2.112/data/val_set.txt",
  format="csv", sep="\t", inferSchema="true", header="false")
test = spark.read.load("hdfs://10.190.2.112/data/val_set.txt",
  format="csv", sep="\t", inferSchema="true", header="false")

# only for feature transform
total = train.union(val).union(test)

# create features
indexer = StringIndexer(inputCol="_c12", outputCol="c22")
indexer = indexer.fit(total)
train = indexer.transform(train)
val = indexer.transform(val)
test = indexer.transform(test)
# create label
indexer = StringIndexer(inputCol="_c11", outputCol="label")
indexer = indexer.fit(total)
train = indexer.transform(train)
val = indexer.transform(val)
test = indexer.transform(test)
# One-hot encoder
encoder = OneHotEncoder(inputCol="c22", outputCol="c2")
train = encoder.transform(train)
val = encoder.transform(val)
test = encoder.transform(test)
Exemplo n.º 24
0
def transform(df, spark, sql_query = None, numerical_features = [], categorical_features = [],\
              normalize = True, normalize_p=2):

    # Apply SQL query
    if sql_query != None:

        df.createOrReplaceTempView("netlytics")
        # Execute Query
        result_df = spark.sql(sql_query)
        df = result_df

    # Transform Strings in OneHot
    schema = df.schema
    feat_to_type = {}
    for struct in schema:
        feat_to_type[struct.name] = str(struct.dataType)

    for feature in categorical_features:

        # Replaces None
        k = col(feature)
        df = df.withColumn(feature, when(k.isNull(), "__NA__").otherwise(k))

        stringIndexer = StringIndexer(inputCol=feature,
                                      outputCol=feature + "_indexed",
                                      handleInvalid="skip")
        model = stringIndexer.fit(df)
        df = model.transform(df)

        encoder = OneHotEncoder(inputCol=feature + "_indexed",
                                outputCol=feature + "_encoded")
        df = encoder.transform(df)

    # Extract Features
    def extract_features(row, numerical_features, feat_to_type):
        output_features = {}

        fields = list(row.asDict().keys())
        for field in fields:
            if field in numerical_features and feat_to_type[
                    field] != "StringType":
                output_features[field] = float(row[field])
            if field.endswith("_encoded"):
                output_list = list(row[field])
                for i, v in enumerate(output_list):
                    tmp_field = field + "_" + str(i)
                    output_features[tmp_field] = float(v)

        features = [
            v for k, v in sorted(output_features.items(),
                                 key=operator.itemgetter(0))
        ]

        old_dict = row.asDict()
        old_dict["features"] = DenseVector(features)
        new_row = Row(**old_dict)
        return new_row

    #spark = df.rdd.
    rdd = df.rdd.map(
        lambda row: extract_features(row, numerical_features, feat_to_type))
    df = spark.createDataFrame(rdd, samplingRatio=1, verifySchema=False)

    # Normalize
    if normalize:
        normalizer = Normalizer(inputCol="features",
                                outputCol="featuresNorm",
                                p=normalize_p)
        df = normalizer.transform(df)
        df = df.drop("features")
        df = df.withColumnRenamed("featuresNorm", "features")

    # Delete intermediate columns:
    schema = df.schema
    feat_to_type = {}
    for struct in schema:
        feat_to_type[struct.name] = str(struct.dataType)

    for feature in feat_to_type:
        if feat_to_type[feature] != "StringType":
            if feature.endswith("_encoded") or feature.endswith("_indexed"):
                df = df.drop(feature)

    return df
Exemplo n.º 25
0
# limitations under the License.
#

from __future__ import print_function

# $example on$
from pyspark.ml.feature import IndexToString, StringIndexer
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession.builder.appName("IndexToStringExample").getOrCreate()

    # $example on$
    df = spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "a"),
                                (4, "a"), (5, "c")], ["id", "category"])

    stringIndexer = StringIndexer(inputCol="category",
                                  outputCol="categoryIndex")
    model = stringIndexer.fit(df)
    indexed = model.transform(df)

    converter = IndexToString(inputCol="categoryIndex",
                              outputCol="originalCategory")
    converted = converter.transform(indexed)

    converted.select("id", "originalCategory").show()
    # $example off$

    spark.stop()
Exemplo n.º 26
0
                        "total_day_charge", "total_eve_calls", "total_eve_charge",
                        "total_night_calls", "total_night_charge", "total_intl_calls", 
                        "total_intl_charge","number_customer_service_calls"]

#Review DataSet Balance 
churn_data.registerTempTable("ChurnData")
sqlResult = spark.sql("SELECT churned, COUNT(churned) as Churned FROM ChurnData group by churned")
sqlResult.show()

#Feature Engineering 
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler

#String to Index
label_indexer = StringIndexer(inputCol = 'churned', outputCol = 'label')
plan_indexer = StringIndexer(inputCol = 'intl_plan', outputCol = 'intl_plan_indexed')
input_cols=['intl_plan_indexed'] + reduced_numeric_cols


#Feature Vector Assembler
assembler = VectorAssembler(inputCols = input_cols, outputCol = 'features')

#Standard Scaler
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",withStd=True, withMean=False)

#Configure Random Forest Classifier Model 
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier

rfclassifier = RandomForestClassifier(labelCol = 'label', featuresCol = 'scaledFeatures')
Exemplo n.º 27
0
users=spark.createDataFrame(fields)
# +---+------+---+-------------+--------+
# |age|gender| id|  occupations|postcode|
# +---+------+---+-------------+--------+
# | 24|     M|  1|   technician|   85711|
# | 53|     F|  2|        other|   94043|
# | 23|     M|  3|       writer|   32067|
# | 24|     M|  4|   technician|   43537|
# | 33|     F|  5|        other|   15213|
# | 42|     M|  6|    executive|   98101|
# | 57|     M|  7|administrator|   91344|
# | 36|     M|  8|administrator|   05201|
# | 29|     M|  9|      student|   01002|
# | 53|     M| 10|       lawyer|   90703|
# | 39|     F| 11|        other|   30329|
indexer = StringIndexer(inputCol="occupations", outputCol="occupationsIndex",handleInvalid='error')
indexed=indexer.fit(users).transform(users)
#transfer dataframe to rdd by ".rdd"
all_occupations = set(indexed.select("occupations","occupationsIndex").rdd.map(lambda x:(x[0],x[1])).collect())
encoder = OneHotEncoder(inputCol="occupationsIndex", outputCol="occupationsVec")
encoded = encoder.transform(indexed)
encoded.select("occupations","occupationsVec").show()
# +-------------+---------------+
# |  occupations| occupationsVec|
# +-------------+---------------+
# |   technician|(20,[11],[1.0])|
# |        other| (20,[1],[1.0])|
# |       writer| (20,[7],[1.0])|
# |   technician|(20,[11],[1.0])|
# |        other| (20,[1],[1.0])|
# |    executive| (20,[8],[1.0])|
    "content", "navigation", "View", "view", "mobile", "version", "Subscribe",
    "subscribe", "Now", "now", "Log", "log", "In", "in", "setting", "settings",
    "Site", "site", "Loading", "loading", "article", "next", "previous",
    "Advertisement", "ad", "advertisement", "Supported", "supported", "by",
    "Share", "share", "Page", "page", "Continue", "continue", "main", "story",
    "newsletter", "Sign", "Up", "Manage", "email", "preferences", "Not", "you",
    "opt", "out", "contact", "us", "anytime", "thank", "subscribing", "see",
    "more", "email"
]
stopwordsRemover = StopWordsRemover(
    inputCol="words", outputCol="filtered1").setStopWords(add_stopwords)
stopwordsRemover1 = StopWordsRemover(
    inputCol="filtered1", outputCol="filtered").setStopWords(add_stopwords_1)

#Extracting features
label_stringIdx = StringIndexer(inputCol="category", outputCol="label")
hashingTF = HashingTF(inputCol="filtered",
                      outputCol="rawFeatures",
                      numFeatures=1000)
idf = IDF(inputCol="rawFeatures", outputCol="features",
          minDocFreq=5)  #minDocFreq: remove sparse terms
pipeline = Pipeline(stages=[
    regexTokenizer, stopwordsRemover, stopwordsRemover1, hashingTF, idf,
    label_stringIdx
])

#training the data -- Logistic regression
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
dataset.show(5)
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed=100)
Exemplo n.º 29
0
def main(spark, train_data_file, test_data_file, model_file):

    time_a = time.time()
    start = time_a

    # Use Validation and Test user_id to filter Train data, to get the 110k mandatory users
    # Stored here hdfs:/user/dz584/cf_train_sample.parquet
    """
    training_data = spark.read.parquet('hdfs:/user/bm106/pub/project/cf_train.parquet')
    validation_data = spark.read.parquet('hdfs:/user/bm106/pub/project/cf_validation.parquet')
    testing_data = spark.read.parquet('hdfs:/user/bm106/pub/project/cf_test.parquet')

    validandtest_userid = validation_data.union(testing_data).select('user_id').distinct()
    validandtest_userid.createOrReplaceTempView('validandtest_userid')

    training_data.createOrReplaceTempView('training_data')
    training_data = spark.sql("SELECT * FROM training_data WHERE user_id IN (SELECT user_id FROM validandtest_userid GROUP BY user_id)")
    training_data.write.parquet("cf_train_sample.parquet")
    """

    training_data = spark.read.parquet(train_data_file)
    indexer_id = StringIndexer(inputCol="user_id", outputCol="userindex").setHandleInvalid("skip")
    indexer_id_model = indexer_id.fit(training_data)
    indexer_item = StringIndexer(inputCol="track_id", outputCol="itemindex").setHandleInvalid("skip")
    indexer_item_model = indexer_item.fit(training_data)

    training_data = indexer_id_model.transform(training_data)
    training_data = indexer_item_model.transform(training_data)

    testing_data = spark.read.parquet(test_data_file)
    testing_data = indexer_id_model.transform(testing_data)
    testing_data = indexer_item_model.transform(testing_data)

    training_data = training_data.select('userindex','itemindex','count')
    testing_data = testing_data.select('userindex','itemindex','count')

    # Add Log Compression
    training_data.createOrReplaceTempView('training_data')
    training_data = spark.sql("SELECT *, count+1 as plus_count FROM training_data")
    training_data = training_data.withColumn("log_count",F.log("plus_count"))

    print('Finished Indexing!')
    time_b = time.time()
    print(time_b - time_a)
    time_a = time_b

    result_dict = {}
    rank_list = [600]#[10,20,30,50]
    reg_param_list = [0.7]#[0.1,0.5]
    alpha_list = [1]#[1,1.5]

    for rank in rank_list:
        for reg_param in reg_param_list:
            for alpha in alpha_list:

                current_key = (rank,reg_param,alpha)
                als = ALS(maxIter=5, userCol="userindex", itemCol="itemindex", ratingCol="log_count", rank=rank, regParam=reg_param, alpha=alpha)
                model = als.fit(training_data)

                print('Finished Modeling with Param:', current_key)
                time_b = time.time()
                print(time_b - time_a)
                time_a = time_b

                prediction = model.recommendForAllUsers(500).select('userindex', 'recommendations.itemindex')
                print('Finished Prediction DF!')

                testing_df = testing_data.groupBy('userindex').agg(expr('collect_list(itemindex) as item_list'))
                print('Finished Label DF!')

                predictionAndLabels = prediction.join(testing_df, 'userindex')
                print('Joined Prediction and Labels!')
                time_b = time.time()
                print(time_b - time_a)
                time_a = time_b

                pred_df = predictionAndLabels.select(['itemindex','item_list']).rdd.map(list)

                metrics = RankingMetrics(pred_df)

                print('Ranking Metrics Calculated!')
                time_b = time.time()
                print(time_b - time_a)
                time_a = time_b

                eva = metrics.meanAveragePrecision
                result_dict[current_key] = eva

                print(current_key,"parameter combination has been trained! MAP= ", eva)
                time_b = time.time()
                print(time_b - time_a)
                time_a = time_b

    best_model_param = max(result_dict, key=result_dict.get)
    als = ALS(maxIter=5, userCol="userindex", itemCol="itemindex", ratingCol="count", rank=best_model_param[0], regParam=best_model_param[1], alpha=best_model_param[2])
    als.fit(training_data).write().overwrite().save(model_file)

    print('Process Finished!')
    print(time.time() - start)
Exemplo n.º 30
0
#pca choose features
categorical = {
    'Cat1': 11,
    'Cat3': 7,
    'Cat6': 7,
    'Cat8': 4,
    'Cat9': 2,
    'Cat10': 4,
    'Cat11': 7,
    'Cat12': 7
}
from pyspark.ml.feature import StringIndexer
for col, num in categorical.items():
    name = col + '_index'
    indexer = StringIndexer(inputCol=col, outputCol=name)
    data = indexer.fit(data).transform(data)
data = data.select('Var1', 'Var2', 'Var3', 'Var4', 'Var5', 'Var6', 'Var7',
                   'Var8', 'NVVar1', 'NVVar2', 'NVVar3', 'NVVar4',
                   'Cat1_index', 'Cat3_index', 'Cat6_index', 'Cat8_index',
                   'Cat9_index', 'Cat10_index', 'Cat11_index', 'Cat12_index',
                   'Calendar_Year', 'Model_Year', 'Claim_Amount')
from pyspark.ml.feature import OneHotEncoderEstimator
category = [
    'Cat1_index', 'Cat3_index', 'Cat6_index', 'Cat8_index', 'Cat9_index',
    'Cat10_index', 'Cat11_index', 'Cat12_index'
]
new_cat = []
for col in category:
    name = col.replace('_index', '_vec')
    new_cat.append(name)