def __init__(self, spark_session, dataset_path): """Init the recommendation engine given a Spark context and a dataset path """ logger.info("Starting up the Recommendation Engine: ") self.spark_session = spark_session # Load kindle data for later use logger.info("Loading Kindle data...") data_file_path = os.path.join(os.getcwd(), 'kindle_reviews.csv') df = spark_session.read.csv(data_file_path, header=True, inferSchema=True).na.drop() stringindexer = StringIndexer(inputCol='reviewerID',outputCol='reviewerID_index') stringindexer.setHandleInvalid("keep") model = stringindexer.fit(df) indexed = model.transform(df) self.uid_indexer = model stringindexer_item = StringIndexer(inputCol='asin',outputCol='asin_index') stringindexer_item.setHandleInvalid("keep") model = stringindexer_item.fit(indexed) indexed = model.transform(indexed) self.iid_indexer = model self.datas=df self.column_trained=indexed.selectExpr(['reviewerID_index as user_id','asin_index as item_id','overall as rating']) # Pre-calculate kindle ratings counts self.__count_and_average_ratings() # Train the model self.__train_model()
def index_str_columns(df_, cols_to_index: list): """Index str columns :param df_: pyspark DF :param cols_to_index: Columns to index :return: pyspark df with indexed columns """ for col_ in cols_to_index: indexer_ = StringIndexer(inputCol=col_, outputCol="indexed{}".format(col_)) indexer_.setHandleInvalid("skip") model_ = indexer_.fit(df_) df_ = model_.transform(df_) return df_
def index_columns(sdf_, cols: list): """Index string columns :param sdf_: pyspark dataframe :param cols: Columns to be indexed :return: Pyspark Dataframe with newly added columns """ col_names = sdf_.columns df_corr = sdf_.select(*col_names) for col_ in cols: indexer_ = StringIndexer(inputCol=col_, outputCol="indexed{}".format(col_)) indexer_.setHandleInvalid("skip") model_ = indexer_.fit(df_corr) sdf_ = model_.transform(sdf_) return sdf_
def preprocess_df(df, selected_columns, label_column): """Preprocesses the dataframe in order to train the models. First we add a feature column in which all predicted columns are together. Then we add a label column with the indexes of each class.""" assembler_features = VectorAssembler(inputCols=selected_columns, outputCol="features") label_indexes = StringIndexer(inputCol='class', outputCol='label') # Avoid null issues label_indexes = label_indexes.setHandleInvalid("skip") stages = [] stages += [assembler_features] stages += [label_indexes] # Add both columns to the a new dataframe pipeline = Pipeline(stages=stages) pipeline_model = pipeline.fit(df) preprocessed_df = pipeline_model.transform(df) cols = ['label', 'features'] + df.columns preprocessed_df = preprocessed_df.select(cols) return preprocessed_df
def test_string_indexer_handle_invalid(self): df = self.spark.createDataFrame([ (0, "a"), (1, "d"), (2, None)], ["id", "label"]) si1 = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="keep", stringOrderType="alphabetAsc") model1 = si1.fit(df) td1 = model1.transform(df) actual1 = td1.select("id", "indexed").collect() expected1 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0), Row(id=2, indexed=2.0)] self.assertEqual(actual1, expected1) si2 = si1.setHandleInvalid("skip") model2 = si2.fit(df) td2 = model2.transform(df) actual2 = td2.select("id", "indexed").collect() expected2 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0)] self.assertEqual(actual2, expected2)
def test_string_indexer_handle_invalid(self): df = self.spark.createDataFrame([(0, "a"), (1, "d"), (2, None)], ["id", "label"]) si1 = StringIndexer( inputCol="label", outputCol="indexed", handleInvalid="keep", stringOrderType="alphabetAsc", ) model1 = si1.fit(df) td1 = model1.transform(df) actual1 = td1.select("id", "indexed").collect() expected1 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0), Row(id=2, indexed=2.0)] self.assertEqual(actual1, expected1) si2 = si1.setHandleInvalid("skip") model2 = si2.fit(df) td2 = model2.transform(df) actual2 = td2.select("id", "indexed").collect() expected2 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0)] self.assertEqual(actual2, expected2)
#print(customer_complaints.take(5)) # Metatrepoume to RDD se dataframe gia na ekpaideusoume montelo tis vivliothikis SparkML kai gia diki mas dieukolinsi # dinoume ta katallila onomata stis stiles tou dataframe customer_complaints_DF = customer_complaints.toDF(["string_label", "features"]) class_count = customer_complaints_DF.groupBy("string_label").count()\ .where(col("count") > 2000) customer_complaints_DF = customer_complaints_DF.join(class_count, "string_label", "inner").drop("count") # Mesw StringIndexer Metasxhmatizoume ta string labels se akeraious stringIndexer = StringIndexer(inputCol="string_label", outputCol="label") stringIndexer.setHandleInvalid("skip") stringIndexerModel = stringIndexer.fit(customer_complaints_DF) customer_complaints_DF = stringIndexerModel.transform(customer_complaints_DF) #customer_complaints_DF.show(15) customer_complaints_DF.groupBy("label").count().show() # Diaxwrismos se train kai test set #(train, test) = customer_complaints_DF.randomSplit([0.75, 0.25]) # Taking 70% of all labels into training set train = customer_complaints_DF.sampleBy("label", fractions={ 0: 0.75, 1: 0.75, 2: 0.75, 3: 0.75,
dfTrain = dfTrain.groupby('Street Name', 'Violation Code').count() dfTest = dfTest.groupby('Street Name', 'Violation Code').count() #dfTrain.show() #dfTest.show() #Removing the space in the column names by renaming them to 'StreetName' & 'ViolationCode' dfTrain = dfTrain.withColumnRenamed('Street Name', 'StreetName').withColumnRenamed( 'Violation Code', 'ViolationCode') dfTest = dfTest.withColumnRenamed('Street Name', 'StreetName').withColumnRenamed( 'Violation Code', 'ViolationCode') #Indexing the 'Street Name' [String type] to 'StreetCode' [Neumeric type] for CF model indexer = StringIndexer(inputCol='StreetName', outputCol="StreetCode") indexModel = indexer.setHandleInvalid("skip").fit(dfTrain) #Scaling/Normalizing the 'count' on a 1-5 scale for 'Rating' parameter for train and test data #Training Data dfTrainAssembler = VectorAssembler(inputCols=['count'], outputCol="countVector") dfTrainAssembled = dfTrainAssembler.transform(dfTrain) dfTrainScaler = MinMaxScaler(inputCol="countVector", outputCol="countScaled", min=1, max=5) dfTrainScalarModel = dfTrainScaler.fit(dfTrainAssembled) dfTrain = dfTrainScalarModel.transform(dfTrainAssembled) #Testing Data dfTestAssembler = VectorAssembler(inputCols=['count'], outputCol="countVector")
# Release Memory #keep.unpersist() # Print Requested Outputs print('\n\nRequested Outputs\n') for i in tfidf.take(5): print(i) print('\n') #keep.unpersist() # Create DataFrame df = tfidf.toDF(['category', 'features']) stringIndexer = StringIndexer(inputCol='category', outputCol='label') stringIndexer.setHandleInvalid('skip') stringIndexerModel = stringIndexer.fit(df) df = stringIndexerModel.transform(df) # Grab unique labels uniq = df.select('label').distinct().collect() # Split Ratio for each Label fractions = {i: 0.8 for i in range(len(uniq) + 1)} # Split to train-test train_set = df.sampleBy('label', fractions=fractions, seed=seed).cache() test_set = df.subtract(train_set) # Get number of documents for each set print('\n\nSize of train set: ', train_set.count(), '\n\n')
one_hot_cols = [] max_distinct = 0 for k in categorical_cols: # now, let's print out the distinct value percentage count = criteoDF.select(k).distinct().count() if count <= 20: one_hot_cols.append(k) if count > max_distinct: max_distinct = count categorical_cols_encoded = ["{}_encoded".format(c) for c in categorical_cols] for i in range(len(categorical_cols)): stringindex_vector = StringIndexer(inputCol=categorical_cols[i], outputCol=categorical_cols_encoded[i]) criteoDF = stringindex_vector.setHandleInvalid("skip").fit( criteoDF).transform(criteoDF) criteoDF = criteoDF.drop(*categorical_cols) one_hot_cols_new = ["{}_encoded".format(c) for c in one_hot_cols] one_hot_cols_encoded = ["{}_one_hot".format(c) for c in one_hot_cols_new] for i in range(len(one_hot_cols)): onehotencoder_vector = OneHotEncoder(inputCol=one_hot_cols_new[i], outputCol=one_hot_cols_encoded[i]) criteoDF = onehotencoder_vector.transform(criteoDF) criteoDF = criteoDF.drop(*one_hot_cols_new) feature_cols = [c for c in criteoDF.columns if c != 'label'] assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
def DataPreparation(): spark = SparkSession.builder.appName('SistemaDeDeteccion').master( "local[*]").getOrCreate() #Creamos la sesión de spark data = spark.read.csv("Burnout_Data.csv", header=True, inferSchema=True) #Cargamos el dataset data = data.select('Tiempo_PlazaActual', 'EstadoCivil', 'Burnout_Antes', 'Hora_Social', 'Horas_Cuidados', 'Calorias', 'Peso', 'Contrato_Adjunto', 'Musica', 'Sexo', 'Estudias', 'Sales_Social', 'Edad', 'Estado_Animo', 'Tiempo_Vida_Laboral', 'Hijos', 'Lectura', 'Hora_Gratificante', 'Horas_Activ_Fisica') #Nos quedamos con las columnas de importancia p>1 según el análisis de componentes cols = data.columns #Guardamos en una variable los nombres de las columnas from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler #importamos las librerias necesiarias para convertir datos categóricos # en datos tratables por los algoritmos, es decir transformandolos a números categoricalColumns = [ 'Contrato_Adjunto', 'Musica', 'Sexo', 'Estudias', 'Sales_Social', 'Edad', 'Estado_Animo', 'Lectura', 'EstadoCivil' ] stages = [ ] #en esta variable guardaremos cada uno de los pasos para luego aplicarlos en el PipeLine for categoricalCol in categoricalColumns: #indexamos para cada una de las variables categoricas de la lista stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + 'Index') encoder = OneHotEncoderEstimator( inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"]) #una vez indexadas utilizamos el OneHotEncoderEstimator que le asigna a cada valor de la variable categórica un número stages += [stringIndexer.setHandleInvalid("keep"), encoder] #Guardamos este proceso en la variable stages indicandole que si hay valores inválidos los conserve label_stringIdx = StringIndexer( inputCol="Burnout_Antes", outputCol="label" ) #Indexamos como label la variable que queremos predecir que es el Burnout_Antes cuyos valores #Son VERDADERO y FALSO stages += [label_stringIdx.setHandleInvalid("keep")] # Guardamos este proceso en la variable stages indicandole que si hay valores inválidos los conserve numericCols = [ 'Tiempo_PlazaActual', 'Hora_Social', 'Horas_Cuidados', 'Calorias', 'Peso', 'Tiempo_Vida_Laboral', 'Hijos', 'Hora_Gratificante', 'Horas_Activ_Fisica' ] #Con las variables categóricas transformadas a números podemos hacer un vector uniendolo con las variables numéricas. assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") #este proceso nos da como resultado las "features" que contienen en objeto vector las variables numéricas y categóricas. stages += [assembler.setHandleInvalid("keep")] # Guardamos este proceso en la variable stages indicandole que si hay valores inválidos los conserve from pyspark.ml import Pipeline pipeline = Pipeline(stages=stages) #Inicializamos nuestro Pipeline y le pasamos la lista de pasos que debe ejecutar, que se encuentran en la variable stages. pipelineModel = pipeline.fit(data) data = pipelineModel.transform(data) #Ejecutamos y entreamos el modelo que sería el procesamiento de los datos. path = 'modelo_Pipeline' os.mkdir(path) pipelineModel.save(os.path.join(path, 'Pipeline')) #Guardamos este modelo, debido a que para predecir necesitamos aplicar este mismo modelo a los nuevos datos selectedCols = ['label', 'features'] + cols data = data.select(selectedCols) #Seleccionamos la variable label y features, más la variable cols que contiene las columnas antes de hacer el procesado de datos train, test = data.randomSplit([0.7, 0.3]) #Para el entrenamiento y las pruebas utilizamos entonces un randomSplit para dividir el dataset en un porcentaje 70% entrenamiento y 30% pruebas print("Training Dataset Count: " + str(train.count())) print("Test Dataset Count: " + str(test.count())) #imprimimos la cantidad de filas que tiene cada uno y devolvemos estos datos para su utilización por los algoritmos. return train, test
def _execute(self): df = self.df_from_temp_table(self.kwargs["previous_job_temp_table"]) if self.target_label in df.columns: df = df.drop(self.target_label) cols_to_index = [ k for k, v in df.dtypes if (v == "string" and k != self.target_label) ] cols_not_to_index = [k for k, v in df.dtypes if v != "string"] feature_cols = cols_not_to_index + [ "indexed{}".format(col_) for col_ in cols_to_index ] df = self.create_feature_vector(df, feature_cols) # Index labels, adding metadata to the label column. # Fit on whole dataset to include all labels in index. label_indexer = StringIndexer( inputCol=self.target_label, outputCol="{}loan_status".format("indexed")) label_indexer.setHandleInvalid("skip") label_indexer = label_indexer.fit(df) # Automatically identify categorical features, and index them. # Set maxCategories so features with > 12 distinct values are # treated as continuous. feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=12) feature_indexer.setHandleInvalid("skip") # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = df.randomSplit([0.7, 0.3]) # Train a RandomForest model. rf = RandomForestClassifier(labelCol="{}loan_status".format("indexed"), featuresCol="indexedFeatures", predictionCol="prediction", numTrees=10) # # Convert indexed labels back to original labels. label_converter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=label_indexer.labels) # Chain indexers and forest in a Pipeline pipeline = Pipeline( stages=[label_indexer, feature_indexer, rf, label_converter]) # Train model. This also runs the indexers. model = pipeline.fit(trainingData) # # Make predictions. predictions = model.transform(testData) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator( labelCol="{}loan_status".format("indexed"), predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) self.metrics["accuracy"] = accuracy return str(accuracy)
predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(prediction) print("Test Error = %g " % (1.0 - accuracy)) #get the best model best_model = cvModel.bestModel return best_model shoes_df = spark.read.parquet("hdfs:///data/products/shoes.parquet") shoes_df.show(10) stages = [] label_stringIdx = StringIndexer(inputCol='category_name', outputCol='label') label_stringIdx.setHandleInvalid("skip") stages += [label_stringIdx] tokenizer = Tokenizer(inputCol="descr", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="tf_features") idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="idf_features") stages += [tokenizer, hashingTF, idf] assemblerInputs = ['idf_features'] assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") stages += [assembler] rf = RandomForestClassifier(labelCol="label", featuresCol="features") paramGrid = ParamGridBuilder() \ .addGrid(hashingTF.numFeatures, [10000]) \
print(len(data)) df = sqlContext.createDataFrame(data, schema=["category", "text"]) # regular expression tokenizer regex_tokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W") # stop words stop_words = list(set(stopwords.words('english'))) stop_words_remover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(stop_words) # bag of words count count_vectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5) label_string_index = StringIndexer(inputCol="category", outputCol="label") label_string_index.setHandleInvalid("keep") pipeline = Pipeline(stages=[regex_tokenizer, stop_words_remover, count_vectors, label_string_index]) (training_data, test_data) = df.randomSplit([0.8, 0.2], seed=100) pipeline_fit = pipeline.fit(training_data) pipeline_fit.save("rf_pipeline") training_data_set = pipeline_fit.transform(training_data) training_data_set.show(5) # stages = pipeline_fit.stages # vec = [s for s in stages if isinstance(s, CountVectorizerModel)] # v1 = vec[0].vocabulary # print(len(v1)) print("Training: " + str(training_data_set.count()))
'City_Category', 'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1', 'Product_Category_2', 'Product_Category_3', ] target = 'Purchase' minDf = df.withColumn('row_index', F.monotonically_increasing_id()) for column in categorical_cols: print('Transforming column: ', column) output_col = "_" + column.lower() indexer = StringIndexer(inputCol=column, outputCol=output_col) indexed = indexer.setHandleInvalid("keep").fit(df).transform(df) maxDf = indexed.withColumn('row_index', F.monotonically_increasing_id()).select( 'row_index', output_col) minDf = minDf.join(maxDf, on=["row_index"]).sort("row_index") final_columns = [] for column in categorical_cols: final_columns.append(F.col("_" + column.lower()).alias(column.lower())) final_columns.append(target) minDf = minDf.select(final_columns) minDf.coalesce(1).write.csv("gs://doitintl_black_friday/data/train_data.csv", header=True, mode="overwrite")