Exemplo n.º 1
0
    def __init__(self, spark_session, dataset_path):
        """Init the recommendation engine given a Spark context and a dataset path
        """
 
        logger.info("Starting up the Recommendation Engine: ")
 
        self.spark_session = spark_session
 
        # Load kindle data for later use
        logger.info("Loading Kindle data...")

        data_file_path = os.path.join(os.getcwd(), 'kindle_reviews.csv')
        df = spark_session.read.csv(data_file_path, header=True, inferSchema=True).na.drop()
        
        stringindexer = StringIndexer(inputCol='reviewerID',outputCol='reviewerID_index')
        stringindexer.setHandleInvalid("keep")
        model = stringindexer.fit(df)
        indexed = model.transform(df)
        self.uid_indexer = model

        stringindexer_item = StringIndexer(inputCol='asin',outputCol='asin_index')
        stringindexer_item.setHandleInvalid("keep") 
        model = stringindexer_item.fit(indexed)
        indexed = model.transform(indexed)
        self.iid_indexer = model

        self.datas=df
        self.column_trained=indexed.selectExpr(['reviewerID_index as user_id','asin_index as item_id','overall as rating'])
        
        # Pre-calculate kindle ratings counts
        self.__count_and_average_ratings()
 
        # Train the model
        self.__train_model()
Exemplo n.º 2
0
    def index_str_columns(df_, cols_to_index: list):
        """Index str columns

        :param df_: pyspark DF
        :param cols_to_index: Columns to index
        :return: pyspark df with indexed columns
        """
        for col_ in cols_to_index:
            indexer_ = StringIndexer(inputCol=col_,
                                     outputCol="indexed{}".format(col_))
            indexer_.setHandleInvalid("skip")
            model_ = indexer_.fit(df_)
            df_ = model_.transform(df_)
        return df_
Exemplo n.º 3
0
    def index_columns(sdf_, cols: list):
        """Index string columns

        :param sdf_: pyspark dataframe
        :param cols: Columns to be indexed
        :return: Pyspark Dataframe with newly added columns
        """
        col_names = sdf_.columns
        df_corr = sdf_.select(*col_names)

        for col_ in cols:
            indexer_ = StringIndexer(inputCol=col_,
                                     outputCol="indexed{}".format(col_))
            indexer_.setHandleInvalid("skip")
            model_ = indexer_.fit(df_corr)
            sdf_ = model_.transform(sdf_)

        return sdf_
Exemplo n.º 4
0
def preprocess_df(df, selected_columns, label_column):
    """Preprocesses the dataframe in order to train the models. First we add
        a feature column in which all predicted columns are together. Then we add
        a label column with the indexes of each class."""
    assembler_features = VectorAssembler(inputCols=selected_columns,
                                         outputCol="features")
    label_indexes = StringIndexer(inputCol='class', outputCol='label')
    # Avoid null issues
    label_indexes = label_indexes.setHandleInvalid("skip")
    stages = []
    stages += [assembler_features]
    stages += [label_indexes]
    # Add both columns to the a new dataframe
    pipeline = Pipeline(stages=stages)
    pipeline_model = pipeline.fit(df)
    preprocessed_df = pipeline_model.transform(df)
    cols = ['label', 'features'] + df.columns
    preprocessed_df = preprocessed_df.select(cols)
    return preprocessed_df
Exemplo n.º 5
0
    def test_string_indexer_handle_invalid(self):
        df = self.spark.createDataFrame([
            (0, "a"),
            (1, "d"),
            (2, None)], ["id", "label"])

        si1 = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="keep",
                            stringOrderType="alphabetAsc")
        model1 = si1.fit(df)
        td1 = model1.transform(df)
        actual1 = td1.select("id", "indexed").collect()
        expected1 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0), Row(id=2, indexed=2.0)]
        self.assertEqual(actual1, expected1)

        si2 = si1.setHandleInvalid("skip")
        model2 = si2.fit(df)
        td2 = model2.transform(df)
        actual2 = td2.select("id", "indexed").collect()
        expected2 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0)]
        self.assertEqual(actual2, expected2)
Exemplo n.º 6
0
    def test_string_indexer_handle_invalid(self):
        df = self.spark.createDataFrame([(0, "a"), (1, "d"), (2, None)], ["id", "label"])

        si1 = StringIndexer(
            inputCol="label",
            outputCol="indexed",
            handleInvalid="keep",
            stringOrderType="alphabetAsc",
        )
        model1 = si1.fit(df)
        td1 = model1.transform(df)
        actual1 = td1.select("id", "indexed").collect()
        expected1 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0), Row(id=2, indexed=2.0)]
        self.assertEqual(actual1, expected1)

        si2 = si1.setHandleInvalid("skip")
        model2 = si2.fit(df)
        td2 = model2.transform(df)
        actual2 = td2.select("id", "indexed").collect()
        expected2 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0)]
        self.assertEqual(actual2, expected2)
#print(customer_complaints.take(5))

# Metatrepoume to RDD se dataframe gia na ekpaideusoume montelo tis vivliothikis SparkML kai gia diki mas dieukolinsi
# dinoume ta katallila onomata stis stiles tou dataframe
customer_complaints_DF = customer_complaints.toDF(["string_label", "features"])

class_count = customer_complaints_DF.groupBy("string_label").count()\
                        .where(col("count") > 2000)

customer_complaints_DF = customer_complaints_DF.join(class_count,
                                                     "string_label",
                                                     "inner").drop("count")

# Mesw StringIndexer Metasxhmatizoume ta string labels se akeraious
stringIndexer = StringIndexer(inputCol="string_label", outputCol="label")
stringIndexer.setHandleInvalid("skip")
stringIndexerModel = stringIndexer.fit(customer_complaints_DF)
customer_complaints_DF = stringIndexerModel.transform(customer_complaints_DF)

#customer_complaints_DF.show(15)
customer_complaints_DF.groupBy("label").count().show()
# Diaxwrismos se train kai test set
#(train, test) = customer_complaints_DF.randomSplit([0.75, 0.25])

# Taking 70% of all labels into training set
train = customer_complaints_DF.sampleBy("label",
                                        fractions={
                                            0: 0.75,
                                            1: 0.75,
                                            2: 0.75,
                                            3: 0.75,
dfTrain = dfTrain.groupby('Street Name', 'Violation Code').count()
dfTest = dfTest.groupby('Street Name', 'Violation Code').count()
#dfTrain.show()
#dfTest.show()

#Removing the space in the column names by renaming them to 'StreetName' & 'ViolationCode'
dfTrain = dfTrain.withColumnRenamed('Street Name',
                                    'StreetName').withColumnRenamed(
                                        'Violation Code', 'ViolationCode')
dfTest = dfTest.withColumnRenamed('Street Name',
                                  'StreetName').withColumnRenamed(
                                      'Violation Code', 'ViolationCode')

#Indexing the 'Street Name' [String type] to 'StreetCode' [Neumeric type] for CF model
indexer = StringIndexer(inputCol='StreetName', outputCol="StreetCode")
indexModel = indexer.setHandleInvalid("skip").fit(dfTrain)

#Scaling/Normalizing the 'count' on a 1-5 scale for 'Rating' parameter for train and test data
#Training Data
dfTrainAssembler = VectorAssembler(inputCols=['count'],
                                   outputCol="countVector")
dfTrainAssembled = dfTrainAssembler.transform(dfTrain)
dfTrainScaler = MinMaxScaler(inputCol="countVector",
                             outputCol="countScaled",
                             min=1,
                             max=5)
dfTrainScalarModel = dfTrainScaler.fit(dfTrainAssembled)
dfTrain = dfTrainScalarModel.transform(dfTrainAssembled)

#Testing Data
dfTestAssembler = VectorAssembler(inputCols=['count'], outputCol="countVector")
Exemplo n.º 9
0
    # Release Memory
    #keep.unpersist()

    # Print Requested Outputs
    print('\n\nRequested Outputs\n')
    for i in tfidf.take(5):
        print(i)
    print('\n')

    #keep.unpersist()
    # Create DataFrame
    df = tfidf.toDF(['category', 'features'])

    stringIndexer = StringIndexer(inputCol='category', outputCol='label')
    stringIndexer.setHandleInvalid('skip')
    stringIndexerModel = stringIndexer.fit(df)
    df = stringIndexerModel.transform(df)

    # Grab unique labels
    uniq = df.select('label').distinct().collect()

    # Split Ratio for each Label
    fractions = {i: 0.8 for i in range(len(uniq) + 1)}

    # Split to train-test
    train_set = df.sampleBy('label', fractions=fractions, seed=seed).cache()
    test_set = df.subtract(train_set)

    # Get number of documents for each set
    print('\n\nSize of train set: ', train_set.count(), '\n\n')
one_hot_cols = []
max_distinct = 0
for k in categorical_cols:
    # now, let's print out the distinct value percentage
    count = criteoDF.select(k).distinct().count()
    if count <= 20:
        one_hot_cols.append(k)
    if count > max_distinct:
        max_distinct = count

categorical_cols_encoded = ["{}_encoded".format(c) for c in categorical_cols]

for i in range(len(categorical_cols)):
    stringindex_vector = StringIndexer(inputCol=categorical_cols[i],
                                       outputCol=categorical_cols_encoded[i])
    criteoDF = stringindex_vector.setHandleInvalid("skip").fit(
        criteoDF).transform(criteoDF)

criteoDF = criteoDF.drop(*categorical_cols)

one_hot_cols_new = ["{}_encoded".format(c) for c in one_hot_cols]
one_hot_cols_encoded = ["{}_one_hot".format(c) for c in one_hot_cols_new]

for i in range(len(one_hot_cols)):
    onehotencoder_vector = OneHotEncoder(inputCol=one_hot_cols_new[i],
                                         outputCol=one_hot_cols_encoded[i])
    criteoDF = onehotencoder_vector.transform(criteoDF)

criteoDF = criteoDF.drop(*one_hot_cols_new)

feature_cols = [c for c in criteoDF.columns if c != 'label']
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
Exemplo n.º 11
0
def DataPreparation():
    spark = SparkSession.builder.appName('SistemaDeDeteccion').master(
        "local[*]").getOrCreate()  #Creamos la sesión de spark
    data = spark.read.csv("Burnout_Data.csv", header=True,
                          inferSchema=True)  #Cargamos el dataset
    data = data.select('Tiempo_PlazaActual', 'EstadoCivil', 'Burnout_Antes',
                       'Hora_Social', 'Horas_Cuidados', 'Calorias', 'Peso',
                       'Contrato_Adjunto', 'Musica', 'Sexo', 'Estudias',
                       'Sales_Social', 'Edad', 'Estado_Animo',
                       'Tiempo_Vida_Laboral', 'Hijos', 'Lectura',
                       'Hora_Gratificante', 'Horas_Activ_Fisica')
    #Nos quedamos con las columnas de importancia p>1 según el análisis de componentes
    cols = data.columns  #Guardamos en una variable los nombres de las columnas

    from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler  #importamos las librerias necesiarias para convertir datos categóricos
    # en datos tratables por los algoritmos, es decir transformandolos a números
    categoricalColumns = [
        'Contrato_Adjunto', 'Musica', 'Sexo', 'Estudias', 'Sales_Social',
        'Edad', 'Estado_Animo', 'Lectura', 'EstadoCivil'
    ]
    stages = [
    ]  #en esta variable guardaremos cada uno de los pasos para luego aplicarlos en el PipeLine
    for categoricalCol in categoricalColumns:  #indexamos para cada una de las variables categoricas de la lista
        stringIndexer = StringIndexer(inputCol=categoricalCol,
                                      outputCol=categoricalCol + 'Index')
        encoder = OneHotEncoderEstimator(
            inputCols=[stringIndexer.getOutputCol()],
            outputCols=[categoricalCol + "classVec"])
        #una vez indexadas utilizamos el OneHotEncoderEstimator que le asigna a cada valor de la variable categórica un número
        stages += [stringIndexer.setHandleInvalid("keep"), encoder]
        #Guardamos este proceso en la variable stages indicandole que si hay valores inválidos los conserve

    label_stringIdx = StringIndexer(
        inputCol="Burnout_Antes", outputCol="label"
    )  #Indexamos como label la variable que queremos predecir que es el Burnout_Antes cuyos valores
    #Son VERDADERO y FALSO
    stages += [label_stringIdx.setHandleInvalid("keep")]
    # Guardamos este proceso en la variable stages indicandole que si hay valores inválidos los conserve

    numericCols = [
        'Tiempo_PlazaActual', 'Hora_Social', 'Horas_Cuidados', 'Calorias',
        'Peso', 'Tiempo_Vida_Laboral', 'Hijos', 'Hora_Gratificante',
        'Horas_Activ_Fisica'
    ]
    #Con las variables categóricas transformadas a números podemos hacer un vector uniendolo con las variables numéricas.
    assemblerInputs = [c + "classVec"
                       for c in categoricalColumns] + numericCols
    assembler = VectorAssembler(inputCols=assemblerInputs,
                                outputCol="features")
    #este proceso nos da como resultado las "features" que contienen en objeto vector las variables numéricas y categóricas.
    stages += [assembler.setHandleInvalid("keep")]
    # Guardamos este proceso en la variable stages indicandole que si hay valores inválidos los conserve

    from pyspark.ml import Pipeline
    pipeline = Pipeline(stages=stages)
    #Inicializamos nuestro Pipeline y le pasamos la lista de pasos que debe ejecutar, que se encuentran en la variable stages.
    pipelineModel = pipeline.fit(data)
    data = pipelineModel.transform(data)
    #Ejecutamos y entreamos el modelo que sería el procesamiento de los datos.
    path = 'modelo_Pipeline'
    os.mkdir(path)
    pipelineModel.save(os.path.join(path, 'Pipeline'))
    #Guardamos este modelo, debido a que para predecir necesitamos aplicar este mismo modelo a los nuevos datos
    selectedCols = ['label', 'features'] + cols
    data = data.select(selectedCols)
    #Seleccionamos la variable label y features, más la variable cols que contiene las columnas antes de hacer el procesado de datos

    train, test = data.randomSplit([0.7, 0.3])
    #Para el entrenamiento y las pruebas utilizamos entonces un randomSplit para dividir el dataset en un porcentaje 70% entrenamiento y 30% pruebas
    print("Training Dataset Count: " + str(train.count()))
    print("Test Dataset Count: " + str(test.count()))
    #imprimimos la cantidad de filas que tiene cada uno y devolvemos estos datos para su utilización por los algoritmos.
    return train, test
Exemplo n.º 12
0
    def _execute(self):
        df = self.df_from_temp_table(self.kwargs["previous_job_temp_table"])
        if self.target_label in df.columns:
            df = df.drop(self.target_label)
        cols_to_index = [
            k for k, v in df.dtypes
            if (v == "string" and k != self.target_label)
        ]
        cols_not_to_index = [k for k, v in df.dtypes if v != "string"]

        feature_cols = cols_not_to_index + [
            "indexed{}".format(col_) for col_ in cols_to_index
        ]

        df = self.create_feature_vector(df, feature_cols)

        # Index labels, adding metadata to the label column.
        # Fit on whole dataset to include all labels in index.
        label_indexer = StringIndexer(
            inputCol=self.target_label,
            outputCol="{}loan_status".format("indexed"))
        label_indexer.setHandleInvalid("skip")
        label_indexer = label_indexer.fit(df)

        # Automatically identify categorical features, and index them.
        # Set maxCategories so features with > 12 distinct values are
        # treated as continuous.
        feature_indexer = VectorIndexer(inputCol="features",
                                        outputCol="indexedFeatures",
                                        maxCategories=12)
        feature_indexer.setHandleInvalid("skip")

        # Split the data into training and test sets (30% held out for testing)
        (trainingData, testData) = df.randomSplit([0.7, 0.3])

        # Train a RandomForest model.
        rf = RandomForestClassifier(labelCol="{}loan_status".format("indexed"),
                                    featuresCol="indexedFeatures",
                                    predictionCol="prediction",
                                    numTrees=10)

        # # Convert indexed labels back to original labels.
        label_converter = IndexToString(inputCol="prediction",
                                        outputCol="predictedLabel",
                                        labels=label_indexer.labels)

        # Chain indexers and forest in a Pipeline
        pipeline = Pipeline(
            stages=[label_indexer, feature_indexer, rf, label_converter])

        # Train model.  This also runs the indexers.
        model = pipeline.fit(trainingData)

        # # Make predictions.
        predictions = model.transform(testData)

        # Select (prediction, true label) and compute test error
        evaluator = MulticlassClassificationEvaluator(
            labelCol="{}loan_status".format("indexed"),
            predictionCol="prediction",
            metricName="accuracy")

        accuracy = evaluator.evaluate(predictions)

        self.metrics["accuracy"] = accuracy

        return str(accuracy)
Exemplo n.º 13
0
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(prediction)
    print("Test Error = %g " % (1.0 - accuracy))
    #get the best model
    best_model = cvModel.bestModel
    return best_model


shoes_df = spark.read.parquet("hdfs:///data/products/shoes.parquet")
shoes_df.show(10)

stages = []

label_stringIdx = StringIndexer(inputCol='category_name', outputCol='label')
label_stringIdx.setHandleInvalid("skip")
stages += [label_stringIdx]

tokenizer = Tokenizer(inputCol="descr", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                      outputCol="tf_features")
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="idf_features")
stages += [tokenizer, hashingTF, idf]

assemblerInputs = ['idf_features']
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

rf = RandomForestClassifier(labelCol="label", featuresCol="features")
paramGrid = ParamGridBuilder() \
        .addGrid(hashingTF.numFeatures, [10000]) \
Exemplo n.º 14
0
    print(len(data))
    df = sqlContext.createDataFrame(data, schema=["category", "text"])

    # regular expression tokenizer
    regex_tokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")

    # stop words
    stop_words = list(set(stopwords.words('english')))

    stop_words_remover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(stop_words)

    # bag of words count
    count_vectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)
    label_string_index = StringIndexer(inputCol="category", outputCol="label")
    label_string_index.setHandleInvalid("keep")

    pipeline = Pipeline(stages=[regex_tokenizer, stop_words_remover, count_vectors, label_string_index])
    (training_data, test_data) = df.randomSplit([0.8, 0.2], seed=100)
    pipeline_fit = pipeline.fit(training_data)
    pipeline_fit.save("rf_pipeline")

    training_data_set = pipeline_fit.transform(training_data)
    training_data_set.show(5)

    # stages = pipeline_fit.stages
    # vec = [s for s in stages if isinstance(s, CountVectorizerModel)]
    # v1 = vec[0].vocabulary
    # print(len(v1))

    print("Training: " + str(training_data_set.count()))
    'City_Category',
    'Stay_In_Current_City_Years',
    'Marital_Status',
    'Product_Category_1',
    'Product_Category_2',
    'Product_Category_3',
]
target = 'Purchase'

minDf = df.withColumn('row_index', F.monotonically_increasing_id())

for column in categorical_cols:
    print('Transforming column: ', column)
    output_col = "_" + column.lower()
    indexer = StringIndexer(inputCol=column, outputCol=output_col)
    indexed = indexer.setHandleInvalid("keep").fit(df).transform(df)
    maxDf = indexed.withColumn('row_index',
                               F.monotonically_increasing_id()).select(
                                   'row_index', output_col)
    minDf = minDf.join(maxDf, on=["row_index"]).sort("row_index")

final_columns = []
for column in categorical_cols:
    final_columns.append(F.col("_" + column.lower()).alias(column.lower()))

final_columns.append(target)
minDf = minDf.select(final_columns)
minDf.coalesce(1).write.csv("gs://doitintl_black_friday/data/train_data.csv",
                            header=True,
                            mode="overwrite")