예제 #1
0
def seg_model_lr(train_data, test_data, regType, num_iter):
    removelist_train= set(['stars', 'business_id', 'bus_id', 'b_id','review_id', 'user_id'])
    newlist_train = [v for i, v in enumerate(train_data.columns) if v not in removelist_train]

    # Putting data in vector assembler form
    assembler_train = VectorAssembler(inputCols=newlist_train, outputCol="features")

    transformed_train = assembler_train.transform(train_data.fillna(0))

    # Creating input dataset in the form of labeled point for training the model
    data_train= (transformed_train.select("features", "stars")).map(lambda row: LabeledPoint(row.stars, row.features))

    # Training the model using Logistic regression Classifier
    model_train = LogisticRegressionWithLBFGS.train(sc.parallelize(data_train.collect(),5),
                                                    regType =regType, iterations=num_iter, numClasses=5)

    # Creating a list of features to be used for predictions
    removelist_final = set(['business_id', 'bus_id', 'b_id','review_id', 'user_id'])
    newlist_final = [v for i, v in enumerate(test_data.columns) if v not in removelist_final]

    # Putting data in vector assembler form
    assembler_final = VectorAssembler(inputCols=newlist_final,outputCol="features")

    transformed_final= assembler_final.transform(test_data.fillna(0))

    # Creating input dataset to be used for predictions
    data_final = transformed_final.select("features", "review_id")

    # Predicting ratings using the developed model
    predictions = model_train.predict(data_final.map(lambda x: x.features))
    labelsAndPredictions = data_final.map(lambda data_final: data_final.review_id).zip(predictions)
    return labelsAndPredictions
예제 #2
0
def seg_model_gb(train_data, test_data, loss_type, num_iter, maxDepth):
    removelist_train= set(['stars', 'business_id', 'bus_id', 'b_id','review_id', 'user_id'])
    newlist_train = [v for i, v in enumerate(train_data.columns) if v not in removelist_train]

    # Putting data in vector assembler form
    assembler_train = VectorAssembler(inputCols=newlist_train, outputCol="features")

    transformed_train = assembler_train.transform(train_data.fillna(0))

    # Creating input dataset in the form of labeled point for training the model
    data_train= (transformed_train.select("features", "stars")).map(lambda row: LabeledPoint(row.stars, row.features))

    # Training the model using Gradient Boosted Trees regressor
    model_train = GradientBoostedTrees.trainRegressor(sc.parallelize(data_train.collect(),5), categoricalFeaturesInfo={},
                                                      loss=loss_type,
                                                      numIterations=num_iter, maxDepth=maxDepth)

    # Creating a list of features to be used for predictions
    removelist_final = set(['business_id', 'bus_id', 'b_id','review_id', 'user_id'])
    newlist_final = [v for i, v in enumerate(test_data.columns) if v not in removelist_final]

    # Putting data in vector assembler form
    assembler_final = VectorAssembler(inputCols=newlist_final,outputCol="features")

    transformed_final= assembler_final.transform(test_data.fillna(0))

    # Creating input dataset to be used for predictions
    data_final = transformed_final.select("features", "review_id")

    # Predicting ratings using the developed model
    predictions = model_train.predict(data_final.map(lambda x: x.features))
    labelsAndPredictions = data_final.map(lambda data_final: data_final.review_id).zip(predictions)
    return labelsAndPredictions
    def _convertPythonXToJavaObject(self, X):
        """
        Converts the input python object X to a java-side object (either MatrixBlock or Java DataFrame)

        Parameters
        ----------
        X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix or PySpark DataFrame
        """
        if isinstance(X, SUPPORTED_TYPES) and self.transferUsingDF:
            pdfX = convertToPandasDF(X)
            df = assemble(
                self.sparkSession,
                pdfX,
                pdfX.columns,
                self.features_col).select(
                self.features_col)
            return df._jdf
        elif isinstance(X, SUPPORTED_TYPES):
            return convertToMatrixBlock(self.sc, X)
        elif hasattr(X, '_jdf') and self.features_col in X.columns:
            # No need to assemble as input DF is likely coming via MLPipeline
            return X._jdf
        elif hasattr(X, '_jdf'):
            assembler = VectorAssembler(
                inputCols=X.columns, outputCol=self.features_col)
            df = assembler.transform(X)
            return df._jdf
        else:
            raise Exception('Unsupported input type')
def writeLumbarReadings(time, rdd):
	try:
		# Convert RDDs of the words DStream to DataFrame and run SQL query
		connectionProperties = MySQLConnection.getDBConnectionProps('/home/erik/mysql_credentials.txt')
		sqlContext = SQLContext(rdd.context)
		if rdd.isEmpty() == False:
			lumbarReadings = sqlContext.jsonRDD(rdd)
			lumbarReadingsIntermediate = lumbarReadings.selectExpr("readingID","readingTime","deviceID","metricTypeID","uomID","actual.y AS actualYaw","actual.p AS actualPitch","actual.r AS actualRoll","setPoints.y AS setPointYaw","setPoints.p AS setPointPitch","setPoints.r AS setPointRoll")
			assembler = VectorAssembler(
						inputCols=["actualPitch"], # Must be in same order as what was used to train the model.  Testing using only pitch since model has limited dataset.
						outputCol="features")
			lumbarReadingsIntermediate = assembler.transform(lumbarReadingsIntermediate)

			
			predictions = loadedModel.predict(lumbarReadingsIntermediate.map(lambda x: x.features))
			predictionsDF = lumbarReadingsIntermediate.map(lambda x: x.readingID).zip(predictions).toDF(["readingID","positionID"])
			combinedDF = lumbarReadingsIntermediate.join(predictionsDF, lumbarReadingsIntermediate.readingID == predictionsDF.readingID).drop(predictionsDF.readingID)
			
			combinedDF = combinedDF.drop("features")
			
			combinedDF.show()


			combinedDF.write.jdbc("jdbc:mysql://localhost/biosensor", "SensorReadings", properties=connectionProperties)
	except:
		pass
예제 #5
0
def text_features(p_df):
    """
    Extracts features derived from the quora question texts.
    :param p_df: A DataFrame.
    :return: A DataFrame.  
    """
    diff_len = udf(lambda arr: arr[0] - arr[1], IntegerType())
    common_words = udf(lambda arr: len(set(arr[0]).intersection(set(arr[1]))), IntegerType())
    unique_chars = udf(lambda s: len(''.join(set(s.replace(' ', '')))), IntegerType())


    p_df = p_df.withColumn("len_q1", length("question1")).withColumn("len_q2", length("question2"))
    p_df = p_df.withColumn("diff_len", diff_len(array("len_q1", "len_q2")))
    p_df = p_df.withColumn("words_q1", size("question1_words")).withColumn("words_q2", size("question2_words"))
    p_df = p_df.withColumn("common_words", common_words(array("question1_words", "question2_words")))
    p_df = p_df.withColumn(
        "unique_chars_q1", unique_chars("question1")
    ).withColumn("unique_chars_q2", unique_chars("question2"))

    assembler = VectorAssembler(
        inputCols=["len_q1", "len_q2", "diff_len", "words_q1", "words_q2", "common_words", "unique_chars_q1", "unique_chars_q2"],
        outputCol="text_features"
    )
    p_df = assembler.transform(p_df)    
    return p_df
예제 #6
0
 def predict(self, X):
     if isinstance(X, SUPPORTED_TYPES):
         if self.transferUsingDF:
             pdfX = convertToPandasDF(X)
             df = assemble(self.sqlCtx, pdfX, pdfX.columns, 'features').select('features')
             retjDF = self.model.transform(df._jdf)
             retDF = DataFrame(retjDF, self.sqlCtx)
             retPDF = retDF.sort('ID').select('prediction').toPandas()
             if isinstance(X, np.ndarray):
                 return retPDF.as_matrix().flatten()
             else:
                 return retPDF
         else:
             retNumPy = convertToNumpyArr(self.sc, self.model.transform(convertToMatrixBlock(self.sc, X)))
             if isinstance(X, np.ndarray):
                 return retNumPy
             else:
                 return retNumPy # TODO: Convert to Pandas
     elif hasattr(X, '_jdf'):
         if 'features' in X.columns:
             # No need to assemble as input DF is likely coming via MLPipeline
             df = X
         else:
             assembler = VectorAssembler(inputCols=X.columns, outputCol='features')
             df = assembler.transform(X)
         retjDF = self.model.transform(df._jdf)
         retDF = DataFrame(retjDF, self.sqlCtx)
         # Return DF
         return retDF.sort('ID')
     else:
         raise Exception('Unsupported input type')
예제 #7
0
    def scaleVecCol(self, columns, nameOutputCol):
        """
        This function groups the columns specified and put them in a list array in one column, then a scale
        process is made. The scaling proccedure is spark scaling default (see the example
        bellow).

        +---------+----------+
        |Price    |AreaLiving|
        +---------+----------+
        |1261706.9|16        |
        |1263607.9|16        |
        |1109960.0|19        |
        |978277.0 |19        |
        |885000.0 |19        |
        +---------+----------+

                    |
                    |
                    |
                    V
        +----------------------------------------+
        |['Price', 'AreaLiving']                 |
        +----------------------------------------+
        |[0.1673858972637624,0.5]                |
        |[0.08966137157852398,0.3611111111111111]|
        |[0.11587093205757598,0.3888888888888889]|
        |[0.1139820728616421,0.3888888888888889] |
        |[0.12260126542983639,0.4722222222222222]|
        +----------------------------------------+
        only showing top 5 rows

        """

        # Check if columns argument must be a string or list datatype:
        self.__assertTypeStrOrList(columns, "columns")

        # Check if columns to be process are in dataframe
        self.__assertColsInDF(columnsProvided=columns, columnsDF=self.__df.columns)

        # Check if nameOutputCol argument a string datatype:
        self.__assertTypeStr(nameOutputCol, "nameOutpuCol")

        # Model to use vectorAssember:
        vecAssembler = VectorAssembler(inputCols=columns, outputCol="features_assembler")
        # Model for scaling feature column:
        mmScaler = MinMaxScaler(inputCol="features_assembler", outputCol=nameOutputCol)
        # Dataframe with feature_assembler column
        tempDF = vecAssembler.transform(self.__df)
        # Fitting scaler model with transformed dataframe
        model = mmScaler.fit(tempDF)

        exprs = list(filter(lambda x: x not in columns, self.__df.columns))

        exprs.extend([nameOutputCol])

        self.__df = model.transform(tempDF).select(*exprs)
        self.__addTransformation()  # checkpoint in case

        return self
예제 #8
0
def convert_to_flat_by_sparkpy(df):
    subkeys = df.select("subkey").dropDuplicates().collect()
    subkeys = [s[0] for s in subkeys]
    assembler = VectorAssembler().setInputCols(subkeys).setOutputCol("features")
    spark_df = assembler.transform(df.groupBy("key", "parameter").pivot("subkey").agg(first(col("reference"))))
    spark_df = spark_df.withColumnRenamed("parameter", "label")
    spark_df = spark_df.select("label", "features")
    return spark_df
def sparking_your_interest():
	df = SQLContext.read.json('speeches_dataset.json')
	df_fillna=df.fillna("")
	print(df_fillna.count())
	print(df_fillna.printSchema())

	df_utf=call_utf_encoder(df)
	df_cleaned=call_para_cleanup(df_utf)
	print(df_cleaned)
	df_with_bigrams = call_ngrams(df_cleaned, 2)
	df_with_trigrams = call_ngrams(df_with_bigrams, 3)
	df_with_4grams = call_ngrams(df_with_trigrams, 4)
	df_with_5grams = call_ngrams(df_with_4grams, 4)
	df_with_6grams = call_ngrams(df_with_5grams, 4)
	df_with_vocab_score = call_speech_vocab(df_with_6grams)

	df_with_2grams_idf_vectors = tf_feature_vectorizer(df_with_vocab_score,100,'2grams')
	df_with_3grams_idf_vectors = tf_feature_vectorizer(df_with_2grams_idf_vectors,100,'3grams')
	df_with_4grams_idf_vectors = tf_feature_vectorizer(df_with_3grams_idf_vectors,100,'4grams')
	assembler = VectorAssembler(
	    inputCols=["2gramsfeatures", "2gramsfeatures", "2gramsfeatures", "vocab_score"],
	    outputCol="features")
	assembler_output = assembler.transform(df_with_4grams_idf_vectors)
	output = assembler_output.selectExpr('speaker','speech_id','para_cleaned_text','features')
	print(output.show())
	print(output.count())

	output_tordd = output.rdd
	train_rdd,test_rdd = output_tordd.randomSplit([0.8, 0.2], 123)
	train_df = train_rdd.toDF()
	test_df = test_rdd.toDF()
	print(train_df)
	print(test_df)

	print('Train DF - Count: ')
	print(train_df.count())
	print('Test DF - Count: ')
	print(test_df.count())

	print("Initializing RF Model")
	labelIndexer = StringIndexer(inputCol="speaker", outputCol="indexedLabel").fit(train_df)       
	rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features",numTrees=1000, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32)
	pipeline = Pipeline(stages=[labelIndexer,rf])
	model = pipeline.fit(output)
	print("Completed RF Model")

	predictions = model.transform(test_df)
	evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
	accuracy = evaluator.evaluate(predictions)
	print("Test Error = %g" % (1.0 - accuracy))
	rfModel = model.stages[1]
	print(rfModel)  # summary only
	print("Predictions: ")
	print(predictions.show())
예제 #10
0
    def _prepare_data_spark(self, data):
        """ Prepare data for spark format, output data will have the feature format and other useful information """

        keys = list(data.keys().difference({self.CHANGE_AMOUNT, self.CHANGE_DIRECTION, self.TARGET_PRICE,
                                            self.TODAY_PRICE}))

        df = self._spark.createDataFrame(data)
        ass = VectorAssembler(inputCols=keys, outputCol="features")
        output = ass.transform(df)
        # output.select('features', 'ChangeDirection', 'ChangeAmount').write.save('test.parquet')
        return output
def predictPopularity(features):
    print(features)
    features = tuple(features)
    feature_label = []    
    for i in range(0, len(features)):
        feature_label.append('feature' +str(i))
    data_frame = spark.createDataFrame([features], feature_label)
    assembler = VectorAssembler(inputCols= feature_label, outputCol = 'features')
    data_frame = assembler.transform(data_frame)
    data_frame = data_frame.select('features')
    result = rfc_model.transform(data_frame)
    return result.select('prediction').head(1)[0][0]
예제 #12
0
    def commit(self):
        self.update_domain_role_hints()
        if self.in_df is not None:
            attributes = [att for att in self.used_attrs._list]
            class_var = [var for var in self.class_attrs._list]
            metas = [meta for meta in self.meta_attrs._list]
            VA = VectorAssembler(inputCols = attributes, outputCol = 'features')
            self.out_df = VA.transform(self.in_df)
            if len(class_var):
                self.out_df = self.out_df.withColumn('label', self.out_df[class_var[0]].cast('double'))

            self.send("DataFrame", self.out_df)
        else:
            self.send("DataFrame", None)
예제 #13
0
def test_train_data(overall_segment):
    removelist_train= set(['stars', 'business_id', 'bus_id', 'b_id','review_id', 'user_id'])
    newlist_train = [v for i, v in enumerate(overall_segment.columns) if v not in removelist_train]

    # Putting data in vector assembler form
    assembler_train = VectorAssembler(inputCols=newlist_train, outputCol="features")

    transformed_train = assembler_train.transform(overall_segment.fillna(0))

    # Creating input dataset in the form of labeled point for training the model
    data_train= (transformed_train.select("features", "stars")).map(lambda row: LabeledPoint(row.stars, row.features))

    (trainingData, testData) = sc.parallelize(data_train.collect(),5).randomSplit([0.7, 0.3])
    return (trainingData, testData)
예제 #14
0
def tf_idf_features_quora(p_df):
    """
    Extracts TF-IDF features from quora dataset.
    :param p_df: A DataFrame.
    :return: A DataFrame.    
    """     
    tf_df = extract_tf_features(p_df, "question1_meaningful_words", "tf1")
    tf_df = extract_tf_features(tf_df, "question2_meaningful_words", "tf2")
    tf_idf_df = extract_idf_features(tf_df, "tf1", "tf-idf1")
    tf_idf_df = extract_idf_features(tf_idf_df, "tf2", "tf-idf2")
    assembler = VectorAssembler(
        inputCols=["tf-idf1", "tf-idf2"],
        outputCol="tf_idf_features"
    )
    return assembler.transform(tf_idf_df)
예제 #15
0
def convert_to_flat_by_sparkpy(df):
    subkeys = df.select("subkey").dropDuplicates().collect()
    subkeys = [s[0] for s in subkeys]

    n = len(df.select("reference").first()[0])
    # df = df.groupBy("key").agg(array(*[avg(col("reference")[i]) for i in range(n)]).alias("averages"))
    df = df.groupBy("key").agg(array(*[collect_list(col("reference")[i]) for i in range(n)]).alias("averages"))
    df.show()
    r = df.collect()

    # changedTypedf = joindf.withColumn("label", joindf["show"].cast(DoubleType()))
    assembler = VectorAssembler().setInputCols(subkeys).setOutputCol("features")
    spark_df = assembler.transform(df.groupBy("key", "parameter").pivot("subkey").agg(first(col("reference"))))
    spark_df = spark_df.withColumnRenamed("parameter", "label")
    spark_df = spark_df.select("label", "features")
    return spark_df
예제 #16
0
    def predict(self, X):
        """
        Invokes the transform method on Estimator object on JVM if X and y are on of the supported data types

        Parameters
        ----------
        X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix or PySpark DataFrame
        """
        try:
            if self.estimator is not None and self.model is not None:
                self.estimator.copyProperties(self.model)
        except AttributeError:
            pass
        if isinstance(X, SUPPORTED_TYPES):
            if self.transferUsingDF:
                pdfX = convertToPandasDF(X)
                df = assemble(self.sparkSession, pdfX, pdfX.columns, self.features_col).select(self.features_col)
                retjDF = self.model.transform(df._jdf)
                retDF = DataFrame(retjDF, self.sparkSession)
                retPDF = retDF.sort('__INDEX').select('prediction').toPandas()
                if isinstance(X, np.ndarray):
                    return self.decode(retPDF.as_matrix().flatten())
                else:
                    return self.decode(retPDF)
            else:
                try:
                    retNumPy = self.decode(convertToNumPyArr(self.sc, self.model.transform(convertToMatrixBlock(self.sc, X))))
                except Py4JError:
                    traceback.print_exc()
                if isinstance(X, np.ndarray):
                    return retNumPy
                else:
                    return retNumPy # TODO: Convert to Pandas
        elif hasattr(X, '_jdf'):
            if self.features_col in X.columns:
                # No need to assemble as input DF is likely coming via MLPipeline
                df = X
            else:
                assembler = VectorAssembler(inputCols=X.columns, outputCol=self.features_col)
                df = assembler.transform(X)
            retjDF = self.model.transform(df._jdf)
            retDF = DataFrame(retjDF, self.sparkSession)
            # Return DF
            return retDF.sort('__INDEX')
        else:
            raise Exception('Unsupported input type')
예제 #17
0
    def transform(self, df, featureCols, targetCol):
        """Keep the K most important features of the Spark DataFrame

        Parameters
        ----------
        df : Spark DataFrame
        featureCols: array, names of feature columns
            to consider in the feature selectio algorithm
        targetCol: str, name of target column, i.e, column to which
            compare each feature.

        Returns
        -------
        transformed_df : New Spark DataFrame with only the most important
            feature columns.

        """

        # build features assemble
        assembler = VectorAssembler(
            inputCols = featureCols,
            outputCol = 'features')
        assembled_df = assembler.transform(df)

        # rename target column
        assembled_df = assembled_df.withColumnRenamed(targetCol,'target')

        # extract features and target
        feats = assembled_df.select('features').rdd
        feats = feats.map(lambda x: x['features'])
        target = assembled_df.select('target').rdd
        target = target.map(lambda x: x['target'])

        # compute per-column metric
        scores = []
        for i,feat in enumerate(featureCols):
            vector = feats.map(lambda x: x[i])
            scores.append(self.sfunc_(vector,target))
        self.scores_ = scores
        
        # sort scores
        idx = sorted(range(len(self.scores_)),reverse=True,key=self.scores_.__getitem__)
        
        # return dataframe with k-best columns 
        return df.select(*[featureCols[idd] for idd in idx[:self.k_]])
예제 #18
0
def convertToLabeledDF(sparkSession, X, y=None):
    from pyspark.ml.feature import VectorAssembler
    if y is not None:
        pd1 = pd.DataFrame(X)
        pd2 = pd.DataFrame(y, columns=['label'])
        pdf = pd.concat([pd1, pd2], axis=1)
        inputColumns = ['C' + str(i) for i in pd1.columns]
        outputColumns = inputColumns + ['label']
    else:
        pdf = pd.DataFrame(X)
        inputColumns = ['C' + str(i) for i in pdf.columns]
        outputColumns = inputColumns
    assembler = VectorAssembler(inputCols=inputColumns, outputCol='features')
    out = assembler.transform(sparkSession.createDataFrame(pdf, outputColumns))
    if y is not None:
        return out.select('features', 'label')
    else:
        return out.select('features')
예제 #19
0
파일: spark.py 프로젝트: sashaostr/datasu
def merge_features(ddfs, join_column, merge_column, output_column='features', drop_merged_columns=True):
    """
    join (inner) several DataFrames by same id and merge its columns (merge_column) into one column using using pyspark.ml.feature.VectorAssembler

    Example:
        ddf_merge = merge_features(ddfs=[ddf_pivot1,ddf_pivot2], join_column='customer_id', merge_column='features')
    :param ddfs:
    :param join_column: id column to join by (each ddf must have this column)
    :param merge_column: column to merge (each ddf must have this column)
    :param output_column:
    :param drop_merged_columns:
    :return:
    """
    from pyspark.ml.feature import VectorAssembler

    ddf_res = ddfs.pop(0)
    merge_column_renamed = merge_column + str(0)
    merge_columns = [merge_column_renamed]
    ddf_res = ddf_res.withColumnRenamed(merge_column, merge_column_renamed)

    for i,ddf in enumerate(ddfs):
        merge_column_renamed = merge_column + str(i+1)
        merge_columns.append(merge_column_renamed)
        ddf_r = ddf.withColumnRenamed(merge_column, merge_column_renamed)
        ddf_res = ddf_res.join(ddf_r, on=join_column, how='inner')

    assembler = VectorAssembler(inputCols=merge_columns, outputCol=output_column)
    res = assembler.transform(ddf_res)

    if drop_merged_columns:
        res = drop_columns(res, columns=merge_columns)

    return res


# def pivot_aggregate(ddf, grpby_columns, pivot_column, aggs, pivot_filter_values=None, pivot_filter_support=None):
#     if pivot_filter_support and not pivot_filter_values:
#         frequent = ddf.freqItems([pivot_column], support=pivot_filter_support).first().asDict()[pivot_column+'_freqItems']
#         pivot_filter_values = map(str,frequent)
#
#     ddf_gr = ddf.groupBy(*grpby_columns)
#     ddf_pivot = ddf_gr.pivot(pivot_column, pivot_filter_values)
#     ddf_agg = ddf_pivot.agg(*aggs)
#     return ddf_agg
예제 #20
0
def preprocess(data):
  data = data.select('Year','Month','DayofMonth','DayOfWeek','DepTime','CRSDepTime','ArrTime','CRSArrTime','UniqueCarrier'\
                            ,'FlightNum','TailNum','ActualElapsedTime','CRSElapsedTime','AirTime','ArrDelay','DepDelay', 'Origin'\
                            ,'Dest','Distance','TaxiIn','TaxiOut','Cancelled')
  data = data.na.fill('999999')
  for t in data.dtypes:
   if t[1]=='string' and t[0] not in ['Origin','Dest','TailNum','UniqueCarrier','FlightNum']:
     data=data.withColumn(t[0],x[t[0]].cast('integer'))
  data = data.na.fill(999999)
  data = data.withColumnRenamed('Cancelled','label')
  data = data.withColumn('label',data.label.cast('double'))
  assembler = VectorAssembler(
	    inputCols=['Year','Month','DayofMonth','DayOfWeek'
		,'DepTime','CRSDepTime','ArrTime','CRSArrTime',
		'ActualElapsedTime','CRSElapsedTime','AirTime',
		'ArrDelay','DepDelay','Distance','TaxiIn','TaxiOut'],
	    outputCol='features')
  data = assembler.transform(data)
  data = data.select('features','label')
  return data
예제 #21
0
파일: ml.py 프로젝트: zhang01GA/koalas
def to_numeric_df(kdf: 'ks.DataFrame') -> Tuple[pyspark.sql.DataFrame, List[str]]:
    """
    Takes a dataframe and turns it into a dataframe containing a single numerical
    vector of doubles. This dataframe has a single field called '_1'.

    TODO: index is not preserved currently
    :param kdf: the koalas dataframe.
    :return: a pair of dataframe, list of strings (the name of the columns
             that were converted to numerical types)

    >>> to_numeric_df(ks.DataFrame({'A': [0, 1], 'B': [1, 0], 'C': ['x', 'y']}))
    (DataFrame[_correlation_output: vector], ['A', 'B'])
    """
    # TODO, it should be more robust.
    accepted_types = {np.dtype(dt) for dt in [np.int8, np.int16, np.int32, np.int64,
                                              np.float32, np.float64, np.bool_]}
    numeric_fields = [fname for fname in kdf._metadata.data_columns
                      if kdf[fname].dtype in accepted_types]
    numeric_df = kdf._sdf.select(*numeric_fields)
    va = VectorAssembler(inputCols=numeric_fields, outputCol=CORRELATION_OUTPUT_COLUMN)
    v = va.transform(numeric_df).select(CORRELATION_OUTPUT_COLUMN)
    return v, numeric_fields
예제 #22
0
def cluster():
    ld = load(open(DATAP+'\\temp\olangdict.json','r',encoding='UTF-8'))

    spark = SparkSession.builder\
                        .master("local")\
                        .appName("Word Count")\
                        .config("spark.some.config.option", "some-value")\
                        .getOrCreate()

    df = spark.createDataFrame([["0"],
                                ["1"],
                                ["2"],
                                ["3"],
                                ["4"]],
                               ["id"])
    df.show()

    vecAssembler = VectorAssembler(inputCols=["feat1", "feat2"], outputCol="features")
    new_df = vecAssembler.transform(df)

    kmeans = KMeans(k=2, seed=1)  # 2 clusters here
    model = kmeans.fit(new_df.select('features'))
    transformed = model.transform(new_df)
    print(transformed.show())
irisNormDf = si_model.transform(responses)

print(irisNormDf.select("Species", "SPECIES_Catogery").distinct().collect())

for i in irisNormDf.columns:
    if not (isinstance(irisNormDf.select(i).take(1)[0][0], six.string_types)):
        print("Correlation to  for ", i,
              irisNormDf.stat.corr('SPECIES_Catogery', i))

#[Row(Species='versicolor', SPECIES_Catogery=0.0), Row(Species='setosa', SPECIES_Catogery=2.0), Row(Species='virginica', SPECIES_Catogery=1.0)]

iris_final = irisNormDf.drop('Species')
vectorAssembler = VectorAssembler(
    inputCols=['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth'],
    outputCol='features')
iris_final = vectorAssembler.transform(iris_final)
iris_final = iris_final.select(['features', 'SPECIES_Catogery'])
#print(vauto_df)
iris_final.show(3)

random.seed(100)
splits = iris_final.randomSplit([0.8, 0.2])
train_df = splits[0]
test_df = splits[1]

print(train_df.count())
print(test_df.count())

##############################----DECISION TREE CLASSIFICATION----########################################

dtreeeClassifer = DecisionTreeClassifier(maxDepth=2,
예제 #24
0
# Load the training data into a dataframe
data = spark.read.format('json').load('train.jsonl')
data = clean_tokenize_remove_stopwords_quora(data)

# Get the tf-idf features
data = tf_idf_features_quora(data)
# Get the text features
data = text_features(data)

# combine all the features
feature_assembler = VectorAssembler(
    inputCols=["tf_idf_features", "text_features"],
    outputCol="combined_features"
)
data = feature_assembler.transform(data)


# Normalizing each feature to have unit standard deviation
scaler = StandardScaler(inputCol="combined_features", outputCol="features",
                        withStd=True, withMean=False)
scalerModel = scaler.fit(data)
# Normalize each feature to have unit standard deviation.
data = scalerModel.transform(data)


# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
# Automatically identify categorical features, and index them.
feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=2).fit(data)
        .builder\
        .appName("VectorSizeHintExample")\
        .getOrCreate()

    # $example on$
    dataset = spark.createDataFrame(
        [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0),
         (0, 18, 1.0, Vectors.dense([0.0, 10.0]), 0.0)],
        ["id", "hour", "mobile", "userFeatures", "clicked"])

    sizeHint = VectorSizeHint(
        inputCol="userFeatures",
        handleInvalid="skip",
        size=3)

    datasetWithSize = sizeHint.transform(dataset)
    print("Rows where 'userFeatures' is not the right size are filtered out")
    datasetWithSize.show(truncate=False)

    assembler = VectorAssembler(
        inputCols=["hour", "mobile", "userFeatures"],
        outputCol="features")

    # This dataframe can be used by downstream transformers as before
    output = assembler.transform(datasetWithSize)
    print("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'")
    output.select("features", "clicked").show(truncate=False)
    # $example off$

    spark.stop()
             'Room_Board',
             'Books',
             'Personal',
             'PhD',
             'Terminal',
             'S_F_Ratio',
             'perc_alumni',
             'Expend',
             'Grad_Rate'],
              outputCol="features")


# In[86]:


output = assembler.transform(data)


# Deal with Private column being "yes" or "no"

# In[87]:


from pyspark.ml.feature import StringIndexer


# In[88]:


indexer = StringIndexer(inputCol="Private", outputCol="PrivateIndex")
output_fixed = indexer.fit(output).transform(output)
예제 #27
0
def combine_columns(columns, df, out_col):
    assembler = VectorAssembler(inputCols=columns, outputCol=out_col)
    return assembler.transform(df)
#Loading the Student_Grades_Data.csv file, uploaded in previous step
data = spark.read.csv('Student_Grades_Data.csv', header=True, inferSchema=True)

#Taking a look at data type of each column to see what data types inferSchema=TRUE paramter has set for each column
data.printSchema()

#Display first few rows of data
data.show()

#Create a Feature array by omitting the last column
feature_cols = data.columns[:-1] 
from pyspark.ml.feature import VectorAssembler
vect_assembler = VectorAssembler(inputCols=feature_cols,outputCol="features")

#Utilize Assembler created above in order to add the feature column
data_w_features = vect_assembler.transform(data)

#Display the data having additional column named features. Had it been multiple linear regression problem, you could see all the
# independent variable values combined in one list
data_w_features.show()

#Select only Features and Label from previous dataset as we need these two entities for building machine learning model
finalized_data = data_w_features.select("features","Grades")

finalized_data.show()

#Split the data into training and test model with 70% obs. going in training and 30% in testing
train_dataset, test_dataset = finalized_data.randomSplit([0.7, 0.3])

#Peek into training data
train_dataset.describe().show()
예제 #29
0
dfAvgStock = dfStock.groupby('stock_hour', 'company').agg(F.mean('close'), F.mean('volume'))

dfJoin = dfAvgSent.join(dfAvgStock, (dfAvgSent.comp == dfAvgStock.company) & (dfAvgSent.tweet_hour == dfAvgStock.stock_hour+5))
dfJoin = dfJoin.withColumnRenamed("avg(sentiment)","avg-sentiment")
dfJoin = dfJoin.withColumnRenamed("avg(close)","avg-close")
dfJoin = dfJoin.withColumnRenamed("avg(volume)","avg-volume")
dfJoin = dfJoin.withColumnRenamed("avg(followers_count)","avg-followers")
dfJoin.show()

# COMMAND ----------

from pyspark.ml.feature import VectorAssembler
dfJoin1 = dfJoin.select("avg-sentiment","avg-followers","avg-volume")
inputFeatures = ["avg-sentiment","avg-followers","avg-volume"]
assembler = VectorAssembler(inputCols=inputFeatures, outputCol="features")
dfJoin2 = assembler.transform(dfJoin1)

# COMMAND ----------

# Scaling features
scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures")
scalerModel = scaler.fit(dfJoin2)
scaledData = scalerModel.transform(dfJoin2)
scaledData.select("features", "scaledFeatures").show()

# COMMAND ----------

#Elbow method
import numpy as np
cost = np.zeros(10)
for k in range(2,10):
예제 #30
0
import findspark
findspark.init()
spark = SparkSession.builder.appName("SICP7").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
# Load data and select feature and label columns
data = spark.read.format("csv").option("header", True).option(
    "inferSchema",
    True).option("delimiter",
                 ",").load("C:/Users/bharani/PycharmProjects/SICP7/adult.data")
data = data.withColumnRenamed("age", "label").select("label", "education-num",
                                                     "hours-per-week")
data = data.select(data.label.cast("double"), "education-num",
                   "hours-per-week")
# Create vector assembler for feature columns
assembler = VectorAssembler(inputCols=data.columns[1:], outputCol="features")
data = assembler.transform(data)
data.show()
# Split data into training and test data set
training, test = data.select("label", "features").randomSplit([0.85, 0.15])
# Create Navie Bayes model and fit the model with training dataset
nb = NaiveBayes()
model = nb.fit(training)
# Generate prediction from test dataset
predictions = model.transform(test)
# Evaluate the accuracy of the model
evaluator = MulticlassClassificationEvaluator()
accuracy = evaluator.evaluate(predictions)
# Show model accuracy
print("Accuracy:", accuracy)
# Report
predictionAndLabels = predictions.select("label", "prediction").rdd
예제 #31
0
    def main():
        appName = "ukhouseprices"
        spark = s.spark_session(appName)
        spark.sparkContext._conf.setAll(v.settings)
        sc = s.sparkcontext()
        #
        # Get data from Hive table
        regionname = "Kensington and Chelsea"
        tableName = "ukhouseprices"
        fullyQualifiedTableName = v.DSDB + "." + tableName
        summaryTableName = v.DSDB + "." + "summary"
        start_date = "2010"
        end_date = "2020"
        lst = (spark.sql(
            "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') "
        )).collect()
        print("\nStarted at")
        uf.println(lst)
        # Model predictions
        spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
        #summary_df = spark.sql(f"""SELECT cast(date_format(datetaken, "yyyyMM") as int) as datetaken, flatprice, terracedprice, semidetachedprice, detachedprice FROM {summaryTableName}""")
        summary_df = spark.sql(
            f"""SELECT cast(Year as int) as year, AVGFlatPricePerYear, AVGTerracedPricePerYear, AVGSemiDetachedPricePerYear, AVGDetachedPricePerYear FROM {v.DSDB}.yearlyhouseprices"""
        )
        df_10 = summary_df.filter(
            col("year").between(f'{start_date}', f'{end_date}'))
        print(df_10.toPandas().columns.tolist())

        # show pandas column list ['Year', 'AVGPricePerYear', 'AVGFlatPricePerYear', 'AVGTerracedPricePerYear', 'AVGSemiDetachedPricePerYear', 'AVGDetachedPricePerYear']
        p_dfm = df_10.toPandas()  # converting spark DF to Pandas DF
        data = p_dfm.values

        # Non-Linear Least-Squares Minimization and Curve Fitting
        model = LorentzianModel()
        n = len(p_dfm.columns)
        for i in range(n):
            if p_dfm.columns[i] != 'year':  # year is x axis in integer
                # it goes through the loop and plots individual average curves one by one and then prints a report for each y value
                vcolumn = p_dfm.columns[i]
                print(vcolumn)
                params = model.guess(p_dfm[vcolumn], x=p_dfm['year'])
                result = model.fit(p_dfm[vcolumn], params, x=p_dfm['year'])
                result.plot_fit()

                # do linear regression here
                # Prepare data for Machine Learning.And we need two columns only — features and label(p_dfm.columns[i]]):
                inputCols = ['year']
                vectorAssembler = VectorAssembler(inputCols=inputCols,
                                                  outputCol='features')
                vhouse_df = vectorAssembler.transform(df_10)
                vhouse_df = vhouse_df.select(
                    ['features', 'AVGFlatPricePerYear'])
                vhouse_df.show(20)
                if vcolumn == "AVGFlatPricePerYear":
                    plt.xlabel("Year", fontdict=v.font)
                    plt.ylabel("Flat house prices in millions/GBP",
                               fontdict=v.font)
                    plt.title(
                        f"""Flat price fluctuations in {regionname} for the past 10 years """,
                        fontdict=v.font)
                    plt.text(0.35,
                             0.45,
                             "Best-fit based on Non-Linear Lorentzian Model",
                             transform=plt.gca().transAxes,
                             color="grey",
                             fontsize=10)
                    print(result.fit_report())
                    plt.xlim(left=2009)
                    plt.xlim(right=2022)
                    plt.show()
                    plt.close()
                elif vcolumn == "AVGTerracedPricePerYear":
                    plt.xlabel("Year", fontdict=v.font)
                    plt.ylabel("Terraced house prices in millions/GBP",
                               fontdict=v.font)
                    plt.title(
                        f"""Terraced house price fluctuations in {regionname} for the past 10 years """,
                        fontdict=v.font)
                    plt.text(0.35,
                             0.45,
                             "Best-fit based on Non-Linear Lorentzian Model",
                             transform=plt.gca().transAxes,
                             color="grey",
                             fontsize=10)
                    print(result.fit_report())
                    plt.show()
                    plt.close()
                elif vcolumn == "AVGSemiDetachedPricePerYear":
                    plt.xlabel("Year", fontdict=v.font)
                    plt.ylabel("semi-detached house prices in millions/GBP",
                               fontdict=v.font)
                    plt.title(
                        f"""semi-detached house price fluctuations in {regionname} for the past 10 years """,
                        fontdict=v.font)
                    plt.text(0.35,
                             0.45,
                             "Best-fit based on Non-Linear Lorentzian Model",
                             transform=plt.gca().transAxes,
                             color="grey",
                             fontsize=10)
                    print(result.fit_report())
                    plt.show()
                    plt.close()
                elif vcolumn == "AVGDetachedPricePerYear":
                    plt.xlabel("Year", fontdict=v.font)
                    plt.ylabel("detached house prices in millions/GBP",
                               fontdict=v.font)
                    plt.title(
                        f"""detached house price fluctuations in {regionname} for the past 10 years """,
                        fontdict=v.font)
                    plt.text(0.35,
                             0.45,
                             "Best-fit based on Non-Linear Lorentzian Model",
                             transform=plt.gca().transAxes,
                             color="grey",
                             fontsize=10)
                    print(result.fit_report())
                    plt.show()
                    plt.close()

        p_df = df_10.select('AVGFlatPricePerYear', 'AVGTerracedPricePerYear',
                            'AVGSemiDetachedPricePerYear',
                            'AVGDetachedPricePerYear').toPandas().describe()
        print(p_df)
        #axs = scatter_matrix(p_df, figsize=(10, 10))
        # Describe returns a DF where count,mean, min, std,max... are values of the index
        y = p_df.loc[['min', 'mean', 'max']]
        #y = p_df.loc[['averageprice', 'flatprice']]
        ax = y.plot(linewidth=2, colormap='jet', marker='.', markersize=20)
        plt.grid(True)
        plt.xlabel("UK House Price Index, January 2020", fontdict=v.font)
        plt.ylabel("Property Prices in millions/GBP", fontdict=v.font)
        plt.title(
            f"""Property price fluctuations in {regionname} for the past 10 years """,
            fontdict=v.font)
        plt.legend(p_df.columns)
        plt.show()
        plt.close()
        lst = (spark.sql(
            "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') "
        )).collect()
        print("\nFinished at")
        uf.println(lst)
예제 #32
0
# Setup the Spark -, and SQL Context (note: this is for Spark < 2.0.0)
sc = SparkContext(appName="DistKeras ATLAS Higgs example")
sqlContext = SQLContext(sc)

# Read the Higgs dataset.
dataset = sqlContext.read.format('com.databricks.spark.csv')\
                    .options(header='true', inferSchema='true').load("data/atlas_higgs.csv")
# Print the schema of the dataset.
dataset.printSchema()
# Vectorize the features into the features column.
features = dataset.columns
features.remove('EventId')
features.remove('Weight')
features.remove('Label')
assembler = VectorAssembler(inputCols=features, outputCol="features")
dataset = assembler.transform(dataset)
# Since the output layer will not be able to read the string label, convert it to an double.
labelIndexer = StringIndexer(inputCol="Label",
                             outputCol="label_index").fit(dataset)
dataset = labelIndexer.transform(dataset)
# Feature normalization.
standardScaler = StandardScaler(inputCol="features",
                                outputCol="features_normalized",
                                withStd=True,
                                withMean=True)
standardScalerModel = standardScaler.fit(dataset)
dataset = standardScalerModel.transform(dataset)

# Define the structure of the dataset.
nb_features = len(features)
nb_classes = 2
예제 #33
0
    spark = SparkSession.builder.appName("RateSourceLKF").getOrCreate()
    spark.sparkContext.setLogLevel("WARN")

    noise_param = 1

    input_df = spark.readStream.format("rate").option("rowsPerSecond", mps).load()\
        .withColumn("mod", F.col("value") % num_states)\
        .withColumn("stateKey", F.col("mod").cast("String"))\
        .withColumn("trend", (F.col("value")/num_states).cast("Integer") + F.randn() * noise_param)

    lkf = LinearKalmanFilter()\
        .setStateKeyCol("stateKey")\
        .setMeasurementCol("measurement")\
        .setInitialStateMean(Vectors.dense([0.0, 0.0]))\
        .setInitialStateCovariance(Matrices.dense(2, 2, [10000.0, 0.0, 0.0, 10000.0]))\
        .setProcessModel(Matrices.dense(2, 2, [1.0, 0.0, 1.0, 1.0]))\
        .setProcessNoise(Matrices.dense(2, 2, [0.0001, 0.0, 0.0, 0.0001]))\
        .setMeasurementNoise(Matrices.dense(1, 1, [noise_param]))\
        .setMeasurementModel(Matrices.dense(1, 2, [1.0, 0.0]))

    assembler = VectorAssembler(inputCols=["trend"], outputCol="measurement")

    measurements = assembler.transform(input_df)
    query = lkf.transform(measurements)\
        .writeStream\
        .queryName("RateSourceLKF")\
        .outputMode("append")\
        .format("console")\
        .start()

    query.awaitTermination()
예제 #34
0
def main():
    # Setup Spark
    spark = SparkSession.builder.master("local[*]").getOrCreate()

    # Nice way to write a tmp file onto the system
    temp_csv_file = tempfile.mktemp()
    with open(temp_csv_file, mode="wb") as f:
        data_https = requests.get(
            "https://teaching.mrsharky.com/data/iris.data"
        )
        f.write(data_https.content)

    iris_df = spark.read.csv(temp_csv_file, inferSchema="true", header="true")
    iris_df = iris_df.toDF(
        "sepal_length",
        "sepal_width",
        "petal_length",
        "petal_width",
        "class")

    iris_df.createOrReplaceTempView("iris")
    iris_df.persist(StorageLevel.DISK_ONLY)

    # Simple SQL
    results = spark.sql("SELECT * FROM iris")
    results.show()
    # Average for each of the 4
    average_overall = spark.sql(
        """
        SELECT
                AVG(sepal_length) AS avg_sepal_length
                , AVG(sepal_width) AS avg_sepal_width
                , AVG(petal_length) AS avg_petal_length
                , AVG(petal_width) AS avg_petal_width
            FROM iris
        """
    )
    average_overall.show()

    # Average for each of the 4 by class
    average_by_class = spark.sql(
        """
        SELECT
                class
                , AVG(sepal_length) AS avg_sepal_length
                , AVG(sepal_width) AS avg_sepal_width
                , AVG(petal_length) AS avg_petal_length
                , AVG(petal_width) AS avg_petal_width
            FROM iris
            GROUP BY class
        """
    )
    average_by_class.show()

    # Add a new column

    iris_df = iris_df.withColumn("rand", functions.rand(seed=42))
    iris_df.createOrReplaceTempView("iris")
    results = spark.sql("SELECT * FROM iris ORDER BY rand")
    results.show()

    vector_assembler = VectorAssembler(
        inputCols=[
            "sepal_length",
            "sepal_width",
            "petal_length",
            "petal_width"],
        outputCol="vector",
    )

    iris_df = vector_assembler.transform(iris_df)
    iris_df.show()


    # Numberize the class column of iris

    string_indexer = StringIndexer(inputCol="class", outputCol="indexed")
    indexer_fitted = string_indexer.fit(iris_df)
    iris_df = indexer_fitted.transform(iris_df)
    iris_df.createOrReplaceTempView("iris")
    results = spark.sql("SELECT * FROM iris ORDER BY rand")
    results.show()
    return

    # Random Forest
    random_forest_classifier = RandomForestClassifier(
        featuresCol="vector",
        labelCol="indexed"
    )
    random_forest_classifier_fitted = random_forest_classifier.fit(iris_df)
    iris_df = random_forest_classifier_fitted.transform(iris_df)
    iris_df.createOrReplaceTempView("iris")
    results = spark.sql("SELECT * FROM iris ORDER BY rand")
    results.show()

    # Calculate the model's Accuracy
    print_heading("Accuracy")
    iris_df_accuracy = spark.sql(
        """
        SELECT
                SUM(correct)/COUNT(*) AS accuracy
            FROM
                (SELECT
                        CASE WHEN prediction == class_idx THEN 1
                        ELSE 0 END AS correct
                    FROM predicted) AS TMP
              """
    )
    iris_df_accuracy.show()
                                                "mortdue":avg("mortdue"), 
                                                "value":avg("value"),
                                                "derog":avg("derog"),
                                                "delinq":0,
                                                "clage":avg("clage"),
                                                "ninq":avg("ninq"),
                                                "clno":avg("clno"),
                                                "debtinc":avg("debtinc")
                                                })
    """
    #Define Input-output columns, i.e. transform to MLP features vector
    ignore=['bad']
    assembler = VectorAssembler(
    inputCols=[k for k in clean_riskdata.columns if k not in ignore],
    outputCol="predictors")
    Triskdata = assembler.transform(clean_riskdata)
    # Split the data into train and test
    splits = Triskdata.randomSplit([0.4, 0.6], 1234)
    train = splits[0]
    test = splits[1]

    #################################################################
    # Preliminary analysis
    #################################################################
    print(clean_riskdata.describe().show())
    print(riskdata.stat.crosstab("bad","job").show())
    print(riskdata.stat.crosstab("bad","reason").show())
    #################################################################
    # Multilayer Perceptron Classifier
    #################################################################
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import os

df = sqlContext.read.json(os.environ['WORKDIR'] + "user_features.json")

df_restaurants = df.filter("category = \"Restaurants\"")


assembler = VectorAssembler(
    inputCols=["average_stars", "cat_avg_review_len", "cat_avg_stars", "cat_business_count", "cat_review_count", "months_yelping", "review_count", "votes_cool", "votes_funny", "votes_useful" ],
    outputCol="features")
output = assembler.transform(df_restaurants)

(trainingData, testData) = output.randomSplit([0.7, 0.3])

dt = DecisionTreeRegressor(labelCol = "elite", featuresCol="features")
pipeline = Pipeline(stages=[dt])
model = pipeline.fit(trainingData)
predictions = model.transform(testData)

predictions.select("prediction", "elite", "features").show(5)


evaluator = RegressionEvaluator(
    labelCol="elite", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print "Root Mean Squared Error (RMSE) on test data = %g" % rmse
예제 #37
0
	df_second_round = df_second_round.join(dosage_mapping, df_second_round.DOSAGE == dosage_mapping.CPA_DOSAGE, how="left").na.fill("")
	df_second_round = df_second_round.withColumn("EFFTIVENESS_DOSAGE_SE", dosage_replace(df_second_round.MASTER_DOSAGE, \
														df_second_round.DOSAGE_STANDARD, df_second_round.EFFTIVENESS_DOSAGE)) 
														
	df_second_round = df_second_round.withColumn("EFFTIVENESS_PRODUCT_NAME_SE", prod_name_replace(df_second_round.MOLE_NAME, df_second_round.MOLE_NAME_STANDARD, \
														df_second_round.MANUFACTURER_NAME, df_second_round.MANUFACTURER_NAME_STANDARD, df_second_round.MANUFACTURER_NAME_EN_STANDARD))
	
	df_second_round = df_second_round.withColumn("EFFTIVENESS_PACK_QTY_SE", pack_replace(df_second_round.EFFTIVENESS_PACK_QTY, df_second_round.SPEC_ORIGINAL, \
														df_second_round.PACK_QTY, df_second_round.PACK_QTY_STANDARD))
	
	
	assembler = VectorAssembler( \
				inputCols=["EFFTIVENESS_MOLE_NAME", "EFFTIVENESS_PRODUCT_NAME_SE", "EFFTIVENESS_DOSAGE_SE", "EFFTIVENESS_SPEC", \
							"EFFTIVENESS_PACK_QTY_SE", "EFFTIVENESS_MANUFACTURER"], \
				outputCol="features")
	df_second_round = assembler.transform(df_second_round)
	# df_second_round.repartition(10).write.mode("overwrite").parquet("s3a://ph-max-auto/2020-08-11/BPBatchDAG/refactor/alfred/second_round_dt")
	
	predictions_second_round = model.transform(df_second_round)
	predictions_second_round.write.mode("overwrite").parquet("s3a://ph-max-auto/2020-08-11/BPBatchDAG/refactor/zyyin/second_round_prediction1106_1")
	evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
	accuracy = evaluator.evaluate(predictions_second_round)
	print("Test Error = %g " % (1.0 - accuracy))
	print("Test set accuracy = " + str(accuracy))
	
	# 第二轮正确率检测
	df_true_positive_se = predictions_second_round.where(predictions_second_round.prediction == 1.0)
	
	ph_positive_prodict_se = df_true_positive_se.count()
	print("机器判断第二轮TP条目 = " + str(ph_positive_prodict_se))
	ph_positive_hit_se = df_true_positive_se.where((df_true_positive_se.prediction == df_true_positive_se.label) & (df_true_positive_se.label == 1.0)).count()
예제 #38
0
    'Avg Session Length', 'Time on App', 'Time on Website',
    'Length of Membership'
],
                                 outputCol='features')

# COMMAND ----------

#type(vectorassember)

# COMMAND ----------

print(vectorassember)

# COMMAND ----------

output = vectorassember.transform(data)

# COMMAND ----------

#df3.printSchema()

# COMMAND ----------

#from pyspark.sql.types import IntegerType
#df3= df3.withColumn("air_time", df3['air_time'].cast(IntegerType()))

# COMMAND ----------

#df3.printSchema()

# COMMAND ----------
예제 #39
0
    # Create training and test data sets
    training_data, test_data = data.randomSplit([0.8, 0.2], seed=7)
    print('Test data')
    print(test_data.groupby('label').count().show())

    print('Training data')
    print(training_data.groupby('label').count().show())
    print(training_data.show())

    # New data set with the following columns:
    #   - 'label' - class.
    #   - 'features' - a vector containing the particular attributes.
    assembler = VectorAssembler(inputCols=column_names, outputCol='features')

    training_data = assembler.transform(training_data)
    training_data = training_data.select('label', 'features')
    # training_data = training_data.drop(*column_names)
    test_data = assembler.transform(test_data)
    test_data = test_data.select('label', 'features')
    # test_data = test_data.drop(*column_names)
    print(training_data.take(1))

    # Scale
    training_scale, _ = standardScale(training_data)
    print('\nScaled training data (Standard)')
    print(training_scale.take(1))
    # training_scale.write.csv('db/training_scale', header=True)
    # training_scale.rdd.saveAsPickleFile('db/training_scale')

    training_scale, _ = minMaxScale(training_data)
예제 #40
0
# Initialize SparkSession
spark = (SparkSession
         .builder
         .appName("news")
         .enableHiveSupport()
         .getOrCreate())

# Read raw data
df = spark.read.csv('/home/worker/data/news.csv', header=True, inferSchema=True, mode="DROPMALFORMED", encoding='UTF-8')

print("==== 生データ ====")
df.show(truncate=False)

assembler = VectorAssembler(inputCols=df.columns[1:], outputCol="変量")
feature_vectors = assembler.transform(df)
feature_vectors.show()


print("==== LightGBMの学習 ====")
model = LightGBMRegressor(alpha=0.3,
                          learningRate=0.3,
                          numIterations=100,
                          numLeaves=31,
                          featuresCol='変量',
                          labelCol='スポーツ').fit(feature_vectors)


print("==== 元のデータフレーム行数 ====")
print((df.count(), len(df.columns)))
예제 #41
0
    StructField("\"\"\"\"chlorides\"\"\"\"", FloatType(), True),
    StructField("\"\"\"\"free sulfur dioxide\"\"\"\"", FloatType(), True),
    StructField("\"\"\"\"total sulfur dioxide\"\"\"\"", FloatType(), True),
    StructField("\"\"\"\"density\"\"\"\"", FloatType(), True),
    StructField("\"\"\"\"pH\"\"\"\"", FloatType(), True),
    StructField("\"\"\"\"sulphates\"\"\"\"", FloatType(), True),
    StructField("\"\"\"\"alcohol\"\"\"\"", FloatType(), True),
    StructField("\"\"\"\"quality\"\"\"\"", FloatType(), True)
])
testing = spark.read.format("csv").option("header", "true").option(
    "delimiter",
    ";").schema(schema).load("s3n://643-pa2/ValidationDataset.csv")
vectorAssembler = VectorAssembler(inputCols=[
    "\"\"\"\"\"fixed acidity\"\"\"\"", "\"\"\"\"volatile acidity\"\"\"\"",
    "\"\"\"\"citric acid\"\"\"\"", "\"\"\"\"residual sugar\"\"\"\"",
    "\"\"\"\"chlorides\"\"\"\"", "\"\"\"\"free sulfur dioxide\"\"\"\"",
    "\"\"\"\"total sulfur dioxide\"\"\"\"", "\"\"\"\"density\"\"\"\"",
    "\"\"\"\"pH\"\"\"\"", "\"\"\"\"sulphates\"\"\"\"",
    "\"\"\"\"alcohol\"\"\"\""
],
                                  outputCol='features')
test_data = vectorAssembler.transform(testing)
predictions = Model.transform(test_data)
predictionAndLabels = predictions.select(
    ['prediction', "\"\"\"\"quality\"\"\"\""]).rdd

# Instantiate metrics object
metrics = MulticlassMetrics(predictionAndLabels)

# Overall statistics
print("F1 Score: " + str(metrics.weightedFMeasure()))
예제 #42
0
    df = process(df_base)

from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from pyspark import StorageLevel

from pyspark.ml.clustering import GaussianMixture

for i in range(6):
    n = 10**i

    for k in [5, 25, 50, 100, 500, 1000]:
        with Timer('limit', 'Limiting data, n={}, k={}'.format(n, k)):
            df_ik = df.limit(n)

        with Timer('clustering', 'n={}, k={}'.format(n, k)):

            gmm = GaussianMixture(k=k)

            va = VectorAssembler(
                inputCols=["pickup_latitude", "pickup_longitude"],
                outputCol="features")
            df_t = va.transform(df_ik)

            model = gmm.fit(df_t)

            df_p = model.transform(df_t)

            df_pp = df_p.select('pickup_latitude', 'pickup_longitude',
                                'prediction').toPandas()
예제 #43
0
ds = ds_raw.select([ds_raw.columns[3]] + [to_float(col(column)).alias(column) for column in ds_raw.columns[4:]])

from pyspark.ml.feature import StringIndexer

categoryIndexer = StringIndexer(inputCol="alchemy_category", outputCol="alchemy_category_index")
categoryTransformer = categoryIndexer.fit(ds)
df1 = categoryTransformer.transform(ds)
from pyspark.ml.feature import OneHotEncoder

encoder = OneHotEncoder(dropLast=False, inputCol="alchemy_category_index", outputCol="alchemy_category_index_vector")
df2 = encoder.transform(df1)
from pyspark.ml.feature import VectorAssembler

assemblerInput = ['alchemy_category_index_vector'] + ds.columns[1:-1]
assembler = VectorAssembler(inputCols=assemblerInput, outputCol="features")
df3 = assembler.transform(df2)
# deal with categorical label
from pyspark.ml.feature import StringIndexer

# Index labels, adding metadata to the label column
labelIndexer = StringIndexer(inputCol='label',
                             outputCol='indexedLabel').fit(df3)
df4 = labelIndexer.transform(df3)
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="features", impurity="gini", maxDepth=10, maxBins=14)
dt_model = dt.fit(df4)
df5 = dt_model.transform(df4)
# Convert indexed labels back to original labels.
from pyspark.ml.feature import IndexToString
예제 #44
0
def main():
    """
     - Downloads outstanding data
     - Sets up Spark environment
     - Loads data
     - Summarises data
     - Merges data
     - Prepares data for modelling
    :return: None
    """
    # --- Download data (if its not already downloaded)
    if config.MODE == 'prod':
        # In production mode we want to download all the csv files
        # Datasets in develop are controlled by the user
        # Download and save taxi journey data
        download_data(config.TAXI_DATA_URLS, config.TAXI_DATA_DIR)
        # Download and save road traffic accident data
        download_data(config.ACCIDENT_DATA_URLS, config.ACCIDENT_DATA_DIR)

    # --- Set up Spark environment
    spark = SparkSession.builder.appName('Basics').getOrCreate()

    # --- Load data
    # Load and parse taxi data, add an id column
    taxi_df = load_data(spark,
                        data_dir=config.TAXI_DATA_DIR,
                        schema=config.TAXI_DATA_SCHEMA)\
        .withColumn(config.TAXI_ID_COL, monotonically_increasing_id())

    logging.info(
        f'Selecting a random {config.SAMPLE_RATE * 100}% of data before merging'
    )
    # Get a random 1% of data with random seed=1
    splits = taxi_df.randomSplit([1 - config.SAMPLE_RATE, config.SAMPLE_RATE],
                                 seed=1)
    taxi_df = splits[1]

    # Load and parse accident data, add an id column and a timestamp column
    accident_df = load_data(spark,
                            data_dir=config.ACCIDENT_DATA_DIR,
                            schema=config.ACCIDENT_DATA_SCHEMA)\
        .withColumn(config.ACCIDENT_ID_COL, monotonically_increasing_id())\
        .withColumn('accident_timestamp', unix_timestamp('date', 'MM/dd/yyyy').cast('timestamp'))

    # --- Summarise data
    # Plot and save data summary (if its not already saved)

    # plot_summary(taxi_df, 'pickup_datetime', config.TAXI_ID_COL, config.TAXI_VOLUME_PLOT_FILE)
    # plot_summary(accident_df, 'accident_timestamp', config.ACCIDENT_ID_COL, config.ACCIDENT_VOLUME_PLOT_FILE)

    # --- Create ML features
    # Merge nearby accidents with taxi trips (this is a very long running process)
    df = merge_accidents(taxi_df, accident_df)

    # Create day of week, hour of day values from time stamp
    df = timestamps_to_features(df, 'pickup_datetime')
    df = timestamps_to_features(df, 'dropoff_datetime')

    # --- Log the results of the data preparation stages
    logging.info('Data preparation complete')
    # RDD has a countApprox() function that is quicker than .count(), unsure if the conversion from DataFrame to RDD
    # cancels out the savings
    logging.info(f'Number of rows: {df.rdd.countApprox(10)}')
    logging.info(f'Data schema: \n{df._jdf.schema().treeString()}')

    # TODO extract all this to function(s)
    # --- Train a gradient boosted trees model to predict the duration of a trip
    # Create the target variable ('label' is a special column name in MLlib
    logging.info(
        'Creating label column (based on the delta between dropoff_datetime and pickup_datetime)'
    )
    df = df.withColumn(
        'label',
        unix_timestamp(df['dropoff_datetime']) -
        unix_timestamp(df['pickup_datetime']))

    logging.info('Dropping rows that contain null (for a subset of columns)')
    # Drop any samples with a NULL value
    df = df.na.drop(
        how="any",
        subset=[config.TAXI_DATA_SCHEMA.fieldNames()].extend([
            'pickup_datetime_day_of_week', 'pickup_datetime_hour_of_day',
            'pickup_datetime_month_of_year', 'dropoff_datetime_day_of_week',
            'dropoff_datetime_hour_of_day', 'dropoff_datetime_month_of_year'
        ]))

    logging.info('Filling any remaining null values with 99999')
    # Filling na values with code 99999
    df = df.na.fill(value=99999)

    logging.info('Spliting records into training and testing sets')
    # Split the data into training and test sets (30% held out for testing)
    (training_df, testing_df) = df.randomSplit([0.7, 0.3])

    # Define an "VectorAssembler", which joins multiple columns into a single vector
    ignore = [
        'label', 'pickup_datetime', 'dropoff_datetime', config.ACCIDENT_ID_COL,
        config.TAXI_ID_COL
    ]
    assembler = VectorAssembler(
        inputCols=[col for col in df.columns if col not in ignore],
        outputCol='features')
    # Transform the data using the defined assembler
    logging.info('Transforming data using VectorAssembler')
    df = assembler.transform(df)

    # Define a "VectorIndexer" which converts categorical fields (defined by having less than 20 unique values) into
    # one-hot (?) encoded data
    feature_indexer = VectorIndexer(inputCol="features",
                                    outputCol="indexedFeatures").fit(
                                        df.select('features'))

    # Train a GBT model
    gbt = GBTRegressor(featuresCol="indexedFeatures", maxIter=10)

    # Chain assembler, indexer and GBT in a Pipeline
    pipeline = Pipeline(stages=[assembler, feature_indexer, gbt])

    logging.info('Running model pipeline')
    # Train model.  This also runs the indexer.
    model_pipeline = pipeline.fit(training_df)

    logging.info('Making model predictions')
    # Make predictions
    predictions = model_pipeline.transform(testing_df)

    # Select example rows to display
    predictions.select("prediction", "label", "features").show(5)

    logging.info('Evaluating predictions')
    # Select (prediction, true label) and compute test error
    evaluator = RegressionEvaluator(labelCol="label",
                                    predictionCol="prediction",
                                    metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    logging.info(f'Root Mean Squared Error (RMSE) on test data: {rmse}')

    # TODO extract this to a function
    # Save the outputs (predictions and model)

    import uuid

    run_id = uuid.uuid4()

    date = datetime.date.today()
    model_pipeline.save(
        f'{config.MODEL_DIR}/{run_id}/model-{config.MODE}-{date}')
    predictions.select("prediction", "label").write.csv(
        f'{config.PREDICTIONS_DIR}/{run_id}/predictions-{config.MODE}-{date}.csv'
    )

    # Print some cool stuff about the model
    gbt_model = model_pipeline.stages[2]
    logging.info(gbt_model)
    attrs = sorted((attr["idx"], attr["name"]) for attr in (chain(
        *df.schema["features"].metadata["ml_attr"]["attrs"].values())))

    for idx, name in attrs:
        if gbt_model.featureImportances[idx]:
            print(name, gbt_model.featureImportances[idx])
예제 #45
0
def main():
    # TODO - Check if valid CSV file path
    input_file = sys.argv[1]


    spark = SparkSession \
     .builder \
     .master("local[*]") \
     .appName("cs643-prediction") \
     .getOrCreate()

    # TODO - This is how docker container file structure should be
    loaded_regression_model = LinearRegressionModel.load(
        "/data/model/trained-model")

    # read dataset to predict
    input_dataset = spark.read.csv(input_file,
                                   header='true',
                                   inferSchema='true',
                                   sep=';')

    assembler = VectorAssembler(inputCols=[
        input_dataset.columns[1], input_dataset.columns[2],
        input_dataset.columns[3], input_dataset.columns[4],
        input_dataset.columns[5], input_dataset.columns[6],
        input_dataset.columns[7], input_dataset.columns[8],
        input_dataset.columns[9], input_dataset.columns[10]
    ],
                                outputCol="Attributes")

    valid_output = assembler.transform(input_dataset)

    valid_finalized_data = valid_output.select("Attributes",
                                               input_dataset.columns[11])

    # predict the quality
    input_predictions = loaded_regression_model.transform(valid_finalized_data)

    data_eval = RegressionEvaluator(labelCol=input_dataset.columns[11],
                                    predictionCol="prediction",
                                    metricName="rmse")
    # r2 - coefficient of determination
    r2 = data_eval.evaluate(input_predictions, {data_eval.metricName: "r2"})
    print("\n\n\n")
    print("r2: %.3f" % r2)

    # Root Mean Square Error
    rmse = data_eval.evaluate(input_predictions)
    print("Root Mean Squared Error (RMSE): %g" % rmse)
    # Mean Square Error
    mse = data_eval.evaluate(input_predictions, {data_eval.metricName: "mse"})
    print("MSE: %g" % mse)
    # Mean Absolute Error
    mae = data_eval.evaluate(input_predictions, {data_eval.metricName: "mae"})
    print("MAE: %g" % mae)

    # Check if user provided how many rows to print
    if args.o is not None:
        input_predictions.show(int(args.o), truncate=False)
    else:
        input_predictions.show(truncate=False)
#	    an iterable and returns the result.
OneHot = map(lambda c: c + "classVec", categoricalColumns)     # Target not included
OneHot = list(OneHot)
OneHot                                                         # List of names

# Compile list of all OneHot and numerical cols for assembling into 'features'
assemblerInputs = OneHot + numericCols
assemblerInputs
len(assemblerInputs)		# 8 (OneHot) + 6 (numeric) = 14


# Create an object to apply to assemblerInputs, VectorAssembler
#  OutCol is 'always' has a name 'features'. 'features' contains
#   all predictors
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
df = assembler.transform(df)


# Examine df
df.take(2)
df.columns
df.dtypes
len(df.columns)     # 18 (one each from ToDo) + 15 (original Cols) = 35


######################### CC.Modeling Data #########################################

# Keep only relevant columns
#  Note now we need just two columns: label and features
#   You can expt by removing others (cols)
selectedcols = ["label", "features"] + cols    # ie 15 + 2 =17 & ignore others
# MAGIC A decision tree is a simple representation for classifying examples. For this section, assume that all of the input features have finite discrete domains, and there is a single target feature called the "classification". Each element of the domain of the classification is called a class. A decision tree or a classification tree is a tree in which each internal (non-leaf) node is labeled with an input feature. The arcs coming from a node labeled with an input feature are labeled with each of the possible values of the target or output feature or the arc leads to a subordinate decision node on a different input feature. Each leaf of the tree is labeled with a class or a probability distribution over the classes, signifying that the data set has been classified by the tree into either a specific class.

# COMMAND ----------

# MAGIC %md
# MAGIC Let's build a decision tree using the training data set

# COMMAND ----------

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier

# Vectorize the features (all columns excluding the first one, Survived)
features = trainDF.columns[1:]
assembler = VectorAssembler(inputCols=features, outputCol="features")
assembledTrainDF = assembler.transform(trainDF)

# Train a decision tree, setting maxDepth parameter to 3
dtc = DecisionTreeClassifier(featuresCol="features", labelCol="Survived", maxDepth=2)
dtcModel = dtc.fit(assembledTrainDF)

# Print the constructed tree
print(dtcModel.toDebugString)

# COMMAND ----------

# Visualize the decision tree

display(dtcModel)

# COMMAND ----------
예제 #48
0
''' Create Spark Data Frame and add features/labels for the MLlib
'''
if DEBUG_SMALL:
    print("Running training on small data-set")
    traindf = sqlContext.createDataFrame(train_df[0:5000])
else:
    print("Running training on 80% data-set")
    traindf = sqlContext.createDataFrame(train_df)

# Below transformations are done in order to brind data-frame to the format MLlib is requiring.
# MLlib requires data-frame with two columns: labels and features. While features column is collection of all features
#
labelIndexer = StringIndexer(inputCol="status_group",
                             outputCol="indexedLabel").fit(traindf)
assembler = VectorAssembler(inputCols=train_columns, outputCol="features")
traindf = assembler.transform(traindf)
featureIndexer = VectorIndexer(inputCol="features",
                               outputCol="indexedFeatures",
                               maxCategories=3).fit(traindf)

(trainingData, testData) = traindf.randomSplit([0.8, 0.2])

#
#Best params from sklearn {'n_estimators': 120, 'random_state': 1, 'min_samples_split': 5, 'max_features': 50, 'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 1}
#
# MLlib RandomForestClassifier input (from documentation)
# class pyspark.ml.classification.RandomForestClassifier(self, featuresCol="features", labelCol="label", predictionCol="prediction", probabilityCol="probability", rawPredictionCol="rawPrediction", maxDepth=5,
#maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", numTrees=20, featureSubsetStrategy="auto", seed=None, subsamplingRate=1.0)

#########################################################################
# Below code implementation is adopted from Spark MLlib main guide
from pyspark.ml.feature import VectorAssembler
va = VectorAssembler()\
  .setInputCols(["Quantity", "UnitPrice"])\
  .setOutputCol("features")

sales = va.transform(spark.read.format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
  .load("/data/retail-data/by-day/*.csv")
  .limit(50)
  .coalesce(1)
  .where("Description IS NOT NULL"))

sales.cache()


# COMMAND ----------

from pyspark.ml.clustering import KMeans
km = KMeans().setK(5)
print km.explainParams()
kmModel = km.fit(sales)


# COMMAND ----------

summary = kmModel.summary
print summary.clusterSizes # number of points
kmModel.computeCost(sales)
centers = kmModel.clusterCenters()
print("Cluster Centers: ")
indexer = StringIndexer(inputCol="education", outputCol="new_education")
indexed = indexer.fit(new_data).transform(new_data)

indexer1 = StringIndexer(inputCol="sex", outputCol="new_sex")
indexed1 = indexer1.fit(indexed).transform(indexed)

indexer2= StringIndexer(inputCol="relationship",outputCol="new_rel")
indexed2= indexer2.fit(indexed1).transform(indexed1)

indexed2=indexed2.drop("sex","education","relationship")
indexed2.show()


# Create vector assembler for feature columns
assembler = VectorAssembler(inputCols=indexed2.columns[1:], outputCol="features")
data = assembler.transform(indexed2)
# Split data into training and test data set
training, test = data.select("label", "features").randomSplit([0.6, 0.4])

# Create Random Forest model and fit the model with training dataset
rf = RandomForestClassifier()
model = rf.fit(training)

# Generate prediction from test dataset
predictions = model.transform(test)

# Evuluate the accuracy of the model
evaluator = MulticlassClassificationEvaluator()
accuracy = evaluator.evaluate(predictions)

# Show model accuracy
#encode the dependent variable - category_predict
classifyIndexer = StringIndexer(inputCol="Category", outputCol="Category_Index")
classifymodel = classifyIndexer.fit(encoded)
encoded2 = classifymodel.transform(encoded)



#keep the following columns: x, y, hour, day, month, year, dayofweek, week, x_sim, y_sim
#drop the following
cleaned = encoded2.select([c for c in encoded2.columns if c not in{'DayOfWeek','Category','Address','Dates','Descript','PdDistrict','Resolution','PdDistrict_Index'}])

ignore = ['Category_Index']
assembler = VectorAssembler(inputCols=[x for x in cleaned.columns if x not in ignore],outputCol='features')

transformed = assembler.transform(cleaned)


data_transformed = transformed.select(col("Category_Index").alias("label"), col("features")).map(lambda row: LabeledPoint(row.label, row.features))

#********************************************************************************
# split the training set
train, test = data_transformed.randomSplit([0.7, 0.3], seed = 2)

#naivebayes classifier
#lambda = 1.0
# initialize classifier:
nb_model = mllib_class.NaiveBayes.train(train, 1.0)
#this step will take 50 seconds

# Make prediction and test accuracy.
예제 #52
0
    def fit(
        self,
        sdf: DataFrame,
        label_col: str = "label",
        sdf_validation: Optional[DataFrame] = None,
        estimator_params: Optional[Dict[str, object]] = None,
        explainer_type_params: Optional[Dict[str, object]] = None,
        explainer_params: Optional[Dict[str, object]] = None,
        broadcast: bool = True,
    ) -> "SparkSelector":
        """Fit the Spark selector with the provided estimator.

        Args:
            sdf: The training input samples.
            label_col: The target column name.
            sdf_validation: The validation input samples.
            estimator_params: Additional parameters for the underlying estimator's fit method.
            explainer_type_params: Additional parameters for the explainer's init.
            explainer_params: Additional parameters for the explainer's shap_values method.
            broadcast: Whether to broadcast the target column when joining.

        """

        # Check if pyspark and pyarrow are installed
        if DataFrame is None or importlib.util.find_spec("pyarrow") is None:
            raise ImportError(
                "SparkSelector requires both pyspark and pyarrow.")

        # Validate parameters
        self._validate_params()

        # Set estimator parameters
        self.estimator.setFeaturesCol(SPARK_FEATURES_NAME)
        self.estimator.setLabelCol(label_col)

        # Make sure that check_additivity is disabled (it's not supported for Spark estimators)
        explainer_params = self._set_additivity_false(explainer_params)

        # Assembly the features vector
        features = [col for col in sdf.columns if col != label_col]
        assembler = VectorAssembler(inputCols=features,
                                    outputCol=SPARK_FEATURES_NAME,
                                    handleInvalid="keep")
        sdf = assembler.transform(sdf)

        # With the progress bar
        with tqdm(total=self.n_iter, disable=not self.verbose) as pbar:
            # Get the true shap values (i.e. without shuffling)
            pbar.set_description("Computing true SHAP values")
            true_pos_shap_values, true_neg_shap_values = self._get_shap_values(
                sdf,
                label_col=label_col,
                shuffle=False,
                sdf_validation=sdf_validation,
                estimator_params=estimator_params,
                explainer_type_params=explainer_type_params,
                explainer_params=explainer_params,
            )

            # Get the null shap values (i.e. with shuffling)
            pbar.set_description("Computing null SHAP values")
            null_pos_shap_values = [None] * self._n_outputs
            null_neg_shap_values = [None] * self._n_outputs
            for i in range(self.n_iter):
                self._current_iter = i + 1
                if self.verbose:
                    logger.info(
                        f"Iteration {self._current_iter}/{self.n_iter}")
                pos_shap_values, neg_shap_values = self._get_shap_values(
                    sdf,
                    label_col=label_col,
                    shuffle=True,
                    sdf_validation=sdf_validation,
                    estimator_params=estimator_params,
                    explainer_type_params=explainer_type_params,
                    explainer_params=explainer_params,
                    broadcast=broadcast,
                )
                for j in range(self._n_outputs):
                    if i == 0:
                        null_pos_shap_values[j] = pos_shap_values[j].to_frame()
                        null_neg_shap_values[j] = neg_shap_values[j].to_frame()
                    else:
                        null_pos_shap_values[j] = null_pos_shap_values[j].join(
                            pos_shap_values[j],
                            rsuffix=f"_{self._current_iter}")
                        null_neg_shap_values[j] = null_neg_shap_values[j].join(
                            neg_shap_values[j],
                            rsuffix=f"_{self._current_iter}")
                pbar.update(1)

        # Compute p-values
        self.p_values_ = self._compute_p_values(true_pos_shap_values,
                                                null_pos_shap_values,
                                                true_neg_shap_values,
                                                null_neg_shap_values)

        # Cleanup
        self._n_outputs = None
        self._X_with_index = None
        self._X_for_shap = None

        return self
예제 #53
0
# Entrenamos y calibramos el modelo modificando sus parametros internos viendo sus resultados en evaluation

from pyspark.sql.types import *
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler

#Definimos el dataset para la prediccion del label ARR_DEL15
#Generamos un vector con la columna label  y la columna array features

ignore = ['label']
assembler = VectorAssembler(
    inputCols=[x for x in train.columns if x not in ignore],
    outputCol='features')
train_LP = assembler.transform(train).select(['label', 'features'])
evaluation_LP = assembler.transform(evaluation).select(['label', 'features'])

#Definimos el algoritmo del modelo (decision tree)
dt = DecisionTreeClassifier(labelCol="label",
                            featuresCol="features",
                            maxDepth=10,
                            maxBins=64)

# Fit the model
model = dt.fit(train_LP)

#Save the model
#model.save("dbfs:/dataset/modelo_binario_DT")

# Make predictions.
#LOAD DATA
dataset = spark.read.format("libsvm").load("/mapreduce-test/pro_1/shot_logs.csv")
df = df.drop(columns=['GAME_ID','SHOT_RESULT','MATCHUP','LOCATION','W','FINAL_MARGIN','SHOT_NUMBER','PERIOD','GAME_CLOCK','DRIBBLES','CLOSEST_DEFENDER','CLOSEST_DEFENDER_PLAYER_ID','FGM','PTS','player_id','PTS_TYPE','TOUCH_TIME'])
df = df[df.SHOT_RESULT != 'missed']
df = df.groupby('player_name')
df = df.mean()

#FILTER DATA TO GROUP BY PLAYER​
training_set = x = df[['SHOT_CLOCK','SHOT_DIST','CLOSE_DEF_DIST']]

#EXPLORATION OF KVALUES

#Convert dataset into VectorRow data cells
data_of_interest = dataset.withColumn('CLOSE_DEF_DIST', data_cleaned['CLOSE_DEF_DIST'].cast(DoubleType())).withColumn('SHOT_DIST', data_cleaned['SHOT_DIST, '].cast(DoubleType())).withColumn('SHOT_CLOCK', data_cleaned['SHOT_CLOCK'].cast(DoubleType()))
feature_vector = VectorAssembler(inputCols=['CLOSE_DEF_DIST', 'SHOT_DIST', 'SHOT_CLOCK'], outputCol="features")
transform_data = feature_vector.transform(data_of_interest)
player_names = transform_data.select("player_name").distinct().collect()
list_items = list()
evaluator = ClusteringEvaluator()

#Getting Silhouette with squared euclidean distance for k value ranging from 2 to 8
TotalSED = []
for player in player_name:
    features = transform_data.where(transform_data["player_name"] == player[0]).select("features")
    for k in range(2,8):
        kmeans = KMeans(featuresCol = 'features', k=k)
        model = kmeans.fit(features)
        predictions = model.transform(features)
        silhouette = evaluator.evaluate(predictions)
        print("With K={}".format(k))
        print("Silhouette with squared euclidean distance = " + str(silhouette))
예제 #55
0
from __future__ import print_function

# $example on$
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("VectorAssemblerExample")\
        .getOrCreate()

    # $example on$
    dataset = spark.createDataFrame(
        [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)],
        ["id", "hour", "mobile", "userFeatures", "clicked"])

    assembler = VectorAssembler(
        inputCols=["hour", "mobile", "userFeatures"],
        outputCol="features")

    output = assembler.transform(dataset)
    print("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'")
    output.select("features", "clicked").show(truncate=False)
    # $example off$

    spark.stop()
예제 #56
0
#from pyspark.ml.feature import StringIndexer
# this will convert each unique string into a numeric
#indexer = StringIndexer(inputCol="txtlabel", outputCol="label")
#indexed = indexer.fit(mydf).transform(mydf)
#indexed.show(5)
# now we need to create  a  "label" and "features"
# input for using the sparkML library

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors

## use  features to predict if there will be a fatality
assembler = VectorAssembler(
    inputCols=[ "age_n", "wt_n", "gndr_n", "druglisthash", "medcount"],
    outputCol="features")
output = assembler.transform(fda20k)
# note the column headers - label and features are keywords
print ( output.show(3) )
from pyspark.ml.classification import LogisticRegression

# Create a LogisticRegression instance. This instance is an Estimator.
lr = LogisticRegression(maxIter=10, regParam=0.01)
# Print out the parameters, documentation, and any default values.
print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")

# Learn a LogisticRegression model. This uses the parameters stored in lr.
model1 = lr.fit(output)

#### Major shortcut - no train and test data!!!
# Since model1 is a Model (i.e., a transformer produced by an Estimator),
# we can view the parameters it used during fit().
예제 #57
0
print(result.show())

# Rename Column count(IP) to IP
result = result.withColumnRenamed("count(IP)", "IP")
# Drop null values
result = result.dropna(how="any", subset=["IP", "Time"])
print(result.show())

# Converting datetime to unix_timestamp
result = result.withColumn("Time", unix_timestamp(result.Time))
print(result.show())
result = result.withColumn("IP", result["Time"].cast(IntegerType()))

# Convert features to vectors with VectorAssembler - required by ML models
assembler = VectorAssembler(inputCols=['IP', 'Time'], outputCol='features')
v_result = assembler.transform(result)
v_result = v_result.select(['features', 'Time'])
splits = v_result.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]
lr = LinearRegression(featuresCol='features',
                      labelCol='Time',
                      maxIter=10,
                      regParam=0.3,
                      elasticNetParam=0.8)
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))
lrModelSummary = lr_model.summary
print("Train R2 Score: ", lrModelSummary.r2)
예제 #58
0
    "neighbourhood_cleansed",
    "room_type",
    "bedrooms",
    "bathrooms",
    "number_of_reviews",
    "price",
).show(5)

trainDF, testDF = airbnbDF.randomSplit([0.8, 0.2], seed=42)
print(
    f"""There are {trainDF.count()} rows in the training set,
    and {testDF.count()} in the test set"""
)

vecAssembler = VectorAssembler(inputCols=["bedrooms"], outputCol="features")
vecTrainDF = vecAssembler.transform(trainDF)
vecTrainDF.select("bedrooms", "features", "price").show(10)

lr = LinearRegression(featuresCol="features", labelCol="price")
lrModel = lr.fit(vecTrainDF)

m = round(lrModel.coefficients[0], 2)
b = round(lrModel.intercept, 2)
print(f"""The formula for the linear regression line is price = {m}*bedrooms + {b}""")

pipeline = Pipeline(stages=[vecAssembler, lr])
pipelineModel = pipeline.fit(trainDF)

predDF = pipelineModel.transform(testDF)
predDF.select("bedrooms", "features", "price", "prediction").show(10)
print label_indexed.take(1)

# COMMAND ----------

# MAGIC %md
# MAGIC Next, we will use the VectorAssembler() to combine all the feature columns into a single vector column. This will include both the numeric columns and the one-hot encoded binary vector columns in our dataset.

# COMMAND ----------

# Transform all features into a vector using VectorAssembler
assembler = VectorAssembler(
    inputCols=["age","workclassclassVec","fnlwgt","educationclassVec","education_num","marital_statusclassVec",
               "occupationclassVec","relationshipclassVec","raceclassVec", "sexclassVec", "capital_gain", "capital_loss", "hours_per_week",
               "native_countryclassVec"],
    outputCol="features")
output = assembler.transform(label_indexed)

# Keep relevant columns
selectedcols = ["label", "features"] + cols
dataset = output.select(selectedcols)
display(dataset)

# COMMAND ----------

### Randomly split data into training and test sets. set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print trainingData.count()
print testData.count()

# COMMAND ----------
예제 #60
0
                                              & (col("startLon") >= lonWest)
                                              & (col("startLon") <= lonEast)
                                              & (col("startLat") >= latSouth)
                                              & (col("startLat") <= latNorth)
                                              & (col("endLon") >= lonWest)
                                              & (col("endLon") <= lonEast)
                                              & (col("endLat") >= latSouth)
                                              & (col("endLat") <= latNorth))

taxi = taxi.select('startLon', 'startLat', 'tip')

GA = taxi.rdd.map(lambda row: row.asDict())
GA.saveToMongoDB('mongodb://localhost:27017/POC1.données')

vec_assembler = VectorAssembler(inputCols=taxi.columns, outputCol='features')
final_data = vec_assembler.transform(taxi)

kmeans = KMeans(featuresCol='features', k=12)
model = kmeans.fit(final_data)

centers = model.clusterCenters()
print("Cluster Centers: ")
A = []
for center in centers:
    print(center.tolist())
    A.append(center.tolist())

resultat= sc.parallelize(A)\
.toDF(['startlon','startlat','tip'])

GA = resultat.rdd.map(lambda row: row.asDict())