def test_norms(self): a = DenseVector([0, 2, 3, -1]) self.assertAlmostEqual(a.norm(2), 3.742, 3) self.assertTrue(a.norm(1), 6) self.assertTrue(a.norm(inf), 3) a = SparseVector(4, [0, 2], [3, -4]) self.assertAlmostEqual(a.norm(2), 5) self.assertTrue(a.norm(1), 7) self.assertTrue(a.norm(inf), 4) tmp = SparseVector(4, [0, 2], [3, 0]) self.assertEqual(tmp.numNonzeros(), 1)
def test_dot(self): sv = SparseVector(4, {1: 1, 3: 2}) dv = DenseVector(array([1., 2., 3., 4.])) lst = DenseVector([1, 2, 3, 4]) mat = array([[1., 2., 3., 4.], [1., 2., 3., 4.], [1., 2., 3., 4.], [1., 2., 3., 4.]]) arr = pyarray.array('d', [0, 1, 2, 3]) self.assertEqual(10.0, sv.dot(dv)) self.assertTrue(array_equal(array([3., 6., 9., 12.]), sv.dot(mat))) self.assertEqual(30.0, dv.dot(dv)) self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat))) self.assertEqual(30.0, lst.dot(dv)) self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat))) self.assertEqual(7.0, sv.dot(arr))
def predict(rows): import tensorflow as tf from pyspark import Row from pyspark.ml.linalg import DenseVector, SparseVector k = keras_utils.keras() k.backend.set_floatx(floatx) # Do not use GPUs for prediction, use single CPU core per task. pin_cpu(tf, k) def load_model_fn(x): with k.utils.custom_object_scope(custom_objects): return k.models.load_model(x) model = keras_utils.deserialize_model(serialized_model, load_model_fn=load_model_fn) input_shapes = [[dim if dim else -1 for dim in input.shape.as_list()] for input in model.inputs] def to_array(item): if type(item) in [DenseVector or SparseVector]: return item.toArray() else: return np.array(item) def to_numpy(item): # Some versions of TensorFlow will return an EagerTensor return item.numpy() if hasattr(item, 'numpy') else item # Perform predictions. for row in rows: fields = row.asDict().copy() preds = model.predict_on_batch( [to_array(row[feature_cols[i]]).reshape(input_shapes[i]) for i in range(len(feature_cols))]) preds = [to_numpy(item) for item in preds] for label_col, output_col, pred, in zip(label_cols, output_cols, preds): meta = metadata[label_col] col_type = meta['spark_data_type'] # dtype for DenseVector and SparseVector is always np.float64 if col_type == DenseVector: shape = np.prod(pred.shape) flattened_pred = pred.reshape(shape, ) field = DenseVector(flattened_pred) elif col_type == SparseVector: shape = meta['shape'] flattened_pred = pred.reshape(shape, ) nonzero_indices = flattened_pred.nonzero()[0] field = SparseVector(shape, nonzero_indices, flattened_pred[nonzero_indices]) else: # If the column is scalar type, int, float, etc. value = pred[0] python_type = util.spark_scalar_to_python_type(col_type) if issubclass(python_type, numbers.Integral): value = round(value) field = python_type(value) fields[output_col] = field yield Row(**fields)
def initSpark(sparkApp=None, sparkHome=os.environ.get(_SPARK_HOME_ENV_VAR_NAME, _SPARK_HOME_ON_ARIMO_LINUX_CLUSTER), sparkConf={}, sparkRepos=(), sparkPkgs=(), javaHome=None, hadoopConfDir=None, yarnConfDir=None, yarnUpdateJARs=False, dataIO={'avro', 'pg', 'redshift', 'sftp'}, executor_aws_ec2_instance_type='c5n.9xlarge'): """ Launch new ``SparkSession`` or connect to existing one, and binding it to ``arimo.data_backend.spark`` Args: sparkApp (str): name to give to the ``SparkSession`` to be launched sparkHome (str): path to Spark installation, if not already set in ``SPARK_HOME`` environment variable sparkConf (tuple/list): tuple/list of configs to over-ride default Spark configs sparkPkgs (tuple/list of str): tuple/list of Maven and/or Spark packages with which to launch Spark javaHome (str): path to Java Development Kit (JDK), if not already set in ``JAVA_HOME`` environment variable hadoopConfDir (str): path to Hadoop configuration directory; *ignored* if not running on a YARN cluster or if Hadoop is installed at ``/opt/hadoop`` ckptDir (str): path to default Spark checkpoint directory dataIO (set): additional data IO support options """ assert (pyspark.__version__ >= _MIN_SPARK_VER), \ f'*** Spark >= {_MIN_SPARK_VER} required, but {pyspark.__version__} installed ***' # initialize logger logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) logger.addHandler(STDOUT_HANDLER) # driver Python executable path os.environ['PYSPARK_DRIVER_PYTHON'] = 'python3' # worker Python executable path os.environ['PYSPARK_PYTHON'] = '/opt/miniconda3/bin/python3' # set relevant environment variables for Java, Spark, Hadoop & YARN if javaHome: os.environ[_JAVA_HOME_ENV_VAR_NAME] = javaHome elif _JAVA_HOME: os.environ[_JAVA_HOME_ENV_VAR_NAME] = _JAVA_HOME if sparkHome: os.environ[_SPARK_HOME_ENV_VAR_NAME] = sparkHome if _ON_LINUX_CLUSTER_WITH_HDFS: os.environ[_HADOOP_CONF_DIR_ENV_VAR_NAME] = \ hadoopConfDir \ if hadoopConfDir \ else os.environ.get( _HADOOP_CONF_DIR_ENV_VAR_NAME, os.path.join(_HADOOP_HOME, 'conf') if _HADOOP_HOME else None) if yarnConfDir: os.environ[_YARN_CONF_DIR_ENV_VAR_NAME] = yarnConfDir os.environ['PYSPARK_SUBMIT_ARGS'] = \ '--py-files {} --repositories {} --packages {} pyspark-shell'.format( # ','.join( # os.path.join(_SPARK_JARS_DIR_PATH, jar_file_name) # for jar_file_name in os.listdir(_SPARK_JARS_DIR_PATH) # if jar_file_name.endswith('.jar')), ','.join(_SPARK_ARIMO_PACKAGE_PY_FILE_PATHS), ','.join(_SPARK_REPOS.union(sparkRepos)), ','.join( _SPARK_PKGS.union( sparkPkgs, *(_DATA_IO_SPARK_PKGS[dataIOOption.lower()] for dataIOOption in dataIO)))) # set / create SparkSession global spark if spark: assert spark._instantiatedSession is None assert spark.sparkContext._active_spark_context is None assert spark.sparkContext._jsc is None # build Spark Configs conf = \ pyspark.SparkConf() \ .setAppName( sparkApp if sparkApp else os.getcwd()) _sparkConf = _SPARK_CONF.copy() _sparkConf.update(sparkConf) # optimally allocating YARN containers executor_aws_ec2_instance_type_info = \ INSTANCE_TYPES_INFO.loc[executor_aws_ec2_instance_type] optim_alloc_details = \ optim_alloc( node_mem_gib=executor_aws_ec2_instance_type_info[MEMORY_GiB_KEY]) n_executors_per_node = optim_alloc_details['n_executors'] _sparkConf['spark.executor.memory'] = \ mem_gib_per_executor = \ f"{optim_alloc_details['executor_mem_gib']}g" _sparkConf['spark.executor.cores'] = \ n_cpus_per_executor = \ int(1.68 * # over-allocating CPUs to maximize CPU usage executor_aws_ec2_instance_type_info[N_CPUS_KEY] / n_executors_per_node) logger.info( msg= 'Allocating {:,}x {} {:,}-CPU Executors per {} ({}-GiB {:,}-CPU) YARN Worker Node (Leaving {:.1f} GiB for Driver)...' .format(n_executors_per_node, mem_gib_per_executor, n_cpus_per_executor, executor_aws_ec2_instance_type, executor_aws_ec2_instance_type_info[MEMORY_GiB_KEY], executor_aws_ec2_instance_type_info[N_CPUS_KEY], optim_alloc_details['avail_for_driver_mem_gib'])) if _ON_LINUX_CLUSTER_WITH_HDFS: if exist(path=_YARN_JARS_DIR_NAME, hdfs=True, dir=True): if not yarnUpdateJARs: # *** TODO: FIX *** # _sparkConf['spark.yarn.jars'] = _YARN_JARS_DIR_NAME pass else: yarnUpdateJARs = False for k, v in _sparkConf.items(): conf.set(k, v) # remove any existing derby.log & metastore_db to avoid Hive start-up errors rm(path='derby.log', hdfs=False, is_dir=False) rm(path='metastore_db', hdfs=False, is_dir=True) # clean up existing Spark checkpoints rmSparkCkPts() # get / create SparkSession spark = pyspark.sql.SparkSession.builder \ .config(conf=conf) \ .enableHiveSupport() \ .getOrCreate() logger.info(msg='SparkSession = {}'.format(spark)) # BELOW DOESN'T WORK FOR dev/preview VERSIONS # assert spark.version == pyspark.__version__, \ # '*** PySpark v{} does not match underlying Spark v{} ***'.format(pyspark.__version__, spark.version) spark.sparkContext.setLogLevel( 'WARN') # ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE or WARN spark.sparkContext.setCheckpointDir(dirName=_SPARK_CKPT_DIR) # set Hadoop Conf in Spark Context if os.environ.get('AWS_ACCESS_KEY_ID'): spark.sparkContext._jsc.hadoopConfiguration().set( "fs.s3a.awsAccessKeyId", os.environ.get('AWS_ACCESS_KEY_ID')) spark.sparkContext._jsc.hadoopConfiguration().set( "fs.s3a.awsSecretAccessKey", os.environ.get('AWS_SECRET_ACCESS_KEY')) # register Uder-Defined Functions (UDFs) from pyspark.ml.linalg import DenseVector, VectorUDT from pyspark.sql.types import ArrayType, DoubleType spark.udf.register(name='_ARRAY_TO_VECTOR', f=lambda a: DenseVector(a), returnType=VectorUDT()) spark.udf.register(name='_VECTOR_TO_ARRAY', f=lambda v: v.array.tolist(), returnType=ArrayType(DoubleType())) if yarnUpdateJARs: msg = 'Putting JARs from {} to {}...'.format( _SPARK_JARS_DIR_PATH_ON_ARIMO_LINUX_CLUSTER, _YARN_JARS_DIR_PATH) logger.info(msg) updateYARNJARs() logger.info(msg + ' done!')
def convert_df(spark, data): """Transform dataframe into the format that can be used by Spark ML.""" input_data = data.rdd.map(lambda x: (x[0], DenseVector(x[1:]))) df = spark.createDataFrame(input_data, ["id", "features"]) return df
def sparse_to_array(v): v = DenseVector(v) new_array = list([int(x) for x in v]) return new_array
#Load dataset file as RDD rdd = sc.textFile("/user/spark/airfoil.txt") rdd = rdd.map(lambda x: x.split('\t')) rdd = rdd.map(lambda x: [ float(x[0]), float(x[1]), float(x[2]), float(x[3]), float(x[4]), float(x[5]) ]) #Create dataframe for ML model df = spark.createDataFrame( rdd, ["frequency", "angle", "chord", "velocity", "suction", "pressure"]) data = df.rdd.map(lambda x: (DenseVector(x[:-1]), x[-1])) df = spark.createDataFrame(data, ["features", "label"]) #Feature scaling standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled") scaler = standardScaler.fit(df) scaled_df = scaler.transform(df) #Split data into training and test train_data, test_data = scaled_df.randomSplit([.7, .3], seed=1234) train_data = train_data.select("features_scaled", "label") test_data = test_data.select("features_scaled", "label") train_data = train_data.withColumnRenamed("features_scaled", "features") test_data = test_data.withColumnRenamed("features_scaled", "features")
def trans2sparse(line): indices = line["chi"]["indices"] values = line["chi"]["values"] vec = DenseVector(Vectors.sparse(2000, indices, values).toArray()) return Row(chi=vec, window=line["window"])
pipeline = Pipeline(stages=stages) pipelineModel = pipeline.fit(df) model = pipelineModel.transform(df) # In[47]: model.take(1) # In[48]: # build the classifier #convert data to a dataFrame from pyspark.ml.linalg import DenseVector input_data = model.rdd.map(lambda x: (x["newlabel"], DenseVector(x["features"]))) # In[49]: # create the training data as a data frame df_train = sqlContext.createDataFrame(input_data, ["label", "features"]) # In[50]: # check row 2 df_train.show(2) # In[51]: # You split the dataset 80/20 with randomSplit. train_data, test_data = df_train.randomSplit([.8, .2], seed=1234)
def denseudf(wcol): if wcol == SparseVector(300, {}): wcol = DenseVector([0.0] * 300) return wcol
def sparse_to_array(self, v): # print("Coverting featues to dense vector ...") v = DenseVector(v) new_array = list([float(x) for x in v]) return new_array
def kMeans(cluster): #combinedDataList = combineData() MLlist = [] for rows in combinedDataList: mlData = {} mlData['Total Crimes'] = rows['Total Crimes'] mlData['Depart'] = rows['Depart'] mlData['Heat'] = rows['Heat'] mlData['PrecipTotal'] = rows['PrecipTotal'] mlData['Tavg'] = rows['Tavg'] mlData['Tmax'] = rows['Tmax'] mlData['Tmin'] = rows['Tmin'] MLlist.append(mlData) #define input data inputRDD = sc.parallelize(MLlist) featureddf = spark.read.json(inputRDD) #featureddf.printSchema() #featureddf.show(2,False) # Replace `df` with the new DataFrame input_data = featureddf.rdd.map(lambda x: (x['Total Crimes'], DenseVector([x['Depart'],x['Heat'], x['PrecipTotal'], x['Tavg'], x['Tmax'], x['Tmin'], x['Total Crimes']]))) MLdf = spark.createDataFrame(input_data, ["label", "features"]) #MLdf.printSchema() #MLdf.show(2,False) """ # Initialize the `standardScaler` standardScaler = StandardScaler(inputCol="unscaledFeatures", outputCol="features") # Fit the DataFrame to the scaler scaler = standardScaler.fit(MLdf) # Transform the data in `df` with the scaler scaled_df = scaler.transform(MLdf) scaled_df.printSchema() # Inspect the result scaled_df.show(2,False) """ # Trains a k-means model. #for i in range(2,100): kmeans = KMeans(k=cluster) model = kmeans.fit(MLdf) centers = model.clusterCenters() # Evaluate clustering by computing Within Set Sum of Squared Errors. wssse = model.computeCost(MLdf) #print("Within Set Sum of Squared Errors = " + str(wssse)) # Shows the result. centers = model.clusterCenters() #print("Cluster Centers: ") centerlist = [] i = 0; for center in centers: centerData = {} centerData['Center ' + str(i) + ' Depart'] = center[0] centerData['Center ' + str(i) + ' Heat'] = center[1] centerData['Center ' + str(i) + ' PrecipTotal'] = center[2] centerData['Center ' + str(i) + ' Tavg'] = center[3] centerData['Center ' + str(i) + ' Tmax'] = center[4] centerData['Center ' + str(i) + ' Tmin'] = center[5] centerData['Center ' + str(i) + ' Total Crimes'] = center[6] centerlist.append(centerData) i = i + 1 transformed = model.transform(MLdf).select("features", "prediction") #transformed.printSchema() #transformed.show(50,False) pandaDF = transformed.toPandas() Tavg = [] precip = [] crimes = [] for item in pandaDF['features'].tolist(): Tavg.append(item[3]) crimes.append(item[6]) precip.append(item[2]) cluster = pandaDF['prediction'].tolist() clusters = [] for x in range(len(centerlist)): clusters.append({ 'name': 'Cluster' + str(x), 'data': [[Tavg[i], crimes[i], precip[i]] for i in range(len(Tavg)) if cluster[i] == x] }) #print(clusters) return json.dumps({ 'clusters': clusters, 'clusterCenters': centerlist, 'WSSSE': wssse })
def decTreeReg(): #combinedDataList = combineData() MLlist = [] for rows in combinedDataList: mlData = {} mlData['Total Crimes'] = rows['Total Crimes'] mlData['Depart'] = rows['Depart'] mlData['Heat'] = rows['Heat'] mlData['PrecipTotal'] = rows['PrecipTotal'] mlData['Tavg'] = rows['Tavg'] mlData['Tmax'] = rows['Tmax'] mlData['Tmin'] = rows['Tmin'] MLlist.append(mlData) #define input data inputRDD = sc.parallelize(MLlist) featureddf = spark.read.json(inputRDD) # label data input_data = featureddf.rdd.map(lambda x: (x['Total Crimes'], DenseVector([x['Depart'],x['Heat'], x['PrecipTotal'], x['Tavg'], x['Tmax'], x['Tmin']]))) MLdf = spark.createDataFrame(input_data, ["label", "features"]) #MLdf.show(10,False) # Automatically identify categorical features, and index them. # We specify maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(MLdf) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = MLdf.randomSplit([0.7, 0.3]) # Train a DecisionTree model. dt = DecisionTreeRegressor(featuresCol="indexedFeatures") # Chain indexer and tree in a Pipeline pipeline = Pipeline(stages=[featureIndexer, dt]) # Train model. This also runs the indexer. model = pipeline.fit(MLdf) # Make predictions. predictions = model.transform(MLdf) # Select example rows to display. #predictions.select("prediction", "label", "features").show(5,False) # Select (prediction, true label) and compute test error evaluator = RegressionEvaluator( labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) #print("Root Mean Squared Error (RMSE) on test data = %g" % rmse) treeModel = model.stages[1] # summary only #print(treeModel) pandaDF = predictions.toPandas() actual = pandaDF['label'].tolist() prediction = pandaDF['prediction'].tolist() print len(prediction) return json.dumps({ 'actual': actual, 'prediction': prediction, 'treeSize': str(treeModel), 'RMSE': rmse })
def binsig(z, c, tau): return DenseVector((z > tau[c,:]))
pan.to_csv('test_fet.csv', mode='w', index=False, header=True) else: pan.to_csv('test_fet.csv', mode='a', index=False, header=False) del pan pan = None print(time.clock()-start) # I converted csv to a parquet file to save space and time def lis(x): return [float(i) for i in x[1:-1].split(',')] from pyspark.ml.linalg import DenseVector spark.read.load("test_fet.csv", format="csv", inferSchema="true", header="true").rdd \ .map(lambda x: (x[2], x[1], DenseVector(lis(x[0])))) \ .toDF(["index", "file", "features"]) .write.parquet("test_fet.parquet") # Now I get the Bag of Visual Words representation using K-means model built on training data from pyspark import StorageLevel schema = spark.read.parquet("test_fet.parquet").persist(StorageLevel(True, True, False, False, 1)) import numpy as np from pyspark.ml.clustering import KMeansModel model = KMeansModel.load('KmeansModel') P = np.load('P.npy') from pyspark.ml.linalg import DenseVector predictions = model.transform(schema)
def test_prepare_data_compress_sparse(self): util.clear_training_cache() expected_metadata = \ { 'float': { 'spark_data_type': FloatType, 'is_sparse_vector_only': False, 'intermediate_format': constants.NOCHANGE, 'max_size': 1, 'shape': 1 }, 'dense': { 'spark_data_type': DenseVector, 'is_sparse_vector_only': False, 'intermediate_format': constants.ARRAY, 'max_size': 2, 'shape': 2 }, 'sparse': { 'spark_data_type': SparseVector, 'is_sparse_vector_only': True, 'intermediate_format': constants.CUSTOM_SPARSE, 'max_size': 1, 'shape': 2 }, 'mixed': { 'spark_data_type': DenseVector, 'is_sparse_vector_only': False, 'intermediate_format': constants.ARRAY, 'max_size': 2, 'shape': 2 }, } with mock.patch('horovod.spark.common.util._get_metadata', side_effect=util._get_metadata) as mock_get_metadata: with spark_session('test_prepare_data') as spark: data = [[ 0.0, DenseVector([1.0, 1.0]), SparseVector(2, {1: 1.0}), DenseVector([1.0, 1.0]) ], [ 1.0, DenseVector([1.0, 1.0]), SparseVector(2, {1: 1.0}), SparseVector(2, {1: 1.0}) ]] schema = StructType([ StructField('float', FloatType()), StructField('dense', VectorUDT()), StructField('sparse', VectorUDT()), StructField('mixed', VectorUDT()) ]) df = create_test_data_from_schema(spark, data, schema) with local_store() as store: with util.prepare_data( num_processes=2, store=store, df=df, feature_columns=['dense', 'sparse', 'mixed'], label_columns=['float'], compress_sparse=True) as dataset_idx: mock_get_metadata.assert_called() assert dataset_idx == 0 train_rows, val_rows, metadata, avg_row_size = util.get_dataset_properties( dataset_idx) self.assertDictEqual(metadata, expected_metadata)
from pyspark.sql.types import * from optimus import Optimus from pyspark.ml.linalg import VectorUDT, DenseVector, SparseVector import numpy as np nan = np.nan from optimus.engines.spark.ml import encoding as fe op = Optimus(master='local') source_df = op.create.df([('id', LongType(), True), ('x', LongType(), True), ('y', LongType(), True), ('features', VectorUDT(), True)], [(0, 1, 2, DenseVector([1.0, 0.5, -1.0])), (1, 2, 3, DenseVector([2.0, 1.0, 1.0])), (2, 3, 4, DenseVector([4.0, 10.0, 2.0]))]) class Testdf_ml_2(object): @staticmethod def test_one_hot_encoder(): actual_df = fe.one_hot_encoder(source_df, input_cols=['id']) expected_df = op.create.df([ ('id', LongType(), True), ('x', LongType(), True), ('y', LongType(), True), ('features', VectorUDT(), True), ('id***ONE_HOT_ENCODER', VectorUDT(), True) ], [(0, 1, 2, DenseVector([1.0, 0.5, -1.0]), SparseVector(2, {0: 1.0})), (1, 2, 3, DenseVector([2.0, 1.0, 1.0]), SparseVector(2, {1: 1.0})), (2, 3, 4, DenseVector([4.0, 10.0, 2.0]), SparseVector(2, {}))]) assert (expected_df.collect() == actual_df.collect()) @staticmethod
num_cols = [ item[0] for item in df.dtypes if item[1].startswith('in') or item[1].startswith('dou') ] #We will choose these features for our baseline model: num_features, cat_features = num_cols, cat_cols #Dropping nulls df = df.dropna() num_features.remove("is_default") #Transform Dataset df_model = make_pipeline(df, num_features, cat_features) input_data = df_model.rdd.map(lambda x: (x["is_default"], DenseVector(x["features"]))) df_pipeline = spark.createDataFrame(input_data, ["is_default", "features"]) scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True) scalerModel = scaler.fit(df_pipeline) scaledData = scalerModel.transform(df_pipeline) scaledData = scaledData.drop("features") #column_names temp = scaledData.rdd.map( lambda x: [float(y) for y in x['scaledFeatures']]).toDF(num_features + cat_features)
genre_audio = genre_audio.where(col("genre").isNotNull()) # To remove null values df = genre_audio.drop('MSD_TRACKID') df.count() # 413277 df.na.drop().count() # To check for the missing values # 413277 from pyspark.ml.feature import StringIndexer from pyspark.ml import Pipeline label_stringIdx = StringIndexer(inputCol = 'new_label', outputCol = 'label') pipeline = Pipeline(stages=[label_stringIdx]) pipelineFit = pipeline.fit(binary_df) data = pipelineFit.transform(binary_df) data = data.drop('new_label') # Standardization from pyspark.ml.linalg import DenseVector # Import DenseVector input_data = df.rdd.map(lambda x: (x[20], DenseVector(x[:19]))) # Define the input_data df = spark.createDataFrame(input_data, ["label","features"]) from pyspark.ml.feature import StandardScaler standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled") # Initialize the standardScaler scaler = standardScaler.fit(df) # Fit the DataFrame to the scaler scaled_df = scaler.transform(df) # Transform the data in `df` with the scaler scaled_df.take(2) # dimensional reduction from pyspark.ml.feature import PCA pca = PCA(k=2, inputCol='features_scaled', outputCol='features_pca') model = pca.fit(scaled_df) reduced_df = model.transform(scaled_df).select('label','features_pca')
def create_datasets(self, userId, pid): # 获取用户特征 user_feature = json.loads(self.client_of_features.hget("user_features", userId)) # 获取用户召回集 recall_sets = self.client_of_recall.smembers(userId) result = [] # 遍历召回集 for adgroupId in recall_sets: adgroupId = int(adgroupId) # 获取该广告的特征值 ad_feature = json.loads(self.client_of_features.hget("ad_features", adgroupId)) features = {} features.update(user_feature) features.update(ad_feature) for k, v in features.items(): if v is None: features[k] = -1 features_col = [ # 特征值 "price", "cms_group_id", "final_gender_code", "age_level", "shopping_level", "occupation", "pid", "pvalue_level", "new_user_class_level" ] ''' "cms_group_id", 类别型特征,约13个分类 ==> 13维 "final_gender_code", 类别型特征,2个分类 ==> 2维 "age_level", 类别型特征,7个分类 ==>7维 "shopping_level", 类别型特征,3个分类 ==> 3维 "occupation", 类别型特征,2个分类 ==> 2维 ''' price = float(features["price"]) pid_value = [0 for i in range(2)] cms_group_id_value = [0 for i in range(13)] final_gender_code_value = [0 for i in range(2)] age_level_value = [0 for i in range(7)] shopping_level_value = [0 for i in range(3)] occupation_value = [0 for i in range(2)] pvalue_level_value = [0 for i in range(4)] new_user_class_level_value = [0 for i in range(5)] pid_value[self.pid_rela[pid]] = 1 cms_group_id_value[self.cms_group_id_rela[int(features["cms_group_id"])]] = 1 final_gender_code_value[self.final_gender_code_rela[int(features["final_gender_code"])]] = 1 age_level_value[self.age_level_rela[int(features["age_level"])]] = 1 shopping_level_value[self.shopping_level_rela[int(features["shopping_level"])]] = 1 occupation_value[self.occupation_rela[int(features["occupation"])]] = 1 pvalue_level_value[self.pvalue_level_rela[int(features["pvalue_level"])]] = 1 new_user_class_level_value[self.new_user_class_level_rela[int(features["new_user_class_level"])]] = 1 # print(pid_value) # print(cms_group_id_value) # print(final_gender_code_value) # print(age_level_value) # print(shopping_level_value) # print(occupation_value) # print(pvalue_level_value) # print(new_user_class_level_value) vector = DenseVector([price] + pid_value + cms_group_id_value + final_gender_code_value \ + age_level_value + shopping_level_value + occupation_value + pvalue_level_value + new_user_class_level_value) result.append((userId, adgroupId, vector)) return result
######################################################### #from cluswisard_estimator import CluswisardEstimator #clus = CluswisardEstimator(2, 48, 4, 10, 0.1) #clus.treinar(t, "input", "label") #classificacoes = clus._fit(t, "input") #classificacoes.show() # Import `DenseVector` from pyspark.ml.linalg import DenseVector # Define the `input_data` input_data = treino.rdd.map(lambda x: (x[0], DenseVector(x[1:]))) # Replace `df` with the new DataFrame df = spark.createDataFrame(input_data, ["label", "features"]) # Import `StandardScaler` from pyspark.ml.feature import StandardScaler # Initialize the `standardScaler` standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled") # Fit the DataFrame to the scaler scaler = standardScaler.fit(df) # Transform the data in `df` with the scaler scaled_df_treino = scaler.transform(df)
genderIndex=0.0, genderclassVec=SparseVector(1, {0: 1.0}), native-countryIndex=0.0, native-countryclassVec=SparseVector(40, {0: 1.0}), newlabel=0.0, features=SparseVector(99, {0: 1.0, 13: 1.0, 24: 1.0, 35: 1.0, 45: 1.0, 49: 1.0, 52: 1.0, 53: 1.0, 93: 25.0, 94: 226802.0, 96: 7.0, 98: 40.0}))] ''' # step4) build the classifier : logistic # to make the computation faster, convert model to a DF # select newlabel and features from model using map from pyspark.ml.linalg import DenseVector input_data = model.rdd.map(lambda x: (x['newlabel'], DenseVector(x['features']))) df_train = sqlcontext.createDataFrame(input_data, ['label', 'features']) df_train.show(2) train_data, test_data = df_train.randomSplit([.8, .2], seed=1234) train_data.groupby('label').agg({'label': 'count'}).show() test_data.groupby('label').agg({'label': 'count'}).show() # build the logreg from pyspark.ml.classification import LogisticRegression # initialize logreg lr = LogisticRegression(labelCol='label', featuresCol='features', maxIter=10,
def features_vector(arr): return DenseVector(np.array(arr))
def fit(self, output_column, input_columns=None): if output_column not in self._dataframe_helper.get_numeric_columns(): raise BIException('Output column: %s is not a measure column' % (output_column,)) if input_columns == None: input_columns = list(set(self._dataframe_helper.get_numeric_columns()) - {output_column}) if len(set(input_columns) - set(self._dataframe_helper.get_numeric_columns())) != 0: raise BIException('At least one of the input columns %r is not a measure column' % (input_columns,)) # TODO: ensure no duplicates are present in input_columns regression_result = RegressionResult(output_column, input_columns) training_df = self._data_frame.rdd.map(lambda row: \ (float(row[output_column]), DenseVector([float(row[col]) for col in input_columns]))).toDF() lr = LR(maxIter=LinearRegression.MAX_ITERATIONS, regParam=LinearRegression.REGULARIZATION_PARAM, elasticNetParam=1.0, labelCol=LinearRegression.LABEL_COLUMN_NAME, featuresCol=LinearRegression.FEATURES_COLUMN_NAME) lr_model = lr.fit(training_df) lr_summary = lr_model.evaluate(training_df) #regression_result.set_params(intercept=lr_model.intercept, coefficients=lr_model.coefficients, # rmse=lr_summary.rootMeanSquaredError, r2=lr_summary.r2, # t_values=lr_summary.tValues, p_values=lr_summary.pValues) # TODO: pass t_values and p_values coefficients = [float(i) for i in lr_model.coefficients.values] if not any([coeff != 0 for coeff in coefficients]): return None sample_data_dict = {} lr_dimension = {} for c in input_columns: sample_data_dict[c] = None lr_dimension[c] = {'dimension':'', 'levels': [], 'coefficients':[], 'dimension2':'', 'levels2': [], 'coefficients2':[]} diff = 0 diff2 = 0 for dim in self._string_columns: # sample_data_dict[col] = self._dataframe_helper.get_sample_data(col, output_column, self._sample_size) temp = [] if len(self._levels[dim])>0 and len(self._levels[dim])<16: for level in self._levels[dim]: sub_df = self._data_frame.select(*[c,output_column]).filter(col(dim)==level) train = sub_df.rdd.map(lambda row: (float(row[output_column]), DenseVector([float(row[c])]))).toDF() sub_lr_model = lr.fit(train) temp = temp + [float(i) for i in sub_lr_model.coefficients.values] if max(temp)-min(temp) > diff: diff = max(temp)-min(temp) diff2 = diff lr_dimension[c]['dimension2']= lr_dimension[c]['dimension'] lr_dimension[c]['levels2'] = lr_dimension[c]['levels'] lr_dimension[c]['coefficients2'] = lr_dimension[c]['coefficients'] lr_dimension[c]['dimension'] = dim X = self._levels[dim] Y = temp Z = [abs(y) for y in Y] lr_dimension[c]['levels'] = [x for (z,y,x) in sorted(zip(Z,Y,X))] lr_dimension[c]['coefficients'] = [y for (z,y,x) in sorted(zip(Z,Y,X))] elif max(temp)-min(temp) > diff2: diff2 = max(temp)-min(temp) lr_dimension[c]['dimension2'] = dim X = self._levels[dim] Y = temp Z = [abs(y) for y in Y] lr_dimension[c]['levels2'] = [x for (z,y,x) in sorted(zip(Z,Y,X))] lr_dimension[c]['coefficients2'] = [y for (z,y,x) in sorted(zip(Z,Y,X))] regression_result.set_params(intercept=float(lr_model.intercept), coefficients=coefficients, rmse=float(lr_summary.rootMeanSquaredError), r2=float(lr_summary.r2), sample_data_dict=sample_data_dict, lr_dimension=lr_dimension) return regression_result
# Each line of the Input file follows this format: # "word - 0.6579854,1.161026,0.43898278,[...],2.0629232,0.063231304" input_file = "sparkvectors2019" # Maximum PCA dimension K = 20 # Read the file, parse it as RDD and transform it into DataFrame raw_vectors = sc.textFile(input_file) vectors_rdd = raw_vectors.map(lambda row: (row.split(" - ")[0], row.split(" - ")[1])) vectors_df = spark.createDataFrame(vectors_rdd, ['Label', 'Vector']) # Count the elements to force Spark to load the file at this time vectors_df.count() # Parse vectors as DenseVectors vectorize_lines = udf(lambda row: DenseVector([float(number) for number in row.split(",")]), VectorUDT()) vectorized_df = vectors_df.withColumn( "DenseVector", vectorize_lines(vectors_df['Vector'])) pca = PCA(k=K, inputCol="DenseVector", outputCol='pcaFeatures') model = pca.fit(vectorized_df) # model.explainedVariance result = mode.transform(vectorized_df) for k, in range(1, K): result.rdd.map(lambda row: row[0] + ",\"" + ",".join(str(a) for a in row[3][:k]) + "\"").saveAsTextFile("pca" + str(k))
def lr_sort_service(reco_set, temp, hbu): """ 排序返回推荐文章 :param reco_set:召回合并过滤后的结果 :param temp: 参数 :param hbu: Hbase工具 :return: """ # 排序 # 1、读取用户特征中心特征 try: user_feature = eval(hbu.get_table_row('ctr_feature_user', '{}'.format(temp.user_id).encode(), 'channel:{}'.format(temp.channel_id).encode())) logger.info("{} INFO get user user_id:{} channel:{} profile data".format( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), temp.user_id, temp.channel_id)) except Exception as e: user_feature = [] if user_feature: # 2、读取文章特征中心特征 result = [] for article_id in reco_set: try: article_feature = eval(hbu.get_table_row('ctr_feature_article', '{}'.format(article_id).encode(), 'article:{}'.format(article_id).encode())) except Exception as e: article_feature = [0.0] * 111 f = [] # 第一个channel_id f.extend([article_feature[0]]) # 第二个article_vector f.extend(article_feature[11:]) # 第三个用户权重特征 f.extend(user_feature) # 第四个文章权重特征 f.extend(article_feature[1:11]) vector = DenseVector(f) result.append([temp.user_id, article_id, vector]) # 4、预测并进行排序是筛选 df = pd.DataFrame(result, columns=["user_id", "article_id", "features"]) test = SORT_SPARK.createDataFrame(df) # 加载逻辑回归模型 model = LogisticRegressionModel.load("hdfs://hadoop-master:9000/headlines/models/LR.obj") predict = model.transform(test) def vector_to_double(row): return float(row.article_id), float(row.probability[1]) res = predict.select(['article_id', 'probability']).rdd.map(vector_to_double).toDF( ['article_id', 'probability']).sort('probability', ascending=False) article_list = [i.article_id for i in res.collect()] logger.info("{} INFO sorting user_id:{} recommend article".format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), temp.user_id)) # 排序后,只将排名在前100个文章ID返回给用户推荐 if len(article_list) > 200: article_list = article_list[:200] reco_set = list(map(int, article_list)) return reco_set
X, y = digits.data, digits.target #todo: note difficulties of spark: cannot set weights for evaluation so using sklearn slows that down # todo: make the lists wayyyy bigger to show the computation time is faster on spark # todo: play with partition size/parameters for each dataset # todo: sanity checks (same weights produced with no DP noise if non-random train test slit) # todo: update python code to include intercept X = X.tolist() y = y.tolist() data = zip(y,X) formatted = [(int(y_i), DenseVector(x_i)) for y_i, x_i in data] fields = [StructField('label', IntegerType(), True), StructField('features', VectorUDT(), True)] schema = StructType(fields) data = spark.createDataFrame(formatted, schema) #train_data, test_data = data.randomSplit([1/2, 1/2]) train_data = data lr = LogisticRegression(maxIter=10) lrModel = lr.fit(train_data) trainingSummary = lrModel.summary print(trainingSummary.accuracy) ###### result = lrModel.transform(train_data) # test_data
' weekday_is_thursday', ' weekday_is_friday', ' weekday_is_saturday', ' weekday_is_sunday', ' is_weekend', ' LDA_00', ' LDA_01', ' LDA_02', ' LDA_03', ' LDA_04', ' global_subjectivity', ' global_sentiment_polarity', ' global_rate_positive_words', ' global_rate_negative_words', ' rate_positive_words', ' rate_negative_words', ' avg_positive_polarity', ' min_positive_polarity', ' max_positive_polarity', ' avg_negative_polarity', ' min_negative_polarity', ' max_negative_polarity', ' title_subjectivity', ' title_sentiment_polarity', ' abs_title_subjectivity', ' abs_title_sentiment_polarity', ' shares'] # Conver the `df` columns to `FloatType()` data = convertColumn(data, columns, FloatType()) data = data.select(' shares',' n_non_stop_words') # standardization # Define the `input_data` input_data = data.rdd.map(lambda x: (x[0], DenseVector(x[1:]))) # Replace `data` with the new DataFrame data = spark.createDataFrame(input_data, ["label", "features"]) # Initialize the `standardScaler` standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled") # Fit the DataFrame to the scaler scaler = standardScaler.fit(data) # Transform the data in `df` with the scaler scaled_data = scaler.transform(data) # Bulding a machine learning model # split the data into train and test sets
def CosineSimi (v1, v2): d1 = DenseVector(v1) d2 = DenseVector(v2) n1 = d1.norm(2) n2 = d2.norm(2) return float(d1.dot(d2) / (n1 * n2))
df_indexed = pipeline.fit(df).transform(df) ## 3. Convert to label/features format catVarsIndexed = [i + '_indexed' for i in catVars] featuresCol = numVars + catVarsIndexed featuresCol.remove('Survived') labelCol = ['Mark', 'Survived'] from pyspark.sql import Row from pyspark.ml.linalg import DenseVector row = Row('mark', 'label', 'features') df_indexed = df_indexed[labelCol + featuresCol] # 0-mark, 1-label, 2-features lf = df_indexed.rdd.map(lambda r: (row(r[0], r[1], DenseVector(r[2:])))).toDF() # index label lf = StringIndexer(inputCol='label', outputCol='index').fit(lf).transform(lf) # split back train/test data train = lf.where(lf.mark == 'train') test = lf.where(lf.mark == 'test') # random split further to get train/validate train, validate = train.randomSplit([0.7, 0.3], seed=121) print 'Train Data Number of Row: ' + str(train.count()) print 'Validate Data Number of Row: ' + str(validate.count()) print 'Test Data Number of Row: ' + str(test.count()) # Apply Logsitic Regression
#Standard scaler scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True) stages += [scaler] #Creating and running the pipeline pipeline = Pipeline(stages=stages) pipelineModel = pipeline.fit(spark_df) out_df = pipelineModel.transform(spark_df) return out_df df_model = make_pipeline_numeric(df) input_data = df_model.rdd.map(lambda x: (x["is_default"], DenseVector(x["scaledFeatures"]))) df_pre_smote = spark.createDataFrame(input_data, ["is_default", "scaledFeatures"]) args = sys.argv k = int(args[1]) df_smote = SmoteSampling(vectorizerFunction(df_pre_smote, 'is_default'), k = k, minorityClass = 1, majorityClass = 0, percentageOver = 400, percentageUnder = 100) df_smote_table = df_smote.rdd.map(lambda x:[float(y) for y in x['features']]).toDF() table_name = 'default.LC_Smote_k' + str(k) df_smote_table\ .write.format("parquet")\