示例#1
0
    def test_norms(self):
        a = DenseVector([0, 2, 3, -1])
        self.assertAlmostEqual(a.norm(2), 3.742, 3)
        self.assertTrue(a.norm(1), 6)
        self.assertTrue(a.norm(inf), 3)
        a = SparseVector(4, [0, 2], [3, -4])
        self.assertAlmostEqual(a.norm(2), 5)
        self.assertTrue(a.norm(1), 7)
        self.assertTrue(a.norm(inf), 4)

        tmp = SparseVector(4, [0, 2], [3, 0])
        self.assertEqual(tmp.numNonzeros(), 1)
示例#2
0
 def test_dot(self):
     sv = SparseVector(4, {1: 1, 3: 2})
     dv = DenseVector(array([1., 2., 3., 4.]))
     lst = DenseVector([1, 2, 3, 4])
     mat = array([[1., 2., 3., 4.],
                  [1., 2., 3., 4.],
                  [1., 2., 3., 4.],
                  [1., 2., 3., 4.]])
     arr = pyarray.array('d', [0, 1, 2, 3])
     self.assertEqual(10.0, sv.dot(dv))
     self.assertTrue(array_equal(array([3., 6., 9., 12.]), sv.dot(mat)))
     self.assertEqual(30.0, dv.dot(dv))
     self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat)))
     self.assertEqual(30.0, lst.dot(dv))
     self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat)))
     self.assertEqual(7.0, sv.dot(arr))
示例#3
0
        def predict(rows):
            import tensorflow as tf
            from pyspark import Row
            from pyspark.ml.linalg import DenseVector, SparseVector

            k = keras_utils.keras()
            k.backend.set_floatx(floatx)

            # Do not use GPUs for prediction, use single CPU core per task.
            pin_cpu(tf, k)

            def load_model_fn(x):
                with k.utils.custom_object_scope(custom_objects):
                    return k.models.load_model(x)

            model = keras_utils.deserialize_model(serialized_model,
                                                  load_model_fn=load_model_fn)

            input_shapes = [[dim if dim else -1 for dim in input.shape.as_list()]
                            for input in model.inputs]

            def to_array(item):
                if type(item) in [DenseVector or SparseVector]:
                    return item.toArray()
                else:
                    return np.array(item)

            def to_numpy(item):
                # Some versions of TensorFlow will return an EagerTensor
                return item.numpy() if hasattr(item, 'numpy') else item

            # Perform predictions.
            for row in rows:
                fields = row.asDict().copy()
                preds = model.predict_on_batch(
                    [to_array(row[feature_cols[i]]).reshape(input_shapes[i])
                     for i in range(len(feature_cols))])
                preds = [to_numpy(item) for item in preds]

                for label_col, output_col, pred, in zip(label_cols, output_cols, preds):
                    meta = metadata[label_col]
                    col_type = meta['spark_data_type']
                    # dtype for DenseVector and SparseVector is always np.float64
                    if col_type == DenseVector:
                        shape = np.prod(pred.shape)
                        flattened_pred = pred.reshape(shape, )
                        field = DenseVector(flattened_pred)
                    elif col_type == SparseVector:
                        shape = meta['shape']
                        flattened_pred = pred.reshape(shape, )
                        nonzero_indices = flattened_pred.nonzero()[0]
                        field = SparseVector(shape, nonzero_indices,
                                             flattened_pred[nonzero_indices])
                    else:
                        # If the column is scalar type, int, float, etc.
                        value = pred[0]
                        python_type = util.spark_scalar_to_python_type(col_type)
                        if issubclass(python_type, numbers.Integral):
                            value = round(value)
                        field = python_type(value)

                    fields[output_col] = field

                yield Row(**fields)
示例#4
0
def initSpark(sparkApp=None,
              sparkHome=os.environ.get(_SPARK_HOME_ENV_VAR_NAME,
                                       _SPARK_HOME_ON_ARIMO_LINUX_CLUSTER),
              sparkConf={},
              sparkRepos=(),
              sparkPkgs=(),
              javaHome=None,
              hadoopConfDir=None,
              yarnConfDir=None,
              yarnUpdateJARs=False,
              dataIO={'avro', 'pg', 'redshift', 'sftp'},
              executor_aws_ec2_instance_type='c5n.9xlarge'):
    """
    Launch new ``SparkSession`` or connect to existing one, and binding it to ``arimo.data_backend.spark``

    Args:
        sparkApp (str): name to give to the ``SparkSession`` to be launched

        sparkHome (str): path to Spark installation, if not already set in ``SPARK_HOME`` environment variable

        sparkConf (tuple/list): tuple/list of configs to over-ride default Spark configs

        sparkPkgs (tuple/list of str): tuple/list of Maven and/or Spark packages with which to launch Spark

        javaHome (str): path to Java Development Kit (JDK), if not already set in ``JAVA_HOME`` environment variable

        hadoopConfDir (str): path to Hadoop configuration directory;
            *ignored* if not running on a YARN cluster or if Hadoop is installed at ``/opt/hadoop``

        ckptDir (str): path to default Spark checkpoint directory

        dataIO (set): additional data IO support options
    """
    assert (pyspark.__version__ >= _MIN_SPARK_VER), \
        f'*** Spark >= {_MIN_SPARK_VER} required, but {pyspark.__version__} installed ***'

    # initialize logger
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)
    logger.addHandler(STDOUT_HANDLER)

    # driver Python executable path
    os.environ['PYSPARK_DRIVER_PYTHON'] = 'python3'
    # worker Python executable path
    os.environ['PYSPARK_PYTHON'] = '/opt/miniconda3/bin/python3'

    # set relevant environment variables for Java, Spark, Hadoop & YARN
    if javaHome:
        os.environ[_JAVA_HOME_ENV_VAR_NAME] = javaHome

    elif _JAVA_HOME:
        os.environ[_JAVA_HOME_ENV_VAR_NAME] = _JAVA_HOME

    if sparkHome:
        os.environ[_SPARK_HOME_ENV_VAR_NAME] = sparkHome

    if _ON_LINUX_CLUSTER_WITH_HDFS:
        os.environ[_HADOOP_CONF_DIR_ENV_VAR_NAME] = \
            hadoopConfDir \
            if hadoopConfDir \
            else os.environ.get(
                _HADOOP_CONF_DIR_ENV_VAR_NAME,
                os.path.join(_HADOOP_HOME, 'conf')
                    if _HADOOP_HOME
                    else None)

        if yarnConfDir:
            os.environ[_YARN_CONF_DIR_ENV_VAR_NAME] = yarnConfDir

    os.environ['PYSPARK_SUBMIT_ARGS'] = \
        '--py-files {} --repositories {} --packages {} pyspark-shell'.format(
            # ','.join(
            #     os.path.join(_SPARK_JARS_DIR_PATH, jar_file_name)
            #     for jar_file_name in os.listdir(_SPARK_JARS_DIR_PATH)
            #     if jar_file_name.endswith('.jar')),

            ','.join(_SPARK_ARIMO_PACKAGE_PY_FILE_PATHS),

            ','.join(_SPARK_REPOS.union(sparkRepos)),

            ','.join(
                _SPARK_PKGS.union(
                    sparkPkgs,
                    *(_DATA_IO_SPARK_PKGS[dataIOOption.lower()]
                      for dataIOOption in dataIO))))

    # set / create SparkSession
    global spark

    if spark:
        assert spark._instantiatedSession is None
        assert spark.sparkContext._active_spark_context is None
        assert spark.sparkContext._jsc is None

    # build Spark Configs
    conf = \
        pyspark.SparkConf() \
        .setAppName(
            sparkApp
            if sparkApp
            else os.getcwd())

    _sparkConf = _SPARK_CONF.copy()

    _sparkConf.update(sparkConf)

    # optimally allocating YARN containers
    executor_aws_ec2_instance_type_info = \
        INSTANCE_TYPES_INFO.loc[executor_aws_ec2_instance_type]

    optim_alloc_details = \
        optim_alloc(
            node_mem_gib=executor_aws_ec2_instance_type_info[MEMORY_GiB_KEY])

    n_executors_per_node = optim_alloc_details['n_executors']

    _sparkConf['spark.executor.memory'] = \
        mem_gib_per_executor = \
        f"{optim_alloc_details['executor_mem_gib']}g"

    _sparkConf['spark.executor.cores'] = \
        n_cpus_per_executor = \
        int(1.68 *   # over-allocating CPUs to maximize CPU usage
            executor_aws_ec2_instance_type_info[N_CPUS_KEY] / n_executors_per_node)

    logger.info(
        msg=
        'Allocating {:,}x {} {:,}-CPU Executors per {} ({}-GiB {:,}-CPU) YARN Worker Node (Leaving {:.1f} GiB for Driver)...'
        .format(n_executors_per_node, mem_gib_per_executor,
                n_cpus_per_executor, executor_aws_ec2_instance_type,
                executor_aws_ec2_instance_type_info[MEMORY_GiB_KEY],
                executor_aws_ec2_instance_type_info[N_CPUS_KEY],
                optim_alloc_details['avail_for_driver_mem_gib']))

    if _ON_LINUX_CLUSTER_WITH_HDFS:
        if exist(path=_YARN_JARS_DIR_NAME, hdfs=True, dir=True):
            if not yarnUpdateJARs:
                # *** TODO: FIX ***
                # _sparkConf['spark.yarn.jars'] = _YARN_JARS_DIR_NAME
                pass

    else:
        yarnUpdateJARs = False

    for k, v in _sparkConf.items():
        conf.set(k, v)

    # remove any existing derby.log & metastore_db to avoid Hive start-up errors
    rm(path='derby.log', hdfs=False, is_dir=False)

    rm(path='metastore_db', hdfs=False, is_dir=True)

    # clean up existing Spark checkpoints
    rmSparkCkPts()

    # get / create SparkSession
    spark = pyspark.sql.SparkSession.builder \
        .config(conf=conf) \
        .enableHiveSupport() \
        .getOrCreate()

    logger.info(msg='SparkSession = {}'.format(spark))

    # BELOW DOESN'T WORK FOR dev/preview VERSIONS
    # assert spark.version == pyspark.__version__, \
    #     '*** PySpark v{} does not match underlying Spark v{} ***'.format(pyspark.__version__, spark.version)

    spark.sparkContext.setLogLevel(
        'WARN')  # ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE or WARN

    spark.sparkContext.setCheckpointDir(dirName=_SPARK_CKPT_DIR)

    # set Hadoop Conf in Spark Context
    if os.environ.get('AWS_ACCESS_KEY_ID'):
        spark.sparkContext._jsc.hadoopConfiguration().set(
            "fs.s3a.awsAccessKeyId", os.environ.get('AWS_ACCESS_KEY_ID'))
        spark.sparkContext._jsc.hadoopConfiguration().set(
            "fs.s3a.awsSecretAccessKey",
            os.environ.get('AWS_SECRET_ACCESS_KEY'))

    # register Uder-Defined Functions (UDFs)
    from pyspark.ml.linalg import DenseVector, VectorUDT
    from pyspark.sql.types import ArrayType, DoubleType

    spark.udf.register(name='_ARRAY_TO_VECTOR',
                       f=lambda a: DenseVector(a),
                       returnType=VectorUDT())

    spark.udf.register(name='_VECTOR_TO_ARRAY',
                       f=lambda v: v.array.tolist(),
                       returnType=ArrayType(DoubleType()))

    if yarnUpdateJARs:
        msg = 'Putting JARs from {} to {}...'.format(
            _SPARK_JARS_DIR_PATH_ON_ARIMO_LINUX_CLUSTER, _YARN_JARS_DIR_PATH)
        logger.info(msg)
        updateYARNJARs()
        logger.info(msg + ' done!')
示例#5
0
def convert_df(spark, data):
    """Transform dataframe into the format that can be used by Spark ML."""
    input_data = data.rdd.map(lambda x: (x[0], DenseVector(x[1:])))
    df = spark.createDataFrame(input_data, ["id", "features"])
    return df
示例#6
0
 def sparse_to_array(v):
     v = DenseVector(v)
     new_array = list([int(x) for x in v])
     return new_array
示例#7
0
#Load dataset file as RDD
rdd = sc.textFile("/user/spark/airfoil.txt")
rdd = rdd.map(lambda x: x.split('\t'))
rdd = rdd.map(lambda x: [
    float(x[0]),
    float(x[1]),
    float(x[2]),
    float(x[3]),
    float(x[4]),
    float(x[5])
])

#Create dataframe for ML model
df = spark.createDataFrame(
    rdd, ["frequency", "angle", "chord", "velocity", "suction", "pressure"])
data = df.rdd.map(lambda x: (DenseVector(x[:-1]), x[-1]))
df = spark.createDataFrame(data, ["features", "label"])

#Feature scaling
standardScaler = StandardScaler(inputCol="features",
                                outputCol="features_scaled")
scaler = standardScaler.fit(df)
scaled_df = scaler.transform(df)

#Split data into training and test
train_data, test_data = scaled_df.randomSplit([.7, .3], seed=1234)
train_data = train_data.select("features_scaled", "label")
test_data = test_data.select("features_scaled", "label")
train_data = train_data.withColumnRenamed("features_scaled", "features")
test_data = test_data.withColumnRenamed("features_scaled", "features")
示例#8
0
def trans2sparse(line):
    indices = line["chi"]["indices"]
    values = line["chi"]["values"]
    vec = DenseVector(Vectors.sparse(2000, indices, values).toArray())
    return Row(chi=vec, window=line["window"])
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(df)
model = pipelineModel.transform(df)

# In[47]:

model.take(1)

# In[48]:

# build the classifier

#convert data to a dataFrame
from pyspark.ml.linalg import DenseVector
input_data = model.rdd.map(lambda x:
                           (x["newlabel"], DenseVector(x["features"])))

# In[49]:

# create the training data as a data frame
df_train = sqlContext.createDataFrame(input_data, ["label", "features"])

# In[50]:

# check row 2
df_train.show(2)

# In[51]:

# You split the dataset 80/20 with randomSplit.
train_data, test_data = df_train.randomSplit([.8, .2], seed=1234)
 def denseudf(wcol):
     if wcol == SparseVector(300, {}):
         wcol = DenseVector([0.0] * 300)
     return wcol
示例#11
0
 def sparse_to_array(self, v):
     # print("Coverting featues to dense vector ...")
     v = DenseVector(v)
     new_array = list([float(x) for x in v])
     return new_array
示例#12
0
def kMeans(cluster):

  #combinedDataList = combineData()

  MLlist = []
  for rows in combinedDataList:
    mlData = {}
    mlData['Total Crimes'] = rows['Total Crimes']
    mlData['Depart'] = rows['Depart']
    mlData['Heat'] = rows['Heat']
    mlData['PrecipTotal'] = rows['PrecipTotal']
    mlData['Tavg'] = rows['Tavg']
    mlData['Tmax'] = rows['Tmax']
    mlData['Tmin'] = rows['Tmin']
    MLlist.append(mlData)

  #define input data
  inputRDD = sc.parallelize(MLlist)
  featureddf = spark.read.json(inputRDD)
  #featureddf.printSchema()
  #featureddf.show(2,False)
  # Replace `df` with the new DataFrame
  input_data = featureddf.rdd.map(lambda x: (x['Total Crimes'], DenseVector([x['Depart'],x['Heat'], x['PrecipTotal'], x['Tavg'], x['Tmax'], x['Tmin'], x['Total Crimes']])))
  MLdf = spark.createDataFrame(input_data, ["label", "features"])
  #MLdf.printSchema()
  #MLdf.show(2,False)
  """
  # Initialize the `standardScaler`
  standardScaler = StandardScaler(inputCol="unscaledFeatures", outputCol="features")

  # Fit the DataFrame to the scaler
  scaler = standardScaler.fit(MLdf)

  # Transform the data in `df` with the scaler
  scaled_df = scaler.transform(MLdf)
  scaled_df.printSchema()

  # Inspect the result
  scaled_df.show(2,False)
  """
  # Trains a k-means model.
  #for i in range(2,100):
  kmeans = KMeans(k=cluster)
  model = kmeans.fit(MLdf)
  centers = model.clusterCenters()

  # Evaluate clustering by computing Within Set Sum of Squared Errors.
  wssse = model.computeCost(MLdf)
  #print("Within Set Sum of Squared Errors = " + str(wssse))

  # Shows the result.
  centers = model.clusterCenters()
  #print("Cluster Centers: ")
  centerlist = []
  i = 0;
  for center in centers:
    centerData = {}
    centerData['Center ' + str(i) + ' Depart'] = center[0]
    centerData['Center ' + str(i) + '  Heat'] = center[1]
    centerData['Center ' + str(i) + '  PrecipTotal'] = center[2]
    centerData['Center ' + str(i) + '  Tavg'] = center[3]
    centerData['Center ' + str(i) + '  Tmax'] = center[4]
    centerData['Center ' + str(i) + '  Tmin'] = center[5]
    centerData['Center ' + str(i) + '  Total Crimes'] = center[6]
    centerlist.append(centerData)
    i = i + 1

  transformed = model.transform(MLdf).select("features", "prediction")
  #transformed.printSchema()
  #transformed.show(50,False)
  pandaDF = transformed.toPandas()

  Tavg = []
  precip = []
  crimes = []
  for item in pandaDF['features'].tolist():
    Tavg.append(item[3])
    crimes.append(item[6])
    precip.append(item[2])
  cluster = pandaDF['prediction'].tolist()

  clusters = []
  for x in range(len(centerlist)):
    clusters.append({
      'name': 'Cluster' + str(x),
      'data': [[Tavg[i], crimes[i], precip[i]] for i in range(len(Tavg)) if cluster[i] == x]
      })

  #print(clusters)

  return json.dumps({
                      'clusters': clusters,
                      'clusterCenters': centerlist,
                      'WSSSE': wssse
                     })
示例#13
0
def decTreeReg():
  #combinedDataList = combineData()
  MLlist = []
  for rows in combinedDataList:
    mlData = {}
    mlData['Total Crimes'] = rows['Total Crimes']
    mlData['Depart'] = rows['Depart']
    mlData['Heat'] = rows['Heat']
    mlData['PrecipTotal'] = rows['PrecipTotal']
    mlData['Tavg'] = rows['Tavg']
    mlData['Tmax'] = rows['Tmax']
    mlData['Tmin'] = rows['Tmin']
    MLlist.append(mlData)

  #define input data
  inputRDD = sc.parallelize(MLlist)
  featureddf = spark.read.json(inputRDD)

  # label data
  input_data = featureddf.rdd.map(lambda x: (x['Total Crimes'], DenseVector([x['Depart'],x['Heat'], x['PrecipTotal'], x['Tavg'], x['Tmax'], x['Tmin']])))
  MLdf = spark.createDataFrame(input_data, ["label", "features"])
  #MLdf.show(10,False)
  # Automatically identify categorical features, and index them.
  # We specify maxCategories so features with > 4 distinct values are treated as continuous.
  featureIndexer =\
      VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(MLdf)

  # Split the data into training and test sets (30% held out for testing)
  (trainingData, testData) = MLdf.randomSplit([0.7, 0.3])

  # Train a DecisionTree model.
  dt = DecisionTreeRegressor(featuresCol="indexedFeatures")

  # Chain indexer and tree in a Pipeline
  pipeline = Pipeline(stages=[featureIndexer, dt])

  # Train model.  This also runs the indexer.
  model = pipeline.fit(MLdf)

  # Make predictions.
  predictions = model.transform(MLdf)

  # Select example rows to display.
  #predictions.select("prediction", "label", "features").show(5,False)

  # Select (prediction, true label) and compute test error
  evaluator = RegressionEvaluator(
      labelCol="label", predictionCol="prediction", metricName="rmse")
  rmse = evaluator.evaluate(predictions)
  #print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

  treeModel = model.stages[1]
  # summary only
  #print(treeModel)

  pandaDF = predictions.toPandas()

  actual = pandaDF['label'].tolist()
  prediction = pandaDF['prediction'].tolist()
  print len(prediction)
  return json.dumps({
          'actual': actual,
          'prediction': prediction,
          'treeSize': str(treeModel),
          'RMSE': rmse
         })
示例#14
0
def binsig(z, c, tau):
    return DenseVector((z > tau[c,:]))
示例#15
0
        pan.to_csv('test_fet.csv', mode='w', index=False, header=True)
    else:
        pan.to_csv('test_fet.csv', mode='a', index=False, header=False)
    del pan
    pan = None
print(time.clock()-start)
			
# I converted csv to a parquet file to save space and time

def lis(x):
    return [float(i) for i in x[1:-1].split(',')]

from pyspark.ml.linalg import DenseVector

spark.read.load("test_fet.csv", format="csv", inferSchema="true", header="true").rdd \
          .map(lambda x: (x[2], x[1], DenseVector(lis(x[0])))) \
          .toDF(["index", "file", "features"])
          .write.parquet("test_fet.parquet")

# Now I get the Bag of Visual Words representation using K-means model built on training data
from pyspark import StorageLevel
schema = spark.read.parquet("test_fet.parquet").persist(StorageLevel(True, True, False, False, 1))

import numpy as np
from pyspark.ml.clustering import KMeansModel

model = KMeansModel.load('KmeansModel')
P = np.load('P.npy')

from pyspark.ml.linalg import DenseVector
predictions = model.transform(schema)
示例#16
0
    def test_prepare_data_compress_sparse(self):
        util.clear_training_cache()

        expected_metadata = \
            {
                'float': {
                    'spark_data_type': FloatType,
                    'is_sparse_vector_only': False,
                    'intermediate_format': constants.NOCHANGE,
                    'max_size': 1,
                    'shape': 1
                },
                'dense': {
                    'spark_data_type': DenseVector,
                    'is_sparse_vector_only': False,
                    'intermediate_format': constants.ARRAY,
                    'max_size': 2,
                    'shape': 2
                },
                'sparse': {
                    'spark_data_type': SparseVector,
                    'is_sparse_vector_only': True,
                    'intermediate_format': constants.CUSTOM_SPARSE,
                    'max_size': 1,
                    'shape': 2
                },
                'mixed': {
                    'spark_data_type': DenseVector,
                    'is_sparse_vector_only': False,
                    'intermediate_format': constants.ARRAY,
                    'max_size': 2,
                    'shape': 2
                },
            }

        with mock.patch('horovod.spark.common.util._get_metadata',
                        side_effect=util._get_metadata) as mock_get_metadata:
            with spark_session('test_prepare_data') as spark:
                data = [[
                    0.0,
                    DenseVector([1.0, 1.0]),
                    SparseVector(2, {1: 1.0}),
                    DenseVector([1.0, 1.0])
                ],
                        [
                            1.0,
                            DenseVector([1.0, 1.0]),
                            SparseVector(2, {1: 1.0}),
                            SparseVector(2, {1: 1.0})
                        ]]

                schema = StructType([
                    StructField('float', FloatType()),
                    StructField('dense', VectorUDT()),
                    StructField('sparse', VectorUDT()),
                    StructField('mixed', VectorUDT())
                ])

                df = create_test_data_from_schema(spark, data, schema)

                with local_store() as store:
                    with util.prepare_data(
                            num_processes=2,
                            store=store,
                            df=df,
                            feature_columns=['dense', 'sparse', 'mixed'],
                            label_columns=['float'],
                            compress_sparse=True) as dataset_idx:
                        mock_get_metadata.assert_called()
                        assert dataset_idx == 0

                        train_rows, val_rows, metadata, avg_row_size = util.get_dataset_properties(
                            dataset_idx)
                        self.assertDictEqual(metadata, expected_metadata)
示例#17
0
from pyspark.sql.types import *
from optimus import Optimus
from pyspark.ml.linalg import VectorUDT, DenseVector, SparseVector
import numpy as np
nan = np.nan
from optimus.engines.spark.ml import encoding as fe

op = Optimus(master='local')
source_df = op.create.df([('id', LongType(), True), ('x', LongType(), True),
                          ('y', LongType(), True),
                          ('features', VectorUDT(), True)],
                         [(0, 1, 2, DenseVector([1.0, 0.5, -1.0])),
                          (1, 2, 3, DenseVector([2.0, 1.0, 1.0])),
                          (2, 3, 4, DenseVector([4.0, 10.0, 2.0]))])


class Testdf_ml_2(object):
    @staticmethod
    def test_one_hot_encoder():
        actual_df = fe.one_hot_encoder(source_df, input_cols=['id'])
        expected_df = op.create.df([
            ('id', LongType(), True), ('x', LongType(), True),
            ('y', LongType(), True), ('features', VectorUDT(), True),
            ('id***ONE_HOT_ENCODER', VectorUDT(), True)
        ], [(0, 1, 2, DenseVector([1.0, 0.5, -1.0]), SparseVector(2,
                                                                  {0: 1.0})),
            (1, 2, 3, DenseVector([2.0, 1.0, 1.0]), SparseVector(2, {1: 1.0})),
            (2, 3, 4, DenseVector([4.0, 10.0, 2.0]), SparseVector(2, {}))])
        assert (expected_df.collect() == actual_df.collect())

    @staticmethod
示例#18
0
num_cols = [
    item[0] for item in df.dtypes
    if item[1].startswith('in') or item[1].startswith('dou')
]

#We will choose these features for our baseline model:
num_features, cat_features = num_cols, cat_cols

#Dropping nulls
df = df.dropna()
num_features.remove("is_default")

#Transform Dataset
df_model = make_pipeline(df, num_features, cat_features)
input_data = df_model.rdd.map(lambda x:
                              (x["is_default"], DenseVector(x["features"])))
df_pipeline = spark.createDataFrame(input_data, ["is_default", "features"])

scaler = StandardScaler(inputCol="features",
                        outputCol="scaledFeatures",
                        withStd=True,
                        withMean=True)
scalerModel = scaler.fit(df_pipeline)
scaledData = scalerModel.transform(df_pipeline)
scaledData = scaledData.drop("features")

#column_names
temp = scaledData.rdd.map(
    lambda x: [float(y) for y in x['scaledFeatures']]).toDF(num_features +
                                                            cat_features)
genre_audio = genre_audio.where(col("genre").isNotNull()) # To remove null values
df = genre_audio.drop('MSD_TRACKID')
df.count() # 413277
df.na.drop().count() # To check for the missing values # 413277

from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
label_stringIdx = StringIndexer(inputCol = 'new_label', outputCol = 'label')
pipeline = Pipeline(stages=[label_stringIdx])
pipelineFit = pipeline.fit(binary_df)
data = pipelineFit.transform(binary_df)
data = data.drop('new_label')

# Standardization
from pyspark.ml.linalg import DenseVector # Import DenseVector
input_data = df.rdd.map(lambda x: (x[20], DenseVector(x[:19]))) # Define the input_data 
df = spark.createDataFrame(input_data, ["label","features"])

from pyspark.ml.feature import StandardScaler
standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled") # Initialize the standardScaler
scaler = standardScaler.fit(df) # Fit the DataFrame to the scaler
scaled_df = scaler.transform(df) # Transform the data in `df` with the scaler
scaled_df.take(2)


# dimensional reduction
from pyspark.ml.feature import PCA
pca = PCA(k=2, inputCol='features_scaled', outputCol='features_pca')
model = pca.fit(scaled_df)
reduced_df = model.transform(scaled_df).select('label','features_pca')
示例#20
0
    def create_datasets(self, userId, pid):

        # 获取用户特征
        user_feature = json.loads(self.client_of_features.hget("user_features", userId))

        # 获取用户召回集
        recall_sets = self.client_of_recall.smembers(userId)

        result = []

        # 遍历召回集
        for adgroupId in recall_sets:
            adgroupId = int(adgroupId)
            # 获取该广告的特征值
            ad_feature = json.loads(self.client_of_features.hget("ad_features", adgroupId))

            features = {}
            features.update(user_feature)
            features.update(ad_feature)

            for k, v in features.items():
                if v is None:
                    features[k] = -1

            features_col = [
                # 特征值
                "price",
                "cms_group_id",
                "final_gender_code",
                "age_level",
                "shopping_level",
                "occupation",
                "pid",
                "pvalue_level",
                "new_user_class_level"
            ]
            '''
            "cms_group_id", 类别型特征,约13个分类 ==> 13维
            "final_gender_code", 类别型特征,2个分类 ==> 2维
            "age_level", 类别型特征,7个分类 ==>7维
            "shopping_level", 类别型特征,3个分类 ==> 3维
            "occupation", 类别型特征,2个分类 ==> 2维
            '''

            price = float(features["price"])

            pid_value = [0 for i in range(2)]
            cms_group_id_value = [0 for i in range(13)]
            final_gender_code_value = [0 for i in range(2)]
            age_level_value = [0 for i in range(7)]
            shopping_level_value = [0 for i in range(3)]
            occupation_value = [0 for i in range(2)]
            pvalue_level_value = [0 for i in range(4)]
            new_user_class_level_value = [0 for i in range(5)]

            pid_value[self.pid_rela[pid]] = 1
            cms_group_id_value[self.cms_group_id_rela[int(features["cms_group_id"])]] = 1
            final_gender_code_value[self.final_gender_code_rela[int(features["final_gender_code"])]] = 1
            age_level_value[self.age_level_rela[int(features["age_level"])]] = 1
            shopping_level_value[self.shopping_level_rela[int(features["shopping_level"])]] = 1
            occupation_value[self.occupation_rela[int(features["occupation"])]] = 1
            pvalue_level_value[self.pvalue_level_rela[int(features["pvalue_level"])]] = 1
            new_user_class_level_value[self.new_user_class_level_rela[int(features["new_user_class_level"])]] = 1

            #         print(pid_value)
            #         print(cms_group_id_value)
            #         print(final_gender_code_value)
            #         print(age_level_value)
            #         print(shopping_level_value)
            #         print(occupation_value)
            #         print(pvalue_level_value)
            #         print(new_user_class_level_value)

            vector = DenseVector([price] + pid_value + cms_group_id_value + final_gender_code_value \
                                 + age_level_value + shopping_level_value + occupation_value + pvalue_level_value + new_user_class_level_value)

            result.append((userId, adgroupId, vector))

        return result
示例#21
0
#########################################################

#from cluswisard_estimator import CluswisardEstimator

#clus = CluswisardEstimator(2, 48, 4, 10, 0.1)
#clus.treinar(t, "input", "label")

#classificacoes = clus._fit(t, "input")

#classificacoes.show()

# Import `DenseVector`
from pyspark.ml.linalg import DenseVector

# Define the `input_data` 
input_data = treino.rdd.map(lambda x: (x[0], DenseVector(x[1:])))

# Replace `df` with the new DataFrame
df = spark.createDataFrame(input_data, ["label", "features"])

# Import `StandardScaler` 
from pyspark.ml.feature import StandardScaler

# Initialize the `standardScaler`
standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled")

# Fit the DataFrame to the scaler
scaler = standardScaler.fit(df)

# Transform the data in `df` with the scaler
scaled_df_treino = scaler.transform(df)
示例#22
0
genderIndex=0.0, 
genderclassVec=SparseVector(1, {0: 1.0}), 
native-countryIndex=0.0, 
native-countryclassVec=SparseVector(40, {0: 1.0}), 
newlabel=0.0, 
features=SparseVector(99, {0: 1.0, 13: 1.0, 24: 1.0, 35: 1.0, 45: 1.0, 49: 1.0, 52: 1.0, 53: 1.0, 93: 25.0, 94: 226802.0, 96: 7.0, 98: 40.0}))]

'''

# step4) build the classifier : logistic
# to make the computation faster, convert model to a DF
# select newlabel and features from model using map

from pyspark.ml.linalg import DenseVector
input_data = model.rdd.map(lambda x:
                           (x['newlabel'], DenseVector(x['features'])))

df_train = sqlcontext.createDataFrame(input_data, ['label', 'features'])
df_train.show(2)

train_data, test_data = df_train.randomSplit([.8, .2], seed=1234)

train_data.groupby('label').agg({'label': 'count'}).show()
test_data.groupby('label').agg({'label': 'count'}).show()

# build the logreg
from pyspark.ml.classification import LogisticRegression
# initialize logreg
lr = LogisticRegression(labelCol='label',
                        featuresCol='features',
                        maxIter=10,
示例#23
0
def features_vector(arr):
    return DenseVector(np.array(arr))
示例#24
0
    def fit(self, output_column, input_columns=None):
        if output_column not in self._dataframe_helper.get_numeric_columns():
            raise BIException('Output column: %s is not a measure column' % (output_column,))

        if input_columns == None:
            input_columns = list(set(self._dataframe_helper.get_numeric_columns()) - {output_column})

        if len(set(input_columns) - set(self._dataframe_helper.get_numeric_columns())) != 0:
            raise BIException('At least one of the input columns %r is not a measure column' % (input_columns,))

        # TODO: ensure no duplicates are present in input_columns

        regression_result = RegressionResult(output_column, input_columns)

        training_df = self._data_frame.rdd.map(lambda row: \
                                                   (float(row[output_column]),
                                                    DenseVector([float(row[col]) for col in input_columns]))).toDF()

        lr = LR(maxIter=LinearRegression.MAX_ITERATIONS, regParam=LinearRegression.REGULARIZATION_PARAM,
                elasticNetParam=1.0, labelCol=LinearRegression.LABEL_COLUMN_NAME,
                featuresCol=LinearRegression.FEATURES_COLUMN_NAME)

        lr_model = lr.fit(training_df)
        lr_summary = lr_model.evaluate(training_df)

        #regression_result.set_params(intercept=lr_model.intercept, coefficients=lr_model.coefficients,
        #                              rmse=lr_summary.rootMeanSquaredError, r2=lr_summary.r2,
        #                              t_values=lr_summary.tValues, p_values=lr_summary.pValues)

        # TODO: pass t_values and p_values
        coefficients = [float(i) for i in lr_model.coefficients.values]
        if not any([coeff != 0 for coeff in coefficients]):
            return None
        sample_data_dict = {}
        lr_dimension = {}
        for c in input_columns:
            sample_data_dict[c] = None
            lr_dimension[c] = {'dimension':'', 'levels': [], 'coefficients':[],
                                'dimension2':'', 'levels2': [], 'coefficients2':[]}
            diff = 0
            diff2 = 0
            for dim in self._string_columns:
            # sample_data_dict[col] = self._dataframe_helper.get_sample_data(col, output_column, self._sample_size)
                temp = []
                if len(self._levels[dim])>0 and len(self._levels[dim])<16:

                    for level in self._levels[dim]:
                        sub_df = self._data_frame.select(*[c,output_column]).filter(col(dim)==level)
                        train = sub_df.rdd.map(lambda row: (float(row[output_column]),
                                                                    DenseVector([float(row[c])]))).toDF()
                        sub_lr_model = lr.fit(train)
                        temp = temp + [float(i) for i in sub_lr_model.coefficients.values]
                    if max(temp)-min(temp) > diff:
                        diff = max(temp)-min(temp)
                        diff2 = diff
                        lr_dimension[c]['dimension2']= lr_dimension[c]['dimension']
                        lr_dimension[c]['levels2'] = lr_dimension[c]['levels']
                        lr_dimension[c]['coefficients2'] = lr_dimension[c]['coefficients']
                        lr_dimension[c]['dimension'] = dim
                        X = self._levels[dim]
                        Y = temp
                        Z = [abs(y) for y in Y]
                        lr_dimension[c]['levels'] = [x for (z,y,x) in sorted(zip(Z,Y,X))]
                        lr_dimension[c]['coefficients'] = [y for (z,y,x) in sorted(zip(Z,Y,X))]
                    elif max(temp)-min(temp) > diff2:
                        diff2 = max(temp)-min(temp)
                        lr_dimension[c]['dimension2'] = dim
                        X = self._levels[dim]
                        Y = temp
                        Z = [abs(y) for y in Y]
                        lr_dimension[c]['levels2'] = [x for (z,y,x) in sorted(zip(Z,Y,X))]
                        lr_dimension[c]['coefficients2'] = [y for (z,y,x) in sorted(zip(Z,Y,X))]

        regression_result.set_params(intercept=float(lr_model.intercept), coefficients=coefficients,
                                      rmse=float(lr_summary.rootMeanSquaredError), r2=float(lr_summary.r2),
                                      sample_data_dict=sample_data_dict, lr_dimension=lr_dimension)

        return regression_result
示例#25
0
# Each line of the Input file follows this format:
# "word - 0.6579854,1.161026,0.43898278,[...],2.0629232,0.063231304"
input_file = "sparkvectors2019"

# Maximum PCA dimension
K = 20

# Read the file, parse it as RDD and transform it into DataFrame
raw_vectors = sc.textFile(input_file)
vectors_rdd = raw_vectors.map(lambda row:
                              (row.split(" - ")[0], row.split(" - ")[1]))
vectors_df = spark.createDataFrame(vectors_rdd, ['Label', 'Vector'])

# Count the elements to force Spark to load the file at this time
vectors_df.count()

# Parse vectors as DenseVectors
vectorize_lines = udf(lambda row:
                      DenseVector([float(number) for number in row.split(",")]), VectorUDT())
vectorized_df = vectors_df.withColumn(
    "DenseVector", vectorize_lines(vectors_df['Vector']))

pca = PCA(k=K, inputCol="DenseVector", outputCol='pcaFeatures')
model = pca.fit(vectorized_df)
# model.explainedVariance
result = mode.transform(vectorized_df)
for k, in range(1, K):
    result.rdd.map(lambda row:
                   row[0] + ",\"" + ",".join(str(a) for a in row[3][:k]) + "\"").saveAsTextFile("pca" + str(k))
示例#26
0
def lr_sort_service(reco_set, temp, hbu):
    """
    排序返回推荐文章
    :param reco_set:召回合并过滤后的结果
    :param temp: 参数
    :param hbu: Hbase工具
    :return:
    """
    # 排序
    # 1、读取用户特征中心特征
    try:
        user_feature = eval(hbu.get_table_row('ctr_feature_user',
                                              '{}'.format(temp.user_id).encode(),
                                              'channel:{}'.format(temp.channel_id).encode()))
        logger.info("{} INFO get user user_id:{} channel:{} profile data".format(
            datetime.now().strftime('%Y-%m-%d %H:%M:%S'), temp.user_id, temp.channel_id))
    except Exception as e:
        user_feature = []

    if user_feature:
        # 2、读取文章特征中心特征
        result = []

        for article_id in reco_set:
            try:
                article_feature = eval(hbu.get_table_row('ctr_feature_article',
                                                         '{}'.format(article_id).encode(),
                                                         'article:{}'.format(article_id).encode()))
            except Exception as e:

                article_feature = [0.0] * 111
            f = []
            # 第一个channel_id
            f.extend([article_feature[0]])
            # 第二个article_vector
            f.extend(article_feature[11:])
            # 第三个用户权重特征
            f.extend(user_feature)
            # 第四个文章权重特征
            f.extend(article_feature[1:11])
            vector = DenseVector(f)
            result.append([temp.user_id, article_id, vector])

        # 4、预测并进行排序是筛选
        df = pd.DataFrame(result, columns=["user_id", "article_id", "features"])
        test = SORT_SPARK.createDataFrame(df)

        # 加载逻辑回归模型
        model = LogisticRegressionModel.load("hdfs://hadoop-master:9000/headlines/models/LR.obj")
        predict = model.transform(test)

        def vector_to_double(row):
            return float(row.article_id), float(row.probability[1])

        res = predict.select(['article_id', 'probability']).rdd.map(vector_to_double).toDF(
            ['article_id', 'probability']).sort('probability', ascending=False)
        article_list = [i.article_id for i in res.collect()]
        logger.info("{} INFO sorting user_id:{} recommend article".format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                                                                          temp.user_id))
        # 排序后,只将排名在前100个文章ID返回给用户推荐
        if len(article_list) > 200:
            article_list = article_list[:200]
        reco_set = list(map(int, article_list))

    return reco_set
示例#27
0
X, y = digits.data, digits.target

#todo: note difficulties of spark: cannot set weights for evaluation so using sklearn slows that down

# todo: make the lists wayyyy bigger to show the computation time is faster on spark
# todo: play with partition size/parameters for each dataset
# todo: sanity checks (same weights produced with no DP noise if non-random train test slit)
# todo: update python code to include intercept


X = X.tolist()
y = y.tolist()

data = zip(y,X)

formatted = [(int(y_i), DenseVector(x_i)) for y_i, x_i in data]
fields = [StructField('label', IntegerType(), True), StructField('features', VectorUDT(), True)]
schema = StructType(fields)
data = spark.createDataFrame(formatted, schema)

#train_data, test_data = data.randomSplit([1/2, 1/2])
train_data = data


lr = LogisticRegression(maxIter=10)
lrModel = lr.fit(train_data)
trainingSummary = lrModel.summary
print(trainingSummary.accuracy)
######

result = lrModel.transform(train_data) # test_data
               ' weekday_is_thursday', ' weekday_is_friday', ' weekday_is_saturday', ' weekday_is_sunday', ' is_weekend',
               ' LDA_00', ' LDA_01', ' LDA_02', ' LDA_03', ' LDA_04', ' global_subjectivity', ' global_sentiment_polarity',
               ' global_rate_positive_words', ' global_rate_negative_words', ' rate_positive_words', ' rate_negative_words',
               ' avg_positive_polarity', ' min_positive_polarity', ' max_positive_polarity', ' avg_negative_polarity',
               ' min_negative_polarity', ' max_negative_polarity', ' title_subjectivity', ' title_sentiment_polarity',
               ' abs_title_subjectivity', ' abs_title_sentiment_polarity', ' shares']

    # Conver the `df` columns to `FloatType()`
    data = convertColumn(data, columns, FloatType())

    data = data.select(' shares',' n_non_stop_words')

    # standardization

    # Define the `input_data`
    input_data = data.rdd.map(lambda x: (x[0], DenseVector(x[1:])))

    # Replace `data` with the new DataFrame
    data = spark.createDataFrame(input_data, ["label", "features"])

    # Initialize the `standardScaler`
    standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled")

    # Fit the DataFrame to the scaler
    scaler = standardScaler.fit(data)

    # Transform the data in `df` with the scaler
    scaled_data = scaler.transform(data)

    # Bulding a machine learning model
    # split the data into train and test sets
示例#29
0
				def CosineSimi (v1, v2):
					d1 = DenseVector(v1)
					d2 = DenseVector(v2)
					n1 = d1.norm(2)
					n2 = d2.norm(2)
					return float(d1.dot(d2) / (n1 * n2))
示例#30
0
df_indexed = pipeline.fit(df).transform(df)

## 3. Convert to label/features format
catVarsIndexed = [i + '_indexed' for i in catVars]
featuresCol = numVars + catVarsIndexed
featuresCol.remove('Survived')
labelCol = ['Mark', 'Survived']

from pyspark.sql import Row
from pyspark.ml.linalg import DenseVector

row = Row('mark', 'label', 'features')

df_indexed = df_indexed[labelCol + featuresCol]
# 0-mark, 1-label, 2-features
lf = df_indexed.rdd.map(lambda r: (row(r[0], r[1], DenseVector(r[2:])))).toDF()
# index label
lf = StringIndexer(inputCol='label', outputCol='index').fit(lf).transform(lf)

# split back train/test data
train = lf.where(lf.mark == 'train')
test = lf.where(lf.mark == 'test')

# random split further to get train/validate
train, validate = train.randomSplit([0.7, 0.3], seed=121)

print 'Train Data Number of Row: ' + str(train.count())
print 'Validate Data Number of Row: ' + str(validate.count())
print 'Test Data Number of Row: ' + str(test.count())

# Apply Logsitic Regression
示例#31
0
    #Standard scaler
    scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=True)
    stages += [scaler]
    
    #Creating and running the pipeline
    pipeline = Pipeline(stages=stages)
    pipelineModel = pipeline.fit(spark_df)
    out_df = pipelineModel.transform(spark_df)
    
    return out_df
    
    
df_model = make_pipeline_numeric(df)

input_data = df_model.rdd.map(lambda x: (x["is_default"], DenseVector(x["scaledFeatures"])))

df_pre_smote = spark.createDataFrame(input_data, ["is_default", "scaledFeatures"])

args = sys.argv
k = int(args[1])

df_smote = SmoteSampling(vectorizerFunction(df_pre_smote, 'is_default'), k = k, minorityClass = 1, majorityClass = 0, percentageOver = 400, percentageUnder = 100)

df_smote_table = df_smote.rdd.map(lambda x:[float(y) for y in x['features']]).toDF()


table_name = 'default.LC_Smote_k' + str(k)

df_smote_table\
  .write.format("parquet")\