Exemplo n.º 1
0
        def getOrCreate(self):
            """Gets an existing :class:`SparkSession` or, if there is no existing one, creates a
            new one based on the options set in this builder.

            This method first checks whether there is a valid global default SparkSession, and if
            yes, return that one. If no valid global default SparkSession exists, the method
            creates a new SparkSession and assigns the newly created SparkSession as the global
            default.

            >>> s1 = SparkSession.builder.config("k1", "v1").getOrCreate()
            >>> s1.conf.get("k1") == "v1"
            True

            In case an existing SparkSession is returned, the config options specified
            in this builder will be applied to the existing SparkSession.

            >>> s2 = SparkSession.builder.config("k2", "v2").getOrCreate()
            >>> s1.conf.get("k1") == s2.conf.get("k1")
            True
            >>> s1.conf.get("k2") == s2.conf.get("k2")
            True
            """
            with self._lock:
                from pyspark.context import SparkContext
                from pyspark.conf import SparkConf
                session = SparkSession._instantiatedContext
                if session is None:
                    sparkConf = SparkConf()
                    for key, value in self._options.items():
                        sparkConf.set(key, value)
                    sc = SparkContext.getOrCreate(sparkConf)
                    session = SparkSession(sc)
                for key, value in self._options.items():
                    session.conf.set(key, value)
                return session
Exemplo n.º 2
0
def read_to_rdd() -> RDD:
    """[read file into RDD]

    Returns:
        RDD: [pyspark RDD]
    """
    sc = SparkContext.getOrCreate()
    rdd = sc.textFile(LOCAL_FILE)
    rdd = rdd.mapPartitions(lambda x: csv.reader(x))
    return rdd
Exemplo n.º 3
0
    def get_resource_data(self):
        """ Fetch the required data from resource file """

        path = settings.PATH
        required_columns = settings.COLUMNS
        spark_ctx = SparkContext.getOrCreate()
        spark = SparkSession(spark_ctx)
        shoes_df = spark.read.csv(path, inferSchema=True, header=True)
        required_df = shoes_df.select(required_columns)
        return required_df
def list_product_count(rdd: RDD):
    """[task 2b - saves unique count of products into txt]

    Args:
        rdd (RDD): [description]
    """
    sc = SparkContext.getOrCreate()
    rdd = sc.parallelize([str(rdd.flatMap(explode).count())])
    header = sc.parallelize(["Count:"])
    header.union(rdd).coalesce(1).saveAsTextFile("out/out_1_2b.txt")
Exemplo n.º 5
0
 def _init_glue_context():
     # Imports are done here so we can isolate the configuration of this job
     from awsglue.context import GlueContext
     from pyspark.context import SparkContext
     spark_context = SparkContext.getOrCreate()
     spark_context._jsc.hadoopConfiguration().set(
         "mapreduce.fileoutputcommitter.marksuccessfuljobs", "false")  # noqa pylint: disable=protected-access
     spark_context._jsc.hadoopConfiguration().set(
         "parquet.enable.summary-metadata", "false")  # noqa pylint: disable=protected-access
     return GlueContext(spark_context)
 def setUp(self):
     # Create a local Spark context with 4 cores
     spark_conf = SparkConf().setMaster('local[4]').\
         setAppName("monasca-transform unit tests").\
         set("spark.sql.shuffle.partitions", "10")
     self.spark_context = SparkContext.getOrCreate(conf=spark_conf)
     # quiet logging
     logger = self.spark_context._jvm.org.apache.log4j
     logger.LogManager.getLogger("org").setLevel(logger.Level.WARN)
     logger.LogManager.getLogger("akka").setLevel(logger.Level.WARN)
 def setUp(self):
     # Create a local Spark context with 4 cores
     spark_conf = SparkConf().setMaster('local[4]').\
         setAppName("monasca-transform unit tests").\
         set("spark.sql.shuffle.partitions", "10")
     self.spark_context = SparkContext.getOrCreate(conf=spark_conf)
     # quiet logging
     logger = self.spark_context._jvm.org.apache.log4j
     logger.LogManager.getLogger("org").setLevel(logger.Level.WARN)
     logger.LogManager.getLogger("akka").setLevel(logger.Level.WARN)
Exemplo n.º 8
0
    def predict(self, src, outfile=None):
        """
			Predicting on a new data set.
		"""
        sc = SparkContext.getOrCreate()

        # Make n-grams. Resultant RDD format: (document <any>, ngram <tuple>)
        ngrams = computeNGramsRDD(src, n=self.n, filter=self.filter)

        # Broadcast the set of class labels.
        blabels = sc.broadcast(set(self.conditionals.keys()))

        # Prepare for estimating class probabilities against each document. Resultant RDD format: ((document, cls), ngram)
        ngrams = ngrams.flatMap(lambda (document, ngram): [(
            (document, cls), ngram) for cls in blabels.value])

        # Broadcast the conditional probabilities.
        bmodel = sc.broadcast(self.conditionals)

        # Replace n-gram with the likelihood estimate. Resultant RDD format: ((document, cls), likelihood)
        likelihoods = ngrams.map(lambda ((document, cls), ngram):
                                 ((document, cls), bmodel.value[cls].get(ngram)
                                  or bmodel.value[cls].get("unseen")))

        # Sum up likelihoods. Resultant RDD format: ((document, cls), likelihood)
        likelihoods = likelihoods.reduceByKey(add)

        # Restructure the key value to make document the key for grouping. Resultant RDD format: (document, (class, likelihood))
        likelihoods = likelihoods.map(lambda ((document, cls), likelihood):
                                      (document, (cls, likelihood)))

        # Broadcast prior probabilities
        bpriors = sc.broadcast(self.priors)

        # Add prior probabilities to estimates. Resultant RDD format: (document, (class, likelihood))
        likelihoods = likelihoods.map(lambda (document, (cls, likelihood)): (
            document, (cls, likelihood + bpriors.value[cls])))

        # Now group predicitons by document. (document <any>, (class, likelihood) <iterator>)
        likelihoods = likelihoods.groupByKey()

        # Pick the best. Resultant RDD format: (document, (class, likelihood))
        likelihoods = likelihoods.map(lambda (document, estimates): (
            document, max(estimates, key=lambda (cls, lkl): lkl)))

        # Retain class. Drop likelihood estimates, only retain the class.
        predictions = likelihoods.map(lambda (document, (cls, lkl)):
                                      (document, cls))

        # Save predicitons to outfile, if provided
        if outfile is not None:
            with open(outfile, 'w') as outfile:
                json.dump(predictions.collectAsMap(), outfile)

        return predictions
Exemplo n.º 9
0
def main(argv):
   input = argv[0]
   output = argv[1]
   sc = SparkContext.getOrCreate() #"SparkContext is the entry point to any Spark functionality" - tutorialspoint.com
   sqlContext = SQLContext(sc)
   dataframes = sqlContext.read.format("csv").option("header", "true").option("mode", "DROPMALFORMED").load(input)
   #previous line found from stackoverflow - it makes the format CSV, it makes the header not recognized as input,
   #drop malformed data, and then load the input file
   sorted_dataframes = dataframes.sort(['cca2', 'timestamp'], ascending=[True, True])
   #previous line sorts dataframe by cca2 (country) and timestamp
   sorted_dataframes.write.csv(output) #outputs to file
Exemplo n.º 10
0
def main():
    # Create a Glue context
    glueContext = GlueContext(SparkContext.getOrCreate())

    # Create a DynamicFrame using the 'persons_json' table
    persons_DyF = glueContext.create_dynamic_frame.from_catalog(
        database="legislators", table_name="persons_json")

    # Print out information about this data
    print("Count:  ", persons_DyF.count())
    persons_DyF.printSchema()
Exemplo n.º 11
0
    def _fit(self, dataset):
        """Trains a TensorFlow model and returns a TFModel instance with the same args/params pointing to a checkpoint or saved_model on disk.

    Args:
      :dataset: A Spark DataFrame with columns that will be mapped to TensorFlow tensors.

    Returns:
      A TFModel representing the trained model, backed on disk by a TensorFlow checkpoint or saved_model.
    """
        sc = SparkContext.getOrCreate()

        logger.info("===== 1. train args: {0}".format(self.args))
        logger.info("===== 2. train params: {0}".format(self._paramMap))
        local_args = self.merge_args_params()
        logger.info("===== 3. train args + params: {0}".format(local_args))

        tf_args = self.args.argv if self.args.argv else local_args
        cluster = TFCluster.run(sc,
                                self.train_fn,
                                tf_args,
                                local_args.cluster_size,
                                local_args.num_ps,
                                local_args.tensorboard,
                                TFCluster.InputMode.SPARK,
                                master_node=local_args.master_node,
                                driver_ps_nodes=local_args.driver_ps_nodes)
        # feed data, using a deterministic order for input columns (lexicographic by key)
        input_cols = sorted(self.getInputMapping())
        cluster.train(dataset.select(input_cols).rdd, local_args.epochs)
        cluster.shutdown(grace_secs=self.getGraceSecs())

        if self.export_fn:
            if version.parse(TF_VERSION) < version.parse("2.0.0"):
                # For TF1.x, run export function, if provided
                if not local_args.export_dir:
                    raise ValueError(
                        "Export function requires --export_dir to be set")
                logging.info(
                    "Exporting saved_model (via export_fn) to: {}".format(
                        local_args.export_dir))

                def _export(iterator, fn, args):
                    single_node_env(args)
                    fn(args)

                # Run on a single exeucutor
                sc.parallelize([1], 1).foreachPartition(
                    lambda it: _export(it, self.export_fn, tf_args))
            else:
                # for TF2.x
                raise Exception(
                    "Please use native TF2.x APIs to export a saved_model.")

        return self._copyValues(TFModel(self.args))
Exemplo n.º 12
0
def main():
    # Create a Glue context
    glue = GlueContext(SparkContext.getOrCreate())

    # Create a DynamicFrame using the 'persons_json' table
    persons_dyf = glue.read.json("s3://bertolb/sampledata/mockaroo/json")
    # persons_dyf = glue.read().catalog("legislators", "persons_json")
    # persons_dyf = glueContext.create_dynamic_frame.from_catalog(database="legislators", table_name="persons_json")

    # Print out information about this data
    print("Count:  ", persons_dyf.count())
    persons_dyf.printSchema()
Exemplo n.º 13
0
def main():
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session

    ########## dyf_learning_object
    dyf_learning_object = glueContext.create_dynamic_frame.from_catalog(
                                database="nvn_knowledge",
                                table_name="nvn_knowledge_learning_object"
                            )
    dyf_learning_object = dyf_learning_object.select_fields(
        ['_key', 'learning_object_id', 'learning_object_name', 'phone_tic'])
    # convert kieu du lieu
    dyf_learning_object = dyf_learning_object.resolveChoice(specs=[('_key', 'cast:long')])


    # Doc ra max key moi nhat, va chi load du lieuj lon hon key nay (tranh viec load full du lieu do cac ban ghi truoc da thuc hien chay etl roi)
    df_flag = spark.read.parquet("s3://dts-odin/flag/flag_LO.parquet")
    max_key = df_flag.collect()[0]['flag']
    # so sanh _key datasource voi flag, lay nhung gia tri co key > flag
    dyf_learning_object = Filter.apply(frame=dyf_learning_object, f=lambda x: x["_key"] > max_key)
    # Show schema
    dyf_learning_object.printSchema()
    # Show data cua dynamicframe
    dyf_learning_object.show()
    # Check neu co ban ghi thi thuc hien tiep
    if (dyf_learning_object.count() > 0):
        apply_mapping_learning_object = ApplyMapping.apply(frame=dyf_learning_object,
                                          mappings=[("learning_object_id", "int", "learning_object_id", "int"),
                                                    ("learning_object_name", "string", "learning_object_name", "string"),
                                                    ("phone_tic", "string", "phone_tic", "string")])
        resolve_choice_learning_object = ResolveChoice.apply(frame=apply_mapping_learning_object, choice="make_cols",
                                                transformation_ctx="resolve_choice_learning_object")
        dropnullfields_learning_object = DropNullFields.apply(frame=resolve_choice_learning_object, transformation_ctx="dropnullfields_learning_object")
        #
        # save_learning_object = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields_learning_object,
        #                                                            catalog_connection="glue_redshift",
        #                                                            connection_options={
        #                                                                "dbtable": "learning_object",
        #                                                                "database": "dts_odin"
        #                                                                },
        #                                                            redshift_tmp_dir="s3n://dts-odin/temp/tig_advisor/user_profile_student_contact/",
        #                                                            transformation_ctx="datasink4")

        # lay max key trong data source
        datasource = dyf_learning_object.toDF()
        flag = datasource.agg({"_key": "max"}).collect()[0][0]

        # convert kieu dl
        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')

        # ghi de _key vao s3
        df.write.parquet("s3a://dts-odin/flag/flag_LO.parquet", mode="overwrite")
Exemplo n.º 14
0
def list_top_purchased_products(rdd: RDD):
    """[task 3 - saves top 5 purchased products into txt file]

    Args:
        rdd (RDD): [spark RDD]
    """
    sc = SparkContext.getOrCreate()
    rdd = (sc.parallelize(
        rdd.flatMap(explode).map(lambda w: (w, 1)).reduceByKey(
            lambda a, b: a + b).takeOrdered(5,
                                            key=lambda x: -x[1])).coalesce(1))
    rdd.saveAsTextFile("out/out_1_3.txt")
Exemplo n.º 15
0
def main():
    console("Starting Job")

    ## @params: [job_name]
    # 1. Start Glue Context
    glueContext = GlueContext(SparkContext.getOrCreate())

    # 2. Initialize Job
    job = Job(glueContext)
    args = get_args()
    job.init(args["JOB_NAME"], args)

    client = boto3.client("glue", region_name="us-east-1")

    Tables = client.get_tables(DatabaseName=args["athena_database"])
    tableList = Tables["TableList"]

    # 3. Clear bucket contents
    console(f"Excluding S3 files for: {args['target_bucket']}")
    s3 = boto3.resource("s3")
    bucket = s3.Bucket(args["target_bucket"])
    bucket.objects.delete()

    for table in tableList:
        tableName = table["Name"]
        if re.search(f"[0-9]+", tableName):
            # Ignoring Athena Tables already processed...
            continue
        else:
            console("Processing Table {}".format(tableName))

        # 4. Load dynamic dataframe
        datasource0 = glueContext.create_dynamic_frame.from_catalog(
            database=args["athena_database"],
            table_name=tableName,
            transformation_ctx="datasource0",
        )
        # 5. Process Dynamo database
        # Drop null fields
        dropnullfields1 = DropNullFields.apply(
            frame=datasource0, transformation_ctx="dropnullfields1")

        # Save Dynamic Frame on S3 using Glue
        glueContext.write_dynamic_frame.from_options(
            frame=dropnullfields1,
            connection_type="s3",
            connection_options={
                "path": "s3://{}/{}/".format(args["target_bucket"], tableName)
            },
            format="parquet",
            transformation_ctx="datasink2",
        )
    job.commit()
Exemplo n.º 16
0
  def _fit(self, dataset):
    """Trains a TensorFlow model and returns a TFModel instance with the same args/params pointing to a checkpoint or saved_model on disk.

    Args:
      :dataset: A Spark DataFrame with columns that will be mapped to TensorFlow tensors.

    Returns:
      A TFModel representing the trained model, backed on disk by a TensorFlow checkpoint or saved_model.
    """
    sc = SparkContext.getOrCreate()

    logging.info("===== 1. train args: {0}".format(self.args))
    logging.info("===== 2. train params: {0}".format(self._paramMap))
    local_args = self.merge_args_params()
    logging.info("===== 3. train args + params: {0}".format(local_args))

    if local_args.input_mode == TFCluster.InputMode.TENSORFLOW:
      if dfutil.isLoadedDF(dataset):
        # if just a DataFrame loaded from tfrecords, just point to original source path
        logging.info("Loaded DataFrame of TFRecord.")
        local_args.tfrecord_dir = dfutil.loadedDF[dataset]
      else:
        # otherwise, save as tfrecords and point to save path
        assert local_args.tfrecord_dir, "Please specify --tfrecord_dir to export DataFrame to TFRecord."
        if self.getInputMapping():
          # if input mapping provided, filter only required columns before exporting
          dataset = dataset.select(list(self.getInputMapping()))
        logging.info("Exporting DataFrame {} as TFRecord to: {}".format(dataset.dtypes, local_args.tfrecord_dir))
        dfutil.saveAsTFRecords(dataset, local_args.tfrecord_dir)
        logging.info("Done saving")

    tf_args = self.args.argv if self.args.argv else local_args
    cluster = TFCluster.run(sc, self.train_fn, tf_args, local_args.cluster_size, local_args.num_ps,
                            local_args.tensorboard, local_args.input_mode, driver_ps_nodes=local_args.driver_ps_nodes)
    if local_args.input_mode == TFCluster.InputMode.SPARK:
      # feed data, using a deterministic order for input columns (lexicographic by key)
      input_cols = sorted(self.getInputMapping())
      cluster.train(dataset.select(input_cols).rdd, local_args.epochs)
    cluster.shutdown()

    # Run export function, if provided
    if self.export_fn:
      assert local_args.export_dir, "Export function requires --export_dir to be set"
      logging.info("Exporting saved_model (via export_fn) to: {}".format(local_args.export_dir))

      def _export(iterator, fn, args):
        single_node_env(args)
        fn(args)

      # Run on a single exeucutor
      sc.parallelize([1], 1).foreachPartition(lambda it: _export(it, self.export_fn, tf_args))

    return self._copyValues(TFModel(self.args))
Exemplo n.º 17
0
def __main__():
	sc = SparkContext.getOrCreate()

	sc.setLogLevel('OFF')

        text_file = sc.textFile(sys.argv[1])
        sortedCount = text_file.flatMap(lambda x: x.split(" ")).map(lambda x: (int(x), 1)).sortByKey()
        output = sortedCount.collect()

        with open(sys.argv[2], 'w') as f: 
    	   for(num, unitcount) in output:
		f.write(str(num)+"\n")
Exemplo n.º 18
0
def main():

    # # @params: [TempDir, JOB_NAME]
    # args = getResolvedOptions(sys.argv, ['TempDir', 'JOB_NAME'])
    # sc = SparkContext()
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session
    # job = Job(glueContext)
    # job.init(args['JOB_NAME'], args)

    spark.conf.set("spark.sql.session.timeZone", "GMT+07:00")
    # get dynamic frame source

    dyf_log_student_level_study = glueContext.create_dynamic_frame.from_catalog(
        database="tig_advisor", table_name="log_student_level_study")

    # chon cac field
    dyf_log_student_level_study = dyf_log_student_level_study.select_fields([
        '_key', 'contact_id', 'level_modified', 'package_code', 'time_created'
    ])
    dyf_log_student_level_study = dyf_log_student_level_study.resolveChoice(
        specs=[('_key', 'cast:long')])
    dyf_log_student_level_study.printSchema()
    dyf_log_student_level_study.show(2)

    #  check bucket is not null
    try:
        # # doc moc flag tu s3
        df_flag = spark.read.parquet(
            "s3a://dtsodin/flag/toa_L3150/toa_8_log_student_level_study.parquet"
        )
        start_read = df_flag.collect()[0]['flag']
        print('read from index: ', start_read)

        # so sanh _key datasource voi flag, lay nhung gia tri co key > flag
        dyf_log_student_level_study = Filter.apply(
            frame=dyf_log_student_level_study,
            f=lambda x: x['_key'] > start_read)
    except:
        print('read flag file error ')

    print('the number of new contacts: ', dyf_log_student_level_study.count())

    if (dyf_log_student_level_study.count() > 0):

        dyf_log_student_level_study = Filter.apply(
            frame=dyf_log_student_level_study,
            f=lambda x: x['contact_id'] is not None and x['level_modified'] is
            not None and x['package_code'] is not None and x['time_created'
                                                             ] is not None)

        dyf_log_student_level_study.printSchema()
Exemplo n.º 19
0
 def getOrCreate(self):
     """Gets an existing :class:`SparkSession` or, if there is no existing one, creates a new
     one based on the options set in this builder.
     """
     with self._lock:
         from pyspark.conf import SparkConf
         from pyspark.context import SparkContext
         from pyspark.sql.context import SQLContext
         sparkConf = SparkConf()
         for key, value in self._options.items():
             sparkConf.set(key, value)
         sparkContext = SparkContext.getOrCreate(sparkConf)
         return SQLContext.getOrCreate(sparkContext).sparkSession
 def broadcast(self):
     """Broadcast self to ensure we are shared."""
     if self._broadcast is None:
         from pyspark.context import SparkContext
         sc = SparkContext.getOrCreate()
         try:
             SpacyMagic.__lock.acquire()
             self.__empty_please = True
             self._broadcast = sc.broadcast(self)
             self.__empty_please = False
         finally:
             SpacyMagic.__lock.release()
     return self._broadcast
Exemplo n.º 21
0
 def getOrCreate(self):
     """Gets an existing :class:`SparkSession` or, if there is no existing one, creates a new
     one based on the options set in this builder.
     """
     with self._lock:
         from pyspark.conf import SparkConf
         from pyspark.context import SparkContext
         from pyspark.sql.context import SQLContext
         sparkConf = SparkConf()
         for key, value in self._options.items():
             sparkConf.set(key, value)
         sparkContext = SparkContext.getOrCreate(sparkConf)
         return SQLContext.getOrCreate(sparkContext).sparkSession
Exemplo n.º 22
0
def computeLikelihoodsRDD(tfs, alpha=0.0, vsize=1.0, logs=True):
    """
		This method takes in an RDD of multinomial distribution and computes likelihoods from the distributon. The flag @clogs dictates whether or not to take log of likelihoods. The mehod can perform additive smoothing before computing the estimator. @alpha <float> is the smoothing parameter and @vsize <int> (|V|) is the size of the vocabulary. Refer the wikipedia article for more on additive smoothing: https://en.wikipedia.org/wiki/Additive_smoothing. No smoothing is performed by default.
		
		Input RDD: (document <any>, {word: count})
		Output RDD: (document <any>, {word: likelihood, .... , "unseen": likelihood})
		Note: The likelihood of an unseen attribute computed via additive smoothing is added to the word-likelihood map with the key 'unseen' <string>.
	"""
    sc = SparkContext.getOrCreate()

    # Broadcast the variables required for computing likelihoods
    logs = sc.broadcast(float(logs))
    alpha = sc.broadcast(float(alpha))
    vsize = sc.broadcast(float(vsize))

    # Define map method for computing likelihoods from frequency table
    # Note that the method is defined within for it uses the variables broadcasted above and wouldn't make sense as a generic mehtod of the module.
    def computeLikelihoods(table):
        """
			The map function for likelihhood estimation.
		"""
        # Total weight
        total = sum((table[word] for word in table))

        # Case 1. logs flag is set: Compute log likelihoods
        if logs.value:
            # log (tf + alpha / total + alpha * |V|)
            newtable = {
                word: log((table[word] + alpha.value) /
                          (total + alpha.value * vsize.value))
                for word in table
            }
            newtable["unseen"] = log(alpha.value /
                                     (total + alpha.value * vsize.value))

        # Case 2. Compute vanilla likelihoods
        else:
            newtable = {
                word: (table[word] + alpha.value) /
                (total + alpha.value * vsize.value)
                for word in table
            }
            newtable["unseen"] = alpha.value / (total +
                                                alpha.value * vsize.value)

        return newtable

    # Transform term frequencies RDD into an estimator
    estimator = tfs.mapValues(computeLikelihoods)

    return estimator
def main():
    # Invoke pydevd
    # pydevd.settrace('169.254.76.0', port=9001, stdoutToServer=True, stderrToServer=True)

    # Create a Glue context
    glueContext = GlueContext(SparkContext.getOrCreate())

    # Create a DynamicFrame using the 'persons_json' table
    persons_DyF = glueContext.create_dynamic_frame.from_catalog(
        database="legislators", table_name="persons_json")

    # Print out information about this data
    print("Count:  ", persons_DyF.count())
    persons_DyF.printSchema()
Exemplo n.º 24
0
    def __init__(self, bucket_name, repartition_number, is_local=False):

        if is_local:
            sc = LocalContext("local[*]")
        else:
            glueContext = GlueContext(SparkContext.getOrCreate())
            sc = glueContext.spark_session
            pass

        self.sql_context = SQLContext(sc)
        self.s3_bucket = bucket_name
        self.repartition_number = repartition_number

        s3_resource = boto3.resource('s3')
        self.bucket_resource = s3_resource.Bucket(bucket_name)
def _get_or_create_context():
    global _sc
    if not _sc:
        from pyspark.conf import SparkConf
        from pyspark.context import SparkContext
        settings = GlobalPreferences['spark_configuration']
        SparkContext.setSystemProperty("spark.executor.memory",settings['executor-memory'])
        SparkContext.setSystemProperty("spark.driver.memory",settings['driver-memory'])
        conf = SparkConf()
        conf = conf.setMaster(settings['master'])
        conf = conf.set('spark.driver.maxResultSize',settings['driver-memory'])
        _sc = SparkContext.getOrCreate(conf=conf)
        _sc.setLogLevel("WARN")
    return _sc
        
Exemplo n.º 26
0
 def _log(msg, level):
     sc = SparkContext.getOrCreate()
     log4jLogger = sc._jvm.org.apache.log4j
     LOGGER = log4jLogger.LogManager.getLogger(__name__)
     if LOGGER:
         if level == 1:
             LOGGER.error(msg)
         elif level == 2:
             LOGGER.warn(msg)
         elif level == 3:
             LOGGER.info(msg)
         else:
             LOGGER.debug(msg)
     else:
         print("Logging level {} : {}".format(level, msg))
Exemplo n.º 27
0
def main():
    print 'hello'
    sc = SparkContext.getOrCreate()
    x = sc.parallelize([("USA", 1), ("USA", 2), ("India", 1), ("UK", 1),
                        ("India", 4), ("India", 9), ("USA", 8), ("USA", 3),
                        ("India", 4), ("UK", 6), ("UK", 9), ("UK", 5)], 3)

    ## groupByKey with default partitions
    y = x.groupByKey()

    ## Check partitions
    print('Output: ', y.getNumPartitions())

    y = x.groupByKey(2)
    print('Output: ', y.getNumPartitions())
Exemplo n.º 28
0
def swapKeysRDD(rdd, keymap):
    """
		The method swaps the keys of a key-value RDD with a new set of keys provided in @keymap.
		
		returns: a new <pyspark.rdd.RDD>
	"""
    sc = SparkContext.getOrCreate()

    # Broadcast new keys
    keymap = sc.broadcast(keymap)

    # Swap old keys with new ones.
    rdd = rdd.map(lambda (key, value): (keymap.value[key], value))

    return rdd
Exemplo n.º 29
0
def main():
    potential_clones = '../Datasource/pc2.xml'
    output_csv = 'csvCodes.csv'
    df = convertAndSaveAsCSV(potential_clones, output_csv, False)

    # spark context
    sc = SparkContext.getOrCreate()
    sqlContext = SQLContext(sc)
    spark_df = sqlContext.createDataFrame(df)

    transformed_spark_df = spark_df.rdd.map(distributedSourceTransform)

    pysparkdf_transformedClones = transformed_spark_df.toDF()

    pysparkdf_transformedClones.toPandas().to_csv('results.csv')
Exemplo n.º 30
0
def main():
    args = parse_args()
    sc = SparkContext.getOrCreate()

    # prep data: read csv and split comma delimited lines into tuples
    rdd = sc.textFile(args.input_path)
    temp = rdd.map(lambda x: x.split(','))

    # sort by country first if tie, sort by timestamp
    sorted_rdd = temp.sortBy(lambda x: x[2], lambda x: x[14])
    #cacat array into one string
    ans = sorted_rdd.map(lambda x: ','.join((str(y) for y in x)))

    # save result to specified location
    ans.repartition(1).saveAsTextFile(args.output_path + 'result.csv')
def annoy_model(als_model,
                sc,
                groundTruth_test,
                test_users,
                n_trees=10,
                search_k=-1):
    print(f"annoy model with n_trees: {n_trees}, search_k: {search_k}")

    sc = SparkContext.getOrCreate()

    user_factors = als_model.userFactors
    size = user_factors.limit(1).select(
        F.size("features").alias("calc_size")).collect()[0].calc_size
    start_time = time()
    index_size = AnnoyIndex(size)

    for row in user_factors.collect():
        index_size.add_item(row.id, row.features)

    index_size.build(n_trees)
    index_size.save("./annoy_result/annoy_t" + str(n_trees) + "_k_" +
                    str(search_k) + ".ann")

    rec_list = [(user.user_id,
                 index_size.get_nns_by_item(int(user.user_id), 500))
                for user in test_users.collect()]

    temp = sc.parallelize(rec_list)

    print("Annoy-Recommendations (500) created for test users")

    rec = spark.createDataFrame(temp, ["user_id", "recommendations"])

    pred_test = rec.join(groundTruth_test,
                         rec.user_id == groundTruth_test.user_id, 'inner')

    predAndLabels_test_annoy = pred_test.select('recommendations',
                                                'test_truth').rdd.map(tuple)

    metrics_test_annoy = RankingMetrics(predAndLabels_test_annoy)
    precision_test_annoy = metrics_test_annoy.precisionAt(500)
    map_test_annoy = metrics_test_annoy.meanAveragePrecision

    print(f"Time taken: {time() - start_time}s")
    print(f"Precision at 500: {precision_test_annoy}")
    print(f"Mean Average Precision: {map_test_annoy}")

    index_size.unload()
Exemplo n.º 32
0
def get_spark():
    conf = SparkConf()

    # Load in a jar that provides extended string comparison functions such as Jaro Winkler.
    # Splink

    # No longer needed in spark 3.0?
    #conf.set("spark.driver.extraClassPath", "C:\\Spark\\spark-3.1.1-bin-hadoop2.7\\jars\\scala-udf-similarity-0.0.7.jar")
    #conf.set("spark.driver.extraClassPath", "C:\\Spark\\spark-3.1.1-bin-hadoop2.7\\jars\\scala-udf-JaroWinkler-0.0.1.jar")

    conf.set(
        "spark.jars",
        "C:\\Spark\\spark-3.1.1-bin-hadoop2.7\\jars\\scala-udf-similarity-0.0.7.jar"
    )
    conf.set(
        "spark.jars",
        "C:\\Spark\\spark-3.1.1-bin-hadoop2.7\\jars\\scala-udf-JaroWinkler-0.0.1.jar"
    )
    # SET TO YOUR SPARK INSTALATION

    # WARNING:
    # These config options are appropriate only if you're running Spark locally!!!
    conf.set("spark.driver.memory", "1g")
    conf.set("spark.sql.shuffle.partitions", "4")

    #conf.set("spark.sql.files.maxPartitionBytes","536870912")
    #conf.set("spark.sql.files.maxPartitionBytes","250000000")
    #conf.set("spark.sql.files.maxPartitionBytes","134217728")

    sc = SparkContext.getOrCreate(conf=conf)
    sc.setCheckpointDir("temp_graphframes/")
    spark = SparkSession(sc)

    # Register UDFs
    from pyspark.sql import types
    '''
    spark.udf.registerJavaFunction(
        "jaro_winkler_sim",
        "uk.gov.moj.dash.linkage.JaroWinklerSimilarity",
        types.DoubleType(),
    )
    spark.udf.registerJavaFunction(
        "Dmetaphone", "uk.gov.moj.dash.linkage.DoubleMetaphone", types.StringType()
    )
    '''

    return spark
Exemplo n.º 33
0
        def getOrCreate(self):
            """Gets an existing :class:`SparkSession` or, if there is no existing one, creates a
            new one based on the options set in this builder.

            This method first checks whether there is a valid global default SparkSession, and if
            yes, return that one. If no valid global default SparkSession exists, the method
            creates a new SparkSession and assigns the newly created SparkSession as the global
            default.

            >>> s1 = SparkSession.builder.config("k1", "v1").getOrCreate()
            >>> s1.conf.get("k1") == s1.sparkContext.getConf().get("k1") == "v1"
            True

            In case an existing SparkSession is returned, the config options specified
            in this builder will be applied to the existing SparkSession.

            >>> s2 = SparkSession.builder.config("k2", "v2").getOrCreate()
            >>> s1.conf.get("k1") == s2.conf.get("k1")
            True
            >>> s1.conf.get("k2") == s2.conf.get("k2")
            True
            """
            with self._lock:
                from pyspark.context import SparkContext
                from pyspark.conf import SparkConf
                session = SparkSession._instantiatedSession
                if session is None or session._sc._jsc is None:
                    if self._sc is not None:
                        sc = self._sc
                    else:
                        sparkConf = SparkConf()
                        for key, value in self._options.items():
                            sparkConf.set(key, value)
                        sc = SparkContext.getOrCreate(sparkConf)
                        # This SparkContext may be an existing one.
                    for key, value in self._options.items():
                        # we need to propagate the confs
                        # before we create the SparkSession. Otherwise, confs like
                        # warehouse path and metastore url will not be set correctly (
                        # these confs cannot be changed once the SparkSession is created).
                        sc._conf.set(key, value)
                    session = SparkSession(sc)
                for key, value in self._options.items():
                    session._jsparkSession.sessionState().conf().setConfString(key, value)
                for key, value in self._options.items():
                    session.sparkContext._conf.set(key, value)
                return session
Exemplo n.º 34
0
        def getOrCreate(self):
            """Gets an existing :class:`SparkSession` or, if there is no existing one, creates a
            new one based on the options set in this builder.

            This method first checks whether there is a valid global default SparkSession, and if
            yes, return that one. If no valid global default SparkSession exists, the method
            creates a new SparkSession and assigns the newly created SparkSession as the global
            default.

            >>> s1 = SparkSession.builder.config("k1", "v1").getOrCreate()
            >>> s1.conf.get("k1") == s1.sparkContext.getConf().get("k1") == "v1"
            True

            In case an existing SparkSession is returned, the config options specified
            in this builder will be applied to the existing SparkSession.

            >>> s2 = SparkSession.builder.config("k2", "v2").getOrCreate()
            >>> s1.conf.get("k1") == s2.conf.get("k1")
            True
            >>> s1.conf.get("k2") == s2.conf.get("k2")
            True
            """
            with self._lock:
                from pyspark.context import SparkContext
                from pyspark.conf import SparkConf

                session = SparkSession._instantiatedContext
                if session is None:
                    sparkConf = SparkConf()
                    for key, value in self._options.items():
                        sparkConf.set(key, value)
                    sc = SparkContext.getOrCreate(sparkConf)
                    # This SparkContext may be an existing one.
                    for key, value in self._options.items():
                        # we need to propagate the confs
                        # before we create the SparkSession. Otherwise, confs like
                        # warehouse path and metastore url will not be set correctly (
                        # these confs cannot be changed once the SparkSession is created).
                        sc._conf.set(key, value)
                    session = SparkSession(sc)
                for key, value in self._options.items():
                    session.conf.set(key, value)
                for key, value in self._options.items():
                    session.sparkContext._conf.set(key, value)
                return session
Exemplo n.º 35
0
    def cast(self, dataType):
        """ Convert the column into type ``dataType``.

        >>> df.select(df.age.cast("string").alias('ages')).collect()
        [Row(ages=u'2'), Row(ages=u'5')]
        >>> df.select(df.age.cast(StringType()).alias('ages')).collect()
        [Row(ages=u'2'), Row(ages=u'5')]
        """
        if isinstance(dataType, basestring):
            jc = self._jc.cast(dataType)
        elif isinstance(dataType, DataType):
            from pyspark.sql import SQLContext
            sc = SparkContext.getOrCreate()
            ctx = SQLContext.getOrCreate(sc)
            jdt = ctx._ssql_ctx.parseDataType(dataType.json())
            jc = self._jc.cast(jdt)
        else:
            raise TypeError("unexpected type: %s" % type(dataType))
        return Column(jc)
Exemplo n.º 36
0
        def getOrCreate(self):
            """Gets an existing :class:`SparkSession` or, if there is no existing one, creates a
            new one based on the options set in this builder.

            This method first checks whether there is a valid thread-local SparkSession,
            and if yes, return that one. It then checks whether there is a valid global
            default SparkSession, and if yes, return that one. If no valid global default
            SparkSession exists, the method creates a new SparkSession and assigns the
            newly created SparkSession as the global default.

            In case an existing SparkSession is returned, the config options specified
            in this builder will be applied to the existing SparkSession.
            """
            with self._lock:
                from pyspark.conf import SparkConf
                from pyspark.context import SparkContext
                from pyspark.sql.context import SQLContext
                sparkConf = SparkConf()
                for key, value in self._options.items():
                    sparkConf.set(key, value)
                sparkContext = SparkContext.getOrCreate(sparkConf)
                return SQLContext.getOrCreate(sparkContext).sparkSession
Exemplo n.º 37
0
#   - Python 3: `PYSPARK_PYTHON=python3 spark-submit --master local[*] --driver-class-path SystemML.jar test_mlcontext.py`

# Make the `systemml` package importable
import os
import sys
path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../")
sys.path.insert(0, path)

import unittest

import numpy as np
from pyspark.context import SparkContext

from systemml import MLContext, dml, pydml

sc = SparkContext.getOrCreate()
ml = MLContext(sc)

class TestAPI(unittest.TestCase):

    def test_output_string(self):
        script = dml("x1 = 'Hello World'").output("x1")
        self.assertEqual(ml.execute(script).get("x1"), "Hello World")

    def test_output_list(self):
        script = """
        x1 = 0.2
        x2 = x1 + 1
        x3 = x1 + 2
        """
        script = dml(script).output("x1", "x2", "x3")
Exemplo n.º 38
0
#  on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
#  or implied. See the License for the specific language governing
#  permissions and limitations under the License.

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from awsglue.job import Job
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

glueContext = GlueContext(SparkContext.getOrCreate())
spark = glueContext.spark_session

# catalog: database and table name
db_name = "medicare"
tbl_name = "medicare"

# s3 output directories
medicare_cast = "s3://glue-sample-target/output-dir/medicare_json_cast"
medicare_project = "s3://glue-sample-target/output-dir/medicare_json_project"
medicare_cols = "s3://glue-sample-target/output-dir/medicare_json_make_cols"
medicare_struct = "s3://glue-sample-target/output-dir/medicare_json_make_struct"
medicare_sql = "s3://glue-sample-target/output-dir/medicare_json_sql"

# Read data into a dynamic frame
medicare_dyf = glueContext.create_dynamic_frame.from_catalog(database = db_name, table_name = tbl_name)