def getOrCreate(self): """Gets an existing :class:`SparkSession` or, if there is no existing one, creates a new one based on the options set in this builder. This method first checks whether there is a valid global default SparkSession, and if yes, return that one. If no valid global default SparkSession exists, the method creates a new SparkSession and assigns the newly created SparkSession as the global default. >>> s1 = SparkSession.builder.config("k1", "v1").getOrCreate() >>> s1.conf.get("k1") == "v1" True In case an existing SparkSession is returned, the config options specified in this builder will be applied to the existing SparkSession. >>> s2 = SparkSession.builder.config("k2", "v2").getOrCreate() >>> s1.conf.get("k1") == s2.conf.get("k1") True >>> s1.conf.get("k2") == s2.conf.get("k2") True """ with self._lock: from pyspark.context import SparkContext from pyspark.conf import SparkConf session = SparkSession._instantiatedContext if session is None: sparkConf = SparkConf() for key, value in self._options.items(): sparkConf.set(key, value) sc = SparkContext.getOrCreate(sparkConf) session = SparkSession(sc) for key, value in self._options.items(): session.conf.set(key, value) return session
def read_to_rdd() -> RDD: """[read file into RDD] Returns: RDD: [pyspark RDD] """ sc = SparkContext.getOrCreate() rdd = sc.textFile(LOCAL_FILE) rdd = rdd.mapPartitions(lambda x: csv.reader(x)) return rdd
def get_resource_data(self): """ Fetch the required data from resource file """ path = settings.PATH required_columns = settings.COLUMNS spark_ctx = SparkContext.getOrCreate() spark = SparkSession(spark_ctx) shoes_df = spark.read.csv(path, inferSchema=True, header=True) required_df = shoes_df.select(required_columns) return required_df
def list_product_count(rdd: RDD): """[task 2b - saves unique count of products into txt] Args: rdd (RDD): [description] """ sc = SparkContext.getOrCreate() rdd = sc.parallelize([str(rdd.flatMap(explode).count())]) header = sc.parallelize(["Count:"]) header.union(rdd).coalesce(1).saveAsTextFile("out/out_1_2b.txt")
def _init_glue_context(): # Imports are done here so we can isolate the configuration of this job from awsglue.context import GlueContext from pyspark.context import SparkContext spark_context = SparkContext.getOrCreate() spark_context._jsc.hadoopConfiguration().set( "mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") # noqa pylint: disable=protected-access spark_context._jsc.hadoopConfiguration().set( "parquet.enable.summary-metadata", "false") # noqa pylint: disable=protected-access return GlueContext(spark_context)
def setUp(self): # Create a local Spark context with 4 cores spark_conf = SparkConf().setMaster('local[4]').\ setAppName("monasca-transform unit tests").\ set("spark.sql.shuffle.partitions", "10") self.spark_context = SparkContext.getOrCreate(conf=spark_conf) # quiet logging logger = self.spark_context._jvm.org.apache.log4j logger.LogManager.getLogger("org").setLevel(logger.Level.WARN) logger.LogManager.getLogger("akka").setLevel(logger.Level.WARN)
def setUp(self): # Create a local Spark context with 4 cores spark_conf = SparkConf().setMaster('local[4]').\ setAppName("monasca-transform unit tests").\ set("spark.sql.shuffle.partitions", "10") self.spark_context = SparkContext.getOrCreate(conf=spark_conf) # quiet logging logger = self.spark_context._jvm.org.apache.log4j logger.LogManager.getLogger("org").setLevel(logger.Level.WARN) logger.LogManager.getLogger("akka").setLevel(logger.Level.WARN)
def predict(self, src, outfile=None): """ Predicting on a new data set. """ sc = SparkContext.getOrCreate() # Make n-grams. Resultant RDD format: (document <any>, ngram <tuple>) ngrams = computeNGramsRDD(src, n=self.n, filter=self.filter) # Broadcast the set of class labels. blabels = sc.broadcast(set(self.conditionals.keys())) # Prepare for estimating class probabilities against each document. Resultant RDD format: ((document, cls), ngram) ngrams = ngrams.flatMap(lambda (document, ngram): [( (document, cls), ngram) for cls in blabels.value]) # Broadcast the conditional probabilities. bmodel = sc.broadcast(self.conditionals) # Replace n-gram with the likelihood estimate. Resultant RDD format: ((document, cls), likelihood) likelihoods = ngrams.map(lambda ((document, cls), ngram): ((document, cls), bmodel.value[cls].get(ngram) or bmodel.value[cls].get("unseen"))) # Sum up likelihoods. Resultant RDD format: ((document, cls), likelihood) likelihoods = likelihoods.reduceByKey(add) # Restructure the key value to make document the key for grouping. Resultant RDD format: (document, (class, likelihood)) likelihoods = likelihoods.map(lambda ((document, cls), likelihood): (document, (cls, likelihood))) # Broadcast prior probabilities bpriors = sc.broadcast(self.priors) # Add prior probabilities to estimates. Resultant RDD format: (document, (class, likelihood)) likelihoods = likelihoods.map(lambda (document, (cls, likelihood)): ( document, (cls, likelihood + bpriors.value[cls]))) # Now group predicitons by document. (document <any>, (class, likelihood) <iterator>) likelihoods = likelihoods.groupByKey() # Pick the best. Resultant RDD format: (document, (class, likelihood)) likelihoods = likelihoods.map(lambda (document, estimates): ( document, max(estimates, key=lambda (cls, lkl): lkl))) # Retain class. Drop likelihood estimates, only retain the class. predictions = likelihoods.map(lambda (document, (cls, lkl)): (document, cls)) # Save predicitons to outfile, if provided if outfile is not None: with open(outfile, 'w') as outfile: json.dump(predictions.collectAsMap(), outfile) return predictions
def main(argv): input = argv[0] output = argv[1] sc = SparkContext.getOrCreate() #"SparkContext is the entry point to any Spark functionality" - tutorialspoint.com sqlContext = SQLContext(sc) dataframes = sqlContext.read.format("csv").option("header", "true").option("mode", "DROPMALFORMED").load(input) #previous line found from stackoverflow - it makes the format CSV, it makes the header not recognized as input, #drop malformed data, and then load the input file sorted_dataframes = dataframes.sort(['cca2', 'timestamp'], ascending=[True, True]) #previous line sorts dataframe by cca2 (country) and timestamp sorted_dataframes.write.csv(output) #outputs to file
def main(): # Create a Glue context glueContext = GlueContext(SparkContext.getOrCreate()) # Create a DynamicFrame using the 'persons_json' table persons_DyF = glueContext.create_dynamic_frame.from_catalog( database="legislators", table_name="persons_json") # Print out information about this data print("Count: ", persons_DyF.count()) persons_DyF.printSchema()
def _fit(self, dataset): """Trains a TensorFlow model and returns a TFModel instance with the same args/params pointing to a checkpoint or saved_model on disk. Args: :dataset: A Spark DataFrame with columns that will be mapped to TensorFlow tensors. Returns: A TFModel representing the trained model, backed on disk by a TensorFlow checkpoint or saved_model. """ sc = SparkContext.getOrCreate() logger.info("===== 1. train args: {0}".format(self.args)) logger.info("===== 2. train params: {0}".format(self._paramMap)) local_args = self.merge_args_params() logger.info("===== 3. train args + params: {0}".format(local_args)) tf_args = self.args.argv if self.args.argv else local_args cluster = TFCluster.run(sc, self.train_fn, tf_args, local_args.cluster_size, local_args.num_ps, local_args.tensorboard, TFCluster.InputMode.SPARK, master_node=local_args.master_node, driver_ps_nodes=local_args.driver_ps_nodes) # feed data, using a deterministic order for input columns (lexicographic by key) input_cols = sorted(self.getInputMapping()) cluster.train(dataset.select(input_cols).rdd, local_args.epochs) cluster.shutdown(grace_secs=self.getGraceSecs()) if self.export_fn: if version.parse(TF_VERSION) < version.parse("2.0.0"): # For TF1.x, run export function, if provided if not local_args.export_dir: raise ValueError( "Export function requires --export_dir to be set") logging.info( "Exporting saved_model (via export_fn) to: {}".format( local_args.export_dir)) def _export(iterator, fn, args): single_node_env(args) fn(args) # Run on a single exeucutor sc.parallelize([1], 1).foreachPartition( lambda it: _export(it, self.export_fn, tf_args)) else: # for TF2.x raise Exception( "Please use native TF2.x APIs to export a saved_model.") return self._copyValues(TFModel(self.args))
def main(): # Create a Glue context glue = GlueContext(SparkContext.getOrCreate()) # Create a DynamicFrame using the 'persons_json' table persons_dyf = glue.read.json("s3://bertolb/sampledata/mockaroo/json") # persons_dyf = glue.read().catalog("legislators", "persons_json") # persons_dyf = glueContext.create_dynamic_frame.from_catalog(database="legislators", table_name="persons_json") # Print out information about this data print("Count: ", persons_dyf.count()) persons_dyf.printSchema()
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session ########## dyf_learning_object dyf_learning_object = glueContext.create_dynamic_frame.from_catalog( database="nvn_knowledge", table_name="nvn_knowledge_learning_object" ) dyf_learning_object = dyf_learning_object.select_fields( ['_key', 'learning_object_id', 'learning_object_name', 'phone_tic']) # convert kieu du lieu dyf_learning_object = dyf_learning_object.resolveChoice(specs=[('_key', 'cast:long')]) # Doc ra max key moi nhat, va chi load du lieuj lon hon key nay (tranh viec load full du lieu do cac ban ghi truoc da thuc hien chay etl roi) df_flag = spark.read.parquet("s3://dts-odin/flag/flag_LO.parquet") max_key = df_flag.collect()[0]['flag'] # so sanh _key datasource voi flag, lay nhung gia tri co key > flag dyf_learning_object = Filter.apply(frame=dyf_learning_object, f=lambda x: x["_key"] > max_key) # Show schema dyf_learning_object.printSchema() # Show data cua dynamicframe dyf_learning_object.show() # Check neu co ban ghi thi thuc hien tiep if (dyf_learning_object.count() > 0): apply_mapping_learning_object = ApplyMapping.apply(frame=dyf_learning_object, mappings=[("learning_object_id", "int", "learning_object_id", "int"), ("learning_object_name", "string", "learning_object_name", "string"), ("phone_tic", "string", "phone_tic", "string")]) resolve_choice_learning_object = ResolveChoice.apply(frame=apply_mapping_learning_object, choice="make_cols", transformation_ctx="resolve_choice_learning_object") dropnullfields_learning_object = DropNullFields.apply(frame=resolve_choice_learning_object, transformation_ctx="dropnullfields_learning_object") # # save_learning_object = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields_learning_object, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "learning_object", # "database": "dts_odin" # }, # redshift_tmp_dir="s3n://dts-odin/temp/tig_advisor/user_profile_student_contact/", # transformation_ctx="datasink4") # lay max key trong data source datasource = dyf_learning_object.toDF() flag = datasource.agg({"_key": "max"}).collect()[0][0] # convert kieu dl flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet("s3a://dts-odin/flag/flag_LO.parquet", mode="overwrite")
def list_top_purchased_products(rdd: RDD): """[task 3 - saves top 5 purchased products into txt file] Args: rdd (RDD): [spark RDD] """ sc = SparkContext.getOrCreate() rdd = (sc.parallelize( rdd.flatMap(explode).map(lambda w: (w, 1)).reduceByKey( lambda a, b: a + b).takeOrdered(5, key=lambda x: -x[1])).coalesce(1)) rdd.saveAsTextFile("out/out_1_3.txt")
def main(): console("Starting Job") ## @params: [job_name] # 1. Start Glue Context glueContext = GlueContext(SparkContext.getOrCreate()) # 2. Initialize Job job = Job(glueContext) args = get_args() job.init(args["JOB_NAME"], args) client = boto3.client("glue", region_name="us-east-1") Tables = client.get_tables(DatabaseName=args["athena_database"]) tableList = Tables["TableList"] # 3. Clear bucket contents console(f"Excluding S3 files for: {args['target_bucket']}") s3 = boto3.resource("s3") bucket = s3.Bucket(args["target_bucket"]) bucket.objects.delete() for table in tableList: tableName = table["Name"] if re.search(f"[0-9]+", tableName): # Ignoring Athena Tables already processed... continue else: console("Processing Table {}".format(tableName)) # 4. Load dynamic dataframe datasource0 = glueContext.create_dynamic_frame.from_catalog( database=args["athena_database"], table_name=tableName, transformation_ctx="datasource0", ) # 5. Process Dynamo database # Drop null fields dropnullfields1 = DropNullFields.apply( frame=datasource0, transformation_ctx="dropnullfields1") # Save Dynamic Frame on S3 using Glue glueContext.write_dynamic_frame.from_options( frame=dropnullfields1, connection_type="s3", connection_options={ "path": "s3://{}/{}/".format(args["target_bucket"], tableName) }, format="parquet", transformation_ctx="datasink2", ) job.commit()
def _fit(self, dataset): """Trains a TensorFlow model and returns a TFModel instance with the same args/params pointing to a checkpoint or saved_model on disk. Args: :dataset: A Spark DataFrame with columns that will be mapped to TensorFlow tensors. Returns: A TFModel representing the trained model, backed on disk by a TensorFlow checkpoint or saved_model. """ sc = SparkContext.getOrCreate() logging.info("===== 1. train args: {0}".format(self.args)) logging.info("===== 2. train params: {0}".format(self._paramMap)) local_args = self.merge_args_params() logging.info("===== 3. train args + params: {0}".format(local_args)) if local_args.input_mode == TFCluster.InputMode.TENSORFLOW: if dfutil.isLoadedDF(dataset): # if just a DataFrame loaded from tfrecords, just point to original source path logging.info("Loaded DataFrame of TFRecord.") local_args.tfrecord_dir = dfutil.loadedDF[dataset] else: # otherwise, save as tfrecords and point to save path assert local_args.tfrecord_dir, "Please specify --tfrecord_dir to export DataFrame to TFRecord." if self.getInputMapping(): # if input mapping provided, filter only required columns before exporting dataset = dataset.select(list(self.getInputMapping())) logging.info("Exporting DataFrame {} as TFRecord to: {}".format(dataset.dtypes, local_args.tfrecord_dir)) dfutil.saveAsTFRecords(dataset, local_args.tfrecord_dir) logging.info("Done saving") tf_args = self.args.argv if self.args.argv else local_args cluster = TFCluster.run(sc, self.train_fn, tf_args, local_args.cluster_size, local_args.num_ps, local_args.tensorboard, local_args.input_mode, driver_ps_nodes=local_args.driver_ps_nodes) if local_args.input_mode == TFCluster.InputMode.SPARK: # feed data, using a deterministic order for input columns (lexicographic by key) input_cols = sorted(self.getInputMapping()) cluster.train(dataset.select(input_cols).rdd, local_args.epochs) cluster.shutdown() # Run export function, if provided if self.export_fn: assert local_args.export_dir, "Export function requires --export_dir to be set" logging.info("Exporting saved_model (via export_fn) to: {}".format(local_args.export_dir)) def _export(iterator, fn, args): single_node_env(args) fn(args) # Run on a single exeucutor sc.parallelize([1], 1).foreachPartition(lambda it: _export(it, self.export_fn, tf_args)) return self._copyValues(TFModel(self.args))
def __main__(): sc = SparkContext.getOrCreate() sc.setLogLevel('OFF') text_file = sc.textFile(sys.argv[1]) sortedCount = text_file.flatMap(lambda x: x.split(" ")).map(lambda x: (int(x), 1)).sortByKey() output = sortedCount.collect() with open(sys.argv[2], 'w') as f: for(num, unitcount) in output: f.write(str(num)+"\n")
def main(): # # @params: [TempDir, JOB_NAME] # args = getResolvedOptions(sys.argv, ['TempDir', 'JOB_NAME']) # sc = SparkContext() glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session # job = Job(glueContext) # job.init(args['JOB_NAME'], args) spark.conf.set("spark.sql.session.timeZone", "GMT+07:00") # get dynamic frame source dyf_log_student_level_study = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="log_student_level_study") # chon cac field dyf_log_student_level_study = dyf_log_student_level_study.select_fields([ '_key', 'contact_id', 'level_modified', 'package_code', 'time_created' ]) dyf_log_student_level_study = dyf_log_student_level_study.resolveChoice( specs=[('_key', 'cast:long')]) dyf_log_student_level_study.printSchema() dyf_log_student_level_study.show(2) # check bucket is not null try: # # doc moc flag tu s3 df_flag = spark.read.parquet( "s3a://dtsodin/flag/toa_L3150/toa_8_log_student_level_study.parquet" ) start_read = df_flag.collect()[0]['flag'] print('read from index: ', start_read) # so sanh _key datasource voi flag, lay nhung gia tri co key > flag dyf_log_student_level_study = Filter.apply( frame=dyf_log_student_level_study, f=lambda x: x['_key'] > start_read) except: print('read flag file error ') print('the number of new contacts: ', dyf_log_student_level_study.count()) if (dyf_log_student_level_study.count() > 0): dyf_log_student_level_study = Filter.apply( frame=dyf_log_student_level_study, f=lambda x: x['contact_id'] is not None and x['level_modified'] is not None and x['package_code'] is not None and x['time_created' ] is not None) dyf_log_student_level_study.printSchema()
def getOrCreate(self): """Gets an existing :class:`SparkSession` or, if there is no existing one, creates a new one based on the options set in this builder. """ with self._lock: from pyspark.conf import SparkConf from pyspark.context import SparkContext from pyspark.sql.context import SQLContext sparkConf = SparkConf() for key, value in self._options.items(): sparkConf.set(key, value) sparkContext = SparkContext.getOrCreate(sparkConf) return SQLContext.getOrCreate(sparkContext).sparkSession
def broadcast(self): """Broadcast self to ensure we are shared.""" if self._broadcast is None: from pyspark.context import SparkContext sc = SparkContext.getOrCreate() try: SpacyMagic.__lock.acquire() self.__empty_please = True self._broadcast = sc.broadcast(self) self.__empty_please = False finally: SpacyMagic.__lock.release() return self._broadcast
def getOrCreate(self): """Gets an existing :class:`SparkSession` or, if there is no existing one, creates a new one based on the options set in this builder. """ with self._lock: from pyspark.conf import SparkConf from pyspark.context import SparkContext from pyspark.sql.context import SQLContext sparkConf = SparkConf() for key, value in self._options.items(): sparkConf.set(key, value) sparkContext = SparkContext.getOrCreate(sparkConf) return SQLContext.getOrCreate(sparkContext).sparkSession
def computeLikelihoodsRDD(tfs, alpha=0.0, vsize=1.0, logs=True): """ This method takes in an RDD of multinomial distribution and computes likelihoods from the distributon. The flag @clogs dictates whether or not to take log of likelihoods. The mehod can perform additive smoothing before computing the estimator. @alpha <float> is the smoothing parameter and @vsize <int> (|V|) is the size of the vocabulary. Refer the wikipedia article for more on additive smoothing: https://en.wikipedia.org/wiki/Additive_smoothing. No smoothing is performed by default. Input RDD: (document <any>, {word: count}) Output RDD: (document <any>, {word: likelihood, .... , "unseen": likelihood}) Note: The likelihood of an unseen attribute computed via additive smoothing is added to the word-likelihood map with the key 'unseen' <string>. """ sc = SparkContext.getOrCreate() # Broadcast the variables required for computing likelihoods logs = sc.broadcast(float(logs)) alpha = sc.broadcast(float(alpha)) vsize = sc.broadcast(float(vsize)) # Define map method for computing likelihoods from frequency table # Note that the method is defined within for it uses the variables broadcasted above and wouldn't make sense as a generic mehtod of the module. def computeLikelihoods(table): """ The map function for likelihhood estimation. """ # Total weight total = sum((table[word] for word in table)) # Case 1. logs flag is set: Compute log likelihoods if logs.value: # log (tf + alpha / total + alpha * |V|) newtable = { word: log((table[word] + alpha.value) / (total + alpha.value * vsize.value)) for word in table } newtable["unseen"] = log(alpha.value / (total + alpha.value * vsize.value)) # Case 2. Compute vanilla likelihoods else: newtable = { word: (table[word] + alpha.value) / (total + alpha.value * vsize.value) for word in table } newtable["unseen"] = alpha.value / (total + alpha.value * vsize.value) return newtable # Transform term frequencies RDD into an estimator estimator = tfs.mapValues(computeLikelihoods) return estimator
def main(): # Invoke pydevd # pydevd.settrace('169.254.76.0', port=9001, stdoutToServer=True, stderrToServer=True) # Create a Glue context glueContext = GlueContext(SparkContext.getOrCreate()) # Create a DynamicFrame using the 'persons_json' table persons_DyF = glueContext.create_dynamic_frame.from_catalog( database="legislators", table_name="persons_json") # Print out information about this data print("Count: ", persons_DyF.count()) persons_DyF.printSchema()
def __init__(self, bucket_name, repartition_number, is_local=False): if is_local: sc = LocalContext("local[*]") else: glueContext = GlueContext(SparkContext.getOrCreate()) sc = glueContext.spark_session pass self.sql_context = SQLContext(sc) self.s3_bucket = bucket_name self.repartition_number = repartition_number s3_resource = boto3.resource('s3') self.bucket_resource = s3_resource.Bucket(bucket_name)
def _get_or_create_context(): global _sc if not _sc: from pyspark.conf import SparkConf from pyspark.context import SparkContext settings = GlobalPreferences['spark_configuration'] SparkContext.setSystemProperty("spark.executor.memory",settings['executor-memory']) SparkContext.setSystemProperty("spark.driver.memory",settings['driver-memory']) conf = SparkConf() conf = conf.setMaster(settings['master']) conf = conf.set('spark.driver.maxResultSize',settings['driver-memory']) _sc = SparkContext.getOrCreate(conf=conf) _sc.setLogLevel("WARN") return _sc
def _log(msg, level): sc = SparkContext.getOrCreate() log4jLogger = sc._jvm.org.apache.log4j LOGGER = log4jLogger.LogManager.getLogger(__name__) if LOGGER: if level == 1: LOGGER.error(msg) elif level == 2: LOGGER.warn(msg) elif level == 3: LOGGER.info(msg) else: LOGGER.debug(msg) else: print("Logging level {} : {}".format(level, msg))
def main(): print 'hello' sc = SparkContext.getOrCreate() x = sc.parallelize([("USA", 1), ("USA", 2), ("India", 1), ("UK", 1), ("India", 4), ("India", 9), ("USA", 8), ("USA", 3), ("India", 4), ("UK", 6), ("UK", 9), ("UK", 5)], 3) ## groupByKey with default partitions y = x.groupByKey() ## Check partitions print('Output: ', y.getNumPartitions()) y = x.groupByKey(2) print('Output: ', y.getNumPartitions())
def swapKeysRDD(rdd, keymap): """ The method swaps the keys of a key-value RDD with a new set of keys provided in @keymap. returns: a new <pyspark.rdd.RDD> """ sc = SparkContext.getOrCreate() # Broadcast new keys keymap = sc.broadcast(keymap) # Swap old keys with new ones. rdd = rdd.map(lambda (key, value): (keymap.value[key], value)) return rdd
def main(): potential_clones = '../Datasource/pc2.xml' output_csv = 'csvCodes.csv' df = convertAndSaveAsCSV(potential_clones, output_csv, False) # spark context sc = SparkContext.getOrCreate() sqlContext = SQLContext(sc) spark_df = sqlContext.createDataFrame(df) transformed_spark_df = spark_df.rdd.map(distributedSourceTransform) pysparkdf_transformedClones = transformed_spark_df.toDF() pysparkdf_transformedClones.toPandas().to_csv('results.csv')
def main(): args = parse_args() sc = SparkContext.getOrCreate() # prep data: read csv and split comma delimited lines into tuples rdd = sc.textFile(args.input_path) temp = rdd.map(lambda x: x.split(',')) # sort by country first if tie, sort by timestamp sorted_rdd = temp.sortBy(lambda x: x[2], lambda x: x[14]) #cacat array into one string ans = sorted_rdd.map(lambda x: ','.join((str(y) for y in x))) # save result to specified location ans.repartition(1).saveAsTextFile(args.output_path + 'result.csv')
def annoy_model(als_model, sc, groundTruth_test, test_users, n_trees=10, search_k=-1): print(f"annoy model with n_trees: {n_trees}, search_k: {search_k}") sc = SparkContext.getOrCreate() user_factors = als_model.userFactors size = user_factors.limit(1).select( F.size("features").alias("calc_size")).collect()[0].calc_size start_time = time() index_size = AnnoyIndex(size) for row in user_factors.collect(): index_size.add_item(row.id, row.features) index_size.build(n_trees) index_size.save("./annoy_result/annoy_t" + str(n_trees) + "_k_" + str(search_k) + ".ann") rec_list = [(user.user_id, index_size.get_nns_by_item(int(user.user_id), 500)) for user in test_users.collect()] temp = sc.parallelize(rec_list) print("Annoy-Recommendations (500) created for test users") rec = spark.createDataFrame(temp, ["user_id", "recommendations"]) pred_test = rec.join(groundTruth_test, rec.user_id == groundTruth_test.user_id, 'inner') predAndLabels_test_annoy = pred_test.select('recommendations', 'test_truth').rdd.map(tuple) metrics_test_annoy = RankingMetrics(predAndLabels_test_annoy) precision_test_annoy = metrics_test_annoy.precisionAt(500) map_test_annoy = metrics_test_annoy.meanAveragePrecision print(f"Time taken: {time() - start_time}s") print(f"Precision at 500: {precision_test_annoy}") print(f"Mean Average Precision: {map_test_annoy}") index_size.unload()
def get_spark(): conf = SparkConf() # Load in a jar that provides extended string comparison functions such as Jaro Winkler. # Splink # No longer needed in spark 3.0? #conf.set("spark.driver.extraClassPath", "C:\\Spark\\spark-3.1.1-bin-hadoop2.7\\jars\\scala-udf-similarity-0.0.7.jar") #conf.set("spark.driver.extraClassPath", "C:\\Spark\\spark-3.1.1-bin-hadoop2.7\\jars\\scala-udf-JaroWinkler-0.0.1.jar") conf.set( "spark.jars", "C:\\Spark\\spark-3.1.1-bin-hadoop2.7\\jars\\scala-udf-similarity-0.0.7.jar" ) conf.set( "spark.jars", "C:\\Spark\\spark-3.1.1-bin-hadoop2.7\\jars\\scala-udf-JaroWinkler-0.0.1.jar" ) # SET TO YOUR SPARK INSTALATION # WARNING: # These config options are appropriate only if you're running Spark locally!!! conf.set("spark.driver.memory", "1g") conf.set("spark.sql.shuffle.partitions", "4") #conf.set("spark.sql.files.maxPartitionBytes","536870912") #conf.set("spark.sql.files.maxPartitionBytes","250000000") #conf.set("spark.sql.files.maxPartitionBytes","134217728") sc = SparkContext.getOrCreate(conf=conf) sc.setCheckpointDir("temp_graphframes/") spark = SparkSession(sc) # Register UDFs from pyspark.sql import types ''' spark.udf.registerJavaFunction( "jaro_winkler_sim", "uk.gov.moj.dash.linkage.JaroWinklerSimilarity", types.DoubleType(), ) spark.udf.registerJavaFunction( "Dmetaphone", "uk.gov.moj.dash.linkage.DoubleMetaphone", types.StringType() ) ''' return spark
def getOrCreate(self): """Gets an existing :class:`SparkSession` or, if there is no existing one, creates a new one based on the options set in this builder. This method first checks whether there is a valid global default SparkSession, and if yes, return that one. If no valid global default SparkSession exists, the method creates a new SparkSession and assigns the newly created SparkSession as the global default. >>> s1 = SparkSession.builder.config("k1", "v1").getOrCreate() >>> s1.conf.get("k1") == s1.sparkContext.getConf().get("k1") == "v1" True In case an existing SparkSession is returned, the config options specified in this builder will be applied to the existing SparkSession. >>> s2 = SparkSession.builder.config("k2", "v2").getOrCreate() >>> s1.conf.get("k1") == s2.conf.get("k1") True >>> s1.conf.get("k2") == s2.conf.get("k2") True """ with self._lock: from pyspark.context import SparkContext from pyspark.conf import SparkConf session = SparkSession._instantiatedSession if session is None or session._sc._jsc is None: if self._sc is not None: sc = self._sc else: sparkConf = SparkConf() for key, value in self._options.items(): sparkConf.set(key, value) sc = SparkContext.getOrCreate(sparkConf) # This SparkContext may be an existing one. for key, value in self._options.items(): # we need to propagate the confs # before we create the SparkSession. Otherwise, confs like # warehouse path and metastore url will not be set correctly ( # these confs cannot be changed once the SparkSession is created). sc._conf.set(key, value) session = SparkSession(sc) for key, value in self._options.items(): session._jsparkSession.sessionState().conf().setConfString(key, value) for key, value in self._options.items(): session.sparkContext._conf.set(key, value) return session
def getOrCreate(self): """Gets an existing :class:`SparkSession` or, if there is no existing one, creates a new one based on the options set in this builder. This method first checks whether there is a valid global default SparkSession, and if yes, return that one. If no valid global default SparkSession exists, the method creates a new SparkSession and assigns the newly created SparkSession as the global default. >>> s1 = SparkSession.builder.config("k1", "v1").getOrCreate() >>> s1.conf.get("k1") == s1.sparkContext.getConf().get("k1") == "v1" True In case an existing SparkSession is returned, the config options specified in this builder will be applied to the existing SparkSession. >>> s2 = SparkSession.builder.config("k2", "v2").getOrCreate() >>> s1.conf.get("k1") == s2.conf.get("k1") True >>> s1.conf.get("k2") == s2.conf.get("k2") True """ with self._lock: from pyspark.context import SparkContext from pyspark.conf import SparkConf session = SparkSession._instantiatedContext if session is None: sparkConf = SparkConf() for key, value in self._options.items(): sparkConf.set(key, value) sc = SparkContext.getOrCreate(sparkConf) # This SparkContext may be an existing one. for key, value in self._options.items(): # we need to propagate the confs # before we create the SparkSession. Otherwise, confs like # warehouse path and metastore url will not be set correctly ( # these confs cannot be changed once the SparkSession is created). sc._conf.set(key, value) session = SparkSession(sc) for key, value in self._options.items(): session.conf.set(key, value) for key, value in self._options.items(): session.sparkContext._conf.set(key, value) return session
def cast(self, dataType): """ Convert the column into type ``dataType``. >>> df.select(df.age.cast("string").alias('ages')).collect() [Row(ages=u'2'), Row(ages=u'5')] >>> df.select(df.age.cast(StringType()).alias('ages')).collect() [Row(ages=u'2'), Row(ages=u'5')] """ if isinstance(dataType, basestring): jc = self._jc.cast(dataType) elif isinstance(dataType, DataType): from pyspark.sql import SQLContext sc = SparkContext.getOrCreate() ctx = SQLContext.getOrCreate(sc) jdt = ctx._ssql_ctx.parseDataType(dataType.json()) jc = self._jc.cast(jdt) else: raise TypeError("unexpected type: %s" % type(dataType)) return Column(jc)
def getOrCreate(self): """Gets an existing :class:`SparkSession` or, if there is no existing one, creates a new one based on the options set in this builder. This method first checks whether there is a valid thread-local SparkSession, and if yes, return that one. It then checks whether there is a valid global default SparkSession, and if yes, return that one. If no valid global default SparkSession exists, the method creates a new SparkSession and assigns the newly created SparkSession as the global default. In case an existing SparkSession is returned, the config options specified in this builder will be applied to the existing SparkSession. """ with self._lock: from pyspark.conf import SparkConf from pyspark.context import SparkContext from pyspark.sql.context import SQLContext sparkConf = SparkConf() for key, value in self._options.items(): sparkConf.set(key, value) sparkContext = SparkContext.getOrCreate(sparkConf) return SQLContext.getOrCreate(sparkContext).sparkSession
# - Python 3: `PYSPARK_PYTHON=python3 spark-submit --master local[*] --driver-class-path SystemML.jar test_mlcontext.py` # Make the `systemml` package importable import os import sys path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../") sys.path.insert(0, path) import unittest import numpy as np from pyspark.context import SparkContext from systemml import MLContext, dml, pydml sc = SparkContext.getOrCreate() ml = MLContext(sc) class TestAPI(unittest.TestCase): def test_output_string(self): script = dml("x1 = 'Hello World'").output("x1") self.assertEqual(ml.execute(script).get("x1"), "Hello World") def test_output_list(self): script = """ x1 = 0.2 x2 = x1 + 1 x3 = x1 + 2 """ script = dml(script).output("x1", "x2", "x3")
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express # or implied. See the License for the specific language governing # permissions and limitations under the License. import sys from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.dynamicframe import DynamicFrame from awsglue.job import Job from pyspark.sql import SparkSession from pyspark.sql.functions import udf from pyspark.sql.types import StringType glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session # catalog: database and table name db_name = "medicare" tbl_name = "medicare" # s3 output directories medicare_cast = "s3://glue-sample-target/output-dir/medicare_json_cast" medicare_project = "s3://glue-sample-target/output-dir/medicare_json_project" medicare_cols = "s3://glue-sample-target/output-dir/medicare_json_make_cols" medicare_struct = "s3://glue-sample-target/output-dir/medicare_json_make_struct" medicare_sql = "s3://glue-sample-target/output-dir/medicare_json_sql" # Read data into a dynamic frame medicare_dyf = glueContext.create_dynamic_frame.from_catalog(database = db_name, table_name = tbl_name)