class SparkContextFactory: def __init__(self): # not sure why windows environment variable can't be read, I set it ##os.environ["SPARK_HOME"] = "C:\Spark" # not sure why windows environment variable can't be read, I set it ##os.environ["HADOOP_CONF_DIR"] = "C:\hdp\bin" ##sys.path.append("C:\Spark\python") ##sys.path.append("C:\Spark\bin") # specify spark home os.environ["SPARK_HOME"] = "/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p0.4/lib/spark" # specify pyspark path so its libraries can be accessed by this application sys.path.append("/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p0.4/lib/spark/python") from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext self.conf = SparkConf().setMaster("yarn-client") self.conf.setAppName("MrT") self.conf.set("spark.executor.memory", "5g") self.conf.set("spark.driver.memory", "10g") self.sc = SparkContext(conf = self.conf, pyFiles = ["ComputeCovHistory.py", "go.py", "risk_DSconvert.py", "ewstats.py", "ewstatsRDD.py", "ewstatswrap.py"]) """ toDF method is a monkey patch executed inside SQLContext constructor so to be able to use it you have to create a SQLContext first """ self.sqlContextInstance = SQLContext(self.sc) def disconnect(self): self.sc.stop()
def main(): """ Main entry point of the application """ # Create spark configuration and spark context include_path = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'preprocessing.py')) conf = SparkConf() conf.set('spark.executor.memory', '1500m') conf.setAppName("Generating predictions") sc = SparkContext(conf=conf, pyFiles=[include_path]) # Set S3 configuration sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", os.environ['AWS_ACCESS_KEY']) sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", os.environ['AWS_SECRET_KEY']) # Single-pass predictions fast_predict(sc, file_input="s3n://twitter-stream-data/twitter-*", file_output="s3n://twitter-stream-predictions/final", sports_model="PyTwitterNews/models/sports.model", politics_model="PyTwitterNews/models/politics.model", technology_model="PyTwitterNews/models/technology.model") # Stop application sc.stop()
def stackexchange_xml_spark_job(): server = bluebook_conf.HDFS_FQDN conf = SparkConf() xml_file_address = "hdfs://" + server + "/" +\ bluebook_conf.STACKEXCHANGE_XML_FOLDER_NAME +\ bluebook_conf.STACKEXCHANGE_XML_FILE_NAME json_ques_folder_address = "hdfs://" + server + "/" +\ bluebook_conf.STACKEXCHANGE_JSON_QUES_FOLDER_NAME json_ans_folder_address = "hdfs://" + server + "/" +\ bluebook_conf.STACKEXCHANGE_JSON_ANS_FOLDER_NAME conf.setAppName('stackexchange_xml_spark_job') spark_context = SparkContext(conf=conf) file = spark_context.textFile(xml_file_address) # Ques and Ans files are stored seperately depending of their 'posttypeid' # Ques -> posttypeid == 1 # Ans -> posttypeid == 2 ques = file.map(stackexchange_xml_mapper)\ .filter(lambda dic: 'posttypeid' in dic.keys())\ .filter(lambda dic: dic['posttypeid'] == '1')\ .map(lambda d: jsoner(d)) ans = file.map(stackexchange_xml_mapper)\ .filter(lambda dic: 'posttypeid' in dic.keys())\ .filter(lambda dic: dic['posttypeid'] == '2')\ .map(lambda d: jsoner(d)) ques.saveAsTextFile(json_ques_folder_address) ans.saveAsTextFile(json_ans_folder_address)
def main(): # Setting the cluster configuration parameters conf = SparkConf() conf.setMaster("spark://localhost:7077") conf.setAppName("Tweet App") conf.set("spark.executor.memory", "3g") conf.set("spark.driver.memory", "4g") # Creating a Spark Context with conf file sc = SparkContext(conf=conf) # Creating and SQL context to perform SQL queries sqlContext = SQLContext(sc) # Define the data path curr_path = os.path.dirname(os.path.abspath(__file__)) json_name = "out.json" json_file_path = os.path.join(curr_path + "/../Spark_Jobs/data/", json_name) parquet_file_path = createSQLContext(json_file_path, sqlContext) print(parquet_file_path) # Read from parquet file parquetFile = sqlContext.read.parquet(parquet_file_path) parquetFile.registerTempTable("tweets") counter = sqlContext.sql("SELECT count(*) as cnt FROM tweets") print("============= Count =================") print("Count:: " + str(counter.collect()[0].cnt))
def __connected_yarn_spark_cluster(self, pilotcompute_description): number_cores=1 if pilotcompute_description.has_key("number_cores"): number_cores=int(pilotcompute_description["number_cores"]) number_of_processes = 1 if pilotcompute_description.has_key("number_of_processes"): number_of_processes = int(pilotcompute_description["number_of_processes"]) executor_memory="1g" if pilotcompute_description.has_key("number_of_processes"): executor_memory = pilotcompute_description["physical_memory_per_process"] conf = SparkConf() conf.set("spark.num.executors", str(number_of_processes)) conf.set("spark.executor.instances", str(number_of_processes)) conf.set("spark.executor.memory", executor_memory) conf.set("spark.executor.cores", number_cores) if pilotcompute_description!=None: for i in pilotcompute_description.keys(): if i.startswith("spark"): conf.set(i, pilotcompute_description[i]) conf.setAppName("Pilot-Spark") conf.setMaster("yarn-client") sc = SparkContext(conf=conf) sqlCtx = SQLContext(sc) pilot = PilotCompute(spark_context=sc, spark_sql_context=sqlCtx) return pilot
def configureSpark(): conf = SparkConf() conf.setMaster("local") conf.setAppName("Apache Spark Alarm Parser") conf.set("spark.executor.memory", "1g") sc = SparkContext(conf = conf) return sc
def sparkconfig(): # spark configuration options # conf = SparkConf() # conf.setMaster("spark://3.168.100.58:7077") # uncomment for standalone cluster # conf.setMaster("local") # uncomment for local execution # conf.setAppName("demo_chain") # conf.set("spark.executor.memory", "2g") # conf.set("spark.default.parallelism", 56) # 48) # conf.set("spark.sql.inMemoryColumnarStorage.compressed","true") # conf.set("sql.inMemoryColumnarStorage.batchSize",2000) # AMAZON AWS EMR conf = SparkConf() conf.setMaster("yarn-client") #client gets output to terminals #conf.setMaster("yarn-cluster") # this seems to runf aster but can't confirm conf.set("spark.default.parallelism",648) conf.setAppName("spark_markov_chain") conf.set("spark.executor.memory", "22g") conf.set("spark.executor.instances",9) conf.set("spark.executor.cores",9) conf.set("spark.yarn.executor.memoryOverhead",800) conf.set("spark.rdd.compress","True") conf.set("spark.shuffle.consolidateFiles","True") conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") return conf
def main(args): if len(args) < 2: sys.exit(1) # Setting the cluster configuration parameters spark_master = args[0] spark_data_file_name = args[1] file_path = CURR_DIR + "/" + spark_data_file_name conf = SparkConf() conf.setMaster(spark_master) conf.setAppName("Log Scanner") # Creating a Spark Context with conf file sc = SparkContext(conf=conf) txt_logs = sc.textFile(file_path).filter(lambda line: check(line)) access_logs = txt_logs.map(lambda line: AccessLog(line)) # Getting response_codes from log objects and caching it response_codes = access_logs.map(lambda log: log.get_status()).cache() log_count = response_codes.count() print("Total Resonse Codes: " + str(log_count)) cnt = response_codes.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y) response200 = cnt.filter(lambda x: x[0] == "200").map(lambda (x, y): y).collect() print("###########################") print("## Success Rate : " + str(int(response200[0])*100/log_count) + " % ##") print("###########################")
def spark_config(self): if self._spark_config is None: os.environ['SPARK_SUBMIT_CLASSPATH'] = ','.join(self.spex_conf.spark_config.jars) conf = SparkConf() conf.setAppName(self.spex_conf.spark_config.name) conf.setMaster(self.spex_conf.spark_config.master) conf.set('spark.rdd.compress', 'true') conf.set('spark.io.compression.codec', 'lz4') conf.set('spark.mesos.coarse', 'true' if self.spex_conf.spark_config.coarse_mode else 'false') # TODO - Setup all the other cruft as needed #conf.set('spark.executor.memory', '4g') #conf.set('spark.cores.max', '16') #conf.set('spark.task.cpus', '6') # TODO - bind port for spark web ui self._spark_config = conf config = self._spark_config # These are always set, if someone changes them we simply set them back config.set('spark.executor.uri', self.artifact_resolver(self.spex_conf.spark_distro)) config.setExecutorEnv(key='PYSPARK_PYTHON', value='./%s daemon' % self.spex_conf.spex_name) return config
def read_conf(): """ Setting up spark contexts """ conf = SparkConf() conf.setMaster("local[*]") conf.setAppName("Testing") return conf
def getSparkConf(self): conf = SparkConf() conf.setAppName(self.PROJECT_NAME) conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") conf.set("spark.cleaner.ttl", self.TTL) # es conf.set("es.index.auto.create", "true") conf.set("es.nodes", self.ES_NODES) return conf
def init_spark_context(): # load spark context conf = SparkConf().setAppName("event-contour-server") conf.setMaster("local[4]") conf.setAppName("reduce") conf.set("spark.executor.memory", "4g") # IMPORTANT: pass aditional Python modules to each worker sc = SparkContext(conf=conf, pyFiles=['app.py', 'contourGenerator.py','EventParallelize.py']) return sc
def get_sc(): """ Defines and returns a SparkContext from some configurations via SparkConf. """ conf = SparkConf() conf.setAppName("Jon's PySpark") conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") conf.set("spark.kryroserializer.buffer.mb", "256") conf.set("spark.akka.frameSize", "500") conf.set("spark.akka.askTimeout", "30") return SparkContext(conf=conf)
def init(self): os.environ["SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2" # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY> # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY> conf = SparkConf() conf.setMaster("local[10]") conf.setAppName("PySparkShell") conf.set("spark.executor.memory", "2g") conf.set("spark.driver.memory", "1g") self.sc = SparkContext(conf=conf) self.sqlContext = SQLContext(self.sc)
def __init__(self, master, name): self.name=name self.master=master print "init spark ..." os.environ["HADOOP_HOME"]="D:\code\wqr\hadoop-common-2.2.0-bin" conf = SparkConf() conf.setMaster(self.master) conf.setAppName(self.name) self.sc = SparkContext(conf=conf)
def __connected_spark_cluster(self, resource_url, pilot_description=None): conf = SparkConf() conf.setAppName("Pilot-Spark") if pilot_description!=None: for i in pilot_description.keys(): if i.startswith("spark"): conf.set(i, pilot_description[i]) conf.setMaster(resource_url) print(conf.toDebugString()) sc = SparkContext(conf=conf) sqlCtx = SQLContext(sc) pilot = PilotCompute(spark_context=sc, spark_sql_context=sqlCtx) return pilot
def main(): conf = SparkConf() conf.setAppName("TopAirports") conf.set("spark.streaming.kafka.maxRatePerPartition", "0") conf.set("spark.dynamicAllocation.enabled", "true") sc = SparkContext(conf = conf) ssc = StreamingContext(sc, 1) # Stream every 1 second ssc.checkpoint("checkpoint") # Clear the cassandra table init_cassandra().execute('TRUNCATE {}'.format(top_airports_table)) stream_kafka(ssc)
def create_spark_instance(master = "local", conf = None): """ master: default "local" conf: default 28 cores with 2g memory """ if not conf: conf = SparkConf() conf.set("spark.executor.memory", "2g") conf.set("spark.cores.max", "28") conf.setAppName("spark ipython notebook") spark_context = SparkContext(master, conf = conf) return spark_context
def _init_spark(self, appname): """Internal function to setup spark context Note: only include spark modules here so that the interface can be queried outside of pyspark. """ # currently using LZ4 compression: should not degrade runtime much # but will help with some operations like shuffling, especially when # dealing with things object like highly compressible label volumes # NOTE: objects > INT_MAX will cause problems for LZ4 worker_env = {} if "DVIDSPARK_WORKFLOW_TMPDIR" in os.environ and os.environ["DVIDSPARK_WORKFLOW_TMPDIR"]: worker_env["DVIDSPARK_WORKFLOW_TMPDIR"] = os.environ["DVIDSPARK_WORKFLOW_TMPDIR"] try: spark_config = self.config_data["options"]["spark-config"] except KeyError: # Old workflows haven't been updated to inherit the base Workflow schema spark_config = {} for k in list(spark_config.keys()): spark_config[k] = str(spark_config[k]) if spark_config[k] in ('True', 'False'): spark_config[k] = spark_config[k].lower() # Backwards compatibility: # if 'corespertask' option exists, override it in the spark config if "corespertask" in self.config_data["options"] and self.config_data["options"]["corespertask"] != 0: if "spark.task.cpus" in spark_config and spark_config["spark.task.cpus"] != '1': raise RuntimeError("Bad config: You can't set both 'corespertask' and 'spark.task.cpus'. Use 'spark.task.cpus'.") spark_config["spark.task.cpus"] = str(self.config_data["options"]["corespertask"]) # set spark config from pyspark import SparkContext, SparkConf conf = SparkConf() conf.setAppName(appname) conf.setAll(list(spark_config.items())) # from pyspark_flame import FlameProfiler # flamegraph_dir = f'{self.config_dir}/flamegraphs' # os.makedirs(flamegraph_dir, exist_ok=True) # conf.set("spark.python.profile.dump", flamegraph_dir) # conf.set("spark.python.profile", "true") # worker_env['pyspark_flame.interval'] = 0.25 # Default is 0.2 seconds # return SparkContext(conf=conf, batchSize=1, environment=worker_env, profiler_cls=FlameProfiler) # Auto-batching heuristic doesn't work well with our auto-compressed numpy array pickling scheme. # Therefore, disable batching with batchSize=1 return SparkContext(conf=conf, batchSize=1, environment=worker_env)
class FireStarter(): mappings = { 'http_api': readers.HttpApi, 'lighter': igniters.Lighter, 'hdfs': writers.HadoopFileSystem } def __init__(self, config_file): self.config_file = config_file def read_config_file(self): with open(self.config_file, 'r+') as config_data: self.config_data = config_data.read() def parse_config_contents(self): self.config = json.loads(self.config_data) check_requirements = required_config - frozenset(self.config.keys()) if check_requirements: raise ValueError('%s must contain %s' % (self.config_file, ', '.join(check_requirements))) def load_modules(self): """This loop initializes all of the readers, writers, and igniters then stores them in an array""" self.modules = OrderedDict() self.data = OrderedDict() for module in self.config['modules']: # Access the module via name, or by order new_module = self.modules[module['name']] = self.mappings[module['type']](**module['parameters']) self.data[module['name']] = new_module.data def create_spark_context(self): conf = self.config['spark_conf'] self.spark_config = SparkConf() self.spark_config.setAppName(conf['app_name']) for attribute, value in conf['parameters'].items(): self.spark_config.set(attribute, value) self.sc = SparkContext(conf = self.spark_config) def run_modules(self): for name, module in self.modules.items(): module.execute() def execute(self): self.read_config_file() self.parse_config_contents() self.load_modules() self.run_modules()
def main(): conf = SparkConf() conf.setMaster('local[*]') conf.setAppName('spark-basic') sc = SparkContext(conf=conf) churn_df = read_dataset(sc, "churn_no_header.csv") pipeline = build_pipeline() training_data, test_data = train_test_split(churn_df, 0.2) model = pipeline.fit(training_data) predictions = model.transform(test_data) print predictions.show(20) (roc_score, pr_score) = evaluate(predictions, ['areaUnderROC', 'areaUnderPR']) print "\nSpark AUC Score: ", roc_score, ", PR Score: ", pr_score
def set_up_spark(): ###################### # # initialize spark # ###################### conf = SparkConf() conf.setAppName("Spark Test") conf.set('spark.shuffle.io.preferDirectBufs','false') sc = SparkContext(conf = conf) quiet_logs(sc) sqlContext = SQLContext(sc) return sqlContext,sc
def createSparkCtx (Config,name): logging.info ("Getting Spark Context") spark_location = Config.get("connectioninfo","spark_location") spark_version = Config.get("connectioninfo","spark_version") spark_executor_num = Config.get("connectioninfo","spark_executor_num") spark_executor_mem = Config.get("connectioninfo","spark_executor_mem") spark_executor_cores = Config.get("connectioninfo","spark_executor_cores") spark_driver_mem = Config.get("connectioninfo","spark_driver_mem") spark_auto_broadcast = Config.get("connectioninfo","spark_auto_broadcast") # Depending on Spark version chose a different version of Spark on the system # Location is coming from the config file os.environ['SPARK_HOME'] = spark_location os.environ['PYTHONPATH'] = spark_location + "/python:"+ spark_location + "/python/lib/usr/lib/spark:$PYTHONPATH" os.environ['HADOOP_CONF_DIR'] = "/etc/hadoop/conf" os.environ['YARN_CONF_DIR'] = "/etc/hadoop/conf" sys.path.append(spark_location + "/python") from pyspark.sql import HiveContext from pyspark import SparkContext, SparkConf try: if spark_version == "2.0": from pyspark.sql import SparkSession sqlContext = SparkSession.builder.master("yarn").appName("Spark2 SQL Driver")\ .config("spark.executor.instances", spark_executor_num)\ .config("spark.executor.memory", spark_executor_mem)\ .config("spark.driver.memory", spark_driver_mem)\ .config("spark.executor.cores", spark_executor_cores)\ .config("spark.sql.autoBroadcastJoinThreshold", spark_auto_broadcast)\ .config("spark.yarn.queue", name)\ .enableHiveSupport().getOrCreate () return sqlContext else: conf = SparkConf() conf.setAppName("Spark1 SQL Driver") conf.set("spark.executor.instances", spark_executor_num ) conf.set("spark.executor.memory", spark_executor_mem) conf.set("spark.executor.cores", spark_executor_cores) conf.set("spark.driver.memory", spark_driver_mem) conf.set("spark.yarn.queue", name) conf.set("spark.sql.autoBroadcastJoinThreshold", spark_auto_broadcast) sc = SparkContext (conf=conf) sqlContext = HiveContext(sc) return sqlContext except Exception, e: logging.error ("Exception encountered: " + str(e)) sys.exit (1)
def main(): global ssc conf = SparkConf() conf.setAppName("TopAirports") conf.set("spark.streaming.kafka.maxRatePerPartition", "0") conf.set('spark.streaming.stopGracefullyOnShutdown', True) sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 1) # Stream every 1 second ssc.checkpoint("/tmp/checkpoint") signal.signal(signal.SIGINT, stop_streaming) stream_kafka()
def sc(cores=None, pyFiles=['back.py', 'cli.py', 'ingest.py', 'spark.py', 'text_nltk.py', 'spark.py', 'word2vec.py'], memo=None): if not cores: cores = 4 try: return sc.sc except AttributeError: from pyspark import SparkConf, SparkContext print >>sys.stderr, "CORES: %i" % cores conf = SparkConf() conf.setAppName("Nuance/Q%s" % (" [%s]" % memo if memo else "")) conf.set("spark.executor.memory", "8g") conf.set("spark.cores.max", str(cores)) conf.set("master.ui.port", "8082") conf.set("spark.ui.port", "4041") # kicked in sc.sc = SparkContext(conf=conf, pyFiles=pyFiles) return sc.sc
def main(): # Configure Spark conf = SparkConf() conf.setAppName("Application name") # Specify the application name conf.set("spark.jars", "file:/shared_data/spark_jars/hadoop-openstack-3.0.0-SNAPSHOT.jar") # Don't modify sc = SparkContext(conf=conf) # Spark Context variable that will be used for all operations running on the cluster parser = argparse.ArgumentParser() parser.add_argument("backend", type=str) parser.add_argument("helperpath", type=str) parser.add_argument("shuffle_partitions", type=str) parser.add_argument("params", type=str) parser.add_argument("inputs", type=str) parser.add_argument("features", type=str, nargs='?') args = parser.parse_args() # Swift Connection if(args.backend == 'swift'): hadoopConf = sc._jsc.hadoopConfiguration() hadoopConf.set("fs.swift.impl", "org.apache.hadoop.fs.swift.snative.SwiftNativeFileSystem") hadoopConf.set("fs.swift.service.SparkTest.auth.url", os.environ['OS_AUTH_URL'] + "/tokens") hadoopConf.set("fs.swift.service.SparkTest.http.port", "8443") hadoopConf.set("fs.swift.service.SparkTest.auth.endpoint.prefix", "/") hadoopConf.set("fs.swift.service.SparkTest.region", os.environ['OS_REGION_NAME']) hadoopConf.set("fs.swift.service.SparkTest.public", "false") hadoopConf.set("fs.swift.service.SparkTest.tenant", os.environ['OS_TENANT_ID']) hadoopConf.set("fs.swift.service.SparkTest.username", os.environ['OS_USERNAME']) hadoopConf.set("fs.swift.service.SparkTest.password", os.environ['OS_PASSWORD']) helperpath = str(args.helperpath) # This is passed by default sc.addFile(helperpath + "/utils/helper.py") # To import custom modules shuffle_partitions = args.shuffle_partitions # Create a dict and pass it in your_module_implementation params = json.loads(args.params) inputs = json.loads(args.inputs) features = json.loads(args.features) # Only used when you want to create a feature set sqlContext = SQLContext(sc) # Create SQLContext var from SparkContext, To work with our default format of datasets i.e. Parquet sqlContext.setConf("spark.sql.shuffle.partitions", shuffle_partitions) # Don't change, required for controlling parallelism # Pass the sc (Spark Context) and sqlContext along with the different paramters and inputs. module_implementation(sc, sqlContext, params=params, inputs=inputs, features=features)
def create_sc(): sc_conf = SparkConf() sc_conf.setAppName("finance-similarity-app") sc_conf.setMaster('spark://10.21.208.21:7077') sc_conf.set('spark.executor.memory', '2g') sc_conf.set('spark.executor.cores', '4') sc_conf.set('spark.cores.max', '40') sc_conf.set('spark.logConf', True) print sc_conf.getAll() sc = None try: sc.stop() sc = SparkContext(conf=sc_conf) except: sc = SparkContext(conf=sc_conf) return sc
def main(): # Spark Configurations conf = SparkConf() conf.set("spark.master", "local[*]") conf = conf.setAppName('Learning PySpark') sc = SparkContext(conf=conf) df = sc\ .textFile('IXQ_20170622080001.csv')\ .map(lambda line: line.split(',')) print(df.take(5))
def main(): conf = SparkConf() conf.setMaster('local[*]') conf.setAppName('renewer-prediction-spark') filename = '/Users/andyyoo/scikit_learn_data/renewer/Orange_Dataset.no.header.csv' sc = SparkContext(conf=conf) df = read_dataset(sc, filename) df = pipe_index_string_cols(df, cols=["label"]) df = pipe_assemble_features(df, excluded_cols=["label"]) df = pipe_scale_cols(df, with_mean=True, with_std=True, use_dense_vector=False) df.show() training_data, test_data = train_test_split(df, 0.2) model = rf_classifier().fit(training_data) predictions = model.transform(test_data) print predictions.show(20) (roc_score, pr_score) = evaluate(predictions, ['areaUnderROC', 'areaUnderPR']) print "\nSpark AUC Score: ", roc_score, ", PR Score: ", pr_score
def main(): conf = SparkConf() conf.setMaster("spark://192.168.199.123:8070") conf.setAppName("User Profile Spark") sc = SparkContext(conf=conf) print("connection sucessed with Master", conf) data = [1, 2, 3, 4] distData = sc.parallelize(data) print(distData.collect()) # raw = open(TRACKS_PATH, 'r').read().split("\n") tackfile = sc.parallelize(raw) tackfile = tackfile.filter(lambda line: len(line.split(',')) == 6) tbycust = tackfile.map(lambda line: make_tracks_kv(line)).reduceByKey(lambda a, b: a + b) custdata = tbycust.mapValues(lambda a: compute_stats_byuser(a)) print(custdata.first())
def geopyspark_conf(master=None, appName=None, additional_jar_dirs=[]): """Construct the base SparkConf for use with GeoPySpark. This configuration object may be used as is , or may be adjusted according to the user's needs. Note: The GEOPYSPARK_JARS_PATH environment variable may contain a colon-separated list of directories to search for JAR files to make available via the SparkConf. Args: master (string): The master URL to connect to, such as "local" to run locally with one thread, "local[4]" to run locally with 4 cores, or "spark://master:7077" to run on a Spark standalone cluster. appName (string): The name of the application, as seen in the Spark console additional_jar_dirs (list, optional): A list of directory locations that might contain JAR files needed by the current script. Already includes $(pwd)/jars. Returns: SparkConf """ conf = SparkConf() if not appName: raise ValueError("An appName must be provided") else: conf.setAppName(appName) if master: conf.setMaster(master) if 'GEOPYSPARK_JARS_PATH' in os.environ: additional_jar_dirs = additional_jar_dirs + os.environ[ 'GEOPYSPARK_JARS_PATH'].split(':') conf.set(key='spark.ui.enabled', value='false') conf.set(key='spark.serializer', value='org.apache.spark.serializer.KryoSerializer') conf.set(key='spark.kryo.registrator', value='geopyspark.geotools.kryo.ExpandedKryoRegistrator') current_location = os.path.dirname(os.path.realpath(__file__)) cwd = os.getcwd() local_prefixes = [ os.path.abspath(os.path.join(current_location, 'jars')), os.path.abspath(os.path.join(cwd, 'jars')), os.path.abspath(os.path.join(cwd, '../geopyspark/jars')) ] possible_jars = [ os.path.join(prefix, '*.jar') for prefix in local_prefixes + additional_jar_dirs ] configuration = os.path.join(current_location, 'command', 'geopyspark.conf') if not possible_jars: if os.path.isfile(configuration): with open(os.path.join(configuration)) as config_file: possible_jars.append(os.path.relpath(config_file.read(), cwd)) module_jars = [os.path.abspath(resource_filename('geopyspark.jars', JAR))] jar_dirs = [(jar, os.path.dirname(jar)) for jar in module_jars] for jar, jar_dir in jar_dirs: if jar_dir not in local_prefixes: possible_jars.append(jar) returned = [glob.glob(jar_files) for jar_files in possible_jars] jars = [jar for sublist in returned for jar in sublist] if not jars: raise IOError( "Failed to find any jars. Looked at these paths {}".format( possible_jars)) jar_string = ",".join(set(jars)) conf.set(key='spark.jars', value=jar_string) conf.set(key='spark.driver.memory', value='8G') conf.set(key='spark.executor.memory', value='8G') return conf
#-*- coding: utf-8 -*- #!env/bin/python from pyspark import SparkConf, SparkContext conf = SparkConf() conf.setAppName("PYSPARK_JOB_NAME") sc = SparkContext(conf=conf)
import sys reload(sys) sys.setdefaultencoding('utf-8') from pyspark import SparkContext from pyspark import SparkConf conf = SparkConf() conf.setAppName('Spark Quick Start Sample') sc = SparkContext(conf=conf) f = sc.textFile('/project/public/PGYR15/OP_PGYR2015_README_P01172017.txt') word_counts = f.flatMap(lambda x: x.split()).map(lambda x: (x, 1)).reduceByKey( lambda a, b: a + b).sortBy(lambda x: x[1], ascending=False).collect() fout = open("quick-start-work-count.txt", 'w') for w in word_counts: fout.write("%s: %d\n" % w) fout.close()
from pyspark import SparkContext, SparkConf import re conf = SparkConf() # To run this file with 'spark-submit' set master to 'yarn-cluster' # conf.setMaster("local") conf.setAppName("DateTime") # Creates SparkContext for the Main entry point of Spark functionality sc = SparkContext(conf=conf) # Read the input file # For remote cluster set remote host_name:port instead of localhost:9000 lines = sc.textFile("/Data/NASA_Access_Log") # Method to split the input line and returns hour value def getKey(line): # Replace '- ' with empty value, so input lines are separated only by white space str = line.replace("- ", " ") # Replace multiple spaces by single space str = ' '.join(str.split()) # Split the input line by white space unless text enclosed with in double quotes and '[]' and stores the each field as string array str = re.findall('\[[^\]]*\]|\"[^\"]*\"|\S+', str) # Get the timestamp value and stores the hour value str = str[1].replace("[", "").split(":")[1] return str # Creates the Key-Value pair with the hour value as the key and the integer 1 as the value pairs = lines.map(lambda s: (getKey(s), 1)) # Creates the Key-Value pair with the hour value as the key and the corresponding count as the value
def main(spark, bucket, input_file_name): """Run ETL pipeline""" # GCS bucket name to create and output tables to input_data = "gs://" + bucket + "raw/" output_data = "gs://" + bucket + "transformed/" print(spark) # process_song_data(spark, input_data, output_data) process_log_data(spark, input_data, output_data, input_file_name) if __name__ == "__main__": spark_conf = SparkConf() spark_conf.setAppName('Sparkify etl') spark_context = SparkContext(conf=spark_conf) sqlContext = SQLContext(spark_context) spark = SparkSession.builder.getOrCreate() parser = argparse.ArgumentParser() parser.add_argument( '--bucket', dest='bucket', required=True, help='Specify the full GCS wildcard path to the json files to enhance.' ) parser.add_argument( '--raw_file_name', dest='raw_file_name',
import numpy as np from sklearn.cross_validation import train_test_split, Bootstrap from sklearn.datasets import make_classification from sklearn.metrics import accuracy_score from sklearn.tree import DecisionTreeClassifier from sklearn import datasets, svm, pipeline from sklearn.kernel_approximation import RBFSampler from sklearn.linear_model import SGDClassifier if __name__ == '__main__': conf = SparkConf() conf.setMaster("spark://172.18.109.87:7077") # conf.setMaster("local") conf.setAppName("spark_svm") conf.set("spark.executor.memory", "12g") sc = SparkContext(conf=conf) X, y = make_classification(n_samples=10000, n_features=30, n_classes=2) X_train, X_test, y_train, y_test = train_test_split(X, y) samples = sc.parallelize(Bootstrap(y.size)) feature_map_fourier = RBFSampler(gamma=.2, random_state=1) fourier_approx_svm = pipeline.Pipeline([("feature_map", feature_map_fourier), ("svm", SGDClassifier())]) fourier_approx_svm.set_params(feature_map__n_components=700) results = samples.map(lambda (index, _): fourier_approx_svm.fit(X[index], y[index]).score(X_test, y_test)) \ .reduce(lambda x,y: x+y) final_results = results / len(Bootstrap(y.size))
data = sock.recv(1024) sock.close() print("Got data from stream.py") print(data) print(data.decode("utf-8")) return data.decode("utf-8").replace("#", "hashtag-").lower() hashtagIndex = getHashtagData() initES(hashtagIndex) # Pyspark # create spark configuration conf = SparkConf() conf.setAppName('TwitterApp') conf.setMaster('local[2]') # create spark context with the above configuration sc = SparkContext(conf=conf) # set the log level to one of ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN # sc.sparkContext.setLogLevel("OFF") sc.setLogLevel("ERROR") # create the Streaming Context from spark context with interval size 4 seconds ssc = StreamingContext(sc, 4) ssc.checkpoint("checkpoint_TwitterApp") # read data from port 900 dataStream = ssc.socketTextStream(TCP_IP, TCP_PORT)
#!/usr/bin/python from pyspark import SparkContext, SparkConf import sys,os,math import json tableName=sys.argv[1] xcol=int(sys.argv[2]) ycol=int(sys.argv[3]) gcol=sys.argv[4:] for i in range(0, len(gcol)): gcol[i] = int(gcol[i]) conf = SparkConf() conf.setAppName("MyWordCount") conf.setMaster("spark://ubuntu:7077") sc = SparkContext(conf=conf) rdd = sc.textFile("hdfs://localhost:9000/Tables/" + tableName) class T1: def __init__: self.val = 1 def mysplit(line): items = line.split("#") glist=[] for i in range(0, len(gcol)): glist.append(items[gcol[i]]) return (tuple(glist),(items[xcol],items[ycol]))
def initSparkConf(isLocal, appName): conf = SparkConf() conf.setAppName(appName) if isLocal is True: conf.setMaster("local") return conf
https://colab.research.google.com/drive/1HM0bHJ8wC333y_TUjb8-Ja3V-DD3AFNA """ import sys import re import math from collections import Counter !pip install pyspark from pyspark import SparkContext, SparkConf from random import randint import shutil # shutil.rmtree('') # !unzip TF_index.zip conf = SparkConf() conf.setMaster('local') conf.setAppName('TF/IDF') sc = SparkContext.getOrCreate(conf=conf) # Read CTF_index # a. Retrieve cosine normalized vector of the query words from CTF_index def st_to_dict(line): line = line.split("@") line1 = line[1].split("+") return (line[0], line[1]) # read data # Edit file name to give your input data dataFile = 'CTF_index' # create RDDS, read stopwords file stopWordFile ='stopwords-en.txt'
from pyspark import SparkConf, SparkContext conf = SparkConf() conf.setAppName("Cancelled OrderOver1000 Pyspark") sc = SparkContext(conf=conf) def parseOrders(rec): parts = rec.split(",") return (int(parts[0]), parts[1], int(parts[2]), parts[3]) def parseOrderItems(rec): parts = rec.split(",") return (int(parts[0]), int(parts[1]), int(parts[2]), int(parts[3]), float(parts[4]), float(parts[5])) path = "/user/hive/warehouse/retail_db.db" orders = sc.textFile(path + "/orders").map(lambda x: parseOrders(x)).map( lambda x: (x[0], (x[1], x[2], x[3]))) orderItems = sc.textFile(path + "/order_items").map( lambda x: parseOrderItems(x)).map(lambda x: (x[1], x[4])) # always filter as early as possible before doing joins, sort or aggregates ordersCancelled = orders.filter(lambda x: x[1][2].upper() == "CANCELED") ordersJoin = ordersCancelled.join(orderItems).map(
#Command: spark-submit spark_wc.py from pyspark import SparkConf, SparkContext #Spark set-up conf = SparkConf() conf.setAppName("Word count App") sc = SparkContext(conf=conf) #uncomment the sc.setLoglevel line, when your program works fine. #Run the program again to take the screenshot. #sc.setLogLevel("WARN") # Upload data file in Hadoop and provide its path in textFile function rdd = sc.textFile("/user/spark/words.txt") rdd = rdd.flatMap(lambda x: x.split(' ')) rdd = rdd.map(lambda x: (x, 1)) # Add few lines of code below rdd = rdd.reduceByKey(lambda x, y: x + y) # Add your code below # # # you may store top 10 results in 'out' variable # and use it to display as mentioned below. for item in out: print item[0], '\t:\t', str(item[1])
def date_boro_aggr(x): """ convert the [ [(year-month,borough),count],[(year-month,borough),count],[(year-month,borough),count],[(year-month,borough),count],[(year-month,borough),count]] where year-month is the same for all the entries of this array since we do group be before this mapping and boroughs are distinct to [list of aggregated borough counts] (we will have six versions of this each representing apr may jun of 2014 and 2015) """ temp = [0, 0, 0, 0, 0] for ele in x[1]: temp[int(ele[0][1])] += int(ele[1]) return temp n_of_periods = 3 #3 month for now conf = SparkConf() conf.setAppName("Residential_Analysis") # create spark context with the above configuration sc = SparkContext(conf=conf) sc.setLogLevel("ERROR") #------Taxi Analysis # read the csv file which has no header taxiFileWithNoHeader = sc.textFile("/taxi_combined.csv") # map each entry to ((year-month, borough integer representaion),1) taxi_date_boro = taxiFileWithNoHeader.map(lambda line: line.split(",")).map( date_boro_mapper) # reduce each mapped tuple by borough and year-month, then group by year-month so that we will have combined
from pyspark import SparkContext, SparkConf from pyspark.streaming import StreamingContext import binascii conf = SparkConf() conf.setAppName('basestation-analyze') sc = SparkContext(conf=conf) sc.setLogLevel("ERROR") ssc = StreamingContext(sc, 10) ssc.checkpoint("checkpoint") initialStateRDD = sc.parallelize([]) stream_data = ssc.textFileStream("hdfs:///data/") def decode_line(line): try: l = line.split(" ") if l[1] == "DCI": return [l[2]] else: return [""] except: return [""] def message_count(m): return m, 1
import os import sys from pyspark import SparkContext, SparkConf from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils os.environ[ 'PYSPARK_SUBMIT_ARGS'] = '--jars spark-streaming-kafka-assembly_2.11-1.6.3.jar pyspark-shell' conf = SparkConf() #conf.setMaster("spark://localhost:7077") conf.setAppName("Test") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 10) # 2 second window kvs = KafkaUtils.createStream(ssc, "localhost:2181", "simpleConsumer", {"test": 1}) lines = kvs.map(lambda x: x[1]) counts = lines.flatMap(lambda line: line.split(" ")).map( lambda word: (word, 1)).reduceByKey(lambda a, b: a + b) counts.pprint() ssc.start() ssc.awaitTermination()
hashtags_df.registerTempTable("hashtags") # get the top 10 hashtags from the table using SQL and print them hashtag_counts_df = sql_context.sql( "select hashtag, hashtag_count from hashtags order by hashtag_count desc limit 10" ) hashtag_counts_df.show() writeTopElements(hashtag_counts_df) except: e = sys.exc_info()[0] print("Error: %s" % e) # create spark configuration conf = SparkConf() conf.setAppName("TwitterStreamApp") # create spark context with the above configuration sc = SparkContext(conf=conf) sc.setLogLevel("ERROR") # create the Streaming Context from the above spark context with interval size 2 seconds ssc = StreamingContext(sc, 10) # setting a checkpoint to allow RDD recovery ssc.checkpoint("checkpoint_TwitterApp") # read data from port 9009 dataStream = ssc.socketTextStream("localhost", 9008) # split each tweet into words words = dataStream.flatMap(lambda line: line.split(" ")) # filter the words to get only hashtags, then map each hashtag to be a pair of (hashtag,1) hashtags = words.map(lambda x: (x, 1)) # adding the count of each hashtag to its last count tags_totals = hashtags.countByWindow(10 * 60 * 60, 30)
from pyspark import SparkConf, SparkContext from pyspark.sql import Row from pyspark.sql import SparkSession from pyspark.sql import SQLContext from pyspark.ml.linalg import DenseVector from pyspark.ml.classification import LogisticRegression # Spark set-up conf = SparkConf() conf.setAppName("Logistic regression") sc = SparkContext(conf=conf) sc.setLogLevel("WARN") spark = SparkSession(sc) # Load dataset file as RDD rdd = sc.textFile("/user/spark/task5.txt") rdd = rdd.map(lambda x: x.split(',')) def renameLabel(x): if x[4] == 'Iris-setosa': x[4] = 1 elif x[4] == 'Iris-versicolor': x[4] = 2 else: x[4] = 3 return x
from pyspark import SparkContext, SparkConf, SQLContext from pyspark.sql import SparkSession, Row, SQLContext, DataFrame from pyspark.sql.types import StructType, StructField, FloatType, StringType, IntegerType import sys import findspark findspark.init() #paths for players.csv and teams.csv players_csv_path = 'hdfs://localhost:9000/input/players.csv' teams_csv_path = 'hdfs://localhost:9000/input/teams.csv' # stores per match metrics for each player that resets at the end of the match player_metrics = dict() conf = SparkConf() conf.setAppName('BigData Project') spark_context = SparkContext(conf=conf, master="local[*]") streaming_context = StreamingContext(spark_context, 2) streaming_context.checkpoint('BigData Project Checkpoint') input_stream = streaming_context.socketTextStream('localhost', 6100) sqlContext = SQLContext(spark_context) #loading players.csv as a dataframe players_data_df = sqlContext.read.csv(players_csv_path, header=True) #loading teams.csv as a dataframe teams_data_df = sqlContext.read.csv(teams_csv_path, header=True)
name = str(row[0]) numbers = [int(i) for i in row[1:] ] tuples = itertools.permutations(numbers,2) pc = max(tuples, key=pewT) return [name, pc[0], pc[1]] if __name__ == "__main__": parser = argparse.ArgumentParser(description='PowerCouples Serial native version') parser.add_argument('-i','--input', dest="input_csv", help="input file in csv format", required=True) parser.add_argument('-o','--output', dest="output_csv", help="output file in csv format", default=sys.stdout, type=argparse.FileType('w')) args = parser.parse_args() # set the spark context conf = SparkConf() #conf.setMaster("local[4]") conf.setAppName("PowerCouples") sc = SparkContext(conf=conf) # compute power couples infile = sc.textFile(args.input_csv,40) result = infile.map(find_powerCouple).map(lambda elem: elem[0]+","+str(elem[1])+","+str(elem[2])).collect() # write results out = csv.writer(args.output_csv) for row in result: out.writerow([row])
def generate_model_package(training_data_path, id_cols, target_cols, fields_config_file, param_grid, model_name, target_var): """ training_data_path ,id_cols ,target_cols ,fields_config_file ,param_grid ,model_name ,target_var """ pyspark_app_nm = "train_" + model_name + "_" + secrets.token_hex(nbytes=4) logging.info("Starting process: " + pyspark_app_nm) #create spark object and spark context for parallel learning logging.info("Instantiating pyspark.") app_pyspark_conf = SparkConf() app_pyspark_conf.setAppName(pyspark_app_nm) # app_pyspark_conf.set('spark.executor.memory',spark_executor_memory) # app_pyspark_conf.set('spark.executor.cores', spark_executor_cores) spark = SparkSession.builder.config(conf=app_pyspark_conf).getOrCreate() sc = spark.sparkContext #load data logging.info("Beginning data load.") training_df = pd.read_parquet(training_data_path, engine='pyarrow') # sampling down # training_df_1 = training_df[training_df[target_var]==1].sample(20) # training_df_0 = training_df[training_df[target_var]==0].sample(40) # training_df = pd.concat([training_df_0,training_df_1]) # column handling logging.info("Creating column lists") all_cols = training_df.columns.tolist() x_cols = list(set(all_cols) - (set(target_cols + id_cols))) # dataframe setup X = training_df[x_cols] y = training_df[target_cols] # create holdout data logging.info("Creating holdout data") x_train, x_test, y_train, y_test = train_test_split(X, y[target_var], test_size=0.1, stratify=y[target_var]) wts = y_test.value_counts() wtrat = (wts[0] / wts[1]) # instantiate model gbm = lgb.LGBMClassifier() fit_params = { "eval_set": [(x_test, y_test)], "eval_metric": ear_stop_eval_mtr, "early_stopping_rounds": ear_stop_rnds # ,"scale_pos_weight": wtrat } grid_search = SparkGridSearchCV(sc, estimator=gbm, param_grid=param_grid, fit_params=fit_params) # grid_search.fit(x_train,y_train) grid_search.fit(x_train, y_train) best_model = grid_search.best_estimator_ optimized_parameters = best_model.get_params() # create confusion dataframe y_true = pd.DataFrame(y_test) y_true = y_true.reset_index() y_true.columns.values[0] = "CUSTOMER_KEY" y_true.columns.values[1] = "Y_TRUE" y_pred = pd.DataFrame(best_model.predict(x_test, y_test.tolist()), columns=["Y_PRED"]) confusion_data = pd.merge(left=y_true, right=y_pred, left_index=True, right_index=True) # summary statistics and metrics fr_col_nam_map = {0: "feature_nm", 1: "feature_importance"} feature_ranking = pd.DataFrame( [X.columns, best_model.feature_importances_]).T feature_ranking = feature_ranking.rename(columns=fr_col_nam_map) feature_ranking = feature_ranking.sort_values("feature_nm", ascending=False) metrics = { "precision_score": precision_score(confusion_data['Y_TRUE'], confusion_data['Y_PRED']), "roc_auc_score": roc_auc_score(confusion_data['Y_TRUE'], confusion_data['Y_PRED']), "classification_report": classification_report(confusion_data['Y_TRUE'], confusion_data['Y_PRED']), "confusion_matrix": confusion_matrix(confusion_data['Y_TRUE'], confusion_data['Y_PRED']), "accuracy_score": accuracy_score(confusion_data['Y_TRUE'], confusion_data['Y_PRED']), "precision_recall_curve": precision_recall_curve(confusion_data['Y_TRUE'], confusion_data['Y_PRED']), "recall_score": recall_score(confusion_data['Y_TRUE'], confusion_data['Y_PRED']), "roc_curve": roc_curve(confusion_data['Y_TRUE'], confusion_data['Y_PRED']) } output = { "model_name": model_name # string with model name , "model_class": best_model # grid_search.best_estimator_ , "optimized_parameters": optimized_parameters # best_model.get_params() , "feature_ranking": feature_ranking # best_model.feature_importances_ , "metrics": metrics, "confusion_data": confusion_data } return output
"-output", "--output", help="Complete output path for results ex. hdfs:/CCF/output") parser.add_argument("-partition", "--partition", type=int, help="Number of partitions for dataset") args = parser.parse_args() partition_number = args.partition input_file_path = args.input output_directory = args.output # Initialize spark-context configuration conf = SparkConf() conf.setMaster('local') conf.setAppName('pyspark-shell-CCF-SS-v2') # Just for local execution conf.set('spark.driver.host', '127.0.0.1') conf.set("spark.ui.proxyBase", "") # Just for having a nice gui locally os.environ[ 'PYSPARK_PYTHON'] = '/Users/ccompain/.pyenv/versions/miniconda3-latest/bin/python' # Needs to be explicitly provided as env. Otherwise workers run Python 2.7 os.environ['PYSPARK_DRIVER_PYTHON'] = 'python' sc = SparkContext(conf=conf) sc.setLogLevel("WARN") # Initialize logger log4jLogger = sc._jvm.org.apache.log4j LOGGER = log4jLogger.LogManager.getLogger(__name__) LOGGER.warn("################################")
flightNum_1 = item['flightNum_1'] flightNum_2 = item['flightNum_2'] origin_2 = item['origin_2'] total_delay = item['total_delay'] table.put_item(Item=item) def saveTopCarriers(carriers): sorted = carriers.sortBy(lambda item: item['total_delay']) for toSave in sorted.take(1): save_results(toSave) if __name__ == '__main__': conf = SparkConf() conf.setAppName("Problem_3-2") conf.set("spark.streaming.kafka.maxRatePerPartition", 50000) conf.set("spark.executor.memory", "2g") conf.set("spark.python.worker.memory", "1g") # airports airports = ['CMI', 'ORD', 'LAX', 'JAX', 'DFW', 'CRP', 'SLC', 'BFL', 'SFO', 'PHX', 'JFK'] year = "2008" sc = SparkContext(conf=conf) sc.setLogLevel("WARN") ssc = StreamingContext(sc, 2) ssc.checkpoint("/tmp/streaming") brokers = "b-1.cs598-tast2.n69c9p.c2.kafka.us-east-1.amazonaws.com:9092,b-2.cs598-tast2.n69c9p.c2.kafka.us-east-1.amazonaws.com:9092" topic = "cs598-task2"
import findspark findspark.init() from pyspark import SparkConf, SparkContext from pyspark.streaming import StreamingContext from pyspark.sql import Row, SQLContext import sys import requests configuration = SparkConf() configuration.setAppName("BigData") def rc(line): t = line.split(";")[7] if ',' not in t: return [t] return t.split(",") def f1(r): sr = r.sortBy(lambda x: (-x[1], x[0])) srr = sr.collect() c = 0 i = 0 if (srr != []): while (c != 5): if (srr[i][0] != ''): if (c != 4): print(srr[i][0], end=',') else: print(srr[i][0])
k=0 #tags_count = [p.hashtag_count for p in df.select("hashtag_count").collect()] #hashtag_counts_df.pprint() except: e = sys.exc_info()[0] #print("Error: %s" % e) def tmp(x): return (x.split(';')[0],1) if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: <file> <windowsize> <batchinterval>", file=sys.stderr) sys.exit(-1) conf=SparkConf() conf.setAppName("BigData") sc=SparkContext(conf=conf) ssc=StreamingContext(sc,int(sys.argv[2])) ssc.checkpoint("/checkpoint_BIGDATA") dataStream=ssc.socketTextStream("localhost",9009) # dataStream.pprint() #tweet=dataStream.map(tmp) # OR dataStream=dataStream.window(int(sys.argv[1]),1) tweet=dataStream.flatMap(lambda w:(w.split(';')[7].split(','))) hashtag=tweet.map(lambda w:(w,1)) #hashtag.pprint() count=hashtag.reduceByKey(lambda x,y:x+y) #count.pprint()
results_file.write(','.join(output) + '\n\n') for freq_set in result_frequent_itemsets[1:]: results_file.write(','.join(map(str, (sorted(freq_set)))) + '\n\n') if __name__ == '__main__': start_time = time.time() # initialize spark conf = SparkConf() conf.set("spark.driver.memory", "4g") conf.set("spark.executor.memory", "4g") conf.setMaster('local[8]') conf.setAppName('Assignment_2') sc = SparkContext.getOrCreate(conf) # get args threshold = int(sys.argv[1]) support = int(sys.argv[2]) input_file = sys.argv[3] result_file = sys.argv[4] # create baskets rdd data = sc.textFile(input_file).map(lambda x: x.split(',')).map( lambda x: (x[0], x[1])) header = data.first() raw_data = data.filter(lambda x: x != header) baskets = raw_data.groupByKey().map(lambda x: (list(set(x[1])))).filter(
i += 1 def cleanData(x): hashtags = x.split(",") clean = [] for hashtag in hashtags: if hashtag == " " or hashtag == " " or hashtag == "": pass else: clean.append(hashtag) return clean conf = SparkConf() conf.setAppName("A2") sc = SparkContext(conf=conf) batch_size = sys.argv[2] window_size = sys.argv[1] ssc = StreamingContext(sc, float(batch_size)) ssc.checkpoint("/checkpoint_BIGDATA") dataStream = ssc.socketTextStream("localhost", 9009) tweet = dataStream.map(lambda w: w.split(';')[7]) tweet1 = tweet.flatMap(lambda w: cleanData(w)) tweet1 = tweet1.map(lambda x: (x, 1))
from pyspark import SparkContext, SparkConf logFile = "loggy.md" # Should be some file on your system conf = SparkConf() conf.setMaster("local[4]") conf.setAppName("Simple Khan") conf.set("spark.executor.memory", "1g") sc = SparkContext(conf=conf) logData = sc.textFile(logFile).cache() keywords = ['Scala', 'Python'] def counter(line): return any(k in line for k in keywords) numAs = logData.filter(counter).count() print "Lines with keywords: %i " % (numAs)
from pyspark import SparkConf from pyspark import SparkContext sparkconfig = SparkConf() sparkconfig.setMaster("local[*]") sparkconfig.setAppName("SparkCSVJOB") def print_each_line(eachLine): print eachLine return sparkcontext = SparkContext(conf= sparkconfig) textFileRDD = sparkcontext.textFile("/home/dharshekthvel/Downloads/query_result.csv") textFileRDD.foreach(print_each_line)
from pyspark import SparkConf,SparkContext from operator import add import string import nltk from nltk.corpus import stopwords import re conf=SparkConf() conf.setAppName("Similarity Index") conf.set("spark.executor.memory","2g") conf.set("spark.ui.port","4098") sc=SparkContext(conf=conf) path="/cosc6339_hw2/gutenberg-500/" #popular words text=sc.textFile(path) words = text.flatMap(lambda line:line.lower().split()) word = words.map(lambda x: re.sub('\W+','',x)) stops = set(stopwords.words('english')) wordt = word.map(lambda x: ''.join([w1 for w1 in x.split() if w1 not in (stops)])) wcounts= wordt.map(lambda w: (w, 1) ) counts = wcounts.reduceByKey(add, numPartitions=1) count1 = counts.map(lambda (a,b) : (b,a)) count2 = count1.sortByKey(False) count = count2.map(lambda (a,b) : (b,a)) count3 = count.take(1000) count4 = sc.parallelize(count3,1) removePunct=(lambda x:x not in string.punctuation) finalWords=[] out=count4.collect()
import numpy as np import pandas as pd import pickle import io import time from pyspark import SparkContext from pyspark import SparkConf import sys conf = SparkConf() conf.setMaster("spark://0.0.0.0:7077") conf.setAppName("NumpyMult") sc = SparkContext(conf=conf) def addToServer(image): from elasticsearch import Elasticsearch from minio import Minio from minio.error import ResponseError es = Elasticsearch(['http://elasticsearch:9200']) minioClient = Minio('minio:9000', access_key='minio', secret_key='minio123', secure=False) ret = "" try: t = time.time() buf = pickle.dumps(image[0])