def main(): #spark_conf = SparkConf().setAppName("Text Preprocesser").set("spark.cores.max", "30") global sc #sc = SparkContext(conf=spark_conf) sc_conf = SparkConf() sc_conf.set("spark.redis.host", "ec2-52-73-233-196.compute-1.amazonaws.com") sc_conf.set("spark.redis.port", "6379") sc = SparkContext(conf=sc_conf) sc.setLogLevel("ERROR") # sc.addFile(os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + "/lib/util.py") # sc.addFile(os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + "/config/config.py") global sql_context sql_context = SQLContext(sc) start_time = time.time() preprocess_files("dataignition-tech-xml-parq", "posts.parquet") end_time = time.time() print( colored( "Preprocessing run time (seconds): {0}".format(end_time - start_time), "magenta"))
def main(): spark_conf = SparkConf().setAppName("Spark Custom MinHashLSH").set( "spark.cores.max", "30") global sc sc = SparkContext(conf=spark_conf) sc.setLogLevel("ERROR") sc.addFile( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + "/lib/min_hash.py") sc.addFile( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + "/lib/locality_sensitive_hash.py") sc.addFile( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + "/lib/util.py") sc.addFile( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + "/config/config.py") global sql_context sql_context = SQLContext(sc) start_time = time.time() run_minhash_lsh() end_time = time.time() print( colored( "Spark Custom MinHashLSH run time (seconds): {0} seconds".format( end_time - start_time), "magenta"))
def main(): #spark_conf = SparkConf().setAppName("Text Preprocesser").set("spark.cores.max", "30") global sc #sc = SparkContext(conf=spark_conf) sc_conf = SparkConf() sc_conf.set("spark.redis.host", config.REDIS_SERVER) sc_conf.set("spark.redis.port", "6379") sc = SparkContext(conf=sc_conf) sc.setLogLevel("ERROR") sc.addFile( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + "/lib/util.py") sc.addFile( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + "/config/config.py") global sql_context sql_context = SQLContext(sc) start_time = time.time() preprocess_files(config.S3_BUCKET, config.S3_FOLDER_EXTRACTED) end_time = time.time() print( colored( "Preprocessing run time (seconds): {0}".format(end_time - start_time), "magenta"))
def main(): spark_conf = SparkConf().setAppName("Text Preprocesser").set( "spark.cores.max", "30") global sc sc = SparkContext(conf=spark_conf) sc.setLogLevel("ERROR") sc.addFile( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + "/lib/util.py") sc.addFile( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + "/config/config.py") global sql_context sql_context = SQLContext(sc) start_time = time.time() preprocess_all() end_time = time.time() print( colored( "Preprocessing run time (seconds): {0}".format(end_time - start_time), "magenta"))
def initialize(): global sc, spark, items, inputfile, buckets_user, buckets_business, partition, totalSize, t, mainThreshold print("Initializing...") t = time.time() candidateList = [] frequentList = [] sc_conf = SparkConf() sc_conf.setAppName("Task1") sc_conf.setMaster('local[*]') sc_conf.set("spark.driver.bindAddress", "127.0.0.1") sc = SparkContext(conf=sc_conf) sc.setLogLevel("ERROR") csvread = sc.textFile(inputfile) columnName = csvread.first().split(',') items = csvread.map(lambda line: line.split(",")).filter( lambda line: (line) != columnName) buckets_user = items.groupByKey().mapValues(list).filter( lambda x: len(x[1]) > mainThreshold).mapPartitionsWithIndex( removeDuplicateEntriesAfter) print("Without Duplicates DOne..") # withoutDuplicates = checkM.mapPartitionsWithIndex( # removeDuplicateEntries).groupByKey().mapValues(list) if (case == 1): # buckets_user = withoutDuplicates.mapPartitionsWithIndex( # createBuckets).groupByKey().mapValues(list).filter(lambda x: len(x[1]) > mainThreshold) callSonPhase1(buckets_user) print("Initializing Phase 2.....") finalFreq = buckets_user.mapPartitionsWithIndex( lambda partition_index, iter_row: phase2(partition_index, iter_row) ).reduceByKey(lambda x, y: x + y).filter( lambda x: x[1] >= threshold).map(lambda x: makeList(x[0])) # print((finalFreq.collect())) finalOutput = (finalFreq.collect()) x = sorted(finalOutput, key=lambda item: (len(list(item)), list(item))) # print(x) printingFreq(x) pass if (case == 2): buckets_business = withoutDuplicates.mapPartitionsWithIndex( createBuckets_case2).groupByKey().mapValues(list) callSonPhase1(buckets_business) print("Initializing Phase 2.....") finalFreq = buckets_business.mapPartitionsWithIndex( lambda partition_index, iter_row: phase2(partition_index, iter_row) ).reduceByKey(lambda x, y: x + y).filter( lambda x: x[1] >= threshold).map(lambda x: makeList(x[0])) # print((finalFreq.collect())) finalOutput = (finalFreq.collect()) x = sorted(finalOutput, key=lambda item: (len(list(item)), list(item))) # print(x) printingFreq(x) pass
def main(): ''' some standard spark functions examples ''' # Starting a spark session sc = SparkContext() sc.setLogLevel("OFF") spark = SparkSession.builder.master("local").getOrCreate() # Creating a dataframe from data l = [(1, 'a', 'b', 'c', 'd'), (1, 'a', 'b', 'c', 'd')] df0 = spark.createDataFrame(l, ['col1', 'col2', 'col3', 'col4', 'col5']) # Creating a dataframe using rdd l = [(2, 'f', 'g'), (2, 'f', 'g')] rdd = sc.parallelize(l) schema = StructType([ StructField("col6", IntegerType(), True), StructField("col7", StringType(), True), StructField("col8", StringType(), True) ]) df1 = spark.createDataFrame(rdd, schema) # Joining both df0 and df1 indexedDf0 = add_column_index(df0) indexedDf1 = add_column_index(df1) df2 = indexedDf0.join(indexedDf1, indexedDf1.idx == indexedDf0.idx, 'inner').drop("idx") df2.write.csv("/tmp/file.csv", mode='overwrite', header=True, nullValue='NA', quoteAll=False) # Read a CSV file into a dataframe (multiLine=True to avoid splitting data with '\n' df = spark.read.csv("/tmp/file.csv", header=True, quote='"', escape='"', multiLine=True) # Print the Schema df.printSchema() # Count the number of rows print('Number of rows: {}'.format(df.count())) # Show columns print('Columns: {}'.format(df.columns)) # Display the data df.show() # Count Total Number of records in csv files in a given path path = '/tmp' count_total(spark, path)
def initialize(): global items, inputfile, sc, filterThreshold, t, totalEdges, cost_dict, strict_totalNodes, adjacency_listMain t = time.time() sc_conf = SparkConf() sc_conf.setAppName("Task1") sc_conf.setMaster('local[*]') # sc_conf.set("spark.driver.bindAddress", "127.0.0.1") sc = SparkContext(conf=sc_conf) sc.setLogLevel("ERROR") csvread = sc.textFile(inputfile) columnName = csvread.first().split(',') # print(columnName) items = csvread.map(lambda line: line.split(",")).filter( lambda line: (line) != columnName) # Getting user and their business count user_business = items.groupByKey().mapValues(set).collect() tuple_edge_list = [] for i in range(0, len(user_business) - 1): for j in range(i + 1, len(user_business)): inter = user_business[i][1] & user_business[j][1] if len(inter) >= filterThreshold: tuple_edge_list.append( (str(user_business[i][0]), str(user_business[j][0]))) tuple_edge_list.append( (str(user_business[j][0]), str(user_business[i][0]))) totalEdges = float(len(tuple_edge_list) / 2) adjacency_list = sc.parallelize(tuple_edge_list).groupByKey().mapValues( list).collectAsMap() adjacency_listMain = copy.deepcopy(adjacency_list) totalNodes = list(adjacency_list.keys()) # ------------------------Newly added line------------------------ strict_totalNodes = copy.deepcopy(totalNodes) # print(len(totalNodes)) # ----------------------Part 1--------------------- bfs(totalNodes, adjacency_list) print("Writing Betweenness to File....") # Converting into sorted List Initial Betweenness list_val = list(cost_dict.items()) list_val.sort(key=lambda x: (-x[1], x[0])) writeToFile(list_val) totalNodes = copy.deepcopy(strict_totalNodes) # print(len(totalNodes)) # ----------------------Part 2---------------------- print("Creating Partitions....") create_components(list_val, adjacency_listMain, totalNodes, totalEdges) # ---------------------EoC--------------------------- print("Duration: " + str(time.time() - t))
def initialize(): global sc, spark, items, inputfile print("Initializing...") sc_conf = SparkConf() sc_conf.setAppName("Task1") sc_conf.setMaster('local[*]') sc_conf.set("spark.driver.bindAddress", "127.0.0.1") sc = SparkContext(conf=sc_conf) sc.setLogLevel("ERROR") jsonread = sc.textFile(inputfile) items = jsonread.map(json.loads)
def main(): spark_conf = SparkConf().setAppName("Spark Custom MinHashLSH").set("spark.cores.max", "30") global sc global sql_context sc = SparkContext(conf=spark_conf) sc.setLogLevel("ERROR") sql_context = SQLContext(sc) sc.addFile(os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + "/lib/util.py") sc.addFile(os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + "/config/config.py") start_time = time.time() similarity_scores_df = compare_text() config = configparser.ConfigParser() config.read('../config/db_properties.ini') similarity_scores_df.write.jdbc(config['postgres']['url'], config['postgres']['table'], mode='overwrite', properties={'user': config['postgres']['user'], 'password': config['postgres']['password']}) end_time = time.time() print(colored("Spark MinHash run time (seconds): {0} seconds".format(end_time - start_time), "magenta"))
# import pandas as pd import numpy as np import datetime import pickle as pkl # import xgboost import os import sys import json import time kafka_topic = 'from-pubsub' zk = '10.138.0.3:2181' app_name = 'from-pubsub' # Can be some other name sc = SparkContext(appName="KafkaPubsub") ssc = StreamingContext(sc, 30) sc.setLogLevel("FATAL") kafkaStream = KafkaUtils.createStream(ssc, zk, app_name, {kafka_topic: 1}) def getSparkSessionInstance(sparkConf): if ("sparkSessionSingletonInstance" not in globals()): globals()["sparkSessionSingletonInstance"] = SparkSession \ .builder \ .config(conf=sparkConf) \ .getOrCreate() return globals()["sparkSessionSingletonInstance"] #Dictionary of mapping between number and label, we got this from training code A = {'label': ['NY', 'K', 'Q', 'BX', 'R']}
return ([float(i) for i in row.asDict()[feature_alias].split(",")], one_hot_label.tolist()) def extract_label_species(train_df,label_name): # 提取类别数 label_type = [i.asDict()[label_name] for i in train_df.select(label_name).distinct().collect()] type_count = len(label_type) return label_type,type_count # 解析命令行参数 parser = create_arg_parser() args = parser.parse_args() # 取得Spark配置中application运行配置的executors的数量 sc = SparkContext(conf=SparkConf().setAppName(args.app_name)) sc.setLogLevel("WARN") hiveContext = HiveContext(sc) executors = sc._conf.get("spark.executor.instances") num_executors = int(executors) if executors is not None else 1 # 指定 parameter server的个数 num_ps = 1 # 如果命令行未指定tensorflow集群的大小,则采用spark配置中指定的executors数量作为集群的大小 if args.cluster_size == None: args.cluster_size = num_executors print("args:", args) print("{0} ===== Start".format(datetime.now().isoformat())) label_name = args.label_name
root = logging.getLogger() root.setLevel(logging.DEBUG) handler = logging.StreamHandler(sys.stdout) handler.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) root.addHandler(handler) root.info("check") PySpark => sc = SparkContext() sc.setLogLevel('DEBUG') glueContext = GlueContext(sc) logger = glueContext.get_logger() logger.info('Hello Glue') from awsglue.context import GlueContext from pyspark.context import SparkContext SparkScala => import com.amazonaws.services.glue.log.GlueLogger object GlueApp { def main(sysArgs: Array[String]) {
files_to_load = [] for path, subdirs, files in os.walk(data_dir): for f_name in files: if file_type == 'csv': if re.match(r'.*?\.csv$', f_name): files_to_load.append(os.path.join(path, f_name)) elif file_type in ('orc', 'parquet'): regex_string = rf'.*?\.{file_type}$' if re.match(regex_string, f_name): files_to_load.append(os.path.join(path, f_name)) return files_to_load CONFIG = ConfigContext() SPARK_CONTEXT = SparkContext('local') SPARK_CONTEXT.setLogLevel(CONFIG.log_level.upper()) SPARK = SparkSession(SPARK_CONTEXT) # pylint: disable=undefined-variable def main(): ''' Main function ''' files_to_load = read_files(CONFIG.data_dir, CONFIG.file_type) desc_string = 'Spark session can be accessed using "SPARK"' data_frame = '' if files_to_load: try: if CONFIG.file_type == 'csv': data_frame = SPARK.read.csv(files_to_load, header=True,
def main(): POSTGRES_URL = 'jdbc:postgresql://10.0.0.12:5432/postgres' # Configure spark SQL conf = (SparkConf()\ .setAppName("Process")\ .set("spark.executor.instances", "4")\ .set("spark.driver.memory", "50g")\ .set("spark.executor.memory", "6g")) sc = SparkContext(conf=conf) sc.setLogLevel("ERROR") sqlContext = SQLContext(sc) spark = SparkSession.builder.appName('lead time predictor').getOrCreate() #create a list of cities available in S3 city_list = get_city_list('hodabnb') for city in city_list: start = time.time() #create a list of calendar files available for each city file_list = get_object_list(city, 'calendar.csv', 'hodabnb') #for each city fetch all the calendar files from s3 dfs = [] for file_name in file_list: scrape_date = file_name.split('_')[1] path = 's3n://hodabnb/' + file_name dfs.append(spark.read.format('com.databricks.spark.csv')\ .options(header='true', inferSchema='true')\ .load(path)\ .select('Listing_id', 'date' , 'available')\ .withColumn("scrape_date",lit(scrape_date).cast(DateType()))) #merge all and process as one df_all = reduce(DataFrame.unionAll, dfs) df_all = df_all.withColumn("date", df_all["date"].cast(DateType())) df_all = df_all.withColumn("listing_id", df_all["listing_id"].cast(IntegerType())) df_all = df_all.withColumn( 'lead_time', when(df_all['available'] == 't', datediff(df_all['date'], df_all['scrape_date'])).otherwise(999)) df_all = df_all.drop('scrape_date', 'available') df_all = df_all.dropDuplicates() df_all = df_all.groupBy('date', 'listing_id').agg({'lead_time': 'min'}) df_city = df_all.withColumn("city", lit(city)) #write to DB df_city.write.format("jdbc")\ .option("url", POSTGRES_URL) \ .option("dbtable", "leadtime_history") \ .option("user", "postgres") \ .option("password", "postgres") \ .option("driver", "org.postgresql.Driver")\ .mode("append")\ .save() end = time.time() print("finished job for %s in %s sec" % (city, (end - start)))
from pyspark.sql.functions import regexp_extract from pyspark.sql.functions import col from pyspark.sql.functions import sum as spark_sum from pyspark.sql.functions import udf from pyspark.sql import functions as F import matplotlib.pyplot as plt import seaborn as sns import numpy as np os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3" java8_location = '/usr/lib/jvm/java-8-oracle' # Set your own os.environ['JAVA_HOME'] = java8_location sc = SparkContext() sc.setLogLevel(logLevel="ERROR") sqlContext = SQLContext(sc) spark = SparkSession(sc) class HttpParser: def __init__(self, filepath, range_days, test_mode=False): self.status_code = 404 self.amount_to_list = 10 self.logs_df = [] self.logs_df_len = 0 self.status_freq_df =[] self.status_freq_df_len = 0 self.status_404 = 0 self.testMode = test_mode self.filepath = filepath
from __future__ import print_function from pyspark.conf import SparkConf from pyspark.context import SparkContext config = SparkConf() config.setAppName("SPARK_WORD_COUNT_JOB") config.setMaster("local[*]") sc = SparkContext(conf=config) sc.setLogLevel("info") text_file_rdd = sc.textFile("/home/dharshekthvel/history_1.txt") flat_mapped_rdd=text_file_rdd.flatMap(lambda each: each.split(' ')) mapped_rdd = flat_mapped_rdd.map(lambda each: (each,1)) mapped_rdd.reduceByKey(lambda x,y: x+y)\ .foreach(print)
def initialize(): global sc, spark, inputfile, t, items, validationfile, dictUid, dictBid, list_unaccounted, dict_code_uid, dict_code_bid, t, case t = time.time() sc_conf = SparkConf() sc_conf.setAppName("Task2") sc_conf.setMaster('local[*]') # sc_conf.set("spark.driver.bindAddress", "127.0.0.1") sc = SparkContext(conf=sc_conf) sc.setLogLevel("ERROR") csvread = sc.textFile(inputfile) columnName = csvread.first().split(',') items = csvread.map(lambda line: line.split(",")).filter( lambda line: (line) != columnName) # ------------Reading evaluation data----------- csvread2 = sc.textFile(validationfile) columnName2 = csvread2.first().split(',') validationData = csvread2.map(lambda line: line.split(",")).filter( lambda line: (line) != columnName2) # calling case 3: if case == 3: implement_case3(items, validationData) print("Duration: " + str(time.time() - t)) return # calling case 2: if case == 2: implement_case2(items, validationData) print("Duration: " + str(time.time() - t)) return # Ending case 2 # ------------PreProcessing data for training the mode----------- if case == 1: bid_uid = items.map(lambda u: (u[0], u[1])) keys = list(set(bid_uid.keys().collect())) values = list(set(bid_uid.values().collect())) dictUid = dict(zip(keys, range(0, len(keys)))) dictBid = dict(zip(values, range(0, len(values)))) for k, v in dictUid.items(): dict_code_uid[v] = k for k, v in dictBid.items(): dict_code_bid[v] = k ratings = items.map(lambda l: Rating(int(dictUid[l[0]]), int(dictBid[l[1]]), float(l[2]))) # Training the model on train data rank = 2 lambd = 0.5 numIterations = 10 model = ALS.train(ratings, rank, numIterations, lambd) print("Total entries in validation data: " + str(len(validationData.collect()))) # ----------------------Creating a map with integer values for users and business on validation test set----------------- test_on_validation = validationData.map(lambda p: mapData(p)) # validationRating = test_on_validation.filter( lambda p: (p[0] == 1)).map(lambda r: (r[1][0], r[1][1], r[1][2])) accountedPairs = test_on_validation.filter(lambda p: (p[0] == 1)).map( lambda r: (r[1][0], r[1][1])) UnaccountedPairs = test_on_validation.filter(lambda p: p[0] == 0).map( lambda r: ((r[1][0], r[1][1]), 2.75)) # print("Accounted Pairs: "+str(len(accountedPairs.collect()))) # print("Unaccounted Pairs: "+str(len(UnaccountedPairs.collect()))) # print(test_on_validation.count()) # print("Unaccounted Pairs: "+str(len(list_unaccounted))) # ----------------------Evaluate the model on training data---------------------- # testdata = ratings.map(lambda p: (p[0], p[1])) # predictions = model.predictAll(testdata).map(8 # lambda r: ((r[0], r[1]), r[2])) # ratesAndPreds = ratings.map(lambda r: ( # (r[0], r[1]), r[2])).join(predictions) # MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() # # import validation data # print("Mean Squared Error = " + str(MSE)) # ----------------------Evaluate the model on testing data---------------------- predictions = model.predictAll(accountedPairs).map( lambda r: ((r[0], r[1]), r[2])) # print(len(predictions.collect())) finalpred = predictions.union(UnaccountedPairs) # print(len(finalpred.collect())) # return # ratesAndPreds = validationRating.map(lambda r: ( # (r[0], r[1]), r[2])).join(predictions) ratesAndPreds = validationRating.map( lambda r: ((r[0], r[1]), r[2])).join(finalpred) MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() writeToFile(finalpred) rmse = math.sqrt(MSE) print("Root Mean Squared Error = " + str(rmse)) print("Duration: " + str(time.time() - t))
import pyspark.sql.types as T from pyspark.context import SparkContext from pyspark.sql.types import StructField, StructType, IntegerType, StringType, FloatType from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler from pyspark.ml import Pipeline from pyspark.ml.stat import Correlation from pyspark.ml.feature import VectorAssembler from pyspark.ml.linalg import DenseVector from pyspark.ml import Pipeline from helper_1 import * from multiprocessing import Process from pyspark.ml.classification import LogisticRegression, RandomForestClassifier sc = SparkContext() spark = SparkSession(sc) sc.setLogLevel('FATAL') #Creating Schema for the data to be loaded schema = StructType([ StructField('Age', IntegerType(), nullable=False), StructField('workclass', StringType(), nullable=False), StructField('fnlwgt', FloatType(), nullable=False), StructField('education', StringType(), nullable=False), StructField('education-num', FloatType(), nullable=False), StructField('marital', StringType(), nullable=False), StructField('occupation', StringType(), nullable=False), StructField('relationship', StringType(), nullable=False), StructField('race', StringType(), nullable=False), StructField('sex', StringType(), nullable=False), StructField('capital-gain', FloatType(), nullable=False), StructField('capital-loss', FloatType(), nullable=False),
import sys from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job args = getResolvedOptions(sys.argv, ['JOB_NAME']) sc = SparkContext() ## Set the Glue Logging level to Debug sc.setLogLevel("DEBUG") glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args)
import pyspark from pyspark.context import SparkContext from pyspark import SparkConf conf = SparkConf() sc = SparkContext(conf=conf) sc.setLogLevel("ERROR") # Load the adjacency list file AdjList1 = sc.textFile("/home/rob/Assignment4/02AdjacencyList.txt") print AdjList1.collect() AdjList2 = AdjList1.map( lambda line: line) # 1. Replace the lambda function with yours AdjList3 = AdjList2.map( lambda x: x) # 2. Replace the lambda function with yours AdjList3.persist() print AdjList3.collect() nNumOfNodes = AdjList3.count() print "Total Number of nodes" print nNumOfNodes # Initialize each page's rank; since we use mapValues, the resulting RDD will have the same partitioner as links print "Initialization" PageRankValues = AdjList3.mapValues( lambda v: v) # 3. Replace the lambda function with yours print PageRankValues.collect() # Run 30 iterations print "Run 30 Iterations"
# from pyspark.sql.session import SparkSession from pyspark.context import SparkConf, SparkContext from pyspark.sql.session import SparkSession sc = SparkContext('local') # conf = (SparkConf() # .set("spark.debug.maxToStringFields", "140") # .set("spark.driver.memory", "15g") # .set('spark.executor.memory', '4g') # .set('spark.sql.codegen.fallback','true') # .set('spark.sql.codegen.wholeStage','false') # .set('spark.driver.maxResultSize', '10g')) # sc.stop() # sc = SparkContext(conf=conf) sc.setLogLevel('OFF') from signal import signal, SIGPIPE, SIG_DFL signal(SIGPIPE, SIG_DFL) # sc = SparkContext('local') spark = SparkSession(sc) spark.sparkContext._conf.setAll([("spark.debug.maxToStringFields", "140"), ("spark.driver.memory", "15g"), ('spark.executor.cores', '4'), ('spark.cores.max', '4'), ('spark.executor.memory', '4g'), ('spark.sql.codegen.fallback', 'true'), ('spark.driver.maxResultSize', '10g')]) # %matplotlib inline b64test = base64.b64decode(sys.argv[1]) print(b64test.decode()) sys.exit()
import py4j import pyspark from pyspark.context import SparkContext sc = SparkContext() # Control our logLevel. This overrides any user-defined log settings. # Valid log levels include: ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN sc.setLogLevel("FATAL") text_file = sc.textFile(spark_home + "/README.md") word_counts = text_file \ .flatMap(lambda line: line.split()) \ .map(lambda word: (word, 1)) \ .reduceByKey(lambda a, b: a + b) print word_counts.collect()
def main(): # Instantiate SparkConf and sent extraJavaOptions to both executors and drivers spark_conf = (SparkConf().set( 'spark.executor.extraJavaOptions', '-Dcom.amazonaws.services.s3.enableV4=true').set( 'spark.driver.extraJavaOptions', '-Dcom.amazonaws.services.s3.enableV4=true')) # Instantiate SparkContext based on SparkConf sc = SparkContext(conf=spark_conf) # Set enableV4 property to access S3 input data sc.setSystemProperty('com.amazonaws.services.s3.enableV4', 'true') # Create new Hadoop Configuration hadoopConf = sc._jsc.hadoopConfiguration() # Set Hadoop configuration K-V if is_not_blank(AWS_ACCESS_KEY_ID): hadoopConf.set('fs.s3a.awsAccessKeyId', AWS_ACCESS_KEY_ID) if is_not_blank(AWS_SECRET_ACCESS_KEY): hadoopConf.set('fs.s3a.awsSecretAccessKey', AWS_SECRET_ACCESS_KEY) hadoopConf.set('com.amazonaws.services.s3a.enableV4', 'true') hadoopConf.set('fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem') # Create SparkSession from SparkContext spark_session = ( SparkSession(sc).builder.appName('ComplaintClassificator').config( conf=spark_conf).getOrCreate()) # Timestamp of start start_timestamp = dt.now() # Instantiate SparkContext sc = spark_session.sparkContext # Instantiate SQLContext sql_ctx = SQLContext(sc) # Set log level to 'WARN' sc.setLogLevel('WARN') # Set up log4j logging log4j_logger = sc._jvm.org.apache.log4j logger = log4j_logger.LogManager.getLogger(__name__) # Create schema as a StructType of StructField(s) schema = StructType([ StructField('ReceivedDate', StringType(), True), StructField('Product', StringType(), True), StructField('Subproduct', StringType(), True), StructField('Issue', StringType(), True), StructField('Subissue', StringType(), True), StructField('ConsumerComplaintNarrative', StringType(), True), StructField('CompanyPublicResponse', StringType(), True), StructField('CompanyName', StringType(), True), StructField('State', StringType(), True), StructField('ZipCode', IntegerType(), True), StructField('Tags', StringType(), True), StructField('IsConsumerConsent', StringType(), True), StructField('SubmittedVia', StringType(), True), StructField('SentDate', StringType(), True), StructField('CompanyResponseToConsument', StringType(), True), StructField('IsTimelyResponse', StringType(), True), StructField('IsConsumerDisputed', StringType(), True), StructField('ComplaintId', IntegerType(), True) ]) logger.warn("Starting preprocessing and data cleansing...") # Read Consumer_Complaints.csv file and apply schema complaint_df = (spark_session.read.format('csv').option( 'header', 'true').option('delimiter', ',').option('mode', 'FAILFAST').option( 'parserLib', 'univocity').option('escape', '"').option( 'multiLine', 'true').option('inferSchema', 'false').schema( schema).load(CONSUMER_COMPLAINTS).alias('complaint_df')) logger.warn("Explaining complaint_df...") complaint_df.explain() logger.warn("complaint_df has %d records, %d columns." % (complaint_df.count(), len(complaint_df.columns))) logger.warn("Printing schema of complaint_df: ") complaint_df.printSchema() # Register cleanse_files function as an UDF (UserDefinedFunction) udf_cleansed_field = udf(cleanse_field, StringType()) # Provide a lambda function to format date-type field to 'YYYY-MM-DD' pattern change_data_format = udf(lambda x: dt.strptime(x, '%m/%d/%Y'), DateType()) # Do some clean-up activities cleansed_df = (complaint_df.withColumn( 'Issue', udf_cleansed_field( complaint_df['ConsumerComplaintNarrative'])).withColumn( 'ReceivedDate', change_data_format(complaint_df['ReceivedDate']))) logger.warn("Explaining cleansed_df...") cleansed_df.explain() logger.warn("cleansed_init_df has %d records, %d columns." % (cleansed_df.count(), len(cleansed_df.columns))) logger.warn("Printing schema of cleansed_df: ") cleansed_df.printSchema() # Reduce a number of fields and filter non-null values out on consumer complaint narratives final_complaints_df = (cleansed_df.where( cleansed_df['ConsumerComplaintNarrative'].isNotNull()).select( 'ComplaintId', 'ReceivedDate', 'State', 'Product', 'ConsumerComplaintNarrative', 'Issue').orderBy(cleansed_df['ReceivedDate'])) final_complaints_df.registerTempTable("final_complaints_df") # Check random ConsumerComplaintNarrative as well as Issue content sql_ctx.sql(""" SELECT RowNum, ConsumerComplaintNarrative, Issue FROM (SELECT ROW_NUMBER() OVER (PARTITION BY State ORDER BY ReceivedDate DESC) AS RowNum, ConsumerComplaintNarrative, Issue, ReceivedDate, State FROM final_complaints_df) fc WHERE RowNum = 1 LIMIT 10 """).show() logger.warn("Explaining final_complaints_df...") final_complaints_df.explain() logger.warn( "final_complaints has %d records, %d columns." % (final_complaints_df.count(), len(final_complaints_df.columns))) logger.warn("Printing schema of final_complaints_df: ") final_complaints_df.printSchema() # Read states json provider as a states_df DataFrame abstraction states_df = (spark_session.read.json(AMERICAN_STATES, multiLine=True).alias('states_df')) logger.warn("Explaining states_df...") states_df.explain() logger.warn("states_df has %d records, %d columns." % (states_df.count(), len(states_df.columns))) logger.warn("Printing schema of states_df: ") states_df.printSchema() # List of fields to drop (not needed for the further processing) drop_list = ['state', 'abbreviation'] # Join complaints data with American states, apply id field and drop unnecessary fields joined_df = (final_complaints_df.join( broadcast(states_df), col('complaint_df.State') == col('states_df.abbreviation'), "left").withColumnRenamed('ConsumerComplaintNarrative', 'ConsumerComplaint').withColumn( 'RowNoIndex', monotonically_increasing_id()).select( 'Product', 'ConsumerComplaint', 'name').drop(*drop_list)) joined_df.registerTempTable("joined_df") # Check random FullStateName content sql_ctx.sql( """ SELECT RowNum, Product, ConsumerComplaint, FullStateName FROM (SELECT ROW_NUMBER() OVER (PARTITION BY Product ORDER BY ConsumerComplaint DESC) AS RowNum, Product, ConsumerComplaint, name AS FullStateName FROM joined_df) jd WHERE RowNum = 1 LIMIT 10 """).show() logger.warn("Explaining joined_df...") joined_df.explain() logger.warn("joined_df has %d records, %d columns." % (joined_df.count(), len(joined_df.columns))) logger.warn("Printing schema of joined_df: ") joined_df.printSchema() # Check unique labels of Product attribute before replace joined_df.select('Product').distinct().show() # Replace redundant labels from Product field renamed_df = (joined_df.withColumn( 'Product', regexp_replace( 'Product', "Credit reporting, credit repair services, or other personal consumer reports", "Credit reporting, repair, or other") ).withColumn( 'Product', regexp_replace("Product", "Virtual currency", "Money transfer, virtual currency, or money service") ).withColumn( 'Product', regexp_replace( "Product", "Money transfer", "Money transfer, virtual currency, or money service")).withColumn( 'Product', regexp_replace( "Product", "Payday loan", "Payday loan, title loan, or personal loan")).withColumn( 'Product', regexp_replace( "Product", "Credit reporting", "Credit reporting, repair, or other")).withColumn( 'Product', regexp_replace( "Product", "Prepaid card", "Credit card or prepaid card")).withColumn( 'Product', regexp_replace( "Product", "Credit card", "Credit card or prepaid card"))) renamed_df.registerTempTable("renamed_df") # Check how many unique labels (classes) there are sql_ctx.sql(""" SELECT DISTINCT Product FROM renamed_df """).show() # Check how many times each class occurs in the corpus sql_ctx.sql(""" SELECT Product, count(*) FROM renamed_df GROUP BY Product ORDER BY count(*) DESC""").show(50, False) logger.warn("Explaining renamed_df...") renamed_df.explain() # Check unique labels of Product attribute after replace renamed_df.select('Product').distinct().show() # Check amount of unique labels of Product attribute after replace logger.warn(str(renamed_df.select('Product').distinct().count())) logger.warn("Starting feature extraction...") # Tokenize consumer complaints sentences tokenizer = Tokenizer(inputCol='ConsumerComplaint', outputCol='Words') # Remove stop words remover = StopWordsRemover(inputCol='Words', outputCol='FilteredWords') # num_features = 700 hashing_tf = HashingTF(inputCol='FilteredWords', outputCol='RawFeatures') # minDocFreq: minimum number of documents in which a term should appear for filtering idf = IDF(inputCol='RawFeatures', outputCol='features') # Instantiate StringIndexer product_indexer = StringIndexer(inputCol='Product', outputCol='label') # Create a pipeline from previously defined feature extraction stages pipeline = Pipeline( stages=[tokenizer, remover, hashing_tf, idf, product_indexer]) # Fit renamed_df to the pipeline pipeline_fit = pipeline.fit(renamed_df) # Transform pipeline_fit data_set = pipeline_fit.transform(renamed_df) # Randomly slice the data into training and test datasets with requested ratio (training_data, test_data) = data_set.randomSplit([0.7, 0.3], seed=100) # Cache training_data training_data.cache() logger.warn("Starting Naive-Bayes...") # Naive-Bayes nb = NaiveBayes(labelCol='label', featuresCol='features', modelType='multinomial') # Create a model without Cross Validation nb_model = nb.fit(training_data) # Make predictions on model without Cross Validation predictions = nb_model.transform(test_data) print("NaiveBayes without CV model type: ", nb.getModelType()) print("NaiveBayes without CV smoothing factor: ", str(nb.getSmoothing())) # NB without CV metrics nb_metrics_rdd = MulticlassMetrics(predictions['label', 'prediction'].rdd) # NB stats by each class (label) labels = predictions.rdd.map(lambda cols: cols.label).distinct().collect() logger.warn("Printing NB stats...") for label in sorted(labels): try: print("Class %s precision = %s" % (label, nb_metrics_rdd.precision(label))) print("Class %s recall = %s" % (label, nb_metrics_rdd.recall(label))) print("Class %s F1 Measure = %s" % (label, nb_metrics_rdd.fMeasure(label, beta=1.0))) except Py4JJavaError: pass # Weighted stats print("Weighted recall = %s" % nb_metrics_rdd.weightedRecall) print("Weighted precision = %s" % nb_metrics_rdd.weightedPrecision) print("Weighted F(1) Score = %s" % nb_metrics_rdd.weightedFMeasure()) print("Weighted F(0.5) Score = %s" % nb_metrics_rdd.weightedFMeasure(beta=0.5)) print("Weighted false positive rate = %s" % nb_metrics_rdd.weightedFalsePositiveRate) # Show 10 results of predictions that haven't been predicted successfully predictions.filter(predictions['prediction'] != predictions['label']) \ .select("Product", "ConsumerComplaint", "probability", "label", "prediction") \ .orderBy("probability", ascending=False) \ .show(n=10, truncate=20) # Show 10 results of predictions that have been predicted successfully predictions.filter(predictions['prediction'] == predictions['label']) \ .select("Product", "ConsumerComplaint", "probability", "label", "prediction") \ .orderBy("probability", ascending=False) \ .show(n=10, truncate=20) # Instantiate an evaluation of predictions without Cross Validation evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction") # Evaluate best model without an use of Cross Validation accuracy_without_cv = evaluator.evaluate(predictions) print("Naive-Bayes accuracy without Cross Validation = %s (metric)" % str(nb_metrics_rdd.accuracy)) logger.warn("Starting Cross Validation...") # Instantiate ParamGridBuilder for the Cross Validation purpose nbp_params_grid = (ParamGridBuilder().addGrid( nb.smoothing, [0.8, 0.9, 1.0]).addGrid(hashing_tf.numFeatures, [700, 720]).addGrid(idf.minDocFreq, [3, 4, 5]).build()) # Instantiate the Evaluator of the model nb_evaluator = MulticlassClassificationEvaluator( labelCol='label', predictionCol='prediction') # Instantiate 5-fold CrossValidator nb_cv = CrossValidator(estimator=nb, estimatorParamMaps=nbp_params_grid, evaluator=nb_evaluator, numFolds=5) # Create a model with Cross Validation nb_cv_model = nb_cv.fit(training_data) # Make predictions on model with Cross Validation cv_predictions = nb_cv_model.transform(training_data) # Evaluate best model with an use of Cross Validation accuracy_with_cv = nb_evaluator.evaluate(cv_predictions) print("Naive-Bayes accuracy with Cross Validation:", str(accuracy_with_cv)) print( "Improvement for the best fitted model (NB with CV) in regard of NB: ", str(accuracy_with_cv - nb_metrics_rdd.accuracy)) # NB with CV metrics nb_with_cv_metrics_rdd = MulticlassMetrics( cv_predictions['label', 'prediction'].rdd) # NB with CV stats by each class (label) labels = cv_predictions.rdd.map(lambda att: att.label).distinct().collect() logger.warn("Printing NB stats...") for label in sorted(labels): try: print("Class %s precision = %s" % (label, nb_with_cv_metrics_rdd.precision(label))) print("Class %s recall = %s" % (label, nb_with_cv_metrics_rdd.recall(label))) print("Class %s F1 Measure = %s" % (label, nb_with_cv_metrics_rdd.fMeasure(label, beta=1.0))) except Py4JJavaError: pass # Print weighted stats print("Weighted recall = %s" % nb_with_cv_metrics_rdd.weightedRecall) print("Weighted precision = %s" % nb_with_cv_metrics_rdd.weightedPrecision) print("Weighted F(1) Score = %s" % nb_with_cv_metrics_rdd.weightedFMeasure()) print("Weighted F(0.5) Score = %s" % nb_with_cv_metrics_rdd.weightedFMeasure(beta=0.5)) print("Weighted false positive rate = %s" % nb_with_cv_metrics_rdd.weightedFalsePositiveRate) # Show 10 results of cv_predictions that have been predicted successfully (cv_predictions.filter( cv_predictions['prediction'] == cv_predictions['label']).select( 'Product', 'ConsumerComplaint', 'probability', 'label', 'prediction').orderBy('probability', ascending=False).show(n=10, truncate=20)) # Show 10 results of cv_predictions that haven't been predicted successfully (cv_predictions.filter( cv_predictions['prediction'] != cv_predictions['label']).select( 'Product', 'ConsumerComplaint', 'probability', 'label', 'prediction').orderBy('probability', ascending=False).show(n=10, truncate=20)) # Timestamp of end end_timestamp = dt.now() # Print elapsed time print("Elapsed time: %s" % str(end_timestamp - start_timestamp)) # Stop SparkSession spark_session.stop()
def initialize(): global sc, spark, items, inputfile, t, m, gidDict, bids, hashedList, n, b, r, candidateTuple, listvala, listvalb print("Initializing...") t = time.time() candidateList = [] frequentList = [] sc_conf = SparkConf() sc_conf.setAppName("Task1") sc_conf.setMaster('local[*]') # sc_conf.set("spark.driver.bindAddress", "127.0.0.1") sc = SparkContext(conf=sc_conf) sc.setLogLevel("ERROR") csvread = sc.textFile(inputfile) columnName = csvread.first().split(',') items = csvread.map(lambda line: line.split(",")).filter( lambda line: (line) != columnName) # column name is userid, businessid, starts userids = list(set(sorted(items.keys().collect()))) k = 0 for user in userids: if (user not in gidDict): gidDict[user] = k k = k + 1 # print(k) bids = list(set(sorted(items.values().collect()))) # bids = copy.copy(sorted(bids)) # print(len(bids)) m = len(userids) listvala = random.sample(range(1, m), n) listvalb = random.sample(range(1, m), n) bid_uid = items.map(lambda x: ((x[1], x[0]), 1)).reduceByKey( lambda x, y: x + y).map(lambda x: (x[0])).groupByKey().mapValues(list) bid_uid_hashed = bid_uid.map(lambda x: initialHash(x)) dict_uniques = {} for each in bid_uid.collect(): dict_uniques[each[0]] = set(each[1]) bid_uid_hashed2 = bid_uid_hashed.map(lambda x: hashing(x)) # print(bid_uid_hashed2.first()) # creating signature matrix column per business IDs start = 0 end = r tempSim = [] finalList = [] hashedListSet = bid_uid_hashed2.collect() length = len(hashedListSet) c = 1 print("Finding similar pairs...") dictionEvery = {} while (end <= n): tempDict = [] for each in hashedListSet: templist = sorted(each[1][start:end]) tempDict.append((tuple(templist), each[0])) # tempDict.append((tuple(each[1][start:end]), each[0])) dictionEvery[c] = tempDict c = c + 1 start = end end = end + r dictionaryCheck = {} # for i in range(1, b+1): # dictionaryCheck = {} # for i in range(0, ) length = len(dictionEvery[1]) candidateset = [] candidateTuple = [] print("Working on Bands 1 to 40 ") for i in range(1, b + 1): justBid = [] dictionBand = dictionEvery[i] # print("Working on Band: "+str(i)) mapper = sc.parallelize(dictionBand).groupByKey().mapValues( list).filter(lambda x: (len(x[1]) > 1)) justBid = mapper.map(lambda c: c[1]).collect() candidateTuple.append(justBid) # print(justBid) # print(len(candidateTuple[0])) # print(len(candidateTuple[1])) # print((candidateTuple[1])) candidateset = (candidateTuple) # it was list(set(candidateTuple)) candidatepairs = [] count = 0 for each in candidateset: for e in each: l1 = list(combinations(sorted(e), 2)) candidatepairs.extend(l1) candPairSet = [] # set() candPairSet = (candidatepairs) # it was list(set(candidatepairs)) lines = [] print("Finding final Jaccard Simmilarity") finalPairs = [] for each in candPairSet: set1 = dict_uniques[each[0]] set2 = dict_uniques[each[1]] inter = set1 & set2 # print(len(inter), len(set1), len(set2)) jaccard = (float(len(inter))) / (float(len(set1.union(set2)))) # print(jaccard) if (jaccard >= 0.5): # print(jaccard) lines.append([each[0], each[1], jaccard]) finalPairs.append(each) # print(len(list(set(finalPairs)))) # print(len((finalPairs))) answer = writeToFile(lines) # calculatingPreRec(lines) print("Total Items Printed: " + str(answer)) print("Duration: " + str(time.time() - t))
from pyspark.sql.types import StringType, StructType, StructField from pyspark.context import SparkContext from pyspark.sql.session import SparkSession from pyspark.sql.functions import explode from pyspark.sql.functions import split, max sc = SparkContext('local') sc.setLogLevel("OFF") spark = SparkSession(sc) # Path to our 20 JSON files inputPath = "hdfs://localhost:9000/stream/" #inputPath = "./stream/" # Explicitly set schema schema = StructType([ StructField("ID", StringType(), True), StructField("Lang", StringType(), True), StructField("Date", StringType(), True), StructField("Source", StringType(), True), StructField("Len", StringType(), True), StructField("Likes", StringType(), True), StructField("RTs", StringType(), True), StructField("Hashtags", StringType(), True), StructField("UserMentionNames", StringType(), True), StructField("UserMentionID", StringType(), True), StructField("name", StringType(), True), StructField("Place", StringType(), True), StructField("Followers", StringType(), True), StructField("Friends", StringType(), True) ]) inputDF = spark.readStream.schema(schema).option("delimiter", ";").option( "maxFilesPerTrigger", 1).csv(inputPath)
shifts = running_word_prices.transform(to_shifts) # Print the results shifts.foreachRDD(print_shifts) if __name__ == "__main__": if len(sys.argv) >= 2 and sys.argv[1] == "test": # Run the tests del sys.argv[1] conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext(appName='unit_test', conf=conf) sc.setLogLevel("WARN") sc.setCheckpointDir("/tmp") unittest.main() sc.stop() else: # Run the main() sc = SparkContext(appName="BoostWords") sc.setLogLevel("WARN") ssc = StreamingContext(sc, 5)