def getOrCreate(cls, sc): """ Get the existing SQLContext or create a new one with given SparkContext. :param sc: SparkContext """ if cls._instantiatedContext is None: jsqlContext = sc._jvm.SQLContext.getOrCreate(sc._jsc.sc()) sparkSession = SparkSession(sc, jsqlContext.sparkSession()) cls(sc, sparkSession, jsqlContext) return cls._instantiatedContext
def main(argv): #se instancia el contexto de spark. sc = SparkContext(appName="KMeans-Clustering-dhoyoso-dsernae") #se inicia sesion en spark. spark = SparkSession(sc) #se guarda el lenguaje a partir del cual se quitaran las stop words. language = argv[4] #"spanish" #se guarda la ruta para la salida de los clusters. pathout = argv[3] #se guarda la ruta de la cual se leeran los archivos. path = argv[2] #"hdfs:///user/dhoyoso/datasets/dataset/" #se guarda el numero de clusters que se desea hacer. k = int(argv[1]) #4 #se sacan los archivos a procesar a partir de la ruta. files = sc.wholeTextFiles(path) #se crea la estructura del dataframe; 2 columnas una para la ruta y otra para el texto. schema = StructType([ StructField("path", StringType(), True), StructField("text", StringType(), True) ]) #se crea el dataframe a partir de la estructura y los archivos. df = spark.createDataFrame(files, schema) #se tokeniza el texto usando la clase de Ml tokenizer. tokenizer = Tokenizer(inputCol="text", outputCol="tokens") #se le dice al stop words remover que idioma es el que estamos tratando. StopWordsRemover.loadDefaultStopWords(language) #se remueven las stopwords de los tokens. stopWords = StopWordsRemover(inputCol="tokens", outputCol="stopWordsRemovedTokens") #se hace el hashing tf de los tokens restantes. hashingTF = HashingTF(inputCol="stopWordsRemovedTokens", outputCol="rawFeatures", numFeatures=2000) #se hace el idf de la salida del hashingTF idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=1) #se inicializa el kmeans con el idf y el k deseado. kmeans = KMeans(k=k) #creacion del mapa de transformaciones. pipeline = Pipeline(stages=[tokenizer, stopWords, hashingTF, idf, kmeans]) #inserta el dataframe como el inicio de las transformaciones model = pipeline.fit(df) #ejecuta las trasformaciones mapeadas y guarda el resultado results = model.transform(df) results.cache() #se corta la ruta para dejar solo el nombre y su respectivo cluster(prediction). split_col = split(results['path'], '/') results = results.withColumn('docname', split_col.getItem(7)) df = results.select("docname", "prediction") #se agrupan los documentos del mismo cluster en cluster_docs_list y se guardan en el path de salida como un json. grouped = df.groupBy(['prediction']).agg( collect_list("docname").alias('cluster_docs_list')) grouped.coalesce(1).write.json(path=pathout, mode="overwrite")
def __init__(self, sparkContext, jhiveContext=None): warnings.warn( "HiveContext is deprecated in Spark 2.0.0. Please use " + "SparkSession.builder.enableHiveSupport().getOrCreate() instead.", DeprecationWarning) if jhiveContext is None: sparkContext._conf.set("spark.sql.catalogImplementation", "hive") sparkSession = SparkSession.builder._sparkContext(sparkContext).getOrCreate() else: sparkSession = SparkSession(sparkContext, jhiveContext.sparkSession()) SQLContext.__init__(self, sparkContext, sparkSession, jhiveContext)
def __init__(self): self.app_name = 'Spotify Structure Parquet' self.conf = SparkConf().setAppName(self.app_name) self.conf = self.conf.setMaster("local[*]") self.sc = SparkContext(conf=self.conf) self.spark = SparkSession(self.sc) self.create_datetime_as_str = datetime.now().strftime( '/%Y/%m/%d/%H/%M') self.parquet_base_path = "s3a://spotifybuck/output/parquet/" self.files_base_path = "s3a://spotifybuck/" self.files_date_path = (datetime.now() - timedelta(days=1)).strftime('/%Y/%m/%d/*/*')
def __init__(self, sparkContext, jhiveContext=None): warnings.warn( "HiveContext is deprecated in Spark 2.0.0. Please use " + "SparkSession.builder.enableHiveSupport().getOrCreate() instead.", DeprecationWarning) if jhiveContext is None: sparkSession = SparkSession.builder.enableHiveSupport( ).getOrCreate() else: sparkSession = SparkSession(sparkContext, jhiveContext.sparkSession()) SQLContext.__init__(self, sparkContext, sparkSession, jhiveContext)
def init_session(self): """ Configure and initialize spark context and spark session. Returns: SparkSession, SparkContext objects as class variables """ config = pyspark.SparkConf().setAll([ ("spark.dynamicAllocation.enabled", "True"), ("spark.executor.cores", str(self.n_spark_workers)) ]) self.sc = SparkContext(conf=config) self.ss = SparkSession(self.sc)
def method1(spark_context: SparkContext, database_URL: str): print('fetching jdbc dataframe...') # Create a SparkSession spark = SparkSession(spark_context) # Create a DataFrame object jdbc_df = spark.read \ .format("jdbc") \ .option("url", database_URL) \ .option("dbtable", "RATING") \ .option("driver", "oracle.jdbc.driver.OracleDriver") \ .option("fetchSize", "5001") \ .load() return jdbc_df
def init_spark(dynamic_allocation="True", executor_cores="2"): """ Initialize Spark context Returns: Spark session object """ config = pyspark.SparkConf().setAll([ ("spark.dynamicAllocation.enabled", dynamic_allocation), ("spark.executor.cores", executor_cores) ]) sc = SparkContext(conf=config) spark = SparkSession(sc) return sc, spark
def get_spark_context(): sc = pyspark.SparkContext.getOrCreate() conf = pyspark.SparkConf() conf.setAppName('SmartPlanner') conf.setMaster('spark://10.2.68.52:7077') #conf.setMaster('local[*]') conf.set('spark.executor.memory', '8g') conf.set('spark.executor.cores', '3') conf.set('spark.cores.max', '9') conf.set('spark.logConf', True) sc.stop() sc = pyspark.SparkContext(conf=conf) spark = SparkSession(sc) spark.catalog.clearCache() return spark
def __init__(self, sparkContext: SparkContext, jhiveContext: Optional[JavaObject] = None): warnings.warn( "HiveContext is deprecated in Spark 2.0.0. Please use " + "SparkSession.builder.enableHiveSupport().getOrCreate() instead.", FutureWarning) if jhiveContext is None: sparkContext._conf.set( # type: ignore[attr-defined] "spark.sql.catalogImplementation", "hive") sparkSession = SparkSession.builder._sparkContext( # type: ignore[attr-defined] sparkContext).getOrCreate() else: sparkSession = SparkSession(sparkContext, jhiveContext.sparkSession()) SQLContext.__init__(self, sparkContext, sparkSession, jhiveContext)
def getOrCreate(cls, sc): """ Get the existing SQLContext or create a new one with given SparkContext. :param sc: SparkContext .. note:: Deprecated in 3.0.0. Use :func:`SparkSession.builder.getOrCreate()` instead. """ warnings.warn( "Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.", DeprecationWarning) if cls._instantiatedContext is None: jsqlContext = sc._jvm.SparkSession.builder().sparkContext( sc._jsc.sc()).getOrCreate().sqlContext() sparkSession = SparkSession(sc, jsqlContext.sparkSession()) cls(sc, sparkSession, jsqlContext) return cls._instantiatedContext
def _test(): import os import doctest from pyspark.context import SparkContext from pyspark.sql.session import SparkSession import pyspark.sql.conf os.chdir(os.environ["SPARK_HOME"]) globs = pyspark.sql.conf.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') globs['sc'] = sc globs['spark'] = SparkSession(sc) (failure_count, test_count) = doctest.testmod(pyspark.sql.conf, globs=globs) globs['sc'].stop() if failure_count: exit(-1)
def fetchspark(): datafolder = settings.CSVFILE #r = redis.StrictRedis(host=redis_host, port=redis_port, password=redis_password, db=0,charset="utf-8",decode_responses=True) def redisConnect(a, b, c, d, e): # step 2: define our connection information for Redis # Replaces with your configuration information #print(a,"|",b,c,d,e) redis_host = "localhost" redis_port = 6379 redis_password = "" try: value1 = "" value2 = "" r = redis.StrictRedis(host=redis_host, port=redis_port, password=redis_password, db=0, charset="utf-8", decode_responses=True) #r.flushall() r.hmset(a, {"id": a, "brand": b, "colors": c, "dateAdded": d}) r.zadd("daterelatedID", {a: e}) value1 = a + "::" + c.lower() r.zadd("ColorRealatedID", {value1: e}) value2 = a + "::" + b r.zadd("BrandRealatedID", {value2: e}) except Exception as e: print(e) conf = pyspark.SparkConf() conf.set('spark.ui.showConsoleProgress', False) sc = SparkContext.getOrCreate(conf) sc.setLogLevel("ERROR") spark = SparkSession(sc) Employee_rdd = spark.read.csv(datafolder, header="true").select( "id", "brand", "colors", "dateAdded").distinct() for row in Employee_rdd.dropna().rdd.collect(): dateman = datetime.datetime.strptime(row.dateAdded, '%Y-%m-%dT%H:%M:%SZ') millisec = dateman.timestamp() * 1000 redisConnect(row["id"], row["brand"], row["colors"], row["dateAdded"], int(millisec))
def loadData(self): data = self.sc.textFile(self.inputFileName) data = data.map(lambda line: line.split(" ")).map( lambda l: [float(i) for i in l]) schema = StructType([ StructField("f1", FloatType(), True), StructField("f2", FloatType(), True), StructField("f3", FloatType(), True), StructField("f4", FloatType(), True), StructField("f5", FloatType(), True), # StructField("f6", FloatType(), True), # StructField("f7", FloatType(), True), # StructField("f8", FloatType(), True), # StructField("f9", FloatType(), True), # StructField("f10", FloatType(), True) ]) data = SparkSession(self.sc).createDataFrame(data, schema) data = data.rdd.map(lambda line: [float(i) for i in line]).take(10000) return data
def main(): startyear=2000 endyear=2019 sc = SparkContext("local", "BigDataProject") sc.setLogLevel("ERROR") spark = SparkSession(sc) result = extract_data(spark,startyear=startyear,endyear=endyear) result = result.withColumn(str(startyear), result[str(startyear)].cast(DoubleType())) result = result.withColumn(str(endyear), result[str(endyear)].cast(DoubleType())) result = result.na.fill(0) result.show() print(result)
def _test(): import os import doctest from pyspark.context import SparkContext from pyspark.sql.session import SparkSession import pyspark.sql.catalog os.chdir(os.environ["SPARK_HOME"]) globs = pyspark.sql.catalog.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') globs['sc'] = sc globs['spark'] = SparkSession(sc) (failure_count, test_count) = doctest.testmod( pyspark.sql.catalog, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE) globs['sc'].stop() if failure_count: exit(-1)
def TF_IDF(): """ 使用 Tokenizer按空格分割句子,形成dataframe""" sentence_list = Cul_Freq() conf = SparkConf().setMaster("local").setAppName("NewsAnalys") sc = SparkContext(conf=conf) spark = SparkSession(sc) sentenceData = spark.createDataFrame(sentence_list).toDF( "label", "sentence").distinct() tokenizer = Tokenizer(inputCol="sentence", outputCol="words") wordsData = tokenizer.transform(sentenceData) """ idf模型训练 """ countVector = CountVectorizer(inputCol="words", outputCol="rawFeatures", minDF=2) cvModel = countVector.fit(wordsData) cv_df = cvModel.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(cv_df) rescaledData = idfModel.transform(cv_df) return rescaledData, cvModel
def __init__(self, feature_dataset_path): """Init the recommendation engine given a Spark context and a dataset path """ import os #os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.4.1 pyspark-shell' logger.info("Starting up the Recommendation Engine: ") self.sc = SparkContext.getOrCreate() self.sparkSession = SparkSession(self.sc) self.sqlContext = SQLContext(self.sc) command = 'aws s3 cp s3://darwin-vs/train_no1.csv -' s3_folder_data = subprocess.check_output(command, stderr=subprocess.STDOUT, shell=True) AWS_ACCESS_KEY = 'AKIAJ6IIHKANYPNE5X6A' AWS_SECRET_KEY = '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' logger.info("Loading Features data...") self.trainNormalizeFeatures = self.formatFeaturesDF( s3_folder_data, "TrainNormFeatures")
def main(): sc = SparkContext("local", "dataframe app") sc.setLogLevel("ERROR") spark = SparkSession(sc) #load the retail dataset: retail_data = spark.read.option("inferSchema", "true").option( "header", "true" ).csv( "/Users/faizan/Documents/Masters/2nd_Semester/Big_Data/Tutorial/Tutorials/Tutorial04/online-retail-dataset.csv" ) retail_data.show() result = retail_data.groupBy("StockCode").pivot("Country").sum( "Quantity").orderBy("StockCode") result - result.na.fill(0) result.show() sc.stop()
def __init__(self, sc, feature_dataset_path): """Init the recommendation engine given a Spark context and a dataset path """ self.sc = sc logger.info("Starting up the Recommendation Engine: ") #self.sc = SparkContext.getOrCreate() self.sparkSession = SparkSession(self.sc) self.sqlContext = SQLContext(self.sc) logger.info("Loading Features data...") featuresRawRDD = self.sc.textFile(feature_dataset_path) logger.info("Loading Features RDD data...") #Map Read CSV file and Map to RDD featuresRdd = featuresRawRDD.mapPartitions(lambda x: csv.reader(x)) #featuresss = featuresRdd.map(lambda x : x.encode("ascii", "ignore")) # print featuresRdd.collect() self.trainNormalizeFeatures = self.formatFeaturesDF( featuresRdd, "TrainNormFeatures") print self.trainNormalizeFeatures.show()
def startStreaming(): try: from pyspark import SparkContext from pyspark import SparkConf from pyspark.python.pyspark.shell import sc from pyspark.sql import SQLContext from pyspark.sql.session import SparkSession from pyspark.sql.functions import to_json, struct, from_json from pyspark.sql.types import StringType, StructType, StructField spark = SparkSession(sc) df = spark.readStream.format("kafka").option( "kafka.bootstrap.servers", kafkaHost + ":" + kafkaPort).option( "startingOffsets", "latest").option("subscribe", kafkaTopic).load() streamedDf = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") queryProcess = streamedDf.writeStream.foreach(processRow).start() queryProcess.awaitTermination(200) except ImportError as e: print("Can not import Spark Modules", e) sys.exit(1)
def setSpark(self): #spark configurations conf = SparkConf() conf.setMaster( "local[4]" ) # if you got java memory error, you can reduce running cores # conf.set("spark.executor.memory", "32g") # this doesn't important when you run on local conf.set("spark.driver.memory", "20g") conf.set("spark.dirver.maxResultSize", "6g") conf.set("spark.memory.offHeap.size", "18g") # you can increase this to overcome java memory issues # conf.set("spark.executor.extraJavaOptions", "-Xmx1024m") conf.set("spark.executor.extraJavaOptions", "-XX:+UseCompressedOops") # conf.set("spark.cores.max", "2") # conf.set("spark.driver.extraClassPath", # driver_home+'/jdbc/postgresql-9.4-1201-jdbc41.jar:'\ # +driver_home+'/jdbc/clickhouse-jdbc-0.1.52.jar:'\ # +driver_home+'/mongo/mongo-spark-connector_2.11-2.2.3.jar:'\ # +driver_home+'/mongo/mongo-java-driver-3.8.0.jar') sc = SparkContext.getOrCreate(conf) self.sparkSession = SparkSession(sc)
def run_spark(complaints_file_path, output_path): sc = SparkContext() spark = SparkSession(sc) print("Reading input file:", complaints_file_path) complaint_info = sc.textFile(complaints_file_path, use_unicode=True).cache() print("Number of partitions: ", complaint_info.getNumPartitions()) complaints = complaint_info.mapPartitionsWithIndex(extract_complaints) results = complaint_info.mapPartitionsWithIndex(extract_complaints) \ .reduceByKey(lambda x,y: x+y) \ .map(lambda x: ( (x[0][0],x[0][1]), [x[1]] ) ) \ .reduceByKey(lambda x,y: x+y) \ .map(lambda x: ( x[0], ( sum(x[1]), len(x[1]), round(max(x[1])*100/sum(x[1])) ) ) ) \ .sortByKey() \ .map(toCSVLine) \ print("Saving to: ", output_path) results.saveAsTextFile(output_path)
def spark_session_setup(memory_limit='4G'): """ creates a spark context. optional parameter is memory limit. >>> sc = spark_session_setup('12G') """ # in order to be bale to change log level conf = pyspark.SparkConf() conf.set('spark.logConf', 'true') conf.set('spark.executor.memory', memory_limit) conf.set('spark.driver.memory', memory_limit) conf.set('spark.driver.maxResultSize', memory_limit) # create a spark session sc = pyspark.SparkContext(appName='word_count', conf=conf) # change log level to ERROR sc.setLogLevel("ERROR") spark = SparkSession(sc) return sc
def startStreaming(): try: from pyspark import SparkContext from pyspark import SparkConf from pyspark.python.pyspark.shell import sc from pyspark.sql import SQLContext from pyspark.sql.functions import date_format from pyspark.sql.functions import to_json, struct from pyspark.sql.session import SparkSession import json from pyspark.sql.types import TimestampType, StringType, StructType, StructField spark = SparkSession(sc) columns = ['dateAdded', 'id', 'brand', 'colors'] schema = StructType([ StructField("id", StringType(), True), StructField("dateAdded", StringType(), True), StructField("dateUpdated", StringType(), True), StructField("asins", StringType(), True), StructField("brand", StringType(), True), StructField("categories", StringType(), True), StructField("primaryCategories", StringType(), True), StructField("colors", StringType(), True) ]) df = spark.readStream.schema(schema).option("sep", ",").option( "header", "true").option("enforceSchema", "true").csv(inputPath) read = df.select( date_format(df.dateAdded, "yyyy-MM-dd").alias('key'), to_json(struct([ df[x] for x in columns ])).alias("value")).writeStream.format("kafka").option( "kafka.bootstrap.servers", kafkaHost + ":" + kafkaPort).option( "topic", kafkaTopic).option("checkpointLocation", "checkpoint").start() read.awaitTermination() print("Successfully Streamed") except ImportError as e: print("Can not import Spark Modules", e) sys.exit(1)
def getOrCreate(cls, sc): """ Get the existing SQLContext or create a new one with given SparkContext. .. versionadded:: 1.6.0 .. deprecated:: 3.0.0 Use :func:`SparkSession.builder.getOrCreate()` instead. Parameters ---------- sc : :class:`SparkContext` """ warnings.warn( "Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.", FutureWarning) if (cls._instantiatedContext is None or SQLContext._instantiatedContext._sc._jsc is None): jsqlContext = sc._jvm.SparkSession.builder().sparkContext( sc._jsc.sc()).getOrCreate().sqlContext() sparkSession = SparkSession(sc, jsqlContext.sparkSession()) cls(sc, sparkSession, jsqlContext) return cls._instantiatedContext
def main(): dataset, features_used, num_trees, max_depth = set_parameters(sys.argv) #Default to use all features in case of invalid parameters. #Setting parameters for Spark sc = SparkContext.getOrCreate() sc.stop() conf = SparkConf().setAll([('spark.executor.memory', '7000M'), ('spark.rpc.message.maxSize', '2047'), ('spark.ui.showConsoleProgress', 'true')]) sc = SparkContext.getOrCreate(conf=conf) print(sc.getConf().getAll()) spark = SparkSession(sc) #Settin URls for files if dataset == 'l': X_train_path = 'https://storage.googleapis.com/uga-dsp/project1/files/X_train.txt' X_test_path = 'https://storage.googleapis.com/uga-dsp/project1/files/X_test.txt' y_path = 'https://storage.googleapis.com/uga-dsp/project1/files/y_train.txt' else: X_train_path = 'https://storage.googleapis.com/uga-dsp/project1/files/X_small_train.txt' X_test_path = 'https://storage.googleapis.com/uga-dsp/project1/files/X_small_test.txt' y_path = 'https://storage.googleapis.com/uga-dsp/project1/files/y_small_train.txt' #Creating a list of y labels y_train = get_labels(y_path) #Extracing features from train and test set X_train, X_test = get_features(X_train_path, X_test_path, features_used) #Creating a training matrix by combing features and labels train_data = create_training_matrix(X_train, y_train) #Passing training data to the model and outputing prediction predict_and_save(sc, train_data, X_test, num_trees, max_depth)
def train_model(self): sc = SparkContext(appName="PySparkShell") spark = SparkSession(sc) train_data_schema = tp.StructType([ tp.StructField(name="id", dataType=tp.IntegerType(), nullable=True), tp.StructField(name="label", dataType=tp.IntegerType(), nullable=True), tp.StructField(name="tweet", dataType=tp.StringType(), nullable=True), ]) LOCAL_ROOT = os.path.abspath("data") + os.sep train_data = spark.read.csv( LOCAL_ROOT + "twitter_sentiments_train.csv", schema=train_data_schema, header=True, ) stage_1 = RegexTokenizer(inputCol="tweet", outputCol="tokens", pattern="\\W") stage_2 = StopWordsRemover(inputCol="tokens", outputCol="filtered_words") stage_3 = Word2Vec(inputCol="filtered_words", outputCol="vector", vectorSize=100) model = LogisticRegression(featuresCol="vector", labelCol="label") pipeline = Pipeline(stages=[stage_1, stage_2, stage_3, model]) self.pipelineFit = pipeline.fit(train_data) print("Done!")
from pyspark.sql import Row from pyspark.sql.functions import udf from pyspark.ml.feature import CountVectorizer import re from pyspark.sql.types import ArrayType, StringType import pickle from pyspark.ml.feature import VectorAssembler from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.tree import GradientBoostedTrees from pyspark.mllib.tree import RandomForest import pandas as pd import pandas # creating the spark context sc = SparkContext.getOrCreate() spark = SparkSession(sc) def byte_feature_extraction(lists, file, feature, track, value): for hexa_file_num, f in enumerate(file): inputs = requests.get( "https://storage.googleapis.com/uga-dsp/project1/data/bytes/" + f + ".bytes", stream=True) for hexa_file in inputs.iter_lines(): if len(hexa_file) == 0: continue hexa_file = hexa_file.split() for byte in hexa_file[1:]:
def main(): set_pandas_options() app_name = "Case Study 2" conf = SparkConf().setAppName(app_name) sc = SparkContext(conf=conf) spark = SparkSession(sc) log4jLogger = sc._jvm.org.apache.log4j LOGGER = log4jLogger.LogManager.getLogger(__name__) LOGGER.info("pyspark script logger initialized") # 1 Load file as a text file in spark LOG_ = get_hdfs_filepath('access.log') CLEAN_LOG = get_hdfs_filepath('access.clean.log') # read text file text_file = sc.textFile(LOG_).filter(lambda row: row != '') split_rdd = text_file.map(lambda row: my_search(row)) LOGGER.info("\n\n1.\tLoad file as a text file in spark\tDone!\n") # 2 Find out how many 404 HTTP codes are in access logs count = split_rdd.filter(lambda row: row[5] == 404).count() LOGGER.info( "\n\n2.\tFind out how many 404 HTTP codes are in access logs\tDone!\n\n{}\n\n" .format(count)) # 3 Find out which URLs are broken url_count_rdd = split_rdd.map(lambda row: (row[0], 1)).reduceByKey( lambda x, y: x + y) start_time = time.time() zz = load_all(url_count_rdd.toLocalIterator(), sc).sortBy(lambda a: -int(a[1])).toDF( ["url", "count", "result"]).toPandas() LOGGER.info( "\n\n3.\tFind out which URLs are broken\tDone!\n\n{}\n{}\n".format( zz, "--- %s seconds ---" % (time.time() - start_time))) # 4 Verify there are no null columns in the original dataset LOGGER.info( "\n\n4.\tVerify there are no null columns in the original dataset\tDone!\n\n" ) # 5 Replace null values with constants such as 0 LOGGER.info( "\n\n5.\tReplace null values with constants such as 0\tDone!\n\n") # 6 Parse timestamp to readable date date_codes = split_rdd.map(lambda row: (row[2], row[5])) dates = date_codes.toDF(['readable_date']).toPandas().iloc[:, 0] LOGGER.info( "\n\n6.\tParse timestamp to readable date\tDone!\n\n{}\n".format( dates.head(10))) # 7 Describe which HTTP status values appear in data and how many codes = date_codes.map(lambda line: line[1]) code_counts = codes.map(lambda code: (code, 1)).reduceByKey( lambda x, y: x + y).sortBy(lambda a: -a[1]) df = code_counts.toDF(['code', 'count']).toPandas() LOGGER.info( "\n\n7.\tDescribe which HTTP status values appear in data and how many\tDone!\n\n{}\n" .format(df)) # 8 Display as chart the above stat in chart in Zeppelin notebook responses = df.iloc[:, 0].values counts = df.iloc[:, 1].values plt.rcdefaults() y_pos = np.arange(len(responses)) plt.bar(responses, counts, align='center', alpha=0.5, log=True) plt.xticks(y_pos, responses) plt.ylabel('Counts') plt.xlabel('Codes') plt.title('Log counts per status code') plt.show() LOGGER.info( "\n\n8.\tDisplay as chart the above stat in chart in Zeppelin notebook\tDone!\n\n" ) # 9 How many unique hosts are there in the entire log and their average request hosts = url_count_rdd.map(lambda row: (urlparse(row[0]).netloc, row[1])) hosts_counts = hosts.reduceByKey(lambda x, y: x + y).sortBy( lambda a: -a[1]) df = hosts_counts.toDF(['host', 'count']).toPandas() LOGGER.info( "\n\n9.\tHow many unique hosts are there in the entire log and their average request\tDone!\n\n{}\n" .format(df)) LOGGER.info( "\n\n10.\tCreate a spark-submit application for the same and print the findings in the log\tDone!\n\n" )