示例#1
0
    def getOrCreate(cls, sc):
        """
        Get the existing SQLContext or create a new one with given SparkContext.

        :param sc: SparkContext
        """
        if cls._instantiatedContext is None:
            jsqlContext = sc._jvm.SQLContext.getOrCreate(sc._jsc.sc())
            sparkSession = SparkSession(sc, jsqlContext.sparkSession())
            cls(sc, sparkSession, jsqlContext)
        return cls._instantiatedContext
示例#2
0
def main(argv):

    #se instancia el contexto de spark.
    sc = SparkContext(appName="KMeans-Clustering-dhoyoso-dsernae")
    #se inicia sesion en spark.
    spark = SparkSession(sc)
    #se guarda el lenguaje a partir del cual se quitaran las stop words.
    language = argv[4]  #"spanish"
    #se guarda la ruta para la salida de los clusters.
    pathout = argv[3]
    #se guarda la ruta de la cual se leeran los archivos.
    path = argv[2]  #"hdfs:///user/dhoyoso/datasets/dataset/"
    #se guarda el numero de clusters que se desea hacer.
    k = int(argv[1])  #4
    #se sacan los archivos a procesar a partir de la ruta.
    files = sc.wholeTextFiles(path)
    #se crea la estructura del dataframe; 2 columnas una para la ruta y otra para el texto.
    schema = StructType([
        StructField("path", StringType(), True),
        StructField("text", StringType(), True)
    ])
    #se crea el dataframe a partir de la estructura y los archivos.
    df = spark.createDataFrame(files, schema)
    #se tokeniza el texto usando la clase de Ml tokenizer.
    tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
    #se le dice al stop words remover que idioma es el que estamos tratando.
    StopWordsRemover.loadDefaultStopWords(language)
    #se remueven las stopwords de los tokens.
    stopWords = StopWordsRemover(inputCol="tokens",
                                 outputCol="stopWordsRemovedTokens")
    #se hace el hashing tf de los tokens restantes.
    hashingTF = HashingTF(inputCol="stopWordsRemovedTokens",
                          outputCol="rawFeatures",
                          numFeatures=2000)
    #se hace el idf de la salida del hashingTF
    idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=1)
    #se inicializa el kmeans con el idf y el k deseado.
    kmeans = KMeans(k=k)
    #creacion del mapa de transformaciones.
    pipeline = Pipeline(stages=[tokenizer, stopWords, hashingTF, idf, kmeans])
    #inserta el dataframe como el inicio de las transformaciones
    model = pipeline.fit(df)
    #ejecuta las trasformaciones mapeadas y guarda el resultado
    results = model.transform(df)
    results.cache()
    #se corta la ruta para dejar solo el nombre y su respectivo cluster(prediction).
    split_col = split(results['path'], '/')
    results = results.withColumn('docname', split_col.getItem(7))
    df = results.select("docname", "prediction")

    #se agrupan los documentos del mismo cluster en cluster_docs_list y se guardan en el path de salida como un json.
    grouped = df.groupBy(['prediction']).agg(
        collect_list("docname").alias('cluster_docs_list'))
    grouped.coalesce(1).write.json(path=pathout, mode="overwrite")
 def __init__(self, sparkContext, jhiveContext=None):
     warnings.warn(
         "HiveContext is deprecated in Spark 2.0.0. Please use " +
         "SparkSession.builder.enableHiveSupport().getOrCreate() instead.",
         DeprecationWarning)
     if jhiveContext is None:
         sparkContext._conf.set("spark.sql.catalogImplementation", "hive")
         sparkSession = SparkSession.builder._sparkContext(sparkContext).getOrCreate()
     else:
         sparkSession = SparkSession(sparkContext, jhiveContext.sparkSession())
     SQLContext.__init__(self, sparkContext, sparkSession, jhiveContext)
示例#4
0
 def __init__(self):
     self.app_name = 'Spotify Structure Parquet'
     self.conf = SparkConf().setAppName(self.app_name)
     self.conf = self.conf.setMaster("local[*]")
     self.sc = SparkContext(conf=self.conf)
     self.spark = SparkSession(self.sc)
     self.create_datetime_as_str = datetime.now().strftime(
         '/%Y/%m/%d/%H/%M')
     self.parquet_base_path = "s3a://spotifybuck/output/parquet/"
     self.files_base_path = "s3a://spotifybuck/"
     self.files_date_path = (datetime.now() -
                             timedelta(days=1)).strftime('/%Y/%m/%d/*/*')
示例#5
0
文件: context.py 项目: yin-bp/spark
 def __init__(self, sparkContext, jhiveContext=None):
     warnings.warn(
         "HiveContext is deprecated in Spark 2.0.0. Please use " +
         "SparkSession.builder.enableHiveSupport().getOrCreate() instead.",
         DeprecationWarning)
     if jhiveContext is None:
         sparkSession = SparkSession.builder.enableHiveSupport(
         ).getOrCreate()
     else:
         sparkSession = SparkSession(sparkContext,
                                     jhiveContext.sparkSession())
     SQLContext.__init__(self, sparkContext, sparkSession, jhiveContext)
    def init_session(self):
        """
		Configure and initialize spark context and spark session.
		Returns: SparkSession, SparkContext objects as class variables 
		"""

        config = pyspark.SparkConf().setAll([
            ("spark.dynamicAllocation.enabled", "True"),
            ("spark.executor.cores", str(self.n_spark_workers))
        ])
        self.sc = SparkContext(conf=config)
        self.ss = SparkSession(self.sc)
示例#7
0
def method1(spark_context: SparkContext, database_URL: str):
    print('fetching jdbc dataframe...')
    # Create a SparkSession
    spark = SparkSession(spark_context)
    # Create a DataFrame object
    jdbc_df = spark.read \
        .format("jdbc") \
        .option("url", database_URL) \
        .option("dbtable", "RATING") \
        .option("driver", "oracle.jdbc.driver.OracleDriver") \
        .option("fetchSize", "5001") \
        .load()

    return jdbc_df
示例#8
0
def init_spark(dynamic_allocation="True", executor_cores="2"):
    """
	Initialize Spark context

	Returns: Spark session object
	"""

    config = pyspark.SparkConf().setAll([
        ("spark.dynamicAllocation.enabled", dynamic_allocation),
        ("spark.executor.cores", executor_cores)
    ])
    sc = SparkContext(conf=config)
    spark = SparkSession(sc)

    return sc, spark
示例#9
0
文件: computing.py 项目: Y1Liu/PFP
def get_spark_context():
    sc = pyspark.SparkContext.getOrCreate()
    conf = pyspark.SparkConf()
    conf.setAppName('SmartPlanner')
    conf.setMaster('spark://10.2.68.52:7077')
    #conf.setMaster('local[*]')
    conf.set('spark.executor.memory', '8g')
    conf.set('spark.executor.cores', '3')
    conf.set('spark.cores.max', '9')
    conf.set('spark.logConf', True)
    sc.stop()
    sc = pyspark.SparkContext(conf=conf)
    spark = SparkSession(sc)
    spark.catalog.clearCache()
    return spark
示例#10
0
 def __init__(self,
              sparkContext: SparkContext,
              jhiveContext: Optional[JavaObject] = None):
     warnings.warn(
         "HiveContext is deprecated in Spark 2.0.0. Please use " +
         "SparkSession.builder.enableHiveSupport().getOrCreate() instead.",
         FutureWarning)
     if jhiveContext is None:
         sparkContext._conf.set(  # type: ignore[attr-defined]
             "spark.sql.catalogImplementation", "hive")
         sparkSession = SparkSession.builder._sparkContext(  # type: ignore[attr-defined]
             sparkContext).getOrCreate()
     else:
         sparkSession = SparkSession(sparkContext,
                                     jhiveContext.sparkSession())
     SQLContext.__init__(self, sparkContext, sparkSession, jhiveContext)
示例#11
0
    def getOrCreate(cls, sc):
        """
        Get the existing SQLContext or create a new one with given SparkContext.

        :param sc: SparkContext

        .. note:: Deprecated in 3.0.0. Use :func:`SparkSession.builder.getOrCreate()` instead.
        """
        warnings.warn(
            "Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.",
            DeprecationWarning)

        if cls._instantiatedContext is None:
            jsqlContext = sc._jvm.SparkSession.builder().sparkContext(
                sc._jsc.sc()).getOrCreate().sqlContext()
            sparkSession = SparkSession(sc, jsqlContext.sparkSession())
            cls(sc, sparkSession, jsqlContext)
        return cls._instantiatedContext
示例#12
0
文件: conf.py 项目: zhudebin/spark
def _test():
    import os
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql.session import SparkSession
    import pyspark.sql.conf

    os.chdir(os.environ["SPARK_HOME"])

    globs = pyspark.sql.conf.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
    globs['sc'] = sc
    globs['spark'] = SparkSession(sc)
    (failure_count, test_count) = doctest.testmod(pyspark.sql.conf,
                                                  globs=globs)
    globs['sc'].stop()
    if failure_count:
        exit(-1)
def fetchspark():
    datafolder = settings.CSVFILE

    #r = redis.StrictRedis(host=redis_host, port=redis_port, password=redis_password, db=0,charset="utf-8",decode_responses=True)
    def redisConnect(a, b, c, d, e):

        # step 2: define our connection information for Redis
        # Replaces with your configuration information
        #print(a,"|",b,c,d,e)
        redis_host = "localhost"
        redis_port = 6379
        redis_password = ""
        try:
            value1 = ""
            value2 = ""
            r = redis.StrictRedis(host=redis_host,
                                  port=redis_port,
                                  password=redis_password,
                                  db=0,
                                  charset="utf-8",
                                  decode_responses=True)
            #r.flushall()
            r.hmset(a, {"id": a, "brand": b, "colors": c, "dateAdded": d})
            r.zadd("daterelatedID", {a: e})
            value1 = a + "::" + c.lower()
            r.zadd("ColorRealatedID", {value1: e})
            value2 = a + "::" + b
            r.zadd("BrandRealatedID", {value2: e})
        except Exception as e:
            print(e)

    conf = pyspark.SparkConf()
    conf.set('spark.ui.showConsoleProgress', False)
    sc = SparkContext.getOrCreate(conf)
    sc.setLogLevel("ERROR")
    spark = SparkSession(sc)
    Employee_rdd = spark.read.csv(datafolder, header="true").select(
        "id", "brand", "colors", "dateAdded").distinct()
    for row in Employee_rdd.dropna().rdd.collect():
        dateman = datetime.datetime.strptime(row.dateAdded,
                                             '%Y-%m-%dT%H:%M:%SZ')
        millisec = dateman.timestamp() * 1000
        redisConnect(row["id"], row["brand"], row["colors"], row["dateAdded"],
                     int(millisec))
示例#14
0
 def loadData(self):
     data = self.sc.textFile(self.inputFileName)
     data = data.map(lambda line: line.split(" ")).map(
         lambda l: [float(i) for i in l])
     schema = StructType([
         StructField("f1", FloatType(), True),
         StructField("f2", FloatType(), True),
         StructField("f3", FloatType(), True),
         StructField("f4", FloatType(), True),
         StructField("f5", FloatType(), True),
         #                 StructField("f6", FloatType(), True),
         #                 StructField("f7", FloatType(), True),
         #                 StructField("f8", FloatType(), True),
         #                 StructField("f9", FloatType(), True),
         #                 StructField("f10", FloatType(), True)
     ])
     data = SparkSession(self.sc).createDataFrame(data, schema)
     data = data.rdd.map(lambda line: [float(i) for i in line]).take(10000)
     return data
示例#15
0
def main():

    startyear=2000
    endyear=2019

    sc = SparkContext("local", "BigDataProject")
    sc.setLogLevel("ERROR")
    spark = SparkSession(sc)


    result = extract_data(spark,startyear=startyear,endyear=endyear)

    result = result.withColumn(str(startyear), result[str(startyear)].cast(DoubleType()))
    result = result.withColumn(str(endyear), result[str(endyear)].cast(DoubleType()))

    result = result.na.fill(0)

    result.show()
    print(result)
示例#16
0
def _test():
    import os
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql.session import SparkSession
    import pyspark.sql.catalog

    os.chdir(os.environ["SPARK_HOME"])

    globs = pyspark.sql.catalog.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
    globs['sc'] = sc
    globs['spark'] = SparkSession(sc)
    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.catalog,
        globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
    globs['sc'].stop()
    if failure_count:
        exit(-1)
示例#17
0
def TF_IDF():
    """ 使用 Tokenizer按空格分割句子,形成dataframe"""
    sentence_list = Cul_Freq()
    conf = SparkConf().setMaster("local").setAppName("NewsAnalys")
    sc = SparkContext(conf=conf)
    spark = SparkSession(sc)
    sentenceData = spark.createDataFrame(sentence_list).toDF(
        "label", "sentence").distinct()
    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
    wordsData = tokenizer.transform(sentenceData)
    """ idf模型训练 """
    countVector = CountVectorizer(inputCol="words",
                                  outputCol="rawFeatures",
                                  minDF=2)
    cvModel = countVector.fit(wordsData)
    cv_df = cvModel.transform(wordsData)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(cv_df)
    rescaledData = idfModel.transform(cv_df)
    return rescaledData, cvModel
    def __init__(self, feature_dataset_path):
        """Init the recommendation engine given a Spark context and a dataset path
        """
        import os
        #os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.4.1 pyspark-shell'
        logger.info("Starting up the Recommendation Engine: ")
        self.sc = SparkContext.getOrCreate()
        self.sparkSession = SparkSession(self.sc)
        self.sqlContext = SQLContext(self.sc)
        command = 'aws s3 cp s3://darwin-vs/train_no1.csv -'
        s3_folder_data = subprocess.check_output(command,
                                                 stderr=subprocess.STDOUT,
                                                 shell=True)
        AWS_ACCESS_KEY = 'AKIAJ6IIHKANYPNE5X6A'
        AWS_SECRET_KEY = '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'

        logger.info("Loading Features data...")

        self.trainNormalizeFeatures = self.formatFeaturesDF(
            s3_folder_data, "TrainNormFeatures")
示例#19
0
def main():
    sc = SparkContext("local", "dataframe app")
    sc.setLogLevel("ERROR")
    spark = SparkSession(sc)

    #load the retail dataset:
    retail_data = spark.read.option("inferSchema", "true").option(
        "header", "true"
    ).csv(
        "/Users/faizan/Documents/Masters/2nd_Semester/Big_Data/Tutorial/Tutorials/Tutorial04/online-retail-dataset.csv"
    )

    retail_data.show()

    result = retail_data.groupBy("StockCode").pivot("Country").sum(
        "Quantity").orderBy("StockCode")
    result - result.na.fill(0)
    result.show()

    sc.stop()
示例#20
0
    def __init__(self, sc, feature_dataset_path):
        """Init the recommendation engine given a Spark context and a dataset path
        """
        self.sc = sc

        logger.info("Starting up the Recommendation Engine: ")
        #self.sc = SparkContext.getOrCreate()
        self.sparkSession = SparkSession(self.sc)
        self.sqlContext = SQLContext(self.sc)

        logger.info("Loading Features data...")
        featuresRawRDD = self.sc.textFile(feature_dataset_path)
        logger.info("Loading Features RDD data...")

        #Map Read CSV file and Map to RDD
        featuresRdd = featuresRawRDD.mapPartitions(lambda x: csv.reader(x))
        #featuresss = featuresRdd.map(lambda x : x.encode("ascii", "ignore"))
        # print featuresRdd.collect()
        self.trainNormalizeFeatures = self.formatFeaturesDF(
            featuresRdd, "TrainNormFeatures")
        print self.trainNormalizeFeatures.show()
示例#21
0
def startStreaming():
    try:
        from pyspark import SparkContext
        from pyspark import SparkConf
        from pyspark.python.pyspark.shell import sc
        from pyspark.sql import SQLContext
        from pyspark.sql.session import SparkSession
        from pyspark.sql.functions import to_json, struct, from_json
        from pyspark.sql.types import StringType, StructType, StructField
        spark = SparkSession(sc)
        df = spark.readStream.format("kafka").option(
            "kafka.bootstrap.servers", kafkaHost + ":" + kafkaPort).option(
                "startingOffsets", "latest").option("subscribe",
                                                    kafkaTopic).load()
        streamedDf = df.selectExpr("CAST(key AS STRING)",
                                   "CAST(value AS STRING)")
        queryProcess = streamedDf.writeStream.foreach(processRow).start()
        queryProcess.awaitTermination(200)
    except ImportError as e:
        print("Can not import Spark Modules", e)
        sys.exit(1)
示例#22
0
 def setSpark(self):
     #spark configurations
     conf = SparkConf()
     conf.setMaster(
         "local[4]"
     )  # if you got java memory error, you can reduce running cores
     #    conf.set("spark.executor.memory", "32g")          # this doesn't important when you run on local
     conf.set("spark.driver.memory", "20g")
     conf.set("spark.dirver.maxResultSize", "6g")
     conf.set("spark.memory.offHeap.size",
              "18g")  # you can increase this to overcome java memory issues
     #    conf.set("spark.executor.extraJavaOptions", "-Xmx1024m")
     conf.set("spark.executor.extraJavaOptions", "-XX:+UseCompressedOops")
     #    conf.set("spark.cores.max", "2")
     #    conf.set("spark.driver.extraClassPath",
     #             driver_home+'/jdbc/postgresql-9.4-1201-jdbc41.jar:'\
     #             +driver_home+'/jdbc/clickhouse-jdbc-0.1.52.jar:'\
     #             +driver_home+'/mongo/mongo-spark-connector_2.11-2.2.3.jar:'\
     #             +driver_home+'/mongo/mongo-java-driver-3.8.0.jar')
     sc = SparkContext.getOrCreate(conf)
     self.sparkSession = SparkSession(sc)
示例#23
0
def run_spark(complaints_file_path, output_path):
    sc = SparkContext()
    spark = SparkSession(sc)

    print("Reading input file:", complaints_file_path)
    complaint_info = sc.textFile(complaints_file_path,
                                 use_unicode=True).cache()
    print("Number of partitions: ", complaint_info.getNumPartitions())

    complaints = complaint_info.mapPartitionsWithIndex(extract_complaints)

    results = complaint_info.mapPartitionsWithIndex(extract_complaints) \
        .reduceByKey(lambda x,y: x+y) \
        .map(lambda x: ( (x[0][0],x[0][1]), [x[1]] ) ) \
        .reduceByKey(lambda x,y: x+y) \
        .map(lambda x: ( x[0], ( sum(x[1]), len(x[1]), round(max(x[1])*100/sum(x[1])) ) ) ) \
        .sortByKey() \
        .map(toCSVLine) \

    print("Saving to: ", output_path)
    results.saveAsTextFile(output_path)
示例#24
0
def spark_session_setup(memory_limit='4G'):
    """
    creates a spark context. 
    optional parameter is memory limit.
    >>> sc = spark_session_setup('12G')
    """

    # in order to be bale to change log level
    conf = pyspark.SparkConf()
    conf.set('spark.logConf', 'true')
    conf.set('spark.executor.memory', memory_limit)
    conf.set('spark.driver.memory', memory_limit)
    conf.set('spark.driver.maxResultSize', memory_limit)

    # create a spark session
    sc = pyspark.SparkContext(appName='word_count', conf=conf)

    # change log level to ERROR
    sc.setLogLevel("ERROR")
    spark = SparkSession(sc)
    return sc
示例#25
0
def startStreaming():
    try:
        from pyspark import SparkContext
        from pyspark import SparkConf
        from pyspark.python.pyspark.shell import sc
        from pyspark.sql import SQLContext
        from pyspark.sql.functions import date_format
        from pyspark.sql.functions import to_json, struct
        from pyspark.sql.session import SparkSession
        import json
        from pyspark.sql.types import TimestampType, StringType, StructType, StructField
        spark = SparkSession(sc)
        columns = ['dateAdded', 'id', 'brand', 'colors']
        schema = StructType([
            StructField("id", StringType(), True),
            StructField("dateAdded", StringType(), True),
            StructField("dateUpdated", StringType(), True),
            StructField("asins", StringType(), True),
            StructField("brand", StringType(), True),
            StructField("categories", StringType(), True),
            StructField("primaryCategories", StringType(), True),
            StructField("colors", StringType(), True)
        ])
        df = spark.readStream.schema(schema).option("sep", ",").option(
            "header", "true").option("enforceSchema", "true").csv(inputPath)
        read = df.select(
            date_format(df.dateAdded, "yyyy-MM-dd").alias('key'),
            to_json(struct([
                df[x] for x in columns
            ])).alias("value")).writeStream.format("kafka").option(
                "kafka.bootstrap.servers", kafkaHost + ":" + kafkaPort).option(
                    "topic", kafkaTopic).option("checkpointLocation",
                                                "checkpoint").start()
        read.awaitTermination()
        print("Successfully Streamed")
    except ImportError as e:
        print("Can not import Spark Modules", e)
        sys.exit(1)
示例#26
0
    def getOrCreate(cls, sc):
        """
        Get the existing SQLContext or create a new one with given SparkContext.

        .. versionadded:: 1.6.0

        .. deprecated:: 3.0.0
            Use :func:`SparkSession.builder.getOrCreate()` instead.

        Parameters
        ----------
        sc : :class:`SparkContext`
        """
        warnings.warn(
            "Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.",
            FutureWarning)

        if (cls._instantiatedContext is None
                or SQLContext._instantiatedContext._sc._jsc is None):
            jsqlContext = sc._jvm.SparkSession.builder().sparkContext(
                sc._jsc.sc()).getOrCreate().sqlContext()
            sparkSession = SparkSession(sc, jsqlContext.sparkSession())
            cls(sc, sparkSession, jsqlContext)
        return cls._instantiatedContext
示例#27
0
def main():

    dataset, features_used, num_trees, max_depth = set_parameters(sys.argv)


    #Default to use all features in case of invalid parameters.


    #Setting parameters for Spark
    sc = SparkContext.getOrCreate()
    sc.stop()
    conf = SparkConf().setAll([('spark.executor.memory', '7000M'), ('spark.rpc.message.maxSize', '2047'), ('spark.ui.showConsoleProgress', 'true')])
    sc = SparkContext.getOrCreate(conf=conf)
    print(sc.getConf().getAll())
    spark = SparkSession(sc)
    
    #Settin URls for files
    if dataset == 'l':
        X_train_path = 'https://storage.googleapis.com/uga-dsp/project1/files/X_train.txt'
        X_test_path = 'https://storage.googleapis.com/uga-dsp/project1/files/X_test.txt'
        y_path = 'https://storage.googleapis.com/uga-dsp/project1/files/y_train.txt'
    else:
        X_train_path = 'https://storage.googleapis.com/uga-dsp/project1/files/X_small_train.txt'
        X_test_path = 'https://storage.googleapis.com/uga-dsp/project1/files/X_small_test.txt'
        y_path = 'https://storage.googleapis.com/uga-dsp/project1/files/y_small_train.txt'
    #Creating a list of y labels
    y_train = get_labels(y_path)
    
    #Extracing features from train and test set
    X_train, X_test = get_features(X_train_path, X_test_path, features_used)
    
    #Creating a training matrix by combing features and labels
    train_data = create_training_matrix(X_train, y_train)
    
    #Passing training data to the model and outputing prediction
    predict_and_save(sc, train_data, X_test, num_trees, max_depth)
    def train_model(self):
        sc = SparkContext(appName="PySparkShell")
        spark = SparkSession(sc)

        train_data_schema = tp.StructType([
            tp.StructField(name="id", dataType=tp.IntegerType(),
                           nullable=True),
            tp.StructField(name="label",
                           dataType=tp.IntegerType(),
                           nullable=True),
            tp.StructField(name="tweet",
                           dataType=tp.StringType(),
                           nullable=True),
        ])

        LOCAL_ROOT = os.path.abspath("data") + os.sep

        train_data = spark.read.csv(
            LOCAL_ROOT + "twitter_sentiments_train.csv",
            schema=train_data_schema,
            header=True,
        )

        stage_1 = RegexTokenizer(inputCol="tweet",
                                 outputCol="tokens",
                                 pattern="\\W")
        stage_2 = StopWordsRemover(inputCol="tokens",
                                   outputCol="filtered_words")
        stage_3 = Word2Vec(inputCol="filtered_words",
                           outputCol="vector",
                           vectorSize=100)
        model = LogisticRegression(featuresCol="vector", labelCol="label")
        pipeline = Pipeline(stages=[stage_1, stage_2, stage_3, model])
        self.pipelineFit = pipeline.fit(train_data)

        print("Done!")
示例#29
0
from pyspark.sql import Row
from pyspark.sql.functions import udf
from pyspark.ml.feature import CountVectorizer
import re
from pyspark.sql.types import ArrayType, StringType
import pickle
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import GradientBoostedTrees
from pyspark.mllib.tree import RandomForest
import pandas as pd
import pandas

# creating the spark context
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)


def byte_feature_extraction(lists, file, feature, track, value):

    for hexa_file_num, f in enumerate(file):
        inputs = requests.get(
            "https://storage.googleapis.com/uga-dsp/project1/data/bytes/" + f +
            ".bytes",
            stream=True)
        for hexa_file in inputs.iter_lines():
            if len(hexa_file) == 0:
                continue
            hexa_file = hexa_file.split()

            for byte in hexa_file[1:]:
示例#30
0
def main():

    set_pandas_options()
    app_name = "Case Study 2"

    conf = SparkConf().setAppName(app_name)
    sc = SparkContext(conf=conf)
    spark = SparkSession(sc)

    log4jLogger = sc._jvm.org.apache.log4j
    LOGGER = log4jLogger.LogManager.getLogger(__name__)
    LOGGER.info("pyspark script logger initialized")

    # 1 Load file as a text file in spark
    LOG_ = get_hdfs_filepath('access.log')
    CLEAN_LOG = get_hdfs_filepath('access.clean.log')
    # read text file
    text_file = sc.textFile(LOG_).filter(lambda row: row != '')
    split_rdd = text_file.map(lambda row: my_search(row))
    LOGGER.info("\n\n1.\tLoad file as a text file in spark\tDone!\n")

    # 2 Find out how many 404 HTTP codes are in access logs
    count = split_rdd.filter(lambda row: row[5] == 404).count()
    LOGGER.info(
        "\n\n2.\tFind out how many 404 HTTP codes are in access logs\tDone!\n\n{}\n\n"
        .format(count))

    # 3 Find out which URLs are broken
    url_count_rdd = split_rdd.map(lambda row: (row[0], 1)).reduceByKey(
        lambda x, y: x + y)
    start_time = time.time()

    zz = load_all(url_count_rdd.toLocalIterator(),
                  sc).sortBy(lambda a: -int(a[1])).toDF(
                      ["url", "count", "result"]).toPandas()
    LOGGER.info(
        "\n\n3.\tFind out which URLs are broken\tDone!\n\n{}\n{}\n".format(
            zz, "--- %s seconds ---" % (time.time() - start_time)))

    # 4 Verify there are no null columns in the original dataset
    LOGGER.info(
        "\n\n4.\tVerify there are no null columns in the original dataset\tDone!\n\n"
    )

    # 5 Replace null values with constants such as 0
    LOGGER.info(
        "\n\n5.\tReplace null values with constants such as 0\tDone!\n\n")

    # 6 Parse timestamp to readable date
    date_codes = split_rdd.map(lambda row: (row[2], row[5]))
    dates = date_codes.toDF(['readable_date']).toPandas().iloc[:, 0]
    LOGGER.info(
        "\n\n6.\tParse timestamp to readable date\tDone!\n\n{}\n".format(
            dates.head(10)))

    # 7 Describe which HTTP status values appear in data and how many
    codes = date_codes.map(lambda line: line[1])
    code_counts = codes.map(lambda code: (code, 1)).reduceByKey(
        lambda x, y: x + y).sortBy(lambda a: -a[1])
    df = code_counts.toDF(['code', 'count']).toPandas()
    LOGGER.info(
        "\n\n7.\tDescribe which HTTP status values appear in data and how many\tDone!\n\n{}\n"
        .format(df))

    # 8 Display as chart the above stat in chart in Zeppelin notebook
    responses = df.iloc[:, 0].values
    counts = df.iloc[:, 1].values
    plt.rcdefaults()
    y_pos = np.arange(len(responses))
    plt.bar(responses, counts, align='center', alpha=0.5, log=True)
    plt.xticks(y_pos, responses)
    plt.ylabel('Counts')
    plt.xlabel('Codes')
    plt.title('Log counts per status code')
    plt.show()
    LOGGER.info(
        "\n\n8.\tDisplay as chart the above stat in chart in Zeppelin notebook\tDone!\n\n"
    )

    # 9 How many unique hosts are there in the entire log and their average request
    hosts = url_count_rdd.map(lambda row: (urlparse(row[0]).netloc, row[1]))
    hosts_counts = hosts.reduceByKey(lambda x, y: x + y).sortBy(
        lambda a: -a[1])
    df = hosts_counts.toDF(['host', 'count']).toPandas()
    LOGGER.info(
        "\n\n9.\tHow many unique hosts are there in the entire log and their average request\tDone!\n\n{}\n"
        .format(df))

    LOGGER.info(
        "\n\n10.\tCreate a spark-submit application for the same and print the findings in the log\tDone!\n\n"
    )