Пример #1
0
def get_data():
    findspark.add_packages('mysql:mysql-connector-java:8.0.11')
    spark = SparkSession.builder.appName('pipeline').getOrCreate()
    with open('/home/tom/mysql_creds.json', 'r') as f:
        data = json.load(f)

    hostname = 'localhost'
    jdbcPort = 3306
    password = data['password']
    dbname = 'my_company'
    jdbc_url = "jdbc:mysql://{0}:{1}/{2}?user={3}&password={4}".format(
        hostname, jdbcPort, dbname, username, password)
    query = "(select * from syria_data) t1_alias"
    df = spark.read.format('jdbc').options(driver='com.mysql.jdbc.Driver',
                                           url=jdbc_url,
                                           dbtable=query).load()
    df.write.parquet('/home/tom/Documents/csv_files/syria_parquet.parquet')
Пример #2
0
def load_data():
    findspark.add_packages('mysql:mysql-connector-java:8.0.11'
                           )  #adding JDBC driver to connect to MySQL
    spark = SparkSession.builder.appName('pipeline').config(
        "spark.ui.port", "4050").getOrCreate()
    hostname = "localhost"
    dbname = "dag_data"
    jdbcPort = 3306
    username = "******"
    password = SQL_PASSWORD
    jdbc_url = "jdbc:mysql://{0}:{1}/{2}?user={3}&password={4}".format(
        hostname, jdbcPort, dbname, username, password)
    df = spark.read.parquet(
        '/home/tom/Documents/csv_files/book_parquet.parquet')
    df.write.format('jdbc').options(
        url=jdbc_url,
        driver='com.mysql.jdbc.Driver',
        dbtable='bookstore_data',
        user=username,
        password=password).mode('overwrite').save()  #saving to MySQL
Пример #3
0
import findspark

try:
    from pyspark import context
except ImportError:
    # Add PySpark to the library path based on the value of SPARK_HOME if
    # pyspark is not already in our path
    findspark.init()

findspark.add_packages(['com.databricks:spark-csv_2.10:1.4.0'])
Пример #4
0
import json
import sys, os, re, ast
import findspark
# Add the streaming package and initialize
findspark.add_packages(
    ["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.2.0"])
findspark.init()
import pyspark
import pyspark.streaming
from pyspark import SparkConf, SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils


def main():
    PERIOD = 10
    BROKERS = 'localhost:9092'
    TOPIC = 'twitterstream'
    duration = 100
    conf = SparkConf().set("spark.default.paralleism", 1)
    sc = SparkContext(appName='Streamer', conf=conf)
    #create a streaming context with batch interval 10 sec
    ssc = StreamingContext(sc, PERIOD)
    #ssc.checkpoint("checkpoint")
    stream = KafkaUtils.createDirectStream(ssc, [TOPIC], {
        "metadata.broker.list": BROKERS,
    })

    tweets = stream.map(lambda x: json.loads(x[1])).map(
        lambda x: json.loads(x))
    text = tweets.map(lambda x: x['text'])
def main(base_path):

    APP_NAME = "make_predictions_streaming.py"

    # Process data every 10 seconds
    PERIOD = 10
    BROKERS = 'localhost:9092'
    PREDICTION_TOPIC = 'flight_delay_classification_request'

    try:
        sc and ssc
    except NameError as e:
        import findspark

        # Add the streaming package and initialize
        findspark.add_packages(
            ["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"])
        findspark.init()

        import pyspark
        import pyspark.sql
        import pyspark.streaming

        conf = SparkConf().set("spark.default.parallelism", 1)
        sc = SparkContext(
            appName="Agile Data Science: PySpark Streaming 'Hello, World!'",
            conf=conf)
        ssc = StreamingContext(sc, PERIOD)
        spark = pyspark.sql.SparkSession(sc).builder.appName(
            APP_NAME).getOrCreate()

    #
    # Load all models to be used in making predictions
    #

    # Load the arrival delay bucketizer
    from pyspark.ml.feature import Bucketizer
    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path)

    # Load all the string field vectorizer pipelines into a dict
    from pyspark.ml.feature import StringIndexerModel

    string_indexer_models = {}
    for column in [
            "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin",
            "Dest", "Route"
    ]:
        string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format(
            base_path, column)
        string_indexer_model = StringIndexerModel.load(
            string_indexer_model_path)
        string_indexer_models[column] = string_indexer_model

    # Load the numeric vector assembler
    from pyspark.ml.feature import VectorAssembler
    vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(
        base_path)
    vector_assembler = VectorAssembler.load(vector_assembler_path)

    # Load the classifier model
    from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
    random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
        base_path)
    rfc = RandomForestClassificationModel.load(random_forest_model_path)

    #
    # Process Prediction Requests in Streaming
    #
    stream = KafkaUtils.createDirectStream(ssc, [PREDICTION_TOPIC], {
        "metadata.broker.list": BROKERS,
        "group.id": "0",
    })

    object_stream = stream.map(lambda x: json.loads(x[1]))
    object_stream.pprint()

    row_stream = object_stream.map(
        lambda x: Row(FlightDate=iso8601.parse_date(x['FlightDate']),
                      Origin=x['Origin'],
                      Distance=x['Distance'],
                      DayOfMonth=x['DayOfMonth'],
                      DayOfYear=x['DayOfYear'],
                      UUID=x['UUID'],
                      DepDelay=x['DepDelay'],
                      DayOfWeek=x['DayOfWeek'],
                      FlightNum=x['FlightNum'],
                      Dest=x['Dest'],
                      Timestamp=iso8601.parse_date(x['Timestamp']),
                      Carrier=x['Carrier']))
    row_stream.pprint()

    #
    # Create a dataframe from the RDD-based object stream
    #

    def classify_prediction_requests(rdd):

        from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType
        from pyspark.sql.types import StructType, StructField

        prediction_request_schema = StructType([
            StructField("Carrier", StringType(), True),
            StructField("DayOfMonth", IntegerType(), True),
            StructField("DayOfWeek", IntegerType(), True),
            StructField("DayOfYear", IntegerType(), True),
            StructField("DepDelay", DoubleType(), True),
            StructField("Dest", StringType(), True),
            StructField("Distance", DoubleType(), True),
            StructField("FlightDate", DateType(), True),
            StructField("FlightNum", StringType(), True),
            StructField("Origin", StringType(), True),
            StructField("Timestamp", TimestampType(), True),
            StructField("UUID", StringType(), True),
        ])

        prediction_requests_df = spark.createDataFrame(
            rdd, schema=prediction_request_schema)
        prediction_requests_df.show()

        #
        # Add a Route variable to replace FlightNum
        #

        from pyspark.sql.functions import lit, concat
        prediction_requests_with_route = prediction_requests_df.withColumn(
            'Route',
            concat(prediction_requests_df.Origin, lit('-'),
                   prediction_requests_df.Dest))
        prediction_requests_with_route.show(6)

        # Vectorize string fields with the corresponding pipeline for that column
        # Turn category fields into categoric feature vectors, then drop intermediate fields
        for column in [
                "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin",
                "Dest", "Route"
        ]:
            string_indexer_model = string_indexer_models[column]
            prediction_requests_with_route = string_indexer_model.transform(
                prediction_requests_with_route)

        # Vectorize numeric columns: DepDelay, Distance and index columns
        final_vectorized_features = vector_assembler.transform(
            prediction_requests_with_route)

        # Inspect the vectors
        final_vectorized_features.show()

        # Drop the individual index columns
        index_columns = [
            "Carrier_index", "DayOfMonth_index", "DayOfWeek_index",
            "DayOfYear_index", "Origin_index", "Dest_index", "Route_index"
        ]
        for column in index_columns:
            final_vectorized_features = final_vectorized_features.drop(column)

        # Inspect the finalized features
        final_vectorized_features.show()

        # Make the prediction
        predictions = rfc.transform(final_vectorized_features)

        # Drop the features vector and prediction metadata to give the original fields
        predictions = predictions.drop("Features_vec")
        final_predictions = predictions.drop("indices").drop("values").drop(
            "rawPrediction").drop("probability")

        # Inspect the output
        final_predictions.show()

        # Store to Mongo
        if final_predictions.count() > 0:
            final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB(
                "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response"
            )

    # Do the classification and store to Mongo
    row_stream.foreachRDD(classify_prediction_requests)

    ssc.start()
    ssc.awaitTermination()
Пример #6
0
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

from analyze.datefrme_mongo import Trade

SPARK_HOME = '/Users/luodongshen/Documents/soft/spark-3.0.0-bin-hadoop3.2'
import findspark
findspark.add_packages('org.mongodb.spark:mongo-spark-connector_2.12:3.0.0')
findspark.init(SPARK_HOME)

from pyspark.sql import SparkSession

import datetime
import logging

import base
import constant
from analyze.GrahamTendency import GrahamPeTTM, CHINA_AAA
import  orm.mongobase as om


spark = SparkSession.builder.appName('MyApp')\
        .config('spark.mongodb.input.uri', 'mongodb://127.0.0.1/stock.k_data') \
        .getOrCreate()

schema = StructType([
    StructField("code", StringType()),
    StructField("start_date", StringType()),
    StructField("end_date", StringType())
])

Пример #7
0
def main(base_path):

    APP_NAME = "make_predictions_streaming.py"

    # 10초마다 데이터 처리
    PERIOD = 10
    BROKERS = 'localhost:9092'
    PREDICTION_TOPIC = 'flight_delay_classification_request'

    try:
        sc and ssc
    except NameError as e:
        import findspark

        # 스트리밍 패키지 추가 및 초기화
        findspark.add_packages(
            ["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"])
        findspark.init()

        import pyspark
        import pyspark.sql
        import pyspark.streaming

        conf = SparkConf().set("spark.default.parallelism", 1)
        sc = SparkContext(
            appName="Agile Data Science: PySpark Streaming 'Hello, World!'",
            conf=conf)
        ssc = StreamingContext(sc, PERIOD)
        spark = pyspark.sql.SparkSession(sc).builder.appName(
            APP_NAME).getOrCreate()

    #
    # 예측 생성에 사용된 모든 모델 적재
    #

    # 도착 지연 구간화 모델 적재
    from pyspark.ml.feature import Bucketizer
    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path)

    # 모든 문자열 필드 벡터화 파이프라인을 dict에 적재
    from pyspark.ml.feature import StringIndexerModel

    string_indexer_models = {}
    for column in ["Carrier", "Origin", "Dest", "Route"]:
        string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format(
            base_path, column)
        string_indexer_model = StringIndexerModel.load(
            string_indexer_model_path)
        string_indexer_models[column] = string_indexer_model

    # 숫자 벡터 어셈블러 적재
    from pyspark.ml.feature import VectorAssembler
    vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(
        base_path)
    vector_assembler = VectorAssembler.load(vector_assembler_path)

    # 분류 모델 적재
    from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
    random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
        base_path)
    rfc = RandomForestClassificationModel.load(random_forest_model_path)

    #
    # 스트리밍에서 예측 요청 처리
    #
    stream = KafkaUtils.createDirectStream(ssc, [PREDICTION_TOPIC], {
        "metadata.broker.list": BROKERS,
        "group.id": "0",
    })

    object_stream = stream.map(lambda x: json.loads(x[1]))
    object_stream.pprint()

    row_stream = object_stream.map(
        lambda x: Row(FlightDate=iso8601.parse_date(x['FlightDate']),
                      Origin=x['Origin'],
                      Distance=x['Distance'],
                      DayOfMonth=x['DayOfMonth'],
                      DayOfYear=x['DayOfYear'],
                      UUID=x['UUID'],
                      DepDelay=x['DepDelay'],
                      DayOfWeek=x['DayOfWeek'],
                      FlightNum=x['FlightNum'],
                      Dest=x['Dest'],
                      Timestamp=iso8601.parse_date(x['Timestamp']),
                      Carrier=x['Carrier']))
    row_stream.pprint()

    #
    # RDD 기반 객체 스트림에서 dataframe 생성
    #

    def classify_prediction_requests(rdd):

        from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType
        from pyspark.sql.types import StructType, StructField

        prediction_request_schema = StructType([
            StructField("Carrier", StringType(), True),
            StructField("DayOfMonth", IntegerType(), True),
            StructField("DayOfWeek", IntegerType(), True),
            StructField("DayOfYear", IntegerType(), True),
            StructField("DepDelay", DoubleType(), True),
            StructField("Dest", StringType(), True),
            StructField("Distance", DoubleType(), True),
            StructField("FlightDate", DateType(), True),
            StructField("FlightNum", StringType(), True),
            StructField("Origin", StringType(), True),
            StructField("Timestamp", TimestampType(), True),
            StructField("UUID", StringType(), True),
        ])

        prediction_requests_df = spark.createDataFrame(
            rdd, schema=prediction_request_schema)
        prediction_requests_df.show()

        #
        # FlightNum을 대체할 Route 변수 추가
        #

        from pyspark.sql.functions import lit, concat
        prediction_requests_with_route = prediction_requests_df.withColumn(
            'Route',
            concat(prediction_requests_df.Origin, lit('-'),
                   prediction_requests_df.Dest))
        prediction_requests_with_route.show(6)

        # 문자열 필드를 해당 열에 대응하는 파이프라인으로 벡터화
        # 범주 필드를 범주형 특징 벡터로 변환한 다음 중간 결과 필드 삭제
        for column in ["Carrier", "Origin", "Dest", "Route"]:
            string_indexer_model = string_indexer_models[column]
            prediction_requests_with_route = string_indexer_model.transform(
                prediction_requests_with_route)

        # 숫사 열 벡터화: DepDelay, Distance, 인덱스 열
        final_vectorized_features = vector_assembler.transform(
            prediction_requests_with_route)

        # 벡터 검사
        final_vectorized_features.show()

        # 개별 인덱스 열 제거
        index_columns = [
            "Carrier_index", "Origin_index", "Dest_index", "Route_index"
        ]
        for column in index_columns:
            final_vectorized_features = final_vectorized_features.drop(column)

        # 확정된 특징 검사
        final_vectorized_features.show()

        # 예측 생성
        predictions = rfc.transform(final_vectorized_features)

        # 원 필드에 제공하기 위해 특징 벡터와 예측 메타데이터 제거
        predictions = predictions.drop("Features_vec")
        final_predictions = predictions.drop("indices").drop("values").drop(
            "rawPrediction").drop("probability")

        # 결과 검사
        final_predictions.show()

        # 몽고DB에 저장
        if final_predictions.count() > 0:
            final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB(
                "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response"
            )

    # 분류를 수행하고 몽고 DB에 저장
    row_stream.foreachRDD(classify_prediction_requests)

    ssc.start()
    ssc.awaitTermination()
Пример #8
0
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import mysql_connect_class
from generator.server_log_generator import ServerLogGenerator
import findspark

# Utilize find spark in case mysql package not found
findspark.add_packages('mysql:mysql-connector-java:8.0.11')

# Set Kafka config

kafka_broker = "b-2.log.02msna.c8.kafka.us-west-2.amazonaws.com:9092," \
               "b-1.log.02msna.c8.kafka.us-west-2.amazonaws.com:9092"
kafka_topic_input = "server-logs"

# MySQL Connection Parameters
mysql_host = 'stream-database.cp2rkjojtqyn.us-west-2.rds.amazonaws.com'
mysql_port = '3306'


def get_defined_values():
    # Create a ServerLogGenerator instance to get predefined values
    s = ServerLogGenerator()

    countries = s._location_country  # Pre-defined countries
    event_types = s._event_type      # Pre-defined events
    devices = ["ANDROID", "IOS"]    # Pre-defined devices
    return [countries, event_types, devices]

def main(base_path):

  APP_NAME = "make_predictions_streaming.py"

  # Process data every 10 seconds
  PERIOD = 10
  BROKERS = 'localhost:9092'
  PREDICTION_TOPIC = 'flight_delay_classification_request'
  
  try:
    sc and ssc
  except NameError as e:
    import findspark

    # Add the streaming package and initialize
    findspark.add_packages(["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"])
    findspark.init()
    
    import pyspark
    import pyspark.sql
    import pyspark.streaming
  
    conf = SparkConf().set("spark.default.parallelism", 1)
    sc = SparkContext(appName="Agile Data Science: PySpark Streaming 'Hello, World!'", conf=conf)
    ssc = StreamingContext(sc, PERIOD)
    spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()
  
  #
  # Load all models to be used in making predictions
  #
  
  # Load the arrival delay bucketizer
  from pyspark.ml.feature import Bucketizer
  arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path)
  arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path)
  
  # Load all the string field vectorizer pipelines into a dict
  from pyspark.ml.feature import StringIndexerModel
  
  string_indexer_models = {}
  for column in ["Carrier", "Origin", "Dest", "Route"]:
    string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format(
      base_path,
      column
    )
    string_indexer_model = StringIndexerModel.load(string_indexer_model_path)
    string_indexer_models[column] = string_indexer_model

  # Load the numeric vector assembler
  from pyspark.ml.feature import VectorAssembler
  vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(base_path)
  vector_assembler = VectorAssembler.load(vector_assembler_path)

  # Load the classifier model
  from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
  random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
    base_path
  )
  rfc = RandomForestClassificationModel.load(
    random_forest_model_path
  )
  
  #
  # Process Prediction Requests in Streaming
  #
  stream = KafkaUtils.createDirectStream(
    ssc,
    [PREDICTION_TOPIC],
    {
      "metadata.broker.list": BROKERS,
      "group.id": "0",
    }
  )

  object_stream = stream.map(lambda x: json.loads(x[1]))
  object_stream.pprint()
  
  row_stream = object_stream.map(
    lambda x: Row(
      FlightDate=iso8601.parse_date(x['FlightDate']),
      Origin=x['Origin'],
      Distance=x['Distance'],
      DayOfMonth=x['DayOfMonth'],
      DayOfYear=x['DayOfYear'],
      UUID=x['UUID'],
      DepDelay=x['DepDelay'],
      DayOfWeek=x['DayOfWeek'],
      FlightNum=x['FlightNum'],
      Dest=x['Dest'],
      Timestamp=iso8601.parse_date(x['Timestamp']),
      Carrier=x['Carrier']
    )
  )
  row_stream.pprint()

  #
  # Create a dataframe from the RDD-based object stream
  #

  def classify_prediction_requests(rdd):
  
    from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType
    from pyspark.sql.types import StructType, StructField
  
    prediction_request_schema = StructType([
      StructField("Carrier", StringType(), True),
      StructField("DayOfMonth", IntegerType(), True),
      StructField("DayOfWeek", IntegerType(), True),
      StructField("DayOfYear", IntegerType(), True),
      StructField("DepDelay", DoubleType(), True),
      StructField("Dest", StringType(), True),
      StructField("Distance", DoubleType(), True),
      StructField("FlightDate", DateType(), True),
      StructField("FlightNum", StringType(), True),
      StructField("Origin", StringType(), True),
      StructField("Timestamp", TimestampType(), True),
      StructField("UUID", StringType(), True),
    ])
    
    prediction_requests_df = spark.createDataFrame(rdd, schema=prediction_request_schema)
    prediction_requests_df.show()

    #
    # Add a Route variable to replace FlightNum
    #

    from pyspark.sql.functions import lit, concat
    prediction_requests_with_route = prediction_requests_df.withColumn(
      'Route',
      concat(
        prediction_requests_df.Origin,
        lit('-'),
        prediction_requests_df.Dest
      )
    )
    prediction_requests_with_route.show(6)
  
    # Vectorize string fields with the corresponding pipeline for that column
    # Turn category fields into categoric feature vectors, then drop intermediate fields
    for column in ["Carrier", "Origin", "Dest", "Route"]:
      string_indexer_model = string_indexer_models[column]
      prediction_requests_with_route = string_indexer_model.transform(prediction_requests_with_route)
  
    # Vectorize numeric columns: DepDelay, Distance and index columns
    final_vectorized_features = vector_assembler.transform(prediction_requests_with_route)
    
    # Inspect the vectors
    final_vectorized_features.show()
  
    # Drop the individual index columns
    index_columns = ["Carrier_index", "Origin_index", "Dest_index", "Route_index"]
    for column in index_columns:
      final_vectorized_features = final_vectorized_features.drop(column)
  
    # Inspect the finalized features
    final_vectorized_features.show()
  
    # Make the prediction
    predictions = rfc.transform(final_vectorized_features)
  
    # Drop the features vector and prediction metadata to give the original fields
    predictions = predictions.drop("Features_vec")
    final_predictions = predictions.drop("indices").drop("values").drop("rawPrediction").drop("probability")
  
    # Inspect the output
    final_predictions.show()
  
    # Store to Mongo
    if final_predictions.count() > 0:
      final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB(
        "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response"
      )
  
  # Do the classification and store to Mongo
  row_stream.foreachRDD(classify_prediction_requests)
  
  ssc.start()
  ssc.awaitTermination()
Пример #10
0
SPARK_HOME = '/Users/luodongshen/Documents/soft/spark-3.0.0-bin-hadoop3.2'
import findspark
findspark.add_packages('org.mongodb.spark:mongo-spark-connector_2.11:2.3.1')
findspark.init(SPARK_HOME)

from pyspark.sql import SparkSession

logFile = "/Users/luodongshen/Documents/stock_logs/stock_info.log"  # Should be some file on your system
spark = SparkSession.builder.appName("SimpleApp").getOrCreate()
logData = spark.read.text(logFile).cache()

numAs = logData.filter(logData.value.contains('a')).count()
numBs = logData.filter(logData.value.contains('b')).count()

print("Lines with a: %i, lines with b: %i" % (numAs, numBs))

spark.stop()
Пример #11
0
import os

os.environ["PYSPARK_SUBMIT_ARGS"] = ""

import findspark

findspark.init()
# to install:
# $SPARK_HOME/bin/pyspark --packages org.mongodb.spark:mongo-spark-connector_2.11:2.2.0
# to load the mongoconnector package:
findspark.add_packages(["org.mongodb.spark:mongo-spark-connector_2.11:2.2.0"])

from pyspark.sql import SparkSession

FORMAT = "com.mongodb.spark.sql.DefaultSource"
URI = "mongodb://127.0.0.1:27017/{db}.{col}"


def get_session(database, collection):
    uri = URI.format(db=database, col=collection)
    return SparkSession \
    .builder \
    .appName("myApp") \
    .config("spark.mongodb.input.uri", uri) \
    .config("spark.mongodb.output.uri", uri) \
    .getOrCreate()
import json
import ast
import os
from scipy.spatial.distance import euclidean
# from kafka import KafkaConsumer
#
# consumer1 = KafkaConsumer('test_5', bootstrap_servers=['172.17.0.1:9092'])
#
# for message in consumer1:
#     print message

import findspark
findspark.init('/home/oliver/Documents/spark-2.0.0-bin-hadoop2.7')
findspark.add_packages(['org.apache.spark:spark-streaming-kafka-0-8-assembly_2.11:2.0.0-preview'])
from pyspark import SparkContext
from pyspark.streaming.kafka import KafkaUtils
from pyspark.streaming import StreamingContext

import pandas as pd

offsetRanges = []

wifi_hotspots = [{
    'name': '2WIRE413',
    'mac_add': '28:16:2e:a4:c4:41',
    'lat_lng': (333, 333)
},
{
    'name': 'ATTUuVi3A2',
    'mac_add': '78:96:84:6e:6f:a0',
    'lat_lng': (333, 666)
Пример #13
0
import findspark
import json
import redis
import os
from pyspark.sql import SparkSession
"""Script takes data from Redis and loads it into Spark, where JDBC is used to move data to
   MySQL instance hosted on EC2"""

findspark.add_packages(
    "mysql:mysql-connector-java:8.0.11")  #Adding MySQL driver to Spark

#retrieving data from Redis ---

r = redis.Redis(host="redis", port=6379)

data = json.loads(r.get("wine_data").decode("utf8"))

#reading data into Spark ---

spark = SparkSession.builder.appName("pipeline").getOrCreate()

with open("/usr/local/airflow/dags/ETL/schema.txt", "r") as f:
    SCHEMA = f.read()

df = spark.createDataFrame(data, schema=SCHEMA)

#Writing data to MySQL ---

user = os.environ.get("MYSQL_USER")
password = os.environ.get("MYSQL_PASSWORD")
db = os.environ.get("MYSQL_DB")
Пример #14
0
    args = arg_parse()

    if args.mode == "local":
        import findspark
        # os.environ["JAVA_HOME"] = r"/usr/lib/jvm/java-1.8.0-openjdk-amd64"
        # os.environ["SPARK_HOME"] = r"/mnt/c/projects/spark2.4.5"
        # os.environ['PYSPARK_SUBMIT_ARGS'] = ""
        # findspark.init(r"/mnt/c/projects/spark2.4.5")
        # findspark.add_packages(["org.apache.spark:spark-streaming-kinesis-asl_2.11:2.4.5"])
        import findspark
        os.environ["JAVA_HOME"] = r"C:\Program Files\Java\jdk1.8.0_241"
        os.environ["SPARK_HOME"] = r"C:\spark-2.4.5-bin-hadoop2.7"
        os.environ['PYSPARK_SUBMIT_ARGS'] = ""
        findspark.init(r"C:\spark-2.4.5-bin-hadoop2.7")
        findspark.add_packages(
            ["org.apache.spark:spark-streaming-kinesis-asl_2.11:2.4.5"])

    from pyspark.streaming import StreamingContext
    from pyspark.sql import SparkSession, DataFrame
    from pyspark import RDD
    from pyspark import SparkConf, SparkContext
    from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream
    from pyspark.sql.context import SQLContext
    from pyspark.sql.types import StructType

    schema = StructType.fromJson(app_config["all_data_scheme"])

    connect = create_db_connection(args.redshift_host, args.redshift_port,
                                   args.redshift_user, args.redshift_password,
                                   args.redshift_db_name)