Exemplo n.º 1
0
def run_driver(keyspace, table, cass_host):
    conf = SparkConf().setAppName("PySpark Cassandra Sample Driver")
    conf.set("spark.cassandra.connection.host", cass_host)
    sc = CassandraSparkContext(conf=conf)

    # Read some data from Cassandra
    pixels = sc.cassandraTable(keyspace, table)
    print pixels.first()

    # Count unique visitors, notice that the data returned by Cassandra is
    # a dict-like, you can access partition, clustering keys as well as
    # columns by name. CQL collections: lists, sets and maps are converted
    # to proper Python data types
    visitors = pixels.map(lambda p: (p["data"]["visitor_id"],))\
                .distinct()
    print "Visitors: {:,}".format(visitors.count())

    # Insert some new pixels into the table
    pixels = ({
        "customer_id": "example.com",
        "url": "http://example.com/article1/",
        "hour": dt.datetime(2014, 1, 2, 1),
        "ts": dt.datetime(2014, 1, 2, 1, 8, 23),
        "pixel_id": str(uuid4()),
        "data": {
            "visitor_id": "xyz"
        }
    }, )
    saveToCassandra(sc.parallelize(pixels), keyspace, table)
    print "Wrote new pixels to Cassandra {!r}.{!r}".format(keyspace, table)
Exemplo n.º 2
0
def get_spark():
    conf = SparkConf()

    # Load in a jar that provides extended string comparison functions such as Jaro Winkler.
    # Splink

    # No longer needed in spark 3.0?
    # conf.set("spark.driver.extraClassPath", "jars/scala-udf-similarity-0.0.7.jar")
    conf.set("spark.jars", "jars/scala-udf-similarity-0.0.7.jar")
    conf.set("spark.jars.packages",
             "graphframes:graphframes:0.8.0-spark3.0-s_2.12")

    # WARNING:
    # These config options are appropriate only if you're running Spark locally!!!
    conf.set("spark.driver.memory", "4g")
    conf.set("spark.sql.shuffle.partitions", "8")

    sc = SparkContext.getOrCreate(conf=conf)
    sc.setCheckpointDir("temp_graphframes/")
    spark = SparkSession(sc)

    # Register UDFs
    from pyspark.sql import types

    spark.udf.registerJavaFunction(
        "jaro_winkler_sim",
        "uk.gov.moj.dash.linkage.JaroWinklerSimilarity",
        types.DoubleType(),
    )
    spark.udf.registerJavaFunction("Dmetaphone",
                                   "uk.gov.moj.dash.linkage.DoubleMetaphone",
                                   types.StringType())
    return spark
def run_driver(keyspace, table):
    conf = SparkConf().setAppName("PySpark Cassandra Sample Driver")
    conf.set("spark.cassandra.connection.host", "127.0.0.1")
    sc = CassandraSparkContext(conf=conf)

    # Read some data from Cassandra
    pixels = sc.cassandraTable(keyspace, table)
    print pixels.first()

    # Count unique visitors, notice that the data returned by Cassandra is
    # a dict-like, you can access partition, clustering keys as well as
    # columns by name. CQL collections: lists, sets and maps are converted
    # to proper Python data types
    visitors = pixels.map(lambda p: (p["data"]["visitor_id"],))\
                .distinct()
    print "Visitors: {:,}".format(visitors.count())

    # Insert some new pixels into the table
    pixels = (
        {
            "customer_id": "example.com",
            "url": "http://example.com/article1/",
            "hour": dt.datetime(2014, 1, 2, 1),
            "ts": dt.datetime(2014, 1, 2, 1, 8, 23),
            "pixel_id": str(uuid4()),
            "data": {"visitor_id": "xyz"}
        },
    )
    saveToCassandra(sc.parallelize(pixels), keyspace, table)
    print "Wrote new pixels to Cassandra {!r}.{!r}".format(keyspace, table)
Exemplo n.º 4
0
        def create_spark_session(app_name="SparkApplication_{}".format(
            datetime.utcfromtimestamp(
                time.time()).strftime('%Y-%m-%d %H:%M:%S'))):
            def init_calc_props(spark_ctx_conf):
                if {'parallelism-to-tasks-factor', 'total-executor-cores'
                    } <= set(dict(spark_ctx_conf.getAll())):
                    total_executor_cores = spark_utils.calc_max_cores(
                        int(spark_ctx_conf.get('total-executor-cores')))
                    parallelism_to_tasks_factor = int(
                        spark_ctx_conf.get('parallelism-to-tasks-factor'))
                    value = str(total_executor_cores *
                                parallelism_to_tasks_factor)
                    logging.info(
                        "total_executor_cores: {0}, parallelism_to_tasks_factor: {1},value: {2}"
                        .format(total_executor_cores,
                                parallelism_to_tasks_factor, value))
                    spark_ctx_conf.set('spark.default.parallelism', value)
                    spark_ctx_conf.set('spark.sql.shuffle.partitions', value)
                    logging.debug('Starting local spark with conf: {0}'.format(
                        "\n".join(str(v) for v in spark_ctx_conf.getAll())))

            from pyspark.sql import SparkSession
            from pyspark.context import SparkConf
            spark_conf = SparkConf()
            spark_conf.setAll(spark_conf_dict.items())
            init_calc_props(spark_conf)
            sc = spark_utils.generate_spark_context(spark_conf=spark_conf)
            spark = SparkSession(sc)

            spark.sparkContext.setLogLevel("WARN")

            return spark
def initialize():
    global sc, spark, items, inputfile, buckets_user, buckets_business, partition, totalSize, t, mainThreshold
    print("Initializing...")
    t = time.time()
    candidateList = []
    frequentList = []
    sc_conf = SparkConf()
    sc_conf.setAppName("Task1")
    sc_conf.setMaster('local[*]')
    sc_conf.set("spark.driver.bindAddress", "127.0.0.1")
    sc = SparkContext(conf=sc_conf)
    sc.setLogLevel("ERROR")
    csvread = sc.textFile(inputfile)
    columnName = csvread.first().split(',')
    items = csvread.map(lambda line: line.split(",")).filter(
        lambda line: (line) != columnName)

    buckets_user = items.groupByKey().mapValues(list).filter(
        lambda x: len(x[1]) > mainThreshold).mapPartitionsWithIndex(
            removeDuplicateEntriesAfter)
    print("Without Duplicates DOne..")
    # withoutDuplicates = checkM.mapPartitionsWithIndex(
    #     removeDuplicateEntries).groupByKey().mapValues(list)

    if (case == 1):
        # buckets_user = withoutDuplicates.mapPartitionsWithIndex(
        #     createBuckets).groupByKey().mapValues(list).filter(lambda x: len(x[1]) > mainThreshold)

        callSonPhase1(buckets_user)
        print("Initializing Phase 2.....")
        finalFreq = buckets_user.mapPartitionsWithIndex(
            lambda partition_index, iter_row: phase2(partition_index, iter_row)
        ).reduceByKey(lambda x, y: x + y).filter(
            lambda x: x[1] >= threshold).map(lambda x: makeList(x[0]))

        # print((finalFreq.collect()))
        finalOutput = (finalFreq.collect())
        x = sorted(finalOutput, key=lambda item: (len(list(item)), list(item)))
        # print(x)
        printingFreq(x)

        pass
    if (case == 2):
        buckets_business = withoutDuplicates.mapPartitionsWithIndex(
            createBuckets_case2).groupByKey().mapValues(list)
        callSonPhase1(buckets_business)
        print("Initializing Phase 2.....")
        finalFreq = buckets_business.mapPartitionsWithIndex(
            lambda partition_index, iter_row: phase2(partition_index, iter_row)
        ).reduceByKey(lambda x, y: x + y).filter(
            lambda x: x[1] >= threshold).map(lambda x: makeList(x[0]))

        # print((finalFreq.collect()))
        finalOutput = (finalFreq.collect())
        x = sorted(finalOutput, key=lambda item: (len(list(item)), list(item)))
        # print(x)
        printingFreq(x)

        pass
Exemplo n.º 6
0
 def __init__(self,
              spark_home,
              spark_master="local",
              exec_memory="8g",
              app_name="SparkClient"):
     """
     Initialize sparkcontext, sqlcontext
     :param spark_master: target spark master
     :param exec_memory: size of memory per executor
     """
     self._spark_master = spark_master
     self._exec_memory = exec_memory
     self._app_name = app_name
     self._spark_home = spark_home
     # Path for spark source folder
     os.environ['SPARK_HOME'] = self._spark_home
     self._spark_url = spark_master
     if spark_master != "local":
         os.environ['SPARK_MASTER_IP'] = spark_master
         self._spark_url = "spark://" + self._spark_master + ":7077"
     # Append pyspark  to Python Path
     sys.path.append(self._spark_home)
     # define the spark configuration
     conf = (SparkConf().setMaster(
         self._spark_url).setAppName(self._app_name).set(
             "spark.executor.memory", self._exec_memory).set(
                 "spark.core.connection.ack.wait.timeout",
                 "600").set("spark.akka.frameSize", "512").set(
                     "spark.cassandra.output.batch.size.bytes", "131072"))
     # create spark context
     self._spark_ctx = None
     if SparkContext._active_spark_context is None:
         self._spark_ctx = SparkContext(conf=conf)
     # create spark-on-hive context
     self._sql = SQLContext(self._spark_ctx)
Exemplo n.º 7
0
 def setUp(self):
     class_name = self.__class__.__name__
     conf = SparkConf().set("spark.default.parallelism", 1)
     self.sc = SparkContext(appName=class_name, conf=conf)
     self.sc.setCheckpointDir("/tmp")
     # TODO: decrease duration to speed up tests
     self.ssc = StreamingContext(self.sc, self.duration)
Exemplo n.º 8
0
def get_spark():
    conf = SparkConf()

    # Load in a jar that provides extended string comparison functions such as Jaro Winkler.
    # Splink

    # No longer needed in spark 3.0?
    #conf.set("spark.driver.extraClassPath", "C:\\Spark\\spark-3.1.1-bin-hadoop2.7\\jars\\scala-udf-similarity-0.0.7.jar")
    #conf.set("spark.driver.extraClassPath", "C:\\Spark\\spark-3.1.1-bin-hadoop2.7\\jars\\scala-udf-JaroWinkler-0.0.1.jar")

    conf.set(
        "spark.jars",
        "C:\\Spark\\spark-3.1.1-bin-hadoop2.7\\jars\\scala-udf-similarity-0.0.7.jar"
    )
    conf.set(
        "spark.jars",
        "C:\\Spark\\spark-3.1.1-bin-hadoop2.7\\jars\\scala-udf-JaroWinkler-0.0.1.jar"
    )
    # SET TO YOUR SPARK INSTALATION

    # WARNING:
    # These config options are appropriate only if you're running Spark locally!!!
    conf.set("spark.driver.memory", "1g")
    conf.set("spark.sql.shuffle.partitions", "4")

    #conf.set("spark.sql.files.maxPartitionBytes","536870912")
    #conf.set("spark.sql.files.maxPartitionBytes","250000000")
    #conf.set("spark.sql.files.maxPartitionBytes","134217728")

    sc = SparkContext.getOrCreate(conf=conf)
    sc.setCheckpointDir("temp_graphframes/")
    spark = SparkSession(sc)

    # Register UDFs
    from pyspark.sql import types
    '''
    spark.udf.registerJavaFunction(
        "jaro_winkler_sim",
        "uk.gov.moj.dash.linkage.JaroWinklerSimilarity",
        types.DoubleType(),
    )
    spark.udf.registerJavaFunction(
        "Dmetaphone", "uk.gov.moj.dash.linkage.DoubleMetaphone", types.StringType()
    )
    '''

    return spark
Exemplo n.º 9
0
def main():
    conf = SparkConf().setAppName('Home run count').setMaster('local')
    sc = SparkContext(conf=conf)
    spark = SparkSession.builder.appName("pyspark sql").getOrCreate()

    batting_path = "hdfs://localhost:8020/user/baseball/Batting.csv"
    batting_path="hdfs://localhost:9000/data/Batting.csv"

    data = spark.read.csv(batting_path, inferSchema=True, header=True)
    df = data.filter(data.yearID == '2018').select('playerID', 'teamID')

    # filter players that play on two or more teams
    players_vertices = df.groupBy("playerID").count().filter("count > 1").select("playerID")

    edges = df.withColumnRenamed("playerID", "src")
    edges = edges.withColumnRenamed("teamID", "dst")
    edges=players_vertices.join(edges, players_vertices.playerID == edges.src, "inner").select("src","dst")

    players_vertices=players_vertices.withColumnRenamed("playerID","id")
    teams_vertices = edges.select("dst").distinct().withColumnRenamed("dst","id")
    vertices = players_vertices.union(teams_vertices)
    # add one column with auto increasing id
    vertices = vertices.withColumn('num', monotonically_increasing_id())

    graph=GraphFrame(vertices,edges)

    # motif 1
    motif = graph.find("(a)-[]->(b); (a)-[]->(c)").filter("c.num > b.num")
    calculate(motif)
    # motif 2
    motif = graph.find("(a)-[]->(b); (a)-[]->(c);(d)-[]->(b); (d)-[]->(c)").filter("c.num > b.num and d.num > a.num")
    calculate(motif)
    # motif 3
    motif = graph.find("(a)-[]->(b); (a)-[]->(c);(d)-[]->(b); (d)-[]->(c);(e)-[]->(b); (e)-[]->(c)").filter(
        "c.num > b.num and d.num > a.num and e.num > d.num").distinct()
    calculate(motif)
    # motif 4
    motif = graph.find(
        "(a)-[]->(b); (a)-[]->(c);(d)-[]->(b); (d)-[]->(c);(e)-[]->(b); (e)-[]->(c);(f)-[]->(b);(f)-[]->(c)").filter(
        "c.num > b.num and d.num > a.num and e.num > d.num and f.num > e.num").distinct()
    calculate(motif)

    output_path = "/user/Wang/graphframe"

    # format the output
    final_result=[]
    for key in result.keys():
        line=""
        key_split=key.split("_")
        for i in range(len(key_split)):
            line += " " + key_split[i]
        for team in result[key]:
            line += " " + team
        final_result.append(line)

    data = sc.parallelize(final_result)
    data.saveAsTextFile(output_path)
Exemplo n.º 10
0
 def setup():
     conf = SparkConf().set("spark.default.parallelism", 1)
     sc = SparkContext(conf=conf)
     ssc = StreamingContext(sc, 0.5)
     dstream = ssc.textFileStream(inputd).map(lambda x: (x, 1))
     wc = dstream.updateStateByKey(updater)
     wc.map(lambda x: "%s,%d" % x).saveAsTextFiles(outputd + "test")
     wc.checkpoint(.5)
     return ssc
Exemplo n.º 11
0
def initialize():
    global sc, spark, items, inputfile
    print("Initializing...")
    sc_conf = SparkConf()
    sc_conf.setAppName("Task1")
    sc_conf.setMaster('local[*]')
    sc_conf.set("spark.driver.bindAddress", "127.0.0.1")
    sc = SparkContext(conf=sc_conf)
    sc.setLogLevel("ERROR")
    jsonread = sc.textFile(inputfile)
    items = jsonread.map(json.loads)
Exemplo n.º 12
0
        def setup():
            conf = SparkConf().set("spark.default.parallelism", 1)
            sc = SparkContext(conf=conf)
            ssc = StreamingContext(sc, 0.5)

            # A function that cannot be serialized
            def process(time, rdd):
                sc.parallelize(range(1, 10))

            ssc.textFileStream(inputd).foreachRDD(process)
            return ssc
Exemplo n.º 13
0
def get_spark_context(workers="*", driver_memory=None, executor_memory=None):
    """
    This function sets up a local Spark context, configured for use with SQL Server and AWS S3.
    """

    # we need some libraries (jars) to connect to SQL Server and S3, so define this config
    jar_dir = r"C:\Jars"

    files = os.listdir(jar_dir)

    jars = [f for f in files if f.lower().endswith(".jar")]

    extra_class_path = ";".join([os.path.join(jar_dir, j) for j in jars])

    # setup spark context
    conf = SparkConf().setMaster(f"local[{workers}]") \
        .set("spark.driver.extraClassPath", extra_class_path) \
        .set("spark.executor.heartbeatInterval", "60s")

    if driver_memory:
        conf.set("spark.driver.memory", driver_memory)

    if executor_memory:
        conf.set("spark.executor.memory", executor_memory)

    spark_context = SparkContext(conf=conf)

    # we need to configure our s3 endpoint because our buckets are in London
    spark_context.setSystemProperty("com.amazonaws.services.s3.enableV4",
                                    "true")
    spark_context._jsc.hadoopConfiguration().set("fs.s3a.endpoint",
                                                 "s3.eu-west-2.amazonaws.com")

    return spark_context
Exemplo n.º 14
0
def initialize(target_partitions=None):
    """Returns SparkContext and SQLContext."""
    conf = SparkConf()
    extra_settings = {
        'spark.serializer': 'org.apache.spark.serializer.KryoSerializer',
        'spark.executor.extraJavaOptions': '-XX:+UseG1GC'
    }
    if target_partitions:
        extra_settings['spark.default.parallelism'] = target_partitions

    conf.setAll(extra_settings.items())
    environment = {'PYTHON_EGG_CACHE': '/tmp/python-eggs'}
    sc = SparkContext(conf=conf, environment=environment)

    sqlContext = SQLContext(sc)
    if target_partitions:
        sqlContext.setConf('spark.sql.shuffle.partitions', target_partitions)

    jvm_logger = sc._jvm.org.apache.log4j
    jvm_logger.LogManager.getLogger("org").setLevel(jvm_logger.Level.ERROR)
    jvm_logger.LogManager.getLogger("akka").setLevel(jvm_logger.Level.ERROR)
    return sc, sqlContext
Exemplo n.º 15
0
def spark_session_setup(request):
    """ fixture for creating a spark context
    Args:
        request: pytest.FixtureRequest object
    """
    conf = (
        SparkConf().setMaster("local[*]").setAppName("aida_insights_testing"))
    spark_context = SparkContext(conf=conf)
    sc = SparkSession(sparkContext=spark_context).builder.getOrCreate()
    request.addfinalizer(lambda: sc.stop())

    quiet_py4j()
    return sc
Exemplo n.º 16
0
def initialize():
    global items, inputfile, sc, filterThreshold, t, totalEdges, cost_dict, strict_totalNodes, adjacency_listMain
    t = time.time()
    sc_conf = SparkConf()
    sc_conf.setAppName("Task1")
    sc_conf.setMaster('local[*]')
    # sc_conf.set("spark.driver.bindAddress", "127.0.0.1")
    sc = SparkContext(conf=sc_conf)
    sc.setLogLevel("ERROR")
    csvread = sc.textFile(inputfile)
    columnName = csvread.first().split(',')
    # print(columnName)
    items = csvread.map(lambda line: line.split(",")).filter(
        lambda line: (line) != columnName)

    # Getting user and their business count
    user_business = items.groupByKey().mapValues(set).collect()
    tuple_edge_list = []

    for i in range(0, len(user_business) - 1):
        for j in range(i + 1, len(user_business)):
            inter = user_business[i][1] & user_business[j][1]
            if len(inter) >= filterThreshold:
                tuple_edge_list.append(
                    (str(user_business[i][0]), str(user_business[j][0])))
                tuple_edge_list.append(
                    (str(user_business[j][0]), str(user_business[i][0])))

    totalEdges = float(len(tuple_edge_list) / 2)
    adjacency_list = sc.parallelize(tuple_edge_list).groupByKey().mapValues(
        list).collectAsMap()
    adjacency_listMain = copy.deepcopy(adjacency_list)
    totalNodes = list(adjacency_list.keys())

    # ------------------------Newly added line------------------------
    strict_totalNodes = copy.deepcopy(totalNodes)
    # print(len(totalNodes))

    # ----------------------Part 1---------------------
    bfs(totalNodes, adjacency_list)
    print("Writing Betweenness to File....")

    # Converting into sorted List Initial Betweenness
    list_val = list(cost_dict.items())

    list_val.sort(key=lambda x: (-x[1], x[0]))
    writeToFile(list_val)
    totalNodes = copy.deepcopy(strict_totalNodes)
    # print(len(totalNodes))
    # ----------------------Part 2----------------------
    print("Creating Partitions....")
    create_components(list_val, adjacency_listMain, totalNodes, totalEdges)
    # ---------------------EoC---------------------------

    print("Duration: " + str(time.time() - t))
Exemplo n.º 17
0
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

OUTPUT_BUCKET_FOLDER = "/outbrain/preprocessed/"
DATA_BUCKET_FOLDER = "/outbrain/orig/"
SPARK_TEMP_FOLDER = "/outbrain/spark-temp/"

from pyspark.sql.types import *
import pyspark.sql.functions as F

from pyspark.context import SparkContext, SparkConf
from pyspark.sql.session import SparkSession

conf = SparkConf().setMaster('local[*]').set(
    'spark.executor.memory', '256g').set('spark.driver.memory',
                                         '126g').set("spark.local.dir",
                                                     SPARK_TEMP_FOLDER)

sc = SparkContext(conf=conf)
spark = SparkSession(sc)

print('Loading data...')

truncate_day_from_timestamp_udf = F.udf(
    lambda ts: int(ts / 1000 / 60 / 60 / 24), IntegerType())

events_schema = StructType([
    StructField("display_id", IntegerType(), True),
    StructField("uuid_event", StringType(), True),
    StructField("document_id_event", IntegerType(), True),
    StructField("timestamp_event", IntegerType(), True),
Exemplo n.º 18
0
from __future__ import print_function

import sys
from operator import add

from pyspark.sql import SparkSession

from pyspark.context import SparkContext, SparkConf

if __name__ == "__main__":
    config = SparkConf().setAppName("wordCount").setMaster("local")
    sc = SparkContext()
    lines = sc.textFile("./src/main/python/wordCount/hello.txt")
    words = lines.flatMap(lambda line: line.split(" "))
    wordCountMap = words.map(lambda word: (word, 1))
    # count = wordCountMap.reduceByKey(lambda preCount, count: preCount + count)
    # output = count.collect()
    # print(output)

    # use countByKey instead
    count = wordCountMap.countByKey()
    print(count)
Exemplo n.º 19
0
#
# This configuration works for Spark on macOS using homebrew
#
import os, sys
# set OS environment variable
os.environ["SPARK_HOME"] = '/usr/hdp/2.4.2.0-258/spark'
# add Spark library to Python
sys.path.insert(0, os.path.join(os.environ["SPARK_HOME"], 'python'))

# import package
import pyspark
from pyspark.context import SparkContext, SparkConf

import atexit
def stop_my_spark():
    sc.stop()
    del(sc)

# Register exit    
atexit.register(stop_my_spark)

# Configure and start Spark ... but only once.
if not 'sc' in globals():
    conf = SparkConf()
    conf.setAppName('MyFirstSpark') ## you may want to change this
    conf.setMaster('yarn-client')
    sc = SparkContext(conf=conf)
    print "Launched Spark version %s with ID %s" % (sc.version, sc.applicationId)
    print "http://arc.insight.gsu.edu:8088/cluster/app/%s"% (sc.applicationId)
Exemplo n.º 20
0
from pyspark.context import SparkContext, SparkConf
from pyspark.sql import HiveContext
from pyspark.sql.functions import *

if __name__ == "__main__":
    conf = SparkConf().setAppName("flight tripreports")
    sc = SparkContext(conf=conf)
    sqlContext = HiveContext(sc)

    flights_data = sqlContext.sql("select * from refined_airlines.flights")
    non_cancelled_flights = flights_data.filter(flights_data.cancelled == 0)

    non_cancelled_longer_flights = non_cancelled_flights.filter(
        non_cancelled_flights.distance >= 1000)

    grouped_data_count = non_cancelled_longer_flights.groupBy(
        "flight_number").count()

    grouped_distance_sum = non_cancelled_longer_flights.groupBy(
        "flight_number").sum("distance")

    df1 = grouped_data_count.alias('df1')
    df2 = grouped_distance_sum.alias('df2')

    trip_report = df1.join(
        df2, df1.flight_number == df2.flight_number,
        how='inner').select('df1.flight_number',
                            col('df1.count').alias("total_trips"),
                            col('df2.sum(distance)').alias("total_distance"))

    trip_report.createOrReplaceTempView("trip_report_temp")
Exemplo n.º 21
0
#
# This configuration works for Spark on macOS using homebrew
#
import os, sys
# set OS environment variable
os.environ["SPARK_HOME"] = '/usr/local/Cellar/apache-spark/2.2.0/libexec'
# add Spark library to Python
sys.path.insert(0, os.path.join(os.environ["SPARK_HOME"], 'python'))

# import package
import pyspark
from pyspark.context import SparkContext, SparkConf

import atexit
def stop_my_spark():
    sc.stop()
    del(sc)

# Register exit    
atexit.register(stop_my_spark)

# Configure and start Spark ... but only once.
if not 'sc' in globals():
    conf = SparkConf()
    conf.setAppName('MyFirstSpark') ## you may want to change this
    conf.setMaster('local[2]')
    sc = SparkContext(conf=conf)
    print "Launched Spark version %s with ID %s" % (sc.version, sc.applicationId)

Exemplo n.º 22
0
import pyspark
from pyspark.context import SparkContext, SparkConf
from pyspark.sql import SQLContext #, HiveContext
#from pyspark.storagelevel import StorageLevel
import atexit
from pyspark_cassandra import CassandraSparkContext
from datetime import tzinfo, timedelta, datetime
from pytz import timezone

conf = SparkConf()

#conf.setMaster("local")
conf.setAppName("My app")
conf.set("spark.cassandra.connection.host", "10.0.40.42")

sc = CassandraSparkContext(conf = conf)
atexit.register(lambda: sc.stop())

rdd = sc.cassandraTable("el_test", "cockpit2_testTogether")


# for( d in range 2015-10-01 ~ 2015-10-10 ) do:
#
#    SELECT url,date,site,cnts,cnt from cockpit2_allTogether where `date` = d and site = giga and tags contains resort:android
#
# after this query, every row has to be updated with new value for cnts:
#
# UPDATE cnts = ga_videoPlays + sda_downloads + fb_socialFacebookLikes + fb_socialFacebookShares + fb_socialFacebookComments + tw_socialTwitterShares + ga_socialGooglePlusShares + gigya_socialComments

def filterDateRage(_from, _to, col):
    loc = timezone('Europe/Berlin')
'''
@Author: Matheus Barros
Date: 23/04/2021

'''

from pyspark.context import SparkContext, SparkConf
from pyspark.sql.context import SQLContext
from pyspark.sql.session import SparkSession

#PARALLELIZING WITH 2 CORES
conf = SparkConf().setAppName("rdd basic").setMaster("local[2]")

sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
spark = SparkSession(sc)

#transformation combines values with the same key

regularRDD = sc.parallelize([("Messi", 23), ("Ronaldo", 34), ("Neymar", 22),
                             ("Messi", 24)])
pairRDD_reducebykey = regularRDD.reduceByKey(lambda x, y: x + y)
reducebykey = pairRDD_reducebykey.collect()

print(reducebykey)

#operation orders pair RDD by key

pairRDD_reducebykey_rev = pairRDD_reducebykey.map(lambda x: (x[1], x[0]))
pairRDD_reducebykey_rev = pairRDD_reducebykey_rev.sortByKey(
    ascending=True).collect()
Exemplo n.º 24
0
#
import os, sys
# set OS environment variable
os.environ["SPARK_HOME"] = '/usr/local/Cellar/apache-spark/2.2.0/libexec'
# add Spark library to Python
sys.path.insert(0, os.path.join(os.environ["SPARK_HOME"], 'python'))

# import package
import pyspark
from pyspark.context import SparkContext, SparkConf

import atexit


def stop_my_spark():
    sc.stop()
    del (sc)


# Register exit
atexit.register(stop_my_spark)

# Configure and start Spark ... but only once.
if not 'sc' in globals():
    conf = SparkConf()
    conf.setAppName('MyFirstSpark')  ## you may want to change this
    conf.setMaster('local[2]')
    sc = SparkContext(conf=conf)
    print "Launched Spark version %s with ID %s" % (sc.version,
                                                    sc.applicationId)
Exemplo n.º 25
0
# Retrieves names from tables


def get_table_names(db):

    temp = sqlContext.sql("show tables from " + db).collect()
    table_names = []

    for i in range(len(temp)):
        table_names.append(temp[i][1])
    return table_names


if __name__ == "__main__":

    conf = SparkConf().setAppName('SCD-Implementation')
    sc = SparkContext(conf=conf).getOrCreate()
    sqlContext = HiveContext(sc)

    temp_table_names = get_table_names("temp_adventureworks")
    target_table_names = get_table_names("adventureworks")

    #	If the temp table does not exist on target, it is created

    for i in temp_table_names:

        if i not in target_table_names:

            table = sqlContext.sql("select * from temp_adventureworks." +
                                   str(i))
            col_names = sqlContext.sql("describe temp_adventureworks." +
Exemplo n.º 26
0
    entry_point = gateway.entry_point
    imports = entry_point.getPy4JImports()
    for i in imports:
        java_import(gateway.jvm, i)

    context_config =\
        ConfigFactory.parse_string(entry_point.contextConfigAsHocon())
    job_id = entry_point.jobId()
    job_env = JobEnvironment(job_id, None, context_config)
    job_config = ConfigFactory.parse_string(entry_point.jobConfigAsHocon())
    job_class = import_class(entry_point.jobClass())
    job = job_class()

    jcontext = entry_point.context()
    jspark_conf = entry_point.sparkConf()
    spark_conf = SparkConf(_jconf=jspark_conf)
    context_class = jcontext.contextType()
    context = None
    if context_class == 'org.apache.spark.api.java.JavaSparkContext':
        context = SparkContext(gateway=gateway, jsc=jcontext, conf=spark_conf)
    elif context_class == 'org.apache.spark.sql.SQLContext':
        jsc = gateway.jvm.org.apache.spark.api.java.JavaSparkContext(
            jcontext.sparkContext())
        sc = SparkContext(gateway=gateway, jsc=jsc, conf=spark_conf)
        ss = SparkSession(sc, jcontext.sparkSession())
        context = SQLContext(sc, ss, jcontext)
    elif context_class == 'org.apache.spark.sql.hive.HiveContext':
        jsc = gateway.jvm.org.apache.spark.api.java.JavaSparkContext(
            jcontext.sparkContext())
        sc = SparkContext(gateway=gateway, jsc=jsc, conf=spark_conf)
        context = HiveContext(sc, jcontext)
from pyspark.context import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql import functions as f

from awsglue.dynamicframe import DynamicFrame
from awsglue.utils import getResolvedOptions
from awsglue.transforms import *

from awsglue.context import GlueContext
from awsglue.job import Job
import sys

args = getResolvedOptions(sys.argv, ['TempDir', 'JOB_NAME'])

conf = SparkConf()

conf.set("spark.sql.parquet.compression.codec", "snappy")
conf.set("spark.sql.parquet.writeLegacyFormat", "true")

sc = SparkContext()

glueContext = GlueContext(sc)

spark = glueContext.spark_session

job = Job(glueContext)

job.init(args['JOB_NAME'], args)

input_file_path = "s3://xxxxx"
Exemplo n.º 28
0
    return html_pages_array


# java_path = "C:\Program Files\Java\jre1.8.0_191\binjava.exe"
# os.environ['JAVAHOME'] = java_path

record_attribute = "WARC-Record-ID"
# Here we use a smaller testfile due to computation time. Use the sample.war.gz for real testing.
in_file = "C:/Users/klm85310/Documents/WDPS/sample.warc.gz"
stanford = 'C:/Users/klm85310/Documents/WDPS/stanford-ner-2017-06-09/stanford-ner-2017-06-09'

# Create Spark Context -- Remove this when running on cluster
# sc = SparkContext.getOrCreate()

conf = SparkConf().setAppName("Entity Recognition").setMaster("local[*]")
sc = SparkContext(
    conf=conf,
    serializer=PickleSerializer(),  # Default serializer
    # Unlimited batch size -> BatchedSerializer instead of AutoBatchedSerializer
    batchSize=64)

st = StanfordNERTagger(stanford +
                       '/classifiers/english.all.3class.distsim.crf.ser.gz',
                       stanford + '/stanford-ner.jar',
                       encoding='utf-8')

rdd_whole_warc_file = rdd = sc.newAPIHadoopFile(
    in_file,
    "org.apache.hadoop.mapreduce.lib.input.TextInputFormat",
    "org.apache.hadoop.io.LongWritable",
Exemplo n.º 29
0
    values = [s for s in line.split(',')]
    if values[0] == -1:  # Convert -1 labels to 0 for MLlib
        values[0] = 0

    #print values[0:-1]
    return LabeledPoint(input[values[-1]], values[0:-1])


if __name__ == '__main__':
    BASE_DATA_PATH = os.path.abspath(
        os.path.join(os.path.dirname(__file__), '../..', '../../../data',
                     'kaggle'))
    print(BASE_DATA_PATH)

    conf = (SparkConf().setMaster("local[2]").setAppName("Trial DF Set"))
    sc = SparkContext(conf=conf)

    # read data as CSV for Dataframe analysis
    # /Volumes/work/data/kaggle/ssi.csv

    # read data n0rmally
    '''

    sqlContext = SQLContext(sc)
    df = sqlContext.read.format('com.databricks.spark.csv').options(header='false').load(BASE_DATA_PATH + '/ssi.csv')
    # summarize(df)
    print df.show()

    #points = df.map(lambda row: LabeledPoint(input[row.C4],[float(row.C0),float(row.C1),float(row.C2),float(row.C3)]))
Exemplo n.º 30
0
# /opt/spark/bin/pyspark --master local[1] --jars /opt/symetry/lib/sym-spark-assembly.jar --driver-java-options -Dsym.lic.loc=/opt/symetry/sym.lic
# execfile('/Users/mike/rtds/master/RTLM/ScalaProjects/sym-shell/src/com/sml/examples/python/amazonExample.py')

import os
import sys
import pyspark
from pyspark.context import SparkContext
from pyspark.context import SparkConf
from pyspark.sql import SQLContext, HiveContext
from pyspark.storagelevel import StorageLevel

print("amazonExample.py start")

conf = SparkConf()
conf.setAppName('amazonExample')
sc = SparkContext(conf=conf)

gateway         = sc._gateway
sym             = gateway.jvm.com.sml.shell

# Find the access keys for EC2.
awsAccessKeyId = os.environ['AWS_ACCESS_KEY']
awsSecretAccessKey = os.environ['AWS_SECRET_KEY']
# print("awsAccessKeyId=" + awsAccessKeyId)
# print("awsSecretAccessKey=" + awsSecretAccessKey)

sc._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", awsAccessKeyId)
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", awsSecretAccessKey)

myrdd  = sc.textFile('s3a://sml-oregon/datasets/susy/SUSYmini.csv')
Exemplo n.º 31
0
 def setUpClass(cls):
     class_name = cls.__name__
     conf = SparkConf().set("spark.default.parallelism", 1)
     cls.sc = SparkContext(appName=class_name, conf=conf)
     cls.sc.setCheckpointDir(tempfile.mkdtemp())
Exemplo n.º 32
0
import pyspark
from pyspark.context import SparkContext, SparkConf
from pyspark.sql import SQLContext #, HiveContext
#from pyspark.storagelevel import StorageLevel
import atexit
from pyspark_cassandra import CassandraSparkContext
from datetime import tzinfo, timedelta, datetime
from pytz import timezone

conf = SparkConf()

#conf.setMaster("local")
conf.setAppName("My app")
conf.set("spark.cassandra.connection.host", "10.0.40.42")

sc = CassandraSparkContext(conf = conf)
atexit.register(lambda: sc.stop())

rdd = sc.cassandraTable("el_test", "cockpit2_testIndexes")


# for( d in range 2015-10-01 ~ 2015-10-10 ) do:
#
#    SELECT url,date,site,cnts,cnt from cockpit2_allTogether where `date` = d and site = giga and tags contains resort:android
#
# after this query, every row has to be updated with new value for cnts:
#
# UPDATE cnts = ga_videoPlays + sda_downloads + fb_socialFacebookLikes + fb_socialFacebookShares + fb_socialFacebookComments + tw_socialTwitterShares + ga_socialGooglePlusShares + gigya_socialComments

def filterDateRage(_from, _to, col):
    loc = timezone('Europe/Berlin')
Exemplo n.º 33
0
    def test_get_or_create_and_get_active_or_create(self):
        inputd = tempfile.mkdtemp()
        outputd = tempfile.mkdtemp() + "/"

        def updater(vs, s):
            return sum(vs, s or 0)

        def setup():
            conf = SparkConf().set("spark.default.parallelism", 1)
            sc = SparkContext(conf=conf)
            ssc = StreamingContext(sc, 2)
            dstream = ssc.textFileStream(inputd).map(lambda x: (x, 1))
            wc = dstream.updateStateByKey(updater)
            wc.map(lambda x: "%s,%d" % x).saveAsTextFiles(outputd + "test")
            wc.checkpoint(2)
            self.setupCalled = True
            return ssc

        # Verify that getOrCreate() calls setup() in absence of checkpoint files
        self.cpd = tempfile.mkdtemp("test_streaming_cps")
        self.setupCalled = False
        self.ssc = StreamingContext.getOrCreate(self.cpd, setup)
        self.assertTrue(self.setupCalled)

        self.ssc.start()

        def check_output(n):
            while not os.listdir(outputd):
                if self.ssc.awaitTerminationOrTimeout(0.5):
                    raise Exception("ssc stopped")
            time.sleep(1)  # make sure mtime is larger than the previous one
            with open(os.path.join(inputd, str(n)), 'w') as f:
                f.writelines(["%d\n" % i for i in range(10)])

            while True:
                if self.ssc.awaitTerminationOrTimeout(0.5):
                    raise Exception("ssc stopped")
                p = os.path.join(outputd, max(os.listdir(outputd)))
                if '_SUCCESS' not in os.listdir(p):
                    # not finished
                    continue
                ordd = self.ssc.sparkContext.textFile(p).map(
                    lambda line: line.split(","))
                d = ordd.values().map(int).collect()
                if not d:
                    continue
                self.assertEqual(10, len(d))
                s = set(d)
                self.assertEqual(1, len(s))
                m = s.pop()
                if n > m:
                    continue
                self.assertEqual(n, m)
                break

        check_output(1)
        check_output(2)

        # Verify the getOrCreate() recovers from checkpoint files
        self.ssc.stop(True, True)
        time.sleep(1)
        self.setupCalled = False
        self.ssc = StreamingContext.getOrCreate(self.cpd, setup)
        self.assertFalse(self.setupCalled)
        self.ssc.start()
        check_output(3)

        # Verify that getOrCreate() uses existing SparkContext
        self.ssc.stop(True, True)
        time.sleep(1)
        self.sc = SparkContext(conf=SparkConf())
        self.setupCalled = False
        self.ssc = StreamingContext.getOrCreate(self.cpd, setup)
        self.assertFalse(self.setupCalled)
        self.assertTrue(self.ssc.sparkContext == self.sc)

        # Verify the getActiveOrCreate() recovers from checkpoint files
        self.ssc.stop(True, True)
        time.sleep(1)
        self.setupCalled = False
        self.ssc = StreamingContext.getActiveOrCreate(self.cpd, setup)
        self.assertFalse(self.setupCalled)
        self.ssc.start()
        check_output(4)

        # Verify that getActiveOrCreate() returns active context
        self.setupCalled = False
        self.assertEqual(StreamingContext.getActiveOrCreate(self.cpd, setup),
                         self.ssc)
        self.assertFalse(self.setupCalled)

        # Verify that getActiveOrCreate() uses existing SparkContext
        self.ssc.stop(True, True)
        time.sleep(1)
        self.sc = SparkContext(conf=SparkConf())
        self.setupCalled = False
        self.ssc = StreamingContext.getActiveOrCreate(self.cpd, setup)
        self.assertFalse(self.setupCalled)
        self.assertTrue(self.ssc.sparkContext == self.sc)

        # Verify that getActiveOrCreate() calls setup() in absence of checkpoint files
        self.ssc.stop(True, True)
        shutil.rmtree(self.cpd)  # delete checkpoint directory
        time.sleep(1)
        self.setupCalled = False
        self.ssc = StreamingContext.getActiveOrCreate(self.cpd, setup)
        self.assertTrue(self.setupCalled)

        # Stop everything
        self.ssc.stop(True, True)
Exemplo n.º 34
0
    return


#=======================================

if __name__ == "__main__":

    time1 = time.time()

    PASS = 20
    PASS2 = 20

    item_weights = {}

    conf = SparkConf().setAppName('inf553_hw3_2').setMaster('local[*]')
    sc = SparkContext(conf=conf)
    sc.setLogLevel("OFF")

    train_file = sys.argv[1]
    val_file = sys.argv[2]
    case_mark = int(sys.argv[3])
    output_file = sys.argv[4]

    if case_mark == 1:
        model_based()
    elif case_mark == 2:
        user_based()
    elif case_mark == 3:
        item_based()
    else: