예제 #1
0
class SparkConnector:
    def __init__(self):
        # findspark.init()
        sc = SparkSession.builder \
         .master("local[*]") \
         .appName('SCA') \
         .config('spark.jars.packages', 'com.datastax.spark:spark-cassandra-connector_2.11:2.3.0') \
         .config('spark.cassandra.connection.host', os.environ['DB_ADDR']) \
         .config('spark.cassandra.auth.username', os.environ['DB_USER']) \
         .config('spark.cassandra.auth.password', os.environ['DB_PASS'])  \
         .config('spark.executor.memory', '15g') \
                              .config('spark.driver.memory','6g') \
         .getOrCreate()
        self.sqlContext = SQLContext(sc)
        self.sqlContext.setConf('spark.sql.shuffle.partitions', '10')

    def submit_sql(self, query):
        return self.sqlContext.sql(query).collect()

    def load_and_get_table_df(self, keys_space_name, table_name):
        table_df = self.sqlContext.read \
         .format("org.apache.spark.sql.cassandra") \
         .options(table=table_name, keyspace=keys_space_name) \
         .load()
        return table_df
예제 #2
0
def load_parquet(database, table, quiet):
    sc = SparkContext()
    if quiet:
        sc = quiet_log(sc)

    sqlContext = SQLContext(sc)
    sqlContext.setConf('spark.sql.parquet.binaryAsString', 'True')
    print(database, table)
    return sqlContext.sql(
        'Select * from parquet.`/user/hive/warehouse/{:s}.db/{:s}`'.format(
            database, table)), sc, sqlContext
예제 #3
0
def main(months, output_path):
    conf = SparkConf().setMaster("local[*]").setAppName("AptoideALS")
    sc = SparkContext(conf=conf)
    sc.setLogLevel("OFF")
    sqlContext = SQLContext(sc)
    sqlContext.setConf('spark.sql.parquet.compression.codec', 'snappy')

    df = get_files_from_s3(sqlContext, months)
    df_country = get_app_country(df)

    def toCSVLine_2(data):
        app_id = data[0]
        count = data[1]
        quo = data[2]
        return "{},{},{}".format(app_id, count, quo)

    df_country.rdd.map(toCSVLine_2).repartition(1).saveAsTextFile(output_path + "/country_info")
예제 #4
0
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
import json
from datetime import tzinfo, datetime
import pytz
import re
from pyspark import SparkContext, SQLContext

sc = SparkContext()

sqlContext = SQLContext(sc)
sqlContext.setConf("spark.sql.parquet.compression.codec", "snappy")

wk = sc.textFile('hdfs:///user/piccardi/all_sections_merged.txt')

counts_rdd = wk.flatMap(lambda x: x.split("\t")).map(
    lambda x: (x, 1)).reduceByKey(lambda a, b: a + b).map(
        lambda x: Row(title=x[0], count=x[1]))

counts = sqlContext.createDataFrame(counts_rdd)

sorted_counts = counts.sort(desc("count"))
예제 #5
0
def prepare_data(sc, months, output_path):

    sqlContext = SQLContext(sc)
    sqlContext.setConf('spark.sql.parquet.compression.codec', 'snappy')

    blacklist = []
    blacklist_top50 = ['({})|'.format(x) for x in get_top50()]
    blacklist_filters = ['(.+\.{}.*)|'.format(x) for x in get_blackList()]
    blacklist.extend(blacklist_top50)
    blacklist.extend(blacklist_filters)
    blacklist = list(set(blacklist))
    rx = ''.join(blacklist)
    rx = rx[:-1]

    # gets all user installs from the selected number of previous months excluding the current month
    df = get_files_from_s3(sqlContext, months)

    # select only the hash and explode the list of packages
    df_pkg = df.select(df['hash'].alias('hash'),
                       df['pkg'].alias('package')).drop_duplicates().cache()

    # remove incoherent packages like "android"
    rpkg = '.+\..+'
    df_pkg = df_pkg.filter(df_pkg['package'].rlike(rpkg)).cache()

    # filter blacklist packages and top 50
    df_pkg_nosystemapps = df_pkg.filter(~df_pkg['package'].rlike(rx)).cache()

    # connects to database and filter packages with less than 500 downloads
    df_pkg_nosystemapps = filter_less_500_downloads(
        sqlContext, df_pkg_nosystemapps).cache()

    def toCSVLine(data):
        name = data[0]
        id = data[1]
        return "{},{}".format(name, id)

    # mapping of hashs and ID used for recommendations
    rdd_hashs = df_pkg_nosystemapps.select(
        df_pkg_nosystemapps['hash']).distinct().rdd.zipWithUniqueId().map(
            lambda x: (x[0][0], x[1] + 1)).cache()
    df_hashs = sqlContext.createDataFrame(rdd_hashs, ['hash', 'user_id'])
    rdd_hashs = rdd_hashs.map(toCSVLine)
    rdd_hashs.repartition(1).saveAsTextFile(output_path + "/hashs")
    rdd_hashs.unpersist()
    print("user hashs saved")

    # mapping of packages and ID used for recommendations
    rdd_packages = df_pkg_nosystemapps.select(
        df_pkg_nosystemapps['package']).distinct().rdd.zipWithUniqueId().map(
            lambda x: (x[0][0], x[1] + 1)).cache()
    df_packages = sqlContext.createDataFrame(rdd_packages,
                                             ['package', 'app_id'])
    rdd_packages = rdd_packages.map(toCSVLine)
    rdd_packages.repartition(1).saveAsTextFile(output_path + "/apps")
    print("apps ID's saved")

    def toCSVLine_2(data):
        app_id = data[0]
        count = data[1]
        quo = data[2]
        return "{},{},{}".format(app_id, count, quo)

    # final dataframe to be sent to recommend engine
    df_data = df_pkg_nosystemapps.join(df_hashs, 'hash', 'left_outer').select(
        'user_id', 'package').cache()
    df_data = df_data.join(df_packages, 'package',
                           'left_outer').select('user_id', 'app_id').cache()
    df_data = df_data.withColumn("rating", lit(1)).cache()
    df_data.rdd.map(toCSVLine_2).repartition(1).saveAsTextFile(output_path +
                                                               "/dataset")
    print("dataset saved")

    # save apps histogram
    df_hist = get_app_histogram(df_data, df_packages)
    df_hist.rdd.map(toCSVLine_2).repartition(1).saveAsTextFile(output_path +
                                                               "/histogram")
    print("apps histogram saved")

    return df_data.rdd
        for j in range(i+1,len(doc_weight)):
            if(len(doc_weight) == 1):
                break
            else:
                weight_i = doc_weight[i][1]
                weight_j = doc_weight[j][1]
                sim = ((doc_weight[i][0],doc_weight[j][0]),weight_i*weight_j)
                sim_matrix.append(sim)
    return sim_matrix

conf = SparkConf().setAppName("part b3")
sc = SparkContext(conf = conf)

sqlcontext = SQLContext(sc)
#Setting the compression to none, by default - snappy
sqlcontext.setConf("spark.sql.avro.compression.codec","uncompressed")
#Loading the file as a data-frame
df = sqlcontext.read.format("com.databricks.spark.avro").load(partfile)
#Converting the data-frame to a hadoop usable RDD
RDD_InvertedIndex = df.rdd

SimilarData = RDD_InvertedIndex.map(similarity_Matrix).flatMap(lambda x : x)
SimilarData_reduced = SimilarData.reduceByKey(lambda x,y : x + y)
SimilarData_sorted = SimilarData_reduced.sortBy(lambda sim : sim[1],ascending = False)

fileData_DF = SimilarData_sorted.toDF()

#saving the file as an Avro file
fileData_DF.write.format("com.databricks.spark.avro").save(outputfile)
#SimilarData_sorted.saveAsTextFile(outputfile)
from pyspark import SparkContext, SparkConf, SQLContext
from operator import add
from nltk.corpus import stopwords
import ntpath
import sys
import re
conf = SparkConf().setAppName("part d2")
sc = SparkContext(conf=conf)
sqlcontext = SQLContext(sc)

sqlcontext.setConf("spark.sql.avro.compression.codec", "snappy")

#Arguments from user
inputfile = sys.argv[1]
outputfile = sys.argv[2]

RDD_popwords = sc.wholeTextFiles(
    "/bigd29/output_hw2/medium/p1/1", use_unicode=False).map(lambda (
        file, popularWords): popularWords.split("\n")).flatMap(lambda x: x)
List_popwords = RDD_popwords.collect()

#Reading the file
fileData = sc.wholeTextFiles(inputfile, use_unicode=False)

stopwordsList = set(stopwords.words('english'))
fileData_base = fileData.map(lambda (filename, content):
                             (ntpath.basename(filename), content))

# Removing special characters
#Pre-process the words and convert to lower case
fileData_preprocess = fileData_base.map(lambda (file, content): (
예제 #8
0
import pyspark.sql.functions as sqlf
from pyspark.sql.functions import isnan, when, count, col, udf
import pandas as pd
import numpy as np
import re
from datetime import datetime
import xml.etree.ElementTree as ET
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import *
from itertools import chain

sc = SparkContext()
sqlContext = SQLContext(sc)
spark = SparkSession.builder.getOrCreate()

sqlContext.setConf("spark.sql.shuffle.partitions", "600")
sqlContext.setConf("spark.default.parallelism", "600")

!apt-get install p7zip-full
!p7zip -d Posts.7z

from google.colab import drive
drive.mount('/content/gdrive')

# Commented out IPython magic to ensure Python compatibility.
# %cd '/content/gdrive/My Drive/Colab Notebooks'

custom_schema = StructType([StructField('ID', IntegerType(), True),
                     StructField('PostTypeID', IntegerType(), True),
                     StructField('CreationDate', DateType(), True),
                     StructField('Title', StringType(), True),
예제 #9
0
import os
import sys

from pyspark import SQLContext
from pyspark import SparkContext

#os.environ["SPARK_HOME"] = "/opt/spark-1.6.1-bin-hadoop2.6"
#os.environ["HADOOP_HOME"] = "/opt/hadoop"
#os.environ["HADOOP_PREFIX"] = "/opt/hadoop"

#os.environ["HIVE_HOME"] = "/opt/hive"


sc = SparkContext('local[1]')
sql_context = SQLContext(sc)
sql_context.setConf( "spark.sql.shuffle.partitions", "1")
sql_context.sql(""" use fex_test """)
예제 #10
0
from pyspark.sql import Row
from pyspark import SparkContext, SQLContext

INPUT_DATA = 'hdfs:///user/harshdee/enwiki-latest-pages-articles-multistream.xml.bz2'
OUTPUT_DATA = 'hdfs:///user/harshdee/citations_content.parquet'

sc = SparkContext()
sqlContext = SQLContext(sc)
sqlContext.setConf('spark.sql.parquet.compression.codec', 'snappy')

wiki = sqlContext.read.format('com.databricks.spark.xml').options(rowTag='page').load(INPUT_DATA)
pages = wiki.where('ns = 0').where('redirect is null')

# Get only ID, title, revision text's value which we are interested in
pages = pages['id', 'title', 'revision.text']
pages = pages.toDF('id', 'page_title', 'content')

## citations_with_words = sqlContext.createDataFrame(pages.map(get_as_row))
pages.write.mode('overwrite').parquet(OUTPUT_DATA)
## citations_with_words.write.format('com.databricks.spark.csv').save('citations_words.csv')
예제 #11
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author [email protected]
import os
import sys
from pyspark import SQLContext
from pyspark import SparkContext
import trade.mat_trade as trade
import ta.mat_close as mat
local_path = os.path.dirname(__file__)
sys.path.append(local_path + "/../lib")
os.environ["SPARK_HOME"] = "C:\spark-1.6.1-bin-hadoop2.6"
sc = SparkContext('local[1]')
sqlContext = SQLContext(sc)
sqlContext.setConf("spark.sql.shuffle.partitions", "1")

def trade_test1():
    dmat1 = [
        {
            "symbol": "AAA",
            "date": "2016-01-01",
            "close": 1.0,
            "close_mat": 1.0
        },
        {
            "symbol": "AAA",
            "date": "2016-01-06",
            "close": 1.2,
            "close_mat": 1.2
        },
예제 #12
0
        for j in range(i + 1, len(doc_weight)):
            if (len(doc_weight) == 1):
                break
            else:
                weight_i = doc_weight[i][1]
                weight_j = doc_weight[j][1]
                sim = ((doc_weight[i][0], doc_weight[j][0]),
                       weight_i * weight_j)
                sim_matrix.append(sim)
    return sim_matrix


conf = SparkConf().setAppName("part c3")
sc = SparkContext(conf=conf)

sqlcontext = SQLContext(sc)
sqlcontext.setConf("spark.sql.parquet.compression.codec", "uncompressed")

df = sqlcontext.read.parquet(partfile)
RDD_InvertedIndex = df.rdd

SimilarData = RDD_InvertedIndex.map(similarity_Matrix).flatMap(lambda x: x)
SimilarData_reduced = SimilarData.reduceByKey(lambda x, y: x + y)
SimilarData_sorted = SimilarData_reduced.sortBy(lambda sim: sim[1],
                                                ascending=False)

fileData_DF = SimilarData_sorted.toDF()

fileData_DF.write.parquet(outputfile)
#SimilarData_sorted.saveAsTextFile(outputfile)
예제 #13
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author [email protected]
import os
import sys

from pyspark import SQLContext
from pyspark import SparkContext

local_path = os.path.dirname(__file__)
sys.path.append(local_path + "/../lib")

os.environ["SPARK_HOME"] = "C:\spark-1.6.1-bin-hadoop2.6"
sc = SparkContext('local[1]')
sqlContext = SQLContext(sc)
sqlContext.setConf("spark.sql.shuffle.partitions", "1")


def create_test():
    sqlContext.sql(
        "CREATE TABLE sample_07 (code string,description string,total_emp int,salary int) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TextFile"
    )
예제 #14
0
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from graphframes import *
from itertools import chain
import math
import os
import time

start_time = time.time()

#initalize SparkContext, SparkSession and SQLContext
sc = SparkContext()
spark = SparkSession(sc)
sqlContext = SQLContext(sc)
sqlContext.setConf('spark.sql.shuffle.partitions', '50')

inv = 1
iter_count = 1
delimiter = "\t"
normChoice = 1

#read file
edges = spark.read.format("csv").option(
    "delimiter",
    "\t").load(os.path.dirname(os.path.abspath(__file__)) +
               "/network2.txt").withColumnRenamed("_c0",
                                                  "src").withColumnRenamed(
                                                      "_c1", "dst")

# edges.createOrReplaceTempView("retail_data")
예제 #15
0
import urllib

#Declare Variables
NumOfPartitions = 10
Directory = "/usr/sreecharan/sampleData/ETL/"

#Download files from S3
#urllib.urlretrieve ("https://s3.amazonaws.com/tmp1.sl.com/20170701_20170701165514569.gz",Directory+"/20170701_20170701165514569.gz")
#urllib.urlretrieve ("https://s3.amazonaws.com/tmp1.sl.com/20170701_20170702004210139.gz",Directory+"/20170701_20170702004210139.gz")
#files_path = [os.path.abspath(x) for x in os.listdir(Directory)]

#Create Sparkcontext, Sqlcontext and configure
conf = SparkConf().setAppName("ETL Using Pypark").setMaster("local")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
sqlContext.setConf("spark.sql.shuffle.partitions", str(NumOfPartitions))

#Read the files
data = sc.textFile("file:///" + Directory, NumOfPartitions)
structuredData=data.map(lambda r: r.split(",")).map(lambda r: \
                       (r[1],r[2],\
                        round(float(r[6]),3),round(float(r[7]),3),\
                        time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(float(r[5])))))
#structuredData.take(10)

columnarData = sqlContext.createDataFrame(
    structuredData, ['ad_id', 'id_type', 'lat', 'long', 'timestamp'])
distinctRecords = columnarData.groupBy("ad_id", "id_type", "lat",
                                       "long").agg(F.max("timestamp"))
distinctRecords.cache()
distinctRecords.toJSON().saveAsTextFile(