class SparkConnector: def __init__(self): # findspark.init() sc = SparkSession.builder \ .master("local[*]") \ .appName('SCA') \ .config('spark.jars.packages', 'com.datastax.spark:spark-cassandra-connector_2.11:2.3.0') \ .config('spark.cassandra.connection.host', os.environ['DB_ADDR']) \ .config('spark.cassandra.auth.username', os.environ['DB_USER']) \ .config('spark.cassandra.auth.password', os.environ['DB_PASS']) \ .config('spark.executor.memory', '15g') \ .config('spark.driver.memory','6g') \ .getOrCreate() self.sqlContext = SQLContext(sc) self.sqlContext.setConf('spark.sql.shuffle.partitions', '10') def submit_sql(self, query): return self.sqlContext.sql(query).collect() def load_and_get_table_df(self, keys_space_name, table_name): table_df = self.sqlContext.read \ .format("org.apache.spark.sql.cassandra") \ .options(table=table_name, keyspace=keys_space_name) \ .load() return table_df
def load_parquet(database, table, quiet): sc = SparkContext() if quiet: sc = quiet_log(sc) sqlContext = SQLContext(sc) sqlContext.setConf('spark.sql.parquet.binaryAsString', 'True') print(database, table) return sqlContext.sql( 'Select * from parquet.`/user/hive/warehouse/{:s}.db/{:s}`'.format( database, table)), sc, sqlContext
def main(months, output_path): conf = SparkConf().setMaster("local[*]").setAppName("AptoideALS") sc = SparkContext(conf=conf) sc.setLogLevel("OFF") sqlContext = SQLContext(sc) sqlContext.setConf('spark.sql.parquet.compression.codec', 'snappy') df = get_files_from_s3(sqlContext, months) df_country = get_app_country(df) def toCSVLine_2(data): app_id = data[0] count = data[1] quo = data[2] return "{},{},{}".format(app_id, count, quo) df_country.rdd.map(toCSVLine_2).repartition(1).saveAsTextFile(output_path + "/country_info")
from pyspark.sql import * from pyspark.sql.functions import * from pyspark.sql.types import * import json from datetime import tzinfo, datetime import pytz import re from pyspark import SparkContext, SQLContext sc = SparkContext() sqlContext = SQLContext(sc) sqlContext.setConf("spark.sql.parquet.compression.codec", "snappy") wk = sc.textFile('hdfs:///user/piccardi/all_sections_merged.txt') counts_rdd = wk.flatMap(lambda x: x.split("\t")).map( lambda x: (x, 1)).reduceByKey(lambda a, b: a + b).map( lambda x: Row(title=x[0], count=x[1])) counts = sqlContext.createDataFrame(counts_rdd) sorted_counts = counts.sort(desc("count"))
def prepare_data(sc, months, output_path): sqlContext = SQLContext(sc) sqlContext.setConf('spark.sql.parquet.compression.codec', 'snappy') blacklist = [] blacklist_top50 = ['({})|'.format(x) for x in get_top50()] blacklist_filters = ['(.+\.{}.*)|'.format(x) for x in get_blackList()] blacklist.extend(blacklist_top50) blacklist.extend(blacklist_filters) blacklist = list(set(blacklist)) rx = ''.join(blacklist) rx = rx[:-1] # gets all user installs from the selected number of previous months excluding the current month df = get_files_from_s3(sqlContext, months) # select only the hash and explode the list of packages df_pkg = df.select(df['hash'].alias('hash'), df['pkg'].alias('package')).drop_duplicates().cache() # remove incoherent packages like "android" rpkg = '.+\..+' df_pkg = df_pkg.filter(df_pkg['package'].rlike(rpkg)).cache() # filter blacklist packages and top 50 df_pkg_nosystemapps = df_pkg.filter(~df_pkg['package'].rlike(rx)).cache() # connects to database and filter packages with less than 500 downloads df_pkg_nosystemapps = filter_less_500_downloads( sqlContext, df_pkg_nosystemapps).cache() def toCSVLine(data): name = data[0] id = data[1] return "{},{}".format(name, id) # mapping of hashs and ID used for recommendations rdd_hashs = df_pkg_nosystemapps.select( df_pkg_nosystemapps['hash']).distinct().rdd.zipWithUniqueId().map( lambda x: (x[0][0], x[1] + 1)).cache() df_hashs = sqlContext.createDataFrame(rdd_hashs, ['hash', 'user_id']) rdd_hashs = rdd_hashs.map(toCSVLine) rdd_hashs.repartition(1).saveAsTextFile(output_path + "/hashs") rdd_hashs.unpersist() print("user hashs saved") # mapping of packages and ID used for recommendations rdd_packages = df_pkg_nosystemapps.select( df_pkg_nosystemapps['package']).distinct().rdd.zipWithUniqueId().map( lambda x: (x[0][0], x[1] + 1)).cache() df_packages = sqlContext.createDataFrame(rdd_packages, ['package', 'app_id']) rdd_packages = rdd_packages.map(toCSVLine) rdd_packages.repartition(1).saveAsTextFile(output_path + "/apps") print("apps ID's saved") def toCSVLine_2(data): app_id = data[0] count = data[1] quo = data[2] return "{},{},{}".format(app_id, count, quo) # final dataframe to be sent to recommend engine df_data = df_pkg_nosystemapps.join(df_hashs, 'hash', 'left_outer').select( 'user_id', 'package').cache() df_data = df_data.join(df_packages, 'package', 'left_outer').select('user_id', 'app_id').cache() df_data = df_data.withColumn("rating", lit(1)).cache() df_data.rdd.map(toCSVLine_2).repartition(1).saveAsTextFile(output_path + "/dataset") print("dataset saved") # save apps histogram df_hist = get_app_histogram(df_data, df_packages) df_hist.rdd.map(toCSVLine_2).repartition(1).saveAsTextFile(output_path + "/histogram") print("apps histogram saved") return df_data.rdd
for j in range(i+1,len(doc_weight)): if(len(doc_weight) == 1): break else: weight_i = doc_weight[i][1] weight_j = doc_weight[j][1] sim = ((doc_weight[i][0],doc_weight[j][0]),weight_i*weight_j) sim_matrix.append(sim) return sim_matrix conf = SparkConf().setAppName("part b3") sc = SparkContext(conf = conf) sqlcontext = SQLContext(sc) #Setting the compression to none, by default - snappy sqlcontext.setConf("spark.sql.avro.compression.codec","uncompressed") #Loading the file as a data-frame df = sqlcontext.read.format("com.databricks.spark.avro").load(partfile) #Converting the data-frame to a hadoop usable RDD RDD_InvertedIndex = df.rdd SimilarData = RDD_InvertedIndex.map(similarity_Matrix).flatMap(lambda x : x) SimilarData_reduced = SimilarData.reduceByKey(lambda x,y : x + y) SimilarData_sorted = SimilarData_reduced.sortBy(lambda sim : sim[1],ascending = False) fileData_DF = SimilarData_sorted.toDF() #saving the file as an Avro file fileData_DF.write.format("com.databricks.spark.avro").save(outputfile) #SimilarData_sorted.saveAsTextFile(outputfile)
from pyspark import SparkContext, SparkConf, SQLContext from operator import add from nltk.corpus import stopwords import ntpath import sys import re conf = SparkConf().setAppName("part d2") sc = SparkContext(conf=conf) sqlcontext = SQLContext(sc) sqlcontext.setConf("spark.sql.avro.compression.codec", "snappy") #Arguments from user inputfile = sys.argv[1] outputfile = sys.argv[2] RDD_popwords = sc.wholeTextFiles( "/bigd29/output_hw2/medium/p1/1", use_unicode=False).map(lambda ( file, popularWords): popularWords.split("\n")).flatMap(lambda x: x) List_popwords = RDD_popwords.collect() #Reading the file fileData = sc.wholeTextFiles(inputfile, use_unicode=False) stopwordsList = set(stopwords.words('english')) fileData_base = fileData.map(lambda (filename, content): (ntpath.basename(filename), content)) # Removing special characters #Pre-process the words and convert to lower case fileData_preprocess = fileData_base.map(lambda (file, content): (
import pyspark.sql.functions as sqlf from pyspark.sql.functions import isnan, when, count, col, udf import pandas as pd import numpy as np import re from datetime import datetime import xml.etree.ElementTree as ET from pyspark.sql import SparkSession, Row from pyspark.sql.types import * from itertools import chain sc = SparkContext() sqlContext = SQLContext(sc) spark = SparkSession.builder.getOrCreate() sqlContext.setConf("spark.sql.shuffle.partitions", "600") sqlContext.setConf("spark.default.parallelism", "600") !apt-get install p7zip-full !p7zip -d Posts.7z from google.colab import drive drive.mount('/content/gdrive') # Commented out IPython magic to ensure Python compatibility. # %cd '/content/gdrive/My Drive/Colab Notebooks' custom_schema = StructType([StructField('ID', IntegerType(), True), StructField('PostTypeID', IntegerType(), True), StructField('CreationDate', DateType(), True), StructField('Title', StringType(), True),
import os import sys from pyspark import SQLContext from pyspark import SparkContext #os.environ["SPARK_HOME"] = "/opt/spark-1.6.1-bin-hadoop2.6" #os.environ["HADOOP_HOME"] = "/opt/hadoop" #os.environ["HADOOP_PREFIX"] = "/opt/hadoop" #os.environ["HIVE_HOME"] = "/opt/hive" sc = SparkContext('local[1]') sql_context = SQLContext(sc) sql_context.setConf( "spark.sql.shuffle.partitions", "1") sql_context.sql(""" use fex_test """)
from pyspark.sql import Row from pyspark import SparkContext, SQLContext INPUT_DATA = 'hdfs:///user/harshdee/enwiki-latest-pages-articles-multistream.xml.bz2' OUTPUT_DATA = 'hdfs:///user/harshdee/citations_content.parquet' sc = SparkContext() sqlContext = SQLContext(sc) sqlContext.setConf('spark.sql.parquet.compression.codec', 'snappy') wiki = sqlContext.read.format('com.databricks.spark.xml').options(rowTag='page').load(INPUT_DATA) pages = wiki.where('ns = 0').where('redirect is null') # Get only ID, title, revision text's value which we are interested in pages = pages['id', 'title', 'revision.text'] pages = pages.toDF('id', 'page_title', 'content') ## citations_with_words = sqlContext.createDataFrame(pages.map(get_as_row)) pages.write.mode('overwrite').parquet(OUTPUT_DATA) ## citations_with_words.write.format('com.databricks.spark.csv').save('citations_words.csv')
#!/usr/bin/env python # -*- coding:utf-8 -*- # author [email protected] import os import sys from pyspark import SQLContext from pyspark import SparkContext import trade.mat_trade as trade import ta.mat_close as mat local_path = os.path.dirname(__file__) sys.path.append(local_path + "/../lib") os.environ["SPARK_HOME"] = "C:\spark-1.6.1-bin-hadoop2.6" sc = SparkContext('local[1]') sqlContext = SQLContext(sc) sqlContext.setConf("spark.sql.shuffle.partitions", "1") def trade_test1(): dmat1 = [ { "symbol": "AAA", "date": "2016-01-01", "close": 1.0, "close_mat": 1.0 }, { "symbol": "AAA", "date": "2016-01-06", "close": 1.2, "close_mat": 1.2 },
for j in range(i + 1, len(doc_weight)): if (len(doc_weight) == 1): break else: weight_i = doc_weight[i][1] weight_j = doc_weight[j][1] sim = ((doc_weight[i][0], doc_weight[j][0]), weight_i * weight_j) sim_matrix.append(sim) return sim_matrix conf = SparkConf().setAppName("part c3") sc = SparkContext(conf=conf) sqlcontext = SQLContext(sc) sqlcontext.setConf("spark.sql.parquet.compression.codec", "uncompressed") df = sqlcontext.read.parquet(partfile) RDD_InvertedIndex = df.rdd SimilarData = RDD_InvertedIndex.map(similarity_Matrix).flatMap(lambda x: x) SimilarData_reduced = SimilarData.reduceByKey(lambda x, y: x + y) SimilarData_sorted = SimilarData_reduced.sortBy(lambda sim: sim[1], ascending=False) fileData_DF = SimilarData_sorted.toDF() fileData_DF.write.parquet(outputfile) #SimilarData_sorted.saveAsTextFile(outputfile)
#!/usr/bin/env python # -*- coding:utf-8 -*- # author [email protected] import os import sys from pyspark import SQLContext from pyspark import SparkContext local_path = os.path.dirname(__file__) sys.path.append(local_path + "/../lib") os.environ["SPARK_HOME"] = "C:\spark-1.6.1-bin-hadoop2.6" sc = SparkContext('local[1]') sqlContext = SQLContext(sc) sqlContext.setConf("spark.sql.shuffle.partitions", "1") def create_test(): sqlContext.sql( "CREATE TABLE sample_07 (code string,description string,total_emp int,salary int) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TextFile" )
from pyspark.sql import SparkSession from pyspark.sql.types import * from pyspark.sql.functions import * from graphframes import * from itertools import chain import math import os import time start_time = time.time() #initalize SparkContext, SparkSession and SQLContext sc = SparkContext() spark = SparkSession(sc) sqlContext = SQLContext(sc) sqlContext.setConf('spark.sql.shuffle.partitions', '50') inv = 1 iter_count = 1 delimiter = "\t" normChoice = 1 #read file edges = spark.read.format("csv").option( "delimiter", "\t").load(os.path.dirname(os.path.abspath(__file__)) + "/network2.txt").withColumnRenamed("_c0", "src").withColumnRenamed( "_c1", "dst") # edges.createOrReplaceTempView("retail_data")
import urllib #Declare Variables NumOfPartitions = 10 Directory = "/usr/sreecharan/sampleData/ETL/" #Download files from S3 #urllib.urlretrieve ("https://s3.amazonaws.com/tmp1.sl.com/20170701_20170701165514569.gz",Directory+"/20170701_20170701165514569.gz") #urllib.urlretrieve ("https://s3.amazonaws.com/tmp1.sl.com/20170701_20170702004210139.gz",Directory+"/20170701_20170702004210139.gz") #files_path = [os.path.abspath(x) for x in os.listdir(Directory)] #Create Sparkcontext, Sqlcontext and configure conf = SparkConf().setAppName("ETL Using Pypark").setMaster("local") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) sqlContext.setConf("spark.sql.shuffle.partitions", str(NumOfPartitions)) #Read the files data = sc.textFile("file:///" + Directory, NumOfPartitions) structuredData=data.map(lambda r: r.split(",")).map(lambda r: \ (r[1],r[2],\ round(float(r[6]),3),round(float(r[7]),3),\ time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(float(r[5]))))) #structuredData.take(10) columnarData = sqlContext.createDataFrame( structuredData, ['ad_id', 'id_type', 'lat', 'long', 'timestamp']) distinctRecords = columnarData.groupBy("ad_id", "id_type", "lat", "long").agg(F.max("timestamp")) distinctRecords.cache() distinctRecords.toJSON().saveAsTextFile(