def main(): sc = initializeSpark() spark = SparkSession(sc) directory, post_id = parse_pls() rdds = make_rdds_from_dir(directory, sc) post_rdd = rdds["posts_rdd"] string = make_stripped_string(post_rdd, post_id) print("\n Body from post_id: " + str(post_id) + ", stripped of shitespaces and special characters:\n") print("'" + string + "'\n") # Tokenize the string tokens = tokenize(string) # remove duplicate entries tokens_unique = remove_dupes(tokens) # Assign id to the unique tokens token_id_tuple = assign_id_to_list(tokens_unique) # Now assign these id's the the original token list token_id_all = assign_unique_ids(token_id_tuple, tokens) print("\nTokens retrieved from the body with their respective id's: \n") for i in token_id_all: print(i) print("\n\nEdges:\n") ids = [] for i in token_id_all: ids.append(i[0]) # Create edges on a window size of 5, using the ids of the tokens edges = create_edges(ids, 5) # Removes duplicate edges from list edges = remove_dupe_tuples(edges) print(edges) print("\n\nPageRank:") sqlContext = SQLContext(sc) v = sqlContext.createDataFrame(token_id_tuple, ["id", "word"]) e = sqlContext.createDataFrame(edges, ["src", "dst"]) g = graphframes.GraphFrame(v, e) results = g.pageRank(resetProbability=0.15, tol=0.0001) results.vertices.select("word", "pagerank").show(truncate=False)
def Compute_Average(rinput, output): ## ratings data ### Craeting ratings-rdd my_RDD_strings = sc.textFile(rinput + '/' + 'ratings.csv') data = my_RDD_strings.map(lambda line: readline(line)) ## Extraacting header row header_info = data.first() ### Selects all the rows except the header row data_mr = data.filter(lambda ratings: ratings != header_info) data_mr = data_mr.map(lambda ratings: string_to_float(ratings)) data_mr_sum_count = data_mr.aggregateByKey( (0, 0), lambda U, s: (U[0] + s, U[1] + 1), lambda U, V: (U[0] + V[0], U[1] + V[1])) ### format (tag(sum,count)) avg_ratings = data_mr_sum_count.map(lambda (a, (b, c)): (a, float(b) / c)) ### here a=movie,b=sum,c=count sorted_avg_ratings = avg_ratings.sortByKey( ) ## sorting by movieID in accending order ## Creating output csv file based on the dataset-folder if 'ml-20m' in rinput: result_csv = 'Prashanth_Manja_result_task1_big.csv' else: result_csv = 'Prashanth_Manja_result_task1_small.csv' sqlContext = SQLContext(sc) data_frame = sqlContext.createDataFrame(sorted_avg_ratings) panda_data_frame = data_frame.toPandas() ## Output as csv file panda_data_frame.to_csv(output + '/' + result_csv, encoding='utf-8', header=['movieID', 'rating_avg'], index=False)
class MainApp(object): def __init__(self): pass def init(self): os.environ["SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2" # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY> # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY> conf = SparkConf() conf.setMaster("local[10]") conf.setAppName("PySparkShell") conf.set("spark.executor.memory", "2g") conf.set("spark.driver.memory", "1g") self.sc = SparkContext(conf=conf) self.sqlContext = SQLContext(self.sc) def loadData(self): category_list = self.sc.textFile("/Users/abhinavrungta/Desktop/uf-study/snc/github/SNC-WEB/src/yahoo/ydata-ymovies-user-movie-ratings-train-v1_0.txt").map(lambda line: (int(line.split(',')[0]), int(line.split(',')[1]), float(line.split(',')[2]), long(line.split(',')[3]))) category_schema = StructType([ StructField("userid", IntegerType(), True), StructField("movieid", IntegerType(), True), StructField("rating", FloatType(), True), StructField("time", LongType(), True) ]) category_list = self.sqlContext.createDataFrame(category_list, category_schema) category_list.registerTempTable("data") movie_list = self.sqlContext.sql("SELECT movieid, COUNT(movieid) AS ct FROM data GROUP BY movieid") movie_list.registerTempTable("movie") movieid = movie_list.sort(movie_list.ct.desc()).first().movieid # movieid = category_list.first().movieid category_list = self.sqlContext.sql("SELECT * FROM data WHERE movieid = {0}".format(movieid)) category_list.registerTempTable("data") user_list = self.sqlContext.sql("SELECT DISTINCT userid FROM data LIMIT 50") print(user_list.count()) user_list.show() user_list.registerTempTable("users") category_list = self.sqlContext.sql("SELECT d.userid AS userid, d.movieid AS movieid, d.rating AS rating, d.time AS time FROM data d, users u WHERE d.userid = u.userid").repartition(1) #category_list = self.sqlContext.createDataFrame(category_list, category_schema) category_list = category_list.map(lambda line: str(line.userid) + "," + str(line.movieid) + "," + str(line.rating) + "," + str(line.time)) category_list = category_list.repartition(1) category_list.saveAsTextFile("data.txt")
sprk = Spark_Session() conn = sprk.Spark_Context() sql_conn = sprk.Spark_Connect() sqlContext = SQLContext(conn) dept_rdd = conn.textFile("/Users/shuvamoymondal/Downloads/Dept.txt") emp_rdd = conn.textFile("/Users/shuvamoymondal/Downloads/Emp.txt") emp = emp_rdd.map(lambda x: x.split(",")).map(lambda k: Row(empID=int(k[0]), name=k[1].strip(), Age=k[3].strip(), Sal=float(k[4]), DeptID=int(k[5]))) dept = dept_rdd.map(lambda x: x.split(",")).map( lambda k: Row(DeptID=int(k[0]), State=k[2].strip())) EmpDF = sqlContext.createDataFrame(emp) DeptDF = sqlContext.createDataFrame(dept) EmpDF.show() DeptDF.show() #emp1.join(dept1,emp1.Deptid == dept1.deptid,"inner").select(emp1.age,dept1.deptid,emp1.Sal).show(2) EmpDF.withColumn('DEPARTMENTNO', EmpDF.DeptID).withColumn('SALARY', EmpDF.Sal).show() EmpDF.select( EmpDF.Age, to_date(from_unixtime(unix_timestamp( EmpDF.Age, 'dd-MM-yyyy'))).alias("ModAge")).show()
#!/usr/bin/env python # -*- coding: utf-8 -*- import logging from pyspark import SparkContext, SparkConf from pyspark.sql.context import SQLContext from pyspark.sql.types import Row __author__ = 'guotengfei' __time__ = 2019 / 11 / 26 """ Module comment """ LOGGER = logging.getLogger(__name__) if __name__ == '__main__': conf = SparkConf().setAppName('vcom') sc = SparkContext(conf=conf) sql = SQLContext(sc) lines = sc.textFile('../data/users.txt') user = lines.map(lambda l: l.split(",")).map(lambda p: (p[0], p[1])) sql.createDataFrame(user, ['id', 'name']).registerTempTable('user') df = sql.sql("select id,name from user") df.write.save("../data/result", format='json')
KnownRow = Row('node', 'source', 'distance') schema = StructType([ StructField('node', StringType(), False), StructField('source', StringType(), False), StructField('distance', IntegerType(), False), ]) graphedges_rdd = textinput.map(lambda line: get_graphedges(line)).filter( lambda x: x is not None).flatMap(lambda x: x).coalesce(1) graphedges = graphedges_rdd.toDF(['source', 'destination']).cache() graphedges.registerTempTable('SourceDestTable') initial_node = source_node initial_row = KnownRow(initial_node, initial_node, 0) knownpaths = sqlContext.createDataFrame([initial_row], schema=schema) part_knownpaths = knownpaths for i in range(6): part_knownpaths.registerTempTable('PartKnownPathTable') newpaths = sqlContext.sql(""" SELECT destination AS node, t1.source AS source, (distance+1) AS distance FROM SourceDestTable t1 JOIN PartKnownPathTable t2 ON (t1.source = t2.node) """).show() newpaths.registerTempTable('NewPathTable') knownpaths.registerTempTable('KnowPathTable')
import pyspark.sql.functions as fn import pyspark.sql.types as typ if __name__ == "__main__": # initialize the spark conf = SparkConf().setMaster("local[*]").setAppName("Test4_1") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) # create DataFrame duplicates df = sqlContext.createDataFrame( [(1, 144.5, 5.9, 33, 'M'), (2, 167.2, 5.4, 45, 'M'), (3, 124.1, 5.2, 23, 'F'), (4, 144.5, 5.9, 33, 'M'), (5, 124.1, 5.2, 54, 'F'), (3, 124.1, 5.2, 23, 'F'), (5, 129.2, 5.3, 42, 'M')], ['id', 'weight', 'height', 'age', 'gender']) print('Count of rows: {0}'.format(df.count())) print('Count of distinct rows: {0}'.format(df.distinct().count())) # duplicated df_drop = df.dropDuplicates() print('Count of ids: {0}'.format(df_drop.count())) print('Count of distinct ids: {0}'.format( df.select([c for c in df_drop.columns if c != 'id']).distinct().count())) df_drop2 = df_drop.dropDuplicates(
conf = SparkConf().setAppName("load logs") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) assert sc.version >= '1.5.1' text = sc.textFile(inputs1)+sc.textFile(inputs2) def parseline(line): linere = re.compile('^(\\S+) - - \\[(\\S+) [+-]\\d+\\] \"[A-Z]+ (\\S+) HTTP/\\d\\.\\d\" \\d+ (\\d+)$') match = re.search(linere, line) if match: m = re.match(linere, line) host = m.group(1) dt = datetime.datetime.strptime(m.group(2), '%d/%b/%Y:%H:%M:%S') path = m.group(3) bys = float(m.group(4)) dct = {"host": host, "datetime": dt, "path": path, "bys": bys} return dct return None rdd = text.map(lambda line: parseline(line)).filter(lambda x: x is not None) df = sqlContext.createDataFrame(rdd).coalesce(1) df.write.format('parquet').save(output, mode='overwrite')
rel['name'] = x[0] rel['age'] = x[1] return rel peopleRDD = sc.textFile( "file:///usr/local/my_soft/spark-2.1.0/examples/src/main/resources/people.txt" ) # 定义一个模式字符串 schemaString = "name age" # 根据模式字符串生成模式 fields = list( map(lambda fieldName: StructField(fieldName, StringType(), nullable=True), schemaString.split(" "))) schema = StructType(fields) # 从上面信息可以看出,schema描述了模式信息,模式中包含name和age两个字段 rowRDD = peopleRDD.map(lambda line: line.split(',')).map( lambda attributes: Row(attributes[0], attributes[1])) peopleDF = spark.createDataFrame(rowRDD, schema) # 必须注册为临时表才能供下面查询使用 peopleDF.createOrReplaceTempView("people") results = spark.sql("SELECT * FROM people") results.rdd.map(lambda attributes: "name: " + attributes[0] + "," + "age:" + attributes[1]).foreach(print)
df.select([mean('uniform'), min('uniform'), max('uniform')]).show() # 3. Sample covariance and correlation # Covariance is a measure of how two variables change with respect to each other. # A positive number would mean that there is a tendency that as one variable increases, # the other increases as well. # A negative number would mean that as one variable increases, # the other variable has a tendency to decrease. df = sqlcontext.range(0, 10).withColumn('rand1', rand(seed=10)).withColumn('rand2', rand(seed=27)) df.stat.cov('rand1', 'rand2') df.stat.cov('id', 'id') # Correlation is a normalized measure of covariance that is easier to understand, # as it provides quantitative measurements of the statistical dependence between two random variables. df.stat.corr('rand1', 'rand2') df.stat.corr('id', 'id') # 4. Cross-tabulation is a powerful tool in statistics that is used to # observe the statistical significance (or independence) of variables. # Create a DataFrame with two columns (name, item) names = ["Alice", "Bob", "Mike"] items = ["milk", "bread", "butter", "apples", "oranges"] df = sqlcontext.createDataFrame([(names[i % 3], items[i % 5]) for i in range(100)], ["name", "item"]) df.show() df.stat.crosstab("name", "item").show()
output_schema = StructType(schema.fields + [StructField(prediction_field, prediction_type, nullable=True)]) if ascontext: if ascontext.isComputeDataModelOnly(): ascontext.setSparkOutputSchema(output_schema) sys.exit(0) if ascontext: s_model = ascontext.getModelContentToString("model") s_metadata = ascontext.getModelContentToString("model.metadata") else: s_model = open(modelpath,"r").read() s_metadata = open(modelmetadata_path,"r").read() df = df.toPandas() model_metadata = json.loads(s_metadata) import pickle clf = pickle.loads(s_model) predictors = model_metadata['predictors'] missing_predictors = [predictor for predictor in predictors if predictor not in df.columns] if len(missing_predictors): raise Exception("Following fields are required by the model for scoring: "+str(missing_predictors)) df[prediction_field] = clf.predict(df[predictors]) df = sqlCtx.createDataFrame(df) if ascontext: ascontext.setSparkOutputData(df)
textinput = sc.textFile(inputs + 'links-simple-sorted.txt') graphedges_rdd = textinput.map(lambda line: get_graphedges(line)).\ filter(lambda x: x is not None).flatMap(lambda x: x) graphedges = graphedges_rdd.toDF(['source', 'destination']).cache() print('constructed edge graph df from given input') graphedges.show() KnownRow = Row('node', 'source', 'distance') schema = StructType([ StructField('node', StringType(), False), StructField('source', StringType(), False), StructField('distance', IntegerType(), False), ]) newRow = KnownRow(source_node, source_node, 0) finalOut = sqlContext.createDataFrame([newRow], schema=schema).cache() inter_df = finalOut # inter_df.show() for i in range(6): print('loop : ', i) if (inter_df.filter(inter_df.node == dest_node).count() > 0): print('match found') break # graphedges.join(inter_df.source) cond = [inter_df['node'] == graphedges['source']] df_result = graphedges.join(inter_df, cond, 'inner').\ select(graphedges['destination'].alias('node'), graphedges['source'],\
from pyspark.sql.readwriter import DataFrameReader, DataFrameWriter from pyspark.sql.window import Window, WindowSpec conf = SparkConf().setMaster("local").setAppName("MyApp") sc = SparkContext(conf=conf) spark = SQLContext(sc) def stopSpark(): sc.stop() def f(x): rel = {} rel['name'] = x[0] rel['age'] = x[1] return rel people = sc.textFile("file:///usr/local/my_soft/spark-2.1.0/examples/src/main/resources/people.txt")\ .map(lambda line : line.split(','))\ .map(lambda p: Row(name=p[0], age=int(p[1]))) peopleDF = spark.createDataFrame(people) # peopleDF = sc.textFile("file:///usr/local/my_soft/spark-2.1.0/examples/src/main/resources/people.txt")\ # .map(lambda line : line.split(','))\ # .map(lambda x: Row(**f(x))).toDF() peopleDF.createOrReplaceTempView("people") # 必须注册为临时表才能供下面的查询使用 personsDF = spark.sql("select name,age from people where age>20") personsDF.rdd.map(lambda t: "Name:" + t[0] + "," + "Age:" + str(t[1])).foreach( print)
swimmersJSON.printSchema # schema reflection stringCSVRDD = sc.parallelize([(123, 'Katie', 19, 'brown'), (234, 'Michael', 22, 'green'), (345, 'Simone', 23, 'blue')]) schema = StructType([ StructField("id", LongType, True), StructField("name", StringType, True), StructField("age", LongType, True), StructField("eyeColor", StringType, True) ]) swimmers = sqlContext.createDataFrame(stringCSVRDD, schema) swimmers.registerTempTable("swimmers") swimmers.printSchema # API swimmers.count swimmers.select("id", "age").filter("age = 22").show swimmers.select(swimmers.id, swimmers.age).filter(swimmers.age == 22).show swimmers.select("name", "eyeColor").filter("eyeColor like 'b%'").show swimmers.sql("select count(1) from swimmers").show
from pyspark.sql import Row from pyspark.sql import SparkSession if __name__ == '__main__': spark = SparkSession.builder.appName( "Python Spark SQL basic example").master("local[*]").config( "spark.some.config.option" "some-value").getOrCreate() sc = spark conn = sc.sparkContext sqlContext = SQLContext(conn) dept_rdd = conn.textFile("/Users/shuvamoymondal/Downloads/Dept.txt") emp_rdd = conn.textFile("/Users/shuvamoymondal/Downloads/Emp.txt") emp = emp_rdd.map(lambda x: x.split(",")).map(lambda k: k) dept = dept_rdd.map(lambda x: x.split(",")).map(lambda k: k) dt1 = emp_rdd.map(lambda j: j.split(",")).map( lambda k: Row(empID=k[0], Name=k[1].strip(), Design=k[2].strip(), Age=k[3], Sal=k[4], DeptID=int(k[5]))) dt2 = dept_rdd.map(lambda j: j.split(",")).map( lambda k: Row(DeptID=int(k[0]), Job=k[1].strip(), State=k[2].strip())) EmpDF = sqlContext.createDataFrame(dt1) DeptDF = sqlContext.createDataFrame(dt2)
dmt = DataModelTools(datamodel) predictors = model_metadata["predictors"] DataModelTools.checkPredictors(datamodel, predictors, df) from pyspark.mllib.clustering import KMeansModel model = KMeansModel.load(sc, modelpath) # to score the model, we need an RDD of DenseVector (the numeric encoded values of the predictors), use DataModelTools to do this dv = dmt.extractDenseVector(df, predictors, setToFlag=1.0) def rowToList(row): result = [] for idx in range(0, len(row)): result.append(row[idx]) return result mapFn = lambda (x, y): rowToList(x) + [y] rdd2 = dv.map(lambda x: rowToList(x[0]) + [model.predict(x[1])]) # finally convert the RDD containing the list of values for the resulting rows, back to a dataframe outdf = sqlCtx.createDataFrame(rdd2, output_schema) if ascontext: ascontext.setSparkOutputData(outdf) else: print(outdf.take(100))
predictors = model_metadata["predictors"] DataModelTools.checkPredictors(datamodel,predictors,df) from pyspark.mllib.clustering import KMeansModel model = KMeansModel.load(sc, modelpath) # to score the model, we need an RDD of DenseVector (the numeric encoded values of the predictors), use DataModelTools to do this dv = dmt.extractDenseVector(df,predictors,setToFlag=1.0) def rowToList(row): result = [] for idx in range(0, len(row)): result.append(row[idx]) return result mapFn = lambda (x,y):rowToList(x)+[y] rdd2 = dv.map(lambda x: rowToList(x[0]) + [model.predict(x[1])]) # finally convert the RDD containing the list of values for the resulting rows, back to a dataframe outdf = sqlCtx.createDataFrame(rdd2,output_schema) if ascontext: ascontext.setSparkOutputData(outdf) else: print(outdf.take(100))
class MainApp(object): def __init__(self): pass def init(self): os.environ[ "SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2" # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY> # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY> conf = SparkConf() conf.setMaster("local") conf.setAppName("My application") conf.set("spark.executor.memory", "2g") self.sc = SparkContext(conf=conf) self.sqlContext = SQLContext(self.sc) self.df_user = self.sqlContext.read.json("dataset/user.json").cache() self.df_review = self.sqlContext.read.json( "dataset/review.json").cache() self.df_business = self.sqlContext.read.json( "dataset/business.json").cache() self.df_user.registerTempTable("user") def getS3File(self, s3FilePath, destinationPathOnLocal): r = requests.get(s3FilePath) fileOb = open(destinationPathOnLocal, 'w') fileOb.write(r.text) fileOb.close() def writeToS3File(self, s3FilePath, sourcePathOnLocal): fileOb = open(sourcePathOnLocal, 'r') payload = fileOb.read() fileOb.close() headers = {"x-amz-acl": "public-read-write"} return requests.put(s3FilePath, headers=headers, data=payload) def reads3spark(self, path): # path = "s3n://b-datasets/flight_data/*" x = self.sc.textFile(path) # we can just specify all the files. return x def writes3spark(self, x, path): x.saveAsTextFile(path) def createFeatures(self): userData = self.sqlContext.sql( "SELECT user_id, name, review_count, votes, fans, yelping_since, elite FROM user" ) userData = userData.map(mapUsers).coalesce(1) res = self.sqlContext.createDataFrame(userData) review_user = self.df_review.select(self.df_review.business_id, self.df_review.user_id) business_loc = self.df_business.select(self.df_business.business_id, self.df_business.city, self.df_business.state) df_join_reviewAndBusiness = review_user.join( business_loc, review_user.business_id == business_loc.business_id).select( "user_id", "city", "state") df_grouped = df_join_reviewAndBusiness.groupBy( ["user_id", "city", "state"]).count() df_panda = res.toPandas() for name, group in df_grouped: if (group['city'] > 10): user_id = df_grouped.get_group(name)[0]['user_id'] df_panda[user_id]['k'] = df_panda[user_id]['k'] + 1 res = self.sqlContext.createDataFrame(df_panda) res.toJSON().saveAsTextFile('user_features.json')
@author: weyu ''' from pyspark.context import SparkContext from pyspark.sql.context import SQLContext import sys, pandas, json from docplex.mp.model import Model from collections import namedtuple from pyspark.sql.types import StructField, StructType, StringType, DoubleType, IntegerType ascontext = None if len(sys.argv) > 1 and sys.argv[1] == "-test": sc = SparkContext('local') sqlContext = SQLContext(sc) data = sqlContext.createDataFrame( pandas.read_csv( "C:\\Users\\weyu\\Documents\\awork_spss\\cplex\\wheat_or_barley_en.csv" )) limitationInput = "wheat_or_barley_limitation.py" input_name = "name" input_qmin = "qmin" input_qmax = "qmax" input_optimise = "Price" else: import spss.pyspark.runtime ascontext = spss.pyspark.runtime.getContext() sc = ascontext.getSparkContext() sqlContext = ascontext.getSparkSQLContext() data = ascontext.getSparkInputData() limitationInput = "%%input_file_nutrients%%" input_name = "%%input_col_name%%" input_qmin = "%%input_col_qmin%%"
user_id = match.group(3), date = (match.group(4)[:-6]).split(":", 1)[0], time = (match.group(4)[:-6]).split(":", 1)[1], method = match.group(5), endpoint = match.group(6), protocol = match.group(7), response_code = int(match.group(8)), content_size = int(match.group(9)) ) # .cache() - Persists the RDD in memory, which will be re-used again access_logs = (sc.textFile(logFile) .map(parse_apache_log_line) .cache()) schema_access_logs = sqlContext.createDataFrame(access_logs) #Creates a table on which SQL like queries can be fired for analysis schema_access_logs.registerTempTable("logs") endpointsSearch = (sqlContext .sql("SELECT * FROM logs WHERE endpoint=" + argv[1]) .rdd.map(lambda row: (row[0], row[1])) .collect()) # def mappingFunc(s): # words = s.split(" ") # return len(words) # base_df = spark.read.text(raw_data_files) # base_df.printSchema()
KnownRow = Row('node', 'source', 'distance') schema = StructType([ StructField('node', StringType(), False), StructField('source', StringType(), False), StructField('distance', IntegerType(), False), ]) graphedges_rdd = textinput.map(lambda line: get_graphedges(line)).filter(lambda x: x is not None).flatMap(lambda x: x).coalesce(1) graphedges = graphedges_rdd.toDF(['source', 'destination']).cache() graphedges.registerTempTable('SourceDestTable') initial_node = source_node initial_row = KnownRow(initial_node, initial_node, 0) knownpaths = sqlContext.createDataFrame([initial_row], schema=schema) part_knownpaths = knownpaths for i in range(6): part_knownpaths.registerTempTable('PartKnownPathTable') newpaths = sqlContext.sql(""" SELECT destination AS node, t1.source AS source, (distance+1) AS distance FROM SourceDestTable t1 JOIN PartKnownPathTable t2 ON (t1.source = t2.node) """) newpaths.registerTempTable('NewPathTable') knownpaths.registerTempTable('KnowPathTable')