def test_column_select(self): df = self.df self.assertEqual(self.testData, df.select("*").collect()) self.assertEqual(self.testData, df.select(df.key, df.value).collect()) self.assertEqual([Row(value='1')], df.where(df.key == 1).select(df.value).collect())
for category_dir in listdir( input_dir ): # Build the dataset of (docname, category, wordcounts) tuples distinct_labels[curr_cat] = category_dir next_docs = sc.wholeTextFiles(('/').join([input_dir, category_dir])) docs = docs.union( next_docs.map(lambda (doc, lines): (format_text(lines), float(curr_cat)))) curr_cat += 1 training_rows = docs.sample(False, train_fraction) testing_rows = docs.subtract(training_rows) # Prepare training and test documents, which are labeled. LabeledDocument = Row("text", "label") train = training_rows.map(lambda x: LabeledDocument(*x)).toDF() test = testing_rows.map(lambda x: LabeledDocument(*x)).toDF() # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="rawFeatures") #outputCol="features") idf = IDF(inputCol="rawFeatures", outputCol="features") lr = LogisticRegression(maxIter=1000, regParam=0.001) #pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) p0 = Pipeline(stages=[tokenizer, hashingTF, idf, lr]) #m0 = p0.fit(train) #pipeline = Pipeline(stages=[m0, lr]) pipeline = p0
def map_time_mobile(x): m_list = [] mobile_value_time_cut = { '15day': [], '365day': [], '180day': [], '90day': [], '60day': [], '30day': [], '30_90day': [], '90_180day': [], '180_365day': [] } if x.mobile_value: for l in x.mobile_value: check_num = 0 if l['eventtype'] != 'Loan': check_num = 1 else: key = 'Loan' + str( int(float(l['eventoccurtime'])) / 86400 + l['partnercode']) if key not in m_list: check_num = 1 m_list.append(key) if check_num == 1 and float(l['eventoccurtime']) <= float( x.loan_date_unix): if float(x.loan_date_unix) - float( l['eventoccurtime']) < 31536000: mobile_value_time_cut['365day'].append(l) if float(x.loan_date_unix) - float( l['eventoccurtime']) < 15552000: mobile_value_time_cut['180day'].append(l) if float(x.loan_date_unix) - float( l['eventoccurtime']) < 7776000: mobile_value_time_cut['90day'].append(l) if float(x.loan_date_unix) - float( l['eventoccurtime']) < 5184000: mobile_value_time_cut['60day'].append(l) if float(x.loan_date_unix) - float( l['eventoccurtime']) < 2592000: mobile_value_time_cut['30day'].append(l) if float(x.loan_date_unix) - float( l['eventoccurtime']) < 1296000: mobile_value_time_cut['15day'].append(l) if float(x.loan_date_unix) - float( l['eventocurtime']) >= 2592000 and ( float(x.loan_date_unix) - float(l['eventoccurtime']) < 7776000): mobile_value_time_cut['30_90day'].append(l) if float(x.loan_date_unix) - float( l['eventocurtime']) >= 7776000 and ( float(x.loan_date_unix) - float(l['eventoccurtime']) < 15552000): mobile_value_time_cut['90_180day'].append(l) if float(x.loan_date_unix) - float( l['eventocurtime']) >= 15552000 and ( float(x.loan_date_unix) - float(l['eventoccurtime']) < 31536000): mobile_value_time_cut['180_365day'].append(l) value = x.asDict() value['mobile_value_time_cut'] = mobile_value_time_cut del value['mobile_value'] return Row(**value)
def test_fillna(self): schema = StructType([ StructField("name", StringType(), True), StructField("age", IntegerType(), True), StructField("height", DoubleType(), True), StructField("spy", BooleanType(), True) ]) # fillna shouldn't change non-null values row = self.spark.createDataFrame([(u'Alice', 10, 80.1, True)], schema).fillna(50).first() self.assertEqual(row.age, 10) # fillna with int row = self.spark.createDataFrame([(u'Alice', None, None, None)], schema).fillna(50).first() self.assertEqual(row.age, 50) self.assertEqual(row.height, 50.0) # fillna with double row = self.spark.createDataFrame([(u'Alice', None, None, None)], schema).fillna(50.1).first() self.assertEqual(row.age, 50) self.assertEqual(row.height, 50.1) # fillna with bool row = self.spark.createDataFrame([(u'Alice', None, None, None)], schema).fillna(True).first() self.assertEqual(row.age, None) self.assertEqual(row.spy, True) # fillna with string row = self.spark.createDataFrame([(None, None, None, None)], schema).fillna("hello").first() self.assertEqual(row.name, u"hello") self.assertEqual(row.age, None) # fillna with subset specified for numeric cols row = self.spark.createDataFrame([(None, None, None, None)], schema).fillna(50, subset=['name', 'age' ]).first() self.assertEqual(row.name, None) self.assertEqual(row.age, 50) self.assertEqual(row.height, None) self.assertEqual(row.spy, None) # fillna with subset specified for string cols row = self.spark.createDataFrame([(None, None, None, None)], schema).fillna("haha", subset=['name', 'age' ]).first() self.assertEqual(row.name, "haha") self.assertEqual(row.age, None) self.assertEqual(row.height, None) self.assertEqual(row.spy, None) # fillna with subset specified for bool cols row = self.spark.createDataFrame([(None, None, None, None)], schema).fillna(True, subset=['name', 'spy' ]).first() self.assertEqual(row.name, None) self.assertEqual(row.age, None) self.assertEqual(row.height, None) self.assertEqual(row.spy, True) # fillna with dictionary for boolean types row = self.spark.createDataFrame([Row(a=None), Row(a=True)]).fillna({ "a": True }).first() self.assertEqual(row.a, True)
def __construct_row(values): return Row(contigName="chr21", start=100, referenceAllele="A", alternateAlleles=["T", "C"], values=values)
def test_rows_both_all_shows_known_diffs_flag_and_known_diffs_count_as_matches( spark, comparison_kd1): expected_df = spark.createDataFrame([ Row(acct=10000001234, acct_seq=0, stat_cd_base='*2', stat_cd_compare=None, stat_cd_match=True, stat_cd_match_type="KNOWN_DIFFERENCE", open_dt_base=datetime.date(2017, 5, 1), open_dt_compare=2017121, open_dt_match=True, open_dt_match_type="KNOWN_DIFFERENCE", cd_base='0001', cd_compare=1.0, cd_match=True, cd_match_type="KNOWN_DIFFERENCE"), Row(acct=10000001235, acct_seq=0, stat_cd_base='V1', stat_cd_compare='V1', stat_cd_match=True, stat_cd_match_type="MATCH", open_dt_base=datetime.date(2017, 5, 2), open_dt_compare=2017122, open_dt_match=True, open_dt_match_type="KNOWN_DIFFERENCE", cd_base='0002', cd_compare=2.0, cd_match=True, cd_match_type="KNOWN_DIFFERENCE"), Row(acct=10000001236, acct_seq=0, stat_cd_base='V2', stat_cd_compare='V2', stat_cd_match=True, stat_cd_match_type="MATCH", open_dt_base=datetime.date(2017, 5, 3), open_dt_compare=2017123, open_dt_match=True, open_dt_match_type="KNOWN_DIFFERENCE", cd_base='0003', cd_compare=3.0, cd_match=True, cd_match_type="KNOWN_DIFFERENCE"), Row(acct=10000001237, acct_seq=0, stat_cd_base='*2', stat_cd_compare='V3', stat_cd_match=False, stat_cd_match_type="MISMATCH", open_dt_base=datetime.date(2017, 5, 4), open_dt_compare=2017124, open_dt_match=True, open_dt_match_type="KNOWN_DIFFERENCE", cd_base='0004', cd_compare=4.0, cd_match=True, cd_match_type="KNOWN_DIFFERENCE"), Row(acct=10000001238, acct_seq=0, stat_cd_base='*2', stat_cd_compare=None, stat_cd_match=True, stat_cd_match_type="KNOWN_DIFFERENCE", open_dt_base=datetime.date(2017, 5, 5), open_dt_compare=2017125, open_dt_match=True, open_dt_match_type="KNOWN_DIFFERENCE", cd_base='0005', cd_compare=5.0, cd_match=True, cd_match_type="KNOWN_DIFFERENCE") ]) assert comparison_kd1.rows_both_all.count() == 5 assert expected_df.unionAll( comparison_kd1.rows_both_all).distinct().count() == 5
# Compute log of absolute error def squared_log_error(pred, actual): return (np.log(pred + 1) - np.log(actual + 1))**2 ############### Start Spark ################################## # get context, data and split the data sc = SparkContext("local", "hw11p1") path = "file:///home/cloudera/Documents/hw11/data/Small_Car_Data.csv" raw_data = sc.textFile(path) sqlContext = SQLContext(sc) parts = raw_data.map(lambda l: l.split(",")) pre_df = parts.map(lambda p: Row(displacement=p[3], hspower=p[4])) # create dataframe for cleaning the data later on df = sqlContext.createDataFrame(pre_df) # Count the number of rows before cleaning the data ( via filtering) print >> f, "Before filtering count=" print >> f, df.count() # cleaning the data dff = df.where((df.displacement != 'NaN') & (df.hspower != 'NaN')) # Count the number of rows after cleaning the data ( via filtering) print >> f, "After filtering count=" print >> f, dff.count()
def test_corr(self): import math df = self.sc.parallelize([Row(a=i, b=math.sqrt(i)) for i in range(10)]).toDF() corr = df.stat.corr(u"a", "b") self.assertTrue(abs(corr - 0.95734012) < 1e-6)
sc = spark.sparkContext # In[3]: lines = sc.textFile('people.txt') # In[4]: # Inferring the Schema Using Reflection parts = lines.map(lambda l: l.split(",")) # Spark SQL can convert an RDD of Row objects to a DataFrame, inferring the datatypes. # Rows are constructed by passing a list of key/value pairs as kwargs to the Row class. # The keys of this list define the column names of the table, and the types are inferred # by sampling the whole dataset, similar to the inference that is performed on JSON files. people = parts.map(lambda p: Row(name=p[0], age=int(p[1]))) # In[5]: # Infer the schema, and register the DataFrame as a table. schemaPeople = spark.createDataFrame(people) schemaPeople.createTempView("people") # In[6]: # SQL can be run over DataFrames that have been registered as a table. teenagers = spark.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19") # In[7]: teenagers
def test_expr(self): from pyspark.sql import functions row = Row(a="length string", b=75) df = self.spark.createDataFrame([row]) result = df.select(functions.expr("length(a)")).collect()[0].asDict() self.assertEqual(13, result["length(a)"])
def test_sort_with_nulls_order(self): from pyspark.sql import functions df = self.spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', 50)], ["name", "height"]) self.assertEqual( df.select(df.name).orderBy( functions.asc_nulls_first('name')).collect(), [Row(name=None), Row(name=u'Alice'), Row(name=u'Tom')]) self.assertEqual( df.select(df.name).orderBy( functions.asc_nulls_last('name')).collect(), [Row(name=u'Alice'), Row(name=u'Tom'), Row(name=None)]) self.assertEqual( df.select(df.name).orderBy( functions.desc_nulls_first('name')).collect(), [Row(name=None), Row(name=u'Tom'), Row(name=u'Alice')]) self.assertEqual( df.select(df.name).orderBy( functions.desc_nulls_last('name')).collect(), [Row(name=u'Tom'), Row(name=u'Alice'), Row(name=None)])
def test_dayofweek(self): from pyspark.sql.functions import dayofweek dt = datetime.datetime(2017, 11, 6) df = self.spark.createDataFrame([Row(date=dt)]) row = df.select(dayofweek(df.date)).first() self.assertEqual(row[0], 2)
def test_bit_length_function(self): # SPARK-36751: add bit length api for python df = self.spark.createDataFrame([('cat', ), ('\U0001F408', )], ['cat']) actual = df.select(bit_length('cat')).collect() self.assertEqual([Row(24), Row(32)], actual)
def test_cov(self): df = self.sc.parallelize([Row(a=i, b=2 * i) for i in range(10)]).toDF() cov = df.stat.cov(u"a", "b") self.assertTrue(abs(cov - 55.0 / 3) < 1e-6)
# expr('someCol - 5') is the same transformation as performing col('someCol') - 5, or even expr('someCol') - 5 # This might be confusing, but remember a couple of key points: # - Columns are just expression - they evaluate to a value if we do expr('count') it evaluates to a value for each row # - Columns and transformations of those columns compile to the same logical plan as parsed expression (((col('someCol') + 5) * 200) - 6) < col('otherCol') # You can write your expressions as DataFrame code or as SQL expressions and get the same performance characteristics. df.select(expr('(((count + 5) * 200) - 6)')).show(5) df.select((((col('count') + 5) * 200) - 6)).show(5) from pyspark.sql import Row # Accessing data in rows: you can specify the position that you would like myRow = Row('Hello', None, 1) r1 = myRow[0] print(r1) # hello df.createOrReplaceTempView('dfTable') # Create DataFrame on the fly my_schema = StructType([ StructField('some', StringType(), True), StructField('col', StringType(), True), StructField('names', LongType(), False, metadata={'hello': 'world'}) ]) myDF = spark.createDataFrame([myRow], my_schema)
from pyspark.sql import SparkSession from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.recommendation import ALS from pyspark.sql import Row import sys spark = SparkSession.builder.appName("ALS").getOrCreate() lines = spark.read.text(sys.argv[1]).rdd parts = lines.map(lambda row: row.value.split("::")) ratingsRDD = parts.map( lambda p: Row(userId=int(p[0]), movieId=int(p[1]), rating=float(p[2]))) ratings = spark.createDataFrame(ratingsRDD) splits = [0.9, 0.85, 0.8, 0.75, 0.7] f = open("ALS_out_2.txt", "w") for i in splits: (training, test) = ratings.randomSplit([i, 1 - i]) als = ALS(maxIter=20, regParam=0.1, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop") model = als.fit(training) predictions = model.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
def test_rows_both_all_returns_all_rows_in_both_dataframes_for_differently_named_columns( spark, comparison3): expected_df = spark.createDataFrame([ Row(acct=10000001234, dollar_amt_base=123, dollar_amt_compare=123.4, dollar_amt_match=False, name_base='George Maharis', name_compare='George Michael Bluth', name_match=False, float_fld_base=14530.1555, float_fld_compare=14530.155, float_fld_match=False, date_fld_base=datetime.date(2017, 1, 1), date_fld_compare=datetime.date(2017, 1, 1), date_fld_match=True, accnt_purge=False), Row(acct=10000001235, dollar_amt_base=0, dollar_amt_compare=0.45, dollar_amt_match=False, name_base='Michael Bluth', name_compare='Michael Bluth', name_match=True, float_fld_base=1.0, float_fld_compare=1.0, float_fld_match=True, date_fld_base=datetime.date(2017, 1, 1), date_fld_compare=datetime.date(2017, 1, 1), date_fld_match=True, accnt_purge=False), Row(acct=10000001236, dollar_amt_base=1345, dollar_amt_compare=1345.0, dollar_amt_match=True, name_base='George Bluth', name_compare='George Bluth', name_match=True, float_fld_base=None, float_fld_compare=None, float_fld_match=True, date_fld_base=datetime.date(2017, 1, 1), date_fld_compare=datetime.date(2017, 1, 1), date_fld_match=True, accnt_purge=False), Row(acct=10000001237, dollar_amt_base=123456, dollar_amt_compare=123456.0, dollar_amt_match=True, name_base='Bob Loblaw', name_compare='Bob Loblaw', name_match=True, float_fld_base=345.12, float_fld_compare=345.12, float_fld_match=True, date_fld_base=datetime.date(2017, 1, 1), date_fld_compare=datetime.date(2017, 1, 1), date_fld_match=True, accnt_purge=False), Row(acct=10000001239, dollar_amt_base=1, dollar_amt_compare=1.05, dollar_amt_match=False, name_base='Lucille Bluth', name_compare='Lucille Bluth', name_match=True, float_fld_base=None, float_fld_compare=None, float_fld_match=True, date_fld_base=datetime.date(2017, 1, 1), date_fld_compare=datetime.date(2017, 1, 1), date_fld_match=True, accnt_purge=True) ]) assert comparison3.rows_both_all.count() == 5 assert expected_df.unionAll( comparison3.rows_both_all).distinct().count() == 5
os.environ["PYSPARK_SUBMIT_ARGS"] = ( "--packages graphframes:graphframes:0.6.0-spark2.3-s_2.11") sc = SparkContext(master="local[*]", appName="graphFrames-community-detection") sqlContext = SQLContext(sc) sc.setLogLevel("ERROR") inputFile = sys.argv[1] outputFile = sys.argv[2] inpData = sc.textFile(inputFile) rddData = inpData.map(lambda x : x.split(' ')) rdd = rddData.union(inpData.map(lambda x : x.split(' ')[::-1]))\ .persist() edgeSchema = StructType([StructField('src', StringType()), StructField('dst',StringType())]) vertexSchema = StructType([StructField('id', StringType())]) graphVertices= rdd.flatMap(lambda v: [Row(v[0]),Row(v[1])])\ .distinct() graphEdges = rdd.map(lambda x: (x[0], x[1]))\ .map(lambda x: Row(src=x[0], dst=x[1])) verticesDataframe= sqlContext.createDataFrame(graphVertices, vertexSchema) edgesDataframe = sqlContext.createDataFrame(graphEdges, edgeSchema) graph = GraphFrame(verticesDataframe, edgesDataframe) graphList = graph.labelPropagation(maxIter=5)\ .rdd\ .map(lambda x: (x[1], x[0]))\ .groupByKey()\ .map(lambda x: x[1])\ .collect() sortedGraph = list()
def test_rows_both_all_returns_a_dataframe_with_all_rows_in_identical_dataframes( spark, comparison2): expected_df = spark.createDataFrame([ Row(acct=10000001234, dollar_amt_base=123, dollar_amt_compare=123, dollar_amt_match=True, name_base='George Maharis', name_compare='George Maharis', name_match=True, float_fld_base=14530.1555, float_fld_compare=14530.1555, float_fld_match=True, date_fld_base=datetime.date(2017, 1, 1), date_fld_compare=datetime.date(2017, 1, 1), date_fld_match=True), Row(acct=10000001235, dollar_amt_base=0, dollar_amt_compare=0, dollar_amt_match=True, name_base='Michael Bluth', name_compare='Michael Bluth', name_match=True, float_fld_base=1.0, float_fld_compare=1.0, float_fld_match=True, date_fld_base=datetime.date(2017, 1, 1), date_fld_compare=datetime.date(2017, 1, 1), date_fld_match=True), Row(acct=10000001236, dollar_amt_base=1345, dollar_amt_compare=1345, dollar_amt_match=True, name_base='George Bluth', name_compare='George Bluth', name_match=True, float_fld_base=None, float_fld_compare=None, float_fld_match=True, date_fld_base=datetime.date(2017, 1, 1), date_fld_compare=datetime.date(2017, 1, 1), date_fld_match=True), Row(acct=10000001237, dollar_amt_base=123456, dollar_amt_compare=123456, dollar_amt_match=True, name_base='Bob Loblaw', name_compare='Bob Loblaw', name_match=True, float_fld_base=345.12, float_fld_compare=345.12, float_fld_match=True, date_fld_base=datetime.date(2017, 1, 1), date_fld_compare=datetime.date(2017, 1, 1), date_fld_match=True), Row(acct=10000001239, dollar_amt_base=1, dollar_amt_compare=1, dollar_amt_match=True, name_base='Lucille Bluth', name_compare='Lucille Bluth', name_match=True, float_fld_base=None, float_fld_compare=None, float_fld_match=True, date_fld_base=datetime.date(2017, 1, 1), date_fld_compare=datetime.date(2017, 1, 1), date_fld_match=True) ]) assert comparison2.rows_both_all.count() == 5 assert expected_df.unionAll( comparison2.rows_both_all).distinct().count() == 5
sc.setLogLevel("WARN") ssc = StreamingContext(sc, 60) sqlContext = SQLContext(sc) kafkaStream = KafkaUtils.createStream(ssc, 'data04:2181', 'trump-consumer-group2', {'trump': 1}) dataJson = kafkaStream.map(lambda x: json.loads(x[1])) messages = dataJson.map(lambda x: (x[ 'text'], datetime.strptime(x['created_at'], '%a %b %d %H:%M:%S %z %Y'). replace(tzinfo=timezone.utc))) messages_downsecs = messages.map(lambda x: (x[0], x[1] - timedelta( seconds=x[1].second, microseconds=x[1].microsecond))) parts = messages_downsecs.map(lambda x: Row( tweet=x[0], sentence=processTweetText(x[0]), created_at=x[1].isoformat())) #parts.count().map(lambda x:'Tweets in this batch: %s' % x).pprint() #partsDF = parts.transform(lambda rdd: rdd.toDF().collect()) #partsDF = parts.transform(lambda rdd: rdd.collect()) #partsDF = parts.transform(lambda rdd: rdd.collect()) schema = StructType([StructField('sentence', StringType(), True)]) partsDF = sqlContext.createDataFrame(sc.emptyRDD(), schema) def RDDsToDF2(rdd, partsDF): print(partsDF) print(rdd.count())
def transformToNumeric(inputStr): attList = inputStr.split(",") #srcip = float(attList[0]) #srcport = float(attList[1]) #dstip = float(attList[2]) #dstport = float(attList[3]) #proto = 1.0 if attList[4] == "tcp" else 0.0 total_fpackets = float(attList[5]) total_fvolume = float(attList[6]) total_bpackets = float(attList[7]) total_bvolume = float(attList[8]) min_fpktl = float(attList[9]) mean_fpktl = float(attList[10]) max_fpktl = float(attList[11]) std_fpktl = float(attList[12]) min_bpktl = float(attList[13]) mean_bpktl = float(attList[14]) max_bpktl = float(attList[15]) std_bpktl = float(attList[16]) min_fiat = float(attList[17]) mean_fiat = float(attList[18]) max_fiat = float(attList[19]) std_fiat = float(attList[20]) min_biat = float(attList[21]) mean_biat = float(attList[22]) max_biat = float(attList[23]) std_biat = float(attList[24]) duration = float(attList[25]) min_active = float(attList[26]) mean_active = float(attList[27]) max_active = float(attList[28]) std_active = float(attList[29]) min_idle = float(attList[30]) mean_idle = float(attList[31]) max_idle = float(attList[32]) std_idle = float(attList[33]) sflow_fpackets = float(attList[34]) sflow_fbytes = float(attList[35]) sflow_bpackets = float(attList[36]) sflow_bbytes = float(attList[37]) fpsh_cnt = float(attList[38]) bpsh_cnt = float(attList[39]) furg_cnt = float(attList[40]) #retirar burg_cnt = float(attList[41]) #retirar total_fhlen = float(attList[42]) total_bhlen = float(attList[43]) dscp = float(attList[44]) classe = float(attList[45]) linhas = Row(classe=classe, total_fpackets=total_fpackets, total_fvolume=total_fvolume, total_bpackets=total_bpackets, total_bvolume=total_bvolume, min_fpktl=min_fpktl, mean_fpktl=mean_fpktl, max_fpktl=max_fpktl, std_fpktl=std_fpktl, min_bpktl=min_bpktl, mean_bpktl=mean_bpktl, max_bpktl=max_bpktl, std_bpktl=std_bpktl, min_fiat=min_fiat, mean_fiat=mean_fiat, max_fiat=max_fiat, std_fiat=std_fiat, min_biat=min_biat, mean_biat=mean_biat, max_biat=max_biat, std_biat=std_biat, duration=duration, min_active=min_active, mean_active=mean_active, max_active=max_active, std_active=std_active, min_idle=min_idle, mean_idle=mean_idle, max_idle=max_idle, std_idle=std_idle, sflow_fpackets=sflow_fpackets, sflow_fbytes=sflow_fbytes, sflow_bpackets=sflow_bpackets, sflow_bbytes=sflow_bbytes, fpsh_cnt=fpsh_cnt, bpsh_cnt=bpsh_cnt, furg_cnt=furg_cnt, burg_cnt=burg_cnt, total_fhlen=total_fhlen, total_bhlen=total_bhlen, dscp=dscp) return linhas
sc = spark.sparkContext ########################### Q2A ############################## from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.recommendation import ALS from pyspark.sql import Row # In[3]: lines = spark.read.option("header", "true").csv("Data/ratings.csv").rdd # In[4]: ratingsRDD = lines.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]), rating=float(p[2]), timestamp=int(p[3]))) ratings = spark.createDataFrame(ratingsRDD) # In[5]: ratings.show(5) # In[6]: (split1, split2, split3) = ratings.randomSplit([0.33, 0.33, 0.34]) # split traing data and test data training1 = split2.union(split3) test1 = split1
def parseInput(line): fields = line.split() return Row(movieID = int(fields[1]), rating = float(fields[2]))
def test_schema(self): data = [Row(x=float(x)) for x in range(100)] df = self.sql.createDataFrame(data) tfs.print_schema(df)
movieNames[int(fields[0])] = fields[1] return movieNames # Create a SparkSession (the config bit is only for Windows!) spark = SparkSession.builder.config("spark.sql.warehouse.dir").appName( "PopularMovies").getOrCreate() # Load up our movie ID -> name dictionary nameDict = loadMovieNames() # ## Get the raw data lines = spark.sparkContext.textFile( "/Users/ameyapatankar/Desktop/SparkCourse/ml-100k/u.data") ## Convert it to a RDD of Row objects movies = lines.map(lambda x: Row(movieID=int(x.split()[1]))) ## Convert that to a DataFrame movieDataset = spark.createDataFrame(movies) # ## Some SQL-style magic to sort all movies by popularity in one line! topMovieIDs = movieDataset.groupBy("movieID").count().orderBy( "count", ascending=False).cache() # ## Show the results at this point: # ##|movieID|count| ##+-------+-----+ ##| 50| 584| ##| 258| 509| ##| 100| 508| #
.config("spark.executor.memory", "500mb") \ .appName("Exercise1") \ .getOrCreate() products = spark.read.parquet("datasets/products_parquet/") sales = spark.read.parquet("datasets/sales_parquet/") REPLICATION_FACTOR = 101 skewed_keys = sales.groupby("product_id").count().orderBy("count").limit(100).collect() replicated_products = [product["product_id"] for product in skewed_keys] salted_keys = [(product_id, part_id) for product_id in replicated_products for part_id in range(REPLICATION_FACTOR)] rdd = spark.sparkContext.parallelize(salted_keys) replicated_df = rdd.map(lambda x: Row(product_id=x[0], replication=int(x[1]))) replicated_df = spark.createDataFrame(replicated_df) products = products.join( other=broadcast(replicated_df), on=products["product_id"] == replicated_df["product_id"], how="left") \ .withColumn( colName="salted_join_key", col=when(replicated_df["replication"].isNull(), products["product_id"]) .otherwise(concat(replicated_df["product_id"], lit("-"), replicated_df["replication"])) ) sales = sales.withColumn( colName="salted_join_key", col=when(
# creating a DataFrame from list of tuples: # Create a list of tuples. # Each tuple contains name, city, and age. # Create a RDD from the list above. # Convert each tuple to a row. # Create a DataFrame by applying createDataFrame # on RDD with the help of sqlContext. list_of_tuples= [('alex','Sunnyvale', 25), ('mary', 'Cupertino', 22), ('jane', 'Ames', 20), ('bob', 'Stanford', 26)] print("list_of_tuples = ", list_of_tuples) rdd = spark.sparkContext.parallelize(list_of_tuples) print("rdd = ", rdd) print("rdd.count() = ", rdd.count()) print("rdd.collect() = ", rdd.collect()) # convert rdd (as RDD[(String, String, Integer)] into RDD[Row] people = rdd.map(lambda x: Row(name=x[0], city=x[1], age=int(x[2]))) print("people = ", people) print("people.count() = ", people.count()) print("people.collect() = ", people.collect()) # create a DataFrame as df df = spark.createDataFrame(people) print("df = ", df) print("df.count() = ", df.count()) print("df.collect() = ", df.collect()) df.show() df.printSchema() # done! spark.stop()
def generate_spark_graph(strings, sc, mat=None, min_ld=1, max_ld=1): """ Make a graph using the Spark graphframes library Inputs ------ strings: list a list of strings to use for the pairwise distance matrix sc : pyspark.SparkContext a live SparkContext mat : pyspark.RDD, optional an RDD representing the distance matrix (returned by `distance_matrix`). If not given, it is generated automatically min_ld : int, optional minimum Levenshtein distance max_ld : int, optional maximum Levenshtein distance Returns ------- g : graphframes.GraphFrame object with strings as node names """ try: import findspark findspark.init() import graphframes from pyspark.sql import Row, SQLContext from pyspark.sql.types import StructField, StructType, IntegerType, ShortType, StringType, LongType except: warn( 'Problem importing pyspark -- are you sure your SPARK_HOME is set?' ) sqc = SQLContext(sc) strings_b = sc.broadcast(strings) size = len(strings) # make the vertex DataFrame v_schema = StructType([ StructField('id', IntegerType()), StructField('string', StringType()) ]) v_rdd = sc.parallelize( range(size)).map(lambda x: Row(id=x, string=strings_b.value[x])) v = sqc.createDataFrame(v_rdd, schema=v_schema) # make the edge DataFrame if mat is None: mat = distance_matrix(strings, min_ld=min_ld, max_ld=max_ld, sc=sc) e_schema = StructType([ StructField('src', IntegerType()), StructField('dst', IntegerType()), StructField('weight', ShortType()) ]) e = sqc.createDataFrame(mat, schema=e_schema) gf = graphframes.GraphFrame(v, e) return gf
def test_sampleby(self): df = self.sc.parallelize([Row(a=i, b=(i % 3)) for i in range(100)]).toDF() sampled = df.stat.sampleBy(u"b", fractions={0: 0.5, 1: 0.5}, seed=0) self.assertTrue(sampled.count() == 35)
from pyspark.sql import Row from pyspark.sql.functions import countDistinct import pandas as pd import os import matplotlib.pyplot as plt spark = SparkSession.builder.master("local").appName("Collborative Filering").getOrCreate() # load csv rawdf = spark.read.csv("ratings_Sports_and_Outdoors.csv").toDF('userId', 'itemId', 'rating', 'timestamp') # map userId and itemId to iteger userIdIntMap = rawdf.rdd.map(lambda r: r.userId).distinct().zipWithUniqueId().collectAsMap() itemIdIntMap = rawdf.rdd.map(lambda r: r.itemId).distinct().zipWithUniqueId().collectAsMap() rawdf = rawdf.rdd.map( lambda d: Row(userId=userIdIntMap.get(d.userId), itemId=itemIdIntMap.get(d.itemId), rating=float(d.rating))).toDF() # get ratings' count grouped by userId counts = rawdf.groupBy(rawdf.userId).count() counts_group = counts.rdd.map(lambda row:(row['count'], 1)).reduceByKey(lambda a, b: a+b).sortByKey(ascending=True).collect() print(counts_group) # musical instruments #(1, 270914), (2, 39277), (3, 13015), (4, 5968), (5, 3226), (6, 1973), (7, 1230), (8, 786), (9, 569), (10, 430), (11, 356), (12, 247), (13, 192), (14, 143), (15, 121), (16, 96), (17, 95), (18, 58), (19, 64), (20, 51), (21, 46), (22, 45), (23, 34), (24, 31), (25, 25), (26, 22), (27, 9), (28, 21), (29, 15), (30, 12), (31, 14), (32, 12), (33, 6), (34, 3), (35, 8), (36, 8), (37, 5), (38, 4), (39, 5), (40, 8), (41, 1), (42, 4), (43, 2), (44, 1), (46, 3), (47, 4), (48, 5), (49, 3), (50, 3), (51, 4), (52, 3), (53, 1), (55, 5), (56, 1), (58, 1), (59, 2), (60, 1), (61, 2), (62, 3), (63, 1), (64, 3), (67, 3), (69, 1), (71, 2), (72, 2), (76, 1), (77, 1), (82, 2), (84, 1), (86, 2), (89, 2), (94, 1), (97, 1), (99, 1), (101, 1), (106, 1), (108, 1), (110, 2), (113, 1), (114, 1), (118, 1), (126, 1), (135, 1), (154, 1), (454, 1), (463, 1), (483, 1)] # sports #[(1, 1463787), (2, 288644), (3, 104732), (4, 48990), (5, 26588), (6, 16106), (7, 10328), (8, 7123), (9, 5053), (10, 3678), (11, 2834), (12, 2167), (13, 1711), (14, 1305), (15, 1081), (16, 851), (17, 727), (18, 592), (19, 484), (20, 407), (21, 363), (22, 324), (23, 268), (24, 227), (25, 184), (26, 193), (27, 182), (28, 147), (29, 115), (30, 111), (31, 118), (32, 72), (33, 75), (34, 82), (35, 65), (36, 68), (37, 63), (38, 54), (39, 41), (40, 33), (41, 32), (42, 27), (43, 28), (44, 30), (45, 25), (46, 29), (47, 19), (48, 22), (49, 19), (50, 17), (51, 12), (52, 11), (53, 18), (54, 10), (55, 11), (56, 10), (57, 16), (58, 16), (59, 11), (60, 7), (61, 7), (62, 6), (63, 9), (64, 4), (65, 7), (66, 4), (67, 4), (68, 10), (69, 8), (70, 3), (71, 5), (72, 3), (73, 5), (74, 4), (75, 6), (76, 2), (77, 6), (78, 3), (79, 4), (80, 6), (81, 2), (82, 3), (83, 3), (84, 3), (85, 1), (87, 1), (88, 3), (89, 1), (90, 2), (91, 2), (92, 1), (94, 6), (95, 1), (97, 1), (98, 1), (99, 3), (101, 2), (103, 1), (104, 2), (105, 1), (106, 1), (109, 4), (111, 1), (112, 1), (113, 1), (115, 2), (117, 1), (119, 3), (120, 2), (122, 1), (123, 1), (124, 1), (129, 1), (131, 1), (143, 1), (147, 1), (153, 1), (155, 1), (156, 1), (159, 1), (168, 1), (194, 1), (224, 1), (254, 1), (403, 1)] # counts_keys = [count[0] for count in counts_group] # counts_values = [count[1] for count in counts_group]