def test_convert_row_to_dict(self): row = Row(l=[Row(a=1, b='s')], d={"key": Row(c=1.0, d="2")}) self.assertEqual(1, row.asDict()['l'][0].a) df = self.sc.parallelize([row]).toDF() df.registerTempTable("test") row = self.sqlCtx.sql("select l, d from test").head() self.assertEqual(1, row.asDict()["l"][0].a) self.assertEqual(1.0, row.asDict()['d']['key'].c)
def test_convert_row_to_dict(self): row = Row(l=[Row(a=1, b='s')], d={"key": Row(c=1.0, d="2")}) self.assertEqual(1, row.asDict()['l'][0].a) df = self.sc.parallelize([row]).toDF() with self.tempView("test"): df.createOrReplaceTempView("test") row = self.spark.sql("select l, d from test").head() self.assertEqual(1, row.asDict()["l"][0].a) self.assertEqual(1.0, row.asDict()['d']['key'].c)
def test_column_select(self): df = self.df self.assertEqual(self.testData, df.select("*").collect()) self.assertEqual(self.testData, df.select(df.key, df.value).collect()) self.assertEqual([Row(value='1')], df.where(df.key == 1).select(df.value).collect())
def parse_line_to_SparkSQLRow(self, line): from pyspark.sql import Row return Row(**self.parse_line_to_dict(line))
StructField("pix6",DoubleType(),True), StructField("pix7",DoubleType(),True), StructField("pix8",DoubleType(),True), StructField("pix9",DoubleType(),True), StructField("pix10",DoubleType(),True), StructField("pix11",DoubleType(),True), StructField("pix12",DoubleType(),True), StructField("pix13",DoubleType(),True), StructField("pix14",DoubleType(),True), StructField("pix15",DoubleType(),True), StructField("pix16",DoubleType(),True), StructField("label",DoubleType(),True) ]) pen_raw = sc.textFile("first-edition/ch08/penbased.dat", 4).map(lambda x: x.split(", ")).map(lambda row: [float(x) for x in row]) dfpen = sqlContext.createDataFrame(pen_raw.map(Row.fromSeq(_)), penschema) def parseRow(row): d = {("pix"+str(i)): row[i-1] for i in range(1,17)} d.update({"label": row[16]}) return d dfpen = sqlContext.createDataFrame(pen_raw.map(parseRow), penschema) va = VectorAssembler(outputCol="features", inputCols=dfpen.columns[0:-1]) penlpoints = va.transform(dfpen).select("features", "label") pensets = penlpoints.randomSplit([0.8, 0.2]) pentrain = pensets[0].cache() penvalid = pensets[1].cache() penlr = LogisticRegression(regParam=0.01)
ds = json.loads(v[1]) return ((ds['shape'], ds['color']), [ds['size']]) db_prop = {'user': '******', 'password': '******'} def writeToDB(v): if not v.isEmpty(): try: v.toDF().write.jdbc(url='jdbc:postgresql://localhost:5432/mydb', table='logs', mode='append', properties=db_prop) except Exception as e: print(e) print("bad, bad") parsed = kafkaStream.map(parseJson) parsed = parsed.reduceByKey(lambda x, y: x + y).map( lambda x: (x[0][0], (x[0][1], len(x[1]), np.percentile(x[1], 10)))) parsed.reduceByKey(lambda x, y: x if x[1] > y[1] else y).map( lambda v: Row(currentTime=datetime.datetime.now(), shape=v[0], mostPopularColor=v[1][0], percentile=float(v[1][2]))).foreachRDD(writeToDB) ssc.start() ssc.awaitTermination()
def get_graphedges(line): list1 = line.split(':') if list1[1] == '': return None list2 = list1[1].split(' ') list2 = filter(None, list2) results = [] s = list1[0] for d in list2: results.append((s, d)) return results KnownRow = Row('node', 'source', 'distance') schema = StructType([ StructField('node', StringType(), False), StructField('source', StringType(), False), StructField('distance', IntegerType(), False), ]) graphedges_rdd = textinput.map(lambda line: get_graphedges(line)).filter( lambda x: x is not None).flatMap(lambda x: x).coalesce(1) graphedges = graphedges_rdd.toDF(['source', 'destination']).cache() graphedges.registerTempTable('SourceDestTable') initial_node = source_node initial_row = KnownRow(initial_node, initial_node, 0) knownpaths = sqlContext.createDataFrame([initial_row], schema=schema)
def do_query(archives, config_file=None, logger=None, context=None): """ Ingest NLS pages, clean and extract the articles of each to each page, and save them to HDFS, with some metadata associated with each article. Metadata collected: "title", "edition", "year", "place", "archive_filename", "source_text_filename", "text_unit", "text_unit_id", "num_text_unit", "type_archive", "model", "type_page", "header", "term", "definition", "num_articles", "num_page_words", "num_article_words", Data is saved as Dataframes into HDFS. Example: 'Encyclopaedia Britannica: or, A dictionary of arts and sciences': - archive_name: /home/tdm/datasets/eb_test/144850366 articles: ACQUEST: - or Acquist, in law, signifies goods got by purchase or donation. See CoNtiUEST. ACQUI: - "a town of Italy, in the Dutchy of Montferrat, with a biihop\u2019s see, and\ \ commodious baths. It was taken by the Spaniards in 1745, and retaken by the\ \ Piedmontese in 1746; but after this, it was taken again and difrcantled by\ \ the French, who afterwards forsook it. It is seated on the river Bormio, 25\ \ miles N.W. of Genoa, and 30 S. of Cafal, 8. 30. E. long. 44. 40. lat." ACQUIESCENCE: - in commerce, is the consent that a person gives to the determination given either by arbitration, orbyaconful :param archives: RDD of defoe.nls.archive.Archive :type archives: pyspark.rdd.PipelinedRDD :param config_file: query configuration file :type config_file: str or unicode :param logger: logger (unused) :type logger: py4j.java_gateway.JavaObject :return: "0" :rtype: string """ with open(config_file, "r") as f: config = yaml.load(f) if "os_type" in config: if config["os_type"] == "linux": os_type = "sys-i386-64" else: os_type = "sys-i386-snow-leopard" else: os_type = "sys-i386-64" if "defoe_path" in config: defoe_path = config["defoe_path"] else: defoe_path = "./" text_unit = "page" # [(tittle, edition, year, place, archive filename, # num pages, type of archive, type of disribution, model)] documents = archives.flatMap( lambda archive: [(document.title, document.edition, document.year, \ document.place, document.archive.filename, document.num_pages, \ document.document_type, document.model, document) for document in list(archive)]) # [(tittle, edition, year, place, archive filename, page filename, text_unit, tex_unit_id, num_pages, # type of archive, type of disribution, model, page_type, header, articles_page_dictionary, num_articles_page, num_page_words)] pages_clean = documents.flatMap( lambda year_document: [(year_document[0], year_document[1], year_document[2],\ year_document[3], year_document[4], page.code, text_unit, page.page_id, \ year_document[5], year_document[6], year_document[7], \ filter_terms_page(page, defoe_path, os_type), len(page.words)) for page in year_document[8]]) # [(tittle, edition, year, place, archive filename, page filename , text_unit, tex_unit_id, num_pages, # type of archive, type of disribution, model, page_type, header, term, definition, num_articles_per_page, num_page_words, num_artciles_words)] pages_articles = pages_clean.flatMap( lambda articles_page: [(articles_page[0], articles_page[1], articles_page[2],\ articles_page[3], articles_page[4], articles_page[5], articles_page[6], articles_page[7], \ articles_page[8], articles_page[9], articles_page[10], \ articles_page[11][0], articles_page[11][1], key, articles_page[11][2][key], articles_page[11][3],\ articles_page[12], len(articles_page[11][2][key].split(" "))) for key in articles_page[11][2]]) #[Encyclopaedia Britannica; or, A dictionary of arts and sciences, compiled upon a new plan, First edition, 1771, Volume 1, A-B, 1771, Edinburgh, /lustre/home/sc048/rosaf4/datasets/nls-data-encyclopaediaBritannica/144133901, alto/188083401.34.xml, page, Page53, 832, book, nlsArticles, Articles, AFFAFR, AFFIANCE, in law, denotes the mutual plighting of troth between a man and a woman to marry each, 32, 887, 17] nlsRow = Row("title", "edition", "year", "place", "archive_filename", "source_text_filename", "text_unit", "text_unit_id", "num_text_unit", "type_archive", "model", "type_page", "header", "term", "definition", "num_articles", "num_page_words", "num_article_words") sqlContext = SQLContext(context) df = sqlContext.createDataFrame(pages_articles, nlsRow) df.write.mode('overwrite').option("header", "true").csv("eb_total_articles.csv") return "0"
with open("ml-100k/u.ITEM") as f: for line in f: fields = line.split('|') movieNames[int(fields[0])] = fields[1] return movieNames # Create a SparkSession spark = SparkSession.builder.config("spark.sql.warehouse.dir", "file:///C:/tmp").appName("BestMovies").getOrCreate() # Load up our movie ID -> name dictionary nameDict = loadMovieNames() #Load the data lines = spark.sparkContext.textFile("file:///SparkCourse/ml-100k/u.data") #Convert to RDD with rows as objects movies = lines.map(lambda x: Row(movieID =int(x.split()[1]),rating=x.split()[2])) #Convert RDD above to a dataframe movieDataset = spark.createDataFrame(movies) #Group by movieID. Average ratings for each movie and count ratings per movie bestMovieIDs = movieDataset.groupBy("movieID").agg(avg("rating").alias("avgRating"), count("movieID").alias("Ratings")).filter(col("Ratings")>=100) orderedMovieIDs=bestMovieIDs.orderBy("avgRating", ascending=True).collect() # Print the results print('{:<40}{:>10}{:>10}'.format("Movie","Average","Ratings")) for result in orderedMovieIDs: # Each row has movieID, count as above. print('{:<40}{:>10}{:>10}'.format(nameDict[result[0]][0:40], round(result[1],3), result[2]))
if __name__ == '__main__': print("......... hello main method ......") # 使用反射推断schema conf = SparkConf().setAppName("1st sql in spark") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) hc = HiveContext(sc) # 创建一个数据集合,用于模拟数据源 datas = ["1 zhangfei 44", "2 guanyu 55", "3 zilong 60"] # 对datas执行parallelize操作,将其转化为spark rdd source 数据类型为字符串 source = sc.parallelize(datas) # c. 将spark rdd source中的每一条数据进行切片(split)后转换为spark rdd rows,数据类型为row; # 至此spark rdd rows已经具备转换为schemardd的条件:它的数据类型为row。 splits = source.map(lambda line: line.split(" ")) rows = splits.map( lambda words: Row(id=words[0], name=words[1], age=words[2])) # d. 使用HiveContext推断rows的schema,将其转换为schemardd people; people = hc.inferSchema(rows) # 通过people.printSchema(),我们可以查看推断schema的结果: people.printSchema() print("the first print end now ......") # e. 将schemardd people注册为一张临时表“people”; people.registerTempTable("people") #执行查询语句 select * from people where age>50 and age < 60 并将查询结果保存至spark rdd results,通过results.printSchema()的输出结果: res = hc.sql("select * from people where age>50 and age<60") res.printSchema() print(".......................................................... 萌萌的") # schemardd results2的数据类型为row,受到查询语句(select name)的影响,其仅包含一列数据,列名为name。 res1 = hc.sql("select name from people") res1.printSchema() print(".......................................................... 萌萌的")
def rebuild_microbatch(rdd, spark_conf): global_config = getConfig() try: encounters = rdd.collect() if len(encounters) > 0: start_time = datetime.datetime.utcnow() print("\n --- Micro-Batch --- \n") print("Building encounter objects " + time.ctime()) rows = [] encounter_ids = set() location_ids = set() visit_ids = set() patient_ids = set() form_ids = set() for encounter in encounters: ## filters encounter_ids.add(encounter['encounter_id']) location_ids.add(encounter['location_id']) visit_ids.add(encounter['visit_id']) patient_ids.add(encounter['patient_id']) form_ids.add(encounter['form_id']) encounter_object = Row(**encounter) rows.append(encounter_object) spark = get_spark_instance(spark_conf) obs_query = '(select * from obs where encounter_id in ({0})) foo'.format( (", ".join(["%d"] * len(encounter_ids))) % tuple(encounter_ids)) obs = spark.read.format('jdbc').option('url', 'jdbc:mysql://mysql2:3306/' + 'amrs' + '?zeroDateTimeBehavior=convertToNull')\ .option('useUnicode', 'true')\ .option('continueBatchOnError', 'true').option('useSSL','false')\ .option('user', global_config['mysql']['user'])\ .option('password', global_config['mysql']['password'])\ .option('dbtable', obs_query)\ .load() encounter_df = spark.createDataFrame( rows, get_encounter_schema()).withColumnRenamed( 'encounter_datetime', 'encounter_unixtime').withColumnRenamed( 'date_created', 'unixtime_created').withColumnRenamed( 'date_voided', 'unixtime_voided').withColumnRenamed( 'date_changed', 'unixtime_changed').withColumnRenamed( 'voided', 'voided_int') encounter_df_fixed_schema = encounter_df.withColumn( 'encounter_datetime', f.to_timestamp( f.from_unixtime(f.col("encounter_unixtime") / 1000)) ).withColumn( 'date_created', f.to_timestamp( f.from_unixtime(f.col("unixtime_created") / 1000)) ).withColumn( 'date_voided', f.to_timestamp(f.from_unixtime( f.col("unixtime_voided") / 1000))).withColumn( 'date_changed', f.to_timestamp( f.from_unixtime( f.col("unixtime_changed") / 1000))).withColumn( 'voided', f.when(f.col('voided_int') == 0, False).otherwise(True)).drop( 'encounter_unixtime', 'unixtime_created', 'unixtime_voided', 'unixtime_changed', 'voided_int').alias('encounter') filters = { 'encounter_ids': { 'column': 'encounter_id', 'values': [0 if x is None else x for x in list(encounter_ids)] }, 'visit_ids': { 'column': 'visit_id', 'values': [0 if x is None else x for x in list(visit_ids)] }, 'form_ids': { 'column': 'form_id', 'values': [0 if x is None else x for x in list(form_ids)] }, 'location_ids': { 'column': 'location_id', 'values': [0 if x is None else x for x in list(location_ids)] }, 'patient_ids': { 'column': 'patient_id', 'values': [0 if x is None else x for x in list(patient_ids)] } } transformed_obs = transform_obs(obs) transformed_encounter = transform_encounter( encounter_df_fixed_schema, transformed_obs, True, filters).cache() save_to_cassandra(transformed_encounter, 'encounter') trigger_couch_update_jobs(location_ids, transformed_encounter) transformed_encounter.unpersist() except: print("An unexpected error occured") raise
for line in f: fields = line.split('|') movieNames[int(fields[0])] = fields[1] return movieNames # Create a SparkSession (the config bit is only for Windows!) spark = SparkSession.builder.appName("PopularMovies").getOrCreate() # Load up our movie ID -> name dictionary nameDict = loadMovieNames() # Get the raw data lines = spark.sparkContext.textFile("./ml-100k/u.data") # Convert it to a RDD of Row objects movies = lines.map(lambda x: Row(movieID=int(x.split()[1]))) # Convert that to a DataFrame movieDataset = spark.createDataFrame(movies) # Some SQL-style magic to sort all movies by popularity in one line! topMovieIDs = movieDataset.groupBy("movieID").count().orderBy( "count", ascending=False).cache() # Show the results at this point: #|movieID|count| #+-------+-----+ #| 50| 584| #| 258| 509| #| 100| 508|
def combine_text(x): return Row(title=x.title, body=x.body, article=f"{x.title} {x.body}")
def test_top_n(spark): data_list = [] for i in range(500): data_list.append({"col_1": "robin", "col_2": ["smith", "jones"]}) for i in range(200): data_list.append({"col_1": "john", "col_2": ["jones"]}) for i in range(300): data_list.append({ "col_1": uuid4().hex[:10], "col_2": [uuid4().hex[:10], uuid4().hex[:10], uuid4().hex[:10]], }) df = spark.createDataFrame(Row(**x) for x in data_list) df.createOrReplaceTempView("df") df_acvf = _generate_df_all_column_value_frequencies(["col_1", "col_2"], df, spark) df_acvf.createOrReplaceTempView("df_acvf") df_acvf = df_acvf.persist() df_perc = _get_df_percentiles(df_acvf, spark) df_top_n = _get_df_top_bottom_n(df_acvf, spark, 20) percentiles_collected = _collect_and_group_percentiles_df(df_perc) top_n_collected = _collect_and_group_top_values(df_top_n) percentiles = percentiles_collected["col_1"] top_n = top_n_collected["col_1"] assert top_n[0]["value_count"] == 500 assert top_n[0]["value"] == "robin" assert top_n[1]["value_count"] == 200 assert top_n[1]["value"] == "john" assert percentiles[0]["percentile_ex_nulls"] == 1.0 assert percentiles[0]["value_count"] == 500 assert percentiles[1]["value_count"] == 500 assert percentiles[2]["value_count"] == 200 assert percentiles[-1]["value_count"] == 1 percentiles = percentiles_collected["col_2"] top_n = top_n_collected["col_2"] assert top_n[0]["value_count"] == 500 assert top_n[0]["value"] == "smith, jones" df_acvf = _generate_df_all_column_value_frequencies_array(["col_2"], df, spark) df_acvf.createOrReplaceTempView("df_acvf") df_acvf = df_acvf.persist() df_top_n = _get_df_top_bottom_n(df_acvf, spark, 20) top_n = _collect_and_group_top_values(df_top_n)["col_2"] assert top_n[0]["value_count"] == 700 assert top_n[0]["value"] == "jones"
def parseCSV(idx, part): if idx==0: part.next() for p in csv.reader(part): yield Row(ORIGIN=p[14], DEP_DEL15 = p[33])
from pyspark.sql import Row if __name__ == "__main__": #cite: http://spark.apache.org/docs/latest/sql-programming-guide.html#tab_python_0 spark = SparkSession \ .builder \ .appName("sql_top10_business") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() # Load a text file and convert each line to a Row. sc = spark.sparkContext review_rdd = sc.textFile("review.csv").map(lambda r: r.split("::")) business_rdd = sc.textFile("business.csv").map(lambda b: b.split("::")) reviewMap_rdd = review_rdd.map( lambda x: Row(business_id=x[2], user_id=x[1])) # Infer the schema, and register the DataFrame as a table. schema_review=spark.createDataFrame(reviewMap_rdd) \ .distinct()\ .groupBy('business_id')\ .count() #select first 10 business_id top10_schema = schema_review.sort("count", ascending=False)\ .head(10) #????? top10_schema = spark.createDataFrame(sc.parallelize(top10_schema)) businessMap_rdd = business_rdd.map( lambda x: Row(business_id=x[0], full_address=x[1], categories=x[2])) # Infer the schema, and register the DataFrame as a table.
from pyspark.sql import Row datas1 = [("foo", 1), ("bar", 2)] datas2 = [ Row(name='Alice', age=5, height=80), Row(name='Alice', age=5, height=80), Row(name='Alice', age=10, height=80) ] # Spark Context를 이용하는 방법 sc.parallelize(datas1).toDF().show() sc.parallelize(datas2).toDF().show() # SparkSession을 이용하는 방법 spark.createDataFrame(datas1).show() spark.createDataFrame(datas2).show()
tempList.append(tweetLongitude) tempList.append(tweetLatitude) tempList.append(tweetText) tempList.append(reply) tempList.append(replyText) tempList.append(tweetResturant) tempList.append(tweetUrl) tempList.append(tweetLocation) return tempList lines = sc.read.text(path).rdd.map(lambda x: x[0])\ .map(lambda x: removeComma(x))\ .map(lambda x: getText(x))\ .filter(lambda x: not(x == None))\ .filter(lambda x: checkEmpty(x[0]))\ .map(lambda x: Row(id=x[0], user=x[1], timeStamp=x[2], geo=x[3], longitude=x[4], latitude=x[5], text=x[6], reply=x[7], replyText=x[8], resturant=x[9], url=x[10], userLocation=x[11])) df = sc.createDataFrame(lines) tokenizer = Tokenizer(inputCol="text", outputCol="words") # Extract the features hashing_tf = HashingTF(numFeatures=2**16, inputCol="words", outputCol="tf") idf = IDF(inputCol="tf", outputCol="features", minDocFreq=5) lines = Pipeline(stages=[tokenizer, hashing_tf, idf]) # Get the data to test line_fit = lines.fit(df) test_model = line_fit.transform(df) # Load the trained model
""" Model fitted by :py:class:`RFormula`. """ if __name__ == "__main__": import doctest from pyspark.context import SparkContext from pyspark.sql import Row, SQLContext globs = globals().copy() # The small batch size here ensures that we see multiple batches, # even in these small test examples: sc = SparkContext("local[2]", "ml.feature tests") sqlContext = SQLContext(sc) globs['sc'] = sc globs['sqlContext'] = sqlContext testData = sc.parallelize([ Row(id=0, label="a"), Row(id=1, label="b"), Row(id=2, label="c"), Row(id=3, label="a"), Row(id=4, label="a"), Row(id=5, label="c") ], 2) globs['stringIndDf'] = sqlContext.createDataFrame(testData) (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) sc.stop() if failure_count: exit(-1)
# Row(language='ta', count=2), # Row(language='et', count=3), # Row(language='zh', count=13), # Row(language='', count=16), # Row(language='se', count=21), # Row(language='fr', count=50), # Row(language='ja', count=6), # Row(language='id', count=3), # Row(language='la', count=6), # Row(language='da', count=4), # Row(language='fi', count=5), # Row(language='he', count=4) # ] language_counts = [ Row(language="en", count=16871), Row(language="ru", count=299), Row(language="no", count=243), Row(language="es", count=80), Row(language="pt", count=56), Row(language="fr", count=50), Row(language="it", count=29), Row(language="ro", count=28), Row(language="se", count=21), Row(language="ms", count=20), Row(language="af", count=18), Row(language="de", count=17), Row(language="", count=16), Row(language="zh", count=13), Row(language="ku", count=11), Row(language="nl", count=11),
def to_row(x): dict = json.loads(x[1]) output = {} output["meas_flag"] = dict.get("meas_flag", "") output["meas_method"] = dict.get("meas_method", "") output["company"] = dict.get("company", "") output["event"] = dict.get("event", "") output["meas_datatype"] = dict.get("meas_datatype", "") output["meas_description"] = dict.get("meas_description", "") output["meas_name"] = dict.get("meas_name", "") output["meas_status"] = dict.get("meas_status", "") output["meas_unit"] = dict.get("meas_unit", "") output["sensor"] = dict.get("sensor", "") output["site"] = dict.get("site", "") output["station"] = dict.get("station", "") output["ts"] = dict.get("ts", 0) # Set meas_value and meas_value_datatype output["meas_value_datatype"] = "unknown" output["meas_value_str"] = "" output["meas_value_d"] = np.nan output["meas_value_l"] = np.nan if "meas_value" in dict and isinstance(dict['meas_value'], unicode): output["meas_value_str"] = dict["meas_value"] if output["meas_value_str"] != "": output["meas_value_datatype"] = 'string' if "meas_value" in dict and isinstance(dict['meas_value'], float): if output["meas_datatype"] == "long" and long( float(dict['meas_value'])) == float(dict['meas_value']): output["meas_value_l"] = float(dict["meas_value"]) output["meas_value_datatype"] = 'long' else: output["meas_value_d"] = float(dict["meas_value"]) output["meas_value_datatype"] = 'double' if "meas_value" in dict and (isinstance(dict['meas_value'], int) or isinstance(dict['meas_value'], long)): output["meas_value_l"] = float(dict["meas_value"]) output["meas_value_datatype"] = 'long' # Set meas_lower_limit output["meas_lower_limit_d"] = np.nan output["meas_lower_limit_l"] = np.nan if "meas_lower_limit" in dict and isinstance(dict['meas_lower_limit'], float): if long(float(dict["meas_lower_limit"])) == float( dict["meas_lower_limit"]): output["meas_lower_limit_l"] = float(dict["meas_lower_limit"]) else: output["meas_lower_limit_d"] = float(dict["meas_lower_limit"]) if "meas_lower_limit" in dict and ( isinstance(dict['meas_lower_limit'], int) or isinstance(dict['meas_lower_limit'], long)): output["meas_lower_limit_l"] = float(dict["meas_lower_limit"]) # Set meas_upper_limit output["meas_upper_limit_d"] = np.nan output["meas_upper_limit_l"] = np.nan if "meas_upper_limit" in dict and isinstance(dict['meas_upper_limit'], float): if long(float(dict["meas_upper_limit"])) == float( dict["meas_upper_limit"]): output["meas_upper_limit_l"] = float(dict["meas_upper_limit"]) else: output["meas_upper_limit_d"] = float(dict["meas_upper_limit"]) if "meas_upper_limit" in dict and ( isinstance(dict['meas_upper_limit'], int) or isinstance(dict['meas_upper_limit'], long)): output["meas_upper_limit_l"] = float(dict["meas_upper_limit"]) return Row(**output)
import re import os from sklearn import decomposition import matplotlib.pyplot as plt from pyspark.ml.linalg import Vectors from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml.feature import StandardScaler # In[63]: #cold start dataset, cointains over 20 features pwd = '/Users/RUIest/Desktop/big_data_project/lending_data_clean_v3.csv' lines = spark.read.text(pwd).rdd parts = lines.map(lambda row: row.value.split(',')) lend_RDD = parts.map(lambda p: Row(lable=int(p[22]),featuresList=(float(p[0]), float(p[1]),float(p[2]),float(p[3]),float(p[4]),float(p[5]),float(p[6]), float(p[8],float(p[9],float(p[10],float(p[11],float(p[12],float(p[13],float(p[14],float(p[15],float(p[16], float(p[17],float(p[18],float(p[19],float(p[20],float(p[21]))) # In[338]: # Create a DataFrame lending_df = spark.createDataFrame(lend_RDD) lending_df.show(10) # In[339]: # Convert feature type to vector lending_df_vectors = lending_df.rdd.map(lambda row: Row( label=row["lable"],
from pyspark.sql import SparkSession spark = SparkSession.builder.appName("Pyspark example").getOrCreate() from pyspark.sql import Row from pyspark.sql.types import StructField, StringType, StructType, LongType mySchema = StructType([ StructField("column1", StringType(), True), StructField("column2", StringType(), True), StructField("column3", LongType(), False) ]) myRow = Row("Bonjour", "French", 1) myDf = spark.createDataFrame([myRow], mySchema) myDf.show()
def _create_row(fields, values): row = Row(*values) row.__fields__ = fields return row
from pyspark.sql import Row from pyspark.sql.types import * from pyspark.sql import SparkSession spark = (SparkSession.builder.appName("Authors").getOrCreate()) schema = StructType([ StructField("Author", StringType(), False), StructField("State", StringType(), False) ]) rows = [Row("Matei Zaharia", "CA"), Row("Reynold Xin", "CA")] authors_df = spark.createDataFrame(rows, schema) authors_df.show()
from pyspark.sql.functions import monotonically_increasing_id from pyspark.ml.fpm import FPGrowth from pyspark.sql.functions import udf from pyspark.sql.types import IntegerType sc = SparkContext() if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("thach")\ .getOrCreate() lines = sc.textFile(sys.argv[1]) parts = lines.map(lambda l: l.split(",", 1)) parts = parts.map(lambda l: [l[0], l[1].split(",")]) plantsRDD = parts.map(lambda p: Row(plant=p[0], items=p[1])) plantsRDD_result = spark.createDataFrame(plantsRDD) plants_withID = plantsRDD_result.orderBy('plant').withColumn( "id", monotonically_increasing_id()) plants_withID.createOrReplaceTempView("plant_states") getFrequentItems = plants_withID.select("id", "items") fpGrowth = FPGrowth(itemsCol="items", minSupport=float(sys.argv[3]), minConfidence=float(sys.argv[4])) model = fpGrowth.fit(getFrequentItems) def get_antecedent_length(antecedent): return len(antecedent)
with open("basics/ml-100k/u.ITEM") as f: for line in f: fields = line.split('|') movieNames[int(fields[0])] = fields[1] return movieNames # Create a SparkSession (the config bit is only for Windows!) spark = SparkSession.builder.config("spark.sql.warehouse.dir", "file:///C:/temp").appName("PopularMovies").getOrCreate() # Load up our movie ID -> name dictionary nameDict = loadMovieNames() # Get the raw data lines = spark.sparkContext.textFile("file:///Spark-Python/basics/ml-100k/u.data") # Convert it to a RDD of Row objects movies = lines.map(lambda x: Row(movieID =int(x.split()[1]))) # single column of movieID # Convert that to a DataFrame movieDataset = spark.createDataFrame(movies) # Some SQL-style magic to sort all movies by popularity in one line! Cache the resulting DataSet topMovieIDs = movieDataset.groupBy("movieID").count().orderBy("count", ascending=False).cache() # Show the results at this point: #|movieID|count| #+-------+-----+ #| 50| 584| #| 258| 509| #| 100| 508| topMovieIDs.show() # show top 20
from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.recommendation import ALS from pyspark.sql import Row from pyspark.sql import SparkSession spark= SparkSession\ .builder \ .appName("dataFrame") \ .getOrCreate() lines = spark.read.text( "/home/luogan/lg/softinstall/spark-2.3.0-bin-hadoop2.7/data/mllib/als/sample_movielens_ratings.txt" ).rdd parts = lines.map(lambda row: row.value.split("::")) ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]), rating=float(p[2]), timestamp=float(p[3]))) ratings = spark.createDataFrame(ratingsRDD) (training, test) = ratings.randomSplit([0.8, 0.2]) # Build the recommendation model using ALS on the training data # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop") model = als.fit(training) # Evaluate the model by computing the RMSE on the test data
def transformToNumeric(inputStr) : attList = inputStr.split(",") #srcip = float(attList[0]) #srcport = float(attList[1]) #dstip = float(attList[2]) #dstport = float(attList[3]) #proto = 1.0 if attList[4] == "tcp" else 0.0 total_fpackets = float(attList[5]) total_fvolume = float(attList[6]) total_bpackets = float(attList[7]) total_bvolume = float(attList[8]) min_fpktl = float(attList[9]) mean_fpktl = float(attList[10]) max_fpktl = float(attList[11]) std_fpktl = float(attList[12]) min_bpktl = float(attList[13]) mean_bpktl = float(attList[14]) max_bpktl = float(attList[15]) std_bpktl = float(attList[16]) min_fiat = float(attList[17]) mean_fiat = float(attList[18]) max_fiat = float(attList[19]) std_fiat = float(attList[20]) min_biat = float(attList[21]) mean_biat = float(attList[22]) max_biat = float(attList[23]) std_biat = float(attList[24]) duration = float(attList[25]) min_active = float(attList[26]) mean_active = float(attList[27]) max_active = float(attList[28]) std_active = float(attList[29]) min_idle = float(attList[30]) mean_idle = float(attList[31]) max_idle = float(attList[32]) std_idle = float(attList[33]) sflow_fpackets = float(attList[34]) sflow_fbytes = float(attList[35]) sflow_bpackets = float(attList[36]) sflow_bbytes = float(attList[37]) fpsh_cnt = float(attList[38]) bpsh_cnt = float(attList[39]) #furg_cnt = float(attList[40]) #burg_cnt = float(attList[41]) total_fhlen = float(attList[42]) total_bhlen = float(attList[43]) dscp = float(attList[44]) classe = float(attList[45]) linhas = Row(classe=classe, total_fpackets=total_fpackets, total_fvolume=total_fvolume, total_bpackets=total_bpackets, total_bvolume=total_bvolume, min_fpktl=min_fpktl, mean_fpktl=mean_fpktl, max_fpktl=max_fpktl, std_fpktl=std_fpktl, min_bpktl=min_bpktl, mean_bpktl=mean_bpktl, max_bpktl=max_bpktl, std_bpktl=std_bpktl, min_fiat=min_fiat, mean_fiat=mean_fiat, max_fiat=max_fiat, std_fiat=std_fiat, min_biat=min_biat, mean_biat=mean_biat, max_biat=max_biat, std_biat=std_biat, duration=duration, min_active=min_active, mean_active=mean_active, max_active=max_active, std_active=std_active, min_idle=min_idle, mean_idle=mean_idle, max_idle=max_idle, std_idle=std_idle, sflow_fpackets=sflow_fpackets, sflow_fbytes=sflow_fbytes, sflow_bpackets=sflow_bpackets, sflow_bbytes=sflow_bbytes, fpsh_cnt=fpsh_cnt, bpsh_cnt=bpsh_cnt, total_fhlen=total_fhlen, total_bhlen=total_bhlen, dscp=dscp) return linhas
def func1(): rawUserRdd = sc.textFile(Path + "u.user") print("数据量 rawUserRdd.count():=", rawUserRdd.count()) print("查看前2行:", rawUserRdd.take(2)) user_Rows = rawUserRdd.map(lambda p: Row( userid=int(p[0]), age=int(p[1]), gender=p[2], occupation=p[3], zipcode=p[4] )) print("dataFrame 的前3,user_Rows.take(3):", user_Rows.take(3)) user_df = sqlContext.createDataFrame(user_Rows) # 展示schema,类似表结构 print("#展示schema,类似表结构:") user_df.printSchema() # 展示前3个数据 print("#展示前3个数据:") user_df.show(3) # dataFrame创建别名 df = user_df.alias("df") print("#dataFrame创建别名:") df.show(3) df.registerTempTable("user_table") print("sparkSQL 查询条数:") sqlContext.sql("select count(*) counts from user_table").show() # 多行输入,3引号的使用 print("sparkSQL 查询条数2:") sqlContext.sql("""select count(*) counts from user_table""").show() print("sparkSQL 查询数据(默认前20条):") sqlContext.sql("select * from user_table").show() print("sparkSQL 查询数据(指定3条):") sqlContext.sql("select * from user_table").show(3) print("sparkSQL 查询数据(指定3条使用limit,可减少运行时间):") sqlContext.sql("select * from user_table limit 3").show() ######选择指定字段展示的三种方式,RDD,dataFrame,sql # RDD userRDDnew = rawUserRdd.map(lambda x: (x[0], x[3], x[2], x[1])) # 选取字段 print("使用RDD方式选取字段展示:", userRDDnew.take(3)) # 使用dataFrame选取字段 print("#使用dataFrame选取字段,输入字段名称字符串:") user_df.select("userid", "occupation", "gender", "age").show(3) print("#使用dataFrame选取字段,dataFrame.字段名,(dataFrame使用创建的别名也行如:df.userid,df.occupation,或者中括号也行df['occupation']选取字段:") user_df.select(user_df.userid, user_df.occupation, user_df.gender, user_df.age).show(3) # spark sql sqlContext.sql("select userid,occupation,gender,age from user_table limit 3").show() #####增加计算字段,即有些字段数据需要计算得到 # RDD userRDDnew2 = rawUserRdd.map(lambda x: (x[0], x[3], x[2], 2016 - int(x[1]))) print("RDD计算字段:", userRDDnew2.take(3)) # dataFrames计算值并娶一个别名,不然字段名就为2016-df.age print("dataframe计算字段:") df.select("userid", "occupation", "gender", (2016 - df.age).alias("birthyear")).show(3) # sparksql print("sparksql:") sqlContext.sql("select userid,occupation,gender,2016-age birthyear from user_table").show(3) ######删选数据 类似where条件 # RDD print("使用RDD筛选,lambda表达式:", rawUserRdd.filter(lambda r: r[3] == "technician" and r[2] == "M" and r[1] == 24).take(3)) # dataframes # 1多个filter 相当于and user_df.filter("occupation='technician'").filter("gender='M'").filter("age=24").show() # 2单个filter配合and or not user_df.filter("occupation='technician'" and "gender='M'" and "age=24").show() # 3使用[名称].[字段] 方式,=要为==,and要为&,中括号引用类似 df.filter((df.occupation == "technician") & (df.gerder == "M") & (df.age == 24)).show() # sparksql,很简单,类似sql添加where调价即可 sqlContext.sql( "select userid,occupation,gender,age from user_table where occupation='technician' and gender='M' and age=24").show( 3) #####排序 # RDD takeOrdered print("RDD 排序默认升序:", rawUserRdd.takeOrdered(3, key=lambda x: int(x[1]))) print("RDD 排序,降序(取反):", rawUserRdd.takeOrdered(3, key=lambda x: -1 * int(x[1]))) # dataframes # 1升序,默认升序 user_df.select("userid", "occupation", "gender", "age").orderBy("age").show(3) user_df.select("userid", "occupation", "gender", "age").orderBy(df.age).show(3) # 2降序 user_df.select("userid", "occupation", "gender", "age").orderBy("age", ascending=0).show(3) user_df.select("userid", "occupation", "gender", "age").orderBy(df.age.desc()).show(3) # sparksql order by desc,asc sqlContext.sql("select userid,occupation,gender,age from user_table order by age asc").show(3) sqlContext.sql("select userid,occupation,gender,age from user_table order by age desc").show(3) ####按照多个字段排序 # rdd print("RDD 多字段排序:", rawUserRdd.takeOrdered(3, key=lambda x: (-int(x[1]), x[2]))) # 现x1降序再x2升序 # dataframes df.orderBy(["age", "gender"], ascending=[0, 1]).show(3) # 0表示升序1表示降序 df.orderBy(df.age.desc(), df.gender).show(3) # sparksql sqlContext.sql("select userid,occupation,gender,age from user_table order by age desc,gender asc").show(3) #####去重 # rdd print("RDd 去重:", rawUserRdd.map(lambda x: x[2]).distinct().collect()) # 限制多个字段,类似双主键 print("RDD 去重多字段:", rawUserRdd.map(lambda x: (x[1], x[2])).distinct().take(5)) # dataframes user_df.select("gender").distinct().show() user_df.select("age", "gender").distinct().show() # 多字段 # sparksql sqlContext.sql("select distinct gender from user_table ").show() ####分组,统计 # rdd print("RDD分组统计:", rawUserRdd.map(lambda x: (x[2], 1)).reduceByKey( lambda x, y: x + y).collect()) # map将数据变成(性别,1),reduce分别按照性别统计和 print("RDD分组统计,多字段:", rawUserRdd.map(lambda x: ((x[2], x[3]), 1)).reduceByKey(lambda x, y: x + y).collect()) # 按照性别职业来统计数据 # dataframes user_df.select("gender").groupBy("gender").count().show() user_df.select("gender", "occupation").groupBy("gender", "occupation").count().orderBy("gender", "occupation").show( 10) # TODO crosstad user_df.stat.crosstab("occupation", "gender").show(10) # sparksql sqlContext.sql("select gender,count(*) counts from user_table group by gender").show() sqlContext.sql("select gender,occupation,count(*) counts from user_table group by gender,occupation").show(10) ###创建邮编数据 ZipCodeRDD = getZipcode() zipcode_data = ZipCodeRDD.map(lambda p: Row( zipcode=int(p[0]), zipCodeType=p[1], city=p[2], state=p[3] )) print("zipcode前3:",zipcode_data.take(3)) zipcode_df=sqlContext.createDataFrame(zipcode_data) zipcode_df.printSchema() #创建临时登陆表 zipcode_df.registerTempTable("zipcode_table") zipcode_df.show(3) #####join 联接数据 #sparksql sqlContext.sql("select u.*,z.city,z.state from user_table u left join zipcode_table z on u.zipcode=z.zipcode where z.state='NY'").show(10)#查看纽约用户数据 sqlContext.sql( "select z.state, count(*) from user_table u left join zipcode_table z on u.zipcode=z.zipcode group by z.state ").show( 10) # 查看纽约用户数据 #dataframes joined_df=user_df.join(zipcode_df,user_df.zipcode==zipcode_df.zipcode,"left_outer") print("dataframes 联接后:") joined_df.printSchema() #安州分组 groupByState_df=joined_df.groupBy("state").count() groupByState_pandas_df=groupByState_df.toPandas().set_index("state") #画个直方图,安州统计数据 ax=groupByState_pandas_df["count"].plot(kind="bar",title="State",figsize=(12,6),legend=True,fontsize=12) plt.show() #按照不同职业统计人数,并以圆饼图展示 Occupation_df=sqlContext.sql("select u.occupation,count(*) counts from user_table u group by occupation") Occupation_pandas_df=Occupation_df.toPandas().set_index("occupation") ax2=Occupation_pandas_df["counts"].plot(kind="pie",title="occupation",figsize=(8,8),startangle=90,autopct="%1.1f%%") ax2.legend(bbox_to_anchor=(1.05,1),loc=2,borderaxespad=0.) plt.show()
centerX.append(center[0]) centerY.append(center[1]) print("cluster "+ str(i)+": "+str(center)+"\n") # In[18]: def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) get_ipython().magic(u'time WSSSE = rdd_final.map(lambda point: error(point)).reduce(lambda x, y: x + y)') print("Within Set Sum of Squared Error = " + str(WSSSE)) # In[20]: from pyspark.sql import Row import pandas as pd import matplotlib.pyplot as plt pdf = (rdd_final.map(lambda (path,times) : Row(path=path,times=times))).toDF() pdf = pdf.toPandas() centers = pd.DataFrame({'x': centerX , 'y': centerY}) pdf.plot(kind='scatter',x ='path',y='times') plt.plot(centerX,centerY,'rs') plt.show()
def fake_entry(): name = fake.name().split() return Row(name[1], name[0], fake.ssn(), fake.job(), abs(2016 - fake.date_time().year) + 1)
#http://mail-archives.apache.org/mod_mbox/spark-user/201505.mbox/%3CCAA+15pcYAmJn_CdA8Wu4hh+JCh7b0Kmk+jAQ6S=jgVgPKgxXXg@mail.gmail.com%3E spark._jsc.hadoopConfiguration().set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") spark._jsc.hadoopConfiguration().set("parquet.enable.summary-metadata", "false") AWS_REGION = 'us-east-1' MIN_SENTENCE_LENGTH_IN_CHARS = 10 MAX_SENTENCE_LENGTH_IN_CHARS = 4500 COMPREHEND_BATCH_SIZE = 25 ## This batch size results in groups no larger than 25 items NUMBER_OF_BATCHES = 10 ROW_LIMIT = 10000 ## Each task handles 25*4 records, there should be 10 partitions overall to process 1000 records. #A PySpark.sql row in SchemaRDD. The fields in it can be accessed like attributes. #Here, Row is used to create a Row like class that takes review_id and sentiment as attributes. SentimentRow = Row("review_id", "sentiment") #Defining method to get batch sentiment from Comprehend def getBatchSentiment(input_list): ## You can import the ratelimit module if you want to further rate limit API calls to Comprehend ## https://pypi.org/project/ratelimit/ #from ratelimit import rate_limited arr = [] bodies = [i[1] for i in input_list] client = boto3.client('comprehend',region_name = AWS_REGION) #@rate_limited(1) def callApi(text_list): response = client.batch_detect_sentiment(TextList = text_list, LanguageCode = 'en') return response