def main(): """ :return: Place and magnitude, where magnitude is greater than 1.0. """ start = time.time() data = os.path.join(root, path) df = sqlContext.read.json(data) df.createOrReplaceTempView('earthquakes') earthquakes_df = sqlContext.sql("SELECT properties.mag, properties.place " "FROM earthquakes " "WHERE properties.mag > 1.0") earthquakes_df.show() end = time.time() print('Time spent', end - start, 'seconds')
def results(query_def): """Query execution logic """ # Read JSON data_frame = sc.read.json("dataset/tweetsdata_v1.json") data_frame.createOrReplaceTempView("tweetDatatable") query = sqlContext.sql(query_def) # Getting contents of df as Pandas data_frame = query.toPandas() # data_frame_dropna = data_frame.dropna() # return data_frame_dropna.to_html() # Display results in HTML #data_frame = pd.DataFrame(np.random.randn(20, 5)) # return data_frame.to_html(classes = 'styles') #myhtml = data_frame.style.set_properties(**{'font-size': '11pt', 'font-family': 'Calibri','border-collapse': 'collapse','border': '1px solid black'}).render() #with open('myhtml.html','w') as f: #f.write(myhtml) return data_frame.to_html()
def sql(sql): sqlContext.sql(sql).show(n=20, truncate=False)
def saveCsv(name, sql): sqlContext.sql(sql).repartition(1).write.format("csv").option( "header", "false").save(name)
def saveResults(name, sql): sqlContext.sql(sql).repartition(1).write.mode('overwrite').parquet(name)
FROM ( SELECT url, filter(subresources, s -> s.integrity IS NOT NULL) AS subresources FROM cc WHERE size(filter(subresources, s -> s.integrity IS NOT NULL)) > 0 ) LATERAL VIEW explode(subresources) T AS sri GROUP BY host, target ORDER BY sri DESC """) # --------------------------- # 07: What is the number of elements per target protocol? select = sqlContext.sql(""" SELECT url as host, sri.target as target FROM cc LATERAL VIEW explode(subresources) T AS sri WHERE sri.integrity IS NOT NULL """) def parse(r): h = urlparse(r.host) t = urlparse(urljoin(r.host, r.target)) return ((h.scheme, t.scheme, 'l' if h.netloc == t.netloc else 'r'), 1) select.rdd.map(parse).reduceByKey(add).toDF().repartition(1).write.mode( 'overwrite').parquet("07_elements_per_protocol") # ---------------------------
def sql(sql): return sqlContext.sql(sql).toPandas()
subjects={"math": 80, "english": 56}, enrolled=datetime(2014, 8, 1, 14, 1, 5)), Row(id=2, name="George", active=False, clubs=['chess', 'soccer'], subjects={"math": 60, "english": 96}, enrolled=datetime(2015, 3, 21, 8, 2, 5)), ]) record_df = record.toDF() record_df.show() record_df.createOrReplaceTempView("records") all_records_df = sqlContext.sql('SELECT * FROM records') all_records_df.show() sqlContext.sql('SELECT id, clubs[1], subjects["english"] FROM records').show() sqlContext.sql('SELECT id, NOT active from records').show() sqlContext.sql('SELECT * FROM records where active').show() sqlContext.sql('SELECT * FROM records where subjects["english"] > 90').show() record_df.createGlobalTempView("global_records") sqlContext.sql('SELECT * FROM global_temp.global_records').show()
def nafill(line): if (line): if (line.number_in): number = line.number_in else: number = line.number_out return (number, line.coefficiant_of_variance_in, line.coefficiant_of_variance_out, line.call_count_in, line.call_count_out, line.call_count_competitor_out) if __name__ == '__main__': sqlContext.setConf("spark.sql.parquet.binaryAsString", "true") df = sqlContext.sql("SELECT * FROM cdrdb.pre_rec_cdr_pqt_vw") hotline = sqlContext.read.text('/data/resources/numlist.txt') global hotline_list hotline_list = hotline.map(lambda x: x.value).collect() df1 = df.select('year', 'month').groupBy('year', 'month').count() partionList = df1.select('year', 'month').collect() temp = True for i, x in enumerate(partionList): print(x) if (x.year and x.month): temp_df = sqlContext.sql( "SELECT * FROM cdrdb.pre_rec_cdr_pqt_vw WHERE year='{}' AND month='{}'" .format(str(x.year), str(x.month))) df2 = sqlContext.createDataFrame(temp_df.map(parse), [ 'number', 'number2', 'type', 'date', 'week', 'callduration', 'iscompethot'
from pyspark.shell import sqlContext from pyspark.sql import SparkSession spark = SparkSession.builder.appName("name_app")\ .config("spark.mongodb.input.uri","mongodb://192.168.1.96:27017/mydatabase.data")\ .config("spark.mongodb.output.uri","mongodb://192.168.1.96:27017/mydatabase.data").getOrCreate() data = spark.createDataFrame([("Bilbo Baggins", 50), ("Gandalf", 1000), ("Thorin", 195), ("Balin", 178), ("Kili", 77), ("Dwalin", 169), ("Oin", 167), ("Gloin", 158), ("Fili", 82), ("Bombur", None)], ["name", "age"]) #data.write.format("com.mongodb.spark.sql.DefaultSource").option("database","mydatabase").option("collection","data").save() data.show() data.registerTempTable("mytable") res = sqlContext.sql("select * from mytable limit 5") res.show()