def pyspark_api(): text_file_path = "/home/holyzing/Desktop/marvin-prod-20201125.db" sc = SparkConf() sc.setMaster("local[*]") sc.setAppName("PysaprkApi") spark = SparkSession.builder(sc).getOrCreate() text_file = spark.read.text(text_file_path) print(text_file.first(), text_file.count()) line_with_insert = text_file.filter(text_file.value.contains("insert")) print(line_with_insert.count())
def main(self): spark = SparkSession.builder().appName("testpy").enableHiveSupport().getOrCreate() spark.sparkContext.setCheckpointDir("/tmp/checkpoints") self.graph(spark)
from pyspark.sql import SparkSession from pyspark import SparkConf, SparkContext from pyspark.sql import SQLContext conf = SparkConf().setAppName("building a warehouse") sc = SparkContext(conf=conf) sqlCtx = SQLContext(sc) if __name__ == '__main__': logFile = "YOUR_SPARK_HOME/README.md" # Should be some file on your system spark = SparkSession.builder().appName(appName).master( master).getOrCreate() logData = spark.read.text(logFile).cache() numAs = logData.filter(logData.value.contains('a')).count() numBs = logData.filter(logData.value.contains('b')).count() print("Lines with a: %i, lines with b: %i" % (numAs, numBs)) spark.stop()
from pyspark.sql import SparkSession from pyspark.sql import SQLContext # from pyspark.conf import SparkConf # from pyspark import SparkContext from pyspark.sql.functions import input_file_name #conf = SparkConf().setAppName('hello').setMaster('spark://SC-PC.localdomain:7077') #sc = SparkContext(conf=conf) #sc = SparkContext("local", "test") spark = SparkSession.builder().master( "local").enableHiveSupport().getOrCreate() sql = SQLContext(spark) df = sql.read.parquet("hdfs://localhost:9000/test/database/") names = df.select(input_file_name()) names.show() d = input("WAIT") #names.repartition(1).write.option("header", "true").csv("filename1.csv") jdbcDF2 = spark.read.jdbc("jdbc:postgresql:dbserver", "schema.tablename", properties={ "user": "******", "password": "******" })