def read_df(self, path): try: reader = DataFrameReader(self.sqlctx) result = reader.load(path) except Exception as e: result = None return result
def generate_pre_transform_specs_data_frame(self, spark_context=None, sql_context=None): data_frame_reader = DataFrameReader(sql_context) pre_transform_specs_data_frame = data_frame_reader.jdbc( DbUtil.get_java_db_connection_string(), 'pre_transform_specs') data = [] for item in pre_transform_specs_data_frame.collect(): spec = json.loads(item['pre_transform_spec']) data.append(json.dumps(spec)) data_frame = sql_context.read.json(spark_context.parallelize(data)) self.pre_transform_specs_data_frame = data_frame
def generate_transform_specs_data_frame(self, spark_context=None, sql_context=None): data_frame_reader = DataFrameReader(sql_context) transform_specs_data_frame = data_frame_reader.jdbc( DbUtil.get_java_db_connection_string(), 'transform_specs' ) data = [] for item in transform_specs_data_frame.collect(): spec = json.loads(item['transform_spec']) data.append(json.dumps(spec)) data_frame = sql_context.read.json(spark_context.parallelize(data)) self.transform_specs_data_frame = data_frame
def generate_pre_transform_specs_data_frame(self, spark_context=None, sql_context=None): data_frame_reader = DataFrameReader(sql_context) pre_transform_specs_data_frame = data_frame_reader.jdbc( self.get_connection_string(), 'pre_transform_specs' ) data = [] for item in pre_transform_specs_data_frame.collect(): spec = json.loads(item['pre_transform_spec']) data.append(json.dumps(spec)) data_frame = sql_context.jsonRDD(spark_context.parallelize(data)) self.pre_transform_specs_data_frame = data_frame
def db_read(self, table_name, sqlContext): ''' This function takes in a table name and reads from a database ''' dataframe = DataFrameReader(sqlContext).jdbc( url=self.__url, table=table_name, properties=self.__properties) return dataframe
def method2(sql_context: SQLContext, database_URL: str, database_properties: dict): print('fetching jdbc dataframe...') # Create a DataFrameReader interface jdbc_df = DataFrameReader(sql_context).option("fetchSize", "5001") # Create a DataFrame object jdbc_df = jdbc_df.jdbc( url=database_URL, table='RATINGS', # column="SERVICE_ID", # lowerBound="0", # upperBound="4", # numPartitions=4, properties=database_properties) return jdbc_df
def filename_to_object(filename, context): """ Given a filename create a defoe.books.archive.Archive. If an error arises during its creation this is caught and returned as a string. :param filename: filename :type filename: str or unicode :return: tuple of form (Archive, None) or (filename, error message), if there was an error creating Archive :rtype: tuple(defoe.books.archive.Archive | str or unicode, str or unicode) """ lines = open(filename).readlines() fields = lines[1].split(",") #host,port,database,user,driver,table host = fields[0] port = fields[1] database = fields[2] user = fields[3] driver = fields[4] table = fields[5] sqlContext = SQLContext(context) url = 'postgresql://%s:%s/%s' % (host, port, database) properties = {'user': user, 'driver': driver} df = DataFrameReader(sqlContext).jdbc(url='jdbc:%s' % url, table=table, properties=properties) return df
def load_datasets(self): """ Loads movielens dataset from a given location """ reader = DataFrameReader(self.sqlctx) fields = [ StructField('userId', IntegerType(), True), StructField('movieId', IntegerType(), True), StructField('rating', FloatType(), True), StructField('timestamp', StringType(), True), ] schema = StructType(fields) self.ratings = reader.csv(os.path.join(self.data_dir, 'ratings.csv'), schema=schema, header=True, mode="DROPMALFORMED")
def load_delay_data(df_reader: DataFrameReader, spark: SparkSession, mes_part_opplan, mes_part_info_dispatch, mes_part_info): df_reader.option( "dbtable", mes_part_opplan).load().createOrReplaceTempView("mes_part_opplan") df_reader.option("dbtable", mes_part_info_dispatch).load().createOrReplaceTempView( "mes_part_info_dispatch") df_reader.option( "dbtable", mes_part_info).load().createOrReplaceTempView("mes_part_info") df = spark.sql(''' select c.MES_PART_INFO_ID,b.MES_PART_INFO_DISPATCH_ID,a.MES_PART_OPPLAN_ID, c.PART_NO,c.LOT_NO,c.FOPLAN_ID,c.TASK_QTY,c.MANUFACTURER, b.SCHEDULED_OPERATOR_TYPE,b.SCHEDULED_OPERATOR_NO,a.SCHEDULED_START_DATE,a.SCHEDULED_COMPLETION_DATE, a.START_DATE,a.END_DATE,a.OP_NO,a.OP_NAME,a.OP_DESCRIPTION, a.OPER_DEPART,a.ACTUAL_MADE_BY,b.COMPLETION_RECORD_CREATOR from mes_part_opplan a join mes_part_info_dispatch b on a.MES_PART_INFO_ID=b.MES_PART_INFO_ID and a.MES_PART_OPPLAN_ID=b.MES_PART_OPPLAN_ID join mes_part_info c on a.MES_PART_INFO_ID=c.MES_PART_INFO_ID ''') return df.filter(F.col("START_DATE").isNotNull())
def detect_anomaly(df): #get distinct tickers in the micro-batch dataframe df.createOrReplaceTempView("prices") #get distributions informations for all the stocks url = 'postgresql://{}:5432/{}'.format(databaseIP, databaseName) properties = {'user': databaseUser, 'password': databasePassword} distributions_df = DataFrameReader(sql_context).jdbc( url='jdbc:%s' % url, table='stock_distributions', properties=properties) distributions_df.createOrReplaceTempView("distributions") #calculate upper and lower limit for percent change anomalyFactor = 3 distributions_df = distributions_df.withColumn( "upper_limit", distributions_df["mean"] + anomalyFactor * distributions_df["stdev"]) distributions_df = distributions_df.withColumn( "lower_limit", distributions_df["mean"] - anomalyFactor * distributions_df["stdev"]) #join the mini batch data frame that holds the prices and percent changes with the distributions table ta = df.alias('ta') tb = distributions_df.alias('tb') batch_join_anom = ta.join(tb, ta.code == tb.code, how="left") batch_join_anom = batch_join_anom.select([ "ta.time", "ta.code", "percent_change", "upper_limit", "lower_limit", "stdev" ]) batch_join_anom.show() #keep the rows that has percent_change below the lower_limit or above the upper_limit #in other words keep the anomalies batch_join_anom = batch_join_anom.filter( (batch_join_anom["percent_change"] < batch_join_anom["lower_limit"]) | (batch_join_anom["percent_change"] > batch_join_anom["upper_limit"])) #calculate how many standard deviations away from the mean this anomaly was batch_join_anom = batch_join_anom.withColumn( "num_of_std_away", func.abs(batch_join_anom["percent_change"]) / batch_join_anom["stdev"]) batch_join_anom.show() #prepare to push to database anomalies table batch_join_anom = batch_join_anom.select( ["ta.time", "ta.code", "percent_change", "num_of_std_away"]) addToDB(batch_join_anom, "stock_anomalies")
def get_df_from_psql(host=None, port=None, db=None, table=None, user=None, password=None): host = 'localhost' port = '5432' db = 'testdb2' table = 'Orders' user = '******' password = '******' url = 'postgresql://{}:{}/{}'.format(host, port, db) properties = { 'user': user, 'password': password, "driver": "org.postgresql.Driver" } df = DataFrameReader(sqlContext).jdbc(url='jdbc:%s' % url, table=table, properties=properties) return df
def get_reader(self, spark): return DataFrameReader(spark)
def spark_intersect2(mutation_table_name, regions_table_name, DB_CONF, jdbc_jar='postgresql-42.2.12.jar'): fs_db_dir =os.getenv('MUTVIZ_FS_DB_FOLDER', "disabled") numPartitions = int(os.getenv('MUTVIZ_NUM_PARTITIONS', -1)) memory = os.getenv('MUTVIZ_DRIVER_MEMORY', "50g") sparkDebug = os.getenv('MUTVIZ_SPARK_DEBUG', "false") == "true" print("USING "+str(numPartitions)+" PARTITIONS (-1:AUTO).") start_time = time.time() os.environ["SPARK_HOME"] = os.getenv('MUTVIZ_SPARK_HOME', "/var/lib/spark-2.4.5-bin-hadoop2.7") os.environ["PYSPARK_PYTHON"] = sys.executable os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable driver_class = "org.postgresql.Driver" cores = os.getenv('MUTVIZ_CORES', "*") print("#### SPARK CONFIGURATION ####") print("SPARK HOME: " + os.getenv('SPARK_HOME')) print("Using cores: "+cores) print("Using memory: "+memory) print("Using partitions: "+str(numPartitions)) print("Debug enabled: " + str(sparkDebug)) print("#############################") spark = SparkSession.builder \ .master("local["+cores+"]") \ .appName("Word Count") \ .config("spark.jars", jdbc_jar) \ .config("spark.driver.memory", memory) \ .config("spark.driver.cores", cores) \ .getOrCreate() sql_ctx = SQLContext(spark.sparkContext) sc = spark.sparkContext properties = {'user': DB_CONF["postgres_user"], 'password':DB_CONF["postgres_pw"], 'driver': driver_class} url = 'postgresql://'+DB_CONF["postgres_host"]+':'+DB_CONF["postgres_port"]+'/'+DB_CONF["postgres_db"] if fs_db_dir == 'disabled': mutations = DataFrameReader(sql_ctx).jdbc( url='jdbc:%s' % url, table=mutation_table_name, properties=properties ) else: customSchema = StructType([ StructField("donor_id", IntegerType(), False), StructField("tumor_type_id", IntegerType(), False), StructField("chrom", IntegerType(), False), StructField("position", IntegerType(), False), StructField("mutation_code_id", IntegerType(), False), StructField("trinucleotide_id_r", IntegerType(), False)] ) mutations = spark.read.format("csv").option("header", "true").schema(customSchema).load(fs_db_dir + "/"+mutation_table_name) regions_df = DataFrameReader(sql_ctx).jdbc( url='jdbc:%s' % url, table=regions_table_name, properties=properties ) regions = regions_df.collect() regions = sorted(regions, key=itemgetter('pos_start', 'pos_stop')) print("====> REGIONS SORTED AFTER (S) %s" % (time.time() - start_time)) regions_broadcast = sc.broadcast(regions) print("====> REGIONS BROADCAST AFTER (S) %s" % (time.time() - start_time)) def partitionWork(p): localMutations = list(p) matched = [] if sparkDebug: print("====> PROCESSING PARTITION AFTER (S) %s" % (time.time() - start_time)) if localMutations: import copy localRegions = copy.deepcopy(regions_broadcast.value) if localRegions: sorted_mutations = sorted(localMutations, key=itemgetter('position')) sorted_regions = localRegions cur_reg_idx = 0 cur_mut_idx = 0 while( cur_mut_idx < len(sorted_mutations) and cur_reg_idx < len(sorted_regions) ): cur_reg = sorted_regions[cur_reg_idx] cur_mut = sorted_mutations[cur_mut_idx] if cur_mut["position"] < cur_reg["pos_start"]: cur_mut_idx += 1 elif cur_mut["position"] <= cur_reg["pos_stop"]: if cur_reg["chrom"] == cur_mut["chrom"]: matched.append(cur_mut) else: # look ahead next_region_index = cur_reg_idx + 1 while next_region_index < len(sorted_regions) and sorted_regions[next_region_index][ "pos_start"] <= cur_mut["position"]: if sorted_regions[next_region_index]["chrom"] == cur_mut["chrom"] and \ sorted_regions[next_region_index]["pos_stop"] >= cur_mut["position"]: matched.append(cur_mut) next_region_index = next_region_index + 1 cur_mut_idx += 1 else: cur_reg_idx += 1 return matched res = mutations.rdd.mapPartitions(partitionWork).toDF().toPandas() print("Spark execution took %s seconds ---" % (time.time() - start_time)) return res
def spark_intersect(mutation_table_name, regions_table_name, DB_CONF, output_format, regions=None, jdbc_jar='postgresql-42.2.12.jar', groupby=None, useSQL=False, minCount=-1, tumorType=None, filter=None): # SQL VS MINE8: 388[1h] , 1507 (25min), 1018[bin=20], (1h, no bins), 1101 (5 bins), 994 [100] - 952 [200] 916(ctcf) 941[41] # 590 ETS1 #3h13 geco 4h37 genomic fs_db_dir =os.getenv('MUTVIZ_FS_DB_FOLDER', "disabled") numPartitions = int(os.getenv('MUTVIZ_NUM_PARTITIONS', -1)) memory = os.getenv('MUTVIZ_DRIVER_MEMORY', "50g") sparkDebug = os.getenv('MUTVIZ_SPARK_DEBUG', "false") == "true" print("USING "+str(numPartitions)+" PARTITIONS (-1:AUTO).") start_time = time.time() os.environ["SPARK_HOME"] = os.getenv('MUTVIZ_SPARK_HOME', "/var/lib/spark-2.4.5-bin-hadoop2.7") os.environ["PYSPARK_PYTHON"] = sys.executable os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable driver_class = "org.postgresql.Driver" cores = os.getenv('MUTVIZ_CORES', "*") print("#### SPARK CONFIGURATION ####") print("SPARK HOME: " + os.getenv('SPARK_HOME')) print("Using cores: "+cores) print("Using memory: "+memory) print("Using partitions: "+str(numPartitions)) print("Debug enabled: " + str(sparkDebug)) if tumorType: print("Tumor Type id: "+str(tumorType)) if filter: print("Filter count: "+str(len(filter))) print("#############################") spark = SparkSession.builder \ .master("local["+cores+"]") \ .appName("Word Count") \ .config("spark.jars", jdbc_jar) \ .config("spark.driver.memory", memory) \ .config("spark.driver.cores", cores) \ .getOrCreate() sql_ctx = SQLContext(spark.sparkContext) sc = spark.sparkContext properties = {'user': DB_CONF["postgres_user"], 'password':DB_CONF["postgres_pw"], 'driver': driver_class} url = 'postgresql://'+DB_CONF["postgres_host"]+':'+DB_CONF["postgres_port"]+'/'+DB_CONF["postgres_db"] if fs_db_dir == 'disabled': mutations = DataFrameReader(sql_ctx).jdbc( url='jdbc:%s' % url, table=mutation_table_name, properties=properties ) else: customSchema = StructType([ StructField("donor_id", IntegerType(), False), StructField("tumor_type_id", IntegerType(), False), StructField("chrom", IntegerType(), False), StructField("position", IntegerType(), False), StructField("mutation_code_id", IntegerType(), False), StructField("trinucleotide_id_r", IntegerType(), False)] ) mutations = spark.read.format("csv").option("header", "true").schema(customSchema).load(fs_db_dir + "/"+mutation_table_name) def filter1(x): return int(x["tumor_type_id"]) == int(tumorType) def filter2(x): return int(x["donor_id"]) in filter if tumorType: mutations = mutations.rdd.filter(filter1) if filter!=None: mutations = mutations.filter(filter2) if mutations.isEmpty(): return [] else: mutations = mutations.toDF() regions_df = DataFrameReader(sql_ctx).jdbc( url='jdbc:%s' % url, table=regions_table_name, properties=properties ) if sparkDebug: print("############ mutations ==> ", mutations.count()) print("############ regions ==>", regions_df.count()) # new if useSQL : mutations.registerTempTable("mutations") regions_df.registerTempTable("regions") sql_res = spark.sql("SELECT m.tumor_type_id, m.trinucleotide_id_r, count(*) from mutations as m, regions as r WHERE m.chrom=r.chrom AND m.position >= r.pos_start AND m.position <= r.pos_stop GROUP BY m.tumor_type_id, m.trinucleotide_id_r") res = sql_res.rdd.map(lambda r: [r["tumor_type_id"], r["trinucleotide_id_r"], r["count(1)"]]).collect() print("Spark execution took %s seconds ---" % (time.time() - start_time)) #print(sql_res.collect()) else: # regions = regions_df.collect() # print("====> REGIONS COLLECTED AFTER (S) %s" % (time.time() - start_time)) # rb = defaultdict(list) # for v in regions: rb[v["chrom"]].append(v) # # for c in rb: # rb[c] = sorted(rb[c], key=itemgetter('pos_start', 'pos_stop')) regions = regions_df.collect() regions = sorted(regions, key=itemgetter('pos_start', 'pos_stop')) print("====> REGIONS SORTED AFTER (S) %s" % (time.time() - start_time)) regions_broadcast = sc.broadcast(regions) print("====> REGIONS BROADCAST AFTER (S) %s" % (time.time() - start_time)) def partitionWork(p): localMutations = list(p) matched = [] if sparkDebug: print("====> PROCESSING PARTITION AFTER (S) %s" % (time.time() - start_time)) if localMutations: import copy localRegions = copy.deepcopy(regions_broadcast.value) if localRegions: sorted_mutations = sorted(localMutations, key=itemgetter('position')) sorted_regions = localRegions cur_reg_idx = 0 cur_mut_idx = 0 while( cur_mut_idx < len(sorted_mutations) and cur_reg_idx < len(sorted_regions) ): cur_reg = sorted_regions[cur_reg_idx] cur_mut = sorted_mutations[cur_mut_idx] if cur_mut["position"] < cur_reg["pos_start"]: cur_mut_idx += 1 elif cur_mut["position"] <= cur_reg["pos_stop"]: if cur_reg["chrom"] == cur_mut["chrom"]: matched.append(cur_mut) else: # look ahead next_region_index = cur_reg_idx + 1 while next_region_index < len(sorted_regions) and sorted_regions[next_region_index][ "pos_start"] <= cur_mut["position"]: if sorted_regions[next_region_index]["chrom"] == cur_mut["chrom"] and \ sorted_regions[next_region_index]["pos_stop"] >= cur_mut["position"]: matched.append(cur_mut) next_region_index = next_region_index + 1 cur_mut_idx += 1 else: cur_reg_idx += 1 return matched #if numPartitions > 0: # res = mutations.rdd.groupBy(lambda e: e["chrom"],numPartitions=numPartitions).flatMap(partitionWork) #else: # res = mutations.rdd.groupBy(lambda e: e["chrom"]).flatMap(partitionWork) if sparkDebug: print("#### NUM PARTITIONS: ", mutations.rdd.getNumPartitions) res = mutations.rdd.mapPartitions(partitionWork) if sparkDebug: print("############ results ==> ", res.count()) # Grouping #todo: if empty if groupby: if minCount==-1: res = res.toDF().groupBy(groupby).count().rdd.map(output_format) else: res_df = res.toDF().groupBy(groupby).count() res = res_df.filter(res_df["count"]>minCount).rdd.map(output_format) if sparkDebug: print("############ results after grouping ==> ", res.count()) res = res.collect() sc.stop() print("Spark execution took %s seconds ---" % (time.time() - start_time)) return res
# Thanks to this post, this was solved: # https://stackoverflow.com/a/46360434/8510370 # # NOTE: Download the jdbc driver from https://jdbc.postgresql.org/download.html # and place it in your project or copy to all machines with environmental variables set sparkClassPath = os.getenv('SPARK_CLASSPATH', os.path.join(os.getcwd(), 'postgresql-42.2.2.jar')) conf = SparkConf() conf.setAppName('application') conf.set('spark.jars', 'file:%s' % sparkClassPath) conf.set('spark.executor.extraClassPath', sparkClassPath) conf.set('spark.driver.extraClassPath', sparkClassPath) # Uncomment line below and modify ip address if you need to use cluster on different IP address # conf.set('spark.master', 'spark://127.0.0.1:7077') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) url = 'jdbc:postgresql://127.0.0.1:5432/movielens' properties = {'user': '******', 'password': '******'} df = DataFrameReader(sqlContext).jdbc(url=url, table='movies', properties=properties) df.select("movieid", "title").show(n=2) movies_rdd = df.select("movieid", "title").rdd.map(tuple) print(movies_rdd.take(5))
] return sqlContext.createDataFrame(sc.emptyRDD(), StructType(fields)) if __name__ == '__main__': """ host: db host name master: master connection string (eg. spark://master_ip:7077) """ host, master = sys.argv[1:] conf.setMaster(master) host = sys.argv[1] params = utils.connection_properties(host, db='owner') url = 'postgresql://{host}:{port}/{db}'.format(**params) df = DataFrameReader(sqlContext).jdbc(url='jdbc:{}'.format(url), table='assessor.owndat', properties=params, numPartitions=8) df = df.withColumn( 'adrconcat', concat_ws(' ', df.adrno, df.adrdir, df.adrstr, df.adrsuf, df.cityname, df.statecode, df.zip1)) func_clean_string = udf(clean_string, StringType()) df = df.withColumn('own1', func_clean_string('own1')) df = df.withColumn('adrconcat', func_clean_string('adrconcat')) uniq_own = df.select(df.own1.alias('uniq_own')).distinct() df = df.withColumn('key', lit(0)) uniq_own = uniq_own.withColumn('key', lit(0))
}, sort_keys=False) except BaseException as e: return None try: data_file = "airports.dat" data_file_url = "https://raw.githubusercontent.com/jpatokal/openflights/master/data/airports.dat" download_file(data_file_url, data_file) kafka_url = "172.18.0.100:9092" kafka_topic = "airports" sc = SparkContext("local", "example-of-processing-data") sc.setLogLevel("ERROR") sqlContext = SQLContext(sc) airports = DataFrameReader(sqlContext).csv(data_file) producer = KafkaProducer(bootstrap_servers=kafka_url) #for index in range(1, airports.count()): # print(to_json(airports.take(index)[0])) for each in airports.collect(): # logging.debug(as_json(each)) producer.send(kafka_topic, as_json(each)) finally: producer.close() sc.stop() shutil.os.remove(data_file)
def _aliased_reader(df_reader: DataFrameReader, format_key: str, path: Optional[str], **options: str) -> DataFrame: """ Loads the file of the given type at the given path.""" return df_reader.format(format_key).load(path, **options)
from pyspark import SparkContext, SparkConf from pyspark.sql import DataFrameReader, SQLContext import os sparkClassPath = os.getenv('SPARK_CLASSPATH', '/opt/spark/jars/postgresql-42.2.14.jar') # Populate configuration conf = SparkConf() conf.setAppName('application') conf.set('spark.jars', 'file:%s' % sparkClassPath) conf.set('spark.executor.extraClassPath', sparkClassPath) conf.set('spark.driver.extraClassPath', sparkClassPath) # Uncomment line below and modify ip address if you need to use cluster on different IP address # conf.set('spark.master', 'spark://127.0.0.1:7077') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) url = 'postgresql://slave:5432/testdb1' properties = {'user':'******', 'password':'******'} df = DataFrameReader(sqlContext).jdbc(url='jdbc:%s' % url, table='rental', properties=properties)#.schema('incertae') df.printSchema() df.show()
# Global temporary view is tied to a system preserved database `global_temp` spark.sql("SELECT * FROM global_temp.people").show() # Global temporary view is cross-session spark.newSession().sql("SELECT * FROM global_temp.people").show() # read from postgres import os from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext, Row, DataFrameReader os.environ['SPARK_CLASSPATH'] = "/home/david/Downloads/postgresql-42.2.1.jar" sparkClassPath = os.getenv('SPARK_CLASSPATH') # Populate configuration conf = SparkConf() conf.setAppName('application') conf.set('spark.jars', 'file:%s' % sparkClassPath) conf.set('spark.executor.extraClassPath', sparkClassPath) conf.set('spark.driver.extraClassPath', sparkClassPath) sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) url = f'postgresql://localhost:{PG_PORT}/tse' properties = {'user': PG_USER, 'password': PG_PWD} df = DataFrameReader(sqlContext).jdbc(url='jdbc:%s' % url, table='"每日收盤行情(全部(不含權證、牛熊證))"', properties=properties) df.printSchema() df.show(truncate=False)
from pyspark import SparkContext, SparkConf from pyspark.sql import DataFrameReader, SQLContext import os # sparkClassPath = os.getenv('SPARK_CLASSPATH', r'C:\tools\postgresql-42.2.5.jar') # print( sparkClassPath ) # Populate configuration conf = SparkConf() # conf.setAppName('application') # conf.set('spark.jars', 'file:%s' % sparkClassPath) # conf.set('spark.executor.extraClassPath', sparkClassPath) # conf.set('spark.driver.extraClassPath', sparkClassPath) # Uncomment line below and modify ip address if you need to use cluster on different IP address # conf.set('spark.master', 'spark://127.0.0.1:7077') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) url = 'postgresql://192.168.175.3:5432/entrata44_dev' properties = {'user':'******', 'password':'******'} df = DataFrameReader(sqlContext).jdbc(url='jdbc:%s' % url, table='lease_interval_types', properties=properties) df.show()
reload(sys) sys.setdefaultencoding('utf8') app = flask.Flask(__name__) app.config["DEBUG"] = True db = GraphDatabase("http://*****:*****@app.route('/averagevotemoviesofactor/<string:actor_name>', methods=['GET']) def average_vote_movies_of_actor(actor_name): q = 'MATCH (a:Actor)-[r:ACTS_IN]->(m:Movie) WHERE a.name= "' + actor_name + '" RETURN m.imdbId' results = db.query(q, returns=(str)) rdd = sc.parallelize(results) schema = StructType([ StructField('imdbId', StringType(), True), ]) df = sqlContext.createDataFrame(rdd, schema) dfj = df.join(dfpostgres, df.imdbId == dfpostgres.imdb_id)
def _layer_reader(df_reader: DataFrameReader, format_key: str, path: Optional[str], **options: str) -> RasterFrameLayer: """ Loads the file of the given type at the given path.""" df = df_reader.format(format_key).load(path, **options) return _convert_df(df)
.builder\ .appName("SparkPostgresqlApp") \ .master("local[2]") \ .config("spark.jars.packages", "org.postgresql:postgresql:42.2.6" ) \ .config("spark.driver.extraClassPath", locationPath ) \ .getOrCreate() spark.sparkContext.setLogLevel("OFF") sc : SparkContext = spark.sparkContext sqlContext = SQLContext(sc) #Step 8-5-1 : Reading student data table from PostgreSQL data base.. dbURL : str="jdbc:postgresql://localhost/pysparkbookdb" props = {'user': '******', 'password': '******', 'driver': 'org.postgresql.Driver'} studentsDataFrame :DataFrameReader= DataFrameReader(sqlContext).jdbc( \ url=dbURL, table='studenttable', properties=props ) studentsDataFrame.show() from pyspark.sql.functions import trim studentsDataFrame :DataFrame= studentsDataFrame.select(trim(studentsDataFrame.studentid),trim(studentsDataFrame.name),studentsDataFrame.gender) studentsDataFrame.show() studentsDataFrame :DataFrame= studentsDataFrame.withColumnRenamed('trim(studentid)', 'studentID').withColumnRenamed('trim(name)','Name').withColumnRenamed('gender', 'Gender') studentsDataFrame.printSchema() studentsDataFrame.show()