# Thanks to this post, this was solved: # https://stackoverflow.com/a/46360434/8510370 # # NOTE: Download the jdbc driver from https://jdbc.postgresql.org/download.html # and place it in your project or copy to all machines with environmental variables set sparkClassPath = os.getenv('SPARK_CLASSPATH', os.path.join(os.getcwd(), 'postgresql-42.2.2.jar')) conf = SparkConf() conf.setAppName('application') conf.set('spark.jars', 'file:%s' % sparkClassPath) conf.set('spark.executor.extraClassPath', sparkClassPath) conf.set('spark.driver.extraClassPath', sparkClassPath) # Uncomment line below and modify ip address if you need to use cluster on different IP address # conf.set('spark.master', 'spark://127.0.0.1:7077') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) url = 'jdbc:postgresql://127.0.0.1:5432/movielens' properties = {'user': '******', 'password': '******'} df = DataFrameReader(sqlContext).jdbc(url=url, table='movies', properties=properties) df.select("movieid", "title").show(n=2) movies_rdd = df.select("movieid", "title").rdd.map(tuple) print(movies_rdd.take(5))
host, master = sys.argv[1:] conf.setMaster(master) host = sys.argv[1] params = utils.connection_properties(host, db='owner') url = 'postgresql://{host}:{port}/{db}'.format(**params) df = DataFrameReader(sqlContext).jdbc(url='jdbc:{}'.format(url), table='assessor.owndat', properties=params, numPartitions=8) df = df.withColumn( 'adrconcat', concat_ws(' ', df.adrno, df.adrdir, df.adrstr, df.adrsuf, df.cityname, df.statecode, df.zip1)) func_clean_string = udf(clean_string, StringType()) df = df.withColumn('own1', func_clean_string('own1')) df = df.withColumn('adrconcat', func_clean_string('adrconcat')) uniq_own = df.select(df.own1.alias('uniq_own')).distinct() df = df.withColumn('key', lit(0)) uniq_own = uniq_own.withColumn('key', lit(0)) a = df.select(df.own1, df.adrconcat, df.parid, df.key).sample(False, .0001, 42) b = uniq_own.select(uniq_own.uniq_own, uniq_own.key).sample(False, .0001, 42) c = a.join(b, a.key == b.key, 'outer') own_score = udf(match_score, DoubleType()) c = c.withColumn('own_score', own_score('own1', 'uniq_own'))