示例#1
0
 def read_df(self, path):
     try:
         reader = DataFrameReader(self.sqlctx)
         result = reader.load(path)
     except Exception as e:
         result = None
     return result
    def generate_pre_transform_specs_data_frame(self,
                                                spark_context=None,
                                                sql_context=None):

        data_frame_reader = DataFrameReader(sql_context)
        pre_transform_specs_data_frame = data_frame_reader.jdbc(
            DbUtil.get_java_db_connection_string(), 'pre_transform_specs')
        data = []
        for item in pre_transform_specs_data_frame.collect():
            spec = json.loads(item['pre_transform_spec'])
            data.append(json.dumps(spec))

        data_frame = sql_context.read.json(spark_context.parallelize(data))
        self.pre_transform_specs_data_frame = data_frame
    def generate_transform_specs_data_frame(self, spark_context=None,
                                            sql_context=None):

        data_frame_reader = DataFrameReader(sql_context)
        transform_specs_data_frame = data_frame_reader.jdbc(
            DbUtil.get_java_db_connection_string(),
            'transform_specs'
        )
        data = []
        for item in transform_specs_data_frame.collect():
            spec = json.loads(item['transform_spec'])
            data.append(json.dumps(spec))

        data_frame = sql_context.read.json(spark_context.parallelize(data))
        self.transform_specs_data_frame = data_frame
    def generate_pre_transform_specs_data_frame(self, spark_context=None,
                                                sql_context=None):

        data_frame_reader = DataFrameReader(sql_context)
        pre_transform_specs_data_frame = data_frame_reader.jdbc(
            self.get_connection_string(),
            'pre_transform_specs'
        )
        data = []
        for item in pre_transform_specs_data_frame.collect():
            spec = json.loads(item['pre_transform_spec'])
            data.append(json.dumps(spec))

        data_frame = sql_context.jsonRDD(spark_context.parallelize(data))
        self.pre_transform_specs_data_frame = data_frame
 def db_read(self, table_name, sqlContext):
     '''
         This function takes in a table name and reads from a database
     '''
     dataframe = DataFrameReader(sqlContext).jdbc(
         url=self.__url, table=table_name, properties=self.__properties)
     return dataframe
示例#6
0
def method2(sql_context: SQLContext, database_URL: str,
            database_properties: dict):
    print('fetching jdbc dataframe...')
    # Create a DataFrameReader interface
    jdbc_df = DataFrameReader(sql_context).option("fetchSize", "5001")
    # Create a DataFrame object
    jdbc_df = jdbc_df.jdbc(
        url=database_URL,
        table='RATINGS',
        # column="SERVICE_ID",
        # lowerBound="0",
        # upperBound="4",
        # numPartitions=4,
        properties=database_properties)

    return jdbc_df
示例#7
0
def filename_to_object(filename, context):
    """
    Given a filename create a defoe.books.archive.Archive.  If an error
    arises during its creation this is caught and returned as a
    string.

    :param filename: filename
    :type filename: str or unicode
    :return: tuple of form (Archive, None) or (filename, error message),
    if there was an error creating Archive
    :rtype: tuple(defoe.books.archive.Archive | str or unicode, str or unicode)
    """

    lines = open(filename).readlines()
    fields = lines[1].split(",")
    #host,port,database,user,driver,table
    host = fields[0]
    port = fields[1]
    database = fields[2]
    user = fields[3]
    driver = fields[4]
    table = fields[5]
    sqlContext = SQLContext(context)
    url = 'postgresql://%s:%s/%s' % (host, port, database)
    properties = {'user': user, 'driver': driver}
    df = DataFrameReader(sqlContext).jdbc(url='jdbc:%s' % url,
                                          table=table,
                                          properties=properties)
    return df
    def load_datasets(self):
        """
        Loads movielens dataset from a given location
        """
        reader = DataFrameReader(self.sqlctx)

        fields = [
            StructField('userId', IntegerType(), True),
            StructField('movieId', IntegerType(), True),
            StructField('rating', FloatType(), True),
            StructField('timestamp', StringType(), True),
        ]
        schema = StructType(fields)
        self.ratings = reader.csv(os.path.join(self.data_dir, 'ratings.csv'),
                                  schema=schema,
                                  header=True,
                                  mode="DROPMALFORMED")
示例#9
0
def load_delay_data(df_reader: DataFrameReader, spark: SparkSession,
                    mes_part_opplan, mes_part_info_dispatch, mes_part_info):
    df_reader.option(
        "dbtable",
        mes_part_opplan).load().createOrReplaceTempView("mes_part_opplan")
    df_reader.option("dbtable",
                     mes_part_info_dispatch).load().createOrReplaceTempView(
                         "mes_part_info_dispatch")
    df_reader.option(
        "dbtable",
        mes_part_info).load().createOrReplaceTempView("mes_part_info")

    df = spark.sql('''
          select c.MES_PART_INFO_ID,b.MES_PART_INFO_DISPATCH_ID,a.MES_PART_OPPLAN_ID,
           c.PART_NO,c.LOT_NO,c.FOPLAN_ID,c.TASK_QTY,c.MANUFACTURER,
           b.SCHEDULED_OPERATOR_TYPE,b.SCHEDULED_OPERATOR_NO,a.SCHEDULED_START_DATE,a.SCHEDULED_COMPLETION_DATE,
           a.START_DATE,a.END_DATE,a.OP_NO,a.OP_NAME,a.OP_DESCRIPTION,
           a.OPER_DEPART,a.ACTUAL_MADE_BY,b.COMPLETION_RECORD_CREATOR
          from mes_part_opplan a
          join mes_part_info_dispatch b 
            on a.MES_PART_INFO_ID=b.MES_PART_INFO_ID
            and a.MES_PART_OPPLAN_ID=b.MES_PART_OPPLAN_ID
          join mes_part_info c 
            on a.MES_PART_INFO_ID=c.MES_PART_INFO_ID
        ''')

    return df.filter(F.col("START_DATE").isNotNull())
def detect_anomaly(df):
    #get distinct tickers in the micro-batch dataframe
    df.createOrReplaceTempView("prices")

    #get distributions informations for all the stocks
    url = 'postgresql://{}:5432/{}'.format(databaseIP, databaseName)
    properties = {'user': databaseUser, 'password': databasePassword}
    distributions_df = DataFrameReader(sql_context).jdbc(
        url='jdbc:%s' % url,
        table='stock_distributions',
        properties=properties)
    distributions_df.createOrReplaceTempView("distributions")

    #calculate upper and lower limit for percent change
    anomalyFactor = 3
    distributions_df = distributions_df.withColumn(
        "upper_limit",
        distributions_df["mean"] + anomalyFactor * distributions_df["stdev"])
    distributions_df = distributions_df.withColumn(
        "lower_limit",
        distributions_df["mean"] - anomalyFactor * distributions_df["stdev"])

    #join the mini batch data frame that holds the prices and percent changes with the distributions table
    ta = df.alias('ta')
    tb = distributions_df.alias('tb')
    batch_join_anom = ta.join(tb, ta.code == tb.code, how="left")
    batch_join_anom = batch_join_anom.select([
        "ta.time", "ta.code", "percent_change", "upper_limit", "lower_limit",
        "stdev"
    ])
    batch_join_anom.show()

    #keep the rows that has percent_change below the lower_limit or above the upper_limit
    #in other words keep the anomalies
    batch_join_anom = batch_join_anom.filter(
        (batch_join_anom["percent_change"] < batch_join_anom["lower_limit"])
        | (batch_join_anom["percent_change"] > batch_join_anom["upper_limit"]))
    #calculate how many standard deviations away from the mean this anomaly was
    batch_join_anom = batch_join_anom.withColumn(
        "num_of_std_away",
        func.abs(batch_join_anom["percent_change"]) / batch_join_anom["stdev"])
    batch_join_anom.show()

    #prepare to push to database anomalies table
    batch_join_anom = batch_join_anom.select(
        ["ta.time", "ta.code", "percent_change", "num_of_std_away"])

    addToDB(batch_join_anom, "stock_anomalies")
示例#11
0
def get_df_from_psql(host=None,
                     port=None,
                     db=None,
                     table=None,
                     user=None,
                     password=None):
    host = 'localhost'
    port = '5432'
    db = 'testdb2'
    table = 'Orders'
    user = '******'
    password = '******'

    url = 'postgresql://{}:{}/{}'.format(host, port, db)
    properties = {
        'user': user,
        'password': password,
        "driver": "org.postgresql.Driver"
    }
    df = DataFrameReader(sqlContext).jdbc(url='jdbc:%s' % url,
                                          table=table,
                                          properties=properties)
    return df
示例#12
0
 def get_reader(self, spark):
     return DataFrameReader(spark)
示例#13
0
def spark_intersect2(mutation_table_name, regions_table_name, DB_CONF, jdbc_jar='postgresql-42.2.12.jar'):

    fs_db_dir  =os.getenv('MUTVIZ_FS_DB_FOLDER', "disabled")

    numPartitions = int(os.getenv('MUTVIZ_NUM_PARTITIONS', -1))
    memory = os.getenv('MUTVIZ_DRIVER_MEMORY', "50g")
    sparkDebug = os.getenv('MUTVIZ_SPARK_DEBUG', "false") == "true"
    print("USING "+str(numPartitions)+" PARTITIONS (-1:AUTO).")
    start_time = time.time()

    os.environ["SPARK_HOME"] = os.getenv('MUTVIZ_SPARK_HOME', "/var/lib/spark-2.4.5-bin-hadoop2.7")
    os.environ["PYSPARK_PYTHON"] = sys.executable
    os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

    driver_class = "org.postgresql.Driver"

    cores = os.getenv('MUTVIZ_CORES', "*")

    print("#### SPARK CONFIGURATION ####")
    print("SPARK HOME: " + os.getenv('SPARK_HOME'))
    print("Using cores: "+cores)
    print("Using memory: "+memory)
    print("Using partitions: "+str(numPartitions))
    print("Debug enabled: " + str(sparkDebug))
    print("#############################")

    spark = SparkSession.builder \
        .master("local["+cores+"]") \
        .appName("Word Count") \
        .config("spark.jars", jdbc_jar) \
        .config("spark.driver.memory", memory) \
        .config("spark.driver.cores", cores) \
        .getOrCreate()

    sql_ctx = SQLContext(spark.sparkContext)
    sc = spark.sparkContext

    properties = {'user': DB_CONF["postgres_user"], 'password':DB_CONF["postgres_pw"], 'driver': driver_class}
    url = 'postgresql://'+DB_CONF["postgres_host"]+':'+DB_CONF["postgres_port"]+'/'+DB_CONF["postgres_db"]


    if fs_db_dir == 'disabled':
        mutations = DataFrameReader(sql_ctx).jdbc(
            url='jdbc:%s' % url, table=mutation_table_name, properties=properties
        )
    else:
        customSchema = StructType([
            StructField("donor_id", IntegerType(), False),
            StructField("tumor_type_id", IntegerType(), False),
            StructField("chrom", IntegerType(), False),
            StructField("position", IntegerType(), False),
            StructField("mutation_code_id", IntegerType(), False),
            StructField("trinucleotide_id_r", IntegerType(), False)]
        )

        mutations = spark.read.format("csv").option("header", "true").schema(customSchema).load(fs_db_dir + "/"+mutation_table_name)

    regions_df = DataFrameReader(sql_ctx).jdbc(
        url='jdbc:%s' % url, table=regions_table_name, properties=properties
    )

    regions = regions_df.collect()
    regions = sorted(regions, key=itemgetter('pos_start', 'pos_stop'))

    print("====> REGIONS SORTED AFTER (S) %s" % (time.time() - start_time))
    regions_broadcast = sc.broadcast(regions)
    print("====> REGIONS BROADCAST AFTER (S) %s" % (time.time() - start_time))


    def partitionWork(p):

        localMutations = list(p)
        matched = []

        if sparkDebug:
            print("====> PROCESSING PARTITION AFTER (S)  %s" % (time.time() - start_time))

        if localMutations:

            import copy
            localRegions = copy.deepcopy(regions_broadcast.value)

            if localRegions:
                sorted_mutations = sorted(localMutations, key=itemgetter('position'))
                sorted_regions = localRegions

                cur_reg_idx = 0
                cur_mut_idx = 0

                while( cur_mut_idx < len(sorted_mutations)  and cur_reg_idx < len(sorted_regions) ):

                    cur_reg = sorted_regions[cur_reg_idx]
                    cur_mut = sorted_mutations[cur_mut_idx]

                    if cur_mut["position"] < cur_reg["pos_start"]:
                        cur_mut_idx += 1
                    elif cur_mut["position"] <= cur_reg["pos_stop"]:
                        if cur_reg["chrom"] == cur_mut["chrom"]:
                            matched.append(cur_mut)
                        else:
                            # look ahead
                            next_region_index = cur_reg_idx + 1
                            while next_region_index < len(sorted_regions) and sorted_regions[next_region_index][
                                "pos_start"] <= cur_mut["position"]:
                                if sorted_regions[next_region_index]["chrom"] == cur_mut["chrom"] and \
                                        sorted_regions[next_region_index]["pos_stop"] >= cur_mut["position"]:
                                    matched.append(cur_mut)
                                next_region_index = next_region_index + 1

                        cur_mut_idx += 1
                    else:
                        cur_reg_idx += 1

        return matched

    res = mutations.rdd.mapPartitions(partitionWork).toDF().toPandas()


    print("Spark execution took %s seconds ---" % (time.time() - start_time))

    return res
示例#14
0
def spark_intersect(mutation_table_name, regions_table_name, DB_CONF, output_format, regions=None, jdbc_jar='postgresql-42.2.12.jar', groupby=None, useSQL=False, minCount=-1, tumorType=None, filter=None):

    # SQL VS MINE8: 388[1h] , 1507 (25min), 1018[bin=20], (1h,  no bins), 1101 (5 bins), 994 [100] - 952 [200] 916(ctcf) 941[41]
    # 590 ETS1
    #3h13 geco 4h37 genomic

    fs_db_dir  =os.getenv('MUTVIZ_FS_DB_FOLDER', "disabled")

    numPartitions = int(os.getenv('MUTVIZ_NUM_PARTITIONS', -1))
    memory = os.getenv('MUTVIZ_DRIVER_MEMORY', "50g")
    sparkDebug = os.getenv('MUTVIZ_SPARK_DEBUG', "false") == "true"
    print("USING "+str(numPartitions)+" PARTITIONS (-1:AUTO).")
    start_time = time.time()

    os.environ["SPARK_HOME"] = os.getenv('MUTVIZ_SPARK_HOME', "/var/lib/spark-2.4.5-bin-hadoop2.7")
    os.environ["PYSPARK_PYTHON"] = sys.executable
    os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

    driver_class = "org.postgresql.Driver"

    cores = os.getenv('MUTVIZ_CORES', "*")

    print("#### SPARK CONFIGURATION ####")
    print("SPARK HOME: " + os.getenv('SPARK_HOME'))
    print("Using cores: "+cores)
    print("Using memory: "+memory)
    print("Using partitions: "+str(numPartitions))
    print("Debug enabled: " + str(sparkDebug))
    if tumorType:
        print("Tumor Type id: "+str(tumorType))
    if filter:
        print("Filter count: "+str(len(filter)))
    print("#############################")

    spark = SparkSession.builder \
        .master("local["+cores+"]") \
        .appName("Word Count") \
        .config("spark.jars", jdbc_jar) \
        .config("spark.driver.memory", memory) \
        .config("spark.driver.cores", cores) \
        .getOrCreate()

    sql_ctx = SQLContext(spark.sparkContext)
    sc = spark.sparkContext

    properties = {'user': DB_CONF["postgres_user"], 'password':DB_CONF["postgres_pw"], 'driver': driver_class}
    url = 'postgresql://'+DB_CONF["postgres_host"]+':'+DB_CONF["postgres_port"]+'/'+DB_CONF["postgres_db"]


    if fs_db_dir == 'disabled':
        mutations = DataFrameReader(sql_ctx).jdbc(
            url='jdbc:%s' % url, table=mutation_table_name, properties=properties
        )
    else:
        customSchema = StructType([
            StructField("donor_id", IntegerType(), False),
            StructField("tumor_type_id", IntegerType(), False),
            StructField("chrom", IntegerType(), False),
            StructField("position", IntegerType(), False),
            StructField("mutation_code_id", IntegerType(), False),
            StructField("trinucleotide_id_r", IntegerType(), False)]
        )

        mutations = spark.read.format("csv").option("header", "true").schema(customSchema).load(fs_db_dir + "/"+mutation_table_name)

    def filter1(x):
        return int(x["tumor_type_id"]) == int(tumorType)
    def filter2(x):
        return  int(x["donor_id"]) in filter

    if tumorType:
        mutations = mutations.rdd.filter(filter1)
        if filter!=None:
            mutations = mutations.filter(filter2)

        if mutations.isEmpty():
            return []
        else:
            mutations = mutations.toDF()



    regions_df = DataFrameReader(sql_ctx).jdbc(
        url='jdbc:%s' % url, table=regions_table_name, properties=properties
    )

    if sparkDebug:
        print("############ mutations ==> ", mutations.count())
        print("############ regions   ==>", regions_df.count())

    # new
    if useSQL :
        mutations.registerTempTable("mutations")
        regions_df.registerTempTable("regions")

        sql_res = spark.sql("SELECT m.tumor_type_id, m.trinucleotide_id_r, count(*) from mutations as m, regions as r WHERE m.chrom=r.chrom AND m.position >= r.pos_start AND m.position <= r.pos_stop GROUP BY m.tumor_type_id, m.trinucleotide_id_r")

        res = sql_res.rdd.map(lambda r: [r["tumor_type_id"], r["trinucleotide_id_r"], r["count(1)"]]).collect()
        print("Spark execution took %s seconds ---" % (time.time() - start_time))
    #print(sql_res.collect())

    else:

        # regions = regions_df.collect()
        # print("====> REGIONS COLLECTED AFTER (S) %s" % (time.time() - start_time))
        # rb = defaultdict(list)
        # for v in regions: rb[v["chrom"]].append(v)
        #
        # for c in rb:
        #     rb[c] = sorted(rb[c], key=itemgetter('pos_start', 'pos_stop'))

        regions = regions_df.collect()
        regions = sorted(regions, key=itemgetter('pos_start', 'pos_stop'))

        print("====> REGIONS SORTED AFTER (S) %s" % (time.time() - start_time))
        regions_broadcast = sc.broadcast(regions)
        print("====> REGIONS BROADCAST AFTER (S) %s" % (time.time() - start_time))


        def partitionWork(p):

            localMutations = list(p)
            matched = []

            if sparkDebug:
                print("====> PROCESSING PARTITION AFTER (S)  %s" % (time.time() - start_time))

            if localMutations:

                import copy
                localRegions = copy.deepcopy(regions_broadcast.value)

                if localRegions:
                    sorted_mutations = sorted(localMutations, key=itemgetter('position'))
                    sorted_regions = localRegions

                    cur_reg_idx = 0
                    cur_mut_idx = 0

                    while( cur_mut_idx < len(sorted_mutations)  and cur_reg_idx < len(sorted_regions) ):

                        cur_reg = sorted_regions[cur_reg_idx]
                        cur_mut = sorted_mutations[cur_mut_idx]

                        if cur_mut["position"] < cur_reg["pos_start"]:
                            cur_mut_idx += 1
                        elif cur_mut["position"] <= cur_reg["pos_stop"]:
                            if cur_reg["chrom"] == cur_mut["chrom"]:
                                matched.append(cur_mut)
                            else:
                                # look ahead
                                next_region_index = cur_reg_idx + 1
                                while next_region_index < len(sorted_regions) and sorted_regions[next_region_index][
                                    "pos_start"] <= cur_mut["position"]:
                                    if sorted_regions[next_region_index]["chrom"] == cur_mut["chrom"] and \
                                            sorted_regions[next_region_index]["pos_stop"] >= cur_mut["position"]:
                                        matched.append(cur_mut)
                                    next_region_index = next_region_index + 1

                            cur_mut_idx += 1
                        else:
                            cur_reg_idx += 1

            return matched


        #if numPartitions > 0:
        #    res = mutations.rdd.groupBy(lambda e: e["chrom"],numPartitions=numPartitions).flatMap(partitionWork)
        #else:
        #    res = mutations.rdd.groupBy(lambda e: e["chrom"]).flatMap(partitionWork)

        if sparkDebug:
            print("#### NUM PARTITIONS: ", mutations.rdd.getNumPartitions)

        res = mutations.rdd.mapPartitions(partitionWork)


        if sparkDebug:
            print("############ results ==> ", res.count())

        # Grouping
        #todo: if empty
        if groupby:
            if minCount==-1:
                res = res.toDF().groupBy(groupby).count().rdd.map(output_format)
            else:
                res_df = res.toDF().groupBy(groupby).count()
                res = res_df.filter(res_df["count"]>minCount).rdd.map(output_format)

            if sparkDebug:
                print("############ results after grouping ==> ", res.count())

    res = res.collect()
    sc.stop()

    print("Spark execution took %s seconds ---" % (time.time() - start_time))

    return res
示例#15
0
# Thanks to this post, this was solved:
# https://stackoverflow.com/a/46360434/8510370
#
# NOTE: Download the jdbc driver from https://jdbc.postgresql.org/download.html
#       and place it in your project or copy to all machines with environmental variables set

sparkClassPath = os.getenv('SPARK_CLASSPATH',
                           os.path.join(os.getcwd(), 'postgresql-42.2.2.jar'))

conf = SparkConf()
conf.setAppName('application')
conf.set('spark.jars', 'file:%s' % sparkClassPath)
conf.set('spark.executor.extraClassPath', sparkClassPath)
conf.set('spark.driver.extraClassPath', sparkClassPath)
# Uncomment line below and modify ip address if you need to use cluster on different IP address
# conf.set('spark.master', 'spark://127.0.0.1:7077')

sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
url = 'jdbc:postgresql://127.0.0.1:5432/movielens'
properties = {'user': '******', 'password': '******'}

df = DataFrameReader(sqlContext).jdbc(url=url,
                                      table='movies',
                                      properties=properties)

df.select("movieid", "title").show(n=2)

movies_rdd = df.select("movieid", "title").rdd.map(tuple)
print(movies_rdd.take(5))
示例#16
0
    ]
    return sqlContext.createDataFrame(sc.emptyRDD(), StructType(fields))


if __name__ == '__main__':
    """
        host: db host name
        master: master connection string (eg. spark://master_ip:7077)
    """
    host, master = sys.argv[1:]
    conf.setMaster(master)
    host = sys.argv[1]
    params = utils.connection_properties(host, db='owner')
    url = 'postgresql://{host}:{port}/{db}'.format(**params)
    df = DataFrameReader(sqlContext).jdbc(url='jdbc:{}'.format(url),
                                          table='assessor.owndat',
                                          properties=params,
                                          numPartitions=8)

    df = df.withColumn(
        'adrconcat',
        concat_ws(' ', df.adrno, df.adrdir, df.adrstr, df.adrsuf, df.cityname,
                  df.statecode, df.zip1))

    func_clean_string = udf(clean_string, StringType())
    df = df.withColumn('own1', func_clean_string('own1'))
    df = df.withColumn('adrconcat', func_clean_string('adrconcat'))
    uniq_own = df.select(df.own1.alias('uniq_own')).distinct()

    df = df.withColumn('key', lit(0))
    uniq_own = uniq_own.withColumn('key', lit(0))
示例#17
0
            },
            sort_keys=False)
    except BaseException as e:
        return None


try:
    data_file = "airports.dat"
    data_file_url = "https://raw.githubusercontent.com/jpatokal/openflights/master/data/airports.dat"
    download_file(data_file_url, data_file)
    kafka_url = "172.18.0.100:9092"
    kafka_topic = "airports"

    sc = SparkContext("local", "example-of-processing-data")
    sc.setLogLevel("ERROR")
    sqlContext = SQLContext(sc)

    airports = DataFrameReader(sqlContext).csv(data_file)

    producer = KafkaProducer(bootstrap_servers=kafka_url)
    #for index in range(1, airports.count()):
    #    print(to_json(airports.take(index)[0]))
    for each in airports.collect():
        # logging.debug(as_json(each))
        producer.send(kafka_topic, as_json(each))

finally:
    producer.close()
    sc.stop()
    shutil.os.remove(data_file)
示例#18
0
def _aliased_reader(df_reader: DataFrameReader, format_key: str,
                    path: Optional[str], **options: str) -> DataFrame:
    """ Loads the file of the given type at the given path."""
    return df_reader.format(format_key).load(path, **options)
示例#19
0
from pyspark import SparkContext, SparkConf
from pyspark.sql import DataFrameReader, SQLContext
import os

sparkClassPath = os.getenv('SPARK_CLASSPATH', '/opt/spark/jars/postgresql-42.2.14.jar')

# Populate configuration
conf = SparkConf()
conf.setAppName('application')
conf.set('spark.jars', 'file:%s' % sparkClassPath)
conf.set('spark.executor.extraClassPath', sparkClassPath)
conf.set('spark.driver.extraClassPath', sparkClassPath)
# Uncomment line below and modify ip address if you need to use cluster on different IP address
# conf.set('spark.master', 'spark://127.0.0.1:7077')

sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

url = 'postgresql://slave:5432/testdb1'
properties = {'user':'******', 'password':'******'}

df = DataFrameReader(sqlContext).jdbc(url='jdbc:%s' % url, table='rental', properties=properties)#.schema('incertae')

df.printSchema()
df.show()
示例#20
0
# Global temporary view is tied to a system preserved database `global_temp`
spark.sql("SELECT * FROM global_temp.people").show()

# Global temporary view is cross-session
spark.newSession().sql("SELECT * FROM global_temp.people").show()

# read from postgres
import os
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, Row, DataFrameReader
os.environ['SPARK_CLASSPATH'] = "/home/david/Downloads/postgresql-42.2.1.jar"
sparkClassPath = os.getenv('SPARK_CLASSPATH')

# Populate configuration
conf = SparkConf()
conf.setAppName('application')
conf.set('spark.jars', 'file:%s' % sparkClassPath)
conf.set('spark.executor.extraClassPath', sparkClassPath)
conf.set('spark.driver.extraClassPath', sparkClassPath)
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
url = f'postgresql://localhost:{PG_PORT}/tse'
properties = {'user': PG_USER, 'password': PG_PWD}

df = DataFrameReader(sqlContext).jdbc(url='jdbc:%s' % url,
                                      table='"每日收盤行情(全部(不含權證、牛熊證))"',
                                      properties=properties)

df.printSchema()
df.show(truncate=False)
示例#21
0
from pyspark import SparkContext, SparkConf
from pyspark.sql import DataFrameReader, SQLContext
import os

# sparkClassPath = os.getenv('SPARK_CLASSPATH', r'C:\tools\postgresql-42.2.5.jar')
# print( sparkClassPath )
# Populate configuration

conf = SparkConf()
# conf.setAppName('application')
# conf.set('spark.jars', 'file:%s' % sparkClassPath)
# conf.set('spark.executor.extraClassPath', sparkClassPath)
# conf.set('spark.driver.extraClassPath', sparkClassPath)
# Uncomment line below and modify ip address if you need to use cluster on different IP address
# conf.set('spark.master', 'spark://127.0.0.1:7077')

sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

url = 'postgresql://192.168.175.3:5432/entrata44_dev'
properties = {'user':'******', 'password':'******'}

df = DataFrameReader(sqlContext).jdbc(url='jdbc:%s' % url, table='lease_interval_types', properties=properties)
df.show()
示例#22
0
reload(sys)
sys.setdefaultencoding('utf8')

app = flask.Flask(__name__)
app.config["DEBUG"] = True

db = GraphDatabase("http://*****:*****@app.route('/averagevotemoviesofactor/<string:actor_name>', methods=['GET'])
def average_vote_movies_of_actor(actor_name):
    q = 'MATCH (a:Actor)-[r:ACTS_IN]->(m:Movie) WHERE a.name= "' + actor_name + '" RETURN m.imdbId'
    results = db.query(q, returns=(str))
    rdd = sc.parallelize(results)

    schema = StructType([
        StructField('imdbId', StringType(), True),
    ])

    df = sqlContext.createDataFrame(rdd, schema)
    dfj = df.join(dfpostgres, df.imdbId == dfpostgres.imdb_id)
示例#23
0
def _layer_reader(df_reader: DataFrameReader, format_key: str,
                  path: Optional[str], **options: str) -> RasterFrameLayer:
    """ Loads the file of the given type at the given path."""
    df = df_reader.format(format_key).load(path, **options)
    return _convert_df(df)
示例#24
0
            .builder\
            .appName("SparkPostgresqlApp") \
            .master("local[2]") \
            .config("spark.jars.packages",  "org.postgresql:postgresql:42.2.6" ) \
            .config("spark.driver.extraClassPath",  locationPath ) \
            .getOrCreate()        
    spark.sparkContext.setLogLevel("OFF")
    sc : SparkContext = spark.sparkContext
    sqlContext = SQLContext(sc)
    
    #Step 8-5-1 : Reading student data table from PostgreSQL data base..

    dbURL : str="jdbc:postgresql://localhost/pysparkbookdb" 
    props = {'user': '******', 'password': '******', 'driver': 'org.postgresql.Driver'}
  
    studentsDataFrame :DataFrameReader= DataFrameReader(sqlContext).jdbc( \
                url=dbURL, table='studenttable', properties=props   )
  
    
    studentsDataFrame.show()
    
    from pyspark.sql.functions import trim
    studentsDataFrame :DataFrame= studentsDataFrame.select(trim(studentsDataFrame.studentid),trim(studentsDataFrame.name),studentsDataFrame.gender)
    
    studentsDataFrame.show()
    
    studentsDataFrame :DataFrame= studentsDataFrame.withColumnRenamed('trim(studentid)', 'studentID').withColumnRenamed('trim(name)','Name').withColumnRenamed('gender', 'Gender')
    
    studentsDataFrame.printSchema()
    
    studentsDataFrame.show()