Exemplo n.º 1
0
def ch9_sql():
    # Import Spark SQL
    from pyspark.sql import HiveContext, Row
    # Or if you can't include the hive requirements 
    from pyspark.sql import SQLContext, Row

    hiveCtx = HiveContext(sc)

    input_file = hiveCtx.read.json("testweet.json")
    # Register the input_file schema RDD 
    input_file.registerTempTable("tweets")
    # Select tweets based on the retweetCount
    topTweets = hiveCtx.sql("""SELECT text, retweetCount FROM
      tweets ORDER BY retweetCount LIMIT 10""")

    topTweetText = topTweets.map(lambda row: row.text)  
    topTweetText.collect()

    topTweets.schema
    hiveCtx.cacheTable("tweets")
def query12_input(query_name, conf=None, output_persist=False):
    sc = SparkContext(conf=conf)
    sqlContext = HiveContext(sc)

    # SQL statements can be run by using the sql methods provided by sqlContext
    sql = "use tpcds_text_db_1_50"
    _ = sqlContext.sql(sql)

#    web_sales_sql = "select * from web_sales"
#    web_sales = sqlContext.sql(web_sales_sql)
#    web_sales.persist()
#    web_sales.registerAsTable("web_sales")
#    item_sql = "select * from item"
#    item = sqlContext.sql(item_sql)
#    item.persist()
#    item.registerAsTable("item")
#    date_dim_sql = "select * from date_dim"
#    date_dim = sqlContext.sql(date_dim_sql)
#    date_dim.persist()
#    date_dim.registerAsTable("date_dim")
    sqlContext.cacheTable("web_sales")
    sqlContext.cacheTable("item")
    sqlContext.cacheTable("date_dim")

    # discard the first query
    output = execute_sql(query_name, sqlContext, output_persist)
    # check the re-run statistics
    output = execute_sql(query_name, sqlContext)
    output['describe'] = output['output'].describe().show()

    sc.stop()
    return output
# Loading order_items table
order_items = sqlContext.sql(
    """select * from {0}.order_items""".format(DB_NAME))
order_items.registerTempTable('order_items')

# Loading products table
products = sqlContext.sql("""select * from {0}.products""".format(DB_NAME))
products.registerTempTable('products')

# Joining categories, products, order_items and orders tables
cat_prod_ord_items = sqlContext.sql(
    """select c.*, o.order_customer_id as customer_id, p.product_id, oi.* from categories c join products p on c.category_id = p.product_category_id join order_items oi on p.product_id = oi.order_item_product_id join orders o on oi.order_item_order_id = o.order_id """
)
cat_prod_ord_items.registerTempTable('cat_prod_ord_items')

sqlContext.cacheTable('cat_prod_ord_items')

#### Category wise sales per day ####
category_wise_sales1 = sqlContext.sql(
    """select category_id, category_name, category_department_id as department_id, round(sum(order_item_subtotal),2) as total_sales from cat_prod_ord_items group by category_id, category_name, category_department_id"""
)
category_wise_sales1.registerTempTable('category_wise_sales1')

category_wise_sales = sqlContext.sql(
    """select c.category_id, c.category_name, d.department_name, c.total_sales from category_wise_sales1 c join departments d on c.department_id = d.department_id"""
).withColumn(
    "txn_date",
    from_unixtime(unix_timestamp() - (86400 * lit(date_difference_in_days)),
                  'yyyy-MM-dd'))
category_wise_sales.registerTempTable('category_wise_sales')
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
from pyspark.sql.types import Row

conf = SparkConf().setAppName("spark_sql_cache_table_extend")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

dataRDD = sc.textFile(
    "/user/hdfs/rawlog/app_weibomobile03x4ts1kl_mwb_interface/"
).map(lambda line: line.split(",")).filter(lambda words: len(words) >= 3).map(
    lambda words: Row(col1=words[0], col2=words[1], col3=words[2]))

sourceRDD = hc.inferSchema(dataRDD)

sourceRDD.registerAsTable("source")

hc.cacheTable("source")

hc.sql("select count(*) from source").collect()

hc.sql("select col2, max(col3) from source group by col2").collect()

hc.sql("select col3, min(col2) from source group by col3").collect()

# hc.uncacheTable("source")

sc.stop()
Exemplo n.º 5
0
mytable.registerTempTable("temp_mytable")
"""


def convert(val):
    return val.upper()

hc.registerFunction("temp_convert", convert)

convertRDD = hc.sql(
    "select temp_convert(col1) as col1, col2, col3 from temp_source")

convertRDD.registerAsTable("temp_mytable")


hc.cacheTable("temp_mytable")


def printRows(rows):
    for row in rows:
        print row

datas = hc.sql("select * from temp_mytable").collect()

printRows(datas)

datas = hc.sql("select col1 from temp_mytable").collect()

printRows(datas)

# hc.uncacheTable("temp_mytable")
Exemplo n.º 6
0
sqlContext.sql('set spark.sql.shuffle.partitions=%i' % numPartitions)


#comments = sqlContext.read.json('data/test/*/')
#comments = sqlContext.read.json('data/micro_fake.json')
comments = sqlContext.read.json('s3n://%s:%s@boazreddit/micro_fake.json' % (acc, sec))
#comments = sqlContext.read.json('s3n://%s:%s@boazreddit/test/*/*' % (acc, sec))
#comments = sqlContext.read.json('s3n://%s:%s@boazreddit/comments/2007/*' % (acc, sec))
#comments = sqlContext.read.json('s3n://%s:%s@boazreddit/comments/200*/*' % (acc, sec))
#comments = sqlContext.read.json('s3n://%s:%s@boazreddit/comments/*/*' % (acc, sec))

polcomments = comments.filter(comments.subreddit=='politics')

polcomments2 = polcomments.repartition(numPartitions)
polcomments2.registerTempTable('comments')
sqlContext.cacheTable('comments')

# Removed when filtering to single subreddit
# COLLECT_LIST(subreddit) AS subreddits,
# COUNT(DISTINCT(subreddit)) AS total_subreddits,

user_pivot = sqlContext.sql('''SELECT
                            author,
                            MIN(CAST((FROM_UNIXTIME(INT(created_utc))) AS TIMESTAMP)) AS first_post_datetime,
                            MAX(CAST((FROM_UNIXTIME(INT(created_utc))) AS TIMESTAMP)) AS last_post_datetime,
                            COLLECT_LIST(CAST((FROM_UNIXTIME(INT(created_utc))) AS TIMESTAMP)) AS post_datetimes,
                            COLLECT_LIST(id) AS post_ids,
                            COUNT(*) AS total_posts
                       FROM comments
                       GROUP BY author''')
#user_pivot2 = user_pivot.repartition(numPartitions)
def sql_hive_context_example(spark):
    
    # create hive context object.
    hive_ctx = HiveContext(spark.sparkContext)

    # createDataFrame
    l = [('Alice', 18), ('Bob', 20), ('Charley', 22)]
    df = hive_ctx.createDataFrame(l, ('name', 'age'))
    print("createDataFrame API finished")

    # registerDataFrameAsTable 
    hive_ctx.registerDataFrameAsTable(df, "table1")
    print("registerDataFrameAsTable API finished")

    # sql
    tmp_df = hive_ctx.sql("select * from table1")
    tmp_df.show()
    print("sql API finished")

    # table
    tmp_df = hive_ctx.table("table1")
    tmp_df.show()
    print("table API finished")

    # tableNames
    table_names = hive_ctx.tableNames()
    print(table_names)
    print("tableNames API finished")

    # tables
    tables = hive_ctx.tables()
    print(tables)
    print("tables API finished")

    # range
    tmp_df = hive_ctx.range(1,10,2)
    tmp_df.show()
    print("range API finished")

    # dropTempTable
    hive_ctx.dropTempTable("table1")
    table_names = hive_ctx.tableNames()
    print(table_names)
    print("dropTempTable API finished")

    # cacheTable & uncacheTable & clearCache
    df = hive_ctx.range(1,10,2)
    hive_ctx.registerDataFrameAsTable(df, "table")
    hive_ctx.cacheTable("table")
    hive_ctx.uncacheTable("table")
    hive_ctx.clearCache()
    print("cacheTable & uncacheTable & clearCache API finished")

    # createExternalTable

    # newSession

    # registerFunction
    # Deprecated in 2.3.0. Use :func:`spark.udf.register` instead

    # registerJavaFunction
    # Deprecated in 2.3.0. Use :func:`spark.udf.registerJavaFunction` instead

    # setConf & getConf
    hive_ctx.setConf("key1", "value1")
    value = hive_ctx.getConf("key1")
    print(value)
    print("setConf & getConf API finished")

    # refreshTable
    # Exception: An error occurred while calling o26.refreshTable:
    # Method refreshTable([class java.lang.String]) does not exist
    
    print("Finish running HiveContext API")
# coding=utf-8

from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
from pyspark.sql.types import Row

conf = SparkConf().setAppName("spark_sql_cache_table_extend")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

dataRDD = sc.textFile("/user/hdfs/rawlog/app_weibomobile03x4ts1kl_mwb_interface/").map(lambda line: line.split(
    ",")).filter(lambda words: len(words) >= 3).map(lambda words: Row(col1=words[0], col2=words[1], col3=words[2]))

sourceRDD = hc.inferSchema(dataRDD)

sourceRDD.registerAsTable("source")

hc.cacheTable("source")

hc.sql("select count(*) from source").collect()

hc.sql("select col2, max(col3) from source group by col2").collect()

hc.sql("select col3, min(col2) from source group by col3").collect()

# hc.uncacheTable("source")

sc.stop()
def main():
    """Run the query and print the statistics."""

    run = parse_cmd_line_arg()

    # Clear cache
    map(clear_cache, hosts)
    map(clear_local_dir, hosts)

    name = 'CS-838-Assignment2-Question2'
    sc = SparkContext(conf=get_conf(name))
    hc = HiveContext(sc)

    hc.sql('use tpcds_text_db_1_50')

    query12 = """
        select  i_item_desc
              ,i_category
              ,i_class
              ,i_current_price
              ,i_item_id
              ,sum(ws_ext_sales_price) as itemrevenue
              ,sum(ws_ext_sales_price)*100/sum(sum(ws_ext_sales_price)) over
                  (partition by i_class) as revenueratio
        from
                web_sales
                ,item
                ,date_dim
        where
                web_sales.ws_item_sk = item.i_item_sk
                and item.i_category in ('Jewelry', 'Sports', 'Books')
                and web_sales.ws_sold_date_sk = date_dim.d_date_sk
                and date_dim.d_date between '2001-01-12' and '2001-02-11'
        group by
                i_item_id
                ,i_item_desc
                ,i_category
                ,i_class
                ,i_current_price
        order by
                i_category
                ,i_class
                ,i_item_id
                ,i_item_desc
                ,revenueratio
        limit 100
        """

    query54 = """
        with my_customers as (
         select  c_customer_sk
                , c_current_addr_sk
         from
                ( select cs_sold_date_sk sold_date_sk,
                         cs_bill_customer_sk customer_sk,
                         cs_item_sk item_sk
                  from   catalog_sales
                  union all
                  select ws_sold_date_sk sold_date_sk,
                         ws_bill_customer_sk customer_sk,
                         ws_item_sk item_sk
                  from   web_sales
                 ) cs_or_ws_sales,
                 item,
                 date_dim,
                 customer
         where   sold_date_sk = d_date_sk
                 and item_sk = i_item_sk
                 and i_category = 'Jewelry'
                 and i_class = 'football'
                 and c_customer_sk = cs_or_ws_sales.customer_sk
                 and d_moy = 3
                 and d_year = 2000
                 group by  c_customer_sk
                , c_current_addr_sk
         )
         , my_revenue as (
         select c_customer_sk,
                sum(ss_ext_sales_price) as revenue
         from   my_customers,
                store_sales,
                customer_address,
                store,
                date_dim
         where  c_current_addr_sk = ca_address_sk
                and ca_county = s_county
                and ca_state = s_state
                and ss_sold_date_sk = d_date_sk
                and c_customer_sk = ss_customer_sk
                and d_month_seq between (1203)
                                   and  (1205)
         group by c_customer_sk
         )
         , segments as
         (select cast((revenue/50) as int) as segment
          from   my_revenue
         )
          select  segment, count(*) as num_customers, segment*50 as segment_base
         from segments
         group by segment
         order by segment, num_customers
         limit 100
         """

    # cache runs
    if run == 2 or run == 3:
        # tables in query12 used for collecting stats
        hc.cacheTable('web_sales')
        hc.cacheTable('item')
        hc.cacheTable('date_dim')

        # to circumvent lazy computation and force cache, we run a query
        # that involves the above cached tables
        if run == 2:
            # we will avoid running query12 now since we want to run
            # it below and collect stats
            # Instead, we run query54 which involves all the above 3
            # cached tables
            df = hc.sql(query54)

        # to force the caching of the outputRDD
        elif run == 3:
            # running the same query used to collect stats: query12
            # since we want to cache the output
            df = hc.sql(query12)
            df.cache()

        df.show()
        time.sleep(120)

    # record stats befor starting
    nw_before = map(get_network_bytes, hosts)
    st_before = map(get_storage_bytes, hosts)
    time_before = time.time()

    # actually run the query for collecting stastics
    hc.sql(query12).show()

    # record stat after completion
    time_after = time.time()
    nw_after = map(get_network_bytes, hosts)
    st_after = map(get_storage_bytes, hosts) 

    # calculate the difference in stats
    nw_read_hosti = 0
    nw_write_hosti = 0
    st_read_hosti = 0
    st_write_hosti = 0
    for i in range(len(hosts)):
        nw_read_hosti += nw_after[i][0] - nw_before[i][0]
        nw_write_hosti += nw_after[i][1] - nw_before[i][1]
        st_read_hosti += st_after[i][0] - st_before[i][0]
        st_write_hosti += st_after[i][1] - st_before[i][1]

    # output the stats
    print time_after - time_before
    print bytes_to_mb(nw_read_hosti)
    print bytes_to_mb(nw_write_hosti)
    print bytes_to_mb(st_read_hosti)
    print bytes_to_mb(st_write_hosti)

    sc.stop()
Exemplo n.º 10
0
from pyspark.sql.types import Row
import random

conf = SparkConf().setAppName("spark_sql_cache_table")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

dataRDD = sc.textFile("hdfs://dip.cdh5.dev:8020/user/yurun/datas").map(
    lambda line: line.split("\t")).map(
        lambda words: Row(col1=words[0], col2=words[1], col3=words[2]))

sourceRDD = hc.inferSchema(dataRDD)

sourceRDD.registerAsTable("source")

cacheRDD = hc.sql("select * from source where col1 = 'col1_50'")

cacheRDD.registerAsTable("cacheTable")

hc.cacheTable("cacheTable")

hc.sql("select col2, max(col3) from cacheTable group by col2").collect()

hc.sql("select col3, min(col2) from cacheTable group by col3").collect()

# hc.uncacheTable("cacheTable")

sc.stop()
Exemplo n.º 11
0
sc = SparkContext(conf=conf)

#创建SQL上下文环境
hiveCtx = HiveContext(sc)

#使用sparkSQL读取json文件
rows = hiveCtx.jsonFile('file:///usr/local/test_data/json')
rows.registerTempTable('rows')
result = hiveCtx.sql("select * from rows")
result.first()
result_data = result.map(lambda x: x.data)  #获取data字段
result_data.collect()
result.printSchema()  #输出结构信息

#数据缓存
hiveCtx.cacheTable('rows')

#读取hive数据库的数据
score_data = hiveCtx.sql('select name,score from testdb.score')
score = score_data.map(lambda x: x[1])
score.collect()

#读取parquet文件
parquet_data = hiveCtx.parquetFile('hdfs://192.168.0.104:9000/users')
parquet_data.first()
gender = parquet_data.map(lambda x: x.gender)
gender.collect()
parquet_data.registerTempTable('users')
male_data = hiveCtx.sql("select * from users where gender='male'")
male_data.collect()
Exemplo n.º 12
0
from pyspark.sql import HiveContext
from pyspark.sql.types import Row
import random

conf = SparkConf().setAppName("spark_sql_cache_table")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

dataRDD = sc.textFile("hdfs://dip.cdh5.dev:8020/user/yurun/datas").map(lambda line: line.split(
    "\t")).map(lambda words: Row(col1=words[0], col2=words[1], col3=words[2]))

sourceRDD = hc.inferSchema(dataRDD)

sourceRDD.registerAsTable("source")

cacheRDD = hc.sql("select * from source where col1 = 'col1_50'")

cacheRDD.registerAsTable("cacheTable")

hc.cacheTable("cacheTable")

hc.sql("select col2, max(col3) from cacheTable group by col2").collect()

hc.sql("select col3, min(col2) from cacheTable group by col3").collect()

# hc.uncacheTable("cacheTable")

sc.stop()
Exemplo n.º 13
0
def main():
    """Run the query and print the statistics."""

    run = parse_cmd_line_arg()

    # Clear cache
    map(clear_cache, hosts)
    map(clear_local_dir, hosts)

    name = 'CS-838-Assignment2-Question2'
    sc = SparkContext(conf=get_conf(name))
    hc = HiveContext(sc)

    hc.sql('use tpcds_text_db_1_50')

    query12 = """
        select  i_item_desc
              ,i_category
              ,i_class
              ,i_current_price
              ,i_item_id
              ,sum(ws_ext_sales_price) as itemrevenue
              ,sum(ws_ext_sales_price)*100/sum(sum(ws_ext_sales_price)) over
                  (partition by i_class) as revenueratio
        from
                web_sales
                ,item
                ,date_dim
        where
                web_sales.ws_item_sk = item.i_item_sk
                and item.i_category in ('Jewelry', 'Sports', 'Books')
                and web_sales.ws_sold_date_sk = date_dim.d_date_sk
                and date_dim.d_date between '2001-01-12' and '2001-02-11'
        group by
                i_item_id
                ,i_item_desc
                ,i_category
                ,i_class
                ,i_current_price
        order by
                i_category
                ,i_class
                ,i_item_id
                ,i_item_desc
                ,revenueratio
        limit 100
        """

    query54 = """
        with my_customers as (
         select  c_customer_sk
                , c_current_addr_sk
         from
                ( select cs_sold_date_sk sold_date_sk,
                         cs_bill_customer_sk customer_sk,
                         cs_item_sk item_sk
                  from   catalog_sales
                  union all
                  select ws_sold_date_sk sold_date_sk,
                         ws_bill_customer_sk customer_sk,
                         ws_item_sk item_sk
                  from   web_sales
                 ) cs_or_ws_sales,
                 item,
                 date_dim,
                 customer
         where   sold_date_sk = d_date_sk
                 and item_sk = i_item_sk
                 and i_category = 'Jewelry'
                 and i_class = 'football'
                 and c_customer_sk = cs_or_ws_sales.customer_sk
                 and d_moy = 3
                 and d_year = 2000
                 group by  c_customer_sk
                , c_current_addr_sk
         )
         , my_revenue as (
         select c_customer_sk,
                sum(ss_ext_sales_price) as revenue
         from   my_customers,
                store_sales,
                customer_address,
                store,
                date_dim
         where  c_current_addr_sk = ca_address_sk
                and ca_county = s_county
                and ca_state = s_state
                and ss_sold_date_sk = d_date_sk
                and c_customer_sk = ss_customer_sk
                and d_month_seq between (1203)
                                   and  (1205)
         group by c_customer_sk
         )
         , segments as
         (select cast((revenue/50) as int) as segment
          from   my_revenue
         )
          select  segment, count(*) as num_customers, segment*50 as segment_base
         from segments
         group by segment
         order by segment, num_customers
         limit 100
         """

    # cache runs
    if run == 2 or run == 3:
        # tables in query12 used for collecting stats
        hc.cacheTable('web_sales')
        hc.cacheTable('item')
        hc.cacheTable('date_dim')

        # to circumvent lazy computation and force cache, we run a query
        # that involves the above cached tables
        if run == 2:
            # we will avoid running query12 now since we want to run
            # it below and collect stats
            # Instead, we run query54 which involves all the above 3
            # cached tables
            df = hc.sql(query54)

        # to force the caching of the outputRDD
        elif run == 3:
            # running the same query used to collect stats: query12
            # since we want to cache the output
            df = hc.sql(query12)
            df.cache()

        df.show()
        time.sleep(120)

    # record stats befor starting
    nw_before = map(get_network_bytes, hosts)
    st_before = map(get_storage_bytes, hosts)
    time_before = time.time()

    # actually run the query for collecting stastics
    hc.sql(query12).show()

    # record stat after completion
    time_after = time.time()
    nw_after = map(get_network_bytes, hosts)
    st_after = map(get_storage_bytes, hosts)

    # calculate the difference in stats
    nw_read_hosti = 0
    nw_write_hosti = 0
    st_read_hosti = 0
    st_write_hosti = 0
    for i in range(len(hosts)):
        nw_read_hosti += nw_after[i][0] - nw_before[i][0]
        nw_write_hosti += nw_after[i][1] - nw_before[i][1]
        st_read_hosti += st_after[i][0] - st_before[i][0]
        st_write_hosti += st_after[i][1] - st_before[i][1]

    # output the stats
    print time_after - time_before
    print bytes_to_mb(nw_read_hosti)
    print bytes_to_mb(nw_write_hosti)
    print bytes_to_mb(st_read_hosti)
    print bytes_to_mb(st_write_hosti)

    sc.stop()
Exemplo n.º 14
0
class SparkEngine(object):
    def __init__(self, sc, debug=False):
        self.export_path = os.environ['COOPERHEWITT_ROOT'] + "/export/"
        self.sc = sc
        # hive requires writable permissions: ~/ephemeral-hdfs/bin/hadoop fs -chmod 777 /tmp/hive
        self.hive_cxt = HiveContext(sc)
        self.sql_cxt  = SQLContext(sc)
        if debug:
            print "{0}\n{1}\n{2}\n".format(sc.master, self.hive_cxt, self.sql_cxt)
            print sc._conf.getAll()
        #TBD destructor Unpersist memory

### functionality to query and create tables
    def _create_df_table(self, schema, frame, name):
        if schema: df = self.hive_cxt.createDataFrame(frame, schema=schema)
        else: df = self.hive_cxt.createDataFrame(frame)
        df.printSchema()
        df.registerTempTable(name)
        self.hive_cxt.cacheTable(name)
        return df

    def _query_temporal_data(self):
        # step 1. create main temporal table
        # n_obs => first join causes for each pen entry * num location entries existent (dependent on time period)
        samples_temporal_tb = self.hive_cxt.sql("""
            SELECT  s.refers_to_object_id, created, visit_raw,
                    room_floor, room_id, room_name,
                    spot_id, spot_name, spot_description,
                    room_count_objects, room_count_spots, spot_count_objects,
                    abs(datediff(
                        from_utc_timestamp(from_unixtime(created,   "yyyy-MM-dd"), 'US/Eastern'),
                        from_utc_timestamp(from_unixtime(visit_raw, "yyyy-MM-dd"), 'US/Eastern')
                    )) as delta
            FROM samples s
            JOIN temporal t
            ON s.refers_to_object_id = t.refers_to_object_id
            ORDER by s.refers_to_object_id, created, delta
        """)
        samples_temporal_tb.registerTempTable('samplestemporal')
        self.hive_cxt.cacheTable('samplestemporal')
        return samples_temporal_tb

    def _minimize_query(self):
        # From the temporal table, we need minimize the location (multiple locations) to the appropriate sample timestamp
        tb_samples = self.hive_cxt.sql("""
            SELECT *
            FROM (
                SELECT *,
                MIN(delta)   OVER ( PARTITION BY refers_to_object_id, created) AS min_delta,
                row_number() OVER ( PARTITION BY refers_to_object_id, created) AS ranks
                FROM samplestemporal st
                ORDER BY refers_to_object_id
            ) query
            where query.ranks = 1
        """)
        tb_samples = tb_samples.withColumn("meta_store", lit(1))
        tb_samples.registerTempTable('minimizedsamples')
        self.hive_cxt.cacheTable('minimizedsamples')
        return tb_samples

    def execute_query(self, (samples_schema,  samples_frame, samples_name),
                                        (temporal_schema, temporal_frame, temporal_name),
                                        cols):
        self.df_samples       = self._create_df_table(samples_schema,  samples_frame,  samples_name)
        self.df_temporal      = self._create_df_table(temporal_schema, temporal_frame, temporal_name)
        self.tb_meta          = self._query_temporal_data()
        self.tb_meta_min      = self._minimize_query()
        # combine to the original pen data (meta_store indicates if we had object data to integrate)
        self.df_samplesmeta   = self.df_samples.join(self.tb_meta_min, ['refers_to_object_id', 'created'], "left_outer")
        self.df_samplesmeta   = self.df_samplesmeta.fillna({'meta_store': 0})
        self.df_samplesmeta.printSchema()
        # pickle file to pandas: alternatively we can store as a json or parquet columnar format
        dropped_cols = ['delta', 'min_delta', 'ranks'] + cols
        samplesmeta_pd  = self.df_samplesmeta.toPandas()
        samplesmeta_pd  = samplesmeta_pd.drop(dropped_cols, axis=1)
        samplesmeta_pd.to_pickle(self.export_path + "penmeta_spark.pkl")
Exemplo n.º 15
0
    schema = StructType([
        StructField("domain", StringType(), False),
        StructField("range", StringType(), False)
    ])

    edges = sqlCtx.createDataFrame(tc, schema)
    tcSQL = sqlCtx.createDataFrame(tc, schema)

    tcSQL.cache()

    print "****** schema created ********"
    tcSQL.printSchema()

    orgCount = tcSQL.count()
    edges.registerAsTable("edges")
    sqlCtx.cacheTable("edges")

    print "******* Loop Start ********"

    loop_start = datetime.now()

    iteration = 0
    oldCount = 0L
    nextCount = tcSQL.count()

    while True:
        iteration = iteration + 1

        print "****** Start iteration %i ******" % iteration

        oldCount = nextCount
Exemplo n.º 16
0
mytable.registerTempTable("temp_mytable")
"""


def convert(val):
    return val.upper()


hc.registerFunction("temp_convert", convert)

convertRDD = hc.sql(
    "select temp_convert(col1) as col1, col2, col3 from temp_source")

convertRDD.registerAsTable("temp_mytable")

hc.cacheTable("temp_mytable")


def printRows(rows):
    for row in rows:
        print row


datas = hc.sql("select * from temp_mytable").collect()

printRows(datas)

datas = hc.sql("select col1 from temp_mytable").collect()

printRows(datas)