예제 #1
0
def item_based():
    input_file = sc.textFile(train_file)
    train_data = input_file.map(lambda x: x.split(',')).filter(
        lambda x: x[0] != "user_id").persist(
            StorageLevel(True, True, False, False))

    input_file2 = sc.textFile(val_file)
    val_data = input_file2.map(lambda x: x.split(',')).filter(
        lambda x: x[0] != "user_id").persist(
            StorageLevel(True, True, False, False))

    t_users = train_data.map(lambda a: a[0]).distinct().collect()
    t_businesses = train_data.map(lambda a: a[1]).distinct().collect()
    R = len(t_users)
    C = len(t_businesses)

    users_dict = {}
    for u in range(0, R):
        users_dict[t_users[u]] = u

    businesses_dict = {}
    for u in range(0, C):
        businesses_dict[t_businesses[u]] = u

    t_characteristic_matrix = train_data.map(
        lambda x: (x[0], ([businesses_dict[x[1]]], [x[2]]))).reduceByKey(
            lambda x, y: (x[0] + y[0], x[1] + y[1])).persist(
                StorageLevel(True, True, False, False))
    t2 = t_characteristic_matrix.map(lambda x: trans(x))
    dum = {}
    for u in t2.collect():
        dum[u[0]] = u[1]

    ti_characteristic_matrix = train_data.map(lambda x: (businesses_dict[x[
        1]], ([users_dict[x[0]]], [x[2]]))).reduceByKey(
            lambda x, y: (x[0] + y[0], x[1] + y[1])).persist(
                StorageLevel(True, True, False, False))
    ti2 = ti_characteristic_matrix.map(lambda x: trans(x))
    dim = {}
    for u in ti2.collect():
        dim[u[0]] = u[1]

    pres = val_data.map(lambda x: (x[0], x[
        1], i_predict((x[0], x[1], x[2]), dum, dim, businesses_dict))).persist(
            StorageLevel(True, True, False, False))

    ans_file = open(output_file, 'w')
    ans_file.write("user_id, business_id, prediction\n")
    for c in pres.collect():
        ans_file.write(c[0] + "," + c[1] + "," + str(c[2]) + "\n")
    ans_file.close()

    return
예제 #2
0
    def createCheckInDataPerUser(self):
        review_user = self.sqlContext.sql(
            "SELECT business_id, user_id FROM reviews")
        business_loc = self.sqlContext.sql(
            "SELECT business_id, latitude, longitude FROM business")
        review_user.registerTempTable("reviews_user")
        business_loc.registerTempTable("business_loc")

        self.df_join_reviewAndBusiness = self.sqlContext.sql(
            "SELECT r.user_id, b.latitude, b.longitude FROM reviews_user r JOIN business_loc b ON r.business_id = b.business_id"
        ).rdd.groupBy(lambda x: x.user_id).persist(
            StorageLevel(True, True, False, True, 1))
        # self.df_join_reviewAndBusiness.repartition(1).saveAsTextFile("user.json")
        self.user_centers = self.df_join_reviewAndBusiness.map(
            getCentersOfUser, preservesPartitioning=True)

        schema_2 = StructType([
            StructField("latitude", FloatType(), True),
            StructField("longitude", FloatType(), True)
        ])

        schema = StructType([
            StructField("cluster_centers", ArrayType(schema_2), True),
            StructField("user_id", StringType(), True)
        ])
        df = self.sqlContext.createDataFrame(self.user_centers.repartition(1),
                                             schema)
        df.save("center.json", "json")
예제 #3
0
def special_show(self,
                 n=2000,
                 truncate=False,
                 vertical=False,
                 auto_sample=True,
                 seed=None):
    """Special version of show, this changes the default to number of rows to 2000
  and samples the result. Caches the input if not already cached.
  """
    if vertical:
        raise Exception("this doesn't work in fancy notebook mode")
    do_cache = auto_sample and self.storageLevel == StorageLevel(
        False, False, False, False, 1)
    try:
        if do_cache:
            df.cache()
        sampled_df = self
        if auto_sample:
            total_count = self.count()
            do_sample = (n < total_count) and auto_sample
            if do_sample:
                fraction = (n * 1.1) / total_count
                sampled_df = self.sample(withReplacement=False,
                                         fraction=fraction).limit(n)
            pandas_df = sampled_df.toPandas()
        return DataFrameResult(pandas_df, self, do_sample)
    finally:
        if do_cache:
            df.unpersist()
예제 #4
0
파일: main.py 프로젝트: godfather239/com
def get_sensor_table(sparkContext, sqlContext):
    start = CONFIG['stat_from'].strftime('%Y%m%d')
    end = CONFIG['stat_to'].strftime('%Y%m%d')
    table = sqlContext.sql('''
        SELECT search_word,
               product_id,
               sum(VIEW) AS exposure_count,
               sum(click) AS click_count
        FROM
          (SELECT if(a.doc_type = 'global_mall' or a.doc_type = 'global_pop_mall', b.product_id, a.p_material_id)
                  AS product_id,
                  CASE
                      WHEN (a.event_id = 4) THEN 1
                      ELSE 0
                  END AS VIEW,
                  CASE
                      WHEN (a.event_id = 3) THEN 1
                      ELSE 0
                  END AS click,
                  a.search_word
           FROM
             (SELECT event_id,
                     search_word,
                     p_material_id,
                     doc_type
              FROM
                (SELECT event_id,
                        regexp_extract(p_params, '^(.*?)&(.*?)$', 1) AS search_word,
                        regexp_extract(p_material_id, '(.*p)?(\\\d+).*',2) AS p_material_id,
                        regexp_extract(p_material_link, '^.*&type=(.*?)&.*', 1) AS doc_type
                 FROM rawdata.event_ros_p1
                 WHERE DAY >= '%s'
                   AND DAY <= '%s'
                   AND p_material_page='product_search_list'
                   AND p_params IS NOT NULL
                   AND (event_id = 4
                        OR event_id = 3)) st
              WHERE search_word IS NOT NULL
                AND search_word != ''
                AND p_material_id IS NOT NULL
                AND p_material_id rlike '^\\\d+$' ) a
           LEFT JOIN mysql.jumei_mall b ON a.p_material_id = b.mall_id
           AND (a.doc_type = 'global_mall'
                OR a.doc_type = 'global_pop_mall')
           WHERE
                if(a.doc_type = 'global_mall' or a.doc_type = 'global_pop_mall', b.product_id, a.p_material_id) is not null
             AND a.search_word IS NOT NULL
             AND a.search_word != '' ) t
        WHERE product_id IS NOT NULL
        GROUP BY search_word,
                 product_id
    ''' % (start, end))
    table.persist(StorageLevel(True, True, False, False, 1))
    if CONFIG['do_save_table']:
        table.write.saveAsTable('recommend.ecpm_sensor' +
                                CONFIG['table_suffix'],
                                mode='overwrite')
    return table
예제 #5
0
def user_based():
    input_file = sc.textFile(train_file)
    train_data = input_file.map(lambda x: x.split(',')).filter(
        lambda x: x[0] != "user_id").persist(
            StorageLevel(True, True, False, False))

    input_file2 = sc.textFile(val_file)
    val_data = input_file2.map(lambda x: x.split(',')).filter(
        lambda x: x[0] != "user_id").persist(
            StorageLevel(True, True, False, False))

    t_businesses = train_data.map(lambda a: a[1]).distinct().collect()

    ncolumns = len(t_businesses)

    businesses_dict = {}
    for u in range(0, ncolumns):
        businesses_dict[t_businesses[u]] = u

    t_characteristic_matrix = train_data.map(
        lambda x: (x[0], ([businesses_dict[x[1]]], [x[2]]))).reduceByKey(
            lambda x, y: (x[0] + y[0], x[1] + y[1])).map(lambda x: trans(x))

    dum = {}
    for u in t_characteristic_matrix.collect():
        dum[u[0]] = u[1]

    businesses_users = train_data.map(lambda x: (x[1], [x[0]])).reduceByKey(
        lambda x, y: x + y)
    dbu = {}
    for bu in businesses_users.collect():
        dbu[bu[0]] = bu[1]

    pres = val_data.map(lambda x: (x[0], x[
        1], predict((x[0], x[1]), dum, dbu, businesses_dict)))

    ans_file = open(output_file, 'w')
    ans_file.write("user_id, business_id, prediction\n")
    for c in pres.collect():
        ans_file.write(c[0] + "," + c[1] + "," + str(c[2]) + "\n")
    ans_file.close()
    return
예제 #6
0
 def getStorageLevel(self):
     """
     Get the RDD's current storage level.
     >>> rdd1 = sc.parallelize([1,2])
     >>> rdd1.getStorageLevel()
     StorageLevel(False, False, False, 1)
     """
     java_storage_level = self._jrdd.getStorageLevel()
     storage_level = StorageLevel(java_storage_level.useDisk(),
                                  java_storage_level.useMemory(),
                                  java_storage_level.deserialized(),
                                  java_storage_level.replication())
     return storage_level
예제 #7
0
def special_show(self,
                 n=2000,
                 truncate=False,
                 vertical=False,
                 auto_sample=True,
                 seed=None):
    """Special version for a Spark dataframe's `show`

    This changes the defaults to number of rows to show to 2000 and samples
    the result.

    Caches the input if not already cached.

    Parameters
    ----------
    - self: a dataframe
    - n (int): number of rows from dataframe to show (default 2000)
    - truncate (bool): whether to truncate rows (default False)
    - vertical (bool): support fancy notebook mode (default False)
    - auto_sample (bool): whether to sample the dataframe (default True)
    - seed (int): seed for sampling (default None)

    """
    if vertical:
        raise Exception("this doesn't work in fancy notebook mode")

    do_cache = auto_sample and self.storageLevel == StorageLevel(
        False, False, False, False, 1)
    try:
        if do_cache:
            df.cache()

        sampled_df = self
        if auto_sample:
            total_count = self.count()
            do_sample = (n < total_count) and auto_sample
            if do_sample:
                fraction = (n * 1.1) / total_count
                sampled_df = self.sample(withReplacement=False,
                                         fraction=fraction).limit(n)
            pandas_df = sampled_df.toPandas()

        return DataFrameResult(pandas_df, self, do_sample)

    finally:
        if do_cache:
            df.unpersist()
예제 #8
0
파일: main.py 프로젝트: godfather239/com
def get_order_table(sparkContext, sqlContext):
    start = CONFIG['stat_from'].strftime('%Y-%m-%d')
    end = CONFIG['stat_to'].strftime('%Y-%m-%d')
    table = sqlContext.sql('''
          select sell_label,product_id as productId,sum(quantity*deal_price) as sales_amount from
          bi_datawarehouse.int_paid_orders where data_date >= '%s' AND data_date <= '%s'
                                and sell_label is not null and sell_label != ""
                                and sell_type = 'mSearch'
          group by sell_label,product_id
          order by sales_amount desc
    ''' % (start, end))
    table.persist(StorageLevel(True, True, False, False, 1))
    if CONFIG['do_save_table']:
        table.write.saveAsTable('recommend.ecpm_order' +
                                CONFIG['table_suffix'],
                                mode='overwrite')
    return table
예제 #9
0
파일: spark.py 프로젝트: goddess5321/utils
宽窄依赖: "https://github.com/rohgar/scala-spark-4/wiki/Wide-vs-Narrow-Dependencies"

# pandas DataFrame to spark DataFrame
from pyspark.sql import SparkSession
sqlContext = SparkSession\
				.builder \
	            .appName("dataFrame") \
	            .getOrCreate()
spark_df = sqlContext.createDataFrame(df)

# pandas DataFrame to spark rdd
spark.createDataFrame(df).rdd

# 设置缓存级别
from pyspark.storagelevel import StorageLevel
StorageLevel.DISK_ONLY = StorageLevel(True, False, False, False)
StorageLevel.DISK_ONLY_2 = StorageLevel(True, False, False, False, 2)
StorageLevel.MEMORY_ONLY = StorageLevel(False, True, False, False)
StorageLevel.MEMORY_ONLY_2 = StorageLevel(False, True, False, False, 2)
StorageLevel.MEMORY_AND_DISK = StorageLevel(True, True, False, False)
StorageLevel.MEMORY_AND_DISK_2 = StorageLevel(True, True, False, False, 2)
StorageLevel.OFF_HEAP = StorageLevel(True, True, True, False, 1)

# spark使用随机森林
from pyspark.mllib.tree import RandomForest, RandomForestModel
model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32)

# spark rdd存为本地文本文件
rdd.saveAsTextFile(file_name)

# rdd获取指定数量元素转化为列表
예제 #10
0
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.storagelevel import StorageLevel

conf = SparkConf().setAppName("wordcount").setMaster("local")
sc = SparkContext(conf=conf)
sc.setCheckpointDir("./chk")
lines = sc.textFile("./text")
words = lines.flatMap(lambda line: line.split(" "))
pairs = words.map(lambda word: (word, 1))
pairs.persist(storageLevel=StorageLevel(True, True, False, False, 3))
result = pairs.reduceByKey(lambda a, b: a + b)
result.checkpoint()
sorted_result = result.sortBy(lambda kv: kv[1], False)
# sorted_result = result.map(lambda kv: (kv[1], kv[0])).sortByKey(False) \
#                     .map(lambda kv: (kv[1], kv[0]))
temp = sorted_result.take(3)
# num = result.count()
# temp = result.collect()
print(temp)
# result.saveAsTextFile("./my_result")
예제 #11
0
    # initiate
    sc = SparkContext('local[*]', 'inf553_hw2_1')
    sc.setLogLevel("OFF")

    case_mark = int(sys.argv[1])
    S = int(sys.argv[2])
    input_file = sc.textFile(sys.argv[3])  # readfile
    output_file = sys.argv[4]

    data = input_file.distinct().map(lambda x: x.split(',')).filter(
        lambda x: x[0] != "user_id")

    # create basket
    if case_mark == 1:
        baskets = data.groupByKey().map(lambda x: (x[0], list(x[1]))).persist(
            StorageLevel(True, True, False, False))
    elif case_mark == 2:
        baskets = data.map(lambda x: (x[1], x[0])).groupByKey().map(
            lambda x: (x[0], list(x[1]))).persist(
                StorageLevel(True, True, False, False))
    else:
        exit(-1)

    # baskets = baskets.coalesce(1)
    # data.unpersist()

    N = baskets.count()

    # SON algorithm
    # Pass 1
    # Pass 1 Map
예제 #12
0
data_review = input_file_review.map(lambda a: json.loads(a)).map(
    lambda a: (a['business_id'], a['stars']))

input_file_business = sc.textFile(sys.argv[2])
data_business = input_file_business.map(lambda a: json.loads(a)).map(
    lambda a: (a['business_id'], a['state']))

data = data_review.join(data_business)

mstatestar = data.map(lambda x: (x[1][1], (x[1][0], 1))).reduceByKey(
    lambda x, y: (x[0] + y[0], x[1] + y[1])).sortByKey()

mstateavgstar = mstatestar.map(
    lambda x: (x[0], format(float(x[1][0]) / x[1][1]))).sortBy(
        lambda x: x[1],
        ascending=False).persist(StorageLevel(True, True, False, False))

# 1-collect
begin_time_1 = time.time()

m1 = mstateavgstar.collect()
for i in range(5):
    print(m1[i])

end_time_1 = time.time()

# 2-take
begin_time_2 = time.time()

m2 = mstateavgstar.take(5)
print(m2)
예제 #13
0
    sim = float(len(inter)/len(un))
    return (pairs[0], pairs[1], sim)


if __name__ == "__main__":

    time1 = time.time()

    conf = SparkConf().setAppName('inf553_hw3_1').setMaster('local[*]')
    sc = SparkContext(conf=conf)  # initiate
    sc.setLogLevel("OFF")

    input_file = sc.textFile(sys.argv[1])  # readfile
    output_file = sys.argv[2]

    data = input_file.map(lambda x: x.split(',')).filter(lambda x: x[0] != "user_id").persist(StorageLevel(True, True, False, False))

    users = data.map(lambda a: a[0]).distinct().collect()
    nrows = len(users)

    users_dict = {}
    for u in range(0, nrows):
        users_dict[users[u]] = u

    characteristic_matrix = data.map(lambda x: (x[1], [users_dict[x[0]]])).reduceByKey(lambda x, y: x + y).persist(StorageLevel(True, True, False, False))

    d_characteristic_matrix = {}
    cm = characteristic_matrix.map(lambda x: (x[0], set(x[1]))).collect()
    for i in cm:
        d_characteristic_matrix[i[0]] = i[1]
예제 #14
0
import os
import sys
import re
import boto3
from datetime import datetime

from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.storagelevel import StorageLevel

MEMORY_AND_DISK = StorageLevel(True, True, False, False)


def splitter(array, n=50):
    assert n > 0
    i = 0
    result = []
    for e in array:
        if i < n:
            result.append(e)
            i += 1
        if i >= n:
            yield result
            i = 0
            del result[:]

    if len(result):
        yield result


def gen_partition_statement(partition_tuples, target_root, run_id=None):
예제 #15
0
from pyspark.context import SparkContext
from pyspark.storagelevel import StorageLevel
import json
import sys

sc = SparkContext('local[*]', 'inf553_hw1_1')  # initiate

input_file = sc.textFile(sys.argv[1])  # readfile
data = input_file.map(lambda x: json.loads(x)).map(
    lambda x: (x['review_id'], (x['user_id'], x['business_id'], x['useful'], x[
        'stars'], len(x['text'])))).persist(
            StorageLevel(True, True, False, False))  # deal with json

museful = data.filter(lambda x: x[1][2] > 0).count()

mfivestar = data.filter(lambda x: x[1][3] == 5.0).count()

mlongestreview = data.map(lambda x: (x[1][4], 1)).top(1)

muser = data.map(lambda x: (x[1][0], 1)).reduceByKey(
    lambda x, y: x + y).sortByKey().persist(
        StorageLevel(True, True, False, False))

musernum = muser.count()

muserreview = muser.takeOrdered(20, lambda x: -x[1])

mbusiness = data.map(lambda x: ((x[1][1]), 1)).reduceByKey(
    lambda x, y: x + y).sortByKey().persist(
        StorageLevel(True, True, False, False))
def join_dict_of_rdd(rdd_dict: Dict[str, RDD]) -> RDD:
    """Join dictionary of RDD, but not a traditional join

    1. Stack all RDD together and make as paired RDD
    2. reducebyKey, value is list of dict
    3. fill all field accorddingly, and return a RDD of dict

    Args:
        rdd_dict (Dict[str, RDD]): RDD dict

    Returns:
        RDD: Joined RDD
    """

    # create loss multiplier for inputs
    rdd_list = []
    loss_multiplier_list = []

    def _add_loss_multiplier(inp: dict, problem: str) -> dict:
        lm_name = '{}_loss_multiplier'.format(problem)
        inp[lm_name] = 1
        return inp

    for p, rdd in rdd_dict.items():
        loss_multiplier_list.append('{}_loss_multiplier'.format(p))
        rdd_list.append(
            rdd.map(lambda x, p=p: _add_loss_multiplier(x, problem=p)))

    # union rdds
    sc: SparkContext = SparkContext.getOrCreate()
    all_problem_rdd = sc.union(rdd_list)

    # make pair rdd
    def _make_pair_rdd(inp_dict: dict) -> Tuple[str, dict]:
        if 'record_id' not in inp_dict:
            raise KeyError(
                "Chaining problems with & without "
                "providing 'record_id' in inputs. Received keys: {}".format(
                    inp_dict.keys()))
        return (inp_dict['record_id'], inp_dict)

    all_problem_rdd = all_problem_rdd.map(_make_pair_rdd)

    # reduce by key, fill out dict correspondingly
    loss_multiplier_list_b = sc.broadcast(loss_multiplier_list)

    def _merge_dicts(left_dict: dict, right_dict: dict):
        left_dict.update(right_dict)
        return left_dict

    def _add_dummpy_loss_multiplier(inp: dict) -> dict:
        # set loss multiplier to inform which problem
        # is available in this record
        lml = loss_multiplier_list_b.value
        for lm in lml:
            if lm not in inp:
                inp[lm] = 0
        return inp

    # MEMORY_AND_DISK
    all_problem_rdd = all_problem_rdd.persist(
        storageLevel=StorageLevel(True, True, False, False))
    all_problem_rdd = all_problem_rdd.reduceByKey(_merge_dicts).map(
        lambda x: x[1]).map(_add_dummpy_loss_multiplier)

    return all_problem_rdd
예제 #17
0
if __name__ == '__main__':

    global table_create
    table_create = False

    # create local StreamingContext with * working thread and batch interval of 20 second
    sc = SparkContext('local[*]', 'TwitterStream')

    ssc = StreamingContext(sc, 20)

    # read data from port
    with open('config.yaml', 'r') as stream:
        details = yaml.safe_load(stream)

    lines = ssc.socketTextStream(details['host'],
                                 details['port'],
                                 storageLevel=StorageLevel(
                                     False, True, False, False, 1))

    # split each tweet into words
    words = lines.flatMap(lambda line: line.split(' '))

    # do processing for each RDD generated in each interval
    words.foreachRDD(process_rdd)

    # start the streaming computation
    ssc.start()

    # wait for the streaming to finish
    ssc.awaitTermination()
예제 #18
0
from pyspark.sql.functions import _to_seq, _to_java_column
from pyspark.sql import SparkSession, Column
from pyspark.sql.functions import broadcast

#from marketing_mart.CRM.delivery_diner.ddls import *
#from marketing_mart.helpers import write_and_partition
#from marketing_mart.CRM.diner_last_address.ddls import *

logging.basicConfig(level=logging.INFO)
formatter = logging.Formatter(
    "%(asctime)s %(levelname)s:%(name)s: %(message)s")
root_logger = logging.getLogger("[CRM ETL Delivery Diner]")

ACTIVE_LOOK_BACK = 380
# MEMORY_AND_DISK = StorageLevel(True, True, False, False)
MEMORY_ONLY = StorageLevel(False, True, False, False)

geom_table = 'source_mysql_core.geom'
customer_table = 'source_mysql_core.customer'
postal_code_dim_table = 'integrated_core.postal_code_dim'
diner_order_agg_table = 'integrated_diner.diner_order_agg'
diner_last_address_table = 'migrated_marketing_reporting.diner_last_address'
login_user_table = 'source_mysql_core.login_user'

#self.sc.conf.set("mapreduce.fileoutputcommitter.algorithm.version", "2")
#spark.conf.set("spark.kryoserializer.buffer.mb", "300")
#spark.conf.set("spark.kryoserializer.buffer.max", "300297910")

cbsa_rest_query = """
		SELECT DISTINCT c.cust_id AS restaurant_id 
		  , g.g AS wkt
예제 #19
0
    time1 = time.time()

    sc = SparkContext('local[*]', 'inf553_hw2_2')  # initiate
    sc.setLogLevel("OFF")

    T = int(sys.argv[1])
    S = int(sys.argv[2])
    input_file = sc.textFile(sys.argv[3])  # readfile
    output_file = sys.argv[4]

    data = input_file.map(lambda x: x.split(',')).filter(
        lambda x: x[0] != "user_id")
    # create basket
    baskets = data.groupByKey().map(lambda x: (x[0], list(x[1]))).filter(
        lambda x: len(x[1]) > T).persist(StorageLevel(True, True, False,
                                                      False))

    # baskets = baskets.coalesce(4, True).persist(StorageLevel(True, True, False, False))

    N = baskets.count()

    # Pass 1
    # Pass 1 Map
    can_freq_is = baskets.mapPartitions(apriori)

    # Pass 1 Reduce
    all_can_freq_is = can_freq_is.distinct().map(lambda x: alterStr(x)).sortBy(
        lambda x: x).sortBy(lambda x: len(x))

    if baskets.getNumPartitions() == 1:
        results = all_can_freq_is
예제 #20
0
 def persist(self, storageLevel=StorageLevel(True, True, False, False, 1)):
     raise NotImplementedError()