Exemplo n.º 1
0
def hash_rating(author_subreddit_rating_rdd, sc):
    sql_context = SQLContext(sc)

    author_sub_schema = StructType([
        StructField("author", StringType(), True),
        StructField("subreddit", StringType(), True),
        StructField("rating", LongType(), True)
    ])
    asr_df = sql_context.createDataFrame(author_subreddit_rating_rdd, author_sub_schema)

    author_rdd = author_subreddit_rating_rdd.map(lambda (a, s, r): a)
    aid_rdd = author_rdd.distinct().zipWithUniqueId().cache()
    author_id_schema = StructType([
        StructField("author", StringType(), True),
        StructField("author_id", LongType(), True)
    ])
    aid_df = sql_context.createDataFrame(aid_rdd, author_id_schema)
    aid_s_r_df = aid_df.join(asr_df, on='author').drop('author').cache()

    subreddit_rdd = author_subreddit_rating_rdd.map(lambda (a, s, r): s)
    sid_rdd = subreddit_rdd.distinct().zipWithUniqueId().cache()
    subreddit_id_schema = StructType([
        StructField("subreddit", StringType(), True),
        StructField("subreddit_id", LongType(), True)
    ])
    sid_df = sql_context.createDataFrame(sid_rdd, subreddit_id_schema)
    aid_sid_r_df = sid_df.join(aid_s_r_df, on='subreddit').drop('subreddit').cache()
    row_aid_sid_r_rdd = aid_sid_r_df.rdd
    aid_sid_r_rdd = row_aid_sid_r_rdd.map(lambda row: (row.author_id, row.subreddit_id, row.rating))

    return aid_rdd, sid_rdd, aid_sid_r_rdd
Exemplo n.º 2
0
 def _get_data(self):
     sql_context = SQLContext(self.sc)
     l = [
         (
         "I dont know why people think this is such a bad movie.",
         Vectors.sparse(3, {1: 1.0, 2: 1.0, 3: 1.0})
         ),
     ]
     return sql_context.createDataFrame(l, ['text', 'features'])
Exemplo n.º 3
0
 def _get_train_data(self):
     sql_context = SQLContext(self.sc)
     l = [
         (1, Vectors.dense([1, 2, 3]), 1.0),
         (2, Vectors.dense([1, 2, 3]), 0.0),
         (3, Vectors.dense([1, 2, 3]), 1.0),
         (4, Vectors.dense([1, 2, 3]), 0.0),
     ]
     return sql_context.createDataFrame(l, ['id', 'features', 'label'])
def main():
	conf = SparkConf().setAppName('artist_career')
	sc = SparkContext(conf=conf)
	assert sc.version >= '1.5.1'
	sqlContext=SQLContext(sc)
	inputs = sys.argv[1]
	output = sys.argv[2]
	customSchema = StructType([StructField('SongNumber', StringType(),False),StructField('SongID', StringType(),False),StructField('AlbumID', StringType(),False),StructField('AlbumName', StringType(),False),StructField('ArtistID', StringType(),False),StructField('ArtistLatitude', StringType(),False),StructField('ArtistLocation', StringType(),False),StructField('ArtistLongitude', StringType(),False),StructField('ArtistName', StringType(),False),StructField('Danceability', StringType(),False),StructField('Duration', StringType(),False),StructField('KeySignature', StringType(),False),StructField('KeySignatureConfidence', StringType(),False),StructField('Tempo', StringType(),False),StructField('TimeSignature', StringType(),False),StructField('TimeSignatureConfidence', StringType(),False),StructField('Title', StringType(),False),StructField('Year', StringType(),False),StructField('Energy', StringType(),False),StructField('ArtistFamiliarity', StringType(),False),StructField('ArtistMbid', StringType(),False),StructField('SongHotttnesss', StringType(),False),StructField('Loudness', StringType(),False),StructField('StartOfFadeOut', StringType(),False),StructField('EndOfFadeIn', StringType(),False),StructField('ModeConfidence', StringType(),False)])

	df= sqlContext.read.format('com.databricks.spark.csv').options(header='true').load(inputs,schema = customSchema)

	df.registerTempTable('artist_data')

	million_song=sqlContext.sql("SELECT SongNumber,SongID,AlbumID,AlbumName,ArtistID,ArtistLatitude,ArtistLocation,ArtistLongitude,ArtistName,Danceability,Duration,KeySignature,KeySignatureConfidence,Tempo,TimeSignature,TimeSignatureConfidence,Title,Year,Energy,ArtistFamiliarity,ArtistMbid,SongHotttnesss,Loudness,StartOfFadeOut,EndOfFadeIn,ModeConfidence from artist_data where Year!=0 AND ArtistFamiliarity!='nan'")
	million_song.write.format('parquet').save(output)
Exemplo n.º 5
0
    def __init__(self):
        self.conf = (SparkConf()
                     .setAppName("BandCard")
                     .set("spark.cores.max", "2")
                     .set('spark.executor.extraClassPath', '/usr/local/env/lib/mysql-connector-java-5.1.38-bin.jar'))
        self.sc = SparkContext(conf=self.conf)
        self.sqlctx = SQLContext(self.sc)

        self.mysql_helper = MySQLHelper('core', host='10.9.29.212')
Exemplo n.º 6
0
def ALS_fit():
    usern = request.args.get('usern')
    users_df = pd.read_sql_query('''SELECT DISTINCT mt3ratings.user, user_id FROM mt3ratings WHERE appdata = 1''', engine)
    if usern not in users_df['user'].values:
        return_str =  "can't find user"
        return jsonify(result = return_str)
    user_id = users_df.user_id[users_df.user == usern].values[0]
    try: key = request.args.get('key')
    except NameError: key = 'e'
    if key == 'abcd':
            #start spark

        try:
             conf = SparkConf().setAppName("BeerSleuthALS").set("spark.executor.memory", "4g")
             sc = SparkContext(conf=conf)
        except ValueError: pass
        sqlContext = SQLContext(sc)
        ratings_sqldf = modeling.get_item_user_rev_from_pg(engine, sqlContext)
        sqlContext.registerDataFrameAsTable(ratings_sqldf, "ratings")
        print('fitting model')
    	model = modeling.fit_final_model(ratings_sqldf)
        beer_ids = beer_dict.values()
        to_predict = zip([user_id]*len(beer_ids), beer_ids)
	to_predict_top20 = zip([user_id]*len(beer_id_filt), beer_id_filt)
        user_preds = model.predictAll(sc.parallelize(to_predict)).collect()
	user_preds_top20 = model.predictAll(sc.parallelize(to_predict_top20)).collect()
        print('got preds')
        preds = Counter({x[1]: x[2] for x in user_preds})
	preds_top20 = Counter({x[1]: x[2] for x in user_preds_top20})
        with open('%s%s_preds.pkl'%(pred_path, user_id),'wb') as f:
            pickle.dump(preds, f)
        with open('%s%s_preds_top20.pkl'%(pred_path, user_id),'wb') as f:
            pickle.dump(preds_top20, f)

        print('done')
        sc.stop()
        return jsonify(result="Model training complete, you may now get predictions")
Exemplo n.º 7
0
def batching(spark, i):

    bucket = "mimic3waveforms3"
    p = '1.0/p0'+ str(i) +'/'

    session = boto3.session.Session()
    client = session.client('s3')
    s3 = session.resource('s3')

    patientList = []
    result = client.list_objects(Bucket=bucket, Prefix=p, Delimiter='/')
    for patient in result.get('CommonPrefixes'):
        patientList.append(patient.get('Prefix'))

    my_bucket = s3.Bucket("mimic3waveforms3")
    patientInfoObj = client.get_object(Bucket=bucket, Key='PATIENTS.csv')

    f_schema = StructType([\
    StructField('min', DoubleType(), True),\
    StructField('max', DoubleType(), True),\
    StructField('mean', DoubleType(), True),\
    StructField('median', DoubleType(), True),\
    StructField('mode', DoubleType(), True),\
    StructField('std', DoubleType(), True),\
    StructField('kurtosis', DoubleType(), True),\
    StructField('mortality_flag', IntegerType(), True),\
    StructField('patient_id', IntegerType(), True)])

    sqlContext = SQLContext(spark)

    data = [(0.0, 0.0,0.0, 0.0, 0.0, 0.0,0.0, 0, 0)]
    df_features = sqlContext.createDataFrame(data, f_schema)
    df = pd.read_csv(patientInfoObj['Body'])

    # for first 6 batch of records  extract train features
    for patient in patientList:
       patientRecords = []
       for object_summary in my_bucket.objects.filter(Prefix=patient):
          patientRecord = {}
          patientRecord['file_name']=object_summary.key
          if len(patientRecord['file_name']) != 32 or 'layout' in patientRecord['file_name']:
              continue                                                                                                              287,63        72%
          patientRecord['body']=object_summary.get()['Body'].read()
          patientRecords.append(patientRecord)
       patientId =int(patient[-7:-1])
       mortality = (df[df['SUBJECT_ID']==patientId]['EXPIRE_FLAG'])

      # for first 6 batch of records  extract train features
      if i < 6:   
          df_new_features = featureExtraction.TrainFeatureExtraction(sqlContext, patientRecords, f_schema, mortality, patientId)
          df_features = df_features.union(df_new_features)

      # for last 2 batchs of records extract test features
      else: 
          df_new_features = featureExtraction.TestFeatureExtraction(sqlContext, patientRecords, df_features, f_schema, int(mortality.values), patientId)
          df_features = df_features.union(df_new_features)
          df_features = df_features.filter(col('patient_id')>0)
          df_features.write.csv("s3a://"+bucket+"/PredictFeatures/"+str(patientId))

      if i < 6: 
        df_features = df_features.filter(col('patient_id')>0)
        df_features.write.csv("s3a://"+bucket+"/TrainFeatures/"+str(i))
Exemplo n.º 8
0
def _load_data(filePaths, dataset_name, spark_context, groupfiles, groupsize):
    sqlContext = SQLContext(spark_context)
    return sqlContext.read.json(filePaths)
import itertools
from math import sqrt
from operator import add
from os.path import join, isfile, dirname
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating


CLOUDSQL_INSTANCE_IP = sys.argv[1]
CLOUDSQL_NAME = sys.argv[2]
CLOUDSQL_USER = sys.argv[3]
CLOUDSQL_PWD  = sys.argv[4]

conf = SparkConf().setAppName("app_collaborative")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

jdbcDriver = 'com.mysql.jdbc.Driver'
jdbcUrl    = 'jdbc:mysql://%s:3306/%s?user=%s&password=%s' % (CLOUDSQL_INSTANCE_IP, CLOUDSQL_NAME, CLOUDSQL_USER, CLOUDSQL_PWD)

#[START how_far]
def howFarAreWe(model, against, sizeAgainst):
  # Ignore the rating column  
  againstNoRatings = against.map(lambda x: (int(x[0]), int(x[1])) )

  # Keep the rating to compare against
  againstWiRatings = against.map(lambda x: ((int(x[0]),int(x[1])), int(x[2])) )

  # Make a prediction and map it for later comparison
  # The map has to be ((user,product), rating) not ((product,user), rating)
  predictions = model.predictAll(againstNoRatings).map(lambda p: ( (p[0],p[1]), p[2]) )
Exemplo n.º 10
0
    sqlc.registerDataFrameAsTable(pw_df, "pw_df")
    sqlc.sql('Select sum(h) from pw_df').show()


# MAIN

sc_conf = SparkConf()
sc_conf.setAppName("pwned")
# sc_conf.set('spark.executor.memory', '2g')
# sc_conf.set('spark.driver.memory', '4g')
# sc_conf.set('spark.cores.max', '4')

sc_conf.set('spark.sql.crossJoin.enabled', True)

sc = SparkContext(conf=sc_conf)
sqlc = SQLContext(sc)
print(sys.version_info)

print('Spark version %s running.' % sc.version)

print('Config values of Spark context: ')
print(sc.getConf().getAll())


runner = rdd_approach, df_sql_approach, count_all_occurrences
for f in runner:
    f()

print('Finished.')
# hack to keep spark ui alive
raw_input("Press ctrl+c to exit")
Exemplo n.º 11
0
from pyspark.sql.utils import AnalysisException
import os, math, time

# In[2]:

conf = (
    SparkConf()
    #.setMaster('spark://10.100.5.182:7077')
    #.setMaster("local")
    .setAppName("hw1"))

# In[3]:

try:
    sc = SparkContext(conf=conf)
    sql_sc = SQLContext(sc)
except ValueError:
    pass

# In[4]:

try:
    data = sql_sc.read.csv('./household_power_consumption.txt',
                           sep=';',
                           header=True)
except AnalysisException:
    data = sql_sc.read.csv('hdfs:///bdm/hw1/household_power_consumption.txt',
                           sep=';',
                           header=True)

# In[5]:
Exemplo n.º 12
0
if __name__ == '__main__':
    conf = SparkConf()

    sc = SparkContext(conf=conf)

    datadir = "/Users/eyalbenivri/Developer/projects/spark-workshop/data/"

    # sudo dpkg --configure -a
    # sudo apt-get install python-setuptools
    # sudo easy_install dateutils
    # Download pyspark_csv.py from https://github.com/seahboonsiew/pyspark-csv
    sys.path.append('/Users/eyalbenivri/Developer/libs/pyspark_libs')  # replace as necessary
    import pyspark_csv

    sc.addFile('/Users/eyalbenivri/Developer/libs/pyspark_libs/pyspark_csv.py')  # ditto
    sqlContext = SQLContext(sc)

    # Task 1: load the prop-prices.csv file as an RDD, and use the csvToDataFrame function from the pyspark_csv module
    # to create a DataFrame and register it as a temporary table so that you can run SQL queries:
    print("------- ******* Task 1 ******* -------")
    columns = ['id', 'price', 'date', 'zip', 'type', 'new', 'duration', 'PAON',
               'SAON', 'street', 'locality', 'town', 'district', 'county', 'ppd',
               'status']

    rdd = sc.textFile(datadir + "prop-prices.csv")
    df = pyspark_csv.csvToDataFrame(sqlContext, rdd, columns=columns)
    df.registerTempTable("properties")
    df.persist()

    # Task 2: let's do some basic analysis on the data.
    # Find how many records we have per year, and print them out sorted by year.
Exemplo n.º 13
0
# Databricks notebook source
from pyspark import SparkContext
from pyspark import SQLContext
from pyspark.sql.functions import *
target_query="SELECT * from databse_name.byod_dbfs_table"
sparkContext = SparkContext.getOrCreate()
sqlContext = SQLContext(sparkContext)
dataframe = sqlContext.sql(target_query)
dataframe.repartition(1).write.format('com.databricks.spark.csv').options(delimiter=",").save("s3n://amgen-edl-acux-aaaa123-bkt/BYOD/", header="true", mode="overwrite")

# COMMAND ----------

print "test2"

# COMMAND ----------

print "test3"
Exemplo n.º 14
0
from pyspark.sql import Row
from pyspark import SparkContext, SQLContext
from pyspark.sql.functions import udf, lit, col
from pyspark.sql.types import ArrayType, StringType


FEATURES_DATA = 'hdfs:///user/harshdee/base_features_complete.parquet'
SELECTED_NEWSPAPERS = 'hdfs:///user/harshdee/newspapers_citations.parquet'

sc = SparkContext()
sqlContext = SQLContext(sc)
sqlContext.setConf('spark.sql.parquet.compression.codec', 'snappy')
features = sqlContext.read.parquet(FEATURES_DATA)
features = features.withColumnRenamed('page_title', 'page_title_')

features = features.select(
    col('citations_features._1').alias('retrieved_citation'),
    col('citations_features._2').alias('ref_index'),
    col('citations_features._3').alias('total_words'),
    col('citations_features._4._1').alias('neighboring_words'),
    col('citations_features._4._2').alias('neighboring_tags')
)

selected_newspapers = sqlContext.read.parquet(SELECTED_NEWSPAPERS)

## def array_to_string(my_list):
##    return '[' + ','.join([str(elem) for elem in my_list]) + ']'
## array_to_string_udf = udf(array_to_string,StringType())

results = features.join(selected_newspapers, features['retrieved_citation'] == selected_newspapers['citations'])
## results = results.withColumn('neighboring_words', array_to_string_udf(results["neighboring_words"]))
Exemplo n.º 15
0
from pyspark.sql import DataFrameStatFunctions, DataFrame
from pyspark.sql.types import *

#inicializar cluster
conf = SparkConf()
#conf.set("spark.driver.memory", "16g")
#conf.set("spark.driver.cores", 4)
#conf.set("spark.driver.memoryOverhead", 0.9)
#conf.set("spark.executor.memory", "32g")
#conf.set("spark.executor.cores", 12)
#conf.set("spark.jars", "/home/jaa6766")
sc = SparkContext(master="local[*]",
                  sparkHome="/usr/local/spark/",
                  appName="tarea-mge-8-parqueteo",
                  conf=conf)
spark = SQLContext(sc)

#leer csv fuente original
data = spark.read.csv("s3a://jorge-altamirano/profeco/data.csv",
                      schema = StructType() \
                        .add("producto", StringType(), False) \
                        .add("presentacion", StringType(), True) \
                        .add("marca", StringType(), True) \
                        .add("categoria", StringType(), True) \
                        .add("catalogo", StringType(), True) \
                        .add("precio", DecimalType(precision=16, scale=4), True) \
                        .add("fechaRegistro", TimestampType(), True) \
                        .add("cadenaComercial", StringType(), True) \
                        .add("giro", StringType(), True) \
                        .add("nombreComercial", StringType(), True) \
                        .add("direccion", StringType(), True) \
Exemplo n.º 16
0
def getAccuracyRate(predDataFrame):
    accurRate = 0.0
    numPredictions = predDataFrame.count()
    predDataFrame = predDataFrame.withColumn('isSame', when(predDataFrame['label'] == predDataFrame['prediction'], 1.0).otherwise(0.0))
    correctPredictions = predDataFrame.select(sum('isSame')).collect()[0][0]
    accurRate = (float(correctPredictions) / float(numPredictions)) * 100.0
    return accurRate


### Main Program ###
reload(sys)
sys.setdefaultencoding('utf8')
if __name__:
    conf = SparkConf().setAppName("Project2Part3")
    sparkContxt = SparkContext(conf = conf)
    sqlContext = SQLContext(sparkContxt)
    directPath = sys.argv[1]
    trainFilePath = directPath + 'adult.data.csv'
    testFilePath = directPath + 'adult.test.csv'
    trainingDataFrame = sqlContext.read.load(trainFilePath, format = 'com.databricks.spark.csv', header = 'true', 
        inferSchema = 'true', ignoreLeadingWhiteSpace='true', ignoreTrailingWhiteSpace='true')
    nRows = trainingDataFrame.count()
    nColumns = len(trainingDataFrame.columns)
    trainingDataFrame.show(5, False)
    print('# Initial Training Rows:', nRows, '\t# Initial Training Columns:', nColumns, '\n')
    testDataFrame = sqlContext.read.load(testFilePath, format = 'com.databricks.spark.csv', header = 'true', 
        inferSchema = 'true', ignoreLeadingWhiteSpace='true', ignoreTrailingWhiteSpace='true')
    nRows = testDataFrame.count()
    nColumns = len(testDataFrame.columns)
    testDataFrame.show(5, False)
    print('# Initial Test Rows :', nRows, '\t# Initial Test Columns :', nColumns, '\n')
Exemplo n.º 17
0
from pyspark.sql.types import *
from sqlalchemy import create_engine
# import pandas as pd
from pyspark.sql.functions import *
from pyspark.sql.functions import UserDefinedFunction, monotonically_increasing_id
from datetime import datetime
from pyspark.sql.functions import lit
from pandas.io import sql
import sys
import boto3
import os

conf = SparkConf().setMaster("spark://*****:*****@148.251.19.66/nlp_live')
# connection= engine.connect()

url = "jdbc:mysql://148.251.19.66/bdt_live?user=bigdata_user&password=dbphuv8qeB28JTBW"
print 'data fetch started'
myrdd = sc.textFile(
    "s3a://nlplive.hi.raw.data/logs/nlp_session-www*.nlpcaptcha.in-{201708180[4-5]*}"
)
data_df = sqlContext.read.json(myrdd)
data_df.registerTempTable('hi_raw_data')
print 'data fetched'
# data_df.printSchema()
mydf2 = sqlContext.sql(
    "Select sessionId,publisher_id,device_finger_print,browserTimeStamp,ct,id1,id2,id3,id4,id5,source,browser,device,ip,country,nlpbot,event_type,event_value,referal_url,user_agent,url,token,time_stamp from hi_raw_data where ip='122.160.157.46'"
Exemplo n.º 18
0
class UserSummary(object):
    def __init__(self):
        # local test
        # self.spark = SparkSession.builder.appName("group_by_fans").master("local").config(
        #     conf=SparkConf()).getOrCreate()
        # self.sql_context = SQLContext(sparkContext=self.spark.sparkContext)
        # self.start_time = "2017-08-01 00:00"  # "publish_time": "2017-02-22 19:20"
        # self.time_format = "%Y-%m-%d %H:%M"
        # prod
        self.spark = SparkSession.builder.config(
            conf=SparkConf().setAppName("weibo_user_summary")).getOrCreate()
        self.sql_context = SQLContext(sparkContext=self.spark.sparkContext)
        self.days_list = self.dateRange("2017-10-16", "2017-10-21")
        self.path = 'hdfs:///ssymmetry_db/raw_db/sina_weibo_fans/sina_weibo_fans_item/2017/%s/*'
        self.output = '/home/spark/hxkTest/out/'

    def dateRange(self, begin_date, end_date):
        dates = []
        dt = datetime.datetime.strptime(begin_date, "%Y-%m-%d")
        date = begin_date[:]
        while date <= end_date:
            dates.append(date)
            dt = dt + datetime.timedelta(1)
            date = dt.strftime("%Y-%m-%d")
        return dates

    def read_dataframe(self, path, time_list):
        data_path = []
        for each_day in time_list:
            data_path.append(path % each_day)
        dataframe = self.spark.read.json(data_path)
        # local test
        # dataframe = self.spark.read.json("sina_weibo_fans_data_2017-11-09-10-18.json")
        return dataframe

    def read_blog_data(self, dataframe):
        df = dataframe.filter("blog_id is not NULL").select(
            'uid', 'blog_content',
            'forward_content').drop_duplicates().fillna(" ")
        return df

    def main(self):
        stop_words = []

        def user_tag_to_num(x):
            uid = x["uid"]
            user_tag = x["user_tag"]
            if user_tag == u"电视剧" or user_tag == u"电台" or user_tag == u"电影" or user_tag == u"动漫" \
                    or user_tag == u"广播电台" or user_tag == u"媒体传播" \
                    or user_tag == u"媒体人" or user_tag == u"美女模特" \
                    or user_tag == u"美女帅哥" or user_tag == u"休闲娱乐" \
                    or user_tag == u"娱乐明星" or user_tag == u"综艺":
                user_tag_num = 1
            elif user_tag == u"动物萌宠" or user_tag == u"萌宠":
                user_tag_num = 2
            elif user_tag == u"法律":
                user_tag_num = 3
            elif user_tag == u"房产":
                user_tag_num = 4
            elif user_tag == u"搞笑" or user_tag == u"搞笑幽默":
                user_tag_num = 5
            elif user_tag == u"互联网":
                user_tag_num = 6
            elif user_tag == u"健身" or user_tag == u"运动健身":
                user_tag_num = 7
            elif user_tag == u"教育" or user_tag == u"公益":
                user_tag_num = 8
            elif user_tag == u"科学":
                user_tag_num = 9
            elif user_tag == u"理财" or user_tag == u"投资理财":
                user_tag_num = 10
            elif user_tag == u"历史":
                user_tag_num = 11
            elif user_tag == u"旅游" or user_tag == u"旅游出行":
                user_tag_num = 12
            elif user_tag == u"美食":
                user_tag_num = 13
            elif user_tag == u"美妆":
                user_tag_num = 14
            elif user_tag == u"汽车" or user_tag == u"交通":
                user_tag_num = 15
            elif user_tag == u"社会时政" or user_tag == u"军事" or user_tag == u"政府政务" or user_tag == u"时事":
                user_tag_num = 16
            elif user_tag == u"数码":
                user_tag_num = 17
            elif user_tag == u"体育" or user_tag == u"体育竞技" or user_tag == u"游戏":
                user_tag_num = 18
            elif user_tag == u"养生" or user_tag == u"医疗" or user_tag == u"医疗健康" or user_tag == u"育儿":
                user_tag_num = 19
            elif user_tag == u"作家" or user_tag == u"艺术" or user_tag == u"音乐" or user_tag == u"收藏" or user_tag == u"设计" or user_tag == u"摄影" or user_tag == u"时尚":
                user_tag_num = 20
            elif user_tag == u"职场":
                user_tag_num = 21
            elif user_tag == u"宗教":
                user_tag_num = 22
            elif user_tag == u"星座命理" or user_tag == u"情感" or user_tag == u"婚庆":
                user_tag_num = 23
            elif user_tag == u"商界名人":
                user_tag_num = 24
            else:
                user_tag_num = 0

            return (uid, user_tag_num)

        # prod
        dataframe = self.read_dataframe(self.path, self.days_list).persist()
        blog_df = self.read_blog_data(dataframe)

        # read approved user list
        df = self.spark.read.csv(
            "hdfs:///ssymmetry_db/raw_db/sina_user_tag/sina_user_tag_item/weibo_uid_with_user_tag.csv", header=True) \
            .select("uid", "user_tag")

        user_tag_num_df = self.sql_context.createDataFrame(
            df.rdd.map(user_tag_to_num), ["uid", "user_tag"])

        # local test
        # dataframe = self.spark.read.json("sina_weibo_fans_data_2017-11-09-10-18.json")
        # blog_df = self.read_blog_data(dataframe).fillna(" ")
        # blog_rdd = blog_df.rdd

        # select the blogs for the tagged users
        tagged_user_blog = blog_df.join(
            user_tag_num_df, blog_df.uid == user_tag_num_df.uid).select(
                blog_df.uid, blog_df.blog_content, blog_df.forward_content,
                user_tag_num_df.user_tag)

        def preprocess_data(x):
            uid = x["uid"]
            blog_content = x["blog_content"]
            forward_content = x["forward_content"]
            user_tag = x["user_tag"]
            if forward_content.rfind(u"*****") > 0:
                forward_content = forward_content.split(u"*****")[1]
            return (uid, (blog_content + forward_content, user_tag))

        def extract_keywords(x):
            uid = x[0]
            # prod
            # ja.set_stop_words("/home/spark/hxkTest/movie_data/stopwords_cn.txt")
            ja.set_stop_words(
                "/home/spark/hxkTest/spark_script/weibo_user_summary/stopwords_cn.txt"
            )

            # local test
            # ja.set_stop_words("stopwords_cn.txt")

            keywords = ja.extract_tags(x[1][0])
            user_tag = x[1][1]
            return (uid, (keywords, user_tag))

        # the rdd contains uid, keywords and the user_tag, next step we need to convert the keywords to a matrix
        data = tagged_user_blog.rdd.map(preprocess_data).reduceByKey(
            lambda x, y: (x[0] + y[0], x[1])).map(extract_keywords).collect()

        uid_list = []
        keywords_list = []
        user_tag_list = []
        for elem in data:
            user_keyword = ""
            uid_list.append(elem[0])
            keywords = elem[1][0]
            for word in keywords:
                user_keyword += word
                user_keyword += "_"
            keywords_list.append(user_keyword)
            user_tag_list.append(elem[1][1])

        result_dict = {
            "uid": uid_list,
            "keywords": keywords_list,
            "user_tag": user_tag_list
        }
        pd.DataFrame(result_dict, index=None).to_csv(
            "/home/spark/hxkTest/spark_script/weibo_user_summary/tagged_user_keyword.csv",
            encoding="utf-8")
Exemplo n.º 19
0
import pyspark
from pyspark import SparkContext, SQLContext

sc = SparkContext.getOrCreate()
sql = SQLContext(sc)

Student = sql.createDataFrame([('009001', 'Anuj', '70%', 'B.tech(cs)'),
                               ('009002', 'Sachin', '80%', 'B.tech(cs)'),
                               ('008005', 'Yogesh', '94%', 'MCA'),
                               ('007014', 'Ananya', '98%', 'MCA')],
                              ['Roll_Num', 'Name', 'Percentage', 'Department'])
Student.show()
Exemplo n.º 20
0
def read_csv(sc):
    sql = SQLContext(sc)
    df = sql.read.csv("./filteredC.small.training",
                      header=True,
                      inferSchema=True)
    return df
Exemplo n.º 21
0
    master = "local[*]"

    spark_home = '/opt/cloud/spark'
    os.environ['SPARK_HOME'] = spark_home

    #input = "/impala/parquet/back/back-portal-loginflowlog/dat=%s" % day
    input = "/input/loginfowlog/02*"
    #output = "/impala/parquet/back"
    output = "/output"
    conf = (SparkConf()
            .setMaster(master)
            .setAppName("user_visit_day")
            #.set("spark.kryoserializer.buffer.mb", "256")
            .set("spark.sql.parquet.binaryAsString", "true"))
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    df = sqlContext.read.parquet(input)
    rdd = df.select('logintype', 'logtype', 'hosid', 'suppid', 'logtime', 'usermac')

    fields = [
        StructField('day', StringType(), True),
        StructField('mac', StringType(), True),
        StructField('hosid', StringType(), True),
        StructField('loginPage', IntegerType(), False),
        StructField('forwardPage', IntegerType(), False),
        StructField('arrivePage', IntegerType(), False)
    ]
    schema = StructType(fields)

    # compute pages
Exemplo n.º 22
0
class Analysiser:
    def __init__(self):

        conf = SparkConf().setAppName('Analysiser').set("spark.sql.crossJoin.enabled", True)
        self.sc = SparkContext(conf=conf)
        self.sqlctx = SQLContext(self.sc)

        self.pdf = pd.read_excel('data_o.xlsx', sheetname=0, header=0,  parse_cols=[9, 10, 23, 32, 45, 60])

        schema = StructType([
            StructField('TI',StringType(),True),
            StructField('SO', StringType(), True),
            StructField('C1', StringType(), True),
            StructField('TC', StringType(), True),
            StructField('PY', StringType(), True),
            StructField('UT', StringType(), True)

        ])

        df = self.sqlctx.createDataFrame(self.pdf,schema)

        def m_clean(x):
            try:
                py = int(x['PY'])
                tc = int(x['TC'])
                authors = x['C1']

                if py>=2006 and py<=2016 and authors != '':

                    first_author = authors[1:].split(']')[0].split('; ')[0]

                    return [(x['TI'],x['SO'],x['C1'],first_author,x['TC'],int(x['PY']),x['UT']),]
                else:
                    return []
            except Exception as e:
                return []

        schema2 = StructType([
            StructField('TI', StringType(), True),
            StructField('SO', StringType(), True),
            StructField('C1', StringType(), True),
            StructField('first_author', StringType(), True),
            StructField('TC', StringType(), True),
            StructField('PY', IntegerType(), True),
            StructField('UT', StringType(), True)

        ])
        self.df = self.sqlctx.createDataFrame(df.rdd.flatMap(m_clean),schema2)


        #self.df.show()


    # def parse(self):
    #     .wb = load_workbook('data_min.xlsx')
    #     sheet = wb.get_sheet_by_name('all')
    #     new_wb = openpyxl.Workbook()
    #     new_sheet = new_wb.create_sheet('simple')
    #     new_sheet.append(['TI', 'SO', 'C1', 'TC', 'PY', 'UT'])
    #
    #
    #     for row in list(sheet.rows)[2:100]:
    #         r = [c.value for c in row]
    #         r_min = [r[9],r[10],r[23],r[32],r[45],r[60]]
    #         print(r_min)
    #         new_sheet.append(r_min)
    #     new_wb.save('export.xlsx')

    def parse2(self):

        self.df.ExcelWriter('output.xls')





    def func1(self):
        df = self.df.toPandas()
        #print(df.head())
        plt.figure(figsize=(9, 6))
        plt.scatter(df['PY'], df['TC'], s=25, alpha=0.4, marker='o')
        # T:散点的颜色
        # s:散点的大小
        # alpha:是透明程度
        plt.show()


    def func2(self):
        df = self.df
        first_author_df = df.select('first_author','PY').groupBy('first_author').max('PY').withColumnRenamed('max(PY)','maxPY')

        self.sqlctx.registerDataFrameAsTable(df.drop('first_author'),'df')
        self.sqlctx.registerDataFrameAsTable(first_author_df,'fa')

        sql = "select first_author,TC from (fa outer join df on C1 like CONCAT('%',first_author,'%'))"

        join = self.sqlctx.sql(sql)
        join_rdd = join.rdd.map(lambda x:(x['first_author'],x['TC'])).reduceByKey(lambda x,y:x+'-'+y)

        # for r in join_rdd.collect():
        #     print(r)

        def m_h(x):
            flag = False
            h = 0
            cts = [int(x) for x in x[1].split('-')]
            cts.sort(reverse=True)
            for i in range(1, len(list(cts))+1):
                if i >= cts[i-1]:
                    flag = True
                    h = i # TODO or cts[i-1]
                    break

            if flag:
                return [(x[0],h),]
            else:
                return []

        author_h_rdd = join_rdd.flatMap(m_h)
        author_h_df = self.sqlctx.createDataFrame(author_h_rdd,['first_author','h'])
        final_df = author_h_df.join(first_author_df,'first_author','left_outer').select('h','maxPY')
        pdf = final_df.toPandas()

        plt.figure(figsize=(9, 6))
        plt.scatter(pdf['maxPY'], pdf['h'], s=25, alpha=0.4, marker='o')
        # T:散点的颜色
        # s:散点的大小
        # alpha:是透明程度
        plt.show()
            ids.append(cluster[j])

    bag_words = {}
    for i in ids:
        bag_words[i]=(float(ids.count(i))/len(ids))
     # Create a SparseVector
    sv = Vectors.sparse(2000, bag_words)
    return sv


input = sys.argv[1]
output = sys.argv[2]

conf = SparkConf().setAppName('bag_words')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

with open('clusterFinal.pickle', 'rb') as f:
	cluster=pickle.load(f)


schema = StructType([
    StructField('reviewText', StringType(), False),StructField('overall', FloatType(), False),StructField('reviewTime', StringType(), False)
])

df = sqlContext.read.json(input, schema=schema)
df.registerTempTable('review_table')
sd=sqlContext.sql("""
    SELECT reviewText FROM review_table
""")
fin=sd.rdd.map(lambda x: str(x.reviewText)).map(clean_words)
Exemplo n.º 24
0
from pyspark.sql import DataFrameStatFunctions, DataFrame
from pyspark.sql.types import *

#inicializar cluster
conf = SparkConf()
#conf.set("spark.driver.memory", "16g")
#conf.set("spark.driver.cores", 4)
#conf.set("spark.driver.memoryOverhead", 0.9)
#conf.set("spark.executor.memory", "32g")
#conf.set("spark.executor.cores", 12)
#conf.set("spark.jars", "/home/jaa6766")
sc = SparkContext(master="local[*]",
                  sparkHome="/usr/local/spark/",
                  appName="tarea-mge-8-parqueteo",
                  conf=conf)
spark = SQLContext(sc)

#lectura del parquet
data = spark.read.parquet("s3a://jorge-altamirano/profeco/data.parquet")


#funcion que hace lo mismo que summary, pero
#irónicamente más rápido que spark (esta es para numéricos)
def summary_j3a(col):
    min1 = data.select(min(data[col]).alias("min")).toPandas().transpose()
    max1 = data.select(max(data[col]).alias("max")).toPandas().transpose()
    avg1 = data.select(mean(data[col]).alias("avg")).toPandas().transpose()
    std1 = data.select(stddev(
        data[col]).alias("stddev")).toPandas().transpose()
    probs = [0.25, 0.5, 0.75]
    qnt1 = pd.DataFrame(  \
def main():

    sc = SparkContext(SPARK_ADDRESS, appName="RedditBatchLayer")
    #sc = SparkContext("local[*]", appName="RedditBatchLayer")
    bcURL = sc.broadcast(urlTitlePool)
    sqlContext = SQLContext(sc)

    conn = S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
    #conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
    bucket = conn.get_bucket(RAW_JSON_REDDIT_BUCKET)

    def addTitleURL(cmtTuple):
        # 150,000/ 3000 = avg 50 comments/topic
        onePst = bcURL.value[randint(0, 3000)]
        return  cmtTuple + (onePst[0], onePst[1]) # adding title and url


    if (smallBatch): 
        logFile = 's3a://reddit-comments/2007/RC_2007-10'
        #df = sqlContext.read.json(logFile)
        df = sqlContext.jsonFile(logFile)
        users_rdd = df.filter(df['author'] != '[deleted]') 
        year = 2007
        month = 12
        users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\
                             .map(addTitleURL)
                             #.repartition(REPARTITION_SIZE)
        users_row.foreachPartition(insert_into_cassandra)

        # calculate user relationship graph
        # (URL, user) tuple
        post2user = users_row.map(lambda x: (x[10], x[0]))
        #graph     = post2user.join(post2user)\                       # self join to find user relationship by posts
        #                     .filter(lambda x: x[1][0] != x[1][1])\  # remove all self linked relationship
        #                     .map(makeAscOrder)\                     # make to asc order by user name
        #                     .distinct()\        # remove duplicated user pairs, because the relationship is mutual
        #                     .map(lambda x: (x[1], 1))\              # ready tho count number of common edges
        #                     .reduceByKey(lambda x, y: x+y)\         # count total number for every edge/relationship
        #                     .map(lambda x: (x[0][0], x[1], x[0][1]))# flatten and ready to write table
        graph     = post2user.join(post2user)\
                             .filter(lambda x: x[1][0] != x[1][1])\
                             .map(makeAscOrder)\
                             .distinct()\
                             .map(lambda x: (x[1], 1))\
                             .reduceByKey(lambda x, y: x+y)\
                             .map(lambda x: (x[0][0], x[1], x[0][1]))
        graph.foreachPartition(insert_graph)

    else:

        for key in bucket.list():
            if '-' not in key.name.encode('utf-8'): # filter out folders and _SUCCESS
                continue
            logFile = 's3a://{0}/{1}'.format(RAW_JSON_REDDIT_BUCKET, key.name.encode('utf-8'))
            year = logFile.split('-')[1][-4:] 
            month = logFile.split('-')[2]
            from_year = FROM_YEAR_MONTH.split('_')[0]
            from_month = FROM_YEAR_MONTH.split('_')[1]
            if int(year) < int(from_year) or (int(year) == int(from_year) and int(month) < int(from_month)):
                continue
            #df = sqlContext.read.json(logFile)
            df = sqlContext.jsonFile(logFile)
            users_rdd = df.filter(df['author'] != '[deleted]') 
                                                   #   0                     1                        2                3            4          5          6          7              8           9 (title)   10(url)
            users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\
                                 .map(addTitleURL)
                                 #.repartition(REPARTITION_SIZE)
            users_row.foreachPartition(insert_into_cassandra)

            # calculate user relationship graph
            # (URL, user) tuple
            post2user = users_row.map(lambda x: (x[10], x[0]))
            #graph     = post2user.join(post2user)\                       # self join to find user relationship by posts
            #                     .filter(lambda x: x[1][0] != x[1][1])\  # remove all self linked relationship
            #                     .map(makeAscOrder)\                     # make to asc order by user name
            #                     .distinct()\        # remove duplicated user pairs, because the relationship is mutual
            #                     .map(lambda x: (x[1], 1))\              # ready tho count number of common edges
            #                     .reduceByKey(lambda x, y: x+y)\         # count total number for every edge/relationship
            #                     .map(lambda x: (x[0][0], x[1], x[0][1]))# flatten and ready to write table
            graph     = post2user.join(post2user)\
                                 .filter(lambda x: x[1][0] != x[1][1])\
                                 .map(makeAscOrder)\
                                 .distinct()\
                                 .map(lambda x: (x[1], 1))\
                                 .reduceByKey(lambda x, y: x+y)\
                                 .map(lambda x: (x[0][0], x[1], x[0][1]))
                                 #.repartition(REPARTITION_SIZE)
            graph.foreachPartition(insert_graph)

    sc.stop()
Exemplo n.º 26
0
# This is an edited version of https://github.com/minhptx/iswc-2016-semantic-labeling, which was edited to use it as a baseline for Tab2KG (https://github.com/sgottsch/Tab2KG).

import os

from gensim.models import Word2Vec

from pyspark import SparkContext, SQLContext

sc = SparkContext()
sql_context = SQLContext(sc)

root_dir = os.path.abspath(os.path.join(os.path.realpath(__file__), '..'))
data_dir = os.path.join(root_dir, "data/datasets")
train_model_dir = os.path.join(root_dir, "data/train_models")

# word2vec = Word2Vec.load_word2vec_format(os.path.join("/Users/minhpham/tools/", 'GoogleNews-vectors-negative300.bin'), binary=True)

file_write = open('debug.txt', 'w')

logger = sc._jvm.org.apache.log4j
logger.LogManager.getLogger("org").setLevel(logger.Level.FATAL)
logger.LogManager.getLogger("akka").setLevel(logger.Level.FATAL)
Exemplo n.º 27
0
    select_userlogin_repeat = "select get_date(login_time) as day,hos_id,mac,login_time from wxcity_userlogin_info where login_time >= '%s 00:00:00' and login_time <='%s 23:59:59' order by day,hos_id,mac,login_time"
    select_userlogin_repeat_sta = "select day,hos_id,sum(t2),sum(t5),sum(t10),sum(t30),sum(t60) from repeat_login_list group by day,hos_id"


if __name__ == '__main__':

    if len(sys.argv) != 5:
        print("Usage: spark_streaming.py <master> <begin> <end> <input>", file=sys.stderr)
        exit(-1)

    master, time_begin, time_end, input = sys.argv[1:]
    input_path = input + '/' + time_begin + '.csv'
    logger.info("--->" + master + " " + input_path)

    sc = SparkContext(master, 'wxcity_userlogin_repeat_app')
    sql_context = SQLContext(sc)

    lines = sc.hadoopFile(input,
                          'org.apache.hadoop.mapred.TextInputFormat',
                          'org.apache.hadoop.io.LongWritable',
                          'org.apache.hadoop.io.Text'
                          )

    rs_tuples = MysqlDao().findWithQuery(ConfigPortalSql.select_mysql_hos_gw_sup)
    gwid_hosid_dict = {}
    for r in rs_tuples:
        hos_id = str(r[0])
        gw_id = r[1]
        gwid_hosid_dict[gw_id] = hos_id
    logger.debug('-->gwid_hosid:' + str(gwid_hosid_dict.__len__()))
    users = lines.map(lambda x: x[1].split(',')).filter(lambda x: len(x) == 17) \
from pyspark import SparkContext, SQLContext
from pyspark import SparkFiles
from pyspark.sql import *
from pyspark.sql.functions import *
sc = SparkContext(appName="phoenix-spark-sparkapi")
sqlContext = SQLContext(sc)

# Approach 1 - Using the Spark driver
zkUrl = 'zookeeper.example.com:2181/hbase-secure'
df_phoenixTable = sqlContext.read\
.format('org.apache.phoenix.spark')\
.option('table', 'schema.table')\
.option('zkUrl',zkUrl)\
.load()

## Use Spark API to filter
df_phoenixTable_filtered=df_phoenixTable.filter((df_phoenixTable['col1'] == lit('98765')) & \
(df_phoenixTable['col2'] == 'A') & (df_phoenixTable['col3'] == lit('123456')))
df_phoenixTable_filtered.show()

## Use SQL to filter
df_phoenixTable.registerTempTable("df_phoenixTable")
df_phoenixTable_sql = sqlContext.sql(
    'SELECT col1,col2,col3 FROM df_phoenixTable WHERE col1=98765 AND col2=\'A\' AND col3=\'123456\''
)
df_phoenixTable_sql.show()

# Sample upsert back to Phoenix
df_phoenixTable_sql.write.format('org.apache.phoenix.spark').mode(
    'overwrite').option('table', 'schema.targetTable').option('zkUrl',
                                                              zkUrl).save()
Exemplo n.º 29
0
    ymd = pro_time.strftime("%Y%m%d")

    master = "spark://hadoop:7077"
    appName = "spark_loginflowlog"
    #input = "/impala/parquet/back/back-portal-loginflowlog/dat=%s*" % ym
    input = '/input/loginfowlog/*'

    spark_home = '/opt/cloud/spark'
    os.environ['SPARK_HOME'] = spark_home
    conf = (SparkConf()
            .setMaster(master)
            .setAppName(appName)
            .set("spark.sql.parquet.binaryAsString","true")
            )
    sc = SparkContext(conf = conf)
    sql_context = SQLContext(sc)
    sql_context.registerFunction("to_mac", lambda x: normal_mac(x), StringType())

    parquet_df = sql_context.read.parquet(input)
    sql_context.registerDataFrameAsTable(parquet_df, "loginflowlog")
    #_sql = "select to_mac(upper(usermac)),count(distinct dat) days from loginflowlog group by to_mac(upper(usermac))"
    _sql = "select to_mac(upper(usermac)),count(distinct logtime) days from loginflowlog group by to_mac(upper(usermac))"
    rs_df = sql_context.sql(_sql)
    rs = rs_df.collect()
    logger.info("---->" + str(len(rs)))

    lists = []
    for r in rs:
        usermac = r[0]
        days = r[1]
        t = (usermac,days)
Exemplo n.º 30
0
"""
Given an RDD of dictionaries and a column to decile,
add decile of column to each row's dictionary by
pre-computing decile thresholds to avoid out of memory errors
when sorting an RDD with too many columns.
"""

from pyspark import SparkContext, SQLContext#, HiveContext
from datetime import datetime
sc = SparkContext()
sqlContext = SQLContext(sc)
# hiveContext = HiveContext(sc)
sc.setLogLevel("FATAL")

#
# setup
#

import numpy as np

# create random data
n = 52
prices = [float(list(5 + abs(np.random.randn(1)) * 100)[0]) 
	for i in range(n)]
dates = [datetime(year=np.random.randint(2000, 2016), 
	month=np.random.randint(1, 12), 
	day=np.random.randint(1, 28)).date() for i in range(n)]
groups = [np.random.randint(1, 100) for i in range(n)]
data = [{"price": price, "date": _date, "group": group} 
	for price, _date, group in zip(prices, dates, groups)]
df = sqlContext.createDataFrame(data)
Exemplo n.º 31
0
# In[47]:

#from pyspark.sql import Row
#from pyspark.sql.types import *

# In[98]:

dataRow.take(1)

# In[94]:

dataRow.distinct().count()

# In[38]:

sqlContext = SQLContext(sc)

# In[ ]:

dfData = sqlContext.createDataFrame(dataRow).toDF("basket_hash", "baskey_key",
                                                  "payload")

# In[40]:

dfData.show()

# In[41]:

dfData.printSchema()

# In[43]:
Exemplo n.º 32
0
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 21 09:31:37 2016

@author: rsk
"""
from pyspark import SparkContext
from pyspark import SQLContext
sc = SparkContext("local","recommendationEngineApp")
sqlContext = SQLContext(sc)

from pyspark.sql import SQLContext,Row
#from pyspark.sql import Functions as F

dataDir = "/home/rsk/Documents/Spark"

userData = sc.textFile(dataDir+"/ml-100k/u.user").map(lambda x : x.split("|"))
movieData = sc.textFile(dataDir+"/ml-100k/u.item").map(lambda x : x.split("|"))
ratingData = sc.textFile(dataDir+"/ml-100k/u.data").map(lambda x : x.split("\t"))

#%%

ratingDataDF = ratingData.map(lambda x : Row(userID = int(x[0]),
                        movieID = int(x[1]),
                        rating=float(x[2]),
                        timestamp = int(x[3])))
ratingDataDF = sqlContext.createDataFrame(ratingDataDF)

userDataDF = userData.map(lambda x : Row(userID=int(x[0]),
                                        age = int(x[1]),
                                        gender = x[2],
Exemplo n.º 33
0
if __name__ == '__main__':
    # --set datetime
    DAY_OFFSET = 1
    now = datetime.datetime.now()
    pro_time = now - datetime.timedelta(days=DAY_OFFSET)
    day = pro_time.strftime("%Y%m%d")

    master = "spark://hadoop:7077"
    appName = "spark_pageflow_outflow"
    input = "/impala/parquet/site/site-pageflowv1/dat=%s" % day

    spark_home = '/opt/cloud/spark'
    os.environ['SPARK_HOME'] = spark_home

    sc = SparkContext(master, appName)
    sql_context = SQLContext(sc)
    sql_context.registerFunction("to_day", lambda x: mill_date_str(x), StringType())
    sql_context.registerFunction("to_str", lambda x: bytearray_str(x), StringType())

    parquet_df = sql_context.read.parquet(input)
    sql_context.registerDataFrameAsTable(parquet_df, "site_pageflowv1")

    _sql = "select to_str(url),to_day(createtime) day,count(1) pv,count(distinct to_str(guuid)) uv " \
           "from site_pageflowv1 where dat= %s and to_str(name)='outflow' " \
           "group by to_str(url),to_day(createtime)" % day

    rs_df = sql_context.sql(_sql)
    rs = rs_df.collect()
    logger.info("---->" + str(len(rs)))

    list = []
Exemplo n.º 34
0
from pyspark.sql.functions import udf
from pyspark.sql import Row

def toInt(s):
        if isinstance(s, str) == True:
            st = [str(ord(i)) for i in s]
            return(int(''.join(st)))
        else:
            return None


if __name__ == '__main__':

    conf = pyspark.SparkConf() 
    sc = pyspark.SparkContext.getOrCreate(conf=conf)
    spark = SQLContext(sc)

    schema = StructType([
        StructField("sales", FloatType(),True),    
        StructField("employee", StringType(),True),
        StructField("ID", IntegerType(),True)
    ])

    data = [[ 10.2, "Fred",123]]

    df = spark.createDataFrame(data,schema=schema)

    colsInt = udf(lambda z: toInt(z), IntegerType())
    spark.udf.register("colsInt", colsInt)

    df2 = df.withColumn( 'semployee',colsInt('employee'))
Exemplo n.º 35
0
from pyspark import SparkContext, SparkConf, SQLContext

conf = SparkConf().setAppName("pyspark-readFromJSONinHDFS-py")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

departmentsJson = sqlContext.jsonFile(
    "/user/joseluisillana1709/department_json/department.json")
departmentsJson.registerTempTable("departmentsTable")
departmentsData = sqlContext.sql("select * from departmentsTable")
for rec in departmentsData.collect():
    print(rec)

#Writing data in json format
departmentsData.toJSON().saveAsTextFile(
    "/user/joseluisillana1709/pruebas_spark/result/departmentsJson")
Exemplo n.º 36
0
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql.types import StructType, StructField, StringType

conf = SparkConf().setMaster("local").setAppName("CustomerOrders")
sqlContext = SQLContext(SparkContext)
df = sqlContext.read.json(
    "https://s3-eu-west-1.amazonaws.com/dwh-test-resources/recipes.json")

df.show()
Exemplo n.º 37
0
from pyspark import SQLContext, SparkConf, SparkContext
from pyspark.sql.types import *

#setting the configurations for the SparkConf object here
conf = (SparkConf()
         .setMaster("local[4]")
         .setAppName("convert.py")
        .set("spark.executor.memory", "1g"))

#creating the SparkConf object here
sc = SparkContext(conf = conf)

#creating the sqlContext that will be used
sqlContext = SQLContext(sc)

#reading the parquet file

#Change this line to be the directory where the parquet file exists
parquetFile = sqlContext.read.parquet('data/test2') 


parquetFile.registerTempTable("parquetFile")

#Queries are made from the base + command.

#base SELECTS elements of what you are interested from WHERE 
base = "SELECT * FROM parquetFile WHERE"

#command is the query you make.
command = ' ip_len >= 1500'
test = sqlContext.sql(base + command)
Exemplo n.º 38
0
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
import json
from datetime import tzinfo, datetime
import pytz
import re
from pyspark import SparkContext, SQLContext

sc = SparkContext()

sqlContext = SQLContext(sc)
sqlContext.setConf("spark.sql.parquet.compression.codec", "snappy")

###############################################

sections_dataframe = sqlContext.createDataFrame(
    sections_titles.flatMap(
        lambda r: [Row(id=r.id, title=t) for t in r.sections]))

counts_rdd = sections_dataframe.map(lambda x: (x.title, 1)).reduceByKey(
    lambda a, b: a + b).map(lambda x: Row(sections_title=x[0], count=x[1]))
counts_dataframe = sqlContext.createDataFrame(counts_rdd).filter("count > 10")

joined = sections_dataframe.join(
    counts_dataframe,
    sections_dataframe.title == counts_dataframe.sections_title)

filtered = joined.map(lambda r: (r.id, [r.title])).reduceByKey(
    lambda a, b: a + b)
Exemplo n.º 39
0
class DataHandler:
    def __init__(self):
        self.conf = (SparkConf()
                     .setAppName("BandCard")
                     .set("spark.cores.max", "2")
                     .set('spark.executor.extraClassPath', '/usr/local/env/lib/mysql-connector-java-5.1.38-bin.jar'))
        self.sc = SparkContext(conf=self.conf)
        self.sqlctx = SQLContext(self.sc)

        self.mysql_helper = MySQLHelper('core', host='10.9.29.212')

    def load_from_mysql(self, table, database='core'):
        url = "jdbc:mysql://10.9.29.212:3306/%s?user=root&characterEncoding=UTF-8" % database
        df = self.sqlctx.read.format("jdbc").options(url=url, dbtable=table, driver="com.mysql.jdbc.Driver").load()
        return df




    def prepare_life_cycle(self, year, season):
        '''
        准备生命周期数据
        从t_CMMS_ASSLIB_ASSET中获取每日AUM数据
        prepare data

        saum1 (last season sum aum)
        saum2 (current season sum aum)
        aum_now
        account_age (months)
        last_tr_date (days)

        :param year:
        :param season: 1,2,3,4
        :return:
        '''

        # 计算月份
        print('----------------------生命周期-Start----------------------')

        print('开始准备生命周期数据...')
        print('开始计算月份')

        if season == 1:
            # date1 当前季度月份
            date1 = [str(year) + '-01', str(year) + '-02', str(year) + '-03']

            # date2 上一季月份
            date2 = [str(year - 1) + '-10', str(year - 1) + '-11', str(year - 1) + '-12']

        elif season == 4:
            date1 = [str(year) + '-10', str(year) + '-11', str(year) + '-12']
            date2 = [str(year) + '-07', str(year) + '-08', str(year) + '-9']

        else:
            date1 = [str(year) + '-0' + str(3 * season - 2), str(year) + '-0' + str(3 * season - 1),
                     str(year) + '-0' + str(3 * season)]
            date2 = [str(year) + '-0' + str(3 * season - 5), str(year) + '-0' + str(3 * season - 4),
                     str(year) + '-0' + str(3 * season - 3)]

        print('当前季度月份 new:', date1)
        print('上一季度月份 old:', date2)

        # 加载AUM表
        aum = self.load_from_mysql('t_CMMS_ASSLIB_ASSET_c').cache()

        # 拼接每季度三个月断数据
        season_new = aum.filter(aum.STAT_DAT == date1[0]).unionAll(aum.filter(aum.STAT_DAT == date1[1])).unionAll(
            aum.filter(aum.STAT_DAT == date1[2]))
        season_old = aum.filter(aum.STAT_DAT == date2[0]).unionAll(aum.filter(aum.STAT_DAT == date2[1])).unionAll(
            aum.filter(aum.STAT_DAT == date2[2]))

        # 计算每季度AUM
        aum_season_old = season_old.select('CUST_NO', season_old.AUM.alias('AUM1')).groupBy('CUST_NO').sum('AUM1')
        aum_season_new = season_new.select('CUST_NO', season_new.AUM.alias('AUM2')).groupBy('CUST_NO').sum('AUM2')

        # 两个季度进行外联接
        '''
        +-----------+---------+---------+
        |    CUST_NO|sum(AUM2)|sum(AUM1)|
        +-----------+---------+---------+
        |81005329523|     null|294844.59|
        |81011793167|     null|   365.20|
        |81015319088|     null|  9640.96|
        +-----------+---------+---------+
        '''
        union_season = aum_season_old.join(aum_season_new, 'CUST_NO', 'outer')

        # 筛选当前AUM
        temp_result = aum.select('CUST_NO', 'AUM', 'STAT_DAT').groupBy('CUST_NO', 'STAT_DAT').sum('AUM').sort(
            'CUST_NO').sort(aum.STAT_DAT.desc())
        temp_result.select('CUST_NO', temp_result['sum(AUM)'].alias('AUM'), 'STAT_DAT').registerTempTable('group_in')

        aum_now_sql = "select CUST_NO,first(AUM) as AUM_NOW from group_in group by CUST_NO"

        aum_now = self.sqlctx.sql(aum_now_sql)
        # 清除缓存表
        self.sqlctx.dropTempTable('group_in')

        # 联合
        union_season_aumnow = union_season.join(aum_now, 'CUST_NO', 'outer')

        # 计算用户开户至今时间(months)
        # 载入账户表
        account = self.load_from_mysql('t_CMMS_ACCOUNT_LIST').cache()
        account.select('CUST_NO', 'OPEN_DAT').registerTempTable('account')
        account_age_aql = "select  CUST_NO, first(ACCOUNT_AGE) as ACCOUNT_AGE  from " \
                          "(select CUST_NO, round(datediff(now(), OPEN_DAT) / 30) as ACCOUNT_AGE " \
                          "from account order by CUST_NO, ACCOUNT_AGE desc ) as t group by CUST_NO"

        account_age = self.sqlctx.sql(account_age_aql)

        # calculate last tran date
        account_1 = account.select('CUST_NO', 'ACC_NO15')
        detail = self.load_from_mysql('t_CMMS_ACCOUNT_DETAIL').select('ACC_NO15', 'TRAN_DAT')
        a_d = account_1.join(detail, 'ACC_NO15', 'outer')
        a_d.filter(a_d.CUST_NO != '').registerTempTable('adtable')

        last_tr_date_sql = "select CUST_NO,first(TRAN_DAT) as LAST_TR_DATE from (select CUST_NO,TRAN_DAT from adtable order by TRAN_DAT desc) as t group by CUST_NO"

        last_tr_date = self.sqlctx.sql(last_tr_date_sql)

        # 联合 season   aum_now    account_age     last_tr_date
        unions = union_season_aumnow.join(account_age, 'CUST_NO', 'outer').join(last_tr_date, 'CUST_NO', 'outer')

        # 清除缓存表
        self.sqlctx.dropTempTable('account')
        self.sqlctx.dropTempTable('adtable')
        self.sqlctx.clearCache()

        # 结果插入表
        print('结果插入临时表:t_CMMS_TEMP_LIFECYCLE...')
        insert_lifecycle_sql = "replace into t_CMMS_TEMP_LIFECYCLE(CUST_NO,SAUM1,SAUM2,INCREASE,ACCOUNT_AGE,AUM_NOW,LAST_TR_DATE) values(%s,%s,%s,%s,%s,%s,%s)"

        # 缓冲区
        temp = []
        for row in unions.collect():
            row_dic = row.asDict()

            if len(temp) >= 1000:  # 批量写入数据库
                self.mysql_helper.executemany(insert_lifecycle_sql, temp)
                temp.clear()

            # 加载数据到缓冲区

            try:
                # 计算增长率
                increase = (row_dic['sum(AUM2)'] - row_dic['sum(AUM1)']) / row_dic['sum(AUM1)']
            except Exception:
                increase = 0

            # 计算开户时长(月份数) 若无则视为6个月以上
            if row_dic['ACCOUNT_AGE'] is None:
                row_dic['ACCOUNT_AGE'] = 7

            # 最后交易日期
            ltd = row_dic['LAST_TR_DATE']
            if ltd is not None:
                try:
                    ltd = datetime.datetime.strptime(ltd, '%Y-%m-%d')
                except Exception:
                    ltd = ltd[:4] + '-' + ltd[4:6] + '-' + ltd[6:]
                    ltd = datetime.datetime.strptime(ltd, '%Y-%m-%d')

                days = (datetime.datetime.now() - ltd).days
            else:
                days = 366

            temp.append((row_dic['CUST_NO'], row_dic['sum(AUM1)'], row_dic['sum(AUM2)'], increase,
                         row_dic['ACCOUNT_AGE'], row_dic['AUM_NOW'], days))

        if len(temp) != 0:
            self.mysql_helper.executemany(insert_lifecycle_sql, temp)
            temp.clear()








    def calculate_life_cycle(self):

        '''
        根据AUM变化情况计算生命周期阶段
        calculate life cycle period
        :return:
        '''

        print('开始计算生命周期...')
        life_cycle = self.load_from_mysql('t_CMMS_TEMP_LIFECYCLE').cache()

        def clcmap(line):
            cust_no = line['CUST_NO']
            account_age = line['ACCOUNT_AGE']
            last_tr_date = line['LAST_TR_DATE']
            aum_now = line['AUM_NOW']
            increase = line['INCREASE']

            period = 0
            if aum_now is None:
                period = 9  # 未知
            elif aum_now < 1000 and last_tr_date > 365:
                period = 3  # 流失期
            else:
                if increase > 20 or account_age < 6:
                    period = 0  # 成长期
                elif increase >= -20 and increase <= 20:
                    period = 1  # 成熟期
                else:
                    period = 2  # 稳定期

            return period, cust_no


        map_result = life_cycle.map(clcmap).collect()

        # clear the life_cycle cache
        self.sqlctx.clearCache()

        temp = []
        print('结果更新到临时表:t_CMMS_TEMP_LIFECYCLE...')
        update_life_period_sql = "update t_CMMS_TEMP_LIFECYCLE set PERIOD = %s where CUST_NO = %s"
        for row in map_result:

            if len(temp) >= 1000:
                self.mysql_helper.executemany(update_life_period_sql, temp)
                temp.clear()
            temp.append(row)

        if len(temp) != 1000:
            self.mysql_helper.executemany(update_life_period_sql, temp)
            temp.clear()







    def lifecycle_to_real_table(self, year, season):
        '''
        将生命周期数据写入正式表中
        put life_cycle tmp table to real table
        :return:
        '''

        print('开始将生命周期数据写入正式表中...')
        life_cycle = self.load_from_mysql('t_CMMS_TEMP_LIFECYCLE').select('CUST_NO', 'PERIOD')
        cust_info = self.load_from_mysql('t_CMMS_INFO_CUSTOMER').select('CUST_NO', 'CUST_ID', 'CUST_NAM')

        union = life_cycle.join(cust_info, 'CUST_NO', 'left_outer').cache()

        temp = []
        sql = "replace into t_CMMS_ANALYSE_LIFE(CUST_NO,CUST_ID,CUST_NM,LIFE_CYC,QUARTER,UPDATE_TIME) values(%s,%s,%s,%s,%s,now())"
        quarter = str(year) + '-' + str(season)
        for row in union.collect():

            if len(temp) >= 1000:
                self.mysql_helper.executemany(sql, temp)
                temp.clear()

            cust_id = row['CUST_ID'] if row['CUST_ID'] is not None else '0'
            temp.append((row['CUST_NO'], cust_id, row['CUST_NAM'], row['PERIOD'], quarter))

        if len(temp) != 1000:
            self.mysql_helper.executemany(sql, temp)
            temp.clear()

        self.sqlctx.clearCache()





    def run_life_cycle(self,year,season):
        '''
        运行完整的生命周期流程
        1 准备生命周期数据,计算AUM及其变化幅度
        2 根据变化幅度计算生命周期阶段
        3 将数据从缓存表放到实际表
        :param year:
        :param season:
        :return:
        '''
        self.prepare_life_cycle(year,season)
        self.calculate_life_cycle()
        self.lifecycle_to_real_table(year,season)




#------------------------------------------------------------------------生命周期结束------------------------------------------------------------------------#














    def customer_value(self, year, half_year):
        '''
        计算客户价值
        calculate customer value
        :param year: which year to calculate
        :param half_year: 0 for month 1-6 , 1 for month 7-12
        :return:
        '''

        print('---------------------------客户价值-Start--------------------------')
        cust_info = self.load_from_mysql('t_CMMS_INFO_CUSTOMER').select('CUST_NO', 'CUST_ID', 'CUST_NAM').cache()
        aum = self.load_from_mysql('t_CMMS_ASSLIB_ASSET_c').select('CUST_NO', 'STAT_DAT', 'AUM', 'ASS_TYPE').cache()

        base = half_year * 6

        aum_slot_filter = None

        for i in range(1, 7):
            i = base + i
            if i < 10:
                i = '0' + str(i)
            else:
                i = str(i)
            slot = str(year) + '-' + i

            slot_filter = aum.filter(aum.STAT_DAT == slot)
            if aum_slot_filter is None:
                aum_slot_filter = slot_filter
            else:
                aum_slot_filter = aum_slot_filter.unionAll(slot_filter)

        # CUST_NO sum(AUM)
        huoqi_aum = aum_slot_filter.select('CUST_NO', 'ASS_TYPE', aum_slot_filter['AUM'].alias('AUM_HQ')).filter(
            aum_slot_filter.ASS_TYPE == '1').groupBy('CUST_NO').sum('AUM_HQ')
        dingqi_aum = aum_slot_filter.select('CUST_NO', 'ASS_TYPE', (aum_slot_filter.AUM * 0.8).alias('AUM_DQ')).filter(
            aum_slot_filter.ASS_TYPE == '2').groupBy('CUST_NO').sum('AUM_DQ')

        # 定期活期已计算好,sum(AUM_HQ),sum(AUM_DQ)
        j = huoqi_aum.join(dingqi_aum, 'CUST_NO', 'outer')
        # j.show()


        # 清除原有数据
        self.mysql_helper.execute('truncate core.t_CMMS_ANALYSE_VALUE')

        # 开始联合其他表
        all_col = j.join(cust_info, 'CUST_NO', 'outer')

        print(j.count(), cust_info.count())

        # all_col.show()

        #根据客户价值计算客户等级
        def calculate_rank(value):
            if value < 1000:
                return 0
            elif value < 10000:
                return 1
            elif value < 100000:
                return 2
            elif value < 500000:
                return 3
            elif value < 2000000:
                return 4
            elif value < 5000000:
                return 5
            else:
                return 6

        temp = []
        print('将数据replace到正式表...')
        update_value_sql = "replace into t_CMMS_ANALYSE_VALUE(CUST_ID,CUST_NO,CUST_NM,CUST_VALUE,CUST_RANK,SLOT,UPDATE_TIME) values(%s,%s,%s,%s,%s,%s,now())"
        for row in all_col.collect():

            if len(temp) >= 1000:
                self.mysql_helper.executemany(update_value_sql, temp)
                temp.clear()

            val_dq = row['sum(AUM_DQ)'] if row['sum(AUM_DQ)'] is not None else 0
            val_hq = row['sum(AUM_HQ)'] if row['sum(AUM_HQ)'] is not None else 0

            cust_val = float(val_dq) + float(val_hq)

            cust_rank = calculate_rank(cust_val)

            slot = str(year) + '-' + str(half_year)
            cust_id = row['CUST_ID'] if row['CUST_ID'] is not None else 1
            temp.append((cust_id, row['CUST_NO'], row['CUST_NAM'], cust_val, cust_rank, slot))

        if len(temp) != 1000:
            self.mysql_helper.executemany(update_value_sql, temp)
            temp.clear()





    def aum_total(self):
        '''
        计算AUM总和
        data for t_CMMS_ASSLIB_ASSTOT
        :return:
        '''
        print('---------------------------总资产-Start--------------------------')
        # TODO t_CMMS_ASSLIB_ASSET_c 要改成正式表t_CMMS_ASSLIB_ASSET
        df_asset = self.load_from_mysql('t_CMMS_ASSLIB_ASSET_c').select('CUST_NO', 'CUST_ID', 'STAT_DAT', 'AUM', 'CUR',
                                                                      'ACC_NAM').cache()
        # print(df_asset.count(), df_asset.columns)

        other_col = df_asset.select('CUST_NO', 'CUST_ID', 'CUR', 'ACC_NAM').distinct()
        # print(other_col.count(),other_col.columns)

        aum = df_asset.select('CUST_NO', 'STAT_DAT', 'AUM')
        # print(aum.count(), aum.columns)

        aum = aum.select('CUST_NO', 'STAT_DAT', 'AUM').groupBy(['CUST_NO', 'STAT_DAT']).sum('AUM').sort(
            ['CUST_NO', aum.STAT_DAT.desc()]) \
            .groupBy('CUST_NO').agg({'sum(AUM)': 'first', 'STAT_DAT': 'first'})
        # print(aum.count(), aum.columns)


        total = aum.select('CUST_NO', aum['first(sum(AUM))'].alias('AUM'), aum['first(STAT_DAT)'].alias('STAT_DAT')). \
            join(other_col, 'CUST_NO', 'left_outer').distinct()

        # total.filter(total.STAT_DAT == '2016-06-') .show()

        # prepare params
        def list_map(line):
            return line['CUST_ID'], line['CUST_NO'], line['ACC_NAM'], line['STAT_DAT'], line['CUR'], line['AUM']

        df = total.map(list_map)

        # clear old data
        self.mysql_helper.execute('truncate t_CMMS_ASSLIB_ASSTOT')
        sql = "insert into t_CMMS_ASSLIB_ASSTOT(CUST_ID,CUST_NO,ACC_NAM,STAT_DAT,CUR,AUM) values(%s,%s,%s,%s,%s,%s)"

        # execute sql
        self.mysql_helper.batch_operate(sql, df, 100)

    def debt_total(self):
        '''
        prepare data for total debt
        :return:
        '''

        print('---------------------------总负债-Start--------------------------')
        df_debt = self.load_from_mysql('t_CMMS_ASSLIB_DEBT').select('LOAN_ACC', 'CUST_NO', 'CUST_ID', 'CUST_NAM',
                                                                    'BAL_AMT', 'GRANT_AMT', 'CUR')
        df_debt = df_debt.filter(df_debt.LOAN_ACC != '')

        df_sum = df_debt.groupBy('CUST_NO').sum('GRANT_AMT', 'BAL_AMT')
        df_other = df_debt.groupBy('CUST_NO').agg({'CUST_ID': 'first', 'CUST_NAM': 'first', 'CUR': 'first'})

        df_total = df_sum.join(df_other, 'CUST_NO', 'left_outer').distinct()

        stat_dat = datetime.datetime.now().strftime('%Y%m%d')

        def m(line):
            return line['CUST_NO'], line['first(CUST_ID)'], line['first(CUST_NAM)'], line['first(CUR)'], line[
                'sum(GRANT_AMT)'], line['sum(BAL_AMT)'], stat_dat

        df = df_total.map(m)

        sql = "replace into t_CMMS_ASSLIB_DEBTOT(CUST_NO,CUST_ID,ACC_NAM,CUR,LOAN_AMT,BAL_AMT,STAT_DAT) values(%s,%s,%s,%s,%s,%s,%s)"

        self.mysql_helper.batch_operate(sql, df)




    def  run(self):
        # 生命周期 年份 季度1,2,3,4
        dh.run_life_cycle(2016, 2)

        # 客户价值 上半年:0,下半年:1
        dh.customer_value(2016, 0)

        # 总资产
        dh.aum_total()

        # 总负债
        dh.debt_total()
Exemplo n.º 40
0
from pyspark import SparkContext, SQLContext
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.types import StringType, IntegerType

sc = SparkContext()
sqlContext = SQLContext(sc)

####
# 1. Setup (10 points): Download the gbook file and write a function to load it in an RDD & DataFrame
####

# RDD API
# Columns:
# 0: place (string), 1: count1 (int), 2: count2 (int), 3: count3 (int)


# Spark SQL - DataFrame API


####
# 5. Joining (10 points): The following program construct a new dataframe out of 'df' with a much smaller size.
####

schema = StructType([StructField("word", StringType(), True), 
	StructField("count1", IntegerType(), True),
	StructField("count2", IntegerType(), True),
	StructField("count3", IntegerType(), True)])


df = sqlContext.read.csv('gbooks', schema=schema, sep='\t')
    '''
    day = "20151212"
    master = "local[*]"

    spark_home = '/opt/cloud/spark'
    os.environ['SPARK_HOME'] = spark_home

    # logFile = 'hdfs://master:8020/impala/parquet/back/back-portal-loginflowlog/dat=' + day
    logFile = "/input/loginfowlog/02*"
    conf = (SparkConf()
            .setMaster(master)
            .setAppName("loginflowlog2mysql")
            # .set("spark.kryoserializer.buffer.mb", "256")
            .set("spark.sql.parquet.binaryAsString", "true"))
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    sqlContext.registerFunction("to_datestr", lambda x: longTime2str(x), StringType())

    df = sqlContext.read.parquet(logFile)

    rdd = df.select('logintype', 'logtype', 'hosid', 'suppid', 'logtime', 'usermac')

    fields = [
        StructField('logintype', StringType(), True),
        StructField('logtype', StringType(), True),
        StructField('hosid', StringType(), True),
        StructField('suppid', StringType(), True),
        StructField('logtime', LongType(), True),
        StructField('usermac', StringType(), True)
    ]
Exemplo n.º 42
0
""" countWords.py"""
from pyspark import SparkContext
from pyspark import SQLContext
from pyspark.sql.types import *
from pyspark.sql import Row

sc = SparkContext("local", "PhaseCalib App")
sqlContext = SQLContext(sc)
custom_schema = StructType([StructField("phase", FloatType(), False)])
df = sqlContext.read.options(
    header='true').schema(custom_schema).csv('new_phase_data.csv')
df.describe().show()


def calibrate(row):
    return Row(row['phase'] * 2)


calibrated_rdd = df.rdd.map(calibrate)

calibrated_df = sqlContext.createDataFrame(calibrated_rdd, custom_schema)
calibrated_df.describe().show()
calibrated_df.write.option("header", "true").save("calibrated_phase_data.csv")

sc.stop()
Exemplo n.º 43
0
    logging.basicConfig(format='%(asctime)s %(message)s')
    nn_gridsearch = logging.getLogger('nn_gridsearch')
    nn_gridsearch.setLevel(logging.DEBUG)
    handler = logging.FileHandler('../logs/nn_gridsearch.txt')
    nn_gridsearch.addHandler(handler)
    nn_gridsearch.debug('-'*40)
    nn_gridsearch.debug('-'*40)
    nn_gridsearch.debug('Execution time: %s' % str(datetime.now()))

    # with open('~/.aws/credentials.json') as f:
    #     CREDENTIALS = json.load(f)

    sc = set_spark_context()

    conn = S3Connection()
    sqc = SQLContext(sc)
    sm = SparkModel(sc, conn, rdd_path='rdd.pkl')


    bow_rdd = sm.RDD.join(sm.target).map(lambda (key, (bow, label)): (label, bow)) \
            .sample(withReplacement=False, fraction=.5, seed=1)
    df = sqc.createDataFrame(bow_rdd, ['string_label', 'raw'])
    train_rdd, test_rdd = df.randomSplit([.8, .2], seed=1)
    results = []

    num_features = 5000
    min_doc_freq = 20
    layers = [[5000, 2056, 512, 128, 2], [5000, 1000, 128, 2], [5000, 100, 2], [5000, 5000, 2]]

    for l in layers:
        remover = StopWordsRemover(inputCol="raw", outputCol="words")
def spark_table_exists(sql_ctx: SQLContext, view: str) -> bool:
    # noinspection PyBroadException
    return view in sql_ctx.tableNames()
import itertools
from math import sqrt
from operator import add
from os.path import join, isfile, dirname
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from pyspark.sql.types import StructType, StructField, StringType, FloatType

CLOUDSQL_INSTANCE_IP = '104.155.188.32'   # CHANGE (database server IP)
CLOUDSQL_DB_NAME = 'recommendation_spark'
CLOUDSQL_USER = '******'
CLOUDSQL_PWD  = 'root'  # CHANGE

conf = SparkConf().setAppName("train_model")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

jdbcDriver = 'com.mysql.jdbc.Driver'
jdbcUrl    = 'jdbc:mysql://%s:3306/%s?user=%s&password=%s' % (CLOUDSQL_INSTANCE_IP, CLOUDSQL_DB_NAME, CLOUDSQL_USER, CLOUDSQL_PWD)

# checkpointing helps prevent stack overflow errors
sc.setCheckpointDir('checkpoint/')

# Read the ratings and accommodations data from Cloud SQL
dfRates = sqlContext.read.format('jdbc').options(driver=jdbcDriver, url=jdbcUrl, dbtable='Rating').load()
dfAccos = sqlContext.read.format('jdbc').options(driver=jdbcDriver, url=jdbcUrl, dbtable='Accommodation').load()
print("read ...")

# train the model
model = ALS.train(dfRates.rdd, 20, 20) # you could tune these numbers, but these are reasonable choices
print("trained ...")
Exemplo n.º 46
0
                        return [notebooks_path, nn['notebook']['path']]
        except Exception as e:
            if not explicit_process_name:
                print(
                    'WARN Unable to automatically extract pyspark notebook name'
                )
            return ['', explicit_process_name or 'Unknown pyspark filename']

    notebooks_path, notebook_name = get_notebook_name()
except:
    pass

spark = create_spark_kensu(project, None, "Lab", offline=OFFLINE)
from pyspark import SQLContext

sql = SQLContext(spark)

# DEMO DATASOURCE
if "DB_USER" not in os.environ or "DB_PASSWORD" not in os.environ or "DB_CONNECTION_URL" not in os.environ:
    print("Var env DB_USER or DB_PASSWORD or DB_CONNECTION_URL missing")

notebook_segments = os.path.split(notebook_name)
offline_file_name = notebook_segments[len(notebook_segments) - 1] + ".jsonl"

dam = DamProvider().initDam(api_url="https://api-demo102.usnek.com",
                            auth_token=token,
                            process_name=notebook_name,
                            user_name=os.environ["USER"],
                            code_location=os.environ['DAM_CODE_REPOSITORY'],
                            init_context=True,
                            do_report=True,
Exemplo n.º 47
0
    '''
    DAY_OFFSET=1
    #--set datetime
    now =datetime.datetime.now()
    pro_time=now-datetime.timedelta(days=DAY_OFFSET)
    dest_time_str=pro_time.strftime("%Y%m%d")
    '''
    master = "spark://master:7077"
    sep = "\t"
    app_name = 'user_sign_in_app'
    '''
    spark_home='/opt/cloud/spark'
    os.environ['SPARK_HOME']=spark_home
    '''
    sc = SparkContext(master, app_name)
    sql_context = SQLContext(sc)

    lines = sc.textFile(input)
    parts = lines.map(lambda l: l.split(sep)).filter(lambda x: len(x) == 18)
    '''
    portal id(*) gw_id user_id user_name
    login_time logout_time(*) mac ip user_agent
    download_flow(*) upload_flow(*) os browser ratio
    batch_no user_type supp_id
    '''
    user_login = parts.map(lambda p: (p[1].strip(), p[2].strip(),p[17].strip(),p[3].strip(),p[16].strip(),
                                  p[4].strip(),p[5].strip(),p[6].strip(),p[7].strip(),p[8].strip(),
                                  p[9].strip(),p[10].strip(),p[11].strip(),p[12].strip(),p[13].strip(),
                                  p[14].strip(),p[15].strip()))
    schema_string = "id gw_id supp_id user_id user_type " \
                   "user_name login_time logout_time mac ip " \
Exemplo n.º 48
0
MODULE_NAME = os.path.basename(sys.modules['__main__'].__file__)
TEST_NAME = os.path.splitext(MODULE_NAME)[0]
LOGGER = logger.get_logger(TEST_NAME)

# Specify some constants
URLPATH1 = "s3a://dask-avro-data/application-data/app-*.avro"
URLPATH2 = "s3a://dask-avro-data/fulfillment-data/fulfillment-*.avro"

# Start
LOGGER.info('START: Creating spark conf')
Sconf = SparkConf().setMaster('local[4]'). \ # 12 on c5.9xlarge
    set('spark.driver.memory', '4g'). \      # 4g on c5.9xlarge
    set('spark.executor.memory', '6g')       # 5g on c5.9xlarge

sc = SparkContext(appName="my_test", conf=Sconf)
sqlContext = SQLContext(sparkContext=sc)
LOGGER.info('FINISH: Finished creating spark conf')

LOGGER.info('START: Creating spark dataframe 1')
df1 = sqlContext.read.format("com.databricks.spark.avro").load(URLPATH1)
df1 = df1.filter(df1.payload.originationCountryCode == 'CAN')
df1 = df1.selectExpr(
    "payload.applicationId as applicationId",
    "payload.creationTimestamp as creationTimestamp",
    "payload.approved as approved",
    "payload.creditLimit as creditLimit"
    )
LOGGER.info('FINISH: Spark dataframe 1 created')

LOGGER.info('START: Creating spark dataframe 2')
df2 = sqlContext.read.format("com.databricks.spark.avro").load(URLPATH2)
Exemplo n.º 49
0
import os
import sys

from pyspark import SQLContext
from pyspark import SparkContext

#os.environ["SPARK_HOME"] = "/opt/spark-1.6.1-bin-hadoop2.6"
#os.environ["HADOOP_HOME"] = "/opt/hadoop"
#os.environ["HADOOP_PREFIX"] = "/opt/hadoop"

#os.environ["HIVE_HOME"] = "/opt/hive"


sc = SparkContext('local[1]')
sql_context = SQLContext(sc)
sql_context.setConf( "spark.sql.shuffle.partitions", "1")
sql_context.sql(""" use fex_test """)
Exemplo n.º 50
0
def analyze(ss, cfg):
    """
    Run job 
    :param ss: SparkSession
    :param cfg: app configuration
    :return: None
    """
    logger = logging.getLogger(__name__)
    logger.info('Python version: {}'.format(sys.version))
    logger.info('Exporting data to support answer to dataset_selection_query_1 : What % of papers coming from a ' +
                'university are OA. This program just retrieves dois of all papers published by the input university ' +
                'and saves it to a file. Another program has to be called on top of this dataset to answer the question'
                )

    # MAG dataset to use
    db_name = "mag2020"
    sql_sc = SQLContext(ss)
    q1a = sql_sc.read.parquet("hdfs:///project/core/Q1A_raw")
    unpaywall = sql_sc.read.parquet("hdfs:///project/core/unpaywall/unpaywall.parquet").withColumnRenamed("is_oa",
                                                                                                          "source_is_oa").withColumnRenamed(
        "oa_status", "source_oa_status")

    q1a_source_oa = q1a.join(unpaywall, q1a.source_doi == unpaywall.doi, "left").select("source_paperid", "source_doi",
                                                                                        "source_year", "source_is_oa",
                                                                                        "source_oa_status").distinct()

    paperauthoraffiliations = ss.table(db_name + ".paperauthoraffiliations").select("paperid", "affiliationid")
    affiliations = ss.table(db_name + ".affiliations").select("affiliationid", "latitude", "longitude",
                                                              "normalizedname", "officialpage", "displayname", "rank")

    authoraffiliations = paperauthoraffiliations.join(affiliations, ["affiliationid"])

    q1a_source_oa_latlon = authoraffiliations.join(q1a_source_oa, authoraffiliations.paperid == q1a.source_paperid)

    q1a_source_oa_agg = q1a_source_oa_latlon.distinct() \
        .groupby("affiliationid") \
        .agg(F.first("latitude"),
             F.first("longitude"),
             F.first("normalizedname"),
             F.first("rank"),
             F.count("paperid").alias("count_paper"),
             F.sum(F.when(F.col("source_is_oa") == True, 1).otherwise(0)).alias("count_oa"),
             F.sum(F.when(F.col("source_oa_status") == "green", 1).otherwise(0)).alias("count_green"),
             F.sum(F.when(F.col("source_oa_status") == "gold", 1).otherwise(0)).alias("count_gold"))
    q1a_source_oa_agg = q1a_source_oa_agg.withColumn("source_oa_score", col("count_oa") / col("count_paper"))
    q1a_source_oa_agg = q1a_source_oa_agg.withColumn("source_oa_gold", col("count_gold") / col("count_oa"))
    q1a_source_oa_agg = q1a_source_oa_agg.withColumn("source_oa_green", col("count_green") / col("count_oa"))
    q1a_source_oa_agg.write.csv("hdfs:///project/core/Q1A_sourceoa")


    q1a_source_oa_agg = q1a_source_oa_latlon.filter(F.col("source_year") < 2011).filter(F.col("source_year") >= 2006).distinct() \
        .groupby("affiliationid") \
        .agg(F.first("latitude"),
             F.first("longitude"),
             F.first("normalizedname"),
             F.first("rank"),
             F.count("paperid").alias("count_paper"),
             F.sum(F.when(F.col("source_is_oa") == True, 1).otherwise(0)).alias("count_oa"),
             F.sum(F.when(F.col("source_oa_status") == "green", 1).otherwise(0)).alias("count_green"),
             F.sum(F.when(F.col("source_oa_status") == "gold", 1).otherwise(0)).alias("count_gold"))
    q1a_source_oa_agg = q1a_source_oa_agg.withColumn("source_oa_score", col("count_oa") / col("count_paper"))
    q1a_source_oa_agg = q1a_source_oa_agg.withColumn("source_oa_gold", col("count_gold") / col("count_oa"))
    q1a_source_oa_agg = q1a_source_oa_agg.withColumn("source_oa_green", col("count_green") / col("count_oa"))
    q1a_source_oa_agg.write.csv("hdfs:///project/core/Q1A_sourceoa_2006_2011")

    q1a_source_oa_agg = q1a_source_oa_latlon.filter(F.col("source_year") <= 2015).filter(F.col("source_year") >= 2011).distinct() \
        .groupby("affiliationid") \
        .agg(F.first("latitude"),
             F.first("longitude"),
             F.first("normalizedname"),
             F.first("rank"),
             F.count("paperid").alias("count_paper"),
             F.sum(F.when(F.col("source_is_oa") == True, 1).otherwise(0)).alias("count_oa"),
             F.sum(F.when(F.col("source_oa_status") == "green", 1).otherwise(0)).alias("count_green"),
             F.sum(F.when(F.col("source_oa_status") == "gold", 1).otherwise(0)).alias("count_gold"))
    q1a_source_oa_agg = q1a_source_oa_agg.withColumn("source_oa_score", col("count_oa") / col("count_paper"))
    q1a_source_oa_agg = q1a_source_oa_agg.withColumn("source_oa_gold", col("count_gold") / col("count_oa"))
    q1a_source_oa_agg = q1a_source_oa_agg.withColumn("source_oa_green", col("count_green") / col("count_oa"))
    q1a_source_oa_agg.write.csv("hdfs:///project/core/Q1A_sourceoa_2011_2015")


    q1a_source_oa_agg = q1a_source_oa_latlon.filter(F.col("source_year") > 2015).filter(F.col("source_year") <= 2020).distinct() \
        .groupby("affiliationid") \
        .agg(F.first("latitude"),
             F.first("longitude"),
             F.first("normalizedname"),
             F.first("rank"),
             F.count("paperid").alias("count_paper"),
             F.sum(F.when(F.col("source_is_oa") == True, 1).otherwise(0)).alias("count_oa"),
             F.sum(F.when(F.col("source_oa_status") == "green", 1).otherwise(0)).alias("count_green"),
             F.sum(F.when(F.col("source_oa_status") == "gold", 1).otherwise(0)).alias("count_gold"))
    q1a_source_oa_agg = q1a_source_oa_agg.withColumn("source_oa_score", col("count_oa") / col("count_paper"))
    q1a_source_oa_agg = q1a_source_oa_agg.withColumn("source_oa_gold", col("count_gold") / col("count_oa"))
    q1a_source_oa_agg = q1a_source_oa_agg.withColumn("source_oa_green", col("count_green") / col("count_oa"))
    q1a_source_oa_agg.write.csv("hdfs:///project/core/Q1A_sourceoa_2015_2020")
Exemplo n.º 51
0
    logger.info(day)
    '''
    #
    spark_home = '/opt/cloud/spark'
    os.environ['SPARK_HOME'] = spark_home

    #master = "spark://hadoop:7077"
    master = "local[1]"
    app_name = "spark_transferdata"
    sep = "\t"
    #input = "/data/140301_150731.csv"
    input = "/input/loginlog/2015"
    output = "/output/loginlog/2015"

    sc = SparkContext(master, app_name)
    sqlContext = SQLContext(sc)
    # load
    lines = sc.textFile(input)
    rdd = lines.map(lambda l: l.split(sep))\
        .filter(lambda l:len(l)==11)\
        .map(lambda l:(l[0],l[1],l[2],to_long(l[3]),l[4],
                       long(l[5]),long(l[6]),l[7],l[8],l[9],
                       to_long(l[10])))
    # uid,adid,guuid,guuidctime,url,referer,hosid,gwid,ua,ip,createtime
    # uid,adid,guuid,createtime
    fields = [
        StructField('uid', StringType(), True),
        StructField('adid', StringType(), True),
        StructField('guuid', StringType(), True),
        StructField('guuidctime', LongType(), True),
        StructField('url', StringType(), True),
Exemplo n.º 52
0
# Path for pyspark and py4j
sys.path.append("/Users/dustinchen/Documents/APP/spark-1.6.1-bin-hadoop2.6/python")
sys.path.append("/Users/dustinchen/Documents/APP/spark-1.6.1-bin-hadoop2.6/python/lib/py4j-0.9-src.zip")

try:
    from pyspark import SparkConf, SparkContext, SQLContext
    from pyspark.sql.functions import regexp_extract
    from pyspark.sql import Row

except ImportError as e:
    print ("Can not import Spark Modules", e)

if __name__ == "__main__":
    conf = SparkConf().setAppName("GISAPP").setMaster("local")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    nyc_shapefile = shapefile.Reader("/Users/dustinchen/Documents/APP/Resources/NY_counties_clip/NY_counties_clip.shp")
    """
       0 ('deletionflag', 'c', 1, 0)
       1 ['objectid', 'n', 9, 0]
       2 ['statefp', 'c', 2, 0]
       3 ['countyfp', 'c', 3, 0]
       4 ['countyns', 'c', 8, 0]
       5 ['geoid', 'c', 5, 0]
       6 ['name', 'c', 100, 0]
        ['namelsad', 'c', 100, 0]
        ['lsad', 'c', 2, 0]
        ['classfp', 'c', 2, 0]
        ['mtfcc', 'c', 5, 0]
        ['csafp', 'c', 3, 0]
        ['cbsafp', 'c', 5, 0]
    for i in range(LINE_LENGTH):
        sys.stdout.write('-')
    print("")


try:
    from pyspark import SparkContext
    from pyspark import SQLContext

    print ("Successfully imported Spark Modules -- `SparkContext, SQLContext`")
    print_horizontal()
except ImportError as e:
    print ("Can not import Spark Modules", e)
    sys.exit(1)

sqlContext = SQLContext(sparkContext=sc)

# Loads parquet file located in AWS S3 into RDD Data Frame
parquetFile = sqlContext.read.parquet("s3://jon-parquet-format/nation.plain.parquet")

# Stores the DataFrame into an "in-memory temporary table"
parquetFile.registerTempTable("parquetFile")

# Run standard SQL queries against temporary table
nations_all_sql = sqlContext.sql("SELECT * FROM parquetFile")

# Print the result set
nations_all = nations_all_sql.map(lambda p: "Country: {0:15} Ipsum Comment: {1}".format(p.name, p.comment_col))

print("All Nations and Comments -- `SELECT * FROM parquetFile`")
print_horizontal()
Exemplo n.º 54
0
from pyspark import SparkContext, SparkConf, SQLContext
import sys

conf = SparkConf().setAppName("DocSimilarity_Avro")
sc = SparkContext(conf=conf)

sqlcontext = SQLContext(sc)
sqlcontext.setConf("spark.sql.avro.compression.codec", "uncompressed")
sim_matrix_df = sqlcontext.read.format("com.databricks.spark.avro").load(
    sys.argv[1])
sim_matrix_rdd = sim_matrix_df.rdd

similar_docs = sim_matrix_rdd.takeOrdered(10, key=lambda x: -x[1])

similar_docs = [doc[0] for doc in similar_docs]

similar_docs_1 = sc.parallelize(similar_docs)
similar_docs_1.saveAsTextFile(sys.argv[2])

# In[2]:

try:

    try:
        timespan=str(sys.argv[1])
    except IndexError:
        print 'please pass timespan in argument'
        sys.exit()

    conf = (SparkConf().setMaster("local").setAppName("hi_report_app").set("spark.executor.memory", "1g"))
    sc = SparkContext(conf = conf)
    sc.setLogLevel("Error")
    sqlContext = SQLContext(sc)
    # In[2]:

    config_url='https://s3-ap-southeast-1.amazonaws.com/nlplive.humanindex.data/config.json'
    try:
        config_response=requests.get(config_url)
        config = json.loads(config_response.content)
    except:
        print "Cannot fetch Config......"

    # In[23]:

    try:
        fetch_response=requests.get(str(config['baseAPIUrl'])+'/'+str(config['version'])+'/preProcessing/GetPredictionFileJob/'+timespan+'/publisher')
        #check if api request is successfull or not
        if(fetch_response.status_code==200):
"""
STEP 4
"""
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
import json
from datetime import tzinfo, datetime
import pytz
import re
from pyspark import SparkContext, SQLContext

sc = SparkContext()

sqlContext = SQLContext(sc)
sqlContext.setConf("spark.sql.parquet.compression.codec", "snappy")

white_list = [
    "itwiki", "enwiki", "dewiki", "fawiki", "nlwiki", "frwiki", "eswiki"
]

links = sqlContext.read.parquet(
    'hdfs:///user/piccardi/parquet/wikidata_links.parquet')


def pivot_row(row):
    result = []
    for l in white_list:
        result.append(Row(id=row.id, lang=l, title=row[l]))
    return result
import sys
import itertools
from math import sqrt
from operator import add
from os.path import join, isfile, dirname
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.types import StringType
from pyspark.sql.types import FloatType

conf = SparkConf().setAppName("app_collaborative")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

USER_ID = 0
#CLOUDSQL_INSTANCE_IP = 173.194.251.148
#BEST_RANK = 20
#BEST_ITERATION = 10
#BEST_REGULATION = 0.1

CLOUDSQL_INSTANCE_IP = sys.argv[1]
CLOUDSQL_NAME = sys.argv[2]
CLOUDSQL_USER = sys.argv[3]
CLOUDSQL_PWD  = sys.argv[4]

BEST_RANK = int(sys.argv[5])
BEST_ITERATION = int(sys.argv[6])
BEST_REGULATION = float(sys.argv[7])
Exemplo n.º 58
0
	return df

def deleteColumn(df):
	columns_to_drop = ['_c0', '_c1', '_c2', '_c3', '_c4', '_c5']
	
	df = df.drop(*columns_to_drop)
	
	return df

if __name__ == "__main__":

	# create Spark context with Spark configuration
	conf = SparkConf().setAppName("Practica 4")  

	sc = SparkContext(conf=conf)
	sql = SQLContext(sc)

	# Load training data
	df = sql.read.format("csv").load("./data_training.csv")	
	df = castColumns(df)
	
	ignore = ['_c6']
	assembler = VectorAssembler(
		inputCols=[x for x in df.columns if x not in ignore],
		outputCol='features')

	data = assembler.transform(df)
	data = data.withColumnRenamed("_c6", "label")
	data = deleteColumn(data)
	data.show()
	
Exemplo n.º 59
0
from pyspark import SQLContext
import os
import json

# 根据evaluation进行分类
def name_place(name, place, price, evaluation):
    if evaluation <= 3:
        return name + "," + "general"
    elif evaluation > 3 and evaluation <=5:
        return name + "," + "good"


if __name__ == "__main__":
    conf = SparkConf().setMaster("local[2]").setAppName("sql_udf")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    json_path = os.path.abspath("../doc/book.json")

    # json读取并隐射
    json_df = sqlContext.read.json(json_path)
    json_df.registerTempTable("json_book")

    # UDF自定义函数
    sqlContext.registerFunction("name_place", name_place)

    evalRDD = sqlContext.sql("SELECT name_place(name, place, price,evaluation) AS book_eval FROM json_book")

    #bookMap = lengthRDD.map(lambda books: (books.name, books.author, books.price, books.publish, books.place))

    evalRDD.show()
import itertools
from math import sqrt
from operator import add
from os.path import join, isfile, dirname
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.types import StringType
from pyspark.sql.types import FloatType
import pandas as pd

conf = SparkConf().setAppName("app_collaborative")
sc = SparkContext(conf=conf)
sc.setCheckpointDir('checkpoint/')
sqlContext = SQLContext(sc)


def howFarAreWe(model, against, sizeAgainst):

    againstNoRatings = against.map(lambda x: (int(x[0]), int(x[1])))
    againstWiRatings = against.map(lambda x:
                                   ((int(x[0]), int(x[1])), int(x[2])))
    predictions = model.predictAll(againstNoRatings).map(lambda p:
                                                         ((p[0], p[1]), p[2]))
    predictionsAndRatings = predictions.join(againstWiRatings).values()
    return sqrt(
        predictionsAndRatings.map(lambda s: (s[0] - s[1])**2).reduce(add) /
        float(sizeAgainst))