def hash_rating(author_subreddit_rating_rdd, sc): sql_context = SQLContext(sc) author_sub_schema = StructType([ StructField("author", StringType(), True), StructField("subreddit", StringType(), True), StructField("rating", LongType(), True) ]) asr_df = sql_context.createDataFrame(author_subreddit_rating_rdd, author_sub_schema) author_rdd = author_subreddit_rating_rdd.map(lambda (a, s, r): a) aid_rdd = author_rdd.distinct().zipWithUniqueId().cache() author_id_schema = StructType([ StructField("author", StringType(), True), StructField("author_id", LongType(), True) ]) aid_df = sql_context.createDataFrame(aid_rdd, author_id_schema) aid_s_r_df = aid_df.join(asr_df, on='author').drop('author').cache() subreddit_rdd = author_subreddit_rating_rdd.map(lambda (a, s, r): s) sid_rdd = subreddit_rdd.distinct().zipWithUniqueId().cache() subreddit_id_schema = StructType([ StructField("subreddit", StringType(), True), StructField("subreddit_id", LongType(), True) ]) sid_df = sql_context.createDataFrame(sid_rdd, subreddit_id_schema) aid_sid_r_df = sid_df.join(aid_s_r_df, on='subreddit').drop('subreddit').cache() row_aid_sid_r_rdd = aid_sid_r_df.rdd aid_sid_r_rdd = row_aid_sid_r_rdd.map(lambda row: (row.author_id, row.subreddit_id, row.rating)) return aid_rdd, sid_rdd, aid_sid_r_rdd
def mat_fact_test(city, rank=5, iter_=20, lambda_ = .01): pitt_train = pd.read_csv('/mnt/sda5/Desktop/search/Fin_project/dataset/final/'+city+'_train.csv',encoding = 'iso8859_15') pitt_test = pd.read_csv('/mnt/sda5/Desktop/search/Fin_project/dataset/final/'+city+'_test.csv',encoding = 'iso8859_15') pitt_test = get_test(pitt_train,pitt_test) pitt_test = resample(pitt_test,n_samples=int(np.ceil(0.8 * pitt_test.shape[0]))) pitt_test = pitt_test.reset_index() pitt_test = pitt_test.drop('index', axis = 1) le_user_id = preprocessing.LabelEncoder() le_user_id = le_user_id.fit(pitt_train.user_id) user_id_enc = le_user_id.transform(pitt_train.user_id) pitt_train['user_id_enc'] = user_id_enc pitt_test['user_id_enc'] = le_user_id.transform(pitt_test.user_id) le_business_id = preprocessing.LabelEncoder() le_business_id = le_business_id.fit(pitt_train.business_id) business_id_enc = le_business_id.transform(pitt_train.business_id) pitt_train['business_id_enc'] = business_id_enc pitt_test['business_id_enc'] = le_business_id.transform(pitt_test.business_id) sqlCtx = SQLContext(sc) pitt_train_sp = sqlCtx.createDataFrame(pitt_train[['user_id_enc','business_id_enc','stars_review']]) pitt_train_sp = pitt_train_sp.withColumn("stars_review", pitt_train_sp["stars_review"].cast("double")) pitt_test_sp = sqlCtx.createDataFrame(pitt_test[['user_id_enc','business_id_enc','stars_review']]) pitt_test_sp = pitt_test_sp.withColumn("stars_review", pitt_test_sp["stars_review"].cast("double")) model = ALS.train(pitt_train_sp, rank, seed=0, iterations=iter_,lambda_=lambda_) prediction=model.predictAll(pitt_train_sp.rdd.map(lambda line: (line[0],line[1]))).map(lambda d: ((d[0],d[1]),d[2])) true_and_pred = pitt_train_sp.rdd.map(lambda d: ((d[0],d[1]),d[2])).join(prediction).map(lambda r:(r[0],r[1][0], r[1][1])) true_and_pred.map(lambda line:(line[0],line[1],5 if line[2]>=5 else line[2])) error = math.sqrt(true_and_pred.map(lambda r: (math.fabs(r[1]-r[2]))**1).mean()) print('Training: ',error) prediction=model.predictAll(pitt_test_sp.rdd.map(lambda line: (line[0],line[1]))).map(lambda d: ((d[0],d[1]),d[2])) true_and_pred = pitt_test_sp.rdd.map(lambda d: ((d[0],d[1]),d[2])).join(prediction).map(lambda r:(r[0],r[1][0], r[1][1])) true_and_pred.map(lambda line:(line[0],line[1],5 if line[2]>=5 else line[2])) error = math.sqrt(true_and_pred.map(lambda r: (math.fabs(r[1]-r[2]))**1).mean()) print('Test: ',error)
def pyspark(): conf = SparkConf().setAppName("PySparkApp").setMaster("local") #conf = SparkConf() sc = SparkContext(conf=conf) #spark = SparkSession.builder.appName("WordCount").master("local").config(conf = conf).getOrCreate() sqlCtx = SQLContext(sc) df1 = get_features() sdf = sqlCtx.createDataFrame(df1) ops1 = "(price_from + price_to)/2" data = sdf.withColumn("MedianPrice", expr(ops1)) tmp = data.withColumn('final_price', coalesce(data['Price123'], data['MedianPrice'])) finaldata = tmp.drop("price", "disFeature") state = { "VIC": "Victoria", "WA": "Western Australia", "ACT": "Australian Capital Territory", "NT": "Northern Territory", "NSW": "New South Wales", "TAS": "Tasmania" } stateDataP = pd.DataFrame(list(state.items()), columns=["State", "StateName"]) stateDataD = sqlCtx.createDataFrame(stateDataP) data1 = finaldata.join(stateDataD, on=['State'], how='inner') finaldataPD = data1.toPandas() #dataPD["StateName"].unique() sc.stop() finaldataPD['price_to'] = finaldataPD['price_to'].astype(str).astype(float) finaldataPD['Price123'] = finaldataPD['Price123'].astype(str).astype(float) finaldataPD['beds'] = finaldataPD['beds'].astype(str).astype(int) finaldataPD['baths'] = finaldataPD['baths'].astype(str).astype(int) finaldataPD['parking'] = finaldataPD['parking'].astype(str).astype(int) df123 = finaldataPD.copy() df123 = df123.replace({pd.np.nan: None}) #print(df123) return df123
def run(self): startTime = time.time() conf = SparkConf() \ .setAppName("Community_Detection_Based_on_GraphFrames") \ .set("spark.executor.memory", "4g")\ .set("spark.driver.host", "localhost") sc = SparkContext(conf=conf) inputData = sc.textFile(self.input_path) # Drop the header header = inputData.first() inputData = inputData.filter(lambda line: line != header) # read and split data into tuples Standard_RDD = inputData.map(self.readAndSplit) UserAndItems = Standard_RDD.groupByKey().map(lambda x: (x[0], set(list(x[1])))) self.global_User_items = UserAndItems.collectAsMap() edge_RDD = UserAndItems.flatMap( self.generate_edges).filter(lambda x: len(x) > 0) vertex_RDD = edge_RDD.flatMap(lambda x: [x[0], x[1]]).distinct().map( lambda x: (x, )) sqlContext = SQLContext(sc) vertices = sqlContext.createDataFrame(vertex_RDD.collect(), [ "id", ]) edges = sqlContext.createDataFrame(edge_RDD.collect(), [ "src", "dst", ]) g = GraphFrame(vertices, edges) result = g.labelPropagation(maxIter=5) verticeRDD = sc.parallelize(result.collect()) community_list = verticeRDD.map(lambda x: (str(x.label), x.id))\ .groupByKey()\ .map(lambda x: sorted(list(x[1])))\ .sortBy(lambda x: (len(x), x[0]))\ .collect() with open(self.output_path, 'w') as f: for line in community_list: for each in line[:-1]: f.write(each + ', ') f.write(line[-1] + '\n') print("Finish time:", time.time() - startTime)
def mat_fact_val(city, rank=5, iter_=20, lambda_=.01): toronto_train = pd.read_csv('D:/Study/TermProject/yelp-dataset/data/' + city + '_train.csv', encoding='iso8859_15') toronto_val = pd.read_csv('D:/Study/TermProject/yelp-dataset/data/' + city + '_val.csv', encoding='iso8859_15') toronto_val = get_test(toronto_train, toronto_val) le_user_id = preprocessing.LabelEncoder() le_user_id = le_user_id.fit(toronto_train.user_id) user_id_enc = le_user_id.transform(toronto_train.user_id) toronto_train['user_id_enc'] = user_id_enc toronto_val['user_id_enc'] = le_user_id.transform(toronto_val.user_id) le_business_id = preprocessing.LabelEncoder() le_business_id = le_business_id.fit(toronto_train.business_id) business_id_enc = le_business_id.transform(toronto_train.business_id) toronto_train['business_id_enc'] = business_id_enc toronto_val['business_id_enc'] = le_business_id.transform( toronto_val.business_id) sqlCtx = SQLContext(sc) toronto_train_sp = sqlCtx.createDataFrame( toronto_train[['user_id_enc', 'business_id_enc', 'stars_review']]) toronto_train_sp = toronto_train_sp.withColumn( "stars_review", toronto_train_sp["stars_review"].cast("double")) toronto_val_sp = sqlCtx.createDataFrame( toronto_val[['user_id_enc', 'business_id_enc', 'stars_review']]) toronto_val_sp = toronto_val_sp.withColumn( "stars_review", toronto_val_sp["stars_review"].cast("double")) model = ALS.train(toronto_train_sp, rank, seed=0, iterations=iter_, lambda_=lambda_) prediction = model.predictAll( toronto_train_sp.rdd.map(lambda line: (line[0], line[1]))).map( lambda d: ((d[0], d[1]), d[2])) true_and_pred = toronto_train_sp.rdd.map(lambda d: ((d[0], d[1]), d[ 2])).join(prediction).map(lambda r: (r[0], r[1][0], r[1][1])) true_and_pred.map(lambda line: (line[0], line[1], 5 if line[2] >= 5 else line[2])) error = math.sqrt( true_and_pred.map(lambda r: (math.fabs(r[1] - r[2]))**1).mean()) print('Training: ', error) prediction = model.predictAll( toronto_val_sp.rdd.map(lambda line: (line[0], line[1]))).map( lambda d: ((d[0], d[1]), d[2])) true_and_pred = toronto_val_sp.rdd.map(lambda d: ((d[0], d[1]), d[ 2])).join(prediction).map(lambda r: (r[0], r[1][0], r[1][1])) true_and_pred.map(lambda line: (line[0], line[1], 5 if line[2] >= 5 else line[2])) error = math.sqrt( true_and_pred.map(lambda r: (math.fabs(r[1] - r[2]))**1).mean()) print('Validation: ', error)
def query2(sc, file_in_name, file_out_name): rdd_file_data = sc.textFile(file_in_name) data_header = rdd_file_data \ .filter(lambda l: "datetime" in l) cites = weather.gen_city_keys(sc) header_position = run2.get_position(data_header) data = rdd_file_data \ .subtract(data_header) \ .flatMap(lambda line: generate_tuple(header_position, line, cites)) sqlc = SQLContext(sc) df = sqlc.createDataFrame(data) #df.show() df.createOrReplaceTempView("dati") query1 = "SELECT country, year, month, " \ "cast(min(value) as decimal (10,2)) as my_min, " \ "cast(max(value) as decimal (10,2)) my_max, " \ "cast(avg(value) as decimal (10,2)) as my_avg," \ "cast(stddev(value) as decimal (10,2)) as my_std " \ "FROM dati " \ "GROUP BY country, year, month" df2 = sqlc.sql(query1).orderBy('dati.country', 'dati.year', 'dati.month') df2.show() ''' Save data in HDFS ''' df2.coalesce(1).write.format("json").save(file_out_name)
def get_spark_runtime_validator(context, df): from pyspark import SparkContext, SQLContext sc = SparkContext.getOrCreate() sqlCtx = SQLContext(sc) sdf = sqlCtx.createDataFrame(df) batch_request = BatchRequest( datasource_name="my_spark_datasource", data_connector_name="my_data_connector", batch_data=sdf, data_asset_name="IN_MEMORY_DATA_ASSET", partition_request={ "batch_identifiers": { "an_example_key": "a", "another_example_key": "b", } }, ) expectation_suite = context.create_expectation_suite( "my_suite", overwrite_existing=True) validator = context.get_validator(batch_request=batch_request, expectation_suite=expectation_suite) return validator
def run(): conf = SparkConf().setAppName("word count") \ .setMaster("local[2]") context = SparkContext(conf=conf) sqlContext = SQLContext(context) context.setLogLevel('ERROR') data = [('Song', 25), ('Trump', 22), ('Yong', 20), ('Obama', 26)] rdd = context.parallelize(data, 2).map(lambda x: Row(name=x[0], age=int(x[1]))) people = sqlContext.createDataFrame(rdd).cache() people.printSchema() old_guy = people.orderBy('age', ascending=False).take(1) print(old_guy) same_old_guy = [ Row(name=x['name'], age=x['age'], other=1) for x in old_guy ] print(same_old_guy) total = people.groupBy().sum('age').collect()[0][0] print('Total age is {}'.format(total)) people.createTempView('people_table') new_people = sqlContext.sql( 'select name, age from people_table order by age desc limit 1') new_people.show()
def test_udf(spark_context, spark_session): sql_sc = SQLContext(spark_context) spark_session.conf.set("spark.sql.execution.arrow.enabled", "true") df = sql_sc.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age")) # df.printSchema() # # df.show() slen = pandas_udf(lambda s: s.str.len(), IntegerType()) # below is similar to @ # upper = pandas_udf(to_upper, StringType()) # addOne = pandas_udf(add_one, IntegerType(), PandasUDFType.SCALAR) # this works df.select("name").show() # this doesn't work, Caused by: java.io.EOFException # at java.io.DataInputStream.readInt(DataInputStream.java:392) # seems related to slen output int # df.select(slen("name").alias("slen(name)")).show() # TODO this hit same error # df.select(to_upper("name")).show() print(df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")).count())
def main(): conf = SparkConf().setAppName('ingest logs') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) inputs = sys.argv[1] output = sys.argv[2] #Reading the input file, and then matching the pattern file_data = sc.textFile(inputs) linere = re.compile("^(\\S+) - - \\[(\\S+) [+-]\\d+\\] \"[A-Z]+ (\\S+) HTTP/\\d\\.\\d\" \\d+ (\\d+)$") #Mapping the data after fetching the required values out of the Nasa Web server logs file KeyValue = file_data.map(lambda line : linere.split(line)).filter(lambda x : len(x)==6).map(lambda y : (y[1],(dt.datetime.strptime(y[2], '%d/%b/%Y:%H:%M:%S')),y[3],y[-2])).cache() #Mapping the KeyValue RDD as the required format of 4 columns Nasa = KeyValue.map(lambda p: {"host": p[0], "datetime": p[1], "path": p[2], "bytes": long(p[3])}) #Converting Nasa to DataFrame and then registering it as Table schemaNasa = sqlContext.createDataFrame(Nasa) schemaNasa.registerTempTable("NasaLogs") #Writing the data into a parquet file schemaNasa.write.format('parquet').save(output) #Reading the data from Parquet file and then Registering it in Table Format parquetdata = sqlContext.read.parquet(output) parquetdata.registerTempTable("parquetTable") #Firing SQL query to count the total number of bytes transferred using SUM(bytes) totalbytes = sqlContext.sql(""" SELECT SUM(bytes) FROM parquetTable """) totalbytes.show()
def astype(cls, df, out_type, **kwargs): """ @param::out_type: the type of output datafram in string return the converted dataframe or None if not feasible """ # handle edge cases if not isinstance(df, DataFrame): raise Exception( '> PandasConverter astype(): input dataframe must be instance of pyspark dataframe class.' ) if out_type == None: raise ValueError( '> PandasConverter astype(): dataframe out_type parameter can not be none.' ) if not cls.is_capable('pandas', out_type): raise Exception( '> PandasConverter astype(): convert to type: %s not supported.' % (out_type)) # get pyspark context sc = SparkContext.getOrCreate() sqlcontext = SQLContext(sc) # convert to target type if out_type.lower() == 'pyspark': # explicity intended try: return sqlcontext.createDataFrame(df) except Exception as e: print( '> PandasConverter astype(): convert to pyspark dataframe failed: %s' % (e)) if out_type.lower() == 'pandas': # explicity intended return df return None
def readFromCsv(self, spark): print("Reading from CSV") sqlContext = SQLContext(sc) schema = StructType([]) df = sqlContext.createDataFrame(sc.emptyRDD(), schema) print("First SparkContext:") print("APP Name :".format(spark.sparkContext.appName)) print("Master :" + spark.sparkContext.master) messageLogger = ml.MessageLogger(const.getProjectName(__file__), "Reading from file.....") try: messageLogger.logInfo("Reading from CSV file.") df = spark.read.csv(const.csv_file_project_1, inferSchema=True, header=True) messageLogger.logInfo("File reading finished successfully.") except Exception as e: messageLogger.logError( "unable to read the file, exception occurred: " + str(e.__class__) + "occurred.") if df.count() > 0: messageLogger.logInfo("Number of records in file: " + str(df.count())) # Display Data Frame Results # processedData = pf.ProcessedData() # processedData.processOutput("hellodcddd") # df.select('*').show() # 100, False) # Data Frame Filter Statements # df.filter(df['eq_site_limit'] == 0).select('*').show() df.filter(df['eq_site_limit'] == 0 & df['hu_site_limit'] > 20000).select('*').show()
def Transfer_to_DB(spark, df): #Create PySpark DataFrame Schema r_schema = StructType([StructField('id',IntegerType(),True)\ ,StructField('h1',DoubleType(),True)\ ,StructField('h2',DoubleType(),True)\ ,StructField('h3',DoubleType(),True)\ ,StructField('h4',DoubleType(),True)\ ,StructField('h5',DoubleType(),True)\ ,StructField('h6',DoubleType(),True)\ ,StructField('h7',DoubleType(),True)\ ,StructField('h8',DoubleType(),True)\ ,StructField('h9',DoubleType(),True)\ ,StructField('h10',DoubleType(),True)\ ,StructField('h11',DoubleType(),True)\ ,StructField('h12',DoubleType(),True)\ ,StructField('services',StringType(),True)]) sqlContext = SQLContext(spark) #Create Spark DataFrame from Pandas df_record = sqlContext.createDataFrame(df, r_schema) #Important to order columns in the same order as the target database df_record = df_record.select("id","h1","h2","h3","h4","h5","h6",\ "h7","h8","h9","h10","h11","h12","services") df_record.show() properties, url = DB_connection() df_record.write.jdbc(url=url, table='patient_records', mode='append', properties=properties)
def evaluate(sc, models, test): sqlc = SQLContext(sc) results_schema = StructType([ StructField("classifier", StringType()), StructField("auc", FloatType()) ]) results = sqlc.createDataFrame(sc.emptyRDD(), schema=results_schema) for classifier, model in models.items(): bce = BinaryClassificationEvaluator(labelCol="class") auc = bce.evaluate(model.transform(test)) evaluation = sc.parallelize([(classifier, auc)]) evaluation = sqlc.createDataFrame(evaluation, schema=results_schema) results = results.union(evaluation) results.coalesce(1).write.csv("test-metrics", header=True)
def group_by(spark_context, spark_session): sql_sc = SQLContext(spark_context) spark_session.conf.set("spark.sql.execution.arrow.enabled", "true") df = sql_sc.createDataFrame([(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)], ("id", "v")) # TODO count return Py4JJavaError: An error occurred while calling o71.count. df.groupby("id").apply(normalize).count()
def main(self): stop_words = [] # prod dataframe = self.read_dataframe(self.path, self.days_list).persist() # read approved user list # df = self.spark.read.csv( # "hdfs:///ssymmetry_db/raw_db/sina_user_tag/sina_user_tag_item/weibo_uid_with_user_tag.csv")\ # .select("uid", "user_tag") # local test # dataframe = self.spark.read.json("sina_weibo_fans_data_2017-11-09-10-18.json") blog_rdd = self.read_blog_data(dataframe).fillna(" ").rdd def preprocess_data(x): uid = x["uid"] blog_content = x["blog_content"] forward_content = x["forward_content"] if forward_content.rfind(u"*****") > 0: forward_content = forward_content.split(u"*****")[1] return (uid, blog_content + forward_content) data = blog_rdd.map(preprocess_data).reduceByKey( lambda x, y: x + y).map( lambda x: [" ".join(jieba.cut(x[1])).split(" ")]) sql_context = SQLContext(sparkContext=self.spark.sparkContext) word_df = sql_context.createDataFrame(data, ["values"]) w2vec = Word2Vec(vectorSize=128, inputCol="values") model = w2vec.fit(word_df) def creat_dictionary(model): w_df = model.getVectors() w_df.show() data = w_df.rdd.collect() w2index = {} w2vec = {} i = 1 for row in data: word = row.word vector = row.vector w2index[word] = i w2vec[word] = vector i += 1 return w2index, w2vec # 把word2vec的词向量写出到一个pickle文件中 index_dict, word_vectors = creat_dictionary(model) # out = open("w2vec.pkl", "wb") out = open("/udisk2/hxk/w2vec/w2vec.pkl", "wb") pickle.dump(index_dict, out) # 索引字典 pickle.dump(word_vectors, out) # 词向量字典 out.close() # test model.findSynonyms("你", 3).show()
def _get_train_data(self): sql_context = SQLContext(self.sc) l = [ (1, Vectors.dense([1, 2, 3]), 1.0), (2, Vectors.dense([1, 2, 3]), 0.0), (3, Vectors.dense([1, 2, 3]), 1.0), (4, Vectors.dense([1, 2, 3]), 0.0), ] return sql_context.createDataFrame(l, ['id', 'features', 'label'])
def process_json(filename, sparkcontext): sqlContext = SQLContext(sparkcontext) df = sqlContext.read.json(filename).select("title") output_list = get_counts(df) columns = ["token", "count"] output_df = sqlContext.createDataFrame(output_list, columns) output_df.write.mode('overwrite').parquet( filename.replace(".json", ".parquet"))
def _get_data(self): sql_context = SQLContext(self.sc) l = [ ( "I dont know why people think this is such a bad movie.", Vectors.sparse(3, {1: 1.0, 2: 1.0, 3: 1.0}) ), ] return sql_context.createDataFrame(l, ['text', 'features'])
def main(): sc = SparkContext() sqlctx = SQLContext(sc) lines = sc.textFile("alerts.csv").map(lambda l: l.split(",")) alerts_rdd = lines.map(lambda l: Row(ts=l[1], name=l[0])) df = sqlctx.createDataFrame(alerts_rdd) aa = AssociationAlgorithm(df, 'ts', 'name', sqlctx) aa.execute_algorithm()
def main(): sc = SparkContext(appName='TextSimillarity') sqlcont = SQLContext(sc) rdd = sc.textFile("test.csv") header = rdd.first() newrdd = rdd.filter(lambda x: x!= header)\ .map(lambda x: x.split(','))\ .map(lambda x: Row(description_x = x[1], description_y = x[2])) new_df = sqlcont.createDataFrame(newrdd) calculate_simillarity(new_df)
def main(stock_list, seq_len, result_table): os.environ[ 'PYSPARK_PYTHON'] = '/Users/lex/miniconda2/envs/pysparkenv2/bin/python' os.environ[ 'PYSPARK_DRIVER_PYTHON'] = '/Users/lex/miniconda2/envs/pysparkenv2/bin/python' fields = [ StructField('open', FloatType(), True), StructField('high', FloatType(), True), StructField('low', FloatType(), True), StructField('close', FloatType(), True), StructField('volume', FloatType(), True), StructField('date', StringType(), True), StructField('ticker', StringType(), True), ] schema = StructType(fields) stock_list = stock_list.split(',') stock_data = pd.DataFrame() print('Predicting %s stocks' % len(stock_list)) for x in stock_list: df = web.DataReader(x, 'morningstar', pd.datetime(2013, 4, 13), pd.datetime(2018, 4, 13)) stock_data = stock_data.append(df) print('Have the data') len_comb = len(stock_list) seq_len = int(seq_len) lstm = LstmStockTrainer(stock_data, seq_len) keys = stock_list conf = SparkConf().setAppName('portfolio_chooser') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) keys = sc.parallelize(keys) task_rdd = keys.map(lambda stock: lstm.predict_stocks(stock)) \ .repartition(len_comb) result_rdd = task_rdd \ .flatMap(lambda r: r.values) \ .map(lambda r: tuple(r)) result_df = sqlContext.createDataFrame(result_rdd, schema) # replacing all pandas NaNs to null cols = [ func.when(~func.col(x).isin("NaN"), func.col(x)).alias(x) for x in result_df.columns ] result_df = result_df.select(*cols) result_df.show(5) util.write_small_df(result_df, result_table) return result_df
def check_fit_params(sc, models): sqlc = SQLContext(sc) results_schema = StructType([ StructField("classifier", StringType()), StructField("params", StringType()), StructField("auc", FloatType()) ]) results = sqlc.createDataFrame(sc.emptyRDD(), schema=results_schema) for classifier, model in models.items(): for i, combination in enumerate(model.getEstimatorParamMaps()): params = ["%s: %s" % (p.name, str(v)) for p, v in combination.items()] param_results = sc.parallelize( [(classifier, "-".join(params), model.avgMetrics[i])]) param_results = sqlc.createDataFrame( param_results, schema=results_schema) results = results.union(param_results) results.coalesce(1).write.csv("fit-metrics", header=True)
def main(): sc = SparkContext("local", "Query 1") rawWeather, weatherHeader, cities = run.getRDDFromCSV( sc, Constants.WEATHER_DESCRIPTION_FILE) weatherDescription = rawWeather \ .subtract(weatherHeader) \ .filter(lambda l: re.search('^\d{4}-03|^\d{4}-04|^\d{4}-05', l)) # month filter daysOfMonth = weatherDescription \ .flatMap(lambda line: generateTuple(line, cities)) sqlc = SQLContext(sc) df = sqlc.createDataFrame(daysOfMonth) df.show() df.createOrReplaceTempView("dati") query1 = "SELECT city, year, month, day, sum(sunny) as n_sunny_h FROM dati GROUP BY city, year, month, day" df2 = sqlc.sql(query1) df2.show() #applicazione regola sunny day (75%) df2.createOrReplaceTempView("dati") query2 = "SELECT city, year, month, day FROM dati where n_sunny_h >13" df3 = sqlc.sql(query2) df3.show() #almeno 15 gg sereno al mese df3.createOrReplaceTempView("dati") query2 = "SELECT city, year, month, count(*) AS n_day FROM dati GROUP BY year,city, month" df4 = sqlc.sql(query2) df4.show() #filtra n_giorni df4.createOrReplaceTempView("dati") query2 = "SELECT city, year, month, n_day FROM dati where n_day>=15" df5 = sqlc.sql(query2) df5.show() # filtra n mesi = 3 df5.createOrReplaceTempView("dati") query2 = "SELECT city, year, count(*) AS n_month FROM dati GROUP BY city, year " df6 = sqlc.sql(query2) df6.show() df6.createOrReplaceTempView("dati") query2 = "SELECT city, year FROM dati WHERE n_month = 3 " df7 = sqlc.sql(query2) df7.show()
def func2(rdd): ''' 利用之前的知识, 我们回顾下: 从rdd创建dataframe dataframe创建表 使用sql 返回dataframe进行各种保存 ''' sqlContext = SQLContext(rdd.context) newrdd = rdd.map(lambda line: [line]) df = sqlContext.createDataFrame(newrdd, StructType([StructField(name="content", dataType=StringType())])) df.createOrReplaceTempView("data") sqlContext.sql("select content from data").show()
def prediction_wrapper(net): def prediction_map_func(row): cols_map = {} for col in column_names: cols_map[col] = row[col] bmu, bmu_idx = find_bmu(row['features'], net) cols_map["bmu"] = Vectors.dense(bmu[0]) cols_map["bmu_idx"] = Vectors.dense(bmu_idx) return Row(**cols_map) rdd_prediction = df.rdd.map(lambda row: prediction_map_func(row)) # getting existing sparkContext sc = SparkContext.getOrCreate() sqlContext = SQLContext(sc) return sqlContext.createDataFrame(rdd_prediction)
def naive_bayes_classify(comment_preprocessed): sc = SparkContext(appName="Classification") sql_context = SQLContext(sc) data = sql_context.createDataFrame(comment_preprocessed) train, test = data.randomSplit([0.7, 0.3], 1234) nb = NaiveBayes(smoothing=1.0, modelType="multinomial") model = nb.fit(train) predictions = model.transform(test) evaluate_classification(predictions) time.sleep(1) # predict_comment(sql_context, model) compare_classification_with_tool(sql_context, model)
def get_rdd_from_df(df): """ takes a pandas df and returns a spark RDD """ from pyspark import SparkContext, SQLContext from pyspark.mllib.linalg import Vectors sc = SparkContext.getOrCreate() from warnings import warn warn("get_rdd_from_df creates a spark context, it is recommended" " that you use SparkContext.getOrCreate() to prevent multiple context" " creation") sqlContext = SQLContext(sc) spark_df = sqlContext.createDataFrame(df) rdd = spark_df.rdd.map( lambda data: Vectors.dense([float(x) for x in data])) return rdd
def main(account_name, account_key): sc = SparkContext() sqlContext = SQLContext(sc) patient_records_container = 'patientrecords' glucose_levels_container = 'glucoselevelsaggs' preds_container = 'predictions' blob_service = BlobService(account_name=account_name, account_key=account_key) blob_service.create_container(preds_container) day_to_predict = get_most_recent_date(blob_service, glucose_levels_container) df = get_df_from_blob(blob_service, glucose_levels_container, patient_records_container, day_to_predict) project_path = 'wasb://model@{}.blob.core.windows.net/{}' si_pipe_model = PipelineModel.read().load(path=project_path.format(account_name, 'si_pipe_model')) oh_pipe_model = PipelineModel.read().load(path=project_path.format(account_name, 'oh_pipe_model')) model = RandomForestClassificationModel.read().load(path=project_path.format(account_name, 'model')) df_spark = sqlContext.createDataFrame(df) df_preds = si_pipe_model.transform(df_spark) df_preds = oh_pipe_model.transform(df_preds) num_var_names = ['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses', 'glucose_min', 'glucose_max', 'glucose_mean', 'glucose_var'] cat_var_names = ['race', 'gender', 'age', 'weight', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'payer_code', 'medical_specialty', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'diag_1_missing', 'diag_2_missing', 'diag_3_missing', 'race_missing', 'weight_missing', 'payer_code_missing', 'medical_specialty_missing'] va = VectorAssembler(inputCols=(num_var_names + [c + "__encoded__" for c in cat_var_names]), outputCol='features') df_preds = va.transform(df_preds).select('features') df_preds = model.transform(df_preds) df_preds_pandas = df_preds.toPandas() df_preds_pandas = pd.concat([df[['patient_nbr', 'discharge_date']], df_preds_pandas['probability'].map(lambda x: x[1])], axis=1) # Save the predictions blob_service.put_block_blob_from_text(blob_name='-'.join(str(day_to_predict).split('/')) + '.csv', container_name=preds_container, text=df_preds_pandas.to_csv(index=False)) return
def simple_test_dataframe(sc: SparkContext): py_data = read_data("data/energy_agg_test.json") rdd = sc.parallelize(py_data) sqlContext = SQLContext(sc) schema = StructType([ StructField('energy', DoubleType(), nullable=False), StructField('energy', DoubleType(), nullable=False), StructField('energy', DoubleType(), nullable=False), StructField('energy', DoubleType(), nullable=False), StructField('energy', DoubleType(), nullable=False), StructField('energy', DoubleType(), nullable=False), StructField('energy', DoubleType(), nullable=False) ]) df = sqlContext.createDataFrame(py_data) print_type_value(df)
def get_result(function, param=None): pandas_dataframe = get_requireddataframe_fromcsv( 'Latest_women_shoes.csv', ['id', 'brand', 'colors', 'dateAdded']) conf = SparkConf().setAppName('Women Catalog') sc = SparkContext(conf=conf) # df2 = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('sample.csv') #used pandas dataframe as using the above the file could not be located. sqlContext = SQLContext(sc) spark_dataframe = sqlContext.createDataFrame(pandas_dataframe) #data=spark_dataframe.select("*").toPandas() result_spark_dataframe = getattr(sys.modules[__name__], function)(spark_dataframe, param) result_python_dataframe = result_spark_dataframe.toPandas() result_dict = result_python_dataframe.to_dict('records') sc.stop() return result_dict
def multilayer_perceptron_classify(comment_preprocessed): sc = SparkContext(appName="Classification") sql_context = SQLContext(sc) data = sql_context.createDataFrame(comment_preprocessed) train, test = data.randomSplit([0.7, 0.3], 1234) layers = [len(comment_preprocessed[0].features), 11, 2] # sqrt(2000) = 45, sqrt(4000) = 63, log(2000, 2) = 11 trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234) model = trainer.fit(train) predictions = model.transform(test) evaluate_classification(predictions) time.sleep(1) # predict_comment(sql_context, model) compare_classification_with_tool(sql_context, model)
def sample_function(sc: SparkContext): schema = StructType([StructField("odd_numbers", IntegerType(), True)]) print(" Odds number sample") big_list = range(10) rdd = sc.parallelize(big_list, 2) odds = rdd.filter(lambda x: x % 2 != 0) odds.foreach(my_print) sql_context = SQLContext(sc) odd_numbers = sql_context.createDataFrame(odds.map(lambda _: Row(_)), schema) odd_numbers.printSchema() odd_numbers.show(truncate=False) print("odd_numbers count:" + str(odd_numbers.count())) odd_numbers.createOrReplaceTempView("odd_numbers_table") sql_context.sql("select * from odd_numbers_table limit 2;").show() return (odd_numbers)
fields = [ StructField('logintype', StringType(), True), StructField('logtype', StringType(), True), StructField('hosid', StringType(), True), StructField('suppid', StringType(), True), StructField('logtime', LongType(), True), StructField('usermac', StringType(), True) ] schema = StructType(fields) rdd1 = rdd.map(convert_logtype).filter(lambda tup: tup != None) # rdd1.foreach(printx) # sc.stop() ret_df = sqlContext.createDataFrame(rdd1, schema) ret_df.registerTempTable("loginflowlog_overall") _sql = "SELECT count(usermac) pv,count(distinct usermac) uv,logtype " \ "from loginflowlog_overall " \ "group by logtype" rs_df = sqlContext.sql(_sql) service = LoginflowlogMysqlService() ret_overall_list = service.getRetOverall(rs_df.collect(), day) _sql_delete = "delete from login_flow_global_count where date ='%s'" % day _sql_insert = "insert into login_flow_global_count(date," \ "prelogin_num,prelogin_pnum,login_num,login_pnum," \ "login_click_num,login_click_pnum,forward_num,forward_pnum," \ "preArrive_num,preArrive_pnum,arrive_num,arrive_pnum) " \ "values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" service.write_mysql(ret_overall_list, _sql_delete, _sql_insert)
from pyspark.sql import SQLContext,Row #from pyspark.sql import Functions as F dataDir = "/home/rsk/Documents/Spark" userData = sc.textFile(dataDir+"/ml-100k/u.user").map(lambda x : x.split("|")) movieData = sc.textFile(dataDir+"/ml-100k/u.item").map(lambda x : x.split("|")) ratingData = sc.textFile(dataDir+"/ml-100k/u.data").map(lambda x : x.split("\t")) #%% ratingDataDF = ratingData.map(lambda x : Row(userID = int(x[0]), movieID = int(x[1]), rating=float(x[2]), timestamp = int(x[3]))) ratingDataDF = sqlContext.createDataFrame(ratingDataDF) userDataDF = userData.map(lambda x : Row(userID=int(x[0]), age = int(x[1]), gender = x[2], occupation = x[3], zipcode = x[4])) userDataDF = sqlContext.createDataFrame(userDataDF) movieDataDF = movieData.map(lambda x : Row(movieID = int(x[0]), movieTitle = x[1], releaseDate = x[2], videoReleaseDate = x[3], IMDBurl = x[4], unknown= int(x[5]), action = int(x[6]),
# setup # import numpy as np # create random data n = 52 prices = [float(list(5 + abs(np.random.randn(1)) * 100)[0]) for i in range(n)] dates = [datetime(year=np.random.randint(2000, 2016), month=np.random.randint(1, 12), day=np.random.randint(1, 28)).date() for i in range(n)] groups = [np.random.randint(1, 100) for i in range(n)] data = [{"price": price, "date": _date, "group": group} for price, _date, group in zip(prices, dates, groups)] df = sqlContext.createDataFrame(data) print('df initial') df.show() # convert to rdd of dicts rdd = df.rdd rdd = rdd.map(lambda x: x.asDict()) # # get deciles # total_num_rows = rdd.count() column_to_decile = 'price'
#(u'2015-48_6C25B958F2CC_175', u'2015120120') #rdd1.foreach(my_print) #(u'2015-50_7014A62FA5B0_0', [u'22',u'23']) rdd1_2 = rdd1_1.groupByKey().mapValues(list).sortByKey().map(times_count_first) #(u'2015-48_903C920CAE97_655', [u'15_1']) #rdd1_2.foreach(my_print) rdd2_1 = df.rdd.map(convert_kv_last) rdd2_2 = rdd2_1.groupByKey().mapValues(list).sortByKey().map(times_count_last) rdd3 = rdd1_2.join(rdd2_2).map(convert_rets).values().flatMap(list) #(u'2015', u'48', u'A09347EC9FBB', u'189', u'13', u'1', u'14', u'1') rdd3.foreach(my_print) logger.info(rdd3.count()) fields = [ StructField('year', StringType(), True), StructField('week', StringType(), True), StructField('mac', StringType(), True), StructField('hosid', StringType(), True), StructField('firstTime', StringType(), True), StructField('firstCount', LongType(), True), StructField('lastTime', StringType(), True), StructField('lastCount', LongType(), True) ] schema = StructType(fields) df1 = sqlContext.createDataFrame(rdd3,schema) df1.coalesce(2).write.parquet(output,'overwrite') sc.stop()
def main(): conf = SparkConf().setAppName("climate") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) climateSchema = StructType( [ StructField("station", StringType(), False), StructField("date", IntegerType(), False), StructField("element", StringType(), False), StructField("value", IntegerType(), True), StructField("mflag", StringType(), True), StructField("qflag", StringType(), True), StructField("sflag", StringType(), True), StructField("obstime", StringType(), True), ] ) info = sqlContext.read.format("com.databricks.spark.csv").options(header="false").schema(climateSchema).load(inputs) info.registerTempTable("info") stationinfo = sqlContext.sql("SELECT station, date, element, value, FLOOR(date/10000) as yy FROM info ") stationinfo.registerTempTable("stationinfo") stationinfo.cache() prcpTable = sqlContext.sql("SELECT station, date, value as prcp, yy FROM stationinfo WHERE element='PRCP' ") prcpTable.registerTempTable("prcpTable") prcpTable.cache() # prcpTable.show() # create 3 tables that hold the monthly average of min, max temperature and prcp yearlyprcp = sqlContext.sql( "SELECT station, yy, ROUND(Avg(prcp),0) as avg_prcp FROM prcpTable GROUP BY station, yy " ) yearlyprcp.registerTempTable("prcpMean") # yearlyprcp.show() # get information about stations from stations.txt def getdata(line): line = line.split(" ") values = [x.strip() for x in line] return values stations = sc.textFile(input2) stations = stations.map(getdata) stations = stations.map(lambda (a, b, c): Row(station=a, latitude=float(b), longitude=float(c))).cache() stationDF = sqlContext.createDataFrame(stations) stationDF.registerTempTable("StationTable") stationDF.cache() # param = sqlContext.sql("SELECT MAX(latitude) as max_lat, Min(latitude) as min_lat, MAX(longitude) as max_long, Min(longitude) as min_long FROM StationTable") # param.show() # Join to station file to add latitude and longitude and stationID result = ( stationDF.join(yearlyprcp) .where(stationDF.station == yearlyprcp.station) .select(yearlyprcp.avg_prcp, yearlyprcp.station, yearlyprcp.yy, stationDF.latitude, stationDF.longitude) ) # save into parquet file result.write.format("parquet").save(output)
if __name__ == "__main__": file_path = os.path.abspath("../doc/book.txt") print file_path conf = SparkConf().setAppName("schema_test").setMaster("local") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) lines = sc.textFile(file_path) # 切分 parts = lines.map(lambda lines: lines.split(",")) # 隐射表间关系(定义表结构) book = parts.map(lambda book: Row(name=book[0], author=book[1], price=float(book[2]), publish=book[3])) # 转换成schema并注册 schemaPeople = sqlContext.createDataFrame(book) schemaPeople.registerTempTable("book") # 定义sqk语句(查询prize在50、60之间的书) book = sqlContext.sql("SELECT * FROM book WHERE price > 50.0 AND price < 60 OR name LIKE '%Spark%'") # 查询结果进行隐射 bookMap = book.map(lambda books: (books.name, books.author, books.price, books.publish)) for book in bookMap.collect(): print "|Name: " + book[0], "|Author: " + book[1], "|Price: " + str(book[2]), "|Publish: " + book[3] + "|" sc.stop()
download_flow(*) upload_flow(*) os browser ratio batch_no user_type supp_id ''' user_login = parts.map(lambda p: (p[1].strip(), p[2].strip(),p[17].strip(),p[3].strip(),p[16].strip(), p[4].strip(),p[5].strip(),p[6].strip(),p[7].strip(),p[8].strip(), p[9].strip(),p[10].strip(),p[11].strip(),p[12].strip(),p[13].strip(), p[14].strip(),p[15].strip())) schema_string = "id gw_id supp_id user_id user_type " \ "user_name login_time logout_time mac ip " \ "user_agent download_flow upload_flow os browser " \ "ratio batch_no" fields = [StructField(field_name, StringType(), True) for field_name in schema_string.split(' ')] schema = StructType(fields) df = sql_context.createDataFrame(user_login, schema) df.registerTempTable("tb_user_login_info") #_sql="select distinct mac,gw_id,'%s' as day,'1' as flag from tb_user_login_info" % date _sql="select distinct user_name,gw_id,'%s' as day,'1' as flag from tb_user_login_info" % date rs = sql_context.sql(_sql) re_rdd = rs.map(lambda r:(r.mac+"_"+r.gw_id,r.day+sep+r.flag))\ .reduceByKey(lambda vs:vs[0],1) list =[] for t in re_rdd.collect(): line = t[0]+sep+t[1] list.append(line) BaseService()._write_file(list, output)
# -*- coding: utf-8 -*- __author__ = 'wxmimperio' from pyspark import SparkContext, SparkConf from pyspark import SQLContext, Row import os if __name__ == "__main__": file_path = os.path.abspath("../doc") conf = SparkConf().setMaster("local[2]").setAppName("schema_merging") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) # 创建DataFrame df1 = sqlContext.createDataFrame(sc.parallelize(range(1, 6)).map(lambda i: Row(single=i, double=i * 2))) df1.write.parquet(file_path + "/result/key=1") df2 = sqlContext.createDataFrame(sc.parallelize(range(6, 11)).map(lambda i: Row(single=i, triple=i * 3))) df2.write.parquet(file_path + "/result/key=2") df3 = sqlContext.read.option("mergeSchema", "true").parquet(file_path + "/result") df3.printSchema() # print df3.collect() for row in df3.collect(): print "single=" + str(row[0]), "triple=" + str(row[1]), "double=" + str(row[2]), "key=" + str(row[3]) sc.stop()
# Load and parse the data # line format: (station, latitude, longitude,) def parsePoint(line): return LabeledPoint(line[0], line[1:]) # read data from station file def getdata(line): line = line.split(' ') values = [x.strip() for x in line] return values stations = sc.textFile(input) stations = stations.map(getdata) stations = stations.map(lambda (a,b,c): (float(hash(a)), int(year), float(b), float(c))).cache() stationsDF = sqlContext.createDataFrame(stations) # create dataset to fit into model parseData = stations.map(parsePoint) # load the model sameModel = LinearRegressionModel.load(sc, myModelPath) # run the model stationidAndPreds = parseData.map(lambda p : (p.label, float(sameModel.predict(p.features)))) stationidAndPredsDF = sqlContext.createDataFrame(stationidAndPreds) # the result returns a predicted value for each station (stationId) in the given year # joining the stations rdd with stationidAndPreds to find the latitude and longitude of each station result = stationsDF.join(stationidAndPredsDF).where(stationidAndPredsDF[0]==stationsDF[0]).select(stationidAndPredsDF[1], stationsDF[2], stationsDF[3])
.setAppName("adhoscount") .set("spark.kryoserializer.buffer.mb", "256") .set("spark.sql.parquet.binaryAsString","true") ) sc = SparkContext(conf = conf) sqlContext = SQLContext(sc) _adloadDF=sqlContext.read.parquet(adLoadFiles) _adloadRdd=_adloadDF.rdd.map(lambda x:(x.guuid,x.hosid)).groupByKey().map(fetchOne) fields = [ StructField('guuid', StringType(), True), StructField('hosid', StringType(), True), ] schema = StructType(fields) schemaDest = sqlContext.createDataFrame(_adloadRdd, schema) schemaDest.registerTempTable("ghid") _adloadDF.registerAsTable("adload") sqlContext.read.parquet(adPlayFiles).registerAsTable("adplay") sqlContext.read.parquet(adClickFiles).registerAsTable("adclick") ''' _adLoadDF=sqlContext.createDataFrame([ {'uid': '1', 'adid': 'a','guuid':'aa','guuidctime':1,'url':'','referer':'','hosid':'132','gwid':'','ua':'','ip':'','createtime':1450823568766}, {'uid': '2', 'adid': 'b','guuid':'aa','guuidctime':1,'url':'','referer':'','hosid':'132','gwid':'','ua':'','ip':'','createtime':1450823569766}, {'uid': '3', 'adid': 'c','guuid':'aa','guuidctime':1,'url':'','referer':'','hosid':'132','gwid':'','ua':'','ip':'','createtime':1450823550766}, {'uid': '4', 'adid': 'd','guuid':'bb','guuidctime':1,'url':'','referer':'','hosid':'133','gwid':'','ua':'','ip':'','createtime':1450823268766}, ]).registerAsTable("adload") _adPlayDF=sqlContext.createDataFrame([ {'uid': '1', 'adid': 'a','guuid':'aa','createtime':1450823568766},
""" from pyspark import SparkContext, SQLContext # $example on$ from pyspark.ml import Pipeline from pyspark.ml.classification import LogisticRegression from pyspark.ml.feature import HashingTF, Tokenizer # $example off$ if __name__ == "__main__": sc = SparkContext(appName="PipelineExample") sqlContext = SQLContext(sc) # $example on$ # Prepare training documents from a list of (id, text, label) tuples. training = sqlContext.createDataFrame([ (0L, "a b c d e spark", 1.0), (1L, "b d", 0.0), (2L, "spark f g h", 1.0), (3L, "hadoop mapreduce", 0.0)], ["id", "text", "label"]) # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=10, regParam=0.01) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # Fit the pipeline to training documents. model = pipeline.fit(training) # Prepare test documents, which are unlabeled (id, text) tuples. test = sqlContext.createDataFrame([
from pyspark import SparkContext, SQLContext # $example on$ from pyspark.mllib.linalg import Vectors from pyspark.ml.classification import LogisticRegression # $example off$ if __name__ == "__main__": sc = SparkContext(appName="EstimatorTransformerParamExample") sqlContext = SQLContext(sc) # $example on$ # Prepare training data from a list of (label, features) tuples. training = sqlContext.createDataFrame([ (1.0, Vectors.dense([0.0, 1.1, 0.1])), (0.0, Vectors.dense([2.0, 1.0, -1.0])), (0.0, Vectors.dense([2.0, 1.3, 1.0])), (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"]) # Create a LogisticRegression instance. This instance is an Estimator. lr = LogisticRegression(maxIter=10, regParam=0.01) # Print out the parameters, documentation, and any default values. print "LogisticRegression parameters:\n" + lr.explainParams() + "\n" # Learn a LogisticRegression model. This uses the parameters stored in lr. model1 = lr.fit(training) # Since model1 is a Model (i.e., a transformer produced by an Estimator), # we can view the parameters it used during fit(). # This prints the parameter (name: value) pairs, where names are unique IDs for this # LogisticRegression instance.
sc = SparkContext(conf = conf) sqlContext = SQLContext(sc) df = sqlContext.read.parquet(logFile) destDF=df.select('logintype','logtype','hosid','suppid','logtime','usermac','gwid').map(lambda x:trimf(x)) fields = [ StructField('logintype', StringType(), True), StructField('logtype', StringType(), True), StructField('hosid', StringType(), True), StructField('suppid', StringType(), True), StructField('logtime', LongType(), True), StructField('usermac', StringType(), True), StructField('gwid', StringType(), True) ] schema = StructType(fields) schemaDest = sqlContext.createDataFrame(destDF, schema) schemaDest.registerTempTable("loginflowlog") sqlContext.registerFunction("todatestr", lambda x:longTime2str(x),StringType()) sqlContext.registerFunction("trimx", lambda x:trimx(x),StringType()) midDF = sqlContext.sql("select count(1) userlogintimes,count(distinct(usermac)) userlogincount,hosid,gwid,todatestr(logtime) day from loginflowlog " "where logtype like '5-%-arrive' and gwid!='' group by hosid,gwid,todatestr(logtime)") hosiddayList=midDF.rdd.map(lambda x:(x[2],x[3],x[4],x[2],x[3],x[4])).collect() resultList=midDF.rdd.collect() dao=MysqlDao() dao.insertMany('INSERT INTO `bblink_data`.`bblink_data_hos_subject` (`hosid`,`gwid`,`day`)VALUES(%s,%s,%s) ON DUPLICATE KEY UPDATE hosid=%s,gwid=%s,day=%s',hosiddayList) dao.insertMany("update `bblink_data`.`bblink_data_hos_subject` set userlogintimes=%s,userlogincount=%s where hosid=%s and gwid=%s and day=%s",resultList);
def applyModel(fileName, loadModelName, outlierPercentile = 100): sc = SparkContext( 'local', 'pyspark') sqlContext = SQLContext(sc) ######### # load data ######### data = sc.textFile(fileName) #extract header and remove it header = data.first() data = data.filter(lambda x:x !=header).cache() header = header.split('\t') #parse data data = data.map(lambda x : x.split('\t')) ######### # prepare features ######### df = sqlContext.createDataFrame(data, header) df = (df.withColumn("ADLOADINGTIME",func.regexp_replace('ADLOADINGTIME', 'null', '0').cast('float')) .withColumn("TIMESTAMP",func.regexp_replace('TIMESTAMP', 'null', '0').cast('int')) .withColumn("GEOIP_LAT",func.regexp_replace('GEOIP_LAT', 'null', '0').cast('int')) .withColumn("GEOIP_LNG",func.regexp_replace('GEOIP_LNG', 'null', '0').cast('int')) .withColumn("HOSTWINDOWHEIGHT",func.regexp_replace('HOSTWINDOWHEIGHT', 'null', '0').cast('int')) .withColumn("HOSTWINDOWWIDTH",func.regexp_replace('HOSTWINDOWWIDTH', 'null', '0').cast('int')) .withColumn("TOPMOSTREACHABLEWINDOWHEIGHT",func.regexp_replace('TOPMOSTREACHABLEWINDOWHEIGHT', 'null', '0').cast('int')) .withColumn("TOPMOSTREACHABLEWINDOWWIDTH",func.regexp_replace('TOPMOSTREACHABLEWINDOWWIDTH', 'null', '0').cast('int')) ) thr = np.percentile(df.select("ADLOADINGTIME").rdd.collect(), outlierPercentile) df = df.filter(func.col('ADLOADINGTIME') < thr) df = df.withColumn("TOPMOSTREACHABLEWINDOWAREA", func.col("TOPMOSTREACHABLEWINDOWHEIGHT")*func.col("TOPMOSTREACHABLEWINDOWWIDTH")) df = df.withColumn("INTENDENTISACTUALDEVICETYPE", (func.col("ACTUALDEVICETYPE")==func.col("INTENDEDDEVICETYPE")).cast('int')) df = df.withColumn("COMBINEDID", func.concat( func.col('ACCOUNTID'), func.col('CAMPAIGNID'), func.col('CREATIVEID'), func.col('SDK')) ) #df = df.withColumn("COMBINEDID", func.regexp_replace("COMBINEDID", '^$', 'NA')) df = df.withColumn("COMBINEDEXTERNALID", func.concat( func.regexp_replace('EXTERNALADSERVER', 'null', ''), func.regexp_replace('EXTERNALPLACEMENTID', 'null', ''), func.regexp_replace('EXTERNALSITEID', 'null', ''), func.regexp_replace('EXTERNALSUPPLIERID', 'null', '') )) #df = df.withColumn("COMBINEDEXTERNALID", func.regexp_replace("COMBINEDEXTERNALID", '^$', 'NA')) df = df.withColumn("PLATFORMCOMBINED", func.concat( func.regexp_replace('PLATFORM', 'null', ''), func.regexp_replace('PLATFORMVERSION', 'null', '') )) #df = df.withColumn("PLATFORMCOMBINED", func.regexp_replace("PLATFORMCOMBINED", '^$', 'NA')) df = df.withColumn("UA_OSCOMB", func.concat( func.regexp_replace('UA_OS', 'null', ''), func.regexp_replace('UA_OSVERSION', 'null', '') )) #df = df.withColumn("UA_OSCOMB", func.regexp_replace("UA_OSCOMB", '^$', 'NA')) df = df.withColumn("FILESJSON_SIZE", func.regexp_replace('FILESJSON', '[^,\d]', '') ) df = df.withColumn("FILESJSON_SIZE", func.regexp_replace('FILESJSON_SIZE', '^,', '') ) df = df.withColumn("FILESJSON_SIZE", func.regexp_replace('FILESJSON_SIZE', ',,', ',') ) udf = func.udf(lambda x: int(np.fromstring(x,dtype=int, sep=',').sum()), IntegerType()) df = df.withColumn("FILESJSON_SIZE", udf("FILESJSON_SIZE")) print('Loaded and prapared %d entries' % df.count()) ######### # keep only needed features ######### features = ['ADLOADINGTIME', 'PLACEMENTID', 'TIMESTAMP', 'CREATIVETYPE', 'UA_HARDWARETYPE', 'UA_VENDOR', 'UA_MODEL', 'UA_BROWSER', 'UA_BROWSERVERSION', 'FILESJSON', 'ERRORSJSON', 'TOPMOSTREACHABLEWINDOWAREA', 'FILESJSON_SIZE', 'COMBINEDID', 'COMBINEDEXTERNALID', 'PLATFORMCOMBINED', 'UA_OSCOMB', 'SDK', 'EXTERNALADSERVER' ] df = df.select(features) ######### # Convert categorical features to numerical ######### featuresCat = [ 'PLACEMENTID', 'CREATIVETYPE', 'UA_HARDWARETYPE', 'UA_VENDOR', 'UA_MODEL', 'UA_BROWSER', 'UA_BROWSERVERSION', 'FILESJSON', 'ERRORSJSON', 'COMBINEDID', 'COMBINEDEXTERNALID', 'PLATFORMCOMBINED', 'UA_OSCOMB', 'SDK', 'EXTERNALADSERVER' ] for i in range(len(featuresCat)): indexer = StringIndexer(inputCol=featuresCat[i], outputCol='_'+featuresCat[i]).setHandleInvalid("skip").fit(df) df = indexer.transform(df).drop(featuresCat[i]) writer = indexer._call_java("write") writer.overwrite().save("indexer_" + featuresCat[i]) featuresCat = [ '_' + featuresCat[i] for i in range(len(featuresCat))] features = featuresCat[:] features.append('TIMESTAMP') features.append('FILESJSON_SIZE') features.append('TOPMOSTREACHABLEWINDOWAREA') ######### # Assemble features ######### assembler = VectorAssembler( inputCols=features, outputCol="features") df = assembler.transform(df) ######### # Convert to labeled point ######### lp = (df.select(func.col("ADLOADINGTIME").alias("label"), func.col("features")) .map(lambda row: LabeledPoint(row.label, row.features))) lp.cache() ######### # Load trained model ######### model = RandomForestModel.load(sc, loadModelName) print('Model loaded!') predictions = model.predict(lp.map(lambda x: x.features)).collect() return predictions
StructField('mac', StringType(), True), StructField('hosid', StringType(), True), StructField('loginPage', IntegerType(), False), StructField('forwardPage', IntegerType(), False), StructField('arrivePage', IntegerType(), False) ] schema = StructType(fields) # compute pages rdd1 = rdd.map(convert_logtime)\ .map(convert_kv)\ .groupByKey().mapValues(list).map(convert_set) #(u'20151201_74:AD:B7:78:03:86_119', set([u'1-prelogin', u'2-mobile-login'])) rdd1_2 = rdd1.map(convert_visitpage) df1 = sqlContext.createDataFrame(rdd1_2,schema) #.registerTempTable("mid_uservisitpage_day") _output = output+"/mid_uservisitpage_day/dat=%s" % day df1.coalesce(2).write.parquet(_output,'overwrite') # compute times rdd2 = rdd.map(convert_kv2).groupByKey().mapValues(list).map(convert_sort) # (u'20151201_38AA3C3DBC12_127', ['2015120119', '2015120121']) rdd2_2 = rdd2.map(convert_days) #rdd2_2.foreach(my_print) fields = [ StructField('day', StringType(), True), StructField('mac', StringType(), True), StructField('hosid', StringType(), True), StructField('firstTime', StringType(), True),
nn_gridsearch.debug('-'*40) nn_gridsearch.debug('Execution time: %s' % str(datetime.now())) # with open('~/.aws/credentials.json') as f: # CREDENTIALS = json.load(f) sc = set_spark_context() conn = S3Connection() sqc = SQLContext(sc) sm = SparkModel(sc, conn, rdd_path='rdd.pkl') bow_rdd = sm.RDD.join(sm.target).map(lambda (key, (bow, label)): (label, bow)) \ .sample(withReplacement=False, fraction=.5, seed=1) df = sqc.createDataFrame(bow_rdd, ['string_label', 'raw']) train_rdd, test_rdd = df.randomSplit([.8, .2], seed=1) results = [] num_features = 5000 min_doc_freq = 20 layers = [[5000, 2056, 512, 128, 2], [5000, 1000, 128, 2], [5000, 100, 2], [5000, 5000, 2]] for l in layers: remover = StopWordsRemover(inputCol="raw", outputCol="words") hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="word_counts", numFeatures=num_features) tfidf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features", minDocFreq=min_doc_freq) indexer = StringIndexer(inputCol="string_label", outputCol="label")
with(SparkContext(appName='My Spark Application')) as sc: print 'It works!' sql_context = SQLContext(sc) # Allows it to work in parallel rdd = sc.parallelize([ ('john', 1), ('tori', 2), ('alex', 3), ('julia', 4), ('chris', 5) ]) # Combines keys together and add them up rdd = rdd.reduceByKey(lambda a, b: a + b) # Make table schema with types schema = StructType([ StructField('name', StringType()), StructField('price', IntegerType()) ]) # Pass RDD with data and schema df = sql_context.createDataFrame(rdd, schema) # print rdd.take(5) df.show() df.printSchema()
class Credit: def __init__(self): self.conf = (SparkConf() .setAppName("CREDIT") .set("spark.cores.max", "2") .set('spark.executor.extraClassPath', '/usr/local/env/lib/mysql-connector-java-5.1.38-bin.jar')) self.sc = SparkContext(conf=self.conf) self.sqlctx = SQLContext(self.sc) self.mysql_helper = MySQLHelper('core', host='10.9.29.212') self.base = 'hdfs://master:9000/gmc/' def load_from_mysql(self, table, database='core'): url = "jdbc:mysql://10.9.29.212:3306/%s?user=root&characterEncoding=UTF-8" % database df = self.sqlctx.read.format("jdbc").options(url=url, dbtable=table, driver="com.mysql.jdbc.Driver").load() return df def sql_operate(self, sql, rdd, once_size=1000): temp = [] for row in rdd.collect(): # print(row) if len(temp) >= once_size: self.mysql_helper.executemany(sql, temp) temp.clear() temp.append(row) if len(temp) != 0: self.mysql_helper.executemany(sql, temp) temp.clear() def prepare_fpgrowth_data(self): tran_df = self.load_from_mysql('t_CMMS_CREDIT_TRAN').filter("BILL_AMTFLAG = '+'").select('ACCTNBR', 'MER_CAT_CD') \ .filter("MER_CAT_CD != 0").filter("MER_CAT_CD != 6013") result = tran_df.map(lambda x: (str(int(x['ACCTNBR'])), [str(int(x['MER_CAT_CD'])), ])).groupByKey() def m(x): k = x[0] l = list(x[1]) v = set() for i in l: v.add(i[0]) return set(v) result = result.map(m) for i in result.take(10): print(i) model = FPGrowth.train(result, minSupport=0.05, numPartitions=10) result = model.freqItemsets().collect() for r in result: print(r) def cycle_credit(self): ''' 信用卡聚类数据预处理 :return: ''' print('---------------------------信用卡-Start--------------------------') # 交易流水 credit_tran_df = self.load_from_mysql('t_CMMS_CREDIT_TRAN').select('ACCTNBR', 'MONTH_NBR', 'BILL_AMT', 'BILL_AMTFLAG').filter( "BILL_AMTFLAG ='-'").cache() # 卡账户信息 credit_acct_df = self.load_from_mysql('ACCT_D').select('ACCTNBR', 'MONTH_NBR', 'STM_MINDUE') # 还款计算 return_amt = credit_tran_df.groupBy('ACCTNBR', 'MONTH_NBR').sum('BILL_AMT') return_amt = return_amt.select('ACCTNBR', 'MONTH_NBR', return_amt['sum(BILL_AMT)'].alias('RETURNED')) # 去除0最低还款额,即未消费的账单月 join = credit_acct_df.join(return_amt, ['ACCTNBR', 'MONTH_NBR'], 'outer').filter('STM_MINDUE != 0') # 清除缓存 self.sqlctx.clearCache() def which_cycle_type(line): mindue = line['STM_MINDUE'] returned = line['RETURNED'] ''' 0:normal,all returned 1:cycle credit 2:overdue,don't return money ''' if mindue is not None and returned is None: flag = 2 elif returned >= mindue * 10: flag = 0 elif returned > mindue and returned < mindue * 10: flag = 1 else: flag = 9 return Row(ACCTNBR=int(line['ACCTNBR']), MONTH_NBR=line['MONTH_NBR'], DUE_FLAG=flag, STM_MINDUE=line['STM_MINDUE']) # 返回为PipelinedRDD join = join.map(which_cycle_type) # 转为DataFrame join = self.sqlctx.createDataFrame(join) ''' +-------+--------+-----+ | ACCTNBR | DUE_FLAG | count | +-------+--------+-----+ | 608126 | 2 | 1 | | 608126 | 0 | 6 | | 608868 | 0 | 4 | ''' # 按还款类型分类 each_type = join.groupBy(['ACCTNBR', 'DUE_FLAG']) # 计算每种还款情况数量 each_type_count = each_type.count() # 计算每种还款情况的最低还款额之和 each_type_mindue_sum = each_type.sum('STM_MINDUE') # 计算还款情况总数 all_type_count = each_type_count.groupBy('ACCTNBR').sum('count') # join 上述三表 rate = each_type_count.join(each_type_mindue_sum, ['ACCTNBR', 'DUE_FLAG'], 'outer').join(all_type_count, 'ACCTNBR', 'outer') # print(rate.columns) # ['ACCTNBR', 'DUE_FLAG', 'count', 'sum(STM_MINDUE)', 'sum(count)'] # 筛选出循环信用的数据 # TODO 暂时只取了循环信用的 rate = rate.filter(rate['DUE_FLAG'] == 1) # 计算进入循环信用的比例 rate = rate.select('ACCTNBR', (rate['sum(STM_MINDUE)'] * 10).alias('CYCLE_AMT'), rate['count'].alias('CYCLE_TIMES'), (rate['count'] / rate['sum(count)']).alias('CYCLE_RATE')) # rate.show() # print(rate.count()) def m(line): return line['CYCLE_TIMES'], line['CYCLE_AMT'], line['CYCLE_RATE'], line['ACCTNBR'] sql = "update t_CMMS_TEMP_KMEANS_CREDIT set CYCLE_TIMES=%s,CYCLE_AMT=%s,CYCLE_RATE=%s where ACCTNBR=%s" df = rate.map(m) print('将数据更新至数据库...') self.sql_operate(sql, df) # 将未进入循环的 设为0 print('将未进入循环的 设为0...') self.mysql_helper.execute( "update t_CMMS_TEMP_KMEANS_CREDIT set CYCLE_TIMES=0,CYCLE_AMT=0,CYCLE_RATE=0 where CYCLE_TIMES is null ") def losing_warn(self,year,month): # #计算月份 # if season == 1: # months_now = [1,2,3] # months_before = [10,11,12] # year_before = year - 1 # else: # months_now = [season * 3 - 2, season * 3 - 1, season * 3] # months_before = [season * 3 - 5, season * 3 - 4, season * 3-3] # year_before = year # # # # # 抽取每个季度数据 # # for m in months_now: # 最近一个月 month = '%02d' % month for i in range(2,29): day = '%02d' % i date = str(year) + month + day print(date) sql = "select MONTH_NBR from t_CMMS_CREDIT_TRAN where INP_DATE = %s limit 1" try: month_nbr = self.mysql_helper.fetchone(sql,(date,)) if month_nbr is None: continue else: month_nbr = int(month_nbr[0]) break except Exception: continue if month_nbr is None: raise Exception("There is no data in database for month:%s" % month) else: print('the latest month_nbr is %s' % month_nbr) months_now = [month_nbr-2,month_nbr-1,month_nbr] months_before = [month_nbr-5,month_nbr-4,month_nbr-3] print('months_now',months_now) print('months_before',months_before) # 交易流水 credit_tran_df = self.load_from_mysql('t_CMMS_CREDIT_TRAN').select('ACCTNBR', 'INP_DATE', 'MONTH_NBR', 'BILL_AMT', 'BILL_AMTFLAG').filter("BILL_AMTFLAG ='+'").cache() # 筛选出两个季度对应的流水 months_now_filter = None months_before_filter = None for i in months_now: f = credit_tran_df.filter(credit_tran_df['MONTH_NBR'] == i) if months_now_filter is None: months_now_filter = f else: months_now_filter = months_now_filter.unionAll(f) for i in months_before: f = credit_tran_df.filter(credit_tran_df['MONTH_NBR'] == i) if months_before_filter is None: months_before_filter = f else: months_before_filter = months_before_filter.unionAll(f) months_now_filter.groupBy('MONTH_NBR').count().show() months_before_filter.groupBy('MONTH_NBR').count().show() months_now_count = months_now_filter.groupBy('ACCTNBR').count() months_before_count = months_before_filter.groupBy('ACCTNBR').count() months_now_count.show() months_before_count.show() join = months_now_count.select('ACCTNBR',months_now_count['count'].alias('NOW_COUNT')).join( months_before_count.select('ACCTNBR',months_before_count['count'].alias('BEFORE_COUNT')),'ACCTNBR','outer' ) join.show() def m(line): ncount = line['NOW_COUNT'] bcount = line['BEFORE_COUNT'] ''' 计算增长率 9999:两季度均无数据 8888:仅第一季度有数据,流失客户 7777:仅第二季度有数据,新增客户 其他数字:第二个季度较第一季度增长率 (s2-s1)/s1*100 ''' if ncount is None: if bcount is None:# n none,b none pass else:# n none, b not none increment = -9999 else: if bcount is None:# n not none, b none increment = 8888 else:# n not none,b not none increment = round((ncount-bcount)/bcount*100) ''' 计算信用卡生命周期(以增长率计算) 100+ 快速增长 50-100 增长 -50-50 稳定 -50- 衰退 9999 流失 ''' if increment > 100 and increment != -9999: life = 1 # fast growing elif increment <=100 and increment >50: life = 2 # growing elif increment <= 50 and increment > -50: life = 3 # stable elif increment <= -50: life = 4 # losing else: life = 9 # no more tran completely lost return line['ACCTNBR'],month_nbr,increment,life sql = "replace into t_CMMS_ANALYSE_CREDIT(ACCTNBR, MONTH_NBR, INCREMENT,LIFE, UPDATE_TIME) values(%s,%s,%s,%s,now())" rdd = join.map(m) print(type(rdd)) self.sql_operate(sql,rdd)
# uid,adid,guuid,createtime fields = [ StructField('uid', StringType(), True), StructField('adid', StringType(), True), StructField('guuid', StringType(), True), StructField('guuidctime', LongType(), True), StructField('url', StringType(), True), StructField('referer', StringType(), True), StructField('hosid', StringType(), True), StructField('gwid', StringType(), True), StructField('ua', StringType(), True), StructField('ip', StringType(), True), StructField('createtime', LongType(), True), ] schema = StructType(fields) # [(),()] ['',''] df_dest = sqlContext.createDataFrame(rdd, schema) df_dest.registerTempTable("back_portal_loginlog") #df_dest.rdd.foreach(my_print) # save df_dest.write.parquet(output) sc.stop()
parser.add_argument('deaths') parser.add_argument('output') args = parser.parse_args() conf = SparkConf().setAppName("correlate") sc = SparkContext(conf=conf) sql = SQLContext(sc) births_raw = sql.read.load(args.births).rdd deaths_raw = sql.read.load(args.deaths).rdd births = births_raw.map(to_joinable_on_id) deaths = deaths_raw.map(to_joinable_on_id) both = births.fullOuterJoin(deaths) unjoined_births = both.filter(get_unjoined_births) unjoined_deaths = both.filter(get_unjoined_deaths) correctly_joined = both.filter(remove_unjoined_all).map(to_joined_format) # do a join with jaro-winkler jaro_input_births = unjoined_births.map(to_jaro_matching_input) jaro_input_deaths = unjoined_deaths.map(to_jaro_matching_input) jaro_input_all = jaro_input_births.cartesian(jaro_input_deaths) jaro_joined = jaro_input_all.filter(jaro_match).map(cart_to_joined_format) to_save = sql.createDataFrame(correctly_joined) to_save.write.save(args.output + '/joined', format="parquet") to_save = sql.createDataFrame(jaro_joined) to_save.write.save(args.output + '/jaro_joined', format="parquet")
if __name__ == "__main__": conf = SparkConf().setAppName("analysis_demo").setMaster("local[2]") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) # UDF自定义函数注册 sqlContext.registerFunction("analysis_email", analysis_email) file_path = os.path.abspath("../doc/analysis.txt") lines = sc.textFile(file_path) info = lines.map(lambda lines: lines.split("----")). \ map(lambda info: Row(email=info[0], username=info[1], realname=info[2], idcard=info[3], password=info[4], phone=info[5])) schemaInfo = sqlContext.createDataFrame(info) schemaInfo.registerTempTable("information") # cache表 #sqlContext.cacheTable("information") #sqlContext.uncacheTable("information") """ :邮箱分析与统计 """ email_str = "SELECT analysis_email(email) AS email FROM information" emailSQL = sqlContext.sql(email_str) # 求总数 count = emailSQL.count() # 分组统计 emailCollect = emailSQL.groupBy("email").count().collect() # email分析结果
def features_to_vec(length, entropy, alexa_grams, word_grams): high_entropy = 0.0 high_length = 0.0 if entropy > 3.5: high_entropy = 1.0 if length > 30: high_length = 1.0 return Vectors.dense(length, entropy, high_entropy, high_length, alexa_grams, word_grams) #dga_domains = sc.textFile("/user/cloudera/dga.txt") #dga_domains = dga_domains.map(lambda x: (x, "dga", float(len(x)), entropy(x))) #dga_domains_df = sqlctx.createDataFrame(dga_domains, schema).dropna().distinct().cache() words = sc.textFile("/user/cloudera/words.txt") words = words.map(lambda x: (x, "dict", float(len(x)), entropy(x))) words_df = sqlctx.createDataFrame(words, schema).dropna().distinct().cache() dga_domains = sc.textFile("/user/cloudera/c_domains_*") dga_domains = dga_domains.map(lambda x: (x, "dga", float(len(x)), entropy(x))) dga_domains_df = sqlctx.createDataFrame(dga_domains, schema).dropna().distinct().cache() alexa_domains = sqlctx.read.format('com.databricks.spark.csv').options(header='false', inferschema='true').load( 'alexa_100k.csv')\ .map(lambda x: (x[1], "legit", float(len(x[1])), entropy(x[1]))) alexa_domains_df = sqlctx.createDataFrame(alexa_domains, schema).dropna().distinct().cache() alexa_domains_1M = sqlctx.read.format('com.databricks.spark.csv').options(header='false', inferschema='true').load( 'alexa_1M.csv')\ .map(lambda x: (x[1], "legit", float(len(x[1])), entropy(x[1]))) alexa_domains_1M = sqlctx.createDataFrame(alexa_domains_1M, schema).distinct().cache()
mirror_dir = "data/mirror" data_dir = "data/data-{0}".format(dataset_date) out_dir = "data/bhl-{0}.parquet".format(dataset_date) if os.path.isdir(out_dir): print("Output dir {0} exists".format(out_dir)) exit get_ocr_udf = sql.udf(get_ocr, types.StringType()) fn = os.path.join(data_dir, "item.txt") # Optional limit for testing, add this to the chain as second step # .sample(withReplacement=False, fraction=0.001) \ sqlContext.createDataFrame(t_gen(fn, type_data_item), schema_item()) \ .withColumn("ocrtext", get_ocr_udf(sql.col("barcode"))) \ .write.parquet(out_dir) # Example run on Elk (16 thread single machine) #real 84m21.818s #user 198m57.612s #sys 15m19.662s # Example run on okapi (128 thread single machine) #real 41m13.984s #user 482m34.084s #sys 278m12.404s
indexes = [1,2,7,11,5,8,20] return [ record[i].replace('"','') for i in indexes] def filterData(record): flag = True if (int(record[-4])<1) or (record[-2] not in (['1','4'])) or (record[-1] != ''): flag = False return flag if __name__ == '__main__': sc = SparkContext(appName = 'CF_prod_in_transaction') sqlContext = SQLContext(sc) in_file = sc.textFile(sys.argv[1]) data = in_file.map(oritentData).filter(filterData).map(lambda x: [int(i) for i in x[:-3]]) Record = Row('customer_id','product_id','invoice_id','units') data = data.map(lambda x: Record(*x)) data = sqlContext.createDataFrame(data) sqlContext.registerDataFrameAsTable(data,'table1') df = sqlContext.sql('select customer_id, product_id, sum(units) as prod_in_transactions from table1 group by customer_id, product_id') df.map(lambda x: ','.join([str(r) for r in x])).saveAsTextFile(sys.argv[2]) sc.stop() data_path,header,train_sample,number,support,confidence,lift,k,testing,testing_split,seed,output_path write = open('test.csv','w') wrtr = csv.writer(write) import csv read = open('arqiva.csv') for line in read: wrtr.writerow(line)
def main(argv): # list of words to look for! GODWINS_WORDS = ['hitler', 'nazi'] # setup inputs and outputs input_directory = argv[0] output_directory = argv[1] # spark specific setup conf = SparkConf().setAppName('godwin whaaa') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) # read input text = sc.textFile(input_directory) text = text.repartition(200) # convert to magic json formatting loadedJson = text.map(lambda line: json.loads(line)) # make the json skinnier by removing unwanted stuff fullRedditJson = loadedJson.map(lambda jObj: (jObj['body'], jObj['name'], jObj['parent_id'])).cache() # code from greg for regex to parse lines linere = re.compile(regex_from_words(GODWINS_WORDS)) # now filter out stuff without GODWINS_WORDS "body","id", "subreddit", "parent_id" godwinJsonList = fullRedditJson.filter(lambda (body, name, parent_id): linere.match(body.lower())) # We don't need the comment body anymore... # We need to find the paths now... godwin_node_rdd = godwinJsonList.map(row_into_node).cache() full_node_rdd = fullRedditJson.map(row_into_node) # we also need a list of node names so we can later check if we already visited it. godwinNodes = godwin_node_rdd.map(lambda (name, parent_id): name) # Convert full data RDD into SQL Data Frame subredditSchema = StructType([ StructField("name", StringType(), True), StructField("parent_id", StringType(), True) ]) full_node_df = sqlContext.createDataFrame(full_node_rdd, subredditSchema) # Convert godwin rows RDD into SQL Data Frame godwinSchema = StructType([ StructField("g_name", StringType(), True), StructField("g_parent_id", StringType(), True) ]) godwin_node_df = sqlContext.createDataFrame(godwin_node_rdd, godwinSchema).cache() count_down = godwin_node_df.count() print 'There are', count_down, 'comments with a godwins word' depth = 0 nodes_per_depth = {} visited_node_list_df = godwin_node_df.select(godwin_node_df.g_name) print 'visited_node_list_df' print str(visited_node_list_df.count()) while count_down > 0 and depth < 100: depth += 1 # Join find next layer of nodes joined_df = godwin_node_df.join(full_node_df, [godwin_node_df['g_parent_id'] == full_node_df['name']]) # Drop the columns of the older node next_node_df = joined_df.select( joined_df['name'].alias('g_name'), joined_df['parent_id'].alias('g_parent_id')).cache() print 'next_node_df count: '+str(next_node_df.count()) # Select only the ones that have NOT been visited # TODO: is there a better way? leftt = next_node_df.join(visited_node_list_df, next_node_df.g_name == visited_node_list_df.g_name, 'left') next_node_df = leftt.select(next_node_df.g_name, next_node_df.g_parent_id, visited_node_list_df.g_name.alias('dup')) next_node_df = next_node_df.fillna({'dup':'xxxxxx'}) next_node_df = next_node_df.filter(next_node_df.dup == 'xxxxxx') next_node_df = next_node_df.drop(next_node_df.dup) # add the g_name to the list of visited nodes # TODO: make more efficient! visited_df = next_node_df.select(next_node_df.g_name) visited_node_list_df = visited_node_list_df.unionAll(visited_df) visited_node_list_df = visited_node_list_df.dropDuplicates() count_up = next_node_df.count() n_nodes = count_down - count_up print 'number of godwin nodes of heignt', depth, '=', n_nodes nodes_per_depth[depth] = n_nodes count_down = count_up godwin_node_df = next_node_df avg = compute_average_godwin(nodes_per_depth) print 'The average distance to the godwin words is', avg fp = open(output_directory + 'average.txt') fp.write(str(avg) + '\n') fp.close()
# Get all the ratings rows of our user dfUserRatings = dfRates.filter(dfRates.userId == USER_ID).map(lambda r: r.accoId).collect() print(dfUserRatings) # Returns only the accommodations that have not been rated by our user rddPotential = dfAccos.rdd.filter(lambda x: x[0] not in dfUserRatings) pairsPotential = rddPotential.map(lambda x: (USER_ID, x[0])) #[START split_sets] rddTraining, rddValidating, rddTesting = dfRates.rdd.randomSplit([6,2,2]) #[END split_sets] #[START predict] # Build our model with the best found values # Rating, Rank, Iteration, Regulation model = ALS.train(rddTraining, BEST_RANK, BEST_ITERATION, BEST_REGULATION) # Calculate all predictions predictions = model.predictAll(pairsPotential).map(lambda p: (str(p[0]), str(p[1]), float(p[2]))) # Take the top 5 ones topPredictions = predictions.takeOrdered(5, key=lambda x: -x[2]) print(topPredictions) schema = StructType([StructField("userId", StringType(), True), StructField("accoId", StringType(), True), StructField("prediction", FloatType(), True)]) #[START save_top] dfToSave = sqlContext.createDataFrame(topPredictions, schema) dfToSave.write.jdbc(url=jdbcUrl, table=TABLE_RECOMMENDATIONS, mode='overwrite') #[END save_top]
class LogisticRegression: def __init__(self): # configuring spark self.spark_conf = SparkConf() self.sc = SparkContext(conf=self.spark_conf) self.sql_context = SQLContext(self.sc) def test_train(self, df, target, train_split, test_split, regularization=None, num_of_iterations=100): try: LOGGER.info("Generation logistic regression") spark_df = self.sql_context.createDataFrame(df) feature_columns = spark_df.columns feature_columns.remove(target) train, test = spark_df.randomSplit([train_split, test_split], seed=1000000) X_train = train.select(*feature_columns).map(lambda x: list(x)) y_train = train.select(target).map(lambda x: x[0]) zipped = y_train.zip(X_train) train_data = zipped.map(lambda x: LabeledPoint(x[0], x[1])) numOfClasses = len(df[target].unique()) logistic_model = LogisticRegressionWithLBFGS.train(train_data, numClasses=numOfClasses, regParam=0, regType=regularization, intercept=True, iterations=num_of_iterations, validateData=False) X_test = test.select(*feature_columns).map(lambda x: list(x)) y_test = test.select(target).map(lambda x: x[0]) prediction = X_test.map(lambda lp: (float(logistic_model.predict(lp)))) prediction_and_label = prediction.zip(y_test) LOGGER.info(prediction_and_label.map(lambda labelAndPred: labelAndPred[0] == labelAndPred[1]).mean()) except Exception as e: raise e def train(self, df, target, regularization=None, num_of_iterations=100): try: LOGGER.info("Generation logistic regression") spark_df = self.sql_context.createDataFrame(df) feature_columns = spark_df.columns feature_columns.remove(target) X_train = spark_df.select(*feature_columns).map(lambda x: list(x)) y_train = spark_df.select(target).map(lambda x: x[0]) zipped = y_train.zip(X_train) train_data = zipped.map(lambda x: LabeledPoint(x[0], x[1])) numOfClasses = len(df[target].unique()) logistic_model = LogisticRegressionWithLBFGS.train(train_data, numClasses=numOfClasses, regParam=0, regType=regularization, intercept=True, iterations=num_of_iterations, validateData=False) self.model = logistic_model except Exception as e: raise e def persist(self, location): try: LOGGER.info("Writing the model to location %s"%location) data = 'data' meta_data = 'metadata' data_location = os.path.join(location, data) if os.path.exists(data_location): LOGGER.info("Removing directory %s"%data_location) shutil.rmtree(data_location) data_location = os.path.join(location, meta_data) if os.path.exists(data_location): LOGGER.info("Removing directory %s"%data_location) shutil.rmtree(data_location) self.model.save(self.sc, location) except Exception as e: raise e def predict(self, df): try: LOGGER.info("Predicting using logistic regression") spark_df = self.sql_context.createDataFrame(df) feature_columns = spark_df.columns inp_data = spark_df.select(*feature_columns).map(lambda x: list(x)) inp_data = spark_df.map(lambda x: list(x)) result = self.model.predict(inp_data.map(lambda x: x)).collect() LOGGER.info("Predicted output is %s"%str(result)) return result except Exception as e: raise e def load(self, location): try: self.model = LogisticRegressionModel.load(self.sc, location) except Exception as e: raise e