def main(input_dir, result_path): conf = SparkConf().setMaster("yarn-client").setAppName("avg").set( 'spark.executor.memory', '4G').set('spark.driver.memory', '4G').set('spark.driver.maxResultSize', '4G') sc = SparkContext(conf=conf) sqlContext = sql.SQLContext(sc) with open(result_path, "a") as f: for file in listdir(input_dir): sum = None count = None with open(input_dir + "/" + file) as in_f: lines = in_f.read().splitlines() rdd = sc.parallelize(lines) row_rdd = rdd.map(lambda line: line.split(",")).filter( lambda line: len(line) == 2) sum = row_rdd.map(lambda line: (float(line[1]))).sum() count = row_rdd.count() in_f.close() f.write(file + " " + str(sum) + " " + str(count) + " " + str(sum / count) + "\n") f.close()
def consumer(): conf = SparkConf().set("spark.jars", "/home/tammy/Downloads/spark-streaming-kafka-0-8-assembly_2.11-2.4.4.jar") sc =SparkContext(conf=conf) ssc = StreamingContext(sc,5) print("PROGRAM STARTING!!!!!!!!!") print("PROGRAM STARTING!!!!!!!!!") sqlContext = sql.SQLContext(sc) directKafkaStream = KafkaUtils.createDirectStream(ssc, ["sparky"], {"metadata.broker.list": "localhost:9091"}) lines = directKafkaStream.map(lambda x: x[1]) line_list = [] def makeIterable(rdd): for x in rdd.collect(): print(x) line_list.append(x) strippedlist = [sub.replace('\n', '').replace('\r','').replace(' ','') for sub in line_list] dic = json.loads(strippedlist[0]) flattened_list = [flatten(dic)] df = pd.DataFrame(flattened_list) print(df) lines.foreachRDD(makeIterable) ssc.start() ssc.awaitTermination()
def get_sqlcontext_instance(spark_context): """ :type spark_context: pyspark.SparkContext :param spark_context: The currently active Spark Context :return: Returns the SQLContext :rtype: sql.SQLContext """ if 'sqlContextSingletonInstance' not in globals(): globals()['sqlContextSingletonInstance'] = sql.SQLContext( spark_context) return globals()['sqlContextSingletonInstance']
def Spark_read_write_csv_to_hdfs(inputType, fileList, outDirectory): sc = SparkContext(appName="DATA-local-to-HDFS") #Set out put replication factor to 1 sc._jsc.hadoopConfiguration().set("dfs.replication", "1") sqlContext = sql.SQLContext(sc) for filename in fileList: print 'Reading ' + 'file://' + filename rddFrame1 = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true') \ .load('file://' + filename) #rddFrame1.coalesce(1).write.format('com.databricks.spark.csv').save(outDirectory+filename[len(filename)-33:]) rddFrame1.write.format('com.databricks.spark.csv').save( outDirectory + filename[len(filename) - 33:]) print 'Writing ' + outDirectory + filename[len(filename) - 33:] + ' done!' sc.stop()
def main(data_paths: list): print(data_paths) print('Creating spark configs.') config = SparkConf().setAppName("Spark configuration").setMaster(cluster_address) print('Setting spark context and sql context.') spark_context = SparkContext(conf=config) saprk_sql_context = sql.SQLContext(spark_context) print('Loding data to spark') dataframe = saprk_sql_context.read.option('header', 'true').csv(data_paths) print('Saving data in parquet format') dataframe.write.mode("overwrite").parquet("/data/compressed.parquet")
def main(): input_file, output_file = sys.argv[1], sys.argv[2] sc = SparkContext('local[*]', 'task1') sc.setLogLevel("ERROR") rdd = sc.textFile(input_file).map(lambda x: x.split()).cache() vertices_list = list(set(rdd.flatMap(set).collect())) edges_list = rdd.map(tuple).distinct().collect() edges_list_undirected = set() for i in range(len(vertices_list)): vertices_list[i] = tuple([vertices_list[i]]) for edge in edges_list: if edge not in edges_list_undirected: edges_list_undirected.add(edge) edge2 = (edge[1], edge[0]) if edge2 not in edges_list_undirected: edges_list_undirected.add(edge2) sqlContext = sql.SQLContext(sc) vertices = sqlContext.createDataFrame(vertices_list, ["id"]) edges = sqlContext.createDataFrame(edges_list_undirected, ["src", "dst"]) g = GraphFrame(vertices, edges) result = g.labelPropagation(maxIter=5) community_list = result.select("id", "label").collect() communities = {} for community in community_list: if community.label not in communities: communities[community.label] = [] communities[community.label].append(community.id) communities_res = {} for c, ids in communities.items(): if len(ids) not in communities_res: communities_res[len(ids)] = [] ids_str = "', '".join(sorted(ids)) ids_str = "'{}'".format(ids_str) communities_res[len(ids)].append(ids_str) with open(output_file, "w") as f: for k in sorted(communities_res): for id in sorted(communities_res[k]): f.write(id + "\n")
def consumer(): conf = SparkConf().set( "spark.jars", "/home/fielemployee/spark-streaming-kafka-0-8-assembly_2.11-2.4.4.jar") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 5) print("PROGRAM STARTING!!!!!!!!!") print("PROGRAM STARTING!!!!!!!!!") sqlContext = sql.SQLContext(sc) directKafkaStream = KafkaUtils.createDirectStream( ssc, ["kafka_spark"], {"metadata.broker.list": "localhost:9094"}) lines = directKafkaStream.map(lambda x: x[1]) print("LINES START!!!") print("LINES START!!!") print("LINES START!!!") print("LINES START!!!")
def main(tweet_path, dataset_path): conf = SparkConf().setMaster("local").setAppName("Test") sc = SparkContext(conf=conf) sqlContext = sql.SQLContext(sc) # join #senti_score = sc.textFile("/user/ja3802/geo-data/*").map(lambda line: (line.split(",")[0], float(line.split(",")[1]))) senti_score = sc.textFile(dataset_path).map(lambda line: line.split(","))\ .filter(lambda line: len(line) == 2).map(lambda line: (line[0], float(line[1]))) #$senti_score.saveAsTextFile("err") geodata = sc.textFile(tweet_path).map(lambda line: toKV(line)) fulldata = geodata.join(senti_score) #fulldata.saveAsTextFile("fulldata") timeKey = fulldata.map(lambda json: getTime(json[1])) time_df = timeKey.toDF(["time", "geo", "like", "rt", "score"]) time_df.write.partitionBy("time").json("heatmapdata")
def main(): # parser = argparse.ArgumentParser(description="Read file contents from S3") # parser.add_argument("bucket", type=str, help="S3 Bucket name") # parser.add_argument("key", type=str, help="S3 Key path and name") # args = parser.parse_args() # config = ConfigParser.ConfigParser() # config.read(os.environ['HOME'] + '/.aws/credentials') # access_key = config.get('default', 'aws_access_key_id') # secret_key = config.get('default', 'aws_secret_access_key') # conn = boto.connect_s3( # aws_access_key_id = access_key, # aws_secret_access_key = secret_key, # #is_secure=False, # uncomment if you are not using ssl # ) # bucket = conn.get_bucket(args.bucket) # #key = public/growth/staging/silver/third_party/facebook/ad_set/2017-04-12.avro # key = Key(bucket, args.key) # print(key.get_contents_as_string()) # df = SQLContext.read.format("com.databricks.spark.avro").load("src/test/resources/episodes.avro") # # Saves the subset of the Avro records read in # subset = df.where("doctor > 5") # subset.write.format("com.databricks.spark.avro").save("/tmp/output") conf = SparkConf() conf.setMaster('local') conf.setAppName('SQLApiDemo') sc = SparkContext(conf = conf) print sc.version sqlContext = sql.SQLContext(sc) sqlContext.sql("CREATE TEMPORARY TABLE table_name USING com.databricks.spark.avro OPTIONS (path '/Users/ridshakeel/Downloads/2017-04-24.avro')") df = SQLContext.sql("SELECT COUNT(*) FROM table_name") df.collect() DataFrame df = sqlContext.load("/Users/tariq/avro_data/browser.avro/", "com.databricks.spark.avro");
def main(): spark = SparkSession.builder.appName("TRAFFIC").config("spark.executor.cores", "4").config("spark.executor.memory", "4g").getOrCreate() sc = spark.sparkContext mapping = sc.textFile("s3a://insighttraffic/ML_model/mappings").collect()[0] mapping = ast.literal_eval(str(mapping)) models=[] for hour in range(0, 24): model = LinearRegressionModel.load(sc, "s3a://insighttraffic/ML_model/linear_model_log_"+str(hour)) models.append(model) category_len = 154 sqlContext = sql.SQLContext(sc) hadoop_conf=sc._jsc.hadoopConfiguration() hadoop_conf.set("fs.s3n.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") hadoop_conf.set("fs.s3n.awsAccessKeyId", 'awsAccessKeyId') hadoop_conf.set("fs.s3n.awsSecretAccessKey", 'awsSecretAccessKey') # set microbatch interval as 10 seconds, this can be customized according to the project ssc = StreamingContext(sc,10) # directly receive the data under a certain topic kafkaStream = KafkaUtils.createDirectStream(ssc, ['data'], {"metadata.broker.list": 'Kafka-DNS:9092'}) connection = psycopg2.connect(host = 'postgres-ip-address', database = 'postgres', user = '******', password = '******') cursor = connection.cursor() cursor.execute('CREATE TABLE IF NOT EXISTS realtimetraffic (sid text, location text, latitude double precision, longitude double precision,\ direction text, lanes integer, roadtype text, highway text, current integer, historical double precision, level text, PRIMARY KEY (id));') cursor.execute('SELECT AddGeometryColumn (%s,%s,%s,4326,%s,2);', (public,realtimetraffic,geom,POINT,)) #The inbound stream is a DStream dstream = kafkaStream.map(lambda (key, value): json.loads(value)) dstream.foreachRDD(lambda rdd: update(rdd, models, mapping))
def test_spark_transformation(spark_context, mocker): """ test that a single event is categorized correctly Args: spark_context: test fixture SparkContext sql_context: test fixture SqlContext """ sqlContext = sql.SQLContext(spark_context) # Mocking the message coming from Kafka mocker.patch( 'processor.spark_processor_refactored.read_from_kafka', return_value=spark_context.parallelize([ Row(value='{"event_id": "141b3ff2a92111ebbfae367ddad5b1fa", ' '"account_id": "684", "event_type": "other", ' '"device": "ANDROID", "location_country": "FR", ' '"event_timestamp": "1619724510"}') ]).toDF()) # Mocking the connection with MySQL mocker.patch('processor.spark_processor_refactored.read_from_mysql', return_value=spark_context.parallelize( [Row(account_no='684', user_device='ANDROID')]).toDF()) # Spark transformation result dataframe result = s.transform().collect() # Expected esult expected_result = [ Row(event_id='141b3ff2a92111ebbfae367ddad5b1fa', account_id=684, event_type='other', device='ANDROID', location_country='FR', event_timestamp=datetime.datetime(2021, 4, 29, 12, 28, 30), status='good') ] assert result == expected_result
from pyspark import sql, SparkConf, SparkContext conf = SparkConf().setAppName("task_1") sc = SparkContext(conf=conf) sqlContext = sql.SQLContext(sc) df_albums = sqlContext.read.csv("albums.csv")\ .withColumnRenamed("_c0", "id")\ .withColumnRenamed("_c1", "artist_id")\ .withColumnRenamed("_c2", "album_title")\ .withColumnRenamed("_c3", "genre")\ .withColumnRenamed("_c4", "year_of_pub")\ .withColumnRenamed("_c5", "num_of_tracks")\ .withColumnRenamed("_c6", "num_of_sales")\ .withColumnRenamed("_c7", "rolling_stone_critic")\ .withColumnRenamed("_c8", "mtv_critic")\ .withColumnRenamed("_c9", "music_maniac_critic") df_artists = sqlContext.read.csv("artists.csv")\ .withColumnRenamed("_c0", "id")\ .withColumnRenamed("_c1", "real_name")\ .withColumnRenamed("_c2", "art_name")\ .withColumnRenamed("_c3", "role")\ .withColumnRenamed("_c4", "year_of_birth")\ .withColumnRenamed("_c5", "country")\ .withColumnRenamed("_c6", "city")\ .withColumnRenamed("_c7", "email")\ .withColumnRenamed("_c8", "zip_code") a = df_artists.select("id").distinct().count()
from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext, SparkSession from pyspark import sql # Create the spark session spark = SparkSession \ .builder \ .appName("Vertica Connector Pyspark Example") \ .getOrCreate() spark_context = spark.sparkContext sql_context = sql.SQLContext(spark_context) # The name of our connector for Spark to look up format = "com.vertica.spark.datasource.VerticaSource" # Set connector options based on our Docker setup host="vertica" user="******" password="" db="docker" staging_fs_url="webhdfs://hdfs:50070/data/" table="pysparktest" # Define data to write to Vertica columns = ["language","users_count"] data = [("Java", "20000"), ("Python", "100000"), ("Scala", "3000")] # Create an RDD from the data rdd = spark_context.parallelize(data) # Convert the RDD to a DataFrame df = rdd.toDF(columns) # Write the DataFrame to the Vertica table pysparktest
from pyspark import SparkConf, SparkContext, sql if __name__ == '__main__': conf = SparkConf().setAppName("app") sc = SparkContext(conf=conf) spark = sql.SparkSession \ .builder \ .appName("TEST") \ .getOrCreate() sql_context = sql.SQLContext(sc, spark) filename = 'admitware/test.parquet4' s3_uri = 's3a://nu-data-lake-test/{}'.format(filename) print(s3_uri) df = sql_context.createDataFrame([('1', '4'), ('2', '5'), ('3', '6')], ["A", "B"]) df.write.parquet(s3_uri) # df.write.parquet("s3a://nu-data-lake-test/admitware/test.parquet",mode="overwrite") # spark.stop()
def initializeSQLContext(sc): """ Creates and returns a SQL context from the Spark context. """ return sql.SQLContext(sc)
from pyspark import sql from pyspark.streaming import StreamingContext from pyspark.sql import SparkSession from pyspark.sql import SQLContext, Row from pyspark.sql import HiveContext from pyspark.sql import types import json import csv from json import loads from time import sleep sc = SparkContext() hc = HiveContext(sc) SparkContext.setSystemProperty("hive.metastore.uris", "thrift://nn1:9083") ssc = StreamingContext(sc, 5) sqlc = sql.SQLContext(sc) directKafkaStream = KafkaUtils.createDirectStream( ssc, ["kafka_spark"], {"metadata.broker.list": "sandbox-hdp.hortonworks.com:6667"}) lines = directKafkaStream.map(lambda x: x[1]) #Create Spark session with Hive supported. appName = "PySpark Hive Example" master = "sandbox-hdp.hortonworks.com" ss = SparkSession.builder \ .appName(appName) \ .config("spark.sql.warehouse.dir", "/warehouse/tablespace/managed/hive") \ .getOrCreate() print("LINES START!!!") print("LINES START!!!")
#for value in dfPostComment: # print(value) #print("---------") #output=dataFile.collect() #for value in dfPostView: # print(value) #print("-----------") dataFile2=sc.textFile("/home/aisenur/Datasets/Tagsnew") header2=dataFile2.first() dataFile2=dataFile2.filter(lambda x: x!=header2) dataFile2=dataFile2.map(lambda x: x.split(" ")) sqlContext1 = sql.SQLContext(sc) dfPostFav = sqlContext1.createDataFrame(dfPostFav, ["postTag1", "FavoriteCount"]) #dfPostFav.show() tf=dfPostFav.alias('tf') sqlContext1 = sql.SQLContext(sc) dfPostComment = sqlContext1.createDataFrame(dfPostComment, ["postTag2", "CommentCount"]) #dfPostComment.show() tc=dfPostComment.alias('tc') sqlContext1 = sql.SQLContext(sc) dfPostView = sqlContext1.createDataFrame(dfPostView, ["postTag3", "ViewCount"]) #dfPostView.show() tv=dfPostView.alias('tv') join_post=tf.join(tc, tf.postTag1 == tc.postTag2, how='left').select([col('tc.'+xx) for xx in tc.columns]
def run(self, inputType, fileList, outDirectory): sc = SparkContext(appName="ALU Application") sqlContext = sql.SQLContext(sc) outputName = outDirectory + "result_group_by_" + self.groupby + "_ALU_2017_spark_" + inputType + ".csv" start = dt.datetime.now() dataframe = None for filename in fileList: date = LTE_MAPPING.x_date(filename[len(filename) - 12:len(filename) - 4]) if inputType == 'hdfs': filename = "hdfs://hdfs1:8020/user/ec2-user/sample-data/" + filename print "reading " + filename rddFrame1 = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true')\ .load(filename) #ENODEB_CELLNAME ENODEB DATA_DATE MARKET_CLUSTER VERSION REGION MARKET DL_CH_BANDWIDTH EARFCN_DL DRBPDCPSDUKBYTESDL_NONGBR DLPRBUSEDWITHDSPUC_FDUSERS DLPRBUSEDWITHDSPUC_FSUSERS EUCELL_DL_TPUT_NUM_KBITS EUCELL_DL_TPUT_DEN_SECS EUCELL_DL_DRB_TPUT_NUM_KBITS EUCELL_DL_DRB_TPUT_DEN_SECS #rddFrame1 = rddFrame1.drop('ENODEB','DATA_DATE','VERSION').dropna() rddFrame1 = rddFrame1.dropna() rddFrame1 = rddFrame1.withColumn('DATE', sql.functions.lit(date)) if dataframe == None: dataframe = rddFrame1 else: dataframe = dataframe.unionAll(rddFrame1) print "reading finished!" self.printDfPartitions(dataframe) #cast Type dataframe = dataframe.withColumn( 'EUCELL_DL_TPUT_NUM_KBITS', dataframe['EUCELL_DL_TPUT_NUM_KBITS'].cast(sql.types.DoubleType())) dataframe = dataframe.withColumn( 'EUCELL_DL_TPUT_DEN_SECS', dataframe['EUCELL_DL_TPUT_DEN_SECS'].cast(sql.types.DoubleType())) dataframe = dataframe.withColumn( 'EUCELL_DL_DRB_TPUT_NUM_KBITS', dataframe['EUCELL_DL_DRB_TPUT_NUM_KBITS'].cast( sql.types.DoubleType())) dataframe = dataframe.withColumn( 'EUCELL_DL_DRB_TPUT_DEN_SECS', dataframe['EUCELL_DL_DRB_TPUT_DEN_SECS'].cast( sql.types.DoubleType())) dataframe = dataframe.withColumn( 'DRBPDCPSDUKBYTESDL_NONGBR', dataframe['DRBPDCPSDUKBYTESDL_NONGBR'].cast( sql.types.DoubleType())) dataframe = dataframe.withColumn( 'DLPRBUSEDWITHDSPUC_FDUSERS', dataframe['DLPRBUSEDWITHDSPUC_FDUSERS'].cast( sql.types.DoubleType())) dataframe = dataframe.withColumn( 'DLPRBUSEDWITHDSPUC_FSUSERS', dataframe['DLPRBUSEDWITHDSPUC_FSUSERS'].cast( sql.types.DoubleType())) #add columns dataframe = dataframe.withColumn('Total cell count', sql.functions.lit(1)) BandMapping = sql.functions.udf( lambda x: LTE_MAPPING.EARFCN_DL_mapping(x), sql.types.StringType()) dataframe = dataframe.withColumn('BAND', BandMapping('EARFCN_DL')) BandWidthMapping = sql.functions.udf( lambda x: LTE_MAPPING.bandwidth(x), sql.types.IntegerType()) dataframe = dataframe.withColumn('Total Spectrum in MHz', BandWidthMapping('DL_CH_BANDWIDTH')) dataframeoutput = dataframe.groupBy(['DATE', self.groupby, 'BAND']).sum() dataframeoutput = dataframeoutput.withColumn( 'UE Tput (kbps)', dataframeoutput['sum(EUCELL_DL_TPUT_NUM_KBITS)'] / dataframeoutput['sum(EUCELL_DL_TPUT_DEN_SECS)']) dataframeoutput = dataframeoutput.withColumn( 'DRB Tput (kbps)', dataframeoutput['sum(EUCELL_DL_DRB_TPUT_NUM_KBITS)'] / dataframeoutput['sum(EUCELL_DL_DRB_TPUT_DEN_SECS)']) dataframeoutput = dataframeoutput.withColumn( 'Cell Spectral Efficiency (bps/Hz)', 8 * dataframeoutput['sum(DRBPDCPSDUKBYTESDL_NONGBR)'] / (dataframeoutput['sum(DLPRBUSEDWITHDSPUC_FDUSERS)'] + dataframeoutput['sum(DLPRBUSEDWITHDSPUC_FSUSERS)']) / 1.024 / 0.18) dataframeoutput = dataframeoutput.withColumn('VENDOR', sql.functions.lit('ALU')) dataframeoutput = dataframeoutput.withColumn( 'UE Traffic (kbytes)', dataframeoutput['sum(EUCELL_DL_TPUT_NUM_KBITS)'] / 8) dataframeoutput = dataframeoutput.withColumn( 'Cell Used PRB', (dataframeoutput['sum(DLPRBUSEDWITHDSPUC_FDUSERS)'] + dataframeoutput['sum(DLPRBUSEDWITHDSPUC_FSUSERS)']) * 1.024) #rename colname dataframeoutput = dataframeoutput.withColumnRenamed( "sum(DRBPDCPSDUKBYTESDL_NONGBR)", "Cell Traffic (kbytes)") dataframeoutput = dataframeoutput.withColumnRenamed( "sum(EUCELL_DL_TPUT_DEN_SECS)", "UE Active Time (s)") dataframeoutput = dataframeoutput.withColumnRenamed( "sum(Total cell count)", "Total cell count") dataframeoutput = dataframeoutput.withColumnRenamed( "sum(Total Spectrum in MHz)", "Total Spectrum in MHz") #dataframeoutput = dataframeoutput.drop('sum(EUCELL_DL_TPUT_NUM_KBITS)').drop('sum(DLPRBUSEDWITHDSPUC_FDUSERS)').drop('sum(DLPRBUSEDWITHDSPUC_FSUSERS)').drop('sum(EUCELL_DL_DRB_TPUT_NUM_KBITS)').drop('sum(EUCELL_DL_DRB_TPUT_DEN_SECS)') dataframeoutput = dataframeoutput.select( 'DATE', 'MARKET', 'VENDOR', 'BAND', 'Cell Traffic (kbytes)', 'Cell Used PRB', 'Cell Spectral Efficiency (bps/Hz)', 'UE Traffic (kbytes)', 'UE Active Time (s)', 'UE Tput (kbps)', 'Total cell count', 'Total Spectrum in MHz') dataframeoutput = dataframeoutput.coalesce(1) #take action here dataframeoutput.write.format('com.databricks.spark.csv').save( outputName) difference = dt.datetime.now() - start dataframeoutput.unpersist() sc.stop() return difference
def setup_spark(): config = SparkConf().setAppName(APP_NAME) context = SparkContext.getOrCreate(conf=config) sql_context = sql.SQLContext(context) return {'config': config, 'context': context, 'sql_context': sql_context}
import etl.EtlAnuncio as EtlAnun import etl.EtlEstadisticaAnuncio as EtlEstAnun import etl.EtlAccionDeAnuncio as EtlAccion import etl.EtlAnunciosReporte as Etlrep import sys import pyspark as pspk import pyspark.sql as pysql import util.LoggerImpl as Log import findspark reload(sys) sys.setdefaultencoding('utf-8') findspark.init("/home/arturo/Software/spark-2.2.3-bin-hadoop2.7") context = pspk.SparkContext.getOrCreate() sql_context = pysql.SQLContext(context) dto_logger = Log.Logger('', '', 'Script_Campanias', '', '') dto_credenciales = Dto.DtoCredenciales( id_cuenta='act_804059193122922', token_de_acceso= 'EAAFqYKPZBGTwBAJSHoktCxD1IHAn0tsl9I3iATCrLWb0aol1cUmq5Bfg1TKqWW' 'SIJccxb2kxtN7HubCQ32rLCN50nzddGPbh1rtJmsbdFgGcD6n4jHWb1IqSINZC' 'GGFgZBRJYGJAjqUfQpAXkmtd4dZCwZCEGDHicZBpj5dZCMgYgZDZD', id_usuario='', id_app='', id_pagina='', app_secreta='') etl_campania = EtlCamp.EtlCampania(dto_credenciales, sql_context) etl_campania.extrae()
df.to_csv(DF_CSV_PATH, index=False) df.head() #%% df = df.astype({'class': 'int32'}) # %% trainDF = df.sample(frac=0.8, random_state=80) testDF = df.drop(trainDF.index) #%% trainDF['class'].isnull().sum() # %% trainDF, testDF = sql.SQLContext(spark.sparkContext).createDataFrame(trainDF), sql.SQLContext(spark.sparkContext).createDataFrame(testDF) # trainDF.show() # testDF.show() #%% print(trainDF.count()) print(trainDF.filter(F.col('class') == 1).count()) print(trainDF.filter(F.col('class') == 0).count()) # %% stopWords = list(set(nltk.corpus.stopwords.words('english'))) + [''] tokenizer = Tokenizer(inputCol='review', outputCol='tokens') stopWordRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='stoppedWords').setStopWords(stopWords) countVector = CountVectorizer(inputCol=stopWordRemover.getOutputCol(), outputCol='vectors') idf = IDF(inputCol=countVector.getOutputCol(), outputCol='idf')
def sql_context(spark_context): sql_context = sql.SQLContext(spark_context) return sql_context
#Data loading data = sc.textFile("./ml-100k/u.data") #loaded data will be a spark RDD type, run the below command to findout the data type of data object. #print(type(data),data.count(),data.first()) print(data.take(5)) # total length of the data loaded is given by: from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating ratings = data.map(lambda l: l.split('\t'))\ .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))) print(ratings.take(5)) #创建df sql_ctx = sql.SQLContext(sc) df = sql_ctx.createDataFrame(ratings, ['UserID', 'product',"Rating"]) #df.select('user').distinct().show(100) user_count = df.groupBy("UserID" ).count() print(type(user_count)) #漂亮的直方图 #plt_show(df) # df.stat.crosstab("UserID", "Rating").show() #分割训练集 (training, test) = ratings.randomSplit([0.8, 0.2])
from pyspark import sql, SparkConf, SparkContext import pyspark import os conf = SparkConf().setAppName("Read_CSV") sc = SparkContext(conf=conf) spark = sql.SQLContext(sc) df1 = spark.read.option("delimiter", ";").option("header", "true").option( "encoding", "ISO-8859-1" ).csv( "/home/data/Documents/dadosCvm/Cias Abertas Documentos Formulário DFP - Balanço Patrimonial Passivo (BPP)/Cias Abertas Documentos Formulário DFP - Balanço Patrimonial Passivo (BPP)/bpp_cia_aberta_con_2013.csv" ) for i in range(0, 3): df2 = spark.read.option("delimiter", ";").option("header", "true").option( "encoding", "ISO-8859-1" ).csv( "/home/data/Documents/dadosCvm/Cias Abertas Documentos Formulário DFP - Balanço Patrimonial Passivo (BPP)/Cias Abertas Documentos Formulário DFP - Balanço Patrimonial Passivo (BPP)/bpp_cia_aberta_con_2013.csv" ) df1 = df1.union(df2) df1.show() print(df1.count())