def Spark_MapReduce(level, wordsatthislevel, graphcache): freqterms1_local=wordsatthislevel md5hash = hashlib.md5(",".join(wordsatthislevel)).hexdigest() #md5hash = ",".join(wordsatthislevel) cachevalue=graphcache.get(md5hash) if cachevalue: print "Spark_MapReduce(): hash = ", md5hash, "; returning from cache" return cachevalue else: spcon=SparkContext("local[2]","Spark_MapReduce") print "Spark_MapReduce(): wordsatthislevel:",wordsatthislevel paralleldata=spcon.parallelize(wordsatthislevel).cache() #k=paralleldata.map(lambda wordsatthislevel: mapFunction(wordsatthislevel)).reduceByKey(reduceFunction) k=paralleldata.map(mapFunction2).reduceByKey(reduceFunction) #k=paralleldata.map(mapFunction).reduceByKey(reduceFunction) #dict_k=k.collect() #s = sorted(dict_k.items(),key=operator.itemgetter(1), reverse=True) #print "Spark MapReduce results:" #print s ############################ sqlContext=SQLContext(spcon) recursiveglossoverlap_schema=sqlContext.createDataFrame(k.collect()) recursiveglossoverlap_schema.registerTempTable("Interview_RecursiveGlossOverlap") query_results=sqlContext.sql("SELECT * FROM Interview_RecursiveGlossOverlap") dict_query_results=dict(query_results.collect()) #print "Spark_MapReduce() - SparkSQL DataFrame query results:" #print dict_query_results[1] graphcache.set(md5hash, dict_query_results[1]) print "graphcache_mapreduce updated:", graphcache spcon.stop() return dict_query_results[1]
def test_logistic_regression_summary(self): from pyspark.mllib.linalg import Vectors sqlContext = SQLContext(self.sc) df = sqlContext.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False) model = lr.fit(df) self.assertTrue(model.hasSummary) s = model.summary # test that api is callable and returns expected types self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.probabilityCol, "probability") self.assertEqual(s.labelCol, "label") self.assertEqual(s.featuresCol, "features") objHist = s.objectiveHistory self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float)) self.assertGreater(s.totalIterations, 0) self.assertTrue(isinstance(s.roc, DataFrame)) self.assertAlmostEqual(s.areaUnderROC, 1.0, 2) self.assertTrue(isinstance(s.pr, DataFrame)) self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame)) self.assertTrue(isinstance(s.precisionByThreshold, DataFrame)) self.assertTrue(isinstance(s.recallByThreshold, DataFrame)) # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned, Scala version runs full test sameSummary = model.evaluate(df) self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
def test_save_load(self): temp_path = tempfile.mkdtemp() sqlContext = SQLContext(self.sc) dataset = sqlContext.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) tvsPath = temp_path + "/tvs" tvs.save(tvsPath) loadedTvs = TrainValidationSplit.load(tvsPath) self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid) self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid) self.assertEqual(loadedTvs.getEstimatorParamMaps(), tvs.getEstimatorParamMaps()) tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) loadedModel = TrainValidationSplitModel.load(tvsModelPath) self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
def Spark_MapReduce_Parents(keyword, tokensofprevlevel, graphcache): #tokensofprevlevelkeyword=tokensofprevlevel #tokensofprevlevelkeyword.append(keyword) md5hashparents = hashlib.md5(keyword).hexdigest() #md5hashparents = keyword md5hashparents = md5hashparents + "$parents" picklef_keyword=open("RecursiveGlossOverlap_MapReduce_Parents_Persisted.txt","w") asfer_pickle_string_dump(keyword,picklef_keyword) picklef_keyword.close() cachevalue=graphcache.get(md5hashparents) if cachevalue: print "Spark_MapReduce_Parents(): hash = ", md5hashparents, "; returning from cache" return cachevalue else: #picklelock.acquire() spcon = SparkContext("local[2]","Spark_MapReduce_Parents") #picklef_keyword=open("RecursiveGlossOverlap_MapReduce_Parents_Persisted.txt","w") #asfer_pickle_string_dump(keyword,picklef_keyword) #picklef_keyword.close() paralleldata = spcon.parallelize(tokensofprevlevel).cache() #k=paralleldata.map(lambda keyword: mapFunction_Parents(keyword,tokensofprevlevel)).reduceByKey(reduceFunction_Parents) k=paralleldata.map(mapFunction_Parents).reduceByKey(reduceFunction_Parents) sqlContext=SQLContext(spcon) parents_schema=sqlContext.createDataFrame(k.collect()) parents_schema.registerTempTable("Interview_RecursiveGlossOverlap_Parents") query_results=sqlContext.sql("SELECT * FROM Interview_RecursiveGlossOverlap_Parents") dict_query_results=dict(query_results.collect()) #print "Spark_MapReduce_Parents() - SparkSQL DataFrame query results:" #picklelock.release() graphcache.set(md5hashparents,dict_query_results[1]) spcon.stop() print "graphcache_mapreduce_parents updated:", graphcache return dict_query_results[1]
def test_nested_pipeline_persistence(self): """ Pipeline[HashingTF, Pipeline[PCA]] """ sqlContext = SQLContext(self.sc) temp_path = tempfile.mkdtemp() try: df = sqlContext.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"]) tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features") pca = PCA(k=2, inputCol="features", outputCol="pca_features") p0 = Pipeline(stages=[pca]) pl = Pipeline(stages=[tf, p0]) model = pl.fit(df) pipeline_path = temp_path + "/pipeline" pl.save(pipeline_path) loaded_pipeline = Pipeline.load(pipeline_path) self._compare_pipelines(pl, loaded_pipeline) model_path = temp_path + "/pipeline-model" model.save(model_path) loaded_model = PipelineModel.load(model_path) self._compare_pipelines(model, loaded_model) finally: try: rmtree(temp_path) except OSError: pass
def main(argv): #STEP1: data ingestion sc = SparkContext(appName="PythonWordCount") sqlContext = SQLContext(sc) #read data into RDD input_schema_rdd = sqlContext.read.json("file:///scratch/network/alexeys/KaggleDato/Preprocessed/0_1/part-*") #input_schema_rdd.show() #input_schema_rdd.printSchema() #input_schema_rdd.select("id").show() train_label_rdd = sqlContext.read.json("file://"+PATH_TO_TRAIN_LABELS) sub_label_rdd = sqlContext.read.json("file://"+PATH_TO_SUB_LABELS) input_schema_rdd.registerTempTable("input") train_label_rdd.registerTempTable("train_label") sub_label_rdd.registerTempTable("sub_label") # SQL can be run over DataFrames that have been registered as a table. train_wlabels_0 = sqlContext.sql("SELECT title,text,images,links,label FROM input JOIN train_label WHERE input.id = train_label.id AND label = 0") train_wlabels_1 = sqlContext.sql("SELECT title,text,images,links,label FROM input JOIN train_label WHERE input.id = train_label.id AND label = 1") text_only_0 = train_wlabels_0.map(lambda p: p.text) text_only_1 = train_wlabels_1.map(lambda p: p.text) counts0 = text_only_0.flatMap(lambda line: tokenize(line))\ .map(lambda x: (x, 1)) \ .reduceByKey(add) counts1 = text_only_1.flatMap(lambda line: tokenize(line))\ .map(lambda x: (x, 1)) \ .reduceByKey(add) relevance = counts0.subtractByKey(counts1).map(lambda (x,y): (y,x)).sortByKey(False, 1) relevance.saveAsTextFile("/user/alexeys/KaggleDato/WordCount")
def test_save_load(self): temp_path = tempfile.mkdtemp() sqlContext = SQLContext(self.sc) dataset = sqlContext.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(dataset) cvPath = temp_path + "/cv" cv.save(cvPath) loadedCV = CrossValidator.load(cvPath) self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid) self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid) self.assertEqual(loadedCV.getEstimatorParamMaps(), cv.getEstimatorParamMaps()) cvModelPath = temp_path + "/cvModel" cvModel.save(cvModelPath) loadedModel = CrossValidatorModel.load(cvModelPath) self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
def run(self): sc = SparkContext("local", "gender") sqlContext = SQLContext(sc) #StringType =(str, unicode) _out = self.output().open('w') #lines = sc.textFile("myUser.csv") #fobj = self.input().open("r") #lines = sc.textFile(fobj.name) print(type(self.required_tasks['insert_source'].output())) print(self.required_tasks['insert_source']) #print(self.input()['insert_source'].input()) lines = sc.textFile("myUser.csv") parts = lines.map(lambda l: l.split(",")) users = parts.map(lambda p: (p[0], p[1],p[2],p[3],p[4],p[5],p[6],p[7], p[8],p[9],p[10],p[11],p[12],p[13],p[14],p[15],p[16],p[17],p[18],p[19])) schemaString = "userId lmsUserId lmsName orgName name gender registrationDate emailId mothertounge highestEduDegree goals city state active firstAccesDate lastAccessDate allowCert yearOfBirth pincode aadharId" print(schemaString) _out.write(schemaString ) fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()] schema = StructType(fields) #schemaUser = sqlContext.createDataFrame(users, schema) schemaUser = sqlContext.applySchema(users, schema) schemaUser.registerTempTable("users") results = sqlContext.sql("SELECT gender FROM users") genders = results.map(lambda p : (p,1)) counts = genders.reduceByKey(lambda a, b: a + b) #.map(lambda t : ("Gender " + t(0) + " No " + t(1))).collect() for name in counts.collect(): _out.write(str(name)) _out.close()
def main(): log = logging.getLogger(prog) log.setLevel(logging.INFO) # bit hackish and hard to keep aligned with docstring changes, not using this # usage = '\r\b\r\b\r' + __doc__ + "usage: %prog -j file.json -p directory.parquet" # parser = OptionParser(usage=usage, version='%prog ' + __version__) parser = OptionParser(version='%prog ' + __version__) parser.add_option('-j', '--json', dest='jsonFile', help='JSON input file/dir', metavar='<file/dir>') parser.add_option('-p', '--parquetDir', dest='parquetDir', help='Parquet output dir', metavar='<dir>') (options, args) = parser.parse_args() jsonFile = options.jsonFile parquetDir = options.parquetDir if args or not jsonFile or not parquetDir: usage(parser) conf = SparkConf().setAppName('HS PySpark JSON => Parquet') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) spark_version = sc.version log.info('Spark version detected as %s' % spark_version) if not isVersionLax(spark_version): die("Spark version couldn't be determined. " + support_msg('pytools')) if isMinVersion(spark_version, 1.4): json = sqlContext.read.json(jsonFile) json.write.parquet(parquetDir) else: log.warn('running legacy code for Spark <= 1.3') json = sqlContext.jsonFile(jsonFile) json.saveAsParquetFile(parquetDir)
def index(request): string = u'template显示字符串变量' list = ['第一','第二','第三'] tuple = ('q','w','e','r','t') dict = {'a':1,'b':2,'c':3,'d':4} conf = SparkConf().setAppName("djangotest").setMaster("spark://HP-Pavilion:7077") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) url='jdbc:mysql://127.0.0.1:3306?user=root&password=raymon' dbtable='networkPublicOpinionAnalysisSystem.test' df = sqlContext.read.format('jdbc').options(url=url,dbtable=dbtable).load() lines = sc.textFile(settings.BASE_DIR+'/system/data/roll_news_sina_com_cn.csv') parts = lines.map(lambda l:l.split(',')) schemaNews = parts.map(lambda p : Row(category=p[0],title=p[1],url=p[2],time=p[3])) news = sqlContext.createDataFrame(schemaNews) # news.registerTempTable('test') # dbtable = 'networkPublicOpinionAnalysisSystem.test' # news.write.format('jdbc').options(url=url).insertInto(tableName=dbtable) # string = news.count() row = news.first() a = Row() print(type(news)) print(type(row)) # print(type(a)) # dict = row.asDict() # string = dict['title'] # news.write.jdbc(url,table=dbtable) return render(request,'index.html',{'string':string,'list':list,'tuple':tuple,'dict':dict})
class TestSQL(PySparkTestCase): def setUp(self): PySparkTestCase.setUp(self) self.sqlCtx = SQLContext(self.sc) def test_basic_functions(self): rdd = self.sc.parallelize(['{"foo":"bar"}', '{"foo":"baz"}']) srdd = self.sqlCtx.jsonRDD(rdd) srdd.count() srdd.collect() srdd.schemaString() srdd.schema() # cache and checkpoint self.assertFalse(srdd.is_cached) srdd.persist(StorageLevel.MEMORY_ONLY_SER) srdd.unpersist() srdd.cache() self.assertTrue(srdd.is_cached) self.assertFalse(srdd.isCheckpointed()) self.assertEqual(None, srdd.getCheckpointFile()) srdd = srdd.coalesce(2, True) srdd = srdd.repartition(3) srdd = srdd.distinct() srdd.intersection(srdd) self.assertEqual(2, srdd.count()) srdd.registerTempTable("temp") srdd = self.sqlCtx.sql("select foo from temp") srdd.count() srdd.collect()
def test_persistence(self): # Test save/load for LDA, LocalLDAModel, DistributedLDAModel. sqlContext = SQLContext(self.sc) df = sqlContext.createDataFrame([ [1, Vectors.dense([0.0, 1.0])], [2, Vectors.sparse(2, {0: 1.0})], ], ["id", "features"]) # Fit model lda = LDA(k=2, seed=1, optimizer="em") distributedModel = lda.fit(df) self.assertTrue(distributedModel.isDistributed()) localModel = distributedModel.toLocal() self.assertFalse(localModel.isDistributed()) # Define paths path = tempfile.mkdtemp() lda_path = path + "/lda" dist_model_path = path + "/distLDAModel" local_model_path = path + "/localLDAModel" # Test LDA lda.save(lda_path) lda2 = LDA.load(lda_path) self._compare(lda, lda2) # Test DistributedLDAModel distributedModel.save(dist_model_path) distributedModel2 = DistributedLDAModel.load(dist_model_path) self._compare(distributedModel, distributedModel2) # Test LocalLDAModel localModel.save(local_model_path) localModel2 = LocalLDAModel.load(local_model_path) self._compare(localModel, localModel2) # Clean up try: rmtree(path) except OSError: pass
def __init__(self, predictionAndLabels): sc = predictionAndLabels.ctx sql_ctx = SQLContext(sc) df = sql_ctx.createDataFrame(predictionAndLabels, schema=sql_ctx._inferSchema(predictionAndLabels)) java_model = callMLlibFunc("newRankingMetrics", df._jdf) super(RankingMetrics, self).__init__(java_model)
def run(self): jsonFile = self.options.jsonFile parquetDir = self.options.parquetDir if not jsonFile: self.usage('--json not defined') if not parquetDir: self.usage('--parquetDir not defined') if self.args: self.usage() conf = SparkConf().setAppName('HS PySpark JSON => Parquet') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) spark_version = sc.version log.info('Spark version detected as %s' % spark_version) if not isVersionLax(spark_version): die("Spark version couldn't be determined. " + support_msg('pytools')) if isMinVersion(spark_version, 1.4): json = sqlContext.read.json(jsonFile) json.write.parquet(parquetDir) else: log.warn('running legacy code for Spark <= 1.3') json = sqlContext.jsonFile(jsonFile) json.saveAsParquetFile(parquetDir)
def main(argv): Conf = (SparkConf().setAppName("recommendation")) sc = SparkContext(conf=Conf) sqlContext = SQLContext(sc) dirPath = "hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/recommend/data/sr_userCount.parquet" rawDF = sqlContext.read.parquet(dirPath).persist(StorageLevel.MEMORY_AND_DISK_SER) # argv[1] is the dump of training data in hdfs # argv[2] is the user perferences # User Hash Lookup stored into cassandra user_hash = rawDF.map(lambda (a,b,c): (a,hashFunction(a))) distinctUser = user_hash.distinct() userHashDF = sqlContext.createDataFrame(distinctUser,["user","hash"]) userHashDF.write.format("org.apache.spark.sql.cassandra").options(table ="userhash", keyspace = keyspace).save(mode="append") # Product Hash Lookup stored into cassandra product_hash = rawDF.map(lambda (a,b,c): (b, hashFunction(b))) distinctProduct = product_hash.distinct() productHashDF = sqlContext.createDataFrame(distinctProduct,["product","hash"]) productHashDF.write.format("org.apache.spark.sql.cassandra").options(table ="producthash", keyspace = keyspace).save(mode="append") # Ratings for training # ALS requires a java hash of string. This function does that and stores it as Rating Object # for the algorithm to consume ratings = rawDF.map(lambda (a,b,c) : Rating(hashFunction(a),hashFunction(b),float(c))) model = ALS.trainImplicit(ratings,10,10,alpha=0.01,seed=5) model.save(sc, "hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/recommend/model") sc.stop()
def writeLumbarReadings(time, rdd): try: # Convert RDDs of the words DStream to DataFrame and run SQL query connectionProperties = MySQLConnection.getDBConnectionProps('/home/erik/mysql_credentials.txt') sqlContext = SQLContext(rdd.context) if rdd.isEmpty() == False: lumbarReadings = sqlContext.jsonRDD(rdd) lumbarReadingsIntermediate = lumbarReadings.selectExpr("readingID","readingTime","deviceID","metricTypeID","uomID","actual.y AS actualYaw","actual.p AS actualPitch","actual.r AS actualRoll","setPoints.y AS setPointYaw","setPoints.p AS setPointPitch","setPoints.r AS setPointRoll") assembler = VectorAssembler( inputCols=["actualPitch"], # Must be in same order as what was used to train the model. Testing using only pitch since model has limited dataset. outputCol="features") lumbarReadingsIntermediate = assembler.transform(lumbarReadingsIntermediate) predictions = loadedModel.predict(lumbarReadingsIntermediate.map(lambda x: x.features)) predictionsDF = lumbarReadingsIntermediate.map(lambda x: x.readingID).zip(predictions).toDF(["readingID","positionID"]) combinedDF = lumbarReadingsIntermediate.join(predictionsDF, lumbarReadingsIntermediate.readingID == predictionsDF.readingID).drop(predictionsDF.readingID) combinedDF = combinedDF.drop("features") combinedDF.show() combinedDF.write.jdbc("jdbc:mysql://localhost/biosensor", "SensorReadings", properties=connectionProperties) except: pass
def main(sc): sqlContext = SQLContext(sc) tasteProfileRdd = sc.textFile("userTaste/*") songRdd = sc.textFile("songsDict/*") # Load a text file and convert each line to a Row. tasteProfile = tasteProfileRdd.filter(lambda l:len(l) > 0) parsedSplits = tasteProfile.map(lambda l: l.split('\t')) userTaste = parsedSplits.map(lambda p: Row(userId=p[0], songId=p[1], playCount=p[2])) individualSong = songRdd.map(lambda l:l.split('|')) songData = individualSong.map(lambda s: Row(songId=s[0],featureSet=s[1])) # Infer the schema, and register the DataFrame as a table. schemaUserTaste = sqlContext.inferSchema(userTaste) schemaUserTaste.registerTempTable("userTaste") schemaSongData = sqlContext.inferSchema(songData) schemaSongData.registerTempTable("songData") test2 = sqlContext.sql("select * from songData limit 5") songIds = test2.map(lambda p: "songIds: " + s.songId) #test1 = sqlContext.sql("SELECT distinct * FROM userTaste limit 5") #songIds = test1.map(lambda p: "songIds: " + p.songId) for i in songIds.collect(): print i
def main(self, sc, *args): from pyspark.sql.types import BooleanType, StringType from pyspark.sql.types import FloatType, StructField, StructType from pyspark.sql import SQLContext fields = [] for field in header_avro["fields"] + self.extra_fields: if field["type"] == "float": field_type = FloatType() elif field["type"] == "bool": field_type = BooleanType() else: field_type = StringType() fields.append(StructField(field["name"], field_type)) schema = StructType(fields) sqlContext = SQLContext(sc) logger.info("Reading %s from %s" % (self.test_name, self.input().path)) df = sqlContext.jsonFile(self.input().path, schema) df.registerTempTable("reports") entries = df.filter("({test_names}) AND" " record_type = 'entry'".format( test_names=' OR '.join([ "test_name = '{test_name}'".format( test_name=tn) for tn in self.test_names]))) interestings = self.find_interesting(entries) out_file = self.output().open('w') for interesting in interestings.toJSON().collect(): out_file.write(interesting) out_file.write("\n") out_file.close()
def main(): # Setting the cluster configuration parameters conf = SparkConf() conf.setMaster("spark://localhost:7077") conf.setAppName("Tweet App") conf.set("spark.executor.memory", "3g") conf.set("spark.driver.memory", "4g") # Creating a Spark Context with conf file sc = SparkContext(conf=conf) # Creating and SQL context to perform SQL queries sqlContext = SQLContext(sc) # Define the data path curr_path = os.path.dirname(os.path.abspath(__file__)) json_name = "out.json" json_file_path = os.path.join(curr_path + "/../Spark_Jobs/data/", json_name) parquet_file_path = createSQLContext(json_file_path, sqlContext) print(parquet_file_path) # Read from parquet file parquetFile = sqlContext.read.parquet(parquet_file_path) parquetFile.registerTempTable("tweets") counter = sqlContext.sql("SELECT count(*) as cnt FROM tweets") print("============= Count =================") print("Count:: " + str(counter.collect()[0].cnt))
def __init__(self, sparkContext): """Create a new HbaseContext. @param sparkContext: The SparkContext to wrap. """ SQLContext.__init__(self, sparkContext) self._scala_HBaseSQLContext = self._get_hbase_ctx()
def main(): reviews_parquet = sys.argv[1] metadata_parquet = sys.argv[2] users_ascores_file = sys.argv[3] products_ascores_file = sys.argv[4] conf = SparkConf().setAppName('Amazon Cassandra Injector').setMaster("local").set("spark.cassandra.connection.host", "localhost") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) sqlContext.read.parquet(reviews_parquet).registerTempTable('amazon_reviews') reviews = sqlContext.sql("""SELECT * FROM amazon_reviews""").rdd.cache() reviews_by_reviewer = reviews.map(process_review).map(lambda j: (j["reviewerid"], j)) users_ascores = sc.textFile(users_ascores_file).map(ast.literal_eval).map(lambda (r_id, score, histo): (r_id, (score, histo))) reviews_joined = reviews_by_reviewer.join(users_ascores).map(lambda (reviewerid, (j, (score, histo))): fillin_review(j, score)) # join with meth2_users_ascores. join on reviewerid -> ascore is reviewer ascore reviews_joined.saveToCassandra("amzdb", "reviews") # reviewers need their alternative score reviewers = reviews.map(process_reviewer).map(lambda j: (j["reviewerid"], j)) # join with meth2_user_ascores. Get ascore and overall_histogram reviewers_joined = reviewers.join(users_ascores).map(lambda (reviewerid, (j, (score, histo))): fillin_reviewer(j, score, histo)) reviewers_joined.saveToCassandra("amzdb", "reviewers") # products need their overall score/histogram, and adjuted score/histogram sqlContext.read.parquet(metadata_parquet).registerTempTable('amazon_metadata') products = sqlContext.sql("""SELECT * FROM amazon_metadata""").rdd.map(process_product).map(lambda j: (j["asin"], j)) # join with meth2_product_ascores products_ascores = sc.textFile(products_ascores_file).map(ast.literal_eval).map(lambda (asin, o_s, a_s, o_h, a_h, n): (asin, (o_s, o_h, a_s, a_h))) products_joined = products.join(products_ascores).map(lambda (asin, (j, (o_s, o_h, a_s, a_h))): fillin_product(j, o_s, o_h, a_s, a_h)) products_joined.saveToCassandra("amzdb", "products")
class RecommendationEngine: """A travel recommendation engine """ def get_recommendations(self, user_id): """Recommends travel for user """ data = (1,2,3,4,5) even_rdd = self.sc.parallelize(data) #ratings = even_rdd.collect() reco = self.sqlContext.sql("SELECT c.contact_id, o.prod_id FROM contacts c , offres o WHERE o.continent_offre = c.continent and o.envie_offre = c.envie and o.moyen_offre = c.moyen").collect() return reco def __init__(self, sc): """Init the recommendation engine given a Spark context and a dataset path """ logger.info("Starting up the Recommendation Engine: ") self.sc = sc self.sqlContext = SQLContext(sc) path_contacts = "data_v3/contacts/attempt_contactV3_perfect_match.json" df_contacts = self.sqlContext.jsonFile(path_contacts) df_contacts.registerTempTable("contacts") path_offres = "data_v3/offres/attempt_productV3_perfect_match.json" df_offres = self.sqlContext.jsonFile(path_offres) df_offres.registerTempTable("offres")
def main(n_part, hdfs_path): print "********************\n*" print "* Start main\n*" print "********************" conf = SparkConf().setAppName("Benchmark Spark SQL") sc = SparkContext(conf = conf) sqlContext = SQLContext(sc) rowsRDD = sc.textFile(hdfs_path).repartition(n_part).map(lambda x: recordToRows(x)).cache() df = sqlContext.createDataFrame(rowsRDD).cache() df.count() df.registerTempTable("msd_table") print "********************\n*" print "* Start querres\n*" print "********************" [ave_t1, std1, dt1, n1] = time_querry("SELECT * FROM msd_table WHERE msd_table.artist_name = 'Taylor Swift'", sqlContext) [ave_t2, std2, dt2, n2] = time_querry("SELECT COUNT(*) FROM msd_table WHERE msd_table.artist_name = 'Taylor Swift'", sqlContext, method=1) [ave_t3, std3, dt3, n3] = time_querry("SELECT * FROM msd_table WHERE msd_table.artist_hotness > 0.75", sqlContext) [ave_t4, std4, dt4, n4] = time_querry("SELECT COUNT(*) FROM msd_table WHERE msd_table.artist_hotness > 0.75", sqlContext, method=1) if n1 != n2: print "\t!!!!Error, counts disagree for the number of T.S. songs!" if n3 != n4: print "\t!!!!Error, counts disagree for the number of high paced songs!" print "********************\n*" print "* Results" print "\t".join(map(lambda x: str(x), [ave_t1, std1, dt1, ave_t2, std2, dt2, ave_t3, std3, dt3, ave_t4, std4, dt4])) print "********************"
def mock_data(self): """Mock data to imitate read from database.""" sqlContext = SQLContext(self.sc) mock_data_rdd = self.sc.parallelize([("A", 1, 1), ("B", 1, 0), ("C", 0, 2), ("D", 2, 4), ("E", 3, 5) ]) schema = ["id", "x", "y"] mock_data_df = sqlContext.createDataFrame(mock_data_rdd, schema) return mock_data_df
def log_mapreducer(logfilename, pattern, filt="None"): spcon=SparkContext() if filt == "None": input=open(logfilename,'r') paralleldata=spcon.parallelize(input.readlines()) patternlines=paralleldata.filter(lambda patternline: pattern in patternline) print "pattern lines",patternlines.collect() matches=patternlines.map(mapFunction).reduceByKey(reduceFunction) else: input=spcon.textFile(logfilename) matches=input.flatMap(lambda line:line.split()).filter(lambda line: filt in line).map(mapFunction).reduceByKey(reduceFunction) matches_collected=matches.collect() print "matches_collected:",matches_collected if len(matches_collected) > 0: sqlContext=SQLContext(spcon) bytes_stream_schema=sqlContext.createDataFrame(matches_collected) bytes_stream_schema.registerTempTable("USBWWAN_bytes_stream") query_results=sqlContext.sql("SELECT * FROM USBWWAN_bytes_stream") dict_query_results=dict(query_results.collect()) print "----------------------------------------------------------------------------------" print "log_mapreducer(): pattern [",pattern,"] in [",logfilename,"] for filter [",filt,"]" print "----------------------------------------------------------------------------------" dict_matches=dict(matches_collected) sorted_dict_matches = sorted(dict_matches.items(),key=operator.itemgetter(1), reverse=True) print "pattern matching lines:",sorted_dict_matches print "----------------------------------------------------------------------------------" print "SparkSQL DataFrame query results:" print "----------------------------------------------------------------------------------" pprint.pprint(dict_query_results) print "----------------------------------------------------------------------------------" print "Cardinality of Stream Dataset:" print "----------------------------------------------------------------------------------" print len(dict_query_results) spcon.stop() return sorted_dict_matches
def main(argv): Conf = (SparkConf().setAppName("SimpleGraph")) sc = SparkContext(conf=Conf) sqlContext = SQLContext(sc) dirPath = "hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/data/"+argv[1]+".parquet" rawDF = sqlContext.read.parquet(dirPath).registerTempTable("comments") # This is where the magic happens # SQL self join to join users who have interacted with one another df = sqlContext.sql(""" SELECT t1.subreddit as Subreddit, t1.id as OrigId , t2.id as RespId, t1.author AS OrigAuth, t2.author AS RespAuth, t1.score AS OrigScore, t2.score AS RespScore, t1.ups AS OrigUps, t2.ups AS RespUps, t1.downs AS OrigDowns, t2.downs AS RespDowns, t1.controversiality AS OrigControv, t2.controversiality AS RespControv FROM comments t1 INNER JOIN comments t2 ON CONCAT("t1_",t1.id) = t2.parent_id where t1.author!='[deleted]' and t2.author!='[deleted]' """) # write it into parquet ? Why ? Cause it compresses the data and is really fast to read from ! df.write.parquet("hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/data/"+argv[1]+"-selfjoin.parquet")
def main(sc): sql_context = SQLContext(sc) all_data = get_all_data() # Input data: Each row is a bag of words from a sentence or document. training_data = [(id_gen.next(), text.split(" ")) for text in all_data] documentdf = sql_context.createDataFrame(training_data, ["id", "text"]) remover = StopWordsRemover(inputCol="text", outputCol="text_filtered") cleaned_document = remover.transform(documentdf) # Learn a mapping from words to Vectors. word2vec = Word2Vec(vectorSize=len(training_data), inputCol="text_filtered", outputCol="result") model = word2vec.fit(cleaned_document) matrix = column_similarities(model.transform(cleaned_document)) # We use the size of the target data to filter only # products of target data to filter data and avoid # products of taret data to itself values = matrix.entries.filter( lambda x: x.j >= TARGET_DATA_SIZE and x.i < TARGET_DATA_SIZE).sortBy( keyfunc=lambda x: x.value, ascending=False).map( lambda x: x.j).distinct().take(100) training_data_index = dict(training_data) for position, item in enumerate(values): line = " ".join(training_data_index[int(item)]) print('%d -> %s' % (position, line.encode('utf-8')))
def RunRandomForest(tf, ctx): sqlContext = SQLContext(ctx) rdd = tf.map(parseForRandomForest) # The schema is encoded in a string. schema = ['genre', 'track_id', 'features'] # Apply the schema to the RDD. songDF = sqlContext.createDataFrame(rdd, schema) # Register the DataFrame as a table. songDF.registerTempTable("genclass") labelIndexer = StringIndexer().setInputCol("genre").setOutputCol("indexedLabel").fit(songDF) trainingData, testData = songDF.randomSplit([0.8, 0.2]) labelConverter = IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels) rfc = RandomForestClassifier().setMaxDepth(10).setNumTrees(2).setLabelCol("indexedLabel").setFeaturesCol("features") #rfc = SVMModel([.5, 10, 20], 5) #rfc = LogisticRegression(maxIter=10, regParam=0.01).setLabelCol("indexedLabel").setFeaturesCol("features") pipeline = Pipeline(stages=[labelIndexer, rfc, labelConverter]) model = pipeline.fit(trainingData) predictions = model.transform(testData) predictions.show() evaluator = MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("precision") accuracy = evaluator.evaluate(predictions) print 'Accuracy of RandomForest = ', accuracy * 100 print "Test Error = ", (1.0 - accuracy) * 100
def main(dataFile, outputPath): conf = SparkConf().setAppName("S3 Example").set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) raw_text = sc.textFile(dataFile).persist(StorageLevel.MEMORY_AND_DISK) csv_data = raw_text.map(lambda l: l.split(",")) row_data = csv_data.map(lambda p: dataIO.dataStruc(p)) interaction_df = sqlContext.createDataFrame(row_data) # features.save_hdfs_parquet(interaction_df, outputPath) dataIO.save_hdfs_parquet(interaction_df, outputPath) interaction_df.registerTempTable("interactions") tcp_interactions = sqlContext.sql( """ SELECT duration, dst_bytes, protocol_type FROM interactions WHERE protocol_type = 'tcp' AND duration > 1000 AND dst_bytes=0 """) tcp_interactions.show() features.print_tcp_interactions(tcp_interactions) dataIO.print_from_dataio() features.print_from_feature() sc.stop()
def get_latest_data(self): from pyspark.sql import SparkSession import config import pandas as pd # initialise sparkContext spark1 = SparkSession.builder \ .master(config.sp_master) \ .appName(config.sp_appname) \ .config('spark.executor.memory', config.sp_memory) \ .config("spark.cores.max", config.sp_cores) \ .getOrCreate() sc = spark1.sparkContext # using SQLContext to read parquet file from pyspark.sql import SQLContext sqlContext = SQLContext(sc) from datetime import datetime t1 = datetime.now() df = sqlContext.read.parquet(config.proj_path+'/datas/appid_datapoint_parquet1') # creating and querying fron the temporory table df1 = df.registerTempTable('dummy') df1 = sqlContext.sql('select count(distinct application) as app_count, time_stamp, source from dummy group by source, time_stamp') # data cleaning self.p2_df = df1.toPandas() dates_outlook = pd.to_datetime(pd.Series(self.p2_df.time_stamp),unit='ms') self.p2_df.index = dates_outlook self.p2_df['date'] = self.p2_df.index.date self.p2_df = self.p2_df.sort_values(by='time_stamp') t2 =datetime.now() time_to_fetch = str(t2-t1)
def read_file_spark(file_path, file_type, **kwargs): sc = init_nncontext() node_num, core_num = get_node_and_core_number() if ZooContext.orca_pandas_read_backend == "pandas": file_url_splits = file_path.split("://") prefix = file_url_splits[0] file_paths = [] if isinstance(file_path, list): [ file_paths.extend(extract_one_path(path, os.environ)) for path in file_path ] else: file_paths = extract_one_path(file_path, os.environ) if not file_paths: raise Exception( "The file path is invalid or empty, please check your data") num_files = len(file_paths) total_cores = node_num * core_num num_partitions = num_files if num_files < total_cores else total_cores rdd = sc.parallelize(file_paths, num_partitions) if prefix == "hdfs": pd_rdd = rdd.mapPartitions( lambda iter: read_pd_hdfs_file_list(iter, file_type, **kwargs)) elif prefix == "s3": pd_rdd = rdd.mapPartitions( lambda iter: read_pd_s3_file_list(iter, file_type, **kwargs)) else: def loadFile(iterator): for x in iterator: df = read_pd_file(x, file_type, **kwargs) yield df pd_rdd = rdd.mapPartitions(loadFile) else: # Spark backend; spark.read.csv/json accepts a folder path as input assert file_type == "json" or file_type == "csv", \ "Unsupported file type: %s. Only csv and json files are supported for now" % file_type from pyspark.sql import SQLContext sqlContext = SQLContext.getOrCreate(sc) spark = sqlContext.sparkSession # TODO: add S3 confidentials # The following implementation is adapted from # https://github.com/databricks/koalas/blob/master/databricks/koalas/namespace.py # with some modifications. if "mangle_dupe_cols" in kwargs: assert kwargs[ "mangle_dupe_cols"], "mangle_dupe_cols can only be True" kwargs.pop("mangle_dupe_cols") if "parse_dates" in kwargs: assert not kwargs["parse_dates"], "parse_dates can only be False" kwargs.pop("parse_dates") names = kwargs.get("names", None) if "names" in kwargs: kwargs.pop("names") usecols = kwargs.get("usecols", None) if "usecols" in kwargs: kwargs.pop("usecols") dtype = kwargs.get("dtype", None) if "dtype" in kwargs: kwargs.pop("dtype") squeeze = kwargs.get("squeeze", False) if "squeeze" in kwargs: kwargs.pop("squeeze") index_col = kwargs.get("index_col", None) if "index_col" in kwargs: kwargs.pop("index_col") if file_type == "csv": # Handle pandas-compatible keyword arguments kwargs["inferSchema"] = True header = kwargs.get("header", "infer") if isinstance(names, str): kwargs["schema"] = names if header == "infer": header = 0 if names is None else None if header == 0: kwargs["header"] = True elif header is None: kwargs["header"] = False else: raise ValueError("Unknown header argument {}".format(header)) if "quotechar" in kwargs: quotechar = kwargs["quotechar"] kwargs.pop("quotechar") kwargs["quote"] = quotechar if "escapechar" in kwargs: escapechar = kwargs["escapechar"] kwargs.pop("escapechar") kwargs["escape"] = escapechar # sep and comment are the same as pandas if "comment" in kwargs: comment = kwargs["comment"] if not isinstance(comment, str) or len(comment) != 1: raise ValueError( "Only length-1 comment characters supported") df = spark.read.csv(file_path, **kwargs) if header is None: df = df.selectExpr(*[ "`%s` as `%s`" % (field.name, i) for i, field in enumerate(df.schema) ]) else: df = spark.read.json(file_path, **kwargs) # Handle pandas-compatible postprocessing arguments if usecols is not None and not callable(usecols): usecols = list(usecols) renamed = False if isinstance(names, list): if len(set(names)) != len(names): raise ValueError( "Found duplicate names, please check your names input") if usecols is not None: if not callable(usecols): # usecols is list if len(names) != len(usecols) and len(names) != len( df.schema): raise ValueError("Passed names did not match usecols") if len(names) == len(df.schema): df = df.selectExpr(*[ "`%s` as `%s`" % (field.name, name) for field, name in zip(df.schema, names) ]) renamed = True else: if len(names) != len(df.schema): raise ValueError( "The number of names [%s] does not match the number " "of columns [%d]. Try names by a Spark SQL DDL-formatted " "string." % (len(names), len(df.schema))) df = df.selectExpr(*[ "`%s` as `%s`" % (field.name, name) for field, name in zip(df.schema, names) ]) renamed = True index_map = dict([(i, field.name) for i, field in enumerate(df.schema)]) if usecols is not None: if callable(usecols): cols = [ field.name for field in df.schema if usecols(field.name) ] missing = [] elif all(isinstance(col, int) for col in usecols): cols = [ field.name for i, field in enumerate(df.schema) if i in usecols ] missing = [ col for col in usecols if col >= len(df.schema) or df.schema[col].name not in cols ] elif all(isinstance(col, str) for col in usecols): cols = [ field.name for field in df.schema if field.name in usecols ] if isinstance(names, list): missing = [c for c in usecols if c not in names] else: missing = [col for col in usecols if col not in cols] else: raise ValueError( "usecols must only be list-like of all strings, " "all unicode, all integers or a callable.") if len(missing) > 0: raise ValueError( "usecols do not match columns, columns expected but not found: %s" % missing) if len(cols) > 0: df = df.select(cols) if isinstance(names, list): if not renamed: df = df.selectExpr(*[ "`%s` as `%s`" % (col, name) for col, name in zip(cols, names) ]) # update index map after rename for index, col in index_map.items(): if col in cols: index_map[index] = names[cols.index(col)] if df.rdd.getNumPartitions() < node_num: df = df.repartition(node_num) def to_pandas(columns, squeeze=False, index_col=None): def f(iter): import pandas as pd data = list(iter) pd_df = pd.DataFrame(data, columns=columns) if dtype is not None: if isinstance(dtype, dict): for col, type in dtype.items(): if isinstance(col, str): if col not in pd_df.columns: raise ValueError( "column to be set type is not" " in current dataframe") pd_df[col] = pd_df[col].astype(type) elif isinstance(col, int): if index_map[col] not in pd_df.columns: raise ValueError( "column index to be set type is not" " in current dataframe") pd_df[index_map[col]] = pd_df[ index_map[col]].astype(type) else: pd_df = pd_df.astype(dtype) if squeeze and len(pd_df.columns) == 1: pd_df = pd_df.iloc[:, 0] if index_col: pd_df = pd_df.set_index(index_col) return [pd_df] return f pd_rdd = df.rdd.mapPartitions(to_pandas(df.columns, squeeze, index_col)) data_shards = SparkXShards(pd_rdd) return data_shards
from pyspark.ml import Pipeline from pyspark.ml.classification import LogisticRegression from pyspark.ml.feature import HashingTF, Tokenizer from pyspark.sql import SQLContext from pyspark.sql import DataFrameWriter # Module Constants APP_NAME = "reddit-comment-controversiality-regression" REDDIT_AUG = "swift://reddit3.sjc01/RC_2010-08" REDDIT_SEPT = "swift://reddit3.sjc01/RC_2010-09" if __name__ == "__main__": # Configure Spark sc = SparkContext(appName=APP_NAME) sqlContext = SQLContext(sc) # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="body", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=10, regParam=0.01) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # prepare Reddit json files as sql Dataframes for pyspark.ml aug_comments = sqlContext.read.json(REDDIT_AUG) sep_comments = sqlContext.read.json(REDDIT_SEPT) training = aug_comments.select('id', 'body', (aug_comments.controversiality).cast("double").alias('label')) test = sep_comments.select('id', 'body') test_actual = sep_comments.select('id', (sep_comments.controversiality).alias('actual'))
from pyspark.sql import SparkSession, SQLContext from pyspark.sql.types import IntegerType, FloatType from pyspark.sql import functions as F # Created a spark session spark = SparkSession.builder \ .master('local[*]') \ .appName('My App') \ .getOrCreate() print(spark) # Read a parquet file sparkContext = spark.sparkContext sc = SQLContext(sparkContext) df = sc.read.parquet('../data/userdata1.parquet') print(df) # Mean mean_df = df.agg({'salary': 'mean'}) print(mean_df.collect()[0][0]) # Using describe described_df = df.select(['salary']).describe() print(described_df.collect()[1][1])
{"name":"Michael"} {"name":"Andy", "age":30} {"name":"Justin", "age":19} Overview Spark SQL is a Spark module for structured data processing. It provides a programmi- ng abstraction called DataFrames and can also act as distributed SQL query engine. - Spark SQL can also be used to read data from an existing Hive installation. --> DataFrames -->--> Starting Point: SQLContext #python >from pyspark.sql import SQLContext >sqlContext = SQLContext(sc) -->--> Creating DataFrames #python >from pyspark.sql import SQLContext >sqlContext = SQLContext(sc) >df = sqlContext.read.json("examples/src/main/resources/people.json") # Displays the content of the DataFrame to stdout >df.show() -->--> DataFrame Operations ------- python from pyspark.sql import SQLContext sqlContext = SQLContext(sc)
from pyspark.sql import SQLContext from pyspark.sql import HiveContext from pyspark.sql.types import * import steel_thread from pyspark import SparkContext import forecast_data_v3 import numpy as np import pandas as pd sc = SparkContext() hive_context = HiveContext(sc) sqlContext = SQLContext(sc) outageData = sc.textFile("file:///home/w205/steel_thread/outage_history.csv") weatherData = sc.textFile("file:///home/w205/steel_thread/weather_history.csv") riOutages = outageData.filter(lambda x: "Rhode Island" in x) riOutageRecords = riOutages.map(lambda r: r.split(",")) weatherRecords = weatherData.map(lambda r: r.split(",")) RI_Outages = riOutageRecords.map(lambda p: (p[2], p[4], p[5], p[8], p[ 12])) # I could not figure out how to properly parse this... RI_Weather = weatherRecords.map(lambda p: (p[5], p[6], p[26], p[27], p[28], p[ 30], p[37], p[38], p[39], p[40], p[41], p[42], p[43], p[44], p[46])) outageSchemaString = 'DATETIME HR MIN AREA NUMCUSTOMERS' # If the above gets updated, this would too (of course) weatherSchemaString = 'DTS ReportType maxTemp minTemp aveTemp aveHumidity WeatherCodes Precip Snowfall SnowDepth aveStationPressure aveSeaLevelPressure aveWindSpeed maxWindSpeed SustainedWindSpeed' outageFields = [ StructField(field_name, StringType(), True) for field_name in outageSchemaString.split()
# See the License for the specific language governing permissions and # limitations under the License. #******************************************************************************/ import pprint from pyspark.sql import SQLContext from pyspark import SparkContext, SparkConf conf = SparkConf().setAppName( "Cloudant Spark SQL External Datasource in Python") # define coudant related configuration conf.set("cloudant.host", "yanglei.cloudant.com") conf.set("cloudant.username", "ntledesewstarkalkedirsee") conf.set("cloudant.password", "b0VbcAS7davOYC0f4umPC2BR") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) print 'About to test com.cloudant.spark.CloudantRP for airportcodemapping' sqlContext.sql( "CREATE TEMPORARY TABLE airportTable USING com.cloudant.spark.CloudantRP OPTIONS ( database 'airportcodemapping')" ) airportData = sqlContext.sql( "SELECT airportCode, airportName FROM airportTable WHERE airportCode >= 'CAA' ORDER BY airportCode" ) airportData.printSchema() for code in airportData.collect(): print code.airportCode print 'About to test com.cloudant.spark.CloudantRP for booking' sqlContext.sql(
from pyspark.sql.types import IntegerType, StringType import pyspark.sql.functions as F from pyspark.ml.regression import LinearRegression, RandomForestRegressor from pyspark.ml.clustering import KMeans import json import socket import pandas as pd from pyspark.mllib.tree import RandomForest, RandomForestModel from pyspark.ml.linalg import Vectors import pandas as pd conf = SparkConf().setAppName('MyFirstStandaloneApp') conf.set("spark.network.timeout", "5601s") conf.set("spark.executor.heartbeatInterval", "5600s") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) def transData(data): return data.rdd.map(lambda r: [Vectors.dense(r[:-1]), r[-1]]).toDF( ['features', 'label']) #----------ZMIEŃ ŚCIEŻKĘ DO PLIKU Z DANYMI TRENINGOWYMI!!!!--------------- lines = sc.textFile('Structured_data2') data = lines.map(lambda line: line.split(";")) df_all = data.toDF(['Scrap_date','Scrap_time','Country_from','Country_to','Flight_id','Days','Journey_time','Airline1_There',\ 'Airline1_Back','Airline2_There','Airline2_Back','Price1_There','Price1_Back','Price2_There','Price2_Back',\ 'Depart_hour1_There','Depart_hour1_Back','Depart_hour2_There','Depart_hour2_Back','Depart_from1_There',\ 'Depart_from1_Back','Depart_from2_There','Depart_from2_Back','Arrival_hour1_There','Arrival_hour1_Back',\ 'Arrival_hour2_There','Arrival_hour2_Back','Arrive_to1_There','Arrive_to1_Back','Arrive_to2_There',\
from pyspark.sql import SparkSession from pyspark.sql.functions import * from pyspark.sql.types import * from pyspark.sql import SQLContext from graphframes.examples import Graphs spark = SparkSession.builder.appName("GraphX").getOrCreate() spark.sparkContext.setLogLevel("ERROR") sqlContext = SQLContext(spark.sparkContext) g = Graphs(sqlContext).friends() g.vertices.show() g.edges.show()
# Even though columns are named differently, the column indices of the ones # we're interested in are consistent across years COLUMN_INDEX_TO_NAME = { 1 : "Pickup_Time", 2 : "Dropoff_Time", 5 : "Start_Lon", 6 : "Start_Lat", 9 : "End_Lon", 10 : "End_Lat", } # Setup Spark conf = (SparkConf().setAppName('taxi-preprocessing')) sc = SparkContext(conf=conf) sc.setLogLevel('WARN') sql_context = SQLContext(sc) # Read & Parse file list. # From lines like "2009/yellow_tripdata_2009-03.csv", it extracts the file name. with open(input_file_list_file) as filelist_file: filelist = [line.strip().split("/")[1] for line in filelist_file.readlines()] # Read in all CSVs, project the relation to the column we need & concatenate df = None for csv_file in filelist: new_df = sql_context.read.format('com.databricks.spark.csv')\ .options(header='true', inferschema='true')\ .load(file_location_base + csv_file) for column_index, column_name in COLUMN_INDEX_TO_NAME.iteritems(): new_df = new_df.withColumnRenamed(new_df.columns[column_index], column_name)
def getSqlContextInstance(sparkContext): if ('sqlContextSingletonInstance' not in globals()): globals()['sqlContextSingletonInstance'] = SQLContext(sc) return globals()['sqlContextSingletonInstance']
import re # other required imports here if __name__ == "__main__": # create Spark context with necessary configuration spark = SparkContext("local", "Stock Returns") # read json data from the newdata directory # df = SQLContext(spark).read.option("multiLine", True) \ # .option("mode", "PERMISSIVE").json("./newsdata") schema = ( 'date STRING, open FLOAT, high FLOAT, low FLOAT, close FLOAT, volume INT, ticker STRING' ) df = SQLContext(spark).read.csv('stock_prices.csv', schema=schema, header=False) # df.show(2) # lines = df.select("date","open","close") # sim = df.withColumn("percent", (df("close") - df("open"))*100/df("open")) sim = df.withColumn("return", (df["close"] - df["open"]) * 100 / df["open"]) # sim.groupBy('date').avg('return').show() # sim.select("date","return").groupBy("date").avg() x = sim.groupBy("date").avg("return") x.collect() # sim=sim.select('date','return') # df.groupBy(df.date).avg(df.close - df.open).show() # vals = lines.map(lambda row: row[2]-row[1]) # to take avg on key
else: k = 10 w = 0.5 alpha = 6 b_update = True debug = True loss_type = 0 dataset = 'slicing/datasets/parallel_data/salaries/rows1000.csv' enumerator = "join" conf = SparkConf().setAppName("salary_test").setMaster('local[4]') num_partitions = 8 model_type = "regression" label = 'salary' sparkContext = SparkContext(conf=conf) sqlContext = SQLContext(sparkContext) fileRDD = sparkContext.textFile(dataset, num_partitions) header = fileRDD.first() head_split = header.split(",") fileRDD = fileRDD.filter(lambda line: line != header) data = fileRDD.map(lambda row: row.split(",")) dataset_df = sqlContext.createDataFrame(data, head_split) cat_features = ["rank", "discipline", "sincephd_bin", "service_bin", "sex"] # initializing stages of main transformation pipeline stages = [] dataset_df = dataset_df.withColumn("id", sf.monotonically_increasing_id()) # bining numeric features by local binner udf function (specified for current dataset if needed) dataset_df = dataset_df.withColumn('sincephd_bin', binner(dataset_df['sincephd'])) dataset_df = dataset_df.withColumn('service_bin',
# schema defined should exactly match the table created in cassandra class userrepo2014_2(Model): username = columns.Text(primary_key=True) repo = columns.List(columns.Text) def __repr__(self): return '%s %d' % (self.username, self.repo) # getting master node's IP and public DNS to run Spark job and read from HDFS master_ip = os.environ['master_ip'] master_public_dns = os.environ['master_public_dns'] # setting SparkContext and SQLContext sc = SparkContext("spark://" + master_ip + ":7077", "2014_events") sqlContext = SQLContext(sc) # reading events data for 2014 from HDFS df14 = sqlContext.jsonFile("hdfs://" + master_public_dns + ":9000/data2014_2/2014-*.*") # filtering rows with just the three relevant events df14_watch = df14.filter("type='WatchEvent'") df14_commit = df14.filter("type='CommitCommentEvent'") df14_fork = df14.filter("type='ForkEvent'") # registering dataframes as tables to be able to select just the three relevant columns sqlContext.registerDataFrameAsTable(df14_watch, "df14_watch_table") sqlContext.registerDataFrameAsTable(df14_commit, "df14_commit_table") sqlContext.registerDataFrameAsTable(df14_fork, "df14_fork_table")
def __init__(self, sparkContext, magellanContext=None): SQLContext.__init__(self, sparkContext) if magellanContext: self._scala_MagellanContext = magellanContext
# -*- coding: utf-8 -*- from pyspark import SparkContext,SparkConf from pyspark.sql import SQLContext from pyspark.sql.types import * import os import time import re if __name__ == "__main__": conf = SparkConf() sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) # PYSPARK_PYTHON = "C:\\Python27\\python.exe" #多版本python情况下,需要配置这个变量指定使用哪个版本 # os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON def p(x): print type(x),type(x[0]),type(x[1]) # print type(x) # print type(x[0]),type(x[1]) print x[0],x[1][0],x[1][1] # judgment_new:556万,3030306 3392975 df = sqlContext.read.jdbc(url='jdbc:mysql://cdh-slave1:3306/laws_doc_zhangye_etl', table='(select * from judgment_zhangye_etl01 ) tmp', column='id', lowerBound=1, upperBound=4816521, numPartitions=28, properties={"user": "******", "password": "******"}) # court:4778条
# Spark Hands On Training # Databricks CE Cloud Practice # Raul Arrabales / Conscious-Robots.com # Getting the Spark SQL context and imports from pyspark.sql import SQLContext, Row sqlContext = SQLContext.getOrCreate(sc.getOrCreate()) # Creating a simple DataFrame programatically array = [ Row(key="a", group="vowels", value=1), Row(key="b", group="consonants", value=2), Row(key="c", group="consonants", value=3), Row(key="d", group="consonants", value=4), Row(key="e", group="vowels", value=5) ] dataframe = sqlContext.createDataFrame(sc.parallelize(array)) dataframe.registerTempTable("PythonTestTable") # Visualize (in Databricks cloud - Display() ) display(dataframe) # Creating more sample DataFrames: # Sample age data: datosEdad = [('Raul', 22), ('Ana', 32), ('Juan', 46)] df1 = sqlContext.createDataFrame(datosEdad, ['nombre', 'edad']) # Apply filter to age data: filtroEdad = df1.filter(df1.edad >= 30).collect() print filtroEdad
# -*- coding:utf-8 -*- # author :seed # date :20170522 # 来源于spark文档中的官方实例测试 from pyspark import SparkContext,SparkConf from pyspark.sql import HiveContext from pyspark.sql import SQLContext from pyspark.sql import Row from pyspark.sql.types import * from pyspark.sql import functions as F from pyspark.sql.functions import * #from pyspark.sql.functions import col sqlContext = conf = SparkConf().setAppName("the apache sparksql") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) sc.setLogLevel("WARN") l = [("zhangfei",1),("guanyu",33)] row = sqlContext.createDataFrame(l).collect() row = sqlContext.createDataFrame(l, ['name', 'age']).collect() print(row) d = [{"name":"zhangfei","age":33},{"name":"guanyu","age":44}] row = sqlContext.createDataFrame(d).collect() print("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx ") print(row) rdd = sc.parallelize(l) row = sqlContext.createDataFrame(rdd).collect() print("xxxxxxxxxxxxxxxxxxxx the 2nd xxxxxxxxxxxxxxxxxxxxxxxxxxx") print(row)
# Project Crime/Living Index - Dhivya Sivaramakrishnan, Mangesh Bhangare from pyspark import SparkConf, SparkContext from pyspark.sql import SQLContext, DataFrame, Row import sys conf = SparkConf().setAppName('K-Means test') sc = SparkContext(conf=conf) assert sc.version >= '1.5.1' sqlContext = SQLContext(sc) input_cluster = sys.argv[1] output = sys.argv[2] # Read the parquet data (output of K-means) and convert to RDD parquet_cluster = sqlContext.read.parquet(input_cluster) parquet_cluster.registerTempTable("cluster_data") cluster_output = sqlContext.sql("SELECT * FROM cluster_data") # Save the result as a text file containing tuples cluster_tuple = cluster_output.rdd.map(tuple) cluster_output = cluster_tuple.saveAsTextFile(output)
from pyspark.sql.types import StringType from pyspark.sql.functions import udf from pyspark.sql import SparkSession from pyspark.sql import SQLContext import pyspark.sql.functions as F from pyspark.sql.functions import split from pyspark.sql.types import StringType from pyspark.sql.functions import udf from pyspark.sql.functions import lit from pyspark.sql.functions import format_string from pyspark.sql.functions import monotonically_increasing_id import functools sc = SparkContext() sqlc=SQLContext(sc) def unionAll(dfs): return functools.reduce(lambda df1,df2: df1.union(df2.select(df1.columns)), dfs) original_df = sqlc.read \ .format("jdbc") \ .option("url", "jdbc:oracle:thin:@150.136.138.197:1521/BIASDB_PDB1.subnet12011439.vcn12011439.oraclevcn.com") \ .option("dbtable", "POC.social_media_dwh") \ .option("user", "poc") \ .option("password", "WElcome##123") \ .option("driver", "oracle.jdbc.driver.OracleDriver") \ .load() selectDF=original_df.select(original_df['social_media_id'], original_df['friends_list'])
def predict(sc, borough, speed, weather): sqlContext = SQLContext(sc) isSpeeding = 0 if int(speed) > 70: isSpeeding = 1 model = PipelineModel.load("data/treeModelNew2") time = str(datetime.now()).split(' ') month = float(time[0].split('-')[1]) hour = float(time[1].split(':')[0]) city2id = { "Brooklyn": 0, "Queens": 1, "Staten_Island": 2, "Bronx": 3, "Manhattan": 4 } if hour == 0.0: hour = 24.0 data = [(float(city2id[borough]), float(speed), month, hour, float(weather['wind']), float(weather['rain']), float(weather['snow']), float(weather['snwd']), float(weather['temp'] * 9.0 / 5.0 + 32))] rdd = sc.parallelize(data) test = rdd.map(lambda x: Row(BOROUGH_1=x[0], MEAN_SPEED=x[1], MONTH=x[2], HOUR=x[3], AWND=x[4], PRCP=x[5], SNOW=x[6], SNWD=x[7], TAVG=x[8])) df = sqlContext.createDataFrame(test) df.show() assembler = VectorAssembler(inputCols=[ "BOROUGH_1", "MEAN_SPEED", "MONTH", "HOUR", "AWND", "PRCP", "SNOW", "SNWD", "TAVG" ], outputCol="features") df2 = assembler.transform(df) df3 = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=25).fit(df2).transform(df2) predictions = model.transform(df3) predictions.show() predictions = predictions.toPandas() predictedLevel = predictions["prediction"][0] + isSpeeding print("======================") print("======================") print("predictedLevel:", predictedLevel) print("======================") print("======================") return predictedLevel
import nltk from nltk.corpus import stopwords from pyspark import SparkConf, SparkContext from pyspark.sql import SQLContext findspark.init() conf = SparkConf().setAppName("TF-IDF").set("spark.dynamicAllocation.enabled", "true") # Set Spark configuration try: sc = SparkContext(conf=conf) except: sc.stop() sc = SparkContext(conf=conf) sc.setLogLevel("ERROR") sql = SQLContext(sc) docs_path = "C:/stories/test" # Path to the data textFiles = sc.wholeTextFiles(docs_path) # (path_doc_name, content) num_docs = textFiles.count() # Get the list of stop word. try: stops = set(stopwords.words('english')) except: nltk.download('popular') stops = set(stopwords.words('english')) def delete_stop_word(word: str): global stops
def get_sql_context_instance(spark_context): if ('sqlContextSingletonInstance' not in globals()): globals()['sqlContextSingletonInstance'] = SQLContext(spark_context) return globals()['sqlContextSingletonInstance']
from pyspark import SparkContext,SparkConf import pyspark from pyspark.sql.session import SparkSession from pyspark.sql.types import StructField,StringType,DoubleType,IntegerType from pyspark.sql import functions as f from pyspark.sql.functions import lit,trim,concat,coalesce,udf,struct from pyspark.sql import SQLContext # In[4]: sc = SparkContext('local','similarity') sqlContext = SQLContext(sc) # In[150]: #reading the input input_1 = sc.textFile('./data1.txt') # In[151]: input_f=input_1.map(lambda x: x+'~ ') #delimiting every document by ~
from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext from pyspark.sql.types import * from pyspark.sql.functions import udf from pyspark.sql.functions import col from pyspark.sql.types import StringType, DoubleType, IntegerType from abbreviations_dict import tofullname, toevent from operator import itemgetter from pyspark import StorageLevel import pyspark_cassandra sc = SparkContext() sqlContext = SQLContext(sc) customSchema = StructType([ StructField('GLOBALEVENTID',StringType(),True), StructField('SQLDATE',StringType(),True), StructField('MonthYear',StringType(),True), StructField('Year',StringType(),True), StructField('FractionDate',StringType(),True), StructField('Actor1Code',StringType(),True), StructField('Actor1Name',StringType(),True), StructField('Actor1CountryCode',StringType(),True), StructField('Actor1KnownGroupCode',StringType(),True), StructField('Actor1EthnicCode',StringType(),True), StructField('Actor1Religion1Code',StringType(),True), StructField('Actor1Religion2Code',StringType(),True), StructField('Actor1Type1Code',StringType(),True), StructField('Actor1Type2Code',StringType(),True), StructField('Actor1Type3Code',StringType(),True), StructField('Actor2Code',StringType(),True),
# # Copyright 2021, SenX S.A.S. # from pyspark.sql import SparkSession from pyspark.sql import SQLContext spark = SparkSession.builder.appName("02").getOrCreate() sc = spark.sparkContext sqlContext = SQLContext(sc) ## ## Configuration used to fetch data from a Warp 10 instance ## conf = {} conf['warp10.fetcher.fallbacks'] = '127.0.0.1' conf['warp10.fetcher.fallbacksonly'] = 'true' conf['warp10.fetcher.protocol'] = 'http' conf['http.header.now'] = 'X-Warp10-Now' conf['http.header.timespan'] = 'X-Warp10-Timespan' conf['warp10.fetcher.port'] = '8080' conf['warp10.fetcher.path'] = '/api/v0/sfetch' conf['warp10.splits.endpoint'] = 'http://127.0.0.1:8080/api/v0/splits' # We fetch a single data point from the GTS, this could be an actual timespan if it were a positive value conf['warp10.fetch.timespan'] = '-1' conf['warp10.http.connect.timeout'] = '60000' conf['warp10.http.read.timeout'] = '60000'
def sim_matrix(pair): similarity_mat = [] pair = pair[1] for i in range(len(pair)): for j in range(i + 1, len(pair)): if (len(pair) > 1): wt1 = pair[i][1] wt2 = pair[j][1] sim = ((pair[i][0], pair[j][0]), wt1 * wt2) similarity_mat.append(sim) return similarity_mat conf = SparkConf().setAppName("HW3_Part3_Avro_Uncompressed") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) sqlContext.setConf("spark.sql.avro.compression.codec", "uncompressed") #Read the inverted index as dataframe and convert it into rdd inv_index_data_avro = sqlContext.read.format("com.databricks.spark.avro").load( sys.argv[1]) inv_index_rdd = inv_index_data_avro.rdd.map(list) #Find similarity matrix inverted_file = inv_index_rdd.map(lambda pr: pr).filter( lambda pr: len(pr) > 0).map(lambda pair: pair) sim_cal = inverted_file.map(sim_matrix).flatMap(lambda pr: pr) similarity_matrix = sim_cal.reduceByKey(lambda c1, c2: c1 + c2).sortBy( lambda x: x[1], ascending=False) #Create dataframe and write as avro file
!pip install pyspark from pyspark import SparkContext from pyspark import SparkConf from pyspark.sql import Row from pyspark.sql import SQLContext conf = SparkConf().setAll([('spark.executor.memory', '1g'),('spark.driver.memory','1g')]) sc = SparkContext(conf=conf) # solve the question:AttributeError: 'PipelinedRDD' object has no attribute 'toDF' sqlContext = SQLContext(sc) rdd = sc.parallelize([1,2,3,4]) df = rdd.map(lambda l: Row(l)).toDF() with open ('/bigdata/xiaoma/spark/data/people.csv') as f: for l in f: print(l) myDF = sqlContext.read.format("com.databricks.spark.csv").option("header", "true").load("file:///bigdata/xiaoma/spark/data/people.csv") df.registerTempTable("tasks") results = sqlContext.sql("select * from tasks") results.show() lines = sc.textFile('file:///bigdata/xiaoma/spark/data/people.csv')\ .map(lambda x:x.split(','))\ .map(lambda l:Row(ID=l[0],name=l[1],age=l[2],sex=l[3],val=l[4])) for i in lines.collect(): print(i) myDF = sc.textFile('file:///bigdata/xiaoma/spark/data/people.csv')\ .map(lambda x:x.split(','))\ .map(lambda l:Row(ID=l[0],name=l[1],age=l[2],sex=l[3],val=l[4]))\ .toDF() myDF.show(20) myDF.select('name').show myDF.registerTempTable("tmp_df")
f.filter(lambda x: x.contains("LMKBRUKER")).count() errors = f.filter(lambda line: line.startswith("139.116.15.37,POSTEN")) messages = errors.map( lambda s: s.split(',')[2]) # Get the third element in the tuplet messages.cache() messages.filter(lambda s: "7/28" in s).count() messages = errors.map(lambda s: s.split(',')[2]).collect() # -------------------------------------------------------------------------------------------- # Spark SQL: # from pyspark.sql import SQLContext, Row from pyspark.sql import * sqlContext = SQLContext(sc) messages = errors.map( lambda s: s.split(',')) # Get the first four elements in the tuplet for m in messages[0][0:3]: print m # Get fields 0-3 of row 0 # Ex 1 lines = sc.textFile("file:///" + "C:/coding/Hadoop/pig/MapReduceInputData/iis3.log") messages = lines.map(lambda l: l.split(",")) messages_subset = messages.map( lambda p: Row(ip=p[0], user=p[1], date=p[2], time=p[3])) # Ex 2 lines = sc.textFile("file:///" +
# May cause deprecation warnings, safe to ignore, they aren't errors from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.sql import SQLContext from pyspark.sql.functions import desc # In[2]: # Can only run this once. restart your kernel for any errors. sc = SparkContext() # In[3]: ssc = StreamingContext(sc, 10) sqlContext = SQLContext(sc) # In[4]: socket_stream = ssc.socketTextStream("127.0.0.1", 5555) # In[5]: lines = socket_stream.window(20) # In[6]: from collections import namedtuple fields = ("tag", "count") Tweet = namedtuple('Tweet', fields)
def main(argv): #STEP1: data ingestion sc = SparkContext(appName="KaggleDato_Step2") sqlContext = SQLContext(sc) #read data into RDD input_schema_rdd = sqlContext.read.json( "file:///scratch/network/alexeys/KaggleDato/Preprocessed/0_1/part-00000" ) #input_schema_rdd.show() #input_schema_rdd.printSchema() #input_schema_rdd.select("id").show() train_label_rdd = sqlContext.read.json(PATH_TO_TRAIN_LABELS) sub_label_rdd = sqlContext.read.json(PATH_TO_SUB_LABELS) input_schema_rdd.registerTempTable("input") train_label_rdd.registerTempTable("train_label") sub_label_rdd.registerTempTable("sub_label") # SQL can be run over DataFrames that have been registered as a table. train_wlabels_0 = sqlContext.sql( "SELECT title,text,images,links,label FROM input JOIN train_label WHERE input.id = train_label.id AND label = 0" ) train_wlabels_1 = sqlContext.sql( "SELECT title,text,images,links,label FROM input JOIN train_label WHERE input.id = train_label.id AND label = 1" ) sub_wlabels = sqlContext.sql( "SELECT title,text,images,links,label FROM input JOIN sub_label WHERE input.id = sub_label.id" ) text_only_0 = train_wlabels_0.map(lambda p: p.text) text_only_1 = train_wlabels_1.map(lambda p: p.text) image_only_0 = train_wlabels_0.map(lambda p: p.images) image_only_1 = train_wlabels_1.map(lambda p: p.images) links_only_0 = train_wlabels_0.map(lambda p: p.links) links_only_1 = train_wlabels_1.map(lambda p: p.links) title_only_0 = train_wlabels_0.map(lambda p: p.title) title_only_1 = train_wlabels_1.map(lambda p: p.title) tf = HashingTF(numFeatures=10) #preprocess text features text_documents_0 = text_only_0.map(lambda line: tokenize(line)).map( lambda word: tf.transform(word)) text_documents_1 = text_only_1.map(lambda line: tokenize(line)).map( lambda word: tf.transform(word)) #add them adhoc non-text features documents_0 = text_documents_0.zip(image_only_0).zip(links_only_0).zip( title_only_0) documents_1 = text_documents_1.zip(image_only_1).zip(links_only_1).zip( title_only_1) #turn into a format expected by MLlib classifiers labeled_tfidf_0 = documents_0.map(lambda row: parsePoint(0, row)) labeled_tfidf_1 = documents_1.map(lambda row: parsePoint(1, row)) #print labeled_tfidf_0.take(2) labeled_tfidf = labeled_tfidf_0.union(labeled_tfidf_1) #print labeled_tfidf.count() #print labeled_tfidf.collect() labeled_tfidf.cache() #CV split (trainData, cvData) = labeled_tfidf.randomSplit([0.7, 0.3]) trainData.cache() cvData.cache() #Try various classifiers #With logistic regression only use training data #model = LogisticRegressionWithLBFGS.train(trainData) #Logistic regression works a lot better #model = NaiveBayes.train(trainData) #random forest model = RandomForest.trainClassifier(trainData, numClasses=2, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32) ## Evaluating the model on training data #labelsAndPreds = cvData.map(lambda p: (p.label, model.predict(p.features))) ##print labelsAndPreds.collect() #trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(cvData.count()) #print("CV Error = " + str(trainErr)) # Evaluate model on test instances and compute test error predictions = model.predict(cvData.map(lambda x: x.features)) labelsAndPredictions = cvData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter( lambda (v, p): v != p).count() / float(cvData.count()) print('Test Error = ' + str(testErr)) print('Learned classification forest model:') print(model.toDebugString())