Python SparkContext.accumulator示例

# An accumulator used to build the word vocabulary
class WordsDictAccumulatorParam(AccumulatorParam):
    def zero(self, v):
        return dict()
    def addInPlace(self, acc1, acc2):
        for key in acc2.keys():
            try:
                acc1[key] += acc2[key]
            except:
                acc1[key] = acc2[key]
        return acc1

# An accumulator used to build the word vocabulary
# vocabulary = sc.accumulator(set(), WordsSetAccumulatorParam())
vocabulary = sc.accumulator(dict(), WordsDictAccumulatorParam())

# load Education census data
location_data = pd.read_csv(EDU_DATA)
area_dict = dict(zip(location_data['city'], location_data[['fips', 'without_hsd','with_hsd', 'somecollege', 'bachelors']].values.tolist()))
county_dict = dict(zip(location_data['county'], location_data[['fips', 'without_hsd','with_hsd', 'somecollege', 'bachelors']].values.tolist()))
coord_dict = {tuple(x[:2]):x[2] for x in location_data[['lat', 'lng', 'county']].values}

# create a KD tree of known county center locations to be used to map a tweet coordinate to a county
latlon = list()
for index, row in location_data.iterrows():
    latlon.append([location_data['lat'][index], location_data['lng'][index]])

latlon = np.array(latlon)
latlonKDT = spatial.KDTree(latlon)

示例#2

显示文件

文件： sales_csv_spark_reader.py 项目： raaghavendra09/pythontraining

from pyspark.conf import SparkConf
from pyspark.context import SparkContext

from sparkpackage.sales_dto import SalesDTO


def print_lines(line):
    print line.product_name


config = SparkConf()
config.setAppName("CSVReaderJOB")
config.setMaster("local[*]")

context = SparkContext(conf=config)

textFileRDD = context.textFile(
    '/home/dharshekthvel/ac/code/scalatrainingintellij/data/sales.csv')

# Broadcast
# amazon_product = context.broadcast(SalesDTO("AMAZON_PRODUCT"))
# mappedRDD = textFileRDD.map(lambda x : amazon_product.value)
# mappedRDD.foreach(lambda x : print_lines(x))

# Accumulator

context.accumulator()

示例#3

显示文件

from datetime import datetime
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

import pyspark
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql import Row
sc = SparkContext('local')
spark = SparkSession(sc)
bucket = spark._jsc.hadoopConfiguration().get("fs.gs.system.bucket")
project = spark._jsc.hadoopConfiguration().get("fs.gs.project.id")
todays_date = datetime.strftime(datetime.today(), "%Y-%m-%d-%H-%M-%S")

accum = sc.accumulator(0)

print "begin to map input"

train_set = sc.textFile("gs://dataproc-0e3e0110-db09-4037-98cc-dc355651aba0-asia-southeast1/tensorflow/data/picfeed/train_set_combine").map(lambda row: row.split("\t"))
combine_uinfo = sc.textFile("gs://dataproc-0e3e0110-db09-4037-98cc-dc355651aba0-asia-southeast1/tensorflow/data/picfeed/data_files_combine_toterm_new/part-00000").map(lambda row: row.split("\t", 1))

print "finish to map input"

def process_uinfo(line):
    if len(line) != 2:
        return Row(urlid=line, urlinfo="")
    return Row(urlid=line[0], urlinfo=line[1])

#combine_uinfo_dict = combine_uinfo.map(lambda p: Row(urlid=p[0], urlinfo=p[1])).collect()

示例#4

显示文件

文件： SparkTest.py 项目： jiasy/PY_Service

class SparkTest(BaseService):
    def __init__(self, sm_):
        super().__init__(sm_)
        self.testJsonFilePath = fileUtils.getPath(self.resPath,
                                                  "testJsonStr.json")
        self.conf: SparkConf = None
        self.sc: SparkContext = None
        self.spark: SparkSession = None
        self.sqlCtx: SQLContext = None
        # self.writeJsonToResPath()
        self.initSpark()

    def writeJsonToResPath(self):
        self.testJsonStr = '{"result":1245186,"roomCardCount":1000000,"battleId":0,"roomId":0,"marqueeVersion":{"low":1,"high":0,"unsigned":false},"newMail":null,"newLimitedCostlessActivity":false,"noticeVersion":{"low":0,"high":0,"unsigned":false},"activityInfo":[{"id":300001,"startTime":{"low":-576284416,"high":345,"unsigned":false},"endTime":{"low":-72284416,"high":345,"unsigned":false}},{"id":300005,"startTime":{"low":-224153600,"high":349,"unsigned":false},"endTime":{"low":438245400,"high":350,"unsigned":false}},{"id":300008,"startTime":{"low":-2140022784,"high":353,"unsigned":false},"endTime":{"low":-238493672,"high":133356,"unsigned":false}},{"id":300004,"startTime":{"low":-1994684416,"high":345,"unsigned":false},"endTime":{"low":-1908285416,"high":345,"unsigned":false}},{"id":300002,"startTime":{"low":131409920,"high":355,"unsigned":false},"endTime":{"low":217808920,"high":355,"unsigned":false}},{"id":300007,"startTime":{"low":-2140022784,"high":353,"unsigned":false},"endTime":{"low":-584823784,"high":353,"unsigned":false}},{"id":300000,"startTime":{"low":-576284416,"high":345,"unsigned":false},"endTime":{"low":46435072,"high":368,"unsigned":false}}],"buttonValue":13,"timeStamp":{"low":1316055502,"high":355,"unsigned":false},"clubId":null,"createTime":{"low":1037369829,"high":355,"unsigned":false},"connGroup":"c74d97b01eae257e44aa9d5bade97baf","isIdentityVerify":false,"isAgency":false,"agtWebUrl":"","combatId":0,"area":10002,"displayId":5198814,"mttStartTime":{"low":0,"high":0,"unsigned":false},"ticket":0,"phone":"","notifyRedDot":[],"pushRegisterId":""}'
        fileUtils.writeFileWithStr(self.testJsonFilePath, self.testJsonStr)

    def initSpark(self):
        # 集群 URL : local 这个特殊值可以让 Spark 运行在单机单线程上而无需连接到集群
        _clusterType = "local"
        # 应用名 : appName 当连接到一个集群时，这个值可以帮助你在集群管理器的用户界面中找到你的应用。
        _appName = self.app.appName
        self.conf = SparkConf().setMaster(_clusterType).setAppName(_appName)
        self.sc = SparkContext(conf=self.conf)
        self.spark = SparkSession \
            .builder \
            .appName("Python Spark SQL basic example") \
            .config("spark.some.config.option", "some-value") \
            .getOrCreate()
        self.sqlCtx = SQLContext(self.sc)
        global blankLines
        blankLines = self.sc.accumulator(0)

    def create(self):
        super(SparkTest, self).create()
        # self.test_parallelize()
        # _testTextRDD = self.test_textFile()
        # self.test_RDD(_testTextRDD)
        # self.test_map()
        # self.test_flatMap()
        # self.test_createPairRDD()
        # self.test_aggregate()
        self.test_jsonStrToDataFrame()
        # self.test_jsonFileWrite()
        # self.test_sparkSql()
        # self.test_createSchema()
        # self.test_accumulator()
        # self.test_sparkStreaming()
        # self.test_presto()

    def destroy(self):
        super(SparkTest, self).destroy()

    def test_parallelize(self):
        # 一个区内进行RDD转化
        self.sc.parallelize(["pandas", "i like pandas"])
        # 分成两个区
        self.sc.parallelize([1, 2, 3, 4], 2)

    def test_textFile(self):
        _testTextFilePath = fileUtils.getPath(self.app.resPath, "README.md")
        # Spark 的 RDD 包含两种操作
        _testTextRDD = self.sc.textFile(_testTextFilePath)
        return _testTextRDD

    def test_RDD(self, targetRDD_):
        # 向Spark传递函数
        def pythonInLine(line_):
            # 确保 filter 中没有 self 之类的引用，否者，引用会被序列化，传递给Spark的成本增加。
            return "Python" in line_

        # 转化操作 （transformation）
        _pythonInLineRDD = targetRDD_.filter(pythonInLine)
        _starInLineRDD = targetRDD_.filter(lambda line: "* " in line)
        # 返回包含两个RDD所有元素的新RDD，可能有重复元素
        _unionRDD = _pythonInLineRDD.union(_starInLineRDD)
        # 去重
        _unionRDD = _unionRDD.distinct()
        # 让 Spark 把这个 RDD 缓存吗，cache() 与使用默认存储级别调用 persist() 是一样的。
        _unionRDD.cache()
        # 行动操作（action）: Spark 只会 惰性 计算这些 RDD
        # 每当我们调用一个新的行动操作时，整个 RDD 都会从头开始计算
        # 只有第一次在一个行动操作中用到时，才会真正计算 (count 和 first 都是 action)
        _count = _unionRDD.count()
        self.app.info.log("_count = " + str(_count))
        _first = _unionRDD.first()
        # print("_first = " + str(_first))
        _take5 = _unionRDD.take(5)  # 取5条
        # print("_take5 = " + str(_take5))
        _all = _unionRDD.collect()  # 取所有<注意数据大小，别给内存弄爆了>

        # 返回两个RDD都中都有的元素组成的RDD
        _intersectionRDD = _pythonInLineRDD.intersection(_starInLineRDD)
        _same = _intersectionRDD.collect()

    # RDD 每一个元素处理之后，获得新的 RDD
    def test_map(self):
        print(self.className + " - " +
              pyUtils.getCurrentRunningFunctionName() + "------------------")
        _numsRDD = self.sc.parallelize([1, 2, 3, 4])
        # map() 接收一个函数， 把这个函数用于 RDD 中的每个元素， 将函数的返回结果作为结果
        _squaredRDD = _numsRDD.map(lambda x: x * x)
        # 输出每一个元素，平方数
        for _num in _squaredRDD.collect():
            print("%i " % _num)

    # RDD 每一个元素处理之后，变成多个元素，然后再将所有元素构成新的 RDD
    def test_flatMap(self):
        print(self.className + " - " +
              pyUtils.getCurrentRunningFunctionName() + "------------------")
        _lines = self.sc.parallelize(["hello world", "hi"])
        # flatMap 得到了一个由各列表中的元素组成的 RDD， 而不是一个由列表组成的 RDD
        _words = _lines.flatMap(lambda line: line.split(" ")).collect()
        for _word in _words:
            print("%s " % _word)
        # 读取json文件，返回一个二元组(文件路径,文件内容)
        _jsonRDD = self.sc.wholeTextFiles(self.testJsonFilePath)

        # 读取每一行json信息，将二元组的第二项，作为字符串解析成Json对象，将其中的 activityInfo 作为新 RDD 的元素。
        def _getActivityInfoFunc(jsonInfoKV_):
            _jsonValue = jsonInfoKV_[1]
            _activityInfoDict = json.loads(_jsonValue)["activityInfo"]
            return _activityInfoDict

        _json_activityInfo_RDD = _jsonRDD.flatMap(
            lambda _jsonInfoKV: (_getActivityInfoFunc(_jsonInfoKV)))
        # 去重
        _json_activityInfo_RDD.distinct()
        # RDD 转 DF
        _json_activityInfo_DF = self.spark.createDataFrame(
            _json_activityInfo_RDD)
        # 创建临时表
        _json_activityInfo_DF.registerTempTable("activityInfo")
        _resultsRDD = self.sqlCtx.sql(
            "SELECT startTime.low,endTime.low FROM activityInfo WHERE id = 300001L"
        )

        for _result in _resultsRDD.collect():
            print("_result = " + str(_result))

    # 创建 Pair RDD
    def test_createPairRDD(self):
        print(self.className + " - " +
              pyUtils.getCurrentRunningFunctionName() + "------------------")
        _lines = self.sc.parallelize(
            ["key1 value1", "key2 value2", "key2 value22", "key3 value3"])
        # 转换成键值对儿
        _pairs = _lines.map(lambda _item:
                            (_item.split(" ")[0], _item.split(" ")[1]))
        # 值字符串长6以内的保留
        _pairs = _pairs.filter(lambda _keyValue: len(_keyValue[1]) <= 6)
        # 输出满足条件的每一个键值
        for (_key, _value) in _pairs.collect():
            print(str(_key) + " = " + str(_value))

    def test_aggregate(self):
        print(self.className + " - " +
              pyUtils.getCurrentRunningFunctionName() + "------------------")
        # 统计单词出现的次数
        _lines = self.sc.parallelize(
            ["hello value1", "hello value2", "hi value22", "f**k value3"])
        # 获取每个词 ["hello","value1","hello","value2","hi","value22","f**k","value3"]
        _words = _lines.flatMap(lambda _line: _line.split(" "))
        # 每个词变成（词，1）元组(pairRDD)
        _wordAndOnes = _words.map(lambda _word: (_word, 1)).cache()
        # pairRDD 二元组，第一元做Key,第二元做值
        _wordReduce = _wordAndOnes.reduceByKey(
            lambda _valueReduce, _valueNext: _valueReduce + _valueNext)
        # 输出 次 和个数
        for _key, _value in _wordReduce.collect():
            print(str(_key) + " = " + str(_value))

        # aggregate 的方法计算 单词出现次数
        _wordAgg = _wordAndOnes.aggregateByKey(
            0,  # 初始值
            (lambda _valueReduce, _valueNext: _valueReduce + _valueNext
             ),  # RDD 中的元素合并起来放入累加器
            (lambda _reduce, reduceNext: _reduce + reduceNext)  # 累加器两两合并
        )
        # 输出 次 和个数
        for _key, _value in _wordAgg.collect():
            print(str(_key) + " = " + str(_value))

        # 计算平均值
        _numsRDD = self.sc.parallelize([1, 2, 3, 4])
        _sumInfo = _numsRDD.aggregate(
            (0, 0),  # 初始值(累加值,计数)
            (lambda _sumReduce, _value:
             (_sumReduce[0] + _value, _sumReduce[1] + 1)
             ),  # 将每一个value进行累加，计数器累计
            (lambda _sumReduceReduce, _sumReduceNext:
             (_sumReduceReduce[0] + _sumReduceNext[0], _sumReduceReduce[1] +
              _sumReduceNext[1]))  # 累加器再一次合并
        )
        _average = _sumInfo[0] / float(_sumInfo[1])
        print("_average = " + str(_average))

    def test_jsonStrToDataFrame(self):
        print(self.className + " - " +
              pyUtils.getCurrentRunningFunctionName() + "------------------")
        _jsonDataFrame = self.spark.read.json(self.testJsonFilePath)
        _jsonDataFrame.printSchema()
        _jsonDataFrame.show()
        _activityInfo = _jsonDataFrame.selectExpr("activityInfo")
        _activityInfo.show()

    def test_jsonFileWrite(self):
        print(self.className + " - " +
              pyUtils.getCurrentRunningFunctionName() + "------------------")
        # 写入文件的路径
        _writeToPath = fileUtils.getPath(self.app.resPath, "activityInfo.json")
        # 产生一个元组列表 (文件路径,文件内容)
        jsonRDD = self.sc.wholeTextFiles(self.testJsonFilePath)
        for _jsonKeyValue in jsonRDD.collect():
            print("_jsonKeyValue = " + str(_jsonKeyValue))
        # 元组列表的每一项，取其中的 文件内容，转换成 json字典对象，构成新的RDD
        # 字典对象RDD，过滤，将包含键的元素，构成新的RDD
        # 将元素中的字段取出，构成新的RDD
        jsonDataFilter = jsonRDD \
            .map(lambda _jsonKeyValue: json.loads(_jsonKeyValue[1])) \
            .filter(lambda _jsonDict: _jsonDict["activityInfo"]) \
            .map(lambda _jsonDict: _jsonDict["activityInfo"])
        # activityInfo 字段内容构成的RDD
        for _jsonDataFilter in jsonDataFilter.collect():
            print("_jsonDataFilter = " + str(_jsonDataFilter))
        # 如果，没有写过的话，就写一份
        if not os.path.exists(_writeToPath):
            jsonDataFilter.saveAsTextFile(_writeToPath)

    #
    def test_sparkSql(self):
        print(self.className + " - " +
              pyUtils.getCurrentRunningFunctionName() + "------------------")
        _tweets = self.sqlCtx.read.json(self.testJsonFilePath)
        _tweets.printSchema()
        _tweets.show()
        _tweets.registerTempTable("tempTable")
        # 即便过滤出来的某一个属性，其查询的结构也是一样的。都需要从activityInfo这个顶层属性开始做字段查找。
        # _activityInfo = _tweets.selectExpr("activityInfo")
        # _activityInfo.show()
        # _activityInfo.registerTempTable("tempTable")
        _resultsRDD = self.sqlCtx.sql(
            "SELECT activityInfo.startTime.low,activityInfo.endTime.low FROM tempTable"
        )
        for _result in _resultsRDD.collect():
            print("_result = " + str(_result))

    def test_createSchema(self):
        print(self.className + " - " +
              pyUtils.getCurrentRunningFunctionName() + "------------------")
        # 数据
        _jsonDatas = [{
            'a': 'aaa',
            'b': 'bbb',
            'c': 'ccc'
        }, {
            'a': 'aaaa',
            'b': 'bbbb',
            'c': 'cccc',
            'd': 'dddd',
            'e': 'eeee'
        }]
        _jsonDatas = [json.dumps(_jsonDict) for _jsonDict in _jsonDatas]

        # 已知结构
        schema = ['a', 'b', 'c', 'd']
        fields = [
            StructField(_fieldName, StringType(), True)
            for _fieldName in schema
        ]
        schema = StructType(fields)

        rdd = self.sc.parallelize(_jsonDatas)
        # 已知 结构 会被保留，未知结构会被抛弃
        df = self.sqlCtx.read.schema(schema).json(rdd)
        for data in df.collect():
            print("data = " + str(data))
        df.registerTempTable("tempTable")
        _resultsRDD = self.sqlCtx.sql("SELECT c,d FROM tempTable")
        for _result in _resultsRDD.collect():
            print("_result = " + str(_result))

    def test_accumulator(self):
        print(self.className + " - " +
              pyUtils.getCurrentRunningFunctionName() + "------------------")

        def accumulatorFunc():
            global blankLines
            blankLines += 1

        for i in range(10):
            accumulatorFunc()

        global blankLines
        print("blankLines = " + str(blankLines))

    def test_sparkStreaming(self):
        print(self.className + " - " +
              pyUtils.getCurrentRunningFunctionName() + "------------------")
        self.getSubClassObject("SparkStreaming")
        self.getSubClassObject("Kafka")
        # self.showCurrentBaseObejctsInfo()
        self.sparkStreaming.destroy()
        self.kafka.destroy()
        # self.showCurrentBaseObejctsInfo()

    def test_presto(self):
        print(self.className + " - " +
              pyUtils.getCurrentRunningFunctionName() + "------------------")
        self.getSubClassObject("Presto")
        self.presto.doTest()
        self.presto.destroy()

示例#5

显示文件

文件： glue_job.py 项目： swabb/visualize-cur-using-glue-es

    'index_name_prefix_template', 'index_pattern_prefix', 'es_domain_url'
])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

# Parameter init
source_bucket = args['source_bucket']
report_folder_prefix = args['report_folder_prefix']
index_name_prefix_template = args['index_name_prefix_template']
index_pattern_prefix = args['index_pattern_prefix']
es_domain_url = args['es_domain_url']
es_domain_url_shared = sc.broadcast(es_domain_url)
succeed = sc.accumulator(0)
failed = sc.accumulator(0)
now = datetime.datetime.now()
index_name_base = index_name_prefix_template.format(str(now.year),
                                                    str(now.month))
index_name = index_name_base + "-" + str(now.day)
index_name_shared = sc.broadcast(index_name)


def doc_generator(source):
    for row in source:
        updated_row = row.asDict()
        index_name = index_name_shared.value
        new_row = {
            '_index': index_name,
            '_type': 'cur',