Python get_spark示例

编程语言: Python

命名空间/包名称: twitter.utils

方法/功能: get_spark

hotexamples.com的示例: 10

Python get_spark - 已找到10个示例。这些是从开源项目中提取的最受好评的twitter.utils.get_spark现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

def main(src):
    """Count tweets.
    """
    sc, spark = get_spark()

    tweets = spark.read.parquet(src)
    print(tweets.count())

示例#2

显示文件

def main(src):
    """Count cities.
    """
    sc, spark = get_spark()

    df = spark.read.parquet(src)

    df.groupBy('key').count().orderBy('count', ascending=False).show(100)

示例#3

显示文件

文件： benchmark_cluster.py 项目： wwbrannon/twitter-ext

def main(n):
    """Test parallelization.
    """
    sc, spark = get_spark()

    data = sc.parallelize(range(n))
    result = data.map(work).collect()

    print(result)

示例#4

显示文件

def main(src, dest):
    """Make a histogram of tweet character counts.
    """
    sc, spark = get_spark()

    tweets = spark.read.parquet(src)

    counts = (tweets.rdd.map(lambda t: (len(t.text), 1)).reduceByKey(
        add).sortBy(lambda x: x[0]).collect())

    dump_csv(counts, dest, ('text_len', 'count'))

示例#5

显示文件

文件： load_tweets.py 项目： wwbrannon/twitter-ext

def main(src, dest):
    """Ingest tweets.
    """
    sc, spark = get_spark()

    paths = list(fs.scan(src, '\.json.gz'))

    paths = sc.parallelize(paths, len(paths))

    df = paths.flatMap(parse_segment).toDF(Tweet.schema)

    df.write.mode('overwrite').parquet(dest)

示例#6

显示文件

def main(states, src, dest, coalesce):
    """Dump selected states as JSON.
    """
    sc, spark = get_spark()

    tweets = spark.read.parquet(src)

    matches = tweets \
        .filter(tweets.state.isin(set(states))) \
        .coalesce(coalesce)

    matches.write.mode('overwrite').json(dest)

示例#7

显示文件

def main(src, dest, fraction):
    """Dump location field examples.
    """
    sc, spark = get_spark()

    tweets = spark.read.parquet(src)

    examples = tweets \
        .filter(tweets.actor.language=='en') \
        .filter(tweets.actor.location.isNotNull()) \
        .select(tweets.actor.location) \
        .sample(False, fraction)

    examples.write.mode('overwrite').text(dest)

示例#8

显示文件

文件： ext_year_candidates.py 项目： wwbrannon/twitter-ext

def main(src, dest):
    """Extract 4-digit year candidates.
    """
    sc, spark = get_spark()

    tweets = spark.read.parquet(src)

    matches = tweets.rdd \
        .filter(lambda t: t.actor.language == 'en') \
        .flatMap(lambda t: match_years(t.body)) \
        .toDF(('prefix', 'year', 'suffix'))

    matches.write \
        .mode('overwrite') \
        .json(dest)

示例#9

显示文件

def main(src, dest):
    """Extract (key, token, count) tuples.
    """
    sc, spark = get_spark()

    tweets = spark.read.parquet(src)

    counts = tweets.rdd \
        .flatMap(count_tokens) \
        .reduceByKey(lambda a, b: a + b) \
        .map(lambda r: (*r[0], r[1])) \
        .toDF(('key', 'token', 'count'))

    counts.write \
        .mode('overwrite') \
        .json(dest)

示例#10

显示文件

文件： load_state_tweets.py 项目： wwbrannon/twitter-ext

def main(src, dest):
    """Get tweets for cities, using (stupid) string matching.
    """
    sc, spark = get_spark()

    tweets = spark.read.parquet(src)

    matches = tweets.rdd \
        .filter(lambda t: t.actor.location and t.actor.language == 'en') \
        .map(match_state) \
        .filter(bool) \
        .toDF(GeoTweet.schema)

    matches.write \
        .mode('overwrite') \
        .parquet(dest)