Exemplo n.º 1
0
def main(src):
    """Count tweets.
    """
    sc, spark = get_spark()

    tweets = spark.read.parquet(src)
    print(tweets.count())
Exemplo n.º 2
0
def main(src):
    """Count cities.
    """
    sc, spark = get_spark()

    df = spark.read.parquet(src)

    df.groupBy('key').count().orderBy('count', ascending=False).show(100)
Exemplo n.º 3
0
def main(n):
    """Test parallelization.
    """
    sc, spark = get_spark()

    data = sc.parallelize(range(n))
    result = data.map(work).collect()

    print(result)
Exemplo n.º 4
0
def main(src, dest):
    """Make a histogram of tweet character counts.
    """
    sc, spark = get_spark()

    tweets = spark.read.parquet(src)

    counts = (tweets.rdd.map(lambda t: (len(t.text), 1)).reduceByKey(
        add).sortBy(lambda x: x[0]).collect())

    dump_csv(counts, dest, ('text_len', 'count'))
Exemplo n.º 5
0
def main(src, dest):
    """Ingest tweets.
    """
    sc, spark = get_spark()

    paths = list(fs.scan(src, '\.json.gz'))

    paths = sc.parallelize(paths, len(paths))

    df = paths.flatMap(parse_segment).toDF(Tweet.schema)

    df.write.mode('overwrite').parquet(dest)
Exemplo n.º 6
0
def main(states, src, dest, coalesce):
    """Dump selected states as JSON.
    """
    sc, spark = get_spark()

    tweets = spark.read.parquet(src)

    matches = tweets \
        .filter(tweets.state.isin(set(states))) \
        .coalesce(coalesce)

    matches.write.mode('overwrite').json(dest)
Exemplo n.º 7
0
def main(src, dest, fraction):
    """Dump location field examples.
    """
    sc, spark = get_spark()

    tweets = spark.read.parquet(src)

    examples = tweets \
        .filter(tweets.actor.language=='en') \
        .filter(tweets.actor.location.isNotNull()) \
        .select(tweets.actor.location) \
        .sample(False, fraction)

    examples.write.mode('overwrite').text(dest)
Exemplo n.º 8
0
def main(src, dest):
    """Extract 4-digit year candidates.
    """
    sc, spark = get_spark()

    tweets = spark.read.parquet(src)

    matches = tweets.rdd \
        .filter(lambda t: t.actor.language == 'en') \
        .flatMap(lambda t: match_years(t.body)) \
        .toDF(('prefix', 'year', 'suffix'))

    matches.write \
        .mode('overwrite') \
        .json(dest)
Exemplo n.º 9
0
def main(src, dest):
    """Extract (key, token, count) tuples.
    """
    sc, spark = get_spark()

    tweets = spark.read.parquet(src)

    counts = tweets.rdd \
        .flatMap(count_tokens) \
        .reduceByKey(lambda a, b: a + b) \
        .map(lambda r: (*r[0], r[1])) \
        .toDF(('key', 'token', 'count'))

    counts.write \
        .mode('overwrite') \
        .json(dest)
Exemplo n.º 10
0
def main(src, dest):
    """Get tweets for cities, using (stupid) string matching.
    """
    sc, spark = get_spark()

    tweets = spark.read.parquet(src)

    matches = tweets.rdd \
        .filter(lambda t: t.actor.location and t.actor.language == 'en') \
        .map(match_state) \
        .filter(bool) \
        .toDF(GeoTweet.schema)

    matches.write \
        .mode('overwrite') \
        .parquet(dest)