def main(src): """Count tweets. """ sc, spark = get_spark() tweets = spark.read.parquet(src) print(tweets.count())
def main(src): """Count cities. """ sc, spark = get_spark() df = spark.read.parquet(src) df.groupBy('key').count().orderBy('count', ascending=False).show(100)
def main(n): """Test parallelization. """ sc, spark = get_spark() data = sc.parallelize(range(n)) result = data.map(work).collect() print(result)
def main(src, dest): """Make a histogram of tweet character counts. """ sc, spark = get_spark() tweets = spark.read.parquet(src) counts = (tweets.rdd.map(lambda t: (len(t.text), 1)).reduceByKey( add).sortBy(lambda x: x[0]).collect()) dump_csv(counts, dest, ('text_len', 'count'))
def main(src, dest): """Ingest tweets. """ sc, spark = get_spark() paths = list(fs.scan(src, '\.json.gz')) paths = sc.parallelize(paths, len(paths)) df = paths.flatMap(parse_segment).toDF(Tweet.schema) df.write.mode('overwrite').parquet(dest)
def main(states, src, dest, coalesce): """Dump selected states as JSON. """ sc, spark = get_spark() tweets = spark.read.parquet(src) matches = tweets \ .filter(tweets.state.isin(set(states))) \ .coalesce(coalesce) matches.write.mode('overwrite').json(dest)
def main(src, dest, fraction): """Dump location field examples. """ sc, spark = get_spark() tweets = spark.read.parquet(src) examples = tweets \ .filter(tweets.actor.language=='en') \ .filter(tweets.actor.location.isNotNull()) \ .select(tweets.actor.location) \ .sample(False, fraction) examples.write.mode('overwrite').text(dest)
def main(src, dest): """Extract 4-digit year candidates. """ sc, spark = get_spark() tweets = spark.read.parquet(src) matches = tweets.rdd \ .filter(lambda t: t.actor.language == 'en') \ .flatMap(lambda t: match_years(t.body)) \ .toDF(('prefix', 'year', 'suffix')) matches.write \ .mode('overwrite') \ .json(dest)
def main(src, dest): """Extract (key, token, count) tuples. """ sc, spark = get_spark() tweets = spark.read.parquet(src) counts = tweets.rdd \ .flatMap(count_tokens) \ .reduceByKey(lambda a, b: a + b) \ .map(lambda r: (*r[0], r[1])) \ .toDF(('key', 'token', 'count')) counts.write \ .mode('overwrite') \ .json(dest)
def main(src, dest): """Get tweets for cities, using (stupid) string matching. """ sc, spark = get_spark() tweets = spark.read.parquet(src) matches = tweets.rdd \ .filter(lambda t: t.actor.location and t.actor.language == 'en') \ .map(match_state) \ .filter(bool) \ .toDF(GeoTweet.schema) matches.write \ .mode('overwrite') \ .parquet(dest)