def export_featureless_counts(interval="day"): """ Create hourly counts for Tweets without mentions or URLs. Complex queries on many-to-many-relationships are very contrived with peewee. For the sake of simplicity, this function instead """ # Create output file with open("featureless_counts.csv", "w") as f: # Write header line f.write("{0},".format(interval)) f.write("featureless,") f.write("\n") # Prepare interator over intervals intervals = database.objects_by_interval(database.Tweet, interval=interval, start_date=None, stop_date=None) for (interval_start, interval_stop), query in intervals: # Convert the timestamp to Mountain Standard Time which is # the local timezone for the example data timestamp = MST.normalize(interval_start).strftime( "%Y-%m-%d %H:%M:%S %z") f.write("{0},".format(timestamp)) featureless_count = 0 for t in query: if bool(t.mentions.is_null() and t.urls.is_null() and t.reply_to_tweet is None): featureless_count += 1 f.write("{0},".format(featureless_count)) f.write("\n")
def export_user_counts(interval="day", usernames=[ "JebBush", "RealBenCarson", "ChrisChristie", "tedcruz", "CarlyFiorina", "GovMikeHuckabee", "JohnKasich", "RandPaul", "marcorubio", "realDonaldTrump" ]): """ Create daily counts for given Users. """ # Create output file with open("user_counts.csv", "w") as f: # Write header line f.write("{0},".format(interval)) f.write(",".join(usernames)) f.write(",\n") # Prepare interator over intervals intervals = database.objects_by_interval(database.Tweet, interval=interval, start_date=None, stop_date=None) for (interval_start, interval_stop), query in intervals: # Convert the timestamp to Mountain Standard Time which is # the local timezone for the example data timestamp = MST.normalize(interval_start).strftime( "%Y-%m-%d %H:%M:%S %z") f.write("{0},".format(timestamp)) for username in usernames: # Match precise username ucount = query.join(database.User).where( database.User.username == username).count() f.write("{0},".format(ucount)) f.write("\n")
def export_featureless_counts(interval="day"): """ Create hourly counts for Tweets without mentions or URLs. Complex queries on many-to-many-relationships are very contrived with peewee. For the sake of simplicity, this function instead """ # Create output file with open("featureless_counts.csv", "w") as f: # Write header line f.write("{0},".format(interval)) f.write("featureless,") f.write("\n") # Prepare interator over intervals intervals = database.objects_by_interval( database.Tweet, interval=interval, start_date=None, stop_date=None) for (interval_start, interval_stop), query in intervals: # Convert the timestamp to Mountain Standard Time which is # the local timezone for the example data timestamp = MST.normalize(interval_start).strftime( "%Y-%m-%d %H:%M:%S %z") f.write("{0},".format(timestamp)) featureless_count = 0 for t in query: if bool(t.mentions.is_null() and t.urls.is_null() and t.reply_to_tweet is None): featureless_count += 1 f.write("{0},".format(featureless_count)) f.write("\n")
def export_keyword_counts(interval="day", keywords=[ "Bush", "Carson", "Christie", "Cruz", "Fiorina", "Huckabee", "Kasich", "Paul", "Rubio", "Trump" ]): """ Create daily counts for given Keywords. """ # Create output file with open("keyword_counts.csv", "w") as f: # Write header line f.write("{0},".format(interval)) f.write(",".join(keywords)) f.write(",\n") # Prepare interator over intervals intervals = database.objects_by_interval(database.Tweet, interval=interval, start_date=None, stop_date=None) for (interval_start, interval_stop), query in intervals: # Convert the timestamp to Mountain Standard Time which is # the local timezone for the example data timestamp = MST.normalize(interval_start).strftime( "%Y-%m-%d %H:%M:%S %z") f.write("{0},".format(timestamp)) for word in keywords: # Match ignoring case kwcount = query.where( peewee.fn.Lower(database.Tweet.text).contains( word.lower())).count() f.write("{0},".format(kwcount)) f.write("\n")
def export_hashtag_counts(interval="day", hashtags=[ "Bush", "Carson", "Christie", "Cruz", "Fiorina", "Huckabee", "Kasich", "Paul", "Rubio", "Trump" ]): """ Create daily counts for given Hashtags. A bit slow. An easy speedup is to convert the list of hashtags to Hashtag database objects and query for them. """ # Create output file with open("hashtag_counts.csv", "w") as f: # Write header line f.write("{0},".format(interval)) f.write(",".join(hashtags)) f.write(",\n") # Prepare interator over intervals # htm is an intermediary model for many-to-many-relationships # In this case Tweet -> htm -> Hashtag htm = database.Tweet.tags.get_through_model() intervals = database.objects_by_interval(database.Tweet, interval=interval, start_date=None, stop_date=None) for (interval_start, interval_stop), query in intervals: # Convert the timestamp to Mountain Standard Time which is # the local timezone for the example data timestamp = MST.normalize(interval_start).strftime( "%Y-%m-%d %H:%M:%S %z") f.write("{0},".format(timestamp)) for tag in hashtags: # Match ignoring case count = query.join(htm).join(database.Hashtag).where( peewee.fn.Lower(database.Hashtag.tag) == tag.lower()).count() f.write("{0},".format(count)) f.write("\n")
def export_user_counts(interval="day", usernames=["JebBush", "RealBenCarson", "ChrisChristie", "tedcruz", "CarlyFiorina", "GovMikeHuckabee", "JohnKasich", "RandPaul", "marcorubio", "realDonaldTrump"]): """ Create daily counts for given Users. """ # Create output file with open("user_counts.csv", "w") as f: # Write header line f.write("{0},".format(interval)) f.write(",".join(usernames)) f.write(",\n") # Prepare interator over intervals intervals = database.objects_by_interval( database.Tweet, interval=interval, start_date=None, stop_date=None) for (interval_start, interval_stop), query in intervals: # Convert the timestamp to Mountain Standard Time which is # the local timezone for the example data timestamp = MST.normalize(interval_start).strftime( "%Y-%m-%d %H:%M:%S %z") f.write("{0},".format(timestamp)) for username in usernames: # Match precise username ucount = query.join(database.User).where( database.User.username == username).count() f.write("{0},".format(ucount)) f.write("\n")
def export_keyword_counts(interval="day", keywords=["Bush", "Carson", "Christie", "Cruz", "Fiorina", "Huckabee", "Kasich", "Paul", "Rubio", "Trump"]): """ Create daily counts for given Keywords. """ # Create output file with open("keyword_counts.csv", "w") as f: # Write header line f.write("{0},".format(interval)) f.write(",".join(keywords)) f.write(",\n") # Prepare interator over intervals intervals = database.objects_by_interval( database.Tweet, interval=interval, start_date=None, stop_date=None) for (interval_start, interval_stop), query in intervals: # Convert the timestamp to Mountain Standard Time which is # the local timezone for the example data timestamp = MST.normalize(interval_start).strftime( "%Y-%m-%d %H:%M:%S %z") f.write("{0},".format(timestamp)) for word in keywords: # Match ignoring case kwcount = query.where( peewee.fn.Lower(database.Tweet.text).contains(word.lower())).count() f.write("{0},".format(kwcount)) f.write("\n")
def export_mention_counts(interval="day", usernames=["jebbush", "realbencarson", "chrischristie", "tedcruz", "carlyfiorina", "govmikehuckabee", "johnkasich", "randpaul", "marcorubio", "realdonaldtrump"]): """ Create daily counts for mentions of given Users. """ # Create output file with open("mention_counts.csv", "w") as f: # Write header line f.write("{0},".format(interval)) f.write(",".join(usernames)) f.write(",\n") # Prepare interator over intervals # htm is an intermediary model for many-to-many-relationships # In this case Tweet -> htm -> Hashtag mtm = database.Tweet.mentions.get_through_model() intervals = database.objects_by_interval( database.Tweet, interval=interval, start_date=None, stop_date=None) for (interval_start, interval_stop), query in intervals: # Convert the timestamp to Mountain Standard Time which is # the local timezone for the example data timestamp = MST.normalize(interval_start).strftime( "%Y-%m-%d %H:%M:%S %z") f.write("{0},".format(timestamp)) for user in usernames: # Match ignoring case count = query.join(mtm).join(database.User).where( peewee.fn.Lower(database.User.username) == user.lower()).count() f.write("{0},".format(count)) f.write("\n")
def export_hashtag_counts(interval="day", hashtags=["Bush", "Carson", "Christie", "Cruz", "Fiorina", "Huckabee", "Kasich", "Paul", "Rubio", "Trump"]): """ Create daily counts for given Hashtags. A bit slow. An easy speedup is to convert the list of hashtags to Hashtag database objects and query for them. """ # Create output file with open("hashtag_counts.csv", "w") as f: # Write header line f.write("{0},".format(interval)) f.write(",".join(hashtags)) f.write(",\n") # Prepare interator over intervals # htm is an intermediary model for many-to-many-relationships # In this case Tweet -> htm -> Hashtag htm = database.Tweet.tags.get_through_model() intervals = database.objects_by_interval( database.Tweet, interval=interval, start_date=None, stop_date=None) for (interval_start, interval_stop), query in intervals: # Convert the timestamp to Mountain Standard Time which is # the local timezone for the example data timestamp = MST.normalize(interval_start).strftime( "%Y-%m-%d %H:%M:%S %z") f.write("{0},".format(timestamp)) for tag in hashtags: # Match ignoring case count = query.join(htm).join(database.Hashtag).where( peewee.fn.Lower(database.Hashtag.tag) == tag.lower()).count() f.write("{0},".format(count)) f.write("\n")
def export_total_counts(interval="day"): """ Create hourly counts for Tweets """ # Create output file with open("total_counts.csv", "w") as f: # Write header line f.write("{0},".format(interval)) f.write("total,") f.write("\n") # Prepare interator over intervals intervals = database.objects_by_interval( database.Tweet, interval=interval, start_date=None, stop_date=None) for (interval_start, interval_stop), query in intervals: # Convert the timestamp to Mountain Standard Time which is # the local timezone for the example data timestamp = MST.normalize(interval_start).strftime( "%Y-%m-%d %H:%M:%S %z") f.write("{0},".format(timestamp)) f.write("{0},".format(query.count())) f.write("\n")
def export_total_counts(interval="day"): """ Create hourly counts for Tweets """ # Create output file with open("total_counts.csv", "w") as f: # Write header line f.write("{0},".format(interval)) f.write("total,") f.write("\n") # Prepare interator over intervals intervals = database.objects_by_interval(database.Tweet, interval=interval, start_date=None, stop_date=None) for (interval_start, interval_stop), query in intervals: # Convert the timestamp to Mountain Standard Time which is # the local timezone for the example data timestamp = MST.normalize(interval_start).strftime( "%Y-%m-%d %H:%M:%S %z") f.write("{0},".format(timestamp)) f.write("{0},".format(query.count())) f.write("\n")
def export_mention_counts(interval="day", usernames=[ "jebbush", "realbencarson", "chrischristie", "tedcruz", "carlyfiorina", "govmikehuckabee", "johnkasich", "randpaul", "marcorubio", "realdonaldtrump" ]): """ Create daily counts for mentions of given Users. """ # Create output file with open("mention_counts.csv", "w") as f: # Write header line f.write("{0},".format(interval)) f.write(",".join(usernames)) f.write(",\n") # Prepare interator over intervals # htm is an intermediary model for many-to-many-relationships # In this case Tweet -> htm -> Hashtag mtm = database.Tweet.mentions.get_through_model() intervals = database.objects_by_interval(database.Tweet, interval=interval, start_date=None, stop_date=None) for (interval_start, interval_stop), query in intervals: # Convert the timestamp to Mountain Standard Time which is # the local timezone for the example data timestamp = MST.normalize(interval_start).strftime( "%Y-%m-%d %H:%M:%S %z") f.write("{0},".format(timestamp)) for user in usernames: # Match ignoring case count = query.join(mtm).join(database.User).where( peewee.fn.Lower(database.User.username) == user.lower()).count() f.write("{0},".format(count)) f.write("\n")