def calculate_all_tweets(input_file): sanitizer = TweetSanitizer() graph = TweetGraphDegree() with open(input_file) as tweets_in: for line in tweets_in: tweet = sanitizer.sanitize_tweet(line) graph.add_tweet(tweet) degree = graph.average_degree() write_line('{:0.2f}'.format(degree))
def sanitize_all_tweets(input_file): sanitizer = TweetSanitizer() with open(input_file) as tweets_in: for line in tweets_in: tweet = sanitizer.sanitize_tweet(line) write_line(str(tweet)) write_line('') num_unicode = sanitizer.num_tweets_with_unicode() write_line('{0} tweets contained unicode.'.format(num_unicode))
class TestTweetSanitizer(unittest.TestCase): def setUp(self): self.sanitizer = TweetSanitizer() def test_load_json(self): """Loads basic JSON tweet that doesnt require sanitization""" expected_output = 'Spark Summit East this week! #Spark #Apache (timestamp: Thu Oct 29 17:51:01 +0000 2015)' sanitized_output = str(self.sanitizer.sanitize_tweet(json_tweet)) self.assertEqual(expected_output, sanitized_output) def test_remove_unicode(self): tweet = unicode_json expected_output = "Im at Terminal de Integrao do Varadouro in Joo Pessoa (timestamp: Thu Oct 29 18:10:49 +0000 2015)" sanitized_output = str(self.sanitizer.sanitize_tweet(tweet)) self.assertEqual(expected_output, sanitized_output) sanitized_output = self.sanitizer.sanitize_tweet(tweet) num_unicode = self.sanitizer.num_tweets_with_unicode() self.assertEqual(num_unicode, 2) def test_unescape_text(self): tweet = json.dumps(escaped_tweet) expected_output = ', PB https://t.co/HOl34REL1a (timestamp: Thu Oct 30 18:10:49 +0000 2015)' sanitized_output = str(self.sanitizer.sanitize_tweet(tweet)) self.assertEqual(expected_output, sanitized_output) sanitized_output = self.sanitizer.sanitize_tweet(tweet) num_unicode = self.sanitizer.num_tweets_with_unicode() self.assertEqual(num_unicode, 0) def test_all_escape_charecters(self): tweet = json.dumps(all_escaped_tweet) expected_output = '/Hello \ \' " n o di ce (timestamp: Fri Oct 30 18:10:49 +0000 2015)' sanitized_output = str(self.sanitizer.sanitize_tweet(tweet)) self.assertEqual(expected_output, sanitized_output)
def setUp(self): self.sanitizer = TweetSanitizer()