ft2 = open(path.abspath(path.join(tweet_output_path, 'ft2.txt')), 'w') close_files = lambda l: [f.close() for f in l] tweet_graph = TweetsGraph(time_window=60) with open(tweets_incomming_path, 'r') as tweets_incomming: # all tweets from the api are utf-8 encoded: # https://dev.twitter.com/overview/api/counting-characters for cnt, tweet in enumerate(tweets_incomming, start=1): try: tweet_dict = json_loads(tweet) # json.loads uses utf-8 decoding by default text = tweet_dict["text"] created_at = tweet_dict["created_at"] hashtags = [hashtag['text'] for hashtag in tweet_dict['entities']['hashtags']] tweet = Tweet(created_at, hashtags) tweet_graph.update_graph(tweet) cleaned_text = clean_text(text, count_unicode=True) # logging.debug('tweet_cnt: {}, num_graph_nodes: {}, avg_deg: {}'.format( # cnt, len(tweet_graph.graph), tweet_graph.get_graph_avg_degree_of_all_nodes())) # print 'tweet_cnt: {}, num_graph_nodes: {}, avg_deg: {}'.format(cnt, len(tweet_graph.graph), tweet_graph.get_graph_avg_degree_of_all_nodes()) ft1.write('{} (timestamp: {})\n'.format(cleaned_text, created_at)) ft2.write('{}\n'.format(tweet_graph.get_graph_avg_degree_of_all_nodes())) except Exception as e: # don't normally exception handle in main like this, but play it safe on unknown data. # logging.exception("Tweet on ln {} failed to work. Exception {}".format(cnt, e)) pass ft1.write('\n{} tweets contained unicode.'.format(unicode_tweets_count)) close_files([ft1, ft2])
def test_update_graph_with_example_from_instructions_manually(self): '''exact example from the online instructions done manually here: https://github.com/InsightDataScience/coding-challenge#building-the-twitter-hashtag-graph''' # First tweet added to the graph self.tweet_graph.update_graph(Tweet('Thu Oct 29 17:51:01 +0000 2015', hashtags=['Spark', 'Apache'])) # graph will have each hashtag as a node and neighbor to one another assert self.tweet_graph.graph == {'apache': set(['spark']), 'spark': set(['apache'])} # get the graph avg degree assert self.tweet_graph.get_graph_avg_degree_of_all_nodes() == '1.00' # Second tweet added to the graph self.tweet_graph.update_graph(Tweet('Thu Oct 29 17:51:30 +0000 2015', hashtags=['Apache', 'Hadoop', 'Storm'])) # graph gets updated assert self.tweet_graph.graph == {'apache': set(['spark', 'hadoop', 'storm']), 'spark': set(['apache']), 'hadoop': set(['apache', 'storm']), 'storm': set(['apache', 'hadoop'])} # get the graph avg degree assert self.tweet_graph.get_graph_avg_degree_of_all_nodes() == '2.00' # Third tweet to the graph self.tweet_graph.update_graph(Tweet('Thu Oct 29 17:51:55 +0000 2015', hashtags=['Apache'])) # graph stays unchanged since there was only one hashtag passed in for this tweet assert self.tweet_graph.graph == {'apache': set(['spark', 'hadoop', 'storm']), 'spark': set(['apache']), 'hadoop': set(['apache', 'storm']), 'storm': set(['apache', 'hadoop'])} # get the graph avg degree assert self.tweet_graph.get_graph_avg_degree_of_all_nodes() == '2.00' # Fourth tweet to the graph self.tweet_graph.update_graph(Tweet('Thu Oct 29 17:51:56 +0000 2015', hashtags=['Flink', 'Spark'])) # graph gets updated accordingly assert self.tweet_graph.graph == {'apache': set(['spark', 'hadoop', 'storm']), 'spark': set(['apache', 'flink']), 'flink': set(['spark']), 'hadoop': set(['apache', 'storm']), 'storm': set(['apache', 'hadoop'])} # get the graph avg degree assert self.tweet_graph.get_graph_avg_degree_of_all_nodes() == '2.00' # Fifth tweet to the graph self.tweet_graph.update_graph(Tweet('Thu Oct 29 17:51:59 +0000 2015', hashtags=['HBase', 'Spark'])) # graph gets updated assert self.tweet_graph.graph == {'flink': set(['spark']), 'hadoop': set(['apache', 'storm']), 'storm': set(['apache', 'hadoop']), 'apache': set(['spark', 'hadoop', 'storm']), 'hbase': set(['spark']), 'spark': set(['apache', 'hbase', 'flink'])} # get the graph avg degree assert self.tweet_graph.get_graph_avg_degree_of_all_nodes() == '2.00' # Last tweet to the graph self.tweet_graph.update_graph(Tweet('Thu Oct 29 17:52:05 +0000 2015', hashtags=['Apache'])) # graph gets updated and now the Spark and Apache edge is remove b/c the tweet with them in it was older than 60s assert self.tweet_graph.graph == {'flink': set(['spark']), 'hadoop': set(['apache', 'storm']), 'storm': set(['apache', 'hadoop']), 'apache': set(['hadoop', 'storm']), 'hbase': set(['spark']), 'spark': set(['hbase', 'flink'])} # get the graph avg degree assert self.tweet_graph.get_graph_avg_degree_of_all_nodes() == '1.67' # now test this against the graph we just did manually, but with the same data loaded from file tweets_test_graph2 = TweetsGraph() testfile = os.path.join(tests_dir, 'test_data', 'data_for_building_hashtag_graph.txt') with open(testfile, 'r') as f: for tweet in f: tweet_dict = json.loads(tweet) hashtags = [hashtag['text'] for hashtag in tweet_dict['entities']['hashtags']] tweets_test_graph2.update_graph(Tweet(tweet_dict['created_at'], hashtags)) # check that the graph output here is the same as for the previous example that was just performed manually assert tweets_test_graph2.graph == self.tweet_graph.graph