def main(argv): inp_file, out_file = argv graph = {} out_put = open(out_file,'w') with open(inp_file) as input_file: for line in input_file: line = line.rstrip() line_json = json.loads(line) try: created_at = dtparser.parse(line_json["created_at"]) clean, cleaned_line = _clean_string(line_json['text']) hash_tags = get_hashtag(cleaned_line) # update graph if there are more than one hashtags if hash_tags and len(hash_tags) > 1: graph = update_or_build_graph(graph, hash_tags, created_at) # update graph to remove edges created more than 60 seconds ago graph = shuffle_graph(graph, created_at) avg_degree = calculate_avg_degree(graph) out_put.write(str(avg_degree)+"\n") except Exception as e: # this except block is here to handle the following sample limit lines # {"limit":{"track":19,"timestamp_ms":"1446218985758"}} pass out_put.close() input_file.close()
def test_tweet_cleaning_and_formatting(self): clean, cleaned_tweet = _clean_string(self.test_tweet_text) self.assertEqual( clean, False, "incorrect testing of unicode and escape character presence") self.assertEqual(cleaned_tweet, self.correct_unicode_tweet, "incorrect escaping and formatting of tweet")
def main(argv): num_unicode = 0 inp_file, out_file = argv out_put = open(out_file, 'w') with open(inp_file) as input_file: for line in input_file: line = line.rstrip() line_json = json.loads(line) try: clean, cleaned_line = _clean_string(line_json["text"]) if not clean: num_unicode += 1 out_put.write( cleaned_line + " (timestamp: {0})".format(line_json["created_at"]) + "\n") except: #this except block is here to handle the following sample limit lines # {"limit":{"track":19,"timestamp_ms":"1446218985758"}} pass out_put.write("\n {0} tweets contained unicode.".format(num_unicode)) out_put.close() input_file.close()
def main(argv): inp_file, out_file = argv graph = {} out_put = open(out_file, 'w') with open(inp_file) as input_file: for line in input_file: line = line.rstrip() line_json = json.loads(line) try: created_at = dtparser.parse(line_json["created_at"]) clean, cleaned_line = _clean_string(line_json['text']) hash_tags = get_hashtag(cleaned_line) # update graph if there are more than one hashtags if hash_tags and len(hash_tags) > 1: graph = update_or_build_graph(graph, hash_tags, created_at) # update graph to remove edges created more than 60 seconds ago graph = shuffle_graph(graph, created_at) avg_degree = calculate_avg_degree(graph) out_put.write(str(avg_degree) + "\n") except Exception as e: # this except block is here to handle the following sample limit lines # {"limit":{"track":19,"timestamp_ms":"1446218985758"}} pass out_put.close() input_file.close()
def test_rolling_avg_degree(self): for tweet in self.tweet_list: created_at = dtparser.parse(tweet["created_at"]) clean, cleaned_line = _clean_string(tweet['text']) hash_tags = get_hashtag(cleaned_line) if hash_tags and len(hash_tags) > 1: self.graph = update_or_build_graph( self.graph, hash_tags, created_at) # print self.graph self.graph = shuffle_graph(self.graph, created_at) self.avg_degree_list.append(calculate_avg_degree(self.graph)) #print self.avg_degree_list self.assertEqual(self.avg_degree_list, [1.0, 2.0, 2.0, 2.0, 1.67], 'incorrect average degree')
def test_rolling_avg_degree(self): for tweet in self.tweet_list: created_at = dtparser.parse(tweet["created_at"]) clean, cleaned_line = _clean_string(tweet['text']) hash_tags = get_hashtag(cleaned_line) if hash_tags and len(hash_tags) > 1: self.graph = update_or_build_graph(self.graph, hash_tags, created_at) # print self.graph self.graph = shuffle_graph(self.graph, created_at) self.avg_degree_list.append(calculate_avg_degree(self.graph)) #print self.avg_degree_list self.assertEqual(self.avg_degree_list, [1.0, 2.0, 2.0, 2.0, 1.67], 'incorrect average degree')
def main(argv): num_unicode = 0 inp_file, out_file = argv out_put = open(out_file,'w') with open(inp_file) as input_file: for line in input_file: line = line.rstrip() line_json = json.loads(line) try: clean, cleaned_line = _clean_string(line_json["text"]) if not clean: num_unicode += 1 out_put.write(cleaned_line + " (timestamp: {0})".format(line_json["created_at"]) + "\n") except: #this except block is here to handle the following sample limit lines # {"limit":{"track":19,"timestamp_ms":"1446218985758"}} pass out_put.write("\n {0} tweets contained unicode.".format(num_unicode)) out_put.close() input_file.close()