def clean_tweets(input, output): # Creates the output text file output_file = open(output,'w') # Creates JSON array out of line delimted JSON file json_input = utility.to_json_array(input) # Initialize unicode count variable unicode_count = 0 # Iterate through all JSON Objects within the array for currentObj in json_input: if 'text' in currentObj: text = parse.parse_json(currentObj, 'text') created_at = parse.parse_json(currentObj, 'created_at') # UNICODE CHECK if contain_unicode(text) or contain_unicode(created_at): unicode_count+=1 text = utility.clean_text(text) created_at = utility.clean_text(created_at) # Write JSON Object Post Output #print(text + ' (timestamp: ' + created_at + ')\n') output_file.write(text + ' (timestamp: ' + created_at + ')\n') # Writing aggregate unicode into output file and test string output_file.write(str(unicode_count) + ' tweet(s) contained unicode.') #print(str(unicode_count) + ' tweet(s) contained unicode.') # Close files output_file.close() print("\nTweet clean completed\nOutput in: " + output)
def average_degree(input, output): # Creates the output text file output_file = open(output,'w') # String that stores all the content of the output.txt # Used for debugging purposes #test = '' # Creates JSON array out of line delimted JSON file json_input = utility.to_json_array(input) adjacency_list = {} previous_threshold_time = None # Iterates through all objects within JSON Array for current_obj in json_input: # Checks if text exists within json_input if 'text' in current_obj: # Creates list of texts and timestamps of posts text = parse.parse_json(current_obj,'text') created_at = parse.parse_json(current_obj,'created_at') # Cleans texts and timestamps and parses timestamps text = utility.clean_text(text) created_at = utility.clean_text(created_at) parsed_time = parse.parse_timestamps(created_at) # Calculates current post threshold time threshold_time = utility.minute_offset(parsed_time); # Initializes previous threshold time so it can be used # for comparison and skip redundant checks if previous_threshold_time is None: previous_threshold_time = threshold_time # Removes all edges in adjacency list below threshold time if # threshold time was changed and resets threshold time to latest # post's time if threshold_time != previous_threshold_time: adjacency_list = remove_outdated(adjacency_list, threshold_time) previous_threshold_time = threshold_time # Creates # {'text': timestamp} -> {Node: [{edge: , timestamp: } {edge: , timestamp: }]} # Creates adjacency from hashtag list created from the post adjacency_list = identify_hashtags(adjacency_list, text, parsed_time) # Writes adjacency list to text output and testing console #print(debug.debug_adjacency_list(adjacency_list)) #test += debug.debug_adjacency_list(adjacency_list) #output_file.write(debug.debug_adjacency_list(adjacency_list)) # Creates degree list and calculates average degrees degree_list = create_degree_list(adjacency_list) average_degree = utility.calculate_average(degree_list) # Adds degree list and average degrees to output file and # debug text #print(debug.debug_degree_list(degree_list)) #print('Average degree = ' + debug.debug_degrees(degree_list) + str(average_degree)) #output_file.write(debug.debug_degree_list(degree_list)) #output_file.write('Average degree = ' + debug.debug_degrees(degree_list) + ' = ' + str(average_degree) + '\n') #print(str(average_degree)) output_file.write(str(average_degree) + '\n') #test += debug.debug_degree_list(degree_list) #test += 'Average degree = ' + str(average_degree)+'\n\n' # Close files output_file.close() print("\nAverage degree completed\nOutput in: " + output)