Пример #1
0
def clean_tweets(input, output):
    # Creates the output text file
    output_file = open(output,'w')

    # Creates JSON array out of line delimted JSON file
    json_input = utility.to_json_array(input)

    # Initialize unicode count variable
    unicode_count = 0

    # Iterate through all JSON Objects within the array
    for currentObj in json_input:
        if 'text' in currentObj:
            text = parse.parse_json(currentObj, 'text')
            created_at = parse.parse_json(currentObj, 'created_at')

            # UNICODE CHECK
            if contain_unicode(text) or contain_unicode(created_at):
                unicode_count+=1
                text = utility.clean_text(text)
                created_at = utility.clean_text(created_at)

            # Write JSON Object Post Output
            #print(text + ' (timestamp: ' + created_at + ')\n')
            output_file.write(text + ' (timestamp: ' + created_at + ')\n')

    # Writing aggregate unicode into output file and test string
    output_file.write(str(unicode_count) + ' tweet(s) contained unicode.')
    #print(str(unicode_count) + ' tweet(s) contained unicode.')

    # Close files
    output_file.close()
    print("\nTweet clean completed\nOutput in: " + output)
Пример #2
0
def average_degree(input, output):
    # Creates the output text file
    output_file = open(output,'w')

    # String that stores all the content of the output.txt
    # Used for debugging purposes
    #test = ''

    # Creates JSON array out of line delimted JSON file
    json_input = utility.to_json_array(input)

    adjacency_list = {}
    previous_threshold_time = None

    # Iterates through all objects within JSON Array
    for current_obj in json_input:
        # Checks if text exists within json_input
        if 'text' in current_obj:
            # Creates list of texts and timestamps of posts
            text = parse.parse_json(current_obj,'text')
            created_at = parse.parse_json(current_obj,'created_at')

            # Cleans texts and timestamps and parses timestamps
            text = utility.clean_text(text)
            created_at = utility.clean_text(created_at)
            parsed_time = parse.parse_timestamps(created_at)

            # Calculates current post threshold time
            threshold_time = utility.minute_offset(parsed_time);

            # Initializes previous threshold time so it can be used
            # for comparison and skip redundant checks
            if previous_threshold_time is None:
                previous_threshold_time = threshold_time

            # Removes all edges in adjacency list below threshold time if
            # threshold time was changed and resets threshold time to latest
            # post's time
            if threshold_time != previous_threshold_time:
                adjacency_list = remove_outdated(adjacency_list, threshold_time)
            previous_threshold_time = threshold_time

            # Creates
            # {'text': timestamp} -> {Node: [{edge: , timestamp: } {edge: , timestamp: }]}
            # Creates adjacency from hashtag list created from the post
            adjacency_list = identify_hashtags(adjacency_list, text, parsed_time)

            # Writes adjacency list to text output and testing console
            #print(debug.debug_adjacency_list(adjacency_list))
            #test += debug.debug_adjacency_list(adjacency_list)
            #output_file.write(debug.debug_adjacency_list(adjacency_list))

            # Creates degree list and calculates average degrees
            degree_list = create_degree_list(adjacency_list)
            average_degree = utility.calculate_average(degree_list)

            # Adds degree list and average degrees to output file and
            # debug text
            #print(debug.debug_degree_list(degree_list))
            #print('Average degree = ' + debug.debug_degrees(degree_list) + str(average_degree))
            #output_file.write(debug.debug_degree_list(degree_list))
            #output_file.write('Average degree = ' + debug.debug_degrees(degree_list) + ' = ' + str(average_degree) + '\n')
            #print(str(average_degree))
            output_file.write(str(average_degree) + '\n')
            #test += debug.debug_degree_list(degree_list)
            #test += 'Average degree = ' + str(average_degree)+'\n\n'

    # Close files
    output_file.close()
    print("\nAverage degree completed\nOutput in: " + output)