def find_min_count(items, min_count):
    '''
	Find the items that occur at least min_count times

	Inputs:
		items: a list of items  (must be hashable/comparable)
		min_count: integer

	Returns: sorted list of tuples
	'''

    # Runs the helper function for you (DO NOT MODIFY)
    item_counts = count_items(items)

    # YOUR CODE HERE

    #Check if list is empty
    if (len(items) == 0):
        return []

    # Sort item_counts
    sorted_item_counts = sort_count_pairs(item_counts)
    # Initializing output list
    min_count_list = []

    # Filling the output list
    index = 1
    count_until_min = sorted_item_counts[index - 1][1]
    while (count_until_min >= min_count and index <= len(sorted_item_counts)):
        min_count_list.append(sorted_item_counts[index - 1])
        index += 1
        if (index <= len(sorted_item_counts)):
            count_until_min = sorted_item_counts[index - 1][1]
    # REPLACE RETURN VALUE WITH AN APPROPRIATE VALUE
    return min_count_list
示例#2
0
def find_top_k(tokens, k):
    '''
    Find the k most frequently occuring tokens

    Inputs:
        tokens: list of tokens (must be immutable)
        k: a non-negative integer

    Returns: list of the top k tokens ordered by count.
    '''

    #Error checking
    if k < 0:
        raise ValueError("In find_top_k, k must be a non-negative integer")

    d = count_tokens(tokens)
    lst = []
    top_k = []

    if len(tokens) != 0:
        for key, value in d.items():
            tpl = (key, value)
            lst.append(tpl)

        sorted_lst = sort_count_pairs(lst)
        sorted_key = [i[0] for i in sorted_lst]
        top_k = sorted_key[:k]

    return top_k
def find_frequent(items, k):
    '''
	Find items where the number of times the item occurs is at least
	1/k * len(items).

	Input:
		items: a list of items  (must be hashable/comparable)
		k: integer

	Returns: sorted list of tuples
	'''

    counter = {}

    for item in items:

        if len(counter) > k - 1:
            raise ValueError("The number of elements stored in counter" +
                             " should not exceed (k-1)=" + str(k - 1))

        # YOUR CODE HERE

        if (item in counter):
            counter[item] += 1
        else:
            counter[item] = 1
            if len(counter) > k - 1:
                for key in list(counter):
                    counter[key] -= 1
                    if counter[key] == 0:
                        del counter[key]

        # WRITE THE APPROPRIATE UPDATE LOGIC FOR COUNTER

    return sort_count_pairs(counter.items())
def find_top_k(items, k):
    '''
	Find the K most frequently occurring items

	Inputs:
		items: list of items (must be hashable/comparable)
		k: a non-negative integer

	Returns: sorted list of the top K tuples

	'''

    # Error checking (DO NOT MODIFY)
    if k < 0:
        raise ValueError("In find_top_k, k must be a non-negative integer")

    # Runs the helper function for you (DO NOT MODIFY)
    item_counts = count_items(items)

    # YOUR CODE GOES HERE

    # Initializing output list
    top_k_list = []
    # Initializing sorted list
    sorted_item_counts = sort_count_pairs(item_counts)

    if len(sorted_item_counts) <= k:
        return sorted_item_counts
    for num in range(k):
        top_k_list.append(sorted_item_counts[num])

    # REPLACE RETURN VALUE WITH AN APPROPRIATE VALUE
    return top_k_list
示例#5
0
def find_min_count(tokens, min_count):
    '''
    Find the tokens that occur *at least* min_count times

    Inputs:
        tokens: a list of tokens  (must be immutable)
        min_count: a non-negative integer

    Returns: set of tokens
    '''

    #Error checking
    if min_count < 0:
        raise ValueError("min_count must be a non-negative integer")

    d = count_tokens(tokens)
    lst = []

    for key, value in d.items():
        tpl = (key, value)
        lst.append(tpl)

    sorted_lst = sort_count_pairs(lst)

    min_lst = [x for x, v in sorted_lst if v >= min_count]

    return set(min_lst)
示例#6
0
def find_min_count(items, min_count):
    '''
    Find the items that occur at least min_count times

    Inputs:
        items: a list of items  (must be hashable/comparable)
        min_count: integer

    Returns: sorted list of tuples
    '''

    # Runs the helper function for you (DO NOT MODIFY)

    # YOUR CODE HERE
    item_counts = count_items(items)

    temp = []
    lst = []
    counts = {}
    for item in items:
        if (item in counts):
            counts[item] += 1
        else:
            counts[item] = 1

    for key, value in counts.items():
        if value >= min_count:
            temp = (key, value)
            lst.append(temp)

    lst1 = sort_count_pairs(lst)

    # REPLACE RETURN VALUE WITH AN APPROPRIATE VALUE
    return lst1
示例#7
0
def find_top_k_entities(tweets, entity_key, value_key, k):
    '''
    Find the K most frequently occuring entitites
 
    Inputs:
        tweets: a list of tweets
        entity_key: a string ("hashtags", "user_mentions", etc)
        value_key: string (appropriate value depends on the entity type)
        k: integer
 
    Returns: list of entity, count pairs sorted in non-decreasing order by count.
 
    '''
  
    # Calls counter function to count occurances of entities of a given type
    # found in tweets and store as a dictionary
    tweet_entities_dict = counter(tweets, entity_key, value_key)
    # Converts this dictionary to a list of tuples
    tweet_entities_list = tweet_entities_dict.items()
    # Sorts this list in decending order by calling sort_count_pairs function
    tweet_entities_list_sorted = sort_count_pairs(tweet_entities_list)
    # Chops off all but the K most occuring values from the list of entities
    tweet_entities_list_sorted = tweet_entities_list_sorted[:k]
    
    return tweet_entities_list_sorted
示例#8
0
def find_frequent(items, k):
    '''
    Find items where the number of times the item occurs is at least
    1/k * len(items).

    Input:
        items: a list of items  (must be hashable/comparable)
        k: integer

    Returns: sorted list of tuples
    '''

    # YOUR CODE HERE
    counter = {}

    keys_to_remove = []
    for item in items:
        if item in counter:
            counter[item] += 1
        if item not in counter:
            if len(counter) < k - 1:
                counter[item] = 1
            else:
                decr_and_remove(counter)

    for item in items:

        if len(counter) > k - 1:
            raise ValueError("The number of elements stored in counter" +
                             " should not exceed (k-1)=" + str(k - 1))
        # WRITE THE APPROPRIATE UPDATE LOGIC FOR COUNTER

    return sort_count_pairs(counter.items())
示例#9
0
def compare_tuple_lists(actual, params, recreate_msg):
    '''
    Do a test, check the result, report an error, if necessary.
    '''

    print("Actual:", actual)
    print()
    print("Expected:", params["expected"])

    # check the type
    check_tuple_list(actual, recreate_msg)

    expected = params["expected"]

    if actual != expected:
        if len(actual) != len(expected):
            msg = ("Length of actual result ({}) does not match "
                   "the length of the expected result ({}).\n{}")
            pytest.fail(msg.format(len(actual), len(expected), recreate_msg))

        if sort_count_pairs(actual) == expected:
            msg = "Actual result is not sorted properly.\n{}"
            pytest.fail(msg.format(recreate_msg))

        for i, actual_val in enumerate(actual):
            if actual_val != expected[i]:
                msg = ("At index {}:"
                       "  Actual result ({}) does not match"
                       "  Expected result ({}).\n{}")
                pytest.fail(
                    msg.format(i, actual_val, expected[i], recreate_msg))
    # Test succeeded if you get to here
    return
示例#10
0
def find_top_k(items, k):
    '''
    Find the K most frequently occuring items

    Inputs:
        items: a list of items
        k: integer 

	Variables:
		item_dict: a dictionary with the number of occurrence 
		of each unique value

    Returns: sorted list of K tuples
    '''

    # Use a dictionary to count the number of times each unique
    # value occurs
    # Extract list of (key, count) pairs from the dictionary
    # Sort the pairs using the supplied function
    # Pick off the first K pairs

    item_dict = {}

    for item in items:
        if item in item_dict:
            item_dict[item] += 1
        else:
            item_dict[item] = 1

    return sort_count_pairs(item_dict.items())[:k]
示例#11
0
def find_top_k(items, k):
    '''
    Find the K most frequently occuring items

    Inputs:
        items: a list of items
        k: integer 

    Returns: sorted list of K tuples
    '''

    k_library = {}
    for i in items:
        item = k_library.get(i, 0)
        k_library[i] = item + 1
    highest_to_lowest = sort_count_pairs(list(k_library.items()))
    return highest_to_lowest[0:k]

    items_dict = {}

    for i in range(len(items)):
        values = items[i]
        v = items_dict.get[values, 0]
        items_dict[values] = v + 1
    return list(items_dict.items())
示例#12
0
def find_frequent(items, k):
    '''
    Find items where the number of times the item occurs is at least
    fraction * len(items).

    Input: 
        items: list of items
        k: integer

    Returns: sorted list of tuples
    '''

    k_library = {}
    for i in items:
        if i in k_library:
            k_library[i] = k_library[i] + 1
        elif (len(k_library) + 1) <= k - 1:
            k_library[i] = 1
        else:
            reject = []
            for j in k_library:
                k_library[j] = k_library[j] - 1
                if k_library[j] == 0:
                    reject.append(j)
            for r in reject:
                del k_library[r]

    return sort_count_pairs(k_library.items())
示例#13
0
def helper(test_description):
    '''
    Do a test, check the result, report an error, if necessary.
    '''

    task = test_description["task"]

    # load the tweets from the file
    try:
        tweet_filename = os.path.join(BASE_DIR,
                                      test_description["tweet_filename"])
        tweets = json.load(open(tweet_filename))
    except OSError as e:
        pytest.fail("{}".format(e))

    expected = get_expected(test_description)
    if expected is None:
        pytest.fail("Could not open expected result file:" +
                    test_description["expected_filename"] + ":")

    try:
        actual = task_to_fn[task](tweets, test_description["arg1"],
                                  test_description["arg2"])
    except Exception as e:
        pytest.fail("{}".format(e))

    if not check_type(actual):
        s = ("Actual result has the wrong type."
             " The correct type is list of pairs "
             "(that is, tuples of length 2)")
        pytest.fail(s)

    if actual != expected:
        if len(actual) != len(expected):
            s = ("Length of actual result ({}) does not match "
                 "the length of the expected result ({})")
            pytest.fail(s.format(len(actual), len(expected)))

        if sort_count_pairs(actual) == expected:
            pytest.fail("Actual result is not sorted properly.")

        for i in range(len(actual)):
            if actual[i] != expected[i]:
                s = ("Actual result at index {} ({}) does not match"
                     "expected result ({}) at index {}.")
                pytest.fail(s.format(i, actual[i], expected[i], i))

    # Test succeeded if you get to here
    return
示例#14
0
def find_top_n(items, n):
    '''
    Find the N most frequently occuring items.

    Inputs:
        items: a list of items
        n: integer 

    Returns: sorted list of N tuples

    '''
    item_count = {}
    for item in items:
        item_count[item] = item_count.get(item, 0) + 1
    sorted_list = sort_count_pairs(item_count.items())
    return sorted_list[0:n]
示例#15
0
def helper(test_description):
    task = test_description["task"]

    # load the tweets from the file
    try:
        tweet_filename = os.path.join(BASE_DIR, test_description["tweet_filename"])
        tweets = json.load(open(tweet_filename))
    except OSError as e:
        pytest.fail("{}".format(e))

    expected = get_expected(test_description)
    if expected == None:
        pytest.fail("Could not open expected result file:"+ test_description["expected_filename"] + ":")

    try:
        if task in ["task1", "task2", "task3"]:
            actual = task_to_fn[task](tweets, test_description["entity_type"], 
                                      test_description["value_key"], test_description["arg3"])
        else:
            assert task in ["task4", "task5", "task6", "task7"]
            stop_words = STOP_WORDS.get(test_description["stop_words_key"], set([]))
            stop_prefix = STOP_PREFIXES.get(test_description["stop_prefix_key"], set([]))
            actual = task_to_fn[task](tweets, test_description["n"], 
                                      stop_words, stop_prefix, 
                                      test_description["arg4"])
    except Exception as e:
        pytest.fail("{}".format(e))

    if not check_type(actual):
        s = "Actual result has the wrong type.  The correct type is list of pairs (that is, tuples of length 2)"
        pytest.fail(s)

    if actual != expected:
        if len(actual) != len(expected):
            s = "Length of actual result ({}) does not match the length of the expected result ({})"
            pytest.fail(s.format(len(actual), len(expected)))

        if sort_count_pairs(actual) == expected:
            pytest.fail("Actual result is not sorted properly.")

        for i in range(len(actual)):
            if actual[i] != expected[i]:
                s = "Actual result at index {} ({}) does not match expected result ({}) at index {}."
                pytest.fail(s.format(i, actual[i], expected[i], i))

    # Test succeeded if you get to here
    return
示例#16
0
def find_frequent_entities(tweets, entity_key, value_key, k):
    '''
    Find entities where the number of times the specific entity occurs
    is at least fraction * the number of entities in across the tweets.
 
    Input:
        tweets: a list of tweets
        entity_key: a string ("hashtags", "user_mentions", etc)
        value_key: string (appropriate value depends on the entity type)
        k: integer
 
    Returns: list of entity, count pairs sorted in non-decreasing order by count.
    '''
   
    # creates empty list to store relevant values from tweets
    tweet_freq = {}
    # generates a list of tweet values to be analyzed by calling
    # tweet_value_finder function
    tweet_values = tweet_value_finder(tweets, entity_key, value_key)
    
    # loops over each tweet value in tweet_values
    for tweet_value in tweet_values:
        # if a value is not stored in tweet_freq dictionary, and the dict.
        # is not longer than the specified length, frequencty count for
        # the value is increased by one
        if tweet_value not in tweet_freq and len(tweet_freq) < k - 1:
            tweet_freq[tweet_value] = 0
            tweet_freq[tweet_value] += 1 
        # if a tweet value is not found in tweet_freq and the dictionary's
        # length meets/exceeds k-1, all counts in tweet_freq are decreased
        # and gets rid of phrases (keys) which map to 0 after the decrease
        elif tweet_value not in tweet_freq and len(tweet_freq) >= k - 1:
            tweet_freq = {key: tweet_freq[key] - 1 for key in tweet_freq}
            tweet_freq = {key: tweet_freq[key] for key in tweet_freq if tweet_freq[key] >= 1}
        # if a tweet entity is found in tweet_freq and the length condition is
        # not exceeded, the count for the value in question is increased by 1
        else: 
            tweet_freq[tweet_value] += 1
   
    # converts the tweet_freq dictionary to a sortable list
    tweet_frequent = tweet_freq.items()
    # sorts the tweet_freq list in decending order by counts
    tweet_freq_sorted = sort_count_pairs(tweet_frequent)
    
    return tweet_freq_sorted
示例#17
0
def find_frequent_ngrams(tweets, n, stop_words, stop_prefixes, k):
    '''
    Find frequently occurring n-grams
 
    Inputs:
        tweets: a list of tweets
        n: integer
        stop_words: a set of strings to ignore
        stop_prefixes: a set of strings.  Words w/ a prefix that
          appears in this list should be ignored.
        k: integer
 
    Returns: list of key/value pairs sorted in non-increasing order
      by value.
    '''
    
    # calls generator function to generate possible ngrams 
    ngrams = ngram_generator(tweets, n, stop_words, stop_prefixes)
    
    # initializes empty dictionary for processing
    freq_ngrams = {}
    # loops over each item in the list
    for ngram in ngrams:
        # checks to see if item isn't in list and if there are less 
        # than k-1 counters and adds it to list with value 1 if so
        if ngram not in freq_ngrams and len(freq_ngrams) < k - 1:
            freq_ngrams[ngram] = 0
            freq_ngrams[ngram] += 1
        # checks to see if item isn't in list and if there are more
        # than k-1 counters and reduces all items in list by 1 and drops 
        # items valued at 0
        elif ngram not in freq_ngrams and len(freq_ngrams) >= k - 1:
            freq_ngrams = {key: freq_ngrams[key] - 1 for key in freq_ngrams}
            freq_ngrams = {key: freq_ngrams[key] for key in freq_ngrams if freq_ngrams[key] >= 1}
        # adds 1 if item is in list
        else: 
            freq_ngrams[ngram] += 1
    # turns dictionary into list
    freq_ngram_list = freq_ngrams.items()
    # sorts list
    freq_ngram_list_sorted = sort_count_pairs(freq_ngram_list)
     
    return freq_ngram_list_sorted
示例#18
0
def find_frequent(items, k):
    '''
    Find items where the number of times the item occurs is at least
    1/k * len(items).

    Input: 
        items: list of items
        k: integer

	Variables:
		item_dict: a dictionary with the number of occurrence 
		of each unique value

    Returns: sorted list of tuples
    '''

    # N = Total number of items
    # D = Data structure with K - 1 counters
    # I = Given list item

    # If I occurs in D, increment I counter by one
    # If I doesn't occur in D, and there are fewer than K - 1 items in D,
    # add I with a value of one to D
    # If I does not occur in D and there are K - 1 items in D, decrement all
    # the counters by one and remove any with a count of 0 from D

    item_dict = {}

    for item in items:
        if item not in item_dict and len(item_dict) < k - 1:
            item_dict[item] = 1
        elif item not in item_dict and len(item_dict) == k - 1:
            item_dict_subtracted = {}
            for key in item_dict:
                value = item_dict[key] - 1
                if value > 0:
                    item_dict_subtracted[key] = value
            item_dict = item_dict_subtracted
        elif item in item_dict:
            item_dict[item] += 1

    return sort_count_pairs(item_dict.items())
示例#19
0
def make_k_list(items):
    '''
    returns a list of counts for each item in items

    Inputs:
        items: list of items to be counted

    Returns:
        k_list: list of counts for each item in list as a list of tuples
    '''

    #initializes an empty dictionary
    k_dict = {}
    #creates keys with binned counts, the keys are the items from the list
    for item in items:
        k_dict[item] = k_dict.get(item, 0) + 1
    #converts dictionary to list of tuples
    k_list = k_dict.items()
    k_list_sorted = sort_count_pairs(k_list)

    return k_list_sorted
示例#20
0
def calc_tf(docs): 
    '''
    Calculates TF scores per document for a corpus of documents. 

    Inputs: 
        docs: a list of lists (must be hashable/comparable)
    Returns: a list of dictionaries where each dictionary is a document 
                containing the token as key and TF score as value  
    ''' 
    tf_full = []
    for doc in docs: 
        if len(doc) == 0:
            tf_full.append({})  
        else:
            sorted_count = sort_count_pairs(count_tokens(doc))
            max_tf = sorted_count[0][1]
            tf_doc = {}
            for k,v in sorted_count: 
                tf_doc[k] = 0.5+0.5*(v/max_tf)
            tf_full.append(tf_doc) 
    return tf_full
示例#21
0
def find_min_count(items, min_count):
    '''
    Find the items that occur at least min_count times

    Inputs:
        items: a list of items    
        min)count: integer
        
    Returns: sorted list of tuples
    '''

    k_library = {}
    made_the_cut = []
    for i in items:
        item = k_library.get(i, 0)
        k_library[i] = item + 1
    for j in k_library:
        if k_library[j] >= min_count:
            made_the_cut.append((j, k_library[j]))
    made_the_cut = sort_count_pairs(made_the_cut)
    return made_the_cut
示例#22
0
def ngram_counter(tweets, n, stop_words, stop_prefixes):
    '''
    Find n-grams and their associated counts.
   
    Inputs:
        tweets: a list of tweets
        n: integer
        stop_words: a set of strings to ignore
        stop_prefixes: a set of strings.  Words w/ a prefix that
          appears in this list should be ignored.
        min_count: integer
 
 
    Returns: list of key/value pairs sorted in non-increasing order
      by value.
    '''    
    
    # creates to list to hold ngrams and dict to count them
    ngram_dict = {}
    ngrams = []
    
    # loops through tweets in list to process text
    for tweet in tweets:
        preproc_words = preprocess_tweet(tweet, stop_words, stop_prefixes)
        # loops through words in tweet to create ngram
        for i, word in enumerate(preproc_words):
            # checks to see if there is space left in list index 
            # to create ngram
            if i <= len(preproc_words) - n:
                ngram = (preproc_words[i:i + n])
                ngram = tuple(ngram)
                ngram_dict[ngram] = ngram_dict.get(ngram, 0) + 1
            else:
                pass
    
    # converts dict to list and sorts
    ngram_list = ngram_dict.items()
    sorted_ngram_list = sort_count_pairs(ngram_list) 
    
    return sorted_ngram_list
示例#23
0
def find_frequent_6(items, k):
    '''
    Find items where the number of times the item occurs is at least
    1/k * len(items).

    Input: 
        items: list of items
        k: integer

    Returns: sorted list of tuples
    '''
   
    N = len(items)
    items_dict = {z : 0 for z in items}
    new_dict = {}
    overall_dict = {z : 0 for z in items}
    
    for a in items:
        if a in new_dict:
            items_dict[a] += 1
        if a not in new_dict:
            new_dict[a] = 0
            if len(new_dict) < k - 1:
                items_dict[a] += 1
            if len(new_dict) == k - 1:
                items_dict[a] += 1
                for b in new_dict:
                    items_dict[b] -= 1
                for b in overall_dict:
                    if b in new_dict:
                        if items_dict[b] == 0:
                            del new_dict[b]
     
    for a in overall_dict:
        if items_dict[a] == 0:
            del items_dict[a] 
        
    l = items_dict.items()

    return sort_count_pairs(l)
示例#24
0
def find_frequent(items, k):
    '''
    Find items where the number of times the item occurs is at least
    fraction * len(items).

    Input: 
        items: list of items
        k: integer

    Returns: sorted list of tuples
    '''

    # initializes empty dictionary for processing
    D = {}

    # loops over each item in the list
    for I in items:
        # checks to see if item isn't in list and if there are less
        # than k-1 counters and adds it to list with value 1 if so
        if I not in D and len(D) < k - 1:
            D[I] = 0
            D[I] += 1
        # checks to see if item isn't in list and if there are more
        # than k-1 counters and reduces all items in list by 1 and drops
        # items valued at 0
        elif I not in D and len(D) >= k - 1:
            D = {key: D[key] - 1 for key in D}
            D = {key: D[key] for key in D if D[key] >= 1}
        # adds 1 if item is in list
        else:
            D[I] += 1

    # turns dictionary into list
    list_frequent = D.items()
    # sorts list
    list_freq_sorted = sort_count_pairs(list_frequent)

    # YOUR CODE HERE
    # REPLACE RETURN VALUE WITH AN APPROPRIATE VALUE
    return list_freq_sorted
示例#25
0
def find_top_k(items, k):
    '''
    Find the K most frequently occurring items

    Inputs:
        items: list of items (must be hashable/comparable)
        k: a non-negative integer

    Returns: sorted list of the top K tuples

    '''

    # Error checking (DO NOT MODIFY)
    if k < 0:
        raise ValueError("In find_top_k, k must be a non-negative integer")

    # Runs the helper function for you (DO NOT MODIFY)
    item_counts = count_items(items)

    temp = []
    lst = []
    counts = {}
    for item in items:
        if (item in counts):
            counts[item] += 1
        else:
            counts[item] = 1

    for key, value in counts.items():
        temp = (key, value)

        lst.append(temp)

    lst1 = sort_count_pairs(lst)

    # YOUR CODE GOES HERE
    # REPLACE RETURN VALUE WITH AN APPROPRIATE VALUE
    return lst1[:k]
示例#26
0
def find_min_count(items, min_count):
    '''
    Find the items that occur at least min_count times
	
    Inputs:
        items: a list of items    
        min_count: integer

	Variables:
		item_dict: a dictionary with the number of occurrence 
		of each unique value
		min_count_array: an array with each tuple containing 
		an item and its occurrences
        
    Returns: sorted list of tuples
    '''

    # Compute the counts
    # Build a list of the items and associated counts that meet
    # the threshold
    # Sort it using the supplied function

    item_dict = {}

    for item in items:
        if item in item_dict:
            item_dict[item] += 1
        else:
            item_dict[item] = 1

    min_count_array = []

    for key, value in item_dict.items():
        if value >= min_count:
            min_count_array.append((key, value))

    return sort_count_pairs(min_count_array)
示例#27
0
def find_min_count_entities(tweets, entity_key, value_key, min_count):
    '''
    Find the entitites that occur at least min_count times.
 
    Inputs:
        tweets: a list of tweets
        entity_key: a string ("hashtags", "user_mentions", etc)
        value_key: string (appropriate value depends on the entity type)
        min_count: integer
 
    Returns: list of entity, count pairs sorted in non-decreasing order by count.
    '''
   
    # calls counter function to count entity occurences in the tweets, stores
    # counts as a dictionary
    tweet_entities_dict = counter(tweets, entity_key, value_key)
    # converts this dictionary of counts to a list
    tweet_entities_list = tweet_entities_dict.items()
    # cuts out all entity values with counts below the specified min. threshold
    tweet_entities_list_min = [x for x in tweet_entities_list if x[1] >= min_count]
    # sorts list of values of count >= mincounts in decending order
    tweet_entities_list_sorted = sort_count_pairs(tweet_entities_list_min)
     
    return tweet_entities_list_sorted
示例#28
0
    Inputs:
        tokens: list of tokens (must be hashable/comparable)
        k: a non-negative integer

    Returns: sorted list of the top k tuples

    '''

    # Error checking (DO NOT MODIFY)
    err_msg = "In find_top_k, k must be a non-negative integer"
    assert k >= 0, err_msg

<<<<<<< HEAD
    counted = count_tokens(tokens)
    sorted_tokens = sort_count_pairs(counted) 
    return sorted_tokens[:k]    
=======
    # Your code for Task 1.2 goes here
    # Replace return value with an appropriate value
    return []
>>>>>>> 772604324d9e6fbf8e76ecb03659b77d09955de0


def find_min_count(tokens, min_count):
    '''
    Find the tokens that occur at least min_count times

    Inputs:
        tokens: a list of tokens (must be hashable/comparable)
        min_count: integer