예제 #1
0
def get_low_filter(args, c):
    print "\n\n*** FILTER LOW FREQUENCY WORDS ***"
    items = itemfreq(c.corpus)
    counts = items[:, 1]
    low_filter = False

    while not low_filter:
        bin_counts, bins = np.histogram(counts[counts.argsort()[::-1]],
                                        range=(0, len(c.words) / 20.))
        #print "{0:>10s} {1:>10s}".format("# Tokens", "# Words")
        for bin, count in zip(bins[:-1], bin_counts):
            print "{1:10.0f} words occur less than {0:10.0f} times".format(
                bin, count)
        print counts.sum(), "total occurrences"
        print len(c.words), "total words"

        input_filter = 0
        accept = None
        while not input_filter:
            try:
                if low_filter:
                    input_filter = low_filter
                else:
                    input_filter = int(
                        raw_input("Enter the minimum word occurrence rate: "))
                candidates = get_candidate_words(c, -input_filter)

                print "Filter will remove", counts[
                    counts < input_filter].sum(), "tokens", "of these", len(
                        counts[counts < input_filter]), "words:"
                print ' '.join(candidates)

                print "\nFilter will remove", counts[
                    counts < input_filter].sum(), "tokens", "of these", len(
                        counts[counts < input_filter]), "words.",

                if len(candidates) == len(c.words):
                    print "\n\nChoice of", input_filter, "will remove ALL words from the corpus."
                    print "Please choose a different filter."
                    low_filter = 0
                    input_filter = 0
                else:
                    accept = None
                    while accept not in ['y', 'n']:
                        accept = raw_input(
                            "\nAccept filter? [y/n/[different min. number] ")
                        if isint(accept):
                            low_filter = int(accept)
                            input_filter = 0
                            accept = 'n'
                        elif accept == 'y':
                            low_filter = input_filter
                        elif accept == 'n':
                            low_filter = 0

            except ValueError:
                input_filter = 0

    return (low_filter, candidates)
예제 #2
0
def get_low_filter(args, c):
    print "\n\n*** FILTER LOW FREQUENCY WORDS ***"
    items=itemfreq(c.corpus)
    counts = items[:,1]
    low_filter = False

    while not low_filter:
        bin_counts, bins = np.histogram(counts[counts.argsort()[::-1]], range=(0,len(c.words)/20.))
	#print "{0:>10s} {1:>10s}".format("# Tokens", "# Words")
	for bin, count in zip(bins[:-1], bin_counts):
	    print "{1:10.0f} words occur less than {0:10.0f} times".format(bin, count)
        print counts.sum(), "total occurrences"
	print len(c.words), "total words"
    
        input_filter = 0
        accept = None
        while not input_filter:
            try:
                if low_filter:
                    input_filter = low_filter
                else:
                    input_filter = int(raw_input("Enter the minimum word occurrence rate: "))
                candidates = get_candidate_words(c, -input_filter)
    
                print "Filter will remove", counts[counts < input_filter].sum(), "tokens", "of these", len(counts[counts < input_filter]), "words:"
                print ' '.join(candidates)

                print "\nFilter will remove", counts[counts < input_filter].sum(), "tokens", "of these", len(counts[counts < input_filter]), "words.",
                
                if len(candidates) == len(c.words):
                    print "\n\nChoice of",input_filter, "will remove ALL words from the corpus."
                    print "Please choose a different filter."
                    low_filter = 0
                    input_filter = 0
                else:
                    accept = None
                    while accept not in ['y', 'n']:
                        accept = raw_input("\nAccept filter? [y/n/[different min. number] ")
                        if isint(accept):
                            low_filter = int(accept)
                            input_filter = 0
                            accept = 'n'
                        elif accept == 'y':
                            low_filter = input_filter
                        elif accept == 'n':
                            low_filter = 0
                        
            except ValueError:
                input_filter = 0 

    return (low_filter, candidates)
예제 #3
0
def get_low_filter(args, c, words=None):
    import numpy as np
    header = "FILTER LOW FREQUENCY WORDS"
    stars = old_div((80 - len(header) - 2), 2)
    print("\n\n{0} {1} {0}".format('*' * stars, header))
    print("    This will remove all words occurring less than N times.")
    print("    The histogram below shows how many words will be removed")
    print("    by selecting each minimum frequency threshold.\n")
    items, counts = get_items_counts(c.corpus)
    items = items[get_mask(c, words)]
    counts = counts[get_mask(c, words)]

    bins = np.linspace(0, 1.0, 11)
    bins = 1. - np.array([
        0., 0.025, 0.05, 0.075, 0.1, 0.15, 0.20, 0.25, 0.3, 0.35, 0.4, 0.5, 1.0
    ])

    thresh = old_div(np.cumsum(counts[counts.argsort()[::-1]]),
                     float(counts.sum()))
    bins = [
        counts[counts.argsort()[::-1]][np.searchsorted(thresh, bin)]
        for bin in bins
    ]
    bins = sorted(set(bins))

    low_filter = False
    while low_filter is False:
        bin_counts, bins = np.histogram(counts[counts.argsort()[::-1]],
                                        bins=bins)
        # print "{0:>10s} {1:>10s}".format("# Tokens", "# Words")
        print("{0:>8s} {1:>8s} {2:<36s} {3:>14s} {4:>8s}".format(
            "Rate", 'Bottom', '% of corpus', "# words", "Rate"))
        for bin, count in zip(bins[1:], np.cumsum(bin_counts)):
            if count:
                percentage = (old_div(counts[counts <= bin].sum(),
                                      float(counts.sum())))
                print("{0:>5.0f}x".format(bin - 1).rjust(8), end=' ')
                print('{0:2.1f}%'.format(percentage * 100).rjust(8), end=' ')
                print((u'\u2588' * int(percentage * 36)).ljust(36), end=' ')
                print("  {0:0.0f} words".format(count).rjust(14), end=' ')
                print("<= {0:>5.0f}x".format(bin - 1).ljust(8))

        print(' ' * 17,
              "{} total occurrences".format(counts.sum()).ljust(36),
              end=' ')
        print('{} words total'.format(get_mask(c, words).sum()).rjust(20))
        print('')

        input_filter = 0
        accept = None
        while not input_filter or input_filter <= 0:
            try:
                if low_filter:
                    input_filter = low_filter
                else:
                    input_filter = int(
                        input("Enter the minimum rate: ").replace('x', ''))

                candidates = get_candidate_words(c, -input_filter, words=words)
                places = np.in1d(c.words[get_mask(c, words)], candidates)
                places = dict(zip(candidates, np.where(places)[0]))
                candidates = sorted(candidates,
                                    key=lambda x: counts[places[x]])

                print("Filter will remove",
                      counts[counts <= input_filter].sum(),
                      "tokens",
                      end=' ')
                print("of these", len(counts[counts <= input_filter]),
                      "words:")
                print(u' '.join(candidates))

                print("\nFilter will remove",
                      counts[counts <= input_filter].sum(),
                      "tokens",
                      end=' ')
                print("of these",
                      len(counts[counts <= input_filter]),
                      "words.",
                      end=' ')

                if len(candidates) == len(c.words):
                    print("\n\nChoice of", input_filter,
                          "will remove ALL words from the corpus.")
                    print("Please choose a different filter.")
                    low_filter = 0
                    input_filter = 0
                else:
                    accept = None
                    while accept not in ['y', 'n']:
                        accept = input(
                            "\nAccept filter? [y/n/[different min. number] ")
                        if isint(accept):
                            low_filter = int(accept)
                            input_filter = 0
                            accept = 'n'
                        elif accept == 'y':
                            low_filter = input_filter
                        elif accept == 'n':
                            low_filter = False

            except ValueError:
                input_filter = 0

    return (low_filter, candidates)
예제 #4
0
def get_low_filter(c, words=None, items=None, counts=None):
    import numpy as np
    header = "FILTER LOW FREQUENCY WORDS"
    stars = old_div((80 - len(header) - 2), 2)
    print("\n\n{0} {1} {0}".format('*' * stars, header))
    print("    This will remove all words occurring less than N times.")
    print("    The histogram below shows how many words will be removed")
    print("    by selecting each minimum frequency threshold.\n")

    # Get frequency bins
    if items is None or counts is None:
        items, counts = get_corpus_counts(c)
    bins = np.arange(1.0, -0.01, -0.025)
    bins = [
        get_closest_bin(c, thresh, reverse=True, counts=counts)
        for thresh in bins
    ]
    bins = sorted(set(bins))
    bins.append(max(counts))

    low_filter = False
    while low_filter is False:
        bin_counts, bins = np.histogram(counts[counts.argsort()[::-1]],
                                        bins=bins)
        # print "{0:>10s} {1:>10s}".format("# Tokens", "# Words")
        print("{0:>8s} {1:>8s} {2:<36s} {3:>14s} {4:>8s}".format(
            "Rate", 'Bottom', '% of corpus', "# words", "Rate"))

        last_row = 0
        for bin, count in zip(bins, np.cumsum(bin_counts)):
            filtered_counts = counts[get_mask(c, words)]
            if last_row < (filtered_counts <
                           bin).sum() <= len(filtered_counts):
                percentage = (old_div(counts[counts <= bin].sum(),
                                      float(c.original_length)))
                print("{0:>5.0f}x".format(bin).rjust(8), end=' ')
                print('{0:2.1f}%'.format(percentage * 100).rjust(8), end=' ')
                print((u'\u2588' * int(percentage * 36)).ljust(36), end=' ')
                print("  {0:0.0f} words".format(
                    (filtered_counts <= bin).sum()).rjust(14),
                      end=' ')
                print("<= {0:>5.0f}x".format(bin).ljust(8))
                if (filtered_counts < bin).sum() == len(filtered_counts):
                    break
            last_row = (filtered_counts >= bin).sum()

        print(' ' * 17,
              "{} total occurrences".format(counts.sum()).ljust(36),
              end=' ')
        print('{} words total'.format(get_mask(c, words).sum()).rjust(20))
        print('')

        input_filter = 0
        accept = None
        while not input_filter or input_filter <= 0:
            try:
                if low_filter:
                    input_filter = low_filter
                else:
                    input_filter = int(
                        input("Enter the minimum rate: ").replace('x', ''))

                candidates = get_candidate_words(c,
                                                 -input_filter,
                                                 words=words,
                                                 items=items,
                                                 counts=counts)
                places = np.in1d(c.words, candidates)
                places = dict(zip(candidates, np.where(places)[0]))
                candidates = sorted(candidates,
                                    key=lambda x: counts[places[x]])
                filtered_counts = counts[get_mask(c, words)]

                print("Filter will remove",
                      filtered_counts[filtered_counts <= input_filter].sum(),
                      "tokens",
                      end=' ')
                print("of these",
                      len(filtered_counts[filtered_counts <= input_filter]),
                      "words:")
                print(u' '.join(candidates))

                print("\nFilter will remove",
                      filtered_counts[filtered_counts <= input_filter].sum(),
                      "tokens",
                      end=' ')
                print("of these",
                      len(filtered_counts[filtered_counts <= input_filter]),
                      "words.",
                      end=' ')

                if len(candidates) == len(c.words):
                    print("\n\nChoice of", input_filter,
                          "will remove ALL words from the corpus.")
                    print("Please choose a different filter.")
                    low_filter = 0
                    input_filter = 0
                else:
                    accept = None
                    while accept not in ['y', 'n']:
                        accept = input(
                            "\nAccept filter? [y/n/[different min. number] ")
                        if isint(accept):
                            low_filter = int(accept)
                            input_filter = 0
                            accept = 'n'
                        elif accept == 'y':
                            low_filter = input_filter
                        elif accept == 'n':
                            low_filter = False

            except ValueError:
                input_filter = 0

    return (low_filter, candidates)
예제 #5
0
def get_high_filter(args, c, words=None):
    import numpy as np
    header = "FILTER HIGH FREQUENCY WORDS"
    stars = old_div((80 - len(header) - 2), 2)
    print("\n\n{0} {1} {0}".format('*' * stars, header))
    print("    This will remove all words occurring more than N times.")
    print("    The histogram below shows how many words will be removed")
    print("    by selecting each maximum frequency threshold.\n")
    items, counts = get_items_counts(c.corpus)
    items = items[get_mask(c, words)]
    counts = counts[get_mask(c, words)]
    high_filter = False
    bins = np.array([0., 0.025, 0.05, 0.075, 0.1, 0.15, 0.20, 0.25, 0.3, 0.35, 0.4, 0.5, 1.0])
    bins = 1. - bins

    thresh = np.cumsum(counts[counts.argsort()]) / float(counts.sum())
    bins = [counts[counts.argsort()][np.searchsorted(thresh, bin)] for bin in bins]
    bins = sorted(set(bins))
    bins.append(max(counts))

    while not high_filter:
        bin_counts, bins = np.histogram(counts, bins=bins)
        print("{0:>8s} {1:>8s} {2:<36s} {3:>14s} {4:>8s}".format("Rate", 'Top', '% of corpus',
                                                                 "# words", "Rate"))
        for bin, count in zip(bins[-2::-1], np.cumsum(bin_counts[::-1])):
            if count:
                percentage = 1. - (old_div(counts[counts < bin].sum(), float(counts.sum())))
                print("{0:>5.0f}x".format(bin - 1).rjust(8), end=' ')
                print('{0:2.1f}%'.format(percentage * 100).rjust(8), end=' ')
                print((u'\u2588' * int(percentage * 36)).ljust(36), end=' ')
                print("  {0:0.0f} words".format(count).rjust(14), end=' ')
                print("> {0:>5.0f}x".format(bin - 1).ljust(8))

        print(' ' * 17, "{} total occurrences".format(counts.sum()).ljust(36), end=' ')
        print('{} words total'.format(get_mask(c, words).sum()).rjust(20))
        print('')

        input_filter = 0
        accept = None
        while not input_filter or input_filter <= 0:
            try:
                if high_filter:
                    input_filter = high_filter
                else:
                    input_filter = int(input("Enter the maximum rate: ").replace('x', ''))
                candidates = get_candidate_words(c, input_filter, words=words)
                places = np.in1d(c.words[get_mask(c, words)], candidates)
                places = dict(zip(candidates, np.where(places)[0]))
                candidates = sorted(candidates, key=lambda x: counts[places[x]], reverse=True)

                print("Filter will remove", counts[counts > input_filter].sum(), end=' ')
                print("occurrences", "of these", len(counts[counts > input_filter]), "words:")
                print(u' '.join(candidates).encode(
                    sys.stdout.encoding, errors='replace'))

                print("\nFilter will remove", counts[counts > input_filter].sum(), end=' ')
                print("occurrences", "of these", len(counts[counts > input_filter]), "words.", end=' ')
                if len(candidates) == len(c.words):
                    print("\n\nChoice of", input_filter, "will remove ALL words from the corpus.")
                    print("Please choose a different filter.")
                    high_filter = 0
                    input_filter = 0
                else:
                    accept = None
                    while accept not in ['y', 'n']:
                        accept = input("\nAccept filter? [y/n/[different max number]] ")
                        if isint(accept):
                            high_filter = int(accept)
                            input_filter = 0
                            accept = 'n'
                        elif accept == 'y':
                            high_filter = input_filter
                        elif accept == 'n':
                            high_filter = 0

            except ValueError:
                input_filter = 0
    return (high_filter, candidates)
예제 #6
0
def get_low_filter(c, words=None, items=None, counts=None):
    import numpy as np
    header = "FILTER LOW FREQUENCY WORDS"
    stars = old_div((80 - len(header) - 2), 2)
    print("\n\n{0} {1} {0}".format('*' * stars, header))
    print("    This will remove all words occurring less than N times.")
    print("    The histogram below shows how many words will be removed")
    print("    by selecting each minimum frequency threshold.\n")

    # Get frequency bins
    if items is None or counts is None:
        items, counts = get_corpus_counts(c)
    bins = np.arange(1.0, -0.01, -0.025)
    bins = [get_closest_bin(c, thresh, reverse=True, counts=counts) for thresh in bins]
    bins = sorted(set(bins))
    bins.append(max(counts))

    low_filter = False
    while low_filter is False:
        bin_counts, bins = np.histogram(counts[counts.argsort()[::-1]], bins=bins)
        # print "{0:>10s} {1:>10s}".format("# Tokens", "# Words")
        print("{0:>8s} {1:>8s} {2:<36s} {3:>14s} {4:>8s}".format("Rate", 'Bottom', '% of corpus',
                                                                 "# words", "Rate"))

        last_row = 0
        for bin, count in zip(bins, np.cumsum(bin_counts)):
            filtered_counts = counts[get_mask(c, words)]
            if last_row < (filtered_counts < bin).sum() <= len(filtered_counts):
                percentage = (old_div(counts[counts <= bin].sum(), float(c.original_length)))
                print("{0:>5.0f}x".format(bin).rjust(8), end=' ')
                print('{0:2.1f}%'.format(percentage * 100).rjust(8), end=' ')
                print((u'\u2588' * int(percentage * 36)).ljust(36), end=' ')
                print("  {0:0.0f} words".format((filtered_counts <= bin).sum()).rjust(14), end=' ')
                print("<= {0:>5.0f}x".format(bin).ljust(8))
                if (filtered_counts < bin).sum() == len(filtered_counts):
                    break
            last_row = (filtered_counts >= bin).sum()


        print(' ' * 17, "{} total occurrences".format(counts.sum()).ljust(36), end=' ')
        print('{} words total'.format(get_mask(c, words).sum()).rjust(20))
        print('')

        input_filter = 0
        accept = None
        while not input_filter or input_filter <= 0:
            try:
                if low_filter:
                    input_filter = low_filter
                else:
                    input_filter = int(input("Enter the minimum rate: ").replace('x', ''))

                candidates = get_candidate_words(c, -input_filter, words=words, items=items, counts=counts)
                places = np.in1d(c.words, candidates)
                places = dict(zip(candidates, np.where(places)[0]))
                candidates = sorted(candidates, key=lambda x: counts[places[x]])
                filtered_counts = counts[get_mask(c, words)]

                print("Filter will remove", filtered_counts[filtered_counts <= input_filter].sum(), "tokens", end=' ')
                print("of these", len(filtered_counts[filtered_counts <= input_filter]), "words:")
                print(u' '.join(candidates))

                print("\nFilter will remove", filtered_counts[filtered_counts <= input_filter].sum(), "tokens", end=' ')
                print("of these", len(filtered_counts[filtered_counts <= input_filter]), "words.", end=' ')

                if len(candidates) == len(c.words):
                    print("\n\nChoice of", input_filter, "will remove ALL words from the corpus.")
                    print("Please choose a different filter.")
                    low_filter = 0
                    input_filter = 0
                else:
                    accept = None
                    while accept not in ['y', 'n']:
                        accept = input("\nAccept filter? [y/n/[different min. number] ")
                        if isint(accept):
                            low_filter = int(accept)
                            input_filter = 0
                            accept = 'n'
                        elif accept == 'y':
                            low_filter = input_filter
                        elif accept == 'n':
                            low_filter = False

            except ValueError:
                input_filter = 0

    return (low_filter, candidates)
예제 #7
0
def get_low_filter(args, c, words=None):
    import numpy as np
    header = "FILTER LOW FREQUENCY WORDS" 
    stars = (80 - len(header) - 2) / 2
    print "\n\n{0} {1} {0}".format('*'*stars, header)
    print "    This will remove all words occurring less than N times."
    print "    The histogram below shows how many words will be removed"
    print "    by selecting each minimum frequency threshold.\n"
    items, counts = get_items_counts(c.corpus)
    items = items[get_mask(c, words)] 
    counts = counts[get_mask(c, words)] 
   
    bins = np.linspace(0, 1.0, 11)
    bins = 1. - np.array([0., 0.025, 0.05, 0.075, 0.1, 0.15,0.20,0.25,0.3, 0.35, 0.4, 0.5, 1.0])

    thresh = np.cumsum(counts[counts.argsort()[::-1]]) / float(counts.sum())
    bins = [counts[counts.argsort()[::-1]][np.searchsorted(thresh, bin)] for bin in bins]
    bins = sorted(set(bins))

    low_filter = False
    while low_filter is False:
        bin_counts, bins = np.histogram(counts[counts.argsort()[::-1]], bins=bins)
	#print "{0:>10s} {1:>10s}".format("# Tokens", "# Words")
	print "{0:>8s} {1:>8s} {2:<36s} {3:>14s} {4:>8s}".format("Rate", 'Bottom', '% of corpus', "# words", "Rate")
	for bin, count in zip(bins[1:], np.cumsum(bin_counts)):
            if count:
                percentage = (counts[counts <= bin].sum() / float(counts.sum()))
                print "{0:>5.0f}x".format(bin-1).rjust(8),
                print '{0:2.1f}%'.format(percentage *100).rjust(8),
                print (u'\u2588' * (percentage * 36)).ljust(36),
                print "  {0:0.0f} words".format(count).rjust(14),
                print "<= {0:>5.0f}x".format(bin-1).ljust(8)

	print ' ' * 17, "{} total occurrences".format(counts.sum()).ljust(36), '{} words total'.format(get_mask(c, words).sum()).rjust(20)
        print ''
    
        input_filter = 0
        accept = None
        while not input_filter or input_filter <= 0:
            try:
                if low_filter:
                    input_filter = low_filter
                else:
                    input_filter = int(raw_input("Enter the minimum rate: ").replace('x',''))

                candidates = get_candidate_words(c, -input_filter, words=words)
                places = np.in1d(c.words[get_mask(c, words)], candidates)
                places = dict(zip(candidates, np.where(places)[0]))
                candidates = sorted(candidates, key=lambda x: counts[places[x]])
    
                print "Filter will remove", counts[counts <= input_filter].sum(), "tokens",
                print "of these", len(counts[counts <= input_filter]), "words:"
                print u' '.join(candidates).encode(
                    sys.stdout.encoding, errors='replace')

                print "\nFilter will remove", counts[counts <= input_filter].sum(), "tokens", 
                print "of these", len(counts[counts <= input_filter]), "words.",
                
                if len(candidates) == len(c.words):
                    print "\n\nChoice of",input_filter, "will remove ALL words from the corpus."
                    print "Please choose a different filter."
                    low_filter = 0
                    input_filter = 0
                else:
                    accept = None
                    while accept not in ['y', 'n']:
                        accept = raw_input("\nAccept filter? [y/n/[different min. number] ")
                        if isint(accept):
                            low_filter = int(accept)
                            input_filter = 0
                            accept = 'n'
                        elif accept == 'y':
                            low_filter = input_filter
                        elif accept == 'n':
                            low_filter = False 
                        
            except ValueError:
                input_filter = 0 

    return (low_filter, candidates)
예제 #8
0
def get_high_filter(args, c, words=None):
    import numpy as np
    header = "FILTER HIGH FREQUENCY WORDS"
    stars = old_div((80 - len(header) - 2), 2)
    print("\n\n{0} {1} {0}".format('*' * stars, header))
    print("    This will remove all words occurring more than N times.")
    print("    The histogram below shows how many words will be removed")
    print("    by selecting each maximum frequency threshold.\n")
    items, counts = get_items_counts(c.corpus)
    items = items[get_mask(c, words)]
    counts = counts[get_mask(c, words)]
    high_filter = False
    bins = np.array([0., 0.025, 0.05, 0.075, 0.1, 0.15, 0.20, 0.25, 0.3, 0.35, 0.4, 0.5, 1.0])
    bins = 1. - bins

    thresh = np.cumsum(counts[counts.argsort()]) / float(counts.sum())
    bins = [counts[counts.argsort()][np.searchsorted(thresh, bin)] for bin in bins]
    bins = sorted(set(bins))
    bins.append(max(counts))

    while not high_filter:
        bin_counts, bins = np.histogram(counts, bins=bins)
        print("{0:>8s} {1:>8s} {2:<36s} {3:>14s} {4:>8s}".format("Rate", 'Top', '% of corpus',
                                                                 "# words", "Rate"))
        for bin, count in zip(bins[-2::-1], np.cumsum(bin_counts[::-1])):
            if count:
                percentage = 1. - (old_div(counts[counts < bin].sum(), float(counts.sum())))
                #print("{0:>5.0f}x".format(bin - 1).rjust(8), end=' ')
                #print('{0:2.1f}%'.format(percentage * 100).rjust(8), end=' ')
                #print((u'\u2588' * int(percentage * 36)).ljust(36), end=' ')
                #print("  {0:0.0f} words".format(count).rjust(14), end=' ')
                #print("> {0:>5.0f}x".format(bin - 1).ljust(8))

        #print(' ' * 17, "{} total occurrences".format(counts.sum()).ljust(36), end=' ')
        print('{} words total'.format(get_mask(c, words).sum()).rjust(20))
        print('')

        input_filter = 0
        accept = None
        while not input_filter or input_filter <= 0:
            try:
                if high_filter:
                    input_filter = high_filter
                else:
                    input_filter = int(input("Enter the maximum rate: ").replace('x', ''))
                candidates = get_candidate_words(c, input_filter, words=words)
                places = np.in1d(c.words[get_mask(c, words)], candidates)
                places = dict(zip(candidates, np.where(places)[0]))
                candidates = sorted(candidates, key=lambda x: counts[places[x]], reverse=True)

                #print("Filter will remove", counts[counts > input_filter].sum(), end=' ')
                print("occurrences", "of these", len(counts[counts > input_filter]), "words:")
                print(u' '.join(candidates))

                #print("\nFilter will remove", counts[counts > input_filter].sum(), end=' ')
                #print("occurrences", "of these", len(counts[counts > input_filter]), "words.", end=' ')
                if len(candidates) == len(c.words):
                    print("\n\nChoice of", input_filter, "will remove ALL words from the corpus.")
                    print("Please choose a different filter.")
                    high_filter = 0
                    input_filter = 0
                else:
                    accept = None
                    while accept not in ['y', 'n']:
                        accept = input("\nAccept filter? [y/n/[different max number]] ")
                        if isint(accept):
                            high_filter = int(accept)
                            input_filter = 0
                            accept = 'n'
                        elif accept == 'y':
                            high_filter = input_filter
                        elif accept == 'n':
                            high_filter = 0

            except ValueError:
                input_filter = 0
    return (high_filter, candidates)