def get_low_filter(args, c): print "\n\n*** FILTER LOW FREQUENCY WORDS ***" items = itemfreq(c.corpus) counts = items[:, 1] low_filter = False while not low_filter: bin_counts, bins = np.histogram(counts[counts.argsort()[::-1]], range=(0, len(c.words) / 20.)) #print "{0:>10s} {1:>10s}".format("# Tokens", "# Words") for bin, count in zip(bins[:-1], bin_counts): print "{1:10.0f} words occur less than {0:10.0f} times".format( bin, count) print counts.sum(), "total occurrences" print len(c.words), "total words" input_filter = 0 accept = None while not input_filter: try: if low_filter: input_filter = low_filter else: input_filter = int( raw_input("Enter the minimum word occurrence rate: ")) candidates = get_candidate_words(c, -input_filter) print "Filter will remove", counts[ counts < input_filter].sum(), "tokens", "of these", len( counts[counts < input_filter]), "words:" print ' '.join(candidates) print "\nFilter will remove", counts[ counts < input_filter].sum(), "tokens", "of these", len( counts[counts < input_filter]), "words.", if len(candidates) == len(c.words): print "\n\nChoice of", input_filter, "will remove ALL words from the corpus." print "Please choose a different filter." low_filter = 0 input_filter = 0 else: accept = None while accept not in ['y', 'n']: accept = raw_input( "\nAccept filter? [y/n/[different min. number] ") if isint(accept): low_filter = int(accept) input_filter = 0 accept = 'n' elif accept == 'y': low_filter = input_filter elif accept == 'n': low_filter = 0 except ValueError: input_filter = 0 return (low_filter, candidates)
def get_low_filter(args, c): print "\n\n*** FILTER LOW FREQUENCY WORDS ***" items=itemfreq(c.corpus) counts = items[:,1] low_filter = False while not low_filter: bin_counts, bins = np.histogram(counts[counts.argsort()[::-1]], range=(0,len(c.words)/20.)) #print "{0:>10s} {1:>10s}".format("# Tokens", "# Words") for bin, count in zip(bins[:-1], bin_counts): print "{1:10.0f} words occur less than {0:10.0f} times".format(bin, count) print counts.sum(), "total occurrences" print len(c.words), "total words" input_filter = 0 accept = None while not input_filter: try: if low_filter: input_filter = low_filter else: input_filter = int(raw_input("Enter the minimum word occurrence rate: ")) candidates = get_candidate_words(c, -input_filter) print "Filter will remove", counts[counts < input_filter].sum(), "tokens", "of these", len(counts[counts < input_filter]), "words:" print ' '.join(candidates) print "\nFilter will remove", counts[counts < input_filter].sum(), "tokens", "of these", len(counts[counts < input_filter]), "words.", if len(candidates) == len(c.words): print "\n\nChoice of",input_filter, "will remove ALL words from the corpus." print "Please choose a different filter." low_filter = 0 input_filter = 0 else: accept = None while accept not in ['y', 'n']: accept = raw_input("\nAccept filter? [y/n/[different min. number] ") if isint(accept): low_filter = int(accept) input_filter = 0 accept = 'n' elif accept == 'y': low_filter = input_filter elif accept == 'n': low_filter = 0 except ValueError: input_filter = 0 return (low_filter, candidates)
def get_low_filter(args, c, words=None): import numpy as np header = "FILTER LOW FREQUENCY WORDS" stars = old_div((80 - len(header) - 2), 2) print("\n\n{0} {1} {0}".format('*' * stars, header)) print(" This will remove all words occurring less than N times.") print(" The histogram below shows how many words will be removed") print(" by selecting each minimum frequency threshold.\n") items, counts = get_items_counts(c.corpus) items = items[get_mask(c, words)] counts = counts[get_mask(c, words)] bins = np.linspace(0, 1.0, 11) bins = 1. - np.array([ 0., 0.025, 0.05, 0.075, 0.1, 0.15, 0.20, 0.25, 0.3, 0.35, 0.4, 0.5, 1.0 ]) thresh = old_div(np.cumsum(counts[counts.argsort()[::-1]]), float(counts.sum())) bins = [ counts[counts.argsort()[::-1]][np.searchsorted(thresh, bin)] for bin in bins ] bins = sorted(set(bins)) low_filter = False while low_filter is False: bin_counts, bins = np.histogram(counts[counts.argsort()[::-1]], bins=bins) # print "{0:>10s} {1:>10s}".format("# Tokens", "# Words") print("{0:>8s} {1:>8s} {2:<36s} {3:>14s} {4:>8s}".format( "Rate", 'Bottom', '% of corpus', "# words", "Rate")) for bin, count in zip(bins[1:], np.cumsum(bin_counts)): if count: percentage = (old_div(counts[counts <= bin].sum(), float(counts.sum()))) print("{0:>5.0f}x".format(bin - 1).rjust(8), end=' ') print('{0:2.1f}%'.format(percentage * 100).rjust(8), end=' ') print((u'\u2588' * int(percentage * 36)).ljust(36), end=' ') print(" {0:0.0f} words".format(count).rjust(14), end=' ') print("<= {0:>5.0f}x".format(bin - 1).ljust(8)) print(' ' * 17, "{} total occurrences".format(counts.sum()).ljust(36), end=' ') print('{} words total'.format(get_mask(c, words).sum()).rjust(20)) print('') input_filter = 0 accept = None while not input_filter or input_filter <= 0: try: if low_filter: input_filter = low_filter else: input_filter = int( input("Enter the minimum rate: ").replace('x', '')) candidates = get_candidate_words(c, -input_filter, words=words) places = np.in1d(c.words[get_mask(c, words)], candidates) places = dict(zip(candidates, np.where(places)[0])) candidates = sorted(candidates, key=lambda x: counts[places[x]]) print("Filter will remove", counts[counts <= input_filter].sum(), "tokens", end=' ') print("of these", len(counts[counts <= input_filter]), "words:") print(u' '.join(candidates)) print("\nFilter will remove", counts[counts <= input_filter].sum(), "tokens", end=' ') print("of these", len(counts[counts <= input_filter]), "words.", end=' ') if len(candidates) == len(c.words): print("\n\nChoice of", input_filter, "will remove ALL words from the corpus.") print("Please choose a different filter.") low_filter = 0 input_filter = 0 else: accept = None while accept not in ['y', 'n']: accept = input( "\nAccept filter? [y/n/[different min. number] ") if isint(accept): low_filter = int(accept) input_filter = 0 accept = 'n' elif accept == 'y': low_filter = input_filter elif accept == 'n': low_filter = False except ValueError: input_filter = 0 return (low_filter, candidates)
def get_low_filter(c, words=None, items=None, counts=None): import numpy as np header = "FILTER LOW FREQUENCY WORDS" stars = old_div((80 - len(header) - 2), 2) print("\n\n{0} {1} {0}".format('*' * stars, header)) print(" This will remove all words occurring less than N times.") print(" The histogram below shows how many words will be removed") print(" by selecting each minimum frequency threshold.\n") # Get frequency bins if items is None or counts is None: items, counts = get_corpus_counts(c) bins = np.arange(1.0, -0.01, -0.025) bins = [ get_closest_bin(c, thresh, reverse=True, counts=counts) for thresh in bins ] bins = sorted(set(bins)) bins.append(max(counts)) low_filter = False while low_filter is False: bin_counts, bins = np.histogram(counts[counts.argsort()[::-1]], bins=bins) # print "{0:>10s} {1:>10s}".format("# Tokens", "# Words") print("{0:>8s} {1:>8s} {2:<36s} {3:>14s} {4:>8s}".format( "Rate", 'Bottom', '% of corpus', "# words", "Rate")) last_row = 0 for bin, count in zip(bins, np.cumsum(bin_counts)): filtered_counts = counts[get_mask(c, words)] if last_row < (filtered_counts < bin).sum() <= len(filtered_counts): percentage = (old_div(counts[counts <= bin].sum(), float(c.original_length))) print("{0:>5.0f}x".format(bin).rjust(8), end=' ') print('{0:2.1f}%'.format(percentage * 100).rjust(8), end=' ') print((u'\u2588' * int(percentage * 36)).ljust(36), end=' ') print(" {0:0.0f} words".format( (filtered_counts <= bin).sum()).rjust(14), end=' ') print("<= {0:>5.0f}x".format(bin).ljust(8)) if (filtered_counts < bin).sum() == len(filtered_counts): break last_row = (filtered_counts >= bin).sum() print(' ' * 17, "{} total occurrences".format(counts.sum()).ljust(36), end=' ') print('{} words total'.format(get_mask(c, words).sum()).rjust(20)) print('') input_filter = 0 accept = None while not input_filter or input_filter <= 0: try: if low_filter: input_filter = low_filter else: input_filter = int( input("Enter the minimum rate: ").replace('x', '')) candidates = get_candidate_words(c, -input_filter, words=words, items=items, counts=counts) places = np.in1d(c.words, candidates) places = dict(zip(candidates, np.where(places)[0])) candidates = sorted(candidates, key=lambda x: counts[places[x]]) filtered_counts = counts[get_mask(c, words)] print("Filter will remove", filtered_counts[filtered_counts <= input_filter].sum(), "tokens", end=' ') print("of these", len(filtered_counts[filtered_counts <= input_filter]), "words:") print(u' '.join(candidates)) print("\nFilter will remove", filtered_counts[filtered_counts <= input_filter].sum(), "tokens", end=' ') print("of these", len(filtered_counts[filtered_counts <= input_filter]), "words.", end=' ') if len(candidates) == len(c.words): print("\n\nChoice of", input_filter, "will remove ALL words from the corpus.") print("Please choose a different filter.") low_filter = 0 input_filter = 0 else: accept = None while accept not in ['y', 'n']: accept = input( "\nAccept filter? [y/n/[different min. number] ") if isint(accept): low_filter = int(accept) input_filter = 0 accept = 'n' elif accept == 'y': low_filter = input_filter elif accept == 'n': low_filter = False except ValueError: input_filter = 0 return (low_filter, candidates)
def get_high_filter(args, c, words=None): import numpy as np header = "FILTER HIGH FREQUENCY WORDS" stars = old_div((80 - len(header) - 2), 2) print("\n\n{0} {1} {0}".format('*' * stars, header)) print(" This will remove all words occurring more than N times.") print(" The histogram below shows how many words will be removed") print(" by selecting each maximum frequency threshold.\n") items, counts = get_items_counts(c.corpus) items = items[get_mask(c, words)] counts = counts[get_mask(c, words)] high_filter = False bins = np.array([0., 0.025, 0.05, 0.075, 0.1, 0.15, 0.20, 0.25, 0.3, 0.35, 0.4, 0.5, 1.0]) bins = 1. - bins thresh = np.cumsum(counts[counts.argsort()]) / float(counts.sum()) bins = [counts[counts.argsort()][np.searchsorted(thresh, bin)] for bin in bins] bins = sorted(set(bins)) bins.append(max(counts)) while not high_filter: bin_counts, bins = np.histogram(counts, bins=bins) print("{0:>8s} {1:>8s} {2:<36s} {3:>14s} {4:>8s}".format("Rate", 'Top', '% of corpus', "# words", "Rate")) for bin, count in zip(bins[-2::-1], np.cumsum(bin_counts[::-1])): if count: percentage = 1. - (old_div(counts[counts < bin].sum(), float(counts.sum()))) print("{0:>5.0f}x".format(bin - 1).rjust(8), end=' ') print('{0:2.1f}%'.format(percentage * 100).rjust(8), end=' ') print((u'\u2588' * int(percentage * 36)).ljust(36), end=' ') print(" {0:0.0f} words".format(count).rjust(14), end=' ') print("> {0:>5.0f}x".format(bin - 1).ljust(8)) print(' ' * 17, "{} total occurrences".format(counts.sum()).ljust(36), end=' ') print('{} words total'.format(get_mask(c, words).sum()).rjust(20)) print('') input_filter = 0 accept = None while not input_filter or input_filter <= 0: try: if high_filter: input_filter = high_filter else: input_filter = int(input("Enter the maximum rate: ").replace('x', '')) candidates = get_candidate_words(c, input_filter, words=words) places = np.in1d(c.words[get_mask(c, words)], candidates) places = dict(zip(candidates, np.where(places)[0])) candidates = sorted(candidates, key=lambda x: counts[places[x]], reverse=True) print("Filter will remove", counts[counts > input_filter].sum(), end=' ') print("occurrences", "of these", len(counts[counts > input_filter]), "words:") print(u' '.join(candidates).encode( sys.stdout.encoding, errors='replace')) print("\nFilter will remove", counts[counts > input_filter].sum(), end=' ') print("occurrences", "of these", len(counts[counts > input_filter]), "words.", end=' ') if len(candidates) == len(c.words): print("\n\nChoice of", input_filter, "will remove ALL words from the corpus.") print("Please choose a different filter.") high_filter = 0 input_filter = 0 else: accept = None while accept not in ['y', 'n']: accept = input("\nAccept filter? [y/n/[different max number]] ") if isint(accept): high_filter = int(accept) input_filter = 0 accept = 'n' elif accept == 'y': high_filter = input_filter elif accept == 'n': high_filter = 0 except ValueError: input_filter = 0 return (high_filter, candidates)
def get_low_filter(c, words=None, items=None, counts=None): import numpy as np header = "FILTER LOW FREQUENCY WORDS" stars = old_div((80 - len(header) - 2), 2) print("\n\n{0} {1} {0}".format('*' * stars, header)) print(" This will remove all words occurring less than N times.") print(" The histogram below shows how many words will be removed") print(" by selecting each minimum frequency threshold.\n") # Get frequency bins if items is None or counts is None: items, counts = get_corpus_counts(c) bins = np.arange(1.0, -0.01, -0.025) bins = [get_closest_bin(c, thresh, reverse=True, counts=counts) for thresh in bins] bins = sorted(set(bins)) bins.append(max(counts)) low_filter = False while low_filter is False: bin_counts, bins = np.histogram(counts[counts.argsort()[::-1]], bins=bins) # print "{0:>10s} {1:>10s}".format("# Tokens", "# Words") print("{0:>8s} {1:>8s} {2:<36s} {3:>14s} {4:>8s}".format("Rate", 'Bottom', '% of corpus', "# words", "Rate")) last_row = 0 for bin, count in zip(bins, np.cumsum(bin_counts)): filtered_counts = counts[get_mask(c, words)] if last_row < (filtered_counts < bin).sum() <= len(filtered_counts): percentage = (old_div(counts[counts <= bin].sum(), float(c.original_length))) print("{0:>5.0f}x".format(bin).rjust(8), end=' ') print('{0:2.1f}%'.format(percentage * 100).rjust(8), end=' ') print((u'\u2588' * int(percentage * 36)).ljust(36), end=' ') print(" {0:0.0f} words".format((filtered_counts <= bin).sum()).rjust(14), end=' ') print("<= {0:>5.0f}x".format(bin).ljust(8)) if (filtered_counts < bin).sum() == len(filtered_counts): break last_row = (filtered_counts >= bin).sum() print(' ' * 17, "{} total occurrences".format(counts.sum()).ljust(36), end=' ') print('{} words total'.format(get_mask(c, words).sum()).rjust(20)) print('') input_filter = 0 accept = None while not input_filter or input_filter <= 0: try: if low_filter: input_filter = low_filter else: input_filter = int(input("Enter the minimum rate: ").replace('x', '')) candidates = get_candidate_words(c, -input_filter, words=words, items=items, counts=counts) places = np.in1d(c.words, candidates) places = dict(zip(candidates, np.where(places)[0])) candidates = sorted(candidates, key=lambda x: counts[places[x]]) filtered_counts = counts[get_mask(c, words)] print("Filter will remove", filtered_counts[filtered_counts <= input_filter].sum(), "tokens", end=' ') print("of these", len(filtered_counts[filtered_counts <= input_filter]), "words:") print(u' '.join(candidates)) print("\nFilter will remove", filtered_counts[filtered_counts <= input_filter].sum(), "tokens", end=' ') print("of these", len(filtered_counts[filtered_counts <= input_filter]), "words.", end=' ') if len(candidates) == len(c.words): print("\n\nChoice of", input_filter, "will remove ALL words from the corpus.") print("Please choose a different filter.") low_filter = 0 input_filter = 0 else: accept = None while accept not in ['y', 'n']: accept = input("\nAccept filter? [y/n/[different min. number] ") if isint(accept): low_filter = int(accept) input_filter = 0 accept = 'n' elif accept == 'y': low_filter = input_filter elif accept == 'n': low_filter = False except ValueError: input_filter = 0 return (low_filter, candidates)
def get_low_filter(args, c, words=None): import numpy as np header = "FILTER LOW FREQUENCY WORDS" stars = (80 - len(header) - 2) / 2 print "\n\n{0} {1} {0}".format('*'*stars, header) print " This will remove all words occurring less than N times." print " The histogram below shows how many words will be removed" print " by selecting each minimum frequency threshold.\n" items, counts = get_items_counts(c.corpus) items = items[get_mask(c, words)] counts = counts[get_mask(c, words)] bins = np.linspace(0, 1.0, 11) bins = 1. - np.array([0., 0.025, 0.05, 0.075, 0.1, 0.15,0.20,0.25,0.3, 0.35, 0.4, 0.5, 1.0]) thresh = np.cumsum(counts[counts.argsort()[::-1]]) / float(counts.sum()) bins = [counts[counts.argsort()[::-1]][np.searchsorted(thresh, bin)] for bin in bins] bins = sorted(set(bins)) low_filter = False while low_filter is False: bin_counts, bins = np.histogram(counts[counts.argsort()[::-1]], bins=bins) #print "{0:>10s} {1:>10s}".format("# Tokens", "# Words") print "{0:>8s} {1:>8s} {2:<36s} {3:>14s} {4:>8s}".format("Rate", 'Bottom', '% of corpus', "# words", "Rate") for bin, count in zip(bins[1:], np.cumsum(bin_counts)): if count: percentage = (counts[counts <= bin].sum() / float(counts.sum())) print "{0:>5.0f}x".format(bin-1).rjust(8), print '{0:2.1f}%'.format(percentage *100).rjust(8), print (u'\u2588' * (percentage * 36)).ljust(36), print " {0:0.0f} words".format(count).rjust(14), print "<= {0:>5.0f}x".format(bin-1).ljust(8) print ' ' * 17, "{} total occurrences".format(counts.sum()).ljust(36), '{} words total'.format(get_mask(c, words).sum()).rjust(20) print '' input_filter = 0 accept = None while not input_filter or input_filter <= 0: try: if low_filter: input_filter = low_filter else: input_filter = int(raw_input("Enter the minimum rate: ").replace('x','')) candidates = get_candidate_words(c, -input_filter, words=words) places = np.in1d(c.words[get_mask(c, words)], candidates) places = dict(zip(candidates, np.where(places)[0])) candidates = sorted(candidates, key=lambda x: counts[places[x]]) print "Filter will remove", counts[counts <= input_filter].sum(), "tokens", print "of these", len(counts[counts <= input_filter]), "words:" print u' '.join(candidates).encode( sys.stdout.encoding, errors='replace') print "\nFilter will remove", counts[counts <= input_filter].sum(), "tokens", print "of these", len(counts[counts <= input_filter]), "words.", if len(candidates) == len(c.words): print "\n\nChoice of",input_filter, "will remove ALL words from the corpus." print "Please choose a different filter." low_filter = 0 input_filter = 0 else: accept = None while accept not in ['y', 'n']: accept = raw_input("\nAccept filter? [y/n/[different min. number] ") if isint(accept): low_filter = int(accept) input_filter = 0 accept = 'n' elif accept == 'y': low_filter = input_filter elif accept == 'n': low_filter = False except ValueError: input_filter = 0 return (low_filter, candidates)
def get_high_filter(args, c, words=None): import numpy as np header = "FILTER HIGH FREQUENCY WORDS" stars = old_div((80 - len(header) - 2), 2) print("\n\n{0} {1} {0}".format('*' * stars, header)) print(" This will remove all words occurring more than N times.") print(" The histogram below shows how many words will be removed") print(" by selecting each maximum frequency threshold.\n") items, counts = get_items_counts(c.corpus) items = items[get_mask(c, words)] counts = counts[get_mask(c, words)] high_filter = False bins = np.array([0., 0.025, 0.05, 0.075, 0.1, 0.15, 0.20, 0.25, 0.3, 0.35, 0.4, 0.5, 1.0]) bins = 1. - bins thresh = np.cumsum(counts[counts.argsort()]) / float(counts.sum()) bins = [counts[counts.argsort()][np.searchsorted(thresh, bin)] for bin in bins] bins = sorted(set(bins)) bins.append(max(counts)) while not high_filter: bin_counts, bins = np.histogram(counts, bins=bins) print("{0:>8s} {1:>8s} {2:<36s} {3:>14s} {4:>8s}".format("Rate", 'Top', '% of corpus', "# words", "Rate")) for bin, count in zip(bins[-2::-1], np.cumsum(bin_counts[::-1])): if count: percentage = 1. - (old_div(counts[counts < bin].sum(), float(counts.sum()))) #print("{0:>5.0f}x".format(bin - 1).rjust(8), end=' ') #print('{0:2.1f}%'.format(percentage * 100).rjust(8), end=' ') #print((u'\u2588' * int(percentage * 36)).ljust(36), end=' ') #print(" {0:0.0f} words".format(count).rjust(14), end=' ') #print("> {0:>5.0f}x".format(bin - 1).ljust(8)) #print(' ' * 17, "{} total occurrences".format(counts.sum()).ljust(36), end=' ') print('{} words total'.format(get_mask(c, words).sum()).rjust(20)) print('') input_filter = 0 accept = None while not input_filter or input_filter <= 0: try: if high_filter: input_filter = high_filter else: input_filter = int(input("Enter the maximum rate: ").replace('x', '')) candidates = get_candidate_words(c, input_filter, words=words) places = np.in1d(c.words[get_mask(c, words)], candidates) places = dict(zip(candidates, np.where(places)[0])) candidates = sorted(candidates, key=lambda x: counts[places[x]], reverse=True) #print("Filter will remove", counts[counts > input_filter].sum(), end=' ') print("occurrences", "of these", len(counts[counts > input_filter]), "words:") print(u' '.join(candidates)) #print("\nFilter will remove", counts[counts > input_filter].sum(), end=' ') #print("occurrences", "of these", len(counts[counts > input_filter]), "words.", end=' ') if len(candidates) == len(c.words): print("\n\nChoice of", input_filter, "will remove ALL words from the corpus.") print("Please choose a different filter.") high_filter = 0 input_filter = 0 else: accept = None while accept not in ['y', 'n']: accept = input("\nAccept filter? [y/n/[different max number]] ") if isint(accept): high_filter = int(accept) input_filter = 0 accept = 'n' elif accept == 'y': high_filter = input_filter elif accept == 'n': high_filter = 0 except ValueError: input_filter = 0 return (high_filter, candidates)