示例#1
0
文件: mass.py 项目: rfong/shittynlp
def main():
    start = time.time()
    lexicon = set(get_lexicon())
    print '%0.2f sec to fetch unix lexicon, %d words' % (time.time() - start,
                                                         len(lexicon))
    print 'avg word length: %d' % counter_avg(Counter(len(w) for w in lexicon))

    start = time.time()
    prefix_trie = PrefixTrie(lexicon)
    print '%0.2f sec to make prefix trie, %d paths' % (
        time.time() - start, len(prefix_trie.get_paths()))

    # We'll be lazy and make a suffix trie by using reverse words and reverse
    # lookup paths
    suffix_trie = SuffixTrie(lexicon)

    start = time.time()
    substring_trie = SubstringTrie(lexicon)
    print '%0.2f sec to make substring trie, %d paths' % (
        time.time() - start, len(substring_trie.get_paths()))

    MIN_SUBSTR_LEN = 3
    substrings = []
    start = time.time()
    for word in lexicon:
        for start_index in range(0, len(word) - 1):
            for end_index in range(start_index + MIN_SUBSTR_LEN, len(word)):
                substrings.append(word[start_index:end_index])
    substring_counts = Counter(substrings)
    print '%0.2f sec to tally substring counts' % (time.time() - start, )
    substring_len_counts = Counter([len(w) for w in substring_counts.keys()])

    most_common_substrs = most_common(substring_counts, n=10)
    least_common_substrs = most_common(substring_counts,
                                       n=10,
                                       least=True,
                                       min_count=10)
    print most_common_substrs
    print least_common_substrs

    print substring_trie.fetch('aar')
    print substring_counts['aar']
    print substring_trie.fetch('ati')[:100]
    print substring_trie.fetch('tillatio')
    print substring_trie.fetch('naest')
    print substring_trie.fetch('nctil')
示例#2
0
文件: mass.py 项目: rfong/shittynlp
def main():
  start = time.time()
  lexicon = set(get_lexicon())
  print '%0.2f sec to fetch unix lexicon, %d words' % (
    time.time() - start, len(lexicon))
  print 'avg word length: %d' % counter_avg(Counter(len(w) for w in lexicon))

  start = time.time()
  prefix_trie = PrefixTrie(lexicon)
  print '%0.2f sec to make prefix trie, %d paths' % (
    time.time() - start, len(prefix_trie.get_paths()))

  # We'll be lazy and make a suffix trie by using reverse words and reverse
  # lookup paths
  suffix_trie = SuffixTrie(lexicon)

  start = time.time()
  substring_trie = SubstringTrie(lexicon)
  print '%0.2f sec to make substring trie, %d paths' % (
    time.time() - start, len(substring_trie.get_paths()))

  MIN_SUBSTR_LEN = 3
  substrings = []
  start = time.time()
  for word in lexicon:
    for start_index in range(0, len(word) - 1):
      for end_index in range(start_index + MIN_SUBSTR_LEN, len(word)):
        substrings.append(word[start_index:end_index])
  substring_counts = Counter(substrings)
  print '%0.2f sec to tally substring counts' % (time.time() - start,)
  substring_len_counts = Counter([len(w) for w in substring_counts.keys()])

  most_common_substrs = most_common(substring_counts, n=10)
  least_common_substrs = most_common(substring_counts, n=10, least=True,
                                     min_count=10)
  print most_common_substrs
  print least_common_substrs

  print substring_trie.fetch('aar')
  print substring_counts['aar']
  print substring_trie.fetch('ati')[:100]
  print substring_trie.fetch('tillatio')
  print substring_trie.fetch('naest')
  print substring_trie.fetch('nctil')