Exemplo n.º 1
0
 def test2():
     import ascmini
     rows = ascmini.csv_load('bnc-words.csv')
     output = []
     words = {}
     for row in rows:
         root = row[0]
         size = int(row[1])
         c5 = row[2]
         word = row[3]
         count = int(row[4])
         head = root[:1].lower()
         if size <= 1:
             continue
         if count * 1000 / size < 1:
             continue
         if '*' in word:
             continue
         if c5 in ('UNC', 'CRD'):
             continue
         if '(' in root or '/' in root:
             continue
         if head != '\'' and (not head.isalpha()):
             if head.isdigit():
                 continue
             if head in ('$', '#', '-'):
                 continue
         if root.count('\'') >= 2:
             continue
         if not root in words:
             stem = WordRoot(root)
             words[root] = stem
         else:
             stem = words[root]
         stem.add(c5, word.lower(), count)
     for key in words:
         stem = words[key]
         for c5, word, count in stem.dump():
             output.append((stem.root, stem.count, c5, word, count))
     output.sort(key=lambda x: (x[1], x[0]), reverse=True)
     # ascmini.csv_save(output, 'bnc-clear.csv')
     print
     'count', len(words)
Exemplo n.º 2
0
 def test3():
     import ascmini
     rows = ascmini.csv_load('bnc-clear.csv')
     output = []
     words = {}
     for row in rows:
         root = row[0]
         size = int(row[1])
         c5 = row[2]
         word = row[3].lower()
         count = int(row[4])
         if word == root:
             continue
         if not root in words:
             stem = WordRoot(root)
             words[root] = stem
         else:
             stem = words[root]
         stem.add('*', word, count)
         stem.count = size
     fp = open('bnc-lemma.txt', 'w')
     lemmas = []
     for key in words:
         stem = words[key]
         part = []
         for c5, word, count in stem.dump():
             output.append((stem.root, stem.count, c5, word, count))
             part.append('%s/%d' % (word, count))
         if not part:
             continue
         text = '%s/%d -> ' % (stem.root, stem.count)
         lemmas.append((stem.count, stem.root, text + ','.join(part)))
     output.sort(key=lambda x: (x[1], x[0]), reverse=True)
     lemmas.sort(reverse=True)
     for _, _, text in lemmas:
         fp.write(text + '\n')
     ascmini.csv_save(output, 'bnc-test.csv')
     print
     'count', len(words)
     return 0
Exemplo n.º 3
0
 def load(self, filename):
     rows = ascmini.csv_load(filename)
     if not rows:
         return False
     words = self.words
     for row in rows:
         if not row:
             continue
         if len(row) < 4:
             continue
         stem = row[0].lower()
         c5 = row[1]
         word = row[3]
         if not word:
             continue
         if not stem in words:
             ws = WordStem(stem)
             words[stem] = ws
         else:
             ws = words[stem]
         ws.add(c5, word.lower())
     rows = None
     return True