Пример #1
0
 def save(self, filename):
     words = [self.words[s] for s in self.words]
     words.sort(key=lambda x: x.count, reverse=True)
     output = []
     for stem in words:
         row = [stem.stem, stem.count]
         for record in stem.dump():
             output.append(row + list(record))
     ascmini.csv_save(output, filename)
     return True
Пример #2
0
 def test3():
     import ascmini
     rows = ascmini.csv_load('bnc-clear.csv')
     output = []
     words = {}
     for row in rows:
         root = row[0]
         size = int(row[1])
         c5 = row[2]
         word = row[3].lower()
         count = int(row[4])
         if word == root:
             continue
         if not root in words:
             stem = WordRoot(root)
             words[root] = stem
         else:
             stem = words[root]
         stem.add('*', word, count)
         stem.count = size
     fp = open('bnc-lemma.txt', 'w')
     lemmas = []
     for key in words:
         stem = words[key]
         part = []
         for c5, word, count in stem.dump():
             output.append((stem.root, stem.count, c5, word, count))
             part.append('%s/%d' % (word, count))
         if not part:
             continue
         text = '%s/%d -> ' % (stem.root, stem.count)
         lemmas.append((stem.count, stem.root, text + ','.join(part)))
     output.sort(key=lambda x: (x[1], x[0]), reverse=True)
     lemmas.sort(reverse=True)
     for _, _, text in lemmas:
         fp.write(text + '\n')
     ascmini.csv_save(output, 'bnc-test.csv')
     print
     'count', len(words)
     return 0