def test2(): import ascmini rows = ascmini.csv_load('bnc-words.csv') output = [] words = {} for row in rows: root = row[0] size = int(row[1]) c5 = row[2] word = row[3] count = int(row[4]) head = root[:1].lower() if size <= 1: continue if count * 1000 / size < 1: continue if '*' in word: continue if c5 in ('UNC', 'CRD'): continue if '(' in root or '/' in root: continue if head != '\'' and (not head.isalpha()): if head.isdigit(): continue if head in ('$', '#', '-'): continue if root.count('\'') >= 2: continue if not root in words: stem = WordRoot(root) words[root] = stem else: stem = words[root] stem.add(c5, word.lower(), count) for key in words: stem = words[key] for c5, word, count in stem.dump(): output.append((stem.root, stem.count, c5, word, count)) output.sort(key=lambda x: (x[1], x[0]), reverse=True) # ascmini.csv_save(output, 'bnc-clear.csv') print 'count', len(words)
def test3(): import ascmini rows = ascmini.csv_load('bnc-clear.csv') output = [] words = {} for row in rows: root = row[0] size = int(row[1]) c5 = row[2] word = row[3].lower() count = int(row[4]) if word == root: continue if not root in words: stem = WordRoot(root) words[root] = stem else: stem = words[root] stem.add('*', word, count) stem.count = size fp = open('bnc-lemma.txt', 'w') lemmas = [] for key in words: stem = words[key] part = [] for c5, word, count in stem.dump(): output.append((stem.root, stem.count, c5, word, count)) part.append('%s/%d' % (word, count)) if not part: continue text = '%s/%d -> ' % (stem.root, stem.count) lemmas.append((stem.count, stem.root, text + ','.join(part))) output.sort(key=lambda x: (x[1], x[0]), reverse=True) lemmas.sort(reverse=True) for _, _, text in lemmas: fp.write(text + '\n') ascmini.csv_save(output, 'bnc-test.csv') print 'count', len(words) return 0
def load(self, filename): rows = ascmini.csv_load(filename) if not rows: return False words = self.words for row in rows: if not row: continue if len(row) < 4: continue stem = row[0].lower() c5 = row[1] word = row[3] if not word: continue if not stem in words: ws = WordStem(stem) words[stem] = ws else: ws = words[stem] ws.add(c5, word.lower()) rows = None return True