def bag_of_text(directory): norms = [] for fn in os.listdir(directory): fn = os.path.join(directory, fn) with open(fn, 'r') as fh: text = fh.read() try: text = text.decode('utf-8') norms.append(watershed.normalize_text(text)) except Exception: pass return '\n'.join(norms)
def classify_dir(directory, cl=None, vect=None): if cl is None: (cl, vect) = build_classifier() print('INFO:classifier built') for (dirpath, dirnames, filenames) in os.walk(directory): for filename in filenames: fn = os.path.join(dirpath, filename) with open(fn, 'r') as fh: text = fh.read() text = text.decode('utf-8') normed = watershed.normalize_text(text) if not is_english(text): print('INFO: skipping junk file %i' % fn) continue print('INFO: classifying %i' % fn) result = classify(normed, cl, vect) if result == 'positive': print(fn)