예제 #1
0
def bag_of_text(directory):
    norms = []
    for fn in os.listdir(directory):
        fn = os.path.join(directory, fn)
        with open(fn, 'r') as fh:
            text = fh.read()
            try:
                text = text.decode('utf-8')
                norms.append(watershed.normalize_text(text))
            except Exception:
                pass
    return '\n'.join(norms)
예제 #2
0
def classify_dir(directory, cl=None, vect=None):
    if cl is None:
        (cl, vect) = build_classifier()
    print('INFO:classifier built')
    for (dirpath, dirnames, filenames) in os.walk(directory):
        for filename in filenames:
            fn = os.path.join(dirpath, filename)
            with open(fn, 'r') as fh:
                text = fh.read()
                text = text.decode('utf-8')
                normed = watershed.normalize_text(text)
                if not is_english(text):
                    print('INFO: skipping junk file %i' % fn)
                    continue
                print('INFO: classifying %i' % fn)
                result = classify(normed, cl, vect)                
                if result == 'positive':
                    print(fn)