lif = label_instance_function[args.instances] feats = [] test_feats = [] for label in labels: texts = lif(categorized_corpus, label) if args.instances == 'files': # don't get list(texts) here since might have tons of files stop = int(len(categorized_corpus.fileids())*args.fraction) else: texts = list(texts) stop = int(len(texts)*args.fraction) for t in itertools.islice(texts, stop): feat = bag_of_words(norm_words(t)) feats.append(feat) test_feats.append((feat, label)) print('accuracy:', accuracy(classifier, test_feats)) refsets, testsets = scoring.ref_test_sets(classifier, test_feats) for label in labels: ref = refsets[label] test = testsets[label] print('%s precision: %f' % (label, precision(ref, test) or 0)) print('%s recall: %f' % (label, recall(ref, test) or 0)) print('%s f-measure: %f' % (label, f_measure(ref, test) or 0)) else: if args.instances == 'sents': texts = categorized_corpus.sents()
label_instance_function = { 'sents': corpus.category_sent_words, 'paras': corpus.category_para_words, 'files': corpus.category_file_words } lif = label_instance_function[args.instances] feats = [] test_feats = [] for label in labels: texts = list(lif(categorized_corpus, label)) stop = int(len(texts)*args.fraction) for t in texts[:stop]: feat = bag_of_words(norm_words(t)) feats.append(feat) test_feats.append((feat, label)) print 'accuracy:', accuracy(classifier, test_feats) refsets, testsets = scoring.ref_test_sets(classifier, test_feats) for label in labels: ref = refsets[label] test = testsets[label] print '%s precision: %f' % (label, precision(ref, test) or 0) print '%s recall: %f' % (label, recall(ref, test) or 0) print '%s f-measure: %f' % (label, f_measure(ref, test) or 0) else: instance_function = { 'sents': categorized_corpus.sents,
def bag_of_bigrams_words(words, score_fn=BigramAssocMeasures.chi_sq, n=200): bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, n) return bag_of_words(words + bigrams)
lif = label_instance_function[args.instances] feats = [] test_feats = [] for label in labels: texts = lif(categorized_corpus, label) if args.instances == 'files': # don't get list(texts) here since might have tons of files stop = int(len(categorized_corpus.fileids()) * args.fraction) else: texts = list(texts) stop = int(len(texts) * args.fraction) for t in itertools.islice(texts, stop): feat = bag_of_words(norm_words(t)) feats.append(feat) test_feats.append((feat, label)) print('accuracy:', accuracy(classifier, test_feats)) refsets, testsets = scoring.ref_test_sets(classifier, test_feats) for label in labels: ref = refsets[label] test = testsets[label] print('%s precision: %f' % (label, precision(ref, test) or 0)) print('%s recall: %f' % (label, recall(ref, test) or 0)) print('%s f-measure: %f' % (label, f_measure(ref, test) or 0)) else: if args.instances == 'sents': texts = categorized_corpus.sents()
if args.trace: print 'filename for category %s: %s' % (label, path) return path labels = classifier.labels() label_files = dict([(l, open(label_filename(l), 'a')) for l in labels]) # TODO: create a nltk.corpus.writer framework with some initial CorpusWriter classes if args.target: if args.trace: print 'translating all text from %s to %s' % (args.source, args.target) featx = lambda words: bag_of_words(norm_words(wordpunct_tokenize(translate(join_words(words), args.source, args.target, trace=args.trace, sleep=args.sleep, retries=args.retries)))) else: featx = lambda words: bag_of_words(norm_words(words)) def classify_write(words): feats = featx(words) probs = classifier.prob_classify(feats) label = probs.max() if probs.prob(label) >= args.threshold: label_files[label].write(join_words(words) + u'\n\n') if args.trace: print 'classifying %s' % args.instances if args.instances == 'paras':
labels = classifier.labels() label_files = dict([(l, open(label_filename(l), 'a')) for l in labels]) # TODO: create a nltk.corpus.writer framework with some initial CorpusWriter classes if args.target: if args.trace: print 'translating all text from %s to %s' % (args.source, args.target) featx = lambda words: bag_of_words( norm_words( wordpunct_tokenize( translate(join_words(words), args.source, args.target, trace=args.trace, sleep=args.sleep, retries=args.retries)))) else: featx = lambda words: bag_of_words(norm_words(words)) def classify_write(words): feats = featx(words) probs = classifier.prob_classify(feats) label = probs.max() if probs.prob(label) >= args.threshold: label_files[label].write(join_words(words) + u'\n\n')
label_instance_function = { "sents": corpus.category_sent_words, "paras": corpus.category_para_words, "files": corpus.category_file_words, } lif = label_instance_function[args.instances] feats = [] test_feats = [] for label in labels: texts = list(lif(categorized_corpus, label)) stop = int(len(texts) * args.fraction) for t in texts[:stop]: feat = bag_of_words(norm_words(t)) feats.append(feat) test_feats.append((feat, label)) print "accuracy:", accuracy(classifier, test_feats) refsets, testsets = scoring.ref_test_sets(classifier, test_feats) for label in labels: ref = refsets[label] test = testsets[label] print "%s precision: %f" % (label, precision(ref, test) or 0) print "%s recall: %f" % (label, recall(ref, test) or 0) print "%s f-measure: %f" % (label, f_measure(ref, test) or 0) else: instance_function = { "sents": categorized_corpus.sents,
if not os.path.exists(args.target_corpus): os.makedirs(args.target_corpus) if args.trace: print('filename for category %s: %s' % (label, path)) return path labels = classifier.labels() label_files = dict([(l, open(label_filename(l), 'a')) for l in labels]) # TODO: create a nltk.corpus.writer framework with some initial CorpusWriter classes featx = lambda words: bag_of_words(norm_words(words)) def classify_write(words): feats = featx(words) probs = classifier.prob_classify(feats) label = probs.max() if probs.prob(label) >= args.threshold: label_files[label].write(join_words(words) + u'\n\n') if args.trace: print('classifying %s' % args.instances) if args.instances == 'paras':
path = os.path.join(args.target_corpus, '%s.txt' % label) if not os.path.exists(args.target_corpus): os.makedirs(args.target_corpus) if args.trace: print 'filename for category %s: %s' % (label, path) return path labels = classifier.labels() label_files = dict([(l, open(label_filename(l), 'a')) for l in labels]) # TODO: create a nltk.corpus.writer framework with some initial CorpusWriter classes featx = lambda words: bag_of_words(norm_words(words)) def classify_write(words): feats = featx(words) probs = classifier.prob_classify(feats) label = probs.max() if probs.prob(label) >= args.threshold: label_files[label].write(join_words(words) + u'\n\n') if args.trace: print 'classifying %s' % args.instances if args.instances == 'paras': for para in source_corpus.paras(): classify_write(list(itertools.chain(*para)))