def printgeg(): # print wat gegevens over het corp wrds = corpus(5) print "type/token ratio vijf auteurs:" tt_ratio = float(len(wrds))/float(len(set(wrds))) print tt_ratio print "documentlengte/aantal woorden:" print len(wrds)
from classifier import train, p_feat_cat, p_feature from features import * from help_functions import * import time from nltk import NaiveBayesClassifier import nltk import operator import pickle import datetime from math import * import winsound from time import time, sleep import webbrowser import os corp = corpus(50) # corp = lemmatize_corpus(corp0) # writetofile(corp,"lemmatized_corpus.pkl") compactcorpus = compactcorpus(corp) print "corpus build" authors = compactcorpus.keys() def document_features(document): features = {} for word in word_features: features[word] = word in document return features