def ten_fold(data): results = [] for iteration in [0]: #xrange(10): train_set, test_set, train_labels, test_distributions=[],[],[],[] start = iteration*(len(data)/10) end = start + (len(data)/10) # scale label to 100 joint_features = [] for i, item in enumerate(data): if i >= start and i < end: test_set.append(item[0]) test_distributions.append(item[1]) else: # normalize label to 100 w ints for training examples label = [int(x*100) for x in item[1]] while sum(label) != 100: rand = random.randint(0, len(cats) - 1) if sum(label) < 100: if item[1][rand] > 0.0: label[rand] += 1 elif sum(label) > 100: if label[rand] > 0: label[rand] -= 1 for j, count in enumerate(label): for k in xrange(count): train_set.append(item[0]) train_labels.append(cats[j]) joint_features.append((item[0],cats[j])) print >> sys.stderr, iteration, 'th iteration' print >> sys.stderr, len(train_set), 'training examples' # training features = maxent.TypedMaxentFeatureEncoding.train(joint_features) print 'features encoded' classifier = nltk.MaxentClassifier.train(joint_features,algorithm='IIS',max_iter=2) # testing kl_stats = [] for f, l in itertools.izip(test_set,test_distributions): classout = [0.]*8 probdist = classifier.prob_classify(f) for item in probdist.samples(): classout[cats.index(item)] = probdist.prob(item) ''' print classout print l print kldiv(l,classout) print '' ''' kl_stats.append(kldiv(l,classout)) results.append(float(sum(kl_stats))/len(kl_stats)) print results
def main(): # execute with parameters <poem-directory> <comment-directory> dataFileName = "data.list" if not isfile("data.list"): print "Processing data..." data = [] poems = PoemModel(sys.argv[1]).poems for i, f in enumerate(sorted(os.listdir(sys.argv[2]))): comment_f = open(sys.argv[2]+'/'+f).readlines() poem = poems[f] # this line removes poems from consideration if they have fewer than 10 comments - comment this out if you want to consider all poems if len(comment_f) < 10: continue label = get_label(comment_f) #print label if label == 'empty': continue data.append((poem, label)) pickle.dump(data, open(dataFileName, "wb")) else: print "Loading data file..." data = pickle.load(open(dataFileName, "rb")) # calculate some stats klvalues = [] for i, (_, l1) in enumerate(data): for j, (_, l2) in enumerate(data): klvalues.append(kldiv(l1, l2)) print "average klvalue of any pair of distributions", np.mean(klvalues) distAverage = [sum(a) for a in zip(*[b for a,b in data])] # this isn't normalized, but it doesn't matter klvalues = [] for i, (_, l1) in enumerate(data): klvalues.append(kldiv(l1, distAverage)) print "average klvalue relative to average distribution", np.mean(klvalues) ten_fold(data)