Пример #1
0
def ten_fold(data):
    results = []
    for iteration in [0]: #xrange(10):
        train_set, test_set, train_labels, test_distributions=[],[],[],[]
        start = iteration*(len(data)/10)
        end = start + (len(data)/10)

        # scale label to 100

        joint_features = []
        for i, item in enumerate(data):
            if i >= start and i < end:
                test_set.append(item[0])
                test_distributions.append(item[1])
            else:
                # normalize label to 100 w ints for training examples
                label = [int(x*100) for x in item[1]]

                while sum(label) != 100:
                    rand = random.randint(0, len(cats) - 1)
                    if sum(label) < 100:
                        if item[1][rand] > 0.0:
                            label[rand] += 1
                    elif sum(label) > 100:
                        if label[rand] > 0:
                            label[rand] -= 1

                for j, count in enumerate(label):
                    for k in xrange(count):
                        train_set.append(item[0])
                        train_labels.append(cats[j])
                        joint_features.append((item[0],cats[j]))

        print >> sys.stderr, iteration, 'th iteration'
        print >> sys.stderr, len(train_set), 'training examples'

        # training        
        features = maxent.TypedMaxentFeatureEncoding.train(joint_features)
        print 'features encoded'
        classifier = nltk.MaxentClassifier.train(joint_features,algorithm='IIS',max_iter=2)
        
        # testing
        kl_stats = []
        for f, l in itertools.izip(test_set,test_distributions):
            classout = [0.]*8
            probdist = classifier.prob_classify(f)
            for item in probdist.samples():
                classout[cats.index(item)] = probdist.prob(item)
            '''
            print classout
            print l
            print kldiv(l,classout)
            print ''
            '''
            kl_stats.append(kldiv(l,classout))
            
        results.append(float(sum(kl_stats))/len(kl_stats))
        print results
Пример #2
0
def main():
    # execute with parameters <poem-directory> <comment-directory>
    dataFileName = "data.list"
    if not isfile("data.list"):
        print "Processing data..."
        data = []
        poems = PoemModel(sys.argv[1]).poems
        for i, f in enumerate(sorted(os.listdir(sys.argv[2]))):
            comment_f = open(sys.argv[2]+'/'+f).readlines()
            poem = poems[f]

            # this line removes poems from consideration if they have fewer than 10 comments - comment this out if you want to consider all poems
            if len(comment_f) < 10: continue

            label = get_label(comment_f)
            #print label
            if label == 'empty': continue
            data.append((poem, label))
            pickle.dump(data, open(dataFileName, "wb"))
    else:
        print "Loading data file..."
        data = pickle.load(open(dataFileName, "rb"))

    # calculate some stats
    klvalues = []
    for i, (_, l1) in enumerate(data):
        for j, (_, l2) in enumerate(data):
            klvalues.append(kldiv(l1, l2))
    print "average klvalue of any pair of distributions", np.mean(klvalues)
    distAverage = [sum(a) for a in zip(*[b for a,b in data])] # this isn't normalized, but it doesn't matter
    klvalues = []
    for i, (_, l1) in enumerate(data):
        klvalues.append(kldiv(l1, distAverage))
    print "average klvalue relative to average distribution", np.mean(klvalues)

    ten_fold(data)