def trainMaxentropy(trainFeatures, trainLabels): import shorttext from shorttext.classifiers import MaxEntClassifier classifier = MaxEntClassifier() clf = make_pipeline(DictVectorizer(sparse=True), MaxentClassifier(encoding=None, weights=0)) scores = cross_val_score(clf, trainFeatures, trainLabels, cv=5) clf.fit(trainFeatures, trainLabels) return clf, scores.mean(), scores
# /usr/bin/env python3 # -*- coding:utf-8 -*- import sys import math from collections import defaultdict from nltk import MaxentClassifier # play outlook temperature humidity windy maxent = MaxentClassifier() class MaxEnt: def __init__(self): self._samples = [] # 样本集, 元素是[y,x1,x2,...,xn]的元组 self._Y = set([]) # 标签集合,相当于去重之后的y self._numXY = defaultdict(int) # Key是(xi,yi)对,Value是count(xi,yi) self._N = 0 # 样本数量 self._n = 0 # 特征对(xi,yi)总数量 self._xyID = {} # 对(x,y)对做的顺序编号(ID), Key是(xi,yi)对,Value是ID self._C = 0 # 样本最大的特征数量,用于求参数时的迭代,见IIS原理说明 self._ep_ = [] # 样本分布的特征期望值 self._ep = [] # 模型分布的特征期望值 self._w = [] # 对应n个特征的权值 self._lastw = [] # 上一轮迭代的权值 self._EPS = 0.01 # 判断是否收敛的阈值 def load_data(self, filename): for line in open(filename, "r"): sample = line.strip().split("\t") if len(sample) < 2: # 至少:标签+一个特征