class Labeler: def __init__(self): """map of single tag words""" self.single_tag_words = {} self.chk_set = set() self.perceptron = Perceptron() def train(self, sents, niter): # make single_tag_words self.perceptron.reset() roled_sents = sents # self._make_stw(roled_sents) self.role_set = set(role for sent in roled_sents for (word, tag, chunk, role) in sent) self.perceptron.tag_set = self.role_set length = int(len(roled_sents)) for iteration in range(niter): ncorrect = 0 ntotal = 0 for sent in roled_sents[:length]: sent = [(self._normalize(word), tag, chk, role) for (word, tag, chk, role) in sent] for idx, (word, tag, chk, role) in enumerate(sent): # pred = self.single_tag_words.get(word) pred = None if not pred: features = self._get_features(idx, sent) pred = self.perceptron.predict(features) self.perceptron.update(role, pred, features) # successful prediction ncorrect += pred == role ntotal += 1 random.shuffle(roled_sents) print "iteration #{0}, {1}/{2}=precision: {3}".format(iteration, ncorrect, ntotal, ncorrect / ntotal) self.perceptron.average_weights() def _make_stw(self, chked_sents): counts = defaultdict(lambda: defaultdict(int)) for sent in chked_sents: sent = [(self._normalize(word), tag, chunk, role) for (word, tag, chunk, role) in sent] for word, tag, chunk, role in sent: counts[word][role] += 1 threshold = 0.95 freqthres = 15 for word, tag_freqs in counts.items(): role, freq = max(tag_freqs.items(), key=lambda item: item[1]) total = sum(tag_freqs.values()) if freq >= freqthres and freq / total >= threshold: # unambiguity self.single_tag_words[word] = role elif freq == total and total >= 3: self.single_tag_words[word] = role def _normalize(self, word): def isnum(word): return word.endswith(tuple("一 二 三 四 五 六 七 八 九 十 百 千 万 亿 两".split(" "))) def iscal(word): return word.endswith(("年", "月", "日", "年代")) if isnum(word): return "NUM" if iscal(word): return "CAL" if word.endswith(("省", "市", "区", "州", "县", "镇", "乡", "街")): return "LOCATION" return word def _make_features(self, current, prev1, prev2, fword1, fword2): def add(name, *args): features["_".join((name,) + tuple(args))] = 1 word, tag, chk, role = current pword1, ptag1, pchk1, prole1 = prev1 pword2, ptag2, pchk2, prole2 = prev2 fword1, ftag1, fchk1 = fword1 fword2, ftag2, fchk2 = fword2 features = defaultdict(int) if chk != "VP": add("i chunk", chk) add("bias") add("i word", word) add("i tag", tag) add("i-1 word", pword1) add("i-1 tag", ptag1) add("i-1 role", prole1) add("i-1 chunk", pchk1) add("i-2 tag", ptag2) add("i-2 word", pword2) add("i-2 role", prole2) add("i-2 chunk", pchk2) add("i+1 word", fword1) add("i+1 tag", ftag1) add("i+1 chunk", pchk1) add("i+2 word", fword2) add("i+2 tag", ftag2) add("i+2 chunk", pchk2) if prole1[0] == "E" or prole1[0] == "O": add("out role") elif prole1[0] == "I" or prole1[0] == "B": add("in role", prole1.split("-")[1]) return features # current, prev1, prev2, after1, after2 def _get_features(self, i, sent): def add(name, *args): features["_".join((name,) + tuple(args))] = 1 def deletes(word): return word[1:] if word[0] == "*" else word def pretag(j): if j < 0: return "START1_TAG" elif j >= len(sent): return "END1_TAG" else: word, pos, chk, role = sent[j] if pos == "PP": return word else: return pos def prechk(j): if j == -2: return "START2_CHK" elif j == -1: return "START1_CHK" elif j == len(sent): return "END1_CHK" elif j == len(sent) + 1: return "END2_CHK" else: return sent[j][2] pword1, ptag1, pchk1, prole1 = ( ("START1_WORD", "START1_TAG", "START1_CHK", "START1_ROLE") if i <= 0 else sent[i - 1] ) pword2, ptag2, pchk2, prole2 = ( ("START2_WORD", "START2_TAG", "START2_CHK", "START2_ROLE") if i <= 1 else sent[i - 2] ) word, tag, chk, role = sent[i] fword1, ftag1, fchk1, frole1 = ( ("END1_WORD", "END1_TAG", "END1_CHK", "END1_ROLE") if i >= len(sent) - 1 else sent[i + 1] ) fword2, ftag2, fchk2, frole2 = ( ("END2_WORD", "END2_TAG", "END2_CHK", "END2_ROLE") if i >= len(sent) - 2 else sent[i + 2] ) pword1 = deletes(pword1) pword2 = deletes(pword2) word = deletes(word) fword1 = deletes(fword1) fword2 = deletes(fword2) sent_len = len(sent) features = defaultdict(int) hasa0 = False for j in range(0, i): if "A0" in sent[j][3]: hasa0 = True break add("i has-A0") if hasa0 else add("i No-A0") pred_pos = 0 predicate = None add("i pos", str(i)) if word[0] == "*": add("i is-predicate") word = word[:1] pred_pos = i else: for j in range(len(sent)): if sent[j][0][0] == "*": pred_pos = j predicate = sent[j] if pred_pos < i: add("i before") else: add("i after") r = range(i, pred_pos + 1) if i < pred_pos else range(pred_pos, i + 1) r2 = range(i + 1, pred_pos) if i < pred_pos else range(pred_pos + 1, i) path = [] nbp, nvp, nnp = 0, 0, 0 for j in r: if j == i: path.append(word) elif j == pred_pos: path.append(sent[j][1]) else: path.append(sent[j][2]) for j in r2: if sent[j][2] != "O": nbp += 1 if sent[j][2] == "VP": nvp += 1 if sent[j][2] == "NP": nnp += 1 path = "-".join(path) add("i path", path) add("i D-BP", str(nbp)) add("i D-VP", str(nvp)) add("i D-NP", str(nnp)) predicate = sent[pred_pos] pre_word, pre_pos, pre_chk, pre_role = predicate pre_word = pre_word[1:] pre_role = "E-V" add("pred", pre_word) add("pred-tag", pre_pos) add("pred-before-tag", pretag(pred_pos - 1)) add("pred-after-tag", pretag(pred_pos + 1)) add("pred pos", str(pred_pos)) add("pred-1 bp", prechk(pred_pos - 1)) add("pred-2 bp", prechk(pred_pos - 2)) add("pred+1 bp", prechk(pred_pos + 1)) add("pred+2 bp", prechk(pred_pos + 2)) if i == 0: add("i begin") if i == len(sent) - 1: add("i end") add("i chunk", chk) add("bias") add("i word", word) add("i tag", tag) add("i suffix2", word[-6:]) add("i suffix1", word[-3:]) add("i-1 word", pword1) add("i-1 tag", ptag1) add("i-1 role", prole1) add("i-1 chunk", pchk1) add("i-2 tag", ptag2) add("i-2 word", pword2) add("i-2 role", prole2) add("i-2 chunk", pchk2) add("i+1 word", fword1) add("i+1 tag", ftag1) add("i+1 chunk", pchk1) add("i+2 word", fword2) add("i+2 tag", ftag2) add("i+2 chunk", pchk2) if prole1[0] == "E" or prole1[0] == "O": add("out role") elif prole1[0] == "I" or prole1[0] == "B": add("in role", prole1[2:]) return features def tag(self, tagged_sent): roled = [[self._normalize(word), tag, chk, None] for word, tag, chk in tagged_sent] for idx, (word, tag, chunk, role) in enumerate(roled): # pred = self.single_tag_words.get(word) pred = None if not pred: features = self._get_features(idx, roled) if features["i is-predicate"] == 1: pred = "E-V" else: pred = self.perceptron.predict(features) roled[idx][3] = pred in_bracket = False for idx, (word, tag, chunk, role) in enumerate(roled): if role[0] == "B" or (role[0] == "E" and role[1] == "B"): if in_bracket: if role[0] == "E": # in bracket, EB j = idx - 1 while j > 0: if roled[j][3][0] == "B": break j -= 1 roled[idx][3] = "E-" + roled[j][3][2:] in_bracket = False else: roled[idx][3] = "I-" + roled[idx - 1][3][2:] else: if not role[0] == "E": in_bracket = True elif role[0] == "E": if in_bracket: j = idx - 1 while j > 0: if roled[j][3][0] == "B": break j -= 1 roled[idx][3] = "E-" + roled[j][3][2:] in_bracket = False else: roled[idx][3] = "EB-" + roled[idx][3][2:] else: if in_bracket: if idx == len(roled) - 1: j = idx - 1 while j > 0: if roled[j][3][0] == "B": break j -= 1 roled[idx][3] = "E-" + roled[j][3][2:] else: roled[idx][3] = "I-" + roled[idx - 1][3][2:] else: roled[idx][3] = "O" if in_bracket and idx == len(roled) - 1: j = idx - 1 while j > 0: if roled[j][3][0] == "B": break j -= 1 roled[idx][3] = "E-" + roled[j][3][2:] for idx, (word, tag, chunk, role) in enumerate(roled): if role.startswith("EB"): roled[idx][3] = "E" + role[2:] return roled def tag2(self, sent): tagged = [[self._normalize(word), pos, chk, None] for word, pos, chk in sent] nword = len(sent) ntag = len(self.role_set) pi = [[[[0, None, None] for k in range(ntag)] for j in range(ntag)] for i in range(nword)] for i, (word, tag, chk, role) in enumerate(tagged): pword1, ptag1, pchk1 = ("START1_WORD", "START1_TAG", "START1_CHK") if i <= 0 else tagged[i - 1][:3] pword2, ptag2, pchk2 = ("START2_WORD", "START2_TAG", "START2_CHK") if i <= 1 else tagged[i - 2][:3] word, tag, chk = tagged[i][:3] fword1, ftag1, fchk1 = ("END1_WORD", "END1_TAG", "END1_CHK") if i >= len(tagged) - 1 else tagged[i + 1][:3] fword2, ftag2, fchk2 = ("END2_WORD", "END2_TAG", "END2_CHK") if i >= len(tagged) - 2 else tagged[i + 2][:3] for j, u in enumerate(self.role_set): prole2 = "START2_ROLE" if i <= 0 else u for k, v in enumerate(self.role_set): prole1 = "START1_ROLE" if i <= 1 else v for t, role in enumerate(self.role_set): score = 0 if i <= 0 else pi[i - 1][t][j][0] score += self.perceptron.get_score( self._make_features( (word, tag, chk, role), (pword1, ptag1, pchk1, prole1), (pword2, ptag2, pchk2, prole2), (fword1, ftag1, fchk1), (fword2, ftag2, fchk2), ), role, ) if score > pi[i][j][k][0]: pi[i][j][k][0] = score pi[i][j][k][1] = role pi[i][j][k][2] = t i = len(tagged) - 1 t, j = None, None for j, u in enumerate(self.role_set): for k, v in enumerate(self.role_set): tag, t = pi[i][j][k][1:3] tagged[i][3] = tag i -= 1 while i >= 0: tagged[i][3] = pi[i][t][j][1] j = t t = pi[i][t][j][2] i -= 1 printc(tagged) return tagged def evaluate(self, roled_sents): ntotal = 0 ncorrect = 0 faults = [] likely = {} faults_count = defaultdict(int) f = open("test.props.txt", "w") for roled_sent in roled_sents: tagged_sent = [(word, tag, chunk) for (word, tag, chunk, role) in roled_sent] roled = self.tag(tagged_sent) for idx, (word, tag, chunk, role) in enumerate(roled): thword = tagged_sent[idx][0] if thword[0] == "*": thword = thword[1:] f.write("%s\t%s\t%s\t%s\n" % (thword, tag, chunk, role)) f.write("\n") has_false = False for idx, (word, tag, chunk, role) in enumerate(roled_sent): ntotal += 1 if role == roled[idx][3]: ncorrect += 1 else: has_false = True if has_false: record = [] for idx, (word, tag, chunk, role) in enumerate(roled_sent): if role == roled[idx][3]: record.append((word, tag, chunk, role)) else: record.append((word, tag, chunk, role, "【" + roled[idx][3] + "】")) faults_count[role + " is roled as " + roled[idx][3]] += 1 faults.append(record) print "precision:", ncorrect / ntotal * 100, "%" sorted_fault_count = sorted(faults_count.items(), key=lambda item: item[1], reverse=True) f.close() # for key, value in sorted_fault_count: # print key, value return faults
class Chunker: def __init__(self): '''map of single tag words''' self.single_tag_words = {} self.chk_set = set() self.perceptron = Perceptron() def train(self, sents, niter): # make single_tag_words self.perceptron.reset() chked_sents = sents self._make_stw(chked_sents) self.chk_set = set(chunk for sent in chked_sents for (word, tag, chunk) in sent) self.perceptron.tag_set = self.chk_set length = int(len(chked_sents)) for iteration in range(niter): ncorrect = 0 ntotal = 0 for sent in chked_sents[:length]: sent = [(self._normalize(word), tag, chk) for (word, tag, chk) in sent] for idx, (word, tag, chk) in enumerate(sent): pred = self.single_tag_words.get(word) if not pred: features = self._get_features(idx, sent) pred = self.perceptron.predict(features) self.perceptron.update(chk, pred, features) # successful prediction ncorrect += pred == chk ntotal += 1 random.shuffle(chked_sents) print "iteration #{0}, {1}/{2}=precision: {3}".format(iteration, ncorrect, ntotal, ncorrect / ntotal) self.perceptron.average_weights() def _make_stw(self, chked_sents): counts = defaultdict(lambda: defaultdict(int)) for sent in chked_sents: sent = [(self._normalize(word), tag, chunk) for (word, tag, chunk) in sent] for word, tag, chunk in sent: counts[word][chunk] += 1 threshold = 0.95 freqthres = 10 for word, tag_freqs in counts.items(): chunk, freq = max(tag_freqs.items(), key=lambda item: item[1]) total = sum(tag_freqs.values()) if freq >= freqthres and freq / total >= threshold: # unambiguity self.single_tag_words[word] = chunk elif freq == total and total >= 3: self.single_tag_words[word] = chunk def _normalize(self, word): def isnum(word): return word.endswith(tuple('一 二 三 四 五 六 七 八 九 十 百 千 万 亿 两'.split(' '))) def iscal(word): return word.endswith(('年', '月', '日', '年代')) if (isnum(word)): return 'NUM' if (iscal(word)): return 'CAL' if (word.endswith(('省', '市', '区', '州', '县', '镇', '乡', '街'))): return 'LOCATION' return word # current, prev1, prev2, after1, after2 def _get_features(self, i, sent): def add(name, *args): features['_'.join((name, ) + tuple(args))] = 1 pword1, ptag1, pchk1 = ('START1_WORD', 'START1_TAG', 'EB-START1_CHK') if i <= 0 else sent[i-1] pword2, ptag2, pchk2 = ('START2_WORD', 'START2_TAG', 'EB-START2_CHK') if i <= 1 else sent[i-2] pchk1_p = pchk1.split('-')[1] if pchk1 != 'O' else pchk1 pchk2_p = pchk2.split('-')[1] if pchk2 != 'O' else pchk2 word, tag, chk = sent[i] fword1, ftag1, fchk1 = ('END1_WORD', 'END1_TAG', 'EB-END1_CHK') if i >= len(sent)-1 else sent[i+1] fword2, ftag2, fchk2 = ('END2_WORD', 'END2_TAG', 'EB-END2_CHK') if i >= len(sent)-2 else sent[i+2] features = defaultdict(int) add('bias') add('i word', word) add('i tag', tag) add('i tag prefix', tag[0]) add('i suffix1', word[-3:]) add('i-1 suffix1', word[-3:]) add('i-1 word', pword1) add('i-1 tag', ptag1) add('i-1 tag prefix', ptag1[0]) add('i-1 i word', pword1, word) # add('i-2 i-1 chunk', pchk2, pchk1) # add('i-2 i-1 chunk_p', pchk2_p, pchk1_p) # add('i-1 chunk', pchk1) # add('i-2 chunk', pchk2) add('i-i i pos', ptag1, tag) add('i i+1 pos', tag, ftag1) add('i i+1 i+2 pos', tag, ftag1, ftag2) add('i+1 word i pos', fword1, tag) add('i-1 word i pos', pword1, tag) add('i-1 pos i+1 pos', ptag1, ftag1) add('i pos i+2 pos', tag, ftag2) add('i-2 word', ptag2) add('i-2 tag', ptag2) add('i-2 tag prefix', ptag2[0]) add('i+1 word', fword1) add('i+1 tag', ftag1) add('i+1 tag prefix', ftag1[0]) add('i+2 word', fword2) add('i+2 tag', ftag2) add('i+2 tag prefix', ftag2[0]) add('i-1 tag i tag i+1 tag', ptag1, tag, ftag1) add('i-1 tag i word i+1 tag', ptag1, word, ftag1) if pchk1[0] == 'E' or pchk1[0] == 'O': add('out chunk') elif pchk1[0] == 'I' or pchk1[0] == 'B': add('in chunk', pchk1[2:]) for j in xrange(i-1, -1, -1): if sent[j][2][0] == 'E': add('before chunk', sent[j][2].split('-')[1]) break if i == 0: add('i begin') elif i == len(sent)-1: add('i end') return features def tag(self, tagged_sent): chked = [[self._normalize(word), tag, None] for word, tag in tagged_sent] for idx, (word, tag, chunk) in enumerate(chked): pred = self.single_tag_words.get(word) if not pred: features = self._get_features(idx, chked) pred = self.perceptron.predict(features) chked[idx][2] = pred in_bracket = False for idx, (word, tag, chunk) in enumerate(chked): if chunk[0] == 'B' or (chunk[0] == 'E' and chunk[1] == 'B'): if in_bracket: if chunk[0] == 'E': # in bracket, EB j = idx-1 while j > 0: if chked[j][2][0] == 'B': break j -= 1 chked[idx][2] = 'E-' + chked[j][2][2:] if tag[0] == 'V' and chked[j][2][2:][0] != 'V': print 1, word, tag, 'to', chked[j][2][2:] in_bracket = False else: chked[idx][2] = 'I-' + chked[idx-1][2][2:] if tag[0] == 'V' and chked[idx-1][2][2:][0] != 'V': print 2, word, tag, 'to', chked[idx-1][2][2:] else: if not chunk[0] == 'E': in_bracket = True elif chunk[0] == 'E': if in_bracket: j = idx-1 while j > 0: if chked[j][2][0] == 'B': break j -= 1 chked[idx][2] = 'E-' + chked[j][2][2:] if tag[0] == 'V' and chked[j][2][2:][0] != 'V': print 3, word, tag, 'to', chked[j][2][2:] in_bracket = False else: chked[idx][2] = 'EB-' + chked[idx][2][2:] else: if in_bracket: if idx == len(chked)-1: j = idx-1 while j > 0: if chked[j][2][0] == 'B': break j -= 1 chked[idx][2] = 'E-' + chked[j][2][2:] if tag[0] == 'V' and chked[j][2][2:][0] != 'V': print 4, word, tag, 'to', chked[j][2][2:] else: chked[idx][2] = 'I-' + chked[idx-1][2][2:] if tag[0] == 'V' and chked[idx-1][2][2:][0] != 'V': print 5, word, tag, 'to', chked[idx-1][2][2:] else: chked[idx][2] = 'O' if in_bracket and idx == len(chked)-1: j = idx-1 while j > 0: if chked[j][2][0] == 'B': break j -= 1 chked[idx][2] = 'E-' + chked[j][2][2:] if tag[0] == 'V' and chked[j][2][2:][0] != 'V': print word, tag, 'to', chked[j][2][2:] for idx, (word, tag, chunk) in enumerate(chked): if tag[0] == 'V' and chunk != 'O' and chunk.split('-')[1][0] != 'V': if idx != 0: if chked[idx-1][2][0] == 'I': chked[idx-1][2] = 'E' + chked[idx-1][2][1:] elif chked[idx-1][2][0] == 'B': chked[idx-1][2] = 'E' + chked[idx-1][2] if idx != len(chked)-1: if chked[idx+1][2][0] == 'I': chked[idx+1][2] = 'B' + chked[idx+1][2][1:] elif chked[idx+1][2][0] == 'E' and chked[idx+1][2][1] == '-': chked[idx+1][2] = 'EB' + chked[idx-1][2][1:] chked[idx][2] = 'EB-VP' for idx, (word, tag, chunk) in enumerate(chked): if tag[0] == 'V' and chunk != 'O' and chunk.split('-')[1][0] != 'V': print word, tag, 'in', chunk return chked def tag2(self, sent): tagged = [[self._normalize(word), None] for word in sent] nword = len(sent) ntag = len(self.chk_set) pi = [[[[0, None, None] for k in range(ntag)] for j in range(ntag)] for i in range(nword)] for i, (word, tag) in enumerate(tagged): pword1 = 'START1_WORD' if i <= 0 else tagged[i-1][0] pword2 = 'START2_WORD' if i <= 1 else tagged[i-2][0] fword1 = 'END1_WORD' if i >= len(sent)-1 else tagged[i+1][0] fword2 = 'END2_WORD' if i >= len(sent)-2 else tagged[i+2][0] for j, u in enumerate(self.chk_set): ptag2 = 'START1_TAG' if i <= 0 else u for k, v in enumerate(self.chk_set): ptag1 = 'START2_TAG' if i <= 1 else v for t, tag in enumerate(self.chk_set): score = 0 if i <= 0 else pi[i-1][t][j][0] score += self.perceptron.get_score(self._make_features((word, tag), (pword1, ptag1), (pword2, ptag2), fword1, fword2), tag) if score > pi[i][j][k][0]: pi[i][j][k][0] = score pi[i][j][k][1] = tag pi[i][j][k][2] = t i = len(tagged)-1 t, j = None, None for j, u in enumerate(self.chk_set): for k, v in enumerate(self.chk_set): tag, t = pi[i][j][k][1:3] tagged[i][1] = tag i -= 1 while i >= 0: tagged[i][1] = pi[i][t][j][1] j = t t = pi[i][t][j][2] i -= 1 printc(tagged) return tagged def evaluate(self, chked_sents, log=False): ntotal = 0 ncorrect = 0 faults = [] likely = {} faults_count = defaultdict(int) f = open('test.pos-chk.iob', 'w') for chked_sent in chked_sents: tagged_sent = [(word, tag) for (word, tag, chunk) in chked_sent] chked = self.tag(tagged_sent) for word, tag, chunk in chked: f.write('%s\t%s\n' % (tag, chunk)) f.write('\n') has_false = False for idx, (word, tag, chunk) in enumerate(chked_sent): ntotal += 1 if chunk == chked[idx][2]: ncorrect += 1 else: has_false = True if has_false and log: record = [] for idx, (word, tag, chunk) in enumerate(chked_sent): if chunk == chked[idx][2]: record.append((word, tag, chunk)) else: record.append((word, tag, chunk, '【' + chked[idx][2] + '】')) faults_count[chunk + ' is chked as ' + chked[idx][2]] += 1 faults.append(record) f.close() print 'precision:', ncorrect / ntotal * 100, '%' sorted_fault_count = sorted(faults_count.items(), key=lambda item: item[1], reverse=True) for key, value in sorted_fault_count: print key, value return faults
class Tagger: def __init__(self): '''map of single tag words''' self.single_tag_words = {} self.tag_set = set() self.perceptron = Perceptron() def train(self, sents, niter): # make single_tag_words self.perceptron.reset() tagged_sents = sents self._make_stw(tagged_sents) self.tag_set = set(tag for sent in tagged_sents for (word, tag) in sent) self.perceptron.tag_set = self.tag_set length = int(len(tagged_sents)) for iteration in range(niter): ncorrect = 0 ntotal = 0 for sent in tagged_sents[:length]: sent = [(self._normalize(word), tag) for (word, tag) in sent] for idx, (word, tag) in enumerate(sent): pred = self.single_tag_words.get(word) if not pred: features = self._get_features(idx, sent) pred = self.perceptron.predict(features) self.perceptron.update(tag, pred, features) # successful prediction ncorrect += pred == tag ntotal += 1 random.shuffle(tagged_sents) print "iteration #{0}, {1}/{2}=precision: {3}".format(iteration, ncorrect, ntotal, ncorrect / ntotal) self.perceptron.average_weights() def _make_stw(self, tagged_sents): counts = defaultdict(lambda: defaultdict(int)) for sent in tagged_sents: sent = [(self._normalize(word), tag) for (word, tag) in sent] for word, tag in sent: counts[word][tag] += 1 threshold = 0.95 freqthres = 15 for word, tag_freqs in counts.items(): tag, freq = max(tag_freqs.items(), key=lambda item: item[1]) total = sum(tag_freqs.values()) if freq >= freqthres and freq / total >= threshold: # unambiguity self.single_tag_words[word] = tag elif tag == 'NR': self.single_tag_words[word] = tag elif freq == total and total >= 3: self.single_tag_words[word] = tag # self.single_tag_words['的'] = 'DEG' self.single_tag_words['-'] = 'PU' self.single_tag_words['--'] = 'PU' def _normalize(self, word): def isnum(word): return word.endswith(tuple('一 二 三 四 五 六 七 八 九 十 百 千 万 亿 两'.split(' '))) def iscal(word): return word.endswith(('年', '月', '日', '年代')) if (isnum(word)): return 'NUM' if (iscal(word)): return 'CAL' if (word.endswith(('省', '市', '区', '州', '县', '镇', '乡', '街'))): return 'LOCATION' return word # current, prev1, prev2, after1, after2 def _make_features(self, current, prev1, prev2, fword1, fword2): def add(name, *args): features['_'.join((name, ) + tuple(args))] = 1 word, tag = current pword1, ptag1 = prev1 pword2, ptag2 = prev2 features = defaultdict(int) add('bias') if word[0] == '*': add('i is v') else: add('i not v') add('i suffix', word[-3:]) add('i-1 suffix', pword1[-3:]) add('i+1 suffix', fword1[-3:]) add('i suffix2', word[-6:]) add('i-1 suffix', pword1[-6:]) add('i+1 suffix', fword2[-6:]) # add('i prefix', word[:3]) # add('i-1 prefix', pword1[:3]) # add('i-2 prefix', pword2[:3]) add('i-1 tag', ptag1) add('i-2 tag', ptag2) add('i-1 i-2 tag', ptag1, ptag2) add('i-2 word', pword2) add('i-1 word', pword1) add('i word', word) add('i-2 i-1 word', fword2, fword1) add('i+1 word', fword1) add('i+2 word', fword2) add('i+1 i+2 word', fword1, fword2) add('i-1 tag i word', ptag1, word) add('i-2 tag i-1 word', ptag2, pword1) add('i word-len', str(len(word))) # if word != 'NUM' and word != 'CAL' and len(word) >= 6: # for i in range(int(len(word) / 3)): # add(str(i), ' charactor', word[i*3:(i+1)*3]) return features def _get_features(self, i, sent): pword1, ptag1 = ('START1_WORD', 'START1_TAG') if i <= 0 else sent[i-1] pword2, ptag2 = ('START2_WORD', 'START2_TAG') if i <= 1 else sent[i-2] word, tag = sent[i] fword1, ftag1 = ('END1_WORD', 'END1_TAG') if i >= len(sent)-1 else sent[i+1] fword2, ftag2 = ('END2_WORD', 'END2_TAG') if i >= len(sent)-2 else sent[i+2] return self._make_features((word, tag), (pword1, ptag1), (pword2, ptag2), fword1, fword2) def tag(self, sent): original = [[word, None] for word in sent] tagged = [[self._normalize(word), None] for word in sent] for idx, (word, tag) in enumerate(tagged): pred = self.single_tag_words.get(word) if not pred: features = self._get_features(idx, tagged) pred = self.perceptron.predict(features) tagged[idx][1] = pred original[idx][1] = pred return original def tag2(self, sent): tagged = [[self._normalize(word), None] for word in sent] nword = len(sent) ntag = len(self.tag_set) pi = [[[[0, None, None] for k in range(ntag)] for j in range(ntag)] for i in range(nword)] for i, (word, tag) in enumerate(tagged): pword1 = 'START1_WORD' if i <= 0 else tagged[i-1][0] pword2 = 'START2_WORD' if i <= 1 else tagged[i-2][0] fword1 = 'END1_WORD' if i >= len(sent)-1 else tagged[i+1][0] fword2 = 'END2_WORD' if i >= len(sent)-2 else tagged[i+2][0] for j, u in enumerate(self.tag_set): ptag2 = 'START1_TAG' if i <= 0 else u for k, v in enumerate(self.tag_set): ptag1 = 'START2_TAG' if i <= 1 else v for t, tag in enumerate(self.tag_set): score = 0 if i <= 0 else pi[i-1][t][j][0] score += self.perceptron.get_score(self._make_features((word, tag), (pword1, ptag1), (pword2, ptag2), fword1, fword2), tag) if score > pi[i][j][k][0]: pi[i][j][k][0] = score pi[i][j][k][1] = tag pi[i][j][k][2] = t i = len(tagged)-1 t, j = None, None for j, u in enumerate(self.tag_set): for k, v in enumerate(self.tag_set): tag, t = pi[i][j][k][1:3] tagged[i][1] = tag i -= 1 while i >= 0: tagged[i][1] = pi[i][t][j][1] j = t t = pi[i][t][j][2] i -= 1 printc(tagged) return tagged def evaluate(self, tagged_sents, log=False): ntotal = 0 ncorrect = 0 faults = [] likely = {} faults_count = defaultdict(int) file = open('test.pos', 'w') for tagged_sent in tagged_sents: sent = [word for (word, tag) in tagged_sent] tagged = self.tag(sent) for word, tag in tagged: # print word, tag, if tag == 'NR' and not word in likely: likely[word] = 'NR' for tagged_sent in tagged_sents: sent = [word for (word, tag) in tagged_sent] tagged = self.tag(sent) # for idx, (word, tag) in enumerate(tagged): # if word in likely: # tagged[idx][1] = 'NR' has_false = False for (word, tag) in tagged: word = word[1:] if word[0] == '*' else word file.write('%s\t%s\n' % (word, tag)) file.write('\n') if log: for idx, (word, tag) in enumerate(tagged_sent): ntotal += 1 if tag == tagged[idx][1] or (tag[0] == 'N' and tagged[idx][1][0] == 'N') or (tag == 'DEC' and tagged[idx][1] == 'DEG') or \ (tag == 'DEG' and tagged[idx][1] == 'DEC') or ((tag[0] == 'V' and tagged[idx][1][0] == 'V')): ncorrect += 1 else: has_false = True if log: if has_false: record = [] for idx, (word, tag) in enumerate(tagged_sent): if tag == tagged[idx][1] or (tag[0] == 'N' and tagged[idx][1][0] == 'N') or (tag == 'DEC' and tagged[idx][1] == 'DEG') or \ (tag == 'DEG' and tagged[idx][1] == 'DEC') or (tag[0] == 'V' and tagged[idx][1][0] == 'V'): record.append((word, tag, tagged[idx][1])) else: record.append((word, tag, '【' + tagged[idx][1] + '】')) faults_count[tag + ' is tagged as ' + tagged[idx][1]] += 1 faults.append(record) if log: print 'precision:', ncorrect / ntotal * 100, '%' file.close() if log: sorted_fault_count = sorted(faults_count.items(), key=lambda item: item[1], reverse=True) for key, value in sorted_fault_count: print key, value return faults