class TestViterbi(unittest.TestCase): def setUp(self): self.training = 'I/PRO want/V to/TO race/V ./. I/PRO like/V cats/N ./.' self.sentence = 'I want to race.' self.pos_tagger = POSTagger([StringIO(self.training)]) self.pos_tagger.train() def test(self): """will calculate the best viterbi sequence for I want to race""" expectation = ['START', 'PRO', 'V', 'TO', 'V', '.'] result = self.pos_tagger.viterbi(self.sentence) self.assertListEqual(expectation, result)
class TestViterbi(unittest.TestCase): def setUp(self): self.training = u'I/PRO want/V to/TO race/V ./. I/PRO like/V cats/N ./.' self.sentence = 'I want to race.' self.pos_tagger = POSTagger([StringIO(self.training)]) self.pos_tagger.train() def test(self): """will calculate the best viterbi sequence for I want to race""" expectation = ['START', 'PRO', 'V', 'TO', 'V', '.'] result = self.pos_tagger.viterbi(self.sentence) self.assertListEqual(expectation, result)
def main(): """ json形式の辞書からACオートマトンを構築して保存する """ """ 辞書構築 MachineAC構築時に必要なイディオムデータは以下のような形の配列で用意 idiom_patterns = [["put", "on"], ["on", "time"], ["on", "earth"]] """ file_name = './lexicon/sample_lexicon.json' if (sys.argv[1]): file_name = sys.argv[1] idiom_patterns = [] json_open = open(file_name, 'r') # 辞書(WIXファイル)を読み込み json_data = json.load(json_open) for idiom in json_data: pos = POSTagger(idiom['pattern']) # 各イディオムパターンを品詞解析 idiom['pattern'] = "" processed_word_counter = 0 for token in pos.tokens: # ホワイトスペースの処理 if (token.start - processed_word_counter == 1): idiom['pattern'] += " " processed_word_counter += 1 idiom['pattern'] += token.lemma processed_word_counter += len(token.text) print(idiom['pattern']) for idiom in json_data: pattern = [] for each_word in idiom['pattern'].split(): if (each_word[0] == "{" and each_word[-1] == "}"): pattern.append(("pos", each_word[1:-1].upper())) else: pattern.append(("lemma", each_word)) idiom_patterns.append({"pattern": pattern, "target": idiom['target']}) ac = MachineAC(idiom_patterns) # オートマトン構築 print("goto") for each_state in ac.state: each_state.print_state() print() print("failure") print(ac.failure) print() print("output") print(ac.output) print() pandas.to_pickle(ac, "./findindex/findindex.pkl") # 外部ファイルにACオブジェクトを保存
class TestPOSTagger(uniitest.TestCase): def setUp(self): self.stream = io.StringIO("A/B C/D C/D A/D A/B ./.") self.pos_tagger = POSTagger([io.StringIO(self.stream)]) self.pos_tagger.train() def it_calculates_probability_of_word_and_tag(self): self.assertRqual(self.pos_tagger.word_tag_probability('Z', 'Z'), 0) # A and B Happends 2 times count og b happends twice therefore 100% self.assertEqual(self.pos_tagger.word_tag_probability('A', 'B'), 1) # A and D happens 1 time, count of D happens 3 times so 1/3 self.assertEqual(self.pos_tagger.word_tag_probability("A", "D"), 1.0 / 3.0) # START and START happens 1, time, count of start happens 1 so 1 self.assertEqual( self.pos_tagger.word_tag_probability("START", "START"), 1) self.assertEqual(self.pos_tagger.word_tag_probability(".", "."), 1) def it_calculates_probability_of_words_and_tags(self): words = ['START', 'A', 'C', 'A', 'A', '.'] tags = ['START', 'B', 'D', 'D', 'B', '.'] tagger = self.pos_tagger tag_probabilities = reduce((lambda z, y: x * y), [ tagger.tag_probability('B', 'D'), tagger.tag_probability('D', 'D'), tagger.tag_probability('D', 'B'), tagger.tag_probability('B', '.') ]) word_probabilities = reduce( (lambda x, y: x * y), [ tagger.word_tag_probability("A", "B"), # 1 tagger.word_tag_probability("C", "D"), tagger.word_tag_probability("A", "D"), tagger.word_tag_probability("A", "B"), # 1 ]) expected = word_probabilities * tag_probabilities self.assertEqual(tagger.probability_of_word_tag(words, tags), expected) def viterbi(self): training = "I/PRO want/V to/TO race/V ./. I/PRO like/V cats/N ./." sentence = 'I want to race.' tagger = self.pos_tagger expected = ['START', 'PRO', 'V', 'TO', 'V', '.'] self.assertEqual(pos_tagger.viterbi(sentence), expected)
def test(self): for i in range(TestCrossValidation.FOLDS): print("test cross validation for fold %d" % i) splits = int(len(self.files) / TestCrossValidation.FOLDS) validation_indexes = range(i * splits, (i + 1) * splits) training_indexes = list( set(range(len(self.files))).difference(validation_indexes)) validation_files = [ fn for idx, fn in enumerate(self.files) if idx in validation_indexes ] training_files = [ fn for idx, fn in enumerate(self.files) if idx in training_indexes ] pos_tagger = POSTagger.from_filepaths(training_files, True) misses = 0 successes = 0 for vf in validation_files: with open(vf, 'r') as f: for l in f: if re.match(r'\A\s+\Z', l): continue words = [] parts_of_speech = ['START'] for ppp in re.split(r'\s+', l.strip()): z = ppp.split('/') words.append(z[0]) parts_of_speech.append(z[1]) tag_seq = pos_tagger.viterbi(' '.join(words)) for tag1, tag2 in zip(tag_seq, parts_of_speech): if tag1 == tag2: successes += 1 else: misses += 1 print(misses / float(misses + successes)) print('Error rate was %f' % (misses / float(misses + successes)))
def test(self): for i in range(TestCrossValidation.FOLDS): print("test cross validation for fold %d" % i) splits = int(len(self.files) / TestCrossValidation.FOLDS) validation_indexes = range(i * splits, (i + 1) * splits) training_indexes = list(set(range(len(self.files))).difference(validation_indexes)) validation_files = [fn for idx, fn in enumerate(self.files) if idx in validation_indexes] training_files = [fn for idx, fn in enumerate(self.files) if idx in training_indexes] pos_tagger = POSTagger.from_filepaths(training_files, True) misses = 0 successes = 0 for vf in validation_files: with open(vf, 'r') as f: for l in f: if re.match(r'\A\s+\Z', l): continue words = [] parts_of_speech = ['START'] for ppp in re.split(r'\s+', l.strip()): z = ppp.split('/') words.append(z[0]) parts_of_speech.append(z[1]) tag_seq = pos_tagger.viterbi(' '.join(words)) for tag1, tag2 in zip(tag_seq, parts_of_speech): if tag1 == tag2: successes += 1 else: misses += 1 print(misses / float(misses + successes)) print('Error rate was %f' % (misses / float(misses + successes)))
if __name__ == '__main__': EMAILS_TRAIN_PATH = './Data/training/' EMAILS_TEST_TAGGED_PATH = './Data/seminar_testdata/test_tagged/' EMAILS_TEST_UNTAGGED_PATH = './Data/seminar_testdata/test_untagged/' GMB_CORPUS_ROOT = './Data/gmb-2.2.0' NER_TAGGER_PATH = './Data/models/ner_tagger.pkl' POS_TAGGER_PATH = './Data/models/pos_tagger_dt.pkl' # Read emails emails_train = read_emails(EMAILS_TRAIN_PATH) emails_test_tagged = read_emails(EMAILS_TEST_TAGGED_PATH) emails_test_untagged = read_emails(EMAILS_TEST_UNTAGGED_PATH) # POSTagger usage. Train it or wait for it to load from disk pos = POSTagger() # pos.train_pos_tagger(POS_TAGGER_PATH) pos.load_pos_tagger(POS_TAGGER_PATH) # NERTagger usage. Train it or wait for it to load from disk # You must have the GMB dataset in the Data folder to be able to train # ner_dataset = read_gmb_ner(GMB_CORPUS_ROOT) ner = NERTagger(feature_detector=ner_features) # ner.train(itertools.islice(ner_dataset, 50000), path='./Data/models/ner_tagger.pkl', batch_size=500, n_iter=5) ner.load_ner_tagger(NER_TAGGER_PATH) # accuracy = ner.score(itertools.islice(ner_dataset, 5000)) # 0.970287054168 # print("NER Tagger Accuracy: {}".format(accuracy)) # Testing of tagging # Tag the untagged testing emails
def match(self, query): # s = 0 document_with_pos = POSTagger(query).tokens document_to_be_matched = [document_with_pos] # skipped_count = 0 identified_idioms = [] while (document_to_be_matched != []): s = 0 skipped_count = 0 skipped_tokens = [] matchcing_document = document_to_be_matched.pop(0) for i, token in enumerate(matchcing_document): """ tokenの開始位置、元の文字列、品詞タグ、lemma形はそれぞれ次のようにして取り出す print(token.start, token.text, token.pos, token.lemma) """ if (token.text == "."): s = 0 continue if (token.pos[0] == "N"): token.pos = token.pos[0] while (self.g(s, ("lemma", token.lemma)) is None) and (self.g( s, ("pos", token.pos)) is None): if (s == self.failure[s]): # 元の状態に返ってくる(skip) skipped_tokens.append(token) skipped_count += 1 break s = self.failure[s] if (skipped_tokens != []): document_to_be_matched.append(skipped_tokens) skipped_tokens = [] skipped_count = 0 if (self.g(s, ("lemma", token.lemma)) is not None): # lemma形で遷移可能だったら遷移 s = self.g(s, ("lemma", token.lemma)) elif (self.g(s, ("pos", token.pos)) is not None): # posで遷移可能だったら遷移 s = self.g(s, ("pos", token.pos)) for x in self.output[s]: # print (matchcing_document[i - (len(x['pattern']) - 1) - skipped_count].start, token.start + len(token.text), x) identified_idioms.append({ "start": matchcing_document[i - (len(x['pattern']) - 1) - skipped_count].start, # skippableにしたとき、ここを修正しないといけないかも "end": token.start + len(token.text), "idiom": x, "attachable": True }) # skipped_count = 0 # attachableなイディオムとそうでないものを識別 identified_idioms = sorted(identified_idioms, key=lambda x: x['start']) # start位置順にソート for i in range(1, len(identified_idioms)): if (identified_idioms[i - 1]['end'] > identified_idioms[i]['start']): if ((identified_idioms[i - 1]['end'] - identified_idioms[i - 1]['start']) >= (identified_idioms[i]['end'] - identified_idioms[i]['start'])): identified_idioms[i]['attachable'] = False else: identified_idioms[i - 1]['attachable'] = False return identified_idioms
def setUp(self): self.training = 'I/PRO want/V to/TO race/V ./. I/PRO like/V cats/N ./.' self.sentence = 'I want to race.' self.pos_tagger = POSTagger([StringIO(self.training)]) self.pos_tagger.train()
def setUp(self): self.stream = 'A/B C/D C/D A/D A/B ./.' self.pos_tagger = POSTagger([StringIO(self.stream)]) self.pos_tagger.train()
def setUp(self): self.stream = u'A/B C/D C/D A/D A/B ./.' self.pos_tagger = POSTagger([StringIO(self.stream)]) self.pos_tagger.train()
def tag_speaker(email, pos_tagger_path, ner_tagger_path): file = open('./Data/speakers.txt', 'r') speakers = file.readlines() speakers = [x.strip() for x in speakers] file.close() header = email.header body = email.body reg_line = r'Who.*\n' reg_loc = r':.*' # Tag header found = False try: line = re.findall(reg_line, header) if len(line) > 0: line = line[0] for s in speakers: if s in line: ns = '<speaker>' + s + '</speaker>' header = re.sub(s, ns, header) body = re.sub(s, ns, body) found = True if not found: # Get line which contains time info line = re.findall(reg_line, header)[0] # remove semicolon from beginning [1:] speaker = re.findall(reg_loc, line)[0][1:].strip() n_speaker = '<speaker>' + speaker + '</speaker>' np_speaker = 'Who: <speaker>' + speaker + '</speaker>\n' header = re.sub(reg_line, np_speaker, header) body = re.sub(speaker, n_speaker, body) return Email(header, body, email.fileid) except: pass # Use NER Tagger for tagging pos = POSTagger() pos.load_pos_tagger(pos_tagger_path) chunker = NERTagger(ner_features) chunker.load_ner_tagger(ner_tagger_path) body = email.body body = [list(pos.predict(word_tokenize(s))) for s in sent_tokenize(body)] body = [chunker.parse(s) for s in body] speakers_n = [] for chunk in body: for c in chunk: if hasattr(c, 'label'): if c.label() == 'per': s = '' for v in c: s = s + v[0] + ' ' s = s.strip() speakers_n.append(s) elif c.label() == 'org': s = '' for v in c: s = s + v[0] + ' ' s = s.strip() speakers_n.append(s) body = email.body for s in speakers_n: if s in speakers: ns = '<speaker>' + s + '</speaker>' body = re.sub(s, ns, body) return Email(header, body, email.fileid)
def __init__(self, tagged_dataset): self._root, self._grammar_tree = self.__get_grammar_tree(tagged_dataset) self._word_tree = POSTagger.get_tagged_word_tree(tagged_dataset)
def setUp(self): self.stream = io.StringIO("A/B C/D C/D A/D A/B ./.") self.pos_tagger = POSTagger([io.StringIO(self.stream)]) self.pos_tagger.train()
def setUp(self): self.training = u'I/PRO want/V to/TO race/V ./. I/PRO like/V cats/N ./.' self.sentence = 'I want to race.' self.pos_tagger = POSTagger([StringIO(self.training)]) self.pos_tagger.train()
class TestProbabilityCalculation(unittest.TestCase): def setUp(self): self.stream = 'A/B C/D C/D A/D A/B ./.' self.pos_tagger = POSTagger([StringIO(self.stream)]) self.pos_tagger.train() def test1(self): """calculates tag transition probabilities""" self.assertAlmostEqual(0, self.pos_tagger.tag_probability('Z', 'Z')) self.assertAlmostEqual(2.0 / 3, self.pos_tagger.tag_probability('D', 'D')) self.assertAlmostEqual(1, self.pos_tagger.tag_probability('START', 'B')) self.assertAlmostEqual(0.5, self.pos_tagger.tag_probability('B', 'D')) self.assertAlmostEqual(0, self.pos_tagger.tag_probability('.', 'D')) def test2(self): """calculates probability of sequence of words and tags""" words = ['START', 'A', 'C', 'A', 'A', '.'] tags = ['START', 'B', 'D', 'D', 'B', '.'] tag_probabilities = self.pos_tagger.tag_probability('B', 'D') * \ self.pos_tagger.tag_probability('D', 'D') * \ self.pos_tagger.tag_probability('D', 'B') * \ self.pos_tagger.tag_probability('B', '.') word_probabilities = self.pos_tagger.word_tag_probability('A', 'B') * \ self.pos_tagger.word_tag_probability('C', 'D') * \ self.pos_tagger.word_tag_probability('A', 'D') * \ self.pos_tagger.word_tag_probability('A', 'B') expected = word_probabilities * tag_probabilities result = self.pos_tagger.probability_of_word_tag(words, tags) self.assertAlmostEqual(expected, result) def test3(self): """calculates the probability of a word given a tag""" self.assertAlmostEqual(0, self.pos_tagger.word_tag_probability('Z', 'Z')) self.assertAlmostEqual(1, self.pos_tagger.word_tag_probability('A', 'B')) self.assertAlmostEqual(1.0 / 3, self.pos_tagger.word_tag_probability('A', 'D')) self.assertAlmostEqual(1, self.pos_tagger.word_tag_probability('.', '.'))
def __init__(self, haikus_file='haikus.json'): self._dataset = HaikuGenerator.__parse_dataset(haikus_file) self._tagged_dataset = POSTagger.get_pos_tagged_dataset(self._dataset) self._bigrams = Bigrams(self._dataset) self._reversed_bigrams = Bigrams(self._dataset, reverse=True) self._grammar_tree = GrammarTree(self._tagged_dataset)
class TestProbabilityCalculation(unittest.TestCase): def setUp(self): self.stream = u'A/B C/D C/D A/D A/B ./.' self.pos_tagger = POSTagger([StringIO(self.stream)]) self.pos_tagger.train() def test1(self): """calculates tag transition probabilities""" self.assertAlmostEqual(0, self.pos_tagger.tag_probability('Z', 'Z')) self.assertAlmostEqual(2.0 / 3, self.pos_tagger.tag_probability('D', 'D')) self.assertAlmostEqual(1, self.pos_tagger.tag_probability('START', 'B')) self.assertAlmostEqual(0.5, self.pos_tagger.tag_probability('B', 'D')) self.assertAlmostEqual(0, self.pos_tagger.tag_probability('.', 'D')) def test2(self): """calculates probability of sequence of words and tags""" words = ['START', 'A', 'C', 'A', 'A', '.'] tags = ['START', 'B', 'D', 'D', 'B', '.'] tag_probabilities = self.pos_tagger.tag_probability('B', 'D') * \ self.pos_tagger.tag_probability('D', 'D') * \ self.pos_tagger.tag_probability('D', 'B') * \ self.pos_tagger.tag_probability('B', '.') word_probabilities = self.pos_tagger.word_tag_probability('A', 'B') * \ self.pos_tagger.word_tag_probability('C', 'D') * \ self.pos_tagger.word_tag_probability('A', 'D') * \ self.pos_tagger.word_tag_probability('A', 'B') expected = word_probabilities * tag_probabilities result = self.pos_tagger.probability_of_word_tag(words, tags) self.assertAlmostEqual(expected, result) def test3(self): """calculates the probability of a word given a tag""" self.assertAlmostEqual(0, self.pos_tagger.word_tag_probability('Z', 'Z')) self.assertAlmostEqual(1, self.pos_tagger.word_tag_probability('A', 'B')) self.assertAlmostEqual(1.0 / 3, self.pos_tagger.word_tag_probability('A', 'D')) self.assertAlmostEqual(1, self.pos_tagger.word_tag_probability('.', '.'))