Пример #1
0
class TestViterbi(unittest.TestCase):
    def setUp(self):
        self.training = 'I/PRO want/V to/TO race/V ./. I/PRO like/V cats/N ./.'
        self.sentence = 'I want to race.'
        self.pos_tagger = POSTagger([StringIO(self.training)])
        self.pos_tagger.train()

    def test(self):
        """will calculate the best viterbi sequence for I want to race"""
        expectation = ['START', 'PRO', 'V', 'TO', 'V', '.']
        result = self.pos_tagger.viterbi(self.sentence)
        self.assertListEqual(expectation, result)
Пример #2
0
class TestViterbi(unittest.TestCase):
  def setUp(self):
    self.training = u'I/PRO want/V to/TO race/V ./. I/PRO like/V cats/N ./.'
    self.sentence = 'I want to race.'
    self.pos_tagger = POSTagger([StringIO(self.training)])
    self.pos_tagger.train()

  def test(self):
    """will calculate the best viterbi sequence for I want to race"""
    expectation = ['START', 'PRO', 'V', 'TO', 'V', '.']
    result = self.pos_tagger.viterbi(self.sentence)
    self.assertListEqual(expectation, result)
Пример #3
0
def main():
    """
    json形式の辞書からACオートマトンを構築して保存する
    """
    """
    辞書構築
    MachineAC構築時に必要なイディオムデータは以下のような形の配列で用意
    idiom_patterns = [["put", "on"], ["on", "time"], ["on", "earth"]]
    """
    file_name = './lexicon/sample_lexicon.json'

    if (sys.argv[1]):
        file_name = sys.argv[1]

    idiom_patterns = []
    json_open = open(file_name, 'r')  # 辞書(WIXファイル)を読み込み
    json_data = json.load(json_open)

    for idiom in json_data:
        pos = POSTagger(idiom['pattern'])  # 各イディオムパターンを品詞解析
        idiom['pattern'] = ""
        processed_word_counter = 0
        for token in pos.tokens:
            # ホワイトスペースの処理
            if (token.start - processed_word_counter == 1):
                idiom['pattern'] += " "
                processed_word_counter += 1

            idiom['pattern'] += token.lemma
            processed_word_counter += len(token.text)
        print(idiom['pattern'])

    for idiom in json_data:
        pattern = []
        for each_word in idiom['pattern'].split():
            if (each_word[0] == "{" and each_word[-1] == "}"):
                pattern.append(("pos", each_word[1:-1].upper()))
            else:
                pattern.append(("lemma", each_word))

        idiom_patterns.append({"pattern": pattern, "target": idiom['target']})

    ac = MachineAC(idiom_patterns)  # オートマトン構築

    print("goto")
    for each_state in ac.state:
        each_state.print_state()
    print()

    print("failure")
    print(ac.failure)
    print()

    print("output")
    print(ac.output)
    print()

    pandas.to_pickle(ac, "./findindex/findindex.pkl")  # 外部ファイルにACオブジェクトを保存
Пример #4
0
class TestPOSTagger(uniitest.TestCase):
    def setUp(self):
        self.stream = io.StringIO("A/B C/D C/D A/D A/B ./.")
        self.pos_tagger = POSTagger([io.StringIO(self.stream)])
        self.pos_tagger.train()

    def it_calculates_probability_of_word_and_tag(self):
        self.assertRqual(self.pos_tagger.word_tag_probability('Z', 'Z'), 0)

        # A and B Happends 2 times count og b happends twice therefore 100%
        self.assertEqual(self.pos_tagger.word_tag_probability('A', 'B'), 1)

        # A and D happens 1 time, count of D happens 3 times so 1/3
        self.assertEqual(self.pos_tagger.word_tag_probability("A", "D"),
                         1.0 / 3.0)

        # START and START happens 1, time, count of start happens 1 so 1
        self.assertEqual(
            self.pos_tagger.word_tag_probability("START", "START"), 1)

        self.assertEqual(self.pos_tagger.word_tag_probability(".", "."), 1)

    def it_calculates_probability_of_words_and_tags(self):
        words = ['START', 'A', 'C', 'A', 'A', '.']
        tags = ['START', 'B', 'D', 'D', 'B', '.']
        tagger = self.pos_tagger

        tag_probabilities = reduce((lambda z, y: x * y), [
            tagger.tag_probability('B', 'D'),
            tagger.tag_probability('D', 'D'),
            tagger.tag_probability('D', 'B'),
            tagger.tag_probability('B', '.')
        ])

        word_probabilities = reduce(
            (lambda x, y: x * y),
            [
                tagger.word_tag_probability("A", "B"),  # 1
                tagger.word_tag_probability("C", "D"),
                tagger.word_tag_probability("A", "D"),
                tagger.word_tag_probability("A", "B"),  # 1
            ])

        expected = word_probabilities * tag_probabilities

        self.assertEqual(tagger.probability_of_word_tag(words, tags), expected)

    def viterbi(self):
        training = "I/PRO want/V to/TO race/V ./. I/PRO like/V cats/N ./."
        sentence = 'I want to race.'
        tagger = self.pos_tagger
        expected = ['START', 'PRO', 'V', 'TO', 'V', '.']
        self.assertEqual(pos_tagger.viterbi(sentence), expected)
Пример #5
0
    def test(self):
        for i in range(TestCrossValidation.FOLDS):
            print("test cross validation for fold %d" % i)
            splits = int(len(self.files) / TestCrossValidation.FOLDS)
            validation_indexes = range(i * splits, (i + 1) * splits)

            training_indexes = list(
                set(range(len(self.files))).difference(validation_indexes))
            validation_files = [
                fn for idx, fn in enumerate(self.files)
                if idx in validation_indexes
            ]
            training_files = [
                fn for idx, fn in enumerate(self.files)
                if idx in training_indexes
            ]

            pos_tagger = POSTagger.from_filepaths(training_files, True)

            misses = 0
            successes = 0

            for vf in validation_files:
                with open(vf, 'r') as f:
                    for l in f:
                        if re.match(r'\A\s+\Z', l):
                            continue
                        words = []
                        parts_of_speech = ['START']
                        for ppp in re.split(r'\s+', l.strip()):
                            z = ppp.split('/')
                            words.append(z[0])
                            parts_of_speech.append(z[1])

                        tag_seq = pos_tagger.viterbi(' '.join(words))
                        for tag1, tag2 in zip(tag_seq, parts_of_speech):
                            if tag1 == tag2:
                                successes += 1
                            else:
                                misses += 1
                print(misses / float(misses + successes))
            print('Error rate was %f' % (misses / float(misses + successes)))
  def test(self):
    for i in range(TestCrossValidation.FOLDS):
      print("test cross validation for fold %d" % i)
      splits = int(len(self.files) / TestCrossValidation.FOLDS)
      validation_indexes = range(i * splits, (i + 1) * splits)

      training_indexes = list(set(range(len(self.files))).difference(validation_indexes))
      validation_files = [fn for idx, fn in enumerate(self.files)
                          if idx in validation_indexes]
      training_files = [fn for idx, fn in enumerate(self.files)
                        if idx in training_indexes]

      pos_tagger = POSTagger.from_filepaths(training_files, True)

      misses = 0
      successes = 0

      for vf in validation_files:
        with open(vf, 'r') as f:
          for l in f:
            if re.match(r'\A\s+\Z', l):
              continue
            words = []
            parts_of_speech = ['START']
            for ppp in re.split(r'\s+', l.strip()):
              z = ppp.split('/')
              words.append(z[0])
              parts_of_speech.append(z[1])

            tag_seq = pos_tagger.viterbi(' '.join(words))
            for tag1, tag2 in zip(tag_seq, parts_of_speech):
              if tag1 == tag2:
                successes += 1
              else:
                misses += 1
        print(misses / float(misses + successes))
      print('Error rate was %f' % (misses / float(misses + successes)))
Пример #7
0
if __name__ == '__main__':
    EMAILS_TRAIN_PATH = './Data/training/'
    EMAILS_TEST_TAGGED_PATH = './Data/seminar_testdata/test_tagged/'
    EMAILS_TEST_UNTAGGED_PATH = './Data/seminar_testdata/test_untagged/'
    GMB_CORPUS_ROOT = './Data/gmb-2.2.0'
    NER_TAGGER_PATH = './Data/models/ner_tagger.pkl'
    POS_TAGGER_PATH = './Data/models/pos_tagger_dt.pkl'

    # Read emails
    emails_train = read_emails(EMAILS_TRAIN_PATH)
    emails_test_tagged = read_emails(EMAILS_TEST_TAGGED_PATH)
    emails_test_untagged = read_emails(EMAILS_TEST_UNTAGGED_PATH)

    # POSTagger usage. Train it or wait for it to load from disk
    pos = POSTagger()
    # pos.train_pos_tagger(POS_TAGGER_PATH)
    pos.load_pos_tagger(POS_TAGGER_PATH)

    # NERTagger usage. Train it or wait for it to load from disk
    # You must have the GMB dataset in the Data folder to be able to train
    # ner_dataset = read_gmb_ner(GMB_CORPUS_ROOT)
    ner = NERTagger(feature_detector=ner_features)
    # ner.train(itertools.islice(ner_dataset, 50000), path='./Data/models/ner_tagger.pkl', batch_size=500, n_iter=5)
    ner.load_ner_tagger(NER_TAGGER_PATH)
    # accuracy = ner.score(itertools.islice(ner_dataset, 5000))  # 0.970287054168
    # print("NER Tagger Accuracy: {}".format(accuracy))

    # Testing of tagging

    # Tag the untagged testing emails
Пример #8
0
    def match(self, query):
        # s = 0
        document_with_pos = POSTagger(query).tokens
        document_to_be_matched = [document_with_pos]
        # skipped_count = 0
        identified_idioms = []
        while (document_to_be_matched != []):
            s = 0
            skipped_count = 0
            skipped_tokens = []
            matchcing_document = document_to_be_matched.pop(0)

            for i, token in enumerate(matchcing_document):
                """
                tokenの開始位置、元の文字列、品詞タグ、lemma形はそれぞれ次のようにして取り出す
                print(token.start, token.text, token.pos, token.lemma)
                """
                if (token.text == "."):
                    s = 0
                    continue

                if (token.pos[0] == "N"):
                    token.pos = token.pos[0]
                while (self.g(s, ("lemma", token.lemma)) is None) and (self.g(
                        s, ("pos", token.pos)) is None):
                    if (s == self.failure[s]):  # 元の状態に返ってくる(skip)
                        skipped_tokens.append(token)
                        skipped_count += 1
                        break
                    s = self.failure[s]
                    if (skipped_tokens != []):
                        document_to_be_matched.append(skipped_tokens)
                    skipped_tokens = []
                    skipped_count = 0

                if (self.g(s, ("lemma", token.lemma))
                        is not None):  # lemma形で遷移可能だったら遷移
                    s = self.g(s, ("lemma", token.lemma))
                elif (self.g(s, ("pos", token.pos))
                      is not None):  # posで遷移可能だったら遷移
                    s = self.g(s, ("pos", token.pos))

                for x in self.output[s]:
                    # print (matchcing_document[i - (len(x['pattern']) - 1) - skipped_count].start, token.start + len(token.text), x)
                    identified_idioms.append({
                        "start":
                        matchcing_document[i - (len(x['pattern']) - 1) -
                                           skipped_count].start,
                        # skippableにしたとき、ここを修正しないといけないかも
                        "end":
                        token.start + len(token.text),
                        "idiom":
                        x,
                        "attachable":
                        True
                    })
                    # skipped_count = 0

        # attachableなイディオムとそうでないものを識別
        identified_idioms = sorted(identified_idioms,
                                   key=lambda x: x['start'])  # start位置順にソート
        for i in range(1, len(identified_idioms)):
            if (identified_idioms[i - 1]['end'] >
                    identified_idioms[i]['start']):
                if ((identified_idioms[i - 1]['end'] -
                     identified_idioms[i - 1]['start']) >=
                    (identified_idioms[i]['end'] -
                     identified_idioms[i]['start'])):
                    identified_idioms[i]['attachable'] = False
                else:
                    identified_idioms[i - 1]['attachable'] = False
        return identified_idioms
Пример #9
0
 def setUp(self):
     self.training = 'I/PRO want/V to/TO race/V ./. I/PRO like/V cats/N ./.'
     self.sentence = 'I want to race.'
     self.pos_tagger = POSTagger([StringIO(self.training)])
     self.pos_tagger.train()
Пример #10
0
 def setUp(self):
     self.stream = 'A/B C/D C/D A/D A/B ./.'
     self.pos_tagger = POSTagger([StringIO(self.stream)])
     self.pos_tagger.train()
Пример #11
0
 def setUp(self):
   self.stream = u'A/B C/D C/D A/D A/B ./.'
   self.pos_tagger = POSTagger([StringIO(self.stream)])
   self.pos_tagger.train()
Пример #12
0
def tag_speaker(email, pos_tagger_path, ner_tagger_path):
    file = open('./Data/speakers.txt', 'r')
    speakers = file.readlines()
    speakers = [x.strip() for x in speakers]
    file.close()

    header = email.header
    body = email.body

    reg_line = r'Who.*\n'
    reg_loc = r':.*'

    # Tag header
    found = False

    try:

        line = re.findall(reg_line, header)

        if len(line) > 0:
            line = line[0]
            for s in speakers:
                if s in line:
                    ns = '<speaker>' + s + '</speaker>'
                    header = re.sub(s, ns, header)
                    body = re.sub(s, ns, body)
                    found = True

        if not found:
            # Get line which contains time info
            line = re.findall(reg_line, header)[0]

            # remove semicolon from beginning [1:]
            speaker = re.findall(reg_loc, line)[0][1:].strip()
            n_speaker = '<speaker>' + speaker + '</speaker>'
            np_speaker = 'Who: <speaker>' + speaker + '</speaker>\n'

            header = re.sub(reg_line, np_speaker, header)
            body = re.sub(speaker, n_speaker, body)

        return Email(header, body, email.fileid)
    except:
        pass

    # Use NER Tagger for tagging
    pos = POSTagger()
    pos.load_pos_tagger(pos_tagger_path)

    chunker = NERTagger(ner_features)
    chunker.load_ner_tagger(ner_tagger_path)

    body = email.body
    body = [list(pos.predict(word_tokenize(s))) for s in sent_tokenize(body)]
    body = [chunker.parse(s) for s in body]

    speakers_n = []

    for chunk in body:
        for c in chunk:
            if hasattr(c, 'label'):
                if c.label() == 'per':
                    s = ''
                    for v in c:
                        s = s + v[0] + ' '
                    s = s.strip()
                    speakers_n.append(s)
                elif c.label() == 'org':
                    s = ''
                    for v in c:
                        s = s + v[0] + ' '
                    s = s.strip()
                    speakers_n.append(s)

    body = email.body
    for s in speakers_n:
        if s in speakers:
            ns = '<speaker>' + s + '</speaker>'
            body = re.sub(s, ns, body)

    return Email(header, body, email.fileid)
Пример #13
0
 def __init__(self, tagged_dataset):
     self._root, self._grammar_tree = self.__get_grammar_tree(tagged_dataset)
     self._word_tree = POSTagger.get_tagged_word_tree(tagged_dataset)
Пример #14
0
 def setUp(self):
     self.stream = io.StringIO("A/B C/D C/D A/D A/B ./.")
     self.pos_tagger = POSTagger([io.StringIO(self.stream)])
     self.pos_tagger.train()
Пример #15
0
 def setUp(self):
   self.training = u'I/PRO want/V to/TO race/V ./. I/PRO like/V cats/N ./.'
   self.sentence = 'I want to race.'
   self.pos_tagger = POSTagger([StringIO(self.training)])
   self.pos_tagger.train()
Пример #16
0
class TestProbabilityCalculation(unittest.TestCase):
    def setUp(self):
        self.stream = 'A/B C/D C/D A/D A/B ./.'
        self.pos_tagger = POSTagger([StringIO(self.stream)])
        self.pos_tagger.train()

    def test1(self):
        """calculates tag transition probabilities"""
        self.assertAlmostEqual(0, self.pos_tagger.tag_probability('Z', 'Z'))
        self.assertAlmostEqual(2.0 / 3, self.pos_tagger.tag_probability('D', 'D'))
        self.assertAlmostEqual(1, self.pos_tagger.tag_probability('START', 'B'))
        self.assertAlmostEqual(0.5, self.pos_tagger.tag_probability('B', 'D'))
        self.assertAlmostEqual(0, self.pos_tagger.tag_probability('.', 'D'))

    def test2(self):
        """calculates probability of sequence of words and tags"""
        words = ['START', 'A', 'C', 'A', 'A', '.']
        tags = ['START', 'B', 'D', 'D', 'B', '.']
        tag_probabilities = self.pos_tagger.tag_probability('B', 'D') * \
                            self.pos_tagger.tag_probability('D', 'D') * \
                            self.pos_tagger.tag_probability('D', 'B') * \
                            self.pos_tagger.tag_probability('B', '.')
        word_probabilities = self.pos_tagger.word_tag_probability('A', 'B') * \
                             self.pos_tagger.word_tag_probability('C', 'D') * \
                             self.pos_tagger.word_tag_probability('A', 'D') * \
                             self.pos_tagger.word_tag_probability('A', 'B')
        expected = word_probabilities * tag_probabilities
        result = self.pos_tagger.probability_of_word_tag(words, tags)
        self.assertAlmostEqual(expected, result)

    def test3(self):
        """calculates the probability of a word given a tag"""
        self.assertAlmostEqual(0, self.pos_tagger.word_tag_probability('Z', 'Z'))
        self.assertAlmostEqual(1, self.pos_tagger.word_tag_probability('A', 'B'))
        self.assertAlmostEqual(1.0 / 3, self.pos_tagger.word_tag_probability('A', 'D'))
        self.assertAlmostEqual(1, self.pos_tagger.word_tag_probability('.', '.'))
Пример #17
0
 def __init__(self, haikus_file='haikus.json'):
     self._dataset = HaikuGenerator.__parse_dataset(haikus_file)
     self._tagged_dataset = POSTagger.get_pos_tagged_dataset(self._dataset)
     self._bigrams = Bigrams(self._dataset)
     self._reversed_bigrams = Bigrams(self._dataset, reverse=True)
     self._grammar_tree = GrammarTree(self._tagged_dataset)
Пример #18
0
class TestProbabilityCalculation(unittest.TestCase):
  def setUp(self):
    self.stream = u'A/B C/D C/D A/D A/B ./.'
    self.pos_tagger = POSTagger([StringIO(self.stream)])
    self.pos_tagger.train()

  def test1(self):
    """calculates tag transition probabilities"""
    self.assertAlmostEqual(0, self.pos_tagger.tag_probability('Z', 'Z'))
    self.assertAlmostEqual(2.0 / 3, self.pos_tagger.tag_probability('D', 'D'))
    self.assertAlmostEqual(1, self.pos_tagger.tag_probability('START', 'B'))
    self.assertAlmostEqual(0.5, self.pos_tagger.tag_probability('B', 'D'))
    self.assertAlmostEqual(0, self.pos_tagger.tag_probability('.', 'D'))

  def test2(self):
    """calculates probability of sequence of words and tags"""
    words = ['START', 'A', 'C', 'A', 'A', '.']
    tags = ['START', 'B', 'D', 'D', 'B', '.']
    tag_probabilities = self.pos_tagger.tag_probability('B', 'D') * \
                        self.pos_tagger.tag_probability('D', 'D') * \
                        self.pos_tagger.tag_probability('D', 'B') * \
                        self.pos_tagger.tag_probability('B', '.')
    word_probabilities = self.pos_tagger.word_tag_probability('A', 'B') * \
                         self.pos_tagger.word_tag_probability('C', 'D') * \
                         self.pos_tagger.word_tag_probability('A', 'D') * \
                         self.pos_tagger.word_tag_probability('A', 'B')
    expected = word_probabilities * tag_probabilities
    result = self.pos_tagger.probability_of_word_tag(words, tags)
    self.assertAlmostEqual(expected, result)

  def test3(self):
    """calculates the probability of a word given a tag"""
    self.assertAlmostEqual(0, self.pos_tagger.word_tag_probability('Z', 'Z'))
    self.assertAlmostEqual(1, self.pos_tagger.word_tag_probability('A', 'B'))
    self.assertAlmostEqual(1.0 / 3, self.pos_tagger.word_tag_probability('A', 'D'))
    self.assertAlmostEqual(1, self.pos_tagger.word_tag_probability('.', '.'))