示例#1
0
 def lesionMorphologicalRules(self, solution):
     """Sometimes we end up learning to put the morphology into the rewrite
     rules, e.g. 0 > k/#_ or something like that. This will take a
     solution and try removing insertion/deletion rules whenever
     possible, keeping the underlying forms constant but being
     willing to modify the morphology.
     """
     rules = list(solution.rules)
     for r in list(solution.rules):
         if isinstance(r.focus, EmptySpecification) and isinstance(r.structuralChange, ConstantPhoneme) and \
            u'#' in unicode(r):
             print "Candidate for lesion",r
             candidateRules = [ r_ for r_ in rules if r_ != r ]
             print "the new rules would be",candidateRules
             Model.Global()
             prefixes = [ Morph.sample() for _ in xrange(self.numberOfInflections) ]
             suffixes = [ Morph.sample() for _ in xrange(self.numberOfInflections) ]
             self.conditionOnData([ r_.makeConstant(self.bank) for r_ in candidateRules ],
                                  [ solution.underlyingForms[x].makeConstant(self.bank)
                                    for x in self.data ],
                                  prefixes, suffixes,
                                  auxiliaryHarness=True)
             #minimize(sum(wordLength(m) for m in prefixes+suffixes ))
             try:
                 output = self.solveSketch()
                 print "Lesioning morphological rule", r
                 solution = Solution(prefixes = [ Morph.parse(self.bank, output, p) for p in prefixes ],
                                     suffixes = [ Morph.parse(self.bank, output, s) for s in suffixes ],
                                     underlyingForms = solution.underlyingForms,
                                     rules = candidateRules)
                 rules = solution.rules
             except SynthesisFailure:
                 print "Turns out that you cannot lesion",r
     return solution
示例#2
0
def parseSolution(s):
    def removeComment(y):
        if ';' in y: return y[:y.index(';')].strip()
        y = y.strip()
        if len(y) == 0 or y[0] == '#': return ''
        return y

    lines = [removeComment(x) for x in s.split('\n')]
    prefixes = []
    suffixes = []
    rules = []
    for l in lines:
        if 'stem' in l:
            [prefix, suffix] = l.split('stem')
            prefix = prefix.replace('+', '').strip()
            suffix = suffix.replace('+', '').strip()
            prefixes.append(Morph(tokenize(prefix)))
            suffixes.append(Morph(tokenize(suffix)))
        elif len(l) > 0:
            r = parseRule(l)
            if r == None:
                print "Could not parse '%s'" % l
                assert False
            rules.append(r)
    return Solution(rules, prefixes, suffixes)
示例#3
0
    def __init__(self):
        self.dictionary = Dictionary()
        self.morph = Morph()

        self.resp_what = responder.WhatResponder(self.dictionary)
        self.resp_random = responder.RandomResponder(self.dictionary)
        self.resp_pattern = responder.PatternResponder(self.dictionary)
        self.resp_template = responder.TemplateResponder(self.dictionary)
        self.responder = self.resp_pattern
示例#4
0
 def finalizeResult(self, k, result):
     """do a final Hail Mary transduction of underlying forms and then expand the frontier"""
     if len(result.solutionSequence) == 0:
         emptySolution = Solution(prefixes=[Morph(u"")]*self.numberOfInflections,
                                  suffixes=[Morph(u"")]*self.numberOfInflections,
                                  rules=[], underlyingForms={})
         result.recordSolution(emptySolution)
         return result.lastSolutionIsFinal()
     
     setGlobalTimeout(None)
     s = result.solutionSequence[-1][0]
     s = self.finalTransduction(s)
     f = self.expandFrontier(s, k)
     result.recordFinalFrontier(f)
示例#5
0
    def parse_sentence(cabocha_sent):
        chunks = []
        morphs = []
        dependencies = {}

        lines = cabocha_sent.split('\n')

        for line in lines:

            if line.startswith('*'):

                if len(morphs) > 0:
                    chunks.append(
                        Chunk(morphs, dst,
                              [k
                               for k, v in dependencies.items() if v == src]))

                morphs = []

                elems = line.split(' ')
                src, dst = int(elems[1]), int(elems[2][:-1])

                if dst > -1:
                    dependencies[src] = dst

            else:
                morphs.append(Morph.parse(line))

        chunks.append(
            Chunk(morphs, dst,
                  [k for k, v in dependencies.items() if v == src]))

        return chunks
示例#6
0
def dump(node):
    debugcnt = 0
    sentence_id = -1
    for c in node:
        if c.tag == "sentence":
            sentence = []
            sentence_id += 1
            chunk_dict = {} # chunkは1sentenceに複数
            token_dict = {} #tokenは1chunkに複数
            chunk_iter = c.getiterator('chunk')
            for chunk in chunk_iter:
                morphs = []
                tok_iter = chunk.getiterator('tok')
                chunk_dict['id'] = chunk.get("id")
                chunk_dict['link'] = chunk.get("link")
                chunk_dict['rel'] = chunk.get("rel")
                chunk_dict['score'] = chunk.get("score")
                chunk_dict['head'] = chunk.get("head")
                chunk_dict['func'] = chunk.get("func")
                for tok in tok_iter:
                    tok_id = tok.get('id')
                    tok_feature = tok.get('feature')
                    morph = Morph(sentence_id, chunk_dict['id'], tok_id, tok_feature, tok.text)
                    morphs.append(morph)
                    tok_content = tok.text
                a_chunk = Chunk(sentence_id,chunk_dict['id'], chunk_dict['link'], chunk_dict['rel'], \
                chunk_dict['score'], chunk_dict['head'], chunk_dict['func'], morphs)
                # chuncks.append(a_chunk)
                sentence.append(a_chunk)
            sents_list.append(sentence)
        dump(c)
示例#7
0
    def parse_sentence(cabocha_sent):
        chunks = []
        morphs = []
        dependencies = {}

        lines = cabocha_sent.split('\n')

        for line in lines:

            if line.startswith('*'):

                if len(morphs) > 0:
                    chunks.append(Chunk(morphs, dst, [k for k, v in dependencies.items() if v == src]))

                morphs = []

                elems = line.split(' ')
                src, dst = int(elems[1]), int(elems[2][:-1])

                if dst > -1:
                    dependencies[src] = dst

            else:
                morphs.append(Morph.parse(line))

        chunks.append(Chunk(morphs, dst, [k for k, v in dependencies.items() if v == src]))

        return chunks
    def __init__(self, data, count, problemName=None):
        self.problemName = problemName
        self.data = [Morph(tokenize(x)) for x in data]
        self.count = count
        self.bank = FeatureBank([w for w in data])

        self.maximumObservationLength = max([len(w) for w in self.data]) + 1
示例#9
0
 def __init__(self, morph=None, quality=None):
     """ Initialize a Hyphenator class. You may pass in a Morph() instance,
     or it can create a new one. If you specify quality, that quality will be
     used by default for every call to the hyphenate functions.
     """
     self.morph = morph or Morph()
     self.default_quality = quality or 2
示例#10
0
    def __init__(self, data, problemName=None, bank = None, useSyllables = False, UG = None,
                 fixedMorphology = None):
        self.problemName = problemName
        self.UG = UG
        self.countingProblem = problemName == "Odden_2.4_Tibetan"

      
        if bank != None: self.bank = bank
        else:
            self.bank = FeatureBank([ w for l in data for w in l if w != None ] + ([u'-'] if useSyllables else []))

        self.numberOfInflections = len(data[0])
        for d in data: assert len(d) == self.numberOfInflections
        
        # wrap the data in Morph objects if it isn't already
        self.data = [ tuple( None if i == None else (i if isinstance(i,Morph) else Morph(tokenize(i)))
                             for i in Lex)
                      for Lex in data ]

        self.maximumObservationLength = max([ len(w) for l in self.data for w in l if w != None ])

        self.wordBoundaries = any([ (u'##' in w.phonemes) for l in self.data for w in l if w ])

        # fixedMorphology : list of morphologies, one for each inflection
        # Each morphology is either None (don't fix it) or a pair of (prefix, suffix)
        if fixedMorphology == None: fixedMorphology = [None]*self.numberOfInflections
        self.fixedMorphology = fixedMorphology
        assert len(self.fixedMorphology) == self.numberOfInflections

        self.inflectionsPerObservation = sum(x is not None
                                             for xs in self.data for x in xs )/len(self.data)

        self.pervasiveTimeout = None

        self.precomputedAlignment = None
示例#11
0
class Bot():
    def __init__(self):
        self.dictionary = Dictionary()
        self.morph = Morph()

        self.resp_what = responder.WhatResponder(self.dictionary)
        self.resp_random = responder.RandomResponder(self.dictionary)
        self.resp_pattern = responder.PatternResponder(self.dictionary)
        self.resp_template = responder.TemplateResponder(self.dictionary)
        self.responder = self.resp_pattern

    def dialogue(self, input_text):
        parts = self.morph.analyze(input_text)

        i = random.randint(0, 100)
        if 0 <= i < 40:
            self.responder = self.resp_pattern
        elif 40 <= i < 70:
            self.response = self.resp_template
        elif 70 <= i < 90:
            self.responder = self.resp_random
        else:
            self.responder = self.resp_what

        self.response = self.responder.response(input_text, parts)

        #学習
        self.dictionary.study(input_text, parts)

        return self.response

    def save(self):
        self.dictionary.save()
示例#12
0
def main():
    # 結果の格納用
    morph_list = []
    i = 0
    with open('neko.txt.cabocha', 'r') as f:
        for line in f.readlines():
            # 係り受け解析による区切りなら
            if line.find('*') == 0:
                # 文節の開始地点なら
                if int(line[2]) == 0:
                    # listの中にリストを追加
                    morph_list.append(list())
                    # 次のリストの位置を保持
                    i += 1
            # 文章の一部で条件を満たしているなら
            #elif line.find('EOS') == -1 and line[0] != ' ':
            elif line.find('EOS') == -1:
                # 品詞情報を格納
                speech = line[line.find('\t') + 1:].replace('\n',
                                                            '').split(',')
                # 形態素のクラスを現在の文章を指す配列に挿入
                morph = Morph(line[0:line.find('\t')], speech[6], speech[0],
                              speech[1])
                morph_list[i - 1].append(morph)

    # 3行目を出力
    for i in range(len(morph_list[2])):
        morph_list[2][i].show()
示例#13
0
    def test_get_sentence(self):
        input_texts = ['こんにちは', 'ジブリが好きです', 'ディズニーが好きです', 'ピクサーが好きです']

        for input_text in input_texts:
            with self.subTest():
                tokens = Morph.analyze(input_text)
                keyword = ''
                for token in tokens:
                    if Morph.is_keyword(token):
                        keyword += token.surface + ' '
                sentence = Search.get_sentence(keyword)
                print('you > ' + input_text)
                print('keyword > ' + keyword)
                print('sentence > ' + sentence)
                print('************')
                self.assertTrue(len(sentence) > 0)
示例#14
0
def parse():
    cabocha = CaboCha.Parser()
    result = []
    with open('neko.txt') as input_data:
        for line in input_data:
            line = line.strip()
            parsed = cabocha.parse(line).toString(CaboCha.FORMAT_LATTICE)
            chunks = {}
            for sentence_str in parsed.split('* '):
                sentence_analysis = sentence_str.split('\n')
                affliation_str = sentence_analysis.pop(0)
                if affliation_str in ['', 'EOS']:
                    continue
                morph_analysis = affliation_str.split(' ')
                chunk = Chunk()
                chunk.id = int(morph_analysis[0])
                chunk.srcs = int(morph_analysis[1][:-1])
                morphs = []
                for morph_str in sentence_analysis:
                    if morph_str in ['', 'EOS']:
                        continue
                    surface, right = morph_str.split('\t')
                    morph_items = right.split(',')
                    morphs.append(Morph(surface, morph_items[6],
                                        morph_items[0], morph_items[1]))
                chunk.morphs = morphs
                chunks[chunk.id] = chunk
            for i, chunk in chunks.items():
                if chunk.srcs > 0:
                    chunks[chunk.srcs].dst.append(i)
            result.append(chunks)
    return result
示例#15
0
def read_chunks(sentence) -> list:
    chunks = []  # 1文の解析結果を格納
    chunk = None  # 文節

    for line in sentence.splitlines():
        # 係り受け解析による区切りなら
        if line.find('*') == 0:
            # 係り先が書かれている位置
            dep_par = re.search('[-]*[0-9]+D', line).start()
            if chunk is not None:
                chunks.append(chunk)
            chunk = Chunk([], int(line[dep_par:line.find('D')]), [])
        # 1文の終了の記号が見つかれば
        elif line.find('EOS') > -1:
            if chunk is not None:
                chunks.append(chunk)
            if len(chunks) > 0:
                # 係り受け元の格納
                for i in range(len(chunks)):
                    if chunks[i].dst > -1:
                        chunks[chunks[i].dst].srcs.append(i)
                # 1文を格納
                return chunks
        else:
            # 形態素の情報を文節に格納
            speech = line[line.find('\t') + 1:].replace('\n', '').split(',')
            morph = Morph(line[0:line.find('\t')], speech[6], speech[0],
                          speech[1])
            chunk.morphs.append(morph)

    return chunks_list
示例#16
0
 def restrict(self, newData):
     restriction = copy.copy(self)
     restriction.data = [
         tuple(None if i == None else (
             i if isinstance(i, Morph) else Morph(tokenize(i)))
               for i in Lex) for Lex in newData
     ]
     return restriction
示例#17
0
    def solveStem(self, ss, morphology):
        Model.Global()
        stem = Morph.sample()

        for (p, s), x in zip(zip(morphology.prefixes, morphology.suffixes),
                             ss):
            if x is None: continue

            condition(
                matchPattern(
                    x.makeConstant(self.bank),
                    concatenate3(p.makeConstant(self.bank), stem,
                                 s.makeConstant(self.bank))))

        minimize(patternCost(stem))
        output = self.solveSketch()
        return Morph.parse(self.bank, output, stem)
示例#18
0
 def tibetanCountingConstraints(self, stems, prefixes, suffixes):
     condition(wordLength(prefixes[0]) == 0)
     condition(wordLength(suffixes[0]) == 0)
     condition(wordLength(suffixes[1]) == 0)
     condition(wordLength(prefixes[2]) == 0)
     for n,inflections in enumerate(self.data):
         if inflections == (Morph(u"ǰu"),None,None): # 10
             condition(wordEqual(stems[n],prefixes[1]))
             condition(wordEqual(stems[n],suffixes[2]))
示例#19
0
 def response(self, input_text='', tokens=[], mood=0):
     keyword = ''
     for token in tokens:
         if Morph.is_keyword(token):
             keyword = token.surface
     generated = self.dictionary.markov.generate(keyword)
     return generated if ((generated is not None) and
                          (len(generated) > 0)) else random.choice(
                              self.dictionary.random)
示例#20
0
 def stochasticSearch(self, iterations, width):
     population = [Solution([EMPTYRULE],
                            [Morph([])]*self.numberOfInflections,
                            [Morph([])]*self.numberOfInflections)]
     for i in range(iterations):
         # expand the population
         children = [ parent.mutate(self.bank)
                      for parent in population
                      for _ in range(width) ]
         population += children
         populationScores = [ (self.solutionDescriptionLength(s) + s.modelCost(),s)
                              for s in population ]
         populationScores.sort()
         population = [ s
                        for _,s in populationScores[:width] ]
         setVerbosity(4)
         mdl = self.solutionDescriptionLength(population[0])
         setVerbosity(0)
         print "MDL:",mdl+population[0].modelCost()
示例#21
0
 def study_pattern(self, input_text, tokens):
     for token in tokens:
         if not Morph.is_keyword(token):
             continue
         word = token.surface
         duped = self.find_pattern(word, input_text)
         if duped != None:
             duped.phrases.append({'need': 0, 'phrase': input_text})
         else:
             self.pattern.append(PatternItem(word, '0##' + input_text))
示例#22
0
    def test_generate(self):
        self.__add_sentense_bocchan()
        input_texts = [
            '初めまして、坊ちゃん', 'あら、ご病気ですか', 'あらあら、大変ですね', 'いたずらして病気になっちゃったんですか?',
            'そんな威張らなくてもいいでしょう', 'はあ、そんなもんですか', '遅刻しちゃだめですね', 'よく覚えてないんですか?',
            'ターナー?', 'どなたですか?'
        ]

        for input_text in input_texts:
            with self.subTest():
                tokens = Morph.analyze(input_text)
                keyword = 'N/A'
                for token in tokens:
                    if Morph.is_keyword(token):
                        keyword = token.surface
                generated = self.markov.generate(keyword)
                print('you > ' + input_text)
                print('generated > ' + generated)
                print('************')
                self.assertTrue(len(generated) > 0)
示例#23
0
 def response(self, input_text='', tokens=[], mood=0):
     keywords = []
     for token in tokens:
         if Morph.is_keyword(token):
             keywords.append(token.surface)
     count = len(keywords)
     if count > 0 and count in self.dictionary.template.keys():
         template = random.choice(self.dictionary.template[count])
         for keyword in keywords:
             template = template.replace('%noun%', keyword, 1)
         return template
     return random.choice(self.dictionary.random)
示例#24
0
 def applyRuleUsingSketch(self,r,u,untilSuffix):
     '''u: morph; r: rule; untilSuffix: int'''
     Model.Global()
     result = Morph.sample()
     _r = r.makeDefinition(self.bank)
     condition(wordEqual(result,applyRule(_r,u.makeConstant(self.bank),
                                          Constant(untilSuffix), len(u) + 2)))
     try:
         output = solveSketch(self.bank,
                              max(self.maximumObservationLength, len(u)) + 2,
                              len(u) + 2,
                              showSource=False, minimizeBound=31,
                              timeout=None)
     except SynthesisFailure:
         print "applyRuleUsingSketch: UNSATISFIABLE for %s %s %s"%(u,r,untilSuffix)
         printSketchFailure()
         assert False
     except SynthesisTimeout:
         print "applyRuleUsingSketch: TIMEOUT for %s %s %s"%(u,r,untilSuffix)
         assert False
     return Morph.parse(self.bank, output, result)
示例#25
0
class Section_5:
    def __init__(self):
        # self.parse_text() # ai.ja.txtをparseする
        self.morph = Morph()
    
    # ai.ja.txtを係り受け解析をする
    def parse_text(self):
        f = open('ai.ja.txt', 'r')
        data = f.read()
        f.close()
        parse_target = [i for i in data.split('\n') if i != ''] # '' を削ぎ落とす

        # CaboChaの解析について
        # https://qiita.com/ayuchiy/items/c3f314889154c4efa71e
        c = CaboCha.Parser()
        f_w = open('ai.ja.txt.parsed', 'w')
        for i in range(len(parse_target)):
            f_w.write(c.parse(parse_target[i]).toString(CaboCha.FORMAT_LATTICE))
        f_w.close()
        
    def ss0(self):
        self.morph.parse()
示例#26
0
    def response(self, input_text='', tokens=[], mood=0):
        try:
            keyword = ''
            for token in tokens:
                if Morph.is_keyword(token):
                    keyword += token.surface + ' '

            if len(keyword) > 0:
                sentence = Search.get_sentence(keyword)
                self.dictionary.study_markov(sentence)
                return sentence
        except:
            print('*** error ***')
        return random.choice(self.dictionary.random)
示例#27
0
    def __init__(self, data, CPUs=1):
        self.CPUs = CPUs
        self.bank = FeatureBank([w for l in data
                                 for w in l if w != None] + [u'?', u'*'])
        self.numberOfInflections = len(data[0])
        # wrap the data in Morph objects if it isn't already
        self.data = [
            tuple(None if i == None else (
                i if isinstance(i, Morph) else Morph(tokenize(i)))
                  for i in Lex) for Lex in data
        ]

        self.maximumObservationLength = max(
            [len(w) for l in self.data for w in l if w != None])
示例#28
0
    def dialogue(self, input_text):
        self.emotion.update(input_text)
        tokens = Morph.analyze(input_text)

        number = randint(9)
        if number == 0:
            self.responder = self.responders['what']
        elif number >= 5:
            self.responder = self.responders['pattern']
        else:
            self.responder = self.responders['random']
        response = self.responder.response(input_text, self.emotion.mood)

        RandomResponder.dictionary.study(input_text, tokens)
        return response
示例#29
0
 def study_template(self, tokens):
     template = ''
     count = 0
     for token in tokens:
         word = token.surface
         if Morph.is_keyword(token):
             word = '%noun%'
             count += 1
         template += word
     if count == 0:
         return
     if not count in self.template.keys():
         self.template[count] = []
     if not template in self.template[count]:
         self.template[count].append(template)
示例#30
0
    def __add_sentense_bocchan(self):
        sample_file = '../KOISURU_PROGRAM/sample/markov/bocchan.txt'
        content = ''

        original_content = codecs.open(sample_file, 'r', 'shift_jis')
        for row in original_content:
            content += row.rstrip()
        original_content.close()
        texts = re.split(r'[。??!!  ]+', content)

        for text in texts:
            if text == '':
                continue
            tokens = Morph.analyze(text)
            self.markov.add_sentence(tokens)
            print('.', end='')
        print('')
示例#31
0
    def sketchJointSolution(self, depth, canAddNewRules = False, costUpperBound = None,
                            fixedRules = None, auxiliaryHarness = False, oldSolution=None):
        try:
            Model.Global()
            if fixedRules == None:
                rules = [ Rule.sample() for _ in range(depth) ]
            else:
                rules = [ r.makeDefinition(self.bank) for r in fixedRules ]
            stems = [ Morph.sample() for _ in self.data ]
            prefixes = [ Morph.sample() for _ in range(self.numberOfInflections) ]
            suffixes = [ Morph.sample() for _ in range(self.numberOfInflections) ]

            for j,m in enumerate(self.fixedMorphology):
                if m != None:
                    (p,s) = m
                    condition(wordEqual(prefixes[j],p.makeConstant(self.bank)))
                    condition(wordEqual(suffixes[j],s.makeConstant(self.bank)))
            if self.wordBoundaries:
                for prefix, suffix in zip(prefixes, suffixes):
                    condition(Or([wordLength(prefix) == 0, wordLength(suffix) == 0]))

            morphologicalCosts = [ None if m == None else len(m[0]) + len(m[1])
                                   for m in self.fixedMorphology ]

            self.minimizeJointCost(rules, stems, prefixes, suffixes, costUpperBound, morphologicalCosts,
                                   oldSolution=oldSolution)

            self.conditionOnData(rules, stems, prefixes, suffixes,
                                 auxiliaryHarness = auxiliaryHarness)
            self.conditionOnPrecomputedMorphology(prefixes, suffixes)

            output = self.solveSketch()
            print "Final hole value:",parseMinimalCostValue(output)

            solution = Solution(prefixes = [ Morph.parse(self.bank, output, p) for p in prefixes ],
                                suffixes = [ Morph.parse(self.bank, output, s) for s in suffixes ],
                                underlyingForms = {x: Morph.parse(self.bank, output, s)
                                                   for x,s in zip(self.data, stems) },
                                rules = [ Rule.parse(self.bank, output, r) for r in rules ] if fixedRules == None else fixedRules)
            solution.showMorphologicalAnalysis()
            solution.showRules()
            return solution
        
        except SynthesisFailure:
            if canAddNewRules:
                depth += 1
                print "Expanding rule depth to %d"%depth
                return self.sketchJointSolution(depth, canAddNewRules = canAddNewRules,
                                                auxiliaryHarness = auxiliaryHarness,
                                                oldSolution=oldSolution)
            else:
                return None
示例#32
0
        "-c",
        "--wordclass",
        dest="wordclass",
        default="",
        help="limit tests to specific word class ('n' for noun, 's' for verb etc.)",
    )
    parser.add_option(
        "-l",
        "--loose",
        dest="loose",
        action="store_true",
        help="allow a more 'loose' comparison, omitting certain parts of tags",
    )
    options, args = parser.parse_args()

    m = Morph()

    total = 0.0
    total_full = 0.0
    total_tags = 0.0
    total_tags_full = 0
    tags_missed = 0
    missing = 0
    all_missing = 0
    no_candidates = 0
    surplus = 0
    wrong_pick = 0
    tags_ignored = 0
    words_ignored = 0

    start_time = time.time()
示例#33
0
文件: q40.py 项目: tanikawa04/nlp100
# coding: utf-8

from morph import Morph

if __name__ == '__main__':
    with open('neko.txt.cabocha', 'r') as f:
        cabocha_sents = [cabocha_sent.strip() for cabocha_sent in f.read().split('EOS') if cabocha_sent != '\n']

    morphs = []
    cabocha_lines = cabocha_sents[2].split('\n')

    for line in cabocha_lines:
        if not line.startswith('*'):
            morphs.append(Morph.parse(line))

    print(morphs)