def lesionMorphologicalRules(self, solution): """Sometimes we end up learning to put the morphology into the rewrite rules, e.g. 0 > k/#_ or something like that. This will take a solution and try removing insertion/deletion rules whenever possible, keeping the underlying forms constant but being willing to modify the morphology. """ rules = list(solution.rules) for r in list(solution.rules): if isinstance(r.focus, EmptySpecification) and isinstance(r.structuralChange, ConstantPhoneme) and \ u'#' in unicode(r): print "Candidate for lesion",r candidateRules = [ r_ for r_ in rules if r_ != r ] print "the new rules would be",candidateRules Model.Global() prefixes = [ Morph.sample() for _ in xrange(self.numberOfInflections) ] suffixes = [ Morph.sample() for _ in xrange(self.numberOfInflections) ] self.conditionOnData([ r_.makeConstant(self.bank) for r_ in candidateRules ], [ solution.underlyingForms[x].makeConstant(self.bank) for x in self.data ], prefixes, suffixes, auxiliaryHarness=True) #minimize(sum(wordLength(m) for m in prefixes+suffixes )) try: output = self.solveSketch() print "Lesioning morphological rule", r solution = Solution(prefixes = [ Morph.parse(self.bank, output, p) for p in prefixes ], suffixes = [ Morph.parse(self.bank, output, s) for s in suffixes ], underlyingForms = solution.underlyingForms, rules = candidateRules) rules = solution.rules except SynthesisFailure: print "Turns out that you cannot lesion",r return solution
def parseSolution(s): def removeComment(y): if ';' in y: return y[:y.index(';')].strip() y = y.strip() if len(y) == 0 or y[0] == '#': return '' return y lines = [removeComment(x) for x in s.split('\n')] prefixes = [] suffixes = [] rules = [] for l in lines: if 'stem' in l: [prefix, suffix] = l.split('stem') prefix = prefix.replace('+', '').strip() suffix = suffix.replace('+', '').strip() prefixes.append(Morph(tokenize(prefix))) suffixes.append(Morph(tokenize(suffix))) elif len(l) > 0: r = parseRule(l) if r == None: print "Could not parse '%s'" % l assert False rules.append(r) return Solution(rules, prefixes, suffixes)
def __init__(self): self.dictionary = Dictionary() self.morph = Morph() self.resp_what = responder.WhatResponder(self.dictionary) self.resp_random = responder.RandomResponder(self.dictionary) self.resp_pattern = responder.PatternResponder(self.dictionary) self.resp_template = responder.TemplateResponder(self.dictionary) self.responder = self.resp_pattern
def finalizeResult(self, k, result): """do a final Hail Mary transduction of underlying forms and then expand the frontier""" if len(result.solutionSequence) == 0: emptySolution = Solution(prefixes=[Morph(u"")]*self.numberOfInflections, suffixes=[Morph(u"")]*self.numberOfInflections, rules=[], underlyingForms={}) result.recordSolution(emptySolution) return result.lastSolutionIsFinal() setGlobalTimeout(None) s = result.solutionSequence[-1][0] s = self.finalTransduction(s) f = self.expandFrontier(s, k) result.recordFinalFrontier(f)
def parse_sentence(cabocha_sent): chunks = [] morphs = [] dependencies = {} lines = cabocha_sent.split('\n') for line in lines: if line.startswith('*'): if len(morphs) > 0: chunks.append( Chunk(morphs, dst, [k for k, v in dependencies.items() if v == src])) morphs = [] elems = line.split(' ') src, dst = int(elems[1]), int(elems[2][:-1]) if dst > -1: dependencies[src] = dst else: morphs.append(Morph.parse(line)) chunks.append( Chunk(morphs, dst, [k for k, v in dependencies.items() if v == src])) return chunks
def dump(node): debugcnt = 0 sentence_id = -1 for c in node: if c.tag == "sentence": sentence = [] sentence_id += 1 chunk_dict = {} # chunkは1sentenceに複数 token_dict = {} #tokenは1chunkに複数 chunk_iter = c.getiterator('chunk') for chunk in chunk_iter: morphs = [] tok_iter = chunk.getiterator('tok') chunk_dict['id'] = chunk.get("id") chunk_dict['link'] = chunk.get("link") chunk_dict['rel'] = chunk.get("rel") chunk_dict['score'] = chunk.get("score") chunk_dict['head'] = chunk.get("head") chunk_dict['func'] = chunk.get("func") for tok in tok_iter: tok_id = tok.get('id') tok_feature = tok.get('feature') morph = Morph(sentence_id, chunk_dict['id'], tok_id, tok_feature, tok.text) morphs.append(morph) tok_content = tok.text a_chunk = Chunk(sentence_id,chunk_dict['id'], chunk_dict['link'], chunk_dict['rel'], \ chunk_dict['score'], chunk_dict['head'], chunk_dict['func'], morphs) # chuncks.append(a_chunk) sentence.append(a_chunk) sents_list.append(sentence) dump(c)
def parse_sentence(cabocha_sent): chunks = [] morphs = [] dependencies = {} lines = cabocha_sent.split('\n') for line in lines: if line.startswith('*'): if len(morphs) > 0: chunks.append(Chunk(morphs, dst, [k for k, v in dependencies.items() if v == src])) morphs = [] elems = line.split(' ') src, dst = int(elems[1]), int(elems[2][:-1]) if dst > -1: dependencies[src] = dst else: morphs.append(Morph.parse(line)) chunks.append(Chunk(morphs, dst, [k for k, v in dependencies.items() if v == src])) return chunks
def __init__(self, data, count, problemName=None): self.problemName = problemName self.data = [Morph(tokenize(x)) for x in data] self.count = count self.bank = FeatureBank([w for w in data]) self.maximumObservationLength = max([len(w) for w in self.data]) + 1
def __init__(self, morph=None, quality=None): """ Initialize a Hyphenator class. You may pass in a Morph() instance, or it can create a new one. If you specify quality, that quality will be used by default for every call to the hyphenate functions. """ self.morph = morph or Morph() self.default_quality = quality or 2
def __init__(self, data, problemName=None, bank = None, useSyllables = False, UG = None, fixedMorphology = None): self.problemName = problemName self.UG = UG self.countingProblem = problemName == "Odden_2.4_Tibetan" if bank != None: self.bank = bank else: self.bank = FeatureBank([ w for l in data for w in l if w != None ] + ([u'-'] if useSyllables else [])) self.numberOfInflections = len(data[0]) for d in data: assert len(d) == self.numberOfInflections # wrap the data in Morph objects if it isn't already self.data = [ tuple( None if i == None else (i if isinstance(i,Morph) else Morph(tokenize(i))) for i in Lex) for Lex in data ] self.maximumObservationLength = max([ len(w) for l in self.data for w in l if w != None ]) self.wordBoundaries = any([ (u'##' in w.phonemes) for l in self.data for w in l if w ]) # fixedMorphology : list of morphologies, one for each inflection # Each morphology is either None (don't fix it) or a pair of (prefix, suffix) if fixedMorphology == None: fixedMorphology = [None]*self.numberOfInflections self.fixedMorphology = fixedMorphology assert len(self.fixedMorphology) == self.numberOfInflections self.inflectionsPerObservation = sum(x is not None for xs in self.data for x in xs )/len(self.data) self.pervasiveTimeout = None self.precomputedAlignment = None
class Bot(): def __init__(self): self.dictionary = Dictionary() self.morph = Morph() self.resp_what = responder.WhatResponder(self.dictionary) self.resp_random = responder.RandomResponder(self.dictionary) self.resp_pattern = responder.PatternResponder(self.dictionary) self.resp_template = responder.TemplateResponder(self.dictionary) self.responder = self.resp_pattern def dialogue(self, input_text): parts = self.morph.analyze(input_text) i = random.randint(0, 100) if 0 <= i < 40: self.responder = self.resp_pattern elif 40 <= i < 70: self.response = self.resp_template elif 70 <= i < 90: self.responder = self.resp_random else: self.responder = self.resp_what self.response = self.responder.response(input_text, parts) #学習 self.dictionary.study(input_text, parts) return self.response def save(self): self.dictionary.save()
def main(): # 結果の格納用 morph_list = [] i = 0 with open('neko.txt.cabocha', 'r') as f: for line in f.readlines(): # 係り受け解析による区切りなら if line.find('*') == 0: # 文節の開始地点なら if int(line[2]) == 0: # listの中にリストを追加 morph_list.append(list()) # 次のリストの位置を保持 i += 1 # 文章の一部で条件を満たしているなら #elif line.find('EOS') == -1 and line[0] != ' ': elif line.find('EOS') == -1: # 品詞情報を格納 speech = line[line.find('\t') + 1:].replace('\n', '').split(',') # 形態素のクラスを現在の文章を指す配列に挿入 morph = Morph(line[0:line.find('\t')], speech[6], speech[0], speech[1]) morph_list[i - 1].append(morph) # 3行目を出力 for i in range(len(morph_list[2])): morph_list[2][i].show()
def test_get_sentence(self): input_texts = ['こんにちは', 'ジブリが好きです', 'ディズニーが好きです', 'ピクサーが好きです'] for input_text in input_texts: with self.subTest(): tokens = Morph.analyze(input_text) keyword = '' for token in tokens: if Morph.is_keyword(token): keyword += token.surface + ' ' sentence = Search.get_sentence(keyword) print('you > ' + input_text) print('keyword > ' + keyword) print('sentence > ' + sentence) print('************') self.assertTrue(len(sentence) > 0)
def parse(): cabocha = CaboCha.Parser() result = [] with open('neko.txt') as input_data: for line in input_data: line = line.strip() parsed = cabocha.parse(line).toString(CaboCha.FORMAT_LATTICE) chunks = {} for sentence_str in parsed.split('* '): sentence_analysis = sentence_str.split('\n') affliation_str = sentence_analysis.pop(0) if affliation_str in ['', 'EOS']: continue morph_analysis = affliation_str.split(' ') chunk = Chunk() chunk.id = int(morph_analysis[0]) chunk.srcs = int(morph_analysis[1][:-1]) morphs = [] for morph_str in sentence_analysis: if morph_str in ['', 'EOS']: continue surface, right = morph_str.split('\t') morph_items = right.split(',') morphs.append(Morph(surface, morph_items[6], morph_items[0], morph_items[1])) chunk.morphs = morphs chunks[chunk.id] = chunk for i, chunk in chunks.items(): if chunk.srcs > 0: chunks[chunk.srcs].dst.append(i) result.append(chunks) return result
def read_chunks(sentence) -> list: chunks = [] # 1文の解析結果を格納 chunk = None # 文節 for line in sentence.splitlines(): # 係り受け解析による区切りなら if line.find('*') == 0: # 係り先が書かれている位置 dep_par = re.search('[-]*[0-9]+D', line).start() if chunk is not None: chunks.append(chunk) chunk = Chunk([], int(line[dep_par:line.find('D')]), []) # 1文の終了の記号が見つかれば elif line.find('EOS') > -1: if chunk is not None: chunks.append(chunk) if len(chunks) > 0: # 係り受け元の格納 for i in range(len(chunks)): if chunks[i].dst > -1: chunks[chunks[i].dst].srcs.append(i) # 1文を格納 return chunks else: # 形態素の情報を文節に格納 speech = line[line.find('\t') + 1:].replace('\n', '').split(',') morph = Morph(line[0:line.find('\t')], speech[6], speech[0], speech[1]) chunk.morphs.append(morph) return chunks_list
def restrict(self, newData): restriction = copy.copy(self) restriction.data = [ tuple(None if i == None else ( i if isinstance(i, Morph) else Morph(tokenize(i))) for i in Lex) for Lex in newData ] return restriction
def solveStem(self, ss, morphology): Model.Global() stem = Morph.sample() for (p, s), x in zip(zip(morphology.prefixes, morphology.suffixes), ss): if x is None: continue condition( matchPattern( x.makeConstant(self.bank), concatenate3(p.makeConstant(self.bank), stem, s.makeConstant(self.bank)))) minimize(patternCost(stem)) output = self.solveSketch() return Morph.parse(self.bank, output, stem)
def tibetanCountingConstraints(self, stems, prefixes, suffixes): condition(wordLength(prefixes[0]) == 0) condition(wordLength(suffixes[0]) == 0) condition(wordLength(suffixes[1]) == 0) condition(wordLength(prefixes[2]) == 0) for n,inflections in enumerate(self.data): if inflections == (Morph(u"ǰu"),None,None): # 10 condition(wordEqual(stems[n],prefixes[1])) condition(wordEqual(stems[n],suffixes[2]))
def response(self, input_text='', tokens=[], mood=0): keyword = '' for token in tokens: if Morph.is_keyword(token): keyword = token.surface generated = self.dictionary.markov.generate(keyword) return generated if ((generated is not None) and (len(generated) > 0)) else random.choice( self.dictionary.random)
def stochasticSearch(self, iterations, width): population = [Solution([EMPTYRULE], [Morph([])]*self.numberOfInflections, [Morph([])]*self.numberOfInflections)] for i in range(iterations): # expand the population children = [ parent.mutate(self.bank) for parent in population for _ in range(width) ] population += children populationScores = [ (self.solutionDescriptionLength(s) + s.modelCost(),s) for s in population ] populationScores.sort() population = [ s for _,s in populationScores[:width] ] setVerbosity(4) mdl = self.solutionDescriptionLength(population[0]) setVerbosity(0) print "MDL:",mdl+population[0].modelCost()
def study_pattern(self, input_text, tokens): for token in tokens: if not Morph.is_keyword(token): continue word = token.surface duped = self.find_pattern(word, input_text) if duped != None: duped.phrases.append({'need': 0, 'phrase': input_text}) else: self.pattern.append(PatternItem(word, '0##' + input_text))
def test_generate(self): self.__add_sentense_bocchan() input_texts = [ '初めまして、坊ちゃん', 'あら、ご病気ですか', 'あらあら、大変ですね', 'いたずらして病気になっちゃったんですか?', 'そんな威張らなくてもいいでしょう', 'はあ、そんなもんですか', '遅刻しちゃだめですね', 'よく覚えてないんですか?', 'ターナー?', 'どなたですか?' ] for input_text in input_texts: with self.subTest(): tokens = Morph.analyze(input_text) keyword = 'N/A' for token in tokens: if Morph.is_keyword(token): keyword = token.surface generated = self.markov.generate(keyword) print('you > ' + input_text) print('generated > ' + generated) print('************') self.assertTrue(len(generated) > 0)
def response(self, input_text='', tokens=[], mood=0): keywords = [] for token in tokens: if Morph.is_keyword(token): keywords.append(token.surface) count = len(keywords) if count > 0 and count in self.dictionary.template.keys(): template = random.choice(self.dictionary.template[count]) for keyword in keywords: template = template.replace('%noun%', keyword, 1) return template return random.choice(self.dictionary.random)
def applyRuleUsingSketch(self,r,u,untilSuffix): '''u: morph; r: rule; untilSuffix: int''' Model.Global() result = Morph.sample() _r = r.makeDefinition(self.bank) condition(wordEqual(result,applyRule(_r,u.makeConstant(self.bank), Constant(untilSuffix), len(u) + 2))) try: output = solveSketch(self.bank, max(self.maximumObservationLength, len(u)) + 2, len(u) + 2, showSource=False, minimizeBound=31, timeout=None) except SynthesisFailure: print "applyRuleUsingSketch: UNSATISFIABLE for %s %s %s"%(u,r,untilSuffix) printSketchFailure() assert False except SynthesisTimeout: print "applyRuleUsingSketch: TIMEOUT for %s %s %s"%(u,r,untilSuffix) assert False return Morph.parse(self.bank, output, result)
class Section_5: def __init__(self): # self.parse_text() # ai.ja.txtをparseする self.morph = Morph() # ai.ja.txtを係り受け解析をする def parse_text(self): f = open('ai.ja.txt', 'r') data = f.read() f.close() parse_target = [i for i in data.split('\n') if i != ''] # '' を削ぎ落とす # CaboChaの解析について # https://qiita.com/ayuchiy/items/c3f314889154c4efa71e c = CaboCha.Parser() f_w = open('ai.ja.txt.parsed', 'w') for i in range(len(parse_target)): f_w.write(c.parse(parse_target[i]).toString(CaboCha.FORMAT_LATTICE)) f_w.close() def ss0(self): self.morph.parse()
def response(self, input_text='', tokens=[], mood=0): try: keyword = '' for token in tokens: if Morph.is_keyword(token): keyword += token.surface + ' ' if len(keyword) > 0: sentence = Search.get_sentence(keyword) self.dictionary.study_markov(sentence) return sentence except: print('*** error ***') return random.choice(self.dictionary.random)
def __init__(self, data, CPUs=1): self.CPUs = CPUs self.bank = FeatureBank([w for l in data for w in l if w != None] + [u'?', u'*']) self.numberOfInflections = len(data[0]) # wrap the data in Morph objects if it isn't already self.data = [ tuple(None if i == None else ( i if isinstance(i, Morph) else Morph(tokenize(i))) for i in Lex) for Lex in data ] self.maximumObservationLength = max( [len(w) for l in self.data for w in l if w != None])
def dialogue(self, input_text): self.emotion.update(input_text) tokens = Morph.analyze(input_text) number = randint(9) if number == 0: self.responder = self.responders['what'] elif number >= 5: self.responder = self.responders['pattern'] else: self.responder = self.responders['random'] response = self.responder.response(input_text, self.emotion.mood) RandomResponder.dictionary.study(input_text, tokens) return response
def study_template(self, tokens): template = '' count = 0 for token in tokens: word = token.surface if Morph.is_keyword(token): word = '%noun%' count += 1 template += word if count == 0: return if not count in self.template.keys(): self.template[count] = [] if not template in self.template[count]: self.template[count].append(template)
def __add_sentense_bocchan(self): sample_file = '../KOISURU_PROGRAM/sample/markov/bocchan.txt' content = '' original_content = codecs.open(sample_file, 'r', 'shift_jis') for row in original_content: content += row.rstrip() original_content.close() texts = re.split(r'[。??!! ]+', content) for text in texts: if text == '': continue tokens = Morph.analyze(text) self.markov.add_sentence(tokens) print('.', end='') print('')
def sketchJointSolution(self, depth, canAddNewRules = False, costUpperBound = None, fixedRules = None, auxiliaryHarness = False, oldSolution=None): try: Model.Global() if fixedRules == None: rules = [ Rule.sample() for _ in range(depth) ] else: rules = [ r.makeDefinition(self.bank) for r in fixedRules ] stems = [ Morph.sample() for _ in self.data ] prefixes = [ Morph.sample() for _ in range(self.numberOfInflections) ] suffixes = [ Morph.sample() for _ in range(self.numberOfInflections) ] for j,m in enumerate(self.fixedMorphology): if m != None: (p,s) = m condition(wordEqual(prefixes[j],p.makeConstant(self.bank))) condition(wordEqual(suffixes[j],s.makeConstant(self.bank))) if self.wordBoundaries: for prefix, suffix in zip(prefixes, suffixes): condition(Or([wordLength(prefix) == 0, wordLength(suffix) == 0])) morphologicalCosts = [ None if m == None else len(m[0]) + len(m[1]) for m in self.fixedMorphology ] self.minimizeJointCost(rules, stems, prefixes, suffixes, costUpperBound, morphologicalCosts, oldSolution=oldSolution) self.conditionOnData(rules, stems, prefixes, suffixes, auxiliaryHarness = auxiliaryHarness) self.conditionOnPrecomputedMorphology(prefixes, suffixes) output = self.solveSketch() print "Final hole value:",parseMinimalCostValue(output) solution = Solution(prefixes = [ Morph.parse(self.bank, output, p) for p in prefixes ], suffixes = [ Morph.parse(self.bank, output, s) for s in suffixes ], underlyingForms = {x: Morph.parse(self.bank, output, s) for x,s in zip(self.data, stems) }, rules = [ Rule.parse(self.bank, output, r) for r in rules ] if fixedRules == None else fixedRules) solution.showMorphologicalAnalysis() solution.showRules() return solution except SynthesisFailure: if canAddNewRules: depth += 1 print "Expanding rule depth to %d"%depth return self.sketchJointSolution(depth, canAddNewRules = canAddNewRules, auxiliaryHarness = auxiliaryHarness, oldSolution=oldSolution) else: return None
"-c", "--wordclass", dest="wordclass", default="", help="limit tests to specific word class ('n' for noun, 's' for verb etc.)", ) parser.add_option( "-l", "--loose", dest="loose", action="store_true", help="allow a more 'loose' comparison, omitting certain parts of tags", ) options, args = parser.parse_args() m = Morph() total = 0.0 total_full = 0.0 total_tags = 0.0 total_tags_full = 0 tags_missed = 0 missing = 0 all_missing = 0 no_candidates = 0 surplus = 0 wrong_pick = 0 tags_ignored = 0 words_ignored = 0 start_time = time.time()
# coding: utf-8 from morph import Morph if __name__ == '__main__': with open('neko.txt.cabocha', 'r') as f: cabocha_sents = [cabocha_sent.strip() for cabocha_sent in f.read().split('EOS') if cabocha_sent != '\n'] morphs = [] cabocha_lines = cabocha_sents[2].split('\n') for line in cabocha_lines: if not line.startswith('*'): morphs.append(Morph.parse(line)) print(morphs)