Пример #1
0
 def lesionMorphologicalRules(self, solution):
     """Sometimes we end up learning to put the morphology into the rewrite
     rules, e.g. 0 > k/#_ or something like that. This will take a
     solution and try removing insertion/deletion rules whenever
     possible, keeping the underlying forms constant but being
     willing to modify the morphology.
     """
     rules = list(solution.rules)
     for r in list(solution.rules):
         if isinstance(r.focus, EmptySpecification) and isinstance(r.structuralChange, ConstantPhoneme) and \
            u'#' in unicode(r):
             print "Candidate for lesion",r
             candidateRules = [ r_ for r_ in rules if r_ != r ]
             print "the new rules would be",candidateRules
             Model.Global()
             prefixes = [ Morph.sample() for _ in xrange(self.numberOfInflections) ]
             suffixes = [ Morph.sample() for _ in xrange(self.numberOfInflections) ]
             self.conditionOnData([ r_.makeConstant(self.bank) for r_ in candidateRules ],
                                  [ solution.underlyingForms[x].makeConstant(self.bank)
                                    for x in self.data ],
                                  prefixes, suffixes,
                                  auxiliaryHarness=True)
             #minimize(sum(wordLength(m) for m in prefixes+suffixes ))
             try:
                 output = self.solveSketch()
                 print "Lesioning morphological rule", r
                 solution = Solution(prefixes = [ Morph.parse(self.bank, output, p) for p in prefixes ],
                                     suffixes = [ Morph.parse(self.bank, output, s) for s in suffixes ],
                                     underlyingForms = solution.underlyingForms,
                                     rules = candidateRules)
                 rules = solution.rules
             except SynthesisFailure:
                 print "Turns out that you cannot lesion",r
     return solution
Пример #2
0
    def sketchJointSolution(self, depth, canAddNewRules = False, costUpperBound = None,
                            fixedRules = None, auxiliaryHarness = False, oldSolution=None):
        try:
            Model.Global()
            if fixedRules == None:
                rules = [ Rule.sample() for _ in range(depth) ]
            else:
                rules = [ r.makeDefinition(self.bank) for r in fixedRules ]
            stems = [ Morph.sample() for _ in self.data ]
            prefixes = [ Morph.sample() for _ in range(self.numberOfInflections) ]
            suffixes = [ Morph.sample() for _ in range(self.numberOfInflections) ]

            for j,m in enumerate(self.fixedMorphology):
                if m != None:
                    (p,s) = m
                    condition(wordEqual(prefixes[j],p.makeConstant(self.bank)))
                    condition(wordEqual(suffixes[j],s.makeConstant(self.bank)))
            if self.wordBoundaries:
                for prefix, suffix in zip(prefixes, suffixes):
                    condition(Or([wordLength(prefix) == 0, wordLength(suffix) == 0]))

            morphologicalCosts = [ None if m == None else len(m[0]) + len(m[1])
                                   for m in self.fixedMorphology ]

            self.minimizeJointCost(rules, stems, prefixes, suffixes, costUpperBound, morphologicalCosts,
                                   oldSolution=oldSolution)

            self.conditionOnData(rules, stems, prefixes, suffixes,
                                 auxiliaryHarness = auxiliaryHarness)
            self.conditionOnPrecomputedMorphology(prefixes, suffixes)

            output = self.solveSketch()
            print "Final hole value:",parseMinimalCostValue(output)

            solution = Solution(prefixes = [ Morph.parse(self.bank, output, p) for p in prefixes ],
                                suffixes = [ Morph.parse(self.bank, output, s) for s in suffixes ],
                                underlyingForms = {x: Morph.parse(self.bank, output, s)
                                                   for x,s in zip(self.data, stems) },
                                rules = [ Rule.parse(self.bank, output, r) for r in rules ] if fixedRules == None else fixedRules)
            solution.showMorphologicalAnalysis()
            solution.showRules()
            return solution
        
        except SynthesisFailure:
            if canAddNewRules:
                depth += 1
                print "Expanding rule depth to %d"%depth
                return self.sketchJointSolution(depth, canAddNewRules = canAddNewRules,
                                                auxiliaryHarness = auxiliaryHarness,
                                                oldSolution=oldSolution)
            else:
                return None
Пример #3
0
    def parse_sentence(cabocha_sent):
        chunks = []
        morphs = []
        dependencies = {}

        lines = cabocha_sent.split('\n')

        for line in lines:

            if line.startswith('*'):

                if len(morphs) > 0:
                    chunks.append(
                        Chunk(morphs, dst,
                              [k
                               for k, v in dependencies.items() if v == src]))

                morphs = []

                elems = line.split(' ')
                src, dst = int(elems[1]), int(elems[2][:-1])

                if dst > -1:
                    dependencies[src] = dst

            else:
                morphs.append(Morph.parse(line))

        chunks.append(
            Chunk(morphs, dst,
                  [k for k, v in dependencies.items() if v == src]))

        return chunks
Пример #4
0
    def parse_sentence(cabocha_sent):
        chunks = []
        morphs = []
        dependencies = {}

        lines = cabocha_sent.split('\n')

        for line in lines:

            if line.startswith('*'):

                if len(morphs) > 0:
                    chunks.append(Chunk(morphs, dst, [k for k, v in dependencies.items() if v == src]))

                morphs = []

                elems = line.split(' ')
                src, dst = int(elems[1]), int(elems[2][:-1])

                if dst > -1:
                    dependencies[src] = dst

            else:
                morphs.append(Morph.parse(line))

        chunks.append(Chunk(morphs, dst, [k for k, v in dependencies.items() if v == src]))

        return chunks
Пример #5
0
class Section_5:
    def __init__(self):
        # self.parse_text() # ai.ja.txtをparseする
        self.morph = Morph()
    
    # ai.ja.txtを係り受け解析をする
    def parse_text(self):
        f = open('ai.ja.txt', 'r')
        data = f.read()
        f.close()
        parse_target = [i for i in data.split('\n') if i != ''] # '' を削ぎ落とす

        # CaboChaの解析について
        # https://qiita.com/ayuchiy/items/c3f314889154c4efa71e
        c = CaboCha.Parser()
        f_w = open('ai.ja.txt.parsed', 'w')
        for i in range(len(parse_target)):
            f_w.write(c.parse(parse_target[i]).toString(CaboCha.FORMAT_LATTICE))
        f_w.close()
        
    def ss0(self):
        self.morph.parse()
Пример #6
0
    def solveStem(self, ss, morphology):
        Model.Global()
        stem = Morph.sample()

        for (p, s), x in zip(zip(morphology.prefixes, morphology.suffixes),
                             ss):
            if x is None: continue

            condition(
                matchPattern(
                    x.makeConstant(self.bank),
                    concatenate3(p.makeConstant(self.bank), stem,
                                 s.makeConstant(self.bank))))

        minimize(patternCost(stem))
        output = self.solveSketch()
        return Morph.parse(self.bank, output, stem)
Пример #7
0
 def applyRuleUsingSketch(self,r,u,untilSuffix):
     '''u: morph; r: rule; untilSuffix: int'''
     Model.Global()
     result = Morph.sample()
     _r = r.makeDefinition(self.bank)
     condition(wordEqual(result,applyRule(_r,u.makeConstant(self.bank),
                                          Constant(untilSuffix), len(u) + 2)))
     try:
         output = solveSketch(self.bank,
                              max(self.maximumObservationLength, len(u)) + 2,
                              len(u) + 2,
                              showSource=False, minimizeBound=31,
                              timeout=None)
     except SynthesisFailure:
         print "applyRuleUsingSketch: UNSATISFIABLE for %s %s %s"%(u,r,untilSuffix)
         printSketchFailure()
         assert False
     except SynthesisTimeout:
         print "applyRuleUsingSketch: TIMEOUT for %s %s %s"%(u,r,untilSuffix)
         assert False
     return Morph.parse(self.bank, output, result)
Пример #8
0
 def parseAffix(output, morph):
     if useMorphology: return Morph.parse(self.bank, output, morph)
     else: return Morph([])
Пример #9
0
    def paretoFront(self, depth, k, temperature, useMorphology = False,
                    offFront=0,
                    oldSolutions=[],
                    morphologicalCoefficient = 3,
                    stemBaseline=0, minimizeBits=7):
        # no idea why we want this
        #self.maximumObservationLength += 1

        def affix():
            if useMorphology: return Morph.sample()
            else: return Morph([]).makeConstant(self.bank)
        def parseAffix(output, morph):
            if useMorphology: return Morph.parse(self.bank, output, morph)
            else: return Morph([])
            
        Model.Global()
        rules = [ Rule.sample() for _ in range(depth) ]

        stems = [ Morph.sample() for _ in self.data ]
        prefixes = [ affix() for _ in range(self.numberOfInflections) ]
        suffixes = [ affix() for _ in range(self.numberOfInflections) ]

        for i in range(len(stems)):
            self.conditionOnStem_1a(rules, stems[i], prefixes, suffixes, self.data[i])
        # actually we want this
        #for r in rules: condition(Not(ruleDoesNothing(r)))

        stemCostExpression = sum([ wordLength(u) for u in stems ]) - stemBaseline
        stemCostVariable = unknownInteger(numberOfBits = minimizeBits)
        condition(stemCostVariable == stemCostExpression)
        minimize(stemCostExpression)
        ruleCostExpression = sum([ ruleCost(r) for r in rules ] + [ wordLength(u)*morphologicalCoefficient for u in suffixes + prefixes ])
        ruleCostVariable = unknownInteger()
        condition(ruleCostVariable == ruleCostExpression)
        if len(rules) > 0 or useMorphology:
            minimize(ruleCostExpression)

        solutions = []
        solutionCosts = []
        if oldSolutions:
            solutions = oldSolutions[0]
            solutionCosts = oldSolutions[1]
        solutionIndex = 0
        while solutionIndex < k + offFront:
            # Excludes solutions we have already found
            for rc,uc in solutionCosts:
                if oldSolutions or solutionIndex >= k:
                    # This condition just says that it has to be a
                    # different trade-off. Gets things a little bit off of
                    # the front
                    condition(And([ruleCostVariable == rc,stemCostVariable == (uc - stemBaseline)]) == 0)
                else:
                    # This condition says that it has to actually be on
                    # the pareto - a stronger constraint
                    condition(Or([ruleCostVariable < rc, stemCostVariable < (uc - stemBaseline)]))

            try:
                output = self.solveSketch(minimizeBound = int(2**minimizeBits - 1))
            except SynthesisFailure:
                if offFront > 0 and solutionIndex < k:
                    solutionIndex = k
                    print "Nothing on front, moving to things just off of front..."
                    continue
                else:
                    print "Exiting Pareto procedure early due to unsatisfied"
                    break
            except SynthesisTimeout:
                print "Exiting Pareto procedure early due to timeout"
                break

            s = Solution(suffixes = [ parseAffix(output, m) for m in suffixes ],
                         prefixes = [ parseAffix(output, m) for m in prefixes ],
                         rules = [ Rule.parse(self.bank, output, r) for r in rules ],
                         underlyingForms = {x: Morph.parse(self.bank, output, m)
                                            for x,m in zip(self.data, stems) }).withoutUselessRules()
            solutions.append(s)
            print s

            rc = sum([r.cost() for r in s.rules ] + [len(a)*morphologicalCoefficient for a in s.prefixes + s.suffixes ])
            uc = sum([len(u) for u in s.underlyingForms.values() ])
            rc = int(rc + 0.5)
            print "Costs:",(rc,uc)
            actualCosts = (parseInteger(output, ruleCostVariable), parseInteger(output, stemCostVariable) + stemBaseline)
            print "Actual costs:",actualCosts
            if not (actualCosts == (rc,uc)):
                print output
            assert actualCosts == (rc,uc)
            (rc,uc) = actualCosts
            solutionCosts.append((rc,uc))

            solutionIndex += 1

        print " pareto: got %d solutions of depth %d"%(len(solutions),depth)
        
        if len(solutions) > 0:
            optimalCost, optimalSolution = min([(uc + float(rc)/temperature, s)
                                                for ((rc,uc),s) in zip(solutionCosts, solutions) ])
            print "Optimal solution:"
            print optimalSolution
            print "Optimal cost:",optimalCost

        return solutions, solutionCosts
Пример #10
0
    def solveAlignment(self):
        Model.Global()
        prefixes = [Morph.sample() for _ in range(self.numberOfInflections)]
        suffixes = [Morph.sample() for _ in range(self.numberOfInflections)]
        stems = [Morph.sample() for _ in self.data]

        for surfaces, stem in zip(self.data, stems):
            for (p, s), x in zip(zip(prefixes, suffixes), surfaces):
                if x is None: continue
                condition(
                    matchPattern(x.makeConstant(self.bank),
                                 concatenate3(p, stem, s)))

        for i in range(self.numberOfInflections):
            if all(ss[i] == None for ss in self.data):
                condition(wordLength(prefixes[i]) == 0)
                condition(wordLength(suffixes[i]) == 0)

        # OBJECTIVE: (# inflections) * (stem lengths) + (# data points) * (affix len)
        # Because we pay for each stem once per inflection,
        # and pay for each affix once per data point
        observationsPerStem = float(
            sum(s is not None for ss in self.data for s in ss)) / len(stems)
        observationsPerAffix = sum( sum(ss[i] is not None
                                        for ss in self.data )
                                    for i in range(self.numberOfInflections) ) \
                                        / float(self.numberOfInflections)
        print "observations per stem", observationsPerStem
        print "observations per affix", observationsPerAffix

        r = observationsPerStem / observationsPerAffix
        if r < 2 and r > 0.5:
            ca = 1
            cs = 1
        elif r >= 2:
            ca = 1
            cs = 2
        elif r <= 0.5:
            ca = 2
            cs = 1
        else:
            assert False

        print "ca = ", ca
        print "cs = ", cs

        minimize(sum((patternCost(p) + patternCost(s)) * ca
                     for j,(p,s) in enumerate(zip(prefixes, suffixes))) + \
                 sum(patternCost(stem) * cs
                     for stem,ss in zip(stems, self.data) ))
        # for m in prefixes + suffixes:
        #     condition(patternCost(m) < 4)

        output = self.solveSketch()
        solution = Solution(
            rules=[],
            prefixes=[Morph.parse(self.bank, output, p) for p in prefixes],
            suffixes=[Morph.parse(self.bank, output, p) for p in suffixes],
            underlyingForms={
                x: Morph.parse(self.bank, output, s)
                for x, s in zip(self.data, stems)
            })

        for i in range(self.numberOfInflections):
            if all(ss[i] == None for ss in self.data):
                print("\t(inflection not seen)")
            else:
                print solution.prefixes[i], "+ stem +", solution.suffixes[i]
        return solution
Пример #11
0
    def sketchJointSolution(self,
                            depth,
                            canAddNewRules=True,
                            existingSolutions=[]):
        assert depth == 1
        assert canAddNewRules

        Model.Global()

        r = Rule.sample()
        for o in existingSolutions:
            assert len(o.rules) == 1
            condition(Not(ruleEqual(r, o.rules[0].makeConstant(self.bank))))

        morphs = {}
        morphs[1] = Morph.sample()
        morphs[4] = Morph.sample()
        morphs[5] = Morph.sample()
        morphs[9] = Morph.sample()
        morphs[10] = Morph.sample()

        if existingSolutions:
            for (k, ), v in existingSolutions[0].underlyingForms.iteritems():
                condition(wordEqual(v.makeConstant(self.bank), morphs[k]))

        for j in range(len(self.data)):
            o = self.data[j]
            k = self.count[j]
            if k <= 10:
                condition(
                    wordEqual(
                        o.makeConstant(self.bank),
                        applyRule(r, morphs[k], Constant(0),
                                  self.maximumObservationLength)))
            elif k % 10 == 0:
                condition(
                    wordEqual(
                        o.makeConstant(self.bank),
                        applyRule(r, concatenate(morphs[k / 10], morphs[10]),
                                  Constant(0), self.maximumObservationLength)))
            elif k < 20:
                condition(
                    wordEqual(
                        o.makeConstant(self.bank),
                        applyRule(r, concatenate(morphs[10], morphs[k - 10]),
                                  Constant(0), self.maximumObservationLength)))
            else:
                assert False

        minimize(ruleCost(r))

        try:
            output = solveSketch(
                self.bank,
                unroll=self.maximumObservationLength + 2,
                maximumMorphLength=self.maximumObservationLength + 1)
        except SynthesisFailure:
            print "Failed at phonological analysis."
            return None

        r = Rule.parse(self.bank, output, r)
        print r.pretty()
        return Solution(rules=[r],
                        prefixes=[],
                        suffixes=[],
                        underlyingForms={(k, ):
                                         Morph.parse(self.bank, output, m)
                                         for k, m in morphs.iteritems()})
Пример #12
0
# coding: utf-8

from morph import Morph

if __name__ == '__main__':
    with open('neko.txt.cabocha', 'r') as f:
        cabocha_sents = [cabocha_sent.strip() for cabocha_sent in f.read().split('EOS') if cabocha_sent != '\n']

    morphs = []
    cabocha_lines = cabocha_sents[2].split('\n')

    for line in cabocha_lines:
        if not line.startswith('*'):
            morphs.append(Morph.parse(line))

    print(morphs)
Пример #13
0
# coding: utf-8

from morph import Morph

if __name__ == '__main__':
    with open('neko.txt.cabocha', 'r') as f:
        cabocha_sents = [
            cabocha_sent.strip() for cabocha_sent in f.read().split('EOS')
            if cabocha_sent != '\n'
        ]

    morphs = []
    cabocha_lines = cabocha_sents[2].split('\n')

    for line in cabocha_lines:
        if not line.startswith('*'):
            morphs.append(Morph.parse(line))

    print(morphs)