예제 #1
0
def parseSolution(s):
    def removeComment(y):
        if ';' in y: return y[:y.index(';')].strip()
        y = y.strip()
        if len(y) == 0 or y[0] == '#': return ''
        return y

    lines = [removeComment(x) for x in s.split('\n')]
    prefixes = []
    suffixes = []
    rules = []
    for l in lines:
        if 'stem' in l:
            [prefix, suffix] = l.split('stem')
            prefix = prefix.replace('+', '').strip()
            suffix = suffix.replace('+', '').strip()
            prefixes.append(Morph(tokenize(prefix)))
            suffixes.append(Morph(tokenize(suffix)))
        elif len(l) > 0:
            r = parseRule(l)
            if r == None:
                print "Could not parse '%s'" % l
                assert False
            rules.append(r)
    return Solution(rules, prefixes, suffixes)
예제 #2
0
 def finalizeResult(self, k, result):
     """do a final Hail Mary transduction of underlying forms and then expand the frontier"""
     if len(result.solutionSequence) == 0:
         emptySolution = Solution(prefixes=[Morph(u"")]*self.numberOfInflections,
                                  suffixes=[Morph(u"")]*self.numberOfInflections,
                                  rules=[], underlyingForms={})
         result.recordSolution(emptySolution)
         return result.lastSolutionIsFinal()
     
     setGlobalTimeout(None)
     s = result.solutionSequence[-1][0]
     s = self.finalTransduction(s)
     f = self.expandFrontier(s, k)
     result.recordFinalFrontier(f)
예제 #3
0
def read_chunks(sentence) -> list:
    chunks = []  # 1文の解析結果を格納
    chunk = None  # 文節

    for line in sentence.splitlines():
        # 係り受け解析による区切りなら
        if line.find('*') == 0:
            # 係り先が書かれている位置
            dep_par = re.search('[-]*[0-9]+D', line).start()
            if chunk is not None:
                chunks.append(chunk)
            chunk = Chunk([], int(line[dep_par:line.find('D')]), [])
        # 1文の終了の記号が見つかれば
        elif line.find('EOS') > -1:
            if chunk is not None:
                chunks.append(chunk)
            if len(chunks) > 0:
                # 係り受け元の格納
                for i in range(len(chunks)):
                    if chunks[i].dst > -1:
                        chunks[chunks[i].dst].srcs.append(i)
                # 1文を格納
                return chunks
        else:
            # 形態素の情報を文節に格納
            speech = line[line.find('\t') + 1:].replace('\n', '').split(',')
            morph = Morph(line[0:line.find('\t')], speech[6], speech[0],
                          speech[1])
            chunk.morphs.append(morph)

    return chunks_list
예제 #4
0
def parse():
    cabocha = CaboCha.Parser()
    result = []
    with open('neko.txt') as input_data:
        for line in input_data:
            line = line.strip()
            parsed = cabocha.parse(line).toString(CaboCha.FORMAT_LATTICE)
            chunks = {}
            for sentence_str in parsed.split('* '):
                sentence_analysis = sentence_str.split('\n')
                affliation_str = sentence_analysis.pop(0)
                if affliation_str in ['', 'EOS']:
                    continue
                morph_analysis = affliation_str.split(' ')
                chunk = Chunk()
                chunk.id = int(morph_analysis[0])
                chunk.srcs = int(morph_analysis[1][:-1])
                morphs = []
                for morph_str in sentence_analysis:
                    if morph_str in ['', 'EOS']:
                        continue
                    surface, right = morph_str.split('\t')
                    morph_items = right.split(',')
                    morphs.append(Morph(surface, morph_items[6],
                                        morph_items[0], morph_items[1]))
                chunk.morphs = morphs
                chunks[chunk.id] = chunk
            for i, chunk in chunks.items():
                if chunk.srcs > 0:
                    chunks[chunk.srcs].dst.append(i)
            result.append(chunks)
    return result
예제 #5
0
def dump(node):
    debugcnt = 0
    sentence_id = -1
    for c in node:
        if c.tag == "sentence":
            sentence = []
            sentence_id += 1
            chunk_dict = {} # chunkは1sentenceに複数
            token_dict = {} #tokenは1chunkに複数
            chunk_iter = c.getiterator('chunk')
            for chunk in chunk_iter:
                morphs = []
                tok_iter = chunk.getiterator('tok')
                chunk_dict['id'] = chunk.get("id")
                chunk_dict['link'] = chunk.get("link")
                chunk_dict['rel'] = chunk.get("rel")
                chunk_dict['score'] = chunk.get("score")
                chunk_dict['head'] = chunk.get("head")
                chunk_dict['func'] = chunk.get("func")
                for tok in tok_iter:
                    tok_id = tok.get('id')
                    tok_feature = tok.get('feature')
                    morph = Morph(sentence_id, chunk_dict['id'], tok_id, tok_feature, tok.text)
                    morphs.append(morph)
                    tok_content = tok.text
                a_chunk = Chunk(sentence_id,chunk_dict['id'], chunk_dict['link'], chunk_dict['rel'], \
                chunk_dict['score'], chunk_dict['head'], chunk_dict['func'], morphs)
                # chuncks.append(a_chunk)
                sentence.append(a_chunk)
            sents_list.append(sentence)
        dump(c)
예제 #6
0
    def __init__(self, data, count, problemName=None):
        self.problemName = problemName
        self.data = [Morph(tokenize(x)) for x in data]
        self.count = count
        self.bank = FeatureBank([w for w in data])

        self.maximumObservationLength = max([len(w) for w in self.data]) + 1
예제 #7
0
 def __init__(self, morph=None, quality=None):
     """ Initialize a Hyphenator class. You may pass in a Morph() instance,
     or it can create a new one. If you specify quality, that quality will be
     used by default for every call to the hyphenate functions.
     """
     self.morph = morph or Morph()
     self.default_quality = quality or 2
예제 #8
0
def main():
    # 結果の格納用
    morph_list = []
    i = 0
    with open('neko.txt.cabocha', 'r') as f:
        for line in f.readlines():
            # 係り受け解析による区切りなら
            if line.find('*') == 0:
                # 文節の開始地点なら
                if int(line[2]) == 0:
                    # listの中にリストを追加
                    morph_list.append(list())
                    # 次のリストの位置を保持
                    i += 1
            # 文章の一部で条件を満たしているなら
            #elif line.find('EOS') == -1 and line[0] != ' ':
            elif line.find('EOS') == -1:
                # 品詞情報を格納
                speech = line[line.find('\t') + 1:].replace('\n',
                                                            '').split(',')
                # 形態素のクラスを現在の文章を指す配列に挿入
                morph = Morph(line[0:line.find('\t')], speech[6], speech[0],
                              speech[1])
                morph_list[i - 1].append(morph)

    # 3行目を出力
    for i in range(len(morph_list[2])):
        morph_list[2][i].show()
예제 #9
0
    def __init__(self, data, problemName=None, bank = None, useSyllables = False, UG = None,
                 fixedMorphology = None):
        self.problemName = problemName
        self.UG = UG
        self.countingProblem = problemName == "Odden_2.4_Tibetan"

      
        if bank != None: self.bank = bank
        else:
            self.bank = FeatureBank([ w for l in data for w in l if w != None ] + ([u'-'] if useSyllables else []))

        self.numberOfInflections = len(data[0])
        for d in data: assert len(d) == self.numberOfInflections
        
        # wrap the data in Morph objects if it isn't already
        self.data = [ tuple( None if i == None else (i if isinstance(i,Morph) else Morph(tokenize(i)))
                             for i in Lex)
                      for Lex in data ]

        self.maximumObservationLength = max([ len(w) for l in self.data for w in l if w != None ])

        self.wordBoundaries = any([ (u'##' in w.phonemes) for l in self.data for w in l if w ])

        # fixedMorphology : list of morphologies, one for each inflection
        # Each morphology is either None (don't fix it) or a pair of (prefix, suffix)
        if fixedMorphology == None: fixedMorphology = [None]*self.numberOfInflections
        self.fixedMorphology = fixedMorphology
        assert len(self.fixedMorphology) == self.numberOfInflections

        self.inflectionsPerObservation = sum(x is not None
                                             for xs in self.data for x in xs )/len(self.data)

        self.pervasiveTimeout = None

        self.precomputedAlignment = None
예제 #10
0
 def restrict(self, newData):
     restriction = copy.copy(self)
     restriction.data = [
         tuple(None if i == None else (
             i if isinstance(i, Morph) else Morph(tokenize(i)))
               for i in Lex) for Lex in newData
     ]
     return restriction
예제 #11
0
    def __init__(self):
        self.dictionary = Dictionary()
        self.morph = Morph()

        self.resp_what = responder.WhatResponder(self.dictionary)
        self.resp_random = responder.RandomResponder(self.dictionary)
        self.resp_pattern = responder.PatternResponder(self.dictionary)
        self.resp_template = responder.TemplateResponder(self.dictionary)
        self.responder = self.resp_pattern
예제 #12
0
 def tibetanCountingConstraints(self, stems, prefixes, suffixes):
     condition(wordLength(prefixes[0]) == 0)
     condition(wordLength(suffixes[0]) == 0)
     condition(wordLength(suffixes[1]) == 0)
     condition(wordLength(prefixes[2]) == 0)
     for n,inflections in enumerate(self.data):
         if inflections == (Morph(u"ǰu"),None,None): # 10
             condition(wordEqual(stems[n],prefixes[1]))
             condition(wordEqual(stems[n],suffixes[2]))
예제 #13
0
 def stochasticSearch(self, iterations, width):
     population = [Solution([EMPTYRULE],
                            [Morph([])]*self.numberOfInflections,
                            [Morph([])]*self.numberOfInflections)]
     for i in range(iterations):
         # expand the population
         children = [ parent.mutate(self.bank)
                      for parent in population
                      for _ in range(width) ]
         population += children
         populationScores = [ (self.solutionDescriptionLength(s) + s.modelCost(),s)
                              for s in population ]
         populationScores.sort()
         population = [ s
                        for _,s in populationScores[:width] ]
         setVerbosity(4)
         mdl = self.solutionDescriptionLength(population[0])
         setVerbosity(0)
         print "MDL:",mdl+population[0].modelCost()
예제 #14
0
    def __init__(self, data, CPUs=1):
        self.CPUs = CPUs
        self.bank = FeatureBank([w for l in data
                                 for w in l if w != None] + [u'?', u'*'])
        self.numberOfInflections = len(data[0])
        # wrap the data in Morph objects if it isn't already
        self.data = [
            tuple(None if i == None else (
                i if isinstance(i, Morph) else Morph(tokenize(i)))
                  for i in Lex) for Lex in data
        ]

        self.maximumObservationLength = max(
            [len(w) for l in self.data for w in l if w != None])
예제 #15
0
    def solve(self):
        '''
        insert your code
        '''
        with open('neko.txt.cabocha', 'r') as f:
            morphologies = []
            sentence = []
            for line in f.readlines():
                if line.strip() == 'EOS':
                    if len(sentence) > 0:
                        morphologies.append(sentence)
                        sentence = []
                elif not line.startswith('*'):
                    surface, result = line.split()
                    results = result.split(',')
                    sentence.append(
                        Morph(surface, results[6], results[0], results[1]))
            print morphologies[2]

        return None
예제 #16
0
def read_chunks(filename='') -> list:
    chunks_list = []  # 結果の格納用
    chunks = []  # 1文の解析結果を格納
    chunk = None  # 文節

    with open(filename, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            # 係り受け解析による区切りなら
            if line.find('*') == 0:
                # 係り先が書かれている位置
                dep_par = re.search('[-]*[0-9]+D', line).start()
                if chunk is not None:
                    chunks.append(chunk)
                chunk = Chunk([], int(line[dep_par:line.find('D')]), [])
            # 1文の終了の記号が見つかれば
            elif line.find('EOS') > -1:
                if chunk is not None:
                    chunks.append(chunk)
                if len(chunks) > 0:
                    # 係り受け元の格納
                    for i in range(len(chunks)):
                        chunks[chunks[i].dst].srcs.append(i)
                    # 1文を格納
                    chunks_list.append(chunks)
                # リセット
                chunk = None
                chunks = []
            else:
                # 形態素の情報を文節に格納
                speech = line[line.find('\t') + 1:].replace('\n',
                                                            '').split(',')
                morph = Morph(line[0:line.find('\t')], speech[6], speech[0],
                              speech[1])
                chunk.morphs.append(morph)

    return chunks_list
    def morph_model(self, ctx, params):
        """
        Morph Function
        :param params: instance of type "CallingParams" (Insert your typespec
           information here.) -> structure: parameter "fbamodel_workspace" of
           String, parameter "fbamodel_id" of String, parameter
           "media_workspace" of String, parameter "media_id" of String,
           parameter "genome_workspace" of String, parameter "genome_id" of
           String, parameter "proteincomparison_workspace" of String,
           parameter "proteincomparison_id" of String, parameter "fill_src"
           of Long, parameter "translate_media" of Long, parameter
           "num_reactions_to_process" of Long, parameter
           "translate_media_workspace" of String, parameter
           "translate_media_id" of String, parameter "output_id" of String,
           parameter "workspace" of String
        :returns: instance of type "CallingResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN morph_model
        self.service = Service(self.callback_url, self.workspaceURL, ctx)
        required_args = [
            'fbamodel_name', 'fbamodel_workspace', 'media_name',
            'media_workspace', 'proteincomparison_name',
            'proteincomparison_workspace', 'genome_name', 'genome_workspace',
            'output_name'
        ]
        for r in required_args:
            if r not in params:
                raise ValueError("insufficient params supplied")

        def _translate_obj_identity(workspace, name):
            info = self.service.get_info(workspace, name=name)
            return info[0], workspace

        objid, ws = _translate_obj_identity(params['fbamodel_workspace'],
                                            params['fbamodel_name'])
        model = FBAModel(objid, ws, service=self.service)
        objid, ws = _translate_obj_identity(params['media_workspace'],
                                            params['media_name'])
        media = Media(objid, ws, service=self.service)
        objid, ws = _translate_obj_identity(
            params['proteincomparison_workspace'],
            params['proteincomparison_name'])
        protcomp = ProteomeComparison(objid, ws, service=self.service)
        objid, ws = _translate_obj_identity(params['genome_workspace'],
                                            params['genome_name'])
        genome = Genome(objid, ws, service=self.service)
        probanno = None
        if 'rxn_probs_name' in params and 'rxn_probs_workspace' in params and \
                params['rxn_probs_name'] is not None and len(params['rxn_probs_name']) > 0:
            objid, ws = _translate_obj_identity(params['rxn_probs_workspace'],
                                                params['rxn_probs_name'])
            probanno = ReactionProbabilities(objid, ws, service=self.service)
        morph = Morph(service=self.service,
                      src_model=model,
                      media=media,
                      probanno=probanno,
                      protcomp=protcomp,
                      genome=genome,
                      ws_id=params['workspace'])
        if 'fill_src' in params and params['fill_src']:
            morph.fill_src_to_media()
        morph.translate_features()
        morph.reconstruct_genome()
        morph.label_reactions()
        morph.build_supermodel()
        if 'translate_media' in params and params['translate_media']:
            if 'target_media_name' in params and 'target_media_workspace' in params:
                objid, ws = self._translate_obj_identity(
                    params['target_media_workspace'],
                    params['target_media_name'])
                new_media = Media(objid, ws, service=self.service)
            else:
                new_media = morph.media
            morph.translate_media(new_media)
        output_name = params[
            'output_name'] if 'output_name' in params else 'MorphedModel'
        if 'num_reactions_to_process' in params:
            morph.process_reactions(num_reactions=int(
                params['num_reactions_to_process']),
                                    name=output_name)
        else:
            morph.process_reactions(name=output_name)

        reportObj = {'objects_created': [], 'text_message': "MIGHTY"}
        #save report
        provenance = [{}]
        if 'provenance' in ctx:
            provenance = ctx['provenance']
        # add additional info to provenance here, in this case the input data object reference
        provenance[0]['input_ws_objects'] = [
            params['workspace'] + '/' + params['fbamodel_name']
        ]
        try:
            report_info_list = self.service.ws_client.save_objects({
                'workspace':
                params['workspace'],
                'objects': [{
                    'type':
                    'KBaseReport.Report',
                    'data':
                    reportObj,
                    'name':
                    'CallingFBA_report' + str(hex(uuid.getnode())),
                    'meta': {},
                    'hidden':
                    1,  # important!  make sure the report is hidden
                    'provenance':
                    provenance
                }]
            })
        except:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            lines = traceback.format_exception(exc_type, exc_value,
                                               exc_traceback)
            orig_error = ''.join('    ' + line for line in lines)
            raise ValueError('Error saving Report object to workspace:\n' +
                             orig_error)
        report_info = report_info_list[0]
        print('Ready to return')
        returnVal = {
            'report_name':
            'FS_report',
            'report_ref':
            str(report_info[6]) + '/' + str(report_info[0]) + '/' +
            str(report_info[4])
        }
        #END morph_model

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method morph_model return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]
예제 #18
0
 def __init__(self):
     # self.parse_text() # ai.ja.txtをparseする
     self.morph = Morph()
예제 #19
0
def init_morph():
    global morph
    if not morph: morph = Morph()
예제 #20
0
    (u"man", u"manmandə"),
    #                   (u"kwaj",u"kwajkwajdə"),
    #                   (u"çin",u"çinçində"),
    (u"le", u"leledə")
]

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="Learn pig Latin and Chinese")
    parser.add_argument(
        'task',
        choices=["Chinese", "Latin", "Latin1", "Latin2", "Latin3"],
        default="Latin")
    parser.add_argument("-d", "--depth", default=1, type=int)
    arguments = parser.parse_args()

    examples = data[arguments.task]
    depth = arguments.depth

    leaveSketchOutput()
    solution = SupervisedProblem(
        [(Morph(tokenize(x)), Constant(0), Morph(tokenize(y)))
         for x, y in examples],
        syllables=True).solve(depth)

    if solution == None:
        print "No solution."
    else:
        for r in solution:
            print r
예제 #21
0
 def affix():
     if useMorphology: return Morph.sample()
     else: return Morph([]).makeConstant(self.bank)
예제 #22
0
 def toMorph(z):
     if isinstance(z, Morph): return z
     elif isinstance(z, (unicode, str)): return Morph(tokenize(z))
     else: assert False
예제 #23
0
 def __init__(self, morph=None):
     self.morph = morph or Morph()
예제 #24
0
 def parseAffix(output, morph):
     if useMorphology: return Morph.parse(self.bank, output, morph)
     else: return Morph([])
 def parse_morph(self, line):
     surface, result = line.split()
     results = result.split(',')
     morph = Morph(surface, results[6], results[0], results[1])
     return morph
예제 #26
0
 def restrict(self, newData):
     """Creates a new version of this object which is identical but has different training data"""
     restriction = copy.copy(self)
     restriction.data = [ tuple( None if i == None else (i if isinstance(i,Morph) else Morph(tokenize(i)))
                            for i in Lex)
                          for Lex in newData ]
     return restriction
예제 #27
0
    parser.add_option(
        '-c',
        '--wordclass',
        dest='wordclass',
        default='',
        help=
        "limit tests to specific word class ('n' for noun, 's' for verb etc.)")
    parser.add_option(
        '-l',
        '--loose',
        dest='loose',
        action="store_true",
        help="allow a more 'loose' comparison, omitting certain parts of tags")
    options, args = parser.parse_args()

    m = Morph()

    total = 0.0
    total_full = 0.0
    total_tags = 0.0
    total_tags_full = 0
    tags_missed = 0
    missing = 0
    all_missing = 0
    no_candidates = 0
    surplus = 0
    wrong_pick = 0
    tags_ignored = 0
    words_ignored = 0

    start_time = time.time()