示例#1
0
import os.path
assert os.path.isfile('streets.txt')

# pmatch transducers are always in ol format, so this has actually no effect...
for type in [
        hfst.ImplementationType.SFST_TYPE,
        hfst.ImplementationType.TROPICAL_OPENFST_TYPE,
        hfst.ImplementationType.FOMA_TYPE
]:
    if hfst.HfstTransducer.is_implementation_type_available(type):
        hfst.set_default_fst_type(type)

        # (1) compile the file directly
        defs = hfst.compile_pmatch_file('streets.txt')
        cont = hfst.PmatchContainer(defs)
        assert cont.match(
            "Je marche seul dans l'avenue des Ternes."
        ) == "Je marche seul dans l'<FrenchStreetName>avenue des Ternes</FrenchStreetName>."

        # (2) compile the contents of file
        with open('streets.txt', 'r') as myfile:
            data = myfile.read()
            myfile.close()
        defs = hfst.compile_pmatch_expression(data)
        cont = hfst.PmatchContainer(defs)
        assert cont.match(
            "Je marche seul dans l'avenue des Ternes."
        ) == "Je marche seul dans l'<FrenchStreetName>avenue des Ternes</FrenchStreetName>."

        # (3) try to compile a nonexistent file
示例#2
0
    def __init__(self, datadir):
        """
        The compulsory argument *datadir* should be a path to eg. the /tag/
        directory of a finnish-tagtools package.
        """
        self.datadir = datadir
        self.postagger = omorfi_postag.TextTagger(self.datadir)
        self.p1_tagger = hfst.PmatchContainer(self.datadir +
                                              "/proper_tagger_ph1.pmatch")
        self.p2_tagger = hfst.PmatchContainer(self.datadir +
                                              "/proper_tagger_ph2.pmatch")

        self.subs = [
            ("ntelu#", "nnella#"),
            ("ntely#", "nnellä#"),
            ("ltelu#", "llella#"),
            ("ltely#", "llellä#"),
            ("rtelu#", "rrella#"),
            ("rtely#", "rrellä#"),
            ("ppelu#", "pella#"),
            ("ppely#", "pellä#"),
            ("ttelu#", "tella#"),
            ("ttely#", "tellä#"),
            ("kkelu#", "kella#"),
            ("kkely#", "kellä#"),
            ("tely#", "dellä#"),
            ("telu#", "della#"),
            ("kelu#", "ella#"),
            ("kely#", "ellä#"),
            ("elu#", "ella#"),
            ("ely#", "ellä#"),
            ("ilu#", "illa#"),
            ("ily#", "illä#"),
            ("ltaamis#", "llata#"),
            ("ltäämis#", "llätä#"),
            ("bbaamis#", "bata#"),
            ("bbäämis#", "bätä#"),
            ("ggaamis#", "gata#"),
            ("ggäämis#", "gätä#"),
            ("ppaamis#", "pata#"),
            ("ppäämis#", "pätä#"),
            ("ttaamis#", "tata#"),
            ("ttäämis#", "tätä#"),
            ("ppaamis#", "kata#"),
            ("ppäämis#", "kätä#"),
            ("paamis#", "vata#"),
            ("päämis#", "vätä#"),
            ("toamis#", "dota#"),
            ("taamis#", "data#"),
            ("täämis#", "dätä#"),
            ("koamis#", "ota#"),
            ("kaamis#", "ata#"),
            ("käämis#", "ätä#"),
            ("lkenemis#", "ljeta#"),
            ("kenemis#", "eta#"),
            ("enemis#", "eta#"),
            ("enemis#", "etä#"),
            ("itsemis#", "ita#"),
            ("itsemis#", "itä#"),
            ("amis#", "ta#"),
            ("ämis#", "tä#"),
            ("kemis#", "hdä#"),
            ("mis#", "da#"),
            ("mis#", "dä#"),
            ("mis#", "a#"),
            ("mis#", "ä#"),
            ("is#", "inen#"),
            ("s#", "nen#"),
            ("s#", "kset#"),
            ("uden#", "us#"),
            ("yden#", "ys#"),
            # Pluralized lemmas
            ("kulu#", "kulut#"),
            ("olo#", "olot#"),
            ("tila#", "tilat#"),
            ("kilpailu#", "kilpailut#"),
            ("kisa#", "kisat#"),
            ("saksi#", "sakset#"),
            ("hää#", "häät#"),
            ("juhla#", "juhlat#"),
            ("housu#", "housut#"),
            ("hius#", "hiukset#"),
            ("markkina#", "markkinat#"),
            ("päivä#", "päivät#"),
            ("suhde#", "suhteet#"),
            ("resurssi#", "resurssit#"),
            ("voima#", "voimat#"),
            ("kasvo#", "kasvot#"),
            ("lasi#", "lasit#"),
            ("tieto#", "tiedot#"),
        ]

        self.regex_filename = self.datadir + 'lemma-errors.tsv'
        self.regexes = []
        for line in open(self.regex_filename, 'r'):
            w_patt, l_patt, l_new = line.strip().split('\t')
            self.regexes.append(
                (re.compile(w_patt + '.*'), re.compile(l_patt + '\\Z'), l_new))

        self.open_and_close_tag_re = re.compile(
            r'<((Enamex|Timex|Numex|Exc)[^>]+)>(.+)</\1>')
        self.open_and_close_tag_re_replacement = r'\3<\1/>'
        self.open_tag_re = re.compile(
            r'^(<(Enamex|Timex|Numex|Exc)[^>]+>)([^\t].*)$')
        self.open_tag_re_replacement = r'\3\1'
        self.nested_tag_4 = re.compile(
            r'(</?(Enamex|Timex|Numex)[^>]+4/?>)([^\t]*\t[^\t]*\t[^\t]*\t)')
        self.nested_tag_3 = re.compile(
            r'(</?(Enamex|Timex|Numex)[^>]+3/?>)([^\t]*\t[^\t]*\t)')
        self.nested_tag_2 = re.compile(
            r'(</?(Enamex|Timex|Numex)[^>]+2/?>)([^\t]*\t)')
        self.nested_tag_1 = re.compile(r'\t+(<(Enamex|Timex|Numex)[^>]+1>)')
        self.nested_tag_1_replacement = r'\t\1'
        self.nested_tags = re.compile(
            r'(</?(Enamex|Timex|Numex)[^>1234]+)[1234](/?>)')
        self.nested_tags_replacement = r'\1\3'
        self.exc_tag_re = re.compile(r'</?Exc[^>]+>')
示例#3
0
newline=False
shortopts = 'n'
longopts = ['newline']
options = hfst_commandline.hfst_getopt(shortopts, longopts, 1)
for opt in options[0]:
    if opt[0] == '-n' or opt[0] == '--newline':
        newline = True
    else:
        pass
#    raise RuntimeError('Usage: hfst-pmatch.py [--newline] INFILE')
istr = hfst_commandline.get_one_hfst_input_stream(options)[0]

transducers = []
for tr in istr:
    transducers.append(tr)
istr.close()
cont = hfst.PmatchContainer(transducers)

from sys import stdin
if newline:
    for line in stdin:
        print(cont.match(line), end='')
else:
    exp=''
    for line in stdin:
        exp = exp + line
        if line == '':
            print(cont.match(line), end='')
    if line != '':
        print(cont.match(line), end='')
示例#4
0
import hfst

import os.path
assert os.path.isfile('streets.txt')

# pmatch transducers are always in ol format, so this has actually no effect...
for type in [hfst.ImplementationType.SFST_TYPE, hfst.ImplementationType.TROPICAL_OPENFST_TYPE, hfst.ImplementationType.FOMA_TYPE]:
    if hfst.HfstTransducer.is_implementation_type_available(type):
        print(hfst.fst_type_to_string(type))
        hfst.set_default_fst_type(type)
        defs = hfst.compile_pmatch_file('streets.txt')
        cont = hfst.PmatchContainer(defs)
        assert cont.match("Je marche seul dans l'avenue des Ternes.") == "Je marche seul dans l'<FrenchStreetName>avenue des Ternes</FrenchStreetName>."
        
        nonexistent_file = 'foofoofoofoofoofoofoofoofoofoofoofoo'
        
        assert not os.path.isfile(nonexistent_file)
        try:
            hfst.compile_pmatch_file(nonexistent_file)
            assert False
        except IOError as e:
            pass