import os.path assert os.path.isfile('streets.txt') # pmatch transducers are always in ol format, so this has actually no effect... for type in [ hfst.ImplementationType.SFST_TYPE, hfst.ImplementationType.TROPICAL_OPENFST_TYPE, hfst.ImplementationType.FOMA_TYPE ]: if hfst.HfstTransducer.is_implementation_type_available(type): hfst.set_default_fst_type(type) # (1) compile the file directly defs = hfst.compile_pmatch_file('streets.txt') cont = hfst.PmatchContainer(defs) assert cont.match( "Je marche seul dans l'avenue des Ternes." ) == "Je marche seul dans l'<FrenchStreetName>avenue des Ternes</FrenchStreetName>." # (2) compile the contents of file with open('streets.txt', 'r') as myfile: data = myfile.read() myfile.close() defs = hfst.compile_pmatch_expression(data) cont = hfst.PmatchContainer(defs) assert cont.match( "Je marche seul dans l'avenue des Ternes." ) == "Je marche seul dans l'<FrenchStreetName>avenue des Ternes</FrenchStreetName>." # (3) try to compile a nonexistent file
def __init__(self, datadir): """ The compulsory argument *datadir* should be a path to eg. the /tag/ directory of a finnish-tagtools package. """ self.datadir = datadir self.postagger = omorfi_postag.TextTagger(self.datadir) self.p1_tagger = hfst.PmatchContainer(self.datadir + "/proper_tagger_ph1.pmatch") self.p2_tagger = hfst.PmatchContainer(self.datadir + "/proper_tagger_ph2.pmatch") self.subs = [ ("ntelu#", "nnella#"), ("ntely#", "nnellä#"), ("ltelu#", "llella#"), ("ltely#", "llellä#"), ("rtelu#", "rrella#"), ("rtely#", "rrellä#"), ("ppelu#", "pella#"), ("ppely#", "pellä#"), ("ttelu#", "tella#"), ("ttely#", "tellä#"), ("kkelu#", "kella#"), ("kkely#", "kellä#"), ("tely#", "dellä#"), ("telu#", "della#"), ("kelu#", "ella#"), ("kely#", "ellä#"), ("elu#", "ella#"), ("ely#", "ellä#"), ("ilu#", "illa#"), ("ily#", "illä#"), ("ltaamis#", "llata#"), ("ltäämis#", "llätä#"), ("bbaamis#", "bata#"), ("bbäämis#", "bätä#"), ("ggaamis#", "gata#"), ("ggäämis#", "gätä#"), ("ppaamis#", "pata#"), ("ppäämis#", "pätä#"), ("ttaamis#", "tata#"), ("ttäämis#", "tätä#"), ("ppaamis#", "kata#"), ("ppäämis#", "kätä#"), ("paamis#", "vata#"), ("päämis#", "vätä#"), ("toamis#", "dota#"), ("taamis#", "data#"), ("täämis#", "dätä#"), ("koamis#", "ota#"), ("kaamis#", "ata#"), ("käämis#", "ätä#"), ("lkenemis#", "ljeta#"), ("kenemis#", "eta#"), ("enemis#", "eta#"), ("enemis#", "etä#"), ("itsemis#", "ita#"), ("itsemis#", "itä#"), ("amis#", "ta#"), ("ämis#", "tä#"), ("kemis#", "hdä#"), ("mis#", "da#"), ("mis#", "dä#"), ("mis#", "a#"), ("mis#", "ä#"), ("is#", "inen#"), ("s#", "nen#"), ("s#", "kset#"), ("uden#", "us#"), ("yden#", "ys#"), # Pluralized lemmas ("kulu#", "kulut#"), ("olo#", "olot#"), ("tila#", "tilat#"), ("kilpailu#", "kilpailut#"), ("kisa#", "kisat#"), ("saksi#", "sakset#"), ("hää#", "häät#"), ("juhla#", "juhlat#"), ("housu#", "housut#"), ("hius#", "hiukset#"), ("markkina#", "markkinat#"), ("päivä#", "päivät#"), ("suhde#", "suhteet#"), ("resurssi#", "resurssit#"), ("voima#", "voimat#"), ("kasvo#", "kasvot#"), ("lasi#", "lasit#"), ("tieto#", "tiedot#"), ] self.regex_filename = self.datadir + 'lemma-errors.tsv' self.regexes = [] for line in open(self.regex_filename, 'r'): w_patt, l_patt, l_new = line.strip().split('\t') self.regexes.append( (re.compile(w_patt + '.*'), re.compile(l_patt + '\\Z'), l_new)) self.open_and_close_tag_re = re.compile( r'<((Enamex|Timex|Numex|Exc)[^>]+)>(.+)</\1>') self.open_and_close_tag_re_replacement = r'\3<\1/>' self.open_tag_re = re.compile( r'^(<(Enamex|Timex|Numex|Exc)[^>]+>)([^\t].*)$') self.open_tag_re_replacement = r'\3\1' self.nested_tag_4 = re.compile( r'(</?(Enamex|Timex|Numex)[^>]+4/?>)([^\t]*\t[^\t]*\t[^\t]*\t)') self.nested_tag_3 = re.compile( r'(</?(Enamex|Timex|Numex)[^>]+3/?>)([^\t]*\t[^\t]*\t)') self.nested_tag_2 = re.compile( r'(</?(Enamex|Timex|Numex)[^>]+2/?>)([^\t]*\t)') self.nested_tag_1 = re.compile(r'\t+(<(Enamex|Timex|Numex)[^>]+1>)') self.nested_tag_1_replacement = r'\t\1' self.nested_tags = re.compile( r'(</?(Enamex|Timex|Numex)[^>1234]+)[1234](/?>)') self.nested_tags_replacement = r'\1\3' self.exc_tag_re = re.compile(r'</?Exc[^>]+>')
newline=False shortopts = 'n' longopts = ['newline'] options = hfst_commandline.hfst_getopt(shortopts, longopts, 1) for opt in options[0]: if opt[0] == '-n' or opt[0] == '--newline': newline = True else: pass # raise RuntimeError('Usage: hfst-pmatch.py [--newline] INFILE') istr = hfst_commandline.get_one_hfst_input_stream(options)[0] transducers = [] for tr in istr: transducers.append(tr) istr.close() cont = hfst.PmatchContainer(transducers) from sys import stdin if newline: for line in stdin: print(cont.match(line), end='') else: exp='' for line in stdin: exp = exp + line if line == '': print(cont.match(line), end='') if line != '': print(cont.match(line), end='')
import hfst import os.path assert os.path.isfile('streets.txt') # pmatch transducers are always in ol format, so this has actually no effect... for type in [hfst.ImplementationType.SFST_TYPE, hfst.ImplementationType.TROPICAL_OPENFST_TYPE, hfst.ImplementationType.FOMA_TYPE]: if hfst.HfstTransducer.is_implementation_type_available(type): print(hfst.fst_type_to_string(type)) hfst.set_default_fst_type(type) defs = hfst.compile_pmatch_file('streets.txt') cont = hfst.PmatchContainer(defs) assert cont.match("Je marche seul dans l'avenue des Ternes.") == "Je marche seul dans l'<FrenchStreetName>avenue des Ternes</FrenchStreetName>." nonexistent_file = 'foofoofoofoofoofoofoofoofoofoofoofoo' assert not os.path.isfile(nonexistent_file) try: hfst.compile_pmatch_file(nonexistent_file) assert False except IOError as e: pass