Пример #1
0
    def __init__(self):
        """Construct a transliterator object."""
        rules_file_location = pkg_resources.resource_filename(
            "data", "hfst.att")
        with open(rules_file_location, "r") as f:
            self.transducer = hfst.AttReader(f).read()
        self.logger = logging.getLogger("franco_arabic_transliterator")
        logging.basicConfig(level=logging.DEBUG)

        with open(pkg_resources.resource_filename("data", "lexicon"),
                  "r") as f:
            self.wordlist = {
                l.split("\t")[0]: int(l.split("\t")[1])
                for l in f.readlines()
            }

        def find_pairs(word, grams=10, max_len=20):
            pairs = []
            chars = ["_" for _ in range(grams)]
            word = "{}{}".format(word, "$" * (max_len - len(word)))
            for c in word:
                pairs.append((c, "".join(chars)))
                chars = chars[1:] + [c]
            return pairs

        pairs = [p for w in self.wordlist for p in find_pairs(w)]
        self.counts = Counter(pairs)
        self.sigma_counts = sum(self.counts.values())
    def __init__(self):
        """Construct a transliterator object."""
        rules_file_location = pkg_resources.resource_filename(
            'data', 'hfst.att')
        with open(rules_file_location, 'r') as f:
            self.transducer = hfst.AttReader(f).read()
        self.logger = logging.getLogger('franco_arabic_transliterator')
        logging.basicConfig(level=logging.DEBUG)

        with open(pkg_resources.resource_filename('data', 'lexicon'),
                  'r') as f:
            self.wordlist = {
                l.split('\t')[0]: int(l.split('\t')[1])
                for l in f.readlines()
            }

        def find_pairs(word, grams=3):
            pairs = []
            chars = ['_' for _ in range(grams)]
            for c in word:
                pairs.append((c, ''.join(chars)))
                chars = chars[1:] + [c]
            return pairs

        pairs = [p for w in self.wordlist for p in find_pairs(w)]
        self.counts = Counter(pairs)
        self.sigma_counts = sum(self.counts.values())
Пример #3
0
def get_att_transducer(f):
    with open(f, 'r', encoding='utf-8') as f:
        try:
            r = hfst.AttReader(f)
            for tr in r:
                return tr
        except hfst.exceptions.NotValidAttFormatException as e:
            print(e.what(), file=sys.stderr)
Пример #4
0
import hfst

transducers = []

with open('testfile.att', 'r') as f:
    r = hfst.AttReader(f, "<eps>")
    for tr in r:
        transducers.append(tr)

assert (f.closed)
assert (len(transducers)) == 4

transducers = []

with open('testfile_fail.att', 'r') as f:
    try:
        r = hfst.AttReader(f, "<eps>")
        for tr in r:
            transducers.append(tr)
    except hfst.exceptions.NotValidAttFormatException as e:
        assert ("1      baz    baz      0.3" in e.what())
        assert ("line: 11" in e.what())

assert (f.closed)
assert (len(transducers)) == 4
Пример #5
0
            impl = hfst.ImplementationType.TROPICAL_OPENFST_TYPE
        elif val == 'foma':
            impl = hfst.ImplementationType.FOMA_TYPE
        else:
            raise RuntimeError('type not recognized: ' + val)
    elif arg == '-e':
        skip_next = True
        epsilonstr = argv[i + 1]
    elif arg == '-i':
        skip_next = True
        inputfilename = argv[i + 1]
    else:
        raise RuntimeError('argument not recognized: ' + arg)

istr = None
if inputfilename != None:
    istr = open(inputfilename, 'r')
else:
    istr = stdin
ostr = hfst.HfstOutputStream()

att = hfst.AttReader(istr, epsilonstr)

for tr in att:
    ostr.write(tr)
    ostr.flush()

if inputfilename != None:
    istr.close()
ostr.close()