Exemplo n.º 1
0
    def __init__(
        self,
        models: Union[Path, Dict[str, Path]],
        field: str,
        output_field: str = "perplexity",
        newline: str = "\n",
        normalize: bool = True,
        load_method: int = 2,
    ):
        super().__init__()
        self.field = field
        self.output_field = output_field
        self.newline = newline
        self.normalize = normalize
        self._prefetch: Sequence[str] = []
        self.lm_config = kenlm.Config()
        # This is the default settings
        # POPULATE will mmap the models and populate the pages.
        # Maybe that's not the best way when the models are on a network disk.
        # TODO: try copying models file, try READ or PARALLEL_READ
        self.lm_config.load_method = load_method

        if isinstance(models, Path):
            self.models = {
                m.name.split(".")[0]: m
                for m in models.parent.glob(models.name)
            }
        else:
            self.models = models
            self._prefetch = list(models.keys())
        self.lm: Dict[str, kenlm.Model] = {}
        self.n_lines = 0
Exemplo n.º 2
0
 def __init__(self, lang, segmenter):
     self.lang = lang
     self.tm = pickle.load(open('assets/wikitweetweb.' + lang + '.tm'))
     cnf = kenlm.Config()
     cnf.load_method = 0
     self.lm = kenlm.LanguageModel('assets/wikitweetweb.' + lang + '.bin',
                                   cnf)
     self.segmenter = segmenter
Exemplo n.º 3
0
 def __init__(self, name, path, normalize=False, debpe=False):
     self.path = path
     c = kenlm.Config()
     c.load_method = kenlm.LoadMethod.LAZY
     self.model = kenlm.Model(path, c)
     self.name = name
     self.normalize = normalize
     self.debpe = debpe
     logger.info('Intialized ' + str(self.model.order) + "-gram language model: " + path)
Exemplo n.º 4
0
    def _load_lms(self, char_lm_dir, word_lm_dir):
        config = kenlm.Config()
        config.show_progress = False
        config.arpa_complain = kenlm.ARPALoadComplain.NONE

        for label in self._labels:
            char_lm_path = Path(char_lm_dir, '{}.arpa'.format(label))
            word_lm_path = Path(word_lm_dir, '{}.arpa'.format(label))
            self._char_lms[label] = kenlm.Model(str(char_lm_path), config)
            self._word_lms[label] = kenlm.Model(str(word_lm_path), config)
Exemplo n.º 5
0

if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(
        description=
        'Diacritic restoration tool for Slovene, Croatian and Serbian')
    parser.add_argument('lang',
                        help='language of the text',
                        choices=['sl', 'hr', 'sr'])
    parser.add_argument('-l',
                        '--language-model',
                        help='use the language model',
                        action='store_true')
    parser.add_argument('-i',
                        '--index',
                        help='index of the column to be processed',
                        type=int,
                        default=0)
    args = parser.parse_args()
    lexicon = pickle.load(
        open(os.path.join(reldir, 'wikitweetweb.' + args.lang + '.tm')))
    if args.language_model:
        cnf = kenlm.Config()
        cnf.load_method = 0
        lm = kenlm.LanguageModel(
            os.path.join(reldir, 'wikitweetweb.' + args.lang + '.bin'), cnf)
    else:
        lm = None
    read_and_write(sys.stdin, args.index - 1, sys.stdout, lm)
Exemplo n.º 6
0
#import logging
#requests_log = logging.getLogger("kenlm")
#requests_log.addHandler(logging.NullHandler())
#requests_log.propagate = False
#log = logging.getLogger('kenlm')
#log.setLevel(logging.ERROR)

#import os
#print os.path.dirname(kenlm.__file__)

import time
import sys
#time.sleep(2)

print kenlm.Config()._c_config

#model_name = 'lm/Christos_Faloutsos.arpa'
from os import listdir
from os.path import isfile, join

fname = sys.argv[1]
model_path = sys.argv[2]
models_files = [f for f in listdir(model_path) if isfile(join(model_path, f))]

#reading testing doc
with open(fname) as f:
    content = f.read()

model_score_list = dict()
for model_name in models_files: