def __init__( self, models: Union[Path, Dict[str, Path]], field: str, output_field: str = "perplexity", newline: str = "\n", normalize: bool = True, load_method: int = 2, ): super().__init__() self.field = field self.output_field = output_field self.newline = newline self.normalize = normalize self._prefetch: Sequence[str] = [] self.lm_config = kenlm.Config() # This is the default settings # POPULATE will mmap the models and populate the pages. # Maybe that's not the best way when the models are on a network disk. # TODO: try copying models file, try READ or PARALLEL_READ self.lm_config.load_method = load_method if isinstance(models, Path): self.models = { m.name.split(".")[0]: m for m in models.parent.glob(models.name) } else: self.models = models self._prefetch = list(models.keys()) self.lm: Dict[str, kenlm.Model] = {} self.n_lines = 0
def __init__(self, lang, segmenter): self.lang = lang self.tm = pickle.load(open('assets/wikitweetweb.' + lang + '.tm')) cnf = kenlm.Config() cnf.load_method = 0 self.lm = kenlm.LanguageModel('assets/wikitweetweb.' + lang + '.bin', cnf) self.segmenter = segmenter
def __init__(self, name, path, normalize=False, debpe=False): self.path = path c = kenlm.Config() c.load_method = kenlm.LoadMethod.LAZY self.model = kenlm.Model(path, c) self.name = name self.normalize = normalize self.debpe = debpe logger.info('Intialized ' + str(self.model.order) + "-gram language model: " + path)
def _load_lms(self, char_lm_dir, word_lm_dir): config = kenlm.Config() config.show_progress = False config.arpa_complain = kenlm.ARPALoadComplain.NONE for label in self._labels: char_lm_path = Path(char_lm_dir, '{}.arpa'.format(label)) word_lm_path = Path(word_lm_dir, '{}.arpa'.format(label)) self._char_lms[label] = kenlm.Model(str(char_lm_path), config) self._word_lms[label] = kenlm.Model(str(word_lm_path), config)
if __name__ == '__main__': import argparse parser = argparse.ArgumentParser( description= 'Diacritic restoration tool for Slovene, Croatian and Serbian') parser.add_argument('lang', help='language of the text', choices=['sl', 'hr', 'sr']) parser.add_argument('-l', '--language-model', help='use the language model', action='store_true') parser.add_argument('-i', '--index', help='index of the column to be processed', type=int, default=0) args = parser.parse_args() lexicon = pickle.load( open(os.path.join(reldir, 'wikitweetweb.' + args.lang + '.tm'))) if args.language_model: cnf = kenlm.Config() cnf.load_method = 0 lm = kenlm.LanguageModel( os.path.join(reldir, 'wikitweetweb.' + args.lang + '.bin'), cnf) else: lm = None read_and_write(sys.stdin, args.index - 1, sys.stdout, lm)
#import logging #requests_log = logging.getLogger("kenlm") #requests_log.addHandler(logging.NullHandler()) #requests_log.propagate = False #log = logging.getLogger('kenlm') #log.setLevel(logging.ERROR) #import os #print os.path.dirname(kenlm.__file__) import time import sys #time.sleep(2) print kenlm.Config()._c_config #model_name = 'lm/Christos_Faloutsos.arpa' from os import listdir from os.path import isfile, join fname = sys.argv[1] model_path = sys.argv[2] models_files = [f for f in listdir(model_path) if isfile(join(model_path, f))] #reading testing doc with open(fname) as f: content = f.read() model_score_list = dict() for model_name in models_files: