def __init__(self, foldtitle=True, ignorecase=False, accelchars="", termlength=3, sourcelanguage="en", invert=False, stopfile=None): self.foldtitle = foldtitle self.ignorecase = ignorecase self.accelchars = accelchars self.termlength = termlength self.sourcelanguage = sourcelanguage self.invert = invert self.stopwords = {} self.stoprelist = [] self.stopfoldtitle = True self.stopignorecase = False if stopfile is None: try: stopfile = file_discovery.get_abs_data_filename('stoplist-%s' % self.sourcelanguage) except: pass self.stopfile = stopfile self.parse_stopword_file() # handles c-format and python-format self.formatpat = re.compile(r"%(?:\([^)]+\)|[0-9]+\$)?[-+#0]*[0-9.*]*(?:[hlLzjt][hl])?[EFGXc-ginoprsux]") # handles XML/HTML elements (<foo>text</foo> => text) self.xmlelpat = re.compile(r"<(?:![[-]|[/?]?[A-Za-z_:])[^>]*>") # handles XML/HTML entities (    & &my_entity;) self.xmlentpat = re.compile(r"&(?:#(?:[0-9]+|x[0-9a-f]+)|[a-z_:][\w.-:]*);", flags=re.UNICODE | re.IGNORECASE) self.units = 0 self.glossary = {}
def find_langmodel_files(): from translate.misc.file_discovery import get_abs_data_filename lmdir = path.abspath(get_abs_data_filename('langmodels')) if not path.isdir(lmdir): return [] return [(path.join(TARGET_DATA_DIR, 'langmodels'), glob(path.join(lmdir, '*.*')))]
def main(): formats = {"po": ("po", None), "pot": ("pot", None), None: ("po", None)} parser = TerminologyOptionParser(formats) parser.add_option( "-u", "--update", type="string", dest="update", metavar="UPDATEFILE", help="update terminology in UPDATEFILE") parser.add_option("-S", "--stopword-list", type="string", metavar="STOPFILE", dest="stopfile", help="read stopword (term exclusion) list from STOPFILE (default %s)" % file_discovery.get_abs_data_filename('stoplist-en')) parser.set_defaults(foldtitle=True, ignorecase=False) parser.add_option( "-F", "--fold-titlecase", callback=fold_case_option, action="callback", help="fold \"Title Case\" to lowercase (default)") parser.add_option( "-C", "--preserve-case", callback=preserve_case_option, action="callback", help="preserve all uppercase/lowercase") parser.add_option( "-I", "--ignore-case", dest="ignorecase", action="store_true", help="make all terms lowercase") parser.add_option( "", "--accelerator", dest="accelchars", default="", metavar="ACCELERATORS", help="ignore the given accelerator characters when matching") parser.add_option( "-t", "--term-words", type="int", dest="termlength", default="3", help="generate terms of up to LENGTH words (default 3)", metavar="LENGTH") parser.add_option( "", "--nonstop-needed", type="int", dest="nonstopmin", default="1", help="omit terms with less than MIN nonstop words (default 1)", metavar="MIN") parser.add_option( "", "--inputs-needed", type="int", dest="inputmin", help="omit terms appearing in less than MIN input files (default 2, or 1 if only one input file)", metavar="MIN") parser.add_option( "", "--fullmsg-needed", type="int", dest="fullmsgmin", default="1", help="omit full message terms appearing in less than MIN different messages (default 1)", metavar="MIN") parser.add_option( "", "--substr-needed", type="int", dest="substrmin", default="2", help="omit substring-only terms appearing in less than MIN different messages (default 2)", metavar="MIN") parser.add_option( "", "--locs-needed", type="int", dest="locmin", default="2", help="omit terms appearing in less than MIN different original source files (default 2)", metavar="MIN") parser.add_option( "", "--sort", dest="sortorders", action="append", type="choice", choices=TerminologyExtractor.sortorders_default, metavar="ORDER", help="output sort order(s): %s (may repeat option, default is all in above order)" % ', '.join(TerminologyExtractor.sortorders_default)) parser.add_option( "", "--source-language", dest="sourcelanguage", default="en", help="the source language code (default 'en')", metavar="LANG") parser.add_option( "-v", "--invert", dest="invert", action="store_true", default=False, help="invert the source and target languages for terminology") parser.set_usage() parser.description = __doc__ parser.run()
def get_abs_data_filename(path_parts, basedirs=None): """Get the absolute path to the given file- or directory name in Virtaal's data directory. @type path_parts: list @param path_parts: The path parts that can be joined by os.path.join(). """ if basedirs is None: basedirs = [] basedirs += [ os.path.join(os.path.dirname(unicode(__file__, sys.getfilesystemencoding())), os.path.pardir), ] return file_discovery.get_abs_data_filename(path_parts, basedirs=basedirs)
def get_abs_data_filename(path_parts, basedirs=None): """Get the absolute path to the given file- or directory name in Virtaal's data directory. @type path_parts: list @param path_parts: The path parts that can be joined by os.path.join(). """ if basedirs is None: basedirs = [] basedirs += [ os.path.join( os.path.dirname(unicode(__file__, sys.getfilesystemencoding())), os.path.pardir), ] return file_discovery.get_abs_data_filename(path_parts, basedirs=basedirs)
class LanguageIdentifier(object): MODEL_DIR = get_abs_data_filename('langmodels') """The directory containing the ngram language model files.""" CONF_FILE = 'fpdb.conf' """ The name of the file that contains language name-code pairs (relative to ``MODEL_DIR``). """ def __init__(self, model_dir=None, conf_file=None): if model_dir is None: model_dir = self.MODEL_DIR if not path.isdir(model_dir): raise ValueError('Directory does not exist: %s' % (model_dir)) if conf_file is None: conf_file = self.CONF_FILE conf_file = path.abspath(path.join(model_dir, conf_file)) if not path.isfile(conf_file): raise ValueError('File does not exist: %s' % (conf_file)) self._lang_codes = {} self._load_config(conf_file) self.ngram = NGram(model_dir) def _load_config(self, conf_file): """Load the mapping of language names to language codes as given in the configuration file.""" lines = open(conf_file).read().splitlines() for line in lines: parts = line.split() if not parts or line.startswith('#'): continue # Skip comment- and empty lines lname, lcode = parts[0], parts[1] # Make sure lname is not prefixed by directory names lname = path.split(lname)[-1] if extsep in lname: lname = lname[:lname.rindex( extsep)] # Remove extension if it has # Remove trailing '[_-]-utf8' from code if lcode.endswith('-utf8'): lcode = lcode[:-len('-utf8')] if lcode.endswith('-') or lcode.endswith('_'): lcode = lcode[:-1] self._lang_codes[lname] = lcode def identify_lang(self, text): """Identify the language of the text in the given string.""" if not text: return None result = self.ngram.classify(text) if result in self._lang_codes: result = self._lang_codes[result] return result def identify_source_lang(self, instore): """Identify the source language of the given translation store or units. :type instore: ``TranslationStore`` or list or tuple of ``TranslationUnit``s. :param instore: The translation store to extract source text from. :returns: The identified language's code or ``None`` if the language could not be identified.""" if not isinstance(instore, (TranslationStore, list, tuple)): return None text = u' '.join(unit.source for unit in instore[:50] if unit.istranslatable() and unit.source) if not text: return None return self.identify_lang(text) def identify_target_lang(self, instore): """Identify the target language of the given translation store or units. :type instore: ``TranslationStore`` or list or tuple of ``TranslationUnit``s. :param instore: The translation store to extract target text from. :returns: The identified language's code or ``None`` if the language could not be identified.""" if not isinstance(instore, (TranslationStore, list, tuple)): return None text = u' '.join(unit.target for unit in instore[:200] if unit.istranslatable() and unit.target) if not text: return None return self.identify_lang(text)
lang = path.split(fname)[-1][:-size] n = _NGram() file = open(fname, 'r') for line in file.readlines(): n.addText(line) file.close() n.normalise() self.ngrams[lang] = n def save(self, folder, ext='.lm'): for lang in self.ngrams.keys(): fname = path.join(folder, lang + ext) file = open(fname, 'w') for v, k in self.ngrams[lang].sorted_by_score(): file.write("%s\t %d\n" % (k, v)) file.close() if __name__ == '__main__': import sys # Should you want to generate your own .lm files #conf = Generate('/tmp') #conf.save('/tmp') text = sys.stdin.readline() from translate.misc.file_discovery import get_abs_data_filename l = NGram(get_abs_data_filename('langmodels')) print l.classify(text)
n = _NGram() file = open(fname, 'r') for line in file.readlines(): n.addText(line) file.close() n.normalise() self.ngrams[lang] = n def save(self, folder, ext='.lm'): for lang in self.ngrams.keys(): fname = path.join(folder, lang + ext) file = open(fname, 'w') for v, k in self.ngrams[lang].sorted_by_score(): file.write("%s\t %d\n" % (k, v)) file.close() if __name__ == '__main__': import sys # Should you want to generate your own .lm files #conf = Generate('/tmp') #conf.save('/tmp') text = sys.stdin.readline() from translate.misc.file_discovery import get_abs_data_filename l = NGram(get_abs_data_filename('langmodels')) print l.classify(text)
for fname in glob.glob(path.normcase(folder)): lang = path.split(fname)[-1][:-size] n = _NGram() with open(fname, encoding="utf-8") as fp: for line in fp: n.addText(line) n.normalise() self.ngrams[lang] = n def save(self, folder, ext=".lm"): for lang in self.ngrams.keys(): fname = path.join(folder, lang + ext) with open(fname, mode="w", encoding="utf-8") as fp: for v, k in self.ngrams[lang].sorted_by_score(): fp.write("%s\t %d\n" % (k, v)) if __name__ == "__main__": # Should you want to generate your own .lm files # conf = Generate('/tmp') # conf.save('/tmp') text = sys.stdin.readline() from translate.misc.file_discovery import get_abs_data_filename lm = NGram(get_abs_data_filename("langmodels")) print(lm.classify(text))