예제 #1
0
    def __init__(self, INFILE, OUTFILE, labels):
	
	self.labels = labels
	self.INFILE = INFILE
	self.OUTFILE = OUTFILE
	self.wxp = wxilp(order="wx2utf")
	self.tag_dct = {tag:i for i,tag in enumerate(labels)}
	self.tree, self.queue, self.blm_wp, self.blm_sp = list(), list(), list(), list()

	# load decision trees
	for tag in self.labels:
	    if tag == "eng": 
		self.tree.append("_")
		continue
	    if tag == "kan":
		tag = "mal"
	    with open('decision_trees/eng-%s.json' %tag) as fp:
    		self.tree.append(json.load(fp))

	# load language-models
	for tag in self.labels:
	    self.blm_wp.append(kenlm.LanguageModel('blm_models/{}.tk.blm'.format(tag)))
	    self.blm_sp.append(kenlm.LanguageModel('blm_models/{}.ts.blm'.format(tag)))

	# load emoticon set
	with open('extras/emoticons.txt') as fp:
	    self.emoticons = set(fp.read().split('\t'))

	self.reg = re.compile(r"(^[^a-zA-Z0-9]+|[^-'a-zA-Z0-9]+|[^a-zA-Z0-9]+$)")
예제 #2
0
    def __init__(self, labels=['hin', 'eng'], transliteration=False):
        self.flag = transliteration
        self.labels = labels
        self.wxp = wxilp(order="wx2utf")
        path = os.path.abspath(__file__).rpartition('/')[0]
        self.tag_dct = {tag: i for i, tag in enumerate(labels)}
        self.tree, self.queue, self.blm_wp, self.blm_sp = list(), list(), list(
        ), list()

        # load decision trees
        for tag in self.labels:
            if tag == "eng":
                self.tree.append("_")
                continue
            if tag == "kan":
                tag = "mal"
            with open('%s/decision_trees/eng-%s.json' % (path, tag)) as fp:
                self.tree.append(json.load(fp))

        # load language-models
        for tag in self.labels:
            self.blm_wp.append(
                kenlm.LanguageModel('{}/blm_models/{}.tk.blm'.format(
                    path, tag)))
            self.blm_sp.append(
                kenlm.LanguageModel('{}/blm_models/{}.ts.blm'.format(
                    path, tag)))

        # load emoticon set
        with open('%s/extras/emoticons.txt' % path) as fp:
            self.emoticons = set(fp.read().split('\t'))

        self.reg = re.compile(
            r"(^[^a-zA-Z0-9]+|[^-'a-zA-Z0-9]+|[^a-zA-Z0-9]+$)")
예제 #3
0
파일: litran.py 프로젝트: kush789/litcm
    def __init__(self, labels=['hin', 'eng'], transliteration=False):
        self.flag = transliteration        
        self.labels = labels
        self.wxp = wxilp(order="wx2utf")
        path = os.path.abspath(__file__).rpartition('/')[0]
        self.tag_dct = {tag:i for i,tag in enumerate(labels)}
        self.tree, self.queue, self.blm_wp, self.blm_sp = list(), list(), list(), list()

        # load decision trees
        for tag in self.labels:
            if tag == "eng": 
                self.tree.append("_")
                continue
            if tag == "kan":
                tag = "mal"
            with open('%s/decision_trees/eng-%s.json' %(path, tag)) as fp:
                self.tree.append(json.load(fp))

        # load language-models
        for tag in self.labels:
            self.blm_wp.append(kenlm.LanguageModel('{}/blm_models/{}.tk.blm'.format(path, tag)))
            self.blm_sp.append(kenlm.LanguageModel('{}/blm_models/{}.ts.blm'.format(path, tag)))

        # load emoticon set
        with open('%s/extras/emoticons.txt' %path) as fp:
            self.emoticons = set(fp.read().split('\t'))

        self.reg = re.compile(r"(^[^a-zA-Z0-9]+|[^-'a-zA-Z0-9]+|[^a-zA-Z0-9]+$)")
예제 #4
0
 def __init__(self, order="wx2utf", format_="text", lang="hin"):
     self.lang = lang
     self.format_ = format_
     wxp = wxilp(self.lang, order)
     self.transform = wxp.wx2utf if order=="wx2utf" else wxp.utf2wx