def __init__(self, INFILE, OUTFILE, labels): self.labels = labels self.INFILE = INFILE self.OUTFILE = OUTFILE self.wxp = wxilp(order="wx2utf") self.tag_dct = {tag:i for i,tag in enumerate(labels)} self.tree, self.queue, self.blm_wp, self.blm_sp = list(), list(), list(), list() # load decision trees for tag in self.labels: if tag == "eng": self.tree.append("_") continue if tag == "kan": tag = "mal" with open('decision_trees/eng-%s.json' %tag) as fp: self.tree.append(json.load(fp)) # load language-models for tag in self.labels: self.blm_wp.append(kenlm.LanguageModel('blm_models/{}.tk.blm'.format(tag))) self.blm_sp.append(kenlm.LanguageModel('blm_models/{}.ts.blm'.format(tag))) # load emoticon set with open('extras/emoticons.txt') as fp: self.emoticons = set(fp.read().split('\t')) self.reg = re.compile(r"(^[^a-zA-Z0-9]+|[^-'a-zA-Z0-9]+|[^a-zA-Z0-9]+$)")
def __init__(self, labels=['hin', 'eng'], transliteration=False): self.flag = transliteration self.labels = labels self.wxp = wxilp(order="wx2utf") path = os.path.abspath(__file__).rpartition('/')[0] self.tag_dct = {tag: i for i, tag in enumerate(labels)} self.tree, self.queue, self.blm_wp, self.blm_sp = list(), list(), list( ), list() # load decision trees for tag in self.labels: if tag == "eng": self.tree.append("_") continue if tag == "kan": tag = "mal" with open('%s/decision_trees/eng-%s.json' % (path, tag)) as fp: self.tree.append(json.load(fp)) # load language-models for tag in self.labels: self.blm_wp.append( kenlm.LanguageModel('{}/blm_models/{}.tk.blm'.format( path, tag))) self.blm_sp.append( kenlm.LanguageModel('{}/blm_models/{}.ts.blm'.format( path, tag))) # load emoticon set with open('%s/extras/emoticons.txt' % path) as fp: self.emoticons = set(fp.read().split('\t')) self.reg = re.compile( r"(^[^a-zA-Z0-9]+|[^-'a-zA-Z0-9]+|[^a-zA-Z0-9]+$)")
def __init__(self, labels=['hin', 'eng'], transliteration=False): self.flag = transliteration self.labels = labels self.wxp = wxilp(order="wx2utf") path = os.path.abspath(__file__).rpartition('/')[0] self.tag_dct = {tag:i for i,tag in enumerate(labels)} self.tree, self.queue, self.blm_wp, self.blm_sp = list(), list(), list(), list() # load decision trees for tag in self.labels: if tag == "eng": self.tree.append("_") continue if tag == "kan": tag = "mal" with open('%s/decision_trees/eng-%s.json' %(path, tag)) as fp: self.tree.append(json.load(fp)) # load language-models for tag in self.labels: self.blm_wp.append(kenlm.LanguageModel('{}/blm_models/{}.tk.blm'.format(path, tag))) self.blm_sp.append(kenlm.LanguageModel('{}/blm_models/{}.ts.blm'.format(path, tag))) # load emoticon set with open('%s/extras/emoticons.txt' %path) as fp: self.emoticons = set(fp.read().split('\t')) self.reg = re.compile(r"(^[^a-zA-Z0-9]+|[^-'a-zA-Z0-9]+|[^a-zA-Z0-9]+$)")
def __init__(self, order="wx2utf", format_="text", lang="hin"): self.lang = lang self.format_ = format_ wxp = wxilp(self.lang, order) self.transform = wxp.wx2utf if order=="wx2utf" else wxp.utf2wx