def pretrain_model(self, space=' '): # Assert the textfile is exist? if self.textfile == None: self.textfile = self.getTexts(self.fname + '.txt', space=' ') self.traincorpusfname = self.fname + '.traincorpus' # iter counter for articles i = 0 with open(self.textfile, 'r') as icorpus, \ open(self.traincorpusfname, 'w') as ocorpus: for line in icorpus.readlines(): # Convert the translated chinese to simple text = tran2simple(line) # seperate word using jieba text = seperate_word(text) # remove non-chinese word from corpus text = remove_word(line=text, encoding='utf8') # print text if text: ocorpus.write(text + '\n') i = i + 1 if i % tCorpus.SEGSIZE == 0: self.logger.info('PreVecModel: ' + str(i) + ' articles') self.logger.info('PreVecModel:' + str(i) + ' articles') return self.traincorpusfname
def pretrain_model(self, space = ' '): # Assert the textfile is exist? if self.textfile == None: self.textfile = self.getTexts(self.fname + '.txt', space=' ') self.traincorpusfname = self.fname + '.traincorpus' # iter counter for articles i = 0 with open(self.textfile, 'r') as icorpus, \ open(self.traincorpusfname, 'w') as ocorpus: for line in icorpus.readlines(): # Convert the translated chinese to simple text = tran2simple(line) # seperate word using jieba text = seperate_word(text) # remove non-chinese word from corpus text = remove_word(line = text, encoding = 'utf8') # print text if text: ocorpus.write(text + '\n') i = i + 1 if i % tCorpus.SEGSIZE == 0: self.logger.info('PreVecModel: ' + str(i) + ' articles') self.logger.info('PreVecModel:' + str(i) + ' articles') return self.traincorpusfname
def testtrans2simplefile(self): import filecmp with open(self.corpus, 'r') as icorpus, \ open(self.corpus+'.simple', 'w') as ocorpus: for line in icorpus.readlines(): ocorpus.write(tran2simple(line) + '\n') assert(filecmp.cmp('transchinese.txt.simpletarget', self.corpus+'.simple'))
def testtrans2simple(self): line = ['開放中文轉換,是一個致力於中文簡繁轉換的項目,提供高質量詞庫和函數庫', '的項目,提供高質量詞庫和函數'] ret = tran2simple(line) exposeret = [ '\xe5\xbc\x80\xe6\x94\xbe\xe4\xb8\xad\xe6\x96\x87\xe8\xbd\xac\xe6\x8d\xa2\xef\xbc\x8c\xe6\x98\xaf\xe4\xb8\x80\xe4\xb8\xaa\xe8\x87\xb4\xe5\x8a\x9b\xe4\xba\x8e\xe4\xb8\xad\xe6\x96\x87\xe7\xae\x80\xe7\xb9\x81\xe8\xbd\xac\xe6\x8d\xa2\xe7\x9a\x84\xe9\xa1\xb9\xe7\x9b\xae\xef\xbc\x8c\xe6\x8f\x90\xe4\xbe\x9b\xe9\xab\x98\xe8\xb4\xa8\xe9\x87\x8f\xe8\xaf\x8d\xe5\xba\x93\xe5\x92\x8c\xe5\x87\xbd\xe6\x95\xb0\xe5\xba\x93', '\xe7\x9a\x84\xe9\xa1\xb9\xe7\x9b\xae\xef\xbc\x8c\xe6\x8f\x90\xe4\xbe\x9b\xe9\xab\x98\xe8\xb4\xa8\xe9\x87\x8f\xe8\xaf\x8d\xe5\xba\x93\xe5\x92\x8c\xe5\x87\xbd\xe6\x95\xb0' ] assert (ret == exposeret)
def testtrans2simple(self): line = ['開放中文轉換,是一個致力於中文簡繁轉換的項目,提供高質量詞庫和函數庫','的項目,提供高質量詞庫和函數'] ret = tran2simple(line) exposeret = ['\xe5\xbc\x80\xe6\x94\xbe\xe4\xb8\xad\xe6\x96\x87\xe8\xbd\xac\xe6\x8d\xa2\xef\xbc\x8c\xe6\x98\xaf\xe4\xb8\x80\xe4\xb8\xaa\xe8\x87\xb4\xe5\x8a\x9b\xe4\xba\x8e\xe4\xb8\xad\xe6\x96\x87\xe7\xae\x80\xe7\xb9\x81\xe8\xbd\xac\xe6\x8d\xa2\xe7\x9a\x84\xe9\xa1\xb9\xe7\x9b\xae\xef\xbc\x8c\xe6\x8f\x90\xe4\xbe\x9b\xe9\xab\x98\xe8\xb4\xa8\xe9\x87\x8f\xe8\xaf\x8d\xe5\xba\x93\xe5\x92\x8c\xe5\x87\xbd\xe6\x95\xb0\xe5\xba\x93', '\xe7\x9a\x84\xe9\xa1\xb9\xe7\x9b\xae\xef\xbc\x8c\xe6\x8f\x90\xe4\xbe\x9b\xe9\xab\x98\xe8\xb4\xa8\xe9\x87\x8f\xe8\xaf\x8d\xe5\xba\x93\xe5\x92\x8c\xe5\x87\xbd\xe6\x95\xb0'] assert(ret == exposeret)