示例#1
0
def traditional2simple(file_path, out_path):
    try:
        cc = opencc.OpenCC("t2s")
        with codecs.open(file_path, mode='r', encoding='utf-8') as rf:
            text_content = cc.convert(rf.read())
        with codecs.open(out_path, mode='w', encoding='utf-8') as wf:
            wf.write(text_content)
            return True

    except Exception as e:
        return False
示例#2
0
def read_corpus_2(dir_path):
    """读取最近的一个数据集 唐诗和宋诗 """
    sents_src = []
    sents_tgt = []
    tokenizer = Tokenizer(word2idx)
    files= os.listdir(dir_path) #得到文件夹下的所有文件名称
    
    for file1 in files: #遍历文件夹
       
        if not os.path.isdir(file1): #判断是否是文件夹,不是文件夹才打开
            file_path = dir_path + "/" + file1
            print(file_path)
            # data = json.load(file_path)
            with open(file_path) as f :
                poem_list = eval(f.read())
            
            for each_poem in poem_list:
                string_list = each_poem["paragraphs"]
                poem = ""
                for each_s in string_list:
                    poem += each_s

                cc = opencc.OpenCC('t2s')
                poem = cc.convert(poem)

                encode_text = tokenizer.encode(poem)[0]
                if word2idx["[UNK]"] in encode_text:
                    # 过滤unk字符
                    continue
                title = cc.convert(each_poem["title"])

                if len(title) > 10 or len(title) < 1:
                    # 过滤掉题目长度过长和过短的诗句
                    continue

                if len(poem) == 24 and (poem[5] == "," or poem[5] == "。"):
                    # 五言绝句
                    sents_src.append(title+ "##" + "五言绝句")
                    sents_tgt.append(poem)
                elif len(poem) == 32 and (poem[7] == "," or poem[7] == "。"):
                    # 七言绝句
                    sents_src.append(title + "##" + "七言绝句")
                    sents_tgt.append(poem)
                elif len(poem) == 48 and (poem[5] == "," or poem[5] == "。"):
                    # 五言律诗
                    sents_src.append(title + "##" + "五言律诗")
                    sents_tgt.append(poem)
                elif len(poem) == 64 and (poem[7] == "," or poem[7] == "。"):
                    # 七言律诗
                    sents_src.append(title + "##" + "七言律诗")
                    sents_tgt.append(poem)

    print("第二个诗句数据集共:" + str(len(sents_src)) + "篇")
    return sents_src, sents_tgt
示例#3
0
def convert(infile: str, outfile: str, cfg: str):
    """read >> convert >> write file
    Args:
        infile (str): input file
        outfile (str): output file
        cfg (str): config
    """
    converter = opencc.OpenCC(cfg)
    with open(infile, "r") as inf, open(outfile, "w+") as outf:
        outf.write("\n".join(converter.convert(line) for line in inf))
    print(f"Convert to {outfile}")
示例#4
0
 def __init__(self, use_cuda, pre_process=False):
     self.simplified_to_traditional = opencc.OpenCC('s2t')
     self.use_cuda = use_cuda
     self.train_x = None
     self.train_y = None
     self.test_x = None
     self.test_y = None
     self.tag2id = None
     if pre_process:
         self.pre_process()
     else:
         self.load_data()
示例#5
0
def convert(src_path, dst_path, cfg='s2twp.json'):
    converter = opencc.OpenCC(cfg)
    with open(src_path,
              "r", encoding='utf-8') as src, open(dst_path,
                                                  "w+",
                                                  encoding='utf-8') as dst:
        dst.write("\n".join(
            converter.convert(line.rstrip()).replace(
                '(img/', '(../img/').replace('髮送', '傳送').replace(
                    '髮布', '釋出').replace('髮生', '發生').replace('髮出', '發出')
            for line in src))
    print("convert %s to %s" % (src_path, dst_path))
示例#6
0
    def setOutputSimplifiedChinese(self, outputSimpChinese):
        self.outputSimpChinese = outputSimpChinese
        # 建立 OpenCC instance 用來做繁簡體中文轉換
        if outputSimpChinese:
            if not self.opencc:
                self.opencc = opencc.OpenCC(
                    opencc.OPENCC_DEFAULT_CONFIG_TRAD_TO_SIMP)
        else:
            self.opencc = None

        self.updateSwitchLangIcon = True
        self.updateLangButtons()
示例#7
0
def load_dev(path='/home/dy/flat-chinese-ner/data/test.txt', simplify=True):
    test_data = []
    with open(path, 'r', encoding='utf8') as f:
        file_text = f.read().encode('utf-8').decode('utf-8-sig')
        converter = opencc.OpenCC('t2s.json')
        if simplify:
            file_text = converter.convert(file_text)

        datas = file_text.split('\n\n--------------------\n\n')[:-1]
        for doc in datas:
            _, doc = doc.split('\n')
            test_data.append(doc)
    return test_data
def convert(infile: str, outfile: str, cfg: str):
    """read >> convert >> write file
    Args:
        infile (str): input file
        outfile (str): output file
        cfg (str): config
    """
    converter = opencc.OpenCC(cfg)
    with open(infile, "r") as inf, open(outfile, "w+") as outf:
        data = inf.readlines()
        data = list(map(converter.convert, data))
        outf.writelines(data)
    print(f"Convert to {outfile}")
示例#9
0
    def __init__(self, options):
        self.critical = False

        super().__init__(options)

        self.drop_zh = False
        self.opencc_version = "N/A"

        try:
            import opencc

            self.s2tw = opencc.OpenCC('s2twp.json')
            self.tw2s = opencc.OpenCC('tw2sp.json')

            self.opencc_version = opencc.__version__
        except Exception as e:
            if self.critical:
                raise e
            else:
                print("[warning] zhconv: opencc load failed, zhconv disabled")
                print("[warning] zhconv: disabling zh-cn, zh-tw build")
                self.drop_zh = True
示例#10
0
def wiki_to_txt(file_name, output_name):
    logging.info("開始 wiki_to_txt")
    wiki_corpus = WikiCorpus(file_name, dictionary={})
    texts_num = 0
    converter = opencc.OpenCC('s2t.json')
    with open(output_name, 'w', encoding='utf-8') as output:
        for texts in wiki_corpus.get_texts():
            r = converter.convert(' '.join(texts))
            output.write(r + '\n')
            texts_num += 1
            if texts_num % 10000 == 0:
                logging.info("已處理 %d 篇文章" % texts_num)

    logging.info("結束 wiki_to_txt")
示例#11
0
def convert2simple():
    cc = opencc.OpenCC('t2s')
    for i in range(1, 5):
        src_file = dir_path + "wiki_texts" + str(i) + ".txt"
        des_file = dir_path + "wiki_simple" + str(i) + ".txt"
        des_f = open(des_file, 'w')
        with open(src_file, 'r') as f:
            for line in f:
                # print line.decode('utf-8')
                content = cc.convert(line.decode('utf-8'))
                print content
                des_f.write(content.encode('utf-8') + '\n')
        des_f.close()
        print str(i) + " finished."
示例#12
0
def merge_files(root, target, convert):
    
    converter = opencc.OpenCC('s2t.json')
    file_names = get_all_files(root)
    output = open(target, "w+", encoding='utf-8', errors='ignore')
    for file_name in file_names:
        f = open(root + file_name, "r", encoding='utf-8',errors='ignore')
        for line in tqdm(f.readlines(), desc="merge lines in a file"):
            if convert:
                output.write(converter.convert(line))
            else:
                output.write(line)
        f.close()
    output.close()
    def load_texts_labels(self, filename_label_dict, pickle_name):
        '''
        加载文本格式数据集分为训练集和测试集; 将数据集保存成pickle文件
        :param filename_label_dict:
        :param pickle_name: 保存的pickle文件名
        :return:
        '''
        texts, labels = [], []
        # 加载数据集
        for filename, label in filename_label_dict.items():
            #label = filename_label_dict[filename]
            file_path = os.path.join(self.DATA_DIR, filename)
            data = open(file_path, 'r', encoding='utf-8')
            cnt = 0
            for line in data:
                try:
                    tokens = [
                        t for t in jieba.lcut(line.strip())
                        if t not in self.stopwords
                    ]
                    text = ' ' + ' '.join(tokens)
                    # 数据预处理,繁体转简体
                    cc = opencc.OpenCC('mix2s')
                    texts.append(cc.convert(text))
                    labels.append(label)
                except Exception as e:
                    print('{}:\n{}'.format(e, data))

                cnt += 1
                if cnt % 1000 == 0:
                    print('Processed {} records.'.format(cnt))
            print('Done processing {} records.'.format(cnt))

        self.dataset['texts'] = texts
        self.dataset['labels'] = labels

        # 保存数据集为pickle文件
        pickle_texts = '{}_texts.pk'.format(pickle_name)
        pickle_labels = '{}_labels.pk'.format(pickle_name)
        with open(os.path.join(self.DATA_DIR, pickle_texts), 'wb') as f_texts:
            pickle.dump(self.dataset['texts'], f_texts)

        with open(os.path.join(self.DATA_DIR, pickle_labels),
                  'wb') as f_labels:
            pickle.dump(self.dataset['labels'], f_labels)

        # 分隔数据集为测试集和训练集
        self.train_x, self.valid_x, self.train_y, self.valid_y = \
            model_selection.train_test_split(self.dataset['texts'], self.dataset['labels'])
示例#14
0
def main():
    converter = opencc.OpenCC('t2s.json')
    book = sys.argv[1]
    with open(book, "rb") as f:
        data = f.read()
        text = data.decode('utf-8',errors='ignore')
        # RE = re.compile(u'[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎|,|。]', re.UNICODE)
        RE = re.compile('[\u4e00-\u9fff|\u3001-\u303F|\uff01-\uff5d|\u2160-\u217F|\u203B|\u30FB|\u2027|\u25a0|\u2500|\uff5e|\u2026\u25c6|\u2460-\u2487]', re.UNICODE)
        chinese = RE.findall(text)
        tcscript = ''.join(chinese)
        sc = converter.convert(tcscript)
        sc = strQ2B(sc)
        with open(book+".txt", "a+", encoding="utf-8") as f1:
            f1.write(sc)
            f1.write('\n\n')
示例#15
0
 def t2s(self, traditional_file, clean_file):
     """
     繁简转换
     :param traditional_file:
     :param clean_file:
     :return:
     """
     converter = opencc.OpenCC('t2s.json')
     with open(traditional_file, 'r', encoding='utf-8') as rf:
         batch_data = []
         for line in rf:
             line = line.strip().split('\t')
             line[1] = converter.convert(line[1])
             batch_data.append(line[0] + '\t' + line[1])
         _write_data2file(batch_data, clean_file)
示例#16
0
    def _create_examples(lines, set_type=None):
        """Creates examples for the training and dev sets."""
        # re_ENUM = re.compile(r"([-.a-zA-Z0-9]+)")
        re_ENUM = re.compile(r'(([-–+])?\d+(([.·])\d+)?%?|([0-9_.·]*[A-Za-z]+[0-9_.·]*)+)')
        converter = opencc.OpenCC('t2s')

        def _labels_words(p_text_segment):
            inside_tokens = []
            inside_labels = []
            for segment in p_text_segment:
                hyper_tokens = segment.split()
                segment_tokens = []
                for hyper_token in hyper_tokens:
                    hyper_token = hyper_token.strip()
                    if len(hyper_token) > 0:
                        is_chinese = False
                        for c in hyper_token:
                            if process.process_utils.is_cjk_char(ord(c)):
                                is_chinese = True
                                break
                        if is_chinese:
                            segment_tokens.extend(list(hyper_token))
                        else:
                            segment_tokens.append(hyper_token)

                inside_tokens.extend(segment_tokens)
                if len(segment_tokens) == 1:
                    inside_labels.extend(["A"])
                elif len(segment_tokens) > 1:
                    inside_labels.extend(["BS"] + ["A"] * (len(segment_tokens) - 2) + ["ES"])

            return inside_tokens, inside_labels

        for (i, line) in enumerate(lines):
            # Only the test set has a header
            line = convert_to_unicode(line.strip())
            text = str.lower(process.process_utils.strQ2B(line))
            text = converter.convert(text)
            text = re_ENUM.sub(" \\1 ", text)
            text_segment = text.split("☃")
            tokens, labels = _labels_words(text_segment)
            o_text = re.sub(r"\s|☃", "", line)
            offset = 0
            o_tokens = []
            for token in tokens:
                o_tokens.append(o_text[offset: offset + len(token)])
                offset += len(token)
            yield InputExample(guid=o_tokens, text=tokens, labels=labels)
示例#17
0
def wiki_replace(d):
    s = d[1]
    s = re.sub(':*{\|[\s\S]*?\|}', '', s)
    s = re.sub('<gallery>[\s\S]*?</gallery>', '', s)
    s = re.sub('(.){{([^{}\n]*?\|[^{}\n]*?)}}', '\\1[[\\2]]', s)
    s = filter_wiki(s)
    s = re.sub('\* *\n|\'{2,}', '', s)
    s = re.sub('\n+', '\n', s)
    s = re.sub('\n[:;]|\n +', '\n', s)
    s = re.sub('\n==', '\n\n==', s)
    s = u'【' + d[0] + u'】\n' + s
    cc = opencc.OpenCC('t2s')
    s = cc.convert(s)
    # print('start.............')
    # print(s)
    return s
示例#18
0
def getLSTMPredict(ans):
    data = {}
    module_lstm_dir = os.path.join(module_dir, 'lstmB')
    jieba.set_dictionary(os.path.join(module_lstm_dir, 'dict_v2.txt'))

    with open(os.path.join(module_lstm_dir, 'stopwords_only_symbol_v2.txt'),
              'r',
              encoding='utf8') as f:
        stops_symbol = f.read().split('\n')
    input_str = ans  # 輸入新聞標題
    # print(f'input_str:{input_str}')
    converter = opencc.OpenCC('s2twp.json')
    s2twp_str = converter.convert(input_str)
    # print(f's2twp_str:{s2twp_str}')
    jieba_str = ' '.join([
        t for t in jieba.cut_for_search(str(s2twp_str))
        if t not in stops_symbol
    ])
    input_data_np = np.array([jieba_str])
    vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor.restore(
        os.path.join(module_lstm_dir,
                     'search_jieba_no_stopwords_train_vocab.pickle'))
    input_data_pd = np.array(list(vocab_processor.transform(input_data_np)))
    tf.reset_default_graph()
    saver = tf.train.import_meta_graph(
        os.path.join(module_lstm_dir,
                     'search_jieba_no_stopwords_train_vocab.ckpt.meta'))
    with tf.Session() as sess:
        saver.restore(
            sess,
            os.path.join(module_lstm_dir,
                         'search_jieba_no_stopwords_train_vocab.ckpt'))
        prob_and_ans = {"Placeholder:0": input_data_pd, "Placeholder_2:0": 1}
        prob = sess.run("probability:0", feed_dict=prob_and_ans)
        ans = sess.run("ans:0", feed_dict=prob_and_ans)
        # print(f'probability: {prob}') # 印出較高的機率
        # print(f'ans: {ans}') # 印出真或假( 1為真, 0為假)
        if ans[0].item() == 0:
            data['result'] = False
        else:
            data['result'] = True
        data['confidence'] = prob[0].item()
        # print(f'判斷:{ans},信心:{prob}')
        # print(f'ans:{type(ans[0])},prob:{type(prob[0])}')
        data['success'] = True
        return data
示例#19
0
 def __init__(self,
              vocab_file,
              lowercase=True,
              strip_accents=False,
              clean_text=True,
              cc=None):
     self.vocab_file = vocab_file
     self.cc = None
     if cc is not None:
         # pip install opencc-python-reimplemented
         import opencc
         self.cc = opencc.OpenCC(cc)
     from tokenizers import BertWordPieceTokenizer
     self._tokenizer = BertWordPieceTokenizer(self.vocab_file,
                                              lowercase=lowercase,
                                              strip_accents=strip_accents,
                                              clean_text=clean_text)
示例#20
0
def preproc():
    rst_items = []
    convertor = opencc.OpenCC('tw2sp.json')
    test_items = proc_test_set('data', convertor)
    for item in read_data(get_abs_path('data')):
        rst_items += proc_item(item, convertor)
    for item in read_confusion_data(get_abs_path('data')):
        rst_items += proc_confusion_item(item)

    # 拆分训练与测试
    dev_set_len = len(rst_items) // 10
    print(len(rst_items))
    random.seed(666)
    random.shuffle(rst_items)
    dump_json(rst_items[:dev_set_len], get_abs_path('data', 'dev.json'))
    dump_json(rst_items[dev_set_len:], get_abs_path('data', 'train.json'))
    dump_json(test_items, get_abs_path('data', 'test.json'))
    gc.collect()
示例#21
0
def gen_data(file_name):
    converter = opencc.OpenCC('t2s.json')
    actions = list()
    with open(file_name) as f:
        for line in tqdm(f):
            doc = json.loads(line.strip())
            actions.append({
                "_index": 'zhwiki',
                '_source': {
                    'title': converter.convert(doc['title']),
                    'text': converter.convert(doc['text'])
                }
            })
            if len(actions) >= 100:
                yield actions
                actions = list()
    if len(actions) > 0:
        yield actions
示例#22
0
def read2df(mnt_txt):
    cc = oc.OpenCC("t2s")
    with open(mnt_txt,"r", encoding="utf-8") as f:
        data = f.read()
    data_list = data.split("\n")
    eng_list,chn_list = [],[]
    df = pd.DataFrame()
    for dl in data_list[:-1]:
        dls = dl.split("\t")
        #print(dls)
        eng_list.append(split_dot(dls[0]))
        chn_list.append(cc.convert(dls[1]))
    df["eng"] = eng_list
    df["chn"] = chn_list
    print(df.head(5))
    df.to_csv("cmn.csv",index=None)
    print("save csv")
    return(df)
示例#23
0
def test_conversion():
    import opencc

    for inpath in glob(os.path.join(_test_assets_dir, '*.in')):
        pref = os.path.splitext(inpath)[0]
        config = os.path.basename(pref)
        converter = opencc.OpenCC(config)
        anspath = '{}.{}'.format(pref, 'ans')
        assert os.path.isfile(anspath)

        with open(inpath, 'rb') as f:
            intexts = [l.strip().decode('utf-8') for l in f]
        with open(anspath, 'rb') as f:
            anstexts = [l.strip().decode('utf-8') for l in f]
        assert len(intexts) == len(anstexts)

        for text, ans in zip(intexts, anstexts):
            assert converter.convert(text) == ans, \
                'Failed to convert {} for {} -> {}'.format(pref, text, ans)
示例#24
0
def export(words):
    result = ""
    converter = opencc.OpenCC('t2s.json')
    HANZI_RE = re.compile('^[\u4e00-\u9fa5]+$')
    count = 0
    last_word = None
    for line in words:
        line = line.rstrip("\n")
        if not HANZI_RE.match(line):
            continue

        # Skip single character & too long pages
        if not 1 < len(line):
            continue

        # Skip list pages
        if line.endswith(('列表', '对照表')):
            continue

        if last_word and len(last_word) >= 4 and line.startswith(last_word):
            continue

        pinyin = "'".join(lazy_pinyin(line))
        if pinyin == line:
            # print("Failed to convert, ignoring:", pinyin, file=sys.stderr)
            continue

        if manual_fix(line):
            pinyin = manual_fix(line)
            console.debug(f"Fixing {line} to {pinyin}")

        last_word = line

        result += "\t".join((converter.convert(line), pinyin, "0"))
        result += "\n"
        count += 1
        if count % 1000 == 0:
            console.debug(str(count) + " converted")

    if count % 1000 != 0 or count == 0:
        console.debug(str(count) + " converted")
    return result
示例#25
0
def search_from_qq(song_full_name):
    cc = opencc.OpenCC('t2s')

    try:
        singer, song = song_full_name.split(' - ')
    except Exception as e:
        singer = False
        song = False

    headers = {
        'Cache-Control': 'no-cache',
        'Pragma': 'no-cache',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
        'Referer': 'https://y.qq.com/portal/search.html'
    }
    s = requests.Session()
    s.mount('http://', HTTPAdapter(max_retries=5))
    s.mount('https://', HTTPAdapter(max_retries=5))
    r = s.get('https://c.y.qq.com/soso/fcgi-bin/client_search_cp',
              params={
                  'w': song_full_name,
                  'format': 'json'
              },
              timeout=None,
              headers=headers)
    resp = r.json()
    try:
        if singer:
            if resp['data']['song']['list']:
                for songinfo in resp['data']['song']['list']:
                    for singer_r in songinfo['singer']:
                        if (cc.convert(singer_r['name'].lower()) in cc.convert(
                                singer.lower())) or (cc.convert(
                                    singer.lower()) in cc.convert(
                                        singer_r['name'].lower())):
                            return songinfo['songmid']
                return False
            return False
        return resp['data']['song']['list'][0]['songmid']
    except Exception as e:
        return False
示例#26
0
def cut_word():
    """切词处理文本"""
    wiki = codecs.open('wiki', 'r', encoding="utf8")
    train = codecs.open('corpus', 'a', encoding="utf8")
    i = 0
    line = wiki.readline()
    cc = opencc.OpenCC('t2s')
    while line:
        ste = re.findall("[\u4e00-\u9fa5]+", line)
        if len(ste):
            line_data = "".join(ste)
            seg_list = jieba.cut(line_data, cut_all=False)
            train.write(cc.convert(" ".join(seg_list)))
            train.write('\n')
        if i % 100 == 0:
            print("切词到第" + str(i) + "行")
        i += 1
        line = wiki.readline()
    wiki.close()
    train.close()
示例#27
0
def conversion():
    afterSimplify = open('afterSim.txt', mode='w+')
    conversion_type = opencc.OpenCC('mix2s')
    with open('wiki.zh.txt') as preText:
        for line in preText:
            # list = []
            print type(line)
            try:
                line = line.split()
                # print line
                for word in line:
                    # print word, type(word)
                    afterWord = conversion_type.convert(word.decode('utf-8')).encode('utf-8')
                    # print afterWord, type(afterWord)
                    afterSimplify.write('{} '.format(afterWord))
                    # print
            except UnicodeDecodeError:
                pass

            afterSimplify.write('\n')
示例#28
0
def search_from_netease(song_full_name):
    cc = opencc.OpenCC('t2s')
    try:
        singer, song = song_full_name.split(' - ')
    except Exception as e:
        singer = False
        song = False

    headers = {
        'Cache-Control':
        'no-cache',
        'Host':
        'musicapi.leanapp.cn',
        'Pragma':
        'no-cache',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
    }

    s = requests.Session()
    s.mount('http://', HTTPAdapter(max_retries=5))
    s.mount('https://', HTTPAdapter(max_retries=5))
    req = s.get('http://127.0.0.1:3000/search',
                params={'keywords': song_full_name},
                headers=headers,
                timeout=None)
    resp = req.json()

    try:
        if singer:
            for x in resp['result']['songs']:
                for artist in x['artists']:
                    if (cc.convert(artist['name'].lower()) in cc.convert(
                            singer.lower())) or (cc.convert(singer.lower())
                                                 in cc.convert(
                                                     artist['name'].lower())):
                        return x['id']
            return False
        return resp['result']['songs'][0]['id']
    except Exception as e:
        return False
示例#29
0
def preprocess(path):
    r=raw_input("type a directory name:")
    fw = open(path,'w')
    ivLIST = file2list('./iv.txt')
    for root,dirs,files in os.walk(r):
        for f in files:
            path = os.path.join(root,f)
            fo = open(path)
            for line in fo:
                line=line.strip()
                cc = opencc.OpenCC('t2s',opencc_path='/usr/bin/opencc')
                line  = cc.convert(line.decode('utf8')).encode('utf8')
                if line:
                    #remove the content in () 
                    match = par.findall(line)
                    if match:
                        for i in match:
                            if i=='(*^__^*)' or i=='(∩_∩)':
                                line = line.replace(i,' 微笑 ')
                            else:
                                line = line.replace(i,' ')

                    line = applyPAT(quote,line,isCH=None,sub=' ')
                    line = applyPAT(par2,line,1)
                    line = applyPAT(quote2,line,1)
                    line = applyPAT(period,line,1,' 无语 ')
                    
                    ## remove intensional verb and something unsure
                    lineCOPY = line    
                    lineCOPY = lineCOPY.replace('。','\n').replace(',','\n').replace(',','\n')
                    clauses = lineCOPY.split('\n') 
                    for i in clauses:
                        for j in ivLIST:
                            if i.find(j) !=-1:
                                line = line.replace(i,' ')
                    if line:
                        if line.startswith('宾馆反馈'):
                            continue
                        fw.write(line+'\n')            
            fw.write("----------\n")
        fw.close()
示例#30
0
def chinese_t2s(writefile, readfile):
    import opencc

    cc = opencc.OpenCC('t2s')  # t2s: Traditional to Simplified, 繁体转简体

    # 遇到 UnicodeEncodeError: ‘gbk’ codec can’t encode character,由于GBK和UTF-8编码冲突
    # 添加参数设置: encoding='utf-8'
    file = open(writefile, 'w', encoding='utf-8')
    prompt = 0  # 用于输出显示工作进度的变量

    for line in open(readfile, 'rb').readlines():
        # l = cc.convert(l).encode('utf8', 'ignore')
        # file.write(l + '\n')
        l = line.decode('utf8', 'ignore').rstrip(u'\n')
        file.write(cc.convert(l) + u'\n')

        # 输出显示工作进度
        print('traditional to simplified, processing: ' + str(prompt))
        prompt += 1

    file.close()