def convert1(): pos_hk = path.join(DIR, "pos_hk.txt") neg_hk = path.join(DIR, "neg_hk.txt") pos_tw = path.join(DIR, "pos_tw.txt") neg_tw = path.join(DIR, "neg_tw.txt") with open(POS) as pos, \ open(NEG) as neg, \ open(pos_hk, "w") as pos_hk, \ open(neg_hk, "w") as neg_hk, \ open(pos_tw, "w") as pos_tw, \ open(neg_tw, "w") as neg_tw: pos = pos.read() neg = neg.read() s2hk = OpenCC('s2hk') pos_converted = s2hk.convert(pos) neg_converted = s2hk.convert(neg) pos_hk.write(pos_converted) neg_hk.write(neg_converted) s2tw = OpenCC('s2tw') pos_converted = s2tw.convert(pos) neg_converted = s2tw.convert(neg) pos_tw.write(pos_converted) neg_tw.write(neg_converted)
def xml_to_json(): """ <step1> 1. 簡轉繁 2. xml轉json 3. 全形符號轉半形 """ openCC = OpenCC('s2t') # 簡轉繁 tree = ET.parse('./corpus/corpus.xml') root = tree.getroot() output_list = [] c = 0 nothing = 0 for doc in root.findall('doc'): c += 1 if c % 10000 == 0: print('----處理進度 %d----' % c) output_dict = {} content = doc.find('content').text title = doc.find('contenttitle').text if content and title: output_dict['abstract'] = openCC.convert(_full_to_half(title)) output_dict['article'] = openCC.convert(_full_to_half(content)) output_list.append(output_dict) else: nothing += 1 if nothing % 1000 == 0: print('沒東西筆數 %d' % nothing) with open('corpus/corpus_1.json', 'w') as wf: json.dump(output_list, wf)
def getStockInfo(stock): cc = OpenCC('s2t') stockCode = stock.split('.')[0] s = '{:05d}'.format(int(stockCode)) session = HTMLSession() session.browser url = 'http://stock.finance.sina.com.cn/hkstock/quotes/{0}.html'.format(s) stock_dict = {} r = session.get(url) r.html.render() name = r.html.find('#stock_cname', first=True) price = r.html.find('#mts_stock_hk_price', first=True) stockQuan = r.html.find('div.deta03', first=True).find('ul')[1].find('li')[3] news = r.html.find('#js_ggzx', first=True).find('a') # print("{0} ({1})".format(cc.convert(name.text), s)) # print(price.text) # print(cc.convert(stockQuan.text)) for new in news: print(new.text, new.links) stock_dict = { 'stock_name': cc.convert(name.text), 'stock_code': s, 'stock_price': price.text, 'stock_quan': cc.convert(stockQuan.text), 'news': news } return stock_dict
def sortData(self, tagResults): cc=OpenCC('s2t') # resultForTotalSize = self.getCallResult(1,1,None) dataForRakuten = {} sortedData = [] for record in tagResults: extend_data = [] customer_info = [] record_session_id = record['session_id'] #獲取 asr result 開始 asr_result = self.getChatRecords(record_session_id) if asr_result['status']!=0: return ("Call api: " + const.GET_ASR_RESULT_API + ", got wrong status: " + str(asr_result['status']) + ", message: " + asr_result['message']) record['asr_result']=json.loads(cc.convert(json.dumps(asr_result['result']['data']))) #獲取 asr result 結束 log.debug(record['extend_data']) for key, value in record['extend_data'].items(): regex = re.search('^\\*(.+)', key) if regex: extend_data.append({"session_id":record_session_id,"col_name":cc.convert(regex.group(1)),"value":cc.convert(value)}) else: customer_info.append({"session_id":record_session_id,"col_name":cc.convert(key),"value":cc.convert(value)}) record['extend_data']=extend_data record['customer_info']=customer_info sortedData.append(record) dataForRakuten['data'] = sortedData return dataForRakuten
def convert2(): pos_tc_path = path.join(DIR, "pos_tc.txt") neg_tc_path = path.join(DIR, "neg_tc.txt") with open(POS) as pos, \ open(NEG) as neg, \ open(pos_tc_path, "w") as pos_tc, \ open(neg_tc_path, "w") as neg_tc: pos = pos.read() neg = neg.read() pos_set = pos.split("\n") neg_set = neg.split("\n") s2hk = OpenCC('s2hk') pos_converted_hk = s2hk.convert(pos) neg_converted_hk = s2hk.convert(neg) s2tw = OpenCC('s2tw') pos_converted = s2tw.convert(pos) neg_converted = s2tw.convert(neg) pos = set(pos_converted_hk.split("\n")).union( pos_converted.split("\n")) neg = set(neg_converted_hk.split("\n")).union( neg_converted.split("\n")) pos_tc.write("\n".join(sorted(pos.difference(pos_set)))) neg_tc.write("\n".join(sorted(neg.difference(neg_set))))
class Emotion(object): def __init__(self): APP_ID = '10508840' # '你的 App ID' API_KEY = 'W9BwLsLvlPQvD9LsfWIBGX28' # '你的 Api Key' SECRET_KEY = 'd4wSFFDKm0VjGrPZVxWpZyGfAFYuD3AX' # '你的 Secret Key' self.db = Mysql_DB() self.aip = AipNlp(APP_ID, API_KEY, SECRET_KEY) self.trans = OpenCC('t2s') #模式设置为繁体-简体 def Get_Sentence(self): sql = "select id, Comment_Content from comment where over = 'YYYY' limit " + str(100) try: Sentence_list = self.db.Query_MySQL(sql) # 读取数据库,获取step行列 for i in Sentence_list: # 执行YYY修改命令,看看参照什么来做基准 self.update_db(i[0]) return Sentence_list except Exception as e: print ('query_db函数执行错误' + str(e)) def update_db(self, i): changeY_sql = "update comment set over = 'YY' where id = " + str(i) try: self.db.Insert_MySQL(changeY_sql) except Exception as e: print ('改变YY错误' + str(e)) def Get_Analyse(self): sentence_list = self.Get_Sentence() r = re.compile(ur"[\u0000-\u4dff,\u9fa6-\uffff]") # 删除除了中文以外的一切 for i in sentence_list: try: simple = self.trans.convert(i[1]) #print i[1].strip().encode('utf-8', 'ignore') result = self.aip.sentimentClassify(simple.strip().encode('utf-8', 'ignore')) #print result '''print result['items'][0]['positive_prob'] #属于积极类别的概率 print result['items'][0]['confidence'] #分类的置信度 print result['items'][0]['negative_prob'] #属于消极类别的概率 print result['items'][0]['sentiment'] #情感极性分类结果,0为负面,1为中性,2为正面''' s = str(result['items'][0]['sentiment']) p = str(result['items'][0]['positive_prob']) n = str(result['items'][0]['negative_prob']) c = str(result['items'][0]['confidence']) sql = "update comment set sentiment = %s, positive_prob = %s, negative_prob = %s, confidence = %s"%(s, p, n, c) + " where id = " + str(i[0]) self.db.Insert_MySQL(sql) except Exception as e: print('辣鸡百度转码又TM错误了,看老子的' + str(e)) try: simple = self.trans.convert(i[1]) re_s = r.sub(',', simple) result = self.aip.sentimentClassify(re_s.strip().encode('utf-8', 'ignore')) s = str(result['items'][0]['sentiment']) p = str(result['items'][0]['positive_prob']) n = str(result['items'][0]['negative_prob']) c = str(result['items'][0]['confidence']) sql = "update comment set sentiment = %s, positive_prob = %s, negative_prob = %s, confidence = %s"%(s, p, n, c) + " where id = " + str(i[0]) self.db.Insert_MySQL(sql) except Exception as e: print ('草,老子没辙了' + str(e))
def weibo_five_mil_save_to_db(file): print("Starting...") openCC = OpenCC('s2t') hashtag_regex = re.compile(r"#(\w+)#") clean_regex = re.compile(r"[(\s)|(\u200b)]+") html_regex = re.compile(r'(www|http)\S+', flags=re.MULTILINE) null_regex = re.compile(r'\x00') with open(file) as fp: for i in range(1820000): fp.readline() # skip headers for idx, line in enumerate(fp, 1820000): if idx % 10000 == 0: print(f"{idx} posts...") post = line.split("\t") try: weibo_id = int(post[0]) if WeiboFiveMilPost.objects.filter(weibo_id=weibo_id).exists(): continue else: try: attitudes_count = int(post[1]) comments_count = int(post[3]) created_at = post[4] _id = int(post[7]) content_raw = post[18] content_raw = null_regex.sub("", content_raw) cn_content_clean = clean_regex.sub("", content_raw) cn_content_clean = html_regex.sub("LINK", cn_content_clean) cn_content_clean_seg = list(jieba.cut(cn_content_clean)) tw_content_clean = openCC.convert(cn_content_clean) tw_content_clean_seg = [openCC.convert(c) for c in cn_content_clean_seg] cn_tags = hashtag_regex.findall(cn_content_clean) if cn_tags: tw_tags = [openCC.convert(t) for t in cn_tags] else: tw_tags = [] reposts_count = int(post[16]) source = post[17] WeiboFiveMilPost( weibo_id=weibo_id, attitudes_count=attitudes_count, comments_count=comments_count, created_at=created_at, _id=_id, content_raw=content_raw, cn_content_clean=cn_content_clean, cn_content_clean_seg=cn_content_clean_seg, tw_content_clean=tw_content_clean, tw_content_clean_seg=tw_content_clean_seg, cn_tags=cn_tags, tw_tags=tw_tags, source=source, reposts_count=reposts_count ).save() except ValueError as err: print(err) continue except ValueError as err: print(err)
def getOneArticle(cls, response): """ 解析並取得單篇文章的內容 """ cc = OpenCC('s2t') article = ArticleItem() article_url = response.url # 從網址切出 小說 & 文章 id tidRegex = re.compile(r'tid=(\d+)&') matchT = tidRegex.search(article_url) novel_id = str(int(matchT.group(1))) sidRegex = re.compile(r'sid=(\d+)') matchS = sidRegex.search(article_url) article_id = str(int(matchS.group(1))) article["novel_id"] = novel_id article["article_id"] = article_id article["site"] = response.meta["site_name"].strip() article["novel"] = response.meta["novel_name"].strip() article["author"] = response.meta["author"].replace("作者:", "").strip() article["link"] = article_url article["title"] = response.css("h3::text").extract_first() whitespacePattern = re.compile(r'\s+') article["title"] = re.sub(whitespacePattern, '', article["title"]) content = response.css("div#bookContent") if content.extract_first(): tags = content.css("div.ad_conetent") if tags: for tag in tags: htmlNode = tag.root if htmlNode is not None and htmlNode.getparent( ) is not None: htmlNode.getparent().remove(htmlNode) article["content"] = content.extract_first() # 時間預設值之處理 tz = pytz.timezone('Asia/Taipei') article["created_at"] = datetime.now(tz) article["updated_at"] = datetime.now(tz) # 轉繁體 try: title2 = cc.convert(article["title"]) if title2: article["title"] = title2 author2 = cc.convert(article["author"]) if author2: article["author"] = author2 content2 = cc.convert(article["content"]) if content2: article["content"] = content2 except: pass yield article
def test_convert2(): cc = OpenCC() text = '乾坤一擲' expect = '乾坤一掷' assert cc.convert(text) == expect text = '開放中文轉換' expect = '开放中文转换' assert cc.convert(text) == expect
def test_class_convert(): cc = OpenCC() text = '乾坤一擲' expect = '乾坤一掷' assert cc.convert(text) == expect text = '開放中文轉換' expect = '开放中文转换' assert cc.convert(text) == expect
def getkeywords(num): # load ptt posts with open(filename,encoding = 'utf8') as f: posts = json.load(f) # print(posts) titles="" for post in posts["articles"]: # print(post["article_title"]) title_tmp = post["article_title"] # title_tmp = pattern.findall(title_tmp) # if( title_tmp[0][1] == " "): # title_tmp[0] = title_tmp[0][1:].lstrip(); # print(title_tmp[0]) try: # print(title_tmp.split("] ")[1]) titles+=title_tmp.split("] ")[1].replace('"',' ')+" \n" except: try: # print(title_tmp.split("] ")[1]) titles+=title_tmp.split("] ")[1].replace('"',' ')+" \n" except: try: # print(title_tmp.split("]")[1]) titles+=title_tmp.split("]")[1].replace('"',' ')+" \n" except: try: # print(title_tmp.split("]")[1]) titles+=title_tmp.split("]")[1].replace('"',' ')+" \n" except: # print(title_tmp) titles+=title_tmp.replace('"',' ')+" \n" from opencc import OpenCC cc = OpenCC('tw2sp') # convert from Simplified Chinese to Traditional Chinese # can also set conversion by calling set_conversion # cc.set_conversion('s2tw') Simplified = cc.convert(titles) # print(Simplified) jieba.analyse.set_stop_words('stopwords.txt') jieba.add_word('柯文哲') jieba.add_word('叶克膜') jieba.add_word('黄士修') jieba.add_word('林佳龙') cc = OpenCC('s2twp') result = ''.join(i for i in Simplified if not i.isdigit()) #去除數字 tags = jieba.analyse.extract_tags(result, topK=num, withWeight=True ) keywords = [] for tag, weight in tags: keywords.append(cc.convert(tag)) # print(cc.convert(tag) + "," + str(weight)) return keywords
class Sorter: def __init__(self, c: Comment): #DTO refactor with Data transfer object 是否要繼承super,啟用init 因為Sorter算是連接物件helper #classmethod static method self.c = c #函式內部使用 self.cc = OpenCC('t2s') # 繁轉簡 self.bc = BertClient() # 取得bert服務器資源 self.classes_enc = None def cosine_sim(self, v1, v2): return 1 - spatial.distance.cosine(v1, v2) def predict_label(self, v): cos_sim = [] for i, c in enumerate(self.classes_enc): cos_sim.append(self.cosine_sim(v, c)) return argmax(cos_sim) def predict_labels(self, vs): op = [] for v in vs: op.append(self.predict_label(v)) return op def command_sort(self, Labeles): sent = self.c.input_comment labels = self.c.ground_truth classes = [self.cc.convert(s) for s in Labeles] #把Label轉簡體 因為模型簡體練的 self.classes_enc = self.bc.encode(classes) # 把Label們轉換成數值向量 print("True Label:", labels) print("Predict Label:", self.predict_labels(self.bc.encode(sent))) self.c.prediction = self.predict_labels(self.bc.encode(sent)) return 0 def single_comment_sort(self, Labeles): sent = self.c.input_comment labels = self.c.ground_truth classes = [self.cc.convert(s) for s in Labeles] #把Label轉簡體 因為模型簡體練的 self.classes_enc = self.bc.encode(classes) # 把Label們轉換成數值向量 #print("True Label:", labels) print("Predict Label:", self.predict_labels(self.bc.encode(sent))) self.c.prediction = self.predict_labels(self.bc.encode(sent)) result = self.c.prediction[0] return result def print_validate_result(self): print("True Label:", self.true_label) print("Predict Label:", self.result) return 0
class OpenCCTest(unittest.TestCase): def setUp(self): self.openCC = OpenCC() def test_hk2s(self): self.openCC.set_conversion('hk2s') words = '香煙(英語:Cigarette),為煙草製品的一種。滑鼠是一種很常見及常用的電腦輸入設備。' self.assertEqual(self.openCC.convert(words), '香烟(英语:Cigarette),为烟草制品的一种。滑鼠是一种很常见及常用的电脑输入设备。') def test_s2hk(self): self.openCC.set_conversion('s2hk') words = '香烟(英语:Cigarette),为烟草制品的一种。鼠标是一种很常见及常用的电脑输入设备。' self.assertEqual(self.openCC.convert(words), '香煙(英語:Cigarette),為煙草製品的一種。鼠標是一種很常見及常用的電腦輸入設備。') def test_s2t(self): self.openCC.set_conversion('s2t') words = '香烟(英语:Cigarette),为烟草制品的一种。鼠标是一种很常见及常用的电脑输入设备。' self.assertEqual(self.openCC.convert(words), '香菸(英語:Cigarette),爲菸草製品的一種。鼠標是一種很常見及常用的電腦輸入設備。') def test_s2tw(self): self.openCC.set_conversion('s2tw') words = '香烟(英语:Cigarette),为烟草制品的一种。鼠标是一种很常见及常用的电脑输入设备。' self.assertEqual(self.openCC.convert(words), '香菸(英語:Cigarette),為菸草製品的一種。鼠標是一種很常見及常用的電腦輸入設備。') def test_s2twp(self): self.openCC.set_conversion('s2twp') words = '香烟(英语:Cigarette),为烟草制品的一种。內存是一种很常见及常用的电脑输入设备。' self.assertEqual(self.openCC.convert(words), '香菸(英語:Cigarette),為菸草製品的一種。記憶體是一種很常見及常用的電腦輸入裝置。') def test_t2hk(self): self.openCC.set_conversion('t2hk') words = '香菸(英語:Cigarette),爲菸草製品的一種。滑鼠是一種很常見及常用的電腦輸入裝置。' self.assertEqual(self.openCC.convert(words), '香煙(英語:Cigarette),為煙草製品的一種。滑鼠是一種很常見及常用的電腦輸入裝置。') def test_t2s(self): self.openCC.set_conversion('t2s') words = '香菸(英語:Cigarette),爲菸草製品的一種。滑鼠是一種很常見及常用的電腦輸入裝置。' self.assertEqual(self.openCC.convert(words), '香烟(英语:Cigarette),为烟草制品的一种。滑鼠是一种很常见及常用的电脑输入装置。') def test_t2tw(self): self.openCC.set_conversion('t2tw') words = '香菸(英語:Cigarette),爲菸草製品的一種。鼠標是一種很常見及常用的電腦輸入設備。' self.assertEqual(self.openCC.convert(words), '香菸(英語:Cigarette),為菸草製品的一種。鼠標是一種很常見及常用的電腦輸入設備。') def test_tw2s(self): self.openCC.set_conversion('tw2s') words = '香菸(英語:Cigarette),為菸草製品的一種。滑鼠是一種很常見及常用的電腦輸入裝置。' self.assertEqual(self.openCC.convert(words), '香烟(英语:Cigarette),为烟草制品的一种。滑鼠是一种很常见及常用的电脑输入装置。') def test_tw2sp(self): self.openCC.set_conversion('tw2sp') words = '香菸(英語:Cigarette),為菸草製品的一種。記憶體是一種很常見及常用的電腦輸入裝置。' self.assertEqual(self.openCC.convert(words), '香烟(英语:Cigarette),为烟草制品的一种。内存是一种很常见及常用的电脑输入设备。')
def is_match(regex, s, trans=True): # 支持繁体 if s and regex: if trans: try: cc = OpenCC("t2s") s = cc.convert(s) regex = cc.convert(regex) except: print(regex, s) return re.search(regex, s) return None
def save(info, url_flag, dir_path): if url_flag == "TW_T" or url_flag == "TW_Y": openCC = OpenCC('tw2sp') elif url_flag == "HK_T" or url_flag == "HK_Y": openCC = OpenCC('hk2s') filename = openCC.convert(info[url_flag]["title"]) + ".txt" outfile = open(dir_path + "/" + filename, "w", encoding='utf-8') for ss in ["title", "url", "detail"]: txt = info[url_flag][ss].strip(" ") txt_converted = openCC.convert(txt) outfile.write(txt_converted + "\r\n\r\n") outfile.close()
def preprocessing(): res = [] i = 0 converter = OpenCC('t2s') #trannsorm into simplified Chinese #nlp = StanfordCoreNLP(r'/home/yuyi/stanford-corenlp-full-2018-02-27',lang='zh') wiki = WikiCorpus('/home/yuyi/zhwiki-latest-pages-articles.xml.bz2', lemmatize=False, dictionary=[]) #gensim里的维基百科处理类WikiCorpus for text in wiki.get_texts( ): #通过get_texts将维基里的每篇文章转换位1行text文本,并且去掉了标点符号等内容 cleaned = '' text = ''.join(text) for char in text: char = converter.convert(char) cleaned += char if len(cleaned): sentence = list(jieba.cut(cleaned)) res.append(sentence) i = i + 1 if (i % 1000) == 0: #if i == 10: print "Saved " + str(i) + " articles." # break with open('wiki_zh.pkl', 'w') as f: pickle.dump(res, f) print "Finished Saved " + str(i) + " articles."
def getShortWord(): oc = OpenCC(conversion='s2twp') # "出租车" --> "計程車" 带短语 out_file_path = os.path.join(traditionalChinese_out_dir, "shortcut.txt") word_shortcut = [] # 读取单个单词 文件格式:词 频率 1(繁体)/0(中文) 拼音 with open(traditionalChineseSinglewordPath, 'r', encoding='utf-16') as simplifiedChineseSingleWord_file: for line in simplifiedChineseSingleWord_file: items = line.strip().split(" ") jianti = items[0].strip() fanti = oc.convert(jianti) # pinyin = items[3].strip() begin = line.index(items[3]) end = line.index("\n") pinyin = str(line[begin:end]) # for i in range(3,len(items)): # pinyin = seq.join(items[i]) # pinyin.append("".join([items[i]])) res_line = pinyin + "\t" + isNotAWord + "\t" + items[ 1] + "\t" + fanti + "\t" + freq2 + "\n" word_shortcut.append(res_line) print("word size:", str(len(word_shortcut))) return word_shortcut
def get(self, request): name = request.GET.get('q', '') print(name) cc = OpenCC('s2t') name = cc.convert(name) if name is not None: queryset1 = Yao.objects.filter(responses__icontains=name) queryset2 = Yao.objects.filter(properties__icontains=name) # merged=queryset1 + queryset2 queryset = list(set(list(chain(queryset1, queryset2)))) for ele in queryset: ele.properties = ele.properties.replace('【', '\n\n【').replace( '】', '】\n') + "\n\n\n" ele.properties = ele.properties.replace('<li>', '').replace( '</li>', '').replace('<ul>', '').replace('</ul>', '').replace('<p', '') ele.responses = ele.responses.replace('<li>', '').replace( '</li>', '').replace('<ul>', '').replace('</ul>', '').replace('<p', '') ele.responses = ele.responses.replace('【', '\n\n【').replace( '】', '】\n') + "\n\n\n" ele.responses = ele.responses.replace(name, '<mg>' + name + '</mg>') ele.properties = ele.properties.replace( name, '<mg>' + name + '</mg>') return Response({'yaos': queryset})
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-i', '--input', metavar='<file>', help='Read original text from <file>.') parser.add_argument('-o', '--output', metavar='<file>', help='Write converted text to <file>.') parser.add_argument('-c', '--config', metavar='<conversion>', help='Conversion') parser.add_argument('--in-enc', metavar='<encoding>', default='UTF-8', help='Encoding for input') parser.add_argument('--out-enc', metavar='<encoding>', default='UTF-8', help='Encoding for output') args = parser.parse_args() if args.config is None: print("Please specify a conversion.", file=sys.stderr) return 1 cc = OpenCC(args.config) with io.open(args.input if args.input else 0, encoding=args.in_enc) as f: input_str = f.read() output_str = cc.convert(input_str) with io.open(args.output if args.output else 1, 'w', encoding=args.out_enc) as f: f.write(output_str) return 0
def convertFile(srcPath, destPath, progressCallback=None): """ Convert the given file into traditional chinese and save to destPath""" openCC = OpenCC('s2twp') # guess the encoding rawdata = open(srcPath, 'rb').read(500) result = chardet.detect(rawdata) charenc = result['encoding'] # count the total lines totalLines = 0 with open(srcPath, 'r', encoding=charenc, errors='ignore') as srcf: for i, l in enumerate(srcf): pass totalLines = i+1 # convert the file content prevProgress = 0 with open(srcPath, 'r', encoding=charenc, errors='ignore') as srcf: with open(destPath, 'w') as destf: for j, line in enumerate(srcf): line = openCC.convert(line) destf.write(line) # tell outside the converting progress if progressCallback: currProgress = j/totalLines if currProgress-prevProgress>=0.01: progressCallback(currProgress) prevProgress = currProgress
def main(): ''' convert simplified Chinese file to traditional Chinese file line by line :return: ''' if len(sys.argv) != 3: print("Usage: python " + sys.argv[0] + " input.txt output.txt") exit() logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) cc = OpenCC('s2tw') output = open( sys.argv[2], "w+", encoding='utf-8') # w+: write to file and create if not exist logging.info("Start converting!") with open(sys.argv[1], "r", encoding='utf-8') as f: for line_num, line in enumerate(f): line_num += 1 output.write(cc.convert(line)) if line_num % 10000 == 0: logging.info("已處理 %d 行" % line_num) output.close() f.close()
def read_input(filename): current_train_x = "" current_train_y = "" train_x = [] train_y = [] with open(filename, "r", encoding="utf-8") as raw_data: open_cc = OpenCC("s2t") for line in raw_data: line = line.strip() line = open_cc.convert(line) if line == "==###=title=###==": state = State.SEE_TITLE if current_train_x.strip() and current_train_y.strip(): train_x.append(current_train_x) train_y.append(current_train_y) current_train_x = "" current_train_y = "" elif line == "==###=description=###==": state = State.SEE_DESCRIPTION elif line == "==###=category=###==": state = State.SEE_CATEGORY else: if state == State.SEE_TITLE: current_train_x = line elif state == State.SEE_DESCRIPTION: current_train_x += " " + line elif state == State.SEE_CATEGORY: current_train_y = line if current_train_x.strip() and current_train_y.strip(): train_x.append(current_train_x) train_y.append(current_train_y) current_train_x = "" current_train_y = "" return train_x, train_y
def zh_t2s(infile, outfile): '''convert the traditional Chinese of infile into the simplified Chinese of outfile''' # read the traditional Chinese file t_corpus = [] with open(infile, 'r', encoding='utf-8') as f: for line in f: line = line.replace('\n', '').replace('\t', '') t_corpus.append(line) logger.info('read traditional file finished!') # convert the t_Chinese to s_Chinese cc = OpenCC('t2s') s_corpus = [] for i, line in zip(range(len(t_corpus)), t_corpus): if i % 1000 == 0: logger.info('convert t2s with the {}/{} line'.format( i, len(t_corpus))) # s_corpus.append(OpenCC.convert(line)) s_corpus.append(cc.convert(line)) logger.info('convert t2s finished!') # write the simplified Chinese into the outfile with open(outfile, 'w', encoding='utf-8') as f: for line in s_corpus: f.writelines(line + '\n') logger.info('write the simplified file finished!')
def strip_wiki_source(wiki_source): # 简繁体转换器 convertor = OpenCC('t2s') # 匹配<...>标签 label_pattern = '<.+>' # 匹配各类中英文标点 punc_pattern = '[“”,。()\(\)·《》::\-\"「」‘’??!!,、;]' for count, path in enumerate(wiki_source): with open(path, 'r', encoding='utf-8') as f: for line in f: if line == '\n': continue # 正则替换 line = re.sub(label_pattern, '', line) line = re.sub(punc_pattern, '', line) # 由繁体转为简体 simplified_line = convertor.convert(line) #追加模式,因此保证是个空文件 output_file = open('wiki_stripped.txt', 'a', encoding='utf-8') output_file.write(simplified_line) output_file.close() print("完成{}个文件".format(count))
def preprocess_wiki(input_file, output_file): # Import input file if not os.path.exists(input_file): url = 'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2' logging.info('Download Wiki dump from {}'.format(url)) wget.download(url) wiki = WikiCorpus(input_file, lemmatize=False, dictionary=[]) # Convert tradtional Chinese to simplified Chinese using OpenCC cc = OpenCC('t2s') # Segment the sentences into words using Jieba paddle mode jieba.enable_paddle() # Process Wiki text logging.info('Start processing Wiki text') output = open(output_file, 'w') i = 0 for article in tqdm(wiki.get_texts()): raw = ' '.join(article) processed = [] # Remove non-Chinese words for token in list(jieba.cut(cc.convert(raw))): matched = re.findall(r'[\u4e00-\u9fff]+', token) if matched: processed.append(matched[0]) output.write(' '.join(processed) + '\n') i += 1 if (i % 10000 == 0): logging.info('Finished processing {} articles'.format(i)) output.close() logging.info('Done')
def article_HK(self, url_flag=''): if url_flag == "HK_T": dir_path = self.config.get("outputPath") + time.strftime( "%Y-%m-%d") + '/' + time.strftime("%Y-%m-%d") + "-香港民报" elif url_flag == "HK_Y": dir_path = self.config.get("outputPath") + str( getYesterday()) + '/' + str(getYesterday()) + "-香港明报" if not os.path.exists(dir_path): os.makedirs(dir_path) for i, url in enumerate(self.articleList_HK[url_flag]): try: print("HK-" + str(i + 1) + ": " + url) tree_HK = lxml.html.fromstring(get_Html(url, js=True, time=3)) title_HK = tree_HK.cssselect("#blockcontent > hgroup > h1") openCC = OpenCC('hk2s') filename = openCC.convert(title_HK[0].text_content()) + ".txt" if filename in os.listdir(dir_path): print("... 第" + str(i + 1) + "篇文章已存在 ...") continue detail_HK_upper = tree_HK.cssselect("#upper > p") detail_HK_lower = tree_HK.cssselect("#lower > p") self.articleDetail[url_flag]["title"] = title_HK[ 0].text_content() self.articleDetail[url_flag]["url"] = url if (title_HK and detail_HK_upper and detail_HK_upper): detail_HK = "" detail_HK += detail_HK_upper[0].text_content() + "\r\n" for j in range(len(detail_HK_lower)): detail_HK += detail_HK_lower[j].text_content() + "\r\n" self.articleDetail[url_flag]["detail"] = detail_HK save(self.articleDetail, url_flag, dir_path) except Exception as err: print(err) print("... 第" + str(i + 1) + "篇文章解析失败 ...") continue
def convert(self): from PyQt5.QtWidgets import QApplication count = lineCount(self.fni) openCC = OpenCC(self.direction) # direction of conversion fi = open(self.fni, "r", encoding="UTF-8") fo = open(self.fno, "w", encoding="UTF-8", newline="\n") n = 0 for line in fi: n += 1 txt = openCC.convert(line) fo.write(txt) # wirte converted text to output #completed = 100 * n / count if n % 100 == 0: self.window.ui.progressBar.setValue(round(100 * n / count, 0)) self.window.ui.progressBar.repaint() QApplication.processEvents() #self.window.update() fi.close() fo.close() self.window.ui.progressBar.setValue(100) self.window.ui.progressBar.repaint() self.numLineProcessed = n return self.numLineProcessed
def get_data(page): datasets = [] try: text = page.text.encode('iso-8859-1').decode('GBK') soup = BeautifulSoup(text, 'lxml') posts = soup.find_all(class_='forumbox postbox') for post in posts: data = {} # 作者id,发帖次数,最后发帖时间 data['uid'] = post.find(class_='author')['href'].split('=')[-1] id = post.find(class_='author')['id'][10:] data['posttime'] = datetime.strptime( post.find(id="postdate" + id).text, "%Y-%m-%d %H:%M") #发帖次数暂定为1,查询后再进行修改 data['postcount'] = 1 datasets.append(data) # 得到帖子内容 content = post.find(id='postcontent' + id).text.strip() content = re.sub(u'\\[quote\\].*?\\[/quote\\]', '', content) content = re.sub(u'\\[b\\].*?\\[/b\\]', '', content) content = re.sub(u'\\[img\\].*?\\[/img\\]', '', content) content = re.sub(u'\\[url\\].*?\\[/url\\]', '', content) content = re.sub(u'\\[size.*?/size\\]', '', content) content = re.sub(u'\\[s:.*?\\]', '', content) content = re.sub(u'\\[.*?del\\]', '', content) content = re.sub(u'\\[.*?list\\]', '', content) content = re.sub(u'\\[.*?collapse\\]', '', content) if len(content) > 0: cc = OpenCC('t2s') content = cc.convert(content) save_content(content) except Exception as e: print("出现异常,错误为:%s" % e) return datasets
def start(): fp = open(filename, "r", encoding="utf-8") soup = BeautifulSoup(fp,"xml") ans = soup.find_all("tuv") print(ans) amountOfData = len(ans)/2 print(amountOfData) temp = {} count = 0 df = pd.DataFrame(columns=["中文", "英文"]) numOfData = 0 englishWord = "" chineseWord = "" for a in ans: print(numOfData) if numOfData == amountOfData: break if a.get("xml:lang") == "en": temp["英文"] = a.get_text() englishWord = a.get_text() count = count + 1 if a.get("xml:lang") == "zh" or a.get("xml:lang") == "zh-tw": cc = OpenCC('s2tw') text = cc.convert(a.get_text()) finalword = "" inBracket = False for letter in text: if letter == '(': inBracket = True continue elif letter == ")": inBracket = False continue if inBracket == False and letter != " ": finalword += letter elif inBracket == True: continue temp["中文"] = finalword chineseWord = finalword count = count + 1 if count == 2: count = 0 if len(chineseWord) == 0 or len(englishWord) == 0: temp.clear() amountOfData = amountOfData -1 continue if chineseWord[0] == englishWord[0]: chineseWord = "" englishWord = "" amountOfData = amountOfData - 1 temp.clear() continue else: print(chineseWord) print(englishWord) df = df.append(temp,ignore_index=True) chineseWord = "" numOfData = numOfData + 1 print(df.to_string()) return df
async def on_message(msg): if msg.author == self.bot.user: # this is to prevent crashing via infinite loops return msg_content = msg.content msg_channel = msg.channel cc = OpenCC('s2tw') tw_ch = cc.convert(msg_content) sticker_res = self.sticker_db_operation.get_sticker_random( msg_content) if sticker_res is not None: img_url = sticker_res[0] local_save = sticker_res[1] is_gif = sticker_res[2] if self.save_image_local and local_save != '': img_url = self.sticker_url + 'sticker-image/' + local_save """ #old method if is_gif: await msg_channel.send(img_url) else: self.com_image_em.set_image(url=img_url) await msg_channel.send(embed=self.com_image_em) """ await msg_channel.send(img_url) await self.bot.process_commands(msg)
def convert_to_sim(source, target): opencc = OpenCC('hk2s') # 繁体转简体 with open(target, 'w') as t: with open(source, 'r') as f: for line in tqdm(f): simple = opencc.convert(line) t.write(simple)
def dataCleaning(id): openCC = OpenCC('tw2s') params = {'s': id} try: r = requests.get(GET_AC_GAMER_URL, timeout=100, params=params) except TimeoutError: return None r.encoding = 'utf-8' soup = BeautifulSoup(r.text, features='lxml') platform = soup.find('a', {'class': 'link2'}) if not platform: return None if platform.text == 'Nintendo Switch ( NS )': tw_name = soup.find('h1').text cn_name = openCC.convert(str(tw_name)) jp_name = soup.find_all('h2')[0].text eu_name = soup.find_all('h2')[1].text names = { 'tw_name': tw_name, 'cn_name': cn_name, 'jp_name': jp_name, 'eu_name': eu_name } name_collection.update({'tw_name': tw_name}, names, upsert=True) return None
def test_unicode_zht2zhs(self): c = OpenCC('zht2zhs.ini') self.assertEqual(c.convert(u'开放中文转换'), u'開放中文轉換') c.close()
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import sys from opencc import OpenCC if __name__ == '__main__': if sys.version_info[0] < 3: print('Require Python3 to run') sys.exit(0) openCC = OpenCC() openCC.set_conversion('s2twp') # openCC = OpenCC('s2twp') words = '鼠标是一种很常見及常用的電腦输入设备,它可以对当前屏幕上的游标进行定位,并通过按键和滚轮装置对游标所经过位置的' \ '屏幕元素进行操作。鼠标的鼻祖於1968年出现。美国科学家道格拉斯·恩格尔巴特(Douglas Englebart)在加利福尼亚制作了' \ '第一只鼠标。' result = openCC.convert(words) print("{} \n\n==> \n\n{}".format(words, result))
def test_convert_text(self): c = OpenCC('zhs2zht.ini') try: c.convert(3) except TypeError, e: self.assertEqual(e.message, 'TypeError: must be string or buffer.')
def test_base_zhs2zht(self): c = OpenCC('zhs2zht.ini') self.assertEqual(c.convert('开放中文转换'), '開放中文轉換') c.close()
class RimeStyle: font_face = "MingLiu" candidate_format = "{0} {1}" inline_preedit = "false" menu_opencc = None font_point = 20 candidate_per_row = 1 inline_code = False display_tray_icon = False candidate_use_cursor = False soft_cursor = False menu = [] options = [] options_states = [] schemas = [] uris = [] session_id = None def __init__(self, appname, session_id): self.session_id = session_id config = RimeConfig() if not rime.config_open(appname.encode("UTF-8"), config): return self.font_face = rimeGetString(config, 'style/font_face') self.candidate_format = rimeGetString(config, 'style/candidate_format') self.inline_preedit = rimeGetString(config, 'style/inline_preedit') menu_opencc_config = rimeGetString(config, 'style/menu_opencc') self.menu_opencc = OpenCC(menu_opencc_config) if menu_opencc_config else None value = c_int() if rime.config_get_int(config, b'style/font_point', value): self.font_point = value.value if rime.config_get_bool(config, b'style/horizontal', value): self.candidate_per_row = 10 if bool(value) else 1 if rime.config_get_int(config, b'style/candidate_per_row', value): self.candidate_per_row = value.value if rime.config_get_bool(config, b'style/display_tray_icon', value): self.display_tray_icon = bool(value) if rime.config_get_bool(config, b'style/candidate_use_cursor', value): self.candidate_use_cursor = bool(value) if rime.config_get_bool(config, b'style/soft_cursor', value): self.soft_cursor = bool(value) self.options.clear() self.options_states.clear() self.uris.clear() self.menu = self.config_get_menu(config, b'menu') #print("menu", self.menu) rime.config_close(config) def get_schema(self, commandId): if commandId >= ID_SCHEMA: return self.schemas[commandId - ID_SCHEMA] def get_option(self, commandId): if commandId >= ID_OPTION: return self.options[commandId - ID_OPTION] def get_uri(self, commandId): if commandId >= ID_URI: return self.uris[commandId - ID_URI] def get_schema_list(self): schema_list = RimeSchemaList() self.schemas = [] submenu = [] current_schema = bytes(CHAR_SIZE) rime.get_current_schema(self.session_id, current_schema, CHAR_SIZE) current_schema_id = current_schema.rstrip(b'\0') if rime.get_schema_list(schema_list): n = schema_list.size for i in range(n): schema_id = schema_list.list[i].schema_id name = schema_list.list[i].name.decode("UTF-8") if self.menu_opencc: name = self.menu_opencc.convert(name) self.schemas.append(schema_id) d = {'text': name, 'id': ID_SCHEMA + i} if schema_id == current_schema_id: d["checked"] = True submenu.append(d) rime.free_schema_list(schema_list) return submenu def config_get_menu(self, config, path): menu = [] iterator = RimeConfigIterator() if not rime.config_begin_list(iterator, config, path): return while rime.config_next(iterator): d = {} name = rime.config_get_cstring(config, iterator.path + b'/name') command = rime.config_get_cstring(config, iterator.path + b'/command') uri = rime.config_get_cstring(config, iterator.path + b'/uri') text = rime.config_get_cstring(config, iterator.path + b'/text') if command: d["id"] = commands.get(command.decode("UTF-8"), 0) if ID_SCHEMA_LIST == d["id"]: d["submenu"] = self.get_schema_list() elif ID_SYNC_DIR == d["id"]: d["enabled"] = os.path.isdir(rime.get_sync_dir().decode(ENC)) elif uri: d["id"] = ID_URI + len(self.uris) self.uris.append(uri.decode("UTF-8")) elif name: states = [rime.config_get_cstring(config, iterator.path + b'/states/@0').decode("UTF-8"), rime.config_get_cstring(config, iterator.path + b'/states/@1').decode("UTF-8")] d["id"] = ID_OPTION + len(self.options) state_id = rime.get_option(self.session_id, name) d["text"] = "%s → %s" % (states[state_id], states[1 - state_id]) self.options_states.append(states) self.options.append(name) if text: d["text"] = text.decode("UTF-8") if self.menu_opencc: d["text"] = self.menu_opencc.convert(d["text"]) submenu = self.config_get_menu(config, iterator.path + b'/submenu') if submenu: d["submenu"] = submenu menu.append(d) rime.config_end(iterator) return menu