def crawler(self): cc = OpenCC('s2tw') # 簡中轉繁中 num = 0 page = 0 # 起始頁0,下一頁+100 while num != self.num_of_news: url = 'https://cn.nytimes.com/search/data/' params = { 'query': self.keyword, 'lang': '', 'dt': 'json', 'from': str(page), 'size': '100' } response = requests.get(url, params=params) news = json.loads(response.text) # 判斷頁面是否有內容 if num > news['total']: print('=====搜尋頁無內容=====') break else: # 擷取新聞標題、發布時間、新聞網址 for i in news['items']: self.title.append(cc.convert(i['headline'])) self.time.append(i['publication_date'].replace(" ", "")) self.url.append(i['web_url_with_host'] + 'zh-hant/') # 以繁體中文顯示 self.content.append( self.get_content(i['web_url_with_host'] + 'zh-hant/')) num += 1 print(f'下載篇數:{num}') if num == self.num_of_news: break else: page += 100 # 下一頁
def article_HK(self, url_flag=''): if url_flag == "HK_T": dir_path = self.config.get("outputPath") + time.strftime( "%Y-%m-%d") + '/' + time.strftime("%Y-%m-%d") + "-香港民报" elif url_flag == "HK_Y": dir_path = self.config.get("outputPath") + str( getYesterday()) + '/' + str(getYesterday()) + "-香港明报" if not os.path.exists(dir_path): os.makedirs(dir_path) for i, url in enumerate(self.articleList_HK[url_flag]): try: print("HK-" + str(i + 1) + ": " + url) tree_HK = lxml.html.fromstring(get_Html(url, js=True, time=3)) title_HK = tree_HK.cssselect("#blockcontent > hgroup > h1") openCC = OpenCC('hk2s') filename = openCC.convert(title_HK[0].text_content()) + ".txt" if filename in os.listdir(dir_path): print("... 第" + str(i + 1) + "篇文章已存在 ...") continue detail_HK_upper = tree_HK.cssselect("#upper > p") detail_HK_lower = tree_HK.cssselect("#lower > p") self.articleDetail[url_flag]["title"] = title_HK[ 0].text_content() self.articleDetail[url_flag]["url"] = url if (title_HK and detail_HK_upper and detail_HK_upper): detail_HK = "" detail_HK += detail_HK_upper[0].text_content() + "\r\n" for j in range(len(detail_HK_lower)): detail_HK += detail_HK_lower[j].text_content() + "\r\n" self.articleDetail[url_flag]["detail"] = detail_HK save(self.articleDetail, url_flag, dir_path) except Exception as err: print(err) print("... 第" + str(i + 1) + "篇文章解析失败 ...") continue
def get(self, request): name = request.GET.get('q', '') print(name) cc = OpenCC('s2t') name = cc.convert(name) if name is not None: queryset1 = Yao.objects.filter(responses__icontains=name) queryset2 = Yao.objects.filter(properties__icontains=name) # merged=queryset1 + queryset2 queryset = list(set(list(chain(queryset1, queryset2)))) for ele in queryset: ele.properties = ele.properties.replace('【', '\n\n【').replace( '】', '】\n') + "\n\n\n" ele.properties = ele.properties.replace('<li>', '').replace( '</li>', '').replace('<ul>', '').replace('</ul>', '').replace('<p', '') ele.responses = ele.responses.replace('<li>', '').replace( '</li>', '').replace('<ul>', '').replace('</ul>', '').replace('<p', '') ele.responses = ele.responses.replace('【', '\n\n【').replace( '】', '】\n') + "\n\n\n" ele.responses = ele.responses.replace(name, '<mg>' + name + '</mg>') ele.properties = ele.properties.replace( name, '<mg>' + name + '</mg>') return Response({'yaos': queryset})
def xml_to_json(): """ <step1> 1. 簡轉繁 2. xml轉json 3. 全形符號轉半形 """ openCC = OpenCC('s2t') # 簡轉繁 tree = ET.parse('./corpus/corpus.xml') root = tree.getroot() output_list = [] c = 0 nothing = 0 for doc in root.findall('doc'): c += 1 if c % 10000 == 0: print('----處理進度 %d----' % c) output_dict = {} content = doc.find('content').text title = doc.find('contenttitle').text if content and title: output_dict['abstract'] = openCC.convert(_full_to_half(title)) output_dict['article'] = openCC.convert(_full_to_half(content)) output_list.append(output_dict) else: nothing += 1 if nothing % 1000 == 0: print('沒東西筆數 %d' % nothing) with open('corpus/corpus_1.json', 'w') as wf: json.dump(output_list, wf)
def convertFile(srcPath, destPath, progressCallback=None): """ Convert the given file into traditional chinese and save to destPath""" openCC = OpenCC('s2twp') # guess the encoding rawdata = open(srcPath, 'rb').read(500) result = chardet.detect(rawdata) charenc = result['encoding'] # count the total lines totalLines = 0 with open(srcPath, 'r', encoding=charenc, errors='ignore') as srcf: for i, l in enumerate(srcf): pass totalLines = i+1 # convert the file content prevProgress = 0 with open(srcPath, 'r', encoding=charenc, errors='ignore') as srcf: with open(destPath, 'w') as destf: for j, line in enumerate(srcf): line = openCC.convert(line) destf.write(line) # tell outside the converting progress if progressCallback: currProgress = j/totalLines if currProgress-prevProgress>=0.01: progressCallback(currProgress) prevProgress = currProgress
def convert(self): from PyQt5.QtWidgets import QApplication count = lineCount(self.fni) openCC = OpenCC(self.direction) # direction of conversion fi = open(self.fni, "r", encoding="UTF-8") fo = open(self.fno, "w", encoding="UTF-8", newline="\n") n = 0 for line in fi: n += 1 txt = openCC.convert(line) fo.write(txt) # wirte converted text to output #completed = 100 * n / count if n % 100 == 0: self.window.ui.progressBar.setValue(round(100 * n / count, 0)) self.window.ui.progressBar.repaint() QApplication.processEvents() #self.window.update() fi.close() fo.close() self.window.ui.progressBar.setValue(100) self.window.ui.progressBar.repaint() self.numLineProcessed = n return self.numLineProcessed
def getShortWord(): oc = OpenCC(conversion='s2twp') # "出租车" --> "計程車" 带短语 out_file_path = os.path.join(traditionalChinese_out_dir, "shortcut.txt") word_shortcut = [] # 读取单个单词 文件格式:词 频率 1(繁体)/0(中文) 拼音 with open(traditionalChineseSinglewordPath, 'r', encoding='utf-16') as simplifiedChineseSingleWord_file: for line in simplifiedChineseSingleWord_file: items = line.strip().split(" ") jianti = items[0].strip() fanti = oc.convert(jianti) # pinyin = items[3].strip() begin = line.index(items[3]) end = line.index("\n") pinyin = str(line[begin:end]) # for i in range(3,len(items)): # pinyin = seq.join(items[i]) # pinyin.append("".join([items[i]])) res_line = pinyin + "\t" + isNotAWord + "\t" + items[ 1] + "\t" + fanti + "\t" + freq2 + "\n" word_shortcut.append(res_line) print("word size:", str(len(word_shortcut))) return word_shortcut
def strip_wiki_source(wiki_source): # 简繁体转换器 convertor = OpenCC('t2s') # 匹配<...>标签 label_pattern = '<.+>' # 匹配各类中英文标点 punc_pattern = '[“”,。()\(\)·《》::\-\"「」‘’??!!,、;]' for count, path in enumerate(wiki_source): with open(path, 'r', encoding='utf-8') as f: for line in f: if line == '\n': continue # 正则替换 line = re.sub(label_pattern, '', line) line = re.sub(punc_pattern, '', line) # 由繁体转为简体 simplified_line = convertor.convert(line) #追加模式,因此保证是个空文件 output_file = open('wiki_stripped.txt', 'a', encoding='utf-8') output_file.write(simplified_line) output_file.close() print("完成{}个文件".format(count))
async def on_message(msg): if msg.author == self.bot.user: # this is to prevent crashing via infinite loops return msg_content = msg.content msg_channel = msg.channel cc = OpenCC('s2tw') tw_ch = cc.convert(msg_content) sticker_res = self.sticker_db_operation.get_sticker_random( msg_content) if sticker_res is not None: img_url = sticker_res[0] local_save = sticker_res[1] is_gif = sticker_res[2] if self.save_image_local and local_save != '': img_url = self.sticker_url + 'sticker-image/' + local_save """ #old method if is_gif: await msg_channel.send(img_url) else: self.com_image_em.set_image(url=img_url) await msg_channel.send(embed=self.com_image_em) """ await msg_channel.send(img_url) await self.bot.process_commands(msg)
def start(): fp = open(filename, "r", encoding="utf-8") soup = BeautifulSoup(fp,"xml") ans = soup.find_all("tuv") print(ans) amountOfData = len(ans)/2 print(amountOfData) temp = {} count = 0 df = pd.DataFrame(columns=["中文", "英文"]) numOfData = 0 englishWord = "" chineseWord = "" for a in ans: print(numOfData) if numOfData == amountOfData: break if a.get("xml:lang") == "en": temp["英文"] = a.get_text() englishWord = a.get_text() count = count + 1 if a.get("xml:lang") == "zh" or a.get("xml:lang") == "zh-tw": cc = OpenCC('s2tw') text = cc.convert(a.get_text()) finalword = "" inBracket = False for letter in text: if letter == '(': inBracket = True continue elif letter == ")": inBracket = False continue if inBracket == False and letter != " ": finalword += letter elif inBracket == True: continue temp["中文"] = finalword chineseWord = finalword count = count + 1 if count == 2: count = 0 if len(chineseWord) == 0 or len(englishWord) == 0: temp.clear() amountOfData = amountOfData -1 continue if chineseWord[0] == englishWord[0]: chineseWord = "" englishWord = "" amountOfData = amountOfData - 1 temp.clear() continue else: print(chineseWord) print(englishWord) df = df.append(temp,ignore_index=True) chineseWord = "" numOfData = numOfData + 1 print(df.to_string()) return df
def _search(self, args, offset=0, limit=None, order=None, count=False, access_rights_uid=None): """ :return: a list of record ids or an integer (if count is True) """ available_modules = ['s2t.json', 't2s.json'] multi_record_ids = [] for m in available_modules: new_args = [] for arg in args: if len(arg) > 1 and type(arg[2]) is str: list_arg = list(arg) list_arg[2] = OpenCC(m).convert(arg[2]) new_args.append(tuple(list_arg)) else: new_args.append(arg) record_ids = super(BaseModelExtend, self)._search( new_args, offset=offset, limit=limit, order=order, count=count, access_rights_uid=access_rights_uid) if type(record_ids) is list: multi_record_ids += record_ids else: return record_ids # Don't use list(set(multi_record_ids)), because we'd like to preserve ordering of the list for dependency. return list(dict.fromkeys(multi_record_ids))
def preprocess_wiki_corpus(path_data_in=None, path_data_out=None): if path_data_in == None: corpus_path = path_wiki + u'zhwiki-latest-pages-articles.xml.bz2' # corpus_path = path_wiki + u'enwiki-latest-pages-articles.xml.bz2' else: corpus_path = path_data_in if path_data_out == None: corpus_processed_path = path_wiki + 'corpus_wiki.txt' else: corpus_processed_path = path_data_out cc = OpenCC('t2s') count = 0 with open(corpus_processed_path, 'w', encoding='utf-8') as corpus_processed: corpus = WikiCorpus(corpus_path, lemmatize=False, dictionary={}) for doc in corpus.get_texts(): doc_new = [ ' '.join(jieba.cut(cc.convert(sent), cut_all=False)) for sent in doc ] # doc_new = series(doc).apply(lambda x: ' '.join(jieba.cut(cc.convert(x), cut_all=False))) corpus_processed.write(' '.join(doc_new) + "\n") count += 1 if (count % 100 == 0): logging.warning('Saved ' + str(count) + ' articles') if ((flag_test == True) and (count == 200)): return return
def onclick(self): year = int("0{}".format(self.yearInput.text())) month = int("0{}".format(self.monthInput.text())) day = int("0{}".format(self.dayInput.text())) hour = int("0{}".format(self.hourInput.text())) minutes = int("0{}".format(self.minutesInput.text())) second = int("0{}".format(self.secondInput.text())) shengNian = int("0{}".format(self.shengNianInput.text())) 月将=DiZHiList[self.yueJiang.currentIndex()] 占时=DiZHiList[self.zhanShi.currentIndex()] 昼占=True __占测的事 = self.占测的事Input.text() if self.zhouZhan.currentIndex() == 1: 昼占=False 命局=False if self.mingJu.currentIndex() == 1: 命局 = True 性别 = self.sex.checkedId() cc = OpenCC('s2t') if 命局: sq = MinGPan(year, month, day, hour, minutes, second, 月将, 占时, 昼占, __占测的事, 性别, shengNian) #pdb.set_trace() else: sq = ShiPan(year, month, day, hour, minutes, second, 月将, 占时, 昼占, __占测的事, 性别, shengNian) #pdb.set_trace() sqhtml = cc.convert(sq.toHml) #sqhtml = sq.toHml sqhtml = sqhtml.replace('後','后') sqhtml = sqhtml.replace('佔','占') sqhtml = sqhtml.replace('醜','丑') self.textBrowser.setHtml(sqhtml) self.shiPan = sq
def convert_s2t(str_input): """ 簡轉繁,並轉換大陸用語為台灣用語 """ cc = OpenCC('s2twp') #設定簡轉繁,並轉換大陸用語為台灣用語 str_output = cc.convert(str_input) return (str_output)
def getStockInfo(stock): cc = OpenCC('s2t') stockCode = stock.split('.')[0] s = '{:05d}'.format(int(stockCode)) session = HTMLSession() session.browser url = 'http://stock.finance.sina.com.cn/hkstock/quotes/{0}.html'.format(s) stock_dict = {} r = session.get(url) r.html.render() name = r.html.find('#stock_cname', first=True) price = r.html.find('#mts_stock_hk_price', first=True) stockQuan = r.html.find('div.deta03', first=True).find('ul')[1].find('li')[3] news = r.html.find('#js_ggzx', first=True).find('a') # print("{0} ({1})".format(cc.convert(name.text), s)) # print(price.text) # print(cc.convert(stockQuan.text)) for new in news: print(new.text, new.links) stock_dict = { 'stock_name': cc.convert(name.text), 'stock_code': s, 'stock_price': price.text, 'stock_quan': cc.convert(stockQuan.text), 'news': news } return stock_dict
def preprocessing(): res = [] i = 0 converter = OpenCC('t2s') #trannsorm into simplified Chinese #nlp = StanfordCoreNLP(r'/home/yuyi/stanford-corenlp-full-2018-02-27',lang='zh') wiki = WikiCorpus('/home/yuyi/zhwiki-latest-pages-articles.xml.bz2', lemmatize=False, dictionary=[]) #gensim里的维基百科处理类WikiCorpus for text in wiki.get_texts( ): #通过get_texts将维基里的每篇文章转换位1行text文本,并且去掉了标点符号等内容 cleaned = '' text = ''.join(text) for char in text: char = converter.convert(char) cleaned += char if len(cleaned): sentence = list(jieba.cut(cleaned)) res.append(sentence) i = i + 1 if (i % 1000) == 0: #if i == 10: print "Saved " + str(i) + " articles." # break with open('wiki_zh.pkl', 'w') as f: pickle.dump(res, f) print "Finished Saved " + str(i) + " articles."
def preprocess_wiki(input_file, output_file): # Import input file if not os.path.exists(input_file): url = 'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2' logging.info('Download Wiki dump from {}'.format(url)) wget.download(url) wiki = WikiCorpus(input_file, lemmatize=False, dictionary=[]) # Convert tradtional Chinese to simplified Chinese using OpenCC cc = OpenCC('t2s') # Segment the sentences into words using Jieba paddle mode jieba.enable_paddle() # Process Wiki text logging.info('Start processing Wiki text') output = open(output_file, 'w') i = 0 for article in tqdm(wiki.get_texts()): raw = ' '.join(article) processed = [] # Remove non-Chinese words for token in list(jieba.cut(cc.convert(raw))): matched = re.findall(r'[\u4e00-\u9fff]+', token) if matched: processed.append(matched[0]) output.write(' '.join(processed) + '\n') i += 1 if (i % 10000 == 0): logging.info('Finished processing {} articles'.format(i)) output.close() logging.info('Done')
def zh_t2s(infile, outfile): '''convert the traditional Chinese of infile into the simplified Chinese of outfile''' # read the traditional Chinese file t_corpus = [] with open(infile, 'r', encoding='utf-8') as f: for line in f: line = line.replace('\n', '').replace('\t', '') t_corpus.append(line) logger.info('read traditional file finished!') # convert the t_Chinese to s_Chinese cc = OpenCC('t2s') s_corpus = [] for i, line in zip(range(len(t_corpus)), t_corpus): if i % 1000 == 0: logger.info('convert t2s with the {}/{} line'.format( i, len(t_corpus))) # s_corpus.append(OpenCC.convert(line)) s_corpus.append(cc.convert(line)) logger.info('convert t2s finished!') # write the simplified Chinese into the outfile with open(outfile, 'w', encoding='utf-8') as f: for line in s_corpus: f.writelines(line + '\n') logger.info('write the simplified file finished!')
def __init__(self): APP_ID = '10508840' # '你的 App ID' API_KEY = 'W9BwLsLvlPQvD9LsfWIBGX28' # '你的 Api Key' SECRET_KEY = 'd4wSFFDKm0VjGrPZVxWpZyGfAFYuD3AX' # '你的 Secret Key' self.db = Mysql_DB() self.aip = AipNlp(APP_ID, API_KEY, SECRET_KEY) self.trans = OpenCC('t2s') #模式设置为繁体-简体
def __init__(self, language, save_path): assert language.lower() in ["en", "zh"], \ 'WikicorpusTextFormatting is not implemented for language %s yet.' % language self.language = language.lower() self.download_urls = { 'en': 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2', 'zh': 'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2' } self.downloaded_files = { 'en': 'wikicorpus_en.xml.bz2', 'zh': 'wikicorpus_zh.xml.bz2' } self.chinese_coneverter = OpenCC('t2s') self.save_path = save_path + '/wikicorpus_' + language if not os.path.exists(self.save_path): os.makedirs(self.save_path) self.formatted_file = os.path.join(self.save_path, "wiki_formatted.txt") self.download() self.merge()
def __init__(self, appname, session_id): self.session_id = session_id config = RimeConfig() if not rime.config_open(appname.encode("UTF-8"), config): return self.font_face = rimeGetString(config, 'style/font_face') self.candidate_format = rimeGetString(config, 'style/candidate_format') self.inline_preedit = rimeGetString(config, 'style/inline_preedit') menu_opencc_config = rimeGetString(config, 'style/menu_opencc') self.menu_opencc = OpenCC(menu_opencc_config) if menu_opencc_config else None value = c_int() if rime.config_get_int(config, b'style/font_point', value): self.font_point = value.value if rime.config_get_bool(config, b'style/horizontal', value): self.candidate_per_row = 10 if bool(value) else 1 if rime.config_get_int(config, b'style/candidate_per_row', value): self.candidate_per_row = value.value if rime.config_get_bool(config, b'style/display_tray_icon', value): self.display_tray_icon = bool(value) if rime.config_get_bool(config, b'style/candidate_use_cursor', value): self.candidate_use_cursor = bool(value) if rime.config_get_bool(config, b'style/soft_cursor', value): self.soft_cursor = bool(value) self.options.clear() self.options_states.clear() self.uris.clear() self.menu = self.config_get_menu(config, b'menu') #print("menu", self.menu) rime.config_close(config)
def cal_sim(str1, str2, stop_words, w2v, dictionary, new_dictionary, tfidf): cc = OpenCC("t2s") s1 = jieba.cut(str1, cut_all=False) s2 = jieba.cut(str2, cut_all=False) # s1、s2去重 data1 = pre_step(s1, cc, stop_words) data2 = pre_step(s2, cc, stop_words) a1 = [] a2 = [] for item in data1: if (item not in dictionary.token2id) or (item not in w2v.vocab): a1.append(item) for item in data2: if (item not in dictionary.token2id) or (item not in w2v.vocab): a2.append(item) data1 = [i for i in data1 if i not in a1] data2 = [i for i in data2 if i not in a2] vec1 = dictionary.doc2bow(data1) # vec1[词的数字表示,频次] vec2 = dictionary.doc2bow(data2) tf1 = tfidf[vec1] # tf1[词的数字表示,权重] tf2 = tfidf[vec2] return word2vec(tf1, tf2, data1, data2, new_dictionary, w2v)
def sortData(self, tagResults): cc=OpenCC('s2t') # resultForTotalSize = self.getCallResult(1,1,None) dataForRakuten = {} sortedData = [] for record in tagResults: extend_data = [] customer_info = [] record_session_id = record['session_id'] #獲取 asr result 開始 asr_result = self.getChatRecords(record_session_id) if asr_result['status']!=0: return ("Call api: " + const.GET_ASR_RESULT_API + ", got wrong status: " + str(asr_result['status']) + ", message: " + asr_result['message']) record['asr_result']=json.loads(cc.convert(json.dumps(asr_result['result']['data']))) #獲取 asr result 結束 log.debug(record['extend_data']) for key, value in record['extend_data'].items(): regex = re.search('^\\*(.+)', key) if regex: extend_data.append({"session_id":record_session_id,"col_name":cc.convert(regex.group(1)),"value":cc.convert(value)}) else: customer_info.append({"session_id":record_session_id,"col_name":cc.convert(key),"value":cc.convert(value)}) record['extend_data']=extend_data record['customer_info']=customer_info sortedData.append(record) dataForRakuten['data'] = sortedData return dataForRakuten
def __init__(self): APP_ID = '10362966' # '你的 App ID' API_KEY = 'nQWiWR6DzjXsfYjW1yyVy8TB' # '你的 Api Key' SECRET_KEY = 'WpjMdNWYv6TSg2psofaGt4LNW366tvnj' # '你的 Secret Key' self.db = Mysql_DB() self.aip = AipNlp(APP_ID, API_KEY, SECRET_KEY) self.trans = OpenCC('t2s') #模式设置为繁体-简体
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-i', '--input', metavar='<file>', help='Read original text from <file>.') parser.add_argument('-o', '--output', metavar='<file>', help='Write converted text to <file>.') parser.add_argument('-c', '--config', metavar='<conversion>', help='Conversion') parser.add_argument('--in-enc', metavar='<encoding>', default='UTF-8', help='Encoding for input') parser.add_argument('--out-enc', metavar='<encoding>', default='UTF-8', help='Encoding for output') args = parser.parse_args() if args.config is None: print("Please specify a conversion.", file=sys.stderr) return 1 cc = OpenCC(args.config) with io.open(args.input if args.input else 0, encoding=args.in_enc) as f: input_str = f.read() output_str = cc.convert(input_str) with io.open(args.output if args.output else 1, 'w', encoding=args.out_enc) as f: f.write(output_str) return 0
def read_input(filename): current_train_x = "" current_train_y = "" train_x = [] train_y = [] with open(filename, "r", encoding="utf-8") as raw_data: open_cc = OpenCC("s2t") for line in raw_data: line = line.strip() line = open_cc.convert(line) if line == "==###=title=###==": state = State.SEE_TITLE if current_train_x.strip() and current_train_y.strip(): train_x.append(current_train_x) train_y.append(current_train_y) current_train_x = "" current_train_y = "" elif line == "==###=description=###==": state = State.SEE_DESCRIPTION elif line == "==###=category=###==": state = State.SEE_CATEGORY else: if state == State.SEE_TITLE: current_train_x = line elif state == State.SEE_DESCRIPTION: current_train_x += " " + line elif state == State.SEE_CATEGORY: current_train_y = line if current_train_x.strip() and current_train_y.strip(): train_x.append(current_train_x) train_y.append(current_train_y) current_train_x = "" current_train_y = "" return train_x, train_y
def get_data(page): datasets = [] try: text = page.text.encode('iso-8859-1').decode('GBK') soup = BeautifulSoup(text, 'lxml') posts = soup.find_all(class_='forumbox postbox') for post in posts: data = {} # 作者id,发帖次数,最后发帖时间 data['uid'] = post.find(class_='author')['href'].split('=')[-1] id = post.find(class_='author')['id'][10:] data['posttime'] = datetime.strptime( post.find(id="postdate" + id).text, "%Y-%m-%d %H:%M") #发帖次数暂定为1,查询后再进行修改 data['postcount'] = 1 datasets.append(data) # 得到帖子内容 content = post.find(id='postcontent' + id).text.strip() content = re.sub(u'\\[quote\\].*?\\[/quote\\]', '', content) content = re.sub(u'\\[b\\].*?\\[/b\\]', '', content) content = re.sub(u'\\[img\\].*?\\[/img\\]', '', content) content = re.sub(u'\\[url\\].*?\\[/url\\]', '', content) content = re.sub(u'\\[size.*?/size\\]', '', content) content = re.sub(u'\\[s:.*?\\]', '', content) content = re.sub(u'\\[.*?del\\]', '', content) content = re.sub(u'\\[.*?list\\]', '', content) content = re.sub(u'\\[.*?collapse\\]', '', content) if len(content) > 0: cc = OpenCC('t2s') content = cc.convert(content) save_content(content) except Exception as e: print("出现异常,错误为:%s" % e) return datasets
def main(): ''' convert simplified Chinese file to traditional Chinese file line by line :return: ''' if len(sys.argv) != 3: print("Usage: python " + sys.argv[0] + " input.txt output.txt") exit() logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) cc = OpenCC('s2tw') output = open( sys.argv[2], "w+", encoding='utf-8') # w+: write to file and create if not exist logging.info("Start converting!") with open(sys.argv[1], "r", encoding='utf-8') as f: for line_num, line in enumerate(f): line_num += 1 output.write(cc.convert(line)) if line_num % 10000 == 0: logging.info("已處理 %d 行" % line_num) output.close() f.close()
def dataCleaning(id): openCC = OpenCC('tw2s') params = {'s': id} try: r = requests.get(GET_AC_GAMER_URL, timeout=100, params=params) except TimeoutError: return None r.encoding = 'utf-8' soup = BeautifulSoup(r.text, features='lxml') platform = soup.find('a', {'class': 'link2'}) if not platform: return None if platform.text == 'Nintendo Switch ( NS )': tw_name = soup.find('h1').text cn_name = openCC.convert(str(tw_name)) jp_name = soup.find_all('h2')[0].text eu_name = soup.find_all('h2')[1].text names = { 'tw_name': tw_name, 'cn_name': cn_name, 'jp_name': jp_name, 'eu_name': eu_name } name_collection.update({'tw_name': tw_name}, names, upsert=True) return None
def generate_sentence_vec_avg(self, sentence, zh_type): exclude = set(string.punctuation + ',' + '。' + '、' + '「' + '」' + '?' + '!') vector = np.zeros((WV_DIIM)) oov_num = 0 if zh_type == 's': openCC = OpenCC('tw2s') sentence = openCC.convert(sentence) token_sentence = jieba.lcut(sentence) token_sentence = [t for t in token_sentence if not t in exclude] elif zh_type == 'tw': segmenter = ckip.CkipSegmenter() token_sentence = segmenter.seg(sentence) token_sentence = [ t for t in token_sentence.tok if not t in exclude ] for token in token_sentence: if token in self.w2v_model.wv.vocab: vector += self.w2v_model[token] else: oov_num += 1 vector /= len(token_sentence) self.paragraph_vec.append(vector) return vector