def regular(sen): """ 句子规范化,主要是对原始语料的句子进行一些标点符号的统一 :param sen: :return: """ # 繁体转简体 sen = zhconv.convert(sen, 'zh-cn') sen = zhconv.convert(sen, 'zh-cn') sen = sen.replace('<GO>', '') sen = sen.replace('<PAD>', '') sen = sen.replace('<EOS>', '') sen = sen.replace('<UNK>', '') sen = sen.replace('/', '') sen = re.sub(r'…{1,100}', '···', sen) sen = re.sub(r'\.{3,100}', '···', sen) sen = re.sub(r'···{2,100}', '···', sen) sen = re.sub(r',{1,100}', ',', sen) sen = re.sub(r',{1,100}', ',', sen) sen = re.sub(r'\.{1,100}', '。', sen) sen = re.sub(r'。{1,100}', '。', sen) sen = re.sub(r'\?{1,100}', '?', sen) sen = re.sub(r'?{1,100}', '?', sen) sen = re.sub(r'!{1,100}', '!', sen) sen = re.sub(r'!{1,100}', '!', sen) sen = re.sub(r'~{1,100}', '~', sen) sen = re.sub(r'~{1,100}', '~', sen) sen = re.sub(r'0', '0', sen) sen = re.sub(r'3', '3', sen) sen = re.sub(r'\s{1,100}', ',', sen) sen = re.sub(r'[“”]{1,100}', '"', sen) #中文引号不好处理 sen = re.sub('[^\w\u4e00-\u9fff"。,?!~·]+', '', sen) sen = re.sub(r'[ˇˊˋˍεπのゞェーω]', '', sen) return sen
def html_to_tw(data_txt_html): html_data = data_txt_html title_content = html_data[0] Articles_contents = html_data[1] category_list = html_data[2] post_tag = html_data[3] # 标题内容转换 title_content = convert(title_content, "zh-tw") # 文本内容转换 Articles_contents = convert(Articles_contents, "zh-tw") # 繁体中文转换 # 分类 category_lists = [] for category in category_list: category = convert(category, "zh-tw") category_lists.append(category) # 标签 post_tags = [] for tag in post_tag: tag = convert(tag, "zh-tw") post_tags.append(tag) tw_data_txt_html = [ title_content, Articles_contents, category_lists, post_tags ] return tw_data_txt_html
def test_zhconv(): from zhconv import convert # 繁体转简体 print(convert('我幹什麼不干你事。', 'zh-cn')) # 简体转繁体 print(convert('人体内存在很多微生物', 'zh-tw'))
def table_to_data(table): data = [[ convert(cell.text.strip(), 'zh-cn') for cell in row.find_all("td") ] for row in table.find_all("tr")] data[0] = [ convert(cell.text.strip(), 'zh-cn') for cell in table.find("tr").find_all("th") ] return data
def convert_file_to_hans(target_file): with codecs.open(target_file, 'r', encoding='utf-8') as f: content = f.read() dir_name, base_name = os.path.split(target_file) hans_base_name = "hans_" + zhconv.convert(base_name.decode('utf-8'), 'zh-hans').encode('utf-8') hans = zhconv.convert(content, 'zh-hans') with codecs.open(dir_name + "/" + hans_base_name, 'w', encoding='utf-8') as res: res.write(hans)
def get_cv(cv): cv = convert(cv, 'zh-tw') query = Info.select().where(Info.cv==cv) if len(query) == 0: msg = f"没有找到{cv}扮演的角色" else: msg = f"{cv} 的扮演角色有:" for i in query: msg += "\n" msg += i.name return convert(msg, "zh-hans")
def convert_to_zhtw(self, sentences): print("Convert sentences to Traditional Chinese...") sentences_tw = {} for title, sentence_list in sentences.items(): title_tw = convert(title, 'zh-tw') sentences_tw[title_tw] = [] for speaker, content in sentence_list: speaker = convert(speaker, 'zh-tw') content = convert(content, 'zh-tw') sentences_tw[title_tw].append([speaker, content]) return sentences_tw
async def resou(ctx, *args): res_json = json.loads( requests.get('https://api.oioweb.cn/api/summary.php').text) title_list = [i['title'] for i in res_json] count = 5 page_index = 0 msg = '\n'.join(title_list[page_index * count:(page_index + 1) * count]) if len(args) > 0 and args[0] == 'f': msg = convert(msg, 'zh-hant') message = await ctx.send(msg) prev_ic = "⬅️" next_ic = "➡️" await message.add_reaction(prev_ic) await message.add_reaction(next_ic) valid_reactions = [prev_ic, next_ic] def check(reaction, user): return user == ctx.author and str(reaction.emoji) in valid_reactions async def reset_reaction(): await message.clear_reactions() await message.add_reaction(prev_ic) await message.add_reaction(next_ic) reaction, user = await bot.wait_for('reaction_add', timeout=30.0, check=check) while reaction != None: if str(reaction.emoji) == next_ic: if page_index >= 10: page_index = 0 else: page_index += 1 else: if page_index <= 0: page_index = 0 else: page_index -= 1 if (page_index + 1) * count < len(title_list): msg = '\n'.join(title_list[page_index * count:(page_index + 1) * count]) else: msg = '\n'.join(title_list[page_index * count:]) if len(args) > 0 and args[0] == 'f': msg = convert(msg, 'zh-hant') await message.edit(content=msg) await reset_reaction() reaction, user = await bot.wait_for('reaction_add', timeout=30.0, check=check)
def parse_item(self, response): try: item = BookItem() item['book_url'] = response.url item['book_name'] = convert( response.xpath('//div/b/text()').extract()[0], 'zh-cn') item['book_desc'] = convert( response.xpath('//*[@id="desc_text"]/text()').extract() [0].strip(), 'zh-cn') yield item except Exception as e: print(e) return
def search(self): self.system_logger.info('loading data...') data = json.load(open('%s/sections.json' % self.data_dir)) self.system_logger.info('keywords: %s' % self.keywords) self.system_logger.info('search...') with open(self.output_path, 'w') as fw: for kbid in data: sections = data[kbid] for n, i in enumerate(sections): sec = zhconv.convert(i[0], 'zh-ch') for kw in self.keywords: if kw in sec: start = i[1][0] if len(sections) - 1 == n: end = sys.maxsize else: end = sections[n + 1][1][0] # In case of: # == 评价 == <- 2 '=' mark, target section # === 正面 === <- 3 '=' mark, subsection # ... # ==== 争议 ==== <- 4 '=' mark, subsubsection # ... # === 负面 === <- 3 '=' mark, subsection # ... # == Foo == <- 2 '=' mark, new section # ... nums = sec.count('=') for k in range(n + 1, len(sections)): if sections[k][0].count('=') == nums: end = sections[k][1][0] break res = self.collection.find({ 'source_title': kbid, 'start': { '$gt': start - 1 }, 'end': { '$lt': end } }) if res.count() > 0: fw.write('%s\n' % kbid) for r in res: sent = ''.join([t[0] for t in r['tokens']]) sent = zhconv.convert(sent, 'zh-ch') fw.write('%s\n' % sent) fw.write('\n') break
def findAndInput(path, matchedMusicPath, targetPathFile): # 相对路径 """ :param path: 歌单文件夹 :param matchedMusicPath: 歌名路径文件 :param targetPathFile: 匹配成功后存放的文件路径 """ matchedMusic = open(matchedMusicPath, encoding="UTF-8-sig") matchedMusicLine = matchedMusic.readline() matchedMusic_list = {} # 受匹配的音乐集合 success_musicPath_list = [] # 成功匹配到的数据 matchedKey = 1 # 匹配键 while matchedMusicLine: # matchedMusicLine = matchedMusicLine.replace(u'\xa0', ' ').replace("\n", "") # 去除 nbsp的现象 matchedMusicLine = matchedMusicLine.replace("\n", "") # 去除 nbsp的现象 targetAtr = re.split( r'[=]', matchedMusicLine ) # 分割目标文本 : Jam - 七月上=F:/缓存音乐/Music1/Jam - 七月上.mp3 # Music = re.split(r'[\s.、&:_\\/ ()()-]', targetAtr[0].lower()) # 切割为歌名 作者名.mp3 while "" in targetAtr: # 去除列表中的空字符串 targetAtr.remove("") # targetAtr.pop(0) targetAtr[0] = targetAtr[0].replace(u'\xa0', ' ') # 仅去除歌名 nbsp的现象,而不是路径的 targetAtr[0] = zhconv.convert(targetAtr[0], 'zh-hans') # 将文本转为中文简体 以防万一 matchedMusic_list[matchedKey] = targetAtr matchedKey += 1 matchedMusicLine = matchedMusic.readline() files = os.listdir(path) # 获取所有歌单文件 # files = ['acivii.txt'] musicCount = 0 # 歌单总个数 for f in files: print("文件名:", f) # 歌单文件里的数据 file = open(path + '/' + f, 'r', encoding='UTF-8-sig') # 获得指定文件 line = file.readline() # 按行读取 while line: line = line.replace(u'\xa0', ' ').replace('\n', "") # 去除 nbsp的现象 targetMusicLine = zhconv.convert(line, 'zh-hans') # 将目标歌单的数据也转为中文简体 以防万一 success_path = adaptation(targetMusicLine, matchedMusic_list) # 文本适配 返回成功匹配到的路径 if success_path: success_musicPath_list.append(success_path) # 添加进入列表中 musicCount += 1 line = file.readline() outPutM3u(success_musicPath_list, targetPathFile, f) # 写入 success_musicPath_list = [] # 数据清空 print("匹配次数为:", musicCount)
def chinese(): json = request.get_json() text = json['text'] to = json['to'] result = text # simpleText = convert(text, 'zh-cn') if to == 'zh-CN': result = convert(text, 'zh-cn') else: result = convert(text, 'zh-tw') print(result) return jsonify({'result': result})
def chineseconvert(): json = request.get_json() text = json['text'] # to = json['to'] result = text simpleText = convert(text, 'zh-cn') if simpleText == text: result = convert(text, 'zh-tw') else: result = simpleText print(result) return jsonify({'result': result})
def get_uniquei(id): query = Uniquei.get_or_none(Uniquei.id==id) if query: skill = Skill.get(Skill.id==id,Skill.type=='技能1') skill1 = Skill.get(Skill.id==id,Skill.type=='專武強化技能1') e_icon = resize_icon(query.num,types='equipment') prop = Props.select().where(Props.id==id) msg = '' msg += f'\n{query.name}\n' msg += f'{e_icon}\n' msg += f'{query.description}' msg += '\n======================\n' for i in prop: msg += f'{i.property}:{i.base_value}-{i.max_value}\n' msg += '======================\n' msg += f'{skill.type}:{skill.name}\n' msg += f'{resize_icon(skill.num)}\n' msg += f'描述:\n{skill.description}\n' effect = skill.effect.strip( "[']" ) msg += '效果:\n' for e in effect.split("', '"): msg += f'{e}' msg += '\n======================\n' msg += f'{skill1.type}:{skill1.name}\n' msg += f'{resize_icon(skill1.num)}\n' msg += f'描述:\n{skill1.description}\n' effect1 = skill1.effect.strip( "[']" ) msg += '效果:\n' for e in effect1.split("', '"): msg += f'{e}' return convert(msg, 'zh-hans') else: return '\n该角色暂时没有专武。'
async def preprocess(self, sent: str): if self.type == "cn2en": sent = convert(sent, "zh-cn") if self.stops.sub("", sent) in self.cn2en_trans_dict or \ not self.chinese_char_pattern.search(sent): return sent async with self.client.post(self.tokenize_url, json={ 'q': sent, "mode": self.tokenize_mode }) as rsp: rsp = await rsp.json() sent = " ".join(rsp['words']) sent = remove_ngram(sent, min_n_gram=2, max_n_gram=4) sent = self.tokenizer.segment(sent) elif self.type == "en2cn": sent = self.en_normalize_punctuation.normalize(sent) sent = self.en_tokenizer.tokenize(sent, return_str=True) tok = E2V(sent) tok = tok.lower() tok = remove_ngram(tok, min_n_gram=2, max_n_gram=4) sent = self.tokenizer.segment(tok) else: raise Exception("This type({}) is not support.".format(self.type)) return sent
def ts_trans2(self, param): op = param.get("option") source = param.get("first") try: if op == '0': result = convert(source, 'zh-tw') if source == result: result = convert(source, 'zh-cn') elif op == '1': result = convert(source, 'zh-tw') else: result = convert(source, 'zh-cn') return result except Exception as e: logger.error(e) raise e
def convertToZhtw(self, fileFromPath, fileToPath): with open(fileFromPath, 'r') as f: content = f.read().decode("utf-8") with open(fileToPath, 'w') as f1: content = zhconv.convert(content, 'zh-tw') f1.write(content)
def t2d(table): # try to solve rowspan / colspan rows = table.find_all("tr") col_num, row_num = get_col_row_num(rows) # print(col_num, row_num) res = [[-1 for i in range(col_num)] for j in range(row_num)] i = 0 # i-th row, j-th column for row in rows: j = 0 cells = row.find_all(["th", "td"]) for cell in cells: value = cell.text.strip() while j < col_num and res[i][j] != -1: j += 1 if col_num <= j: break col_span, row_span = min(int(cell.attrs.get('colspan', 1)), col_num - j), min( int(cell.attrs.get('rowspan', 1)), row_num - i) # handle overflow value = int(value) if value.isdigit() else convert(value, 'zh-cn') res[i][j] = value # current cell for k in range(1, row_span): res[i + k][j] = value # down for k in range(1, col_span): j += 1 res[i][j] = value # right j += 1 i += 1 return res
def get_skill(id): loop = Info.get(Info.id==id) query = Skill.select().where(Skill.id==id) arr = [] for i in query: if i.type in ("專武強化技能1",): continue skill = {} skill['name'] = i.name skill['type'] = i.type skill['description'] = i.description skill['num'] = i.num skill['effect'] = i.effect arr.append(skill) newlist = sorted(arr, key=functools.cmp_to_key(custom_sorted)) msg = '' start = get_icons(loop.start.split(',')) msg += f'\n起手:\n{start}\n' loop = get_icons(loop.loop.split(',')) msg += f'循环:\n{loop}\n' msg += '技能:' for s in newlist: msg += '\n======================\n' msg += f'{s["type"]}:{s["name"]}\n' msg += f'{resize_icon(s["num"])}\n' msg += f'描述:\n{s["description"]}\n' effect = s["effect"].strip( "[']" ) msg += '效果:\n' for e in effect.split("', '"): msg += f'{e}' return convert(msg, 'zh-hans')
def char_word_tokenize(text): """分词器、中文单独成词,英文单词、连续数字作为一个词""" # 大写转小写,繁体转简体 text = zhconv.convert(text.lower(), 'zh-cn') # 全角转半角 text = full_to_half(text) tokenized_chs = [] text_len = len(text) i = 0 while i < text_len: ch = text[i] # 中文字符 if ch in all: tokenized_chs.append(ch) i += 1 # 数字或英文字母 elif ch.isdigit() or ch.islower(): word = ch j = i + 1 while j < text_len: tch = text[j] if tch.isdigit() or tch.islower(): word += tch j += 1 else: break i = j # 抽取词干API有错误,暂弃 # tokenized_chs.append(stemmer.stemWord(word)) tokenized_chs.append(word) else: i += 1 return tokenized_chs
def getPropertyJson(props, percount, output): for index in range(0, len(props), percount): props_detail = [] ids = "|".join(props[index:index+percount]) r = requests.get(url+ids) data = json.loads(zhconv.convert(r.text, "zh-cn")) for k, v in data['entities'].items(): if "missing" in v or v.get("labels", -1)==-1: continue newdata = {} newdata['id'] = k attrs = ['labels', 'descriptions'] langs = ['zh', 'en'] for attr in attrs: if v.get(attr, -1)!=-1: for lang in langs: if v[attr].get(lang, -1)!=-1: newdata[lang + '-' + attr[:-1]] = v[attr][lang]["value"] if v.get("aliases", -1)!=-1: for lang in langs: if v["aliases"].get(lang, -1)!=-1: newdata[lang + '-' + "aliase"] = [] for vv in v["aliases"][lang]: newdata[lang + '-' + "aliase"].append(vv["value"]) props_detail.append(newdata) print("Crawled %d properties."%(len(props_detail))) with open(output, "a+", encoding="utf8") as f: for pp in props_detail: json.dump(pp, f, ensure_ascii=False) f.write("\n") print("total crawled properties: %d"%(total))
def save_cut_word_rst(file_path): data = pd.read_csv(file_path + '.csv', usecols=['content']) with open(file_path + '_cut_word_rst.txt', 'w') as f_w: for content in data['content'].values: content = zhconv.convert(content.strip(), 'zh-cn') content = list(filter(lambda x: len(x.strip()) > 0, list(jieba.cut(content)))) f_w.write(' '.join(content) + '\n')
def one_process(file_list, mode, output_file): """ :param file_list: 待处理文件 :param output_file: 输出结果文件路径 :param mode: 模式 :return: """ assert mode == "zh" or "en" for file_path in file_list: #存储在output路径下的同子路径文件中 output_f = output_file + file_path[file_path.index('/', 2):] with open(output_f, 'w+', encoding='utf-8') as fw: with open(file_path, 'r', encoding='utf-8') as f: count = 0 for line_data in f: if count % 3000 == 0: print(file_path, count) count += 1 if mode == "zh": #将line_data转化为一个dict line_data = json.loads( zhconv.convert(line_data, "zh-cn").strip()) # print(line_data) else: line_data = json.loads(line_data.strip()) new_data = add_mention(line_data, mode) fw.write(json.dumps(new_data, ensure_ascii=False) + '\n')
def print_poem(num=10): flist = open('./similar_output.txt', 'rU', encoding='UTF-8').readlines() rtn = "" scoredict = {} for i in range(int(len(flist) / 3)): plist = flist[3 * i].strip('\n') + flist[3 * i + 1] scoredict[i] = pz_score(plist, i) sort_scoredict = sorted(scoredict.items(), key=lambda item: item[1], reverse=True) for m in range(num): # 诗的编号 num = sort_scoredict[m][0] # 诗的得分 score = sort_scoredict[m][1] if score >= 0.8: # 输出诗的得分 t = 'score: ' + str(score) + '\n' poem = flist[3 * num].strip('\n') + flist[3 * num + 1] poem = zhconv.convert(poem, 'zh-cn') # 输出诗 t += poem + '\n' rtn += t else: break return rtn
async def shici(ctx, *args): res_json = json.loads(requests.get('https://v1.jinrishici.com/all').text) msg = "{}\n——{} {}".format(res_json['content'], res_json['origin'], res_json['author']) if len(args) > 0 and args[0] == 'f': msg = convert(msg, 'zh-hant') await ctx.send(msg)
def preprocess(sen): """ 用来清洗评价数据,包括统一为小写,删除文本中的空格,换行,句号,问号,感叹号以及标签信息,将繁体转换为简体,最后利用jieba库进行tokenization操作 :param sen: 待处理的字符串 :return: list,处理并分词后的列表 """ import zhconv import jieba # import hanlp # tokenizer = hanlp.load('LARGE_ALBERT_BASE') sen.lower() sen = sen.replace(' ', '') sen = sen.replace('\n', '') pattern = re.compile( r'(?<=<).+?(?=>)' ) # https://blog.csdn.net/z1102252970/article/details/70739804 str1 = pattern.sub('', sen) str1 = str1.replace('<>', '') cop = re.compile("[^\u4e00-\u9fa5^a-z^A-Z^0-9]") # 匹配汉字,英文,数字 str1 = zhconv.convert(str1, 'zh-cn') str1 = cop.sub('', str1) # tokens=tokenizer(str1)# hanlp问题:str1的长度不能超过126,多余的字符会被截断,所以需要对长字符串进行拆分 # tokens=[] # if len(str1)>100: # str1s=cut_str_by_len(str1,100) # for split_text in str1s: # tokens.extend(tokenizer(split_text)) # else:tokens.extend(tokenizer(str1)) tokens = jieba.cut(str1) return list(tokens)
def convertepub(filename, output, locale): with zipfile.ZipFile(filename, 'r') as zf, \ zipfile.ZipFile(output, 'w') as zw: zfiles = collections.OrderedDict() for zi in zf.infolist(): zfiles[zi.filename] = zi with zf.open(zfiles['META-INF/container.xml'], 'r') as f: dom = xml.dom.minidom.parse(f) rootfiles = [t.getAttribute('full-path') for t in dom.getElementsByTagName('rootfile') if t.getAttribute('media-type') == 'application/oebps-package+xml'] htmls = set(rootfiles) for rootfile in rootfiles: with zf.open(zfiles[rootfile], 'r') as f: dom = xml.dom.minidom.parse(f) manifest = dom.getElementsByTagName('manifest')[0] htmls.update(t.getAttribute('href') for t in manifest.getElementsByTagName('item') if t.getAttribute('media-type') in ('application/xhtml+xml', 'application/x-dtbncx+xml')) for name, zi in zfiles.items(): if name in htmls: s = zhconv.convert(zf.read(zi).decode('utf-8'), locale) zw.writestr(zi, s.encode('utf-8')) else: zw.writestr(zi, zf.read(zi))
def render_text_with_token_id(token_id, font, use_traditional, idx2word): word = idx2word[token_id] word = convert(word, 'zh-hant') if use_traditional else word if len(word) > 1: return np.zeros((font.size + 1, font.size + 1)) else: return pad_mask(render_text(word, font), font.size)
def search(self, searchItem): proxies = { "http": "220.168.237.187:8888", "https": "https://127.0.0.1:1080", "http": "http://127.0.0.1:1080" } headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' } searchItem = '战争列表_(' + searchItem google_url = str(self.url) + urllib.parse.quote(searchItem) try: proxy_handler = urllib.request.ProxyHandler( proxies) # 设置对应的代理服务器信息 opener = urllib.request.build_opener( proxy_handler, urllib.request.HTTPHandler) # 创建一个自定义的opener对象 urllib.request.install_opener(opener) # 创建全局默认的opener对象 req = urllib.request.Request(google_url, headers=headers) response = urllib.request.urlopen(req) content = response.read().decode("utf-8") content = convert(str(content), 'zh-hans') soupIter = BeautifulSoup(content, 'lxml') yield soupIter except: return '404'
def convertepub(filename, output, locale): with zipfile.ZipFile(filename, 'r') as zf, \ zipfile.ZipFile(output, 'w') as zw: zfiles = collections.OrderedDict() for zi in zf.infolist(): zfiles[zi.filename] = zi with zf.open(zfiles['META-INF/container.xml'], 'r') as f: dom = xml.dom.minidom.parse(f) rootfiles = [ t.getAttribute('full-path') for t in dom.getElementsByTagName('rootfile') if t.getAttribute('media-type') == 'application/oebps-package+xml' ] htmls = set(rootfiles) for rootfile in rootfiles: with zf.open(zfiles[rootfile], 'r') as f: dom = xml.dom.minidom.parse(f) manifest = dom.getElementsByTagName('manifest')[0] htmls.update( t.getAttribute('href') for t in manifest.getElementsByTagName('item') if t.getAttribute('media-type') in ( 'application/xhtml+xml', 'application/x-dtbncx+xml')) for name, zi in zfiles.items(): if name in htmls: s = zhconv.convert(zf.read(zi).decode('utf-8'), locale) zw.writestr(zi, s.encode('utf-8')) else: zw.writestr(zi, zf.read(zi))
def find_loc(self, text): text = zhconv.convert(text, "zh-cn") text = text.lower() text = text.replace(" ", "") city = self.find_city(text) province = self.find_province(text) nation = self.find_nation(text) if len(city) > 1: if "吉安" in city: if self.city_province_dict["吉安"] in province: pass else: city.pop(city.index("吉安")) elif "吉林" in city and "吉林" in province: city.pop(city.index("吉林")) else: for c in city: if self.city_province_dict[c] not in province: city.pop(city.index(c)) if len(city) > 1: city = [city[0]] if city.__contains__( "阿里") and self.city_province_dict["阿里"] not in province: city = [] if len(city) > 0: province = [self.city_province_dict[city[0]]] elif len(province) > 1: province = [province[-1]] if (len(province) + len(city) + len(nation)) > 0: nation = "中国" return (nation, province, city) else: return 0
def html_to_txt(title): """将下载的html文件转换为txt""" html_file = os.path.join(PATH, "{}.html".format(title)) save_file = os.path.join(PATH, "txt/{}.txt".format(title)) with open(html_file)as f: content = f.read() soup = BeautifulSoup(content, "lxml") with open(save_file, 'w', encoding='utf-8')as f: for text in soup.find_all('div', id='content'): for t in text.strings: t = convert(t.strip(), 'zh-cn') # 繁体转简体 print(t, file=f)
def convertfunc(s, locale, locale_only): if locale: simp = zhconv.issimp(s, True) if (simp is None or simp and locale in Locales['zh-hans'] or not simp and locale in Locales['zh-hant']): return identity elif locale_only: return empty else: return lambda x: zhconv.convert(s, locale) else: return identity
def fetch_post(url, output=os.path.join(OUTPUT, 'posts')): if not os.path.exists(output): os.makedirs(output) post_name = zhconv.convert(url_filename(url), 'zh-cn') post_file = os.path.join(output, '%s.txt' % post_name) if os.path.exists(post_file): print('Skip %s' % url) else: print('Fetch %s' % url) soup = commons.soup(url) for s in soup.find_all('a'): s.decompose() content_tag = soup.find(filter_post_content) content = content_tag.get_text() # print('Post %s (%s)' % (post_name, len(content))) if content and len(content) > 500: content = re.sub(r'[><&%]','',content) content = zhconv.convert(content, 'zh-cn') with codecs.open(post_file, 'w', 'utf-8') as f: f.write(post_name) f.write('\n\n') f.write(content)