def convert_to_strings(wikipage): # given a wikipage object, the function will return a structurlized # dictionary that holds all information from a wikipage. from hanziconv import HanziConv import wikitextparser as wtp import pprint try: summary = HanziConv.toTraditional( wtp.parse(wikipage.content).sections[0].pprint()) except: summary = None try: sections = [HanziConv.toTraditional( sec.pprint()) for sec in wtp.parse(wikipage.content).sections[1:]] try: sub_titles = [HanziConv.toTraditional( sec.title[1:-1]) for sec in wtp.parse(wikipage.content).sections[1:]] except: sub_titles = None try: section_content = [s[s.find('\n') + 1:] for s in sections] except: section_content = None except: sections = None try: sections = list(zip(sub_titles, section_content)) except: sections = None try: links = wikipage.links except: links = None return {'title': wikipage.title, 'summary': summary, 'sections': sections, 'links': links}
def extract_langlinks(sql, fo): total = 0 category = 0 instance = 0 template = 0 o = open(fo, 'w') with open(sql) as f: for line in f: if line.startswith('INSERT'): line = line[line.index('('):] line = line.strip('\n').strip(';').strip(')').strip('(') #删除前后的(和) for tri in line.split('),('): #以),(分割,得出每个item tri = tri.replace("'",'').replace("'",'') _id, lan, link = tri.split(',',2) #因为link里可能有逗号,要限制只分2次 if lan == 'zh': total += 1 if link.startswith('Category:'): category += 1 if link.startswith('Template:'): template += 1 print _id, HanziConv.toSimplified(link).encode('utf-8') link = link.replace('_', ' ') o.write('%s\t%s\n'%(_id,HanziConv.toSimplified(link).encode('utf-8'))) instance = total - category - template print "Total:%d, Category:%d, Instance:%d, Template:%d"%(total, category, instance, template)
def get_names(file_path): # get Chinese names, English names and id from json files names = [] keys_en = {'外文名称', '外文名', '英文名称', '英文别名'} keys_zh = {'中文名', '又称', '别名', '别称', '中医病名', '中文别名', '中文学名'} with open(file_path, 'r') as f: data = json.load(f) for key, value in data.items(): title = key names.append(title) basic = value['基本信息'] if type(basic) is not dict: continue for key_basic, value_basic in basic.items(): if key_basic.replace(' ', '') in keys_en: names.extend([ name_en.lower() for name_en in basic[key_basic].replace( ';', ',').split(',') if len(name_en) > 0 ]) if key_basic.replace(' ', '') in keys_zh: names.extend([ HanziConv.toSimplified(name_zh) for name_zh in basic[key_basic].replace(';', ',').split(',') if len(name_zh) > 0 and not name_zh == title ]) return title, names
def process_page(self): url = self.found.pop() try: page = requests.get(url) tree = html.fromstring(page.content) paragraphs = tree.xpath( '//div[@class="mw-parser-output"]/p/text()') hrefs = tree.xpath("//div[@class='mw-parser-output']/p/a/@href") for p in paragraphs: w = get_chinese(p) x = hc.toSimplified(w) self.characters += Counter(x) self.words += Counter(jieba.cut(x, cut_all=False)) for href in hrefs: zhref = 'https://zh.wikipedia.org' + href if zhref not in self.visited: self.found.add(zhref) self.found_own.add(zhref) except requests.exceptions.ConnectionError as e: print(e) print('Continuing...') except requests.exceptions.ChunkedEncodingError as e: print(e) print('Continuing...') except requests.exceptions.InvalidURL as e: print(e) print('Continuing...') self.visited.add(url) self.visited_own.add(url)
def get_stock_info(self, stock_name, use_proxy=True): from hanziconv import HanziConv headers = { 'X-Requested-With': 'XMLHttpRequest', 'Referer': 'http://xueqiu.com/p/ZH010389', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0', 'Host': 'xueqiu.com', #'Connection':'keep-alive', #'Accept':'*/*', 'cookie':'s=iabht2os.1dgjn9z; xq_a_token=02a16c8dd2d87980d1b3ddced673bd6a74288bde; xq_r_token=024b1e233fea42dd2e0a74832bde2c914ed30e79; __utma=1.2130135756.1433017807.1433017807.1433017807.1;' '__utmc=1; __utmz=1.1433017807.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); Hm_lvt_1db88642e346389874251b5a1eded6e3=1433017809; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1433017809' } counter = 0 while counter < self.RETRY: counter += 1 try: proxies = {} if use_proxy: proxies = self.get_proxy() print("PROXY => {:}".format(proxies)) res = requests.get("https://xueqiu.com/S/" + stock_name, headers=headers, proxies=proxies, timeout=self.REQUEST_TIMEOUT) reGetStockInfo = re.compile(r"profile-detail.*?\">(.*?)<", re.S | re.UNICODE) for stockInfo in reGetStockInfo.findall(res.text): return HanziConv.toTraditional(stockInfo) except: traceback.print_exc() time.sleep(1) return ''
def data_prepare(): """""" # 提前下好, gensim的wiki 词向量训练 article_num = 0 ## 预先设置训练检测 wiki = WikiCorpus('zhwiki-latest-pages-articles.xml.bz2',lemmatize=False, dictionary={}) stopwords = [line.strip() for line in open('stopwords.txt', 'r', encoding='utf-8').readlines()] #with open('stopwords.txt','r',encoding='utf8').readlines() as f: #stopwords = [ w.strip() for w in f] #stopwords = codecs.open('stopwords.txt','r',encoding='utf8').readlines() #stopwords = [ w.strip() for w in stopwords ] start = time.time() for text in wiki.get_texts(): text = ' '.join(text) text = HanziConv.toSimplified(text) #re.sub('[:·•’!\"#$%&\'()*+,,-./::;;<=>?@,。?★、…【】《》?“”〞‘’![\\]^_`{}()~]+', "", text) text = text.strip() seg_list = list(jieba.cut(text)) # ['歐幾里', '得', ' ', '西元前', '三世', '紀的', '古希臘', '數學家', ' ', '現在', '被', '認 # '是', '幾何', '之父', ' ', '此畫', '為拉斐爾', '的', '作品', ' ', '雅典', '學院'] new_text = [x for x in seg_list if (re.compile(u'[\u4e00-\u9fa5]+').search(x) or \ re.compile("[\"”“,??\,\.。,0-9]+").search(x)) and (x not in stopwords)] #new_text = [x for x in seg_list if re.compile('[^a-zA-Z]+').search(x) and x != ' '] ## 原来的版本是len(x) > 1 这里 不能这样8行 article_num = article_num + 1 if article_num == 10: break yield new_text
def process_poetry( self, data_dir='/media/pony/DLdigest/data/languageModel/chinese-poetry/json' ): save_dir = os.path.join(self.save_dir, 'poem') check_path_exists(save_dir) count = 0 for entry in os.scandir(data_dir): if entry.name.startswith('poet'): with open(entry.path, 'r') as json_file: poems = json.load(json_file) for p in poems: paras = HanziConv.toSimplified(''.join( p['paragraphs']).replace('\n', '')) paras = filter_punctuation(paras) for para in paras.split(' '): if len(para.strip()) != 0: pys = ' '.join( np.array(pinyin(para)).flatten()) with open( os.path.join( save_dir, str(count // 400000 + 1) + '.txt'), 'a') as f: f.write(para + ',' + pys + '\n') count += 1
def preprocess(): """ 使用gensim中的WikiCorpus库提取wiki的中文语料,并将繁体转成简体中文。 然后利用jieba的分词工具将转换后的语料分词并写入一个txt 每个wiki文档的分词结果写在新txt中的一行,词与词之间用空格隔开 !!! 这个要windows上要跑2个多小时的 !!! :return: 对zhwiki...bz2进行提取,并将繁体字转为简体字,存的reduced_zhwiki.txt ======================== from gensim.corpora import WikiCorpus import jieba from langconv import * ——这个直接从网上下载后放在同一目录即可 ======================== """ count = 0 zhwiki_path = './data/zhwiki-20190720-pages-articles-multistream.xml.bz2' f = open('./data/reduced_zhwiki.txt', 'w', encoding='utf8') # 每次成功跑完,最好改下名字,防止重新跑覆盖了 wiki = WikiCorpus(zhwiki_path, lemmatize=False, dictionary={}) for text in wiki.get_texts(): word_list = [] for sentence in text: sentence = HanziConv.toSimplified(sentence) # 繁体转简体 seg_list = jieba.cut(sentence) # 用结巴分词 for seg in seg_list: word_list.append(seg) f.write(' '.join(word_list) + '\n') count += 1 if count % 200 == 0: print("Saved " + str(count) + ' articles') f.close()
def clean(text): text = text.strip() text = HanziConv.toSimplified(text) text = full2half(text) text = re.sub("\\#.*?#|\\|.*?\\||\\[.*?]", "", text) # text = re.sub("\s*", "", text) return text
def translate(translate_file_path): with open(file=translate_file_path, mode="r", encoding="utf-8") as file: content = file.read() with open(file=translate_file_path, mode="w", encoding="utf-8") as file: if content: content = HanziConv.toTraditional(content) file.write(content)
def get_Xueqiu_categories(self): from hanziconv import HanziConv from selenium import webdriver from webdriver_manager.chrome import ChromeDriverManager url = 'https://xueqiu.com/hq#exchange=US&industry=3_2&firstName=3&page=1' while 1: try: driver = webdriver.Chrome(ChromeDriverManager().install()) driver.get(url) driver.implicitly_wait(10) soup = BeautifulSoup(driver.page_source, 'html.parser') categories = {} for ele in soup.find_all('i', {'class' : 'list-style'}): if re.search("明星股", ele.parent.text): for li in ele.parent.find_all('li'): key = HanziConv.toTraditional(li.text).strip() link = "https://xueqiu.com/hq{}".format(li.select('a')[0]['href'].strip()) categories[key] = link driver.quit() break except: traceback.print_exc() driver.quit() self.GICS_csvs(categories)
def _LoadCorpus(self): print(f"Loading corpus from: {self.CorpusPath}") if self.SeparatedBySpace: lines = [ line for line in open(self.CorpusPath, 'r').readlines() if line.find("doc id=") < 0 ] Corpus = ' '.join(lines) Tokens = set(Corpus.split()) self.Vocabulary = [token.replace('\n', '') for token in Tokens] if self.is_Arabic: self.Vocabulary = [ word for word in self.Vocabulary if len(set(word + arabic_alphabet)) == size_arabic ] self.CharSet = set(''.join(self.Vocabulary)) print(f"voc size={len(self.Vocabulary)}") else: lines = [ line for line in open(self.CorpusPath, 'r').readlines() if line.find("doc id=") < 0 ] random.shuffle(lines) line_num = len(lines) lines = lines[:line_num // 3] self.Corpus = ''.join(lines).replace('\n', '').replace(' ', '') if self.CorpusPath.find('corpus_zh') >= 0: from hanziconv import HanziConv self.Corpus = HanziConv.toSimplified(self.Corpus) print("Transformed to simplified Chinese") self.Vocabulary = None self.CharSet = set(self.Corpus) print(f"char count={len(self.CharSet)}")
def open_dataset(path, simplified, limit_length): ############################################################################### # This function opens txt file( for labels) and utf8 file (for the dataset), # removing all the sentences less than a specifi value characters # (just for the training set), splitting the words also when there is a punctuation # and converting the dataset from Traditional chinese to simplified chinese, if it is needed. # # Input: # path: path of the file # simplified: Boolean variable for the convertion of the dataset (if it is True) # limit_length: value to choose sentences larger than it # # Output: # chinese_sentences: list of sentences # ############################################################################### # condition to open file ( if the dataset is for training or for Dev set) with open(path, 'r', encoding='utf8') as file: if simplified: chinese_sentences = [ split_punctuation(line.strip().split()) for line in file if len(line.strip().split()) > limit_length ] else: chinese_sentences = [ HanziConv.toSimplified(split_punctuation(line.strip().split())) for line in file if len(line.strip().split()) > limit_length ] return chinese_sentences
def token_normalize(self, token): # url_xxx 的词统一转成 url # TODO 更多的标准化 # if token == ' ': # return self.pad_token if 'url_' in token: token = 'url' # if token.lower() in {'win10', 'win7', 'windows10', 'windows7', 'windows8', 'window7', # 'windows10windows', 'windows2000', 'windows7windows', 'windowxp', # 'windownt', 'window10', 'windowswindows', 'windows98', 'windows9x'}: # return 'Windows' # # # iphone # token = re.sub('iphone(.+)', 'iphone', token.lower()) # 繁体字转换 token = HanziConv.toSimplified(token) # # 数字和标点符号组合的词的标准化 # token = re.sub(r'(\d+)\.', '\g<1>', token) # token = re.sub(r'\.(\d+)', '\g<1>', token) # 唿 -> 呼 # token = re.sub('唿', '呼', token) # token = re.sub( # r'(第*)(有|多|几|半|一|两|二|三|四|五|六|七|八|九|十)(种|个|次|款|篇|天|步|年|大|条|方|位|键|份|项|周|层|只|套|名|句|件|台|部|页|段|把|片|小时|遍|颗|根|批|张|分|性|点点|场|分钟|组|堆|本|圈|季|笔|群|斤|日|支|排|章|所|股|门|首|代|号|生|点|辆|轮|瓶|声|杯|列|座|集)', # '\g<2>', token) return token
def sentence_cmn(rspecifier, word_level=True): if os.path.isdir(rspecifier): for f in os.listdir(rspecifier): full_name = os.path.join(rspecifier, f) for s in sentence_cmn(full_name, word_level): yield s else: with codecs.open(rspecifier, 'rb', 'utf8') as fp: for line in fp: line = HanziConv.toSimplified(line.strip()) if word_level: sent = line.split() else: sent = [] for w in line.split(): has_chinese = any(u'\u4e00' <= c <= u'\u9fff' for c in w) if has_chinese: sent.extend(list(w)) else: sent.append(w) yield [ u'<numeric>' if re.match(__has_digit_but_no_letter, w) else w for w in sent ]
def clean(text): text = text.strip() text = tokenization.convert_to_unicode(text) text = HanziConv.toSimplified(text) text = full2half(text) text = re.sub(u"\\#.*?#|\\|.*?\\||\\[.*?]", "", text) text = re.sub(u"\\s*", "", text) return text
def to_S(k): txt=X.content[k].strip() txt = re.sub('\t|\r', '\n',txt) txt = txt.replace('\n\n', '\n') txt = re.sub(' |\u3000', ' ', txt) txt=HanziConv.toSimplified(txt) txt=txt.strip() return (X.shop_url[k], X.post_time[k],txt,int(X.score[k]),len(txt))
def replace(x): x = x.replace('"', "").replace("\r\n", " ").replace("\n", " ").replace(",", ",") x = HanziConv.toSimplified(x) x = [a for a in cut(x) if a not in stop_words] x = " ".join(x) return x
def down(self, song_name, song_artist): for ext in SONG_EXT: for src, results in search_results.items(): for result in results: #print(result) print(src + ' - ' + result['songname'] + ' - ' + result['singers'] + ' - ' + result['ext']) if song_name.lower() in HanziConv.toSimplified( result['songname'].lower()): if song_artist.lower() in HanziConv.toSimplified( result['singers'].lower()): if ext in result['ext'].lower(): self.client.download([result]) #print('matches') return True #print('no match') return False
def tochar(x): x = re.sub('\n|\t|\r| |"|。。|!!|…', ' ', x) x = re.sub('\n|\t|\r| |"|。。|!!', ' ', x) x = re.sub('\x05|\x06|\x07|\.\.|\.\.\.', ' ', x) x = HanziConv.toSimplified(x) x = list(x.strip()) x = [a for a in x if len(a.strip()) > 0] return ' '.join(x)
def pre_process(text): text = HanziConv.toTraditional(text) # load cantonese corpus # jb.load_userdict('util/dict/canto_dict.txt') vocabs = list(jb.cut(text)) pp_text = " ".join(vocabs) return pp_text
def get_word_list(query): # 繁体转简体 query = HanziConv.toSimplified(query.strip()) # 大写转小写 query = query.lower() # 利用jieba进行分词 words = ' '.join(jieba.cut(query)).split(" ") return words
def simplified_to_traditional(self): logging.info("等待中..(簡 to 繁)") traditional = open("traditional.txt", "w", encoding="utf-8") with open("wiki_text.txt", "r", encoding="utf-8") as simplified: for s in simplified: traditional.write(HanziConv.toTraditional(s)) print("成功簡體轉繁體!") traditional.close()
def toTraditional(filename, content): content_trans = HanziConv.toTraditional(content) if content_trans != content: # Write with utf8 encoding with open(filename, 'w', encoding='UTF-8-SIG') as file: file.write(content_trans)
def simplified_to_traditional(): logging.info("等待中..(繁 to 簡)") simple = open("w2vSimplified.txt", "w", encoding="utf-8") with open("w2v.txt", "r", encoding="utf-8") as traditional: for t in traditional: simple.write(HanziConv.toSimplified(t)) print("成功繁體轉簡體!") simple.close()
def generate_qimai_addition_dataset(model_type="bert"): test_df = pickle_load(path_cache / "test_df.pkl") qimai_test_id = pickle_load(path_cache / f"{model_type}_qimai_test_id.pkl") appname2appdesc = pickle_load(path_cache / "appname2appdesc.pkl") apkname2appdesc = pickle_load(path_cache / "apkname2appdesc.pkl") test_df["appname"] = test_df["new_appname"] qimai_test_df = test_df.merge(apkname2appdesc) qimai_test_df = qimai_test_df[["appname", "app_desc"]] chusai_test_df = pickle_load(path_cache / "chusai_test_df.pkl") chusai_test_df = chusai_test_df.loc[~chusai_test_df["appname"].isna(), ["appname", "app_desc"]] appname2appdesc = pd.concat( [appname2appdesc, chusai_test_df, qimai_test_df], axis=0, sort=False) appname2appdesc["desc_len"] = appname2appdesc["app_desc"].str.replace( "[\x00-\xff”“•]", "").str.len() appname2appdesc["appname"] = appname2appdesc["appname"].str.lower( ).str.replace(" ", "") appname2appdesc["appname"] = [ HanziConv.toSimplified(x) for x in appname2appdesc["appname"] ] appname2appdesc = appname2appdesc.sort_values("desc_len").drop_duplicates( "appname", keep="last") appname2appdesc = appname2appdesc.loc[appname2appdesc["desc_len"] >= 8] test_df_new = test_df.copy() test_df_new = test_df_new.loc[~test_df["id"].isin(qimai_test_id)] test_df_new["appname"] = test_df_new["appname"].str.lower().str.replace( " ", "") test_df_new["appname"] = [ HanziConv.toSimplified(x) for x in test_df_new["appname"] ] test_df_new = test_df_new.merge(appname2appdesc) qimai_addition_test_id = test_df_new["id"].tolist() qimai_addition_test_dataset = generate_tensor_data(test_df_new["app_desc"], model_type) qimai_addition_test_dataset = TensorDataset(*qimai_addition_test_dataset) pickle_save(qimai_addition_test_id, path_cache / "qimai_addition_test_id.pkl") pickle_save( qimai_addition_test_dataset, path_tensor_dataset / f"{model_type}_qimai_addition_test_dataset.pkl")
def clear_text(x): x = BeautifulSoup(x, 'html.parser').text x = html.unescape(x) x = HanziConv.toSimplified(x) x = re.sub(r'\s+', '', x) # tab \n 去除 匹配任何非空白字符。等价于 [^ \f\n\r\t\v]。 x = re.sub(r'[\((【](.*?)[\))】]', '', x) # 替换 ()是单元, ?非贪婪匹配,否则.*把后面的括号也匹配掉了 x = re.sub(r'([–-—=…]*)', '',x) x = x.strip() return x
def Transform_ZhTw_Save(self, File_Name, Next_FileName): FileRead = [] with open(File_Name, 'rb') as RawFile: for line in RawFile: FileRead.append(HanziConv.toTraditional(line)) with open(Next_FileName, 'wb') as Next_File: for i in range(len(FileRead)): for j in range(len(FileRead[i])): Next_File.write(FileRead[i][j].encode('utf-8'))
def terms2VecIDs(terms): ans = [] for term in terms: ID = word2id.get(HanziConv.toSimplified(term)) #Problem: Some terms are not pretrained, like '食记','咖哩','捷运' if ID == None: ans.append(0) else: ans.append(ID) return ans
def process_text(self): logging.info("等待中..(簡 to 繁)") with open('./word2vec_data/traditional.txt', 'w', encoding='utf-8') as fw: with open('./word2vec_data/wiki_text.txt', 'r', encoding='utf-8') as f: for line in f: line = HanziConv.toTraditional(line) fw.write(line)
def __iter__(self): for content, (page_id, title) in self.wiki.get_texts(): yield doc2vec.LabeledSentence( # 1. 对content中的每一个c, # 2. 转换成简体中文之后用jieba分词 # 3. 加入到words列表中 words=[w for c in content for w in jieba.cut(HanziConv.toSimplified(c))], tags=[title])
def inputTest(): x = input("請說話:") # x:token y = jerry.get_response(x) y = HanziConv.toTraditional(y.text) print(type(x)) print(type(y)) print(y)
def create_post(): form = PostForm() if form.validate_on_submit(): chinese = HanziConv.toTraditional(form.chinese_content.data) title = HanziConv.toTraditional(form.title.data) post = Post(author=current_user, title=title, chinese_content=chinese, content=form.content.data, tags=form.tags.data) db.session.add(post) db.session.commit() flash('Your post has been created!', 'success') return redirect(url_for('home')) return render_template('create_post.html', title='New Post', form=form, legend='New Post')
def terms2Vec(terms): vec = np.zeros(len(embeddings[0])) for term in terms: ID = word2id.get(HanziConv.toSimplified(term)) if ID == None: vec += embeddings[0] else: vec += embeddings[ID] vec /= len(terms) return vec
def terms2Vec(terms): vec = np.zeros(len(embeddings[0])) for term in terms: ID = word2id.get(HanziConv.toSimplified(term)) #Problem: Some terms are not pretrained, like '食记','咖哩','捷运' if ID == None: vec += embeddings[0] else: vec += embeddings[ID] vec /= len(terms) return vec
def get_download_url(name, ep, keyword, translation_team, **dict): """ Search download url in dmhy.org """ root_url = 'https://share.dmhy.org' payload = {'keyword': keyword + ' ' + '{:0>2}'.format(ep)} user_agent = { 'User-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML,' 'like Gecko) Chrome/41.0.2228.0 Safari/537.36' } print('DMHY scraper is searching for {} of {}'.format(ep, name)) content = net.request_get_content(root_url + '/topics/list', retry=5, params=payload) soup = bs4.BeautifulSoup(content, 'lxml') trs = soup.find_all('tr') if len(trs) == 0: raise FileNotFoundError found_flag = False download_url = '' unified_name = name.lower() print('Unified name:{}'.format(unified_name)) # Skip the table header for tr in trs[1:]: a = tr.select('td.title > a')[0] # Check the correctness of entry entry_desc = '' for string in a.strings: entry_desc += string # Eliminating spaces entry_desc = HanziConv.toSimplified(entry_desc.strip()) try: print('Searching: {0}'.format(entry_desc)) except: print('Experiencing encoding problem, but search is still going on.') print('Searching:', entry_desc.encode('utf-8')) unified_entry_desc = entry_desc.lower() if unified_name in unified_entry_desc: # Translation team check if (translation_team != [] and not any(trans_t.lower() in unified_entry_desc for trans_t in translation_team)): continue download_page_url = a['href'] print('download_page link:{0}'.format(download_page_url)) download_page_content = net.request_get_content( root_url + download_page_url, retry=5) soup1 = bs4.BeautifulSoup(download_page_content, 'lxml') url_list = soup1.find(id='tabs-1') p = url_list.find('p') download_url = p.find('a')['href'] break if download_url == '': raise FileNotFoundError return "https:" + download_url
def hello(): name = request.form['checking'] temp_name = HanziConv.toTraditional(name) # name = HanziConv.toSimplified(name) name = name.encode('utf-8') name = urllib2.quote(name) url_tem= "http://csclab11.cs.nthu.edu.tw:5000/?q=%s"%name result = urllib2.urlopen(url_tem).read() #result = json.load(result) # print type(result) d = json.loads(result) kangxi=HanziConv.toTraditional(d["result"]) # print d["result"] # namelist.append(temp_name) # resultlist.append(d["result"]) # result = get_result(name) kangxi=kangxi.encode('utf-8') kangxi=urllib2.quote(kangxi) url_kang="http://kxgen.mqstudiotw.com/?%s"%kangxi kangxi_result = urllib2.urlopen(url_kang) #print kangxi_result return render_template('index.html', name=temp_name,result=d["result"])
def segment2( self, sent ): ssent = HanziConv.toSimplified( sent ) res = self.segmenter.segment( ssent ) arr = [] start = 0 for i in range( res.size() ): length = len(res.get(i)) arr.append( sent[start:start+length] ) start += length return arr
def writeDBF(filePattern, fullFilePath, dicInput): global dbfFileHandle global dbfFileIndex global writeMax # dbfFileHandle = None # dbfFileIndex = None insertCount = 0; updateCount = 0; bFileExists = os.path.exists(fullFilePath) dtWriteDBFStart = datetime.datetime.now() # logger.debug("write DBF start") today = dtWriteDBFStart.strftime("%Y%m%d") fileName = today strToken = "" if filePattern == "0": strToken = "SH" fileName += ".SH.txt" elif filePattern == "1": strToken = "SZ" fileName += ".SZ.txt" with open(fileName, "w") as text_file: for key, value in dicInput.iteritems(): insertCount += 1 value = HanziConv.toTraditional(value) try: value = value.decode("utf8") except: pass strWrite = (u"%s.%s,%s\n" % (key, strToken, value)) text_file.write(strWrite.encode('utf8')) dtWriteDBFEnd = datetime.datetime.now() logger.debug("write count : " + str(insertCount) + "/" + str(updateCount)) logger.debug("write DBF end (" + str(dtWriteDBFEnd - dtWriteDBFStart) + ")")
def traditional_to_simplified(ustring): return HanziConv.toSimplified(ustring)
new_lines = [] n = 0 for line in lines: if line[0] in "#%": new_lines.append(line) continue try: cmd, value = line.strip(' ').decode('utf-8').split(u' ', 1) except ValueError as e: # '\t' 鍵盤對應部份 new_lines.append(line) continue newv = HanziConv.toTraditional(value) if newv != value: # print value , # print ' -> ', # print newv n += 1 elif len(value.strip()) > 1: print value.strip() pass else: newl = line.strip().split(' ')[0].decode('utf-8') + ' ' + newv new_lines.append(newl.encode('utf-8')) print len(lines) print n
m = re.search(ur"^(\[.+?\])(.+?):", s) if m: s = m.group(2) + m.group(1) else: m = re.search(ur"^\[.+?\](.*)", s) if m: s = m.group(1) return s if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("input", action="store", nargs = 1) parser.add_argument("output", action="store", nargs = 1) parser.add_argument("--encoding", action="store", default="utf_8_sig", nargs=1) parser.add_argument("--traditional", action="store_true", default=False) args = parser.parse_args() buf = codecs.open(args.input[0], "rb", args.encoding).read() if args.traditional: buf = HanziConv.toTraditional(buf) else: buf = HanziConv.toSimplified(buf) lines = buf.split("\n") lines.sort(key = sort_func) codecs.open(args.output[0], "wb", args.encoding).writelines(lines)
dic_postive ={} dic_negative = {} dic_term_orientation = {} pos = 0.0 neg = 0.0 oth = 0.0 reader = csv.reader(open("./generated_data/training_file.csv", "rb")) for row in reader: if row[1] == "1": pos += 1 elif row[1] == "0": neg += 1 elif row[1] == "2": oth += 1 flag = row[1] temp = HanziConv.toSimplified(row[3]) words = jieba.cut(temp, cut_all=False) word_is_counted = [] for w in words: if w not in word_is_counted: word = w.encode('utf8') if (word not in punctuation and word not in stop_word_list) and only_nonascii(word) != "": if flag == '1': if word not in dic_postive: dic_postive[word] = 2 else: dic_postive[word] += 1 if word not in dic_negative: dic_negative[word] = 1 elif flag == '0': if word not in dic_negative:
def simplified_eq(a, b): return len(a) == len(b) and \ HanziConv.toSimplified(a[0]) == \ HanziConv.toSimplified(b[0])
# 草 ("grass"), "肏" is the actual character. "艹" is not a real character # but it's used this way "操你", "草你", "日你", # f**k you "操他", "草他", "日他", # f**k his "操她", "草她", "日她", # f**k her # Discrimination (racial slurs) "小日本", # little Japanese "台湾狗", # Taiwanese dogs "共产中国", # communist Chinese "流氓国家", # rogue country "人渣", # human slag "我去", # this is verbal and bad "鬼子" # devil, usually a suffix ] BAD = [HanziConv.toSimplified(word) for word in bad_init] + \ [HanziConv.toTraditional(word) for word in bad_init] INFORMAL = [ # Hello "你好", # nǐ hǎo; The standard "hello" greeting. "您好", # nín hǎo; The same "hello" greeting as above "你怎么样", # nǐ zěnmeyàng?; "What's up?", "How are you doing?" # Good afternoon "午安", # wǔ'an; note: seldom used in the Mainland. "下午好", # xìawǔ hǎo! Seldom used in the Republic of China # Good evening / Good night "晚安", # wǎn'an; Literally "Peace at night", Good night. "晚上好", # wǎnshang hǎo; Good evening!
def gen_response(keyword_list): dic = {"笑話":"你想要聽我說個笑話嗎", "無聊":"那聽個笑話好嗎"} ans = dic[HanziConv.toTraditional(keyword_list[0])] print(ans)
def get_json_from_page(page): from hanziconv import HanziConv stopwords = load_stop_words() cat_constrain_set = set(tokenize(HanziConv.toTraditional("。".join(page.categories)),stopwords)) summary_constrain_set = set(tokenize(HanziConv.toTraditional("。".join(page.summary)),stopwords)) return get_places(page.title,cat_constrain_set|summary_constrain_set)
def simplify_or_none(text): if text is None: return None else: return HanziConv.toSimplified(text)
# [dic_TW, dic_HK, dic_CN] = mdic() # str_TW = conv(a, dic_TW) # str_HK = conv(c, dic_HK) # str_CN = conv(b, dic_CN) # print a, ' <-> ', str_TW, '\n', c, ' < -> ', str_HK, '\n', b, ' < -> ', # str_CN def check_contain_chinese(check_str): for ch in check_str: if u'\u4e00' <= ch <= u'\u9fff': return True return False if __name__ == '__main__': fin = codecs.open("zhwiki-20151226-all-titles-in-ns0", "r", "utf-8") fout = codecs.open("zhwiki-titles-converted", "w", "utf-8") #[dic_TW, dic_HK, dic_CN] = mdic() # print(HanziConv.toSimplified("!_")) cnt = 0 while(True): cnt += 1 if(cnt % 10000 == 0): print(cnt) line = fin.readline() if(line == ""): break if(check_contain_chinese(line)): fout.write(HanziConv.toSimplified(line))
def convert_to_simplified(text): if u'歷' in text: text = text.replace(u'歷', u'历') return HanziConv.toSimplified(text)
def chinese_tokenizer(s, lower=True): s = unicode(s) if lower: s = hanzi.toSimplified(s) return [t[0] for t in jieba_tokenize(s)]
#coding:utf-8 from hanziconv import HanziConv stop_file = open("./other_data/stop_word.txt", 'r') stop_word_array = [] for line in stop_file: temp = line.replace("\n", "") temp = HanziConv.toSimplified(temp) if temp not in stop_word_array: stop_word_array.append(temp) stop_file1 = open("./generated_data/stop_word_final.txt", "w") for i in stop_word_array: stop_file1.write(i.encode('utf8')+"\n")
def get_sentences(page): from hanziconv import HanziConv sentences = [] for line in HanziConv.toTraditional(page.content).splitlines(): sentences.extend(line.split('。')) return sentences