예제 #1
0
def convert_to_strings(wikipage):
    # given a wikipage object, the function will return a structurlized
    # dictionary that holds all information from a wikipage.
    from hanziconv import HanziConv
    import wikitextparser as wtp
    import pprint
    try:
        summary = HanziConv.toTraditional(
            wtp.parse(wikipage.content).sections[0].pprint())
    except:
        summary = None
    try:
        sections = [HanziConv.toTraditional(
            sec.pprint()) for sec in wtp.parse(wikipage.content).sections[1:]]
        try:
            sub_titles = [HanziConv.toTraditional(
                sec.title[1:-1]) for sec in wtp.parse(wikipage.content).sections[1:]]
        except:
            sub_titles = None
        try:
            section_content = [s[s.find('\n') + 1:] for s in sections]
        except:
            section_content = None
    except:
        sections = None

    try:
        sections = list(zip(sub_titles, section_content))
    except:
        sections = None
    try:
        links = wikipage.links
    except:
        links = None
    return {'title': wikipage.title, 'summary': summary, 'sections': sections, 'links': links}
예제 #2
0
def extract_langlinks(sql, fo):
    total = 0
    category = 0
    instance = 0
    template = 0

    o = open(fo, 'w')

    with open(sql) as f:
        for line in f:
            if line.startswith('INSERT'):
                line = line[line.index('('):]
                line = line.strip('\n').strip(';').strip(')').strip('(') #删除前后的(和)
                for tri in line.split('),('): #以),(分割,得出每个item
                    tri = tri.replace("'",'').replace("'",'') 
                    _id, lan, link = tri.split(',',2) #因为link里可能有逗号,要限制只分2次
                    if lan == 'zh':
                        total += 1
                        if link.startswith('Category:'):
                            category += 1
                        if link.startswith('Template:'):
                            template += 1
                            print _id, HanziConv.toSimplified(link).encode('utf-8')
                        link = link.replace('_', ' ')
                        o.write('%s\t%s\n'%(_id,HanziConv.toSimplified(link).encode('utf-8')))
    
    instance = total - category - template
    print "Total:%d, Category:%d, Instance:%d, Template:%d"%(total, category, instance, template)
예제 #3
0
def get_names(file_path):
    # get Chinese names, English names and id from json files
    names = []
    keys_en = {'外文名称', '外文名', '英文名称', '英文别名'}
    keys_zh = {'中文名', '又称', '别名', '别称', '中医病名', '中文别名', '中文学名'}
    with open(file_path, 'r') as f:
        data = json.load(f)
    for key, value in data.items():
        title = key
        names.append(title)
        basic = value['基本信息']
        if type(basic) is not dict:
            continue
        for key_basic, value_basic in basic.items():
            if key_basic.replace(' ', '') in keys_en:
                names.extend([
                    name_en.lower() for name_en in basic[key_basic].replace(
                        ';', ',').split(',') if len(name_en) > 0
                ])
            if key_basic.replace(' ', '') in keys_zh:
                names.extend([
                    HanziConv.toSimplified(name_zh) for name_zh in
                    basic[key_basic].replace(';', ',').split(',')
                    if len(name_zh) > 0 and not name_zh == title
                ])
    return title, names
예제 #4
0
    def process_page(self):
        url = self.found.pop()
        try:
            page = requests.get(url)
            tree = html.fromstring(page.content)
            paragraphs = tree.xpath(
                '//div[@class="mw-parser-output"]/p/text()')
            hrefs = tree.xpath("//div[@class='mw-parser-output']/p/a/@href")

            for p in paragraphs:
                w = get_chinese(p)
                x = hc.toSimplified(w)
                self.characters += Counter(x)
                self.words += Counter(jieba.cut(x, cut_all=False))

            for href in hrefs:
                zhref = 'https://zh.wikipedia.org' + href
                if zhref not in self.visited:
                    self.found.add(zhref)
                    self.found_own.add(zhref)
        except requests.exceptions.ConnectionError as e:
            print(e)
            print('Continuing...')
        except requests.exceptions.ChunkedEncodingError as e:
            print(e)
            print('Continuing...')
        except requests.exceptions.InvalidURL as e:
            print(e)
            print('Continuing...')
        self.visited.add(url)
        self.visited_own.add(url)
예제 #5
0
파일: util.py 프로젝트: CheHaoKang/US_Stock
    def get_stock_info(self, stock_name, use_proxy=True):
        from hanziconv import HanziConv

        headers = {
            'X-Requested-With': 'XMLHttpRequest',
            'Referer': 'http://xueqiu.com/p/ZH010389',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0',
            'Host': 'xueqiu.com',
            #'Connection':'keep-alive',
            #'Accept':'*/*',
            'cookie':'s=iabht2os.1dgjn9z; xq_a_token=02a16c8dd2d87980d1b3ddced673bd6a74288bde; xq_r_token=024b1e233fea42dd2e0a74832bde2c914ed30e79; __utma=1.2130135756.1433017807.1433017807.1433017807.1;'
            '__utmc=1; __utmz=1.1433017807.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); Hm_lvt_1db88642e346389874251b5a1eded6e3=1433017809; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1433017809'
        }

        counter = 0
        while counter < self.RETRY:
            counter += 1

            try:
                proxies = {}
                if use_proxy:
                    proxies = self.get_proxy()
                    print("PROXY => {:}".format(proxies))
                res = requests.get("https://xueqiu.com/S/" + stock_name, headers=headers, proxies=proxies, timeout=self.REQUEST_TIMEOUT)
                reGetStockInfo = re.compile(r"profile-detail.*?\">(.*?)<", re.S | re.UNICODE)
                for stockInfo in reGetStockInfo.findall(res.text):
                    return HanziConv.toTraditional(stockInfo)
            except:
                traceback.print_exc()
                time.sleep(1)

        return ''
예제 #6
0
def data_prepare():
    """"""
    # 提前下好, gensim的wiki 词向量训练
    
    article_num = 0   ## 预先设置训练检测

    wiki = WikiCorpus('zhwiki-latest-pages-articles.xml.bz2',lemmatize=False, dictionary={})
    
    stopwords = [line.strip() for line in open('stopwords.txt', 'r', encoding='utf-8').readlines()]
    #with open('stopwords.txt','r',encoding='utf8').readlines() as f:
        #stopwords = [ w.strip() for w in f] 
    #stopwords = codecs.open('stopwords.txt','r',encoding='utf8').readlines()
    #stopwords = [ w.strip() for w in stopwords ] 
    start = time.time()
    for text in wiki.get_texts():
        text = ' '.join(text)
        text = HanziConv.toSimplified(text)
        #re.sub('[:·•’!\"#$%&\'()*+,,-./::;;<=>?@,。?★、…【】《》?“”〞‘’![\\]^_`{}()~]+', "", text)
        text = text.strip()
        seg_list = list(jieba.cut(text))
        # ['歐幾里', '得', ' ', '西元前', '三世', '紀的', '古希臘', '數學家', ' ', '現在', '被', '認
        #  '是', '幾何', '之父', ' ', '此畫', '為拉斐爾', '的', '作品', ' ', '雅典', '學院']
        new_text = [x for x in seg_list  if  (re.compile(u'[\u4e00-\u9fa5]+').search(x) or \
                        re.compile("[\"”“,??\,\.。,0-9]+").search(x)) and (x not in stopwords)]
        #new_text = [x for x in seg_list if     re.compile('[^a-zA-Z]+').search(x) and x != ' '] ## 原来的版本是len(x)  > 1 这里 不能这样8行
        
        article_num = article_num + 1
        if article_num == 10:
            break        
        yield new_text
예제 #7
0
 def process_poetry(
     self,
     data_dir='/media/pony/DLdigest/data/languageModel/chinese-poetry/json'
 ):
     save_dir = os.path.join(self.save_dir, 'poem')
     check_path_exists(save_dir)
     count = 0
     for entry in os.scandir(data_dir):
         if entry.name.startswith('poet'):
             with open(entry.path, 'r') as json_file:
                 poems = json.load(json_file)
                 for p in poems:
                     paras = HanziConv.toSimplified(''.join(
                         p['paragraphs']).replace('\n', ''))
                     paras = filter_punctuation(paras)
                     for para in paras.split(' '):
                         if len(para.strip()) != 0:
                             pys = ' '.join(
                                 np.array(pinyin(para)).flatten())
                             with open(
                                     os.path.join(
                                         save_dir,
                                         str(count // 400000 + 1) + '.txt'),
                                     'a') as f:
                                 f.write(para + ',' + pys + '\n')
                             count += 1
예제 #8
0
def preprocess():
    """
    使用gensim中的WikiCorpus库提取wiki的中文语料,并将繁体转成简体中文。
    然后利用jieba的分词工具将转换后的语料分词并写入一个txt
    每个wiki文档的分词结果写在新txt中的一行,词与词之间用空格隔开

    !!!     这个要windows上要跑2个多小时的      !!!

    :return: 对zhwiki...bz2进行提取,并将繁体字转为简体字,存的reduced_zhwiki.txt

    ========================
    from gensim.corpora import WikiCorpus
    import jieba
    from langconv import *       ——这个直接从网上下载后放在同一目录即可
    ========================
    """
    count = 0
    zhwiki_path = './data/zhwiki-20190720-pages-articles-multistream.xml.bz2'
    f = open('./data/reduced_zhwiki.txt', 'w', encoding='utf8')  # 每次成功跑完,最好改下名字,防止重新跑覆盖了
    wiki = WikiCorpus(zhwiki_path, lemmatize=False, dictionary={})
    for text in wiki.get_texts():
        word_list = []
        for sentence in text:
            sentence = HanziConv.toSimplified(sentence)  # 繁体转简体
            seg_list = jieba.cut(sentence)  # 用结巴分词
            for seg in seg_list:
                word_list.append(seg)
        f.write(' '.join(word_list) + '\n')
        count += 1
        if count % 200 == 0:
            print("Saved " + str(count) + ' articles')

    f.close()
def clean(text):
    text = text.strip()
    text = HanziConv.toSimplified(text)
    text = full2half(text)
    text = re.sub("\\#.*?#|\\|.*?\\||\\[.*?]", "", text)
    # text = re.sub("\s*", "", text)
    return text
예제 #10
0
def translate(translate_file_path):
    with open(file=translate_file_path, mode="r", encoding="utf-8") as file:
        content = file.read()
    with open(file=translate_file_path, mode="w", encoding="utf-8") as file:
        if content:
            content = HanziConv.toTraditional(content)
            file.write(content)
예제 #11
0
파일: util.py 프로젝트: CheHaoKang/US_Stock
    def get_Xueqiu_categories(self):
        from hanziconv import HanziConv
        from selenium import webdriver
        from webdriver_manager.chrome import ChromeDriverManager

        url = 'https://xueqiu.com/hq#exchange=US&industry=3_2&firstName=3&page=1'
        while 1:
            try:
                driver = webdriver.Chrome(ChromeDriverManager().install())
                driver.get(url)
                driver.implicitly_wait(10)

                soup = BeautifulSoup(driver.page_source, 'html.parser')
                categories = {}
                for ele in soup.find_all('i', {'class' : 'list-style'}):
                    if re.search("明星股", ele.parent.text):
                        for li in ele.parent.find_all('li'):
                            key = HanziConv.toTraditional(li.text).strip()
                            link = "https://xueqiu.com/hq{}".format(li.select('a')[0]['href'].strip())
                            categories[key] = link

                driver.quit()
                break
            except:
                traceback.print_exc()
                driver.quit()

        self.GICS_csvs(categories)
    def _LoadCorpus(self):
        print(f"Loading corpus from: {self.CorpusPath}")
        if self.SeparatedBySpace:
            lines = [
                line for line in open(self.CorpusPath, 'r').readlines()
                if line.find("doc id=") < 0
            ]
            Corpus = ' '.join(lines)
            Tokens = set(Corpus.split())
            self.Vocabulary = [token.replace('\n', '') for token in Tokens]
            if self.is_Arabic:
                self.Vocabulary = [
                    word for word in self.Vocabulary
                    if len(set(word + arabic_alphabet)) == size_arabic
                ]
            self.CharSet = set(''.join(self.Vocabulary))
            print(f"voc size={len(self.Vocabulary)}")
        else:
            lines = [
                line for line in open(self.CorpusPath, 'r').readlines()
                if line.find("doc id=") < 0
            ]
            random.shuffle(lines)
            line_num = len(lines)
            lines = lines[:line_num // 3]

            self.Corpus = ''.join(lines).replace('\n', '').replace(' ', '')
            if self.CorpusPath.find('corpus_zh') >= 0:
                from hanziconv import HanziConv
                self.Corpus = HanziConv.toSimplified(self.Corpus)
                print("Transformed to simplified Chinese")
            self.Vocabulary = None
            self.CharSet = set(self.Corpus)
            print(f"char count={len(self.CharSet)}")
예제 #13
0
def open_dataset(path, simplified, limit_length):
    ###############################################################################
    # This function opens txt file( for labels) and utf8 file (for the dataset),
    # removing all the sentences less than a specifi value characters
    # (just for the training set), splitting the words also when there is a punctuation
    # and converting the dataset from Traditional chinese to simplified chinese, if it is needed.
    #
    # Input:
    #   path: path of the file
    #   simplified: Boolean variable for the convertion of the dataset (if it is True)
    #   limit_length: value to choose sentences larger than it
    #
    # Output:
    #   chinese_sentences: list of sentences
    #
    ###############################################################################

    # condition to open file ( if the dataset is for training or for Dev set)
    with open(path, 'r', encoding='utf8') as file:

        if simplified:
            chinese_sentences = [
                split_punctuation(line.strip().split()) for line in file
                if len(line.strip().split()) > limit_length
            ]

        else:
            chinese_sentences = [
                HanziConv.toSimplified(split_punctuation(line.strip().split()))
                for line in file if len(line.strip().split()) > limit_length
            ]

    return chinese_sentences
    def token_normalize(self, token):
        # url_xxx 的词统一转成 url # TODO 更多的标准化
        # if token == ' ':
        #     return self.pad_token

        if 'url_' in token:
            token = 'url'

        # if token.lower() in {'win10', 'win7', 'windows10', 'windows7', 'windows8', 'window7',
        #                      'windows10windows', 'windows2000', 'windows7windows', 'windowxp',
        #                      'windownt', 'window10', 'windowswindows', 'windows98', 'windows9x'}:
        #     return 'Windows'
        #
        # # iphone
        # token = re.sub('iphone(.+)', 'iphone', token.lower())

        # 繁体字转换
        token = HanziConv.toSimplified(token)
        # # 数字和标点符号组合的词的标准化
        # token = re.sub(r'(\d+)\.', '\g<1>', token)
        # token = re.sub(r'\.(\d+)', '\g<1>', token)
        # 唿 -> 呼
        # token = re.sub('唿', '呼', token)
        # token = re.sub(
        #     r'(第*)(有|多|几|半|一|两|二|三|四|五|六|七|八|九|十)(种|个|次|款|篇|天|步|年|大|条|方|位|键|份|项|周|层|只|套|名|句|件|台|部|页|段|把|片|小时|遍|颗|根|批|张|分|性|点点|场|分钟|组|堆|本|圈|季|笔|群|斤|日|支|排|章|所|股|门|首|代|号|生|点|辆|轮|瓶|声|杯|列|座|集)',
        #     '\g<2>', token)

        return token
예제 #15
0
def sentence_cmn(rspecifier, word_level=True):
    if os.path.isdir(rspecifier):
        for f in os.listdir(rspecifier):
            full_name = os.path.join(rspecifier, f)
            for s in sentence_cmn(full_name, word_level):
                yield s

    else:
        with codecs.open(rspecifier, 'rb', 'utf8') as fp:
            for line in fp:
                line = HanziConv.toSimplified(line.strip())
                if word_level:
                    sent = line.split()
                else:
                    sent = []
                    for w in line.split():
                        has_chinese = any(u'\u4e00' <= c <= u'\u9fff'
                                          for c in w)
                        if has_chinese:
                            sent.extend(list(w))
                        else:
                            sent.append(w)

                yield [
                    u'<numeric>'
                    if re.match(__has_digit_but_no_letter, w) else w
                    for w in sent
                ]
예제 #16
0
def clean(text):
    text = text.strip()
    text = tokenization.convert_to_unicode(text)
    text = HanziConv.toSimplified(text)
    text = full2half(text)
    text = re.sub(u"\\#.*?#|\\|.*?\\||\\[.*?]", "", text)
    text = re.sub(u"\\s*", "", text)
    return text
def to_S(k):
    txt=X.content[k].strip()
    txt = re.sub('\t|\r', '\n',txt)
    txt = txt.replace('\n\n', '\n')
    txt = re.sub('  |\u3000', ' ', txt)
    txt=HanziConv.toSimplified(txt)
    txt=txt.strip()
    return (X.shop_url[k], X.post_time[k],txt,int(X.score[k]),len(txt))
예제 #18
0
 def replace(x):
     x = x.replace('"', "").replace("\r\n",
                                    " ").replace("\n",
                                                 " ").replace(",", ",")
     x = HanziConv.toSimplified(x)
     x = [a for a in cut(x) if a not in stop_words]
     x = " ".join(x)
     return x
예제 #19
0
 def down(self, song_name, song_artist):
     for ext in SONG_EXT:
         for src, results in search_results.items():
             for result in results:
                 #print(result)
                 print(src + ' - ' + result['songname'] + ' - ' +
                       result['singers'] + ' - ' + result['ext'])
                 if song_name.lower() in HanziConv.toSimplified(
                         result['songname'].lower()):
                     if song_artist.lower() in HanziConv.toSimplified(
                             result['singers'].lower()):
                         if ext in result['ext'].lower():
                             self.client.download([result])
                             #print('matches')
                             return True
     #print('no match')
     return False
예제 #20
0
파일: pre.py 프로젝트: JohanyCheung/fsauor
def tochar(x):
    x = re.sub('\n|\t|\r| |"|。。|!!|…', ' ', x)
    x = re.sub('\n|\t|\r| |"|。。|!!', ' ', x)
    x = re.sub('\x05|\x06|\x07|\.\.|\.\.\.', ' ', x)
    x = HanziConv.toSimplified(x)
    x = list(x.strip())
    x = [a for a in x if len(a.strip()) > 0]
    return ' '.join(x)
예제 #21
0
def pre_process(text):
    text = HanziConv.toTraditional(text)

    # load cantonese corpus
    # jb.load_userdict('util/dict/canto_dict.txt')
    vocabs = list(jb.cut(text))
    pp_text = " ".join(vocabs)
    return pp_text
예제 #22
0
def get_word_list(query):
    # 繁体转简体
    query = HanziConv.toSimplified(query.strip())
    # 大写转小写
    query = query.lower()
    # 利用jieba进行分词
    words = ' '.join(jieba.cut(query)).split(" ")
    return words
 def simplified_to_traditional(self):
     logging.info("等待中..(簡 to 繁)")
     traditional = open("traditional.txt", "w", encoding="utf-8")
     with open("wiki_text.txt", "r", encoding="utf-8") as simplified:
         for s in simplified:
             traditional.write(HanziConv.toTraditional(s))
     print("成功簡體轉繁體!")
     traditional.close()
예제 #24
0
def toTraditional(filename, content):

    content_trans = HanziConv.toTraditional(content)
    
    if content_trans != content:
        # Write with utf8 encoding
        with open(filename, 'w', encoding='UTF-8-SIG') as file:
            file.write(content_trans)
def simplified_to_traditional():
    logging.info("等待中..(繁 to 簡)")
    simple = open("w2vSimplified.txt", "w", encoding="utf-8")
    with open("w2v.txt", "r", encoding="utf-8") as traditional:
        for t in traditional:
            simple.write(HanziConv.toSimplified(t))
    print("成功繁體轉簡體!")
    simple.close()
예제 #26
0
def generate_qimai_addition_dataset(model_type="bert"):
    test_df = pickle_load(path_cache / "test_df.pkl")
    qimai_test_id = pickle_load(path_cache / f"{model_type}_qimai_test_id.pkl")
    appname2appdesc = pickle_load(path_cache / "appname2appdesc.pkl")
    apkname2appdesc = pickle_load(path_cache / "apkname2appdesc.pkl")

    test_df["appname"] = test_df["new_appname"]
    qimai_test_df = test_df.merge(apkname2appdesc)
    qimai_test_df = qimai_test_df[["appname", "app_desc"]]
    chusai_test_df = pickle_load(path_cache / "chusai_test_df.pkl")
    chusai_test_df = chusai_test_df.loc[~chusai_test_df["appname"].isna(),
                                        ["appname", "app_desc"]]

    appname2appdesc = pd.concat(
        [appname2appdesc, chusai_test_df, qimai_test_df], axis=0, sort=False)
    appname2appdesc["desc_len"] = appname2appdesc["app_desc"].str.replace(
        "[\x00-\xff”“•]", "").str.len()
    appname2appdesc["appname"] = appname2appdesc["appname"].str.lower(
    ).str.replace(" ", "")
    appname2appdesc["appname"] = [
        HanziConv.toSimplified(x) for x in appname2appdesc["appname"]
    ]
    appname2appdesc = appname2appdesc.sort_values("desc_len").drop_duplicates(
        "appname", keep="last")
    appname2appdesc = appname2appdesc.loc[appname2appdesc["desc_len"] >= 8]

    test_df_new = test_df.copy()
    test_df_new = test_df_new.loc[~test_df["id"].isin(qimai_test_id)]
    test_df_new["appname"] = test_df_new["appname"].str.lower().str.replace(
        " ", "")
    test_df_new["appname"] = [
        HanziConv.toSimplified(x) for x in test_df_new["appname"]
    ]
    test_df_new = test_df_new.merge(appname2appdesc)

    qimai_addition_test_id = test_df_new["id"].tolist()
    qimai_addition_test_dataset = generate_tensor_data(test_df_new["app_desc"],
                                                       model_type)
    qimai_addition_test_dataset = TensorDataset(*qimai_addition_test_dataset)

    pickle_save(qimai_addition_test_id,
                path_cache / "qimai_addition_test_id.pkl")
    pickle_save(
        qimai_addition_test_dataset,
        path_tensor_dataset / f"{model_type}_qimai_addition_test_dataset.pkl")
def clear_text(x):
    x = BeautifulSoup(x, 'html.parser').text
    x = html.unescape(x)
    x = HanziConv.toSimplified(x)
    x = re.sub(r'\s+', '', x) # tab \n 去除 匹配任何非空白字符。等价于 [^ \f\n\r\t\v]。
    x = re.sub(r'[\((【](.*?)[\))】]', '', x) # 替换 ()是单元, ?非贪婪匹配,否则.*把后面的括号也匹配掉了
    x = re.sub(r'([–-—=…]*)', '',x)
    x = x.strip()
    return x
예제 #28
0
 def Transform_ZhTw_Save(self, File_Name, Next_FileName):
     FileRead = []
     with open(File_Name, 'rb') as RawFile:
         for line in RawFile:
             FileRead.append(HanziConv.toTraditional(line))
     with open(Next_FileName, 'wb') as Next_File:
         for i in range(len(FileRead)):
             for j in range(len(FileRead[i])):
                 Next_File.write(FileRead[i][j].encode('utf-8'))
예제 #29
0
def terms2VecIDs(terms):
    ans = []
    for term in terms:
        ID = word2id.get(HanziConv.toSimplified(term)) #Problem: Some terms are not pretrained, like '食记','咖哩','捷运'
        if ID == None:
            ans.append(0)
        else:
            ans.append(ID)
    return ans
예제 #30
0
 def process_text(self):
     logging.info("等待中..(簡 to 繁)")
     with open('./word2vec_data/traditional.txt', 'w',
               encoding='utf-8') as fw:
         with open('./word2vec_data/wiki_text.txt', 'r',
                   encoding='utf-8') as f:
             for line in f:
                 line = HanziConv.toTraditional(line)
                 fw.write(line)
예제 #31
0
 def __iter__(self):
     for content, (page_id, title) in self.wiki.get_texts():
         yield doc2vec.LabeledSentence(
             # 1. 对content中的每一个c,
             # 2. 转换成简体中文之后用jieba分词
             # 3. 加入到words列表中
             words=[w for c in content
                    for w in jieba.cut(HanziConv.toSimplified(c))],
             tags=[title])
예제 #32
0
def inputTest():
    x = input("請說話:")
    # x:token
    y = jerry.get_response(x)
    y = HanziConv.toTraditional(y.text)

    print(type(x))
    print(type(y))
    print(y)
예제 #33
0
def create_post():
    form = PostForm()
    if form.validate_on_submit():
        chinese = HanziConv.toTraditional(form.chinese_content.data)
        title = HanziConv.toTraditional(form.title.data)
        post = Post(author=current_user,
                    title=title,
                    chinese_content=chinese,
                    content=form.content.data,
                    tags=form.tags.data)
        db.session.add(post)
        db.session.commit()
        flash('Your post has been created!', 'success')
        return redirect(url_for('home'))
    return render_template('create_post.html',
                           title='New Post',
                           form=form,
                           legend='New Post')
예제 #34
0
def terms2Vec(terms):
    vec = np.zeros(len(embeddings[0]))
    for term in terms:
        ID = word2id.get(HanziConv.toSimplified(term)) 
        if ID == None:
            vec += embeddings[0]
        else:
            vec += embeddings[ID]
    vec /= len(terms)
    return vec
예제 #35
0
def terms2Vec(terms):
    vec = np.zeros(len(embeddings[0]))
    for term in terms:
        ID = word2id.get(HanziConv.toSimplified(term)) #Problem: Some terms are not pretrained, like '食记','咖哩','捷运'
        if ID == None:
            vec += embeddings[0]
        else:
            vec += embeddings[ID]
    vec /= len(terms)
    return vec
예제 #36
0
def get_download_url(name, ep, keyword, translation_team, **dict):
    """
    Search download url in dmhy.org
    """
    root_url = 'https://share.dmhy.org'
    payload = {'keyword': keyword + ' ' + '{:0>2}'.format(ep)}
    user_agent = {
        'User-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML,'
                      'like Gecko) Chrome/41.0.2228.0 Safari/537.36'
    }
    print('DMHY scraper is searching for {} of {}'.format(ep, name))
    content = net.request_get_content(root_url + '/topics/list',
                                      retry=5,
                                      params=payload)
    soup = bs4.BeautifulSoup(content, 'lxml')
    trs = soup.find_all('tr')
    if len(trs) == 0:
        raise FileNotFoundError
    found_flag = False
    download_url = ''
    unified_name = name.lower()
    print('Unified name:{}'.format(unified_name))
    # Skip the table header
    for tr in trs[1:]:
        a = tr.select('td.title > a')[0]
        # Check the correctness of entry
        entry_desc = ''
        for string in a.strings:
            entry_desc += string
        # Eliminating spaces
        entry_desc = HanziConv.toSimplified(entry_desc.strip())
        try:
            print('Searching: {0}'.format(entry_desc))
        except:
            print('Experiencing encoding problem, but search is still going on.')
            print('Searching:', entry_desc.encode('utf-8'))
        unified_entry_desc = entry_desc.lower()
        if unified_name in unified_entry_desc:
            # Translation team check
            if (translation_team != []
                    and not any(trans_t.lower() in unified_entry_desc for trans_t in translation_team)):
                continue
            download_page_url = a['href']
            print('download_page link:{0}'.format(download_page_url))
            download_page_content = net.request_get_content(
                root_url + download_page_url,
                retry=5)
            soup1 = bs4.BeautifulSoup(download_page_content, 'lxml')
            url_list = soup1.find(id='tabs-1')
            p = url_list.find('p')
            download_url = p.find('a')['href']
            break
    if download_url == '':
        raise FileNotFoundError
    return "https:" + download_url
예제 #37
0
def hello():
    name = request.form['checking']
    temp_name = HanziConv.toTraditional(name)
    # name = HanziConv.toSimplified(name)
    name = name.encode('utf-8')
    name = urllib2.quote(name)
    url_tem= "http://csclab11.cs.nthu.edu.tw:5000/?q=%s"%name
    result = urllib2.urlopen(url_tem).read()
    #result = json.load(result)
    # print type(result)
    d = json.loads(result)
    kangxi=HanziConv.toTraditional(d["result"])
    # print d["result"]
    # namelist.append(temp_name)
    # resultlist.append(d["result"])
    # result = get_result(name)
    kangxi=kangxi.encode('utf-8')
    kangxi=urllib2.quote(kangxi)
    url_kang="http://kxgen.mqstudiotw.com/?%s"%kangxi
    kangxi_result = urllib2.urlopen(url_kang)
    #print kangxi_result
    return render_template('index.html', name=temp_name,result=d["result"])
예제 #38
0
    def segment2( self, sent ):
        ssent = HanziConv.toSimplified( sent )
        
        res = self.segmenter.segment( ssent )

        arr = []
        
        start = 0
        for i in range( res.size() ):
            length = len(res.get(i))
            arr.append( sent[start:start+length] )
            start += length

        return arr
예제 #39
0
def writeDBF(filePattern, fullFilePath, dicInput):
	global dbfFileHandle
	global dbfFileIndex
	global writeMax
	# dbfFileHandle = None
	# dbfFileIndex = None

	insertCount = 0; updateCount = 0;
	bFileExists = os.path.exists(fullFilePath) 

	dtWriteDBFStart = datetime.datetime.now()
	# logger.debug("write DBF start")
	today = dtWriteDBFStart.strftime("%Y%m%d")
	fileName = today
	strToken = ""
	if filePattern == "0":
		strToken = "SH"
		fileName += ".SH.txt"
	elif filePattern == "1":
		strToken = "SZ"
		fileName += ".SZ.txt"

	with open(fileName, "w") as text_file:
		for key, value in dicInput.iteritems():
			insertCount += 1

			value = HanziConv.toTraditional(value)
			try:
				value = value.decode("utf8")
			except:
				pass

			strWrite = (u"%s.%s,%s\n" % (key, strToken, value))
			text_file.write(strWrite.encode('utf8'))

	dtWriteDBFEnd = datetime.datetime.now()

	logger.debug("write count : " + str(insertCount) + "/" + str(updateCount))
	logger.debug("write DBF end (" + str(dtWriteDBFEnd - dtWriteDBFStart) + ")")
def traditional_to_simplified(ustring):
    return HanziConv.toSimplified(ustring)
new_lines = []

n = 0
for line in lines:
    if line[0] in "#%":
        new_lines.append(line)
        continue
    try:
        cmd, value = line.strip(' ').decode('utf-8').split(u' ', 1)
    except ValueError as e:
        # '\t' 鍵盤對應部份
        new_lines.append(line)
        continue

    newv = HanziConv.toTraditional(value)
    if newv != value:
        # print value ,
        # print ' -> ',
        # print newv
        n += 1
    elif len(value.strip()) > 1:
        print value.strip()
        pass
    else:
        newl = line.strip().split(' ')[0].decode('utf-8') + ' ' + newv
        new_lines.append(newl.encode('utf-8'))

print len(lines)
print n    
        m = re.search(ur"^(\[.+?\])(.+?):", s)
        if m:
            s = m.group(2) + m.group(1)
        else:
            m = re.search(ur"^\[.+?\](.*)", s)
            if m:
                s = m.group(1)
    return s

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()

    parser.add_argument("input", action="store", nargs = 1)
    parser.add_argument("output", action="store", nargs = 1)
    parser.add_argument("--encoding", action="store", default="utf_8_sig", nargs=1)
    parser.add_argument("--traditional", action="store_true", default=False)
    args = parser.parse_args()

    buf = codecs.open(args.input[0], "rb", args.encoding).read()

    if args.traditional:
        buf = HanziConv.toTraditional(buf)
    else:
        buf = HanziConv.toSimplified(buf)

    lines = buf.split("\n")
    lines.sort(key = sort_func)
    codecs.open(args.output[0], "wb", args.encoding).writelines(lines)
예제 #43
0
dic_postive ={}
dic_negative = {}
dic_term_orientation = {}
pos = 0.0
neg = 0.0
oth = 0.0
reader = csv.reader(open("./generated_data/training_file.csv", "rb"))
for row in reader:
    if row[1] == "1":
        pos += 1
    elif row[1] == "0":
        neg += 1
    elif row[1] == "2":
        oth += 1
    flag = row[1]
    temp = HanziConv.toSimplified(row[3])
    words = jieba.cut(temp, cut_all=False)
    word_is_counted = []
    for w in words:
        if w not in word_is_counted:
            word = w.encode('utf8')
            if (word not in punctuation and word not in stop_word_list) and only_nonascii(word) != "":
                if flag == '1':
                    if word not in dic_postive:
                        dic_postive[word] = 2
                    else:
                        dic_postive[word] += 1
                    if word not in dic_negative:
                        dic_negative[word] = 1
                elif flag == '0':
                    if word not in dic_negative:
예제 #44
0
def simplified_eq(a, b):
    return len(a) == len(b) and \
           HanziConv.toSimplified(a[0]) == \
           HanziConv.toSimplified(b[0])
예제 #45
0
    # 草 ("grass"), "肏" is the actual character.  "艹" is not a real character
    # but it's used this way
    "操你", "草你", "日你",  # f**k you
    "操他", "草他", "日他",  # f**k his
    "操她", "草她", "日她",  # f**k her

    # Discrimination (racial slurs)
    "小日本",  # little Japanese
    "台湾狗",  # Taiwanese dogs
    "共产中国",  # communist Chinese
    "流氓国家",  # rogue country
    "人渣",  # human slag
    "我去",  # this is verbal and bad
    "鬼子"  # devil, usually a suffix
]
BAD = [HanziConv.toSimplified(word) for word in bad_init] + \
      [HanziConv.toTraditional(word) for word in bad_init]

INFORMAL = [
    # Hello
    "你好",  # nǐ hǎo; The standard "hello" greeting.
    "您好",  # nín hǎo; The same "hello" greeting as above
    "你怎么样",  # nǐ zěnmeyàng?; "What's up?", "How are you doing?"

    # Good afternoon
    "午安",  # wǔ'an; note: seldom used in the Mainland.
    "下午好",  # xìawǔ hǎo! Seldom used in the Republic of China

    # Good evening / Good night
    "晚安",  # wǎn'an; Literally "Peace at night", Good night.
    "晚上好",  # wǎnshang hǎo; Good evening!
예제 #46
0
def gen_response(keyword_list):
    dic = {"笑話":"你想要聽我說個笑話嗎", "無聊":"那聽個笑話好嗎"}

    ans = dic[HanziConv.toTraditional(keyword_list[0])]
    print(ans) 
def get_json_from_page(page):
    from hanziconv import HanziConv
    stopwords = load_stop_words()
    cat_constrain_set = set(tokenize(HanziConv.toTraditional("。".join(page.categories)),stopwords))
    summary_constrain_set = set(tokenize(HanziConv.toTraditional("。".join(page.summary)),stopwords))
    return get_places(page.title,cat_constrain_set|summary_constrain_set)
예제 #48
0
def simplify_or_none(text):
    if text is None:
        return None
    else:
        return HanziConv.toSimplified(text)
예제 #49
0
#     [dic_TW, dic_HK, dic_CN] = mdic()
#     str_TW = conv(a, dic_TW)
#     str_HK = conv(c, dic_HK)
#     str_CN = conv(b, dic_CN)
# print a, ' <-> ', str_TW, '\n', c, ' < -> ', str_HK, '\n', b, ' < -> ',
# str_CN


def check_contain_chinese(check_str):
    for ch in check_str:
        if u'\u4e00' <= ch <= u'\u9fff':
            return True
    return False

if __name__ == '__main__':
    fin = codecs.open("zhwiki-20151226-all-titles-in-ns0", "r", "utf-8")
    fout = codecs.open("zhwiki-titles-converted", "w", "utf-8")
    #[dic_TW, dic_HK, dic_CN] = mdic()
    # print(HanziConv.toSimplified("!_"))
    cnt = 0
    while(True):
        cnt += 1
        if(cnt % 10000 == 0):
            print(cnt)
        line = fin.readline()
        if(line == ""):
            break
        if(check_contain_chinese(line)):
            fout.write(HanziConv.toSimplified(line))
예제 #50
0
def convert_to_simplified(text):
    if u'歷' in text:
        text = text.replace(u'歷', u'历')
    return HanziConv.toSimplified(text)
예제 #51
0
def chinese_tokenizer(s, lower=True):
    s = unicode(s)
    if lower:
        s = hanzi.toSimplified(s)
    return [t[0] for t in jieba_tokenize(s)]
예제 #52
0
#coding:utf-8
from hanziconv import HanziConv

stop_file = open("./other_data/stop_word.txt", 'r')
stop_word_array = []
for line in stop_file:
    temp = line.replace("\n", "")
    temp = HanziConv.toSimplified(temp)
    if temp not in stop_word_array:
        stop_word_array.append(temp)

stop_file1 = open("./generated_data/stop_word_final.txt", "w")
for i in stop_word_array:
    stop_file1.write(i.encode('utf8')+"\n")
def get_sentences(page):
    from hanziconv import HanziConv
    sentences = []
    for line in HanziConv.toTraditional(page.content).splitlines():
        sentences.extend(line.split('。'))
    return sentences