예제 #1
0
def convert1():
    pos_hk = path.join(DIR, "pos_hk.txt")
    neg_hk = path.join(DIR, "neg_hk.txt")

    pos_tw = path.join(DIR, "pos_tw.txt")
    neg_tw = path.join(DIR, "neg_tw.txt")

    with open(POS) as pos, \
            open(NEG) as neg, \
            open(pos_hk, "w") as pos_hk, \
            open(neg_hk, "w") as neg_hk, \
            open(pos_tw, "w") as pos_tw, \
            open(neg_tw, "w") as neg_tw:
        pos = pos.read()
        neg = neg.read()
        s2hk = OpenCC('s2hk')

        pos_converted = s2hk.convert(pos)
        neg_converted = s2hk.convert(neg)

        pos_hk.write(pos_converted)
        neg_hk.write(neg_converted)

        s2tw = OpenCC('s2tw')
        pos_converted = s2tw.convert(pos)
        neg_converted = s2tw.convert(neg)

        pos_tw.write(pos_converted)
        neg_tw.write(neg_converted)
예제 #2
0
def xml_to_json():
    """
    <step1>
    1. 簡轉繁
    2. xml轉json
    3. 全形符號轉半形
    """
    openCC = OpenCC('s2t')  # 簡轉繁

    tree = ET.parse('./corpus/corpus.xml')
    root = tree.getroot()

    output_list = []
    c = 0
    nothing = 0
    for doc in root.findall('doc'):
        c += 1
        if c % 10000 == 0:
            print('----處理進度 %d----' % c)

        output_dict = {}
        content = doc.find('content').text
        title = doc.find('contenttitle').text
        if content and title:
            output_dict['abstract'] = openCC.convert(_full_to_half(title))
            output_dict['article'] = openCC.convert(_full_to_half(content))
            output_list.append(output_dict)
        else:
            nothing += 1
            if nothing % 1000 == 0:
                print('沒東西筆數 %d' % nothing)
    with open('corpus/corpus_1.json', 'w') as wf:
        json.dump(output_list, wf)
예제 #3
0
def getStockInfo(stock):
    cc = OpenCC('s2t')
    stockCode = stock.split('.')[0]
    s = '{:05d}'.format(int(stockCode))
    session = HTMLSession()
    session.browser
    url = 'http://stock.finance.sina.com.cn/hkstock/quotes/{0}.html'.format(s)
    stock_dict = {}
    r = session.get(url)
    r.html.render()
    name = r.html.find('#stock_cname', first=True)
    price = r.html.find('#mts_stock_hk_price', first=True)
    stockQuan = r.html.find('div.deta03',
                            first=True).find('ul')[1].find('li')[3]
    news = r.html.find('#js_ggzx', first=True).find('a')
    # print("{0} ({1})".format(cc.convert(name.text), s))
    # print(price.text)
    # print(cc.convert(stockQuan.text))
    for new in news:
        print(new.text, new.links)
    stock_dict = {
        'stock_name': cc.convert(name.text),
        'stock_code': s,
        'stock_price': price.text,
        'stock_quan': cc.convert(stockQuan.text),
        'news': news
    }
    return stock_dict
예제 #4
0
    def sortData(self, tagResults):
        cc=OpenCC('s2t')
        # resultForTotalSize = self.getCallResult(1,1,None)
        dataForRakuten = {}
        sortedData = []

        for record in tagResults:
            extend_data = []
            customer_info = []
            record_session_id = record['session_id']

            #獲取 asr result 開始
            asr_result = self.getChatRecords(record_session_id)
            if asr_result['status']!=0:
                return ("Call api: " + const.GET_ASR_RESULT_API + ", got wrong status: " + str(asr_result['status']) + ", message: " + asr_result['message'])
            record['asr_result']=json.loads(cc.convert(json.dumps(asr_result['result']['data'])))
            #獲取 asr result 結束

            log.debug(record['extend_data'])
            for key, value in record['extend_data'].items():
                regex = re.search('^\\*(.+)', key)
                if regex:
                    extend_data.append({"session_id":record_session_id,"col_name":cc.convert(regex.group(1)),"value":cc.convert(value)})
                else:
                    customer_info.append({"session_id":record_session_id,"col_name":cc.convert(key),"value":cc.convert(value)})
            record['extend_data']=extend_data
            record['customer_info']=customer_info
            sortedData.append(record)

        dataForRakuten['data'] = sortedData

        return dataForRakuten
예제 #5
0
def convert2():
    pos_tc_path = path.join(DIR, "pos_tc.txt")
    neg_tc_path = path.join(DIR, "neg_tc.txt")

    with open(POS) as pos, \
            open(NEG) as neg, \
            open(pos_tc_path, "w") as pos_tc, \
            open(neg_tc_path, "w") as neg_tc:
        pos = pos.read()
        neg = neg.read()

        pos_set = pos.split("\n")
        neg_set = neg.split("\n")

        s2hk = OpenCC('s2hk')

        pos_converted_hk = s2hk.convert(pos)
        neg_converted_hk = s2hk.convert(neg)

        s2tw = OpenCC('s2tw')

        pos_converted = s2tw.convert(pos)
        neg_converted = s2tw.convert(neg)

        pos = set(pos_converted_hk.split("\n")).union(
            pos_converted.split("\n"))
        neg = set(neg_converted_hk.split("\n")).union(
            neg_converted.split("\n"))

        pos_tc.write("\n".join(sorted(pos.difference(pos_set))))
        neg_tc.write("\n".join(sorted(neg.difference(neg_set))))
예제 #6
0
class Emotion(object):
    def __init__(self):
        APP_ID = '10508840'  # '你的 App ID'
        API_KEY = 'W9BwLsLvlPQvD9LsfWIBGX28'  # '你的 Api Key'
        SECRET_KEY = 'd4wSFFDKm0VjGrPZVxWpZyGfAFYuD3AX'  # '你的 Secret Key'
        self.db = Mysql_DB()
        self.aip = AipNlp(APP_ID, API_KEY, SECRET_KEY)
        self.trans = OpenCC('t2s') #模式设置为繁体-简体

    def Get_Sentence(self):
        sql = "select id, Comment_Content from comment where over = 'YYYY' limit " + str(100)
        try:
            Sentence_list = self.db.Query_MySQL(sql)  # 读取数据库,获取step行列
            for i in Sentence_list:  # 执行YYY修改命令,看看参照什么来做基准
                self.update_db(i[0])
            return Sentence_list
        except Exception as e:
            print ('query_db函数执行错误' + str(e))

    def update_db(self, i):
        changeY_sql = "update comment set over = 'YY' where id = " + str(i)
        try:
            self.db.Insert_MySQL(changeY_sql)
        except Exception as e:
            print ('改变YY错误' + str(e))

    def Get_Analyse(self):
        sentence_list = self.Get_Sentence()
        r = re.compile(ur"[\u0000-\u4dff,\u9fa6-\uffff]")  # 删除除了中文以外的一切
        for i in sentence_list:
            try:
                simple = self.trans.convert(i[1])
                #print i[1].strip().encode('utf-8', 'ignore')
                result = self.aip.sentimentClassify(simple.strip().encode('utf-8', 'ignore'))
                #print result
                '''print result['items'][0]['positive_prob'] #属于积极类别的概率
                print result['items'][0]['confidence'] #分类的置信度
                print result['items'][0]['negative_prob'] #属于消极类别的概率
                print result['items'][0]['sentiment'] #情感极性分类结果,0为负面,1为中性,2为正面'''
                s = str(result['items'][0]['sentiment'])
                p = str(result['items'][0]['positive_prob'])
                n = str(result['items'][0]['negative_prob'])
                c = str(result['items'][0]['confidence'])
                sql = "update comment set sentiment = %s, positive_prob = %s, negative_prob = %s, confidence = %s"%(s, p, n, c) + " where id = " + str(i[0])
                self.db.Insert_MySQL(sql)
            except Exception as e:
                print('辣鸡百度转码又TM错误了,看老子的' + str(e))
                try:
                    simple = self.trans.convert(i[1])
                    re_s = r.sub(',', simple)
                    result = self.aip.sentimentClassify(re_s.strip().encode('utf-8', 'ignore'))
                    s = str(result['items'][0]['sentiment'])
                    p = str(result['items'][0]['positive_prob'])
                    n = str(result['items'][0]['negative_prob'])
                    c = str(result['items'][0]['confidence'])
                    sql = "update comment set sentiment = %s, positive_prob = %s, negative_prob = %s, confidence = %s"%(s, p, n, c) + " where id = " + str(i[0])
                    self.db.Insert_MySQL(sql)
                except Exception as e:
                    print ('草,老子没辙了' + str(e))
예제 #7
0
def weibo_five_mil_save_to_db(file):
    print("Starting...")
    openCC = OpenCC('s2t')
    hashtag_regex = re.compile(r"#(\w+)#")
    clean_regex = re.compile(r"[(\s)|(\u200b)]+")
    html_regex = re.compile(r'(www|http)\S+', flags=re.MULTILINE)
    null_regex = re.compile(r'\x00')
    with open(file) as fp:
        for i in range(1820000):
            fp.readline()  # skip headers
        for idx, line in enumerate(fp, 1820000):
            if idx % 10000 == 0:
                print(f"{idx} posts...")
            post = line.split("\t")
            try:
                weibo_id = int(post[0])
                if WeiboFiveMilPost.objects.filter(weibo_id=weibo_id).exists():
                    continue
                else:
                    try:
                        attitudes_count = int(post[1])
                        comments_count = int(post[3])
                        created_at = post[4]
                        _id = int(post[7])
                        content_raw = post[18]
                        content_raw = null_regex.sub("", content_raw)
                        cn_content_clean = clean_regex.sub("", content_raw)
                        cn_content_clean = html_regex.sub("LINK", cn_content_clean)
                        cn_content_clean_seg = list(jieba.cut(cn_content_clean))
                        tw_content_clean = openCC.convert(cn_content_clean)
                        tw_content_clean_seg = [openCC.convert(c) for c in cn_content_clean_seg]
                        cn_tags = hashtag_regex.findall(cn_content_clean)
                        if cn_tags:
                            tw_tags = [openCC.convert(t) for t in cn_tags]
                        else:
                            tw_tags = []
                        reposts_count = int(post[16])
                        source = post[17]
                        WeiboFiveMilPost(
                            weibo_id=weibo_id,
                            attitudes_count=attitudes_count,
                            comments_count=comments_count,
                            created_at=created_at,
                            _id=_id,
                            content_raw=content_raw,
                            cn_content_clean=cn_content_clean,
                            cn_content_clean_seg=cn_content_clean_seg,
                            tw_content_clean=tw_content_clean,
                            tw_content_clean_seg=tw_content_clean_seg,
                            cn_tags=cn_tags,
                            tw_tags=tw_tags,
                            source=source,
                            reposts_count=reposts_count
                        ).save()
                    except ValueError as err:
                        print(err)
                        continue
            except ValueError as err:
                print(err)
예제 #8
0
    def getOneArticle(cls, response):
        """
        解析並取得單篇文章的內容
        """
        cc = OpenCC('s2t')
        article = ArticleItem()

        article_url = response.url
        # 從網址切出 小說 & 文章 id
        tidRegex = re.compile(r'tid=(\d+)&')
        matchT = tidRegex.search(article_url)
        novel_id = str(int(matchT.group(1)))
        sidRegex = re.compile(r'sid=(\d+)')
        matchS = sidRegex.search(article_url)
        article_id = str(int(matchS.group(1)))
        article["novel_id"] = novel_id
        article["article_id"] = article_id
        article["site"] = response.meta["site_name"].strip()
        article["novel"] = response.meta["novel_name"].strip()
        article["author"] = response.meta["author"].replace("作者:", "").strip()

        article["link"] = article_url

        article["title"] = response.css("h3::text").extract_first()
        whitespacePattern = re.compile(r'\s+')
        article["title"] = re.sub(whitespacePattern, '', article["title"])

        content = response.css("div#bookContent")
        if content.extract_first():
            tags = content.css("div.ad_conetent")
            if tags:
                for tag in tags:
                    htmlNode = tag.root
                    if htmlNode is not None and htmlNode.getparent(
                    ) is not None:
                        htmlNode.getparent().remove(htmlNode)
        article["content"] = content.extract_first()

        # 時間預設值之處理
        tz = pytz.timezone('Asia/Taipei')
        article["created_at"] = datetime.now(tz)
        article["updated_at"] = datetime.now(tz)

        # 轉繁體
        try:
            title2 = cc.convert(article["title"])
            if title2:
                article["title"] = title2
            author2 = cc.convert(article["author"])
            if author2:
                article["author"] = author2
            content2 = cc.convert(article["content"])
            if content2:
                article["content"] = content2
        except:
            pass

        yield article
예제 #9
0
def test_convert2():
    cc = OpenCC()
    text = '乾坤一擲'
    expect = '乾坤一掷'
    assert cc.convert(text) == expect

    text = '開放中文轉換'
    expect = '开放中文转换'
    assert cc.convert(text) == expect
예제 #10
0
def test_class_convert():
    cc = OpenCC()
    text = '乾坤一擲'
    expect = '乾坤一掷'
    assert cc.convert(text) == expect

    text = '開放中文轉換'
    expect = '开放中文转换'
    assert cc.convert(text) == expect
def getkeywords(num):
    # load ptt posts

    with open(filename,encoding = 'utf8') as f:
        posts = json.load(f)
    #     print(posts)
    titles=""
    for post in posts["articles"]:
    #     print(post["article_title"])
        title_tmp = post["article_title"]

    #     title_tmp = pattern.findall(title_tmp)
    #     if( title_tmp[0][1] == " "):
    #         title_tmp[0] = title_tmp[0][1:].lstrip();
    #     print(title_tmp[0])


        try:
    #         print(title_tmp.split("] ")[1])
            titles+=title_tmp.split("] ")[1].replace('"',' ')+" \n"
        except:
            try:
    #             print(title_tmp.split("] ")[1])
                titles+=title_tmp.split("] ")[1].replace('"',' ')+" \n"
            except:
                try:
    #                 print(title_tmp.split("]")[1])
                    titles+=title_tmp.split("]")[1].replace('"',' ')+" \n"
                except:
                    try:
    #                     print(title_tmp.split("]")[1])
                        titles+=title_tmp.split("]")[1].replace('"',' ')+" \n"
                    except:
    #                     print(title_tmp)
                        titles+=title_tmp.replace('"',' ')+" \n"
    from opencc import OpenCC
    cc = OpenCC('tw2sp')  # convert from Simplified Chinese to Traditional Chinese
    # can also set conversion by calling set_conversion
    # cc.set_conversion('s2tw')
    Simplified = cc.convert(titles)
    # print(Simplified)
    jieba.analyse.set_stop_words('stopwords.txt')
    jieba.add_word('柯文哲')
    jieba.add_word('叶克膜')
    jieba.add_word('黄士修')
    jieba.add_word('林佳龙')
    cc = OpenCC('s2twp')
    result = ''.join(i for i in Simplified if not i.isdigit()) #去除數字
    tags = jieba.analyse.extract_tags(result,
                                      topK=num,
                                      withWeight=True
                                     )
    keywords = []
    for tag, weight in tags:
        keywords.append(cc.convert(tag))
#         print(cc.convert(tag) + "," + str(weight))
    return keywords
예제 #12
0
class Sorter:
    def __init__(self, c: Comment):
        #DTO refactor with Data transfer object  是否要繼承super,啟用init 因為Sorter算是連接物件helper
        #classmethod static method
        self.c = c

        #函式內部使用
        self.cc = OpenCC('t2s')  # 繁轉簡
        self.bc = BertClient()  # 取得bert服務器資源
        self.classes_enc = None

    def cosine_sim(self, v1, v2):
        return 1 - spatial.distance.cosine(v1, v2)

    def predict_label(self, v):
        cos_sim = []
        for i, c in enumerate(self.classes_enc):
            cos_sim.append(self.cosine_sim(v, c))
        return argmax(cos_sim)

    def predict_labels(self, vs):
        op = []
        for v in vs:
            op.append(self.predict_label(v))
        return op

    def command_sort(self, Labeles):

        sent = self.c.input_comment
        labels = self.c.ground_truth
        classes = [self.cc.convert(s) for s in Labeles]  #把Label轉簡體 因為模型簡體練的

        self.classes_enc = self.bc.encode(classes)  # 把Label們轉換成數值向量
        print("True Label:", labels)
        print("Predict Label:", self.predict_labels(self.bc.encode(sent)))
        self.c.prediction = self.predict_labels(self.bc.encode(sent))
        return 0

    def single_comment_sort(self, Labeles):

        sent = self.c.input_comment
        labels = self.c.ground_truth
        classes = [self.cc.convert(s) for s in Labeles]  #把Label轉簡體 因為模型簡體練的

        self.classes_enc = self.bc.encode(classes)  # 把Label們轉換成數值向量
        #print("True Label:", labels)
        print("Predict Label:", self.predict_labels(self.bc.encode(sent)))
        self.c.prediction = self.predict_labels(self.bc.encode(sent))
        result = self.c.prediction[0]
        return result

    def print_validate_result(self):

        print("True Label:", self.true_label)
        print("Predict Label:", self.result)
        return 0
예제 #13
0
class OpenCCTest(unittest.TestCase):

    def setUp(self):
        self.openCC = OpenCC()

    def test_hk2s(self):
        self.openCC.set_conversion('hk2s')
        words = '香煙(英語:Cigarette),為煙草製品的一種。滑鼠是一種很常見及常用的電腦輸入設備。'
        self.assertEqual(self.openCC.convert(words), '香烟(英语:Cigarette),为烟草制品的一种。滑鼠是一种很常见及常用的电脑输入设备。')

    def test_s2hk(self):
        self.openCC.set_conversion('s2hk')
        words = '香烟(英语:Cigarette),为烟草制品的一种。鼠标是一种很常见及常用的电脑输入设备。'
        self.assertEqual(self.openCC.convert(words), '香煙(英語:Cigarette),為煙草製品的一種。鼠標是一種很常見及常用的電腦輸入設備。')

    def test_s2t(self):
        self.openCC.set_conversion('s2t')
        words = '香烟(英语:Cigarette),为烟草制品的一种。鼠标是一种很常见及常用的电脑输入设备。'
        self.assertEqual(self.openCC.convert(words), '香菸(英語:Cigarette),爲菸草製品的一種。鼠標是一種很常見及常用的電腦輸入設備。')

    def test_s2tw(self):
        self.openCC.set_conversion('s2tw')
        words = '香烟(英语:Cigarette),为烟草制品的一种。鼠标是一种很常见及常用的电脑输入设备。'
        self.assertEqual(self.openCC.convert(words), '香菸(英語:Cigarette),為菸草製品的一種。鼠標是一種很常見及常用的電腦輸入設備。')

    def test_s2twp(self):
        self.openCC.set_conversion('s2twp')
        words = '香烟(英语:Cigarette),为烟草制品的一种。內存是一种很常见及常用的电脑输入设备。'
        self.assertEqual(self.openCC.convert(words), '香菸(英語:Cigarette),為菸草製品的一種。記憶體是一種很常見及常用的電腦輸入裝置。')

    def test_t2hk(self):
        self.openCC.set_conversion('t2hk')
        words = '香菸(英語:Cigarette),爲菸草製品的一種。滑鼠是一種很常見及常用的電腦輸入裝置。'
        self.assertEqual(self.openCC.convert(words), '香煙(英語:Cigarette),為煙草製品的一種。滑鼠是一種很常見及常用的電腦輸入裝置。')

    def test_t2s(self):
        self.openCC.set_conversion('t2s')
        words = '香菸(英語:Cigarette),爲菸草製品的一種。滑鼠是一種很常見及常用的電腦輸入裝置。'
        self.assertEqual(self.openCC.convert(words), '香烟(英语:Cigarette),为烟草制品的一种。滑鼠是一种很常见及常用的电脑输入装置。')

    def test_t2tw(self):
        self.openCC.set_conversion('t2tw')
        words = '香菸(英語:Cigarette),爲菸草製品的一種。鼠標是一種很常見及常用的電腦輸入設備。'
        self.assertEqual(self.openCC.convert(words), '香菸(英語:Cigarette),為菸草製品的一種。鼠標是一種很常見及常用的電腦輸入設備。')

    def test_tw2s(self):
        self.openCC.set_conversion('tw2s')
        words = '香菸(英語:Cigarette),為菸草製品的一種。滑鼠是一種很常見及常用的電腦輸入裝置。'
        self.assertEqual(self.openCC.convert(words), '香烟(英语:Cigarette),为烟草制品的一种。滑鼠是一种很常见及常用的电脑输入装置。')

    def test_tw2sp(self):
        self.openCC.set_conversion('tw2sp')
        words = '香菸(英語:Cigarette),為菸草製品的一種。記憶體是一種很常見及常用的電腦輸入裝置。'
        self.assertEqual(self.openCC.convert(words), '香烟(英语:Cigarette),为烟草制品的一种。内存是一种很常见及常用的电脑输入设备。')
예제 #14
0
def is_match(regex, s, trans=True):
    # 支持繁体
    if s and regex:
        if trans:
            try:
                cc = OpenCC("t2s")
                s = cc.convert(s)
                regex = cc.convert(regex)
            except:
                print(regex, s)
        return re.search(regex, s)
    return None
예제 #15
0
def save(info, url_flag, dir_path):
    if url_flag == "TW_T" or url_flag == "TW_Y":
        openCC = OpenCC('tw2sp')
    elif url_flag == "HK_T" or url_flag == "HK_Y":
        openCC = OpenCC('hk2s')
    filename = openCC.convert(info[url_flag]["title"]) + ".txt"
    outfile = open(dir_path + "/" + filename, "w", encoding='utf-8')
    for ss in ["title", "url", "detail"]:
        txt = info[url_flag][ss].strip(" ")
        txt_converted = openCC.convert(txt)
        outfile.write(txt_converted + "\r\n\r\n")
    outfile.close()
예제 #16
0
def preprocessing():
    res = []
    i = 0
    converter = OpenCC('t2s')  #trannsorm into simplified Chinese
    #nlp = StanfordCoreNLP(r'/home/yuyi/stanford-corenlp-full-2018-02-27',lang='zh')

    wiki = WikiCorpus('/home/yuyi/zhwiki-latest-pages-articles.xml.bz2',
                      lemmatize=False,
                      dictionary=[])  #gensim里的维基百科处理类WikiCorpus
    for text in wiki.get_texts(
    ):  #通过get_texts将维基里的每篇文章转换位1行text文本,并且去掉了标点符号等内容
        cleaned = ''
        text = ''.join(text)
        for char in text:
            char = converter.convert(char)
            cleaned += char

        if len(cleaned):
            sentence = list(jieba.cut(cleaned))
            res.append(sentence)

        i = i + 1
        if (i % 1000) == 0:
            #if i == 10:
            print "Saved " + str(i) + " articles."
        # break

    with open('wiki_zh.pkl', 'w') as f:
        pickle.dump(res, f)

    print "Finished Saved " + str(i) + " articles."
예제 #17
0
def getShortWord():
    oc = OpenCC(conversion='s2twp')  # "出租车" --> "計程車" 带短语
    out_file_path = os.path.join(traditionalChinese_out_dir, "shortcut.txt")

    word_shortcut = []
    # 读取单个单词   文件格式:词 频率 1(繁体)/0(中文) 拼音
    with open(traditionalChineseSinglewordPath, 'r',
              encoding='utf-16') as simplifiedChineseSingleWord_file:
        for line in simplifiedChineseSingleWord_file:
            items = line.strip().split(" ")
            jianti = items[0].strip()
            fanti = oc.convert(jianti)
            # pinyin = items[3].strip()
            begin = line.index(items[3])
            end = line.index("\n")
            pinyin = str(line[begin:end])
            # for i in range(3,len(items)):
            #     pinyin = seq.join(items[i])
            # pinyin.append("".join([items[i]]))
            res_line = pinyin + "\t" + isNotAWord + "\t" + items[
                1] + "\t" + fanti + "\t" + freq2 + "\n"
            word_shortcut.append(res_line)

    print("word size:", str(len(word_shortcut)))
    return word_shortcut
예제 #18
0
 def get(self, request):
     name = request.GET.get('q', '')
     print(name)
     cc = OpenCC('s2t')
     name = cc.convert(name)
     if name is not None:
         queryset1 = Yao.objects.filter(responses__icontains=name)
         queryset2 = Yao.objects.filter(properties__icontains=name)
         # merged=queryset1 + queryset2
         queryset = list(set(list(chain(queryset1, queryset2))))
         for ele in queryset:
             ele.properties = ele.properties.replace('【', '\n\n【').replace(
                 '】', '】\n') + "\n\n\n"
             ele.properties = ele.properties.replace('<li>', '').replace(
                 '</li>', '').replace('<ul>',
                                      '').replace('</ul>',
                                                  '').replace('<p', '')
             ele.responses = ele.responses.replace('<li>', '').replace(
                 '</li>', '').replace('<ul>',
                                      '').replace('</ul>',
                                                  '').replace('<p', '')
             ele.responses = ele.responses.replace('【', '\n\n【').replace(
                 '】', '】\n') + "\n\n\n"
             ele.responses = ele.responses.replace(name,
                                                   '<mg>' + name + '</mg>')
             ele.properties = ele.properties.replace(
                 name, '<mg>' + name + '</mg>')
         return Response({'yaos': queryset})
예제 #19
0
파일: __main__.py 프로젝트: zxsama/epubcst
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-i', '--input', metavar='<file>',
                        help='Read original text from <file>.')
    parser.add_argument('-o', '--output', metavar='<file>',
                        help='Write converted text to <file>.')
    parser.add_argument('-c', '--config', metavar='<conversion>',
                        help='Conversion')
    parser.add_argument('--in-enc', metavar='<encoding>', default='UTF-8',
                        help='Encoding for input')
    parser.add_argument('--out-enc', metavar='<encoding>', default='UTF-8',
                        help='Encoding for output')
    args = parser.parse_args()

    if args.config is None:
        print("Please specify a conversion.", file=sys.stderr)
        return 1

    cc = OpenCC(args.config)

    with io.open(args.input if args.input else 0, encoding=args.in_enc) as f:
        input_str = f.read()
    output_str = cc.convert(input_str)
    with io.open(args.output if args.output else 1, 'w',
              encoding=args.out_enc) as f:
        f.write(output_str)

    return 0
예제 #20
0
파일: route.py 프로젝트: wallat/txtconv
def convertFile(srcPath, destPath, progressCallback=None):
	""" Convert the given file into traditional chinese and save to destPath"""
	openCC = OpenCC('s2twp')

	# guess the encoding
	rawdata = open(srcPath, 'rb').read(500)
	result = chardet.detect(rawdata)
	charenc = result['encoding']

	# count the total lines
	totalLines = 0
	with open(srcPath, 'r', encoding=charenc, errors='ignore') as srcf:
		for i, l in enumerate(srcf):
				pass
		totalLines = i+1

	# convert the file content
	prevProgress = 0
	with open(srcPath, 'r', encoding=charenc, errors='ignore') as srcf:
		with open(destPath, 'w') as destf:
			for j, line in enumerate(srcf):
				line = openCC.convert(line)
				destf.write(line)

				# tell outside the converting progress
				if progressCallback:
					currProgress = j/totalLines
					if currProgress-prevProgress>=0.01:
						progressCallback(currProgress)
						prevProgress = currProgress
예제 #21
0
파일: s2t.py 프로젝트: ARJhe/nlp_tutorial
def main():
    '''
        convert simplified Chinese file to traditional Chinese file line by line
    :return:
    '''
    if len(sys.argv) != 3:
        print("Usage: python " + sys.argv[0] + " input.txt output.txt")
        exit()

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    cc = OpenCC('s2tw')
    output = open(
        sys.argv[2], "w+",
        encoding='utf-8')  # w+: write to file and create if not exist
    logging.info("Start converting!")
    with open(sys.argv[1], "r", encoding='utf-8') as f:
        for line_num, line in enumerate(f):
            line_num += 1
            output.write(cc.convert(line))
            if line_num % 10000 == 0:
                logging.info("已處理 %d 行" % line_num)
        output.close()
        f.close()
예제 #22
0
def read_input(filename):
    current_train_x = ""
    current_train_y = ""
    train_x = []
    train_y = []

    with open(filename, "r", encoding="utf-8") as raw_data:
        open_cc = OpenCC("s2t")
        for line in raw_data:
            line = line.strip()
            line = open_cc.convert(line)
            if line == "==###=title=###==":
                state = State.SEE_TITLE
                if current_train_x.strip() and current_train_y.strip():
                    train_x.append(current_train_x)
                    train_y.append(current_train_y)
                    current_train_x = ""
                    current_train_y = ""
            elif line == "==###=description=###==":
                state = State.SEE_DESCRIPTION
            elif line == "==###=category=###==":
                state = State.SEE_CATEGORY
            else:
                if state == State.SEE_TITLE:
                    current_train_x = line
                elif state == State.SEE_DESCRIPTION:
                    current_train_x += " " + line
                elif state == State.SEE_CATEGORY:
                    current_train_y = line
        if current_train_x.strip() and current_train_y.strip():
            train_x.append(current_train_x)
            train_y.append(current_train_y)
            current_train_x = ""
            current_train_y = ""
    return train_x, train_y
예제 #23
0
def zh_t2s(infile, outfile):
    '''convert the traditional Chinese of infile into the simplified Chinese of outfile'''
    # read the traditional Chinese file
    t_corpus = []
    with open(infile, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.replace('\n', '').replace('\t', '')
            t_corpus.append(line)
    logger.info('read traditional file finished!')

    # convert the t_Chinese to s_Chinese
    cc = OpenCC('t2s')
    s_corpus = []
    for i, line in zip(range(len(t_corpus)), t_corpus):
        if i % 1000 == 0:
            logger.info('convert t2s with the {}/{} line'.format(
                i, len(t_corpus)))
        # s_corpus.append(OpenCC.convert(line))
        s_corpus.append(cc.convert(line))
    logger.info('convert t2s finished!')

    # write the simplified Chinese into the outfile
    with open(outfile, 'w', encoding='utf-8') as f:
        for line in s_corpus:
            f.writelines(line + '\n')
    logger.info('write the simplified file finished!')
def strip_wiki_source(wiki_source):
    # 简繁体转换器
    convertor = OpenCC('t2s')

    # 匹配<...>标签
    label_pattern = '<.+>'
    # 匹配各类中英文标点
    punc_pattern = '[“”,。()\(\)·《》::\-\"「」‘’??!!,、;]'

    for count, path in enumerate(wiki_source):

        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                if line == '\n': continue
                # 正则替换
                line = re.sub(label_pattern, '', line)
                line = re.sub(punc_pattern, '', line)
                # 由繁体转为简体
                simplified_line = convertor.convert(line)

                #追加模式,因此保证是个空文件
                output_file = open('wiki_stripped.txt', 'a', encoding='utf-8')
                output_file.write(simplified_line)
                output_file.close()

        print("完成{}个文件".format(count))
예제 #25
0
def preprocess_wiki(input_file, output_file):
    # Import input file
    if not os.path.exists(input_file):
        url = 'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
        logging.info('Download Wiki dump from {}'.format(url))
        wget.download(url)
    wiki = WikiCorpus(input_file, lemmatize=False, dictionary=[])

    # Convert tradtional Chinese to simplified Chinese using OpenCC
    cc = OpenCC('t2s')
    # Segment the sentences into words using Jieba paddle mode
    jieba.enable_paddle()

    # Process Wiki text
    logging.info('Start processing Wiki text')
    output = open(output_file, 'w')
    i = 0
    for article in tqdm(wiki.get_texts()):
        raw = ' '.join(article)
        processed = []
        # Remove non-Chinese words
        for token in list(jieba.cut(cc.convert(raw))):
            matched = re.findall(r'[\u4e00-\u9fff]+', token)
            if matched:
                processed.append(matched[0])
        output.write(' '.join(processed) + '\n')
        i += 1
        if (i % 10000 == 0):
            logging.info('Finished processing {} articles'.format(i))
    output.close()
    logging.info('Done')
예제 #26
0
 def article_HK(self, url_flag=''):
     if url_flag == "HK_T":
         dir_path = self.config.get("outputPath") + time.strftime(
             "%Y-%m-%d") + '/' + time.strftime("%Y-%m-%d") + "-香港民报"
     elif url_flag == "HK_Y":
         dir_path = self.config.get("outputPath") + str(
             getYesterday()) + '/' + str(getYesterday()) + "-香港明报"
     if not os.path.exists(dir_path):
         os.makedirs(dir_path)
     for i, url in enumerate(self.articleList_HK[url_flag]):
         try:
             print("HK-" + str(i + 1) + ": " + url)
             tree_HK = lxml.html.fromstring(get_Html(url, js=True, time=3))
             title_HK = tree_HK.cssselect("#blockcontent > hgroup > h1")
             openCC = OpenCC('hk2s')
             filename = openCC.convert(title_HK[0].text_content()) + ".txt"
             if filename in os.listdir(dir_path):
                 print("... 第" + str(i + 1) + "篇文章已存在 ...")
                 continue
             detail_HK_upper = tree_HK.cssselect("#upper > p")
             detail_HK_lower = tree_HK.cssselect("#lower > p")
             self.articleDetail[url_flag]["title"] = title_HK[
                 0].text_content()
             self.articleDetail[url_flag]["url"] = url
             if (title_HK and detail_HK_upper and detail_HK_upper):
                 detail_HK = ""
                 detail_HK += detail_HK_upper[0].text_content() + "\r\n"
                 for j in range(len(detail_HK_lower)):
                     detail_HK += detail_HK_lower[j].text_content() + "\r\n"
                 self.articleDetail[url_flag]["detail"] = detail_HK
             save(self.articleDetail, url_flag, dir_path)
         except Exception as err:
             print(err)
             print("... 第" + str(i + 1) + "篇文章解析失败 ...")
             continue
예제 #27
0
    def convert(self):

        from PyQt5.QtWidgets import QApplication
        count = lineCount(self.fni)

        openCC = OpenCC(self.direction)  # direction of conversion
        fi = open(self.fni, "r", encoding="UTF-8")
        fo = open(self.fno, "w", encoding="UTF-8", newline="\n")

        n = 0
        for line in fi:
            n += 1
            txt = openCC.convert(line)
            fo.write(txt)  # wirte converted text to output
            #completed = 100 * n / count
            if n % 100 == 0:
                self.window.ui.progressBar.setValue(round(100 * n / count, 0))
                self.window.ui.progressBar.repaint()
                QApplication.processEvents()
            #self.window.update()
        fi.close()
        fo.close()
        self.window.ui.progressBar.setValue(100)
        self.window.ui.progressBar.repaint()
        self.numLineProcessed = n
        return self.numLineProcessed
예제 #28
0
def get_data(page):
    datasets = []
    try:
        text = page.text.encode('iso-8859-1').decode('GBK')
        soup = BeautifulSoup(text, 'lxml')
        posts = soup.find_all(class_='forumbox postbox')
        for post in posts:
            data = {}
            # 作者id,发帖次数,最后发帖时间
            data['uid'] = post.find(class_='author')['href'].split('=')[-1]
            id = post.find(class_='author')['id'][10:]
            data['posttime'] = datetime.strptime(
                post.find(id="postdate" + id).text, "%Y-%m-%d %H:%M")
            #发帖次数暂定为1,查询后再进行修改
            data['postcount'] = 1
            datasets.append(data)

            # 得到帖子内容
            content = post.find(id='postcontent' + id).text.strip()
            content = re.sub(u'\\[quote\\].*?\\[/quote\\]', '', content)
            content = re.sub(u'\\[b\\].*?\\[/b\\]', '', content)
            content = re.sub(u'\\[img\\].*?\\[/img\\]', '', content)
            content = re.sub(u'\\[url\\].*?\\[/url\\]', '', content)
            content = re.sub(u'\\[size.*?/size\\]', '', content)
            content = re.sub(u'\\[s:.*?\\]', '', content)
            content = re.sub(u'\\[.*?del\\]', '', content)
            content = re.sub(u'\\[.*?list\\]', '', content)
            content = re.sub(u'\\[.*?collapse\\]', '', content)
            if len(content) > 0:
                cc = OpenCC('t2s')
                content = cc.convert(content)
                save_content(content)
    except Exception as e:
        print("出现异常,错误为:%s" % e)
    return datasets
def start():
    fp = open(filename, "r", encoding="utf-8")
    soup = BeautifulSoup(fp,"xml")
    ans = soup.find_all("tuv")
    print(ans)
    amountOfData = len(ans)/2
    print(amountOfData)
    temp = {}
    count = 0
    df = pd.DataFrame(columns=["中文", "英文"])
    numOfData = 0
    englishWord = ""
    chineseWord = ""
    for a in ans:
        print(numOfData)
        if numOfData == amountOfData:
            break
        if a.get("xml:lang") == "en":
            temp["英文"] = a.get_text()
            englishWord = a.get_text()
            count = count + 1
        if a.get("xml:lang") == "zh" or a.get("xml:lang") == "zh-tw":
            cc = OpenCC('s2tw')
            text = cc.convert(a.get_text())
            finalword = ""
            inBracket = False
            for letter in text:
                if letter == '(':
                    inBracket = True
                    continue
                elif letter == ")":
                    inBracket = False
                    continue
                if inBracket == False and letter != " ":
                    finalword += letter
                elif inBracket == True:
                    continue
            temp["中文"] = finalword
            chineseWord = finalword
            count = count + 1
        if count == 2:
            count = 0
            if len(chineseWord) == 0 or len(englishWord) == 0:
                temp.clear()
                amountOfData = amountOfData -1
                continue
            if chineseWord[0] == englishWord[0]:
                chineseWord = ""
                englishWord = ""
                amountOfData = amountOfData - 1
                temp.clear()
                continue
            else:
                print(chineseWord)
                print(englishWord)
                df = df.append(temp,ignore_index=True)
                chineseWord = ""
                numOfData = numOfData + 1
    print(df.to_string())
    return df
예제 #30
0
        async def on_message(msg):
            if msg.author == self.bot.user:  # this is to prevent crashing via infinite loops
                return
            msg_content = msg.content
            msg_channel = msg.channel

            cc = OpenCC('s2tw')
            tw_ch = cc.convert(msg_content)

            sticker_res = self.sticker_db_operation.get_sticker_random(
                msg_content)
            if sticker_res is not None:
                img_url = sticker_res[0]
                local_save = sticker_res[1]
                is_gif = sticker_res[2]

                if self.save_image_local and local_save != '':
                    img_url = self.sticker_url + 'sticker-image/' + local_save
                """
                #old method
                if is_gif:
                    await msg_channel.send(img_url)
                else:
                    self.com_image_em.set_image(url=img_url)
                    await msg_channel.send(embed=self.com_image_em)
                """
                await msg_channel.send(img_url)

            await self.bot.process_commands(msg)
예제 #31
0
def convert_to_sim(source, target):
    opencc = OpenCC('hk2s')  # 繁体转简体
    with open(target, 'w') as t:
        with open(source, 'r') as f:
            for line in tqdm(f):
                simple = opencc.convert(line)
                t.write(simple)
예제 #32
0
def dataCleaning(id):
    openCC = OpenCC('tw2s')
    params = {'s': id}
    try:
        r = requests.get(GET_AC_GAMER_URL, timeout=100, params=params)
    except TimeoutError:
        return None
    r.encoding = 'utf-8'
    soup = BeautifulSoup(r.text, features='lxml')
    platform = soup.find('a', {'class': 'link2'})
    if not platform:
        return None
    if platform.text == 'Nintendo Switch ( NS )':
        tw_name = soup.find('h1').text
        cn_name = openCC.convert(str(tw_name))
        jp_name = soup.find_all('h2')[0].text
        eu_name = soup.find_all('h2')[1].text

        names = {
            'tw_name': tw_name,
            'cn_name': cn_name,
            'jp_name': jp_name,
            'eu_name': eu_name
        }
        name_collection.update({'tw_name': tw_name}, names, upsert=True)

    return None
예제 #33
0
파일: test.py 프로젝트: cute/pyopencc
 def test_unicode_zht2zhs(self):
     c = OpenCC('zht2zhs.ini')
     self.assertEqual(c.convert(u'开放中文转换'), u'開放中文轉換')
     c.close()
예제 #34
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys
from opencc import OpenCC


if __name__ == '__main__':
    if sys.version_info[0] < 3:
        print('Require Python3 to run')
        sys.exit(0)

    openCC = OpenCC()
    openCC.set_conversion('s2twp')
    # openCC = OpenCC('s2twp')

    words = '鼠标是一种很常見及常用的電腦输入设备,它可以对当前屏幕上的游标进行定位,并通过按键和滚轮装置对游标所经过位置的' \
            '屏幕元素进行操作。鼠标的鼻祖於1968年出现。美国科学家道格拉斯·恩格尔巴特(Douglas Englebart)在加利福尼亚制作了' \
            '第一只鼠标。'

    result = openCC.convert(words)
    print("{} \n\n==> \n\n{}".format(words, result))
예제 #35
0
파일: test.py 프로젝트: cute/pyopencc
 def test_convert_text(self):
     c = OpenCC('zhs2zht.ini')
     try:
         c.convert(3)
     except TypeError, e:
         self.assertEqual(e.message, 'TypeError: must be string or buffer.')
예제 #36
0
파일: test.py 프로젝트: cute/pyopencc
 def test_base_zhs2zht(self):
     c = OpenCC('zhs2zht.ini')
     self.assertEqual(c.convert('开放中文转换'), '開放中文轉換')
     c.close()
예제 #37
0
파일: librime.py 프로젝트: EasyIME/PIME
class RimeStyle:
    font_face = "MingLiu"
    candidate_format = "{0} {1}"
    inline_preedit = "false"
    menu_opencc = None
    font_point = 20
    candidate_per_row = 1
    inline_code = False
    display_tray_icon = False
    candidate_use_cursor = False
    soft_cursor = False
    menu = []
    options = []
    options_states = []
    schemas = []
    uris = []
    session_id = None

    def __init__(self, appname, session_id):
        self.session_id = session_id
        config = RimeConfig()
        if not rime.config_open(appname.encode("UTF-8"), config):
            return
        self.font_face = rimeGetString(config, 'style/font_face')
        self.candidate_format = rimeGetString(config, 'style/candidate_format')
        self.inline_preedit = rimeGetString(config, 'style/inline_preedit')
        menu_opencc_config = rimeGetString(config, 'style/menu_opencc')
        self.menu_opencc = OpenCC(menu_opencc_config) if menu_opencc_config else None
        value = c_int()
        if rime.config_get_int(config, b'style/font_point', value):
            self.font_point = value.value
        if rime.config_get_bool(config, b'style/horizontal', value):
            self.candidate_per_row = 10 if bool(value) else 1
        if rime.config_get_int(config, b'style/candidate_per_row', value):
            self.candidate_per_row = value.value
        if rime.config_get_bool(config, b'style/display_tray_icon', value):
            self.display_tray_icon = bool(value)
        if rime.config_get_bool(config, b'style/candidate_use_cursor', value):
            self.candidate_use_cursor = bool(value)
        if rime.config_get_bool(config, b'style/soft_cursor', value):
            self.soft_cursor = bool(value)
        self.options.clear()
        self.options_states.clear()
        self.uris.clear()
        self.menu = self.config_get_menu(config, b'menu')
        #print("menu", self.menu)
        rime.config_close(config)

    def get_schema(self, commandId):
        if commandId >= ID_SCHEMA:
            return self.schemas[commandId - ID_SCHEMA]

    def get_option(self, commandId):
        if commandId >= ID_OPTION:
            return self.options[commandId - ID_OPTION]

    def get_uri(self, commandId):
        if commandId >= ID_URI:
            return self.uris[commandId - ID_URI]

    def get_schema_list(self):
        schema_list = RimeSchemaList()
        self.schemas = []
        submenu = []
        current_schema = bytes(CHAR_SIZE)
        rime.get_current_schema(self.session_id, current_schema, CHAR_SIZE)
        current_schema_id = current_schema.rstrip(b'\0')
        if rime.get_schema_list(schema_list):
            n = schema_list.size
            for i in range(n):
                schema_id = schema_list.list[i].schema_id
                name = schema_list.list[i].name.decode("UTF-8")
                if self.menu_opencc:
                    name = self.menu_opencc.convert(name)
                self.schemas.append(schema_id)
                d = {'text': name, 'id': ID_SCHEMA + i}
                if schema_id == current_schema_id:
                    d["checked"] = True
                submenu.append(d)
        rime.free_schema_list(schema_list)
        return submenu          

    def config_get_menu(self, config, path):
        menu = []
        iterator = RimeConfigIterator()
        if not rime.config_begin_list(iterator, config, path):
            return
        while rime.config_next(iterator):
            d = {}
            name = rime.config_get_cstring(config, iterator.path + b'/name')
            command = rime.config_get_cstring(config, iterator.path + b'/command')
            uri = rime.config_get_cstring(config, iterator.path + b'/uri')
            text = rime.config_get_cstring(config, iterator.path + b'/text')
            if command:
                d["id"] = commands.get(command.decode("UTF-8"), 0)
                if ID_SCHEMA_LIST == d["id"]:
                    d["submenu"] = self.get_schema_list()
                elif ID_SYNC_DIR == d["id"]:
                    d["enabled"] = os.path.isdir(rime.get_sync_dir().decode(ENC))
            elif uri:
                d["id"] = ID_URI + len(self.uris)
                self.uris.append(uri.decode("UTF-8"))
            elif name:
                states = [rime.config_get_cstring(config, iterator.path + b'/states/@0').decode("UTF-8"),
                          rime.config_get_cstring(config, iterator.path + b'/states/@1').decode("UTF-8")]
                d["id"] = ID_OPTION + len(self.options)
                state_id = rime.get_option(self.session_id, name)
                d["text"] = "%s → %s" % (states[state_id], states[1 - state_id])
                self.options_states.append(states)
                self.options.append(name)
            if text:
                d["text"] = text.decode("UTF-8")
                if self.menu_opencc:
                    d["text"] = self.menu_opencc.convert(d["text"])
                submenu = self.config_get_menu(config, iterator.path + b'/submenu')
                if submenu:
                    d["submenu"] = submenu
            menu.append(d)
        rime.config_end(iterator)
        return menu