예제 #1
0
    def crawler(self):
        cc = OpenCC('s2tw')  # 簡中轉繁中
        num = 0
        page = 0  # 起始頁0,下一頁+100

        while num != self.num_of_news:
            url = 'https://cn.nytimes.com/search/data/'
            params = {
                'query': self.keyword,
                'lang': '',
                'dt': 'json',
                'from': str(page),
                'size': '100'
            }
            response = requests.get(url, params=params)
            news = json.loads(response.text)

            # 判斷頁面是否有內容
            if num > news['total']:
                print('=====搜尋頁無內容=====')
                break
            else:
                # 擷取新聞標題、發布時間、新聞網址
                for i in news['items']:
                    self.title.append(cc.convert(i['headline']))
                    self.time.append(i['publication_date'].replace(" ", ""))
                    self.url.append(i['web_url_with_host'] +
                                    'zh-hant/')  # 以繁體中文顯示
                    self.content.append(
                        self.get_content(i['web_url_with_host'] + 'zh-hant/'))
                    num += 1
                    print(f'下載篇數:{num}')
                    if num == self.num_of_news: break
                else:
                    page += 100  # 下一頁
예제 #2
0
 def article_HK(self, url_flag=''):
     if url_flag == "HK_T":
         dir_path = self.config.get("outputPath") + time.strftime(
             "%Y-%m-%d") + '/' + time.strftime("%Y-%m-%d") + "-香港民报"
     elif url_flag == "HK_Y":
         dir_path = self.config.get("outputPath") + str(
             getYesterday()) + '/' + str(getYesterday()) + "-香港明报"
     if not os.path.exists(dir_path):
         os.makedirs(dir_path)
     for i, url in enumerate(self.articleList_HK[url_flag]):
         try:
             print("HK-" + str(i + 1) + ": " + url)
             tree_HK = lxml.html.fromstring(get_Html(url, js=True, time=3))
             title_HK = tree_HK.cssselect("#blockcontent > hgroup > h1")
             openCC = OpenCC('hk2s')
             filename = openCC.convert(title_HK[0].text_content()) + ".txt"
             if filename in os.listdir(dir_path):
                 print("... 第" + str(i + 1) + "篇文章已存在 ...")
                 continue
             detail_HK_upper = tree_HK.cssselect("#upper > p")
             detail_HK_lower = tree_HK.cssselect("#lower > p")
             self.articleDetail[url_flag]["title"] = title_HK[
                 0].text_content()
             self.articleDetail[url_flag]["url"] = url
             if (title_HK and detail_HK_upper and detail_HK_upper):
                 detail_HK = ""
                 detail_HK += detail_HK_upper[0].text_content() + "\r\n"
                 for j in range(len(detail_HK_lower)):
                     detail_HK += detail_HK_lower[j].text_content() + "\r\n"
                 self.articleDetail[url_flag]["detail"] = detail_HK
             save(self.articleDetail, url_flag, dir_path)
         except Exception as err:
             print(err)
             print("... 第" + str(i + 1) + "篇文章解析失败 ...")
             continue
예제 #3
0
 def get(self, request):
     name = request.GET.get('q', '')
     print(name)
     cc = OpenCC('s2t')
     name = cc.convert(name)
     if name is not None:
         queryset1 = Yao.objects.filter(responses__icontains=name)
         queryset2 = Yao.objects.filter(properties__icontains=name)
         # merged=queryset1 + queryset2
         queryset = list(set(list(chain(queryset1, queryset2))))
         for ele in queryset:
             ele.properties = ele.properties.replace('【', '\n\n【').replace(
                 '】', '】\n') + "\n\n\n"
             ele.properties = ele.properties.replace('<li>', '').replace(
                 '</li>', '').replace('<ul>',
                                      '').replace('</ul>',
                                                  '').replace('<p', '')
             ele.responses = ele.responses.replace('<li>', '').replace(
                 '</li>', '').replace('<ul>',
                                      '').replace('</ul>',
                                                  '').replace('<p', '')
             ele.responses = ele.responses.replace('【', '\n\n【').replace(
                 '】', '】\n') + "\n\n\n"
             ele.responses = ele.responses.replace(name,
                                                   '<mg>' + name + '</mg>')
             ele.properties = ele.properties.replace(
                 name, '<mg>' + name + '</mg>')
         return Response({'yaos': queryset})
예제 #4
0
def xml_to_json():
    """
    <step1>
    1. 簡轉繁
    2. xml轉json
    3. 全形符號轉半形
    """
    openCC = OpenCC('s2t')  # 簡轉繁

    tree = ET.parse('./corpus/corpus.xml')
    root = tree.getroot()

    output_list = []
    c = 0
    nothing = 0
    for doc in root.findall('doc'):
        c += 1
        if c % 10000 == 0:
            print('----處理進度 %d----' % c)

        output_dict = {}
        content = doc.find('content').text
        title = doc.find('contenttitle').text
        if content and title:
            output_dict['abstract'] = openCC.convert(_full_to_half(title))
            output_dict['article'] = openCC.convert(_full_to_half(content))
            output_list.append(output_dict)
        else:
            nothing += 1
            if nothing % 1000 == 0:
                print('沒東西筆數 %d' % nothing)
    with open('corpus/corpus_1.json', 'w') as wf:
        json.dump(output_list, wf)
예제 #5
0
파일: route.py 프로젝트: wallat/txtconv
def convertFile(srcPath, destPath, progressCallback=None):
	""" Convert the given file into traditional chinese and save to destPath"""
	openCC = OpenCC('s2twp')

	# guess the encoding
	rawdata = open(srcPath, 'rb').read(500)
	result = chardet.detect(rawdata)
	charenc = result['encoding']

	# count the total lines
	totalLines = 0
	with open(srcPath, 'r', encoding=charenc, errors='ignore') as srcf:
		for i, l in enumerate(srcf):
				pass
		totalLines = i+1

	# convert the file content
	prevProgress = 0
	with open(srcPath, 'r', encoding=charenc, errors='ignore') as srcf:
		with open(destPath, 'w') as destf:
			for j, line in enumerate(srcf):
				line = openCC.convert(line)
				destf.write(line)

				# tell outside the converting progress
				if progressCallback:
					currProgress = j/totalLines
					if currProgress-prevProgress>=0.01:
						progressCallback(currProgress)
						prevProgress = currProgress
예제 #6
0
    def convert(self):

        from PyQt5.QtWidgets import QApplication
        count = lineCount(self.fni)

        openCC = OpenCC(self.direction)  # direction of conversion
        fi = open(self.fni, "r", encoding="UTF-8")
        fo = open(self.fno, "w", encoding="UTF-8", newline="\n")

        n = 0
        for line in fi:
            n += 1
            txt = openCC.convert(line)
            fo.write(txt)  # wirte converted text to output
            #completed = 100 * n / count
            if n % 100 == 0:
                self.window.ui.progressBar.setValue(round(100 * n / count, 0))
                self.window.ui.progressBar.repaint()
                QApplication.processEvents()
            #self.window.update()
        fi.close()
        fo.close()
        self.window.ui.progressBar.setValue(100)
        self.window.ui.progressBar.repaint()
        self.numLineProcessed = n
        return self.numLineProcessed
예제 #7
0
def getShortWord():
    oc = OpenCC(conversion='s2twp')  # "出租车" --> "計程車" 带短语
    out_file_path = os.path.join(traditionalChinese_out_dir, "shortcut.txt")

    word_shortcut = []
    # 读取单个单词   文件格式:词 频率 1(繁体)/0(中文) 拼音
    with open(traditionalChineseSinglewordPath, 'r',
              encoding='utf-16') as simplifiedChineseSingleWord_file:
        for line in simplifiedChineseSingleWord_file:
            items = line.strip().split(" ")
            jianti = items[0].strip()
            fanti = oc.convert(jianti)
            # pinyin = items[3].strip()
            begin = line.index(items[3])
            end = line.index("\n")
            pinyin = str(line[begin:end])
            # for i in range(3,len(items)):
            #     pinyin = seq.join(items[i])
            # pinyin.append("".join([items[i]]))
            res_line = pinyin + "\t" + isNotAWord + "\t" + items[
                1] + "\t" + fanti + "\t" + freq2 + "\n"
            word_shortcut.append(res_line)

    print("word size:", str(len(word_shortcut)))
    return word_shortcut
def strip_wiki_source(wiki_source):
    # 简繁体转换器
    convertor = OpenCC('t2s')

    # 匹配<...>标签
    label_pattern = '<.+>'
    # 匹配各类中英文标点
    punc_pattern = '[“”,。()\(\)·《》::\-\"「」‘’??!!,、;]'

    for count, path in enumerate(wiki_source):

        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                if line == '\n': continue
                # 正则替换
                line = re.sub(label_pattern, '', line)
                line = re.sub(punc_pattern, '', line)
                # 由繁体转为简体
                simplified_line = convertor.convert(line)

                #追加模式,因此保证是个空文件
                output_file = open('wiki_stripped.txt', 'a', encoding='utf-8')
                output_file.write(simplified_line)
                output_file.close()

        print("完成{}个文件".format(count))
예제 #9
0
        async def on_message(msg):
            if msg.author == self.bot.user:  # this is to prevent crashing via infinite loops
                return
            msg_content = msg.content
            msg_channel = msg.channel

            cc = OpenCC('s2tw')
            tw_ch = cc.convert(msg_content)

            sticker_res = self.sticker_db_operation.get_sticker_random(
                msg_content)
            if sticker_res is not None:
                img_url = sticker_res[0]
                local_save = sticker_res[1]
                is_gif = sticker_res[2]

                if self.save_image_local and local_save != '':
                    img_url = self.sticker_url + 'sticker-image/' + local_save
                """
                #old method
                if is_gif:
                    await msg_channel.send(img_url)
                else:
                    self.com_image_em.set_image(url=img_url)
                    await msg_channel.send(embed=self.com_image_em)
                """
                await msg_channel.send(img_url)

            await self.bot.process_commands(msg)
def start():
    fp = open(filename, "r", encoding="utf-8")
    soup = BeautifulSoup(fp,"xml")
    ans = soup.find_all("tuv")
    print(ans)
    amountOfData = len(ans)/2
    print(amountOfData)
    temp = {}
    count = 0
    df = pd.DataFrame(columns=["中文", "英文"])
    numOfData = 0
    englishWord = ""
    chineseWord = ""
    for a in ans:
        print(numOfData)
        if numOfData == amountOfData:
            break
        if a.get("xml:lang") == "en":
            temp["英文"] = a.get_text()
            englishWord = a.get_text()
            count = count + 1
        if a.get("xml:lang") == "zh" or a.get("xml:lang") == "zh-tw":
            cc = OpenCC('s2tw')
            text = cc.convert(a.get_text())
            finalword = ""
            inBracket = False
            for letter in text:
                if letter == '(':
                    inBracket = True
                    continue
                elif letter == ")":
                    inBracket = False
                    continue
                if inBracket == False and letter != " ":
                    finalword += letter
                elif inBracket == True:
                    continue
            temp["中文"] = finalword
            chineseWord = finalword
            count = count + 1
        if count == 2:
            count = 0
            if len(chineseWord) == 0 or len(englishWord) == 0:
                temp.clear()
                amountOfData = amountOfData -1
                continue
            if chineseWord[0] == englishWord[0]:
                chineseWord = ""
                englishWord = ""
                amountOfData = amountOfData - 1
                temp.clear()
                continue
            else:
                print(chineseWord)
                print(englishWord)
                df = df.append(temp,ignore_index=True)
                chineseWord = ""
                numOfData = numOfData + 1
    print(df.to_string())
    return df
예제 #11
0
파일: models.py 프로젝트: Cayprol/ovis
    def _search(self,
                args,
                offset=0,
                limit=None,
                order=None,
                count=False,
                access_rights_uid=None):
        """
		:return: a list of record ids or an integer (if count is True)
		"""
        available_modules = ['s2t.json', 't2s.json']
        multi_record_ids = []
        for m in available_modules:
            new_args = []
            for arg in args:
                if len(arg) > 1 and type(arg[2]) is str:
                    list_arg = list(arg)
                    list_arg[2] = OpenCC(m).convert(arg[2])
                    new_args.append(tuple(list_arg))
                else:
                    new_args.append(arg)

            record_ids = super(BaseModelExtend, self)._search(
                new_args,
                offset=offset,
                limit=limit,
                order=order,
                count=count,
                access_rights_uid=access_rights_uid)
            if type(record_ids) is list:
                multi_record_ids += record_ids
            else:
                return record_ids
        # Don't use list(set(multi_record_ids)), because we'd like to preserve ordering of the list for dependency.
        return list(dict.fromkeys(multi_record_ids))
예제 #12
0
파일: word2vec.py 프로젝트: archfool/nlp
def preprocess_wiki_corpus(path_data_in=None, path_data_out=None):
    if path_data_in == None:
        corpus_path = path_wiki + u'zhwiki-latest-pages-articles.xml.bz2'
        # corpus_path = path_wiki + u'enwiki-latest-pages-articles.xml.bz2'
    else:
        corpus_path = path_data_in
    if path_data_out == None:
        corpus_processed_path = path_wiki + 'corpus_wiki.txt'
    else:
        corpus_processed_path = path_data_out
    cc = OpenCC('t2s')
    count = 0
    with open(corpus_processed_path, 'w',
              encoding='utf-8') as corpus_processed:
        corpus = WikiCorpus(corpus_path, lemmatize=False, dictionary={})
        for doc in corpus.get_texts():
            doc_new = [
                ' '.join(jieba.cut(cc.convert(sent), cut_all=False))
                for sent in doc
            ]
            # doc_new = series(doc).apply(lambda x: ' '.join(jieba.cut(cc.convert(x), cut_all=False)))
            corpus_processed.write(' '.join(doc_new) + "\n")
            count += 1
            if (count % 100 == 0):
                logging.warning('Saved ' + str(count) + ' articles')
            if ((flag_test == True) and (count == 200)):
                return
    return
 def onclick(self):
     year = int("0{}".format(self.yearInput.text()))
     month = int("0{}".format(self.monthInput.text()))
     day = int("0{}".format(self.dayInput.text()))
     hour = int("0{}".format(self.hourInput.text()))
     minutes = int("0{}".format(self.minutesInput.text()))
     second = int("0{}".format(self.secondInput.text()))
     shengNian = int("0{}".format(self.shengNianInput.text()))
     月将=DiZHiList[self.yueJiang.currentIndex()]
     占时=DiZHiList[self.zhanShi.currentIndex()]
     昼占=True
     __占测的事 = self.占测的事Input.text()
     if self.zhouZhan.currentIndex() == 1:
         昼占=False
     命局=False
     if self.mingJu.currentIndex() == 1:
         命局 = True
     性别 = self.sex.checkedId()
     cc = OpenCC('s2t')
     if 命局:
         sq = MinGPan(year, month, day, hour, minutes, second, 月将, 占时, 昼占,
                     __占测的事, 性别, shengNian)
         #pdb.set_trace()
     else:
         sq = ShiPan(year, month, day, hour, minutes, second, 月将, 占时, 昼占,
                    __占测的事, 性别, shengNian)
         #pdb.set_trace()
     sqhtml = cc.convert(sq.toHml)
     #sqhtml = sq.toHml
     sqhtml = sqhtml.replace('後','后')
     sqhtml = sqhtml.replace('佔','占')
     sqhtml = sqhtml.replace('醜','丑')
     self.textBrowser.setHtml(sqhtml)
     self.shiPan = sq
예제 #14
0
def convert_s2t(str_input):
    """
    簡轉繁,並轉換大陸用語為台灣用語
    """
    cc = OpenCC('s2twp')  #設定簡轉繁,並轉換大陸用語為台灣用語
    str_output = cc.convert(str_input)
    return (str_output)
예제 #15
0
def getStockInfo(stock):
    cc = OpenCC('s2t')
    stockCode = stock.split('.')[0]
    s = '{:05d}'.format(int(stockCode))
    session = HTMLSession()
    session.browser
    url = 'http://stock.finance.sina.com.cn/hkstock/quotes/{0}.html'.format(s)
    stock_dict = {}
    r = session.get(url)
    r.html.render()
    name = r.html.find('#stock_cname', first=True)
    price = r.html.find('#mts_stock_hk_price', first=True)
    stockQuan = r.html.find('div.deta03',
                            first=True).find('ul')[1].find('li')[3]
    news = r.html.find('#js_ggzx', first=True).find('a')
    # print("{0} ({1})".format(cc.convert(name.text), s))
    # print(price.text)
    # print(cc.convert(stockQuan.text))
    for new in news:
        print(new.text, new.links)
    stock_dict = {
        'stock_name': cc.convert(name.text),
        'stock_code': s,
        'stock_price': price.text,
        'stock_quan': cc.convert(stockQuan.text),
        'news': news
    }
    return stock_dict
예제 #16
0
def preprocessing():
    res = []
    i = 0
    converter = OpenCC('t2s')  #trannsorm into simplified Chinese
    #nlp = StanfordCoreNLP(r'/home/yuyi/stanford-corenlp-full-2018-02-27',lang='zh')

    wiki = WikiCorpus('/home/yuyi/zhwiki-latest-pages-articles.xml.bz2',
                      lemmatize=False,
                      dictionary=[])  #gensim里的维基百科处理类WikiCorpus
    for text in wiki.get_texts(
    ):  #通过get_texts将维基里的每篇文章转换位1行text文本,并且去掉了标点符号等内容
        cleaned = ''
        text = ''.join(text)
        for char in text:
            char = converter.convert(char)
            cleaned += char

        if len(cleaned):
            sentence = list(jieba.cut(cleaned))
            res.append(sentence)

        i = i + 1
        if (i % 1000) == 0:
            #if i == 10:
            print "Saved " + str(i) + " articles."
        # break

    with open('wiki_zh.pkl', 'w') as f:
        pickle.dump(res, f)

    print "Finished Saved " + str(i) + " articles."
예제 #17
0
def preprocess_wiki(input_file, output_file):
    # Import input file
    if not os.path.exists(input_file):
        url = 'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
        logging.info('Download Wiki dump from {}'.format(url))
        wget.download(url)
    wiki = WikiCorpus(input_file, lemmatize=False, dictionary=[])

    # Convert tradtional Chinese to simplified Chinese using OpenCC
    cc = OpenCC('t2s')
    # Segment the sentences into words using Jieba paddle mode
    jieba.enable_paddle()

    # Process Wiki text
    logging.info('Start processing Wiki text')
    output = open(output_file, 'w')
    i = 0
    for article in tqdm(wiki.get_texts()):
        raw = ' '.join(article)
        processed = []
        # Remove non-Chinese words
        for token in list(jieba.cut(cc.convert(raw))):
            matched = re.findall(r'[\u4e00-\u9fff]+', token)
            if matched:
                processed.append(matched[0])
        output.write(' '.join(processed) + '\n')
        i += 1
        if (i % 10000 == 0):
            logging.info('Finished processing {} articles'.format(i))
    output.close()
    logging.info('Done')
예제 #18
0
def zh_t2s(infile, outfile):
    '''convert the traditional Chinese of infile into the simplified Chinese of outfile'''
    # read the traditional Chinese file
    t_corpus = []
    with open(infile, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.replace('\n', '').replace('\t', '')
            t_corpus.append(line)
    logger.info('read traditional file finished!')

    # convert the t_Chinese to s_Chinese
    cc = OpenCC('t2s')
    s_corpus = []
    for i, line in zip(range(len(t_corpus)), t_corpus):
        if i % 1000 == 0:
            logger.info('convert t2s with the {}/{} line'.format(
                i, len(t_corpus)))
        # s_corpus.append(OpenCC.convert(line))
        s_corpus.append(cc.convert(line))
    logger.info('convert t2s finished!')

    # write the simplified Chinese into the outfile
    with open(outfile, 'w', encoding='utf-8') as f:
        for line in s_corpus:
            f.writelines(line + '\n')
    logger.info('write the simplified file finished!')
예제 #19
0
 def __init__(self):
     APP_ID = '10508840'  # '你的 App ID'
     API_KEY = 'W9BwLsLvlPQvD9LsfWIBGX28'  # '你的 Api Key'
     SECRET_KEY = 'd4wSFFDKm0VjGrPZVxWpZyGfAFYuD3AX'  # '你的 Secret Key'
     self.db = Mysql_DB()
     self.aip = AipNlp(APP_ID, API_KEY, SECRET_KEY)
     self.trans = OpenCC('t2s') #模式设置为繁体-简体
예제 #20
0
    def __init__(self, language, save_path):
        assert language.lower() in ["en", "zh"], \
            'WikicorpusTextFormatting is not implemented for language %s yet.' % language

        self.language = language.lower()
        self.download_urls = {
            'en':
            'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
            'zh':
            'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
        }
        self.downloaded_files = {
            'en': 'wikicorpus_en.xml.bz2',
            'zh': 'wikicorpus_zh.xml.bz2'
        }
        self.chinese_coneverter = OpenCC('t2s')
        self.save_path = save_path + '/wikicorpus_' + language
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)

        self.formatted_file = os.path.join(self.save_path,
                                           "wiki_formatted.txt")

        self.download()
        self.merge()
예제 #21
0
 def __init__(self, appname, session_id):
     self.session_id = session_id
     config = RimeConfig()
     if not rime.config_open(appname.encode("UTF-8"), config):
         return
     self.font_face = rimeGetString(config, 'style/font_face')
     self.candidate_format = rimeGetString(config, 'style/candidate_format')
     self.inline_preedit = rimeGetString(config, 'style/inline_preedit')
     menu_opencc_config = rimeGetString(config, 'style/menu_opencc')
     self.menu_opencc = OpenCC(menu_opencc_config) if menu_opencc_config else None
     value = c_int()
     if rime.config_get_int(config, b'style/font_point', value):
         self.font_point = value.value
     if rime.config_get_bool(config, b'style/horizontal', value):
         self.candidate_per_row = 10 if bool(value) else 1
     if rime.config_get_int(config, b'style/candidate_per_row', value):
         self.candidate_per_row = value.value
     if rime.config_get_bool(config, b'style/display_tray_icon', value):
         self.display_tray_icon = bool(value)
     if rime.config_get_bool(config, b'style/candidate_use_cursor', value):
         self.candidate_use_cursor = bool(value)
     if rime.config_get_bool(config, b'style/soft_cursor', value):
         self.soft_cursor = bool(value)
     self.options.clear()
     self.options_states.clear()
     self.uris.clear()
     self.menu = self.config_get_menu(config, b'menu')
     #print("menu", self.menu)
     rime.config_close(config)
예제 #22
0
def cal_sim(str1, str2, stop_words, w2v, dictionary, new_dictionary, tfidf):
    cc = OpenCC("t2s")
    s1 = jieba.cut(str1, cut_all=False)
    s2 = jieba.cut(str2, cut_all=False)
    # s1、s2去重
    data1 = pre_step(s1, cc, stop_words)
    data2 = pre_step(s2, cc, stop_words)
    a1 = []
    a2 = []
    for item in data1:
        if (item not in dictionary.token2id) or (item not in w2v.vocab):
            a1.append(item)
    for item in data2:
        if (item not in dictionary.token2id) or (item not in w2v.vocab):
            a2.append(item)
    data1 = [i for i in data1 if i not in a1]
    data2 = [i for i in data2 if i not in a2]
    vec1 = dictionary.doc2bow(data1)
    # vec1[词的数字表示,频次]
    vec2 = dictionary.doc2bow(data2)

    tf1 = tfidf[vec1]
    # tf1[词的数字表示,权重]
    tf2 = tfidf[vec2]
    return word2vec(tf1, tf2, data1, data2, new_dictionary, w2v)
예제 #23
0
    def sortData(self, tagResults):
        cc=OpenCC('s2t')
        # resultForTotalSize = self.getCallResult(1,1,None)
        dataForRakuten = {}
        sortedData = []

        for record in tagResults:
            extend_data = []
            customer_info = []
            record_session_id = record['session_id']

            #獲取 asr result 開始
            asr_result = self.getChatRecords(record_session_id)
            if asr_result['status']!=0:
                return ("Call api: " + const.GET_ASR_RESULT_API + ", got wrong status: " + str(asr_result['status']) + ", message: " + asr_result['message'])
            record['asr_result']=json.loads(cc.convert(json.dumps(asr_result['result']['data'])))
            #獲取 asr result 結束

            log.debug(record['extend_data'])
            for key, value in record['extend_data'].items():
                regex = re.search('^\\*(.+)', key)
                if regex:
                    extend_data.append({"session_id":record_session_id,"col_name":cc.convert(regex.group(1)),"value":cc.convert(value)})
                else:
                    customer_info.append({"session_id":record_session_id,"col_name":cc.convert(key),"value":cc.convert(value)})
            record['extend_data']=extend_data
            record['customer_info']=customer_info
            sortedData.append(record)

        dataForRakuten['data'] = sortedData

        return dataForRakuten
예제 #24
0
 def __init__(self):
     APP_ID = '10362966'  # '你的 App ID'
     API_KEY = 'nQWiWR6DzjXsfYjW1yyVy8TB'  # '你的 Api Key'
     SECRET_KEY = 'WpjMdNWYv6TSg2psofaGt4LNW366tvnj'  # '你的 Secret Key'
     self.db = Mysql_DB()
     self.aip = AipNlp(APP_ID, API_KEY, SECRET_KEY)
     self.trans = OpenCC('t2s')  #模式设置为繁体-简体
예제 #25
0
파일: __main__.py 프로젝트: zxsama/epubcst
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-i', '--input', metavar='<file>',
                        help='Read original text from <file>.')
    parser.add_argument('-o', '--output', metavar='<file>',
                        help='Write converted text to <file>.')
    parser.add_argument('-c', '--config', metavar='<conversion>',
                        help='Conversion')
    parser.add_argument('--in-enc', metavar='<encoding>', default='UTF-8',
                        help='Encoding for input')
    parser.add_argument('--out-enc', metavar='<encoding>', default='UTF-8',
                        help='Encoding for output')
    args = parser.parse_args()

    if args.config is None:
        print("Please specify a conversion.", file=sys.stderr)
        return 1

    cc = OpenCC(args.config)

    with io.open(args.input if args.input else 0, encoding=args.in_enc) as f:
        input_str = f.read()
    output_str = cc.convert(input_str)
    with io.open(args.output if args.output else 1, 'w',
              encoding=args.out_enc) as f:
        f.write(output_str)

    return 0
예제 #26
0
def read_input(filename):
    current_train_x = ""
    current_train_y = ""
    train_x = []
    train_y = []

    with open(filename, "r", encoding="utf-8") as raw_data:
        open_cc = OpenCC("s2t")
        for line in raw_data:
            line = line.strip()
            line = open_cc.convert(line)
            if line == "==###=title=###==":
                state = State.SEE_TITLE
                if current_train_x.strip() and current_train_y.strip():
                    train_x.append(current_train_x)
                    train_y.append(current_train_y)
                    current_train_x = ""
                    current_train_y = ""
            elif line == "==###=description=###==":
                state = State.SEE_DESCRIPTION
            elif line == "==###=category=###==":
                state = State.SEE_CATEGORY
            else:
                if state == State.SEE_TITLE:
                    current_train_x = line
                elif state == State.SEE_DESCRIPTION:
                    current_train_x += " " + line
                elif state == State.SEE_CATEGORY:
                    current_train_y = line
        if current_train_x.strip() and current_train_y.strip():
            train_x.append(current_train_x)
            train_y.append(current_train_y)
            current_train_x = ""
            current_train_y = ""
    return train_x, train_y
예제 #27
0
def get_data(page):
    datasets = []
    try:
        text = page.text.encode('iso-8859-1').decode('GBK')
        soup = BeautifulSoup(text, 'lxml')
        posts = soup.find_all(class_='forumbox postbox')
        for post in posts:
            data = {}
            # 作者id,发帖次数,最后发帖时间
            data['uid'] = post.find(class_='author')['href'].split('=')[-1]
            id = post.find(class_='author')['id'][10:]
            data['posttime'] = datetime.strptime(
                post.find(id="postdate" + id).text, "%Y-%m-%d %H:%M")
            #发帖次数暂定为1,查询后再进行修改
            data['postcount'] = 1
            datasets.append(data)

            # 得到帖子内容
            content = post.find(id='postcontent' + id).text.strip()
            content = re.sub(u'\\[quote\\].*?\\[/quote\\]', '', content)
            content = re.sub(u'\\[b\\].*?\\[/b\\]', '', content)
            content = re.sub(u'\\[img\\].*?\\[/img\\]', '', content)
            content = re.sub(u'\\[url\\].*?\\[/url\\]', '', content)
            content = re.sub(u'\\[size.*?/size\\]', '', content)
            content = re.sub(u'\\[s:.*?\\]', '', content)
            content = re.sub(u'\\[.*?del\\]', '', content)
            content = re.sub(u'\\[.*?list\\]', '', content)
            content = re.sub(u'\\[.*?collapse\\]', '', content)
            if len(content) > 0:
                cc = OpenCC('t2s')
                content = cc.convert(content)
                save_content(content)
    except Exception as e:
        print("出现异常,错误为:%s" % e)
    return datasets
예제 #28
0
파일: s2t.py 프로젝트: ARJhe/nlp_tutorial
def main():
    '''
        convert simplified Chinese file to traditional Chinese file line by line
    :return:
    '''
    if len(sys.argv) != 3:
        print("Usage: python " + sys.argv[0] + " input.txt output.txt")
        exit()

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    cc = OpenCC('s2tw')
    output = open(
        sys.argv[2], "w+",
        encoding='utf-8')  # w+: write to file and create if not exist
    logging.info("Start converting!")
    with open(sys.argv[1], "r", encoding='utf-8') as f:
        for line_num, line in enumerate(f):
            line_num += 1
            output.write(cc.convert(line))
            if line_num % 10000 == 0:
                logging.info("已處理 %d 行" % line_num)
        output.close()
        f.close()
예제 #29
0
def dataCleaning(id):
    openCC = OpenCC('tw2s')
    params = {'s': id}
    try:
        r = requests.get(GET_AC_GAMER_URL, timeout=100, params=params)
    except TimeoutError:
        return None
    r.encoding = 'utf-8'
    soup = BeautifulSoup(r.text, features='lxml')
    platform = soup.find('a', {'class': 'link2'})
    if not platform:
        return None
    if platform.text == 'Nintendo Switch ( NS )':
        tw_name = soup.find('h1').text
        cn_name = openCC.convert(str(tw_name))
        jp_name = soup.find_all('h2')[0].text
        eu_name = soup.find_all('h2')[1].text

        names = {
            'tw_name': tw_name,
            'cn_name': cn_name,
            'jp_name': jp_name,
            'eu_name': eu_name
        }
        name_collection.update({'tw_name': tw_name}, names, upsert=True)

    return None
예제 #30
0
    def generate_sentence_vec_avg(self, sentence, zh_type):
        exclude = set(string.punctuation + ',' + '。' + '、' + '「' + '」' + '?' +
                      '!')
        vector = np.zeros((WV_DIIM))
        oov_num = 0

        if zh_type == 's':
            openCC = OpenCC('tw2s')
            sentence = openCC.convert(sentence)
            token_sentence = jieba.lcut(sentence)
            token_sentence = [t for t in token_sentence if not t in exclude]

        elif zh_type == 'tw':
            segmenter = ckip.CkipSegmenter()
            token_sentence = segmenter.seg(sentence)
            token_sentence = [
                t for t in token_sentence.tok if not t in exclude
            ]

        for token in token_sentence:
            if token in self.w2v_model.wv.vocab:
                vector += self.w2v_model[token]
            else:
                oov_num += 1
        vector /= len(token_sentence)
        self.paragraph_vec.append(vector)
        return vector