def main(): args = get_args() vocab = Vocab(args.vocab_path, args.vocab_size) # create a vocabulary hps = get_hps() if not args.data_path == "": batcher = Batcher(args.data_path, vocab, hps, args.single_pass) import pdb pdb.set_trace() x = batcher.next_batch() import pdb pdb.set_trace() pass else: with open(args.json_path) as f: art = json.load(f) article = neologdn.normalize(art['body']) abstract = neologdn.normalize(art['title']) m = MeCab('-Owakati') parsed_article = m.parse(article) abs_words = m.parse(abstract).split() ex = B.Example(parsed_article, abs_words, vocab, hps) b = B.Batch([ex], hps, vocab) import pdb pdb.set_trace() pass
def json_batch(fname, hps, vocab): with open(fname) as f: art = json.load(f) article = neologdn.normalize(art['body']) abstract = neologdn.normalize(art['title']) m = MeCab('-Owakati') parsed_article = m.parse(article) abs_words = m.parse(abstract).split() ex = B.Example(parsed_article, abs_words, vocab, hps) b = B.Batch([ex], hps, vocab) return b
def read_from_csv(path, parse_func): lists = [] with open(path, 'r', encoding='utf-8') as f: sources = csv.reader(f, delimiter=',') for src, dst in sources: src = parse_func( neologdn.normalize( src.replace('"', '').replace("~~~", '~'))) dst = parse_func( neologdn.normalize( dst.replace('"', '').replace('~~~', '~'))) lists += [[' '.join(src), ' '.join(dst)]] return lists
def main(self): tweets_num = 0 stopwords = self.Stop_Words() df_tweet, tweets = self.Load_tweets() #ツイートを分かち書きしてcsvに出力(モード'a'はデータ追加、モード'w'は新規作成) with open('./output/' + self.out_file, self.mode) as f: for i in tweets: tweets_num += 1 i = neologdn.normalize(i) i = re.sub('\n', "", i) i = re.sub(r'[!-~]', "", i) #半角記号,数字,英字を削除 i = re.sub(r'[︰-@]', "", i) #全角記号削除 i = self.format_text(i) #記号削除 i = re.sub( r'[【】●ㅅ●Ф☆✩︎♡→←▼①②③④⑤『』ω《》∠∇∩♪∀◞ཀCщ≧≦ ́◤◢■◆★※↑↓〇◯○◎⇒▽◉Θ♫♬〃“”◇ᄉ⊂⊃д°]', "", i) #i = re.sub(r'[‥…?!〜「」「」::♪♩『』→↓↑〈〉・゜・´Д´°ω°•ω•★*☆♡()✔Θ∀´∀`˘ω˘‼бωб ̄▽ ̄]', "", i) i = self.remove_emoji(i) i = self.Tokenizer(i, stopwords) i = ' '.join(i) #リストを文字列に変換 i = str(i) f.write(i) with open('./output/' + self.out_file) as f: wakati = f.read() print('csv出力完了:' + self.out_file) print("学習用ツイート数(判定用ツイート含む/短すぎるツイートは削除):", tweets_num) print("[分かち書きサンプル]\n", wakati[:100]) print() return df_tweet, self.similar
def normalize_string(text): normalized_text = neologdn.normalize(text).lower() replaced_text = re.sub("[!?@「」()、。・()…/_:;\s]", "", normalized_text) # replaced_text = re.sub("[!?@「」()、。()…/_:;\d\s]", "", normalized_text) # replaced_text = re.sub("[!?@「」()、。()…/_:;\d\sa-zA-Z]", "", normalized_text) return replaced_text
def most_words(self): nsdtweet = self.api.user_timeline(screen_name="@nsd244", count=200) #nsdtext = nsdtweet[0].text words = [] print(len(nsdtweet)) for status in nsdtweet: tex = neologdn.normalize(status.text) # 正規化 tex = ''.join(c for c in tex if c not in emoji.UNICODE_EMOJI) # 絵文字の除去 tex = re.sub(r'https?://[\w/:%#\$&\?\(\)~\.=\+\-]+', '', tex) # URLの除去 tex = re.sub(r'(\d)([,.])(\d+)', r'\1\3', tex) # 桁区切りの除去 tex = re.sub(r'\d+', '0', tex) # 数字の置換 tex = re.sub(r'[!-/:-@[-`{-~]', r' ', tex) # 半角記号の置換 tex = re.sub(u'[■-♯]', ' ', tex) # 全角記号の置換 (ここでは0x25A0 - 0x266Fのブロックのみを除去) m = MeCab.Tagger( "/usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd") for line in m.parse(tex).splitlines()[:-1]: surface, feature = line.split('\t') if feature.startswith( "名詞" ) and ',非自立,' not in feature and surface != "0" and surface != "RT": words.append(surface) #print(words) counter = Counter(words) out = [] for word, cnt in counter.most_common(10): out.append("単語:" + word + ", 出現回数:" + str(cnt) + "\n") self.api.update_status(status="@nsd244" + "\n".join(map(str, out)), in_reply_to_status_id=nsdtweet[0].id)
def normalize_text(input_text, dictionary_mode='ipadic', new_line_replaced='。', is_replace_eos=True, is_kana=True, is_ascii=True, is_digit=True): """* What you can do - It converts input-text into normalized-text which is good for tokenizer input. * Params - new_line_replaced: a string which replaces from \n string. """ # type: (text_type,text_type,text_type,bool,bool,bool,bool)->text_type if is_replace_eos: without_new_line = input_text.replace('\n', new_line_replaced) else: without_new_line = new_line_replaced if dictionary_mode == 'neologd': return neologdn.normalize( normalize_text_normal_ipadic(without_new_line)) else: return normalize_text_normal_ipadic(without_new_line, kana=is_kana, ascii=is_ascii, digit=is_digit)
def _normalize_hours(self, hours): hoursn = hours hoursn = hoursn.replace('~', '-') hoursn = hoursn.replace('・','・') hoursn = neologdn.normalize(hoursn) hoursn = hoursn[:100] return hoursn
def normalize(self, word): """ テキストを正規化する """ # 前後空白を削除 word = word.strip() # 日本語の区切りをシンプルに変換 word = word.translate(self.tt_seps) # 小文字化 word = word.lower() # 漢数字をアラビア数字にする word = self.kansuji2arabic(word) # NFKC(Normalization Form Compatibility Composition)で # 半角カタカナ、全角英数、ローマ数字・丸数字、異体字などなどを正規化。 word = unicodedata.normalize("NFKC", word) # アルファベットやアラビア数字、括弧やエクスクラメーションマークなどの記号は、半角に統一 # カタカナは、全角に統一 # "うまーーーい!!" → "うまーい!" など、重ね表現を除去 # 一方で、"やばっっっ!!" は除去できてません # repeat引数に1を渡すと、2文字以上の重ね表現は1文字にできますが、そうすると"Good"は"God"になってしまったりします # ”〜”などの記号も除去されてます word = neologdn.normalize(word) # もろもろ正規化したあとのパターンマッチング変換 # URLを削除 word = re.sub(r'https?://[\w/:%#\$&\?\(\)~\.=\+\-]+', '', word) # 桁区切りの除去と数字の置換 word = re.sub(r'(\d)([,.])(\d+)', r'\1\3', word) word = re.sub(r'\d+', '0', word) # 記号削除 word = re.sub(r'[\(\)\<\>\[\]\【\】\《\》\≪\≫\/\#\?\・]', '', word) return word
def predict(): response = {"success": False, "Content-Type": "application/json"} if flask.request.method == "POST": if flask.request.get_json().get("xs"): user_input = flask.request.get_json().get("xs") normalized = neologdn.normalize(user_input) s = m.parse(normalized).replace('\n', '').strip().split() print('xs is ', s) xs = [] for x in s: try: xs.append(vocab[x]) except (KeyError): xs.append(random.uniform(0, len(vocab) - 1)) xs.append(vocab['<eos>']) xs = xp.array(xs).astype(xp.int32) dummy = [(xs, xp.zeros(1).astype(xp.int32))] with chainer.using_config("train", False), chainer.using_config( "enable_backprop", False): ys_list = model(dummy)[0] ys = [] for y in ys_list: if int(y) is vocab["<eos>"]: break ys.append(rvocab[int(y)]) # classify the input feature response["ys"] = ''.join(ys) print('ys is ', response["ys"]) # indicate that the request was a success response["success"] = True # return the data dictionary as a JSON response return flask.jsonify(response)
def _preprocess(self, sentence: str) -> str: sentence = sentence.replace('●', '') s = sentence # 参考元:https://qiita.com/gacky01/items/26cd642731e3eddde60d while s.find("(") != -1: start_1 = s.find("(") if s.find(")") != -1: end_1 = s.find(")") if start_1 >= end_1: s = s.replace(s[end_1], "") else: s = s.replace(s[start_1:end_1 + 1], "") if len(s) == 0: continue else: s = s[0:start_1] while s.find("【") != -1: start_4 = s.find("【") if s.find("】") != -1: end_4 = s.find("】") s = s.replace(s[start_4:end_4 + 1], "") else: s = s[0:start_4] sentence = s return neologdn.normalize(re.sub(r'\d+', '0', sentence)).replace("\n", "")
def normalize_string(text): """ 文字列から余計な記号などを取り除く """ normalized_text = neologdn.normalize(text).lower() replaced_text = re.sub("[!?@「」()、。・()…/_:;\s]", "", normalized_text) return replaced_text
def normalize_text(input_text, dictionary_mode='ipadic', new_line_replaced='。', is_replace_eos=True, is_kana=True, is_ascii=True, is_digit=True): """* What you can do - It converts input-text into normalized-text which is good for tokenizer input. * Params - new_line_replaced: a string which replaces from \n string. """ # type: (str,str,str,bool,bool,bool,bool)->str if is_replace_eos: without_new_line = input_text.replace('\n', new_line_replaced) else: without_new_line = new_line_replaced if dictionary_mode == 'neologd': # this code comes from https://github.com/neologd/mecab-ipadic-neologd/wiki/Regexp.ja return neologdn.normalize(without_new_line) else: return normalize_text_normal_ipadic(without_new_line, kana=is_kana, ascii=is_ascii, digit=is_digit)
def text_clean(text): text = text.replace('\u3000', '') text = neologdn.normalize(text, repeat=3) text = ''.join(['' if c in emoji.UNICODE_EMOJI else c for c in text]) text = re.sub(r'(\d)([,.])(\d+)', r'\1\3', text) text = re.sub(r'\d+', '0', text) text = re.sub(r'[!-/:-@[-`{-~]', r'', text) text = re.sub(u'[■-♯]', '', text) text = regex.sub(r'^(\p{Nd}+\p{Zs})(.*)$', r'\2', text) text = text.strip() text = text.replace('“', '') text = text.replace('…', '') text = text.replace('『', '「') text = text.replace('』', '」') text = text.replace('《', '「') text = text.replace('》', '」') text = text.replace('〕', ')') text = text.replace('〔', '(') text = text.replace('〈', '(') text = text.replace('〉', ')') text = text.replace('→', '') text = text.replace(',', '、') text = text.replace(',', '、') text = text.replace('.', '。') text = text.replace('.', '。') text = text.replace(' ', '') return text
def start_preprocess(path): with open(path, mode='r', encoding='utf-8') as f: text = f.readlines() # preprocess text text = [neo.normalize(sentence, repeat=2).lower() for sentence in text] text = replacer(text, r'<.+>', '') text = replacer(text, r'\d+', '') text = [sentence.rstrip() for sentence in text] pattern = r'[、。「」〈〉『』【】&*・()$#@。、?!`+¥%:〔〕“”!"#$%&()*+,-./:;<=>?@^_`{|}~]' text = replacer(text, pattern, '') # tokenize text text_str = ' '.join(text) tokens = tokenizer(text_str) token_list = tokens.split(' ') # remove stopwords processed_text = remove_stopwords(token_list) processed_text = ' '.join(processed_text) # save to txt save_path = path.replace('text', 'processed_text') with open(save_path, mode='w') as f: f.write(processed_text)
def normalize_text(input_text, dictionary_mode='ipadic', new_line_replaced='。', is_replace_eos=True, is_kana=True, is_ascii=True, is_digit=True): """* What you can do - It converts input-text into normalized-text which is good for tokenizer input. * Params - new_line_replaced: a string which replaces from \n string. """ # type: (text_type,text_type,text_type,bool,bool,bool,bool)->text_type if is_replace_eos: without_new_line = input_text.replace('\n', new_line_replaced) else: without_new_line = new_line_replaced if dictionary_mode=='neologd' and is_neologdn_valid: return neologdn.normalize(normalize_text_normal_ipadic(without_new_line)) elif dictionary_mode=='neologd' and is_neologdn_valid == False: raise Exception("You could not call neologd dictionary bacause you do NOT install the package neologdn.") else: return normalize_text_normal_ipadic(without_new_line, kana=is_kana, ascii=is_ascii, digit=is_digit)
def split_into_words(text, tokenizer): # tokens = tokenizer.tokenize(text) normalized_text = neologdn.normalize(text) normalized_text = re.sub(r'[!-/:-@[-`{-~]', r' ', normalized_text) tokens = [token for token in tokenizer.analyze(normalized_text)] ret = [] for idx in range(len(tokens)): token = tokens[idx] if idx + 1 == len(tokens): if parts[0] == '名詞' and parts[1] != '接尾' and parts[1] != '副詞可能': ret.append(token.base_form) elif parts[0] == '名詞': continue else: ret.append(token.base_form) break post_token = tokens[idx + 1] parts = token.part_of_speech.split(',') post_parts = post_token.part_of_speech.split(',') if parts[0] == '名詞': if parts[1] == '一般' and post_parts[0] == '名詞' and post_parts[ 1] == '接尾': ret.append(token.base_form + post_token.base_form) elif parts[1] == '一般': ret.append(token.base_form) elif parts[1] == '接尾': continue elif parts[1] == '副詞可能': continue else: ret.append(token.base_form) else: ret.append(token.base_form) return ret
def searchTeacher(text, bool): text = neologdn.normalize(text) #正規化 text = text.split(",")[0] if "," in text else text text = text.replace(" ", "") text = text.replace("C1", "") sql_lecture = f"select * from lecture_assessments where subject LIKE '%{text}%'" lecture_info = get_dict_resultset(sql_lecture) #検索結果が入っている. teacher_info_list = [] if bool: #[{'subject': '実践機械学習', 'teacher': '篠原歩', 'difficulty': '仏', 'worth': '', 'comment': 'Pythonに関する授業', # 'test': '', 'report': '', 'attendance': ''}, # {'subject': '実践機械学習', 'teacher': '篠原歩', 'difficulty': '仏', 'worth': '', 'comment': '機械学習に興味があるけどよく知らないという人にはよさそう', # 'test': '', 'report': 'あり', 'attendance': 'あり'}] こういう辞書のリストをつくる teacher_info_list = [{ key: value for key, value in zip(keys, _lecture_info) } for _lecture_info in lecture_info] else: if lecture_info: teacher_info_list = list( set([_lecture_info[1] for _lecture_info in lecture_info ])) #教授名だけのリスト. 一度setにしてからlistに戻すことで,重複している要素を除いている. if teacher_info_list[0] == "": #先頭が空であることが多いので,それを除去. teacher_info_list = teacher_info_list[1:] return teacher_info_list
def text_to_ward(self, text): m = MeCab.Tagger("-d C:\mecab-ipadic-neologd") m.parse(" ") buff = neologdn.normalize(text) m_text = m.parse(buff) basic_word = [] m_text = m_text.split("\n") for row in m_text: word = row.split("\t")[0] if word == "EOS": break else: pos = row.split("\t")[1].split(",") parts = pos[0] if "記号" in parts: if word != "。": continue basic_word.append(word) elif "助" in parts: pass elif "形容詞" in parts or "動詞" in parts: basic_word.append(pos[6]) pass elif "名詞" in parts or "副詞" in parts: basic_word.append(word) pass result_word = " ".join(basic_word) return result_word
def web_rand(url="",fields={}): https = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',ca_certs=certifi.where(), headers={"User-Agent":"Janome_doe"}) try:html=https.request('POST',str(url).split("?")[0]+"?"+parse.quote(str(url).split("?")[1],safe="=&-")) except: print("err");return "ERROR:invalid endpoint" html=html.data.decode('utf-8').translate(str.maketrans("","","\"\'\\/<>%`?;"))#Not_secure_filename! return neologdn.normalize(html).translate(str.maketrans("","","_:| ~-#"))
def normalize_text(text: str) -> str: """テキストを正規化する. :param text: 正規化対象テキスト :return: 正規化後テキスト """ normalized_text = neologdn.normalize(text) return str(_normalize_circle_char(string=normalized_text))
def cleansing_text(self, text): text = self.cleansing_space(text) text = self.cleansing_url(text) text = self.cleansing_emoji(text) text = self.cleansing_unity(text) text = self.cleansing_num(text) text = neologdn.normalize(text) return text
def normalize(text: str) -> str: """ テキストの正規化 """ text = text.replace("\n", " ").strip() # 改行を除去して1行の長いテキストとみなす text = re.sub(r'(\d)([,.])(\d+)', r'\1\3', text) # 数字の桁区切りの除去 text = re.sub(r'\d+', '0', text) # 数字をすべて0に統一 text = neologdn.normalize(text) # 全角・半角の統一と重ね表現の除去 return text
def preprocessing(self, text): text = re.sub(r'\n', '', text) text = re.sub(r'\r', '', text) text = mojimoji.han_to_zen(text, digit=False, ascii=False) text = mojimoji.zen_to_han(text, kana=True) text = ''.join(c for c in text if c not in emoji.UNICODE_EMOJI) text = neologdn.normalize(text) return text
def normalize(text): text_without_account = re.sub(r'@[a-zA-Z0-9_]+', '', text) # remove twitter_account text_without_url = re.sub(r'https?://[\w/;:%#\$&\?\(\)~\.=\+\-]+', '', text_without_account) # remove URL text_normalized = neologdn.normalize(text_without_url).replace('<', '<').replace('>', '>').replace('&', '&') text_without_emoji = ''.join(['' if c in emoji.UNICODE_EMOJI else c for c in text_normalized]) #tmp = re.sub(r'(\d)([,.])(\d+)', r'\1\3', text_without_emoji) #text_replaced_number = re.sub(r'\d+', '0', tmp) text_replaced_indention = ' '.join(text_without_emoji.splitlines()) return text_replaced_indention.lower()
def query(self, word): normalized_word = neologdn.normalize(word) if word in self.vocab: return self.w2v[word] elif normalized_word in self.vocab: return self.w2v[normalized_word] else: return self.out_of_vocab_vector(normalized_word)
def preprocessing(text: str) -> str: # メンション除去 text = re.sub(r'@[a-zA-Z0-9_]+', '', text) # リンク除去 text = re.sub(r'https?://[\w/:%#\$&\?\(\)~\.=\+\-]+', '', text) # 絵文字除去 text = ''.join(['' if c in emoji.UNICODE_EMOJI["en"] else c for c in text]) # いい感じな正規化 text = neologdn.normalize(text) return text
def extractWords(text): text = removeEmoji(text) text = neologdn.normalize(text) words = [] analyzedResults = tagger.parse(text).split("\n") for result in analyzedResults: splittedWord = result.split(",")[0].split("\t")[0] if not splittedWord in stopWords: words.append(splittedWord) return words
def extract_noun(text: str): norm_text = neologdn.normalize(text) parsed = parse_text(norm_text) noun_df = parsed[ parsed.type.str.startswith('名詞-一般') | parsed['type'].str.startswith('名詞-固有名詞') | parsed.type.str.startswith('名詞-サ変接続') | parsed.type.str.startswith('名詞-形容動詞語幹') ] return ' '.join(noun_df.orig.tolist())
def tokenize(self, sentence, normalized=True, is_feature=False, is_surface=False, return_list=False, func_normalizer=normalize_text): """* What you can do - Call mecab tokenizer, and return tokenized objects """ # type: (str, bool, bool, bool, bool, Callable[[str], str])->Union[List[str], TokenizedSenetence] ### decide normalization function depending on dictType if func_normalizer is None and self._dictType == 'neologd': normalized_sentence = neologdn.normalize(sentence) elif func_normalizer == normalize_text: normalized_sentence = normalize_text(sentence, dictionary_mode=self._dictType) elif func_normalizer is None: normalized_sentence = sentence else: normalized_sentence = func_normalizer(sentence) assert isinstance(sentence, string_types) tokenized_objects = [] # don't delete this variable. encoded_text protects sentence from deleting encoded_text = normalized_sentence.encode('utf-8') node = self.mecabObj.parseToNode(encoded_text) node = node.next while node.next is not None: word_surface = node.surface.decode('utf-8') tuple_pos, word_stem = self.__feature_parser(node.feature.decode('utf-8'), word_surface) tokenized_obj = TokenizedResult( node_obj=node, tuple_pos=tuple_pos, word_stem=word_stem, word_surface=word_surface, is_feature=is_feature, is_surface=is_surface ) tokenized_objects.append(tokenized_obj) node = node.next tokenized_sentence = TokenizedSenetence( sentence=sentence, tokenized_objects=tokenized_objects ) if return_list: return tokenized_sentence.convert_list_object() else: return tokenized_sentence
def test_normalize_lengthened(self): self.assertEqual(normalize("うまああああああああああああい", repeat=7), "うまあああああああい") self.assertEqual(normalize("かわいいいいいるい", repeat=6), "かわいいいいいるい")
def test_normalize(self): self.assertEqual(normalize("0"), "0") self.assertEqual(normalize("ハンカク"), "ハンカク") self.assertEqual(normalize("o₋o"), "o-o") self.assertEqual(normalize("majika━"), "majikaー") self.assertEqual(normalize("わ〰い"), "わい") self.assertEqual(normalize("スーパーーーー"), "スーパー") self.assertEqual(normalize("!#"), "!#") self.assertEqual(normalize("ゼンカク スペース"), "ゼンカクスペース") self.assertEqual(normalize("お お"), "おお") self.assertEqual(normalize(" おお"), "おお") self.assertEqual(normalize("おお "), "おお") self.assertEqual(normalize("検索 エンジン 自作 入門 を 買い ました!!!"),\ "検索エンジン自作入門を買いました!!!") self.assertEqual(normalize("アルゴリズム C"), "アルゴリズムC") self.assertEqual(normalize(" PRML 副 読 本 "), "PRML副読本") self.assertEqual(normalize("Coding the Matrix"), "Coding the Matrix") self.assertEqual(normalize("南アルプスの 天然水 Sparking Lemon レモン一絞り"),\ "南アルプスの天然水Sparking Lemonレモン一絞り") self.assertEqual(normalize("南アルプスの 天然水- Sparking* Lemon+ レモン一絞り"),\ "南アルプスの天然水- Sparking*Lemon+レモン一絞り") self.assertEqual(normalize(u'パパ'), u"パパ") self.assertEqual(normalize(u'a˗֊‐‑‒–⁃⁻₋−'), "a-") self.assertEqual(normalize(u'あ﹣-ー—―─━ー'), u"あー") self.assertEqual(normalize(u'チルダ~∼∾〜〰~'), u"チルダ")
def test_suppress_removal_of_spaces_between_Japanese(self): self.assertEqual(normalize('巴 マミ', remove_space=False), '巴 マミ')
def test_normalize(self): self.assertEqual(normalize('0'), '0') self.assertEqual(normalize('ハンカク'), 'ハンカク') self.assertEqual(normalize('o₋o'), 'o-o') self.assertEqual(normalize('majika━'), 'majikaー') self.assertEqual(normalize('わ〰い'), 'わい') self.assertEqual(normalize('スーパーーーー'), 'スーパー') self.assertEqual(normalize('!#'), '!#') self.assertEqual(normalize('ゼンカク スペース'), 'ゼンカクスペース') self.assertEqual(normalize('お お'), 'おお') self.assertEqual(normalize(' おお'), 'おお') self.assertEqual(normalize('おお '), 'おお') self.assertEqual(normalize('検索 エンジン 自作 入門 を 買い ました!!!'),\ '検索エンジン自作入門を買いました!!!') self.assertEqual(normalize('アルゴリズム C'), 'アルゴリズムC') self.assertEqual(normalize(' PRML 副 読 本 '), 'PRML副読本') self.assertEqual(normalize('Coding the Matrix'), 'Coding the Matrix') self.assertEqual(normalize('南アルプスの 天然水 Sparking Lemon レモン一絞り'),\ '南アルプスの天然水Sparking Lemonレモン一絞り') self.assertEqual(normalize('南アルプスの 天然水- Sparking* Lemon+ レモン一絞り'),\ '南アルプスの天然水- Sparking*Lemon+レモン一絞り') self.assertEqual(normalize('パパ'), 'パパ') self.assertEqual(normalize('a˗֊‐‑‒–⁃⁻₋−'), 'a-') self.assertEqual(normalize('あ﹣-ー—―─━ー'), 'あー') self.assertEqual(normalize('チルダ~∼∾〜〰~'), 'チルダ') self.assertEqual(normalize('う゛ぽ'), 'ゔぽ')