def clean_text(self, origin): cltweets = [] ht = HarvestText() for twcl in origin: if type(twcl) == list: cltwcl = [] for etwcl in twcl: cltwcl.append( ht.clean_text(emojiswitch.demojize(etwcl, delimiters=("[", "]")), t2s=True, weibo_at=False)) # cltweets.append( # ht.clean_text(emojiswitch.demojize(twcl, delimiters=("[", "]")), t2s=True)) cltweets.append(cltwcl) else: cltweets.append( ht.clean_text(emojiswitch.demojize(twcl, delimiters=("[", "]")), t2s=True, weibo_at=False)) # cltweets.append( # ht.clean_text(emojiswitch.demojize(twcl, delimiters=("[", "]")), t2s=True)) # print(cltweets) return cltweets
def test_hard_text_cleaning(): ht = HarvestText() # 不可见字符 text1 = "捧杀!干得漂亮![doge] \\u200b\\u200b\\u200b" text2 = ht.clean_text(text1) print("清洗前:", [text1]) print("清洗后:", [text2]) assert text2 == "捧杀!干得漂亮!" text1 = "捧杀!干得漂亮![doge] \u200b\u200b\u200b" text2 = ht.clean_text(text1) assert text2 == "捧杀!干得漂亮!" print("清洗前:", [text1]) print("清洗后:", [text2]) # 两个表情符号中间有内容 text1 = "#缺钱找新浪# 瞎找不良网贷不如用新浪官方借款,不查负债不填联系人。 http://t.cn/A643boyi \n新浪[浪]用户专享福利,[浪]新浪产品用的越久额度越高,借万元日利率最低至0.03%,最长可分12期慢慢还! http://t.cn/A643bojv http://t.cn/A643bKHS \u200b\u200b\u200b" text2 = ht.clean_text(text1) print("清洗前:", [text1]) print("清洗后:", [text2]) assert text2 == "#缺钱找新浪# 瞎找不良网贷不如用新浪官方借款,不查负债不填联系人。\n新浪用户专享福利,新浪产品用的越久额度越高,借万元日利率最低至0.03%,最长可分12期慢慢还!" # 包含emoji text1 = "各位大神们🙏求教一下这是什么动物呀![疑问]\n\n为什么它同时长得有点吓人又有点可爱[允悲]\n\n#thosetiktoks# http://t.cn/A6bXIC44 \u200b\u200b\u200b" text2 = ht.clean_text(text1) print("清洗前:", [text1]) print("清洗后:", [text2]) assert text2 == "各位大神们求教一下这是什么动物呀!\n为什么它同时长得有点吓人又有点可爱\n#thosetiktoks#" text1 = "JJ棋牌数据4.3万。数据链接http://www.jj.cn/,数据第一个账号,第二个密码,95%可登录,可以登录官网查看数据是否准确" text2 = ht.clean_text(text1) assert text2 == "JJ棋牌数据4.3万。数据链接,数据第一个账号,第二个密码,95%可登录,可以登录官网查看数据是否准确"
def clean_text(file, save_dir): ht = HarvestText() CharTable = pyhanlp.JClass('com.hankcs.hanlp.dictionary.other.CharTable') data = read_json(file) num_null = 0 cleaned_data = [] for i in trange(len(data)): content = CharTable.convert(data[i]['content']) cleaned_content = remove_url(ht.clean_text(content, emoji=False)) # 过滤@后最多6个字符 num_null += 1 if cleaned_content == '' else 0 if 'train' in file and (not content or not cleaned_content ): # 删除train中的自带的空数据或清洗后出现的空数据 continue if 'eval' in file or 'test' in file: cleaned_data.append({ 'id': data[i]['id'], 'content': cleaned_content }) else: cleaned_data.append({ 'id': data[i]['id'], 'content': cleaned_content, 'label': data[i]['label'] }) filename = file.split('/')[-1] save_json(cleaned_data, os.path.join(save_dir, filename)) print('num data: ', num_null)
def clean_cn_text_by_third_party(self, sentence): """ 用第三方库清洗中文文本 """ from harvesttext import HarvestText ht_obj = HarvestText() # 去掉微博的@,表情符;网址;email;html代码中的一类的特殊字符等 _text = sentence.replace('\u2028', '').replace('\n', '').replace('\t', '') re_h = re.compile('<(/?\w+|!--|!DOCTYPE|\?xml)[^>]*>') _text = re_h.sub('', _text) # html处理 clean_text = ht_obj.clean_text(_text) return clean_text
def clean_text_whole(original_text): ht = HarvestText() original_text = re.compile(r'【.*?】').sub('', original_text) # 去掉方括号 original_text = re.compile(r'(\d{4}-\d{2}-\d{2})').sub( '', original_text) # 去掉日期 original_text = re.compile(r'(\d{2}:\d{2}:\d{2})').sub( '', original_text) # 去掉时间 original_text = re.compile(r'(\d{2}:\d{2})').sub('', original_text) # 去掉时间 cleaned_text = ht.clean_text(original_text) return cleaned_text
def clean_text(): print("各种清洗文本") ht0 = HarvestText() # 默认的设置可用于清洗微博文本 text1 = "回复@钱旭明QXM:[嘻嘻][嘻嘻] //@钱旭明QXM:杨大哥[good][good]" print("清洗微博【@和表情符等】") print("原:", text1) print("清洗后:", ht0.clean_text(text1)) # URL的清理 text1 = "【#赵薇#:正筹备下一部电影 但不是青春片....http://t.cn/8FLopdQ" print("清洗网址URL") print("原:", text1) print("清洗后:", ht0.clean_text(text1, remove_url=True)) # 清洗邮箱 text1 = "我的邮箱是[email protected],欢迎联系" print("清洗邮箱") print("原:", text1) print("清洗后:", ht0.clean_text(text1, email=True)) # 处理URL转义字符 text1 = "www.%E4%B8%AD%E6%96%87%20and%20space.com" print("URL转正常字符") print("原:", text1) print("清洗后:", ht0.clean_text(text1, norm_url=True, remove_url=False)) text1 = "www.中文 and space.com" print("正常字符转URL[含有中文和空格的request需要注意]") print("原:", text1) print("清洗后:", ht0.clean_text(text1, to_url=True, remove_url=False)) # 处理HTML转义字符 text1 = "<a c> ''" print("HTML转正常字符") print("原:", text1) print("清洗后:", ht0.clean_text(text1, norm_html=True))