def preprocess(title: str, comment: str): # Erase redundant \" in the start & end of the title if title.startswith("\""): title = title[1:] if title.endswith("\""): title = title[:-1] # Change quotes title = title.replace("“", "\"").replace("”", "\"").replace("‘", "\'").replace("’", "\'") # Erase braces in title braces = r"\[(.*?)\]" braces2 = r"\{(.*?)\}" braces3 = r"\【(.*?)\】" braces4 = r"\<(.*?)\>" title = re.sub(braces, '', title) title = re.sub(braces2, '', title) title = re.sub(braces3, '', title) title = re.sub(braces4, '', title) # Normalize the comment comment = emoticon_normalize(comment, num_repeats=3) comment = repeat_normalize(comment, num_repeats=3) return title, comment
def preprocessing(reviews): corpus = [] for review in reviews: # remove e-mail review = re.sub('([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+.[a-zA-Z0-9-.]+)', '', str(review)) review = re.sub('(http|ftp|https)://(?:[-\w.]|(?:\da-fA-F]{2}))+', '', str(review)) # remove url review = re.sub(r'<[^>]+>', '', review) # remove Html tags review = re.sub(r'\[[^>]+\]', '', review) # remove Html tags review = re.sub(r'\{[^>]+\}', '', review) # remove Html tags review = re.sub(r'\([^>]+\)', '', review) # remove Html tags # 한글, 숫자, 알파벳, 기본구두점 제외 review = re.sub(r'[^ .,?!/@$%~%·∼()\x00-\x7Fㄱ-ㅎㅏ-ㅣ가-힣0-9a-zA-Z]+', '', str(review)) # review = re.sub(r'[^ㄱ-ㅎㅏ-ㅣ가-힣0-9a-zA-Z]', ' ', str(review)) # 한글, 숫자, 알파벳, 기본구두점 제외 review = re.sub(r'\s+', ' ', str(review)) # remove spaces review = re.sub(r"^\s+", '', str(review)) # remove space from start review = re.sub(r'\s+$', '', str(review)) # remove space from the end review = repeat_normalize(str(review)) corpus.append(review) return corpus
def _strpreprocess(self, string): """모델링용 텍스트 전처리 (중국어 키워드 추출에 적합하게 변경 필요함.)""" # compile basics url = re.compile( r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)' ) email = re.compile('([0-9a-zA-Z_]|[^\s^\w])+(@)[a-zA-Z]+.[a-zA-Z)]+') emojis = ''.join(emoji.UNICODE_EMOJI.keys()) zh_pattern = re.compile( f'[^ .,?!/@$%~%·∼()。、,《 》“”:0-9a-zA-Z\u4e00-\u9fff{emojis}]+' ) # 기호, 영어, 중국어, 이모티콘 # process if self.html.search(string) != None: # html js 처리 soup = BeautifulSoup(string, "lxml") for script in soup(["script", "style"]): script.decompose() string = soup.get_text() string = string.strip() string = self.zh_pattern.sub( ' ', string) # 숫자, 문자, whitespace, 이모지, 일반특수문자를 제외한 모든 유니코드 제거. string = re.sub(' ', ' ', string) # 제거 string = repeat_normalize(string, num_repeats=3) # repeats string = self.url.sub(' [URL] ', string) # url string = self.email.sub(' [EMAIL] ', string) # email # 요기에 보존리스트 추가하자. string = re.sub(r'\s+', ' ', string) #white space character 변환 연속 하나로 return string.strip()
def clean(x): x = pattern.sub(' ', x) x = url_pattern.sub('', x) x = emoji_pattern.sub('', x) x = x.strip() x = repeat_normalize(x, num_repeats=2) x = emoticon_normalize(x) return x
def __tokenize(self, text): text = self.regex.sub('', text) if not self.__tokenizer_type: raise ValueError('Tokenizer를 먼저 load하세요') if self.__tokenizer_type == 'spm': return [ repeat_normalize(token.replace("▁", "")) for token in self.tokenizer.EncodeAsPieces(text) ] elif self.__tokenizer_type == 'okt': tag_list = ['Noun', 'Verb', 'Adjective', 'Adverb'] return [ repeat_normalize(token) for token, pos in self.tokenizer.pos(text) if pos in tag_list ] else: return [ repeat_normalize(token) for token in self.tokenizer.nouns(text) ]
def clean(x): emojis = ''.join(emoji.UNICODE_EMOJI.keys()) pattern = re.compile(f'[^ .,?!/@$%~%·∼()\x00-\x7Fㄱ-힣{emojis}]+') url_pattern = re.compile( r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)') x = pattern.sub(' ', x) x = url_pattern.sub('', x) x = x.strip() x = repeat_normalize(x, num_repeats=2) return x
def preprocess_text(self, text): emojis = ''.join(emoji.UNICODE_EMOJI.keys()) pattern = re.compile(f'[^ .,?!/@$%~%·∼()\x00-\x7Fㄱ-힣{emojis}]+') url_pattern = re.compile( r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)' ) text = pattern.sub(' ', str(text)) text = url_pattern.sub('', text) text = text.strip() text = repeat_normalize(text, num_repeats=2) return self.tokenizer(text, max_length=self.args.max_length, truncation=True, return_tensors="pt")
def _postprocess(self, doc: List[str]) -> List[Tuple[str]]: """후처리 로직""" processed_doc = [] for l_part, r_part in doc: ## l_part l_part = repeat_normalize(l_part, num_repeats=3) sub_l_part = re.findall(r"[\w]+|[\W]+", l_part) if len(sub_l_part) == 2: processed_doc += [(sub, 'L') for sub in sub_l_part] else: processed_doc.append((l_part, 'L')) ## r_part if r_part != '': r_part = repeat_normalize(r_part, num_repeats=3) sub_r_part = re.findall(r"[\w]+|[\W]+", r_part) if len(sub_r_part) == 2: processed_doc += [(sub, 'R') for sub in sub_r_part] else: processed_doc.append((r_part, 'R')) return processed_doc
def process_reviews(reviews): processed_reviews = [] for review in reviews: review = repeat_normalize(review, num_repeats=2) # normalize repeats by two review = spacing(review) # space by words review = ('.').join(split_sentences(review)) # split by sentence try: review = spell_checker.check(review).as_dict()['checked'] except: print('pass') pass print(review) processed_reviews.append(review) time.sleep(0.5) return processed_reviews
def regex_spacing_normalization(data): del_filter1 = re.compile(r'[!?,.ㅋㅎㅜㅠ가-힣0-9]+') data[SENTENCE_IDX_COLUME_NAME] = 0 df = pd.DataFrame(columns = [TEXT_REVIEW_COLUMN_NAME, SENTENCE_IDX_COLUME_NAME]) for idx, item in tqdm(enumerate(data[TEXT_REVIEW_COLUMN_NAME])): _tmp = str(item) if _tmp == 'nan': continue item = ' '.join(del_filter1.findall(item)) #item = spacing(item) #tmp = spell_checker.check(tmp) #tmp = tmp.checked #tmp = tmp.replace('구성비', '가성비') item = repeat_normalize(item, num_repeats=2) df = df.append({TEXT_REVIEW_COLUMN_NAME:item, SENTENCE_IDX_COLUME_NAME:idx}, ignore_index=True) return df
def normalize(text: str) -> str: # unicode normalization text = unicodedata.normalize('NFKC', text) # eradicate html script html = re.compile("<(\"[^\"]*\"|'[^']*'|[^'\">])*>") if html.search(text) != None: # html js 처리 soup = BeautifulSoup(text, "lxml") for script in soup(["script", "style"]): script.decompose() text = soup.get_text() # chinese preprocessing format text = preprocessor.normalize_chinese_pattern(text) # normalize repeated pattern text = repeat_normalize(text, num_repeats=3) return text.strip()
def regex_spacing_normalization(data): del_filter1 = re.compile(r'[!?,.ㅋㅎㅜㅠ가-힣0-9]+') data[SENTENCE_IDX_COLUME_NAME] = 0 df = pd.DataFrame( columns=[TEXT_REVIEW_COLUMN_NAME, SENTENCE_IDX_COLUME_NAME]) for idx, item in enumerate(data[TEXT_REVIEW_COLUMN_NAME]): tmp = str(item) if tmp == 'nan': continue tmp = ' '.join(del_filter1.findall(item)) tmp = spacing(tmp) tmp = repeat_normalize(tmp, num_repeats=2) df = df.append( { TEXT_REVIEW_COLUMN_NAME: tmp, SENTENCE_IDX_COLUME_NAME: idx }, ignore_index=True) return df
def strpreprocess(string): """모델링용 텍스트 전처리""" # compile basics html = re.compile(r'<\s*a[^>]*>(.*?)<\s*/\s*a>') url = re.compile( r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)' ) email = re.compile('([0-9a-zA-Z_]|[^\s^\w])+(@)[a-zA-Z]+.[a-zA-Z)]+') emojis = ''.join(emoji.UNICODE_EMOJI.keys()) pattern = re.compile(f'[^ .,?!/@$%~%·∼()\x00-\x7Fㄱ-힣{emojis}]+') # process if html.search(string) != None: # html js 처리 soup = BeautifulSoup(string, "lxml") for script in soup(["script", "style"]): script.decompose() string = soup.get_text() string = string.strip() string = re.sub(rf'[^ .,?!/@$%~%·∼()\x00-\x7Fㄱ-힣{emojis}]+', ' ', string) string = re.sub(' ', ' ', string) # 제거 string = re.sub('<', '<', string) #기타 html특수기호 string = re.sub('>', '>', string) #기타 html특수기호 string = re.sub('&', '&', string) #기타 html특수기호 string = re.sub('"', '""', string) #기타 html특수기호 string = repeat_normalize(string, num_repeats=3) # repeats string = url.sub(' [URL] ', string) # url string = email.sub(' [EMAIL] ', string) # email string = demojize(string, delimiters=(' :', ': ')) # emoji를 영문으로 변환 string = re.sub(r'@(\w+)', r' ', string) # Mention 제거 for ht in re.findall(r'#(\w+)', string): # spacing to hashtag p = re.compile(f'#{ht}') string = p.sub(f' #{ht} ', string) # 요기에 보존리스트 추가하자. string = re.sub(r'\s+', ' ', string) #white space character 변환 연속 하나로 return string.strip()
x = x.strip() x = repeat_normalize(x, num_repeats=2) x = emoticon_normalize(x) return x file_name = sys.argv[1] naver_data = pd.read_csv(file_name, sep="\t") naver_doc = naver_data['document'].values naver_rating = naver_data['label'].values sentences = [] ratings = [] for naver_review, naver_rate in zip(naver_doc, naver_rating): try: naver_review = clean(naver_review) naver_review = repeat_normalize(naver_review, num_repeats=2) naver_review = emoticon_normalize(naver_review) naver_review = only_hangle(naver_review) if len(naver_review) >= 4: sentences.append(naver_review) ratings.append(naver_rate) except: pass with open(file_name + "preprocessing.tsv", 'w') as out_file: tsv_writer = csv.writer(out_file, delimiter='\t') for sent, rate in zip(sentences, ratings): tsv_writer.writerow([sent, rate])
def clean(self, text): for ch, sub_pattern in self.pattern: text = sub_pattern.sub(ch, text) text = text.strip() text = repeat_normalize(text, num_repeats=2) return text
def clean_text(x): x = pattern.sub(' ', x) x = urlpattern.sub('', x) x = x.strip() x = repeat_normalize(x, num_repeats=2) return x
def clean(x): x = rex_useless.sub(' ', x) x = rex_website.sub('', x) x = x.strip() x = repeat_normalize(x, num_repeats=2) return x