def preprocess(title: str, comment: str):
    # Erase redundant \" in the start & end of the title
    if title.startswith("\""):
        title = title[1:]
    if title.endswith("\""):
        title = title[:-1]

    # Change quotes
    title = title.replace("“",
                          "\"").replace("”",
                                        "\"").replace("‘",
                                                      "\'").replace("’", "\'")

    # Erase braces in title
    braces = r"\[(.*?)\]"
    braces2 = r"\{(.*?)\}"
    braces3 = r"\【(.*?)\】"
    braces4 = r"\<(.*?)\>"

    title = re.sub(braces, '', title)
    title = re.sub(braces2, '', title)
    title = re.sub(braces3, '', title)
    title = re.sub(braces4, '', title)

    # Normalize the comment
    comment = emoticon_normalize(comment, num_repeats=3)
    comment = repeat_normalize(comment, num_repeats=3)

    return title, comment
Exemplo n.º 2
0
def preprocessing(reviews):

    corpus = []

    for review in reviews:
        # remove e-mail
        review = re.sub('([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+.[a-zA-Z0-9-.]+)', '',
                        str(review))
        review = re.sub('(http|ftp|https)://(?:[-\w.]|(?:\da-fA-F]{2}))+', '',
                        str(review))  # remove url
        review = re.sub(r'<[^>]+>', '', review)  # remove Html tags
        review = re.sub(r'\[[^>]+\]', '', review)  # remove Html tags
        review = re.sub(r'\{[^>]+\}', '', review)  # remove Html tags
        review = re.sub(r'\([^>]+\)', '', review)  # remove Html tags

        # 한글, 숫자, 알파벳, 기본구두점 제외
        review = re.sub(r'[^ .,?!/@$%~%·∼()\x00-\x7Fㄱ-ㅎㅏ-ㅣ가-힣0-9a-zA-Z]+', '',
                        str(review))
        # review = re.sub(r'[^ㄱ-ㅎㅏ-ㅣ가-힣0-9a-zA-Z]', ' ', str(review)) # 한글, 숫자, 알파벳, 기본구두점 제외

        review = re.sub(r'\s+', ' ', str(review))  # remove spaces
        review = re.sub(r"^\s+", '', str(review))  # remove space from start
        review = re.sub(r'\s+$', '', str(review))  # remove space from the end

        review = repeat_normalize(str(review))

        corpus.append(review)

    return corpus
Exemplo n.º 3
0
    def _strpreprocess(self, string):
        """모델링용 텍스트 전처리 (중국어 키워드 추출에 적합하게 변경 필요함.)"""

        # compile basics

        url = re.compile(
            r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)'
        )
        email = re.compile('([0-9a-zA-Z_]|[^\s^\w])+(@)[a-zA-Z]+.[a-zA-Z)]+')
        emojis = ''.join(emoji.UNICODE_EMOJI.keys())
        zh_pattern = re.compile(
            f'[^ .,?!/@$%~%·∼()。、,《 》“”:0-9a-zA-Z\u4e00-\u9fff{emojis}]+'
        )  # 기호, 영어, 중국어, 이모티콘

        # process
        if self.html.search(string) != None:  # html js 처리
            soup = BeautifulSoup(string, "lxml")
            for script in soup(["script", "style"]):
                script.decompose()
            string = soup.get_text()
        string = string.strip()
        string = self.zh_pattern.sub(
            ' ', string)  # 숫자, 문자, whitespace, 이모지, 일반특수문자를 제외한 모든 유니코드 제거.
        string = re.sub('&nbsp;', ' ', string)  #&nbsp; 제거
        string = repeat_normalize(string, num_repeats=3)  # repeats
        string = self.url.sub(' [URL] ', string)  # url
        string = self.email.sub(' [EMAIL] ', string)  # email
        # 요기에 보존리스트 추가하자.
        string = re.sub(r'\s+', ' ', string)  #white space character 변환 연속 하나로
        return string.strip()
Exemplo n.º 4
0
def clean(x):
    x = pattern.sub(' ', x)
    x = url_pattern.sub('', x)
    x = emoji_pattern.sub('', x)
    x = x.strip()
    x = repeat_normalize(x, num_repeats=2)
    x = emoticon_normalize(x)
    return x
Exemplo n.º 5
0
 def __tokenize(self, text):
     text = self.regex.sub('', text)
     if not self.__tokenizer_type:
         raise ValueError('Tokenizer를 먼저 load하세요')
     if self.__tokenizer_type == 'spm':
         return [
             repeat_normalize(token.replace("▁", ""))
             for token in self.tokenizer.EncodeAsPieces(text)
         ]
     elif self.__tokenizer_type == 'okt':
         tag_list = ['Noun', 'Verb', 'Adjective', 'Adverb']
         return [
             repeat_normalize(token)
             for token, pos in self.tokenizer.pos(text) if pos in tag_list
         ]
     else:
         return [
             repeat_normalize(token) for token in self.tokenizer.nouns(text)
         ]
Exemplo n.º 6
0
def clean(x):
    emojis = ''.join(emoji.UNICODE_EMOJI.keys())
    pattern = re.compile(f'[^ .,?!/@$%~%·∼()\x00-\x7Fㄱ-힣{emojis}]+')
    url_pattern = re.compile(
        r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')
    
    x = pattern.sub(' ', x)
    x = url_pattern.sub('', x)
    x = x.strip()
    x = repeat_normalize(x, num_repeats=2)
    return x
    def preprocess_text(self, text):
        emojis = ''.join(emoji.UNICODE_EMOJI.keys())
        pattern = re.compile(f'[^ .,?!/@$%~%·∼()\x00-\x7Fㄱ-힣{emojis}]+')
        url_pattern = re.compile(
            r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)'
        )

        text = pattern.sub(' ', str(text))
        text = url_pattern.sub('', text)
        text = text.strip()
        text = repeat_normalize(text, num_repeats=2)
        return self.tokenizer(text,
                              max_length=self.args.max_length,
                              truncation=True,
                              return_tensors="pt")
Exemplo n.º 8
0
    def _postprocess(self, doc: List[str]) -> List[Tuple[str]]:
        """후처리 로직"""
        processed_doc = []

        for l_part, r_part in doc:

            ## l_part
            l_part = repeat_normalize(l_part, num_repeats=3)
            sub_l_part = re.findall(r"[\w]+|[\W]+", l_part)
            if len(sub_l_part) == 2:
                processed_doc += [(sub, 'L') for sub in sub_l_part]
            else:
                processed_doc.append((l_part, 'L'))

            ## r_part
            if r_part != '':
                r_part = repeat_normalize(r_part, num_repeats=3)
                sub_r_part = re.findall(r"[\w]+|[\W]+", r_part)
                if len(sub_r_part) == 2:
                    processed_doc += [(sub, 'R') for sub in sub_r_part]
                else:
                    processed_doc.append((r_part, 'R'))

        return processed_doc
Exemplo n.º 9
0
def process_reviews(reviews):
    processed_reviews = []
    for review in reviews:
        review = repeat_normalize(review,
                                  num_repeats=2)  # normalize repeats by two
        review = spacing(review)  # space by words
        review = ('.').join(split_sentences(review))  # split by sentence
        try:
            review = spell_checker.check(review).as_dict()['checked']
        except:
            print('pass')
            pass
        print(review)
        processed_reviews.append(review)
        time.sleep(0.5)
    return processed_reviews
def regex_spacing_normalization(data):
    del_filter1 = re.compile(r'[!?,.ㅋㅎㅜㅠ가-힣0-9]+')
    data[SENTENCE_IDX_COLUME_NAME] = 0
    df = pd.DataFrame(columns = [TEXT_REVIEW_COLUMN_NAME, SENTENCE_IDX_COLUME_NAME])
    for idx, item in tqdm(enumerate(data[TEXT_REVIEW_COLUMN_NAME])):
        _tmp = str(item)
        if _tmp == 'nan':
            continue
        item = ' '.join(del_filter1.findall(item))
        #item = spacing(item)
        #tmp = spell_checker.check(tmp)
        #tmp = tmp.checked
        #tmp = tmp.replace('구성비', '가성비')
        item = repeat_normalize(item, num_repeats=2)

        df = df.append({TEXT_REVIEW_COLUMN_NAME:item, SENTENCE_IDX_COLUME_NAME:idx}, ignore_index=True)
    return df
Exemplo n.º 11
0
def normalize(text: str) -> str:
    # unicode normalization
    text = unicodedata.normalize('NFKC', text)

    # eradicate html script
    html = re.compile("<(\"[^\"]*\"|'[^']*'|[^'\">])*>")
    if html.search(text) != None:  # html js 처리
        soup = BeautifulSoup(text, "lxml")
        for script in soup(["script", "style"]):
            script.decompose()
        text = soup.get_text()

    # chinese preprocessing format
    text = preprocessor.normalize_chinese_pattern(text)

    # normalize repeated pattern
    text = repeat_normalize(text, num_repeats=3)

    return text.strip()
Exemplo n.º 12
0
def regex_spacing_normalization(data):
    del_filter1 = re.compile(r'[!?,.ㅋㅎㅜㅠ가-힣0-9]+')
    data[SENTENCE_IDX_COLUME_NAME] = 0
    df = pd.DataFrame(
        columns=[TEXT_REVIEW_COLUMN_NAME, SENTENCE_IDX_COLUME_NAME])
    for idx, item in enumerate(data[TEXT_REVIEW_COLUMN_NAME]):
        tmp = str(item)
        if tmp == 'nan':
            continue
        tmp = ' '.join(del_filter1.findall(item))
        tmp = spacing(tmp)
        tmp = repeat_normalize(tmp, num_repeats=2)

        df = df.append(
            {
                TEXT_REVIEW_COLUMN_NAME: tmp,
                SENTENCE_IDX_COLUME_NAME: idx
            },
            ignore_index=True)
    return df
Exemplo n.º 13
0
def strpreprocess(string):
    """모델링용 텍스트 전처리"""
    # compile basics
    html = re.compile(r'<\s*a[^>]*>(.*?)<\s*/\s*a>')
    url = re.compile(
        r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)'
    )
    email = re.compile('([0-9a-zA-Z_]|[^\s^\w])+(@)[a-zA-Z]+.[a-zA-Z)]+')
    emojis = ''.join(emoji.UNICODE_EMOJI.keys())
    pattern = re.compile(f'[^ .,?!/@$%~%·∼()\x00-\x7Fㄱ-힣{emojis}]+')

    # process
    if html.search(string) != None:  # html js 처리
        soup = BeautifulSoup(string, "lxml")
        for script in soup(["script", "style"]):
            script.decompose()
        string = soup.get_text()
    string = string.strip()
    string = re.sub(rf'[^ .,?!/@$%~%·∼()\x00-\x7Fㄱ-힣{emojis}]+', ' ', string)
    string = re.sub('&nbsp;', ' ', string)  #&nbsp; 제거
    string = re.sub('&lt;', '<', string)  #기타 html특수기호
    string = re.sub('&gt;', '>', string)  #기타 html특수기호
    string = re.sub('&amp;', '&', string)  #기타 html특수기호
    string = re.sub('&quot;', '""', string)  #기타 html특수기호
    string = repeat_normalize(string, num_repeats=3)  # repeats
    string = url.sub(' [URL] ', string)  # url
    string = email.sub(' [EMAIL] ', string)  # email
    string = demojize(string, delimiters=(' :', ': '))  # emoji를 영문으로 변환
    string = re.sub(r'@(\w+)', r' ', string)  # Mention 제거
    for ht in re.findall(r'#(\w+)', string):  # spacing to hashtag
        p = re.compile(f'#{ht}')
        string = p.sub(f' #{ht} ', string)
    # 요기에 보존리스트 추가하자.
    string = re.sub(r'\s+', ' ', string)  #white space character 변환 연속 하나로

    return string.strip()
Exemplo n.º 14
0
    x = x.strip()
    x = repeat_normalize(x, num_repeats=2)
    x = emoticon_normalize(x)
    return x


file_name = sys.argv[1]
naver_data = pd.read_csv(file_name, sep="\t")
naver_doc = naver_data['document'].values
naver_rating = naver_data['label'].values

sentences = []
ratings = []
for naver_review, naver_rate in zip(naver_doc, naver_rating):
    try:
        naver_review = clean(naver_review)
        naver_review = repeat_normalize(naver_review, num_repeats=2)
        naver_review = emoticon_normalize(naver_review)
        naver_review = only_hangle(naver_review)

        if len(naver_review) >= 4:
            sentences.append(naver_review)
            ratings.append(naver_rate)
    except:
        pass

with open(file_name + "preprocessing.tsv", 'w') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    for sent, rate in zip(sentences, ratings):
        tsv_writer.writerow([sent, rate])
Exemplo n.º 15
0
 def clean(self, text):
     for ch, sub_pattern in self.pattern:
         text = sub_pattern.sub(ch, text)
     text = text.strip()
     text = repeat_normalize(text, num_repeats=2)
     return text
def clean_text(x):
    x = pattern.sub(' ', x)
    x = urlpattern.sub('', x)
    x = x.strip()
    x = repeat_normalize(x, num_repeats=2)
    return x
Exemplo n.º 17
0
def clean(x):
    x = rex_useless.sub(' ', x)
    x = rex_website.sub('', x)
    x = x.strip()
    x = repeat_normalize(x, num_repeats=2)
    return x