Exemplo n.º 1
0
def test_clean_text():
    """test text_clean"""
    text_cleaner = TextCleaner("corpse_bride.txt")
    text_cleaner.text = "Hi, Mr.Lee -went to the park. Let's go"
    text_cleaner.clean_text()
    assert text_cleaner.sentence == [
        "hi COMMA mr lee went to the park", " let's go"
    ]
Exemplo n.º 2
0
def cleaned_comments(subreddit):
    tc = TextCleaner()
    return [tc.clean_text(line).text for line in comments(subreddit)]
Exemplo n.º 3
0
class LanguageProcessor:
    """
    Contains methods for cleaning or enhancing sentences in
    some languages.
    """
    rxNum = re.compile('^[0-9]+$')
    rxLat = re.compile('^[a-z.-]+$')

    def __init__(self, lang):
        self.lang = lang
        self.tc = TextCleaner(lang)

    def clean(self, s):
        """
        Perform basic text cleaning operations.
        """
        return self.tc.clean_text(s)

    def replace_numerals_kpv(self, s):
        if LanguageProcessor.rxNum.search(s) is None:
            return s
        if len(s) == 1:
            if s == '1':
                return 'ӧтик'
            elif s == '2':
                return 'кык'
            elif s == '3':
                return 'куим'
            elif s == '4':
                return 'нёль'
            elif s == '5':
                return 'вит'
            elif s == '6':
                return 'квайт'
            elif s == '7':
                return 'сизим'
            elif s == '8':
                return 'кӧкъямыс'
            elif s == '9':
                return 'ӧкмыс'
        elif len(s) == 2:
            if s == '10':
                return 'дас'
            elif s == '20':
                return 'кызь'
            elif s[0] == '2':
                return 'кызь ' + self.replace_numerals_kpv(s[1])
            elif s == '30':
                return 'комын'
            elif s[0] == '3':
                return 'комын ' + self.replace_numerals_kpv(s[1])
            elif s == '40':
                return 'нелямын'
            elif s[0] == '4':
                return 'нелямын ' + self.replace_numerals_kpv(s[1])
            elif s == '50':
                return 'ветымын'
            elif s[0] == '5':
                return 'ветымын ' + self.replace_numerals_kpv(s[1])
            elif s == '60':
                return 'квайтымын'
            elif s[0] == '6':
                return 'квайтымын ' + self.replace_numerals_kpv(s[1])
            elif s == '70':
                return 'сизимдас'
            elif s[0] == '7':
                return 'сизимдас ' + self.replace_numerals_kpv(s[1])
            elif s == '80':
                return 'кöкъямысдас'
            elif s[0] == '8':
                return 'кöкъямысдас ' + self.replace_numerals_kpv(s[1])
            elif s == '90':
                return 'öкмысдас'
            elif s[0] == '9':
                return 'öкмысдас ' + self.replace_numerals_kpv(s[1])
        elif len(s) == 3:
            if s == '100':
                return 'сё'
        elif len(s) == 4:
            if s == '1000':
                return 'сюрс'
            elif s[1:] == '000':
                return self.replace_numerals_kpv(s[0]) + ' сюрс'
            elif s[1] == '0':
                return self.replace_numerals_kpv(
                    s[0]) + ' сюрс ' + self.replace_numerals_kpv(s[2:])
            elif s[:2] == '19':
                return 'сюрс öкмыс сё ' + self.replace_numerals_kpv(s[2:])
        return s

    def replace_latin(self, s):
        if LanguageProcessor.rxLat.search(s) is not None:
            return ''
        return s

    def replace_abbr(self, s):
        if s == 'кр':
            return 'коми республика'
        if s == 'кг':
            return 'килограмм'
        return s

    def process_word(self, s):
        s = self.clean(s.lower())
        if self.lang == 'kpv':
            s = self.replace_numerals_kpv(s)
            s = self.replace_abbr(s)
            s = self.replace_latin(s)
        return s