Exemplo n.º 1
0
    def _process_text(self, text, **kw):
        """
        Preprocess text.
        """
        # always lower case + unidecode
        text = unicode(
            unidecode(text.lower().decode('utf-8')), errors='ignore')

        # optionally remove punctuation
        if kw.get('remove_punct', True):
            text = "".join(map(lambda x: x if x not in punct else " ", text))

        # optionally remove digits
        if kw.get('remove_digits', True):
            text = "".join(map(lambda x: x if x not in digits else " ", text))

        # optionally remove whitespace
        if kw.get('remove_html', True):
            text = html.strip_tags(text)

        # optionally remove whitespace
        if kw.get('remove_whitespace', True):
            text = re_whitespace.sub(" ", text).strip()

        return text
Exemplo n.º 2
0
    def _process_text(self, text, **kw):
        """
        Preprocess text.
        """
        # always lower case + unidecode
        text = unicode(unidecode(text.lower().decode('utf-8')),
                       errors='ignore')

        # optionally remove punctuation
        if kw.get('remove_punct', True):
            text = "".join(map(lambda x: x if x not in punct else " ", text))

        # optionally remove digits
        if kw.get('remove_digits', True):
            text = "".join(map(lambda x: x if x not in digits else " ", text))

        # optionally remove whitespace
        if kw.get('remove_html', True):
            text = html.strip_tags(text)

        # optionally remove whitespace
        if kw.get('remove_whitespace', True):
            text = re_whitespace.sub(" ", text).strip()

        return text
Exemplo n.º 3
0
def prepare(s):
    """
    Prepare text.
    """
    s = unicode_symbols(s)
    s = re_whitespace.sub(' ', s).strip()
    return unidecode(s)
Exemplo n.º 4
0
def prepare(s):
    """
    Prepare text.
    """
    s = unicode_symbols(s)
    s = re_whitespace.sub(' ', s).strip()
    try:
        s = unidecode(s)
    except Warning:
        pass
    return s
Exemplo n.º 5
0
    def _process_text(self, text, **kw):
        """
        Preprocess text.
        """

        # optionally remove punctuation
        if kw.get('rm_punct', True):
            text = "".join(map(lambda x: x if x not in punct else " ", text))

        # optionally remove digits
        if kw.get('rm_digits', True):
            text = "".join(map(lambda x: x if x not in digits else " ", text))

        # optionally remove whitespace
        if kw.get('rm_html', True):
            text = html.strip_tags(text)

        # optionally remove whitespace
        if kw.get('rm_whitespace', True):
            text = re_whitespace.sub(" ", text).strip()

        return text
Exemplo n.º 6
0
    def _process_text(self, text, **kw):
        """
        Preprocess text.
        """

        # optionally remove punctuation
        if kw.get("rm_punct", True):
            text = "".join(map(lambda x: x if x not in punct else " ", text))

        # optionally remove digits
        if kw.get("rm_digits", True):
            text = "".join(map(lambda x: x if x not in digits else " ", text))

        # optionally remove whitespace
        if kw.get("rm_html", True):
            text = html.strip_tags(text)

        # optionally remove whitespace
        if kw.get("rm_whitespace", True):
            text = re_whitespace.sub(" ", text).strip()

        return text