示例#1
0
    def _process_text(self, text, **kw):
        """
        Preprocess text.
        """
        # always lower case + unidecode
        text = unicode(
            unidecode(text.lower().decode('utf-8')), errors='ignore')

        # optionally remove punctuation
        if kw.get('remove_punct', True):
            text = "".join(map(lambda x: x if x not in punct else " ", text))

        # optionally remove digits
        if kw.get('remove_digits', True):
            text = "".join(map(lambda x: x if x not in digits else " ", text))

        # optionally remove whitespace
        if kw.get('remove_html', True):
            text = html.strip_tags(text)

        # optionally remove whitespace
        if kw.get('remove_whitespace', True):
            text = re_whitespace.sub(" ", text).strip()

        return text
示例#2
0
    def _process_text(self, text, **kw):
        """
        Preprocess text.
        """
        # always lower case + unidecode
        text = unicode(unidecode(text.lower().decode('utf-8')),
                       errors='ignore')

        # optionally remove punctuation
        if kw.get('remove_punct', True):
            text = "".join(map(lambda x: x if x not in punct else " ", text))

        # optionally remove digits
        if kw.get('remove_digits', True):
            text = "".join(map(lambda x: x if x not in digits else " ", text))

        # optionally remove whitespace
        if kw.get('remove_html', True):
            text = html.strip_tags(text)

        # optionally remove whitespace
        if kw.get('remove_whitespace', True):
            text = re_whitespace.sub(" ", text).strip()

        return text
示例#3
0
def prepare(s):
    """
    Prepare text.
    """
    s = unicode_symbols(s)
    s = re_whitespace.sub(' ', s).strip()
    return unidecode(s)
示例#4
0
def prepare(s):
    """
    Prepare text.
    """
    s = unicode_symbols(s)
    s = re_whitespace.sub(' ', s).strip()
    try:
        s = unidecode(s)
    except Warning:
        pass
    return s
示例#5
0
    def _process_text(self, text, **kw):
        """
        Preprocess text.
        """

        # optionally remove punctuation
        if kw.get('rm_punct', True):
            text = "".join(map(lambda x: x if x not in punct else " ", text))

        # optionally remove digits
        if kw.get('rm_digits', True):
            text = "".join(map(lambda x: x if x not in digits else " ", text))

        # optionally remove whitespace
        if kw.get('rm_html', True):
            text = html.strip_tags(text)

        # optionally remove whitespace
        if kw.get('rm_whitespace', True):
            text = re_whitespace.sub(" ", text).strip()

        return text
示例#6
0
    def _process_text(self, text, **kw):
        """
        Preprocess text.
        """

        # optionally remove punctuation
        if kw.get("rm_punct", True):
            text = "".join(map(lambda x: x if x not in punct else " ", text))

        # optionally remove digits
        if kw.get("rm_digits", True):
            text = "".join(map(lambda x: x if x not in digits else " ", text))

        # optionally remove whitespace
        if kw.get("rm_html", True):
            text = html.strip_tags(text)

        # optionally remove whitespace
        if kw.get("rm_whitespace", True):
            text = re_whitespace.sub(" ", text).strip()

        return text