示例#1
0
def slugify(*args, **kwargs):
    """Join a series of strings into a URL slug.

    - normalizes strings to proper ascii repesentations
    - removes non-alphanumeric characters
    - replaces whitespace with dashes

    :param lower: Whether the slug should be all-lowercase
    :param maxlen: Maximum slug length
    :param fallback: Fallback in case of an empty slug
    """

    lower = kwargs.get('lower', True)
    maxlen = kwargs.get('maxlen')
    fallback = kwargs.get('fallback', '')

    value = '-'.join(str(val) for val in args)
    value = translitcodec.long_encode(value)[0]
    value = re.sub(r'[^\w\s-]', '', value, flags=re.ASCII).strip()

    if lower:
        value = value.lower()
    value = re.sub(r'[-\s]+', '-', value)
    if maxlen:
        value = value[0:maxlen].rstrip('-')

    return value or fallback
示例#2
0
def slugify(text, delim=u'-'):
    """Generates an ASCII-only slug."""
    result = []
    for word in _punct_re.split(text.lower()):
        word = translitcodec.long_encode(word)[0]
        if word:
            result.append(word)
    return unicode(delim.join(result))
    def stem_document(self, doc, re_fit):
        """
        Stem the documents, and prepare the stemmer for the inverse_transform by keeping track of
        a mapping back to the original expressions and their counts.

        :param doc: document string
        :param re_fit: boolean, if True, it will prepare the stemmer for the inverse_transform by saving state.
        :return: stemmed document string
        """
        # Ignore punctuation and split on spaces.
        for punctuation_character in punctuation:
            doc = doc.replace(
                # punctuation_character, " {} ".format(punctuation_character)
                punctuation_character,
                " ".format(punctuation_character))
        doc = doc.replace("  ", " ").replace("  ", " ").strip()

        # words_or_punct = doc.split(" ")
        # stemmer = st.Stemmer(self.language)
        # stemmed_words = stemmer.stemWords(words_or_punct)

        # Stemmed words won't have accents nor capital letters anymore.
        transformed_words = [
            translitcodec.long_encode(w)[0].lower() for w in doc.split(" ")
        ]
        words = doc.split(" ")
        stemmer = st.Stemmer(self.language)
        stemmed_words = stemmer.stemWords(transformed_words)

        if re_fit:
            # Keep track of things for inverse stemming: each word has its count.
            # But the inverse relationship is not deterministic: we need to count occurences
            # because we need the TOP equivalent word back.
            for (_word, _stemmed_word) in zip(words, stemmed_words):

                if _stemmed_word in self.stemmed_word_to_equiv_word_count:

                    if _word in self.stemmed_word_to_equiv_word_count[
                            _stemmed_word]:
                        count_yet = self.stemmed_word_to_equiv_word_count[
                            _stemmed_word][_word]
                        self.stemmed_word_to_equiv_word_count[_stemmed_word][
                            _word] = count_yet + 1  # += 1
                    else:
                        self.stemmed_word_to_equiv_word_count[_stemmed_word][
                            _word] = 1
                else:
                    self.stemmed_word_to_equiv_word_count[_stemmed_word] = {
                        _word: 1
                    }

        else:
            stemmed_document = " ".join(stemmed_words)
            return stemmed_document
示例#4
0
    def remove_from_string(self, text):
        """
        Remove stopwords from a string in the safest possible way to keep the text intact.
        """

        # In the following variables, text's characters will flow from bottom to top such as:
        # text --> last_word|last_punct --> past_text
        past_text = ""
        last_punct = ""
        last_word = ""

        text += "."  # add a last punctuation to loop 1 last time closing the sentence.
        for char in text:
            decoded_char = translitcodec.short_encode(char)[0].lower()

            char_is_letter = False
            if decoded_char in string.ascii_lowercase:  # Lowercase alphabet
                char_is_letter = True

            # We loop if it's part of a word.
            if char_is_letter:
                # We're building a word.
                # Loop
                last_word += char

            # Otherwise if it's punctuation, we're either somehow before or directly after a word.
            elif not char_is_letter:

                # We ignore N punctuations in a row before a word.
                if last_word == "":
                    # Move on.
                    last_punct += char
                # Otherwise we're closing a word. Let's process it now.
                else:
                    full_word = last_word
                    safe_full_word = translitcodec.long_encode(
                        full_word)[0].lower()
                    if safe_full_word in self.safe_stopwords:
                        # We remove the word (and the following apostrophe or space if there is one)!
                        full_word = ""
                        if char in "’'‘’'' ":
                            char = ""

                    # Loop
                    past_text += last_punct + full_word
                    last_punct = char
                    last_word = ""

        past_text += last_punct
        return past_text[:-1]
示例#5
0
    def fit(self, X=None, y=None):
        """
        This function is implemented for the class to be usable by scikit-learn's Pipeline() behavior.
        X & y are ignored here, but required by convention.

        It reads the stopwords from disk if there are none provided.
        """

        if self.stopwords is None:
            current_dir = os.path.dirname(os.path.realpath(__file__))
            stop_words_file = os.path.join(current_dir, "..", "data",
                                           STOPWORDS_FILENAME)
            with open(stop_words_file) as f:
                self.stopwords = f.read().split("\n")
        self.safe_stopwords = [
            translitcodec.long_encode(w)[0].lower() for w in self.stopwords
        ]

        return self
示例#6
0
def str_to_ascii(text):
    return translitcodec.long_encode(text)[0].encode(
        'ascii', 'ignore').decode().strip()
示例#7
0
#!/usr/local/bin/python3
import sys, os, unicodedata, timeit
sys.path.append(os.getcwd() + '/lib/python3.8/site-packages')
import ctranslitcodec, translitcodec, _ctranslitcodec

x = '£ ☹ wøóf méåw ﷲ etsi vereor judices ne turpe sit pro fortissimo viro incipientem timere minimeque deceat'

a = open('long.txt').read()
print(a)
assert '\u2639' in a
print(translitcodec.long_encode(a))
print(ctranslitcodec.long_encode(unicodedata.normalize('NFKC', a)))
assert ctranslitcodec.long_encode(unicodedata.normalize('NFKC', a))[0] \
  == translitcodec.long_encode(a)[0]
# validate UTF-8 encoding
for i in range(1, 0x10FFFF):
    try:
        c = chr(i)
        c.encode('utf-8')
    except:
        continue
    if len(unicodedata.normalize('NFKC', c)) != 1: continue
    if c in a: continue
    if unicodedata.normalize('NFKC', c) in a: continue
    try:
        assert ctranslitcodec.long_encode(c) == \
          (unicodedata.normalize('NFKC', c), 1)
        assert _ctranslitcodec.long_encode(c) == c
    except:
        print('FAILED AT', i, c)
        print(ctranslitcodec.long_encode(c), '!=',