Exemplo n.º 1
0
 def search_results(self, originator):
     """Display the search results."""
     query = self.search_input.text[len("Keyword: "):]
     query_cf = casefold(query)
     self.mode = 'search_results'
     self.set_header('SEARCH RESULTS FOR "{0}"'.format(query))
     results = [i for i in self.passwords if query_cf in casefold(i)]
     self._clear_box()
     self.box.body.append(BackButton('BACK', self.load_dispatch,
                                     self.current, self))
     self._make_password_buttons(results)
Exemplo n.º 2
0
 def test_unsupported(self):
     # This makes sure that full case folding (C+F) is done, not
     # simple or special case folding.
     
     # First some Turkish fun...
     # 0x130 folds to:
     #    0x0069 + 0x0307 with full case folding (F)
     #    0x0069 ("i") alone with the Turkish special case (T)
     # We expect full case folding...
     self.assertEqual(casefold("\u0130"), "i\u0307")
     
     # Now explicitly check a full vs simple fold...
     # 0x1F9B folds to:
     #    0x1F23 + 0x03B9 with full case folding (F)
     #    0x1F93 alone with simple case folding (S)
     self.assertEqual(casefold("\u1F9B"), "\u1F23\u03B9")
Exemplo n.º 3
0
 def test_unsupported(self):
     # This makes sure that full case folding (C+F) is done, not
     # simple or special case folding.
     
     # First some Turkish fun...
     # 0x130 folds to:
     #    0x0069 + 0x0307 with full case folding (F)
     #    0x0069 ("i") alone with the Turkish special case (T)
     # We expect full case folding...
     self.assertEqual(casefold(u"\u0130"), u"i\u0307")
     
     # Now explicitly check a full vs simple fold...
     # 0x1F9B folds to:
     #    0x1F23 + 0x03B9 with full case folding (F)
     #    0x1F93 alone with simple case folding (S)
     self.assertEqual(casefold(u"\u1F9B"), u"\u1F23\u03B9")
Exemplo n.º 4
0
def standardize_string(my_string):
    """Alter a copy of a sting to a standardized format for comparison."""
    # Removed this debug statement because it produces so much unneeded output.
    # fn = sys._getframe().f_code.co_name
    # wp.debug(fn, "START: %s" % my_string, wdb.SHOW_START)

    str_lower = casefold(my_string.strip().decode('utf-8')).strip()
    return str_lower.replace('_', ' ').strip()
Exemplo n.º 5
0
def transform_text(parseText, punctuations, casefolding):
    newcontent = ''
    if casefolding:
        parseText = casefold(parseText.decode('utf-8'))
    if punctuations:
        for letter in parseText:
            if letter.isalnum() or letter == '-' or letter == ' ':
                newcontent += letter
    return newcontent
Exemplo n.º 6
0
 def parse(self, data):
     # case-fold handled
     data = casefold(data)
     # encode to utf-8
     data = data.encode('utf-8')
     # punctation removed
     data = re.sub(r'\W+', ' ', data)
     # lowercase
     return data.lower().strip()
Exemplo n.º 7
0
    def _normalize(cls, value):
        """
        Normalize a search value.

        @type value: L{unicode}
        @param value: The value to normalize.

        @rtype: L{unicode}
        @return: The normalized value.
        """
        return cls._searchNoise.sub(u'', casefold(normalize('NFC', value)))
Exemplo n.º 8
0
    def _normalize(cls, value):
        """
        Normalize a search value.

        @type value: L{unicode}
        @param value: The value to normalize.

        @rtype: L{unicode}
        @return: The normalized value.
        """
        return cls._searchNoise.sub(u'', casefold(normalize('NFC', value)))
Exemplo n.º 9
0
def text_preprocess(review):
    """
    Takes in a string of text, then performs the following:
    1. Casefold the string
    2. Remove all stopwords
    3. stem the words
    4. Returns a list of the cleaned text
    """
    casefolded = casefold(unicode(review, 'utf-8'))
    tokenize = word_tokenize(casefolded)
    no_stopwards = [item for item in tokenize if item not in stop]
    stemmed = [stemmer.stem(y) for y in no_stopwards]
    return stemmed
Exemplo n.º 10
0
def transform_for_lm(string):
    new_string = ''
    for index, char in enumerate(string):
        if char == ' ':
            new_string = new_string + u'}'
        elif (char == '.' and len(string) == (index+1)) or \
                (char == '.' and string[index+1] == ' '):  # treat period as end of sentence
            new_string = new_string + u'</s>'
        else:
            new_string = new_string + char.lower()
        if len(string) > (index + 1):
            new_string = new_string + u' '
    new_string = casefold(new_string)
    return new_string
Exemplo n.º 11
0
    def _apply(self, value):
        value = self._filter(value, Type(text_type))  # type: Text

        if self._has_errors:
            return None

        # In Python 3, case folding is supported natively.
        # In Python 2, this is the best we can do.
        # https://docs.python.org/3/library/stdtypes.html#str.casefold
        if PY3:
            # noinspection PyUnresolvedReferences
            return value.casefold()
        else:
            # noinspection PyUnresolvedReferences
            from py2casefold import casefold
            return casefold(value)
Exemplo n.º 12
0
def create_vocab_for_lm(pickle_file_1, pickle_file_2=None):
    outputfile_name = "all_chars.txt"
    df_1 = pickle.load(open(pickle_file_1, "rb"))
    context_1 = df_1['Error Context'].str.replace(" ",
                                                  "}")  # convert spaces to }
    all_chars = set(list(' '.join(list(context_1.values))))

    if pickle_file_2:
        df_2 = pickle.load(open(pickle_file_2, "rb"))
        context_2 = df_2['Error Context'].str.replace(
            " ", "}")  # convert spaces to }
        chars_2 = set(list(casefold(' '.join(list(
            context_2.values)))))  # casefold
        all_chars = chars_2.union(all_chars)

    with codecs.open(outputfile_name, 'w', 'utf-8') as outfile:
        for char in all_chars:
            outfile.write(char + '\n')
Exemplo n.º 13
0
def casefold_file(file_name):
    casefolded_file_name = file_name + '_casefolded'
    with codecs.open(file_name, 'r', 'utf-8') as infile:
        with codecs.open(casefolded_file_name, 'w', 'utf-8') as outfile:
            for line in infile:
                outfile.write(casefold(line))
Exemplo n.º 14
0
 def test_basic(self):
     self.assertEqual(casefold(u"tschüß"), u"tschüss")
     self.assertEqual(casefold(u"ΣίσυφοςfiÆ"),
                      casefold(u"ΣΊΣΥΦΟσFIæ"))
     self.assertEqual(casefold(u"ffi"), u"ffi")
Exemplo n.º 15
0
 def test_ascii_only(self):
     self.assertEqual(casefold(u"aA"), u"aa")
     self.assertEqual(casefold(u"fOo Bar"), u"foo bar")
Exemplo n.º 16
0
 def test_empty_string(self):
     self.assertEqual(casefold(u""), u"")
Exemplo n.º 17
0
 def test_empty_string(self):
     self.assertEqual(casefold(""), "")
Exemplo n.º 18
0
def normalize_caseless(text):
    return unicodedata.normalize("NFKD", casefold(text))
Exemplo n.º 19
0
 def test_ascii_only(self):
     self.assertEqual(casefold("aA"), "aa")
     self.assertEqual(casefold("fOo Bar"), "foo bar")
Exemplo n.º 20
0
def normalize_caseless(text):
    return unicodedata.normalize("NFC", casefold(text))
Exemplo n.º 21
0
 def test_basic(self):
     self.assertEqual(casefold("tschüß"), "tschüss")
     self.assertEqual(casefold("ΣίσυφοςfiÆ"),
                      casefold("ΣΊΣΥΦΟσFIæ"))
     self.assertEqual(casefold("ffi"), "ffi")
Exemplo n.º 22
0
def caseFolding(data):
    for i in range(len(data)):
        data[i] = casefold(data[i])
    return data
Exemplo n.º 23
0
 def username_eq(self, u1, u2):
     """
     Force each username to lowercase and compare them. Can be overridden if
     the service has case sensitive usernames.
     """
     return casefold(u1) == casefold(u2)