Пример #1
0
 def test_dont_condense_whitespace(self):
     s1 = "new york mets - atlanta braves"
     s2 = "new york mets atlanta braves"
     p1 = StringProcessor.replace_non_letters_non_numbers_with_whitespace(
         s1)
     p2 = StringProcessor.replace_non_letters_non_numbers_with_whitespace(
         s2)
     self.assertNotEqual(p1, p2)
Пример #2
0
def query_processor(s, force_ascii=False):
    global processor_regex
    # Keep only letters, numbers and some special character in path
    string_out = processor_regex.sub(" ", s)
    # Force into lowercase.
    string_out = StringProcessor.to_lower_case(string_out)
    # Remove leading and trailing whitespaces.
    string_out = StringProcessor.strip(string_out)
    return string_out
Пример #3
0
 def test_replace_non_letters_non_numbers_with_whitespace(self):
     strings = ["new york mets - atlanta braves", "Cães danados", "New York //// Mets $$$", "Ça va?"]
     for string in strings:
         proc_string = StringProcessor.replace_non_letters_non_numbers_with_whitespace(string)
         regex = re.compile(r"(?ui)[\W]")
         for expr in regex.finditer(proc_string):
             self.assertEqual(expr.group(), " ")
Пример #4
0
def full_process(s, force_ascii=False):
    """Process string by
        -- removing all but letters and numbers
        -- trim whitespace
        -- force to lower case
        if force_ascii == True, force convert to ascii"""

    if force_ascii:
        s = asciidammit(s)
    # Keep only Letters and Numbers (see Unicode docs).
    string_out = StringProcessor.replace_non_letters_non_numbers_with_whitespace(s)
    # Force into lowercase.
    string_out = StringProcessor.to_lower_case(string_out)
    # Remove leading and trailing whitespaces.
    string_out = StringProcessor.strip(string_out)
    return string_out
Пример #5
0
 def test_replace_non_lettters_non_numbers_with_whitespace(self):
     strings = [u"new york mets - atlanta braves", u"Cães danados", u"New York //// Mets $$$", u"Ça va?"]
     for string in strings:
         proc_string = StringProcessor.replace_non_lettters_non_numbers_with_whitespace(string)
         regex = re.compile(r"(?ui)[\W]")
         for expr in regex.finditer(proc_string):
             self.assertEquals(expr.group(), " ")
Пример #6
0
def semi_process(s, force_ascii=False):
    """
    Variation on Fuzzywuzzy's full_process:
    Process string by
    XX removing all but letters and numbers --> These are kept to keep consecutive spans
    -- trim whitespace
    XX force to lower case --> These are kept since annotators marked verbatim spans, so case is a good signal
    if force_ascii == True, force convert to ascii
    """

    if s is None:
        return ""

    if force_ascii:
        s = asciidammit(s)
    # Remove leading and trailing whitespaces.
    string_out = StringProcessor.strip(s)
    return string_out
Пример #7
0
 def test_dont_condense_whitespace(self):
     s1 = "new york mets - atlanta braves"
     s2 = "new york mets atlanta braves"
     p1 = StringProcessor.replace_non_letters_non_numbers_with_whitespace(s1)
     p2 = StringProcessor.replace_non_letters_non_numbers_with_whitespace(s2)
     self.assertNotEqual(p1, p2)
Пример #8
0
def string_processor(string):
    string_out = StringProcessor.to_lower_case(string)
    string_out = StringProcessor.strip(string_out)
    return string_out