Python RegexUtils示例，core_data_modules.cleaners.RegexUtils Python示例

示例#1

0

显示文件

文件： demographic_cleaner.py 项目： AfricasVoices/CoreDataModules

    def clean_number_tens(text):
        """
        Extract a tens-column number expressed in words from the given text, and converts it to an integer.

        >>> DemographicCleaner.clean_number_tens("arobaini")
        40

        :param text: Text to clean.
        :type text: str
        :return: 10, 20, 30, ..., 80, or 90.
        :rtype: int
        """
        patterns = {
            10: Patterns.ten,
            20: Patterns.twenty,
            30: Patterns.thirty,
            40: Patterns.forty,
            50: Patterns.fifty,
            60: Patterns.sixty,
            70: Patterns.seventy,
            80: Patterns.eighty,
            90: Patterns.ninety
        }

        return RegexUtils.clean_with_patterns(text, patterns)

示例#2

0

显示文件

文件： demographic_cleaner.py 项目： AfricasVoices/CoreDataModules

    def clean_number_units(text):
        """
        Extracts a units-column number expressed in words from the given text, and converts it to an integer.

        >>> DemographicCleaner.clean_number_units("tano")
        5

        :param text: Text to clean.
        :type text: str
        :return: A number from 1-9 inclusive.
        :rtype: int
        """
        patterns = {
            1: Patterns.one,
            2: Patterns.two,
            3: Patterns.three,
            4: Patterns.four,
            5: Patterns.five,
            6: Patterns.six,
            7: Patterns.seven,
            8: Patterns.eight,
            9: Patterns.nine,
        }

        return RegexUtils.clean_with_patterns(text, patterns)

示例#3

0

显示文件

文件： demographic_cleaner.py 项目： AfricasVoices/CoreDataModules

    def clean_number_teens(text):
        """
        Extract a "teens" number expressed in words from the given text, and converts it to an integer.

        >>> DemographicCleaner.clean_number_teens("eleven")
        11

        :param text: Text to clean.
        :type text: str
        :return: A number from 11-19 inclusive.
        :rtype: int
        """
        patterns = {
            11: Patterns.eleven,
            12: Patterns.twelve,
            13: Patterns.thirteen,
            14: Patterns.fourteen,
            15: Patterns.fifteen,
            16: Patterns.sixteen,
            17: Patterns.seventeen,
            18: Patterns.eighteen,
            19: Patterns.nineteen
        }

        return RegexUtils.clean_with_patterns(text, patterns)

示例#4

0

显示文件

文件： demographic_cleaner.py 项目： AfricasVoices/CoreDataModules

    def clean_number_words(text):
        """
        Extracts the numbers in the given text that are expressed in words, and converts them to an integer.
        
        The strategy employed searches the given string for each number in somali.Patterns.numbers,
        and returns the sum total of the numbers which matched at least once in the given text.

        >>> DemographicCleaner.clean_number_words("lix iyo lawatan")
        26
        >>> DemographicCleaner.clean_number_words("afar shan lix")
        15

        :param text: Text to clean
        :type text: str
        :return: Extracted number
        :rtype: int
        """
        total = 0
        found_match = False

        for number, pattern in DemographicPatterns.numbers.items():
            if RegexUtils.has_matches(text, pattern):
                found_match = True
                total += number

        if found_match:
            return total
        else:
            return Codes.NOT_CODED

示例#5

0

显示文件

文件： demographic_cleaner.py 项目： AfricasVoices/CoreDataModules

    def clean_gender(text):
        """
        Identifies the gender in the given string.

        >>> DemographicCleaner.clean_gender("KiUMe")
        'man'

        :param text: Text to clean.
        :type text: str
        :return: Codes.Man, Codes.Woman, or None if no gender could be identified.
        :rtype: str
        """
        patterns = {Codes.MAN: Patterns.man, Codes.WOMAN: Patterns.woman}

        return RegexUtils.clean_with_patterns(text, patterns)

示例#6

0

显示文件

文件： demographic_cleaner.py 项目： AfricasVoices/CoreDataModules

    def is_noise(cls, text, min_length=-1):
        # Note: Testing is_only_yes_no AND clean_yes_no because
        # is_only_yes_no == True does not imply clean_yes_no does not return Codes.NOT_CODED
        is_demographic = False
        if cls.is_only_yes_no(text) or \
                cls.clean_gender(text) is not Codes.NOT_CODED or \
                cls.clean_yes_no(text) is not Codes.NOT_CODED or \
                cls.clean_urban_rural(text) is not Codes.NOT_CODED or \
                cls.clean_somalia_district(text) is not Codes.NOT_CODED or \
                cls.clean_age(text) is not Codes.NOT_CODED:
            is_demographic = True

        is_noise_by_regex = RegexUtils.has_matches(text,
                                                   DemographicPatterns.noise)
        is_noise_by_compexity = min_length > 0 and len(text) < min_length

        return (is_noise_by_regex
                or is_noise_by_compexity) and not is_demographic

示例#7

0

显示文件

文件： regex_tester.py 项目： AfricasVoices/Project-Wellcome-DFID-Somalia

    input_path = args.input[0]
    key_of_raw = args.key_of_raw[0]
    pattern = args.regex_pattern[0]
    json_output_path = args.json_output[0]
    matches_csv_output_path = args.matches_csv_output[0]

    key_of_matches = "{}_matches".format(key_of_raw)

    # Load data from JSON file
    with open(input_path, "r") as f:
        data = TracedDataJsonIO.import_json_to_traced_data_iterable(f)

    # Apply the regex pattern to the data
    for td in data:
        td.append_data(
            {key_of_matches: RegexUtils.has_matches(td[key_of_raw], pattern)},
            Metadata(user, Metadata.get_call_location(), time.time()))

    # Write json output
    if os.path.dirname(json_output_path) is not "" and not os.path.exists(
            os.path.dirname(json_output_path)):
        os.makedirs(os.path.dirname(json_output_path))
    with open(json_output_path, "w") as f:
        TracedDataJsonIO.export_traced_data_iterable_to_json(data,
                                                             f,
                                                             pretty_print=True)

    # Output to a CSV listing de-duplicated responses and whether or not they matched the regex pattern
    if os.path.dirname(
            matches_csv_output_path) is not "" and not os.path.exists(
                os.path.dirname(matches_csv_output_path)):

示例#8

0

显示文件

文件： demographic_cleaner.py 项目： AfricasVoices/CoreDataModules

 def clean_somalia_district(text):
     return RegexUtils.clean_with_patterns(
         text, DemographicPatterns.somalia_districts)

示例#9

0

显示文件

文件： demographic_cleaner.py 项目： AfricasVoices/CoreDataModules

 def clean_mogadishu_sub_district(text):
     return RegexUtils.clean_with_patterns(
         text, DemographicPatterns.mogadishu_sub_districts)

示例#10

0

显示文件

文件： demographic_cleaner.py 项目： AfricasVoices/CoreDataModules

 def clean_urban_rural(text):
     return RegexUtils.clean_with_patterns(text,
                                           DemographicPatterns.urban_rural)

示例#11

0

显示文件

文件： demographic_cleaner.py 项目： AfricasVoices/CoreDataModules

 def clean_yes_no(text):
     return RegexUtils.clean_with_patterns(text, DemographicPatterns.yes_no)

示例#12

0

显示文件

文件： demographic_cleaner.py 项目： AfricasVoices/CoreDataModules

 def clean_gender(text):
     return RegexUtils.clean_with_patterns(text,
                                           DemographicPatterns.genders)

示例#13

0

显示文件

文件： demographic_cleaner.py 项目： AfricasVoices/CoreDataModules

 def is_only_yes_no(text):
     return RegexUtils.has_matches(text, DemographicPatterns.only_yes_no)