def clean_number_tens(text): """ Extract a tens-column number expressed in words from the given text, and converts it to an integer. >>> DemographicCleaner.clean_number_tens("arobaini") 40 :param text: Text to clean. :type text: str :return: 10, 20, 30, ..., 80, or 90. :rtype: int """ patterns = { 10: Patterns.ten, 20: Patterns.twenty, 30: Patterns.thirty, 40: Patterns.forty, 50: Patterns.fifty, 60: Patterns.sixty, 70: Patterns.seventy, 80: Patterns.eighty, 90: Patterns.ninety } return RegexUtils.clean_with_patterns(text, patterns)
def clean_number_units(text): """ Extracts a units-column number expressed in words from the given text, and converts it to an integer. >>> DemographicCleaner.clean_number_units("tano") 5 :param text: Text to clean. :type text: str :return: A number from 1-9 inclusive. :rtype: int """ patterns = { 1: Patterns.one, 2: Patterns.two, 3: Patterns.three, 4: Patterns.four, 5: Patterns.five, 6: Patterns.six, 7: Patterns.seven, 8: Patterns.eight, 9: Patterns.nine, } return RegexUtils.clean_with_patterns(text, patterns)
def clean_number_teens(text): """ Extract a "teens" number expressed in words from the given text, and converts it to an integer. >>> DemographicCleaner.clean_number_teens("eleven") 11 :param text: Text to clean. :type text: str :return: A number from 11-19 inclusive. :rtype: int """ patterns = { 11: Patterns.eleven, 12: Patterns.twelve, 13: Patterns.thirteen, 14: Patterns.fourteen, 15: Patterns.fifteen, 16: Patterns.sixteen, 17: Patterns.seventeen, 18: Patterns.eighteen, 19: Patterns.nineteen } return RegexUtils.clean_with_patterns(text, patterns)
def clean_number_words(text): """ Extracts the numbers in the given text that are expressed in words, and converts them to an integer. The strategy employed searches the given string for each number in somali.Patterns.numbers, and returns the sum total of the numbers which matched at least once in the given text. >>> DemographicCleaner.clean_number_words("lix iyo lawatan") 26 >>> DemographicCleaner.clean_number_words("afar shan lix") 15 :param text: Text to clean :type text: str :return: Extracted number :rtype: int """ total = 0 found_match = False for number, pattern in DemographicPatterns.numbers.items(): if RegexUtils.has_matches(text, pattern): found_match = True total += number if found_match: return total else: return Codes.NOT_CODED
def clean_gender(text): """ Identifies the gender in the given string. >>> DemographicCleaner.clean_gender("KiUMe") 'man' :param text: Text to clean. :type text: str :return: Codes.Man, Codes.Woman, or None if no gender could be identified. :rtype: str """ patterns = {Codes.MAN: Patterns.man, Codes.WOMAN: Patterns.woman} return RegexUtils.clean_with_patterns(text, patterns)
def is_noise(cls, text, min_length=-1): # Note: Testing is_only_yes_no AND clean_yes_no because # is_only_yes_no == True does not imply clean_yes_no does not return Codes.NOT_CODED is_demographic = False if cls.is_only_yes_no(text) or \ cls.clean_gender(text) is not Codes.NOT_CODED or \ cls.clean_yes_no(text) is not Codes.NOT_CODED or \ cls.clean_urban_rural(text) is not Codes.NOT_CODED or \ cls.clean_somalia_district(text) is not Codes.NOT_CODED or \ cls.clean_age(text) is not Codes.NOT_CODED: is_demographic = True is_noise_by_regex = RegexUtils.has_matches(text, DemographicPatterns.noise) is_noise_by_compexity = min_length > 0 and len(text) < min_length return (is_noise_by_regex or is_noise_by_compexity) and not is_demographic
input_path = args.input[0] key_of_raw = args.key_of_raw[0] pattern = args.regex_pattern[0] json_output_path = args.json_output[0] matches_csv_output_path = args.matches_csv_output[0] key_of_matches = "{}_matches".format(key_of_raw) # Load data from JSON file with open(input_path, "r") as f: data = TracedDataJsonIO.import_json_to_traced_data_iterable(f) # Apply the regex pattern to the data for td in data: td.append_data( {key_of_matches: RegexUtils.has_matches(td[key_of_raw], pattern)}, Metadata(user, Metadata.get_call_location(), time.time())) # Write json output if os.path.dirname(json_output_path) is not "" and not os.path.exists( os.path.dirname(json_output_path)): os.makedirs(os.path.dirname(json_output_path)) with open(json_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_json(data, f, pretty_print=True) # Output to a CSV listing de-duplicated responses and whether or not they matched the regex pattern if os.path.dirname( matches_csv_output_path) is not "" and not os.path.exists( os.path.dirname(matches_csv_output_path)):
def clean_somalia_district(text): return RegexUtils.clean_with_patterns( text, DemographicPatterns.somalia_districts)
def clean_mogadishu_sub_district(text): return RegexUtils.clean_with_patterns( text, DemographicPatterns.mogadishu_sub_districts)
def clean_urban_rural(text): return RegexUtils.clean_with_patterns(text, DemographicPatterns.urban_rural)
def clean_yes_no(text): return RegexUtils.clean_with_patterns(text, DemographicPatterns.yes_no)
def clean_gender(text): return RegexUtils.clean_with_patterns(text, DemographicPatterns.genders)
def is_only_yes_no(text): return RegexUtils.has_matches(text, DemographicPatterns.only_yes_no)