def name_clean_wrapper(name_list, clean_regex=name_address_regex, legal_regex=legal_regex): name_string = psCleanup.decoder(name_list) name_string = psCleanup.remove_diacritics(name_string) name_string = psCleanup.stdize_case(name_string) name_string = psCleanup.master_clean_regex(name_string, clean_regex) names_ids = psCleanup.get_legal_ids(name_string, legal_regex) return names_ids
def clean_wrapper(name_string, dict_list=regex_dicts): print name_string out = psCleanup.rem_diacritics(name_string) out = psCleanup.stdize_case(out) out = psCleanup.master_clean_regex([out], dict_list) out = out[0].strip() return(out)
def clean_wrapper(name_string, dict_list=regex_dicts): print name_string out = psCleanup.rem_diacritics(name_string) out = psCleanup.stdize_case(out) out = psCleanup.master_clean_regex([out], dict_list) out = out[0].strip() return (out)
def address_clean_wrapper(address_list, clean_regex=name_address_regex): address_string = psCleanup.decoder(address_list) address_string = psCleanup.remove_diacritics(address_string) address_string = psCleanup.stdize_case(address_string) address_string = psCleanup.master_clean_regex(address_string, clean_regex) return address_string
def clean_wrapper(name_dict, dict_list=regex_dicts): name_string = psCleanup.decoder(name_dict["person_name"]) out = psCleanup.rem_diacritics(name_string) out = psCleanup.stdize_case(out) out = psCleanup.master_clean_regex([out], dict_list) out = out[0].strip() name_dict["person_name"] = psCleanup.encoder(out) return name_dict
def clean_wrapper(name_dict, dict_list=regex_dicts): name_string = psCleanup.decoder(name_dict['person_name']) out = psCleanup.rem_diacritics(name_string) out = psCleanup.stdize_case(out) out = psCleanup.master_clean_regex([out], dict_list) out = out[0].strip() name_dict['person_name'] = psCleanup.encoder(out) return (name_dict)
def translate_non_alphanumerics(to_translate, translate_to=u' '): not_letters_or_digits = unicode(string.punctuation) translate_table = dict((ord(char), translate_to) for char in not_letters_or_digits) return to_translate.translate(translate_table) def strip_punc(s): s_out = s.translate(string.maketrans("",""), string.punctuation) return s_out #Function names below are not exact N = len(names) t0 = time.time() clean_names = [psCleanup.rem_diacritics(n) for n in names] clean_names = [psCleanup.rem_trail_spaces(n) for n in clean_names] clean_names = [psCleanup.stdize_case(n) for n in clean_names] clean_names = [translate_non_alphanumerics(n) for n in clean_names] clean_names = psCleanup.master_clean_dicts(clean_names, all_dicts) clean_names = [n.strip() for n in clean_names] t1 = time.time() ### Works out to ~ 0.05s / entry clean_time = t1 - t0 print clean_time / N ## Then pre-cluster by the leading 3 characters of the name t0 = time.time() leading_ngram_dict = psDisambig.build_leading_ngram_dict(clean_names, leading_n=3) t1 = time.time() leading_ngram_time = t1 - t0
translate_table = dict( (ord(char), translate_to) for char in not_letters_or_digits) return to_translate.translate(translate_table) def strip_punc(s): s_out = s.translate(string.maketrans("", ""), string.punctuation) return s_out #Function names below are not exact N = len(names) t0 = time.time() clean_names = [psCleanup.rem_diacritics(n) for n in names] clean_names = [psCleanup.rem_trail_spaces(n) for n in clean_names] clean_names = [psCleanup.stdize_case(n) for n in clean_names] clean_names = [translate_non_alphanumerics(n) for n in clean_names] clean_names = psCleanup.master_clean_dicts(clean_names, all_dicts) clean_names = [re.sub(' ', '', n) for n in clean_names] t1 = time.time() ## Define some blocking functions def block_by_2_ngrams(name_string, ngram_length=2): block_dict = {} for name in name_string: these_ngrams = set([ ''.join(name[j] for j in range(i, i + ngram_length)) for i in range((len(name) - ngram_length)) ])
def clean_list_wrapper(name_list, dict_list=all_dicts): out = [psCleanup.rem_diacritics(n) for n in name_list] out = [psCleanup.stdize_case(n) for n in out] out = psCleanup.master_clean_dicts(out, all_dicts) out = [n.strip() for n in out] return(out)
def clean_list_wrapper(name_list, dict_list=all_dicts): out = [psCleanup.rem_diacritics(n) for n in name_list] out = [psCleanup.stdize_case(n) for n in out] out = psCleanup.master_clean_dicts(out, all_dicts) out = [n.strip() for n in out] return (out)