Пример #1
0
def normalize_us_phone_number(phone):
    """Tidy up phone numbers so they're nice."""
    phone = re.sub('(\(|\)|\s+)', '', phone)
    m = phone_digits_re.search(phone)
    if m:
        return '(%s) %s-%s' % (m.group(1), m.group(2), m.group(3))
    return ''
Пример #2
0
def normalize_us_phone_number(phone):
    """Tidy up phone numbers so they're nice."""
    phone = re.sub(r"(\(|\)|\s+)", "", phone)
    m = phone_digits_re.search(phone)
    if m:
        return f"({m.group(1)}) {m.group(2)}-{m.group(3)}"
    return ""
Пример #3
0
def normalize_us_phone_number(phone):
    """Tidy up phone numbers so they're nice."""
    phone = re.sub('(\(|\)|\s+)', '', phone)
    m = phone_digits_re.search(phone)
    if m:
        return '(%s) %s-%s' % (m.group(1), m.group(2), m.group(3))
    return ''
Пример #4
0
 def to_python(self, value):
     if value in EMPTY_VALUES:
         return u''
     value = smart_unicode(value)
     m = phone_digits_re.search(value)
     if m:
         return u'-'.join(m.groups())
     return value
Пример #5
0
def normalize_attorney_contact(c, fallback_name=''):
    """Normalize the contact string for an attorney.

    Attorney contact strings are newline separated addresses like:

        Landye Bennett Blumstein LLP
        701 West Eighth Avenue, Suite 1200
        Anchorage, AK 99501
        907-276-5152
        Email: [email protected]

    We need to pull off email and phone numbers, and then our address parser
    should work nicely.
    """
    atty_info = {
        'email': '',
        'fax': '',
        'phone': '',
    }
    if not c:
        return {}, atty_info

    address_lines = []
    lines = c.split('\n')
    for i, line in enumerate(lines):
        line = re.sub('Email:\s*', '', line).strip()
        line = re.sub('pro se', '', line, flags=re.I)
        if not line:
            continue
        try:
            validate_email(line)
        except ValidationError:
            # Not an email address, press on.
            pass
        else:
            # An email address.
            atty_info['email'] = line
            continue

        # Perhaps a phone/fax number?
        clean_line = re.sub(r'(\(|\)|\\|/|\s+)', '', line)
        if clean_line.startswith('Fax:'):
            clean_line = re.sub('Fax:', '', clean_line)
            m = phone_digits_re.search(clean_line)
            if m:
                atty_info['fax'] = clean_line
            continue
        else:
            m = phone_digits_re.search(clean_line)
            if m:
                atty_info['phone'] = clean_line
                continue

        # First line containing an ampersand? These are usually law firm names.
        if u'&' in line and i == 0:
            fallback_name = line
            continue

        has_chars = re.search('[a-zA-Z]', line)
        if has_chars:
            # Not email, phone, fax, and has at least one char.
            address_lines.append(line)

    mapping = {
        'Recipient': 'name',
        'AddressNumber': 'address1',
        'AddressNumberPrefix': 'address1',
        'AddressNumberSuffix': 'address1',
        'StreetName': 'address1',
        'StreetNamePreDirectional': 'address1',
        'StreetNamePreModifier': 'address1',
        'StreetNamePreType': 'address1',
        'StreetNamePostDirectional': 'address1',
        'StreetNamePostModifier': 'address1',
        'StreetNamePostType': 'address1',
        # When corner addresses are given, you have two streets in an address
        'SecondStreetName': 'address1',
        'SecondStreetNamePreDirectional': 'address1',
        'SecondStreetNamePreModifier': 'address1',
        'SecondStreetNamePreType': 'address1',
        'SecondStreetNamePostDirectional': 'address1',
        'SecondStreetNamePostModifier': 'address1',
        'SecondStreetNamePostType': 'address1',
        'CornerOf': 'address1',
        'IntersectionSeparator': 'address1',
        'LandmarkName': 'address1',
        'USPSBoxGroupID': 'address1',
        'USPSBoxGroupType': 'address1',
        'USPSBoxID': 'address1',
        'USPSBoxType': 'address1',
        'BuildingName': 'address2',
        'OccupancyType': 'address2',
        'OccupancyIdentifier': 'address2',
        'SubaddressIdentifier': 'address2',
        'SubaddressType': 'address2',
        'PlaceName': 'city',
        'StateName': 'state',
        'ZipCode': 'zip_code',
        'ZipPlus4': 'zip_code',
    }
    try:
        address_info, address_type = usaddress.tag(
            ', '.join(address_lines),
            tag_mapping=mapping,
        )
    except usaddress.RepeatedLabelError:
        logger.warn("Unable to parse address (RepeatedLabelError): %s" %
                    ', '.join(c.split('\n')))
        return {}, atty_info

    # We don't want this getting through to the database layer. Pop it.
    address_info.pop('NotAddress', None)

    if any([address_type == 'Ambiguous', 'CountryName' in address_info]):
        logger.warn("Unable to parse address (Ambiguous address type): %s" %
                    ', '.join(c.split('\n')))
        return {}, atty_info

    if address_info.get('name') is None and fallback_name:
        address_info['name'] = fallback_name
    if address_info.get('state'):
        address_info['state'] = normalize_us_state(address_info['state'])

    address_info = normalize_address_info(dict(address_info))
    address_info['lookup_key'] = make_address_lookup_key(address_info)
    return address_info, atty_info
Пример #6
0
def normalize_attorney_contact(c, fallback_name=""):
    """Normalize the contact string for an attorney.

    Attorney contact strings are newline separated addresses like:

        Landye Bennett Blumstein LLP
        701 West Eighth Avenue, Suite 1200
        Anchorage, AK 99501
        907-276-5152
        Email: [email protected]

    We need to pull off email and phone numbers, and then our address parser
    should work nicely.
    """
    atty_info = {
        "email": "",
        "fax": "",
        "phone": "",
    }
    if not c:
        return {}, atty_info

    address_lines = []
    lines = c.split("\n")
    for i, line in enumerate(lines):
        line = re.sub("Email:\s*", "", line).strip()
        line = re.sub("pro se", "", line, flags=re.I)
        if not line:
            continue
        try:
            validate_email(line)
        except ValidationError:
            # Not an email address, press on.
            pass
        else:
            # An email address.
            atty_info["email"] = line
            continue

        # Perhaps a phone/fax number?
        clean_line = re.sub(r"(\(|\)|\\|/|\s+)", "", line)
        if clean_line.startswith("Fax:"):
            clean_line = re.sub("Fax:", "", clean_line)
            m = phone_digits_re.search(clean_line)
            if m:
                atty_info["fax"] = normalize_us_phone_number(clean_line)
            continue
        else:
            m = phone_digits_re.search(clean_line)
            if m:
                atty_info["phone"] = normalize_us_phone_number(clean_line)
                continue

        # First line containing an ampersand? These are usually law firm names.
        if u"&" in line and i == 0:
            fallback_name = line
            continue

        has_chars = re.search("[a-zA-Z]", line)
        if has_chars:
            # Not email, phone, fax, and has at least one char.
            address_lines.append(line)

    mapping = {
        "Recipient": "name",
        "AddressNumber": "address1",
        "AddressNumberPrefix": "address1",
        "AddressNumberSuffix": "address1",
        "StreetName": "address1",
        "StreetNamePreDirectional": "address1",
        "StreetNamePreModifier": "address1",
        "StreetNamePreType": "address1",
        "StreetNamePostDirectional": "address1",
        "StreetNamePostModifier": "address1",
        "StreetNamePostType": "address1",
        # When corner addresses are given, you have two streets in an address
        "SecondStreetName": "address1",
        "SecondStreetNamePreDirectional": "address1",
        "SecondStreetNamePreModifier": "address1",
        "SecondStreetNamePreType": "address1",
        "SecondStreetNamePostDirectional": "address1",
        "SecondStreetNamePostModifier": "address1",
        "SecondStreetNamePostType": "address1",
        "CornerOf": "address1",
        "IntersectionSeparator": "address1",
        "LandmarkName": "address1",
        "USPSBoxGroupID": "address1",
        "USPSBoxGroupType": "address1",
        "USPSBoxID": "address1",
        "USPSBoxType": "address1",
        "BuildingName": "address2",
        "OccupancyType": "address2",
        "OccupancyIdentifier": "address2",
        "SubaddressIdentifier": "address2",
        "SubaddressType": "address2",
        "PlaceName": "city",
        "StateName": "state",
        "ZipCode": "zip_code",
        "ZipPlus4": "zip_code",
    }
    try:
        address_info, address_type = usaddress.tag(u", ".join(address_lines),
                                                   tag_mapping=mapping)
    except (usaddress.RepeatedLabelError, UnicodeEncodeError):
        # See https://github.com/datamade/probableparsing/issues/2 for why we
        # catch the UnicodeEncodeError. Oy.
        logger.warning("Unable to parse address (RepeatedLabelError): %s" %
                       ", ".join(c.split("\n")))
        return {}, atty_info

    # We don't want this getting through to the database layer. Pop it.
    address_info.pop("NotAddress", None)

    if any([address_type == "Ambiguous", "CountryName" in address_info]):
        logger.warning("Unable to parse address (Ambiguous address type): %s" %
                       ", ".join(c.split("\n")))
        return {}, atty_info

    if address_info.get("name") is None and fallback_name:
        address_info["name"] = fallback_name
    if address_info.get("state"):
        address_info["state"] = normalize_us_state(address_info["state"])

    address_info = normalize_address_info(dict(address_info))
    address_info["lookup_key"] = make_address_lookup_key(address_info)
    return address_info, atty_info
Пример #7
0
def normalize_attorney_contact(c, fallback_name=''):
    """Normalize the contact string for an attorney.

    Attorney contact strings are newline separated addresses like:

        Landye Bennett Blumstein LLP
        701 West Eighth Avenue, Suite 1200
        Anchorage, AK 99501
        907-276-5152
        Email: [email protected]

    We need to pull off email and phone numbers, and then our address parser
    should work nicely.
    """
    atty_info = {
        'email': '',
        'fax': '',
        'phone': '',
    }
    if not c:
        return {}, atty_info

    address_lines = []
    lines = c.split('\n')
    for i, line in enumerate(lines):
        line = re.sub('Email:\s*', '', line).strip()
        line = re.sub('pro se', '', line, flags=re.I)
        if not line:
            continue
        try:
            validate_email(line)
        except ValidationError:
            # Not an email address, press on.
            pass
        else:
            # An email address.
            atty_info['email'] = line
            continue

        # Perhaps a phone/fax number?
        clean_line = re.sub(r'(\(|\)|\\|/|\s+)', '', line)
        if clean_line.startswith('Fax:'):
            clean_line = re.sub('Fax:', '', clean_line)
            m = phone_digits_re.search(clean_line)
            if m:
                atty_info['fax'] = normalize_us_phone_number(clean_line)
            continue
        else:
            m = phone_digits_re.search(clean_line)
            if m:
                atty_info['phone'] = normalize_us_phone_number(clean_line)
                continue

        # First line containing an ampersand? These are usually law firm names.
        if u'&' in line and i == 0:
            fallback_name = line
            continue

        has_chars = re.search('[a-zA-Z]', line)
        if has_chars:
            # Not email, phone, fax, and has at least one char.
            address_lines.append(line)

    mapping = {
        'Recipient': 'name',
        'AddressNumber': 'address1',
        'AddressNumberPrefix': 'address1',
        'AddressNumberSuffix': 'address1',
        'StreetName': 'address1',
        'StreetNamePreDirectional': 'address1',
        'StreetNamePreModifier': 'address1',
        'StreetNamePreType': 'address1',
        'StreetNamePostDirectional': 'address1',
        'StreetNamePostModifier': 'address1',
        'StreetNamePostType': 'address1',
        # When corner addresses are given, you have two streets in an address
        'SecondStreetName': 'address1',
        'SecondStreetNamePreDirectional': 'address1',
        'SecondStreetNamePreModifier': 'address1',
        'SecondStreetNamePreType': 'address1',
        'SecondStreetNamePostDirectional': 'address1',
        'SecondStreetNamePostModifier': 'address1',
        'SecondStreetNamePostType': 'address1',
        'CornerOf': 'address1',
        'IntersectionSeparator': 'address1',
        'LandmarkName': 'address1',
        'USPSBoxGroupID': 'address1',
        'USPSBoxGroupType': 'address1',
        'USPSBoxID': 'address1',
        'USPSBoxType': 'address1',
        'BuildingName': 'address2',
        'OccupancyType': 'address2',
        'OccupancyIdentifier': 'address2',
        'SubaddressIdentifier': 'address2',
        'SubaddressType': 'address2',
        'PlaceName': 'city',
        'StateName': 'state',
        'ZipCode': 'zip_code',
        'ZipPlus4': 'zip_code',
    }
    try:
        address_info, address_type = usaddress.tag(
            u', '.join(address_lines),
            tag_mapping=mapping,
        )
    except (usaddress.RepeatedLabelError, UnicodeEncodeError):
        # See https://github.com/datamade/probableparsing/issues/2 for why we
        # catch the UnicodeEncodeError. Oy.
        logger.warn("Unable to parse address (RepeatedLabelError): %s" %
                    ', '.join(c.split('\n')))
        return {}, atty_info

    # We don't want this getting through to the database layer. Pop it.
    address_info.pop('NotAddress', None)

    if any([address_type == 'Ambiguous',
            'CountryName' in address_info]):
        logger.warn("Unable to parse address (Ambiguous address type): %s" %
                    ', '.join(c.split('\n')))
        return {}, atty_info

    if address_info.get('name') is None and fallback_name:
        address_info['name'] = fallback_name
    if address_info.get('state'):
        address_info['state'] = normalize_us_state(address_info['state'])

    address_info = normalize_address_info(dict(address_info))
    address_info['lookup_key'] = make_address_lookup_key(address_info)
    return address_info, atty_info