Exemplo n.º 1
0
    def parse_name(self, name_raw):
        """
        Parses a raw_name, e.g. "Corbato, F.J." into first, middle, and last name

        >>> p=Person(name_raw='Corbato, F.J.')
        >>> p.last, p.first, p.middle
        ('Corbató', 'F', 'J')

        :param name_raw:
        :return:
        """

        # fix names like "Verzuh M., F.", where the middle name comes after the last name
        # -> it should be Verzuh, F. M.
        match = re.match('([A-Z][a-z]+) ([A-Z])\., ([A-Z][a-z]*)\.*', name_raw)
        if match:
            name_raw = f'{match.groups()[0]}, {match.groups()[2]}. {match.groups()[1]}.'

        name = HumanName(name_raw)
        # If first and middle initials have periods but not spaces -> separate, e.g. "R.K. Teague"
        if re.match('[a-zA-Z]\.[a-zA-Z]\.', name.first):
            name.middle = name.first[2]
            name.first = name.first[0]

        name.last = name.last.capitalize()
        name.first = name.first.strip('.').capitalize()
        name.middle = name.middle.strip('.').capitalize()

        last_name_replacements = [('Corbato', 'Corbató'),
                                  ('Corbatò', 'Corbató'), ('Verguh', 'Verzuh')]
        for replacement in last_name_replacements:
            name.last = name.last.replace(replacement[0], replacement[1])

        return name.last, name.first, name.middle
Exemplo n.º 2
0
 def sort_contributor(self, c: ParsedSegment, default_type=None):
     '''
     Sort a contributor into lists based on agent type.
     '''
     name = HumanName(c.name)
     initials = ''.join(rgx.abbr.findall(c.name))
     _type = default_type
     if name.last in self.contributors['person']:
         _type = 'person'
     else:
         for k, v in self.contributors.items():
             if k == 'person':
                 continue
             if initials in v:
                 if any([x.name == c.name for x in v[initials]]):
                     _type = k
     if _type is None:
         i = multi_choice(
             'What type of contributor is "{0}"?'.format(c.name),
             self.contributors.keys())
         _type = list(self.contributors.keys())[i]
     if _type == 'person':
         if name.last == '':
             name.last = name.first
             name.first = '?'
         if name.first == '':
             name.first = '?'
         c.name = name
         family_name_records = self.contributors[_type].get(name.last, {})
         initial_records = family_name_records.get(name.first[0], []) + [c]
         family_name_records[name.first[0]] = initial_records
         self.contributors[_type][name.last] = family_name_records
     else:
         self.contributors[_type][initials] = self.contributors[_type].get(
             initials, []) + [c]
Exemplo n.º 3
0
def normalize_name(first_name, last_name):
    """Normalizes capitalization of first and last name."""
    name = HumanName()
    name.first = first_name
    name.last = last_name
    name.capitalize()
    return (name.first, name.last)
Exemplo n.º 4
0
 def test_assignment_to_attribute(self):
     hn = HumanName("John A. Kenneth Doe, Jr.")
     hn.last = "de la Vega"
     self.m(hn.last,"de la Vega", hn)
     hn.title = "test"
     self.m(hn.title,"test", hn)
     hn.first = "test"
     self.m(hn.first,"test", hn)
     hn.middle = "test"
     self.m(hn.middle,"test", hn)
     hn.suffix = "test"
     self.m(hn.suffix,"test", hn)
Exemplo n.º 5
0
 def test_assign_list_to_attribute(self):
     hn = HumanName("John A. Kenneth Doe, Jr.")
     hn.title = ["test1", "test2"]
     assert hn.title == "test1 test2"
     hn.first = ["test3", "test4"]
     assert hn.first == "test3 test4"
     hn.middle = ["test5", "test6", "test7"]
     assert hn.middle == "test5 test6 test7"
     hn.last = ["test8", "test9", "test10"]
     assert hn.last == "test8 test9 test10"
     hn.suffix = ["test"]
     assert hn.suffix == "test"
Exemplo n.º 6
0
 def _massage_measure_donor_name(self, name_string):
     """
     """
     name = HumanName(name_string)
     name.first = name.first.title()
     name.last = name.last.title()
     if name.middle:
         name.middle = name.middle.replace(".", "")
         name.middle = "%s." % (name.middle.title())
     if name == "JR. Munger CHARLES T.":
         name.first = "Charles"
         name.middle = "T."
         name.last = "Munger"
         name.suffix = "Jr."
     if name == "M. Quinn. Delaney":
         name.first = "M."
         name.middle = "Quinn"
         name.last = "Delaney"
         name.suffix = None
     if name == "Robert Alan. Eustace":
         name.first = "Robert"
         name.middle = "Alan"
         name.last = "Eustace"
         name.suffix = None
     if name == "Susie Tompkins. Buell":
         name.first = "Susie"
         name.middle = "Tompkins"
         name.last = "Buell"
         name.suffix = None
     if name.middle and name.suffix:
         output = "%s %s %s %s" % (name.first, name.middle, name.last,
                                   name.suffix)
     if name.middle:
         output = "%s %s %s" % (name.first, name.middle, name.last)
     elif name.suffix:
         output = "%s %s %s" % (name.first, name.last, name.suffix)
     else:
         output = "%s %s" % (name.first, name.last)
     return output
Exemplo n.º 7
0
 def from_parts(cls,
                first=None,
                last=None,
                middle=None,
                suffix=None,
                title=None):
     name = HumanName()
     name.first = first
     name.middle = middle
     name.last = last
     name.suffix = suffix
     name.title = title
     return ParsedName(name)
Exemplo n.º 8
0
def clean_names(dirty_names):
    from nameparser import HumanName
    import string

    names = []
    for dude in dirty_names:
        name = HumanName(
            dude.translate(str.maketrans('', '', string.punctuation)))
        if not name.first:
            name.first = name.title
        names.append(name.first + ' ' + name.last)

    return names
 def _massage_measure_donor_name(self, name_string):
     """
     """
     name = HumanName(name_string)
     name.first = name.first.title()
     name.last = name.last.title()
     if name.middle:
         name.middle = name.middle.replace(".", "")
         name.middle = "%s." % (name.middle.title())
     if name == "JR. Munger CHARLES T.":
         name.first = "Charles"
         name.middle = "T."
         name.last = "Munger"
         name.suffix = "Jr."
     if name == "M. Quinn. Delaney":
         name.first = "M."
         name.middle = "Quinn"
         name.last = "Delaney"
         name.suffix = None
     if name == "Robert Alan. Eustace":
         name.first = "Robert"
         name.middle = "Alan"
         name.last = "Eustace"
         name.suffix = None
     if name == "Susie Tompkins. Buell":
         name.first = "Susie"
         name.middle = "Tompkins"
         name.last = "Buell"
         name.suffix = None
     if name.middle and name.suffix:
         output = "%s %s %s %s" % (name.first, name.middle, name.last, name.suffix)
     if name.middle:
         output = "%s %s %s" % (name.first, name.middle, name.last)
     elif name.suffix:
         output = "%s %s %s" % (name.first, name.last, name.suffix)
     else:
         output = "%s %s" % (name.first, name.last)
     return output
Exemplo n.º 10
0
    def _massage_payload(self, payload):
        for k, v in payload.items():
            if pd.isnull(v) or not v:
                # Replace nan or None with empty string.
                payload[k] = ""

        # Ensure names aren't all caps or all lowercase.
        if payload.get("firstname") and payload.get("lastname"):
            name = HumanName()
            name.first = payload["firstname"]
            name.last = payload["lastname"]
            name.capitalize()
            payload["firstname"] = name.first
            payload["lastname"] = name.last
Exemplo n.º 11
0
 def test_assignment_to_attribute(self):
     hn = HumanName("John A. Kenneth Doe, Jr.")
     hn.last = "de la Vega"
     assert hn.last == "de la Vega"
     hn.title = "test"
     assert hn.title == "test"
     hn.first = "test"
     assert hn.first == "test"
     hn.middle = "test"
     assert hn.middle == "test"
     hn.suffix = "test"
     assert hn.suffix == "test"
     with pytest.raises(TypeError):
         hn.suffix = [["test"]]
     with pytest.raises(TypeError):
         hn.suffix = {"test": "test"}
Exemplo n.º 12
0
    def HumanNameFmXML(self, ell):
        hn = HumanName()
        for el in ell:
            if el.tag == 'First':
                hn.first = el.text
            elif el.tag == 'Middle':
                hn.middle = el.text
            elif el.tag == 'Last':
                hn.last = el.text
            elif el.tag == 'Title':
                hn.title = el.text
            elif el.tag == 'Suffix':
                hn.suffix = el.text
            elif el.tag == 'NickName':
                hn.nickname = el.text
            else:
                pass

        return hn
Exemplo n.º 13
0
 def initContact(contactId: str):
     assert contactId not in activeContacts
     activeContacts.add(contactId)
     contact = dir.getContact(contactId)
     contactsToEmails[contactId] = contact['email']
     name = " ".join(
         filter(None, [
             contact.get('title_before_name'),
             contact.get('first_name'),
             contact.get('last_name'),
             contact.get('title_after_name')
         ]))
     #name = name.translate(str.maketrans('', '', string.punctuation))
     name = name.translate(str.maketrans('', '', '@'))
     name = HumanName(name.lower().strip())
     name.capitalize()
     if re.search('^(\w\.)+$', name.first):
         name.first = name.first.upper()
     contactsToNames[contactId] = name.__str__()
Exemplo n.º 14
0
    def person_name_from_xml(self, ell):
        '''Create a person mane from an XML element.'''
        hname = HumanName()
        for elm in ell:
            if elm.tag == 'First':
                hname.first = elm.text
            elif elm.tag == 'Middle':
                hname.middle = elm.text
            elif elm.tag == 'Last':
                hname.last = elm.text
            elif elm.tag == 'Title':
                hname.title = elm.text
            elif elm.tag == 'Suffix':
                hname.suffix = elm.text
            elif elm.tag == 'NickName':
                hname.nickname = elm.text
            else:
                pass

        return hname
Exemplo n.º 15
0
    page = url.read()
soup = BeautifulSoup(page, 'html.parser')
row_box = soup.find_all('tr', attrs={'role': 'row'})
names = {}
for i in row_box:
    if i.td is not None:
        name = i.td.find('a').text
        year = i.find('td', attrs={'role': 'rowheader'})
        if year is not None:
            year = year.text
        else:
            year = "0"
        # Process the name, adding dots to middle names if needed.
        name = HumanName(name)
        if len(name.first) == 1:
            name.first = name.first + "."
        if name.middle is not "":
            if len(name.middle) == 1:
                name.middle += '.'
            names[name.first + ' ' + name.middle + ' ' + name.last] = year
        else:
            names[name.first + ' ' + name.last] = year

# Write out a csv.
with open('acm-fellows.csv', 'w', newline='') as csvfile:
    fieldnames = ['name', 'year']
    wr = csv.DictWriter(csvfile, fieldnames=fieldnames)
    wr.writeheader()
    for n in names:
        wr.writerow({'name': n, 'year': names[n]})
Exemplo n.º 16
0
# let nameparser parse
parsed = HumanName( name )

# look at how that turned out:
print( "Parsed HumanName for " + name + ":" )
print( Person.HumanName_to_str( parsed ) )

# now, make a second HumanName instance.
manual = HumanName()

# look at how that turned out:
print( "Empty HumanName?:" )
print( Person.HumanName_to_str( manual ) )

# override parsed values with correct name parts
manual.first = "Van"
manual.last = "Conway"

# look at how that turned out:
print( "after manual configuration:" )
print( Person.HumanName_to_str( manual ) )

# now, try some lookups

# let the lookup parse the name.
test1 = Person.look_up_person_from_name( name )
print( "test1 = " + str( test1 ) )

# pass in manually configured HumanName
test2 = Person.look_up_person_from_name( name, manual )
print( "test2 = " + str( test2 ) )
Exemplo n.º 17
0
for comment in subreddit.stream.comments(
        skip_existing=True
):  # Watch the comment stream on our subreddit of choice
    if KEYPHRASE in comment.body:
        tableBase = "GP|PTS|REB|AST|STL|BLK|TOV|3PM|FG%|FT%\n:--|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--|:--\n"  # We want to format our response neatly, this is just the formatting convention Reddit uses to create tables.
        N = 0  # N represents the number of games to include in our averages. N = 0 will default to pulling averages for the entire season so far.

        player = comment.body.replace(
            KEYPHRASE, ''
        )  # Get rid of the keyphrase leaving us with the players name and optionally an N value
        player = HumanName(player.translate(str.maketrans('', '', "!?.,'-")))

        if (player.first.isdigit() == True):  # If the user entered a number
            N = player.first
            player.first = player.middle

        if player.suffix == '':  # Player did not enter a suffic (Jr, III etc)
            playerID = findPlayer(obj, player.first, player.last)
        else:
            playerID = findPlayer(obj, player.first,
                                  player.last + " " + player.suffix)

        if playerID != None:
            URL = 'https://stats.nba.com/stats/playerdashboardbylastngames/?measureType=Base&perMode=PerGame&plusMinus=N&paceAdjust=N&rank=N&leagueId=00&season=2019-20&seasonType=Regular+Season&poRound=0&playerId=' + str(
                playerID
            ) + '&outcome=&location=&month=0&seasonSegment=&dateFrom=&dateTo=&opponentTeamId=0&vsConference=&vsDivision=&gameSegment=&period=0&shotClockRange=&lastNGames=' + str(
                N)

            r = requests.get(url=URL, headers=request_headers)
            data = r.json()
Exemplo n.º 18
0
    def to_HumanName( self ):
        
        '''
        This method creates a nameparser HumanName() object instance for the
            Person name property values in this instance.  Returns the HumanName
            instance.
           
        preconditions: None.
        postconditions: None.
        '''
        
        # return reference
        instance_OUT = None
        
        # declare variables
        me = "to_HumanName"
        my_name_prefix = ""
        my_first_name = ""
        my_middle_name = ""
        my_last_name = ""
        my_name_suffix = ""
        my_nickname = ""
        my_full_name_string = ""
        my_lookup_name = ""
        got_name_parts = False
        
        # retrieve values from this instance
        my_name_prefix = self.get( self.PROP_NAME_NAME_PREFIX, None )
        my_first_name = self.get( self.PROP_NAME_FIRST_NAME, None )
        my_middle_name = self.get( self.PROP_NAME_MIDDLE_NAME, None )
        my_last_name = self.get( self.PROP_NAME_LAST_NAME, None )
        my_name_suffix = self.get( self.PROP_NAME_NAME_SUFFIX, None )
        my_nickname = self.get( self.PROP_NAME_NICKNAME, None )
        my_full_name_string = self.get( self.PROP_NAME_FULL_NAME_STRING, None )
        my_lookup_name = self.get_lookup_name()
        
        # got name parts?
        got_name_parts = self.got_name_parts()
        if ( got_name_parts == True ):
        
            # build human name from name parts.
            instance_OUT = HumanName()
    
            # Use nested values to populate HumanName.
            if ( my_name_prefix ):
        
                instance_OUT.title = my_name_prefix
                
            #-- END check to see if name_prefix. --#
            
            if ( my_first_name ):
        
                instance_OUT.first = my_first_name
                
            #-- END check to see if first_name. --#
            
            if ( my_middle_name ):
        
                instance_OUT.middle = my_middle_name
                
            #-- END check to see if middle_name. --#
            
            if ( my_last_name ):
        
                instance_OUT.last = my_last_name
                
            #-- END check to see if last_name. --#
            
            if ( my_name_suffix ):
        
                instance_OUT.suffix = my_name_suffix
                
            #-- END check to see if name_suffix. --#
            
            if ( my_nickname ):
        
                instance_OUT.nickname = my_nickname
                
            #-- END check to see if nickname. --#
            
        # got full name string?
        elif ( ( my_full_name_string is not None ) and ( my_full_name_string != "" ) ):
        
            # yes.  Pass it to HumanName
            instance_OUT = HumanName( my_full_name_string )
        
        # how about lookup name?
        elif ( ( my_lookup_name is not None ) and ( my_lookup_name != "" ) ):
        
            # yes.  Pass it to HumanName
            instance_OUT = HumanName( my_lookup_name )
        
        else:
        
            # no names present at all.  Return None.
            instance_OUT = None
            
        #-- END check to see what name information we have --#
                
        return instance_OUT
        
    #-- END method to_HumanName() --#


#-- END class PersonDetails --#
def parse_persname(persname, auth="", source=""):
    name, birth_date, death_date = extract_birth_death_dates(persname)
    birth_date, death_date = validate_dates(birth_date, death_date)
    dates_string = make_date_string(birth_date, death_date)
    name = HumanName(name)

    titles = ["sir", "mr", "mrs", "baron", "dame", "madame", "viscount", "conte"]
    numbers = ["II", "III"]
    title = name.title
    suffix = name.suffix
    number = u""

    # check if the suffix should actually be a title
    if not title and any(suffix.lower().strip(". ") == title for title in titles):
        title = suffix.capitalize()
        if "mr" in title.lower() and not title.endswith("."):
            title += "."
        suffix = u""

    # extract numbers from the suffix
    if suffix in numbers:
        number = suffix
        suffix = u""

    # special cases cleanup
    if name.title == u"Royal":
        name.title = ""
        title = ""
        name.middle = name.first if not name.middle else "{} {}".format(u"Royal", name.middle)
        name.first = u"Royal"

    if name.title == u"Queen of Great":
        title = name.title + u" Britain"
        name.first = u""

    if name.title == u"Lama":
        title = u"Dalai Lama XIV"
        name.first = u""
        name.middle = u""

    if name.title == u"Marquis":
        title = u""
        name.first = u"Marquis"
        name.middle = u"W."

    if suffix == u"1941":
        birth_date = suffix
        suffix = u""

    if suffix in [u"18", u"b."]:
        suffix = u""

    if suffix == u"Jr":
        suffix += u"."

    if ", fl. 17th cent" in suffix:
        suffix = u"sieur de"
        dates_string = u"fl. 17th cent"

    rest_of_name = u"{0} {1}".format(name.first, name.middle).rstrip()
    if rest_of_name == u"Christella D. Personal journey through South Africa. 1991":
        rest_of_name = u"Christella D."

    # People with single-part names (like Keewaydinoquay) are mis-assigned. Have to fix those
    primary_name = name.last
    if rest_of_name and not primary_name:
        primary_name = rest_of_name
        rest_of_name = ""

    # create the parsed name dictionary
    name_parsed = {u"title": unicode(title),
                   u"primary_name": unicode(primary_name),
                   u"rest_of_name": rest_of_name,
                   u"suffix": unicode(suffix),
                   u"fuller_form": unicode(name.nickname),
                   u"numbers": unicode(number),
                   u"birth_date": unicode(birth_date),
                   u"death_date": unicode(death_date),
                   u"date_string": unicode(dates_string),
                   u"authority_id": unicode(auth),
                   u"source": unicode(source),
                   u"name_order": u"inverted",
                   u"sort_name_auto_generate": True}

    # remove empty fields
    for key, value in name_parsed.items():
        if not value:
            del name_parsed[key]

    return name_parsed
Exemplo n.º 20
0
    def namer(field):
        #pre
        if type(field) == tuple:
            w_name = re.sub(
                '[\t\r\n]', '',
                ", ".join([x.encode('ascii', 'ignore')
                           for x in field])).upper()
        else:
            w_name = re.sub('[\t\r\n]', '', field.encode('ascii',
                                                         'ignore')).upper()
        if 'ANONYMOUS' not in w_name:
            if ' FORMER ' not in w_name:
                w_name = re.split(";", w_name)[0]
            else:
                w_name = re.split(";", w_name)[1]

            w_name = re.sub("(?<=[`'/+]) | (?=['`/+])", '', w_name)  #6A, 4A-C

            out = HumanName(w_name)
            out.middle = re.sub("^[A-Z] |^[A-Z]\. ", '', out.middle)
            if " " in out.last:
                out.last = re.sub("^[A-Z] |^[A-Z]\. ", '', out.last)
            if re.sub("^[A-Z]\.|^[A-Z]", '',
                      out.first) == '' and len(out.middle) != 0:
                out.first, out.middle = out.middle, ""
            else:
                out.first = re.sub("^[A-Z] |^[A-Z]\. ", '', out.first)

            #post

            if out.middle.startswith("FOR ") or out.middle.startswith(
                    "- "):  #7A, 1B, 3E
                out.middle = ""

            if " FOR " in out.last:
                out.last = re.sub(" FOR .*", '', out.last)

            if len(out.last) == 0 and len(out.title) != 0:  #9A
                if " " in out.first:
                    out = HumanName(out.first)
                else:
                    out.first, out.last = "", out.first

            if " AND " in out.middle or " & " in out.middle:
                out.last = re.split("( AND )|( & )", out.middle)[0]
                out.middle = ""
            if "AND" in out.last or "&" in out.last:

                if out.last.startswith("AND ") or out.last.startswith(
                        "& "):  #3F
                    out.last = HumanName(out.last).last
                elif " AND " in out.last or " & " in out.last:
                    out.last = re.sub("( AND ).*|( & ).*", '', out.last)
            out.first = re.split("( AND )|&|/|\+", out.first)[0]
            out.last = re.split("/", out.last)[0].strip()
            if len(re.sub("[^A-Z]", '', out.first)) == 1 and " " in out.last:
                out.first = out.last.split(" ")[0]
                out.last = out.last.split(" ")[1]
            out.capitalize()
            first, last = out.first, out.last
            if len(out.middle) > 0:
                if re.sub("^[A-Z]\.|^[A-Z]", '', out.middle) == '':
                    out.middle = ""
                elif first.endswith("-") or out.middle.startswith("-"):
                    first += out.middle
                else:
                    first += " %s" % out.middle  #8A-B
            if len(out.suffix) > 0:
                last += " %s" % out.suffix  #2A
            return (first, last)
        else:
            name = HumanName(w_name)
            return (name.first, name.last)
Exemplo n.º 21
0
    def parse_raw_name(name_raw: str, count: int, extract_orgs=True) -> (str, str, str, Counter):
        """
        Parses a (usually messy) raw name and returns
        first, middle, last names and a Counter of extracted positions

        extract_orgs tries to extract organizations from name. defaults to True. only set to False
        to be able to check if a name is valid (it prevents an infinite loop because by default,
        extracting organizations is part of the initialization of a person

        :param name_raw: str
        :param count: int
        :param extract_orgs: bool
        :return: str, str, str, Counter (first name, middle name, last name, positions Counter)
        """
        name_raw = Person.remove_privlog_info(name_raw)
        # remove JR, SR, or III if it follows this pattern: 'Chumney-RD-Jr'
        name_raw = Person.remove_jr_sr_iii(name_raw)

        # position is often attached with a dash,
        # e.g. 'BAKER, T E - NATIONAL ASSOCIATION OF ATTORNEYS'
        if name_raw.find(" - ") > -1 and len(name_raw.split(' - ')) == 2:
            name_raw, extracted_position = name_raw.split(" - ")
            extracted_positions = [extracted_position.strip()]
        else:
            extracted_positions = []

        # extract positions in parens e.g. Henson, A (Chadbourne & Park)
        paren_positions = re.findall(r'\([^(]+\)', name_raw)
        for position in paren_positions:
            extracted_positions.append(position.strip(',#() '))
            name_raw = name_raw.replace(position, '')

        # Search for known raw_org strings in name_raw, extract them as positions if necessary
        if extract_orgs:
            name_raw, new_positions = Person.extract_raw_org_names_from_name(name_raw)
            extracted_positions += new_positions

        # delete any leftover hashtags
        name_raw = name_raw.strip(' #')

        # Delete dashes between last name and initials
        # DUNN-W -> Dunn W
        if name_raw[-2] == '-':
            name_raw = name_raw[:-2] + " " + name_raw[-1:]
        # DUNN-WL -> DUNN WL
        if len(name_raw) > 2 and name_raw[-3] == '-':
            name_raw = name_raw[:-3] + " " + name_raw[-2:]

        # Parse current string using HumanName
        name = HumanName(name_raw)

        # e.g. Dunn W -> parsed as last name W. -> switch first/last
        if len(name.last) <= 2 < len(name.first):
            name.first, name.last = name.last, name.first

        # remove periods from initials
        if len(name.first) == 2 and name.first[1] == '.':
            name.first = name.first[0]
        if len(name.middle) == 2 and name.middle[1] == '.':
            name.middle = name.middle[0]

        # If first name is length 2 (Teague, CE), the two letters are most likely initials.
        if len(name.middle) == 0 and len(name.first) == 2:
            name.middle = name.first[1].upper()
            name.first = name.first[0].upper()

        # If first and middle initials have periods but not spaces -> separate, e.g. "R.K. Teague"
        if re.match(r'[a-zA-Z]\.[a-zA-Z]\.', name.first):
            name.middle = name.first[2]
            name.first = name.first[0]

        name.last = name.last.capitalize()
        name.first = name.first.capitalize()
        name.middle = name.middle.capitalize()

        # if multiple names are passed, they often end up in the middle name
        # e.g. 'Holtzman, A.,  Murray, J. ,  Henson, A.  -> only allow one comma or set to empty
        if name.middle.count(',') > 1:
            name.middle = ''

        if len(name.suffix) > 20 and name.suffix.count('.') > 2:
            name.suffix = ''

        if name.suffix:
            extracted_positions.append(name.suffix)

        # map organization names to clean official names (if they are in the dict) using
        # RAW_ORG_TO_CLEAN_ORG_DICT
        clean_orgs = []
        for raw_org in extracted_positions:
            if raw_org in RAW_ORG_TO_CLEAN_ORG_DICT:
                clean_org = RAW_ORG_TO_CLEAN_ORG_DICT[raw_org]
                if clean_org != '@skip@':
                    clean_orgs.append(clean_org)
            else:
                clean_orgs.append(raw_org)
        extracted_positions = clean_orgs

        # convert mapped positions into a counter
        result_positions = Counter()
        for position in extracted_positions:
            cleaned = re.sub(r'\.', '', position)
            result_positions[cleaned.upper()] += count

        # print(name.first, name.middle, name.last, result_positions)
        return name.first, name.middle, name.last, result_positions
Exemplo n.º 22
0
def human_to_csl(name):
    """Convert HumanName to CSL-formatted JSON.

    Args:
        name : HumanName or str / unicode
    Returns:
        CSL-formatted JSON

    Examples:
    >>> csl = human_to_csl('Rafael Nadal')
    >>> csl == {'given' : 'Rafael', 'family' : 'Nadal'}
    True
    >>> csl = human_to_csl(HumanName('Rafael Nadal'))
    >>> csl == {'given' : 'Rafael', 'family' : 'Nadal'}
    True
    >>> csl = human_to_csl(HumanName('George HW de Bush'))
    >>> csl == {'given' : 'George H. W.', 'family' : 'de Bush'}
    True
    >>> csl = human_to_csl('Eisenhower, I')
    >>> csl == {'given' : 'I.', 'family' : 'Eisenhower'}
    True
    >>> csl = human_to_csl('Eisenhower, V')
    >>> csl == {'given' : 'V.', 'family' : 'Eisenhower'}
    True
    """
    # Optionally convert to nameparser.HumanName
    if not isinstance(name, HumanName):
        name = HumanName(name)
    
    # Fix: nameparser treats HumanName('Eisenhower, I') as 
    # {first : 'Eisenhower', suffix : 'I'}
    if re.search('^[IV]\.*$', name.suffix):
        name.last = name.first
        name.first = name.suffix
        name.suffix = ''

    # Initialize CSL data
    csl_data = {}
    
    # Append middle name to first
    if name.middle:
        name.first += ' ' + name.middle

    # Iterate over lookup fields
    for lookup in human_to_csl_map:
        
        # Get field and function
        field = human_to_csl_map[lookup]['field']
        fun = human_to_csl_map[lookup].get('fun', I)
        
        # Get field from name
        value = getattr(name, field)

        # Skip if empty
        if not value:
            continue

        # Apply function
        value = fun(value)
        
        # Save to CSL data
        csl_data[lookup] = value

    # Return CSL data
    return csl_data
def parse_persname(persname, auth="", source=""):
    name, birth_date, death_date = extract_birth_death_dates(persname)
    birth_date, death_date = validate_dates(birth_date, death_date)
    dates_string = make_date_string(birth_date, death_date)
    name = HumanName(name)

    titles = [
        "sir", "mr", "mrs", "baron", "dame", "madame", "viscount", "conte"
    ]
    numbers = ["II", "III"]
    title = name.title
    suffix = name.suffix
    number = u""

    # check if the suffix should actually be a title
    if not title and any(suffix.lower().strip(". ") == title
                         for title in titles):
        title = suffix.capitalize()
        if "mr" in title.lower() and not title.endswith("."):
            title += "."
        suffix = u""

    # extract numbers from the suffix
    if suffix in numbers:
        number = suffix
        suffix = u""

    # special cases cleanup
    if name.title == u"Royal":
        name.title = ""
        title = ""
        name.middle = name.first if not name.middle else "{} {}".format(
            u"Royal", name.middle)
        name.first = u"Royal"

    if name.title == u"Queen of Great":
        title = name.title + u" Britain"
        name.first = u""

    if name.title == u"Lama":
        title = u"Dalai Lama XIV"
        name.first = u""
        name.middle = u""

    if name.title == u"Marquis":
        title = u""
        name.first = u"Marquis"
        name.middle = u"W."

    if suffix == u"1941":
        birth_date = suffix
        suffix = u""

    if suffix in [u"18", u"b."]:
        suffix = u""

    if suffix == u"Jr":
        suffix += u"."

    if ", fl. 17th cent" in suffix:
        suffix = u"sieur de"
        dates_string = u"fl. 17th cent"

    rest_of_name = u"{0} {1}".format(name.first, name.middle).rstrip()
    if rest_of_name == u"Christella D. Personal journey through South Africa. 1991":
        rest_of_name = u"Christella D."

    # People with single-part names (like Keewaydinoquay) are mis-assigned. Have to fix those
    primary_name = name.last
    if rest_of_name and not primary_name:
        primary_name = rest_of_name
        rest_of_name = ""

    # create the parsed name dictionary
    name_parsed = {
        u"title": unicode(title),
        u"primary_name": unicode(primary_name),
        u"rest_of_name": rest_of_name,
        u"suffix": unicode(suffix),
        u"fuller_form": unicode(name.nickname),
        u"numbers": unicode(number),
        u"birth_date": unicode(birth_date),
        u"death_date": unicode(death_date),
        u"date_string": unicode(dates_string),
        u"authority_id": unicode(auth),
        u"source": unicode(source),
        u"name_order": u"inverted",
        u"sort_name_auto_generate": True
    }

    # remove empty fields
    for key, value in name_parsed.items():
        if not value:
            del name_parsed[key]

    return name_parsed
Exemplo n.º 24
0
	def namer(field):
		#pre
		if type(field) == tuple:
			w_name = re.sub('[\t\r\n]', '', ", ".join([x.encode('ascii', 'ignore') for x in field])).upper()
		else:
			w_name = re.sub('[\t\r\n]', '', field.encode('ascii', 'ignore')).upper()
		if 'ANONYMOUS' not in w_name:
			if ' FORMER ' not in w_name:
				w_name = re.split(";", w_name)[0]
			else:
				w_name = re.split(";", w_name)[1]

			w_name = re.sub("(?<=[`'/+]) | (?=['`/+])", '', w_name) #6A, 4A-C
			
			out = HumanName(w_name)
			out.middle = re.sub("^[A-Z] |^[A-Z]\. ", '', out.middle)
			if " " in out.last:
				out.last = re.sub("^[A-Z] |^[A-Z]\. ", '', out.last)
			if re.sub("^[A-Z]\.|^[A-Z]", '', out.first) == '' and len(out.middle) != 0:
				out.first, out.middle = out.middle, ""
			else:
				out.first = re.sub("^[A-Z] |^[A-Z]\. ", '', out.first)
			
			#post
			
			if out.middle.startswith("FOR ") or out.middle.startswith("- "): #7A, 1B, 3E
				out.middle = "" 

			if " FOR " in out.last:
				out.last = re.sub(" FOR .*", '', out.last)

			if len(out.last) == 0 and len(out.title) != 0: #9A
				if " " in out.first:
					out = HumanName(out.first)
				else:
					out.first, out.last = "", out.first

			if " AND " in out.middle or " & " in out.middle:
				out.last = re.split("( AND )|( & )", out.middle)[0]
				out.middle = ""
 			if "AND" in out.last or "&" in out.last:

				if out.last.startswith("AND ") or out.last.startswith("& "): #3F
					out.last = HumanName(out.last).last
				elif " AND " in out.last or " & " in out.last:
					out.last = re.sub("( AND ).*|( & ).*", '', out.last)
			out.first = re.split("( AND )|&|/|\+", out.first)[0]
			out.last = re.split("/", out.last)[0].strip()
			if len(re.sub("[^A-Z]", '', out.first)) == 1 and " " in out.last:
				out.first = out.last.split(" ")[0]
				out.last = out.last.split(" ")[1]
			out.capitalize()
			first, last = out.first, out.last
			if len(out.middle) > 0:
				if re.sub("^[A-Z]\.|^[A-Z]", '', out.middle) == '':
					out.middle = ""
				elif first.endswith("-") or out.middle.startswith("-"):
					first += out.middle
				else:
					first += " %s" % out.middle #8A-B
			if len(out.suffix) > 0:
				last += " %s" % out.suffix #2A
			return (first, last)
		else:
			name = HumanName(w_name)
			return (name.first, name.last)
Exemplo n.º 25
0
def normalize_author_name(author):
    """Normalize author name.

    :param author: author name
    :type author: string

    :return name: the name of the author normilized
    """
    constants = Constants()
    roman_numeral_suffixes = [
        u'v', u'vi', u'vii', u'viii', u'ix', u'x', u'xii', u'xiii', u'xiv',
        u'xv'
    ]
    titles = [
        u'Dr', u'Prof', u'Professor', u'Sir', u'Editor', u'Ed', u'Mr', u'Mrs',
        u'Ms', u'Chair', u'Co-Chair', u'Chairs', u'co-Chairs'
    ]
    constants.titles.remove(*constants.titles).add(*titles)
    constants.suffix_not_acronyms.add(*roman_numeral_suffixes)

    def _is_initial(author_name):
        return len(author_name) == 1 or u'.' in author_name

    def _ensure_dotted_initials(author_name):
        if _is_initial(author_name)\
                and u'.' not in author_name:
            seq = (author_name, u'.')
            author_name = u''.join(seq)
        return author_name

    def _ensure_dotted_suffixes(author_suffix):
        if u'.' not in author_suffix:
            seq = (author_suffix, u'.')
            author_suffix = u''.join(seq)
        return author_suffix

    def _is_roman_numeral(suffix):
        """Controls that the userinput only contains valid roman numerals"""
        valid_roman_numerals = [
            u'M', u'D', u'C', u'L', u'X', u'V', u'I', u'(', u')'
        ]
        return all(letters in valid_roman_numerals
                   for letters in suffix.upper())

    name = HumanName(author, constants=constants)

    name.first = _ensure_dotted_initials(name.first)
    name.middle = _ensure_dotted_initials(name.middle)

    if _is_initial(name.first) and _is_initial(name.middle):
        normalized_names = u'{first_name}{middle_name}'
    else:
        normalized_names = u'{first_name} {middle_name}'

    normalized_names = normalized_names.format(
        first_name=name.first,
        middle_name=name.middle,
    )

    if _is_roman_numeral(name.suffix):
        suffix = name.suffix.upper()
    else:
        suffix = _ensure_dotted_suffixes(name.suffix)

    final_name = u', '.join(part for part in (name.last,
                                              normalized_names.strip(), suffix)
                            if part)

    return final_name
Exemplo n.º 26
0
def clean_name(dirty_name):
    name = HumanName(dirty_name.translate(str.maketrans('', '', string.punctuation)))
    if not name.first:
        name.first = name.title
    clean_name = name.first + ' ' + name.last
    return clean_name
    def parse_raw_name(name_raw: str) -> (str, str, str, set):
        """
        Parses a (usually messy) raw name and returns
        first, middle, last names and a set of extracted positions

        :param name_raw: str
        :return: str, str, str, set


        Parses name and returns as human name
        >>> n = Person('TEAGUE CE JR')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Teague', 'C', 'E', 'JR')

        >>> n = Person('teague, ce jr')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Teague', 'C', 'E', 'JR')


        >>> n = Person('Teague, Claude Edward, Jr., Ph.D. ')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Teague', 'Claude', 'Edward', 'JR., PH.D.')

        >>> n = Person('Teague, J - BAT')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Teague', 'J', '', 'BAT')

        >>> n = Person('BAKER, T E - NATIONAL ASSOCIATION OF ATTORNEYS GENERAL')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Baker', 'T', 'E', 'NATIONAL ASSOCIATION OF ATTORNEYS GENERAL')

        >>> n = Person('BAKER-cj')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Baker', 'C', 'J', '')

        JR and SR are by default recognized as titles -> turn off through CONSTANTS.
        >>> n = Person('Baker, JR')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Baker', 'J', 'R', '')

        >>> n = Person('DUNN WL #')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Dunn', 'W', 'L', '')

        >>> n = Person('Dunn, W. L.')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Dunn', 'W', 'L', '')

        >>> n = Person('TEMKO SL, COVINGTON AND BURLING')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Temko', 'S', 'L', 'COVINGTON AND BURLING')

        >>> n = Person('Temko, Stanley L [Privlog:] TEMKO,SL')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Temko', 'Stanley', 'L', '')

        >>> n = Person('Temko-SL, Covington & Burling')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Temko', 'S', 'L', 'COVINGTON & BURLING')

        >>> n = Person('HENSON, A. (AMERICAN SENIOR VICE PRESIDENT AND GENERAL COUNSEL)')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Henson', 'A', '', 'AMERICAN SENIOR VICE PRESIDENT AND GENERAL COUNSEL')

        >>> n = Person('HENSON, A. (CHADBOURNE, PARKE, WHITESIDE & WOLFF, AMERICAN OUTSIDE COUNSEL) (HANDWRITTEN NOTES)')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Henson', 'A', '', 'CHADBOURNE, PARKE, WHITESIDE & WOLFF, AMERICAN OUTSIDE COUNSEL HANDWRITTEN NOTES')

        >>> n = Person('Holtzman, A.,  Murray, J. ,  Henson, A. ,  Pepples, E. ,  Stevens, A. ,  Witt, S.')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Holtzman', 'A', '', '')

        >>> n = Person('Holtz, Jacob, Jacob & Medinger')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Holtz', 'Jacob', '', 'JACOB & MEDINGER')

        # This one breaks. But I don't think it can be avoided.
        >>> n = Person('Holtz, Jacob Alexander, Jacob & Medinger')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Holtz', '', '', 'JACOB ALEXANDER, JACOB & MEDINGER')

        >>> n = Person('PROCTOR DF, JOHNS HOPKINS SCHOOL OF HYGIENE')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Proctor', 'D', 'F', 'JOHNS HOPKINS SCHOOL OF HYGIENE')

        """

        # remove privlog info, e.g. 'Temko, Stanley L [Privlog:] TEMKO,SL'. It confuses
        # the name parser
        privlog_id = name_raw.find('[Privlog:]')
        if privlog_id == 0:
            name_raw = name_raw[privlog_id:]
        elif privlog_id > 0:
            name_raw = name_raw[:name_raw.find('[Privlog:]')]
        else:
            pass

        # position is often attached with a dash, e.g. 'BAKER, T E - NATIONAL ASSOCIATION OF ATTORNEYS'
        if name_raw.find(" - ") > -1 and len(name_raw.split(' - ')) == 2:
            name_raw, extracted_position = name_raw.split(" - ")
            extracted_positions = {extracted_position.strip()}
        else:
            extracted_positions = set()

        # extract positions in parens e.g. Henson, A (Chadbourne & Park)
        paren_positions = re.findall(r'\([^(]+\)', name_raw)
        for position in paren_positions:
            extracted_positions.add(position.strip(',#() '))
            name_raw = name_raw.replace(position, '')

        institution_regexes = [

            # TI/CTR
            r'[,#] Tobacco Inst.+$',
            r'[\(\,\#] ?SAB Exec.*$',

            # American Tobacco
            r'[(,#] ?American .+$',
            r'[\(\,\#] ?Amer Brands.*$',
            r'[,#] American Tob',
            r'[,#] Atco.*$',

            # PM
            r'[\(\,\#] ?Philip Morris.*$',

            # RJR
            r'[\(\,\#] ?RJR.*$',

            # LAW FIRMS
            r'[\(\,\#] ?Arnold &.*$',
            r'[\(\,\#] ?Chadbourne.*$',
            r'[,#] COVINGTON [AB&]*.+$',
            r'[,#] Foster [&A]*.+$',
            r'[,#] JACOB [A&]*.+$',

            r'[\(\,\#] ?Philip Morris.*$',

            # Universities
            # match a ( or , or # at the beginning, then some characters that
            # aren't (,# until the end of the string
            r'[\(\,\#][^\(\,\#]+ Univ\b.*$',

            # Univ is fine if it appears at the end of a string (don't want to match in the
            # middle of a string, e.g "Universal"
            r'[\(\,\#][^\(\,\#]+ School\b.*$',

            # Organizations
            r'[\(\,\#][^\(\,\#]+ Federal Trade Commission.*$',

        ]
        for institution in institution_regexes:
            extracted_institution = re.search(institution, name_raw, re.IGNORECASE)
            if extracted_institution:
                extracted_positions.add(extracted_institution.group().strip(',#() '))
                name_raw = name_raw[:name_raw.find(extracted_institution.group())]

        # remove #
        name_raw = name_raw.strip("#").strip()

        if name_raw[-2] == '-':
            name_raw = name_raw[:-2] + " " + name_raw[-1:]
        if len(name_raw) > 2 and name_raw[-3] == '-':
            name_raw = name_raw[:-3] + " " + name_raw[-2:]

        name = HumanName(name_raw)

        # e.g. Dunn W -> parsed as last name W. -> switch first/last
        if len(name.last) <= 2 and len(name.first) > 2:
            name.first, name.last = name.last, name.first

        # remove periods from initials
        if len(name.first) == 2 and name.first[1] == '.':
            name.first = name.first[0]
        if len(name.middle) == 2 and name.middle[1] == '.':
            name.middle = name.middle[0]

        # If first name is length 2 (Teague, CE), the two letters are most likely initials.
        if len(name.first) == 2:
            name.middle = name.first[1].upper()
            name.first = name.first[0].upper()

        # If first and middle initials have periods but not spaces -> separate, e.g. "R.K. Teague"
        if re.match(r'[a-zA-Z]\.[a-zA-Z]\.', name.first):
            name.middle = name.first[2]
            name.first = name.first[0]

        name.last = name.last.capitalize()
        name.first = name.first.capitalize()
        name.middle = name.middle.capitalize()

        # if multiple names are passed, they often end up in the middle name
        # e.g. 'Holtzman, A.,  Murray, J. ,  Henson, A.  -> only allow one comma or set to empty
        if name.middle.count(',') > 1:
            name.middle = ''

        if len(name.suffix) > 20 and name.suffix.count('.') > 2:
            name.suffix = ''

        if name.suffix:
            extracted_positions.add(name.suffix)

        return name.first, name.middle, name.last, extracted_positions
Exemplo n.º 28
0
    return potential_names


def match(name1, name2):
    n1, n2 = HumanName(name1), HumanName(name2)
    return (any(
        u(x) == u(y) for x in get_potential_names(n1)
        for y in get_potential_names(n2)))


with open('ap_candidates.csv') as f:
    reader = csv.DictReader(f)
    ap_candidates = [row for row in reader]
    for row in ap_candidates:
        n = HumanName()
        n.first = row['first_name']
        n.middle = row['middle_name']
        n.last = row['last_name']
        n.suffix = row['suffix']
        row['name'] = str(n)

with open('ap_historical_ids.csv') as f:
    reader = csv.DictReader(f)
    ap_candidates2 = [row for row in reader]


def find(name):
    for row in ap_candidates:
        if match(name, row['name']):
            # print(f'found match for {name} with', row['name'])
            return int(row['pol_id'])