def parse_full_name(self): """ The main parse method for the parser. This method is run upon assignment to the :py:attr:`full_name` attribute or instantiation. Basic flow is to hand off to :py:func:`pre_process` to handle nicknames. It then splits on commas and chooses a code path depending on the number of commas. :py:func:`parse_pieces` then splits those parts on spaces and :py:func:`join_on_conjunctions` joins any pieces next to conjunctions. """ self.title_list = [] self.first_list = [] self.middle_list = [] self.last_list = [] self.suffix_list = [] self.nickname_list = [] self.unparsable = True self.pre_process() self._full_name = self.collapse_whitespace(self._full_name) # break up full_name by commas parts = [x.strip() for x in self._full_name.split(",")] log.debug("full_name: {0}".format(self._full_name)) log.debug("parts: {0}".format(parts)) if len(parts) == 1: # no commas, title first middle middle middle last suffix # part[0] pieces = self.parse_pieces(parts) p_len = len(pieces) for i, piece in enumerate(pieces): try: nxt = pieces[i + 1] except IndexError: nxt = None # title must have a next piece, unless it's just a title if self.is_title(piece) \ and (nxt or p_len == 1) \ and not self.first: self.title_list.append(piece) continue if not self.first: self.first_list.append(piece) continue if self.are_suffixes(pieces[i+1:]) or \ ( # if the next piece is the last piece and a roman # numeral but this piece is not an initial self.is_roman_numeral(nxt) and i == p_len - 2 and not self.is_an_initial(piece) ): self.last_list.append(piece) self.suffix_list += pieces[i + 1:] break if not nxt: self.last_list.append(piece) continue self.middle_list.append(piece) else: # if all the end parts are suffixes and there is more than one piece # in the first part. (Suffixes will never appear after last names # only, and allows potential first names to be in suffixes, e.g. # "Johnson, Bart" if self.are_suffixes(parts[1].split(' ')) \ and len(parts[0].split(' ')) > 1: # suffix comma: # title first middle last [suffix], suffix [suffix] [, suffix] # parts[0], parts[1:...] self.suffix_list += parts[1:] pieces = self.parse_pieces(parts[0].split(' ')) log.debug("pieces: {0}".format(u(pieces))) for i, piece in enumerate(pieces): try: nxt = pieces[i + 1] except IndexError: nxt = None if self.is_title(piece) \ and (nxt or len(pieces) == 1) \ and not self.first: self.title_list.append(piece) continue if not self.first: self.first_list.append(piece) continue if self.are_suffixes(pieces[i + 1:]): self.last_list.append(piece) self.suffix_list = pieces[i + 1:] + self.suffix_list break if not nxt: self.last_list.append(piece) continue self.middle_list.append(piece) else: # lastname comma: # last [suffix], title first middles[,] suffix [,suffix] # parts[0], parts[1], parts[2:...] pieces = self.parse_pieces(parts[1].split(' '), 1) log.debug("pieces: {0}".format(u(pieces))) # lastname part may have suffixes in it lastname_pieces = self.parse_pieces(parts[0].split(' '), 1) for piece in lastname_pieces: # the first one is always a last name, even if it look like # a suffix if self.is_suffix(piece) and len(self.last_list) > 0: self.suffix_list.append(piece) else: self.last_list.append(piece) for i, piece in enumerate(pieces): try: nxt = pieces[i + 1] except IndexError: nxt = None if self.is_title(piece) \ and (nxt or len(pieces) == 1) \ and not self.first: self.title_list.append(piece) continue if not self.first: self.first_list.append(piece) continue if self.is_suffix(piece): self.suffix_list.append(piece) continue self.middle_list.append(piece) try: if parts[2]: self.suffix_list += parts[2:] except IndexError: pass if len(self) < 0: log.info("Unparsable: \"{}\" ".format(self.original)) else: self.unparsable = False self.post_process()
def parse_full_name(self): """ The main parse method for the parser. This method is run upon assignment to the :py:attr:`full_name` attribute or instantiation. Basic flow is to hand off to :py:func:`pre_process` to handle nicknames. It then splits on commas and chooses a code path depending on the number of commas. :py:func:`parse_pieces` then splits those parts on spaces and :py:func:`join_on_conjunctions` joins any pieces next to conjunctions. """ self.title_list = [] self.first_list = [] self.middle_list = [] self.last_list = [] self.suffix_list = [] self.nickname_list = [] self.unparsable = True self.pre_process() self._full_name = self.collapse_whitespace(self._full_name) # break up full_name by commas parts = [x.strip() for x in self._full_name.split(",")] log.debug("full_name: %s", self._full_name) log.debug("parts: %s", parts) if len(parts) == 1: # no commas, title first middle middle middle last suffix # part[0] pieces = self.parse_pieces(parts) p_len = len(pieces) for i, piece in enumerate(pieces): try: nxt = pieces[i + 1] except IndexError: nxt = None # title must have a next piece, unless it's just a title if self.is_title(piece) \ and (nxt or p_len == 1) \ and not self.first: self.title_list.append(piece) continue if not self.first: if p_len == 1 and self.nickname: self.last_list.append(piece) continue self.first_list.append(piece) continue if self.are_suffixes(pieces[i+1:]) or \ ( # if the next piece is the last piece and a roman # numeral but this piece is not an initial self.is_roman_numeral(nxt) and i == p_len - 2 and not self.is_an_initial(piece) ): self.last_list.append(piece) self.suffix_list += pieces[i+1:] break if not nxt: self.last_list.append(piece) continue self.middle_list.append(piece) else: # if all the end parts are suffixes and there is more than one piece # in the first part. (Suffixes will never appear after last names # only, and allows potential first names to be in suffixes, e.g. # "Johnson, Bart" if self.are_suffixes(parts[1].split(' ')) \ and len(parts[0].split(' ')) > 1: # suffix comma: # title first middle last [suffix], suffix [suffix] [, suffix] # parts[0], parts[1:...] self.suffix_list += parts[1:] pieces = self.parse_pieces(parts[0].split(' ')) log.debug("pieces: %s", u(pieces)) for i, piece in enumerate(pieces): try: nxt = pieces[i + 1] except IndexError: nxt = None if self.is_title(piece) \ and (nxt or len(pieces) == 1) \ and not self.first: self.title_list.append(piece) continue if not self.first: self.first_list.append(piece) continue if self.are_suffixes(pieces[i+1:]): self.last_list.append(piece) self.suffix_list = pieces[i+1:] + self.suffix_list break if not nxt: self.last_list.append(piece) continue self.middle_list.append(piece) else: # lastname comma: # last [suffix], title first middles[,] suffix [,suffix] # parts[0], parts[1], parts[2:...] pieces = self.parse_pieces(parts[1].split(' '), 1) log.debug("pieces: %s", u(pieces)) # lastname part may have suffixes in it lastname_pieces = self.parse_pieces(parts[0].split(' '), 1) for piece in lastname_pieces: # the first one is always a last name, even if it looks like # a suffix if self.is_suffix(piece) and len(self.last_list) > 0: self.suffix_list.append(piece) else: self.last_list.append(piece) for i, piece in enumerate(pieces): try: nxt = pieces[i + 1] except IndexError: nxt = None if self.is_title(piece) \ and (nxt or len(pieces) == 1) \ and not self.first: self.title_list.append(piece) continue if not self.first: self.first_list.append(piece) continue if self.is_suffix(piece): self.suffix_list.append(piece) continue self.middle_list.append(piece) try: if parts[2]: self.suffix_list += parts[2:] except IndexError: pass if len(self) < 0: log.info("Unparsable: \"%s\" ", self.original) else: self.unparsable = False self.post_process()