def get_noun_phrases(text, strict=False, return_source=False, window=3, valid_punctuation=None) -> Generator: """ Get NNP phrases from text """ valid_punctuation = valid_punctuation or VALID_PUNCTUATION # Iterate through sentences for sentence in get_sentence_list(text): # Tag sentence sentence_pos = nltk.pos_tag(get_token_list(sentence)) # Iterate through chunks nnps = [] last_nnp_pos = None for i, chunk in enumerate(sentence_pos): do_join = not strict and last_nnp_pos is not None and ( i - last_nnp_pos) < window # Check label if chunk[1] in ['NNP', 'NNPS']: if do_join: sep = "" if "(" in valid_punctuation and nnps[-1][ -1] == "(" else " " nnps[-1] += sep + chunk[0] else: nnps.append(chunk[0]) last_nnp_pos = i elif do_join: if chunk[1] in ['CC'] or chunk[0] in valid_punctuation: if chunk[0].lower() in ["or"]: continue nnps[-1] += (' ' if chunk[0].lower() in ['&', 'and', '('] else '') + chunk[0] last_nnp_pos = i else: last_nnp_pos = None # Clean up names and yield for nnp in nnps: # Cleanup nnp = nnp.strip() if len(nnp) <= 2: continue if nnp.lower().endswith(' and'): nnp = nnp[0:-4].strip() elif nnp.endswith(' &'): nnp = nnp[0:-2].strip() nnp = strip_unicode_punctuation(nnp).strip( string.punctuation).strip(string.whitespace) if return_source: yield nnp, sentence else: yield nnp
def get_geopolitical(text, strict=False, return_source=False, window=2) -> Generator: """ Get GPEs from text """ # Iterate through sentences for sentence in get_sentence_list(text): # Tag sentence sentence_pos = nltk.pos_tag(get_token_list(sentence)) # Iterate through chunks gpes = [] last_gpe_pos = None for i, chunk in enumerate(nltk.ne_chunk(sentence_pos)): if isinstance(chunk, nltk.tree.Tree): # Check label if chunk.label() == 'GPE': if not strict and last_gpe_pos is not None and ( i - last_gpe_pos) < window: gpes[-1] += " " + " ".join([c[0] for c in chunk]) else: gpes.append(" ".join([c[0] for c in chunk])) last_gpe_pos = i elif not strict and last_gpe_pos is not None and ( i - last_gpe_pos) < window: if chunk[1] in ["NNP", "NNPS"]: gpes[-1] += " " + chunk[0] last_gpe_pos = i elif chunk[1] in ["CC"] or chunk[0] in VALID_PUNCTUATION: if chunk[0].lower() in ["or"]: continue gpes[-1] += (" " if chunk[0].lower() in ["&", "and"] else "") + chunk[0] last_gpe_pos = i else: last_gpe_pos = None # Clean up names and yield for gpe in gpes: # Cleanup gpe = gpe.strip() if len(gpe) <= 2: continue if gpe.lower().endswith(" and"): gpe = gpe[0:-4] elif gpe.endswith(" &"): gpe = gpe[0:-2] gpe = strip_unicode_punctuation(gpe).strip( string.punctuation).strip(string.whitespace) if return_source: yield gpe, sentence else: yield gpe
def get_locations(text, strict=False, return_source=False, window=2) -> Generator: """ Get locations from text using Stanford libraries. :param window: :param return_source: :param strict: :param text: :return: """ # Iterate through sentences for sentence in get_sentence_list(text): # Tag sentence sentence_pos = STANFORD_NER_TAGGER.tag(get_tokens_list(text)) # Iterate through chunks locations = [] last_loc_pos = None for i, token in enumerate(sentence_pos): # Check label if token[1] == 'LOCATION': if not strict and last_loc_pos is not None and ( i - last_loc_pos) < window: locations[-1] += (" " if not token[0].startswith("'") else "") + token[0] else: locations.append(token[0]) last_loc_pos = i else: if token[0] in [".", ","]: if not strict and last_loc_pos is not None and ( i - last_loc_pos) < window: locations[-1] += ( " " if token[0] not in string.punctuation and not token[0].startswith("'") else "") + token[0] last_loc_pos = i # Cleanup and yield for location in locations: location = strip_unicode_punctuation(location).strip( string.punctuation).strip(string.whitespace) if return_source: yield location, sentence else: yield location
def get_persons(text, strict=False, return_source=False, window=2) -> Generator: """ Get persons from text using Stanford libraries. :param window: :param return_source: :param strict: :param text: :return: """ # Iterate through sentences for sentence in get_sentence_list(text): # Tag sentence sentence_pos = STANFORD_NER_TAGGER.tag(get_tokens_list(text)) # Iterate through chunks names = [] last_person_pos = None for i, token in enumerate(sentence_pos): # Check label if token[1] == 'PERSON': if not strict and last_person_pos is not None and ( i - last_person_pos) < window: names[-1] += " " + token[0] else: names.append(token[0]) last_person_pos = i else: if token[0] in [".", ","]: if not strict and last_person_pos is not None and ( i - last_person_pos) < window: names[-1] += (" " if token[0] not in string.punctuation else "") + token[0] last_person_pos = i # Cleanup and yield for name in names: name = strip_unicode_punctuation(name).strip( string.punctuation).strip(string.whitespace) if return_source: yield name, sentence else: yield name
def get_persons(text, strict=False, return_source=False, window=2) -> Generator: """ Get names from text. :param window: :param return_source: :param strict: :param text: :return: """ # Iterate through sentences for sentence in get_sentence_list(text): # Tag sentence sentence_pos = nltk.pos_tag(get_token_list(sentence)) companies = list(get_company_annotations(text)) # Iterate through chunks persons = [] last_person_pos = None for i, chunk in enumerate(nltk.ne_chunk(sentence_pos)): if type(chunk) == nltk.tree.Tree: # Check label if chunk.label() == 'PERSON': if not strict and last_person_pos is not None and ( i - last_person_pos) < window: persons[-1] += " " + " ".join([c[0] for c in chunk]) else: persons.append(" ".join([c[0] for c in chunk])) last_person_pos = i elif not strict and last_person_pos is not None and ( i - last_person_pos) < window: if chunk[1] in ["NNP", "NNPS"]: persons[-1] += " " + chunk[0] last_person_pos = i elif chunk[1] in ["CC"] or chunk[0] in VALID_PUNCTUATION: if chunk[0].lower() in ["or"]: continue persons[-1] += (" " if chunk[0].lower() in ["&", "and"] else "") + chunk[0] last_person_pos = i else: last_person_pos = None # Cleanup and yield for person in persons: # Cleanup person = person.strip() if len(person) <= 2: continue if PERSONS_STOP_WORDS.search(person): continue person = strip_unicode_punctuation(person).strip( string.punctuation).strip(string.whitespace) if contains_companies(person, companies): continue if person.lower().endswith(" and"): person = person[0:-4] elif person.endswith(" &"): person = person[0:-2] if return_source: yield person, sentence else: yield person
def get_organizations(text, strict=False, return_source=False, window=2) -> Generator: """ Get organizations from text. :param window: :param return_source: :param strict: :param text: :return: """ # Iterate through sentences for sentence in get_sentence_list(text): # Tag sentence sentence_pos = nltk.pos_tag(get_token_list(sentence)) # Iterate through chunks organizations = [] last_org_pos = None for i, chunk in enumerate(nltk.ne_chunk(sentence_pos)): if type(chunk) == nltk.tree.Tree: # Check label if chunk.label() in ['ORGANIZATION']: if not strict and last_org_pos is not None and ( i - last_org_pos) < window: organizations[-1] += " " + " ".join( [c[0] for c in chunk]) else: organizations.append(" ".join([c[0] for c in chunk])) last_org_pos = i elif not strict and last_org_pos is not None and ( i - last_org_pos) < window: if chunk[1] in ["NNP", "NNPS"]: organizations[-1] += " " + chunk[0] last_org_pos = i elif chunk[1] in ["CC"] or chunk[0] in VALID_PUNCTUATION: if chunk[0].lower() in ["or"]: continue organizations[-1] += (" " if chunk[0].lower() in ["&", "and"] else "") + chunk[0] last_org_pos = i else: last_org_pos = None for org in organizations: # Cleanup org = org.strip() if len(org) <= 2: continue if org.lower().endswith(" and"): org = org[0:-4] elif org.endswith(" &"): org = org[0:-2] org = strip_unicode_punctuation(org).strip( string.punctuation).strip(string.whitespace) if return_source: yield org, sentence else: yield org