예제 #1
0
def get_noun_phrases(text,
                     strict=False,
                     return_source=False,
                     window=3,
                     valid_punctuation=None) -> Generator:
    """
    Get NNP phrases from text
    """
    valid_punctuation = valid_punctuation or VALID_PUNCTUATION
    # Iterate through sentences
    for sentence in get_sentence_list(text):
        # Tag sentence
        sentence_pos = nltk.pos_tag(get_token_list(sentence))

        # Iterate through chunks
        nnps = []
        last_nnp_pos = None
        for i, chunk in enumerate(sentence_pos):
            do_join = not strict and last_nnp_pos is not None and (
                i - last_nnp_pos) < window
            # Check label
            if chunk[1] in ['NNP', 'NNPS']:
                if do_join:
                    sep = "" if "(" in valid_punctuation and nnps[-1][
                        -1] == "(" else " "
                    nnps[-1] += sep + chunk[0]
                else:
                    nnps.append(chunk[0])
                last_nnp_pos = i
            elif do_join:
                if chunk[1] in ['CC'] or chunk[0] in valid_punctuation:
                    if chunk[0].lower() in ["or"]:
                        continue
                    nnps[-1] += (' ' if chunk[0].lower() in ['&', 'and', '(']
                                 else '') + chunk[0]
                    last_nnp_pos = i
                else:
                    last_nnp_pos = None

        # Clean up names and yield
        for nnp in nnps:
            # Cleanup
            nnp = nnp.strip()
            if len(nnp) <= 2:
                continue

            if nnp.lower().endswith(' and'):
                nnp = nnp[0:-4].strip()
            elif nnp.endswith(' &'):
                nnp = nnp[0:-2].strip()

            nnp = strip_unicode_punctuation(nnp).strip(
                string.punctuation).strip(string.whitespace)
            if return_source:
                yield nnp, sentence
            else:
                yield nnp
예제 #2
0
def get_geopolitical(text,
                     strict=False,
                     return_source=False,
                     window=2) -> Generator:
    """
    Get GPEs from text
    """
    # Iterate through sentences
    for sentence in get_sentence_list(text):
        # Tag sentence
        sentence_pos = nltk.pos_tag(get_token_list(sentence))

        # Iterate through chunks
        gpes = []
        last_gpe_pos = None
        for i, chunk in enumerate(nltk.ne_chunk(sentence_pos)):
            if isinstance(chunk, nltk.tree.Tree):
                # Check label
                if chunk.label() == 'GPE':
                    if not strict and last_gpe_pos is not None and (
                            i - last_gpe_pos) < window:
                        gpes[-1] += " " + " ".join([c[0] for c in chunk])
                    else:
                        gpes.append(" ".join([c[0] for c in chunk]))
                    last_gpe_pos = i
            elif not strict and last_gpe_pos is not None and (
                    i - last_gpe_pos) < window:
                if chunk[1] in ["NNP", "NNPS"]:
                    gpes[-1] += " " + chunk[0]
                    last_gpe_pos = i
                elif chunk[1] in ["CC"] or chunk[0] in VALID_PUNCTUATION:
                    if chunk[0].lower() in ["or"]:
                        continue
                    gpes[-1] += (" " if chunk[0].lower() in ["&", "and"] else
                                 "") + chunk[0]
                    last_gpe_pos = i
                else:
                    last_gpe_pos = None

        # Clean up names and yield
        for gpe in gpes:
            # Cleanup
            gpe = gpe.strip()
            if len(gpe) <= 2:
                continue

            if gpe.lower().endswith(" and"):
                gpe = gpe[0:-4]
            elif gpe.endswith(" &"):
                gpe = gpe[0:-2]

            gpe = strip_unicode_punctuation(gpe).strip(
                string.punctuation).strip(string.whitespace)
            if return_source:
                yield gpe, sentence
            else:
                yield gpe
예제 #3
0
def get_locations(text,
                  strict=False,
                  return_source=False,
                  window=2) -> Generator:
    """
    Get locations from text using Stanford libraries.
    :param window:
    :param return_source:
    :param strict:
    :param text:
    :return:
    """
    # Iterate through sentences
    for sentence in get_sentence_list(text):
        # Tag sentence
        sentence_pos = STANFORD_NER_TAGGER.tag(get_tokens_list(text))

        # Iterate through chunks
        locations = []
        last_loc_pos = None
        for i, token in enumerate(sentence_pos):
            # Check label
            if token[1] == 'LOCATION':
                if not strict and last_loc_pos is not None and (
                        i - last_loc_pos) < window:
                    locations[-1] += (" " if not token[0].startswith("'") else
                                      "") + token[0]
                else:
                    locations.append(token[0])
                last_loc_pos = i
            else:
                if token[0] in [".", ","]:
                    if not strict and last_loc_pos is not None and (
                            i - last_loc_pos) < window:
                        locations[-1] += (
                            " " if token[0] not in string.punctuation and
                            not token[0].startswith("'") else "") + token[0]
                        last_loc_pos = i

        # Cleanup and yield
        for location in locations:
            location = strip_unicode_punctuation(location).strip(
                string.punctuation).strip(string.whitespace)
            if return_source:
                yield location, sentence
            else:
                yield location
예제 #4
0
def get_persons(text,
                strict=False,
                return_source=False,
                window=2) -> Generator:
    """
    Get persons from text using Stanford libraries.
    :param window:
    :param return_source:
    :param strict:
    :param text:
    :return:
    """
    # Iterate through sentences
    for sentence in get_sentence_list(text):
        # Tag sentence
        sentence_pos = STANFORD_NER_TAGGER.tag(get_tokens_list(text))

        # Iterate through chunks
        names = []
        last_person_pos = None
        for i, token in enumerate(sentence_pos):
            # Check label
            if token[1] == 'PERSON':
                if not strict and last_person_pos is not None and (
                        i - last_person_pos) < window:
                    names[-1] += " " + token[0]
                else:
                    names.append(token[0])
                last_person_pos = i
            else:
                if token[0] in [".", ","]:
                    if not strict and last_person_pos is not None and (
                            i - last_person_pos) < window:
                        names[-1] += (" " if token[0] not in string.punctuation
                                      else "") + token[0]
                        last_person_pos = i

        # Cleanup and yield
        for name in names:
            name = strip_unicode_punctuation(name).strip(
                string.punctuation).strip(string.whitespace)
            if return_source:
                yield name, sentence
            else:
                yield name
예제 #5
0
def get_persons(text,
                strict=False,
                return_source=False,
                window=2) -> Generator:
    """
    Get names from text.
    :param window:
    :param return_source:
    :param strict:
    :param text:
    :return:
    """
    # Iterate through sentences
    for sentence in get_sentence_list(text):
        # Tag sentence
        sentence_pos = nltk.pos_tag(get_token_list(sentence))
        companies = list(get_company_annotations(text))

        # Iterate through chunks
        persons = []
        last_person_pos = None

        for i, chunk in enumerate(nltk.ne_chunk(sentence_pos)):
            if type(chunk) == nltk.tree.Tree:
                # Check label
                if chunk.label() == 'PERSON':
                    if not strict and last_person_pos is not None and (
                            i - last_person_pos) < window:
                        persons[-1] += " " + " ".join([c[0] for c in chunk])
                    else:
                        persons.append(" ".join([c[0] for c in chunk]))
                    last_person_pos = i
            elif not strict and last_person_pos is not None and (
                    i - last_person_pos) < window:
                if chunk[1] in ["NNP", "NNPS"]:
                    persons[-1] += " " + chunk[0]
                    last_person_pos = i
                elif chunk[1] in ["CC"] or chunk[0] in VALID_PUNCTUATION:
                    if chunk[0].lower() in ["or"]:
                        continue
                    persons[-1] += (" " if chunk[0].lower() in ["&", "and"]
                                    else "") + chunk[0]
                    last_person_pos = i
                else:
                    last_person_pos = None

        # Cleanup and yield
        for person in persons:
            # Cleanup
            person = person.strip()
            if len(person) <= 2:
                continue

            if PERSONS_STOP_WORDS.search(person):
                continue

            person = strip_unicode_punctuation(person).strip(
                string.punctuation).strip(string.whitespace)

            if contains_companies(person, companies):
                continue

            if person.lower().endswith(" and"):
                person = person[0:-4]
            elif person.endswith(" &"):
                person = person[0:-2]

            if return_source:
                yield person, sentence
            else:
                yield person
예제 #6
0
def get_organizations(text,
                      strict=False,
                      return_source=False,
                      window=2) -> Generator:
    """
    Get organizations from text.
    :param window:
    :param return_source:
    :param strict:
    :param text:
    :return:
    """
    # Iterate through sentences
    for sentence in get_sentence_list(text):
        # Tag sentence
        sentence_pos = nltk.pos_tag(get_token_list(sentence))

        # Iterate through chunks
        organizations = []
        last_org_pos = None
        for i, chunk in enumerate(nltk.ne_chunk(sentence_pos)):
            if type(chunk) == nltk.tree.Tree:
                # Check label
                if chunk.label() in ['ORGANIZATION']:
                    if not strict and last_org_pos is not None and (
                            i - last_org_pos) < window:
                        organizations[-1] += " " + " ".join(
                            [c[0] for c in chunk])
                    else:
                        organizations.append(" ".join([c[0] for c in chunk]))
                    last_org_pos = i
            elif not strict and last_org_pos is not None and (
                    i - last_org_pos) < window:
                if chunk[1] in ["NNP", "NNPS"]:
                    organizations[-1] += " " + chunk[0]
                    last_org_pos = i
                elif chunk[1] in ["CC"] or chunk[0] in VALID_PUNCTUATION:
                    if chunk[0].lower() in ["or"]:
                        continue
                    organizations[-1] += (" " if chunk[0].lower()
                                          in ["&", "and"] else "") + chunk[0]
                    last_org_pos = i
                else:
                    last_org_pos = None

        for org in organizations:
            # Cleanup
            org = org.strip()
            if len(org) <= 2:
                continue

            if org.lower().endswith(" and"):
                org = org[0:-4]
            elif org.endswith(" &"):
                org = org[0:-2]

            org = strip_unicode_punctuation(org).strip(
                string.punctuation).strip(string.whitespace)
            if return_source:
                yield org, sentence
            else:
                yield org