示例#1
0
def contains_companies(person: str, companies) -> bool:
    if COMPANY_TYPES_RE.search(person):
        # noinspection PyTypeChecker
        for ant in nltk_re.get_companies(person):  # type: CompanyAnnotation
            if ant.name == ant.company_type or ant.name == ant.description:
                continue
            return True

    for ant in companies:
        # Solving this scenario: This Amendment to Employment Agreement ("Amendment") is entered into
        # between Marsh Supermarkets, Inc. (the "Company"), and Don E. Marsh (the "Executive").
        # because that is pretty common , even though it screws up this scenario
        # "This is an agreement between John Smith and John Smith, LLC"
        if person in ant.name:
            return True
    return False
示例#2
0
def contains_companies(person:str, companies) -> bool:
    if COMPANY_TYPES_RE.search(person):
        for result in nltk_re.get_companies(person,
                                            detail_type=True,
                                            parse_name_abbr=True):
            co_name, co_type, co_type_abbr, co_type_label, co_desc, co_abbr = result

            if co_name == co_type or co_name == co_desc:
                continue
            return True

    for co_name, co_type in companies:
        # Solving this scenario: This Amendment to Employment Agreement ("Amendment") is entered into
        # between Marsh Supermarkets, Inc. (the "Company"), and Don E. Marsh (the "Executive").
        # because that is pretty common , even though it screws up this scenario
        # "This is an agreement between John Smith and John Smith, LLC"
        if person in co_name:
            return True
    return False
示例#3
0
def get_company_annotations(
    text: str,
    strict: bool = False,
    use_gnp: bool = False,
    count_unique: bool = False,
    name_upper: bool = False,
) -> Generator[CompanyAnnotation, None, None]:
    """
    Find company names in text, optionally using the stricter article/prefix expression.
    :param parse_name_abbr:
    :param text:
    :param strict:
    :param use_gnp: use get_noun_phrases or NPExtractor
    :param name_upper: return company name in upper case.
    :param count_unique: return only unique companies - case insensitive.
    :return:
    """
    # skip if all text is in uppercase
    if text == text.upper():
        return
    valid_punctuation = VALID_PUNCTUATION + ["(", ")"]

    unique_companies = {}  # type: Dict[Tuple[str, str], CompanyAnnotation]

    if COMPANY_TYPES_RE.search(text):
        # Iterate through sentences
        for s_start, s_end, sentence in get_sentence_span_list(text):
            # skip if whole phrase is in uppercase
            if sentence == sentence.upper():
                continue
            if use_gnp:
                phrases = list(
                    get_noun_phrases(sentence,
                                     strict=strict,
                                     valid_punctuation=valid_punctuation))
            else:
                phrases = list(np_extractor.get_np(sentence))
            phrase_spans = PhrasePositionFinder.find_phrase_in_source_text(
                sentence, phrases)

            for phrase, p_start, p_end in phrase_spans:
                if COMPANY_TYPES_RE.search(phrase):
                    # noinspection PyTypeChecker
                    for ant in nltk_re.get_companies(
                            phrase, use_sentence_splitter=False
                    ):  # type: CompanyAnnotation

                        if ant.name == ant.company_type or ant.name == ant.description:
                            continue
                        ant.coords = (ant.coords[0] + s_start + p_start,
                                      ant.coords[1] + s_start + p_start)

                        if name_upper:
                            ant.name = ant.name.upper()

                        if count_unique:
                            unique_key = (ant.name.lower() if ant.name else
                                          None, ant.company_type_abbr)
                            existing_result = unique_companies.get(unique_key)

                            if existing_result:
                                existing_result.counter += 1
                            else:
                                unique_companies[unique_key] = ant
                        else:
                            yield ant

        if count_unique:
            for company in unique_companies.values():
                yield company
示例#4
0
def get_companies(text: str,
                  strict: bool = False,
                  use_gnp: bool = False,
                  detail_type: bool = False,
                  count_unique: bool = False,
                  name_upper: bool = False,
                  parse_name_abbr: bool = False,
                  return_source: bool = False):
    """
    Find company names in text, optionally using the stricter article/prefix expression.
    :param text:
    :param strict:
    :param use_gnp: use get_noun_phrases or NPExtractor
    :param detail_type: return detailed type (type, unified type, label) vs type only
    :param name_upper: return company name in upper case.
    :param count_unique: return only unique companies - case insensitive.
    :param parse_name_abbr: return company abbreviated name if exists.
    :param return_source:
    :return:
    """
    # skip if all text is in uppercase
    if text == text.upper():
        return
    valid_punctuation = VALID_PUNCTUATION + ["(", ")"]

    unique_companies = dict()

    if COMPANY_TYPES_RE.search(text):
        # Iterate through sentences
        for sentence in get_sentence_list(text):
            # skip if whole phrase is in uppercase
            if sentence == sentence.upper():
                continue
            if use_gnp:
                phrases = get_noun_phrases(sentence, strict=strict,
                                           valid_punctuation=valid_punctuation)
            else:
                phrases = np_extractor.get_np(sentence)
            for phrase in phrases:
                if COMPANY_TYPES_RE.search(phrase):
                    for result in nltk_re.get_companies(phrase,
                                                        detail_type=True,
                                                        parse_name_abbr=True):
                        co_name, co_type, co_type_abbr, co_type_label, co_desc, co_abbr = result

                        if co_name == co_type or co_name == co_desc:
                            continue
                        if name_upper:
                            co_name = co_name.upper()

                        result = (co_name, co_type)

                        if detail_type:
                            result += (co_type_abbr, co_type_label, co_desc)
                        if parse_name_abbr:
                            result += (co_abbr,)
                        if return_source and not count_unique:
                            result = result + (sentence,)

                        if count_unique:
                            unique_key = (result[0].lower() if result[0] else None, co_type_abbr)
                            existing_result = unique_companies.get(unique_key)

                            if existing_result:
                                unique_companies[unique_key] = existing_result[:-1] + (existing_result[-1] + 1,)
                            else:
                                unique_companies[unique_key] = result + (1,)
                        else:
                            yield result

        if count_unique:
            for company in unique_companies.values():
                yield company