def test_get_phrase_matcher():
    text = "Aujourd'hui, Michaël et Jessica écrivent des unit tests dans la joie et la bonne humeur, " \
           "mais où sont donc les enfants ?"
    matcher = FirstName(ignore_case=False)
    assert matcher.get_matches(text=text) == [
        Offset(13, 19, "PERS"), Offset(24, 30, "PERS")
    ]
    assert matcher.contain_first_names(text=text) is True
def get_juridictions(text: str) -> List[Offset]:
    """
    Extract Courts name from text
    :param text: original paragraph text
    :return: offsets as a list
    """
    result1 = [Offset(t.start(), t.end(), "COURT_1") for t in juridiction_pattern_1.finditer(text)]
    result2 = [Offset(t.start(), t.end(), "COURT_1") for t in juridiction_pattern_2.finditer(text)]
    return result1 + result2
def get_clerk_name(text: str) -> List[Offset]:
    """
    Extract clerk name from text
    :param text: original paragraph text
    :return: offsets as a list
    """
    result1 = [Offset(start=t.start(), end=t.end(), type="JUDGE_CLERK_1") for t in extract_clerk_pattern_1.finditer(text)]
    result2 = [Offset(start=t.start(), end=t.end(), type="JUDGE_CLERK_1") for t in extract_clerk_pattern_2.finditer(text)]
    return result1 + result2
def test_licence_plate():
    assert get_licence_plate("AA111AA") == [Offset(0, 7, 'LICENCE_PLATE')]
    assert get_licence_plate("AA-111-AA") == [Offset(0, 9, 'LICENCE_PLATE')]
    assert get_licence_plate("AA 111 AA") == [Offset(0, 9, 'LICENCE_PLATE')]
    assert get_licence_plate("1 AA111AA") == []
    assert get_licence_plate("AA 111 AA 1") == []

    assert get_licence_plate("1AA11") == [Offset(0, 5, 'LICENCE_PLATE')]
    assert get_licence_plate("9999 ZZZ 99") == [Offset(0, 11, 'LICENCE_PLATE')]
    assert get_licence_plate("9999 ZZZ 999") == []
def test_rg_regex():
    assert get_rg_from_regex(" RG n° 11/17073") == [Offset(7, 15, 'RG')]
    assert get_rg_from_regex(" RG : 13/03625 D") == [Offset(6, 15, 'RG')]
    assert get_rg_from_regex("RG numéro : 12/01503") == [Offset(12, 20, 'RG')]
    assert get_rg_from_regex("N°RG: 13/03409") == [Offset(6, 14, 'RG')]
    assert get_rg_from_regex("N° 13/03409") == []
    assert get_rg_from_regex(
        "Numéro d'inscription au répertoire général : 14/01913") == [
            Offset(45, 53, 'RG')
        ]
def test_rg_from_case_id():
    text1 = "CA-aix-en-provence-20130208-1022871-jurica"
    matcher = MatchRg(case_id=text1)
    assert matcher.get_rg_from_case_id() == "1022871"
    assert matcher.get_rg_offset_from_text(text=text1) == [
        Offset(28, 35, 'RG')
    ]
    text2 = "Le numéro RG est celui-ci 102 /2871."
    assert matcher.get_rg_offset_from_text(text=text2) == [
        Offset(26, 35, 'RG')
    ]
def test_bilou_conv():
    doc: Doc = pytest.nlp.make_doc("Ceci est un test.")
    offset1 = [Offset(5, 8, "UNKNOWN")]
    assert convert_unknown_bilou(
        doc, offsets=offset1).ner == ['O', '-', 'O', 'O', 'O']
    assert convert_unknown_bilou_bulk(
        [doc], [offset1])[0].ner == ['O', '-', 'O', 'O', 'O']
    offset2 = [Offset(5, 8, "PERS")]
    assert convert_unknown_bilou(
        doc, offsets=offset2).ner == ['O', 'U-PERS', 'O', 'O', 'O']
    offset3 = [Offset(0, 4, "UNKNOWN")]
    assert convert_unknown_bilou(
        doc, offsets=offset3).ner == ['-', 'O', 'O', 'O', 'O']
    def get_matches(self, text: str) -> List[Offset]:
        """
        Find matches of frequent entities in the provided text
        :param text: original text
        :return: a list of offsets
        """
        match_results = self.matcher.findall(text)
        entities = list()
        for match_text, start_offset in match_results:
            end_offset = start_offset + len(match_text)
            entity_span = text[start_offset:end_offset]

            # end_offset is one character after the end of the selection,
            # so it can be equal to the last charcter offset of the text + 1
            last_char_ok = (end_offset
                            == len(text)) or (not text[end_offset].isalnum())

            first_char_ok = (start_offset == 0 or not text[start_offset - 1].isalnum()) and \
                            (text[start_offset].isupper() or text[start_offset].isdecimal())

            if first_char_ok and last_char_ok:
                type_name = self.frequent_entities_dict[entity_span.lower()]
                entities.append(Offset(start_offset, end_offset, type_name))

        return entities
def remove_spaces_included_in_offsets(text: str, offsets: List[Offset]) -> List[Offset]:
    """
    If offset doesn't match a word boundary its type is removed by Spacy (unknown action)
    This function removes spaces when at the start or the end of the offset, if any
    More info -> https://spacy.io/usage/linguistic-features
    Test code:
    ----
    import spacy
    from spacy.gold import GoldParse
    from spacy.tokens import Doc
    nlp = spacy.blank('fr')
    doc2 = nlp('Who is Chaka Khan popo?')
    gold2 = GoldParse(doc2, entities=[(7, 18, 'PERSON')])
    print(gold2.ner)
    ----
    :param text: original text
    :param offsets: list of original offsets for this text
    :return: list of new offsets fixed
    """
    result = list()
    for offset in offsets:
        if (offset.start >= 0) and (offset.end - 1 < len(text)) and (offset.start != offset.end):
            new_start = offset.start + 1 if text[offset.start].isspace() else offset.start
            # remove 1 because the closing offset is not included in the selection in Python
            new_end = offset.end - 1 if text[offset.end - 1].isspace() else offset.end
            result.append(Offset(new_start, new_end, offset.type))
    return result
def get_bar(text: str) -> List[Offset]:
    """
    Extract offset related to a bar and its city localization
    French bar list: http://www.conferencedesbatonniers.com/barreaux/userslist/7-liste-des-barreaux
    :param text: original text
    :return: offset as a list
    """
    return [Offset(t.start(), t.end(), "BAR_1") for t in barreau_pattern.finditer(text)]
def test_random_case_change():
    text = "La Banque est fermée"
    offsets = [Offset(3, 9, "PERS")]
    seed(123)
    results = [random_case_change(text, offsets, 100) for _ in range(1, 500)]
    assert "La Banque est fermée" in results
    assert "La banque est fermée" in results
    assert "La BANQUE est fermée" in results
def test_social_security():
    valid_id = "2 40 09 93 618 017 05"
    invalid_id = "2 40 09 93 618 017 06"
    assert get_social_security_number(valid_id) == [
        Offset(0, 21, 'SOCIAL_SECURITY_NUMBER')
    ]
    assert get_social_security_number("1" + valid_id) == []
    assert get_social_security_number(invalid_id) == []
Exemplo n.º 13
0
def parse_offsets(text: str) -> Offset:
    """
    Convert to the right offset format
    :param text: original line
    :return: a tuple containing the offset
    """
    item = text.split(' ')
    return Offset(int(item[0]), int(item[1]), item[2])
def test_remove_spaces():
    text = "Je suis ici et je programme."
    offset = [Offset(3, 8, "TEST")]
    span_original = text[offset[0].start:offset[0].end]
    assert span_original == "suis "
    new_offset = remove_spaces_included_in_offsets(text, offset)
    span_new = text[new_offset[0].start:new_offset[0].end]
    assert span_new == span_original.strip()
Exemplo n.º 15
0
def get_lawyer_name(text: str) -> List[Offset]:
    """
    Extract lawyer name from text
    :param text: original paragraph text
    :return: offsets as a list
    """
    return [
        Offset(t.start(), t.end(), "LAWYER")
        for t in extract_lawyer.finditer(text)
    ]
def get_company_names(text: str) -> List[Offset]:
    """
    Extract company names from string text
    :param text: original text
    :return: a list of offsets
    """
    return [
        Offset(start=t.start(), end=t.end(), type="ORGANIZATION_1")
        for t in find_corp.finditer(text)
    ]
Exemplo n.º 17
0
 def get_rg_offset_from_text(self, text: str) -> List[Offset]:
     """
     Extract RG number offsets from a text, if any
     :param text: original text
     :return: offsets as a list
     """
     return [
         Offset(item.start(), item.end(), "RG")
         for item in self.pattern.finditer(text)
     ]
def get_partie_pers(text: str) -> List[Offset]:
    """
    Extract people names from text
    :param text: original paragraph text
    :return: offsets as a list
    """
    result1 = [
        Offset(start=t.start(), end=t.end(), type="PERS")
        for t in extract_partie_pp_pattern_1.finditer(text)
    ]
    result2 = [
        Offset(start=t.start(), end=t.end(), type="PERS")
        for t in extract_partie_pp_pattern_2.finditer(text)
    ]
    result3 = [
        Offset(start=t.start(), end=t.end(), type="PERS")
        for t in extract_partie_pp_pattern_3.finditer(text)
    ]
    return result1 + result2 + result3
def get_date(text: str) -> List[Offset]:
    """
    Parse text to retrieve offset mentioning a date
    :param text: original text
    :return: offsets as a list
    """
    r1 = [
        Offset(t.start(), t.end(), "DATE_1")
        for t in date_pattern_in_letters_regex.finditer(text)
    ]
    r2 = [
        Offset(t.start(), t.end(), "DATE_1")
        for t in date_pattern_in_numbers_regex_1.finditer(text)
    ]
    r3 = [
        Offset(t.start(), t.end(), "DATE_1")
        for t in date_pattern_in_numbers_regex_2.finditer(text)
    ]
    return r1 + r2 + r3
Exemplo n.º 20
0
def remove_key_words(text: str, offsets: List[Offset],
                     rate: int) -> Tuple[str, List[Offset]]:
    """
    Modify text to remove some key words, making the learning harder and the model more robust.
    :param text: original paragraph as a string
    :param offsets: list of extracted offsets
    :param rate: chance as an integer between 1 and 100 that a key word is removed
    :return: a tuple (new_text, offsets)
    """
    words_to_delete_offsets: List[Offset] = key_words_matcher.get_matches(
        text=text, tag="TO_DELETE")

    if (len(words_to_delete_offsets) == 0) or (len(offsets) == 0):
        return text, offsets

    detected_spans = dict()
    for offset in offsets:
        span_text = text[offset.start:offset.end]
        if len(span_text) > 0:
            detected_spans[span_text] = offset.type

    if len(detected_spans) == 0:
        return text, offsets

    original_content_offsets_matcher = AcoraMatcher(content=list(
        detected_spans.keys()),
                                                    ignore_case=False)

    cleaned_text = list()
    start_selection_offset = 0
    for offset in words_to_delete_offsets:
        if randint(1, 99) < rate:
            # - 1 to remove also the space following the keyword to remove
            cleaned_text.append(text[start_selection_offset:offset.start - 1])
            start_selection_offset = offset.end
        else:
            cleaned_text.append(text[start_selection_offset:offset.end])
            start_selection_offset = offset.end

    cleaned_text.append(text[start_selection_offset:len(text)])

    cleaned_text = ''.join(cleaned_text)

    updated_offsets = original_content_offsets_matcher.get_matches(
        text=cleaned_text, tag="UNKNOWN")

    offsets_to_return = list()

    # restore original offset type name
    for offset in updated_offsets:
        span_text = cleaned_text[offset.start:offset.end]
        type_name = detected_spans[span_text]
        offsets_to_return.append(Offset(offset.start, offset.end, type_name))

    return cleaned_text, offsets_to_return
def get_stripped_offsets(text: str, tag: str) -> Offset:
    """
    Get offsets of a the actual text ie. the text without the surronding whitespaces
    :param text: a line of text
    :param tag: tag to apply to the text
    :return: a tuple (start offset, end offset, tag name)
    """
    stripped_text = text.strip()
    start = text.find(stripped_text)
    end = start + len(stripped_text)
    return Offset(start, end, tag)
Exemplo n.º 22
0
def get_rg_from_regex(text: str) -> List[Offset]:
    """
    Extract RG number from text when some pattern is found
    :param text: original text
    :return: offset as a list
    """
    offsets = extract_rg_from_text_regex.search(text)
    if offsets is not None:
        return [Offset(offsets.start(), offsets.end(), "RG")]
    else:
        return list()
def test_credit_card():
    credit_card_valid = "4474 2054 6736 1295"
    credit_card_invalid = "1265 157309414560"

    assert get_credit_card_number("pop " + credit_card_valid +
                                  " apres") == [Offset(4, 24, 'CREDIT_CARD')]
    assert get_credit_card_number("pop " + credit_card_invalid +
                                  " apres") == []
    assert get_credit_card_number("1234 1234 1234 1234") == []
    assert get_credit_card_number("1- 1234 1234 1234 1234") == []
    assert get_credit_card_number("1- " + credit_card_valid) == []
def get_addresses(text: str) -> List[Offset]:
    """
    Extract addresses from text
    :param text: original paragraph text
    :return: offsets as a list
    """
    result1 = [
        Offset(t.start(), t.end(), "ADDRESS_1")
        for t in extract_address_pattern_1.finditer(text)
    ]
    result2 = [
        Offset(t.start(), t.end(), "ADDRESS_1")
        for t in extract_address_pattern_2.finditer(text)
    ]
    result3 = [
        Offset(t.start(), t.end(), "ADDRESS_1")
        for t in extract_address_pattern_3.finditer(text)
    ]

    return sorted(remove_duplicates(result1 + result2 + result3),
                  key=lambda o: (o.start, o.end))
Exemplo n.º 25
0
def test_match_headers_content():
    config_training = get_config_default()
    xml_path = config_training["xml_unittest_file"]
    header_content_all_cases = parse_xml_header(path=xml_path)
    case_id = list(header_content_all_cases.keys())[0]
    header_content = header_content_all_cases[case_id]
    headers_matcher = MatchValuesFromHeaders(current_header=header_content, threshold_size=3)
    matcher_partie_pp = headers_matcher.get_matcher_of_partie_pp_from_headers()

    text1 = "C'est Catherine ***REMOVED*** qui est responsable de ces faits avec M. LEON ***REMOVED***"

    assert matcher_partie_pp.get_matches(text1, "PERS") == [Offset(6, 29, "PERS")]
Exemplo n.º 26
0
    def get_extended_names(self, text: str) -> List[Offset]:
        """
        Apply the generated regex pattern to current paragraph text
        No computation if there is nothing to find
        :param text: current original text
        :return: offset list
        """
        if self.dont_detect:
            return list()

        result1 = [
            Offset(t.start(), t.end(), self.type_name)
            for t in self.pattern_title.finditer(text)
        ]
        result2 = [
            Offset(t.start(), t.end(), self.type_name)
            for t in self.pattern_extend_right.finditer(text)
        ]
        result = list(remove_duplicates(result1 + result2))
        result = sorted(result, key=lambda o: o.start)
        return result
Exemplo n.º 27
0
 def get_matches(self, text: str) -> List[Offset]:
     """
     Find match of first name in a text
     :param text: original text
     :return: list of offsets
     """
     offsets = self.matcher.get_matches(text=text, tag="PERS")
     # names include a space so we fix the point by removing 1 to the offset
     results = [
         Offset(offset.start, offset.end - 1, offset.type)
         for offset in offsets
     ]
     return results
def get_phone_number(text: str) -> List[Offset]:
    """
    Find phone number.
    Pattern catch numbers in one block or separated per block of 2 numbers
    :param text: original text
    :return: list of offsets
    """
    patterns = phone_regex.finditer(text)
    result = list()
    for pattern in patterns:
        if pattern is not None and ("compte" not in text):
            result.append(Offset(pattern.start(), pattern.end(), "PHONE_NUMBER"))
    return result
Exemplo n.º 29
0
def get_credit_card_number(text: str) -> List[Offset]:
    """
    Retrieve list of offsets related to credit cards.
    Check the checksum code to avoid most of false positives
    :param text: original text as a string
    :return: list of offsets
    """
    pattern = credit_card_regex.search(text)
    if pattern is not None:
        number_as_string = text[pattern.start():pattern.end()]
        if validate(number_as_string):
            return [Offset(pattern.start(), pattern.end(), "CREDIT_CARD")]
    return []
def test_remove_unwanted_words():
    text1 = "Monsieur toto"
    offset1 = [Offset(0, len("Monsieur toto"), "PERS")]
    assert clean_offsets_from_unwanted_words(text=text1, offsets=offset1) == [Offset(9, 13, "PERS")]
    text2 = "Succombant même partiellement, Madame GUERIN supportera la charge "
    offset2 = [Offset(31, 37, "PERS"), Offset(31, 44, "PERS")]
    assert clean_offsets_from_unwanted_words(text=text2, offsets=offset2) == [Offset(37, 37, "PERS"),
                                                                              Offset(38, 44, "PERS")]