def create_api_response_for_post_identify_alternative_reference_in_text_service_norwegian_chapter_input(forward_filtered_result_with_only_the_things_we_are_looking_for):

    nlp = English()
    forward_result = []

    for line in forward_filtered_result_with_only_the_things_we_are_looking_for:

        # Get NLP data from line
        text_service_url = line['title']
        text = line['text']
        ents = line['ents'] # discovered enteties in the line
        last_index_number_of_ents = len(ents)-1
        doc = nlp(text)

        # Get metadata from URL
        result_text_service_url = get_data_from_text_service_item_url(text_service_url)
        metadata_from_url = {}
        if "regulation_year" in result_text_service_url:
            metadata_from_url['regulation_year'] = result_text_service_url['regulation_year']
        if "regulation_month" in result_text_service_url:
            metadata_from_url['regulation_month'] = result_text_service_url['regulation_month']
        if "regulation_day" in result_text_service_url:
            metadata_from_url['regulation_day'] = result_text_service_url['regulation_day']
        if "regulation_id" in result_text_service_url:
            metadata_from_url['regulation_id'] = result_text_service_url['regulation_id']
        if "chapter_number" in result_text_service_url:
            metadata_from_url['chapter_number'] = result_text_service_url['chapter_number']
        if "section_number" in result_text_service_url:
            metadata_from_url['section_number'] = result_text_service_url['section_number']
        if "part_number" in result_text_service_url:
            metadata_from_url['part_number'] = result_text_service_url['part_number']
        if "sub_part_number" in result_text_service_url:
            metadata_from_url['sub_part_number'] = result_text_service_url['sub_part_number']

        # For each ent in line
        for ent_index_number, ent in enumerate(ents):

            ent_label = ent['label']
            ent_start = ent['start']
            ent_end = ent['end']

            ent_text = text[ent_start:ent_end] # same as: doc[ent_token_span.start:ent_token_span.end]
            ent_doc = nlp(ent_text)
            words_in_doc_count = len(ent_doc)

            ent_token_span = doc.char_span(ent_start,ent_end)
            ent_token_span_start = ent_token_span.start
            ent_token_span_end = ent_token_span.end
            
            #
            # Statment concluder
            # No need to filter. All ents are welcome!
            #

            detection_with_url_metadata = dict(metadata_from_url)
            detection_with_url_metadata["alternative_reference"] = ent_text
            forward_result.append(detection_with_url_metadata)

    return forward_result
def create_api_response_for_post_identify_PROTECTED_in_text_service_norwegian_chapter(forward_filtered_result_with_only_the_things_we_are_looking_for):

    nlp = English()
    forward_result = []

    for line in forward_filtered_result_with_only_the_things_we_are_looking_for:

        # Get NLP data from line
        text_service_url = line['title']
        text = line['text']
        ents = line['ents'] # discovered enteties in the line
        last_index_number_of_ents = len(ents)-1
        doc = nlp(text)

        # Get metadata from URL
        result_text_service_url = get_data_from_text_service_item_url(text_service_url)
        metadata_from_url = {}
        if "regulation_year" in result_text_service_url:
            metadata_from_url['regulation_year'] = result_text_service_url['regulation_year']
        if "regulation_month" in result_text_service_url:
            metadata_from_url['regulation_month'] = result_text_service_url['regulation_month']
        if "regulation_day" in result_text_service_url:
            metadata_from_url['regulation_day'] = result_text_service_url['regulation_day']
        if "regulation_id" in result_text_service_url:
            metadata_from_url['regulation_id'] = result_text_service_url['regulation_id']
        if "chapter_number" in result_text_service_url:
            metadata_from_url['chapter_number'] = result_text_service_url['chapter_number']
        if "section_number" in result_text_service_url:
            metadata_from_url['section_number'] = result_text_service_url['section_number']
        if "part_number" in result_text_service_url:
            metadata_from_url['part_number'] = result_text_service_url['part_number']
        if "sub_part_number" in result_text_service_url:
            metadata_from_url['sub_part_number'] = result_text_service_url['sub_part_number']

        detection_with_url_metadata = dict(metadata_from_url)
        detection_with_url_metadata["protected"] = True
        forward_result.append(detection_with_url_metadata)

    return forward_result
def create_api_response_for_post_identify_PASSENGER_in_text_service_norwegian_chapter(
        title_dictionary,
        forward_filtered_result_with_only_the_things_we_are_looking_for):

    nlp = English()
    forward_result = []

    for line in forward_filtered_result_with_only_the_things_we_are_looking_for:

        # Get NLP data from line
        text_service_url = line['title']
        text = line['text']
        ents = line['ents']  # discovered enteties in the line
        last_index_number_of_ents = len(ents) - 1
        doc = nlp(text)

        # Get metadata from URL
        result_text_service_url = get_data_from_text_service_item_url(
            text_service_url)
        metadata_from_url = {}
        if "regulation_year" in result_text_service_url:
            metadata_from_url['regulation_year'] = result_text_service_url[
                'regulation_year']
        if "regulation_month" in result_text_service_url:
            metadata_from_url['regulation_month'] = result_text_service_url[
                'regulation_month']
        if "regulation_day" in result_text_service_url:
            metadata_from_url['regulation_day'] = result_text_service_url[
                'regulation_day']
        if "regulation_id" in result_text_service_url:
            metadata_from_url['regulation_id'] = result_text_service_url[
                'regulation_id']
        if "chapter_number" in result_text_service_url:
            metadata_from_url['chapter_number'] = result_text_service_url[
                'chapter_number']
        if "section_number" in result_text_service_url:
            metadata_from_url['section_number'] = result_text_service_url[
                'section_number']
        if "part_number" in result_text_service_url:
            metadata_from_url['part_number'] = result_text_service_url[
                'part_number']
        if "sub_part_number" in result_text_service_url:
            metadata_from_url['sub_part_number'] = result_text_service_url[
                'sub_part_number']

        # add chapter_title and section_title
        if "chapter_title" in title_dictionary:
            metadata_from_url['chapter_title'] = title_dictionary[
                'chapter_title']
        if "section_title_in_dictionary" in title_dictionary:
            section_title_dictionary = title_dictionary[
                'section_title_in_dictionary']
            if text_service_url in section_title_dictionary:
                metadata_from_url['section_title'] = section_title_dictionary[
                    text_service_url]

        # For each ent in line
        for ent_index_number, ent in enumerate(ents):

            ent_label = ent['label']
            ent_start = ent['start']
            ent_end = ent['end']

            ent_text = text[
                ent_start:
                ent_end]  # same as: doc[ent_token_span.start:ent_token_span.end]
            ent_doc = nlp(ent_text)
            words_in_doc_count = len(ent_doc)

            ent_token_span = doc.char_span(ent_start, ent_end)
            ent_token_span_start = ent_token_span.start
            ent_token_span_end = ent_token_span.end

            #print(ent_text + " - " + ent_label + " (" + str(ent_token_span_start) + ":" + str(ent_token_span_end) + ")")

            #
            # Statment concluder
            #

            if ent_label == "PASSENGER":

                if "eller færre" in ent_text:
                    detection_with_url_metadata = dict(metadata_from_url)
                    detection_with_url_metadata[
                        "passenger_context"] = "eller færre"
                    detection_with_url_metadata["passenger_value_1"] = ent_doc[
                        0].text
                    detection_with_url_metadata["measurement_text"] = ent_doc[
                        words_in_doc_count - 1].text
                    forward_result.append(detection_with_url_metadata)
                elif ent_text.startswith("mer enn"):
                    detection_with_url_metadata = dict(metadata_from_url)
                    detection_with_url_metadata[
                        "passenger_context"] = "mer enn"
                    detection_with_url_metadata["passenger_value_1"] = ent_doc[
                        2].text
                    detection_with_url_metadata["measurement_text"] = ent_doc[
                        words_in_doc_count - 1].text
                    forward_result.append(detection_with_url_metadata)

    return forward_result
def create_api_response_for_post_identify_FLASHPOINT_in_text_service_norwegian_chapter(
        forward_filtered_result_with_only_the_things_we_are_looking_for):

    nlp = English()
    forward_result = []

    for line in forward_filtered_result_with_only_the_things_we_are_looking_for:

        # Get NLP data from line
        text_service_url = line['title']
        text = line['text']
        ents = line['ents']  # discovered enteties in the line
        last_index_number_of_ents = len(ents) - 1
        doc = nlp(text)

        # Get metadata from URL
        result_text_service_url = get_data_from_text_service_item_url(
            text_service_url)
        metadata_from_url = {}
        if "regulation_year" in result_text_service_url:
            metadata_from_url['regulation_year'] = result_text_service_url[
                'regulation_year']
        if "regulation_month" in result_text_service_url:
            metadata_from_url['regulation_month'] = result_text_service_url[
                'regulation_month']
        if "regulation_day" in result_text_service_url:
            metadata_from_url['regulation_day'] = result_text_service_url[
                'regulation_day']
        if "regulation_id" in result_text_service_url:
            metadata_from_url['regulation_id'] = result_text_service_url[
                'regulation_id']
        if "chapter_number" in result_text_service_url:
            metadata_from_url['chapter_number'] = result_text_service_url[
                'chapter_number']
        if "section_number" in result_text_service_url:
            metadata_from_url['section_number'] = result_text_service_url[
                'section_number']
        if "part_number" in result_text_service_url:
            metadata_from_url['part_number'] = result_text_service_url[
                'part_number']
        if "sub_part_number" in result_text_service_url:
            metadata_from_url['sub_part_number'] = result_text_service_url[
                'sub_part_number']

        # For each ent in line
        ents_count = len(ents)
        for ent_index_number, ent in enumerate(ents):

            ent_count_number = ent_index_number + 1

            ent_label = ent['label']
            ent_start = ent['start']
            ent_end = ent['end']

            ent_text = text[
                ent_start:
                ent_end]  # same as: doc[ent_token_span.start:ent_token_span.end]
            ent_doc = nlp(ent_text)
            words_in_doc_count = len(ent_doc)

            ent_token_span = doc.char_span(ent_start, ent_end)
            ent_token_span_start = ent_token_span.start
            ent_token_span_end = ent_token_span.end

            #print(ent_text + " - " + ent_label + " (" + str(ent_token_span_start) + ":" + str(ent_token_span_end) + ")")

            #
            # Statment concluder
            #

            if (ent_label == "TEMPERATURE" and ent_index_number >
                    0):  # TEMPERATURE is never the first ent

                prev_1_ent_index_number = ent_index_number - 1
                prev_1_ent = ents[prev_1_ent_index_number]

                if prev_1_ent['label'] == "FLASHPOINT":

                    detection_with_url_metadata = dict(metadata_from_url)
                    detection_with_url_metadata[
                        "flashpoint_value_1"] = ent_doc[0].text
                    detection_with_url_metadata[
                        "flashpoint_value_1_measurement"] = "celsius"

                    # do we have a suffix to add?
                    ent_count_number_of_next_ent = ent_count_number + 1
                    if ents_count >= ent_count_number_of_next_ent:
                        next_1_ent_index_number = ent_index_number + 1
                        next_1_ent = ents[next_1_ent_index_number]
                        if next_1_ent['label'] == "TEMPERATURE_SUFFIX":
                            next_1_ent_text = text[
                                next_1_ent['start']:next_1_ent['end']]
                            detection_with_url_metadata[
                                "flashpoint_value_1_suffix"] = next_1_ent_text

                    forward_result.append(detection_with_url_metadata)

                elif (prev_1_ent['label'] == "TEMPERATURE_PREFIX"
                      and ent_index_number > 1):

                    prev_2_ent_index_number = ent_index_number - 2
                    prev_2_ent = ents[prev_2_ent_index_number]

                    if prev_2_ent['label'] == "FLASHPOINT":

                        detection_with_url_metadata = dict(metadata_from_url)
                        detection_with_url_metadata[
                            "flashpoint_value_1"] = ent_doc[0].text
                        detection_with_url_metadata[
                            "flashpoint_value_1_measurement"] = "celsius"
                        prev_1_ent_text = text[
                            prev_1_ent['start']:prev_1_ent['end']]
                        detection_with_url_metadata[
                            "flashpoint_value_1_prefix"] = prev_1_ent_text

                        # do we have a suffix to add?
                        ent_count_number_of_next_ent = ent_count_number + 1
                        if ents_count >= ent_count_number_of_next_ent:
                            next_1_ent_index_number = ent_index_number + 1
                            next_1_ent = ents[next_1_ent_index_number]
                            if next_1_ent['label'] == "TEMPERATURE_SUFFIX":
                                next_1_ent_text = text[
                                    next_1_ent['start']:next_1_ent['end']]
                                detection_with_url_metadata[
                                    "flashpoint_value_1_suffix"] = next_1_ent_text

                        forward_result.append(detection_with_url_metadata)

    return forward_result
Пример #5
0
def create_api_response_for_post_identify_build_date_in_text_service_norwegian_chapter_input(
        forward_filtered_result_with_only_the_things_we_are_looking_for):

    nlp = English()

    forward_result = []

    ### TEMP AREA
    temp_detection_dictionary = {}
    temp_check_before_reset = {}
    ###

    for line in forward_filtered_result_with_only_the_things_we_are_looking_for:

        # new line and temp reset
        temp_detection_dictionary.clear()
        temp_check_before_reset.clear()

        # Get NLP data from line
        text_service_url = line['title']
        text = line['text']
        ents = line['ents']  # discovered enteties in the line
        last_index_number_of_ents = len(ents) - 1
        doc = nlp(text)

        # Get metadata from URL
        result_text_service_url = get_data_from_text_service_item_url(
            text_service_url)
        metadata_from_url = {}
        if "regulation_year" in result_text_service_url:
            metadata_from_url['regulation_year'] = result_text_service_url[
                'regulation_year']
        if "regulation_month" in result_text_service_url:
            metadata_from_url['regulation_month'] = result_text_service_url[
                'regulation_month']
        if "regulation_day" in result_text_service_url:
            metadata_from_url['regulation_day'] = result_text_service_url[
                'regulation_day']
        if "regulation_id" in result_text_service_url:
            metadata_from_url['regulation_id'] = result_text_service_url[
                'regulation_id']
        if "chapter_number" in result_text_service_url:
            metadata_from_url['chapter_number'] = result_text_service_url[
                'chapter_number']
        if "section_number" in result_text_service_url:
            metadata_from_url['section_number'] = result_text_service_url[
                'section_number']
        if "part_number" in result_text_service_url:
            metadata_from_url['part_number'] = result_text_service_url[
                'part_number']
        if "sub_part_number" in result_text_service_url:
            metadata_from_url['sub_part_number'] = result_text_service_url[
                'sub_part_number']

        # For each ent in line
        for ent_index_number, ent in enumerate(ents):

            ent_label = ent['label']
            ent_start = ent['start']
            ent_end = ent['end']

            ent_text = text[
                ent_start:
                ent_end]  # same as: doc[ent_token_span.start:ent_token_span.end]
            ent_doc = nlp(ent_text)
            words_in_doc_count = len(ent_doc)

            ent_token_span = doc.char_span(ent_start, ent_end)
            ent_token_span_start = ent_token_span.start
            ent_token_span_end = ent_token_span.end

            #print(ent_text + " - " + ent_label + " (" + str(ent_token_span_start) + ":" + str(ent_token_span_end) + ")")

            #
            # Statment builder
            #

            if ent_label == "WATER_VESSEL":
                if "START_detected" not in temp_detection_dictionary:
                    temp_detection_dictionary["START_detected"] = True
                else:  # restart with new term
                    temp_check_before_reset = dict(temp_detection_dictionary)
                    temp_detection_dictionary.clear()
                    temp_detection_dictionary["START_detected"] = True

            elif ent_label == "CONSTRUCT":
                if ("START_detected" in temp_detection_dictionary and
                        "CONSTRUCT_detected" not in temp_detection_dictionary):
                    temp_detection_dictionary["CONSTRUCT_detected"] = True
                else:  # reset
                    temp_check_before_reset = dict(temp_detection_dictionary)
                    temp_detection_dictionary.clear()

            elif ent_label == "DATE_PREFIX":
                if ("START_detected" in temp_detection_dictionary
                        and "CONSTRUCT_detected" in temp_detection_dictionary
                        and "DATE_PREFIX_value"
                        not in temp_detection_dictionary
                        and "DATE_value_1" not in temp_detection_dictionary and
                        "DATE_SEPARATOR_value" not in temp_detection_dictionary
                        and "DATE_value_2" not in temp_detection_dictionary):
                    temp_detection_dictionary["DATE_PREFIX_value"] = ent_text
                else:  # reset
                    temp_check_before_reset = dict(temp_detection_dictionary)
                    temp_detection_dictionary.clear()

            elif ent_label == "DATE":
                if ("START_detected" in temp_detection_dictionary
                        and "CONSTRUCT_detected" in temp_detection_dictionary
                        and "DATE_value_1" not in temp_detection_dictionary
                        and "DATE_value_1_token_end"
                        not in temp_detection_dictionary):
                    temp_detection_dictionary["DATE_value_1"] = ent_text
                    temp_detection_dictionary[
                        "DATE_value_1_token_end"] = ent_token_span_end

                elif ("START_detected" in temp_detection_dictionary
                      and "CONSTRUCT_detected" in temp_detection_dictionary
                      and "DATE_value_1" in temp_detection_dictionary
                      and "DATE_SEPARATOR_value" in temp_detection_dictionary
                      and "DATE_value_2" not in temp_detection_dictionary):
                    temp_detection_dictionary['DATE_value_2'] = ent_text
                    # because this is the last value in a statment:
                    temp_check_before_reset = dict(temp_detection_dictionary)
                    temp_detection_dictionary.clear()
                else:  # reset
                    temp_check_before_reset = dict(temp_detection_dictionary)
                    temp_detection_dictionary.clear()

            elif ent_label == "DATE_SEPARATOR":
                if ("START_detected" in temp_detection_dictionary
                        and "CONSTRUCT_detected" in temp_detection_dictionary
                        and "DATE_value_1" in temp_detection_dictionary and
                        "DATE_value_1_token_end" in temp_detection_dictionary
                        and "DATE_SEPARATOR_value"
                        not in temp_detection_dictionary):
                    # Q: Is the separator the next term after value 1?
                    if temp_detection_dictionary[
                            "DATE_value_1_token_end"] == ent_token_span_start:
                        # A: Yes, this separator is the first word after value 1
                        temp_detection_dictionary[
                            "DATE_SEPARATOR_value"] = ent_text
                    else:  # reset
                        # A: No. Reject value and reset.
                        temp_check_before_reset = dict(
                            temp_detection_dictionary)
                        temp_detection_dictionary.clear()
                else:  # reset
                    temp_check_before_reset = dict(temp_detection_dictionary)
                    temp_detection_dictionary.clear()

            #
            # Statment concluder
            # Q: Do we have what we need to build a statment?
            #

            # The statment builder have restarted.
            # Check what we have for a statment before continuing.
            if len(temp_check_before_reset) > 0:
                # If we have a double value statement
                if ("START_detected" in temp_check_before_reset
                        and "CONSTRUCT_detected" in temp_check_before_reset
                        and "DATE_value_1" in temp_check_before_reset
                        and "DATE_SEPARATOR_value" in temp_check_before_reset
                        and "DATE_value_2" in temp_check_before_reset):
                    detection_with_url_metadata = dict(metadata_from_url)
                    if "DATE_PREFIX_value" in temp_check_before_reset:
                        detection_with_url_metadata[
                            "date_context"] = temp_check_before_reset[
                                "DATE_PREFIX_value"]
                    detection_with_url_metadata[
                        "date_value_1"] = temp_check_before_reset[
                            "DATE_value_1"]
                    detection_with_url_metadata[
                        "date_separator"] = temp_check_before_reset[
                            "DATE_SEPARATOR_value"]
                    detection_with_url_metadata[
                        "date_value_2"] = temp_check_before_reset[
                            "DATE_value_2"]
                    forward_result.append(detection_with_url_metadata)
                # If we have a single value statment
                elif ("START_detected" in temp_check_before_reset
                      and "CONSTRUCT_detected" in temp_check_before_reset
                      and "DATE_value_1" in temp_check_before_reset):
                    detection_with_url_metadata = dict(metadata_from_url)
                    if "DATE_PREFIX_value" in temp_check_before_reset:
                        detection_with_url_metadata[
                            "date_context"] = temp_check_before_reset[
                                "DATE_PREFIX_value"]
                    detection_with_url_metadata[
                        "date_value_1"] = temp_check_before_reset[
                            "DATE_value_1"]
                    forward_result.append(detection_with_url_metadata)
                temp_check_before_reset.clear()

            # Conclude on current detections
            if ("START_detected" in temp_detection_dictionary
                    and "CONSTRUCT_detected" in temp_detection_dictionary
                    and "DATE_value_1" in temp_detection_dictionary
                    and "DATE_SEPARATOR_value" in temp_detection_dictionary
                    and "DATE_value_2" in temp_detection_dictionary):
                # we have a full statment.
                # add and reset.
                detection_with_url_metadata = dict(metadata_from_url)
                if "DATE_PREFIX_value" in temp_detection_dictionary:
                    detection_with_url_metadata[
                        "date_context"] = temp_detection_dictionary[
                            "DATE_PREFIX_value"]
                detection_with_url_metadata[
                    "date_value_1"] = temp_detection_dictionary["DATE_value_1"]
                detection_with_url_metadata[
                    "date_separator"] = temp_detection_dictionary[
                        "DATE_SEPARATOR_value"]
                detection_with_url_metadata[
                    "date_value_2"] = temp_detection_dictionary["DATE_value_2"]
                forward_result.append(detection_with_url_metadata)
                temp_detection_dictionary.clear()

            else:
                # get next ent
                next_ent_index_number = ent_index_number + 1
                next_ent_label = ""
                if next_ent_index_number <= last_index_number_of_ents:
                    next_ent = ents[next_ent_index_number]
                    next_ent_label = next_ent["label"]
                # Q: Do we have enough for a new statment?
                if ("START_detected" in temp_detection_dictionary
                        and "CONSTRUCT_detected" in temp_detection_dictionary
                        and "DATE_value_1" in temp_detection_dictionary):
                    # A: Yes, we have enough for a new statment.
                    # Is the next ent relevant?
                    if ("DATE_SEPARATOR_value" not in temp_detection_dictionary
                            and next_ent_label == "DATE_SEPARATOR"):
                        continue  # we want the next ent
                    elif ("DATE_SEPARATOR_value" in temp_detection_dictionary
                          and "DATE_value_2" not in temp_detection_dictionary):
                        continue  # we know that the next value is a date
                    else:  # add the statment and move on
                        detection_with_url_metadata = dict(metadata_from_url)
                        if "DATE_PREFIX_value" in temp_detection_dictionary:
                            detection_with_url_metadata[
                                "date_context"] = temp_detection_dictionary[
                                    "DATE_PREFIX_value"]
                        detection_with_url_metadata[
                            "date_value_1"] = temp_detection_dictionary[
                                "DATE_value_1"]
                        forward_result.append(detection_with_url_metadata)
                        temp_detection_dictionary.clear()

    return forward_result
Пример #6
0
def create_api_response_for_post_identify_GROSS_TONNAGE_in_text_service_norwegian_chapter(
        forward_filtered_result_with_only_the_things_we_are_looking_for):

    nlp = English()
    forward_result = []

    for line in forward_filtered_result_with_only_the_things_we_are_looking_for:

        # Get NLP data from line
        text_service_url = line['title']
        text = line['text']
        ents = line['ents']  # discovered enteties in the line
        last_index_number_of_ents = len(ents) - 1
        doc = nlp(text)

        # Get metadata from URL
        result_text_service_url = get_data_from_text_service_item_url(
            text_service_url)
        metadata_from_url = {}
        if "regulation_year" in result_text_service_url:
            metadata_from_url['regulation_year'] = result_text_service_url[
                'regulation_year']
        if "regulation_month" in result_text_service_url:
            metadata_from_url['regulation_month'] = result_text_service_url[
                'regulation_month']
        if "regulation_day" in result_text_service_url:
            metadata_from_url['regulation_day'] = result_text_service_url[
                'regulation_day']
        if "regulation_id" in result_text_service_url:
            metadata_from_url['regulation_id'] = result_text_service_url[
                'regulation_id']
        if "chapter_number" in result_text_service_url:
            metadata_from_url['chapter_number'] = result_text_service_url[
                'chapter_number']
        if "section_number" in result_text_service_url:
            metadata_from_url['section_number'] = result_text_service_url[
                'section_number']
        if "part_number" in result_text_service_url:
            metadata_from_url['part_number'] = result_text_service_url[
                'part_number']
        if "sub_part_number" in result_text_service_url:
            metadata_from_url['sub_part_number'] = result_text_service_url[
                'sub_part_number']

        # For each ent in line
        for ent_index_number, ent in enumerate(ents):

            ent_label = ent['label']
            ent_start = ent['start']
            ent_end = ent['end']

            ent_text = text[
                ent_start:
                ent_end]  # same as: doc[ent_token_span.start:ent_token_span.end]
            ent_doc = nlp(ent_text)
            words_in_doc_count = len(ent_doc)

            ent_token_span = doc.char_span(ent_start, ent_end)
            ent_token_span_start = ent_token_span.start
            ent_token_span_end = ent_token_span.end

            #print(ent_text + " - " + ent_label + " (" + str(ent_token_span_start) + ":" + str(ent_token_span_end) + ")")

            #
            # Statment concluder
            #

            if ent_label == "GROSS_TONNAGE":

                if "til" in ent_text:
                    # Example: bruttotonnasje opp til 3000
                    detection_with_url_metadata = dict(metadata_from_url)
                    detection_with_url_metadata[
                        "gross_tonnage_context"] = ent_doc[1].text + " til"
                    detection_with_url_metadata[
                        "gross_tonnage_value_1"] = ent_doc[words_in_doc_count -
                                                           1].text
                    detection_with_url_metadata["measurement_text"] = ent_doc[
                        0].text
                    forward_result.append(detection_with_url_metadata)

                elif any(ext in ent_text for ext in ["under", "over"]):
                    # Example: bruttotonnasje over 500
                    detection_with_url_metadata = dict(metadata_from_url)
                    detection_with_url_metadata[
                        "gross_tonnage_context"] = ent_doc[1].text
                    detection_with_url_metadata[
                        "gross_tonnage_value_1"] = ent_doc[words_in_doc_count -
                                                           1].text
                    detection_with_url_metadata["measurement_text"] = ent_doc[
                        0].text
                    forward_result.append(detection_with_url_metadata)

                elif "eller" in ent_text:
                    # Example: bruttotonnasje 50 eller mer
                    detection_with_url_metadata = dict(metadata_from_url)
                    detection_with_url_metadata[
                        "gross_tonnage_context"] = "eller " + ent_doc[
                            words_in_doc_count - 1].text
                    detection_with_url_metadata[
                        "gross_tonnage_value_1"] = ent_doc[1].text
                    detection_with_url_metadata["measurement_text"] = ent_doc[
                        0].text
                    forward_result.append(detection_with_url_metadata)

    return forward_result
Пример #7
0
def create_api_response_for_post_identify_vessel_length_overall_in_text_service_norwegian_chapter_input(forward_filtered_result_with_only_the_things_we_are_looking_for):

    nlp = Norwegian()

    forward_result = []

    for line in forward_filtered_result_with_only_the_things_we_are_looking_for:

        text_service_url = line['title']
        text = line['text']
        ents = line['ents']

        # result
        result_length_dictionary = {}
        result_length_prefix_dictionary = {}
        result_text_service_url = get_data_from_text_service_item_url(text_service_url)

        for ent in ents:

            ent_label = ent['label']
            ent_start = ent['start']
            ent_end = ent['end']
            ent_text = text[ent_start:ent_end]

            ent_doc = nlp(ent_text)
            words_in_doc_count = len(ent_doc)

            data = {}

            if ent_label == "LENGTH" and words_in_doc_count == 4:
                # Example: 10,67 og 15 meter
                data['length_value_1'] = ent_doc[0].text
                data['length_separator'] = ent_doc[1].text
                data['length_value_2'] = ent_doc[2].text
                data['measurement'] = ent_doc[3].text

                if not result_length_dictionary:
                    result_length_dictionary = data

            elif ent_label == "LENGTH" and words_in_doc_count == 2:
                # Example: 10,67 meter
                data['length_value'] = ent_doc[0].text
                data['measurement'] = ent_doc[1].text

                if not result_length_dictionary:
                    result_length_dictionary = data

            elif ent_label == "LENGTH_PREFIX":
                # Example: mindre enn
                data['length_prefix'] = ent_doc.text

                if not result_length_prefix_dictionary:
                    result_length_prefix_dictionary = data

        merged_line_result = result_length_dictionary | result_length_prefix_dictionary

        if "regulation_year" in result_text_service_url:
            merged_line_result['regulation_year'] = result_text_service_url['regulation_year']
        if "regulation_month" in result_text_service_url:
            merged_line_result['regulation_month'] = result_text_service_url['regulation_month']
        if "regulation_day" in result_text_service_url:
            merged_line_result['regulation_day'] = result_text_service_url['regulation_day']
        if "regulation_id" in result_text_service_url:
            merged_line_result['regulation_id'] = result_text_service_url['regulation_id']
        if "chapter_number" in result_text_service_url:
            merged_line_result['chapter_number'] = result_text_service_url['chapter_number']
        if "section_number" in result_text_service_url:
            merged_line_result['section_number'] = result_text_service_url['section_number']
        if "part_number" in result_text_service_url:
            merged_line_result['part_number'] = result_text_service_url['part_number']
        if "sub_part_number" in result_text_service_url:
            merged_line_result['sub_part_number'] = result_text_service_url['sub_part_number']

        forward_result.append(merged_line_result)

    return forward_result
def create_api_response_for_post_identify_KEEL_LAID_in_text_service_norwegian_chapter(
        forward_filtered_result_with_only_the_things_we_are_looking_for):

    nlp = English()
    forward_result = []

    ### TEMP DATA
    date_counter = 0
    ###

    for line in forward_filtered_result_with_only_the_things_we_are_looking_for:

        # Get NLP data from line
        text_service_url = line['title']
        text = line['text']
        ents = line['ents']  # discovered enteties in the line
        last_index_number_of_ents = len(ents) - 1
        doc = nlp(text)

        # Get metadata from URL
        result_text_service_url = get_data_from_text_service_item_url(
            text_service_url)
        metadata_from_url = {}
        if "regulation_year" in result_text_service_url:
            metadata_from_url['regulation_year'] = result_text_service_url[
                'regulation_year']
        if "regulation_month" in result_text_service_url:
            metadata_from_url['regulation_month'] = result_text_service_url[
                'regulation_month']
        if "regulation_day" in result_text_service_url:
            metadata_from_url['regulation_day'] = result_text_service_url[
                'regulation_day']
        if "regulation_id" in result_text_service_url:
            metadata_from_url['regulation_id'] = result_text_service_url[
                'regulation_id']
        if "chapter_number" in result_text_service_url:
            metadata_from_url['chapter_number'] = result_text_service_url[
                'chapter_number']
        if "section_number" in result_text_service_url:
            metadata_from_url['section_number'] = result_text_service_url[
                'section_number']
        if "part_number" in result_text_service_url:
            metadata_from_url['part_number'] = result_text_service_url[
                'part_number']
        if "sub_part_number" in result_text_service_url:
            metadata_from_url['sub_part_number'] = result_text_service_url[
                'sub_part_number']

        detection_with_url_metadata = dict(metadata_from_url)
        detection_with_url_metadata["measurement_text"] = "date"

        # For each ent in line
        for ent_index_number, ent in enumerate(ents):

            ent_label = ent['label']
            ent_start = ent['start']
            ent_end = ent['end']

            ent_text = text[
                ent_start:
                ent_end]  # same as: doc[ent_token_span.start:ent_token_span.end]
            ent_doc = nlp(ent_text)
            words_in_doc_count = len(ent_doc)

            ent_token_span = doc.char_span(ent_start, ent_end)
            ent_token_span_start = ent_token_span.start
            ent_token_span_end = ent_token_span.end

            #
            # Statment concluder
            #

            if ent_label == "DATE":

                date_counter = date_counter + 1
                date_value_text = "date_value_" + str(date_counter)
                detection_with_url_metadata[date_value_text] = ent_text

                #
                # Does the DATE have a prefix or suffix?
                #

                # PREFIX
                prev_ent_index_number = ent_index_number - 1
                if prev_ent_index_number > 0:
                    prev_ent = ents[prev_ent_index_number]
                    if prev_ent['label'] == "DATE_PREFIX":
                        prev_ent_start = prev_ent['start']
                        prev_ent_end = prev_ent['end']
                        date_value_text = "date_value_" + str(
                            date_counter) + "_prefix"
                        detection_with_url_metadata[date_value_text] = text[
                            prev_ent_start:prev_ent_end]

                # SUFFIX
                next_ent_index_number = ent_index_number + 1
                if next_ent_index_number <= last_index_number_of_ents:
                    next_ent = ents[next_ent_index_number]
                    if next_ent['label'] == "DATE_SUFFIX":
                        next_ent_start = next_ent['start']
                        next_ent_end = next_ent['end']
                        date_value_text = "date_value_" + str(
                            date_counter) + "_suffix"
                        detection_with_url_metadata[date_value_text] = text[
                            next_ent_start:next_ent_end]

        date_counter = 0
        forward_result.append(detection_with_url_metadata)

    return forward_result
def create_api_response_for_post_identify_electrical_installation_in_text_service_norwegian_chapter_input(
        title_dictionary,
        forward_filtered_result_with_only_the_things_we_are_looking_for):

    nlp = Norwegian()

    forward_result = []

    for line in forward_filtered_result_with_only_the_things_we_are_looking_for:

        # A line is to be interpreted as one sentence.

        # We look for statments about something that starts with WATER_VESSEL and ends with VOLTAGE.

        # Detection of multiple statments in one line is supported.

        # Data
        text_service_url = line['title']
        text = line['text']
        ents = line['ents']

        # Metadata from URL
        metadata_from_url = {}
        result_text_service_url = get_data_from_text_service_item_url(
            text_service_url)
        if "regulation_year" in result_text_service_url:
            metadata_from_url['regulation_year'] = result_text_service_url[
                'regulation_year']
        if "regulation_month" in result_text_service_url:
            metadata_from_url['regulation_month'] = result_text_service_url[
                'regulation_month']
        if "regulation_day" in result_text_service_url:
            metadata_from_url['regulation_day'] = result_text_service_url[
                'regulation_day']
        if "regulation_id" in result_text_service_url:
            metadata_from_url['regulation_id'] = result_text_service_url[
                'regulation_id']
        if "chapter_number" in result_text_service_url:
            metadata_from_url['chapter_number'] = result_text_service_url[
                'chapter_number']
        if "section_number" in result_text_service_url:
            metadata_from_url['section_number'] = result_text_service_url[
                'section_number']
        if "part_number" in result_text_service_url:
            metadata_from_url['part_number'] = result_text_service_url[
                'part_number']
        if "sub_part_number" in result_text_service_url:
            metadata_from_url['sub_part_number'] = result_text_service_url[
                'sub_part_number']

        # add chapter_title and section_title
        if "chapter_title" in title_dictionary:
            metadata_from_url['chapter_title'] = title_dictionary[
                'chapter_title']
        if "section_title_in_dictionary" in title_dictionary:
            section_title_dictionary = title_dictionary[
                'section_title_in_dictionary']
            if text_service_url in section_title_dictionary:
                metadata_from_url['section_title'] = section_title_dictionary[
                    text_service_url]

        #This will be reset on detection of START ent and END ent.
        temp_detection_dictionary = {}
        temp_term_between_start_and_end_detected = False

        for ent_id, ent in enumerate(ents):

            ent_label = ent['label']
            ent_start = ent['start']
            ent_end = ent['end']
            ent_text = text[ent_start:ent_end]
            ent_doc = nlp(ent_text)
            words_in_doc_count = len(ent_doc)

            if ent_label == "WATER_VESSEL":
                # This is the START of a statment.
                # Therefore resetting detetection dictionary:
                temp_detection_dictionary = {}
                temp_term_between_start_and_end_detected = False

            elif ent_label == "ELECTRICAL_INSTALLATION":
                temp_term_between_start_and_end_detected = True

            elif ent_label == "VOLTAGE_PREFIX":
                # Example: mindre enn
                temp_detection_dictionary['voltage_prefix'] = ent_doc.text

            elif ent_label == "VOLTAGE" and words_in_doc_count == 2:
                # This is the END of a statment.

                if temp_term_between_start_and_end_detected == True:
                    # Statment is complete.
                    # Adding result to output list.

                    # Example: 10,67 meter
                    temp_detection_dictionary['voltage_value'] = ent_doc[
                        0].text
                    temp_detection_dictionary['measurement_text'] = ent_doc[
                        1].text

                    detection_with_url_metadata = temp_detection_dictionary | metadata_from_url
                    forward_result.append(detection_with_url_metadata)

                # Resetting detetection dictionary.
                temp_detection_dictionary = {}
                temp_term_between_start_and_end_detected = False

    return forward_result
Пример #10
0
def create_api_response_for_post_identify_RADIO_AREA_in_text_service_norwegian_chapter(title_dictionary,forward_filtered_result_with_only_the_things_we_are_looking_for):

    nlp = English()
    forward_result = []

    for line in forward_filtered_result_with_only_the_things_we_are_looking_for:

        # Get NLP data from line
        text_service_url = line['title']
        text = line['text']
        ents = line['ents'] # discovered enteties in the line
        last_index_number_of_ents = len(ents)-1
        doc = nlp(text)

        # Get metadata from URL
        result_text_service_url = get_data_from_text_service_item_url(text_service_url)
        metadata_from_url = {}
        if "regulation_year" in result_text_service_url:
            metadata_from_url['regulation_year'] = result_text_service_url['regulation_year']
        if "regulation_month" in result_text_service_url:
            metadata_from_url['regulation_month'] = result_text_service_url['regulation_month']
        if "regulation_day" in result_text_service_url:
            metadata_from_url['regulation_day'] = result_text_service_url['regulation_day']
        if "regulation_id" in result_text_service_url:
            metadata_from_url['regulation_id'] = result_text_service_url['regulation_id']
        if "chapter_number" in result_text_service_url:
            metadata_from_url['chapter_number'] = result_text_service_url['chapter_number']
        if "section_number" in result_text_service_url:
            metadata_from_url['section_number'] = result_text_service_url['section_number']
        if "part_number" in result_text_service_url:
            metadata_from_url['part_number'] = result_text_service_url['part_number']
        if "sub_part_number" in result_text_service_url:
            metadata_from_url['sub_part_number'] = result_text_service_url['sub_part_number']

        # add chapter_title and section_title
        if "chapter_title" in title_dictionary:
            metadata_from_url['chapter_title'] = title_dictionary['chapter_title']
        if "section_title_in_dictionary" in title_dictionary:
            section_title_dictionary = title_dictionary['section_title_in_dictionary']
            if text_service_url in section_title_dictionary:
                metadata_from_url['section_title'] = section_title_dictionary[text_service_url]

        # For each ent in line
        for ent_index_number, ent in enumerate(ents):

            ent_label = ent['label']
            ent_start = ent['start']
            ent_end = ent['end']

            ent_text = text[ent_start:ent_end] # same as: doc[ent_token_span.start:ent_token_span.end]
            ent_doc = nlp(ent_text)
            words_in_doc_count = len(ent_doc)

            ent_token_span = doc.char_span(ent_start,ent_end)
            ent_token_span_start = ent_token_span.start
            ent_token_span_end = ent_token_span.end
            
            #
            # Statment concluder
            #

            if ent_label == "RADIO_AREA_TYPE":

                detection_with_url_metadata = dict(metadata_from_url)
                detection_with_url_metadata["radio_area_type_text"] = ent_text
                forward_result.append(detection_with_url_metadata)

    return forward_result
Пример #11
0
def create_api_response_for_post_identify_PROPULSION_POWER_in_text_service_norwegian_chapter(forward_filtered_result_with_only_the_things_we_are_looking_for):

    nlp = English()
    forward_result = []

    for line in forward_filtered_result_with_only_the_things_we_are_looking_for:

        # Get NLP data from line
        text_service_url = line['title']
        text = line['text']
        ents = line['ents'] # discovered enteties in the line
        last_index_number_of_ents = len(ents)-1
        doc = nlp(text)

        # Get metadata from URL
        result_text_service_url = get_data_from_text_service_item_url(text_service_url)
        metadata_from_url = {}
        if "regulation_year" in result_text_service_url:
            metadata_from_url['regulation_year'] = result_text_service_url['regulation_year']
        if "regulation_month" in result_text_service_url:
            metadata_from_url['regulation_month'] = result_text_service_url['regulation_month']
        if "regulation_day" in result_text_service_url:
            metadata_from_url['regulation_day'] = result_text_service_url['regulation_day']
        if "regulation_id" in result_text_service_url:
            metadata_from_url['regulation_id'] = result_text_service_url['regulation_id']
        if "chapter_number" in result_text_service_url:
            metadata_from_url['chapter_number'] = result_text_service_url['chapter_number']
        if "section_number" in result_text_service_url:
            metadata_from_url['section_number'] = result_text_service_url['section_number']
        if "part_number" in result_text_service_url:
            metadata_from_url['part_number'] = result_text_service_url['part_number']
        if "sub_part_number" in result_text_service_url:
            metadata_from_url['sub_part_number'] = result_text_service_url['sub_part_number']

        # For each ent in line
        for ent_index_number, ent in enumerate(ents):

            ent_label = ent['label']
            ent_start = ent['start']
            ent_end = ent['end']

            ent_text = text[ent_start:ent_end] # same as: doc[ent_token_span.start:ent_token_span.end]
            ent_doc = nlp(ent_text)
            words_in_doc_count = len(ent_doc)

            ent_token_span = doc.char_span(ent_start,ent_end)
            ent_token_span_start = ent_token_span.start
            ent_token_span_end = ent_token_span.end
            
            #
            # Statment concluder
            #

            if ent_label == "PROPULSION_POWER_FACT":

                detection_with_url_metadata = dict(metadata_from_url)
                detection_with_url_metadata["propulsion_power_value_1"] = ent_doc[0].text
                detection_with_url_metadata["measurement_text"] = ent_doc[1].text

                if ent_text.endswith("eller mer"):
                    detection_with_url_metadata["propulsion_power_context"] = "eller mer"

                forward_result.append(detection_with_url_metadata)

    return forward_result