def create_api_response_for_post_identify_alternative_reference_in_text_service_norwegian_chapter_input(forward_filtered_result_with_only_the_things_we_are_looking_for): nlp = English() forward_result = [] for line in forward_filtered_result_with_only_the_things_we_are_looking_for: # Get NLP data from line text_service_url = line['title'] text = line['text'] ents = line['ents'] # discovered enteties in the line last_index_number_of_ents = len(ents)-1 doc = nlp(text) # Get metadata from URL result_text_service_url = get_data_from_text_service_item_url(text_service_url) metadata_from_url = {} if "regulation_year" in result_text_service_url: metadata_from_url['regulation_year'] = result_text_service_url['regulation_year'] if "regulation_month" in result_text_service_url: metadata_from_url['regulation_month'] = result_text_service_url['regulation_month'] if "regulation_day" in result_text_service_url: metadata_from_url['regulation_day'] = result_text_service_url['regulation_day'] if "regulation_id" in result_text_service_url: metadata_from_url['regulation_id'] = result_text_service_url['regulation_id'] if "chapter_number" in result_text_service_url: metadata_from_url['chapter_number'] = result_text_service_url['chapter_number'] if "section_number" in result_text_service_url: metadata_from_url['section_number'] = result_text_service_url['section_number'] if "part_number" in result_text_service_url: metadata_from_url['part_number'] = result_text_service_url['part_number'] if "sub_part_number" in result_text_service_url: metadata_from_url['sub_part_number'] = result_text_service_url['sub_part_number'] # For each ent in line for ent_index_number, ent in enumerate(ents): ent_label = ent['label'] ent_start = ent['start'] ent_end = ent['end'] ent_text = text[ent_start:ent_end] # same as: doc[ent_token_span.start:ent_token_span.end] ent_doc = nlp(ent_text) words_in_doc_count = len(ent_doc) ent_token_span = doc.char_span(ent_start,ent_end) ent_token_span_start = ent_token_span.start ent_token_span_end = ent_token_span.end # # Statment concluder # No need to filter. All ents are welcome! # detection_with_url_metadata = dict(metadata_from_url) detection_with_url_metadata["alternative_reference"] = ent_text forward_result.append(detection_with_url_metadata) return forward_result
def create_api_response_for_post_identify_PROTECTED_in_text_service_norwegian_chapter(forward_filtered_result_with_only_the_things_we_are_looking_for): nlp = English() forward_result = [] for line in forward_filtered_result_with_only_the_things_we_are_looking_for: # Get NLP data from line text_service_url = line['title'] text = line['text'] ents = line['ents'] # discovered enteties in the line last_index_number_of_ents = len(ents)-1 doc = nlp(text) # Get metadata from URL result_text_service_url = get_data_from_text_service_item_url(text_service_url) metadata_from_url = {} if "regulation_year" in result_text_service_url: metadata_from_url['regulation_year'] = result_text_service_url['regulation_year'] if "regulation_month" in result_text_service_url: metadata_from_url['regulation_month'] = result_text_service_url['regulation_month'] if "regulation_day" in result_text_service_url: metadata_from_url['regulation_day'] = result_text_service_url['regulation_day'] if "regulation_id" in result_text_service_url: metadata_from_url['regulation_id'] = result_text_service_url['regulation_id'] if "chapter_number" in result_text_service_url: metadata_from_url['chapter_number'] = result_text_service_url['chapter_number'] if "section_number" in result_text_service_url: metadata_from_url['section_number'] = result_text_service_url['section_number'] if "part_number" in result_text_service_url: metadata_from_url['part_number'] = result_text_service_url['part_number'] if "sub_part_number" in result_text_service_url: metadata_from_url['sub_part_number'] = result_text_service_url['sub_part_number'] detection_with_url_metadata = dict(metadata_from_url) detection_with_url_metadata["protected"] = True forward_result.append(detection_with_url_metadata) return forward_result
def create_api_response_for_post_identify_PASSENGER_in_text_service_norwegian_chapter( title_dictionary, forward_filtered_result_with_only_the_things_we_are_looking_for): nlp = English() forward_result = [] for line in forward_filtered_result_with_only_the_things_we_are_looking_for: # Get NLP data from line text_service_url = line['title'] text = line['text'] ents = line['ents'] # discovered enteties in the line last_index_number_of_ents = len(ents) - 1 doc = nlp(text) # Get metadata from URL result_text_service_url = get_data_from_text_service_item_url( text_service_url) metadata_from_url = {} if "regulation_year" in result_text_service_url: metadata_from_url['regulation_year'] = result_text_service_url[ 'regulation_year'] if "regulation_month" in result_text_service_url: metadata_from_url['regulation_month'] = result_text_service_url[ 'regulation_month'] if "regulation_day" in result_text_service_url: metadata_from_url['regulation_day'] = result_text_service_url[ 'regulation_day'] if "regulation_id" in result_text_service_url: metadata_from_url['regulation_id'] = result_text_service_url[ 'regulation_id'] if "chapter_number" in result_text_service_url: metadata_from_url['chapter_number'] = result_text_service_url[ 'chapter_number'] if "section_number" in result_text_service_url: metadata_from_url['section_number'] = result_text_service_url[ 'section_number'] if "part_number" in result_text_service_url: metadata_from_url['part_number'] = result_text_service_url[ 'part_number'] if "sub_part_number" in result_text_service_url: metadata_from_url['sub_part_number'] = result_text_service_url[ 'sub_part_number'] # add chapter_title and section_title if "chapter_title" in title_dictionary: metadata_from_url['chapter_title'] = title_dictionary[ 'chapter_title'] if "section_title_in_dictionary" in title_dictionary: section_title_dictionary = title_dictionary[ 'section_title_in_dictionary'] if text_service_url in section_title_dictionary: metadata_from_url['section_title'] = section_title_dictionary[ text_service_url] # For each ent in line for ent_index_number, ent in enumerate(ents): ent_label = ent['label'] ent_start = ent['start'] ent_end = ent['end'] ent_text = text[ ent_start: ent_end] # same as: doc[ent_token_span.start:ent_token_span.end] ent_doc = nlp(ent_text) words_in_doc_count = len(ent_doc) ent_token_span = doc.char_span(ent_start, ent_end) ent_token_span_start = ent_token_span.start ent_token_span_end = ent_token_span.end #print(ent_text + " - " + ent_label + " (" + str(ent_token_span_start) + ":" + str(ent_token_span_end) + ")") # # Statment concluder # if ent_label == "PASSENGER": if "eller færre" in ent_text: detection_with_url_metadata = dict(metadata_from_url) detection_with_url_metadata[ "passenger_context"] = "eller færre" detection_with_url_metadata["passenger_value_1"] = ent_doc[ 0].text detection_with_url_metadata["measurement_text"] = ent_doc[ words_in_doc_count - 1].text forward_result.append(detection_with_url_metadata) elif ent_text.startswith("mer enn"): detection_with_url_metadata = dict(metadata_from_url) detection_with_url_metadata[ "passenger_context"] = "mer enn" detection_with_url_metadata["passenger_value_1"] = ent_doc[ 2].text detection_with_url_metadata["measurement_text"] = ent_doc[ words_in_doc_count - 1].text forward_result.append(detection_with_url_metadata) return forward_result
def create_api_response_for_post_identify_FLASHPOINT_in_text_service_norwegian_chapter( forward_filtered_result_with_only_the_things_we_are_looking_for): nlp = English() forward_result = [] for line in forward_filtered_result_with_only_the_things_we_are_looking_for: # Get NLP data from line text_service_url = line['title'] text = line['text'] ents = line['ents'] # discovered enteties in the line last_index_number_of_ents = len(ents) - 1 doc = nlp(text) # Get metadata from URL result_text_service_url = get_data_from_text_service_item_url( text_service_url) metadata_from_url = {} if "regulation_year" in result_text_service_url: metadata_from_url['regulation_year'] = result_text_service_url[ 'regulation_year'] if "regulation_month" in result_text_service_url: metadata_from_url['regulation_month'] = result_text_service_url[ 'regulation_month'] if "regulation_day" in result_text_service_url: metadata_from_url['regulation_day'] = result_text_service_url[ 'regulation_day'] if "regulation_id" in result_text_service_url: metadata_from_url['regulation_id'] = result_text_service_url[ 'regulation_id'] if "chapter_number" in result_text_service_url: metadata_from_url['chapter_number'] = result_text_service_url[ 'chapter_number'] if "section_number" in result_text_service_url: metadata_from_url['section_number'] = result_text_service_url[ 'section_number'] if "part_number" in result_text_service_url: metadata_from_url['part_number'] = result_text_service_url[ 'part_number'] if "sub_part_number" in result_text_service_url: metadata_from_url['sub_part_number'] = result_text_service_url[ 'sub_part_number'] # For each ent in line ents_count = len(ents) for ent_index_number, ent in enumerate(ents): ent_count_number = ent_index_number + 1 ent_label = ent['label'] ent_start = ent['start'] ent_end = ent['end'] ent_text = text[ ent_start: ent_end] # same as: doc[ent_token_span.start:ent_token_span.end] ent_doc = nlp(ent_text) words_in_doc_count = len(ent_doc) ent_token_span = doc.char_span(ent_start, ent_end) ent_token_span_start = ent_token_span.start ent_token_span_end = ent_token_span.end #print(ent_text + " - " + ent_label + " (" + str(ent_token_span_start) + ":" + str(ent_token_span_end) + ")") # # Statment concluder # if (ent_label == "TEMPERATURE" and ent_index_number > 0): # TEMPERATURE is never the first ent prev_1_ent_index_number = ent_index_number - 1 prev_1_ent = ents[prev_1_ent_index_number] if prev_1_ent['label'] == "FLASHPOINT": detection_with_url_metadata = dict(metadata_from_url) detection_with_url_metadata[ "flashpoint_value_1"] = ent_doc[0].text detection_with_url_metadata[ "flashpoint_value_1_measurement"] = "celsius" # do we have a suffix to add? ent_count_number_of_next_ent = ent_count_number + 1 if ents_count >= ent_count_number_of_next_ent: next_1_ent_index_number = ent_index_number + 1 next_1_ent = ents[next_1_ent_index_number] if next_1_ent['label'] == "TEMPERATURE_SUFFIX": next_1_ent_text = text[ next_1_ent['start']:next_1_ent['end']] detection_with_url_metadata[ "flashpoint_value_1_suffix"] = next_1_ent_text forward_result.append(detection_with_url_metadata) elif (prev_1_ent['label'] == "TEMPERATURE_PREFIX" and ent_index_number > 1): prev_2_ent_index_number = ent_index_number - 2 prev_2_ent = ents[prev_2_ent_index_number] if prev_2_ent['label'] == "FLASHPOINT": detection_with_url_metadata = dict(metadata_from_url) detection_with_url_metadata[ "flashpoint_value_1"] = ent_doc[0].text detection_with_url_metadata[ "flashpoint_value_1_measurement"] = "celsius" prev_1_ent_text = text[ prev_1_ent['start']:prev_1_ent['end']] detection_with_url_metadata[ "flashpoint_value_1_prefix"] = prev_1_ent_text # do we have a suffix to add? ent_count_number_of_next_ent = ent_count_number + 1 if ents_count >= ent_count_number_of_next_ent: next_1_ent_index_number = ent_index_number + 1 next_1_ent = ents[next_1_ent_index_number] if next_1_ent['label'] == "TEMPERATURE_SUFFIX": next_1_ent_text = text[ next_1_ent['start']:next_1_ent['end']] detection_with_url_metadata[ "flashpoint_value_1_suffix"] = next_1_ent_text forward_result.append(detection_with_url_metadata) return forward_result
def create_api_response_for_post_identify_build_date_in_text_service_norwegian_chapter_input( forward_filtered_result_with_only_the_things_we_are_looking_for): nlp = English() forward_result = [] ### TEMP AREA temp_detection_dictionary = {} temp_check_before_reset = {} ### for line in forward_filtered_result_with_only_the_things_we_are_looking_for: # new line and temp reset temp_detection_dictionary.clear() temp_check_before_reset.clear() # Get NLP data from line text_service_url = line['title'] text = line['text'] ents = line['ents'] # discovered enteties in the line last_index_number_of_ents = len(ents) - 1 doc = nlp(text) # Get metadata from URL result_text_service_url = get_data_from_text_service_item_url( text_service_url) metadata_from_url = {} if "regulation_year" in result_text_service_url: metadata_from_url['regulation_year'] = result_text_service_url[ 'regulation_year'] if "regulation_month" in result_text_service_url: metadata_from_url['regulation_month'] = result_text_service_url[ 'regulation_month'] if "regulation_day" in result_text_service_url: metadata_from_url['regulation_day'] = result_text_service_url[ 'regulation_day'] if "regulation_id" in result_text_service_url: metadata_from_url['regulation_id'] = result_text_service_url[ 'regulation_id'] if "chapter_number" in result_text_service_url: metadata_from_url['chapter_number'] = result_text_service_url[ 'chapter_number'] if "section_number" in result_text_service_url: metadata_from_url['section_number'] = result_text_service_url[ 'section_number'] if "part_number" in result_text_service_url: metadata_from_url['part_number'] = result_text_service_url[ 'part_number'] if "sub_part_number" in result_text_service_url: metadata_from_url['sub_part_number'] = result_text_service_url[ 'sub_part_number'] # For each ent in line for ent_index_number, ent in enumerate(ents): ent_label = ent['label'] ent_start = ent['start'] ent_end = ent['end'] ent_text = text[ ent_start: ent_end] # same as: doc[ent_token_span.start:ent_token_span.end] ent_doc = nlp(ent_text) words_in_doc_count = len(ent_doc) ent_token_span = doc.char_span(ent_start, ent_end) ent_token_span_start = ent_token_span.start ent_token_span_end = ent_token_span.end #print(ent_text + " - " + ent_label + " (" + str(ent_token_span_start) + ":" + str(ent_token_span_end) + ")") # # Statment builder # if ent_label == "WATER_VESSEL": if "START_detected" not in temp_detection_dictionary: temp_detection_dictionary["START_detected"] = True else: # restart with new term temp_check_before_reset = dict(temp_detection_dictionary) temp_detection_dictionary.clear() temp_detection_dictionary["START_detected"] = True elif ent_label == "CONSTRUCT": if ("START_detected" in temp_detection_dictionary and "CONSTRUCT_detected" not in temp_detection_dictionary): temp_detection_dictionary["CONSTRUCT_detected"] = True else: # reset temp_check_before_reset = dict(temp_detection_dictionary) temp_detection_dictionary.clear() elif ent_label == "DATE_PREFIX": if ("START_detected" in temp_detection_dictionary and "CONSTRUCT_detected" in temp_detection_dictionary and "DATE_PREFIX_value" not in temp_detection_dictionary and "DATE_value_1" not in temp_detection_dictionary and "DATE_SEPARATOR_value" not in temp_detection_dictionary and "DATE_value_2" not in temp_detection_dictionary): temp_detection_dictionary["DATE_PREFIX_value"] = ent_text else: # reset temp_check_before_reset = dict(temp_detection_dictionary) temp_detection_dictionary.clear() elif ent_label == "DATE": if ("START_detected" in temp_detection_dictionary and "CONSTRUCT_detected" in temp_detection_dictionary and "DATE_value_1" not in temp_detection_dictionary and "DATE_value_1_token_end" not in temp_detection_dictionary): temp_detection_dictionary["DATE_value_1"] = ent_text temp_detection_dictionary[ "DATE_value_1_token_end"] = ent_token_span_end elif ("START_detected" in temp_detection_dictionary and "CONSTRUCT_detected" in temp_detection_dictionary and "DATE_value_1" in temp_detection_dictionary and "DATE_SEPARATOR_value" in temp_detection_dictionary and "DATE_value_2" not in temp_detection_dictionary): temp_detection_dictionary['DATE_value_2'] = ent_text # because this is the last value in a statment: temp_check_before_reset = dict(temp_detection_dictionary) temp_detection_dictionary.clear() else: # reset temp_check_before_reset = dict(temp_detection_dictionary) temp_detection_dictionary.clear() elif ent_label == "DATE_SEPARATOR": if ("START_detected" in temp_detection_dictionary and "CONSTRUCT_detected" in temp_detection_dictionary and "DATE_value_1" in temp_detection_dictionary and "DATE_value_1_token_end" in temp_detection_dictionary and "DATE_SEPARATOR_value" not in temp_detection_dictionary): # Q: Is the separator the next term after value 1? if temp_detection_dictionary[ "DATE_value_1_token_end"] == ent_token_span_start: # A: Yes, this separator is the first word after value 1 temp_detection_dictionary[ "DATE_SEPARATOR_value"] = ent_text else: # reset # A: No. Reject value and reset. temp_check_before_reset = dict( temp_detection_dictionary) temp_detection_dictionary.clear() else: # reset temp_check_before_reset = dict(temp_detection_dictionary) temp_detection_dictionary.clear() # # Statment concluder # Q: Do we have what we need to build a statment? # # The statment builder have restarted. # Check what we have for a statment before continuing. if len(temp_check_before_reset) > 0: # If we have a double value statement if ("START_detected" in temp_check_before_reset and "CONSTRUCT_detected" in temp_check_before_reset and "DATE_value_1" in temp_check_before_reset and "DATE_SEPARATOR_value" in temp_check_before_reset and "DATE_value_2" in temp_check_before_reset): detection_with_url_metadata = dict(metadata_from_url) if "DATE_PREFIX_value" in temp_check_before_reset: detection_with_url_metadata[ "date_context"] = temp_check_before_reset[ "DATE_PREFIX_value"] detection_with_url_metadata[ "date_value_1"] = temp_check_before_reset[ "DATE_value_1"] detection_with_url_metadata[ "date_separator"] = temp_check_before_reset[ "DATE_SEPARATOR_value"] detection_with_url_metadata[ "date_value_2"] = temp_check_before_reset[ "DATE_value_2"] forward_result.append(detection_with_url_metadata) # If we have a single value statment elif ("START_detected" in temp_check_before_reset and "CONSTRUCT_detected" in temp_check_before_reset and "DATE_value_1" in temp_check_before_reset): detection_with_url_metadata = dict(metadata_from_url) if "DATE_PREFIX_value" in temp_check_before_reset: detection_with_url_metadata[ "date_context"] = temp_check_before_reset[ "DATE_PREFIX_value"] detection_with_url_metadata[ "date_value_1"] = temp_check_before_reset[ "DATE_value_1"] forward_result.append(detection_with_url_metadata) temp_check_before_reset.clear() # Conclude on current detections if ("START_detected" in temp_detection_dictionary and "CONSTRUCT_detected" in temp_detection_dictionary and "DATE_value_1" in temp_detection_dictionary and "DATE_SEPARATOR_value" in temp_detection_dictionary and "DATE_value_2" in temp_detection_dictionary): # we have a full statment. # add and reset. detection_with_url_metadata = dict(metadata_from_url) if "DATE_PREFIX_value" in temp_detection_dictionary: detection_with_url_metadata[ "date_context"] = temp_detection_dictionary[ "DATE_PREFIX_value"] detection_with_url_metadata[ "date_value_1"] = temp_detection_dictionary["DATE_value_1"] detection_with_url_metadata[ "date_separator"] = temp_detection_dictionary[ "DATE_SEPARATOR_value"] detection_with_url_metadata[ "date_value_2"] = temp_detection_dictionary["DATE_value_2"] forward_result.append(detection_with_url_metadata) temp_detection_dictionary.clear() else: # get next ent next_ent_index_number = ent_index_number + 1 next_ent_label = "" if next_ent_index_number <= last_index_number_of_ents: next_ent = ents[next_ent_index_number] next_ent_label = next_ent["label"] # Q: Do we have enough for a new statment? if ("START_detected" in temp_detection_dictionary and "CONSTRUCT_detected" in temp_detection_dictionary and "DATE_value_1" in temp_detection_dictionary): # A: Yes, we have enough for a new statment. # Is the next ent relevant? if ("DATE_SEPARATOR_value" not in temp_detection_dictionary and next_ent_label == "DATE_SEPARATOR"): continue # we want the next ent elif ("DATE_SEPARATOR_value" in temp_detection_dictionary and "DATE_value_2" not in temp_detection_dictionary): continue # we know that the next value is a date else: # add the statment and move on detection_with_url_metadata = dict(metadata_from_url) if "DATE_PREFIX_value" in temp_detection_dictionary: detection_with_url_metadata[ "date_context"] = temp_detection_dictionary[ "DATE_PREFIX_value"] detection_with_url_metadata[ "date_value_1"] = temp_detection_dictionary[ "DATE_value_1"] forward_result.append(detection_with_url_metadata) temp_detection_dictionary.clear() return forward_result
def create_api_response_for_post_identify_GROSS_TONNAGE_in_text_service_norwegian_chapter( forward_filtered_result_with_only_the_things_we_are_looking_for): nlp = English() forward_result = [] for line in forward_filtered_result_with_only_the_things_we_are_looking_for: # Get NLP data from line text_service_url = line['title'] text = line['text'] ents = line['ents'] # discovered enteties in the line last_index_number_of_ents = len(ents) - 1 doc = nlp(text) # Get metadata from URL result_text_service_url = get_data_from_text_service_item_url( text_service_url) metadata_from_url = {} if "regulation_year" in result_text_service_url: metadata_from_url['regulation_year'] = result_text_service_url[ 'regulation_year'] if "regulation_month" in result_text_service_url: metadata_from_url['regulation_month'] = result_text_service_url[ 'regulation_month'] if "regulation_day" in result_text_service_url: metadata_from_url['regulation_day'] = result_text_service_url[ 'regulation_day'] if "regulation_id" in result_text_service_url: metadata_from_url['regulation_id'] = result_text_service_url[ 'regulation_id'] if "chapter_number" in result_text_service_url: metadata_from_url['chapter_number'] = result_text_service_url[ 'chapter_number'] if "section_number" in result_text_service_url: metadata_from_url['section_number'] = result_text_service_url[ 'section_number'] if "part_number" in result_text_service_url: metadata_from_url['part_number'] = result_text_service_url[ 'part_number'] if "sub_part_number" in result_text_service_url: metadata_from_url['sub_part_number'] = result_text_service_url[ 'sub_part_number'] # For each ent in line for ent_index_number, ent in enumerate(ents): ent_label = ent['label'] ent_start = ent['start'] ent_end = ent['end'] ent_text = text[ ent_start: ent_end] # same as: doc[ent_token_span.start:ent_token_span.end] ent_doc = nlp(ent_text) words_in_doc_count = len(ent_doc) ent_token_span = doc.char_span(ent_start, ent_end) ent_token_span_start = ent_token_span.start ent_token_span_end = ent_token_span.end #print(ent_text + " - " + ent_label + " (" + str(ent_token_span_start) + ":" + str(ent_token_span_end) + ")") # # Statment concluder # if ent_label == "GROSS_TONNAGE": if "til" in ent_text: # Example: bruttotonnasje opp til 3000 detection_with_url_metadata = dict(metadata_from_url) detection_with_url_metadata[ "gross_tonnage_context"] = ent_doc[1].text + " til" detection_with_url_metadata[ "gross_tonnage_value_1"] = ent_doc[words_in_doc_count - 1].text detection_with_url_metadata["measurement_text"] = ent_doc[ 0].text forward_result.append(detection_with_url_metadata) elif any(ext in ent_text for ext in ["under", "over"]): # Example: bruttotonnasje over 500 detection_with_url_metadata = dict(metadata_from_url) detection_with_url_metadata[ "gross_tonnage_context"] = ent_doc[1].text detection_with_url_metadata[ "gross_tonnage_value_1"] = ent_doc[words_in_doc_count - 1].text detection_with_url_metadata["measurement_text"] = ent_doc[ 0].text forward_result.append(detection_with_url_metadata) elif "eller" in ent_text: # Example: bruttotonnasje 50 eller mer detection_with_url_metadata = dict(metadata_from_url) detection_with_url_metadata[ "gross_tonnage_context"] = "eller " + ent_doc[ words_in_doc_count - 1].text detection_with_url_metadata[ "gross_tonnage_value_1"] = ent_doc[1].text detection_with_url_metadata["measurement_text"] = ent_doc[ 0].text forward_result.append(detection_with_url_metadata) return forward_result
def create_api_response_for_post_identify_vessel_length_overall_in_text_service_norwegian_chapter_input(forward_filtered_result_with_only_the_things_we_are_looking_for): nlp = Norwegian() forward_result = [] for line in forward_filtered_result_with_only_the_things_we_are_looking_for: text_service_url = line['title'] text = line['text'] ents = line['ents'] # result result_length_dictionary = {} result_length_prefix_dictionary = {} result_text_service_url = get_data_from_text_service_item_url(text_service_url) for ent in ents: ent_label = ent['label'] ent_start = ent['start'] ent_end = ent['end'] ent_text = text[ent_start:ent_end] ent_doc = nlp(ent_text) words_in_doc_count = len(ent_doc) data = {} if ent_label == "LENGTH" and words_in_doc_count == 4: # Example: 10,67 og 15 meter data['length_value_1'] = ent_doc[0].text data['length_separator'] = ent_doc[1].text data['length_value_2'] = ent_doc[2].text data['measurement'] = ent_doc[3].text if not result_length_dictionary: result_length_dictionary = data elif ent_label == "LENGTH" and words_in_doc_count == 2: # Example: 10,67 meter data['length_value'] = ent_doc[0].text data['measurement'] = ent_doc[1].text if not result_length_dictionary: result_length_dictionary = data elif ent_label == "LENGTH_PREFIX": # Example: mindre enn data['length_prefix'] = ent_doc.text if not result_length_prefix_dictionary: result_length_prefix_dictionary = data merged_line_result = result_length_dictionary | result_length_prefix_dictionary if "regulation_year" in result_text_service_url: merged_line_result['regulation_year'] = result_text_service_url['regulation_year'] if "regulation_month" in result_text_service_url: merged_line_result['regulation_month'] = result_text_service_url['regulation_month'] if "regulation_day" in result_text_service_url: merged_line_result['regulation_day'] = result_text_service_url['regulation_day'] if "regulation_id" in result_text_service_url: merged_line_result['regulation_id'] = result_text_service_url['regulation_id'] if "chapter_number" in result_text_service_url: merged_line_result['chapter_number'] = result_text_service_url['chapter_number'] if "section_number" in result_text_service_url: merged_line_result['section_number'] = result_text_service_url['section_number'] if "part_number" in result_text_service_url: merged_line_result['part_number'] = result_text_service_url['part_number'] if "sub_part_number" in result_text_service_url: merged_line_result['sub_part_number'] = result_text_service_url['sub_part_number'] forward_result.append(merged_line_result) return forward_result
def create_api_response_for_post_identify_KEEL_LAID_in_text_service_norwegian_chapter( forward_filtered_result_with_only_the_things_we_are_looking_for): nlp = English() forward_result = [] ### TEMP DATA date_counter = 0 ### for line in forward_filtered_result_with_only_the_things_we_are_looking_for: # Get NLP data from line text_service_url = line['title'] text = line['text'] ents = line['ents'] # discovered enteties in the line last_index_number_of_ents = len(ents) - 1 doc = nlp(text) # Get metadata from URL result_text_service_url = get_data_from_text_service_item_url( text_service_url) metadata_from_url = {} if "regulation_year" in result_text_service_url: metadata_from_url['regulation_year'] = result_text_service_url[ 'regulation_year'] if "regulation_month" in result_text_service_url: metadata_from_url['regulation_month'] = result_text_service_url[ 'regulation_month'] if "regulation_day" in result_text_service_url: metadata_from_url['regulation_day'] = result_text_service_url[ 'regulation_day'] if "regulation_id" in result_text_service_url: metadata_from_url['regulation_id'] = result_text_service_url[ 'regulation_id'] if "chapter_number" in result_text_service_url: metadata_from_url['chapter_number'] = result_text_service_url[ 'chapter_number'] if "section_number" in result_text_service_url: metadata_from_url['section_number'] = result_text_service_url[ 'section_number'] if "part_number" in result_text_service_url: metadata_from_url['part_number'] = result_text_service_url[ 'part_number'] if "sub_part_number" in result_text_service_url: metadata_from_url['sub_part_number'] = result_text_service_url[ 'sub_part_number'] detection_with_url_metadata = dict(metadata_from_url) detection_with_url_metadata["measurement_text"] = "date" # For each ent in line for ent_index_number, ent in enumerate(ents): ent_label = ent['label'] ent_start = ent['start'] ent_end = ent['end'] ent_text = text[ ent_start: ent_end] # same as: doc[ent_token_span.start:ent_token_span.end] ent_doc = nlp(ent_text) words_in_doc_count = len(ent_doc) ent_token_span = doc.char_span(ent_start, ent_end) ent_token_span_start = ent_token_span.start ent_token_span_end = ent_token_span.end # # Statment concluder # if ent_label == "DATE": date_counter = date_counter + 1 date_value_text = "date_value_" + str(date_counter) detection_with_url_metadata[date_value_text] = ent_text # # Does the DATE have a prefix or suffix? # # PREFIX prev_ent_index_number = ent_index_number - 1 if prev_ent_index_number > 0: prev_ent = ents[prev_ent_index_number] if prev_ent['label'] == "DATE_PREFIX": prev_ent_start = prev_ent['start'] prev_ent_end = prev_ent['end'] date_value_text = "date_value_" + str( date_counter) + "_prefix" detection_with_url_metadata[date_value_text] = text[ prev_ent_start:prev_ent_end] # SUFFIX next_ent_index_number = ent_index_number + 1 if next_ent_index_number <= last_index_number_of_ents: next_ent = ents[next_ent_index_number] if next_ent['label'] == "DATE_SUFFIX": next_ent_start = next_ent['start'] next_ent_end = next_ent['end'] date_value_text = "date_value_" + str( date_counter) + "_suffix" detection_with_url_metadata[date_value_text] = text[ next_ent_start:next_ent_end] date_counter = 0 forward_result.append(detection_with_url_metadata) return forward_result
def create_api_response_for_post_identify_electrical_installation_in_text_service_norwegian_chapter_input( title_dictionary, forward_filtered_result_with_only_the_things_we_are_looking_for): nlp = Norwegian() forward_result = [] for line in forward_filtered_result_with_only_the_things_we_are_looking_for: # A line is to be interpreted as one sentence. # We look for statments about something that starts with WATER_VESSEL and ends with VOLTAGE. # Detection of multiple statments in one line is supported. # Data text_service_url = line['title'] text = line['text'] ents = line['ents'] # Metadata from URL metadata_from_url = {} result_text_service_url = get_data_from_text_service_item_url( text_service_url) if "regulation_year" in result_text_service_url: metadata_from_url['regulation_year'] = result_text_service_url[ 'regulation_year'] if "regulation_month" in result_text_service_url: metadata_from_url['regulation_month'] = result_text_service_url[ 'regulation_month'] if "regulation_day" in result_text_service_url: metadata_from_url['regulation_day'] = result_text_service_url[ 'regulation_day'] if "regulation_id" in result_text_service_url: metadata_from_url['regulation_id'] = result_text_service_url[ 'regulation_id'] if "chapter_number" in result_text_service_url: metadata_from_url['chapter_number'] = result_text_service_url[ 'chapter_number'] if "section_number" in result_text_service_url: metadata_from_url['section_number'] = result_text_service_url[ 'section_number'] if "part_number" in result_text_service_url: metadata_from_url['part_number'] = result_text_service_url[ 'part_number'] if "sub_part_number" in result_text_service_url: metadata_from_url['sub_part_number'] = result_text_service_url[ 'sub_part_number'] # add chapter_title and section_title if "chapter_title" in title_dictionary: metadata_from_url['chapter_title'] = title_dictionary[ 'chapter_title'] if "section_title_in_dictionary" in title_dictionary: section_title_dictionary = title_dictionary[ 'section_title_in_dictionary'] if text_service_url in section_title_dictionary: metadata_from_url['section_title'] = section_title_dictionary[ text_service_url] #This will be reset on detection of START ent and END ent. temp_detection_dictionary = {} temp_term_between_start_and_end_detected = False for ent_id, ent in enumerate(ents): ent_label = ent['label'] ent_start = ent['start'] ent_end = ent['end'] ent_text = text[ent_start:ent_end] ent_doc = nlp(ent_text) words_in_doc_count = len(ent_doc) if ent_label == "WATER_VESSEL": # This is the START of a statment. # Therefore resetting detetection dictionary: temp_detection_dictionary = {} temp_term_between_start_and_end_detected = False elif ent_label == "ELECTRICAL_INSTALLATION": temp_term_between_start_and_end_detected = True elif ent_label == "VOLTAGE_PREFIX": # Example: mindre enn temp_detection_dictionary['voltage_prefix'] = ent_doc.text elif ent_label == "VOLTAGE" and words_in_doc_count == 2: # This is the END of a statment. if temp_term_between_start_and_end_detected == True: # Statment is complete. # Adding result to output list. # Example: 10,67 meter temp_detection_dictionary['voltage_value'] = ent_doc[ 0].text temp_detection_dictionary['measurement_text'] = ent_doc[ 1].text detection_with_url_metadata = temp_detection_dictionary | metadata_from_url forward_result.append(detection_with_url_metadata) # Resetting detetection dictionary. temp_detection_dictionary = {} temp_term_between_start_and_end_detected = False return forward_result
def create_api_response_for_post_identify_RADIO_AREA_in_text_service_norwegian_chapter(title_dictionary,forward_filtered_result_with_only_the_things_we_are_looking_for): nlp = English() forward_result = [] for line in forward_filtered_result_with_only_the_things_we_are_looking_for: # Get NLP data from line text_service_url = line['title'] text = line['text'] ents = line['ents'] # discovered enteties in the line last_index_number_of_ents = len(ents)-1 doc = nlp(text) # Get metadata from URL result_text_service_url = get_data_from_text_service_item_url(text_service_url) metadata_from_url = {} if "regulation_year" in result_text_service_url: metadata_from_url['regulation_year'] = result_text_service_url['regulation_year'] if "regulation_month" in result_text_service_url: metadata_from_url['regulation_month'] = result_text_service_url['regulation_month'] if "regulation_day" in result_text_service_url: metadata_from_url['regulation_day'] = result_text_service_url['regulation_day'] if "regulation_id" in result_text_service_url: metadata_from_url['regulation_id'] = result_text_service_url['regulation_id'] if "chapter_number" in result_text_service_url: metadata_from_url['chapter_number'] = result_text_service_url['chapter_number'] if "section_number" in result_text_service_url: metadata_from_url['section_number'] = result_text_service_url['section_number'] if "part_number" in result_text_service_url: metadata_from_url['part_number'] = result_text_service_url['part_number'] if "sub_part_number" in result_text_service_url: metadata_from_url['sub_part_number'] = result_text_service_url['sub_part_number'] # add chapter_title and section_title if "chapter_title" in title_dictionary: metadata_from_url['chapter_title'] = title_dictionary['chapter_title'] if "section_title_in_dictionary" in title_dictionary: section_title_dictionary = title_dictionary['section_title_in_dictionary'] if text_service_url in section_title_dictionary: metadata_from_url['section_title'] = section_title_dictionary[text_service_url] # For each ent in line for ent_index_number, ent in enumerate(ents): ent_label = ent['label'] ent_start = ent['start'] ent_end = ent['end'] ent_text = text[ent_start:ent_end] # same as: doc[ent_token_span.start:ent_token_span.end] ent_doc = nlp(ent_text) words_in_doc_count = len(ent_doc) ent_token_span = doc.char_span(ent_start,ent_end) ent_token_span_start = ent_token_span.start ent_token_span_end = ent_token_span.end # # Statment concluder # if ent_label == "RADIO_AREA_TYPE": detection_with_url_metadata = dict(metadata_from_url) detection_with_url_metadata["radio_area_type_text"] = ent_text forward_result.append(detection_with_url_metadata) return forward_result
def create_api_response_for_post_identify_PROPULSION_POWER_in_text_service_norwegian_chapter(forward_filtered_result_with_only_the_things_we_are_looking_for): nlp = English() forward_result = [] for line in forward_filtered_result_with_only_the_things_we_are_looking_for: # Get NLP data from line text_service_url = line['title'] text = line['text'] ents = line['ents'] # discovered enteties in the line last_index_number_of_ents = len(ents)-1 doc = nlp(text) # Get metadata from URL result_text_service_url = get_data_from_text_service_item_url(text_service_url) metadata_from_url = {} if "regulation_year" in result_text_service_url: metadata_from_url['regulation_year'] = result_text_service_url['regulation_year'] if "regulation_month" in result_text_service_url: metadata_from_url['regulation_month'] = result_text_service_url['regulation_month'] if "regulation_day" in result_text_service_url: metadata_from_url['regulation_day'] = result_text_service_url['regulation_day'] if "regulation_id" in result_text_service_url: metadata_from_url['regulation_id'] = result_text_service_url['regulation_id'] if "chapter_number" in result_text_service_url: metadata_from_url['chapter_number'] = result_text_service_url['chapter_number'] if "section_number" in result_text_service_url: metadata_from_url['section_number'] = result_text_service_url['section_number'] if "part_number" in result_text_service_url: metadata_from_url['part_number'] = result_text_service_url['part_number'] if "sub_part_number" in result_text_service_url: metadata_from_url['sub_part_number'] = result_text_service_url['sub_part_number'] # For each ent in line for ent_index_number, ent in enumerate(ents): ent_label = ent['label'] ent_start = ent['start'] ent_end = ent['end'] ent_text = text[ent_start:ent_end] # same as: doc[ent_token_span.start:ent_token_span.end] ent_doc = nlp(ent_text) words_in_doc_count = len(ent_doc) ent_token_span = doc.char_span(ent_start,ent_end) ent_token_span_start = ent_token_span.start ent_token_span_end = ent_token_span.end # # Statment concluder # if ent_label == "PROPULSION_POWER_FACT": detection_with_url_metadata = dict(metadata_from_url) detection_with_url_metadata["propulsion_power_value_1"] = ent_doc[0].text detection_with_url_metadata["measurement_text"] = ent_doc[1].text if ent_text.endswith("eller mer"): detection_with_url_metadata["propulsion_power_context"] = "eller mer" forward_result.append(detection_with_url_metadata) return forward_result