def enforce_AT_schema_on_embedding_processors(pipe): """For every embedding provider and consumer, enforce that their output col is named <output_level>@storage_ref for output_levels word,chunk,sentence aka document , i.e. word_embed@elmo or sentence_embed@elmo etc..""" for c in pipe.components: if ComponentUtils.is_embedding_provider(c): if '@' not in c.info.outputs[0]: new_embed_AT_ref = ComponentUtils.extract_storage_ref_AT_notation_for_embeds( c, 'output') c.info.outputs = [new_embed_AT_ref] c.info.spark_output_column_names = [new_embed_AT_ref] # c.model.setOutputCol(new_embed_AT_ref[0]) # why [0] here?? bug! c.model.setOutputCol(new_embed_AT_ref) if ComponentUtils.is_embedding_consumer(c): input_embed_col = ComponentUtils.extract_embed_col(c) if '@' not in input_embed_col: # storage_ref = StorageRefUtils.extract_storage_ref(c) # new_embed_col_with_AT_notation = input_embed_col+"@"+storage_ref new_embed_AT_ref = ComponentUtils.extract_storage_ref_AT_notation_for_embeds( c, 'input') c.info.inputs.remove(input_embed_col) c.info.inputs.append(new_embed_AT_ref) c.info.spark_input_column_names.remove(input_embed_col) c.info.spark_input_column_names.append(new_embed_AT_ref) c.model.setInputCols(c.info.inputs) return pipe
def check_and_fix_component_order(pipe: NLUPipeline): ''' This method takes care that the order of components is the correct in such a way,that the pipeline can be iteratively processed by spark NLP. Column Names will not be touched. DAG Task Sort basically. ''' logger.info("Starting to optimize component order ") correct_order_component_pipeline = [] all_components_orderd = False all_components = pipe.components provided_features = [] update_last_type = False last_type_sorted = None trainable_updated = False while all_components_orderd == False: if update_last_type: last_type_sorted = None else: update_last_type = True for component in all_components: logger.info( f"Optimizing order for component {component.info.name}") input_columns = ComponentUtils.clean_irrelevant_features( component.info.spark_input_column_names, False) if last_type_sorted is None or component.info.type == last_type_sorted: if set(input_columns).issubset(provided_features): correct_order_component_pipeline.append(component) if component in all_components: all_components.remove(component) # for feature in component.info.spark_output_column_names: provided_features.append(feature) provided_features += ComponentUtils.clean_irrelevant_features( component.info.spark_output_column_names, False) last_type_sorted = component.info.type update_last_type = False break if len(all_components) == 0: all_components_orderd = True if len( all_components ) == 1 and pipe.has_trainable_components and not trainable_updated and 'approach' in str( all_components[0].model ).lower( ) and 'sentence_embeddings@' in all_components[0].info.inputs: # special case, if trainable then we feed embed consumers on the first sentence embed provider # 1. Find first sent embed provider # 2. substitute any 'sent_embed@' consumer inputs for the provider col for f in provided_features: if 'sentence_embeddings' in f and not trainable_updated: all_components[0].info.spark_input_column_names.remove( 'sentence_embeddings@') if 'sentence_embeddings@' in all_components[ 0].info.inputs: all_components[0].info.inputs.remove( 'sentence_embeddings@') all_components[0].info.spark_input_column_names.append( f) if f not in all_components[0].info.inputs: all_components[0].info.inputs.append(f) trainable_updated = True pipe.components = correct_order_component_pipeline return pipe
def enforece_AT_embedding_provider_output_col_name_schema_for_list_of_components( pipe_list): """For every embedding provider, enforce that their output col is named <output_level>@storage_ref for output_levels word,chunk,sentence aka document , i.e. word_embed@elmo or sentence_embed@elmo etc..""" for c in pipe_list: if ComponentUtils.is_embedding_provider(c): level_AT_ref = ComponentUtils.extract_storage_ref_AT_notation_for_embeds( c, 'output') c.info.outputs = [level_AT_ref] c.info.spark_output_column_names = [level_AT_ref] c.model.setOutputCol(level_AT_ref[0]) return pipe_list
def extract_required_features_refless_from_pipe(pipe: NLUPipeline): """Extract provided features from pipe, which have no storage ref""" provided_features_no_ref = [] for c in pipe.components: for feat in c.info.inputs: if 'embed' not in feat: provided_features_no_ref.append(feat) return ComponentUtils.clean_irrelevant_features( provided_features_no_ref)
def get_missing_required_features(pipe: NLUPipeline): provided_features_no_ref = ComponentUtils.clean_irrelevant_features( PipelineQueryVerifier.extract_provided_features_refless_from_pipe( pipe)) required_features_no_ref = ComponentUtils.clean_irrelevant_features( PipelineQueryVerifier.extract_required_features_refless_from_pipe( pipe)) provided_features_ref = ComponentUtils.clean_irrelevant_features( PipelineQueryVerifier.extract_provided_features_ref_from_pipe( pipe)) required_features_ref = ComponentUtils.clean_irrelevant_features( PipelineQueryVerifier.extract_required_features_ref_from_pipe( pipe)) is_trainable = PipeUtils.is_trainable_pipe(pipe) conversion_candidates = PipelineQueryVerifier.extract_sentence_embedding_conversion_candidates( pipe) pipe.has_trainable_components = is_trainable if is_trainable and len(provided_features_ref) == 0: required_features_ref = [ ] # ['sentence_embedding@u'] # special case, if training we can reset this required_features_no_ref.append( 'sentence_embeddings' ) # special case, if training we can reset this components_for_ner_conversion = [] # todo? missing_features_no_ref = set(required_features_no_ref) - set( provided_features_no_ref) # - set(['text','label']) missing_features_ref = set(required_features_ref) - set( provided_features_ref) PipelineQueryVerifier.log_resolution_status( provided_features_no_ref, required_features_no_ref, provided_features_ref, required_features_ref, is_trainable, conversion_candidates, missing_features_no_ref, missing_features_ref, ) return missing_features_no_ref, missing_features_ref, conversion_candidates
def subsitute_leaf_output_names(pipe): """Change all output column names of leaves to something nicer, if they not already use AT notation""" for c in pipe.components: if PipeUtils.is_leaf_node( c, pipe) and not ComponentUtils.has_AT_notation(): # update name 1 return pipe
def check_if_there_component_with_col_in_components( component_list, features, except_component): """For a given list of features and a list of components, see if there are components taht provide this feature If yes, True, otherwise False """ for c in component_list: if c.info.outputs[0] != except_component.info.outputs[0]: for f in ComponentUtils.clean_irrelevant_features( c.info.spark_output_column_names, True): if f in features: return True return False
def extract_sentence_embedding_conversion_candidates(pipe): """Extract information about embedding conversion candidates""" conversion_candidates_data = [] for c in pipe.components: if ComponentUtils.component_has_embeddings_requirement( c) and not PipeUtils.is_trainable_pipe(pipe): storage_ref = StorageRefUtils.extract_storage_ref(c) conversion_applicable, conversion_data = PipelineQueryVerifier.check_if_storage_ref_is_satisfied_or_get_conversion_candidate( c, pipe, storage_ref) if conversion_applicable: conversion_candidates_data.append(conversion_data) return conversion_candidates_data
def extract_provided_features_ref_from_pipe(pipe: NLUPipeline): """Extract provided features from pipe, which have storage ref""" provided_features_ref = [] for c in pipe.components: for feat in c.info.outputs: if 'embed' in feat: if '@' not in feat: provided_features_ref.append( feat + "@" + StorageRefUtils.extract_storage_ref(c)) else: provided_features_ref.append(feat) return ComponentUtils.clean_irrelevant_features(provided_features_ref)
def check_and_fix_component_output_column_name_overlap(pipe: NLUPipeline): ''' This method enforces that every component has a unique output column name. Especially for classifiers or bert_embeddings this issue might occur, 1. For each component we veryify that all input column names are satisfied by checking all other components output names 2. When a input column is missing we do the following : 2.1 Figure out the type of the missing input column. The name of the missing column should be equal to the type 2.2 Check if there is already a component in the pipe, which provides this input (It should) 2.3. When the providing component is found, update its output name, or update the original coponents input name :return: NLU pipeline where the output and input column names of the models have been adjusted to each other ''' all_names_provided = False for component_to_check in pipe.components: all_names_provided_for_component = False input_columns = set( component_to_check.info.spark_input_column_names) logger.info( f'Checking for component {component_to_check.info.name} wether input {input_columns} is satisfied by another component in the pipe' ) for other_component in pipe.components: if component_to_check.info.name == other_component.info.name: continue output_columns = set( other_component.info.spark_output_column_names) input_columns -= output_columns # set substraction input_columns = ComponentUtils.clean_irrelevant_features( input_columns) if len(input_columns) != 0: # fix missing column name for missing_column in input_columns: for other_component in pipe.components: if component_to_check.info.name == other_component.info.name: continue if other_component.info.type == missing_column: # resolve which setter to use ... # We update the output name for the component which provides our feature other_component.info.spark_output_column_names = [ missing_column ] logger.info( f'Setting output columns for component {other_component.info.name} to {missing_column} ' ) other_component.model.setOutputCol(missing_column) return pipe
def is_storage_ref_match(embedding_consumer, embedding_provider, pipe): """Check for 2 components, if one provides the embeddings for the other. Makes sure that output_level matches up (chunk/sent/tok/embeds)""" consumer_AT_ref = ComponentUtils.extract_storage_ref_AT_notation_for_embeds( embedding_consumer, 'input') provider_AT_rev = ComponentUtils.extract_storage_ref_AT_notation_for_embeds( embedding_provider, 'output') consum_level = ComponentUtils.extract_embed_level_identity( embedding_consumer, 'input') provide_level = ComponentUtils.extract_embed_level_identity( embedding_provider, 'output') consumer_ref = StorageRefUtils.extract_storage_ref(embedding_consumer) provider_rev = StorageRefUtils.extract_storage_ref(embedding_provider) # input/output levels must match if consum_level != provide_level: return False # If storage ref dont match up, we must consult the storage_ref_2_embed mapping if it still maybe is a match, otherwise it is not. if consumer_ref == provider_rev: return True # Embed Components have have been resolved via@ have a nlu_resolution_ref_source will match up with the consumer ref if correct embedding. if hasattr(embedding_provider.info, 'nlu_ref'): if consumer_ref == StorageRefUtils.extract_storage_ref( embedding_provider.info.nlu_ref): return True # If it is either sentence_embedding_converter or chunk_embedding_converter then we gotta check what the storage ref of the inpot of those is. # If storage ref matches up, the providers output will match the consumer # if embedding_provider if embedding_provider.info.name in [ "chunk_embedding_converter", 'sentence_embedding_converter' ]: # TODO FOR RESOLUTION nlu_ref, conv_prov_storage_ref = PipelineQueryVerifier.get_converters_provider_info( embedding_provider, pipe) return False
def extract_required_features_ref_from_pipe(pipe: NLUPipeline): """Extract provided features from pipe, which have storage ref""" provided_features_ref = [] for c in pipe.components: for feat in c.info.inputs: if 'embed' in feat: # if StorageRefUtils.extract_storage_ref(c) !='': # special edge case, some components might not have a storage ref set if '@' not in feat: provided_features_ref.append( feat + "@" + StorageRefUtils.extract_storage_ref(c)) else: provided_features_ref.append(feat) return ComponentUtils.clean_irrelevant_features(provided_features_ref)
def enforce_AT_schema_on_NER_processors_and_add_missing_NER_converters( pipe): """For every NER provider and consumer, enforce that their output col is named <output_level>@storage_ref for output_levels word,chunk,sentence aka document , i.e. word_embed@elmo or sentence_embed@elmo etc.. We also add NER converters for every NER model that no Converter converting it's inputs In addition, returns the pipeline with missing NER converters added, for every NER model. The converters transform the IOB schema in a merged and more usable form for downstream tasks 1. Find a NER model in pipe 2. Find a NER converter feeding from it, if there is None, create one. 3. Generate name with Identifier <ner-iob>@<nlu_ref_identifier> and <entities>@<nlu_ref_identifier> 3.1 Update NER Models output to <ner-iob>@<nlu_ref_identifier> 3.2 Update NER Converter input to <ner-iob>@<nlu_ref_identifier> 3.3 Update NER Converter output to <entities>@<nlu_ref_identifier> 4. Update every Component that feeds from the NER converter (i.e. Resolver etc..) """ from nlu import Util new_converters = [] for c in pipe.components: if ComponentUtils.is_NER_provider(c): output_NER_col = ComponentUtils.extract_NER_col(c, 'output') converter_to_update = None # if '@' not in output_NER_col: for other_c in pipe.components: if output_NER_col in other_c.info.inputs and ComponentUtils.is_NER_converter( other_c): converter_to_update = other_c ner_identifier = ComponentUtils.get_nlu_ref_identifier(c) if converter_to_update is None: if c.info.license == 'healthcare': converter_to_update = Util( "ner_to_chunk_converter_licensed", is_licensed=True) else: converter_to_update = Util("ner_to_chunk_converter") new_converters.append(converter_to_update) converter_to_update.info.nlu_ref = f'ner_converter@{ner_identifier}' # 3. generate new col names new_NER_AT_ref = output_NER_col if '@' not in output_NER_col: new_NER_AT_ref = output_NER_col + '@' + ner_identifier new_NER_converter_AT_ref = 'entities' + '@' + ner_identifier # 3.1 upate NER model outputs c.info.outputs = [new_NER_AT_ref] c.info.spark_output_column_names = [new_NER_AT_ref] c.model.setOutputCol(new_NER_AT_ref) #3.2 update converter inputs old_ner_input_col = ComponentUtils.extract_NER_converter_col( converter_to_update, 'input') converter_to_update.info.inputs.remove(old_ner_input_col) converter_to_update.info.spark_input_column_names.remove( old_ner_input_col) converter_to_update.info.inputs.append(new_NER_AT_ref) converter_to_update.info.spark_input_column_names.append( new_NER_AT_ref) converter_to_update.model.setInputCols( converter_to_update.info.inputs) #3.3 update converter outputs converter_to_update.info.outputs = [new_NER_converter_AT_ref] converter_to_update.info.spark_output_column_names = [ new_NER_converter_AT_ref ] converter_to_update.model.setOutputCol( new_NER_converter_AT_ref) ## todo improve, this causes the first ner producer to feed to all ner-cosnuners. All other ner-producers will be ignored by ner-consumers,w ithouth special syntax or manual configs ##4. Update all NER consumers input columns for conversion_consumer in pipe.components: if 'entities' in conversion_consumer.info.inputs: conversion_consumer.info.inputs.remove('entities') conversion_consumer.info.spark_input_column_names.remove( 'entities') conversion_consumer.info.inputs.append( new_NER_converter_AT_ref) conversion_consumer.info.spark_input_column_names.append( new_NER_converter_AT_ref) # Add new converters to pipe for conv in new_converters: if conv.info.license == 'healthcare': pipe.add( conv, name_to_add= f'chunk_converter_licensed@{conv.info.outputs[0].split("@")[0]}' ) else: pipe.add( conv, name_to_add= f'chunk_converter@{conv.info.outputs[0].split("@")[0]}') return pipe
def is_trainable_pipe(pipe): '''Check if pipe is trainable''' for c in pipe.components: if ComponentUtils.is_untrained_model(c): return True return False
def check_and_fix_component_output_column_name_satisfaction( pipe: NLUPipeline): ''' This function verifies that every input and output column name of a component is satisfied. If some output names are missing, it will be added by this methods. Usually classifiers need to change their input column name, so that it matches one of the previous embeddings because they have dynamic output names This function peforms the following steps : 1. For each component we veryify that all input column names are satisfied by checking all other components output names 2. When a input column is missing we do the following : 2.1 Figure out the type of the missing input column. The name of the missing column should be equal to the type 2.2 Check if there is already a component in the pipe, which provides this input (It should) 2.3. When A providing component is found, check if storage ref matches up. 2.4 If True for all, update provider component output name, or update the original coponents input name :return: NLU pipeline where the output and input column names of the models have been adjusted to each other ''' logger.info("Fixing input and output column names") # pipe = PipeUtils.enforce_AT_schema_on_pipeline(pipe) for component_to_check in pipe.components: input_columns = set( component_to_check.info.spark_input_column_names) # a component either has '' storage ref or at most 1 logger.info( f'Checking for component {component_to_check.info.name} wether inputs {input_columns} is satisfied by another component in the pipe ', ) for other_component in pipe.components: if component_to_check.info.name == other_component.info.name: continue output_columns = set( other_component.info.spark_output_column_names) input_columns -= output_columns # we substract alrfready provided columns input_columns = ComponentUtils.clean_irrelevant_features( input_columns) # Resolve basic mismatches, usually storage refs if len( input_columns ) != 0 and not pipe.has_trainable_components or ComponentUtils.is_embedding_consumer( component_to_check): # fix missing column name # We must not only check if input satisfied, but if storage refs match! and Match Storage_refs accordingly logger.info( f"Fixing bad input col for C={component_to_check} untrainable pipe" ) resolved_storage_ref_cols = [] for missing_column in input_columns: for other_component in pipe.components: if component_to_check.info.name == other_component.info.name: continue if other_component.info.type == missing_column: # We update the output name for the component which consumes our feature if StorageRefUtils.has_storage_ref( other_component ) and ComponentUtils.is_embedding_provider( component_to_check): if ComponentUtils.are_producer_consumer_matches( component_to_check, other_component): resolved_storage_ref_cols.append( (other_component.info. spark_output_column_names[0], missing_column)) component_to_check.info.spark_output_column_names = [ missing_column ] component_to_check.info.outputs = [missing_column] logger.info( f'Resolved requirement for missing_column={missing_column} with inputs from provider={other_component.info.name} by col={missing_column} ' ) other_component.model.setOutputCol(missing_column) for resolution, unsatisfied in resolved_storage_ref_cols: component_to_check.info.spark_input_column_names.remove( unsatisfied) component_to_check.info.spark_input_column_names.append( resolution) component_to_check.info.inputs = component_to_check.info.spark_input_column_names # TODO USE is_storage_ref_match ? # Resolve training missatches elif len( input_columns ) != 0 and pipe.has_trainable_components: # fix missing column name logger.info( f"Fixing bad input col for C={component_to_check} trainable pipe" ) # for trainable components, we change their input columns and leave other components outputs unchanged for missing_column in input_columns: for other_component in pipe.components: if component_to_check.info.name == other_component.info.name: continue if other_component.info.type == missing_column: # We update the input col name for the componenet that has missing cols component_to_check.info.spark_input_column_names.remove( missing_column) # component_to_check.component_info.inputs.remove(missing_column) # component_to_check.component_info.inputs.remove(missing_column) # component_to_check.component_info.inputs.append(other_component.component_info.spark_output_column_names[0]) component_to_check.info.spark_input_column_names.append( other_component.info. spark_output_column_names[0]) component_to_check.model.setInputCols( component_to_check.info. spark_input_column_names) logger.info( f'Setting input col columns for component {component_to_check.info.name} to {other_component.info.spark_output_column_names[0]} ' ) return pipe
def satisfy_dependencies(pipe: NLUPipeline) -> NLUPipeline: """Dependency Resolution Algorithm. For a given pipeline with N components, builds a DAG in reverse and satisfiy each of their dependencies and child dependencies with a BFS approach and returns the resulting pipeline""" all_features_provided = False is_licensed = PipelineQueryVerifier.has_licensed_components(pipe) pipe.has_licensed_components = is_licensed while all_features_provided == False: # After new components have been added, we must loop again and check for the new components if requriements are met components_to_add = [] missing_components, missing_storage_refs, components_for_embedding_conversion = PipelineQueryVerifier.get_missing_required_features( pipe) logger.info( f"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) logger.info( f"Trying to resolve missing features for \n missing_components={missing_components} \n missing storage_refs={missing_storage_refs}\n conversion_candidates={components_for_embedding_conversion}" ) if PipelineQueryVerifier.check_if_all_dependencies_satisfied( missing_components, missing_storage_refs, components_for_embedding_conversion): break # Now all features are provided # Create missing base storage ref producers, i.e embeddings for missing_component in missing_storage_refs: component = get_default_component_of_type( missing_component, language=pipe.lang, is_licensed=is_licensed) if component is None: continue if 'chunk_emb' in missing_component: components_to_add.append( ComponentUtils.config_chunk_embed_converter(component)) else: components_to_add.append(component) # Create missing base components, storage refs are fetched in rpevious loop for missing_component in missing_components: components_to_add.append( get_default_component_of_type(missing_component, language=pipe.lang, is_licensed=is_licensed)) # Create embedding converters for resolution_info in components_for_embedding_conversion: converter = None if 'word2chunk' == resolution_info.type: converter = PipelineQueryVerifier.add_chunk_embedding_converter( resolution_info) elif 'word2sentence' == resolution_info.type: converter = PipelineQueryVerifier.add_sentence_embedding_converter( resolution_info) if converter is not None: components_to_add.append(converter) logger.info( f'Resolved for missing components the following NLU components : {components_to_add}' ) # Add missing components for new_component in components_to_add: logger.info(f'adding {new_component.info.name}') pipe.add(new_component) logger.info( f"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ) logger.info(f"ALLL DEPENDENCIES SATISFIED") return pipe