Exemplo n.º 1
0
 def are_producer_consumer_matches(e_consumer: SparkNLUComponent, e_provider: SparkNLUComponent) -> bool:
     """Check for embedding_consumer and embedding_producer if they match storage_ref and output level wise wise """
     if StorageRefUtils.extract_storage_ref(e_consumer) == StorageRefUtils.extract_storage_ref(e_provider):
         if ComponentUtils.extract_embed_level_identity(e_consumer,
                                                        'input') == ComponentUtils.extract_embed_level_identity(
             e_provider, 'output'):
             return True
         ## TODO FALL BACK FOR BAD MATCHES WHICH ACTUALLY MATCH-> consult name space
     return False
Exemplo n.º 2
0
    def check_if_storage_ref_is_satisfied_or_get_conversion_candidate(
            component_to_check: NluComponent, pipe, storage_ref_to_find: str):
        """Check if any other component in the pipeline has same storage ref as the input component.
        Returns 1. If there is a candidate, but it has different level, it will be returned as candidate
        If first condition is not satisfied, consults the namespace.storage_ref_2_nlp_ref
        """
        # If there is just 1 component, there is nothing to check
        if len(pipe.components) == 1:
            return False, None
        conversion_candidate = None
        conversion_type = "no_conversion"
        logger.info(
            f'checking for storage={storage_ref_to_find} is available in component_list..'
        )
        for c in pipe.components:
            if component_to_check.name != c.name:
                if StorageRefUtils.has_storage_ref(c):
                    if StorageRefUtils.extract_storage_ref(
                            c) == storage_ref_to_find:
                        # Both components have Different Names AND their Storage Ref Matches up AND they both take in tokens -> Match
                        if NLP_FEATURES.TOKEN in component_to_check.in_types and c.type == AnnoTypes.TOKEN_EMBEDDING:
                            logger.info(
                                f'Word Embedding Match found = {c.name}')
                            return False, None

                        # Since document and be substituted for sentence
                        # and vice versa if either of them matches up we have a match
                        if NLP_FEATURES.SENTENCE_EMBEDDINGS in component_to_check.in_types and \
                                c.type == AnnoTypes.DOCUMENT_EMBEDDING:
                            logger.info(
                                f'Sentence Embedding Match found = {c.name}')
                            return False, None

                        # component_to_check requires Sentence_embedding
                        # but the Matching Storage_ref component takes in Token
                        #   -> Convert the Output of the Match to SentenceLevel
                        #   and feed the component_to_check to the new component
                        if NLP_FEATURES.SENTENCE_EMBEDDINGS in component_to_check.in_types \
                                and c.type == AnnoTypes.TOKEN_EMBEDDING:
                            logger.info(
                                f'Sentence Embedding Conversion Candidate found={c.name}'
                            )
                            conversion_type = 'word2sentence'
                            conversion_candidate = c

                        # analogous case as above for chunk
                        if NLP_FEATURES.CHUNK_EMBEDDINGS in component_to_check.in_types and c.type == AnnoTypes.TOKEN_EMBEDDING:
                            logger.info(
                                f'Sentence Embedding Conversion Candidate found={c.name}'
                            )
                            conversion_type = 'word2chunk'
                            conversion_candidate = c

        logger.info(f'No matching storage ref found')
        return True, StorageRefConversionResolutionData(
            storage_ref_to_find, conversion_candidate, conversion_type)
Exemplo n.º 3
0
 def add_sentence_embedding_converter(
         resolution_data: StorageRefConversionResolutionData
 ) -> NluComponent:
     """ Return a Word to Sentence Embedding converter for a given Component. The input cols with match the Sentence Embedder ones
         The converter is a NLU Component Embelishement of the Spark NLP Sentence Embeddings Annotator
     """
     logger.info(
         f'Adding Sentence embedding conversion for Embedding Provider={resolution_data}'
     )
     word_embedding_provider = resolution_data.component_candidate
     c = ComponentMap.os_components[
         NLP_NODE_IDS.SENTENCE_EMBEDDINGS_CONVERTER]
     storage_ref = StorageRefUtils.extract_storage_ref(
         word_embedding_provider)
     c.set_metadata(c.get_default_model(), 'sentence_embedding_converter',
                    NLP_NODE_IDS.SENTENCE_EMBEDDINGS_CONVERTER, 'xx', False,
                    Licenses.open_source, storage_ref)
     c.model.setStorageRef(storage_ref)
     # set output cols
     embed_AT_out = NLP_FEATURES.SENTENCE_EMBEDDINGS + '@' + storage_ref
     c.model.setOutputCol(embed_AT_out)
     c.spark_output_column_names = [embed_AT_out]
     c.spark_input_column_names = [
         NLP_FEATURES.DOCUMENT,
         NLP_FEATURES.WORD_EMBEDDINGS + '@' + storage_ref
     ]
     c.model.setInputCols(c.spark_input_column_names)
     return c
Exemplo n.º 4
0
 def set_storage_ref_attribute_of_embedding_converters(pipe_list: List[NluComponent]):
     """For every embedding converter, we set storage ref attr on it, based on what the storage ref from it's provider is """
     for converter in pipe_list:
         if ComponentUtils.is_embedding_provider(converter) and ComponentUtils.is_embedding_converter(converter):
             # First find the embed col of the converter
             embed_col = ComponentUtils.extract_embed_col(converter)
             for provider in pipe_list:
                 # Now find the Embedding generator that is feeding the converter
                 if embed_col in provider.spark_input_column_names:
                     converter.storage_ref = StorageRefUtils.nlp_extract_storage_ref_nlp_model(provider.model)
                     # converter.storage_ref = StorageRefUtils.extract_storage_ref(provider)
     return pipe_list
Exemplo n.º 5
0
 def extract_provided_features_ref_from_pipe(pipe):
     """Extract provided features from component_list, which have  storage ref"""
     provided_features_ref = []
     for c in pipe.components:
         for feat in c.out_types:
             if 'embed' in feat:
                 if '@' not in feat:
                     provided_features_ref.append(
                         feat + "@" +
                         StorageRefUtils.extract_storage_ref(c))
                 else:
                     provided_features_ref.append(feat)
     return ComponentUtils.clean_irrelevant_features(provided_features_ref)
Exemplo n.º 6
0
 def extract_storage_ref_AT_notation_for_embeds(component: NluComponent, col='input'):
     '''
     Extract <col>_embed_col@storage_ref notation from a component if it has a storage ref, otherwise '
     :param component:  To extract notation from
     :cols component:  Wether to extract for the input or output col
     :return: '' if no storage_ref, <col>_embed_col@storage_ref otherwise
     '''
     if col == 'input':
         e_col = next(filter(lambda s: 'embed' in s, component.spark_input_column_names))
     elif col == 'output':
         e_col = next(filter(lambda s: 'embed' in s, component.spark_output_column_names))
     stor_ref = StorageRefUtils.extract_storage_ref(component)
     return e_col + '@' + stor_ref
Exemplo n.º 7
0
    def extract_sentence_embedding_conversion_candidates(pipe):
        """Extract information about embedding conversion candidates"""
        conversion_candidates_data = []
        for c in pipe.components:
            if ComponentUtils.component_has_embeddings_requirement(
                    c) and not PipeUtils.is_trainable_pipe(pipe):
                storage_ref = StorageRefUtils.extract_storage_ref(c)
                conversion_applicable, conversion_data = PipelineQueryVerifier.check_if_storage_ref_is_satisfied_or_get_conversion_candidate(
                    c, pipe, storage_ref)
                if conversion_applicable:
                    conversion_candidates_data.append(conversion_data)

        return conversion_candidates_data
Exemplo n.º 8
0
    def config_chunk_embed_converter(converter: SparkNLUComponent) -> SparkNLUComponent:
        '''For a Chunk to be added to a pipeline, configure its input/output and set storage ref to amtch the storage ref and
        enfore storage ref notation. This will be used to infer backward later which component should feed this consumer'''
        storage_ref = StorageRefUtils.extract_storage_ref(converter)
        input_embed_col = ComponentUtils.extract_embed_col(converter)
        new_embed_col_with_AT_notation = input_embed_col + "@" + storage_ref
        converter.info.inputs.remove(input_embed_col)
        converter.info.inputs.append(new_embed_col_with_AT_notation)
        converter.info.spark_input_column_names.remove(input_embed_col)
        converter.info.spark_input_column_names.append(new_embed_col_with_AT_notation)
        converter.model.setInputCols(converter.info.inputs)

        return converter
Exemplo n.º 9
0
    def extract_required_features_ref_from_pipe(pipe):
        """Extract provided features from component_list, which have  storage ref"""
        provided_features_ref = []
        for c in pipe.components:
            for feat in c.in_types:
                if 'embed' in feat:
                    # if StorageRefUtils.extract_storage_ref(os_components) !='':  # special edge case, some components might not have a storage ref set
                    if '@' not in feat:
                        provided_features_ref.append(
                            feat + "@" +
                            StorageRefUtils.extract_storage_ref(c))
                    else:
                        provided_features_ref.append(feat)

        return ComponentUtils.clean_irrelevant_features(provided_features_ref)
Exemplo n.º 10
0
    def is_storage_ref_match(embedding_consumer, embedding_provider, pipe):
        """Check for 2 components, if one provides the embeddings for the other. Makes sure that output_level matches up (chunk/sent/tok/embeds)"""
        consumer_AT_ref = ComponentUtils.extract_storage_ref_AT_notation_for_embeds(
            embedding_consumer, 'input')
        provider_AT_rev = ComponentUtils.extract_storage_ref_AT_notation_for_embeds(
            embedding_provider, 'output')
        consum_level = ComponentUtils.extract_embed_level_identity(
            embedding_consumer, 'input')
        provide_level = ComponentUtils.extract_embed_level_identity(
            embedding_provider, 'output')

        consumer_ref = StorageRefUtils.extract_storage_ref(embedding_consumer)
        provider_rev = StorageRefUtils.extract_storage_ref(embedding_provider)

        # input/output levels must match
        if consum_level != provide_level: return False

        # If storage ref dont match up, we must consult the storage_ref_2_embed mapping if it still maybe is a match, otherwise it is not.
        if consumer_ref == provider_rev: return True

        # Embed Components have have been resolved via@ have a  nlu_resolution_ref_source will match up with the consumer ref if correct embedding.
        if hasattr(embedding_provider.info, 'nlu_ref'):
            if consumer_ref == StorageRefUtils.extract_storage_ref(
                    embedding_provider.info.nlu_ref):
                return True

        # If it is either  sentence_embedding_converter or chunk_embedding_converter then we gotta check what the storage ref of the inpot of those is.
        # If storage ref matches up, the providers output will match the consumer
        # if embedding_provider
        if embedding_provider.info.name in [
                "chunk_embedding_converter", 'sentence_embedding_converter'
        ]:  # TODO FOR RESOLUTION
            nlu_ref, conv_prov_storage_ref = PipelineQueryVerifier.get_converters_provider_info(
                embedding_provider, pipe)

        return False
Exemplo n.º 11
0
    def update_converter_storage_refs_and_cols(pipe, provided_features_ref,
                                               required_features_ref):
        """Storage ref of converters is initially empty string, i.e. '' .
        This method checks if  any convertable embeddings are provided, if yes it will update storage ref of converter
        , update the input/output columns with colname@storage_ref notation and mark it as resolved
        by removing it from the corrosponding lists"""

        for c in pipe.components:
            if c.name in [
                    NLP_NODE_IDS.SENTENCE_EMBEDDINGS_CONVERTER,
                    NLP_NODE_IDS.CHUNK_EMBEDDINGS_CONVERTER
            ]:
                # Check if there are candidates that feed the converter, any word Embedding will work
                if c.storage_ref != '':
                    # If storage_ref is not '' then this is converter is already fixed, nothing to do
                    continue
                for other_c in pipe.components:
                    if other_c.has_storage_ref and other_c.type == AnnoTypes.TOKEN_EMBEDDING:
                        # Get original embed cols
                        in_embed = ComponentUtils.extract_embed_col(c, 'input')
                        out_embed = ComponentUtils.extract_embed_col(
                            c, 'output')

                        if len(in_embed.split('@')) == 2:
                            # Storage ref is already on annotator, we dont ned to fix this
                            continue

                        c.spark_output_column_names.remove(out_embed)
                        c.spark_input_column_names.remove(in_embed)
                        provided_features_ref.remove(out_embed + '@')
                        required_features_ref.remove(in_embed + '@')
                        storage_ref = StorageRefUtils.extract_storage_ref(
                            other_c)
                        in_embed = in_embed + '@' + storage_ref
                        out_embed = out_embed + '@' + storage_ref
                        c.spark_output_column_names.append(out_embed)
                        c.spark_input_column_names.append(in_embed)
                        provided_features_ref.append(out_embed)
                        required_features_ref.append(in_embed)
                        c.storage_ref = storage_ref

        return provided_features_ref, required_features_ref
Exemplo n.º 12
0
    def add_chunk_embedding_converter(
            resolution_data: StorageRefConversionResolutionData
    ) -> NluComponent:
        """ Return a Word to CHUNK Embedding converter for a given Component. The input cols with match the Sentence Embedder ones
            The converter is a NLU Component Embelishement of the Spark NLP Sentence Embeddings Annotator
            The CHUNK embedder requires entities and also embeddings to generate data from. Since there could be multiple entities generators, we neeed to pass the correct one
        """
        # TODO REFACTOR
        logger.info(
            f'Adding Chunk embedding conversion  Provider={resolution_data} and NER Converter provider = '
        )
        word_embedding_provider = resolution_data.component_candidate
        entities_col = 'entities'
        embed_provider_col = word_embedding_provider.info.spark_output_column_names[
            0]

        c = ComponentMap.os_components[NLP_NODE_IDS.CHUNK_EMBEDDINGS_CONVERTER]
        c.set_metadata(c.get_default_model(),
                       NLP_NODE_IDS.CHUNK_EMBEDDINGS_CONVERTER,
                       NLP_NODE_IDS.CHUNK_EMBEDDINGS_CONVERTER, 'xx', False,
                       Licenses.open_source)

        # c = nlu.embeddings_chunker.EmbeddingsChunker(annotator_class='chunk_embedder')
        storage_ref = StorageRefUtils.extract_storage_ref(
            word_embedding_provider)
        c.model.setStorageRef(storage_ref)
        c.info.storage_ref = storage_ref

        c.model.setInputCols(entities_col, embed_provider_col)
        c.model.setOutputCol('chunk_embeddings@' + storage_ref)
        c.info.spark_input_column_names = [entities_col, embed_provider_col]
        c.info.input_column_names = [entities_col, embed_provider_col]

        c.info.spark_output_column_names = ['chunk_embeddings@' + storage_ref]
        c.info.output_column_names = ['chunk_embeddings@' + storage_ref]
        return c
Exemplo n.º 13
0
    def viz_streamlit_word_embed_manifold(
            pipe, # nlu component_list
            default_texts: List[str] = ("Donald Trump likes to party!", "Angela Merkel likes to party!", 'Peter HATES TO PARTTY!!!! :('),
            title: Optional[str] = "Lower dimensional Manifold visualization for word embeddings",
            sub_title: Optional[str] = "Apply any of the 11 `Manifold` or `Matrix Decomposition` algorithms to reduce the dimensionality of `Word Embeddings` to `1-D`, `2-D` and `3-D` ",
            write_raw_pandas : bool = False ,
            default_algos_to_apply : List[str] = ("TSNE", "PCA"),#,'LLE','Spectral Embedding','MDS','ISOMAP','SVD aka LSA','DictionaryLearning','FactorAnalysis','FastICA','KernelPCA',),  # LatentDirichletAllocation 'NMF',
            target_dimensions : List[int] = (1,2,3),
            show_algo_select : bool = True,
            show_embed_select : bool = True,
            show_color_select: bool = True,
            MAX_DISPLAY_NUM:int=200000,
            display_embed_information:bool=True,
            set_wide_layout_CSS:bool=True,
            num_cols: int = 3,
            model_select_position:str = 'side', # side or main
            key:str = "NLU_streamlit",
            additional_classifiers_for_coloring:List[str]=['pos', 'sentiment.imdb'],
            generate_code_sample:bool = False,
            show_infos:bool = True,
            show_logo:bool = True,
            n_jobs: Optional[int] = 3, # False
    ):
        from nlu.pipe.viz.streamlit_viz.streamlit_utils_OS import StreamlitUtilsOS
        StreamlitVizTracker.footer_displayed=False
        try :
            import plotly.express as px
            from sklearn.metrics.pairwise import distance_metrics
        except :st.error("You need the sklearn and plotly package in your Python environment installed for similarity visualizations. Run <pip install sklearn plotly>")
        if len(default_texts) > MAX_DISPLAY_NUM : default_texts = default_texts[:MAX_DISPLAY_NUM]
        if show_logo :StreamlitVizTracker.show_logo()
        if set_wide_layout_CSS : _set_block_container_style()
        if title:st.header(title)
        if sub_title:st.subheader(sub_title)
        # if show_logo :VizUtilsStreamlitOS.show_logo()

        # VizUtilsStreamlitOS.loaded_word_embeding_pipes = []


        data = st.text_area('Enter N texts, seperated by new lines to visualize Word Embeddings for ','\n'.join(default_texts))
        if len(data) > MAX_DISPLAY_NUM : data = data[:MAX_DISPLAY_NUM]
        original_text = nlu.load('tokenize').predict(data.split("\n"),output_level='document')['document'].values

        if show_color_select:
            if model_select_position == 'side' : feature_to_color_by =  st.sidebar.selectbox('Pick a feature to color points in manifold by ',['pos','sentiment',],0)
            else:feature_to_color_by =  st.selectbox('Feature to color plots by ',['pos','sentiment',],0)
        text_col = 'token'
        embed_algos_to_load = []
        new_embed_pipes = []
        e_coms = StreamlitUtilsOS.find_all_embed_components(pipe)

        if show_algo_select :
            exp = st.expander("Select additional manifold and dimension reduction techniques to apply")

            algos = exp.multiselect(
                "Reduce embedding dimensionality to something visualizable",
                options=("TSNE", "ISOMAP",'LLE','Spectral Embedding','MDS','PCA','SVD aka LSA','DictionaryLearning','FactorAnalysis','FastICA','KernelPCA',),default=default_algos_to_apply,)

            emb_components_usable = [e for e in Discoverer.get_components('embed',True, include_aliases=True) if 'chunk' not in e and 'sentence' not in e]
            loaded_embed_nlu_refs = []
            loaded_classifier_nlu_refs = []
            loaded_storage_refs = []
            for c in e_coms :
                r = c.nlu_ref
                if 'en.' not in r and 'embed.' not  in r and 'ner' not in r : loaded_embed_nlu_refs.append('en.embed.' + r)
                elif 'en.'  in r and 'embed.' not  in r  and 'ner' not in r:
                    r = r.split('en.')[0]
                    loaded_embed_nlu_refs.append('en.embed.' + r)
                else :
                    loaded_embed_nlu_refs.append(StorageRefUtils.extract_storage_ref(c))
                loaded_storage_refs.append(StorageRefUtils.extract_storage_ref(c))

            for p in StreamlitVizTracker.loaded_word_embeding_pipes :
                if p != pipe : loaded_embed_nlu_refs.append(p.nlu_ref)
            loaded_embed_nlu_refs = list(set(loaded_embed_nlu_refs))

            for l in loaded_embed_nlu_refs:
                if l not in emb_components_usable : emb_components_usable.append(l)
            emb_components_usable.sort()
            loaded_embed_nlu_refs.sort()


            if model_select_position =='side':
                embed_algo_selection   = st.sidebar.multiselect("Pick additional Word Embeddings for the Dimension Reduction",options=emb_components_usable,default=loaded_embed_nlu_refs,key = key)
                embed_algo_selection=[embed_algo_selection[-1]]
            else :
                exp = st.expander("Pick additional Word Embeddings")
                embed_algo_selection   = exp.multiselect("Pick additional Word Embeddings for the Dimension Reduction",options=emb_components_usable,default=loaded_embed_nlu_refs,key = key)
                embed_algo_selection=[embed_algo_selection[-1]]
            embed_algos_to_load = list(set(embed_algo_selection) - set(loaded_embed_nlu_refs))
        for embedder in embed_algos_to_load:new_embed_pipes.append(nlu.load(embedder))# + f' {" ".join(additional_classifiers_for_coloring)}'))
        StreamlitVizTracker.loaded_word_embeding_pipes+=new_embed_pipes
        if pipe not in StreamlitVizTracker.loaded_word_embeding_pipes: StreamlitVizTracker.loaded_word_embeding_pipes.append(pipe)
        for nlu_ref in additional_classifiers_for_coloring :
            already_loaded=False
            if 'pos' in nlu_ref : continue
            # for p in  VizUtilsStreamlitOS.loaded_document_classifier_pipes:
            #     if p.nlu_ref == nlu_ref : already_loaded = True
            # if not already_loaded : VizUtilsStreamlitOS.loaded_token_level_classifiers.append(nlu.load(nlu_ref))
            else :
                for p in  StreamlitVizTracker.loaded_document_classifier_pipes:
                    if p.nlu_ref == nlu_ref : already_loaded = True
                if not already_loaded :
                    already_loaded=True
                    StreamlitVizTracker.loaded_document_classifier_pipes.append(nlu.load(nlu_ref))

        col_index = 0
        cols = st.columns(num_cols)
        def are_cols_full(): return col_index == num_cols
        token_feature_pipe = StreamlitUtilsOS.get_pipe('pos')
        #not all pipes have sentiment/pos etc.. models for hueing loaded....
        ## Lets FIRST predict with the classifiers/Token level feature generators and THEN apply embed component_list


        data = original_text.copy()
        classifier_cols = []
        for class_p in StreamlitVizTracker.loaded_document_classifier_pipes:

            data = class_p.predict(data, output_level='document',multithread=False)#.dropna()
            classifier_cols.append(StreamlitUtilsOS.get_classifier_cols(class_p))
            data['text'] = original_text
            # drop embeds of classifiers because bad conversion
            for c in data.columns :
                if 'embedding' in c : data.drop(c, inplace=True,axis=1)
        # data['text']
        # =data['document']
        data['text'] = original_text
        for c in data.columns :
            if 'sentence_embedding' in c : data.drop(c,inplace=True,axis=1)
        if 'document' in data.columns : data.drop('document',inplace=True,axis=1)
        if'pos' in data.columns : data.drop('pos',inplace=True,axis=1)


        for p in StreamlitVizTracker.loaded_word_embeding_pipes :
            p = StreamlitUtilsOS.merge_token_classifiers_with_embed_pipe(p, token_feature_pipe)
            predictions =   p.predict(data,output_level='token',multithread=False).dropna()
            e_col = StreamlitUtilsOS.find_embed_col(predictions)
            e_com = StreamlitUtilsOS.find_embed_component(p)
            e_com_storage_ref = StorageRefUtils.extract_storage_ref(e_com)
            emb = predictions[e_col]
            mat = np.array([x for x in emb])
            for algo in algos :
                #Only pos values for latent Dirchlet
                if algo == 'LatentDirichletAllocation':mat = np.square(mat)
                if len(mat.shape)>2 : mat = mat.reshape(len(emb),mat.shape[-1])
                hover_data = ['token','text','sentiment', 'pos']  # TODO DEDUCT
                # calc reduced dimensionality with every algo
                if 1 in target_dimensions:
                    low_dim_data = StreamlitUtilsOS.get_manifold_algo(algo,1,n_jobs).fit_transform(mat)
                    x = low_dim_data[:,0]
                    y = np.zeros(low_dim_data[:,0].shape)
                    tsne_df =  pd.DataFrame({'x':x,'y':y, 'text':predictions[text_col], 'pos':predictions.pos, 'sentiment' : predictions.sentiment,'token':predictions.token})
                    fig = px.scatter(tsne_df, x="x", y="y",color=feature_to_color_by, hover_data=hover_data)
                    subh = f"""Word-Embeddings =`{e_com_storage_ref}`, Manifold-Algo =`{algo}` for `D=1`"""
                    cols[col_index].markdown(subh)
                    cols[col_index].write(fig,key=key)
                    col_index+=1
                    if are_cols_full() :
                        cols = st.columns(num_cols)
                        col_index = 0
                if 2 in target_dimensions:
                    low_dim_data = StreamlitUtilsOS.get_manifold_algo(algo,2,n_jobs).fit_transform(mat)
                    x = low_dim_data[:,0]
                    y = low_dim_data[:,1]
                    tsne_df =  pd.DataFrame({'x':x,'y':y, 'text':predictions[text_col], 'pos':predictions.pos, 'sentiment' : predictions.sentiment,'token':predictions.token})

                    fig = px.scatter(tsne_df, x="x", y="y",color=feature_to_color_by, hover_data=hover_data)
                    subh = f"""Word-Embeddings =`{e_com_storage_ref}`, Manifold-Algo =`{algo}` for `D=2`"""
                    cols[col_index].markdown(subh)
                    cols[col_index].write(fig,key=key)
                    # st.write(fig)
                    col_index+=1
                    if are_cols_full() :
                        cols = st.columns(num_cols)
                        col_index = 0
                if 3 in target_dimensions:
                    low_dim_data = StreamlitUtilsOS.get_manifold_algo(algo,3,n_jobs).fit_transform(mat)
                    x = low_dim_data[:,0]
                    y = low_dim_data[:,1]
                    z = low_dim_data[:,2]
                    tsne_df =  pd.DataFrame({'x':x,'y':y,'z':z, 'text':predictions[text_col], 'pos':predictions.pos, 'sentiment':predictions.sentiment,'token':predictions.token })

                    fig = px.scatter_3d(tsne_df, x="x", y="y", z='z',color=feature_to_color_by, hover_data=hover_data)
                    subh = f"""Word-Embeddings =`{e_com_storage_ref}`, Manifold-Algo =`{algo}` for `D=3`"""
                    cols[col_index].markdown(subh)
                    cols[col_index].write(fig,key=key)

                    # st.write(fig)
                    col_index+=1
                    if are_cols_full() :
                        cols = st.columns(num_cols)
                        col_index = 0
            # Todo fancy embed infos etc
            # if display_embed_information: display_embed_vetor_information(e_com,mat)

        # if display_embed_information:
        #     exp = st.expander("Embedding vector information")
        #     exp.write(embed_vector_info)
        if show_infos :
            # VizUtilsStreamlitOS.display_infos()
            StreamlitVizTracker.display_model_info(pipe.nlu_ref, pipes = [pipe])
            StreamlitVizTracker.display_footer()
Exemplo n.º 14
0
    def check_and_fix_component_output_column_name_satisfaction(pipe):
        '''
        This function verifies that every input and output column name of a component is satisfied.
        If some output names are missing, it will be added by this method.
        Usually classifiers need to change their input column name, so that it matches one of the previous embeddings because they have dynamic output names
        This function performs the following steps :
        1. For each component we verify that all input column names are satisfied  by checking all other components output names
        2. When a input column is missing we do the following :
        2.1 Figure out the type of the missing input column. The name of the missing column should be equal to the type
        2.2 Check if there is already a component in the component_list, which provides this input (It should)
        2.3. When A providing component is found, check if storage ref matches up.
        2.4 If True for all, update provider component output name, or update the original component input name
        :return: NLU pipeline where the output and input column names of the models have been adjusted to each other
        '''
        logger.info("Fixing input and output column names")
        for component_to_check in pipe.components:
            if component_to_check.loaded_from_pretrained_pipe: continue
            input_columns = set(component_to_check.spark_input_column_names)
            # a component either has '' storage ref or at most 1
            logger.info(
                f'Checking for component {component_to_check.name} wether inputs {input_columns} is satisfied by another component in the component_list ',
            )
            for other_component in pipe.components:
                if component_to_check.name == other_component.name: continue
                output_columns = set(other_component.spark_output_column_names)
                input_columns -= output_columns  # remove provided columns

            input_columns = ComponentUtils.clean_irrelevant_features(
                input_columns)

            # Resolve basic mismatches, usually storage refs
            if len(
                    input_columns
            ) != 0 and not pipe.has_trainable_components or ComponentUtils.is_embedding_consumer(
                    component_to_check):  # fix missing column name
                # We must not only check if input satisfied, but if storage refs match! and Match Storage_refs accordingly
                logger.info(
                    f"Fixing bad input col for C={component_to_check} untrainable component_list"
                )
                resolved_storage_ref_cols = []
                for missing_column in input_columns:
                    for other_component in pipe.components:
                        if component_to_check.name == other_component.name:
                            continue
                        if other_component.type == missing_column:  #########
                            # We update the output name for the component which consumes our feature
                            if StorageRefUtils.has_storage_ref(
                                    other_component
                            ) and ComponentUtils.is_embedding_provider(
                                    component_to_check):
                                if ComponentUtils.are_producer_consumer_matches(
                                        component_to_check, other_component):
                                    resolved_storage_ref_cols.append(
                                        (other_component.
                                         spark_output_column_names[0],
                                         missing_column))

                            component_to_check.spark_output_column_names = [
                                missing_column
                            ]
                            logger.info(
                                f'Resolved requirement for missing_column={missing_column} with inputs from provider={other_component.name} by col={missing_column} '
                            )
                            other_component.model.setOutputCol(missing_column)

                for resolution, unsatisfied in resolved_storage_ref_cols:
                    component_to_check.spark_input_column_names.remove(
                        unsatisfied)
                    component_to_check.spark_input_column_names.append(
                        resolution)

            # Resolve training missmatches
            elif len(
                    input_columns
            ) != 0 and pipe.has_trainable_components:  # fix missing column name
                logger.info(
                    f"Fixing bad input col for C={component_to_check} trainable component_list"
                )
                # for trainable components, we change their input columns and leave other components outputs unchanged
                for missing_column in input_columns:
                    for other_component in pipe.components:
                        if component_to_check.name == other_component.name:
                            continue
                        if other_component.type == missing_column:
                            # We update the input col name for the componenet that has missing cols
                            component_to_check.spark_input_column_names.remove(
                                missing_column)
                            component_to_check.spark_input_column_names.append(
                                other_component.spark_output_column_names[0])
                            component_to_check.model.setInputCols(
                                component_to_check.spark_input_column_names)

                            logger.info(
                                f'Setting input col columns for component {component_to_check.name} to {other_component.spark_output_column_names[0]} '
                            )

        return pipe
Exemplo n.º 15
0
    def viz_streamlit_entity_embed_manifold(
            pipe,  # nlu component_list
            default_texts: List[str] = ("Donald Trump likes to visit New York", "Angela Merkel likes to visit Berlin!", 'Peter hates visiting Paris'),
            title: Optional[str] = "Lower dimensional Manifold visualization for Entity embeddings",
            sub_title: Optional[str] = "Apply any of the 10+ `Manifold` or `Matrix Decomposition` algorithms to reduce the dimensionality of `Entity Embeddings` to `1-D`, `2-D` and `3-D` ",
            default_algos_to_apply: List[str] = ("TSNE", "PCA"),
            target_dimensions: List[int] = (1, 2, 3),
            show_algo_select: bool = True,
            set_wide_layout_CSS: bool = True,
            num_cols: int = 3,
            model_select_position: str = 'side',  # side or main
            key: str = "NLU_streamlit",
            show_infos: bool = True,
            show_logo: bool = True,
            n_jobs: Optional[int] = 3,  # False
    ):

        from nlu.pipe.viz.streamlit_viz.streamlit_utils_OS import StreamlitUtilsOS
        StreamlitVizTracker.footer_displayed = False

        try:
            import plotly.express as px
            from sklearn.metrics.pairwise import distance_metrics
        except:
            st.error(
                "You need the sklearn and plotly package in your Python environment installed for similarity visualizations. Run <pip install sklearn plotly>")

        if show_logo: StreamlitVizTracker.show_logo()
        if set_wide_layout_CSS: _set_block_container_style()
        if title: st.header(title)
        if sub_title: st.subheader(sub_title)
        # if show_logo :VizUtilsStreamlitOS.show_logo()
        # VizUtilsStreamlitOS.loaded_word_embeding_pipes = []
        if isinstance(default_texts, list) : default_texts = '\n'.join(default_texts)
        data = st.text_area('Enter N texts, seperated by new lines to visualize Sentence Embeddings for ',
                            default_texts).split('\n')
        output_level = 'chunk'
        ner_emebed_pipe_algo_selection = []
        loaded_ner_embed_nlu_refs = []
        algos = ['TSNE']
        # A component_list should have a NER and a Word Embedding
        if pipe not in StreamlitVizTracker.loaded_ner_word_embeding_pipes: StreamlitVizTracker.loaded_ner_word_embeding_pipes.append(
            pipe)
        if pipe not in StreamlitVizTracker.loaded_word_embeding_pipes: StreamlitVizTracker.loaded_word_embeding_pipes.append(
            pipe)

        if show_algo_select:
            # Manifold Selection
            exp = st.expander("Select additional manifold and dimension reduction techniques to apply")
            algos = exp.multiselect(
                "Reduce embedding dimensionality to something visualizable",
                options=(
                    "TSNE", "ISOMAP", 'LLE', 'Spectral Embedding', 'MDS', 'PCA', 'SVD aka LSA', 'DictionaryLearning',
                    'FactorAnalysis', 'FastICA', 'KernelPCA', 'LatentDirichletAllocation'),
                default=default_algos_to_apply, )
            ner_emb_components_usable = [e for e in Discoverer.get_components('ner', True, include_aliases=True) if
                                         'embed' not in e and 'sentence' not in e]

            # Find nlu_ref of currenlty loaded component_list
            for p in StreamlitVizTracker.loaded_ner_word_embeding_pipes:
                loaded_ner_embed_nlu_refs.append(p.nlu_ref)

            # NER Selection
            if model_select_position == 'side':
                ner_emebed_pipe_algo_selection = st.sidebar.multiselect(
                    "Pick additional NER Models for the Dimension Reduction", options=ner_emb_components_usable,
                    default=loaded_ner_embed_nlu_refs, key=key)
            else:
                ner_emebed_pipe_algo_selection = exp.multiselect(
                    "Pick additional NER Models for the Dimension Reduction", options=ner_emb_components_usable,
                    default=loaded_ner_embed_nlu_refs, key=key)

        for ner_nlu_ref in ner_emebed_pipe_algo_selection:
            load = True
            for ner_p in StreamlitVizTracker.loaded_ner_word_embeding_pipes:
                if ner_p.nlu_ref == ner_nlu_ref:
                    load = False
                    break
            if not load: continue
            p = nlu.load(ner_nlu_ref)
            if p not in StreamlitVizTracker.loaded_ner_word_embeding_pipes: StreamlitVizTracker.loaded_ner_word_embeding_pipes.append(
                p)
            if p not in StreamlitVizTracker.loaded_word_embeding_pipes: StreamlitVizTracker.loaded_word_embeding_pipes.append(
                p)

        col_index = 0
        cols = st.columns(num_cols)

        def are_cols_full():
            return col_index == num_cols

        for p in StreamlitVizTracker.loaded_ner_word_embeding_pipes:
            p = EntityManifoldUtils.insert_chunk_embedder_to_pipe_if_missing(p)
            predictions = p.predict(data, metadata=True, output_level=output_level, multithread=False).dropna()
            entity_cols = EntityManifoldUtils.get_ner_cols(predictions)
            chunk_embed_col = EntityManifoldUtils.find_chunk_embed_col(predictions)

            # TODO get cols for non default NER? or multi ner setups?
            # features = predictions[EntityManifoldUtils.get_ner_cols(predictions)]
            # e_col = StreamlitUtilsOS.find_embed_col(predictions)
            e_com = StreamlitUtilsOS.find_embed_component(p)
            e_com_storage_ref = StorageRefUtils.extract_storage_ref(e_com)
            emb = predictions[chunk_embed_col]
            mat = np.array([x for x in emb])
            # for ner_emb_p in ps:
            for algo in algos:
                # Only pos values for latent Dirchlet
                if algo == 'LatentDirichletAllocation': mat = np.square(mat)
                if len(mat.shape) > 2: mat = mat.reshape(len(emb), mat.shape[-1])
                hover_data = entity_cols + ['text']
                # calc reduced dimensionality with every algo
                feature_to_color_by = entity_cols[0]
                if 1 in target_dimensions:
                    low_dim_data = StreamlitUtilsOS.get_manifold_algo(algo, 1, n_jobs).fit_transform(mat)
                    x = low_dim_data[:, 0]
                    y = np.zeros(low_dim_data[:, 0].shape)

                    # predictions['text'] = original_text
                    tsne_df = pd.DataFrame({**{'x': x, 'y': y},
                                            **{k: predictions[k] for k in entity_cols},
                                            **{'text': predictions[entity_cols[-1]]}
                                            })
                    fig = px.scatter(tsne_df, x="x", y="y", color=feature_to_color_by, hover_data=hover_data)
                    subh = f"""Word-Embeddings =`{e_com_storage_ref}`, NER-Model =`{p.nlu_ref}`, Manifold-Algo =`{algo}` for `D=1`"""
                    cols[col_index].markdown(subh)
                    cols[col_index].write(fig, key=key)
                    col_index += 1
                    if are_cols_full():
                        cols = st.columns(num_cols)
                        col_index = 0
                if 2 in target_dimensions:
                    low_dim_data = StreamlitUtilsOS.get_manifold_algo(algo, 2, n_jobs).fit_transform(mat)
                    x = low_dim_data[:, 0]
                    y = low_dim_data[:, 1]
                    tsne_df = pd.DataFrame({**{'x': x, 'y': y},
                                            **{k: predictions[k] for k in entity_cols},
                                            **{'text': predictions[entity_cols[-1]]}
                                            })
                    fig = px.scatter(tsne_df, x="x", y="y", color=feature_to_color_by, hover_data=hover_data)
                    subh = f"""Word-Embeddings =`{e_com_storage_ref}`, NER-Model =`{p.nlu_ref}`, Manifold-Algo =`{algo}` for `D=2`"""
                    cols[col_index].markdown(subh)
                    cols[col_index].write(fig, key=key)
                    col_index += 1
                    if are_cols_full():
                        cols = st.columns(num_cols)
                        col_index = 0
                if 3 in target_dimensions:
                    low_dim_data = StreamlitUtilsOS.get_manifold_algo(algo, 3, n_jobs).fit_transform(mat)
                    x = low_dim_data[:, 0]
                    y = low_dim_data[:, 1]
                    z = low_dim_data[:, 2]
                    tsne_df = pd.DataFrame({**{'x': x, 'y': y, 'z': z},
                                            **{k: predictions[k] for k in entity_cols},
                                            **{'text': predictions[entity_cols[-1]]}
                                            })
                    fig = px.scatter_3d(tsne_df, x="x", y="y", z='z', color=feature_to_color_by, hover_data=hover_data)
                    subh = f"""Word-Embeddings =`{e_com_storage_ref}`, NER-Model =`{p.nlu_ref}`, Manifold-Algo =`{algo}` for `D=3`"""
                    cols[col_index].markdown(subh)
                    cols[col_index].write(fig, key=key)
                    col_index += 1
                    if are_cols_full():
                        cols = st.columns(num_cols)
                        col_index = 0

                # Todo fancy embed infos etc
                # if display_embed_information: display_embed_vetor_information(e_com,mat)

            # if display_embed_information:
            #     exp = st.expander("Embedding vector information")
            #     exp.write(embed_vector_info)

        if show_infos:
            # VizUtilsStreamlitOS.display_infos()
            StreamlitVizTracker.display_model_info(pipe.nlu_ref, pipes=[pipe])
            StreamlitVizTracker.display_footer()
Exemplo n.º 16
0
    def viz_streamlit_sentence_embed_manifold(
            pipe,  # nlu component_list
            default_texts: List[str] = (
            "Donald Trump likes to party!", "Angela Merkel likes to party!", 'Peter HATES TO PARTTY!!!! :('),
            title: Optional[str] = "Lower dimensional Manifold visualization for Sentence embeddings",
            sub_title: Optional[
                str] = "Apply any of the 11 `Manifold` or `Matrix Decomposition` algorithms to reduce the dimensionality of `Sentence Embeddings` to `1-D`, `2-D` and `3-D` ",
            write_raw_pandas: bool = False,
            default_algos_to_apply: List[str] = ("TSNE", "PCA"),
            # ,'LLE','Spectral Embedding','MDS','ISOMAP','SVD aka LSA','DictionaryLearning','FactorAnalysis','FastICA','KernelPCA',),  # LatentDirichletAllocation 'NMF',
            target_dimensions: List[int] = (1, 2, 3),
            show_algo_select: bool = True,
            show_embed_select: bool = True,
            show_color_select: bool = True,
            MAX_DISPLAY_NUM: int = 200000,
            display_embed_information: bool = True,
            set_wide_layout_CSS: bool = True,
            num_cols: int = 3,
            model_select_position: str = 'side',  # side or main
            key: str = "NLU_streamlit",
            additional_classifiers_for_coloring: List[str] = ['sentiment.imdb'],
            generate_code_sample: bool = False,
            show_infos: bool = True,
            show_logo: bool = True,
            n_jobs: Optional[int] = 3,  # False
    ):
        from nlu.pipe.viz.streamlit_viz.streamlit_utils_OS import StreamlitUtilsOS
        StreamlitVizTracker.footer_displayed = False

        try:
            import plotly.express as px
            from sklearn.metrics.pairwise import distance_metrics
        except:
            st.error(
                "You need the sklearn and plotly package in your Python environment installed for similarity visualizations. Run <pip install sklearn plotly>")
        # if len(default_texts) > MAX_DISPLAY_NUM : default_texts = default_texts[:MAX_DISPLAY_NUM]
        if show_logo: StreamlitVizTracker.show_logo()
        if set_wide_layout_CSS: _set_block_container_style()
        if title: st.header(title)
        if sub_title: st.subheader(sub_title)
        # if show_logo :VizUtilsStreamlitOS.show_logo()

        # VizUtilsStreamlitOS.loaded_word_embeding_pipes = []

        data = st.text_area('Enter N texts, seperated by new lines to visualize Sentence Embeddings for ',
                            default_texts)
        # detect_sentence = False # TODO ITNEGRATE PARAM
        output_level = 'document'  # if not detect_sentence else 'sentence'
        classifier_cols = []
        original_text = nlu.load('tokenize').predict(data.split("\n"), output_level=output_level)[output_level].values
        original_text = original_text
        original_text = original_text[original_text != '']
        original_text = original_text[~pd.isna(original_text)]

        text_col = output_level
        embed_algos_to_load = []
        class_algos_to_load = []
        new_embed_pipes = []
        new_class_pipes = []
        e_coms = StreamlitUtilsOS.find_all_embed_components(pipe)

        if show_algo_select:
            exp = st.expander("Select additional manifold and dimension reduction techniques to apply")

            algos = exp.multiselect(
                "Reduce embedding dimensionality to something visualizable",
                options=(
                "TSNE", "ISOMAP", 'LLE', 'Spectral Embedding', 'MDS', 'PCA', 'SVD aka LSA', 'DictionaryLearning',
                'FactorAnalysis', 'FastICA', 'KernelPCA', 'LatentDirichletAllocation'),
                default=default_algos_to_apply, )

            emb_components_usable = [e for e in Discoverer.get_components('embed', True, include_aliases=True) if
                                     'chunk' not in e and 'sentence' in e]
            # Todo, multi-classifiers excluded
            classifier_components_usable = [e for e in Discoverer.get_components('classify', True, include_aliases=True)
                                            if 'xx' not in e and 'toxic' not in e and 'e2e' not in e]
            # Storage Ref extraction
            loaded_embed_nlu_refs, loaded_storage_refs = StreamlitUtilsOS.extract_all_sentence_storage_refs_or_nlu_refs(
                e_coms)
            loaded_classifier_nlu_refs = additional_classifiers_for_coloring  # + all classifier NLU_refs?

            # Get loaded Embed NLU Refs
            for embed_pipe in StreamlitVizTracker.loaded_sentence_embeding_pipes:
                if embed_pipe != pipe: loaded_embed_nlu_refs.append(embed_pipe.nlu_ref)
            loaded_embed_nlu_refs = list(set(loaded_embed_nlu_refs))

            # Get loaded Classifier NLU Refs
            for embed_pipe in StreamlitVizTracker.loaded_document_classifier_pipes:
                if embed_pipe != pipe: loaded_classifier_nlu_refs.append(embed_pipe.nlu_ref)
            loaded_classifier_nlu_refs = list(set(loaded_classifier_nlu_refs))

            # fix default selector
            for l in loaded_embed_nlu_refs:
                if l not in emb_components_usable: emb_components_usable.append(l)

            # fix default selector
            for l in loaded_classifier_nlu_refs:
                if l not in classifier_components_usable: classifier_components_usable.append(l)

            emb_components_usable.sort()
            loaded_embed_nlu_refs.sort()
            classifier_components_usable.sort()
            loaded_classifier_nlu_refs.sort()
            if model_select_position == 'side':
                embed_algo_selection = st.sidebar.multiselect(
                    "Pick additional Sentence Embeddings for the Dimension Reduction", options=emb_components_usable,
                    default=loaded_embed_nlu_refs, key=key)
                embed_algo_selection = [embed_algo_selection[-1]]

                exp = st.expander("Pick additional Classifiers")
                class_algo_selection = exp.multiselect("Pick additional Classifiers to load for coloring points",
                                                       options=classifier_components_usable,
                                                       default=loaded_classifier_nlu_refs, key=key)
                class_algo_selection = [class_algo_selection[-1]]

            else:
                exp = st.expander("Pick additional Sentence Embeddings")
                embed_algo_selection = exp.multiselect(
                    "Pick additional Sentence Embeddings for the Dimension Reduction", options=emb_components_usable,
                    default=loaded_embed_nlu_refs, key=key)
                embed_algo_selection = [embed_algo_selection[-1]]

                exp = st.expander("Pick additional Classifiers")
                class_algo_selection = exp.multiselect("Pick additional Classifiers to load for coloring points",
                                                       options=classifier_components_usable,
                                                       default=loaded_classifier_nlu_refs, key=key)
                class_algo_selection = [class_algo_selection[-1]]

            embed_algos_to_load = list(set(embed_algo_selection) - set(loaded_embed_nlu_refs))
            class_algos_to_load = list(set(class_algo_selection) - set(loaded_classifier_nlu_refs))

        for embedder in embed_algos_to_load: new_embed_pipes.append(nlu.load(embedder))
        for classifier in class_algos_to_load: new_class_pipes.append(nlu.load(classifier))

        StreamlitVizTracker.loaded_sentence_embeding_pipes += new_embed_pipes
        StreamlitVizTracker.loaded_document_classifier_pipes += new_class_pipes
        if pipe not in StreamlitVizTracker.loaded_sentence_embeding_pipes: StreamlitVizTracker.loaded_sentence_embeding_pipes.append(
            pipe)

        for nlu_ref in additional_classifiers_for_coloring:  # TODO REMVOVE< INTEGRATE INTO THE AUT LOAD THING REDUNDAND
            already_loaded = False
            for embed_pipe in StreamlitVizTracker.loaded_document_classifier_pipes:
                if embed_pipe.nlu_ref == nlu_ref: already_loaded = True
            if not already_loaded:
                already_loaded = True
                StreamlitVizTracker.loaded_document_classifier_pipes.append(nlu.load(nlu_ref))

        col_index = 0
        cols = st.columns(num_cols)

        data = original_text.copy()
        # Get classifier predictions
        classifier_cols = []
        for class_pipe in StreamlitVizTracker.loaded_document_classifier_pipes:
            data = class_pipe.predict(data, output_level=output_level, multithread=False)
            classifier_cols += StreamlitUtilsOS.get_classifier_cols(class_pipe)
            data['text'] = original_text
            # drop embeds of classifiers because bad conversion
            for c in data.columns:
                if 'embedding' in c: data.drop(c, inplace=True, axis=1)

        data['text'] = original_text
        if show_color_select:
            if model_select_position == 'side':
                feature_to_color_by = st.sidebar.selectbox('Pick a feature to color points in manifold by ',
                                                           classifier_cols, 0)
            else:
                feature_to_color_by = st.selectbox('Feature to color plots by ', classifier_cols, 0)

        def are_cols_full():
            return col_index == num_cols

        for embed_pipe in StreamlitVizTracker.loaded_sentence_embeding_pipes:
            predictions = embed_pipe.predict(data, output_level=output_level, multithread=False).dropna()
            e_col = StreamlitUtilsOS.find_embed_col(predictions)
            e_com = StreamlitUtilsOS.find_embed_component(embed_pipe)
            e_com_storage_ref = StorageRefUtils.extract_storage_ref(e_com)
            emb = predictions[e_col]
            mat = np.array([x for x in emb])
            for algo in algos:
                # Only pos values for latent Dirchlet
                if algo == 'LatentDirichletAllocation': mat = np.square(mat)
                if len(mat.shape) > 2: mat = mat.reshape(len(emb), mat.shape[-1])
                hover_data = classifier_cols + ['text']
                # calc reduced dimensionality with every algo
                if 1 in target_dimensions:
                    low_dim_data = StreamlitUtilsOS.get_manifold_algo(algo, 1, n_jobs).fit_transform(mat)
                    x = low_dim_data[:, 0]
                    y = np.zeros(low_dim_data[:, 0].shape)
                    predictions['text'] = original_text
                    tsne_df = pd.DataFrame({**{'x': x, 'y': y},
                                            **{k: predictions[k] for k in classifier_cols},
                                            **{'text': original_text}
                                            })
                    fig = px.scatter(tsne_df, x="x", y="y", color=feature_to_color_by, hover_data=hover_data)
                    subh = f"""Sentence-Embeddings =`{e_com_storage_ref}`, Manifold-Algo =`{algo}` for `D=1`"""
                    cols[col_index].markdown(subh)
                    cols[col_index].write(fig, key=key)
                    col_index += 1
                    if are_cols_full():
                        cols = st.columns(num_cols)
                        col_index = 0
                if 2 in target_dimensions:
                    low_dim_data = StreamlitUtilsOS.get_manifold_algo(algo, 2, n_jobs).fit_transform(mat)
                    x = low_dim_data[:, 0]
                    y = low_dim_data[:, 1]
                    tsne_df = pd.DataFrame({**{'x': x, 'y': y},
                                            **{k: predictions[k] for k in classifier_cols},
                                            **{'text': original_text}
                                            })
                    fig = px.scatter(tsne_df, x="x", y="y", color=feature_to_color_by, hover_data=hover_data)
                    subh = f"""Sentence-Embeddings =`{e_com_storage_ref}`, Manifold-Algo =`{algo}` for `D=2`"""
                    cols[col_index].markdown(subh)
                    cols[col_index].write(fig, key=key)
                    col_index += 1
                    if are_cols_full():
                        cols = st.columns(num_cols)
                        col_index = 0
                if 3 in target_dimensions:
                    low_dim_data = StreamlitUtilsOS.get_manifold_algo(algo, 3, n_jobs).fit_transform(mat)
                    x = low_dim_data[:, 0]
                    y = low_dim_data[:, 1]
                    z = low_dim_data[:, 2]
                    tsne_df = pd.DataFrame({**{'x': x, 'y': y, 'z': z},
                                            **{k: predictions[k] for k in classifier_cols},
                                            **{'text': original_text}
                                            })
                    fig = px.scatter_3d(tsne_df, x="x", y="y", z='z', color=feature_to_color_by, hover_data=hover_data)
                    subh = f"""Sentence-Embeddings =`{e_com_storage_ref}`, Manifold-Algo =`{algo}` for `D=3`"""
                    cols[col_index].markdown(subh)
                    cols[col_index].write(fig, key=key)
                    col_index += 1
                    if are_cols_full():
                        cols = st.columns(num_cols)
                        col_index = 0

            # Todo fancy embed infos etc
            # if display_embed_information: display_embed_vetor_information(e_com,mat)

        # if display_embed_information:
        #     exp = st.expander("Embedding vector information")
        #     exp.write(embed_vector_info)

        if show_infos:
            # VizUtilsStreamlitOS.display_infos()
            StreamlitVizTracker.display_model_info(pipe.nlu_ref, pipes=[pipe])
            StreamlitVizTracker.display_footer()
Exemplo n.º 17
0
    def display_word_similarity(
        pipe,  # nlu component_list
        default_texts: Tuple[str, str] = ("Donald Trump likes to party!",
                                          "Angela Merkel likes to party!"),
        threshold: float = 0.5,
        title: Optional[
            str] = "Embeddings Similarity Matrix &  Visualizations  ",
        sub_tile: Optional[
            str] = "Visualize `word-wise similarity matrix` and calculate `similarity scores` for `2 texts` and every `word embedding` loaded",
        write_raw_pandas: bool = False,
        display_embed_information: bool = True,
        similarity_matrix=True,
        show_algo_select: bool = True,
        dist_metrics: List[str] = ('cosine'),
        set_wide_layout_CSS: bool = True,
        generate_code_sample: bool = False,
        key: str = "NLU_streamlit",
        num_cols: int = 2,
        display_scalar_similarities: bool = False,
        display_similarity_summary: bool = False,
        model_select_position: str = 'side',  # main or side
        show_infos: bool = True,
        show_logo: bool = True,
    ):
        """We visualize the following cases :
        1. Simmilarity between 2 words - > sim (word_emb1, word_emb2)
        2. Simmilarity between 2 sentences -> let weTW stand word word_emb of token T and sentence S
            2.1. Raw token level with merged embeddings -> sim([we11,we21,weT1], [we12,we22,weT2])
            2.2  Autogenerate sentemb, basically does 2.1 in the Spark NLP backend
            2.3 Already using sentence_embedder model -> sim(se1,se2)
        3. Simmilarity between token and sentence -> sim([we11,w21,wT1], se2)
        4. Mirrored 3
         """
        # https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.pairwise
        StreamlitVizTracker.footer_displayed = False
        try:
            import plotly.express as px
            from sklearn.metrics.pairwise import distance_metrics
        except:
            st.error(
                "You need the sklearn and plotly package in your Python environment installed for similarity visualizations. Run <pip install sklearn plotly>"
            )
        if set_wide_layout_CSS: _set_block_container_style()
        if title: st.header(title)
        if show_logo: StreamlitVizTracker.show_logo()
        if sub_tile: st.subheader(sub_tile)

        StreamlitVizTracker.loaded_word_embeding_pipes = []
        dist_metric_algos = distance_metrics()
        dist_algos = list(dist_metric_algos.keys())
        if 'haversine' in dist_algos:
            dist_algos.remove('haversine')  # not applicable in >2D
        if 'precomputed' in dist_algos:
            dist_algos.remove('precomputed')  # Not a dist
        cols = st.columns(2)
        text1 = cols[0].text_input("Text or word1",
                                   default_texts[0],
                                   key=key + 'field_1')
        text2 = cols[1].text_input(
            "Text or word2", default_texts[1], key=key +
            'field_2') if len(default_texts) > 1 else cols[1].text_input(
                "Text or word2", 'Please enter second string', key=key)
        # exp = st.sidebar.beta_expander("Select additional Embedding Models and distance metric to compare ")
        e_coms = StreamlitUtilsOS.find_all_embed_components(pipe)
        embed_algos_to_load = []
        embed_pipes = [pipe]
        dist_algo_selection = dist_metrics
        if show_algo_select:
            # emb_components_usable = Discoverer.get_components('embed')
            emb_components_usable = [
                e for e in Discoverer.get_components(
                    'embed', True, include_aliases=True)
                if 'chunk' not in e and 'sentence' not in e
            ]
            loaded_embed_nlu_refs = []
            loaded_storage_refs = []
            loaded_embed_nlu_refs = list(set(loaded_embed_nlu_refs))

            for c in e_coms:
                r = c.nlu_ref
                if 'en.' not in r and 'embed.' not in r and 'ner' not in r:
                    loaded_embed_nlu_refs.append('en.embed.' + r)
                elif 'en.' in r and 'embed.' not in r and 'ner' not in r:
                    r = r.split('en.')[0]
                    loaded_embed_nlu_refs.append('en.embed.' + r)
                else:
                    loaded_embed_nlu_refs.append(
                        StorageRefUtils.extract_storage_ref(c))
                loaded_storage_refs.append(
                    StorageRefUtils.extract_storage_ref(c))
            for p in StreamlitVizTracker.loaded_word_embeding_pipes:
                if p != pipe: loaded_embed_nlu_refs.append(p.nlu_ref)
            for l in loaded_embed_nlu_refs:
                if l not in emb_components_usable:
                    emb_components_usable.append(l)
            # embed_algo_selection = exp.multiselect("Click to pick additional Embedding Algorithm",options=emb_components_usable,default=loaded_embed_nlu_refs,key = key)
            # dist_algo_selection = exp.multiselect("Click to pick additional Distance Metric", options=dist_algos, default=dist_metrics, key = key)
            emb_components_usable.sort()
            loaded_embed_nlu_refs.sort()
            dist_algos.sort()
            if model_select_position == 'side':
                embed_algo_selection = st.sidebar.multiselect(
                    "Pick additional Word Embeddings for the Similarity Matrix",
                    options=emb_components_usable,
                    default=loaded_embed_nlu_refs,
                    key=key)
                dist_algo_selection = st.sidebar.multiselect(
                    "Pick additional Similarity Metrics ",
                    options=dist_algos,
                    default=dist_metrics,
                    key=key)
            else:
                exp = st.expander(
                    "Pick additional Word Embeddings and Similarity Metrics")
                embed_algo_selection = exp.multiselect(
                    "Pick additional Word Embeddings for the Similarity Matrix",
                    options=emb_components_usable,
                    default=loaded_embed_nlu_refs,
                    key=key)
                dist_algo_selection = exp.multiselect(
                    "Pick additional Similarity Metrics ",
                    options=dist_algos,
                    default=dist_metrics,
                    key=key)
            embed_algos_to_load = list(
                set(embed_algo_selection) - set(loaded_embed_nlu_refs))

        for embedder in embed_algos_to_load:
            embed_pipes.append(nlu.load(embedder))

        if generate_code_sample:
            st.code(
                get_code_for_viz(
                    'SIMILARITY',
                    [StreamlitUtilsOS.extract_name(p)
                     for p in embed_pipes], default_texts))

        StreamlitVizTracker.loaded_word_embeding_pipes += embed_pipes
        similarity_metrics = {}
        embed_vector_info = {}
        cols_full = True
        col_index = 0
        # for p in embed_pipes :
        for p in StreamlitVizTracker.loaded_word_embeding_pipes:
            data1 = p.predict(text1, output_level='token',
                              get_embeddings=True).dropna()
            data2 = p.predict(text2, output_level='token',
                              get_embeddings=True).dropna()
            e_coms = StreamlitUtilsOS.find_all_embed_components(p)
            modelhub_links = [
                ModelHubUtils.get_url_by_nlu_refrence(c.nlu_ref)
                for c in e_coms
            ]
            e_cols = StreamlitUtilsOS.get_embed_cols(p)
            for num_emb, e_col in enumerate(e_cols):
                if col_index == num_cols - 1: cols_full = True
                if cols_full:
                    cols = st.columns(num_cols)
                    col_index = 0
                    cols_full = False
                else:
                    col_index += 1
                tok1 = data1['token']
                tok2 = data2['token']
                emb1 = data1[e_col]
                emb2 = data2[e_col]

                def normalize_matrix(m):
                    return np.nan_to_num(
                        m / np.linalg.norm(m, axis=1, keepdims=True))

                embed_mat1 = normalize_matrix(np.array([x for x in emb1]))
                embed_mat2 = normalize_matrix(np.array([x for x in emb2]))
                # e_name = e_col.split('word_embedding_')[-1]
                e_name = e_coms[num_emb].nlu_ref
                e_name = e_name.split(
                    'embed.')[-1] if 'en.' in e_name else e_name
                if 'ner' in e_name: e_name = loaded_storage_refs[num_emb]

                embed_vector_info[e_name] = {
                    "Vector Dimension ":
                    embed_mat1.shape[1],
                    "Num Vectors":
                    embed_mat1.shape[0] + embed_mat1.shape[0],
                    "NLU_reference":
                    e_coms[num_emb].nlu_ref,
                    "Spark_NLP_reference":
                    ModelHubUtils.NLU_ref_to_NLP_ref(e_coms[num_emb].nlu_ref),
                    "Storage Reference":
                    loaded_storage_refs[num_emb],
                    'Modelhub info':
                    modelhub_links[num_emb]
                }
                for dist_algo in dist_algo_selection:
                    # scalar_similarities[e_col][dist_algo]={}
                    sim_score = ((dist_metric_algos[dist_algo]
                                  (embed_mat1, embed_mat2) - 1) * -1)

                    sim_score = pd.DataFrame(sim_score)
                    sim_score.index = tok1.values
                    sim_score.columns = tok2.values
                    sim_score.columns = StreamlitVizTracker.pad_duplicate_tokens(
                        list(sim_score.columns))
                    sim_score.index = StreamlitVizTracker.pad_duplicate_tokens(
                        list(sim_score.index))
                    if write_raw_pandas: st.write(sim_score, key=key)
                    if sim_score.shape == (1, 1):
                        sim_score = sim_score.iloc[0][0]
                        sim_score = round(sim_score, 2)
                        if sim_score > threshold:
                            st.success(sim_score)
                            st.success(
                                f'Scalar Similarity={sim_score} for distance metric={dist_algo}'
                            )
                            st.error(
                                'No similarity matrix for only 2 tokens. Try entering at least 1 sentences in a field'
                            )
                        else:
                            st.error(
                                f'Scalar Similarity={sim_score} for distance metric={dist_algo}'
                            )
                    else:
                        ploty_avaiable = True
                        # for tok emb, sum rows and norm by rows, then sum cols and norm by cols to generate a scalar from matrix
                        scalar_sim_score = np.sum(
                            (np.sum(sim_score, axis=0) /
                             sim_score.shape[0])) / sim_score.shape[1]
                        scalar_sim_score = round(scalar_sim_score, 2)

                        if display_scalar_similarities:
                            if scalar_sim_score > threshold:
                                st.success(
                                    f'Scalar Similarity :{scalar_sim_score} for distance metric={dist_algo}'
                                )
                            else:
                                st.error(
                                    f'Scalar Similarity :{scalar_sim_score} for embedder={e_col} distance metric={dist_algo}'
                                )
                        if similarity_matrix:
                            if ploty_avaiable:
                                fig = px.imshow(
                                    sim_score, labels=dict(color="similarity")
                                )  # , title=f'Simmilarity Matrix for embedding_model={e_name} distance metric={dist_algo}')
                                # st.write(fig,key =key)
                                similarity_metrics[
                                    f'{e_name}_{dist_algo}_similarity'] = {
                                        'scalar_similarity': scalar_sim_score,
                                        'dist_metric': dist_algo,
                                        'embedding_model': e_name,
                                        'modelhub_info':
                                        modelhub_links[num_emb],
                                    }
                                subh = f"""Embedding-Model=`{e_name}`, Similarity-Score=`{scalar_sim_score}`,  distance metric=`{dist_algo}`"""
                                cols[col_index].markdown(subh)
                                cols[col_index].write(fig, key=key)
                            else:
                                pass  # todo fallback plots

        if display_similarity_summary:
            exp = st.expander("Similarity summary")
            exp.write(similarity_metrics)
        if display_embed_information:
            exp = st.expander("Embedding vector information")
            exp.write(embed_vector_info)
        if show_infos:
            # VizUtilsStreamlitOS.display_infos()
            StreamlitVizTracker.display_model_info(pipe.nlu_ref, pipes=[pipe])
            StreamlitVizTracker.display_footer()