예제 #1
0
파일: ner.py 프로젝트: JohnSnowLabs/nlu
    def visualize_ner(
            pipe, # Nlu component_list
            text:str,
            ner_tags: Optional[List[str]] = None,
            show_label_select: bool = True,
            show_table: bool = False,
            title: Optional[str] = "Named Entities",
            sub_title: Optional[str] = "Recognize various `Named Entities (NER)` in text entered and filter them. You can select from over `100 languages` in the dropdown.",
            colors: Dict[str, str] = {},
            show_color_selector: bool = False,
            set_wide_layout_CSS:bool=True,
            generate_code_sample:bool = False,
            key = "NLU_streamlit",
            model_select_position:str = 'side',
            show_model_select : bool = True,
            show_text_input:bool = True,
            show_infos:bool = True,
            show_logo:bool = True,

    ):
        StreamlitVizTracker.footer_displayed=False
        if set_wide_layout_CSS : _set_block_container_style()
        if show_logo :StreamlitVizTracker.show_logo()
        if show_model_select :
            model_selection = Discoverer.get_components('ner',include_pipes=True)
            model_selection.sort()
            if model_select_position == 'side':ner_model_2_viz = st.sidebar.selectbox("Select a NER model",model_selection,index=model_selection.index(pipe.nlu_ref.split(' ')[0]))
            else : ner_model_2_viz = st.selectbox("Select a NER model",model_selection,index=model_selection.index(pipe.nlu_ref.split(' ')[0]))
            pipe = pipe if pipe.nlu_ref == ner_model_2_viz else StreamlitUtilsOS.get_pipe(ner_model_2_viz)
        if title: st.header(title)
        if show_text_input : text = st.text_area("Enter text you want to visualize NER classes for below", text, key=key)
        if sub_title : st.subheader(sub_title)
        if generate_code_sample: st.code(get_code_for_viz('NER',StreamlitUtilsOS.extract_name(pipe),text))
        if ner_tags is None: ner_tags = StreamlitUtilsOS.get_NER_tags_in_pipe(pipe)

        if not show_color_selector :
            if show_label_select:
                exp = st.expander("Select entity labels to highlight")
                label_select = exp.multiselect(
                    "These labels are predicted by the NER model. Select which ones you want to display",
                    options=ner_tags,default=list(ner_tags))
            else : label_select = ner_tags
            pipe.viz(text,write_to_streamlit=True, viz_type='ner',labels_to_viz=label_select,viz_colors=colors, streamlit_key=key)
        else : # TODO WIP color select
            cols = st.columns(3)
            exp = cols[0].beta_expander("Select entity labels to display")
            color = st.color_picker('Pick A Color', '#00f900',key = key)
            color = cols[2].color_picker('Pick A Color for a specific entity label', '#00f900',key = key)
            tag2color = cols[1].selectbox('Pick a ner tag to color', ner_tags,key = key)
            colors[tag2color]=color
        if show_table : st.write(pipe.predict(text, output_level='chunk'),key = key)

        if show_infos :
            # VizUtilsStreamlitOS.display_infos()
            StreamlitVizTracker.display_model_info(pipe.nlu_ref, pipes = [pipe])
            StreamlitVizTracker.display_footer()
예제 #2
0
 def visualize_dep_tree(
     pipe,  # nlu component_list
     text: str = 'Billy likes to swim',
     title: Optional[str] = "Dependency Parse & Part-of-speech tags",
     sub_title: Optional[
         str] = 'POS tags define a `grammatical label` for `each token` and the `Dependency Tree` classifies `Relations between the tokens` ',
     set_wide_layout_CSS: bool = True,
     generate_code_sample: bool = False,
     key="NLU_streamlit_dep_tree",
     show_infos: bool = True,
     show_logo: bool = True,
     show_text_input: bool = True,
 ):
     StreamlitVizTracker.footer_displayed = False
     if show_logo: StreamlitVizTracker.show_logo()
     if set_wide_layout_CSS: _set_block_container_style()
     if title: st.header(title)
     if show_text_input:
         text = st.text_area(
             "Enter text you want to visualize dependency tree for ",
             text,
             key=key)
     if sub_title: st.subheader(sub_title)
     if generate_code_sample:
         st.code(
             get_code_for_viz('TREE', StreamlitUtilsOS.extract_name(pipe),
                              text))
     if isinstance(text, str) and '\n' in text: text = text.split('\n')
     pipe.viz(text,
              write_to_streamlit=True,
              viz_type='dep',
              streamlit_key=key)
     if show_infos:
         # VizUtilsStreamlitOS.display_infos()
         StreamlitVizTracker.display_model_info(pipe.nlu_ref, pipes=[pipe])
         StreamlitVizTracker.display_footer()
예제 #3
0
    def visualize_tokens_information(
        pipe,  # nlu component_list
        text: str,
        title: Optional[str] = "Token Features",
        sub_title: Optional[
            str] = 'Pick from `over 1000+ models` on the left and `view the generated features`',
        show_feature_select: bool = True,
        features: Optional[List[str]] = None,
        full_metadata: bool = True,
        output_level: str = 'token',
        positions: bool = False,
        set_wide_layout_CSS: bool = True,
        generate_code_sample: bool = False,
        key="NLU_streamlit",
        show_model_select=True,
        model_select_position: str = 'side',  # main or side
        show_infos: bool = True,
        show_logo: bool = True,
        show_text_input: bool = True,
    ) -> None:
        """Visualizer for token features."""
        StreamlitVizTracker.footer_displayed = False
        if show_logo: StreamlitVizTracker.show_logo()
        if set_wide_layout_CSS: _set_block_container_style()
        if title: st.header(title)
        # if generate_code_sample: st.code(get_code_for_viz('TOKEN',StreamlitUtilsOS.extract_name(component_list),text))
        if sub_title: st.subheader(sub_title)
        token_pipes = [pipe]
        if show_text_input:
            text = st.text_area(
                "Enter text you want to view token features for",
                text,
                key=key)
        if show_model_select:
            token_pipes_components_usable = [
                e for e in Discoverer.get_components(get_all=True)
            ]
            loaded_nlu_refs = [c.nlu_ref for c in pipe.components]

            for l in loaded_nlu_refs:
                if 'converter' in l:
                    loaded_nlu_refs.remove(l)
                    continue
                if l not in token_pipes_components_usable:
                    token_pipes_components_usable.append(l)
            token_pipes_components_usable = list(
                set(token_pipes_components_usable))
            loaded_nlu_refs = list(set(loaded_nlu_refs))
            if '' in loaded_nlu_refs: loaded_nlu_refs.remove('')
            if ' ' in loaded_nlu_refs: loaded_nlu_refs.remove(' ')
            token_pipes_components_usable.sort()
            loaded_nlu_refs.sort()
            if model_select_position == 'side':
                model_selection = st.sidebar.multiselect(
                    "Pick any additional models for token features",
                    options=token_pipes_components_usable,
                    default=loaded_nlu_refs,
                    key=key)
            else:
                model_selection = st.multiselect(
                    "Pick any additional models for token features",
                    options=token_pipes_components_usable,
                    default=loaded_nlu_refs,
                    key=key)
            # else : ValueError("Please define model_select_position as main or side")
            models_to_load = list(set(model_selection) - set(loaded_nlu_refs))
            for model in models_to_load:
                token_pipes.append(nlu.load(model))
            StreamlitVizTracker.loaded_token_pipes += token_pipes
        if generate_code_sample:
            st.code(
                get_code_for_viz(
                    'TOKEN',
                    [StreamlitUtilsOS.extract_name(p)
                     for p in token_pipes], text))
        dfs = []
        for p in token_pipes:
            df = p.predict(text,
                           output_level=output_level,
                           metadata=full_metadata,
                           positions=positions)
            dfs.append(df)

        df = pd.concat(dfs, axis=1)
        df = df.loc[:, ~df.columns.duplicated()]
        if show_feature_select:
            exp = st.expander("Select token features to display")
            features = exp.multiselect("Token features",
                                       options=list(df.columns),
                                       default=list(df.columns))
        for f in features:
            if 'entities' and 'embedding' in f:
                features.remove(f)
        st.dataframe(df[features])
        if show_infos:
            # VizUtilsStreamlitOS.display_infos()
            StreamlitVizTracker.display_model_info(pipe.nlu_ref, pipes=[pipe])
            StreamlitVizTracker.display_footer()
예제 #4
0
    def viz_streamlit_dashboard(
            pipe,
            # Base Params
            text: Union[str, List[str], pd.DataFrame, pd.Series],
            model_selection: List[str] = [],
            # NER PARAMS
            # default_ner_model2viz:Union[str, List[str]] = 'en.ner.onto.electra.base',
            # SIMILARITY PARAMS
            similarity_texts: Tuple[str, str] = ('I love NLU <3',
                                                 'I love Streamlit <3'),
            title:
        str = 'NLU ❤️ Streamlit - Prototype your NLP startup in 0 lines of code',
            sub_title:
        str = 'Play with over 1000+ scalable enterprise NLP models',
            side_info: str = None,
            # UI PARAMS
            visualizers: List[str] = ("dependency_tree", "ner", "similarity",
                                      "token_features", 'classification',
                                      'manifold'),
            show_models_info: bool = True,
            show_model_select: bool = False,
            show_viz_selection: bool = False,
            show_logo: bool = True,
            set_wide_layout_CSS: bool = True,
            show_code_snippets: bool = False,
            model_select_position: str = 'side',  # main or side
            display_infos: bool = True,
            key: str = "NLU_streamlit",
            display_footer: bool = True,
            num_similarity_cols: int = 2,

            # NEW PARAMS
            # MANIfold
            num_manifold_cols: int = 3,
            manifold_algos: List[str] = ('TSNE'),

            # SIMY
            similarity_algos: List[str] = ('COSINE'),
    ) -> None:
        """Visualize either individual building blocks for streamlit or a full UI to experiment and explore models with"""
        StreamlitVizTracker.footer_displayed = not display_footer
        if set_wide_layout_CSS: _set_block_container_style()
        if title: st.title(title)
        if sub_title: st.subheader(sub_title)
        if show_logo: StreamlitVizTracker.show_logo()
        if side_info: st.sidebar.markdown(side_info)
        if isinstance(text, list): text = '\n'.join(text)
        text = st.text_area("Enter text you want to visualize below",
                            text,
                            key=key)
        ner_model_2_viz = pipe.nlu_ref
        if show_model_select:
            show_code_snippets = st.sidebar.checkbox('Generate code snippets',
                                                     value=show_code_snippets)
            if model_selection == []:
                model_selection = Discoverer.get_components('ner',
                                                            include_pipes=True)
            model_selection.sort()
            if model_select_position == 'side':
                if pipe.nlu_ref.split(' ')[0] in model_selection:
                    ner_model_2_viz = st.sidebar.selectbox(
                        "Select a NER model.",
                        model_selection,
                        index=model_selection.index(
                            pipe.nlu_ref.split(' ')[0]))
                else:
                    ner_model_2_viz = st.sidebar.selectbox(
                        "Select a NER model.",
                        model_selection,
                        index=model_selection.index('en.ner'))
            else:
                if pipe.nlu_ref.split(' ')[0] in model_selection:
                    ner_model_2_viz = st.selectbox(
                        "Select a NER model",
                        model_selection,
                        index=model_selection.index(
                            pipe.nlu_ref.split(' ')[0]))
                else:
                    ner_model_2_viz = st.selectbox(
                        "Select a NER model.",
                        index=model_selection.index('en.ner'))

        active_visualizers = visualizers
        if show_viz_selection:
            active_visualizers = st.sidebar.multiselect("Visualizers",
                                                        options=visualizers,
                                                        default=visualizers,
                                                        key=key)

        all_models = ner_model_2_viz + ' en.dep.typed ' if 'dependency_tree' in active_visualizers else ner_model_2_viz
        ner_pipe, tree_pipe = None, None

        for viz in active_visualizers:
            if 'ner' == viz:
                ner_pipe = pipe if pipe.nlu_ref == ner_model_2_viz else StreamlitUtilsOS.get_pipe(
                    ner_model_2_viz)
                NERStreamlitBlock.visualize_ner(
                    ner_pipe,
                    text,
                    generate_code_sample=show_code_snippets,
                    key=key + '_ner',
                    show_model_select=False,
                    show_text_input=True,
                    show_logo=False,
                    show_infos=False)
            if 'dependency_tree' == viz:
                tree_pipe = StreamlitUtilsOS.get_pipe(
                    'en.dep.typed'
                )  # if not ValidateVizPipe.viz_tree_satisfied(component_list) else component_list
                DepTreeStreamlitBlock.visualize_dep_tree(
                    tree_pipe,
                    text,
                    generate_code_sample=show_code_snippets,
                    key=key + '_dep',
                    show_infos=False,
                    show_logo=False)
            if 'token_features' == viz:
                ner_pipe = pipe if pipe.nlu_ref == ner_model_2_viz else StreamlitUtilsOS.get_pipe(
                    ner_model_2_viz)
                TokenFeaturesStreamlitBlock.visualize_tokens_information(
                    ner_pipe,
                    text,
                    generate_code_sample=show_code_snippets,
                    key=key + '_tok',
                    model_select_position=model_select_position,
                    show_infos=False,
                    show_logo=False,
                )
            if 'classification' == viz:
                ner_pipe = pipe if pipe.nlu_ref == ner_model_2_viz else StreamlitUtilsOS.get_pipe(
                    ner_model_2_viz)
                ClassifierStreamlitBlock.visualize_classes(
                    ner_pipe,
                    text,
                    generate_code_sample=show_code_snippets,
                    key=key + '_class',
                    model_select_position=model_select_position,
                    show_infos=False,
                    show_logo=False)
            if 'similarity' == viz:
                ner_pipe = pipe if pipe.nlu_ref == ner_model_2_viz else StreamlitUtilsOS.get_pipe(
                    ner_model_2_viz)
                WordSimilarityStreamlitBlock.display_word_similarity(
                    ner_pipe,
                    similarity_texts,
                    generate_code_sample=show_code_snippets,
                    model_select_position=model_select_position,
                    show_infos=False,
                    show_logo=False,
                    num_cols=num_similarity_cols,
                    key=key + '_sim')
            if 'manifold' == viz:
                ner_pipe = pipe if ner_model_2_viz in pipe.nlu_ref.split(
                    ' ') else StreamlitUtilsOS.get_pipe(ner_model_2_viz)
                WordEmbeddingManifoldStreamlitBlock.viz_streamlit_word_embed_manifold(
                    ner_pipe,
                    similarity_texts,
                    generate_code_sample=show_code_snippets,
                    model_select_position=model_select_position,
                    show_infos=False,
                    show_logo=False,
                    num_cols=num_manifold_cols,
                    key=key + '_mani')

        models_to_display_info_for = []
        if ner_pipe is not None: models_to_display_info_for.append(ner_pipe)
        if tree_pipe is not None: models_to_display_info_for.append(tree_pipe)
        if show_models_info:
            StreamlitVizTracker.display_model_info(all_models,
                                                   models_to_display_info_for)
        if display_infos: StreamlitVizTracker.display_footer()
예제 #5
0
    def viz_streamlit_word_embed_manifold(
            pipe, # nlu component_list
            default_texts: List[str] = ("Donald Trump likes to party!", "Angela Merkel likes to party!", 'Peter HATES TO PARTTY!!!! :('),
            title: Optional[str] = "Lower dimensional Manifold visualization for word embeddings",
            sub_title: Optional[str] = "Apply any of the 11 `Manifold` or `Matrix Decomposition` algorithms to reduce the dimensionality of `Word Embeddings` to `1-D`, `2-D` and `3-D` ",
            write_raw_pandas : bool = False ,
            default_algos_to_apply : List[str] = ("TSNE", "PCA"),#,'LLE','Spectral Embedding','MDS','ISOMAP','SVD aka LSA','DictionaryLearning','FactorAnalysis','FastICA','KernelPCA',),  # LatentDirichletAllocation 'NMF',
            target_dimensions : List[int] = (1,2,3),
            show_algo_select : bool = True,
            show_embed_select : bool = True,
            show_color_select: bool = True,
            MAX_DISPLAY_NUM:int=200000,
            display_embed_information:bool=True,
            set_wide_layout_CSS:bool=True,
            num_cols: int = 3,
            model_select_position:str = 'side', # side or main
            key:str = "NLU_streamlit",
            additional_classifiers_for_coloring:List[str]=['pos', 'sentiment.imdb'],
            generate_code_sample:bool = False,
            show_infos:bool = True,
            show_logo:bool = True,
            n_jobs: Optional[int] = 3, # False
    ):
        from nlu.pipe.viz.streamlit_viz.streamlit_utils_OS import StreamlitUtilsOS
        StreamlitVizTracker.footer_displayed=False
        try :
            import plotly.express as px
            from sklearn.metrics.pairwise import distance_metrics
        except :st.error("You need the sklearn and plotly package in your Python environment installed for similarity visualizations. Run <pip install sklearn plotly>")
        if len(default_texts) > MAX_DISPLAY_NUM : default_texts = default_texts[:MAX_DISPLAY_NUM]
        if show_logo :StreamlitVizTracker.show_logo()
        if set_wide_layout_CSS : _set_block_container_style()
        if title:st.header(title)
        if sub_title:st.subheader(sub_title)
        # if show_logo :VizUtilsStreamlitOS.show_logo()

        # VizUtilsStreamlitOS.loaded_word_embeding_pipes = []


        data = st.text_area('Enter N texts, seperated by new lines to visualize Word Embeddings for ','\n'.join(default_texts))
        if len(data) > MAX_DISPLAY_NUM : data = data[:MAX_DISPLAY_NUM]
        original_text = nlu.load('tokenize').predict(data.split("\n"),output_level='document')['document'].values

        if show_color_select:
            if model_select_position == 'side' : feature_to_color_by =  st.sidebar.selectbox('Pick a feature to color points in manifold by ',['pos','sentiment',],0)
            else:feature_to_color_by =  st.selectbox('Feature to color plots by ',['pos','sentiment',],0)
        text_col = 'token'
        embed_algos_to_load = []
        new_embed_pipes = []
        e_coms = StreamlitUtilsOS.find_all_embed_components(pipe)

        if show_algo_select :
            exp = st.expander("Select additional manifold and dimension reduction techniques to apply")

            algos = exp.multiselect(
                "Reduce embedding dimensionality to something visualizable",
                options=("TSNE", "ISOMAP",'LLE','Spectral Embedding','MDS','PCA','SVD aka LSA','DictionaryLearning','FactorAnalysis','FastICA','KernelPCA',),default=default_algos_to_apply,)

            emb_components_usable = [e for e in Discoverer.get_components('embed',True, include_aliases=True) if 'chunk' not in e and 'sentence' not in e]
            loaded_embed_nlu_refs = []
            loaded_classifier_nlu_refs = []
            loaded_storage_refs = []
            for c in e_coms :
                r = c.nlu_ref
                if 'en.' not in r and 'embed.' not  in r and 'ner' not in r : loaded_embed_nlu_refs.append('en.embed.' + r)
                elif 'en.'  in r and 'embed.' not  in r  and 'ner' not in r:
                    r = r.split('en.')[0]
                    loaded_embed_nlu_refs.append('en.embed.' + r)
                else :
                    loaded_embed_nlu_refs.append(StorageRefUtils.extract_storage_ref(c))
                loaded_storage_refs.append(StorageRefUtils.extract_storage_ref(c))

            for p in StreamlitVizTracker.loaded_word_embeding_pipes :
                if p != pipe : loaded_embed_nlu_refs.append(p.nlu_ref)
            loaded_embed_nlu_refs = list(set(loaded_embed_nlu_refs))

            for l in loaded_embed_nlu_refs:
                if l not in emb_components_usable : emb_components_usable.append(l)
            emb_components_usable.sort()
            loaded_embed_nlu_refs.sort()


            if model_select_position =='side':
                embed_algo_selection   = st.sidebar.multiselect("Pick additional Word Embeddings for the Dimension Reduction",options=emb_components_usable,default=loaded_embed_nlu_refs,key = key)
                embed_algo_selection=[embed_algo_selection[-1]]
            else :
                exp = st.expander("Pick additional Word Embeddings")
                embed_algo_selection   = exp.multiselect("Pick additional Word Embeddings for the Dimension Reduction",options=emb_components_usable,default=loaded_embed_nlu_refs,key = key)
                embed_algo_selection=[embed_algo_selection[-1]]
            embed_algos_to_load = list(set(embed_algo_selection) - set(loaded_embed_nlu_refs))
        for embedder in embed_algos_to_load:new_embed_pipes.append(nlu.load(embedder))# + f' {" ".join(additional_classifiers_for_coloring)}'))
        StreamlitVizTracker.loaded_word_embeding_pipes+=new_embed_pipes
        if pipe not in StreamlitVizTracker.loaded_word_embeding_pipes: StreamlitVizTracker.loaded_word_embeding_pipes.append(pipe)
        for nlu_ref in additional_classifiers_for_coloring :
            already_loaded=False
            if 'pos' in nlu_ref : continue
            # for p in  VizUtilsStreamlitOS.loaded_document_classifier_pipes:
            #     if p.nlu_ref == nlu_ref : already_loaded = True
            # if not already_loaded : VizUtilsStreamlitOS.loaded_token_level_classifiers.append(nlu.load(nlu_ref))
            else :
                for p in  StreamlitVizTracker.loaded_document_classifier_pipes:
                    if p.nlu_ref == nlu_ref : already_loaded = True
                if not already_loaded :
                    already_loaded=True
                    StreamlitVizTracker.loaded_document_classifier_pipes.append(nlu.load(nlu_ref))

        col_index = 0
        cols = st.columns(num_cols)
        def are_cols_full(): return col_index == num_cols
        token_feature_pipe = StreamlitUtilsOS.get_pipe('pos')
        #not all pipes have sentiment/pos etc.. models for hueing loaded....
        ## Lets FIRST predict with the classifiers/Token level feature generators and THEN apply embed component_list


        data = original_text.copy()
        classifier_cols = []
        for class_p in StreamlitVizTracker.loaded_document_classifier_pipes:

            data = class_p.predict(data, output_level='document',multithread=False)#.dropna()
            classifier_cols.append(StreamlitUtilsOS.get_classifier_cols(class_p))
            data['text'] = original_text
            # drop embeds of classifiers because bad conversion
            for c in data.columns :
                if 'embedding' in c : data.drop(c, inplace=True,axis=1)
        # data['text']
        # =data['document']
        data['text'] = original_text
        for c in data.columns :
            if 'sentence_embedding' in c : data.drop(c,inplace=True,axis=1)
        if 'document' in data.columns : data.drop('document',inplace=True,axis=1)
        if'pos' in data.columns : data.drop('pos',inplace=True,axis=1)


        for p in StreamlitVizTracker.loaded_word_embeding_pipes :
            p = StreamlitUtilsOS.merge_token_classifiers_with_embed_pipe(p, token_feature_pipe)
            predictions =   p.predict(data,output_level='token',multithread=False).dropna()
            e_col = StreamlitUtilsOS.find_embed_col(predictions)
            e_com = StreamlitUtilsOS.find_embed_component(p)
            e_com_storage_ref = StorageRefUtils.extract_storage_ref(e_com)
            emb = predictions[e_col]
            mat = np.array([x for x in emb])
            for algo in algos :
                #Only pos values for latent Dirchlet
                if algo == 'LatentDirichletAllocation':mat = np.square(mat)
                if len(mat.shape)>2 : mat = mat.reshape(len(emb),mat.shape[-1])
                hover_data = ['token','text','sentiment', 'pos']  # TODO DEDUCT
                # calc reduced dimensionality with every algo
                if 1 in target_dimensions:
                    low_dim_data = StreamlitUtilsOS.get_manifold_algo(algo,1,n_jobs).fit_transform(mat)
                    x = low_dim_data[:,0]
                    y = np.zeros(low_dim_data[:,0].shape)
                    tsne_df =  pd.DataFrame({'x':x,'y':y, 'text':predictions[text_col], 'pos':predictions.pos, 'sentiment' : predictions.sentiment,'token':predictions.token})
                    fig = px.scatter(tsne_df, x="x", y="y",color=feature_to_color_by, hover_data=hover_data)
                    subh = f"""Word-Embeddings =`{e_com_storage_ref}`, Manifold-Algo =`{algo}` for `D=1`"""
                    cols[col_index].markdown(subh)
                    cols[col_index].write(fig,key=key)
                    col_index+=1
                    if are_cols_full() :
                        cols = st.columns(num_cols)
                        col_index = 0
                if 2 in target_dimensions:
                    low_dim_data = StreamlitUtilsOS.get_manifold_algo(algo,2,n_jobs).fit_transform(mat)
                    x = low_dim_data[:,0]
                    y = low_dim_data[:,1]
                    tsne_df =  pd.DataFrame({'x':x,'y':y, 'text':predictions[text_col], 'pos':predictions.pos, 'sentiment' : predictions.sentiment,'token':predictions.token})

                    fig = px.scatter(tsne_df, x="x", y="y",color=feature_to_color_by, hover_data=hover_data)
                    subh = f"""Word-Embeddings =`{e_com_storage_ref}`, Manifold-Algo =`{algo}` for `D=2`"""
                    cols[col_index].markdown(subh)
                    cols[col_index].write(fig,key=key)
                    # st.write(fig)
                    col_index+=1
                    if are_cols_full() :
                        cols = st.columns(num_cols)
                        col_index = 0
                if 3 in target_dimensions:
                    low_dim_data = StreamlitUtilsOS.get_manifold_algo(algo,3,n_jobs).fit_transform(mat)
                    x = low_dim_data[:,0]
                    y = low_dim_data[:,1]
                    z = low_dim_data[:,2]
                    tsne_df =  pd.DataFrame({'x':x,'y':y,'z':z, 'text':predictions[text_col], 'pos':predictions.pos, 'sentiment':predictions.sentiment,'token':predictions.token })

                    fig = px.scatter_3d(tsne_df, x="x", y="y", z='z',color=feature_to_color_by, hover_data=hover_data)
                    subh = f"""Word-Embeddings =`{e_com_storage_ref}`, Manifold-Algo =`{algo}` for `D=3`"""
                    cols[col_index].markdown(subh)
                    cols[col_index].write(fig,key=key)

                    # st.write(fig)
                    col_index+=1
                    if are_cols_full() :
                        cols = st.columns(num_cols)
                        col_index = 0
            # Todo fancy embed infos etc
            # if display_embed_information: display_embed_vetor_information(e_com,mat)

        # if display_embed_information:
        #     exp = st.expander("Embedding vector information")
        #     exp.write(embed_vector_info)
        if show_infos :
            # VizUtilsStreamlitOS.display_infos()
            StreamlitVizTracker.display_model_info(pipe.nlu_ref, pipes = [pipe])
            StreamlitVizTracker.display_footer()
예제 #6
0
    def display_word_similarity(
        pipe,  # nlu component_list
        default_texts: Tuple[str, str] = ("Donald Trump likes to party!",
                                          "Angela Merkel likes to party!"),
        threshold: float = 0.5,
        title: Optional[
            str] = "Embeddings Similarity Matrix &  Visualizations  ",
        sub_tile: Optional[
            str] = "Visualize `word-wise similarity matrix` and calculate `similarity scores` for `2 texts` and every `word embedding` loaded",
        write_raw_pandas: bool = False,
        display_embed_information: bool = True,
        similarity_matrix=True,
        show_algo_select: bool = True,
        dist_metrics: List[str] = ('cosine'),
        set_wide_layout_CSS: bool = True,
        generate_code_sample: bool = False,
        key: str = "NLU_streamlit",
        num_cols: int = 2,
        display_scalar_similarities: bool = False,
        display_similarity_summary: bool = False,
        model_select_position: str = 'side',  # main or side
        show_infos: bool = True,
        show_logo: bool = True,
    ):
        """We visualize the following cases :
        1. Simmilarity between 2 words - > sim (word_emb1, word_emb2)
        2. Simmilarity between 2 sentences -> let weTW stand word word_emb of token T and sentence S
            2.1. Raw token level with merged embeddings -> sim([we11,we21,weT1], [we12,we22,weT2])
            2.2  Autogenerate sentemb, basically does 2.1 in the Spark NLP backend
            2.3 Already using sentence_embedder model -> sim(se1,se2)
        3. Simmilarity between token and sentence -> sim([we11,w21,wT1], se2)
        4. Mirrored 3
         """
        # https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.pairwise
        StreamlitVizTracker.footer_displayed = False
        try:
            import plotly.express as px
            from sklearn.metrics.pairwise import distance_metrics
        except:
            st.error(
                "You need the sklearn and plotly package in your Python environment installed for similarity visualizations. Run <pip install sklearn plotly>"
            )
        if set_wide_layout_CSS: _set_block_container_style()
        if title: st.header(title)
        if show_logo: StreamlitVizTracker.show_logo()
        if sub_tile: st.subheader(sub_tile)

        StreamlitVizTracker.loaded_word_embeding_pipes = []
        dist_metric_algos = distance_metrics()
        dist_algos = list(dist_metric_algos.keys())
        if 'haversine' in dist_algos:
            dist_algos.remove('haversine')  # not applicable in >2D
        if 'precomputed' in dist_algos:
            dist_algos.remove('precomputed')  # Not a dist
        cols = st.columns(2)
        text1 = cols[0].text_input("Text or word1",
                                   default_texts[0],
                                   key=key + 'field_1')
        text2 = cols[1].text_input(
            "Text or word2", default_texts[1], key=key +
            'field_2') if len(default_texts) > 1 else cols[1].text_input(
                "Text or word2", 'Please enter second string', key=key)
        # exp = st.sidebar.beta_expander("Select additional Embedding Models and distance metric to compare ")
        e_coms = StreamlitUtilsOS.find_all_embed_components(pipe)
        embed_algos_to_load = []
        embed_pipes = [pipe]
        dist_algo_selection = dist_metrics
        if show_algo_select:
            # emb_components_usable = Discoverer.get_components('embed')
            emb_components_usable = [
                e for e in Discoverer.get_components(
                    'embed', True, include_aliases=True)
                if 'chunk' not in e and 'sentence' not in e
            ]
            loaded_embed_nlu_refs = []
            loaded_storage_refs = []
            loaded_embed_nlu_refs = list(set(loaded_embed_nlu_refs))

            for c in e_coms:
                r = c.nlu_ref
                if 'en.' not in r and 'embed.' not in r and 'ner' not in r:
                    loaded_embed_nlu_refs.append('en.embed.' + r)
                elif 'en.' in r and 'embed.' not in r and 'ner' not in r:
                    r = r.split('en.')[0]
                    loaded_embed_nlu_refs.append('en.embed.' + r)
                else:
                    loaded_embed_nlu_refs.append(
                        StorageRefUtils.extract_storage_ref(c))
                loaded_storage_refs.append(
                    StorageRefUtils.extract_storage_ref(c))
            for p in StreamlitVizTracker.loaded_word_embeding_pipes:
                if p != pipe: loaded_embed_nlu_refs.append(p.nlu_ref)
            for l in loaded_embed_nlu_refs:
                if l not in emb_components_usable:
                    emb_components_usable.append(l)
            # embed_algo_selection = exp.multiselect("Click to pick additional Embedding Algorithm",options=emb_components_usable,default=loaded_embed_nlu_refs,key = key)
            # dist_algo_selection = exp.multiselect("Click to pick additional Distance Metric", options=dist_algos, default=dist_metrics, key = key)
            emb_components_usable.sort()
            loaded_embed_nlu_refs.sort()
            dist_algos.sort()
            if model_select_position == 'side':
                embed_algo_selection = st.sidebar.multiselect(
                    "Pick additional Word Embeddings for the Similarity Matrix",
                    options=emb_components_usable,
                    default=loaded_embed_nlu_refs,
                    key=key)
                dist_algo_selection = st.sidebar.multiselect(
                    "Pick additional Similarity Metrics ",
                    options=dist_algos,
                    default=dist_metrics,
                    key=key)
            else:
                exp = st.expander(
                    "Pick additional Word Embeddings and Similarity Metrics")
                embed_algo_selection = exp.multiselect(
                    "Pick additional Word Embeddings for the Similarity Matrix",
                    options=emb_components_usable,
                    default=loaded_embed_nlu_refs,
                    key=key)
                dist_algo_selection = exp.multiselect(
                    "Pick additional Similarity Metrics ",
                    options=dist_algos,
                    default=dist_metrics,
                    key=key)
            embed_algos_to_load = list(
                set(embed_algo_selection) - set(loaded_embed_nlu_refs))

        for embedder in embed_algos_to_load:
            embed_pipes.append(nlu.load(embedder))

        if generate_code_sample:
            st.code(
                get_code_for_viz(
                    'SIMILARITY',
                    [StreamlitUtilsOS.extract_name(p)
                     for p in embed_pipes], default_texts))

        StreamlitVizTracker.loaded_word_embeding_pipes += embed_pipes
        similarity_metrics = {}
        embed_vector_info = {}
        cols_full = True
        col_index = 0
        # for p in embed_pipes :
        for p in StreamlitVizTracker.loaded_word_embeding_pipes:
            data1 = p.predict(text1, output_level='token',
                              get_embeddings=True).dropna()
            data2 = p.predict(text2, output_level='token',
                              get_embeddings=True).dropna()
            e_coms = StreamlitUtilsOS.find_all_embed_components(p)
            modelhub_links = [
                ModelHubUtils.get_url_by_nlu_refrence(c.nlu_ref)
                for c in e_coms
            ]
            e_cols = StreamlitUtilsOS.get_embed_cols(p)
            for num_emb, e_col in enumerate(e_cols):
                if col_index == num_cols - 1: cols_full = True
                if cols_full:
                    cols = st.columns(num_cols)
                    col_index = 0
                    cols_full = False
                else:
                    col_index += 1
                tok1 = data1['token']
                tok2 = data2['token']
                emb1 = data1[e_col]
                emb2 = data2[e_col]

                def normalize_matrix(m):
                    return np.nan_to_num(
                        m / np.linalg.norm(m, axis=1, keepdims=True))

                embed_mat1 = normalize_matrix(np.array([x for x in emb1]))
                embed_mat2 = normalize_matrix(np.array([x for x in emb2]))
                # e_name = e_col.split('word_embedding_')[-1]
                e_name = e_coms[num_emb].nlu_ref
                e_name = e_name.split(
                    'embed.')[-1] if 'en.' in e_name else e_name
                if 'ner' in e_name: e_name = loaded_storage_refs[num_emb]

                embed_vector_info[e_name] = {
                    "Vector Dimension ":
                    embed_mat1.shape[1],
                    "Num Vectors":
                    embed_mat1.shape[0] + embed_mat1.shape[0],
                    "NLU_reference":
                    e_coms[num_emb].nlu_ref,
                    "Spark_NLP_reference":
                    ModelHubUtils.NLU_ref_to_NLP_ref(e_coms[num_emb].nlu_ref),
                    "Storage Reference":
                    loaded_storage_refs[num_emb],
                    'Modelhub info':
                    modelhub_links[num_emb]
                }
                for dist_algo in dist_algo_selection:
                    # scalar_similarities[e_col][dist_algo]={}
                    sim_score = ((dist_metric_algos[dist_algo]
                                  (embed_mat1, embed_mat2) - 1) * -1)

                    sim_score = pd.DataFrame(sim_score)
                    sim_score.index = tok1.values
                    sim_score.columns = tok2.values
                    sim_score.columns = StreamlitVizTracker.pad_duplicate_tokens(
                        list(sim_score.columns))
                    sim_score.index = StreamlitVizTracker.pad_duplicate_tokens(
                        list(sim_score.index))
                    if write_raw_pandas: st.write(sim_score, key=key)
                    if sim_score.shape == (1, 1):
                        sim_score = sim_score.iloc[0][0]
                        sim_score = round(sim_score, 2)
                        if sim_score > threshold:
                            st.success(sim_score)
                            st.success(
                                f'Scalar Similarity={sim_score} for distance metric={dist_algo}'
                            )
                            st.error(
                                'No similarity matrix for only 2 tokens. Try entering at least 1 sentences in a field'
                            )
                        else:
                            st.error(
                                f'Scalar Similarity={sim_score} for distance metric={dist_algo}'
                            )
                    else:
                        ploty_avaiable = True
                        # for tok emb, sum rows and norm by rows, then sum cols and norm by cols to generate a scalar from matrix
                        scalar_sim_score = np.sum(
                            (np.sum(sim_score, axis=0) /
                             sim_score.shape[0])) / sim_score.shape[1]
                        scalar_sim_score = round(scalar_sim_score, 2)

                        if display_scalar_similarities:
                            if scalar_sim_score > threshold:
                                st.success(
                                    f'Scalar Similarity :{scalar_sim_score} for distance metric={dist_algo}'
                                )
                            else:
                                st.error(
                                    f'Scalar Similarity :{scalar_sim_score} for embedder={e_col} distance metric={dist_algo}'
                                )
                        if similarity_matrix:
                            if ploty_avaiable:
                                fig = px.imshow(
                                    sim_score, labels=dict(color="similarity")
                                )  # , title=f'Simmilarity Matrix for embedding_model={e_name} distance metric={dist_algo}')
                                # st.write(fig,key =key)
                                similarity_metrics[
                                    f'{e_name}_{dist_algo}_similarity'] = {
                                        'scalar_similarity': scalar_sim_score,
                                        'dist_metric': dist_algo,
                                        'embedding_model': e_name,
                                        'modelhub_info':
                                        modelhub_links[num_emb],
                                    }
                                subh = f"""Embedding-Model=`{e_name}`, Similarity-Score=`{scalar_sim_score}`,  distance metric=`{dist_algo}`"""
                                cols[col_index].markdown(subh)
                                cols[col_index].write(fig, key=key)
                            else:
                                pass  # todo fallback plots

        if display_similarity_summary:
            exp = st.expander("Similarity summary")
            exp.write(similarity_metrics)
        if display_embed_information:
            exp = st.expander("Embedding vector information")
            exp.write(embed_vector_info)
        if show_infos:
            # VizUtilsStreamlitOS.display_infos()
            StreamlitVizTracker.display_model_info(pipe.nlu_ref, pipes=[pipe])
            StreamlitVizTracker.display_footer()
예제 #7
0
    def viz_streamlit_sentence_embed_manifold(
            pipe,  # nlu component_list
            default_texts: List[str] = (
            "Donald Trump likes to party!", "Angela Merkel likes to party!", 'Peter HATES TO PARTTY!!!! :('),
            title: Optional[str] = "Lower dimensional Manifold visualization for Sentence embeddings",
            sub_title: Optional[
                str] = "Apply any of the 11 `Manifold` or `Matrix Decomposition` algorithms to reduce the dimensionality of `Sentence Embeddings` to `1-D`, `2-D` and `3-D` ",
            write_raw_pandas: bool = False,
            default_algos_to_apply: List[str] = ("TSNE", "PCA"),
            # ,'LLE','Spectral Embedding','MDS','ISOMAP','SVD aka LSA','DictionaryLearning','FactorAnalysis','FastICA','KernelPCA',),  # LatentDirichletAllocation 'NMF',
            target_dimensions: List[int] = (1, 2, 3),
            show_algo_select: bool = True,
            show_embed_select: bool = True,
            show_color_select: bool = True,
            MAX_DISPLAY_NUM: int = 200000,
            display_embed_information: bool = True,
            set_wide_layout_CSS: bool = True,
            num_cols: int = 3,
            model_select_position: str = 'side',  # side or main
            key: str = "NLU_streamlit",
            additional_classifiers_for_coloring: List[str] = ['sentiment.imdb'],
            generate_code_sample: bool = False,
            show_infos: bool = True,
            show_logo: bool = True,
            n_jobs: Optional[int] = 3,  # False
    ):
        from nlu.pipe.viz.streamlit_viz.streamlit_utils_OS import StreamlitUtilsOS
        StreamlitVizTracker.footer_displayed = False

        try:
            import plotly.express as px
            from sklearn.metrics.pairwise import distance_metrics
        except:
            st.error(
                "You need the sklearn and plotly package in your Python environment installed for similarity visualizations. Run <pip install sklearn plotly>")
        # if len(default_texts) > MAX_DISPLAY_NUM : default_texts = default_texts[:MAX_DISPLAY_NUM]
        if show_logo: StreamlitVizTracker.show_logo()
        if set_wide_layout_CSS: _set_block_container_style()
        if title: st.header(title)
        if sub_title: st.subheader(sub_title)
        # if show_logo :VizUtilsStreamlitOS.show_logo()

        # VizUtilsStreamlitOS.loaded_word_embeding_pipes = []

        data = st.text_area('Enter N texts, seperated by new lines to visualize Sentence Embeddings for ',
                            default_texts)
        # detect_sentence = False # TODO ITNEGRATE PARAM
        output_level = 'document'  # if not detect_sentence else 'sentence'
        classifier_cols = []
        original_text = nlu.load('tokenize').predict(data.split("\n"), output_level=output_level)[output_level].values
        original_text = original_text
        original_text = original_text[original_text != '']
        original_text = original_text[~pd.isna(original_text)]

        text_col = output_level
        embed_algos_to_load = []
        class_algos_to_load = []
        new_embed_pipes = []
        new_class_pipes = []
        e_coms = StreamlitUtilsOS.find_all_embed_components(pipe)

        if show_algo_select:
            exp = st.expander("Select additional manifold and dimension reduction techniques to apply")

            algos = exp.multiselect(
                "Reduce embedding dimensionality to something visualizable",
                options=(
                "TSNE", "ISOMAP", 'LLE', 'Spectral Embedding', 'MDS', 'PCA', 'SVD aka LSA', 'DictionaryLearning',
                'FactorAnalysis', 'FastICA', 'KernelPCA', 'LatentDirichletAllocation'),
                default=default_algos_to_apply, )

            emb_components_usable = [e for e in Discoverer.get_components('embed', True, include_aliases=True) if
                                     'chunk' not in e and 'sentence' in e]
            # Todo, multi-classifiers excluded
            classifier_components_usable = [e for e in Discoverer.get_components('classify', True, include_aliases=True)
                                            if 'xx' not in e and 'toxic' not in e and 'e2e' not in e]
            # Storage Ref extraction
            loaded_embed_nlu_refs, loaded_storage_refs = StreamlitUtilsOS.extract_all_sentence_storage_refs_or_nlu_refs(
                e_coms)
            loaded_classifier_nlu_refs = additional_classifiers_for_coloring  # + all classifier NLU_refs?

            # Get loaded Embed NLU Refs
            for embed_pipe in StreamlitVizTracker.loaded_sentence_embeding_pipes:
                if embed_pipe != pipe: loaded_embed_nlu_refs.append(embed_pipe.nlu_ref)
            loaded_embed_nlu_refs = list(set(loaded_embed_nlu_refs))

            # Get loaded Classifier NLU Refs
            for embed_pipe in StreamlitVizTracker.loaded_document_classifier_pipes:
                if embed_pipe != pipe: loaded_classifier_nlu_refs.append(embed_pipe.nlu_ref)
            loaded_classifier_nlu_refs = list(set(loaded_classifier_nlu_refs))

            # fix default selector
            for l in loaded_embed_nlu_refs:
                if l not in emb_components_usable: emb_components_usable.append(l)

            # fix default selector
            for l in loaded_classifier_nlu_refs:
                if l not in classifier_components_usable: classifier_components_usable.append(l)

            emb_components_usable.sort()
            loaded_embed_nlu_refs.sort()
            classifier_components_usable.sort()
            loaded_classifier_nlu_refs.sort()
            if model_select_position == 'side':
                embed_algo_selection = st.sidebar.multiselect(
                    "Pick additional Sentence Embeddings for the Dimension Reduction", options=emb_components_usable,
                    default=loaded_embed_nlu_refs, key=key)
                embed_algo_selection = [embed_algo_selection[-1]]

                exp = st.expander("Pick additional Classifiers")
                class_algo_selection = exp.multiselect("Pick additional Classifiers to load for coloring points",
                                                       options=classifier_components_usable,
                                                       default=loaded_classifier_nlu_refs, key=key)
                class_algo_selection = [class_algo_selection[-1]]

            else:
                exp = st.expander("Pick additional Sentence Embeddings")
                embed_algo_selection = exp.multiselect(
                    "Pick additional Sentence Embeddings for the Dimension Reduction", options=emb_components_usable,
                    default=loaded_embed_nlu_refs, key=key)
                embed_algo_selection = [embed_algo_selection[-1]]

                exp = st.expander("Pick additional Classifiers")
                class_algo_selection = exp.multiselect("Pick additional Classifiers to load for coloring points",
                                                       options=classifier_components_usable,
                                                       default=loaded_classifier_nlu_refs, key=key)
                class_algo_selection = [class_algo_selection[-1]]

            embed_algos_to_load = list(set(embed_algo_selection) - set(loaded_embed_nlu_refs))
            class_algos_to_load = list(set(class_algo_selection) - set(loaded_classifier_nlu_refs))

        for embedder in embed_algos_to_load: new_embed_pipes.append(nlu.load(embedder))
        for classifier in class_algos_to_load: new_class_pipes.append(nlu.load(classifier))

        StreamlitVizTracker.loaded_sentence_embeding_pipes += new_embed_pipes
        StreamlitVizTracker.loaded_document_classifier_pipes += new_class_pipes
        if pipe not in StreamlitVizTracker.loaded_sentence_embeding_pipes: StreamlitVizTracker.loaded_sentence_embeding_pipes.append(
            pipe)

        for nlu_ref in additional_classifiers_for_coloring:  # TODO REMVOVE< INTEGRATE INTO THE AUT LOAD THING REDUNDAND
            already_loaded = False
            for embed_pipe in StreamlitVizTracker.loaded_document_classifier_pipes:
                if embed_pipe.nlu_ref == nlu_ref: already_loaded = True
            if not already_loaded:
                already_loaded = True
                StreamlitVizTracker.loaded_document_classifier_pipes.append(nlu.load(nlu_ref))

        col_index = 0
        cols = st.columns(num_cols)

        data = original_text.copy()
        # Get classifier predictions
        classifier_cols = []
        for class_pipe in StreamlitVizTracker.loaded_document_classifier_pipes:
            data = class_pipe.predict(data, output_level=output_level, multithread=False)
            classifier_cols += StreamlitUtilsOS.get_classifier_cols(class_pipe)
            data['text'] = original_text
            # drop embeds of classifiers because bad conversion
            for c in data.columns:
                if 'embedding' in c: data.drop(c, inplace=True, axis=1)

        data['text'] = original_text
        if show_color_select:
            if model_select_position == 'side':
                feature_to_color_by = st.sidebar.selectbox('Pick a feature to color points in manifold by ',
                                                           classifier_cols, 0)
            else:
                feature_to_color_by = st.selectbox('Feature to color plots by ', classifier_cols, 0)

        def are_cols_full():
            return col_index == num_cols

        for embed_pipe in StreamlitVizTracker.loaded_sentence_embeding_pipes:
            predictions = embed_pipe.predict(data, output_level=output_level, multithread=False).dropna()
            e_col = StreamlitUtilsOS.find_embed_col(predictions)
            e_com = StreamlitUtilsOS.find_embed_component(embed_pipe)
            e_com_storage_ref = StorageRefUtils.extract_storage_ref(e_com)
            emb = predictions[e_col]
            mat = np.array([x for x in emb])
            for algo in algos:
                # Only pos values for latent Dirchlet
                if algo == 'LatentDirichletAllocation': mat = np.square(mat)
                if len(mat.shape) > 2: mat = mat.reshape(len(emb), mat.shape[-1])
                hover_data = classifier_cols + ['text']
                # calc reduced dimensionality with every algo
                if 1 in target_dimensions:
                    low_dim_data = StreamlitUtilsOS.get_manifold_algo(algo, 1, n_jobs).fit_transform(mat)
                    x = low_dim_data[:, 0]
                    y = np.zeros(low_dim_data[:, 0].shape)
                    predictions['text'] = original_text
                    tsne_df = pd.DataFrame({**{'x': x, 'y': y},
                                            **{k: predictions[k] for k in classifier_cols},
                                            **{'text': original_text}
                                            })
                    fig = px.scatter(tsne_df, x="x", y="y", color=feature_to_color_by, hover_data=hover_data)
                    subh = f"""Sentence-Embeddings =`{e_com_storage_ref}`, Manifold-Algo =`{algo}` for `D=1`"""
                    cols[col_index].markdown(subh)
                    cols[col_index].write(fig, key=key)
                    col_index += 1
                    if are_cols_full():
                        cols = st.columns(num_cols)
                        col_index = 0
                if 2 in target_dimensions:
                    low_dim_data = StreamlitUtilsOS.get_manifold_algo(algo, 2, n_jobs).fit_transform(mat)
                    x = low_dim_data[:, 0]
                    y = low_dim_data[:, 1]
                    tsne_df = pd.DataFrame({**{'x': x, 'y': y},
                                            **{k: predictions[k] for k in classifier_cols},
                                            **{'text': original_text}
                                            })
                    fig = px.scatter(tsne_df, x="x", y="y", color=feature_to_color_by, hover_data=hover_data)
                    subh = f"""Sentence-Embeddings =`{e_com_storage_ref}`, Manifold-Algo =`{algo}` for `D=2`"""
                    cols[col_index].markdown(subh)
                    cols[col_index].write(fig, key=key)
                    col_index += 1
                    if are_cols_full():
                        cols = st.columns(num_cols)
                        col_index = 0
                if 3 in target_dimensions:
                    low_dim_data = StreamlitUtilsOS.get_manifold_algo(algo, 3, n_jobs).fit_transform(mat)
                    x = low_dim_data[:, 0]
                    y = low_dim_data[:, 1]
                    z = low_dim_data[:, 2]
                    tsne_df = pd.DataFrame({**{'x': x, 'y': y, 'z': z},
                                            **{k: predictions[k] for k in classifier_cols},
                                            **{'text': original_text}
                                            })
                    fig = px.scatter_3d(tsne_df, x="x", y="y", z='z', color=feature_to_color_by, hover_data=hover_data)
                    subh = f"""Sentence-Embeddings =`{e_com_storage_ref}`, Manifold-Algo =`{algo}` for `D=3`"""
                    cols[col_index].markdown(subh)
                    cols[col_index].write(fig, key=key)
                    col_index += 1
                    if are_cols_full():
                        cols = st.columns(num_cols)
                        col_index = 0

            # Todo fancy embed infos etc
            # if display_embed_information: display_embed_vetor_information(e_com,mat)

        # if display_embed_information:
        #     exp = st.expander("Embedding vector information")
        #     exp.write(embed_vector_info)

        if show_infos:
            # VizUtilsStreamlitOS.display_infos()
            StreamlitVizTracker.display_model_info(pipe.nlu_ref, pipes=[pipe])
            StreamlitVizTracker.display_footer()
예제 #8
0
    def viz_streamlit_entity_embed_manifold(
            pipe,  # nlu component_list
            default_texts: List[str] = ("Donald Trump likes to visit New York", "Angela Merkel likes to visit Berlin!", 'Peter hates visiting Paris'),
            title: Optional[str] = "Lower dimensional Manifold visualization for Entity embeddings",
            sub_title: Optional[str] = "Apply any of the 10+ `Manifold` or `Matrix Decomposition` algorithms to reduce the dimensionality of `Entity Embeddings` to `1-D`, `2-D` and `3-D` ",
            default_algos_to_apply: List[str] = ("TSNE", "PCA"),
            target_dimensions: List[int] = (1, 2, 3),
            show_algo_select: bool = True,
            set_wide_layout_CSS: bool = True,
            num_cols: int = 3,
            model_select_position: str = 'side',  # side or main
            key: str = "NLU_streamlit",
            show_infos: bool = True,
            show_logo: bool = True,
            n_jobs: Optional[int] = 3,  # False
    ):

        from nlu.pipe.viz.streamlit_viz.streamlit_utils_OS import StreamlitUtilsOS
        StreamlitVizTracker.footer_displayed = False

        try:
            import plotly.express as px
            from sklearn.metrics.pairwise import distance_metrics
        except:
            st.error(
                "You need the sklearn and plotly package in your Python environment installed for similarity visualizations. Run <pip install sklearn plotly>")

        if show_logo: StreamlitVizTracker.show_logo()
        if set_wide_layout_CSS: _set_block_container_style()
        if title: st.header(title)
        if sub_title: st.subheader(sub_title)
        # if show_logo :VizUtilsStreamlitOS.show_logo()
        # VizUtilsStreamlitOS.loaded_word_embeding_pipes = []
        if isinstance(default_texts, list) : default_texts = '\n'.join(default_texts)
        data = st.text_area('Enter N texts, seperated by new lines to visualize Sentence Embeddings for ',
                            default_texts).split('\n')
        output_level = 'chunk'
        ner_emebed_pipe_algo_selection = []
        loaded_ner_embed_nlu_refs = []
        algos = ['TSNE']
        # A component_list should have a NER and a Word Embedding
        if pipe not in StreamlitVizTracker.loaded_ner_word_embeding_pipes: StreamlitVizTracker.loaded_ner_word_embeding_pipes.append(
            pipe)
        if pipe not in StreamlitVizTracker.loaded_word_embeding_pipes: StreamlitVizTracker.loaded_word_embeding_pipes.append(
            pipe)

        if show_algo_select:
            # Manifold Selection
            exp = st.expander("Select additional manifold and dimension reduction techniques to apply")
            algos = exp.multiselect(
                "Reduce embedding dimensionality to something visualizable",
                options=(
                    "TSNE", "ISOMAP", 'LLE', 'Spectral Embedding', 'MDS', 'PCA', 'SVD aka LSA', 'DictionaryLearning',
                    'FactorAnalysis', 'FastICA', 'KernelPCA', 'LatentDirichletAllocation'),
                default=default_algos_to_apply, )
            ner_emb_components_usable = [e for e in Discoverer.get_components('ner', True, include_aliases=True) if
                                         'embed' not in e and 'sentence' not in e]

            # Find nlu_ref of currenlty loaded component_list
            for p in StreamlitVizTracker.loaded_ner_word_embeding_pipes:
                loaded_ner_embed_nlu_refs.append(p.nlu_ref)

            # NER Selection
            if model_select_position == 'side':
                ner_emebed_pipe_algo_selection = st.sidebar.multiselect(
                    "Pick additional NER Models for the Dimension Reduction", options=ner_emb_components_usable,
                    default=loaded_ner_embed_nlu_refs, key=key)
            else:
                ner_emebed_pipe_algo_selection = exp.multiselect(
                    "Pick additional NER Models for the Dimension Reduction", options=ner_emb_components_usable,
                    default=loaded_ner_embed_nlu_refs, key=key)

        for ner_nlu_ref in ner_emebed_pipe_algo_selection:
            load = True
            for ner_p in StreamlitVizTracker.loaded_ner_word_embeding_pipes:
                if ner_p.nlu_ref == ner_nlu_ref:
                    load = False
                    break
            if not load: continue
            p = nlu.load(ner_nlu_ref)
            if p not in StreamlitVizTracker.loaded_ner_word_embeding_pipes: StreamlitVizTracker.loaded_ner_word_embeding_pipes.append(
                p)
            if p not in StreamlitVizTracker.loaded_word_embeding_pipes: StreamlitVizTracker.loaded_word_embeding_pipes.append(
                p)

        col_index = 0
        cols = st.columns(num_cols)

        def are_cols_full():
            return col_index == num_cols

        for p in StreamlitVizTracker.loaded_ner_word_embeding_pipes:
            p = EntityManifoldUtils.insert_chunk_embedder_to_pipe_if_missing(p)
            predictions = p.predict(data, metadata=True, output_level=output_level, multithread=False).dropna()
            entity_cols = EntityManifoldUtils.get_ner_cols(predictions)
            chunk_embed_col = EntityManifoldUtils.find_chunk_embed_col(predictions)

            # TODO get cols for non default NER? or multi ner setups?
            # features = predictions[EntityManifoldUtils.get_ner_cols(predictions)]
            # e_col = StreamlitUtilsOS.find_embed_col(predictions)
            e_com = StreamlitUtilsOS.find_embed_component(p)
            e_com_storage_ref = StorageRefUtils.extract_storage_ref(e_com)
            emb = predictions[chunk_embed_col]
            mat = np.array([x for x in emb])
            # for ner_emb_p in ps:
            for algo in algos:
                # Only pos values for latent Dirchlet
                if algo == 'LatentDirichletAllocation': mat = np.square(mat)
                if len(mat.shape) > 2: mat = mat.reshape(len(emb), mat.shape[-1])
                hover_data = entity_cols + ['text']
                # calc reduced dimensionality with every algo
                feature_to_color_by = entity_cols[0]
                if 1 in target_dimensions:
                    low_dim_data = StreamlitUtilsOS.get_manifold_algo(algo, 1, n_jobs).fit_transform(mat)
                    x = low_dim_data[:, 0]
                    y = np.zeros(low_dim_data[:, 0].shape)

                    # predictions['text'] = original_text
                    tsne_df = pd.DataFrame({**{'x': x, 'y': y},
                                            **{k: predictions[k] for k in entity_cols},
                                            **{'text': predictions[entity_cols[-1]]}
                                            })
                    fig = px.scatter(tsne_df, x="x", y="y", color=feature_to_color_by, hover_data=hover_data)
                    subh = f"""Word-Embeddings =`{e_com_storage_ref}`, NER-Model =`{p.nlu_ref}`, Manifold-Algo =`{algo}` for `D=1`"""
                    cols[col_index].markdown(subh)
                    cols[col_index].write(fig, key=key)
                    col_index += 1
                    if are_cols_full():
                        cols = st.columns(num_cols)
                        col_index = 0
                if 2 in target_dimensions:
                    low_dim_data = StreamlitUtilsOS.get_manifold_algo(algo, 2, n_jobs).fit_transform(mat)
                    x = low_dim_data[:, 0]
                    y = low_dim_data[:, 1]
                    tsne_df = pd.DataFrame({**{'x': x, 'y': y},
                                            **{k: predictions[k] for k in entity_cols},
                                            **{'text': predictions[entity_cols[-1]]}
                                            })
                    fig = px.scatter(tsne_df, x="x", y="y", color=feature_to_color_by, hover_data=hover_data)
                    subh = f"""Word-Embeddings =`{e_com_storage_ref}`, NER-Model =`{p.nlu_ref}`, Manifold-Algo =`{algo}` for `D=2`"""
                    cols[col_index].markdown(subh)
                    cols[col_index].write(fig, key=key)
                    col_index += 1
                    if are_cols_full():
                        cols = st.columns(num_cols)
                        col_index = 0
                if 3 in target_dimensions:
                    low_dim_data = StreamlitUtilsOS.get_manifold_algo(algo, 3, n_jobs).fit_transform(mat)
                    x = low_dim_data[:, 0]
                    y = low_dim_data[:, 1]
                    z = low_dim_data[:, 2]
                    tsne_df = pd.DataFrame({**{'x': x, 'y': y, 'z': z},
                                            **{k: predictions[k] for k in entity_cols},
                                            **{'text': predictions[entity_cols[-1]]}
                                            })
                    fig = px.scatter_3d(tsne_df, x="x", y="y", z='z', color=feature_to_color_by, hover_data=hover_data)
                    subh = f"""Word-Embeddings =`{e_com_storage_ref}`, NER-Model =`{p.nlu_ref}`, Manifold-Algo =`{algo}` for `D=3`"""
                    cols[col_index].markdown(subh)
                    cols[col_index].write(fig, key=key)
                    col_index += 1
                    if are_cols_full():
                        cols = st.columns(num_cols)
                        col_index = 0

                # Todo fancy embed infos etc
                # if display_embed_information: display_embed_vetor_information(e_com,mat)

            # if display_embed_information:
            #     exp = st.expander("Embedding vector information")
            #     exp.write(embed_vector_info)

        if show_infos:
            # VizUtilsStreamlitOS.display_infos()
            StreamlitVizTracker.display_model_info(pipe.nlu_ref, pipes=[pipe])
            StreamlitVizTracker.display_footer()