Exemplo n.º 1
0
def exploration():
    progress_bar = st.sidebar.progress(0)
    status_text = st.sidebar.empty()

    # Load Data
    status_text.text("Load Data")
    st.write("## Load Data 📥 ")
    st.write("Loading Data from :")
    data_path = st.text_input("Data path",
                              "../../tutorial/data/emails_preprocessed.csv")
    df_emails_preprocessed = pd.read_csv(data_path, encoding="utf-8", sep=";")
    progress_bar.progress(10)
    i = df_emails_preprocessed.shape[0]
    st.write("Dataset contains %i emails" % i)
    st.dataframe(df_emails_preprocessed.head())

    df_emails_preprocessed["structured_body"] = df_emails_preprocessed[
        "structured_body"].apply(ast.literal_eval)
    progress_bar.progress(20)
    df_emails_preprocessed["mean_words_per_sentence"] = [
        mean_words_by_sentence(row, tokenizer)
        for index, row in df_emails_preprocessed.iterrows()
    ]
    df_emails_preprocessed["parts_tags"] = [
        structured_score(row)
        for index, row in df_emails_preprocessed.iterrows()
    ]
    df_emails_preprocessed[["parts_tags_set", "nb_parts_tags"]] = pd.DataFrame(
        df_emails_preprocessed["parts_tags"].tolist(),
        index=df_emails_preprocessed.index,
    )
    progress_bar.progress(30)

    # Build graphs
    st.write("## Build graphs 👩‍🎨")
    # Graphs counter
    i = 0

    # Sexe distribution
    i += 1
    status_text.text("Build graphs %i" % i)
    st.write("### Distribution of variable Sex")
    fig_sex = px.pie(df_emails_preprocessed, names="sexe")
    st.plotly_chart(fig_sex)

    # Age distribution
    i += 1
    status_text.text("Build graphs %i" % i)
    st.write("### Distribution of variable Age")
    fig_age = px.histogram(df_emails_preprocessed, x="age")
    st.plotly_chart(fig_age)

    # Analyse structuration of emails
    i += 1
    status_text.text("Build graphs %i" % i)
    st.write("### Structuration of emails")
    tags_counter = Counter()
    for email_tags in df_emails_preprocessed["parts_tags_set"]:
        tags_counter.update(email_tags)

    tags = list(tags_counter.keys())
    count = list(tags_counter.values())
    fig_parts = go.Figure([go.Bar(x=tags, y=count)])
    st.plotly_chart(fig_parts)

    # Analyse structuration of emails bis
    i += 1
    status_text.text("Build graphs %i" % i)
    st.write("### Number of parts tag by email")
    fig_nb_pt = px.histogram(df_emails_preprocessed, x="nb_parts_tags")
    st.plotly_chart(fig_nb_pt)

    # Analyse complexity of sentences
    i += 1
    status_text.text("Build graphs %i" % i)
    st.write("### Complexity of body sentences")
    fig_mw = px.histogram(df_emails_preprocessed, x="mean_words_per_sentence")
    st.plotly_chart(fig_mw)

    progress_bar.progress(100)
    status_text.text("Complete")
    st.button("Re-run")
Exemplo n.º 2
0
def discrimination():
    progress_bar = st.sidebar.progress(0)
    status_text = st.sidebar.empty()

    # Load Data
    status_text.text("Load Data")
    st.write("## Load Data 📥 ")
    st.write("Loading Data from :")
    data_path = st.text_input("Data path",
                              "../../tutorial/data/emails_preprocessed.csv")
    df_emails_preprocessed = pd.read_csv(data_path, encoding="utf-8", sep=";")
    progress_bar.progress(10)
    i = df_emails_preprocessed.shape[0]
    st.write("Dataset contains %i emails" % i)
    st.dataframe(df_emails_preprocessed.head())

    df_emails_preprocessed["structured_body"] = df_emails_preprocessed[
        "structured_body"].apply(ast.literal_eval)
    progress_bar.progress(20)
    df_emails_preprocessed["mean_words_per_sentence"] = [
        mean_words_by_sentence(row, tokenizer)
        for index, row in df_emails_preprocessed.iterrows()
    ]
    df_emails_preprocessed["parts_tags"] = [
        structured_score(row)
        for index, row in df_emails_preprocessed.iterrows()
    ]
    df_emails_preprocessed[["parts_tags_set", "nb_parts_tags"]] = pd.DataFrame(
        df_emails_preprocessed["parts_tags"].tolist(),
        index=df_emails_preprocessed.index,
    )
    progress_bar.progress(30)

    # The new clean_text column is the concatenation of the clean_header column and the clean_body column
    df_emails_preprocessed["clean_text"] = (
        df_emails_preprocessed["clean_header"] + " " +
        df_emails_preprocessed["clean_body"])

    # Metadata input
    # By default the metadata used are :
    # - the extension : gmail, outlook, wanadoo..
    # - the day of the week at which the email has been sent
    # - the hour at which the email has been sent
    # - the minute at which the email has been sent
    status_text.text("Load Metadata")
    st.write("## Load Metadata 📥 ")
    st.write("Loading Metadata from :")
    metadata_path = st.text_input("Metadata path",
                                  "../../tutorial/data/metadata.csv")
    df_meta = pd.read_csv(metadata_path, encoding="utf-8", sep=";")

    st.write(
        "X is a Pandas dataframe with a clean_text column that will be used for the text input and columns \
    containing the dummified metadata")
    X = pd.concat([df_emails_preprocessed["clean_text"], df_meta], axis=1)
    st.dataframe(X.head())

    st.write("y is a numpy array containing the encoded labels")
    y = df_emails_preprocessed["label"]
    st.dataframe(y)

    # Loading the neural network
    status_text.text("Load Model")
    st.write("## Load Neural Network 🔮 ")
    st.write("Loading model pickel from :")
    nn_model_path = st.text_input("Model path",
                                  "../../tutorial/data/nn_model.pickle")
    # The NeuralModel saved as a pickle file has to be loaded first
    nn_model = joblib.load(nn_model_path)
    # Then the Keras model and its weights can be loaded
    st.write("Loading weights  from :")
    weights_path = st.text_input("Weights path",
                                 "../../tutorial/data/nn_model")
    nn_model.load_nn_model(weights_path)

    # Generate Label Encoder
    le = LabelEncoder()
    le.fit_transform(y)

    y_res = nn_model.predict(X)
    y_res = le.inverse_transform(y_res)
    df_emails_preprocessed["prediction"] = y_res
    df_emails_preprocessed["prediction_error"] = y_res != y
    st.write("Model prediction over the choosen dataset")
    st.dataframe(df_emails_preprocessed[[
        "clean_text", "label", "prediction", "prediction_error"
    ]])

    # Build graphs
    st.write("## Build graphs 👩‍🎨")
    # Graphs counter
    i = 0

    # Analyse Prediction Error
    i += 1
    status_text.text("Build graphs %i" % i)
    st.write("### Prediction Error rate ")
    label_counter = Counter()
    error_counter = Counter()
    for label, error in zip(
            df_emails_preprocessed["label"],
            df_emails_preprocessed["prediction_error"],
    ):
        label_counter.update([label])
        if error == 1:
            error_counter.update([label])
    labels = list(label_counter.keys())
    count = list(label_counter.values())
    labels_error = list(error_counter.keys())
    count_error = list(error_counter.values())
    fig_label_error = go.Figure(data=[
        go.Bar(x=labels, y=count, name="label count"),
        go.Bar(x=labels_error, y=count_error, name="prediction error"),
    ])
    fig_label_error.update_layout(barmode="overlay")
    fig_label_error.update_traces(opacity=0.75)
    st.plotly_chart(fig_label_error)

    # Sexe distribution
    i += 1
    status_text.text("Build graphs %i" % i)
    st.write("### Error rate in regards of variable Sex")
    error_counter = Counter()
    sex_counter = Counter()
    for sex, error in zip(
            df_emails_preprocessed["sexe"],
            df_emails_preprocessed["prediction_error"],
    ):
        sex_counter.update([sex])
        if error == 1:
            error_counter.update([sex])
    sex = list(sex_counter.keys())
    count = list(sex_counter.values())
    sex_error = list(error_counter.keys())
    count_error = list(error_counter.values())
    fig_sex = go.Figure(data=[
        go.Bar(x=sex, y=count, name="sex count"),
        go.Bar(x=sex_error, y=count_error, name="prediction error"),
    ])
    fig_sex.update_layout(barmode="overlay")
    fig_sex.update_traces(opacity=0.75)
    st.plotly_chart(fig_sex)

    # Age distribution
    i += 1
    status_text.text("Build graphs %i" % i)
    st.write("### Error rate in regards of variable Age")
    fig_age = px.histogram(df_emails_preprocessed,
                           x="age",
                           color="prediction_error")
    fig_age.update_layout(barmode="group")
    st.plotly_chart(fig_age)

    # Analyse structuration of emails bis
    i += 1
    status_text.text("Build graphs %i" % i)
    st.write("### Error rate in regards of number of parts tag by email")
    fig_nb_pt = px.histogram(df_emails_preprocessed,
                             x="nb_parts_tags",
                             color="prediction_error")
    fig_nb_pt.update_layout(barmode="group")
    st.plotly_chart(fig_nb_pt)

    # Analyse complexity of sentences
    i += 1
    status_text.text("Build graphs %i" % i)
    st.write("### Error rate in regards of the complexity of body sentences")
    fig_mw = px.histogram(
        df_emails_preprocessed,
        x="mean_words_per_sentence",
        color="prediction_error",
    )
    fig_mw.update_layout(barmode="group")
    st.plotly_chart(fig_mw)

    progress_bar.progress(100)
    status_text.text("Complete")
    st.button("Re-run")