Пример #1
0
def test_get_partial_table_y_labels():
    delay = 0
    data = ML_prepare(delay)

    t1 = data.get_partial_table(x_section="all", y_labels=True)
    assert t1["y"].columns.to_list() == ["SV_label", "SVI_label"]

    t2 = data.get_partial_table(x_section="all", y_labels=False)
    assert t2["y"].columns.to_list() == ["Settling_velocity", "SVI"]
Пример #2
0
def confusion_matrix_SVC(label: str, section: str, delay: int):
    data = ML_prepare(delay)
    table_xy = data.get_partial_table(x_section=section, y_labels=True)
    X = table_xy.loc[:, 'x']
    y = table_xy['y', label]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=42)
    clf = make_pipeline(StandardScaler(),
                        LinearSVC(random_state=0, tol=1e-5, max_iter=100000))
    y_pred = clf.fit(X_train, y_train).predict(X_test)
    le = preprocessing.LabelEncoder()
    le.fit(list(y_pred))
    y_pred = le.transform(y_pred)
    y_test = le.transform(np.array(y_test))
    cnf_matrix = confusion_matrix(y_test, y_pred)
    plt.figure()
    np.set_printoptions(precision=2)
    plot_confusion_matrix(cnf_matrix,
                          classes=le.classes_,
                          normalize=False,
                          title='Confusion matrix, without normalization')
    plt.savefig(
        f"figures/SVC/LinearSVC_Confusion matrix, without normalization.png",
        bbox_inches="tight")
Пример #3
0
def test_get_partial_table_wrong_input():
    delay = 10
    data = ML_prepare(delay)

    section = "blah"
    with pytest.raises(AssertionError):
        t = data.get_partial_table(x_section=section, y_labels=True)
Пример #4
0
def test_get_partial_table_x_sections_lengths():
    delay = 6
    data = ML_prepare(delay)

    sections = {"all": 27, "total_counts": 9, "filaments": 9, "various": 18}
    for section in sections:
        t = data.get_partial_table(x_section=section)
        assert t["x"].shape[1] == sections[section]
Пример #5
0
def test_get_partial_table_no_nans():
    delay = 10
    data = ML_prepare(delay)

    sections = ["all", "total_counts", "filaments", "various"]
    for section in sections:
        t = data.get_partial_table(x_section=section)
        assert not t.isnull().values.any()
Пример #6
0
def test_regr_model_func():
    data = ML_prepare(1)
    table_xy = data.get_partial_table(x_section="various", y_labels=False)
    X, y = table_xy.loc[:, "x"], table_xy.loc[:, "y"]
    models_dict = create_models_dict()
    regr_model = list(models_dict.keys())[0]
    score, fitted_model = regr_model_func(X, y, regr_model)
    assert isinstance(score, float)

    module = getattr(fitted_model, "__module__", None)
    assert "sklearn" in module
Пример #7
0
def choose_k_value(section: str, label: str, delay: int):
    """
    plot graph of scores_k by k_range for KNeighbors model.
    The user need to choose the k value by the result.

    Parameters
    ----------
    section : str
        microorganisms section - should be: all or total_counts or filaments or various
    label : str
        label - should be: SV_label or SVI_label
    delay : int
        delay between the microscopic test which took place and the SVI test

    return
    ----------
    k : int
        k choosen by the user
    """
    if section not in {'all', 'total_counts', 'filaments', 'various'}:
        raise ValueError(
            "Please supply a valid section value: 'all','total_counts', 'filaments', 'various' "
        )
    if label not in {'SV_label', 'SVI_label'}:
        raise ValueError(
            "Please supply a valid label value: 'SV_label', 'SVI_label' ")
    data = ML_prepare(delay)
    table_xy = data.get_partial_table(x_section=section, y_labels=True)
    X = table_xy.loc[:, 'x']
    y = table_xy['y', label]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=42)
    scores_data = check_K_values(20, X_train, y_train, X_test, y_test)
    sns.set()
    sns.stripplot(data=scores_data, x='k', y='score', size=10)
    plt.ylim(0, 1)
    print(
        "please look at the graph and choose k value. press enter to continue")
    input()
    plt.show()
    print("please insert k value")
    k = int(input())
    return k
Пример #8
0
def get_day3_filaments_svi_data():
    '''
    Generates the most promising data producing regression results:
    filaments microscopic data, and svi results of 3 days later.

    return
    --------
    filaments_x: pd.DataFrame
        microscopic measurements of all filament organisms.
    filaments_svi: pd.Series
        matching svi results of 3 days later
    '''
    data = ML_prepare(delay=3)
    filaments_table = data.get_partial_table(x_section="filaments",
                                             y_labels=False)
    filaments_x = filaments_table.loc[:, "x"]
    filaments_svi = filaments_table.loc[:, ("y", "SVI")]
    return filaments_x, filaments_svi
Пример #9
0
def create_score_list_Knn(labels: list, sections: list, delay_lst: list,
                          k: int) -> list:
    """
    create KNeighbors score list of all the prediction by label type, section of microorganism and delay

    Parameters
    ----------
    labels : list
        list of labels: SV and SVI
    sections : list
        list of microorganisms sections: all, total_counts, filaments and various
    delay_lst : list
        list of delays between the microscopic test which took place and the SVI test
    K : int
        k value for KNeighbors model

    return
    ----------
    score_delay : list
        score list of score predictions for each combination of label, section and delay
    """

    if k <= 0:
        raise ValueError("Please supply k value > 0 ")
    score_delay = []
    for label in labels:
        for section in sections:
            for delay in delay_lst:
                data = ML_prepare(delay)
                table_xy = data.get_partial_table(x_section=section,
                                                  y_labels=True)
                X = table_xy.loc[:, 'x']
                y = table_xy['y', label]
                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, test_size=0.25, random_state=42)
                knn = KNeighborsClassifier(n_neighbors=k)
                knn.fit(X_train, y_train)
                y_predict = (knn.predict(X_test))
                score_label = score_by_label(y_test, y_predict)
                score = knn.score(X_test, y_test)
                score_label.append(score)
                score_delay.append(score_label)
    return score_delay
Пример #10
0
def create_score_list_SVC(labels: list, sections: list,
                          delay_lst: list) -> list:
    """
    create LinearSVC score list of all the prediction by label type, section of microorganism and delay

    Parameters
    ----------
    labels : list
        list of labels: SV and SVI
    sections : list
        list of microorganisms section: all, total_counts, filaments and various
    delay_lst : list
        list of delays between the microscopic test which took place and the SVI test

    return
    ----------
    score_delay : list
        score list of score predictions for each combination of label, section and delay
    """

    score_delay = []
    for label in labels:
        for section in sections:
            for delay in delay_lst:
                data = ML_prepare(delay)
                table_xy = data.get_partial_table(x_section=section,
                                                  y_labels=True)
                X = table_xy.loc[:, 'x']
                y = table_xy['y', label]
                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, test_size=0.25, random_state=42)
                clf = make_pipeline(
                    StandardScaler(),
                    LinearSVC(random_state=0, tol=1e-5, max_iter=100000))
                clf.fit(X_train, y_train)
                y_predict = (clf.predict(X_test))
                score_label = score_by_label(y_test, y_predict)
                score = clf.score(X_test, y_test)
                score_label.append(score)
                score_delay.append(score_label)
                # print(f"result for delay= {delay}, section= {section}, label={label} : score_bad={score_label[0]}, score_reasonable={score_label[1]}, score_good={score_label[2]}, score={score_label[3]}")
    return score_delay
Пример #11
0
def create_section_and_PCA(data: ML_prepare, labled: bool = False):
    """
    Creates PCA for every section (organism group) of the data:
    "all", "filaments", "total_counts", "various".
    Using helper function "pca_plot".
    Plots by the "y", results, whether labeled or not.

    Parameters
    ----------
    data: ML_prepare
    labled: bool
    """
    section_lst = ["all", "filaments", "total_counts", "various"]
    fig, ax = plt.subplots(4, 2)
    for i in range(len(section_lst)):
        table_xy = data.get_partial_table(x_section=section_lst[i],
                                          y_labels=labled)
        y_cols = table_xy.loc[:, "y"].columns.tolist()
        for j in range(2):
            ### model on y = y_cols[j]
            pca_plot(table_xy,
                     color_col=y_cols[j],
                     section=section_lst[i],
                     ax_i=ax[i, j])
    fig.set_figheight(15)
    fig.set_figwidth(15)
    fig.suptitle(
        f"PCA of groups, colored by output, delay = {data.delay} days",
        fontsize=20,
        y=1.02,
    )
    plt.tight_layout()
    fig_name = "PCA_by_groups"
    if labled:
        fig_name = fig_name + "_labled"

    plt.tight_layout()
    fig.savefig("figures/" + fig_name + ".png", dpi=150, bbox_inches="tight")
    plt.show()
Пример #12
0
def loop_over_sections_and_y(data: ML_prepare, regr_model):
    '''
    Generates for a given model, for a given delayed data,
    scores of all 8 combinations:
            "all_sv",
            "all_svi",
            "filaments_sv",
            "filaments_svi",
            "total_counts_sv",
            "total_counts_svi",
            "various_sv",
            "various_svi"
    Saves results in namedtuple.

    Parameters
    ---------
    data: ML_prepare
        allready generated with chosen delay
    regr_model: sklearn model object

    return
    ---------
    tup_scores: namedtuple, 
    '''
    scores_lst = []
    section_lst = ["all", "filaments", "total_counts", "various"]
    for i in range(len(section_lst)):
        table_xy = data.get_partial_table(x_section=section_lst[i],
                                          y_labels=False)
        y_cols = table_xy.loc[:, "y"].columns.tolist()
        for j in range(2):
            X = table_xy.loc[:, "x"]
            y = table_xy.loc[:, ("y", y_cols[j])]
            score, _ = regr_model_func(X, y, regr_model)
            scores_lst.append(score)

    tup_scores = insert_scores_to_namedtuple(scores_lst)
    return tup_scores