def test_get_partial_table_y_labels(): delay = 0 data = ML_prepare(delay) t1 = data.get_partial_table(x_section="all", y_labels=True) assert t1["y"].columns.to_list() == ["SV_label", "SVI_label"] t2 = data.get_partial_table(x_section="all", y_labels=False) assert t2["y"].columns.to_list() == ["Settling_velocity", "SVI"]
def confusion_matrix_SVC(label: str, section: str, delay: int): data = ML_prepare(delay) table_xy = data.get_partial_table(x_section=section, y_labels=True) X = table_xy.loc[:, 'x'] y = table_xy['y', label] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) clf = make_pipeline(StandardScaler(), LinearSVC(random_state=0, tol=1e-5, max_iter=100000)) y_pred = clf.fit(X_train, y_train).predict(X_test) le = preprocessing.LabelEncoder() le.fit(list(y_pred)) y_pred = le.transform(y_pred) y_test = le.transform(np.array(y_test)) cnf_matrix = confusion_matrix(y_test, y_pred) plt.figure() np.set_printoptions(precision=2) plot_confusion_matrix(cnf_matrix, classes=le.classes_, normalize=False, title='Confusion matrix, without normalization') plt.savefig( f"figures/SVC/LinearSVC_Confusion matrix, without normalization.png", bbox_inches="tight")
def test_get_partial_table_wrong_input(): delay = 10 data = ML_prepare(delay) section = "blah" with pytest.raises(AssertionError): t = data.get_partial_table(x_section=section, y_labels=True)
def test_get_partial_table_x_sections_lengths(): delay = 6 data = ML_prepare(delay) sections = {"all": 27, "total_counts": 9, "filaments": 9, "various": 18} for section in sections: t = data.get_partial_table(x_section=section) assert t["x"].shape[1] == sections[section]
def test_get_partial_table_no_nans(): delay = 10 data = ML_prepare(delay) sections = ["all", "total_counts", "filaments", "various"] for section in sections: t = data.get_partial_table(x_section=section) assert not t.isnull().values.any()
def test_regr_model_func(): data = ML_prepare(1) table_xy = data.get_partial_table(x_section="various", y_labels=False) X, y = table_xy.loc[:, "x"], table_xy.loc[:, "y"] models_dict = create_models_dict() regr_model = list(models_dict.keys())[0] score, fitted_model = regr_model_func(X, y, regr_model) assert isinstance(score, float) module = getattr(fitted_model, "__module__", None) assert "sklearn" in module
def choose_k_value(section: str, label: str, delay: int): """ plot graph of scores_k by k_range for KNeighbors model. The user need to choose the k value by the result. Parameters ---------- section : str microorganisms section - should be: all or total_counts or filaments or various label : str label - should be: SV_label or SVI_label delay : int delay between the microscopic test which took place and the SVI test return ---------- k : int k choosen by the user """ if section not in {'all', 'total_counts', 'filaments', 'various'}: raise ValueError( "Please supply a valid section value: 'all','total_counts', 'filaments', 'various' " ) if label not in {'SV_label', 'SVI_label'}: raise ValueError( "Please supply a valid label value: 'SV_label', 'SVI_label' ") data = ML_prepare(delay) table_xy = data.get_partial_table(x_section=section, y_labels=True) X = table_xy.loc[:, 'x'] y = table_xy['y', label] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) scores_data = check_K_values(20, X_train, y_train, X_test, y_test) sns.set() sns.stripplot(data=scores_data, x='k', y='score', size=10) plt.ylim(0, 1) print( "please look at the graph and choose k value. press enter to continue") input() plt.show() print("please insert k value") k = int(input()) return k
def get_day3_filaments_svi_data(): ''' Generates the most promising data producing regression results: filaments microscopic data, and svi results of 3 days later. return -------- filaments_x: pd.DataFrame microscopic measurements of all filament organisms. filaments_svi: pd.Series matching svi results of 3 days later ''' data = ML_prepare(delay=3) filaments_table = data.get_partial_table(x_section="filaments", y_labels=False) filaments_x = filaments_table.loc[:, "x"] filaments_svi = filaments_table.loc[:, ("y", "SVI")] return filaments_x, filaments_svi
def create_score_list_Knn(labels: list, sections: list, delay_lst: list, k: int) -> list: """ create KNeighbors score list of all the prediction by label type, section of microorganism and delay Parameters ---------- labels : list list of labels: SV and SVI sections : list list of microorganisms sections: all, total_counts, filaments and various delay_lst : list list of delays between the microscopic test which took place and the SVI test K : int k value for KNeighbors model return ---------- score_delay : list score list of score predictions for each combination of label, section and delay """ if k <= 0: raise ValueError("Please supply k value > 0 ") score_delay = [] for label in labels: for section in sections: for delay in delay_lst: data = ML_prepare(delay) table_xy = data.get_partial_table(x_section=section, y_labels=True) X = table_xy.loc[:, 'x'] y = table_xy['y', label] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.25, random_state=42) knn = KNeighborsClassifier(n_neighbors=k) knn.fit(X_train, y_train) y_predict = (knn.predict(X_test)) score_label = score_by_label(y_test, y_predict) score = knn.score(X_test, y_test) score_label.append(score) score_delay.append(score_label) return score_delay
def create_score_list_SVC(labels: list, sections: list, delay_lst: list) -> list: """ create LinearSVC score list of all the prediction by label type, section of microorganism and delay Parameters ---------- labels : list list of labels: SV and SVI sections : list list of microorganisms section: all, total_counts, filaments and various delay_lst : list list of delays between the microscopic test which took place and the SVI test return ---------- score_delay : list score list of score predictions for each combination of label, section and delay """ score_delay = [] for label in labels: for section in sections: for delay in delay_lst: data = ML_prepare(delay) table_xy = data.get_partial_table(x_section=section, y_labels=True) X = table_xy.loc[:, 'x'] y = table_xy['y', label] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.25, random_state=42) clf = make_pipeline( StandardScaler(), LinearSVC(random_state=0, tol=1e-5, max_iter=100000)) clf.fit(X_train, y_train) y_predict = (clf.predict(X_test)) score_label = score_by_label(y_test, y_predict) score = clf.score(X_test, y_test) score_label.append(score) score_delay.append(score_label) # print(f"result for delay= {delay}, section= {section}, label={label} : score_bad={score_label[0]}, score_reasonable={score_label[1]}, score_good={score_label[2]}, score={score_label[3]}") return score_delay
def create_section_and_PCA(data: ML_prepare, labled: bool = False): """ Creates PCA for every section (organism group) of the data: "all", "filaments", "total_counts", "various". Using helper function "pca_plot". Plots by the "y", results, whether labeled or not. Parameters ---------- data: ML_prepare labled: bool """ section_lst = ["all", "filaments", "total_counts", "various"] fig, ax = plt.subplots(4, 2) for i in range(len(section_lst)): table_xy = data.get_partial_table(x_section=section_lst[i], y_labels=labled) y_cols = table_xy.loc[:, "y"].columns.tolist() for j in range(2): ### model on y = y_cols[j] pca_plot(table_xy, color_col=y_cols[j], section=section_lst[i], ax_i=ax[i, j]) fig.set_figheight(15) fig.set_figwidth(15) fig.suptitle( f"PCA of groups, colored by output, delay = {data.delay} days", fontsize=20, y=1.02, ) plt.tight_layout() fig_name = "PCA_by_groups" if labled: fig_name = fig_name + "_labled" plt.tight_layout() fig.savefig("figures/" + fig_name + ".png", dpi=150, bbox_inches="tight") plt.show()
def loop_over_sections_and_y(data: ML_prepare, regr_model): ''' Generates for a given model, for a given delayed data, scores of all 8 combinations: "all_sv", "all_svi", "filaments_sv", "filaments_svi", "total_counts_sv", "total_counts_svi", "various_sv", "various_svi" Saves results in namedtuple. Parameters --------- data: ML_prepare allready generated with chosen delay regr_model: sklearn model object return --------- tup_scores: namedtuple, ''' scores_lst = [] section_lst = ["all", "filaments", "total_counts", "various"] for i in range(len(section_lst)): table_xy = data.get_partial_table(x_section=section_lst[i], y_labels=False) y_cols = table_xy.loc[:, "y"].columns.tolist() for j in range(2): X = table_xy.loc[:, "x"] y = table_xy.loc[:, ("y", y_cols[j])] score, _ = regr_model_func(X, y, regr_model) scores_lst.append(score) tup_scores = insert_scores_to_namedtuple(scores_lst) return tup_scores