def actual_ovo_classifier(classifier, train_data, test_data, output_dir, bacteria_num, class_num): train_answer = train_data.pop("Classification") test_answer = test_data.pop("Classification") train_data = train_data[general.num_to_bacteria(bacteria_num)] test_data = test_data[general.num_to_bacteria(bacteria_num)] classifier.fit(train_data, train_answer) pandas.DataFrame(classifier.predict_proba(test_data), columns=sorted(set(test_answer))).to_csv( general.check_exist( os.path.join( output_dir, "Probability_" + str(bacteria_num) + "_" + str(class_num) + ".csv")), index=False) prediction = classifier.predict(test_data) pandas.DataFrame(zip(test_answer, prediction), columns=["real", "prediction"]).to_csv(general.check_exist( os.path.join( output_dir, "Prediction_" + str(bacteria_num) + "_" + str(class_num) + ".csv")), index=False) return (bacteria_num, ) + general.aggregate_confusion_matrix( numpy.sum(sklearn.metrics.multilabel_confusion_matrix( test_answer, prediction), axis=0, dtype=int))
def headquarter_three_class_classifier(jobs=30, input_file=None, output_dir=None): if (input_file is None) or (output_dir is None): raise ValueError elif not os.path.isfile(input_file): raise ValueError(input_file) data = pandas.read_csv(input_file) data = data[["Classification"] + general.whole_values] result_data = list() original_class = list(data["Classification"]) for selected_class in general.three_class_combinations: data["Classification"] = list( map( lambda x: "+".join(selected_class) if x in selected_class else x, data["Classification"])) train_data, test_data = sklearn.model_selection.train_test_split( data, test_size=0.1, random_state=0, stratify=data["Classification"]) with multiprocessing.Pool(processes=jobs) as pool: for name, classifier in classifiers: results = [("Number", "balanced_accuracy_score") + general.aggregate_confusion_matrix(None)] results += pool.starmap( actual_three_class_classifier, [(classifier, train_data.copy(), test_data.copy(), os.path.join(output_dir, name), i, general.class_to_num(selected_class)) for i in range(1, 2**len(general.absolute_values))]) results += pool.starmap( actual_three_class_classifier, [(classifier, train_data.copy(), test_data.copy(), os.path.join(output_dir, name), i * (2**len(general.absolute_values)), general.class_to_num(selected_class)) for i in range(1, 2**len(general.relative_values))]) results = pandas.DataFrame(results[1:], columns=results[0]) results["classifier"] = name results["combined_class"] = "-vs-".join( sorted(set(data["Classification"]))) results.to_csv(general.check_exist( os.path.join(output_dir, name, "-".join(selected_class) + ".csv")), index=False) result_data.append(results.copy()) data["Classification"] = original_class pandas.concat(result_data, ignore_index=True).to_csv(general.check_exist( os.path.join(output_dir, "statistics.csv")), index=False)
def headquarter_regressor(input_file, output_dir, watch, jobs=30): data = pandas.read_csv(input_file) data = data[[watch] + general.whole_values] data.rename(columns={watch: "answer"}, inplace=True) train_data, test_data = sklearn.model_selection.train_test_split( data, test_size=0.1, random_state=0) with multiprocessing.Pool(processes=jobs) as pool: for name, regressor in regressors: results = [("Number", "R2_score")] results += pool.starmap( actual_regressor, [(regressor, train_data.copy(), test_data.copy(), os.path.join(output_dir, name), i) for i in range(1, 2**len(general.absolute_values))]) results += pool.starmap( actual_regressor, [(regressor, train_data.copy(), test_data.copy(), os.path.join(output_dir, name), i * (2**len(general.absolute_values))) for i in range(1, 2**len(general.relative_values))]) results = pandas.DataFrame(results[1:], columns=results[0]) results["regressor"] = name results["feature_num"] = list( map(lambda x: len(general.num_to_bacteria(x)), results["Number"])) results.to_csv(general.check_exist( os.path.join(output_dir, name, "statistics.csv")), index=False) drawing_data = pandas.concat([ pandas.read_csv(os.path.join(output_dir, name, "statistics.csv")) for name, regressor in regressors ], ignore_index=True) drawing_data.to_csv(general.check_exist( os.path.join(output_dir, "statistics.csv")), index=False) seaborn.set(context="poster", style="whitegrid") fig, ax = matplotlib.pyplot.subplots(figsize=(24, 24)) seaborn.lineplot(data=drawing_data, x="feature_num", y="R2_score", hue="regressor", ax=ax, legend="full", hue_order=sorted(set(drawing_data["regressor"]))) fig.savefig( general.check_exist( os.path.join(output_dir, "Regressor_" + watch + ".png"))) matplotlib.pyplot.close(fig)
def headquarter_ovo_classifier(input_file, output_dir, jobs): if not os.path.isfile(input_file): raise ValueError(input_file) elif jobs < 1: raise ValueError(jobs) data = pandas.read_csv(input_file) data = data[["Classification"] + general.whole_values] result_data = list() for selected_class in general.two_class_combinations: tmp_data = data.loc[(data["Classification"].isin(selected_class))] train_data, test_data = sklearn.model_selection.train_test_split( tmp_data, test_size=0.1, random_state=0, stratify=tmp_data["Classification"]) with multiprocessing.Pool(processes=jobs) as pool: for name, classifier in classifiers: results = [ ("Number", ) + general.aggregate_confusion_matrix(None) ] results += pool.starmap( actual_ovo_classifier, [(classifier, train_data.copy(), test_data.copy(), os.path.join(output_dir, name), i, general.class_to_num(selected_class)) for i in range(1, 2**len(general.absolute_values))]) results += pool.starmap( actual_ovo_classifier, [(classifier, train_data.copy(), test_data.copy(), os.path.join(output_dir, name), i * (2**len(general.absolute_values)), general.class_to_num(selected_class)) for i in range(1, 2**len(general.relative_values))]) results = pandas.DataFrame(results[1:], columns=results[0]) results["classifier"] = name results["combined_class"] = "-vs-".join( sorted(set(tmp_data["Classification"]))) results.to_csv(general.check_exist( os.path.join(output_dir, name, "-".join(selected_class) + ".csv")), index=False) result_data.append(results) pandas.concat(result_data, ignore_index=True).to_csv(general.check_exist( os.path.join(output_dir, "statistics.csv")), index=False)
def get_tsne(csv_file=None, tsne_file=None, random_state=0): if tsne_file is None: tsne_file = os.path.join(default_tsne_directory, "tsne_" + str(random_state) + ".csv") if csv_file is None: raise ValueError elif not os.path.isfile(csv_file): raise ValueError data = pandas.read_csv(csv_file) tsne_data = pandas.DataFrame( sklearn.manifold.TSNE(n_components=2, random_state=random_state, init="pca").fit_transform(data[list( filter(lambda x: x in tsne_columns, list(data.columns)))]), columns=["TSNE1", "TSNE2"]) for column in list(tsne_data.columns): tsne_data[column] = scipy.stats.zscore(tsne_data[column]) for column in ["ID", "Classification"]: tsne_data[column] = data[column] tsne_data.to_csv(general.check_exist(tsne_file), index=False) return tsne_file
def draw_violin(input_file, output_file, watch): data = pandas.read_csv(input_file) seaborn.set(context="poster", style="whitegrid") fig, ax = matplotlib.pyplot.subplots(figsize=(24, 24)) seaborn.violinplot(data=data, x="Classification", y=watch, order=general.classes) statannot.add_stat_annotation(ax, data=data, x="Classification", y=watch, box_pairs=[ (general.classes[i - 1], general.classes[i]) for i in range(1, len(general.classes)) ], test="t-test_ind", text_format="star", verbose=0, order=general.classes) fig.savefig(general.check_exist(output_file)) matplotlib.pyplot.close(fig)
def draw_extreme(csv_file, output_dir): if not os.path.isfile(csv_file): raise ValueError(csv_file) statistics_data = pandas.read_csv(csv_file) results = [("combined_class", "classifier", "bacteria", "statistics", "type", "value")] for combined_class in sorted(set(statistics_data["combined_class"])): tmp = list(filter(lambda x: "+" in x, combined_class.split("-vs-"))) if tmp: combined_class_num = general.class_to_num(tmp[0].split("+")) else: combined_class_num = 0 for classifier in sorted(set(statistics_data["classifier"])): prediction_directory = os.path.join(os.path.dirname(csv_file), classifier) for statistics_value in general.aggregate_confusion_matrix(None): selected_data = statistics_data.loc[(statistics_data["combined_class"] == combined_class) & (statistics_data["classifier"] == classifier)][[statistics_value, "Number"]] minimum, maximum = selected_data.loc[selected_data.idxmin(axis="index")[statistics_value], "Number"], selected_data.loc[selected_data.idxmax(axis="index")[statistics_value], "Number"] for name, value in zip(["minimum", "maximum"], [minimum, maximum]): if combined_class_num: prediction_data = pandas.read_csv(os.path.join(prediction_directory, "Prediction_%s_%d.csv" % (value, combined_class_num))) else: prediction_data = pandas.read_csv(os.path.join(prediction_directory, "Prediction_%s.csv" % (value))) prediction_data = prediction_data.groupby(list(prediction_data.columns), as_index=False).size().reset_index().rename(columns={0: "counts"}).pivot("prediction", "real", "counts").fillna(0) seaborn.set(context="poster", style="whitegrid") fig, ax = matplotlib.pyplot.subplots(figsize=(24, 24)) seaborn.heatmap(prediction_data, annot=True, ax=ax, robust=True) ax.set_title(combined_class.replace("-", " ") + " with " + statistics_value) fig.savefig(general.check_exist(os.path.join(output_dir, name + "_" + combined_class + "_" + classifier + "_" + statistics_value + ".png"))) matplotlib.pyplot.close(fig) results.append((combined_class, classifier, "+".join(general.num_to_bacteria(value)), statistics_value, name, value)) pandas.DataFrame(results[1:], columns=results[0]).to_csv(general.check_exist(os.path.join(output_dir, "Min_Max.csv")), index=False)
def draw_statistics(csv_file, output_dir): if not os.path.isfile(csv_file): raise ValueError(csv_file) statistics_data = pandas.read_csv(csv_file) statistics_data["feature_num"] = list(map(lambda x: len(general.num_to_bacteria(x)), statistics_data["Number"])) for combined_class in sorted(set(statistics_data["combined_class"])): selected_data = statistics_data.loc[(statistics_data["combined_class"] == combined_class)] for statistics_value in sorted(general.aggregate_confusion_matrix(None)): seaborn.set(context="poster", style="whitegrid") fig, ax = matplotlib.pyplot.subplots(figsize=(24, 24)) seaborn.lineplot(x="feature_num", y=statistics_value, hue="classifier", ax=ax, legend="full", data=selected_data, hue_order=sorted(set(statistics_data["classifier"])), estimator="median", ci="sd") ax.set_title(combined_class.replace("-", " ")) fig.savefig(general.check_exist(os.path.join(output_dir, "Median_" + combined_class + "_" + statistics_value + ".png"))) matplotlib.pyplot.close(fig) seaborn.set(context="poster", style="whitegrid") fig, ax = matplotlib.pyplot.subplots(figsize=(24, 24)) seaborn.lineplot(x="feature_num", y=statistics_value, hue="classifier", ax=ax, legend="full", data=selected_data, hue_order=sorted(set(statistics_data["classifier"]))) ax.set_title(combined_class.replace("-", " ")) fig.savefig(general.check_exist(os.path.join(output_dir, "Mean_" + combined_class + "_" + statistics_value + ".png"))) matplotlib.pyplot.close(fig) seaborn.set(context="poster", style="whitegrid") fig, ax = matplotlib.pyplot.subplots(figsize=(24, 24)) seaborn.lineplot(x="feature_num", y=statistics_value, hue="classifier", ax=ax, legend="full", data=selected_data, hue_order=sorted(set(statistics_data["classifier"])), estimator=min, ci=None) ax.set_title(combined_class.replace("-", " ")) fig.savefig(general.check_exist(os.path.join(output_dir, "Min_" + combined_class + "_" + statistics_value + ".png"))) matplotlib.pyplot.close(fig) seaborn.set(context="poster", style="whitegrid") fig, ax = matplotlib.pyplot.subplots(figsize=(24, 24)) seaborn.lineplot(x="feature_num", y=statistics_value, hue="classifier", ax=ax, legend="full", data=selected_data, hue_order=sorted(set(statistics_data["classifier"])), estimator=max, ci=None) ax.set_title(combined_class.replace("-", " ")) fig.savefig(general.check_exist(os.path.join(output_dir, "Max_" + combined_class + "_" + statistics_value + ".png"))) matplotlib.pyplot.close(fig)
def draw_scatter(input_file, output_file): seaborn.set(context="poster", style="whitegrid") fig, ax = matplotlib.pyplot.subplots(figsize=(24, 24)) seaborn.scatterplot(data=pandas.read_csv(input_file), x="AL", y="PD", hue="Classification", style="Classification", legend="full", ax=ax, hue_order=general.classes) fig.savefig(general.check_exist(output_file)) matplotlib.pyplot.close(fig)
def draw_tsne(tsne_file=None, png_file=None): if png_file is None: png_file = os.path.join(default_tsne_directory, "tsne.png") if tsne_file is None: raise ValueError(tsne_file) elif not os.path.isfile(tsne_file): raise ValueError(tsne_file) seaborn.set(context="poster", style="whitegrid") fig, ax = matplotlib.pyplot.subplots(figsize=(24, 24)) seaborn.scatterplot(data=pandas.read_csv(tsne_file), x="TSNE1", y="TSNE2", hue="Classification", style="Classification", legend="full", ax=ax) fig.savefig(general.check_exist(png_file)) matplotlib.pyplot.close(fig)