def read_data_detail_to_dataframe(data_file_name): # TODO change to DB data_file_name = "health_and_medical_history_501_600.csv" file_full_path = fs.get_full_path(file_name=data_file_name) df_data_detail = DataFrameUtil.convert_file_to_dataframe(file_full_path, header=0) return df_data_detail
def elbow_plot_handler_old(request): resp_data = dict() file_name = request.GET.get("file_name") column_header = request.GET.get("column_header") exclude_columns = request.GET.get("exclude_columns") print(column_header) if file_name: fs = FileStorage() file_full_path = fs.get_base_location() + file_name # If the file does exist, read data by panda and drop columns (if any) if fs.is_file(file_full_path): # Get data from file column_header_idx = None if column_header == "on": column_header_idx = 0; df = DataFrameUtil.convert_file_to_dataframe(file_full_path, header=column_header_idx) # Drop column specified by user if exclude_columns: str_column_indexs = exclude_columns.split(",") # column_indexs = list(map(int, str_column_indexs)) column_indexs = [int(i) - 1 for i in str_column_indexs] df = DataFrameUtil.drop_column_by_index(df, column_indexs) is_nan = np.any(np.isnan(df)) is_finite = np.all(np.isfinite(df)) # Standardize data X_scaled = PreProcessingUtil.standardize(df) # Get explain variance ratio pca_helper = PcaUtil() pca = pca_helper.get_fit_transfrom_pca(X_scaled) arr_variance_ratio = pca.explained_variance_ratio_ # Prepare all tabs to display Plot, Table by Bokeh # Add ratio to bokeh line graph elbow_plot = draw_elbow_plot(arr_variance_ratio) # Describe data # df_describe = df.describe().to_json() # df_describe_table = draw_df_describe_table(df) # Add line to a panel tab1 = Panel(child=elbow_plot, title="Elbow Curve Plot") # tab2 = Panel(child=df_describe_table, title="Data Description") # Add a panel to tab tabs = Tabs(tabs=[ tab1 ]) script, div = components(tabs) plots = { 'script': script, 'div': div} resp_data["bokeh_plot"] = plots # resp_data["data_describe"] = bokeh_df_describe_table else: resp_data["msg"] = "[ERROR] File is not found." else: resp_data['msg'] = "[ERROR] File name is invalid." return JsonResponse(resp_data)
def read_based_space_to_dataframe(): """ Read data from file and convert to dataframe for input X that will be predicted and generated as data in scatter plot """ # TODO need to change this setting to DB df_based_space = DataFrameUtil.convert_file_to_dataframe( fs.get_full_path("radiomic_result_501_600.csv"), header=0) return df_based_space
def process_model_data(model_file_name, data_file_name, data_detail_file_name): # convert file to dataframe fs = FileStorage() # TODO change column_header_idx = None # Dataframe of data to process, it is new data apart from training df_data = DataFrameUtil.convert_file_to_dataframe(fs.get_full_path(data_file_name), \ header=column_header_idx) # Dataframe for matching index with processed data and show detail column_header_idx = 0 df_data_detail = DataFrameUtil.convert_file_to_dataframe(fs.get_full_path(data_detail_file_name), \ header=column_header_idx) # Load model model = ModelUtils.load_model(model_file_name) # TODO!!!!!! change to DB and dynamic # Do PCA logger.debug("Dimensionality Reduction by PCA...") pca_helper = PcaUtil() # Standardize data, reduce dimensions and return as X. X_scaled = PreProcessingUtil.fit_transform(df_data) # TODO change n =100 to dynamic X_reduced = pca_helper.get_pc(X_scaled, n_components=100) pred_y = model.predict(X_reduced) df_label = pd.DataFrame(pred_y, columns=["Label"]) # TODO Keep predicted result as label # https://www.geeksforgeeks.org/different-ways-to-create-pandas-dataframe/ X_graph = pca_helper.get_pc(X_scaled, n_components=2) df_data = pd.DataFrame(X_graph, columns=['PC1', 'PC2']) df_graph = df_label.join(df_data) scrip, div = draw_2d(df_graph, df_data_detail) plot = dict() plot['script'] = scrip plot['div'] = div # Matching detail of data based row/index return plot
def analyze_data(file_full_path, header_row=None): # Read data from file by panda dataframe # TODO header should be specified by user # Check NaN df = DataFrameUtil.convert_file_to_dataframe(file_full_path, header=header_row) results = DataFrameUtil.analyze_dataframe(df, header_row) return results
def load_model(model_name): # TODO change to load setting from DB DB # model_file_name = "radiomic482_svm_ovo_model.joblib" # model = ModelUtils.load_model(model_file_name) # TODO below data must be trained data df_train = DataFrameUtil.convert_file_to_dataframe( fs.get_full_path("radiomic482_no_key.csv"), header=0) X_scaled = PreProcessingUtil.standardize(df_train) X_reduced = PcaUtil.reduce_dimension(X_scaled, n_components=50) model = KMeanUtil.get_kmean_model(X_reduced, n_clusters=5, random_state=42) return model
def get_scaled_dataframe(form): data_file_name = form.cleaned_data['data_file_name'] column_header = form.cleaned_data['column_header'] df = None # X = None # Get file from storage data_file_full_path = fs.get_full_path(data_file_name) if column_header == "on": column_header_idx = 0 df = DataFrameUtil.convert_file_to_dataframe(data_file_full_path, header=column_header_idx) df_scaled = PreProcessingUtil.standardize(df) return df_scaled
def read_file_to_dataframe(file_name, column_header_idx): file_full_path = fs.get_base_location() + file_name # Read the file data return DataFrameUtil.convert_file_to_dataframe(file_full_path, header=column_header_idx)
def unsupervised_learning_train_test_handler(request): resp_data = dict() process_log = [] msg = [] resp_data['process_log'] = process_log resp_data['msg'] = msg form = SupervisedLearningTrainTestForm(request.GET) # When it's valid, data from screen is converted to Python type # and stored in clean_data if form.is_valid(): sel_algorithm = form.cleaned_data['sel_algorithm'] sel_dim_reduction = form.cleaned_data['sel_dim_reduction'] n_components = form.cleaned_data['n_components'] dataset_file_name = form.cleaned_data['dataset_file_name'] column_header = form.cleaned_data['column_header'] label_file_name = form.cleaned_data['label_file_name'] label_column_header = form.cleaned_data['label_column_header'] test_size = form.cleaned_data['test_size'] sel_test_method = form.cleaned_data['sel_test_method'] n_folds = form.cleaned_data['n_folds'] is_saved = form.cleaned_data['is_saved'] model_file_name = form.cleaned_data['model_file_name'] # Dataframe for storing dataset from file. df = None if fs.is_file_in_base_location(dataset_file_name) \ and fs.is_file_in_base_location(label_file_name): # Get data file and store in data frame. data_file_path = fs.get_base_location() + dataset_file_name # dataset column header checking column_header_idx = None if column_header == "on": column_header_idx = 0 df = DataFrameUtil.convert_file_to_dataframe( data_file_path, header=column_header_idx) # PCA process # Features data X = None if sel_dim_reduction == "pca": logger.debug("Dimensionality Reduction by PCA...") pca_helper = PcaHelper() # Standardize data, reduce dimensions and return as X. X_scaled = PreProcessingUtil.fit_transform(df) X = pca_helper.get_pc(X_scaled, n_components) logger.debug("PCA Done") # Label data y = None label_file_path = fs.get_base_location() + label_file_name label_column_header_idx = None if label_column_header == "on": label_column_header_idx = 0 # Use pandas to read data then change to 1D array y = pd.read_csv(label_file_path, header=label_column_header_idx).values.ravel() clf = None # Model if sel_algorithm: logger.debug("Creating model by SVM...") # Split train, test data based on specified ratio. # Select to create SVM as one vs one or one vs all clf = init_model_object(sel_algorithm) if sel_test_method: logger.debug("Starting Cross Validation...") if sel_test_method == "cv" and n_folds: scores = cross_val_score(clf, X, y, cv=n_folds) txt_accuracy = "%0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2) logger.debug(txt_accuracy) resp_data["scores"] = scores.tolist() resp_data["accuracy_mean"] = scores.mean() resp_data["params"] = clf.get_params(deep=True) else: # Set random_state here to get the same split for different run. X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, random_state=42) if is_saved == 1 and model_file_name: clf.fit(X, y) logger.debug("Save model as %s", model_file_name) saved_model_file_name = ModelUtils.save_model( clf, model_file_name) resp_data[ "msg"] = "Model has been saved succuessfully as " + saved_model_file_name else: # File dataset file is not found. msg.append("File name is not found in storage.") else: resp_data['msg'] = form._errors return JsonResponse(resp_data)
def pipeline_run_handler(request): resp_data = dict() form = PipelineForm(request.GET) # When it's valid, data from screen is converted to Python type # and stored in clean_data if form.is_valid(): str_pipeline = form.cleaned_data['pipeline'] dataset_file_name = form.cleaned_data['dataset_file_name'] column_header = form.cleaned_data['column_header'] label_file_name = form.cleaned_data['label_file_name'] label_column_header = form.cleaned_data['label_column_header'] # Dimensionality Reduction pca_n_components = form.cleaned_data['pca_n_components'] kernel_pca_n_components = form.cleaned_data['kernel_pca_n_components'] lda_n_components = form.cleaned_data['lda_n_components'] tsne_n_components = form.cleaned_data['tsne_n_components'] # Test test_size = form.cleaned_data['test_size'] n_folds = form.cleaned_data['n_folds'] # Save model save_as_name = form.cleaned_data['save_as_name'] # Feature Selection sfs_k_features = form.cleaned_data['sfs_k_features'] sfs_k_neighbors = form.cleaned_data['sfs_k_neighbors'] sfs_forward = form.cleaned_data['sfs_forward'] sfs_floating = form.cleaned_data['sfs_floating'] sfs_scoring = form.cleaned_data['sfs_scoring'] sfs_cv = form.cleaned_data['sfs_cv'] sfs_n_jobs = form.cleaned_data['sfs_n_jobs'] select_k_best_n_k = form.cleaned_data['select_k_best_n_k'] stratified_kfold_n_split = form.cleaned_data[ 'stratified_kfold_n_split'] stratified_kfold_shuffle = form.cleaned_data[ 'stratified_kfold_shuffle'] # Dataframe for storing dataset from file. df = pd.DataFrame() if fs.is_file_in_base_location(dataset_file_name): # and fs.is_file_in_base_location(label_file_name): # Get data file and store in data frame. data_file_path = fs.get_base_location() + dataset_file_name # dataset column header checking column_header_idx = None if column_header == "on": column_header_idx = 0 df = DataFrameUtil.convert_file_to_dataframe( data_file_path, header=column_header_idx) # PCA process # Features data X = df # Label data y = None # Use pandas to read data then change to 1D array if fs.is_file_in_base_location(label_file_name): label_column_header_idx = None if label_column_header == "on": label_column_header_idx = 0 label_file_path = fs.get_base_location() + label_file_name y = pd.read_csv(label_file_path, header=label_column_header_idx).values.ravel() # process pipeline arr_pipeline = str_pipeline.split(",") parameters = dict() parameters['n_folds'] = n_folds parameters['pca_n_components'] = pca_n_components parameters['kernel_pca_n_components'] = kernel_pca_n_components parameters['lda_n_components'] = lda_n_components parameters['tsne_n_components'] = tsne_n_components parameters['test_size'] = test_size parameters['select_k_best_n_k'] = select_k_best_n_k parameters['stratified_kfold_n_split'] = stratified_kfold_n_split parameters['stratified_kfold_shuffle'] = stratified_kfold_shuffle if sfs_k_features != "": # In case of feature selection, plot result as table # Feature Selection parameters['sfs_k_neighbors'] = sfs_k_neighbors parameters['sfs_k_features'] = sfs_k_features parameters['sfs_forward'] = sfs_forward parameters['sfs_floating'] = sfs_floating parameters['sfs_scoring'] = sfs_scoring parameters['sfs_cv'] = sfs_cv parameters['sfs_n_jobs'] = sfs_n_jobs parameters['feature_names'] = df.columns result, X, y, model = process_pipeline(arr_pipeline, X, y, parameters) print(X) print(y) resp_data = result if save_as_name != "": # If model is not fitted yet, fit the model and save if not ModelUtils.is_fitted(model): model.fit(X, y) save_as_name = ModelUtils.save_model(model, save_as_name) resp_data[ msg. SUCCESS] = "Model has been save successfully as " + save_as_name # Display table that list feature in order. if isinstance(X, np.ndarray) and X.any() \ or isinstance(X, pd.DataFrame) and not X.empty: # Check X dimension nD = X.shape[1] if nD == 2: # For 2D # pca_helper = PcaUtil() # X2d = pca_helper.reduce_dimension(X, n_components=2) df_plot = pd.DataFrame(data=X, columns=['x', 'y']) # df_label = pd.DataFrame(data=y, columns=['label']) df_plot['label'] = y resp_data['plot_data'] = df_plot.to_json() resp_data['dimension'] = 2 elif nD == 3: # For 3D # X3d = pca_helper.reduce_dimension(X, n_components=3) df_plot = pd.DataFrame(data=X, columns=['x', 'y', 'z']) # df_label = pd.DataFrame(data=y, columns=['label']) # df_plot = df_plot.join(df_label) df_plot['label'] = y resp_data['plot_data'] = df_plot.to_json() resp_data['dimension'] = 3 elif nD > 3: # Default to 3D pca_helper = PcaUtil() X = pca_helper.reduce_dimension(X, n_components=3) df_plot = pd.DataFrame(data=X, columns=['x', 'y', 'z']) df_label = pd.DataFrame(data=y, columns=['label']) df_plot = df_plot.join(df_label) resp_data['plot_data'] = df_plot.to_json() resp_data['dimension'] = 3 else: # File dataset file is not found. resp_data[msg.ERROR] = "File name is not found in storage." else: resp_data[msg.ERROR] = escape(form._errors) return JsonResponse(resp_data, safe=False)