def read_data_detail_to_dataframe(data_file_name):
    # TODO change to DB
    data_file_name = "health_and_medical_history_501_600.csv"
    file_full_path = fs.get_full_path(file_name=data_file_name)
    df_data_detail = DataFrameUtil.convert_file_to_dataframe(file_full_path,
                                                             header=0)
    return df_data_detail
예제 #2
0
def elbow_plot_handler_old(request):
    resp_data = dict()
    file_name = request.GET.get("file_name")
    column_header = request.GET.get("column_header")
    exclude_columns = request.GET.get("exclude_columns")
    print(column_header)
    if file_name:
        fs = FileStorage()
        file_full_path = fs.get_base_location() + file_name
        
        # If the file does exist, read data by panda and drop columns (if any)
        if fs.is_file(file_full_path):
            # Get data from file
            column_header_idx = None
            if column_header == "on":
                column_header_idx = 0;
               
            df = DataFrameUtil.convert_file_to_dataframe(file_full_path, header=column_header_idx) 
            # Drop column specified by user
            if exclude_columns:
                str_column_indexs = exclude_columns.split(",")
                # column_indexs = list(map(int, str_column_indexs))
                column_indexs = [int(i) - 1 for i in str_column_indexs]
                df = DataFrameUtil.drop_column_by_index(df, column_indexs)
                is_nan = np.any(np.isnan(df))
                is_finite = np.all(np.isfinite(df))
            
            # Standardize data
            X_scaled = PreProcessingUtil.standardize(df)
            
            # Get explain variance ratio
            pca_helper = PcaUtil()
            pca = pca_helper.get_fit_transfrom_pca(X_scaled)
            arr_variance_ratio = pca.explained_variance_ratio_
            
            # Prepare all tabs to display Plot, Table by Bokeh
            # Add ratio to bokeh line graph
            elbow_plot = draw_elbow_plot(arr_variance_ratio)
            
            # Describe data 
#             df_describe = df.describe().to_json()
           #  df_describe_table = draw_df_describe_table(df)
            
            # Add line to a panel
            tab1 = Panel(child=elbow_plot, title="Elbow Curve Plot")
            # tab2 = Panel(child=df_describe_table, title="Data Description")
            # Add a panel to tab
            tabs = Tabs(tabs=[ tab1 ])

            script, div = components(tabs)
            plots = { 'script': script, 'div': div}
            resp_data["bokeh_plot"] = plots
            # resp_data["data_describe"] = bokeh_df_describe_table
        else:
            resp_data["msg"] = "[ERROR] File is not found."
        
    else:
        resp_data['msg'] = "[ERROR] File name is invalid."
    
    return JsonResponse(resp_data) 
def read_based_space_to_dataframe():
    """
    Read data from file and convert to dataframe for input X that will be predicted and generated as data in scatter plot
    """
    # TODO need to change this setting to DB
    df_based_space = DataFrameUtil.convert_file_to_dataframe(
        fs.get_full_path("radiomic_result_501_600.csv"), header=0)
    return df_based_space
예제 #4
0
def process_model_data(model_file_name, data_file_name, data_detail_file_name):
    # convert file to dataframe
    fs = FileStorage()
    # TODO change
    column_header_idx = None
    # Dataframe of data to process, it is new data apart from training
    df_data = DataFrameUtil.convert_file_to_dataframe(fs.get_full_path(data_file_name), \
                                             header=column_header_idx)

    # Dataframe for matching index with processed data and show detail
    column_header_idx = 0
    df_data_detail = DataFrameUtil.convert_file_to_dataframe(fs.get_full_path(data_detail_file_name), \
                                             header=column_header_idx)

    # Load model
    model = ModelUtils.load_model(model_file_name)

    # TODO!!!!!! change to DB and dynamic
    # Do PCA
    logger.debug("Dimensionality Reduction by PCA...")
    pca_helper = PcaUtil()
    # Standardize data, reduce dimensions and return as X.
    X_scaled = PreProcessingUtil.fit_transform(df_data)

    # TODO change n =100 to dynamic
    X_reduced = pca_helper.get_pc(X_scaled, n_components=100)
    pred_y = model.predict(X_reduced)
    df_label = pd.DataFrame(pred_y, columns=["Label"])

    # TODO Keep predicted result as label

    # https://www.geeksforgeeks.org/different-ways-to-create-pandas-dataframe/
    X_graph = pca_helper.get_pc(X_scaled, n_components=2)
    df_data = pd.DataFrame(X_graph, columns=['PC1', 'PC2'])

    df_graph = df_label.join(df_data)
    scrip, div = draw_2d(df_graph, df_data_detail)

    plot = dict()
    plot['script'] = scrip
    plot['div'] = div
    # Matching detail of data based row/index

    return plot
def analyze_data(file_full_path, header_row=None):
    
    # Read data from file by panda dataframe
    # TODO header should be specified by user
    
    # Check NaN
    df = DataFrameUtil.convert_file_to_dataframe(file_full_path, header=header_row)
    results = DataFrameUtil.analyze_dataframe(df, header_row)
    
    return results
def load_model(model_name):
    # TODO change to load setting from DB DB
    # model_file_name = "radiomic482_svm_ovo_model.joblib"
    # model = ModelUtils.load_model(model_file_name)

    # TODO below data must be trained data
    df_train = DataFrameUtil.convert_file_to_dataframe(
        fs.get_full_path("radiomic482_no_key.csv"), header=0)
    X_scaled = PreProcessingUtil.standardize(df_train)
    X_reduced = PcaUtil.reduce_dimension(X_scaled, n_components=50)
    model = KMeanUtil.get_kmean_model(X_reduced, n_clusters=5, random_state=42)
    return model
예제 #7
0
def get_scaled_dataframe(form):
    data_file_name = form.cleaned_data['data_file_name']
    column_header = form.cleaned_data['column_header']
    df = None
    # X = None
    # Get file from storage
    data_file_full_path = fs.get_full_path(data_file_name)
    if column_header == "on":
        column_header_idx = 0

    df = DataFrameUtil.convert_file_to_dataframe(data_file_full_path,
                                                 header=column_header_idx)
    df_scaled = PreProcessingUtil.standardize(df)
    return df_scaled
def read_file_to_dataframe(file_name, column_header_idx):
    file_full_path = fs.get_base_location() + file_name
    # Read the file data  
    return DataFrameUtil.convert_file_to_dataframe(file_full_path, header=column_header_idx)
예제 #9
0
def unsupervised_learning_train_test_handler(request):
    resp_data = dict()
    process_log = []
    msg = []
    resp_data['process_log'] = process_log
    resp_data['msg'] = msg

    form = SupervisedLearningTrainTestForm(request.GET)
    # When it's valid, data from screen is converted to Python type
    # and stored in clean_data
    if form.is_valid():
        sel_algorithm = form.cleaned_data['sel_algorithm']
        sel_dim_reduction = form.cleaned_data['sel_dim_reduction']
        n_components = form.cleaned_data['n_components']
        dataset_file_name = form.cleaned_data['dataset_file_name']
        column_header = form.cleaned_data['column_header']
        label_file_name = form.cleaned_data['label_file_name']
        label_column_header = form.cleaned_data['label_column_header']
        test_size = form.cleaned_data['test_size']
        sel_test_method = form.cleaned_data['sel_test_method']
        n_folds = form.cleaned_data['n_folds']
        is_saved = form.cleaned_data['is_saved']
        model_file_name = form.cleaned_data['model_file_name']

        # Dataframe for storing dataset from file.
        df = None

        if fs.is_file_in_base_location(dataset_file_name) \
            and fs.is_file_in_base_location(label_file_name):

            # Get data file and store in data frame.
            data_file_path = fs.get_base_location() + dataset_file_name
            # dataset column header checking
            column_header_idx = None
            if column_header == "on":
                column_header_idx = 0

            df = DataFrameUtil.convert_file_to_dataframe(
                data_file_path, header=column_header_idx)

            # PCA process
            # Features data
            X = None
            if sel_dim_reduction == "pca":
                logger.debug("Dimensionality Reduction by PCA...")
                pca_helper = PcaHelper()
                # Standardize data, reduce dimensions and return as X.
                X_scaled = PreProcessingUtil.fit_transform(df)
                X = pca_helper.get_pc(X_scaled, n_components)
                logger.debug("PCA Done")

            # Label data
            y = None
            label_file_path = fs.get_base_location() + label_file_name
            label_column_header_idx = None
            if label_column_header == "on":
                label_column_header_idx = 0

            # Use pandas to read data then change to 1D array
            y = pd.read_csv(label_file_path,
                            header=label_column_header_idx).values.ravel()

            clf = None  # Model
            if sel_algorithm:
                logger.debug("Creating model by SVM...")
                # Split train, test data based on specified ratio.
                # Select to create SVM as one vs one or one vs all
                clf = init_model_object(sel_algorithm)

            if sel_test_method:
                logger.debug("Starting Cross Validation...")
                if sel_test_method == "cv" and n_folds:
                    scores = cross_val_score(clf, X, y, cv=n_folds)
                    txt_accuracy = "%0.2f (+/- %0.2f)" % (scores.mean(),
                                                          scores.std() * 2)
                    logger.debug(txt_accuracy)
                    resp_data["scores"] = scores.tolist()
                    resp_data["accuracy_mean"] = scores.mean()
                    resp_data["params"] = clf.get_params(deep=True)
                else:
                    # Set random_state here to get the same split for different run.
                    X_train, X_test, y_train, y_test = train_test_split(
                        X, y, test_size=test_size, random_state=42)

            if is_saved == 1 and model_file_name:
                clf.fit(X, y)
                logger.debug("Save model as %s", model_file_name)
                saved_model_file_name = ModelUtils.save_model(
                    clf, model_file_name)
                resp_data[
                    "msg"] = "Model has been saved succuessfully as " + saved_model_file_name
        else:
            # File dataset file is not found.
            msg.append("File name is not found in storage.")

    else:
        resp_data['msg'] = form._errors

    return JsonResponse(resp_data)
def pipeline_run_handler(request):
    resp_data = dict()

    form = PipelineForm(request.GET)
    # When it's valid, data from screen is converted to Python type
    # and stored in clean_data

    if form.is_valid():
        str_pipeline = form.cleaned_data['pipeline']
        dataset_file_name = form.cleaned_data['dataset_file_name']
        column_header = form.cleaned_data['column_header']

        label_file_name = form.cleaned_data['label_file_name']
        label_column_header = form.cleaned_data['label_column_header']

        # Dimensionality Reduction
        pca_n_components = form.cleaned_data['pca_n_components']
        kernel_pca_n_components = form.cleaned_data['kernel_pca_n_components']
        lda_n_components = form.cleaned_data['lda_n_components']
        tsne_n_components = form.cleaned_data['tsne_n_components']

        # Test
        test_size = form.cleaned_data['test_size']
        n_folds = form.cleaned_data['n_folds']

        # Save model
        save_as_name = form.cleaned_data['save_as_name']

        # Feature Selection
        sfs_k_features = form.cleaned_data['sfs_k_features']
        sfs_k_neighbors = form.cleaned_data['sfs_k_neighbors']
        sfs_forward = form.cleaned_data['sfs_forward']
        sfs_floating = form.cleaned_data['sfs_floating']
        sfs_scoring = form.cleaned_data['sfs_scoring']
        sfs_cv = form.cleaned_data['sfs_cv']
        sfs_n_jobs = form.cleaned_data['sfs_n_jobs']

        select_k_best_n_k = form.cleaned_data['select_k_best_n_k']

        stratified_kfold_n_split = form.cleaned_data[
            'stratified_kfold_n_split']
        stratified_kfold_shuffle = form.cleaned_data[
            'stratified_kfold_shuffle']

        # Dataframe for storing dataset from file.
        df = pd.DataFrame()

        if fs.is_file_in_base_location(dataset_file_name):
            # and fs.is_file_in_base_location(label_file_name):

            # Get data file and store in data frame.
            data_file_path = fs.get_base_location() + dataset_file_name
            # dataset column header checking
            column_header_idx = None
            if column_header == "on":
                column_header_idx = 0

            df = DataFrameUtil.convert_file_to_dataframe(
                data_file_path, header=column_header_idx)

            # PCA process
            # Features data
            X = df

            # Label data
            y = None

            # Use pandas to read data then change to 1D array
            if fs.is_file_in_base_location(label_file_name):
                label_column_header_idx = None
                if label_column_header == "on":
                    label_column_header_idx = 0
                label_file_path = fs.get_base_location() + label_file_name
                y = pd.read_csv(label_file_path,
                                header=label_column_header_idx).values.ravel()

            # process pipeline
            arr_pipeline = str_pipeline.split(",")
            parameters = dict()
            parameters['n_folds'] = n_folds
            parameters['pca_n_components'] = pca_n_components
            parameters['kernel_pca_n_components'] = kernel_pca_n_components
            parameters['lda_n_components'] = lda_n_components
            parameters['tsne_n_components'] = tsne_n_components
            parameters['test_size'] = test_size
            parameters['select_k_best_n_k'] = select_k_best_n_k

            parameters['stratified_kfold_n_split'] = stratified_kfold_n_split
            parameters['stratified_kfold_shuffle'] = stratified_kfold_shuffle

            if sfs_k_features != "":
                # In case of feature selection, plot result as table
                # Feature Selection
                parameters['sfs_k_neighbors'] = sfs_k_neighbors
                parameters['sfs_k_features'] = sfs_k_features
                parameters['sfs_forward'] = sfs_forward
                parameters['sfs_floating'] = sfs_floating
                parameters['sfs_scoring'] = sfs_scoring
                parameters['sfs_cv'] = sfs_cv
                parameters['sfs_n_jobs'] = sfs_n_jobs
                parameters['feature_names'] = df.columns

            result, X, y, model = process_pipeline(arr_pipeline, X, y,
                                                   parameters)
            print(X)
            print(y)
            resp_data = result

            if save_as_name != "":
                # If model is not fitted yet, fit the model and save
                if not ModelUtils.is_fitted(model):
                    model.fit(X, y)

                save_as_name = ModelUtils.save_model(model, save_as_name)
                resp_data[
                    msg.
                    SUCCESS] = "Model has been save successfully as " + save_as_name

                # Display table that list feature in order.

            if isinstance(X, np.ndarray) and X.any() \
                or isinstance(X, pd.DataFrame) and not X.empty:
                # Check X dimension
                nD = X.shape[1]
                if nD == 2:
                    # For 2D
                    #                     pca_helper = PcaUtil()
                    #                     X2d = pca_helper.reduce_dimension(X, n_components=2)
                    df_plot = pd.DataFrame(data=X, columns=['x', 'y'])
                    # df_label = pd.DataFrame(data=y, columns=['label'])
                    df_plot['label'] = y
                    resp_data['plot_data'] = df_plot.to_json()
                    resp_data['dimension'] = 2

                elif nD == 3:
                    # For 3D
                    #                 X3d = pca_helper.reduce_dimension(X, n_components=3)
                    df_plot = pd.DataFrame(data=X, columns=['x', 'y', 'z'])
                    # df_label = pd.DataFrame(data=y, columns=['label'])
                    # df_plot = df_plot.join(df_label)
                    df_plot['label'] = y
                    resp_data['plot_data'] = df_plot.to_json()
                    resp_data['dimension'] = 3

                elif nD > 3:
                    # Default to 3D
                    pca_helper = PcaUtil()
                    X = pca_helper.reduce_dimension(X, n_components=3)
                    df_plot = pd.DataFrame(data=X, columns=['x', 'y', 'z'])
                    df_label = pd.DataFrame(data=y, columns=['label'])
                    df_plot = df_plot.join(df_label)
                    resp_data['plot_data'] = df_plot.to_json()
                    resp_data['dimension'] = 3

        else:
            # File dataset file is not found.
            resp_data[msg.ERROR] = "File name is not found in storage."

    else:
        resp_data[msg.ERROR] = escape(form._errors)

    return JsonResponse(resp_data, safe=False)