Exemplo n.º 1
0
    def evaluate(self, array_data):
        """
        Calculate stdev value of the array and store it to history
        Parameters:

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)
        array = array_data.data
        if numpy.issubdtype(array.dtype, numpy.number):
            idx = numpy.logical_not(numpy.isnan(array))
            if StatContainer.conditional_array is not None and StatContainer.conditional_array.data.size == array.size:
                idx = numpy.logical_and(idx, StatContainer.conditional_array.data)
            std_val = numpy.std(array[idx])
            result_object = ResultObject(
                std_val, [], DataType.array, CommandStatus.Success)
            result_object.createName(
                    array_data.keyword_list,
                    command_name=self.commandTags()[0],
                    set_keyword_list=True)
            df_new = pd.DataFrame()
            df_new['Feature'] = [array_data.name]
            df_new['Standard Deviation'] = [std_val]
            TablePrinter.printDataFrame(df_new)

            # Printer.Print("Standard deviation of", array_data.name,
            #              "is", std_val)
        else:
            Printer.Print("The array is not of numeric type so cannot",
                          "find stdev")

        return result_object
Exemplo n.º 2
0
    def evaluate(self, data_frame, classifier_algo):
        """
        Train a classifier on multiple arrays

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)

        # Get the data frame
        df = data_frame.data
        #command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame(array_datas)

        if StatContainer.ground_truth is None:
            Printer.Print("Please set a feature vector to ground truth by",
                          "typing set ground truth before using this command")
            result_object = ResultObject(None, None, None, CommandStatus.Error)
            return result_object
        else:
            df = DataGuru.removeGT(df, StatContainer.ground_truth)
            Y = StatContainer.filterGroundTruth()

        # Remove nans:
        df, Y = DataGuru.removenan(df, Y)

        # Get the classifier model
        model = classifier_algo.data[0]

        # Code to run the classifier
        X = df.values

        # Get a standard scaler for the extracted data X
        scaler = preprocessing.StandardScaler().fit(X)
        X = scaler.transform(X)

        # Train the classifier
        Printer.Print("Training the classifier")
        df_show = pd.DataFrame()
        df_show['Features'] = df.columns

        TablePrinter.printDataFrame(df_show)
        model.fit(X, Y)

        # Print an update
        Printer.Print("The classifier", classifier_algo.name,
                      "has been trained")

        predictions = model.predict(X)
        accuracy = metrics.accuracy_score(predictions, Y)
        Printer.Print("Accuracy on training set : %s" % "{0:.3%}".format(accuracy))

        trained_model = {'Scaler': scaler, 'Model': model}

        result_object = ResultObject(trained_model, [], DataType.trained_model,
                              CommandStatus.Success)

        classifier_algo_name = classifier_algo.name.replace('.', ' ')
        result_object.createName(data_frame.keyword_list, command_name=classifier_algo_name,
                          set_keyword_list=True)

        return result_object
Exemplo n.º 3
0
 def initializeTable(self):
     headers = ["Column_name", "Column_type", "Size", "Column_range"]
     alignments = [Align.Right, Align.Center, Align.Center, Align.Left]
     col_widths = [30, 15, 6, 40]
     TablePrinter.initialize(4,
                             col_widths,
                             headers,
                             alignments,
                             tabbed=False)
Exemplo n.º 4
0
    def evaluate(self, data_frame, target):
        """
        Use one of the models to identify the top predictors
        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)

        # Get the data frame
        df = data_frame.data
        #command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame(array_datas)

        if StatContainer.ground_truth is None:
            Printer.Print("Please set a feature vector to ground truth by",
                          "typing set ground truth before using this command")
            result_object = ResultObject(None, None, None, CommandStatus.Error)
            return result_object
        else:
            df = DataGuru.removeGT(df, StatContainer.ground_truth)
            Y = StatContainer.filterGroundTruth()

        # Remove nans:
        df, Y = DataGuru.removenan(df, Y)

        numbers = findNumbers(target.data, 1)
        if numbers != [] and numbers[0].data > 0:
            num = int(numbers[0].data)
        else:
            num = 10  # If not specified select top 10 features

        X = df.values

        # Get a standard scaler for the extracted data X
        scaler = preprocessing.StandardScaler().fit(X)
        X = scaler.transform(X)

        model = RandomForestClassifier(n_estimators=100)
        model.fit(X, Y)
        featImpVals = model.feature_importances_

        featimp = pd.Series(featImpVals,
                            index=df.columns).sort_values(ascending=False)

        df_show = pd.DataFrame()
        df_show['top features'] = featimp.index[0:num]
        df_show['feature importance'] = featimp.values[0:num]
        TablePrinter.printDataFrame(df_show)
        df_new = df[featimp.index[0:num]]

        result_object = ResultObject(df_new, [], DataType.csv,
                                     CommandStatus.Success)

        command_name = 'top.predictors'
        result_object.createName(data_frame.name,
                                 command_name=command_name,
                                 set_keyword_list=True)

        return result_object
Exemplo n.º 5
0
    def evaluate(self):
        """
        Calculate average value of the array and store it to history
        Parameters:

        """
        if StatContainer.row_labels is not None:
            TablePrinter.clearBackGround(StatContainer.row_labels.name)
        StatContainer.row_labels = None
        Printer.Print("clearing row labels")
        return ResultObject(None, None, None, CommandStatus.Success)
Exemplo n.º 6
0
    def evaluate(self, array_data):
        """
        store ground truth to history
        Parameters:

        """
        if StatContainer.ground_truth is not None:
            TablePrinter.clearBackGround(StatContainer.ground_truth.name)
        StatContainer.ground_truth = array_data
        Printer.Print("Setting reference to ",
                      " ".join(array_data.keyword_list))
        return ResultObject(None, None, None, CommandStatus.Success)
Exemplo n.º 7
0
    def evaluate(self, array_data):
        """
        set row label
        Parameters:

        """
        if StatContainer.row_labels is not None:
            TablePrinter.clearBackGround(StatContainer.row_labels.name)
        StatContainer.row_labels = array_data
        Printer.Print("Setting row label to ",
                      " ".join(array_data.keyword_list))
        return ResultObject(None, None, None, CommandStatus.Success)
Exemplo n.º 8
0
    def evaluate(self, array_data):
        """
        Calculate max value of the array and store it to history
        Parameters:

        """
        result_objects = []
        result_object = ResultObject(None, None, None, CommandStatus.Error)
        array = array_data.data

        if numpy.issubdtype(array.dtype, numpy.number):
            idx = numpy.logical_not(numpy.isnan(array))
        elif numpy.issubdtype(array.dtype, numpy.datetime64):
            idx = numpy.logical_not(numpy.isnat(array))
        else:
            Printer.Print("The array is not supported type so cannot find max")
            return result_object
        if StatContainer.conditional_array is not None and StatContainer.conditional_array.data.size == array.size:
            idx = numpy.logical_and(idx, StatContainer.conditional_array.data)
        max_val = numpy.max(array[idx])
        idx = numpy.argmax(array[idx])
        if StatContainer.row_labels is not None:
            rl = StatContainer.row_labels.data
            max_rl = rl[idx]
            # Result for max index
            result_object = ResultObject(max_rl, [], DataType.array,
                                         CommandStatus.Success)

            result_object.createName(StatContainer.row_labels.name,
                                     command_name=self.commandTags()[0],
                                     set_keyword_list=True)
            result_objects.append(result_object)
        # Result for max value
        result_object = ResultObject(max_val, [], DataType.array,
                                     CommandStatus.Success)
        result_object.createName(array_data.keyword_list,
                                 command_name=self.commandTags()[0],
                                 set_keyword_list=True)
        result_objects.append(result_object)

        # Create a dataframe to store the results
        df_new = pd.DataFrame()
        df_new['Feature'] = [array_data.name]
        df_new['Maximum'] = [max_val]
        if StatContainer.row_labels is not None:
            df_new[StatContainer.row_labels.name] = [max_rl]
            #Printer.Print("Maximum of", array_data.name, "is", max_val, "corresponding to", max_rl)
        # else:
        #Printer.Print("Maximum of", array_data.name, "is", max_val)
        TablePrinter.printDataFrame(df_new)
        return result_objects
Exemplo n.º 9
0
    def evaluate(self, array_datas, data_frame):
        """
        Calculate label-wise mean array store it to history
        Parameters:

        """
        result_object = ResultObject(None, None, None, CommandStatus.Success)

        if data_frame is not None:
            df = data_frame.data
            cname = data_frame.name
        elif array_datas is not None:
            command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
                array_datas)
            if len(cname) == 0:
                cname = ".".join(kl1)
            if command_status == CommandStatus.Error:
                return ResultObject(None, None, None, CommandStatus.Error)
        else:
            Printer.Print("Please provide data frame or arrays to analyze")
            return ResultObject(None, None, None, CommandStatus.Error)

        df_new = self.performOperation(df)
        TablePrinter.printDataFrame(df_new)

        result_objects = []
        # Adding the newly created CSV
        result_object = ResultObject(df_new, [], DataType.csv,
                                     CommandStatus.Success)
        command_name = "smry"
        result_object.createName(cname,
                                 command_name=command_name,
                                 set_keyword_list=True)

        result_objects.append(result_object)
        # create an updated list of column names by removing the common names
        kl1 = df_new.columns
        truncated_kl1, common_name = StatContainer.removeCommonNames(kl1)
        for col in range(0, len(kl1)):
            arr = df_new[kl1[col]]
            result_object = ResultObject(arr, [], DataType.array,
                                         CommandStatus.Success)

            result_object.createName(truncated_kl1[col],
                                     command_name=command_name,
                                     set_keyword_list=True)

            result_objects.append(result_object)

        return result_objects
Exemplo n.º 10
0
    def evaluate(self, array_data):
        """
        Calculate range value of the array and store it to history
        Parameters:

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)
        array = array_data.data

        if numpy.issubdtype(array.dtype, numpy.number):
            idx = numpy.logical_not(numpy.isnan(array))
        elif numpy.issubdtype(array.dtype, numpy.datetime64):
            idx = numpy.logical_not(numpy.isnat(array))
        else:
            Printer.Print("The array is not supported type so cannot find max")
            return result_object
        if StatContainer.conditional_array is not None and StatContainer.conditional_array.data.size == array.size:
            idx = numpy.logical_and(idx, StatContainer.conditional_array.data)
        max_val = numpy.max(array[idx])
        min_val = numpy.min(array[idx])
        range_val = max_val - min_val
        result_object = ResultObject(range_val, [], DataType.array,
                                     CommandStatus.Success)
        result_object.createName(array_data.keyword_list,
                                 command_name=self.commandTags()[0],
                                 set_keyword_list=True)

        df_new = pd.DataFrame()
        df_new['Feature'] = [array_data.name]
        df_new['Range'] = [range_val]
        df_new['Minimum'] = [min_val]
        df_new['Maximum'] = [max_val]

        TablePrinter.printDataFrame(df_new)
        # Printer.Print("Range of", array_data.name, "is", range_val,
        #       "from", min_val, "to", max_val)

        return result_object
Exemplo n.º 11
0
    def evaluate(self, array_datas):
        """
        Create a a new dataframe using the supplied arrays

        """
        command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
            array_datas)
        if command_status == CommandStatus.Error:
            Printer.Print(
                "Please check whether the arrays are of the same size")
            return ResultObject(None, None, None, CommandStatus.Error)

        result_object = ResultObject(df, [], DataType.csv,
                                     CommandStatus.Success)

        command_name = 'concatenate.array'
        result_object.createName(cname,
                                 command_name=command_name,
                                 set_keyword_list=True)

        TablePrinter.printDataFrame(df)

        return result_object
Exemplo n.º 12
0
    def evaluate(self, history, user_conv):
        """
        Takes in the session history and prints all the elements stored in each
        category
        """
        result_object = ResultObject(None, None, None, CommandStatus.Success)
        if not hasattr(history.data, 'command_database'):
            Printer.Print("History does not contain command database")
            return result_object
        command_database = history.data.command_database
        TablePrinter.initialize(
            4, [20, 20, 30, 55],
            ['Command Name', 'Command type', 'Keywords', 'Description'],
            [Align.Right, Align.Center, Align.Center, Align.Left])
        user_input = user_conv.data
        user_command_types = [
            data_object.data
            for data_object in self.commandtype_database.search(user_input)
        ]
        # try:
        for command_data_object in command_database.data_objects:
            command_object = command_data_object.data
            command_type = command_object.commandType()
            if user_command_types != [] and command_type not in user_command_types:
                continue
            if command_data_object.name is None:
                if len(command_data_object.keyword_list) == 0:
                    object_name = "None"
                else:
                    object_name = command_data_object.keyword_list[0]
            else:
                object_name = command_data_object.name
            command_type_name = command_type.name.lower()
            command_tags = ' '.join(command_object.commandTags()[:2])
            command_brief = command_object.briefDescription()
            TablePrinter.addRow(
                (object_name, command_type_name, command_tags, command_brief))
        # except:
        #    result_object = ResultObject(None, None, None, CommandStatus.Error)
        TablePrinter.sort(1)  # Sort data by command type
        TablePrinter.show()

        return result_object
Exemplo n.º 13
0
    def evaluate(self, array_datas, data_frame):
        """
        Calculate ttest of the array and store it to history
        Parameters:

        """

        if data_frame is not None:
            df = data_frame.data
            cname = data_frame.name
        elif array_datas is not None:
            command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
                array_datas)
            if command_status == CommandStatus.Error:
                return ResultObject(None, None, None, CommandStatus.Error)
        else:
            Printer.Print("Please provide data frame or arrays to analyze")
            return ResultObject(None, None, None, CommandStatus.Error)

        if StatContainer.ground_truth is None:
            print("Could not find the reference variable.")
            print("Please set the reference variable")
            return ResultObject(None, None, None, CommandStatus.Error)
        else:
            gtVals = StatContainer.filterGroundTruth()
            ground_truth = StatContainer.ground_truth.name
            if len(gtVals) != df.shape[0]:
                print(
                    "The size of the ground truth does not match with arrays being analyzed"
                )
                print(len(gtVals), df.shape[0])
                return ResultObject(None, None, None, CommandStatus.Error)

        uniqVals = StatContainer.isCategorical(gtVals)
        df[ground_truth] = gtVals
        df_new = pd.DataFrame()
        if ground_truth in df.columns:
            df_new['features'] = df.columns.drop(ground_truth).values
        else:
            df_new['features'] = df.columns

        allCols = df_new['features']
        for iter in range(len(uniqVals)):
            for iter1 in range(iter + 1, len(uniqVals)):
                df_new['pValue: ' + str(iter) + ' vs ' +
                       str(iter1)] = np.zeros(df_new.shape[0])

        for iter_feature in range(len(df_new['features'])):
            arr = df[allCols[iter_feature]]
            for iter in range(len(uniqVals)):
                uniV = uniqVals[iter]
                a = arr[gtVals == uniV]
                for iter1 in range(iter + 1, len(uniqVals)):
                    b = arr[gtVals == uniqVals[iter1]]
                    if uniV != uniqVals[iter1]:
                        ttest_val = scipy.stats.ttest_ind(a,
                                                          b,
                                                          axis=0,
                                                          equal_var=False)
                        df_new['pValue: ' + str(iter) + ' vs ' +
                               str(iter1)][iter_feature] = (ttest_val.pvalue)
                    else:
                        df_new['pValue: ' + str(iter) + ' vs ' +
                               str(iter1)][iter_feature] = 0

        TablePrinter.printDataFrame(df_new)

        result_objects = []
        # Adding the newly created csv
        result_object = ResultObject(df_new, [], DataType.csv,
                                     CommandStatus.Success)
        result_object.createName(cname,
                                 command_name='sigtest',
                                 set_keyword_list=True)

        result_objects.append(result_object)
        # create an updated list of column names by removing the common names
        kl1 = df_new.columns
        truncated_kl1, common_name = StatContainer.removeCommonNames(kl1)
        for col in range(0, len(kl1)):
            arr = df_new[kl1[col]]
            result_object = ResultObject(arr, [], DataType.array,
                                         CommandStatus.Success)
            command_name = 'sigtest'
            result_object.createName(truncated_kl1[col],
                                     command_name=command_name,
                                     set_keyword_list=True)

            result_objects.append(result_object)
        return result_objects
Exemplo n.º 14
0
    def evaluate(self, array_datas, data_frame):
        """
        Calculate ROC of the array and store it to history
        Parameters:

        """
        if data_frame is not None:
            df = data_frame.data
            cname = data_frame.name
        elif array_datas is not None:
            command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
                array_datas)
            if command_status == CommandStatus.Error:
                return ResultObject(None, None, None, CommandStatus.Error)
        else:
            Printer.Print("Please provide data frame or arrays to analyze")
            return ResultObject(None, None, None, CommandStatus.Error)

        if StatContainer.ground_truth is None:
            Printer.Print("Could not find the reference variable.")
            Printer.Print("Please set the reference variable")
            return ResultObject(None, None, None, CommandStatus.Error)
        else:
            gtVals = StatContainer.filterGroundTruth()
            ground_truth = StatContainer.ground_truth.name
            if len(gtVals) != df.shape[0]:
                Printer.Print(
                    "The size of the ground truth does not match with arrays being analyzed"
                )
                Printer.Print(len(gtVals), df.shape[0])
                return ResultObject(None, None, None, CommandStatus.Error)

        uniqVals = StatContainer.isCategorical(gtVals)
        df[ground_truth] = gtVals
        df_new = pd.DataFrame()
        if ground_truth in df.columns:
            df_new['features'] = df.columns.drop(ground_truth).values
        else:
            df_new['features'] = df.columns

        allCols = df_new['features']
        for iter in range(len(uniqVals)):
            for iter1 in range(iter + 1, len(uniqVals)):
                df_new['AUC'] = 0

        avgAUC = []
        for iter_feature in range(len(df_new['features'])):
            arr = df[allCols[iter_feature]]
            model = LogisticRegression()
            X = arr.values
            X1 = X.reshape(-1, 1)
            model.fit(X1, gtVals)
            # evaluate the model
            allAUC = []
            Y_Pr = model.predict_proba(X1)
            for iter in range(len(uniqVals)):
                fpr, tpr, thresholds = metrics.roc_curve(
                    gtVals, Y_Pr[:, iter], pos_label=uniqVals[iter])
                fpr, tpr, thresholds = metrics.roc_curve(
                    gtVals, Y_Pr[:, iter], pos_label=uniqVals[iter])
                auc_val = metrics.auc(fpr, tpr)
                allAUC.append(auc_val)
            avgAUC.append(np.mean(allAUC))
        df_new['AUC'] = avgAUC

        TablePrinter.printDataFrame(df_new)

        # New data frame
        result_objects = []
        result_object = ResultObject(df_new, [], DataType.csv,
                                     CommandStatus.Success)
        result_object.createName(cname,
                                 command_name='rcurve',
                                 set_keyword_list=True)

        result_objects.append(result_object)

        # create an updated list of column names by removing the common names
        kl1 = df_new.columns
        truncated_kl1, common_name = StatContainer.removeCommonNames(kl1)
        for col in range(0, len(kl1)):
            arr = df_new[kl1[col]]
            result_object = ResultObject(arr, [], DataType.array,
                                         CommandStatus.Success)
            command_name = 'rcurve'
            result_object.createName(truncated_kl1[col],
                                     command_name=command_name,
                                     set_keyword_list=True)

            result_objects.append(result_object)

        return result_objects
Exemplo n.º 15
0
 def __init__(self, parent=None):
     super(QtGUI, self).__init__(parent)
     # Create subcomponents of the GUI
     self.tab_container = QTabWidget()
     self.qt_printer = QtPrinter()
     self.qt_table_printer = QtTablePrinter()
     self.user_input = QCustomLineEdit()
     self.completion_model = QStringListModel()
     self.labels = [QLineEdit("None"), QLineEdit("None"), QLineEdit("None")]
     self.ground_truth = QLabel()
     self.row_label = QLabel()
     self.filter_label = QLabel()
     completer = QCompleter()
     completer.setModel(self.completion_model)
     self.user_input.setCompleter(completer)
     self.variable_history = QtTablePrinter()
     # Select global configs
     QTabManager.setParentWidget(self.tab_container)
     Window.selectWindowType(QtWindow)
     PropertyEditor.property_editor_class = QtPropertyEditor
     Printer.selectPrinter(self.qt_printer)
     TablePrinter.selectPrinter(self.qt_table_printer)
     # Get screen resolution:
     app = QApplication.instance()
     screen_resolution = app.desktop().screenGeometry()
     # Add ref labels
     self.ref_labels = []
     for ref_label in ['Reference', 'Row Label', 'Filter']:
         self.ref_labels.append(
             QLabel('<span style=" font-size: ' + str(self.label_font) +
                    'pt; font-weight:600;">' + ref_label + ': </span>'))
         self.ref_labels[-1].setMinimumHeight(0.02 *
                                              screen_resolution.height())
     # Font for user input:
     f = self.user_input.font()
     f.setPointSize(self.user_input_font)  # sets the size to 27
     self.user_input.setFont(f)
     f.setPointSize(self.label_font)
     for label in self.labels:
         label.setFont(f)
         label.setMinimumHeight(0.02 * screen_resolution.height())
         label.setReadOnly(True)
     # Size
     self.user_input.setMinimumHeight(0.02 * screen_resolution.height())
     self.qt_printer.text_box.setSizePolicy(QSizePolicy.Minimum,
                                            QSizePolicy.Expanding)
     self.tab_container.setSizePolicy(QSizePolicy.Expanding,
                                      QSizePolicy.Expanding)
     self.qt_table_printer.table_widget.setSizePolicy(
         QSizePolicy.MinimumExpanding, QSizePolicy.Expanding)
     self.variable_history.table_widget.setSizePolicy(
         QSizePolicy.Minimum, QSizePolicy.Minimum)
     self.ground_truth.setSizePolicy(QSizePolicy.Expanding,
                                     QSizePolicy.Minimum)
     # Layout
     layout = QVBoxLayout()
     # Add gt, rowlabels, filter
     hlayout = QHBoxLayout()
     for i in range(3):
         hlayout.addWidget(self.ref_labels[i])
         hlayout.addWidget(self.labels[i])
     # Add tabs for table and past history
     self.right_tab_widget = QTabWidget()
     self.right_tab_widget.addTab(self.qt_table_printer.table_widget,
                                  "Data Summary")
     self.right_tab_widget.addTab(self.variable_history.table_widget,
                                  "Past variables")
     # Add separate splitter for table and property editor
     h_splitter = QSplitter(Qt.Vertical)
     h_splitter.addWidget(self.right_tab_widget)
     h_splitter.setStretchFactor(0, 2)
     PropertyEditor.parent_widget = h_splitter
     # Add chat,window, tab
     splitter = QSplitter(Qt.Horizontal)
     splitter.addWidget(self.qt_printer.text_box)
     splitter.addWidget(self.tab_container)
     splitter.addWidget(h_splitter)
     splitter.setStretchFactor(0, 0)
     splitter.setStretchFactor(1, 2)
     splitter.setStretchFactor(2, 0)
     splitter.setSizes([1, 1000, 500])
     # Final
     layout.addLayout(hlayout)
     layout.addWidget(splitter)
     layout.addWidget(self.user_input)
     self.setLayout(layout)
     # Connections
     self.qt_table_printer.table_widget.itemDoubleClicked.connect(
         self.double_click_table_cell)
Exemplo n.º 16
0
    def evaluate(self, array_data, user_conv):
        """
        List all columns in a csv matrix
        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)
        data = array_data.data
        req_categories = [
            res.data for res in self.column_type_db.search(user_conv.data)
        ]
        column_strings = {
            'Categorical': [],
            'Numeric': [],
            'Logic': [],
            'String': [],
            'Unknown': []
        }
        self.initializeTable()
        if hasattr(data, 'columns'):
            for column in data.columns:
                data_column = data[column]
                data_column.dropna(inplace=True)
                unique_vals = StatContainer.isCategorical(data_column)
                if unique_vals is not None:
                    column_type = "Categorical"
                elif np.issubdtype(data_column.dtype, np.number):
                    column_type = "Numeric"
                elif np.issubdtype(data_column.dtype, np.bool_):
                    column_type = "Logic"
                elif (len(data_column) > 0
                      and isinstance(data_column.iloc[0], str)):
                    column_type = "String"
                else:
                    column_type = "Unknown"
                # If we did not request specific column just ignore
                if req_categories != [] and column_type not in req_categories:
                    continue
                n_unique_vals = str(len(data_column))
                if unique_vals is not None:
                    n_unique_vals = str(len(unique_vals))
                    if len(unique_vals) < 5:
                        column_range = str(unique_vals)
                    else:
                        column_range = ('[' + str(unique_vals[0]) + '...' +
                                        str(unique_vals[-1]) + ']')
                elif np.issubdtype(data_column.dtype, np.number):
                    column_range = "[{:.2f}, {:.2f}]".format(
                        np.min(data_column), np.max(data_column))
                else:
                    column_range = ""
                column_strings[column_type].append(
                    (column, column_type, n_unique_vals, column_range))

            Printer.Print("Showing Statistics for",
                          " ".join(array_data.keyword_list))
            for column_type in column_strings:
                column_strings[column_type].sort()  # Sort the elements
                for row_data in column_strings[column_type]:
                    TablePrinter.addRow(row_data)
            TablePrinter.show()
            result_object = ResultObject(None, None, None,
                                         CommandStatus.Success)

        return result_object
Exemplo n.º 17
0
 def evaluate(self, array_data, target):
     result = ResultObject(None, None, None, CommandStatus.Error)
     in_array = array_data.data
     N = in_array.shape[0]
     if StatContainer.conditional_array is not None and len(
             StatContainer.conditional_array.data) == N:
         in_array = in_array[StatContainer.conditional_array.data]
     if in_array.size == 0:
         Printer.Print("No data")
         return result
     nan_idx = StatContainer.getNanIdx(in_array)
     non_nan_idx = np.logical_not(nan_idx)
     non_nan_array = in_array[non_nan_idx]
     numbers = findNumbers(target.data, 1)
     try:
         unique_arr, inv, counts = np.unique(non_nan_array,
                                             return_inverse=True,
                                             return_counts=True)
     except:
         return result
     if numbers != [] and numbers[0].data > 0:
         num = int(numbers[0].data)
         idx = None
         if not np.issubdtype(non_nan_array.dtype, np.number):
             num = min(unique_arr.size, num)
         if self._condition[0] == "top":
             Printer.Print("Finding top", num)
             if np.issubdtype(non_nan_array.dtype, np.number):
                 best_idx = np.argpartition(non_nan_array, -num)[-num:]
                 idx = np.full(non_nan_array.size, False)
                 idx[best_idx] = True
                 if num <= 30:
                     if StatContainer.row_labels is not None:
                         df_new = pd.DataFrame(
                             {array_data.name: non_nan_array[best_idx]})
                         df_new[
                             StatContainer.row_labels.
                             name] = StatContainer.row_labels.data[best_idx]
                         TablePrinter.printDataFrame(df_new)
                         TablePrinter.sort(0, ascending=False)
                     else:
                         Printer.Print("Top values:")
                         Printer.Print(non_nan_array[best_idx])
             else:
                 best_idx = np.argpartition(counts, -num)[-num:]
                 idx = np.isin(inv, best_idx)
                 if num <= 30:
                     Printer.Print("Top values:")
                     Printer.Print(unique_arr[best_idx])
         elif self._condition[0] == "bottom":
             Printer.Print("Finding bottom", num)
             if np.issubdtype(non_nan_array.dtype, np.number):
                 worst_idx = np.argpartition(non_nan_array, -num)[:num]
                 idx = np.full(non_nan_array.size, False)
                 idx[worst_idx] = True
                 if num <= 30:
                     if StatContainer.row_labels is not None:
                         df_new = pd.DataFrame(
                             {array_data.name: non_nan_array[worst_idx]})
                         df_new[StatContainer.row_labels.
                                name] = StatContainer.row_labels.data[
                                    worst_idx]
                         TablePrinter.printDataFrame(df_new)
                         TablePrinter.sort(0, ascending=True)
                     else:
                         Printer.Print("Worst values:")
                         Printer.Print(non_nan_array[worst_idx])
             else:
                 worst_idx = np.argpartition(counts, num)[:num]
                 idx = np.isin(inv, worst_idx)
                 if num <= 30:
                     Printer.Print("Worst values:")
                     Printer.Print(unique_arr[worst_idx])
         elif self._condition[0] == "first":
             Printer.Print(array_data.data[:num])
             result = ResultObject(None, None, None, CommandStatus.Success)
         else:
             Printer.Print("Did not find the right condition")
         if idx is not None:
             out1 = np.full(in_array.size, False)
             out1[non_nan_idx] = idx
             if StatContainer.conditional_array is not None and len(
                     StatContainer.conditional_array.data) == N:
                 out = np.full(N, False)
                 out[StatContainer.conditional_array.data] = out1
             else:
                 out = out1
             result = ResultObject(out, [], DataType.logical_array,
                                   CommandStatus.Success, True)
             result.createName(array_data.keyword_list,
                               command_name=self._condition[0],
                               set_keyword_list=True)
     elif self._condition[0] == "first":
         if unique_arr.size < 50:
             Printer.Print(unique_arr)
         else:
             Printer.Print(non_nan_array[:10])
         result = ResultObject(None, None, None, CommandStatus.Success)
     return result
Exemplo n.º 18
0
    def evaluate(self, array_datas):
        """
        Calculate label-wise mean array store it to history
        Parameters:

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)
        if isinstance(array_datas, list) and len(array_datas) == 0:
            return result_object
        command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
            array_datas)
        if command_status == CommandStatus.Error:
            return ResultObject(None, None, None, CommandStatus.Error)

        if StatContainer.ground_truth is None:
            gtVals = np.ones(df.shape[0])
            gtName = 'ground_truth'
        else:
            gtVals = StatContainer.filterGroundTruth()
            gtName = StatContainer.ground_truth.name

        # Remove nans:
        df[gtName] = gtVals
        df.dropna(inplace=True)

        gtVals = df[gtName]
        uniqVals = StatContainer.isCategorical(gtVals, uniqueCutoff=1000)
        binned_ground_truth = True

        if uniqVals is None and np.issubdtype(gtVals.dtype, np.number):
            # Convert to categorical
            df[gtName] = pd.cut(gtVals, 10)
            binned_ground_truth = True

        # Create groupwise arrays
        result_objects = []

        if uniqVals is not None:
            df_new = self.performOperation(df, gtName)

            df_new = df_new.reset_index()
            for col in df_new.columns:
                arr = df_new[col]
                kName = []
                if col == '':
                    kName = array_datas[0].keyword_list
                else:
                    # kName.append(cname)
                    kName.append(col)

                result_object = ResultObject(arr, [], DataType.array,
                                             CommandStatus.Success)
                command_name = 'labelwise.' + self._condition[0]
                result_object.createName(kName,
                                         command_name=command_name,
                                         set_keyword_list=True)

                result_objects.append(result_object)
            TablePrinter.printDataFrame(df_new)
        else:
            Printer.Print("The array is not of numeric type so cannot",
                          "calculate groupwise " + self._condition[0])
            result_objects.append(result_object)

        return result_objects