Пример #1
0
    def __init__(self, parent, controller, config, dbvar=None):
        self.widget_name = "analysispage"
        super().__init__(parent, controller, config, dbvar)
        self.engine = AnalysisPageEngine()

        self.graph_table_notebook = ttk.Notebook(self)
        self.graph_table_notebook.grid(row=0,
                                       rowspan=1,
                                       column=0,
                                       sticky="news")

        self.graph_frame = graphframe.GraphFrame(self.graph_table_notebook,
                                                 self, config)
        self.datatable = datatable.DataTable(self.graph_table_notebook, self,
                                             config)

        self.graph_table_notebook.add(self.graph_frame, text="Graph")
        self.graph_table_notebook.add(self.datatable, text="Table")

        self.query_panel = querypanel.QueryPanel(self, self, config)
        self.query_panel.grid(row=0, column=1, sticky="nw", rowspan=2)

        self.menu_pane = ttk.Labelframe(self, text="Controls")
        self.menu_pane.grid(row=1, column=0, sticky="NW")
        self.b_menu_pane()

        self.columnconfigure(0, weight=0)
        self.columnconfigure(1, weight=1)

        self.config_chain.append(self.query_panel)
    def fit_transform(self, X: dt.Frame, y: np.array = None):

        target = '__target__'
        X[:, target] = dt.Frame(y)
        target_is_numeric = X[:, target][:, [bool, int, float]].shape[1] > 0
        if not target_is_numeric:
            X[:, target] = dt.Frame(LabelEncoder().fit_transform(
                X[:, target].to_pandas().iloc[:, 0].values).ravel())

        self._group_means = X[:,
                              dt.mean(dt.f[target]),
                              dt.by(*self.input_feature_names)]
        self._group_means.key = self.input_feature_names
        self.dataset_mean = X[target].mean().to_numpy().ravel()[0]

        #Expanding mean transform
        X_ = X.to_pandas()[self.input_feature_names + [target]]
        X_["index"] = X_.index
        X_shuffled = X_.sample(n=len(X_), replace=False)
        X_shuffled["cnt"] = 1
        X_shuffled["cumsum"] = (X_shuffled.groupby(
            self.input_feature_names,
            sort=False)['__target__'].apply(lambda x: x.shift().cumsum()))
        X_shuffled["cumcnt"] = (X_shuffled.groupby(
            self.input_feature_names,
            sort=False)['cnt'].apply(lambda x: x.shift().cumsum()))
        X_shuffled["encoded"] = X_shuffled["cumsum"] / X_shuffled["cumcnt"]
        X_shuffled["encoded"] = X_shuffled["encoded"].fillna(self.dataset_mean)
        X_transformed = X_shuffled.sort_values("index")["encoded"].values
        return dt.DataTable(X_transformed)
Пример #3
0
    def parse_table(self, table):
        ''' Takes HTML for a single table and returns a Table. '''

        # Formatting issues sometimes prevent table extraction, so just return
        if table is None:
            return False

        # Count columns. Check either just one row, or all of them.
        def n_cols_in_row(row):
            return sum([
                int(td['colspan']) if td.has_key('colspan') else 1
                for td in row.find_all('td')
            ])

        if config.CAREFUL_PARSING:
            n_cols = max([
                n_cols_in_row(row)
                for row in table.find('tbody').find_all('tr')
            ])
        else:
            n_cols = n_cols_in_row(table.find('tbody').find('tr'))

        # Initialize grid and populate
        data = datatable.DataTable(0, n_cols)
        rows = table.find_all('tr')
        for r in rows:
            try:
                cols = r.find_all(['td', 'th'])
                cols_found_in_row = 0
                n_cells = len(cols)
                # Assign number of rows and columns this cell fills. We use these rules:
                # * If a rowspan/colspan is explicitly provided, use it
                # * If not, initially assume span == 1 for both rows and columns.
                # * Check to make sure that we don't have unaccounted-for columns in the
                #   row after including the current cell. If we do, adjust the colspan
                #   to take up all of the remaining columns. This is necessary because
                #   some tables have malformed HTML, and BeautifulSoup can also
                #   cause problems in its efforts to fix bad tables. The most common
                #   problem is deletion or omission of enough <td> tags to fill all
                #   columns, hence our adjustment.
                for (i, c) in enumerate(cols):
                    r_num = int(c['rowspan']) if c.has_key('rowspan') else 1
                    c_num = int(c['colspan']) if c.has_key('colspan') else 1
                    cols_found_in_row += c_num
                    if i + 1 == n_cells and cols_found_in_row < n_cols:
                        c_num += n_cols - cols_found_in_row
                    data.add_val(c.get_text(), r_num, c_num)
            except Exception as e:
                if not config.SILENT_ERRORS: logger.error(e.message)
                if not config.IGNORE_BAD_ROWS: raise
        return tableparser.parse_table(data)
Пример #4
0
def test_groups_internal2():
    d0 = dt.DataTable([[1, 5, 3, 2, 1, 3, 1, 1, None],
                       ["a", "b", "c", "a", None, "f", "b", "h", "d"]],
                      names=["A", "B"])
    d1 = d0(groupby="A")
    d1.internal.check()
    gb = d1.internal.groupby
    assert gb.ngroups == 5
    assert gb.group_sizes == [1, 4, 1, 2, 1]
    assert d1.to_list() == [[None, 1, 1, 1, 1, 2, 3, 3, 5],
                            ["d", "a", None, "b", "h", "a", "c", "f", "b"]]
    d2 = d0(groupby="B")
    d2.internal.check()
    gb = d2.internal.groupby
    assert gb.ngroups == 7
    assert gb.group_sizes == [1, 2, 2, 1, 1, 1, 1]
    assert d2.to_list() == [[1, 1, 2, 5, 1, 3, None, 3, 1],
                            [None, "a", "a", "b", "b", "c", "d", "f", "h"]]
Пример #5
0
def test_groups_internal2():
    d0 = dt.DataTable([[1, 5, 3, 2, 1, 3, 1, 1, None],
                       ["a", "b", "c", "a", None, "f", "b", "h", "d"]],
                      names=["A", "B"])
    d1 = d0(groupby="A")
    assert d1.internal.check()
    ri = d1.internal.rowindex
    assert ri.ngroups == 5
    assert ri.group_sizes == [1, 4, 1, 2, 1]
    assert d1.topython() == [[None, 1, 1, 1, 1, 2, 3, 3, 5],
                             ["d", "a", None, "b", "h", "a", "c", "f", "b"]]
    d2 = d0(groupby="B")
    assert d2.internal.check()
    ri = d2.internal.rowindex
    assert ri.ngroups == 7
    assert ri.group_sizes == [1, 2, 2, 1, 1, 1, 1]
    assert d2.topython() == [[1, 1, 2, 5, 1, 3, None, 3, 1],
                             [None, "a", "a", "b", "b", "c", "d", "f", "h"]]
Пример #6
0
def test_groups_internal2():
    d0 = dt.DataTable([[1, 5, 3, 2, 1, 3, 1, 1, None],
                       ["a", "b", "c", "a", None, "f", "b", "h", "d"]],
                      names=["A", "B"])
    d1 = d0[:, :, by("A")]
    # gb = d1.internal.groupby
    # assert gb.ngroups == 5
    # assert gb.group_sizes == [1, 4, 1, 2, 1]
    assert_equals(
        d1,
        dt.Frame(A=[None, 1, 1, 1, 1, 2, 3, 3, 5],
                 B=["d", "a", None, "b", "h", "a", "c", "f", "b"]))
    d2 = d0[:, :, by("B")]
    # gb = d2.internal.groupby
    # assert gb.ngroups == 7
    # assert gb.group_sizes == [1, 2, 2, 1, 1, 1, 1]
    assert_equals(
        d2,
        dt.Frame(B=[None, "a", "a", "b", "b", "c", "d", "f", "h"],
                 A=[1, 1, 2, 5, 1, 3, None, 3, 1]))
Пример #7
0
def test_create_datatable():
    """DataTable is old symbol for Frame."""
    d = dt.DataTable([1, 2, 3])
    d.internal.check()
    assert d.__class__.__name__ == "Frame"
    assert d.topython() == [[1, 2, 3]]
 def transform(self, X: dt.Frame):
     transformed_X = X[:, :, dt.join(self._group_means)][:, -1]
     return dt.DataTable(transformed_X.to_pandas().fillna(
         self.dataset_mean))
Пример #9
0
def run_benchmark(algorithm='gpu_hist', rows=1000000, columns=50, iterations=5, test_size=0.25):
    
    print("Generating dataset: {} rows * {} columns".format(rows, columns))
    print("{}/{} test/train split".format(test_size, 1.0 - test_size))
    tmp = time.time()
    X, y = make_classification(rows, n_features=columns, random_state=7)
    aa = np.random.rand(X.shape[0],X.shape[1])
    fraction_missing = 0.1
    X[aa<fraction_missing]=np.NaN
    print("Number of Nans: %d" % (np.isnan(X).sum()))

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=7)
    print ("Generate Time: %s seconds" % (str(time.time() - tmp)))

    param = {'objective': 'binary:logistic',
             'max_depth': 6,
             'silent': 0,
             'n_gpus': 1,
             'gpu_id': 0,
             'eval_metric': 'error',
             'debug_verbose': 0,
             }

    param['tree_method'] = algorithm

    do_dt = True
    do_dt_likeDAI = True
    do_ccont = False
    do_nondt = True

    do_check_accuracy = True

    tmp = time.time()
    if do_ccont:
        X_train_cc = X_train
        X_test_cc = X_test
        y_train_cc = y_train
        y_test_cc = y_test
    else:
        # convert to dt as test
        X_train_cc = np.asfortranarray(X_train)
        X_test_cc = np.asfortranarray(X_test)
        y_train_cc = np.asfortranarray(y_train)
        y_test_cc = np.asfortranarray(y_test)

        if not (X_train_cc.flags['F_CONTIGUOUS'] and X_test_cc.flags['F_CONTIGUOUS'] \
                        and y_train_cc.flags['F_CONTIGUOUS'] and y_test_cc.flags['F_CONTIGUOUS']):
            ValueError("Need data to be Fortran (i.e. column-major) contiguous")
    print("dt prepare1 Time: %s seconds" % (str(time.time() - tmp)))

    res={}
    if do_nondt:
        print("np->DMatrix Start")
        # omp way
        tmp = time.time()
        # below takes about 2.826s if do_ccont=False
        # below takes about 0.248s if do_ccont=True
        dtrain = xgb.DMatrix(X_train_cc, y_train_cc, nthread=-1)
        print ("np->DMatrix1 Time: %s seconds" % (str(time.time() - tmp)))
        tmp = time.time()
        dtest = xgb.DMatrix(X_test_cc, y_test_cc, nthread=-1)
        print ("np->DMatrix2 Time: %s seconds" % (str(time.time() - tmp)))

        print("Training with '%s'" % param['tree_method'])
        tmp = time.time()
        res_tmp = {}
        xgb.train(param, dtrain, iterations, evals=[(dtrain, "train"),(dtest, "test")], evals_result=res_tmp)
        res['1'] = res_tmp['train']['error']
        print("Train Time: %s seconds" % (str(time.time() - tmp)))
    if HAVE_DT and do_dt:

        # convert to column-major contiguous in memory to mimic persistent column-major state
        # do_cccont = True leads to prepare2 time of about 1.4s for 1000000 rows * 50 columns
        # do_cccont = False leads to prepare2 time of about 0.000548 for 1000000 rows * 50 columns
        tmp = time.time()
        dtdata_X_train = dt.DataTable(X_train_cc)
        dtdata_X_test = dt.DataTable(X_test_cc)
        dtdata_y_train = dt.DataTable(y_train_cc)
        dtdata_y_test = dt.DataTable(y_test_cc)
        print ("dt prepare2 Time: %s seconds" % (str(time.time() - tmp)))

        #test = dtdata_X_train.tonumpy()
        #print(test)

        print ("dt->DMatrix Start")
        # omp way
        tmp = time.time()
        # below takes about 0.47s - 0.53s independent of do_ccont
        dtrain = xgb.DMatrix(dtdata_X_train, dtdata_y_train, nthread=-1)
        print ("dt->DMatrix1 Time: %s seconds" % (str(time.time() - tmp)))
        tmp = time.time()
        dtest = xgb.DMatrix(dtdata_X_test, dtdata_y_test, nthread=-1)
        print ("dt->DMatrix2 Time: %s seconds" % (str(time.time() - tmp)))

        print("Training with '%s'" % param['tree_method'])
        tmp = time.time()
        res_tmp = {}
        xgb.train(param, dtrain, iterations, evals=[(dtrain, "train"),(dtest, "test")], evals_result=res_tmp)
        res['2'] = res_tmp['train']['error']
        print ("Train Time: %s seconds" % (str(time.time() - tmp)))
    if HAVE_DT and do_dt_likeDAI:

        # convert to column-major contiguous in memory to mimic persistent column-major state
        # do_cccont = True leads to prepare2 time of about 1.4s for 1000000 rows * 50 columns
        # do_cccont = False leads to prepare2 time of about 0.000548 for 1000000 rows * 50 columns
        tmp = time.time()
        dtdata_X_train = dt.DataTable(X_train_cc)
        dtdata_X_test = dt.DataTable(X_test_cc)
        dtdata_y_train = dt.DataTable(y_train_cc)
        dtdata_y_test = dt.DataTable(y_test_cc)
        print ("dt prepare2 Time: %s seconds" % (str(time.time() - tmp)))

        #test = dtdata_X_train.tonumpy()
        #print(test)

        print ("dt->DMatrix Start")
        # omp way
        tmp = time.time()
        dtrain = xgb.DMatrix(dtdata_X_train.tonumpy(), dtdata_y_train.tonumpy(), nthread=-1)
        print ("dt->DMatrix1 Time: %s seconds" % (str(time.time() - tmp)))
        tmp = time.time()
        dtest = xgb.DMatrix(dtdata_X_test.tonumpy(), dtdata_y_test.tonumpy(), nthread=-1)
        print ("dt->DMatrix2 Time: %s seconds" % (str(time.time() - tmp)))

        print("Training with '%s'" % param['tree_method'])
        tmp = time.time()
        res_tmp = {}
        xgb.train(param, dtrain, iterations, evals=[(dtrain, "train"),(dtest, "test")], evals_result=res_tmp)
        res['3'] = res_tmp['train']['error']
        print ("Train Time: %s seconds" % (str(time.time() - tmp)))
    if HAVE_DT and do_check_accuracy:
        assert_accuracy(res['1'],res['2'])
        assert_accuracy(res['1'],res['3'])
Пример #10
0
def test_create_datatable():
    """DataTable is old symbol for Frame."""
    d = dt.DataTable([1, 2, 3])
    frame_integrity_check(d)
    assert d.__class__.__name__ == "Frame"
    assert d.to_list() == [[1, 2, 3]]
Пример #11
0
 def new_table(self, name, cols=[]) :
     self.table_id += 1
     name_with_id = '%s_%06d' % (name, self.table_id)
     return datatable.DataTable(self, self.con, name_with_id, cols)