예제 #1
0
def pyunit_deep_copy():

    pros_1 = h2o.import_file(
        pyunit_utils.locate("smalldata/prostate/prostate.csv"))
    pros_2 = h2o.import_file(
        pyunit_utils.locate("smalldata/prostate/prostate.csv"))
    pros_copy_1 = h2o.deep_copy(pros_1, "copy")
    pros_copy_2 = h2o.deep_copy(pros_2, "copy2")

    #Change a part of the original frame and a copied frame. It is expected in a deep copy that changing the original
    #frame will not effect the duplicate and vice versa
    pros_1.insert_missing_values()
    pros_copy_2.insert_missing_values()

    print("Original Frame with inserted missing values:")
    print(pros_1)
    print("Duplicate Frame with no inserted missing values")
    print(pros_copy_1)
    print("Original Frame with no inserted missing values:")
    print(pros_2)
    print("Duplicate Frame with inserted missing values")
    print(pros_copy_2)
    print("Number of frames in session after deep_copy")
    print(h2o.ls())

    assert pros_1.nacnt() != pros_copy_1.nacnt(
    ), "Inserted NA's into the original frame but the original seems to match the duplicates NA count!"
    assert pros_2.nacnt() != pros_copy_2.nacnt(
    ), "Inserted NA's into the duplicate frame but the original seems to match the originals NA count!"
예제 #2
0
    def run_two_step_model(self, model=None, rass=False):
        if model is None:
            model = h2o.estimators.random_forest.H2ORandomForestEstimator(
                col_sample_rate_per_tree=0.9,
                ntrees=100,
                model_id='two_step_model')
            # model = h2o.estimators.glm.H2OGeneralizedLinearEstimator(family='binomial')
        self.model = model

        if not rass:
            self.first_step()
        else:
            self.df['ys'] = self.df['target']

        for _ in range(10):
            log(f'Step {_}. {time.ctime()}')
            df2 = self.second_step()
            if df2['ys'] == self.df['ys']:
                log('Finished')
                break
            else:
                self.df = h2o.deep_copy(df2, 'df')

        # self.df = df2[df2['ys'] != '-1']
        self.df = df2
        # #self.df['ys'] = self.df['ys'].ascharacter()
        # self.df['ys'] = self.df['ys'].asnumeric()
        # self.df['ys'] = self.df['ys'].ascharacter()
        # self.df = self.df[self.df['ys'] != '-1']
        # self.df['ys'] = self.df['ys'].asfactor()
        self.change_target()
예제 #3
0
    def analyze_improvements(self, df_eval, target_class, base_line_scoring):

        improvement_results = []

        for q in self.actionable_q:

            try:

                _uuid = "df_" + str(uuid.uuid1())

                df_mod = h2o.deep_copy(df_eval, _uuid)

                impr = {"varname": q["varname"]}
                # modify single variable by increment / decrement

                flag_re_run = False

                if "actionable" in q:

                    curr_val = df_mod[q["varname"]]

                    flag_re_run, new_val, new_val_itm, init_val_itm, action_meta = self._get_next_val(
                        q, curr_val)

                    if flag_re_run:
                        df_mod[q["varname"]] = new_val

                    # print(df_mod[q["varname"]])

                if flag_re_run:
                    # predict with slightly modified feature vector

                    df_prediction = self._get_evaluation(df_mod)

                    impr["p" + str(target_class)] = float(
                        df_prediction["p" + str(target_class)])
                    impr["target_class"] = str(target_class)
                    impr["meta"] = action_meta
                    impr["new_val_itm"] = new_val_itm
                    impr["init_val_itm"] = init_val_itm
                    impr["delta"] = float(
                        df_prediction["p" +
                                      str(target_class)]) - base_line_scoring

                    improvement_results.append(impr)

            except Exception as e:
                print(e)

        df_improvements = pd.DataFrame(improvement_results)
        df_improvements.sort_values(by="delta", ascending=False, inplace=True)

        df_improvements = df_improvements[df_improvements["delta"] > 0]

        return df_improvements
예제 #4
0
def h2odeep_copy():
    """
    Python API test: h2o.deep_copy(data, xid)
    """
    new_name = "new_frame"
    training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv"))
    training_copy = h2o.deep_copy(training_data, new_name)
    assert_is_type(training_data, H2OFrame)
    assert_is_type(training_copy, H2OFrame)
    assert training_data.nacnt()==training_copy.nacnt(), "h2o.deep_copy() command is not working."
    training_copy.insert_missing_values(fraction=0.9)   # randomly added missing values with high probability
    assert not(training_data.nacnt()==training_copy.nacnt()), "h2o.deep_copy() command is not working."
예제 #5
0
def h2odeep_copy():
    """
    Python API test: h2o.deep_copy(data, xid)
    """
    try:
        new_name = "new_frame"
        training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv"))
        training_copy = h2o.deep_copy(training_data, new_name)
        assert_is_type(training_data, H2OFrame)
        assert_is_type(training_copy, H2OFrame)
        assert training_data.nacnt()==training_copy.nacnt(), "h2o.deep_copy() command is not working."
        training_copy.insert_missing_values(fraction=0.9)   # randomly added missing values with high probability
        assert not(training_data.nacnt()==training_copy.nacnt()), "h2o.deep_copy() command is not working."
    except Exception as e:
        assert False, "h2o.deep_copy() command is not working."
예제 #6
0
    def second_step(self):
        # self.model = h2o.estimators.glm.H2OGeneralizedLinearEstimator(family='multinomial')
        df = h2o.deep_copy(self.df, 'df2')
        df['ys'] = df['ys'].asfactor()
        self.model.train(x=list(self.features), y='ys', training_frame=df)

        pred = self.model.predict(df)['p1']
        df['p1'] = pred
        df['ys'] = df['ys'].asnumeric()
        max_prob = df[df['ys'] > 0, 'p1'].max()
        min_prob = df[df['ys'] > 0, 'p1'].min()
        log('New positives: {0}.'.format(df[(df['ys'] < 0)
                                            & (df['p1'] > max_prob)].shape))
        df[(df['ys'] < 0) & (df['p1'] > max_prob), 'ys'] = 1
        log('New negatives: {0}.'.format(df[(df['ys'] < 0)
                                            & (df['p1'] < min_prob)].shape))
        df[(df['ys'] < 0) & (df['p1'] < min_prob), 'ys'] = 0

        df['ys'] = df['ys'].asfactor()

        return df
def probabilistic_labels2weights(df,prob_label_col='prob_bogus',label_var='is_bogus'):
    """
    turns pandas/h2o DataFrame @df with probabilistic labels in @prob_label_col (0-1) to H2OFrame
    with double the rows, where each observation is duplicated into two:
    (label_var=1, weight=prob) and (label_var=0, weight=1-prob)
    """
    if isinstance(df,h2o.H2OFrame):
        df2 = h2o.deep_copy(df,'some_internal_id') # h2o
    elif isinstance(df,pd.DataFrame):
        df2 = df.copy()
    else:
        raise ValueError('not a data frame')
    df[label_var] = 1
    df['weight'] = df[prob_label_col]
    df2[label_var] = 0
    df2['weight'] = 1 - df2[prob_label_col]
    if isinstance(df,h2o.H2OFrame):
        df_weighted = df.concat(df2,axis=0) # h2o
    elif isinstance(df,pd.DataFrame):
        df_weighted = pd.concat([df,df2],axis=0)
    return df_weighted
예제 #8
0
    def first_step(self):
        """


       :return:
        """
        df = h2o.deep_copy(self.df, 'df_')
        self.model.train(x=list(self.features), y='target', training_frame=df)

        pred = self.model.predict(df)
        df['target'] = df['target'].asnumeric()
        df['ys'] = df['target'] * 2 - 1
        df = df.cbind(pred['p1'])
        max_prob = df[df['ys'] > 0, 'p1'].max()
        min_prob = df[df['ys'] > 0, 'p1'].min()
        log('New positives: {0}.'.format(df[(df['ys'] < 0)
                                            & (df['p1'] > max_prob)].shape))
        df[(df['ys'] < 0) & (df['p1'] > max_prob), 'ys'] = 1
        log('New negatives: {0}.'.format(df[(df['ys'] < 0)
                                            & (df['p1'] < min_prob)].shape))
        df[(df['ys'] < 0) & (df['p1'] < min_prob), 'ys'] = 0
        df['ys'] = df['ys'].asfactor()
        self.df = df
예제 #9
0
    def prepare_data(self,
                     target: str = None,
                     to_target: str = None,
                     n_sample: int = 10,
                     hidden_size: int = 5,
                     process: bool = True,
                     rass=False):
        """
        Prepare data for model.

       :param target:
       :param to_target:
       :param n_sample:
       :param hidden_size:
       :param process:
       :param rass:
       :return:
        """
        self.hidden_size = hidden_size

        if not rass:
            if target is not None:
                if target in self.data.columns:
                    pass
                else:
                    raise ValueError(f'{target} column not found!')

            elif to_target is not None:
                self.data['target'] = 0
                self.data[self.data[to_target] != '\\N', 'target'] = 1

            else:
                raise ValueError('Target column not defined!')

            self.data['target'] = self.data['target'].asfactor()

            if hidden_size == 0:
                # sample data from negative class
                print('Model will be trained without validation.')
                df = self.data[self.data['target'] == '0'][:n_sample, :].rbind(
                    self.data[self.data['target'] == '1'])

            else:
                # random sampling of positive, the rest is validation.
                df_ = self.data[self.data['target'] ==
                                '0'][:n_sample, :].rbind(
                                    self.data[self.data['target'] == '1'])
                self.orig_target = df_['target']
                print('Doing random sampling')
                target_1_len = self.data[self.data['target'] == '1'].shape[0]
                rand_ind = np.random.choice(range(target_1_len),
                                            hidden_size,
                                            replace=False)

                data_1 = self.data[self.data['target'] == '1']
                data_1[list(rand_ind), 'target'] = '0'
                df = self.data[self.data['target'] == '0'][:n_sample, :].rbind(
                    data_1)

        else:
            self.data['target'] = self.data['target'].set_levels(
                ['0', '1', '-1'])

            if hidden_size == 0:
                # sample data from negative class
                print('Model will be trained without validation.')
                df = self.data[
                    self.data['target'] == '-1'][:n_sample, :].rbind(
                        self.data[self.data['target'] == '0']).rbind(
                            self.data[self.data['target'] == '1'])

            else:
                # random sampling of positive, the rest is validation.
                df_ = self.data[
                    self.data['target'] == '-1'][:n_sample, :].rbind(
                        self.data[self.data['target'] == '0']).rbind(
                            self.data[self.data['target'] == '1'])
                self.orig_target = df_['target']
                print('Doing random sampling')
                target_1_len = self.data[self.data['target'] == '1'].shape[0]
                rand_ind = np.random.choice(range(target_1_len),
                                            hidden_size,
                                            replace=False)

                data_1 = self.data[self.data['target'] == '1']
                data_1[list(rand_ind), 'target'] = '-1'
                df = self.data[self.data['target'] ==
                               '-1'][:n_sample, :].rbind(data_1).rbind(
                                   self.data[self.data['target'] == '0'])

        self.features = df.columns[3:-1]

        if process:
            df = process_df(df)

        self.df = df
        self.orig_df = h2o.deep_copy(self.df, 'orig_df')
예제 #10
0
h2o.connection.H2OConnection.post("GarbageCollect")
h2o.connection.H2OConnection.post("GarbageCollect")
h2o.connection.H2OConnection.post("GarbageCollect")

#loop through post_cols
start = False
for c_i, post_col in enumerate(post_cols):
    if post_col == settings.start_icd or settings.start_icd == "":
        start = True
    if start:
        print utils.time() + 'Work on ' + post_col
        # calculate node statistics
        node = post_col.replace(settings.post_prefix, "")
        idx_col = settings.index_prefix + node
        if post_col == "POST_DEATH":
            possible_incidents = h2o.deep_copy(matrix, "poss_inc")
        else:
            pre_col = settings.pre_prefix + node
            pre_col_vector = matrix[pre_col]
            print "...Anteil possible incidents: " + str(
                pre_col_vector.mean()
            )  #needed for tricking h2o to not lazy calc this vector -> MEM issues
            possible_incidents = matrix[pre_col_vector == 0]
            h2o.remove(pre_col_vector)
            pre_col_vector = None
        print "...Possible incidents: " + str(
            possible_incidents.nrow
        )  #needed for tricking h2o to not lazy calc this vector -> MEM issues
        post_col_vector = possible_incidents[post_col]
        print "...Anteil real incidents: " + str(post_col_vector.mean())
        real_incidents = possible_incidents[post_col_vector > 0]