Exemplo n.º 1
0
def NB(X, y, X_ind, y_ind):
    """Cross Validation and independent set test for Naive Bayes.

    Arguments:
        X (ndarray): Feature data of training and validation set for cross-validation.
                     m X n matrix, m is the No. of samples, n is the No. of fetures
        y (ndarray): Label data of training and validation set for cross-validation.
                     m-D vector, and m is the No. of samples.
        X_ind (ndarray): Feature data of independent test set for independent test.
                         It has the similar data structure as X.
        y_ind (ndarray): Feature data of independent set for for independent test.
                         It has the similar data structure as y
        out (str): The file path for saving the result data.

    Returns:
         cvs (ndarray): cross-validation results. The shape is (m, ), m is the No. of samples.
         inds (ndarray): independent test results. It has similar data structure as cvs.
    """
    folds = StratifiedKFold(5).split(X, y)
    cvs = np.zeros(y.shape)
    inds = np.zeros(y_ind.shape)
    for i, (trained, valided) in enumerate(folds):
        model = GaussianNB()
        model.fit(X[trained], y[trained])
        cvs[valided] = model.predict_proba(X[valided])[:, 1]
        inds += model.predict_proba(X_ind)[:, 1]
    return cvs, inds / 5
Exemplo n.º 2
0
def evaluate(model, dataloader, device):
    model.eval()

    probas = []
    labels = []
    # compute metrics over the dataset
    with torch.no_grad():
        for i, (batch_inputs, batch_labels) in enumerate(tqdm(dataloader)):
            # move to GPU if available
            batch_inputs = batch_inputs.to(device)
            batch_labels = batch_labels.to(device)  # shape: (batch_size,)

            # predict softmax probabilities
            batch_probas = model.predict_proba(
                batch_inputs)  # shape: (batch_size, 2)

            # collect predictions
            probas.append(to_np(batch_probas))
            labels.append(to_np(batch_labels))

    probas = np.vstack(probas)
    labels = np.concatenate(labels)

    # compute all metrics after one epoch
    metrics = {
        "loss": log_loss(labels, probas[:, 1]),
        "accuracy": accuracy_score(labels, probas.argmax(1)),
        "AUC": roc_auc_score(labels, probas[:, 1])
    }
    return metrics
Exemplo n.º 3
0
def SVM(X, y, X_ind, y_ind, is_reg=False):
    """Cross Validation and independent set test for Support Vector Machine (SVM)

    Arguments:
        X (ndarray): Feature data of training and validation set for cross-validation.
                     m X n matrix, m is the No. of samples, n is the No. of fetures
        y (ndarray): Label data of training and validation set for cross-validation.
                     m-D vector, and m is the No. of samples.
        X_ind (ndarray): Feature data of independent test set for independent test.
                         It has the similar data structure as X.
        y_ind (ndarray): Feature data of independent set for for independent test.
                         It has the similar data structure as y
        out (str): The file path for saving the result data.
        is_reg (bool, optional): define the model for regression (True) or classification (False) (Default: False)

    Returns:
         cvs (ndarray): cross-validation results. The shape is (m, ), m is the No. of samples.
         inds (ndarray): independent test results. It has similar data structure as cvs.
    """
    if is_reg:
        folds = KFold(5).split(X)
        model = SVR()
    else:
        folds = StratifiedKFold(5).split(X, y)
        model = SVC(probability=True)
    cvs = np.zeros(y.shape)
    inds = np.zeros(y_ind.shape)
    gs = GridSearchCV(model, {
        'C': 2.0**np.array([-5, 15]),
        'gamma': 2.0**np.array([-15, 5])
    },
                      n_jobs=5)
    gs.fit(X, y)
    params = gs.best_params_
    print(params)
    for i, (trained, valided) in enumerate(folds):
        model = SVC(probability=True, C=params['C'], gamma=params['gamma'])
        model.fit(X[trained], y[trained])
        if is_reg:
            cvs[valided] = model.predict(X[valided])
            inds += model.predict(X_ind)
        else:
            cvs[valided] = model.predict_proba(X[valided])[:, 1]
            inds += model.predict_proba(X_ind)[:, 1]
    return cvs, inds / 5
Exemplo n.º 4
0
def task():
    year, months = 2019, [10, 11, 12]
    select = None
    radius, aperture_size, incident_interval, time_step = '100', '6', '25', '1'
    config = {
        'year': year,
        'months': months,
        'radius': radius,
        'aperture_size': aperture_size,
        'incident_interval': incident_interval,
        'time_step': time_step,
        'select': select
    }
    month = months[0]
    df = pd.read_pickle(
        f'output/waze/{year}_{month}_{radius}_{aperture_size}_features.pkl')
    for month in months[1:]:
        temp_df = pd.read_pickle(
            f'output/waze/{year}_{month}_{radius}_{aperture_size}_features.pkl'
        )
        df = df.append(temp_df, sort=False)
    df.fillna(0, inplace=True)
    # if select is not None:
    #     df = df[-1 * int(select):]
    if os.path.exists(
            f'output/{year}_{month}_{radius}_{aperture_size}_predict_proba.pkl'
    ):
        x = pd.read_pickle(
            f'output/{year}_{month}_{radius}_{aperture_size}_predict_proba.pkl'
        )
    else:
        x = model.predict_proba(df, int(incident_interval), time_step)
        x = model.extract_features(x, df)
        x.to_pickle(
            f'output/{year}_{month}_{radius}_{aperture_size}_predict_proba.pkl'
        )
    incident_df = load_incidents(aperture_size)
    if os.path.exists(f'output/{year}_{month}_{radius}_{aperture_size}_y.pkl'):
        y = pd.read_pickle(
            f'output/{year}_{month}_{radius}_{aperture_size}_y.pkl')
    else:
        y = label_mapper(x, incident_df)
        pd.Series(y).to_pickle(
            f'output/{year}_{month}_{radius}_{aperture_size}_y.pkl')
    models = [
        'LogisticRegression', 'DecisionTreeClassifier',
        'RandomForestClassifier'
    ]
    for m in models:
        print('model', m)
        cv_results, y_pred = model.cross_validate(x, y, m)
        for k, v in config.items():
            print(k, ',', v)
        for k, v in cv_results.items():
            print(k, ',', np.average(v))
        print()
Exemplo n.º 5
0
def score_and_predict(model, X, Y):
    '''
    Given a binary classification model, predict output classification for numpy features `X` 
    and evaluate accuracy against labels `Y`. Labels should be numpy array of 0s and 1s.
    Returns (accuracy, numpy array of classification probabilities)
    '''
    probs = model.predict_proba(X)[:, 1]
    clf = probs > .5
    accuracy = (np.squeeze(Y) == np.squeeze(clf)).mean()
    return accuracy, probs
Exemplo n.º 6
0
def find_text_in_frame(current_img, baseimgs, modelfile='webapp/model.pickle',proba_threshold = 0.5, debug=False):
    blobs = []
    for baseimg in baseimgs:
        for (xmin,ymin), blob in img_proc_utils.extract_blobs(current_img-baseimg, img_proc_pipeline = img_proc_utils.pipeline_otsu):
            proba = model.predict_proba(blob, model=modelfile)
            if proba >= proba_threshold or debug:
                blobs.append({'blob': blob, 'left_corner': [xmin,ymin], 'proba': proba})
        if len(blobs) > 0 and not debug:
            return blobs
    return blobs
Exemplo n.º 7
0
def RF(X, y, X_ind, y_ind, is_reg=False):
    """Cross Validation and independent set test for Random Forest model

    Arguments:
        X (ndarray): Feature data of training and validation set for cross-validation.
                     m X n matrix, m is the No. of samples, n is the No. of fetures
        y (ndarray): Label data of training and validation set for cross-validation.
                     m-D vector, and m is the No. of samples.
        X_ind (ndarray): Feature data of independent test set for independent test.
                         It has the similar data structure as X.
        y_ind (ndarray): Feature data of independent set for for independent test.
                         It has the similar data structure as y
        out (str): The file path for saving the result data.
        is_reg (bool, optional): define the model for regression (True) or classification (False) (Default: False)

    Returns:
         cvs (ndarray): cross-validation results. The shape is (m, ), m is the No. of samples.
         inds (ndarray): independent test results. It has similar data structure as cvs.
        """
    if is_reg:
        folds = KFold(5).split(X)
        alg = RandomForestRegressor
    else:
        folds = StratifiedKFold(5).split(X, y)
        alg = RandomForestClassifier
    cvs = np.zeros(y.shape)
    inds = np.zeros(y_ind.shape)
    for i, (trained, valided) in enumerate(folds):
        model = alg(n_estimators=500, n_jobs=1)
        model.fit(X[trained], y[trained])
        if is_reg:
            cvs[valided] = model.predict(X[valided])
            inds += model.predict(X_ind)
        else:
            cvs[valided] = model.predict_proba(X[valided])[:, 1]
            inds += model.predict_proba(X_ind)[:, 1]
    return cvs, inds / 5
Exemplo n.º 8
0
def temporal(model, batch, y_preds, num_classes, device):
    inputs, lengths = batch
    new_preds = torch.zeros(inputs.shape)
    losses = torch.zeros(inputs.shape)
    for i in range(inputs.shape[0]):
        preinputs = inputs[:i + 1, :]
        with torch.no_grad():
            new_lengths = torch.min(lengths, torch.tensor(i + 1).to(device))
            preout = model.predict_proba((preinputs, new_lengths))
            new_preds[i, :] = preout.gather(1, y_preds).squeeze()

    losses[0, :] = new_preds[0, :] - 1.0 / num_classes
    for i in range(1, inputs.shape[0]):
        losses[i, :] = new_preds[i, :] - new_preds[i - 1, :]

    return losses
Exemplo n.º 9
0
def temporal_tail(model, batch, y_preds, num_classes, device):
    inputs, lengths = batch
    new_preds = torch.zeros(inputs.shape)
    losses = torch.zeros(inputs.shape)
    for i in range(inputs.shape[0]):
        postinputs = inputs[i:, :]
        with torch.no_grad():
            new_lengths = torch.max(lengths - i, torch.tensor(1).to(device))
            postout = model.predict_proba((postinputs, new_lengths))
            new_preds[i, :] = postout.gather(1, y_preds).squeeze()

    losses[-1, :] = new_preds[-1, :] - 1.0 / num_classes
    for i in range(inputs.shape[0] - 1):
        losses[i, :] = new_preds[i, :] - new_preds[i + 1, :]

    return losses
Exemplo n.º 10
0
    def get(self):
        # use parser and find the user's query
        args = parser.parse_args()
        user_query = args['query']

        # vectorize the user's query and make a prediction
        uq_vectorized = model.vectorizer_transform(np.array([user_query]))
        prediction = model.predict(uq_vectorized)
        pred_proba = model.predict_proba(uq_vectorized)

        # round the predict proba value and set to new variable
        confidence = round(pred_proba[0], 3)

        # create JSON object
        output = {'intent': prediction.item(0), 'probability': str(confidence)}

        return output
Exemplo n.º 11
0
    def get(self):
        # use parser and find the user's query
        args = parser.parse_args()
        user_query = args['query']

        # preprocessing the user's query and make a prediction
        uq_preprocess = model.numericalImputer_transform(np.array([user_query]))
        prediction = model.predict(uq_preprocess)
        pred_proba = model.predict_proba(uq_preprocess)


        # round the predict proba value and set to new variable
        confidence = round(pred_proba[0], 3)

        # create JSON object
        output = {'prediction': str(prediction), 'probability': confidence}

        return output
Exemplo n.º 12
0
def word_drop(model, batch, y_preds, num_classes, device):
    inputs = batch[0]
    losses = torch.zeros(inputs.shape)
    target = None
    for i in range(inputs.shape[0]):
        if target:
            index, vals = target
            inputs[i - 1, :] = vals
        target = (i, torch.clone(inputs[i, :]))
        inputs[i, :] = 0
        with torch.no_grad():
            out = model.predict_proba(batch)
            losses[i, :] = out.gather(1, y_preds).squeeze()

    if target:
        index, vals = target
        inputs[-1, :] = vals
    return 1. - losses
Exemplo n.º 13
0
def stream_frames2(stream, pafy_video = None):
    base_frame_sec = -1
    base_frame = None
    test = (pafy_video == None)
    # stream = '/windows/mit/rubakov.mp4' # testing
    if base_frame < 0:
        if pafy_video:
            yield 'event: onstart\ndata: %s\n\n' % json.dumps({'video_length': pafy_video.length,
                                                               'video_title': pafy_video.title,
                                                               # 'video_desc': pafy_video.description,
                                                               'video_author': pafy_video.author})
        else: 
            yield 'event: onstart\ndata: %s\n\n' % json.dumps({'video_length': 5000})

    try:
        for sec, frame in utils.get_frames_from_stream(stream,5):
            if int(sec % 20) == 0:
                yield 'event: onprogress\ndata: %s\n\n' % json.dumps({'sec': int(sec)})
            if base_frame_sec < 0:
                base_frame = frame
                base_frame_sec = sec
                continue
            if test: has_blob = False
            for (xmin,ymin), blob in img_proc_utils.extract_blobs(frame-base_frame, img_proc_pipeline = img_proc_utils.pipeline2):
                proba = model.predict_proba(blob, model='webapp/model.pickle')
                if proba > 0.5:
                    has_blob = True
                    print sec, xmin, ymin,proba
                    yield 'data: %s\n\n' % json.dumps({'img': utils.img_to_base64_bytes(blob), #utils.img_to_base64_bytes(255-np.nan_to_num(abs(blob))),
                                                 'sec': int(sec),
                                                 'proba': proba,
                                                 'left_corner': [xmin,ymin],
                                                 'size': blob.shape,
                                                 'frame': utils.img_to_base64_bytes(frame)
                                             })
                    base_frame = frame
                    base_frame_sec = sec
            if test and has_blob: time.sleep(3)

    except StopIteration:
        print 'onend!'
        yield 'event: onend\ndata: end\n\n'
        raise StopIteration
Exemplo n.º 14
0
    def get(self):
        features = self.parser.parse_args()

        # control all required features are passed.

        # create pandas dataframe
        row = args_to_pandas(features)
        # drop useless features (ids)
        # to_drop = []
        df = row[columns]
        # clean & transform dataset
        df = api_feature_transformation(df)
        # make positive (fraud) probability prediction
        model, prediction, model_cols = predict_proba(df)
        # get feature contributions (force to 1 row)
        contributions = get_feature_contributions(model, df[model_cols])
        # return json
        return {
            "Prediction": round(prediction, 4),
            "Impact": contributions,
        }, 200
Exemplo n.º 15
0
def predict(year, month, radius, aperture_size, incident_interval, time_step,
            select):
    config = {
        'year': year,
        'month': month,
        'radius': radius,
        'aperture_size': aperture_size,
        'incident_interval': incident_interval,
        'time_step': time_step,
        'select': select
    }
    df = pd.read_pickle(
        f'output/waze/{year}_{month}_{radius}_{aperture_size}_features.pkl')
    if select is not None:
        df = df[-1 * int(select):]
    x = model.predict_proba(df, int(incident_interval), time_step)
    incident_df = load_incidents(aperture_size)
    y = label_mapper(x, incident_df)
    print(np.unique(y, return_counts=True))
    cv_results = model.cross_validate(x, y)
    for k, v in config.items():
        print(k, v)
    for k, v in cv_results.items():
        print(k, np.average(v))
Exemplo n.º 16
0
    def step(self, X_test, iteration):

        ####################################### Hardcoded sequences of actions ####################################################
        if iteration == 1:  # as fast as possible
            X_train, X_test = make_inference(self.se_extractor,
                                             self.X,
                                             X_test,
                                             self.device,
                                             max_len=3,
                                             num_eval=1)

            model = LogisticRegression(solver='lbfgs',
                                       multi_class='multinomial',
                                       max_iter=20)
            model.fit(X_train, self.y.argmax(axis=1))
            y_pred = model.predict_proba(X_test)
            self.predictions.append((1, 'first', y_pred))
            self.features['se_3_1'] = (X_train, X_test)
            return

        if iteration == 2:
            # schedule a non-blocking op
            self.data_id = ray.put((self.X, self.y, X_test, self.cv))
            self.X_music = ray.remote(
                num_gpus=0.25, num_cpus=1,
                max_calls=1)(extract_musicnn_features).remote(self.data_id)

            self.predictions.clear()
            X_tr, X_t = self.features['se_3_1']
            model = LogisticRegression(solver='lbfgs',
                                       multi_class='multinomial',
                                       max_iter=100)
            score, y_pred = fit_single(None, model, self.cv, X_tr, self.y, X_t,
                                       self.metric)
            self.predictions.append((score, 'first', y_pred))

            module_path = f'{self.root_path}/3rdparty/autospeech19/'
            self.solution3_2019 = ray.remote(
                num_gpus=0.25, num_cpus=1,
                max_calls=1)(top3_2019_kon).remote(module_path,
                                                   self.out_path_top3_2019,
                                                   self.n_classes,
                                                   self.data_id, self.metric)

            module_path = f'{self.root_path}/3rdparty/AutoSpeech/code_submission'
            self.solution1_2019 = ray.remote(
                num_gpus=0.25, num_cpus=1,
                max_calls=1)(top1_2019_hazza_cheng).remote(
                    module_path, self.out_path_top1_2019, self.n_classes,
                    self.data_id, self.metric)

        if iteration == 3:
            X_train, X_test = make_inference(self.se_extractor,
                                             self.X,
                                             X_test,
                                             self.device,
                                             max_len=5,
                                             num_eval=5)

            self.features['se_5_5'] = (X_train.mean(axis=1),
                                       X_test.mean(axis=1))
            X_train = X_train.transpose(1, 0, 2).reshape(
                (-1, X_train.shape[-1]))
            self.features['se_5_5_expand'] = (X_train, X_test.mean(axis=1))

        if iteration == 10:
            X_train, X_test = make_inference(self.se_extractor,
                                             self.X,
                                             X_test,
                                             self.device,
                                             max_len=10,
                                             num_eval=5)
            self.features['se_10_5'] = (X_train.mean(axis=1),
                                        X_test.mean(axis=1))

        if iteration == 15:
            X_train, X_test = make_inference(self.se_extractor,
                                             self.X,
                                             X_test,
                                             self.device,
                                             max_len=15,
                                             num_eval=5)
            self.features['se_15_5'] = (X_train.mean(axis=1),
                                        X_test.mean(axis=1))

        if iteration == 20:
            X_train, X_test = make_inference(self.se_extractor,
                                             self.X,
                                             X_test,
                                             self.device,
                                             max_len=10,
                                             num_eval=10)
            self.features['se_10_10'] = (X_train.mean(axis=1),
                                         X_test.mean(axis=1))

##################################### Check for external results ############################################################

        try:  # TODO add interprocess filelock (concurrent read/write)
            for root_path in [
                    self.out_path_top1_2019, self.out_path_top3_2019
            ]:

                ext_paths = glob.glob(root_path + '/*.pkl.lzma')
                for path in ext_paths:
                    score, y_pred = joblib.load(path)
                    name = os.path.basename(path).split('.')[0]
                    name = f'{os.path.dirname(path)}_{name}'
                    names = set([x[1] for x in self.predictions])
                    if name in names:
                        continue
                    print('extresult', score, name)
                    self.predictions.append((score, name, y_pred))
        except:
            pass

########################################### Train a model #########################################################

# Check whether the futures are ready
        if 'mg' not in self.features:
            ready, nready = ray.wait([self.X_music], timeout=0.1)
            if ready:
                X_tr, X_te = ray.get(self.X_music)
                X_tr = time_pooling(X_tr, 'mean')
                X_te = time_pooling(X_te, 'mean')
                assert len(X_tr.shape) == 2 and len(X_te.shape) == 2
                print('>' * 400, 'MG ready')
                self.features['mg'] = (X_tr, X_te)
                self.pm.start(self.data_id,
                              time_budget=TIME_BUDGET,
                              seconds_per_step=10,
                              max_t=5,
                              reduction_factor=4)

        # Select candidates to train
        candidates = []
        for model_suf in ['standartize', 'normalize', 'noop', 'sign_sqrt']:
            for fname in self.features:
                out_name = f'{fname}_{model_suf}'
                if out_name in self.features_completed:
                    continue
                candidates.append((model_suf, fname, out_name))

        # Train random candidate
        if candidates and np.random.rand() < 0.8:  # sometimes skip this step
            r_idx = np.random.randint(0, len(candidates))
            model_suf, fname, out_name = candidates[r_idx]

            prep = ClassDesc(Preprocessor, model_suf)
            X_train, X_test = self.features[fname]
            max_iter = 100 if len(X_train) < 1500 else 30
            model = LogisticRegression(solver='lbfgs',
                                       multi_class='multinomial',
                                       max_iter=max_iter)
            score, y_pred = fit_single(prep, model, self.cv, X_train, self.y,
                                       X_test)
            self.features_completed.add(out_name)
            self.predictions.append((score, out_name, y_pred))
            return  # one model per iteration

        if self.pm.executor is not None:
            pm_results = self.pm.executor.get_results()[0]
            if pm_results:
                pm_results = list(sorted(pm_results,
                                         key=lambda x: -x['score']))
                pm_top_score = pm_results[0]['score']

                top_predictions = list(
                    sorted(self.predictions, key=lambda x: -x[0]))
                if pm_top_score > top_predictions[0][0] - 0.03:
                    refit_seconds = 10 if self.iteration < 30 else 20
                    y_pred = self.pm.predict(X_test,
                                             refit_seconds=refit_seconds)
                    name = f'pm_{pm_top_score}'
                    if len(pm_results) > 1:
                        pm_top2_score = pm_results[1]['score']
                        name = f'pm_{pm_top_score}_{pm_top2_score}'

                    self.predictions.append((pm_top_score, name, y_pred))

        if 'mg' in self.features:
            X1_train, X1_test = self.features['mg']
            for model_suf in ['standartize', 'normalize', 'noop', 'sign_sqrt']:
                for fname in self.features:
                    if 'expand' in fname:
                        continue
                    out_name = f'mg_{fname}_fus_{model_suf}'
                    if out_name in self.features_completed:
                        continue

                    X2_train, X2_test = self.features[fname]
                    pre = ClassDesc(Preprocessor, model_suf)
                    post = ClassDesc(Preprocessor, model_suf)
                    fus = StaticFusion(pre, post)
                    X_train = fus.fit_transform(X1_train, X2_train)
                    X_test = fus.transform(X1_test, X2_test)
                    model = LogisticRegression(solver='lbfgs',
                                               multi_class='multinomial')

                    score, y_pred = fit_single(None, model, self.cv, X_train,
                                               self.y, X_test)
                    self.features_completed.add(out_name)
                    self.predictions.append((score, out_name, y_pred))
                    return
Exemplo n.º 17
0
        return 1

def one_hot(df):
    df['delivery_method_1.0'] = (df['delivery_method'] == 1).astype(int)
    df['delivery_method_3.0'] = (df['delivery_method'] == 3).astype(int)
    df['delivery_method_nan'] = (df['delivery_method'] == np.nan).astype(int)

    df['has_header_1.0'] = (df['has_header'] == 1).astype(int)
    df['has_header_nan'] = (df['has_header'] == np.nan).astype(int)

    df['user_type_2.0'] = (df['user_type'] == 2).astype(int)
    df['user_type_3.0'] = (df['user_type'] == 3).astype(int)
    df['user_type_4.0'] = (df['user_type'] == 4).astype(int)
    df['user_type_5.0'] = (df['user_type'] == 5).astype(int)
    df['user_type_103.0'] = (df['user_type'] == 103).astype(int)
    df['user_type_nan'] = (df['user_type'] == np.nan).astype(int)

    return df.drop(columns=['delivery_method', 'has_header', 'user_type'])



if __name__ == "__main__":
    with open('models/LRmodel.pkl', 'rb') as f:
        model = pickle.load(f)

    with open('models/LRmodelScaler.pkl', 'rb') as f:
        scaler = pickle.load(f)

    X, y = get_example_X_y('data/test_script_examples.csv', scaler)
    print(model.predict_proba(X)[np.random.randint(low=0, high=25)])
Exemplo n.º 18
0
def train_logreg(trX,
                 trY,
                 vaX=None,
                 vaY=None,
                 teX=None,
                 teY=None,
                 penalty='l1',
                 max_iter=100,
                 C=2**np.arange(-8, 1).astype(np.float),
                 seed=42,
                 model=None,
                 eval_test=True,
                 neurons=None):
    """
    slightly modified version of openai implementation https://github.com/openai/generating-reviews-discovering-sentiment/blob/master/utils.py
    if model is not None it doesn't train the model before scoring, it just scores the model
    """
    # if only integer is provided for C make it iterable so we can loop over
    if not isinstance(C, collections.Iterable):
        C = list([C])
    # extract features for given neuron indices
    if neurons is not None:
        trX = trX[:, neurons]
        if vaX is not None:
            vaX = vaX[:, neurons]
        if teX is not None:
            teX = teX[:, neurons]

    # Cross validation over C
    scores = []
    if model is None:
        for i, c in enumerate(C):
            model = LogisticRegression(C=c,
                                       penalty=penalty,
                                       max_iter=max_iter,
                                       random_state=42 + i)
            model.fit(trX, trY)
            if vaX is not None:
                score = model.score(vaX, vaY)
            else:
                score = model.score(trX, trY)
            scores.append(score)
            del model
        c = C[np.argmax(scores)]
        model = LogisticRegression(C=c,
                                   penalty=penalty,
                                   max_iter=max_iter,
                                   random_state=42 + len(C))
        model.fit(trX, trY)
    else:
        c = model.C
    # predict probabilities and get accuracy of regression model on train, val, test as appropriate
    # also get number of regression weights that are not zero. (number of features used for modeling)
    nnotzero = np.sum(model.coef_ != 0)
    scores = []
    probs = []
    train_score, train_probs = score_and_predict(model, trX, trY)
    scores.append(train_score * 100)
    probs.append(train_probs)
    if vaX is None:
        eval_data = trX
        val_score = train_score
        val_probs = train_probs
    else:
        eval_data = vaX
        val_score, val_probs = score_and_predict(model, vaX, vaY)
    scores.append(val_score * 100)
    probs.append(val_probs)
    eval_score = val_score
    eval_probs = val_probs
    if teX is not None and teY is not None:
        if eval_test:
            eval_score, eval_probs = score_and_predict(model, teX, teY)
        else:
            eval_probs = model.predict_proba(teX)[:, 1]
    scores.append(eval_score * 100)
    probs.append(eval_probs)
    return model, scores, probs, c, nnotzero
Exemplo n.º 19
0
def model(features, test_features, encoding='ohe', n_folds=5):
    """Train and test a light gradient boosting model using
    cross validation. 
    
    Parameters
    --------
        features (pd.DataFrame): 
            dataframe of training features to use 
            for training a model. Must include the TARGET column.
        test_features (pd.DataFrame): 
            dataframe of testing features to use
            for making predictions with the model. 
        encoding (str, default = 'ohe'): 
            method for encoding categorical variables. Either 'ohe' for one-hot encoding or 'le' for integer label encoding
            n_folds (int, default = 5): number of folds to use for cross validation
        
    Return
    --------
        submission (pd.DataFrame): 
            dataframe with `SK_ID_CURR` and `TARGET` probabilities
            predicted by the model.
        feature_importances (pd.DataFrame): 
            dataframe with the feature importances from the model.
        valid_metrics (pd.DataFrame): 
            dataframe with training and validation metrics (ROC AUC) for each fold and overall.
        
    """

    # Extract the ids
    train_ids = features['SK_ID_CURR']
    test_ids = test_features['SK_ID_CURR']

    # Extract the labels for training
    labels = features['TARGET']

    # Remove the ids and target
    features = features.drop(columns=['SK_ID_CURR', 'TARGET'])
    test_features = test_features.drop(columns=['SK_ID_CURR'])

    # One Hot Encoding
    if encoding == 'ohe':
        features = pd.get_dummies(features)
        test_features = pd.get_dummies(test_features)

        # Align the dataframes by the columns
        features, test_features = features.align(test_features,
                                                 join='inner',
                                                 axis=1)

        # No categorical indices to record
        cat_indices = 'auto'

    # Integer label encoding
    elif encoding == 'le':

        # Create a label encoder
        label_encoder = LabelEncoder()

        # List for storing categorical indices
        cat_indices = []

        # Iterate through each column
        for i, col in enumerate(features):
            if features[col].dtype == 'object':
                # Map the categorical features to integers
                features[col] = label_encoder.fit_transform(
                    np.array(features[col].astype(str)).reshape((-1, )))
                test_features[col] = label_encoder.transform(
                    np.array(test_features[col].astype(str)).reshape((-1, )))

                # Record the categorical indices
                cat_indices.append(i)

    # Catch error if label encoding scheme is not valid
    else:
        raise ValueError("Encoding must be either 'ohe' or 'le'")

    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)

    # Extract feature names
    feature_names = list(features.columns)

    # Convert to np arrays
    features = np.array(features)
    test_features = np.array(test_features)

    # Create the kfold object
    k_fold = KFold(n_splits=n_folds, shuffle=False, random_state=50)

    # Empty array for feature importances
    feature_importance_values = np.zeros(len(feature_names))

    # Empty array for test predictions
    test_predictions = np.zeros(test_features.shape[0])

    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])

    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []

    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(features):

        # Training data for the fold
        train_features, train_labels = features[train_indices], labels[
            train_indices]
        # Validation data for the fold
        valid_features, valid_labels = features[valid_indices], labels[
            valid_indices]

        # Create the model
        model = lgb.LGBMClassifier(n_estimators=10000,
                                   objective='binary',
                                   class_weight='balanced',
                                   learning_rate=0.05,
                                   reg_alpha=0.1,
                                   reg_lambda=0.1,
                                   subsample=0.8,
                                   n_jobs=-1,
                                   random_state=50)

        # Train the model
        model.fit(train_features,
                  train_labels,
                  eval_metric='auc',
                  eval_set=[(valid_features, valid_labels),
                            (train_features, train_labels)],
                  eval_names=['valid', 'train'],
                  categorical_feature=cat_indices,
                  early_stopping_rounds=100,
                  verbose=200)

        # Record the best iteration
        best_iteration = model.best_iteration_

        # Record the feature importances
        feature_importance_values += model.feature_importances_ / k_fold.n_splits

        # Make predictions
        test_predictions += model.predict_proba(
            test_features, num_iteration=best_iteration)[:,
                                                         1] / k_fold.n_splits

        # Record the out of fold predictions
        out_of_fold[valid_indices] = model.predict_proba(
            valid_features, num_iteration=best_iteration)[:, 1]

        # Record the best score
        valid_score = model.best_score_['valid']['auc']
        train_score = model.best_score_['train']['auc']

        valid_scores.append(valid_score)
        train_scores.append(train_score)

        # Clean up memory
        gc.enable()
        del model, train_features, valid_features
        gc.collect()

    # Make the submission dataframe
    submission = pd.DataFrame({
        'SK_ID_CURR': test_ids,
        'TARGET': test_predictions
    })

    # Make the feature importance dataframe
    feature_importances = pd.DataFrame({
        'feature': feature_names,
        'importance': feature_importance_values
    })

    # Overall validation score
    valid_auc = roc_auc_score(labels, out_of_fold)

    # Add the overall scores to the metrics
    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))

    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')

    # Dataframe of validation scores
    metrics = pd.DataFrame({
        'fold': fold_names,
        'train': train_scores,
        'valid': valid_scores
    })

    return submission, feature_importances, metrics