Пример #1
0
def main():
    path = '../data/accidents'
    data = pd.read_csv(f'{path}/accident_data_clean_balanced.csv', header=0)

    # Feature columns
    cat_cols = ['roadway_type', 'intersection', 'light_condition', 'atmospheric_conditions',
                'manner_of_collision', 'body_type', 'vehicle_conditions', 'part_of_day']
    binary_cols = ['land_use_urban', 'national_highway_system', 'previous_dwi_convictions',
                   'previous_speeding_convictions', 'speeding_related', 'driver_vision_obscured', 'is_weekend',
                   'multiple_vehicles', 'nonmotorist_involved', 'multiple_motorists', 'drunk_driver_involved']
    numeric_cols = ['vehicle_year', 'speed_limit']

    data[cat_cols] = data[cat_cols].apply(lambda x: x.astype('category'))

    labels = data['multiple_fatalities']
    features = data[cat_cols + binary_cols + numeric_cols]

    # features = pd.get_dummies(features, columns=cat_cols, drop_first=True)
    # features.rename(columns={'manner_of_collision_Not Collision with Motor Vehicle in Transport (Not Necessarily in Transport for\n2005-2009)': 'manner_of_collision_Not Collision with Motor Vehicle in Transport'},
    #                 inplace=True)
    feature_names = features.columns

    oe = OrdinalEncoder()
    features = oe.fit_transform(features)

    scaler = StandardScaler()
    features = scaler.fit_transform(features)

    X_train, X_test, y_train, y_test = train_test_split(features, labels,
                                                        test_size=0.2, random_state=2020)
    print('Class Balance')
    print(y_test.value_counts())
    print()

    models = {
        'Random Forest': (RandomForestClassifier(n_estimators=100,
                                                 min_samples_leaf=5,
                                                 random_state=2020),
                          'rf'),
        'Logistic Regression': (LogisticRegressionCV(cv=5, scoring='f1',
                                                     max_iter=1000,
                                                     random_state=2020),
                                'lr')
    }

    for name, (model, suffix) in models.items():
        print(name)
        print('-' * 20)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        y_probs = model.predict_proba(X_test)[:, 1]

        utils.print_metrics(y_test, y_pred)
        utils.roc_curve(y_test, y_probs, name, suffix)
        utils.feature_importance(model, feature_names, name, suffix)
        utils.permutation_importances(model, X_test, y_test, feature_names, name, suffix)
        utils.permutation_importances(model, X_train, y_train, feature_names, name, suffix, dataset='train')
        print('#' * 50)
Пример #2
0
    def test_(self,
              data_generator,
              model,
              repurposing_mode=False,
              test=False,
              verbose=True):
        y_pred = []
        y_label = []
        model.eval()
        for i, (v_d, label) in enumerate(data_generator):
            if self.drug_encoding == "MPNN" or self.drug_encoding == 'Transformer':
                v_d = v_d
            else:
                v_d = v_d.float().to(self.device)

            score = self.model(v_d)

            if self.binary:
                m = torch.nn.Sigmoid()
                logits = torch.squeeze(m(score)).detach().cpu().numpy()
            else:
                logits = torch.squeeze(score).detach().cpu().numpy()

            label_ids = label.to('cpu').numpy()
            y_label = y_label + label_ids.flatten().tolist()
            y_pred = y_pred + logits.flatten().tolist()
            outputs = np.asarray(
                [1 if i else 0 for i in (np.asarray(y_pred) >= 0.5)])

        model.train()
        if self.binary:
            if repurposing_mode:
                return y_pred
            ## ROC-AUC curve
            if test:
                if verbose:
                    roc_auc_file = os.path.join(self.result_folder,
                                                "roc-auc.jpg")
                    plt.figure(0)
                    roc_curve(y_pred, y_label, roc_auc_file,
                              self.drug_encoding)
                    plt.figure(1)
                    pr_auc_file = os.path.join(self.result_folder,
                                               "pr-auc.jpg")
                    prauc_curve(y_pred, y_label, pr_auc_file,
                                self.drug_encoding)

            return roc_auc_score(y_label, y_pred), average_precision_score(
                y_label, y_pred), f1_score(y_label, outputs), y_pred
        else:
            if repurposing_mode:
                return y_pred
            return mean_squared_error(y_label, y_pred), \
                pearsonr(y_label, y_pred)[0], \
                pearsonr(y_label, y_pred)[1], \
                concordance_index(y_label, y_pred), y_pred
def roc_experiment(motif, trials=10**5):
    pw_model = pairwise_model_from_motif(motif)
    li_model = linear_model_from_motif(motif)
    L = len(motif[0])
    negatives = [random_site(L) for i in trange(trials)]
    pw_pos = [pw_prob_site(site, pw_model) for site in motif]
    pw_neg = [pw_prob_site(site, pw_model) for site in tqdm(negatives)]
    li_pos = [linear_prob_site(site, li_model) for site in motif]
    li_neg = [linear_prob_site(site, li_model) for site in tqdm(negatives)]
    _, _, _, pw_auc = roc_curve(pw_pos, pw_neg)
    _, _, _, li_auc = roc_curve(li_pos, li_neg, color='g')
    return li_auc, pw_auc
def main():
    path = '../data/accidents'
    data = pd.read_csv(f'{path}/accident_data_clean_balanced.csv', header=0)

    cat_cols = [
        'month', 'roadway_type', 'intersection', 'light_condition',
        'atmospheric_conditions', 'manner_of_collision', 'body_type',
        'vehicle_conditions', 'part_of_day'
    ]
    binary_cols = [
        'land_use_urban', 'national_highway_system',
        'previous_dwi_convictions', 'previous_speeding_convictions',
        'speeding_related', 'driver_vision_obscured', 'is_weekend',
        'multiple_vehicles', 'nonmotorist_involved', 'multiple_motorists',
        'drunk_driver_involved'
    ]
    numeric_cols = ['vehicle_year', 'speed_limit']

    data[cat_cols] = data[cat_cols].apply(lambda x: x.astype('category'))

    labels = data['multiple_fatalities']
    features = data[cat_cols + binary_cols + numeric_cols]
    feature_names = features.columns

    # oe = OrdinalEncoder()
    # features = oe.fit_transform(features)

    features = pd.get_dummies(features, columns=cat_cols)

    # scaler = StandardScaler()
    # features = scaler.fit_transform(features)

    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        labels,
                                                        test_size=0.2,
                                                        random_state=2020)
    print('Class Balance')
    print(y_test.value_counts())
    print()

    model = GridSearchCV(estimator=KNeighborsClassifier(),
                         param_grid={'n_neighbors': range(1, 20, 2)},
                         cv=5,
                         scoring='f1')
    model.fit(X_train, y_train)
    print(model.best_params_)
    print()

    y_pred = model.predict(X_test)
    y_probs = model.predict_proba(X_test)[:, 1]

    utils.print_metrics(y_test, y_pred)
    utils.roc_curve(y_test, y_probs, 'KNN', 'knn')
Пример #5
0
    def test(self, test_loader, epoch=0):
        X, y = next(iter(test_loader))

        B, D, C, W, H = X.shape
        # X = X.view(B, C * D, W, H)

        self.unet.eval()
        self.facenet.eval()
        self.discrim.eval()
        with torch.no_grad():
            y_ = self.unet(X.to(device))

            mse = self.mse_loss_function(y_, y.to(device))
            loss_G = self.loss_GAN_generator(btch_X=X.to(device))
            loss_D = self.loss_GAN_discrimator(btch_X=X.to(device), btch_y=y.to(device))

            loss_facenet, _, n_bad = self.loss_facenet(X.to(device), y.to(device))

        plt.title(f"epoch {epoch} mse={mse.item():.4} facenet={loss_facenet.item():.4} bad={n_bad / B ** 2}")
        i = np.random.randint(0, B)
        a = np.hstack((y[i].transpose(0, 1).transpose(1, 2), y_[i].transpose(0, 1).transpose(1, 2).to(cpu)))
        b = np.hstack((X[i][0].transpose(0, 1).transpose(1, 2),
                       X[i][-1].transpose(0, 1).transpose(1, 2)))
        plt.imshow(np.vstack((a, b)))
        plt.axis('off')
        plt.show()

        self.writer.add_scalar("test bad_percent", n_bad / B ** 2, global_step=epoch)
        self.writer.add_scalar("test loss", mse.item(), global_step=epoch)
        # self.writer.add_scalars("test GAN", {"discrim": loss_D.item(),
        #                                      "gen": loss_G.item()}, global_step=epoch)

        with torch.no_grad():
            n_for_show = 10
            y_show_ = y_.to(device)
            y_show = y.to(device)
            embeddings_anc, _ = self.facenet(y_show_)
            embeddings_pos, _ = self.facenet(y_show)

            embeds = torch.cat((embeddings_anc[:n_for_show], embeddings_pos[:n_for_show]))
            imgs = torch.cat((y_show_[:n_for_show], y_show[:n_for_show]))
            names = list(range(n_for_show)) * 2
            # print(embeds.shape, imgs.shape, len(names))
            # self.writer.add_embedding(mat=embeds, metadata=names, label_img=imgs, tag="embeddings", global_step=epoch)

        trshs, fprs, tprs = roc_curve(embeddings_anc.detach().to(cpu), embeddings_pos.detach().to(cpu))
        rnk1 = rank1(embeddings_anc.detach().to(cpu), embeddings_pos.detach().to(cpu))
        plt.step(fprs, tprs)
        # plt.xlim((1e-4, 1))
        plt.yticks(np.arange(0, 1, 0.05))
        plt.xticks(np.arange(min(fprs), max(fprs), 10))
        plt.xscale('log')
        plt.title(f"ROC auc={auc(fprs, tprs)} rnk1={rnk1}")
        self.writer.add_figure("ROC test", plt.gcf(), global_step=epoch)
        self.writer.add_scalar("auc", auc(fprs, tprs), global_step=epoch)
        self.writer.add_scalar("rank1", rnk1, global_step=epoch)
        print(f"\n###### {epoch} TEST mse={mse.item():.4} GAN(G/D)={loss_G.item():.4}/{loss_D.item():.4} "
              f"facenet={loss_facenet.item():.4} bad={n_bad / B ** 2:.4} auc={auc(fprs, tprs)} rank1={rnk1} #######")
Пример #6
0
def main():
    path = '../data/persons'
    data = pd.read_csv(f'{path}/person_data_clean.csv', header=0)

    cat_cols = ['person_type', 'trafficway_type', 'manner_of_collision', 'body_type', 'seating_position',
                'ejection', 'safety_equipment_use']
    binary_cols = ['sex', 'land_use_urban', 'rollover', 'air_bag_deployed']
    numeric_cols = ['age']

    data[cat_cols] = data[cat_cols].apply(lambda x: x.astype('category'))

    labels = data['fatality']
    features = data[cat_cols + binary_cols + numeric_cols]
    feature_names = features.columns

    oe = OrdinalEncoder()
    features = oe.fit_transform(features)

    # features = pd.get_dummies(features, columns=cat_cols)

    scaler = StandardScaler()
    features = scaler.fit_transform(features)

    X_train, X_test, y_train, y_test = train_test_split(features, labels,
                                                        test_size=0.2, random_state=2020)
    print('Class Balance')
    print(y_test.value_counts())
    print()

    model = GridSearchCV(estimator=KNeighborsClassifier(),
                         param_grid={'n_neighbors': range(1, 20, 2)},
                         cv=5, scoring='f1')
    model.fit(X_train, y_train)
    print(model.best_params_)
    print()

    y_pred = model.predict(X_test)
    y_probs = model.predict_proba(X_test)[:, 1]

    utils.print_metrics(y_test, y_pred)
    utils.roc_curve(y_test, y_probs, 'KNN', 'knn')
Пример #7
0
    def test(self, test_loader, epoch):
        self.enc.eval()
        self.dec.eval()
        self.clf.eval()
        X, y = next(iter(test_loader))
        B, D, C, W, H = X.shape

        n = len(y)

        with torch.no_grad():
            loss_mse = self.get_mse_loss(X.to(device), y.to(device))
            loss_clf = self.get_clf_loss(X.to(device), y.to(device))

        self.writer.add_scalar("test loss_mse", loss_mse.item(), global_step=epoch)
        self.writer.add_scalar("test loss_clf", loss_clf.item(), global_step=epoch)

        embeddings_anc = self.enc(X.view(B * D, C, W, H).to(device))
        embeddings_pos = self.enc(y.to(device))

        trshs, fprs, tprs = roc_curve(embeddings_anc.detach(), embeddings_pos.detach(), self.clf)
        rnk1 = rank1(embeddings_anc.detach(), embeddings_pos.detach(), self.clf)
        plt.step(fprs, tprs)
        plt.yticks(np.arange(0, 1, 0.05))
        plt.xticks(np.arange(min(fprs), max(fprs), 10))
        plt.xscale('log')
        plt.title(f"ROC auc={auc(fprs, tprs)} rnk1={rnk1}")

        self.writer.add_figure("ROC test", plt.gcf(), global_step=epoch)
        self.writer.add_scalar("auc", auc(fprs, tprs), global_step=epoch)
        self.writer.add_scalar("rank1", rnk1, global_step=epoch)

        print(f"\n###### {epoch} TEST loss_mse {loss_mse.item():.5} loss_clf {loss_clf.item():.5} "
              f"auc={auc(fprs, tprs)} rank1 = {rnk1}  #######")

        x = X.view(B * D, C, W, H)[0:1]
        emb = self.enc(x.to(device))
        front = self.dec(emb).detach().cpu()
        self.writer.add_image("cfr", np.hstack((x[0], y[0], front[0])), global_step=epoch)
        torch.cuda.empty_cache()
def main():
    path = '../data/persons'
    data = pd.read_csv(f'{path}/person_data_clean.csv', header=0)

    cat_cols = [
        'person_type', 'trafficway_type', 'manner_of_collision', 'body_type',
        'seating_position', 'ejection', 'safety_equipment_use'
    ]
    binary_cols = ['sex', 'land_use_urban', 'rollover', 'air_bag_deployed']
    numeric_cols = ['age']

    data[cat_cols] = data[cat_cols].apply(lambda x: x.astype('category'))

    labels = data['fatality']
    features = data[cat_cols + binary_cols + numeric_cols]

    # features = pd.get_dummies(features, columns=cat_cols)
    # features.rename(columns={'manner_of_collision_Not Collision with Motor Vehicle in Transport (Not Necessarily in Transport for\n2005-2009)': 'manner_of_collision_Not Collision with Motor Vehicle in Transport'},
    #                 inplace=True)
    feature_names = features.columns

    oe = OrdinalEncoder()
    features = oe.fit_transform(features)

    scaler = StandardScaler()
    features = scaler.fit_transform(features)

    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        labels,
                                                        test_size=0.2,
                                                        random_state=2020)
    print('Class Balance')
    print(y_test.value_counts())
    print()

    models = {
        'Random Forest': (RandomForestClassifier(n_estimators=100,
                                                 min_samples_leaf=5,
                                                 class_weight='balanced',
                                                 random_state=2020), 'rf'),
        'Logistic Regression': (LogisticRegressionCV(cv=5,
                                                     scoring='f1',
                                                     class_weight='balanced',
                                                     max_iter=500,
                                                     random_state=2020), 'lr')
    }

    for name, (model, suffix) in models.items():
        print(name)
        print('-' * 20)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        y_probs = model.predict_proba(X_test)[:, 1]

        utils.print_metrics(y_test, y_pred)
        utils.roc_curve(y_test, y_probs, name, suffix)
        utils.feature_importance(model, feature_names, name, suffix)
        utils.permutation_importances(model, X_test, y_test, feature_names,
                                      name, suffix)
        # utils.permutation_importances(model, X_train, y_train, feature_names, name, suffix + '_ohe', dataset='train')
        print('#' * 50)