def test_group_shuffle_split():
    for groups_i in test_groups:
        X = y = np.ones(len(groups_i))
        n_splits = 6
        test_size = 1./3
        slo = GroupShuffleSplit(n_splits, test_size=test_size, random_state=0)

        # Make sure the repr works
        repr(slo)

        # Test that the length is correct
        assert_equal(slo.get_n_splits(X, y, groups=groups_i), n_splits)

        l_unique = np.unique(groups_i)
        l = np.asarray(groups_i)

        for train, test in slo.split(X, y, groups=groups_i):
            # First test: no train group is in the test set and vice versa
            l_train_unique = np.unique(l[train])
            l_test_unique = np.unique(l[test])
            assert_false(np.any(np.in1d(l[train], l_test_unique)))
            assert_false(np.any(np.in1d(l[test], l_train_unique)))

            # Second test: train and test add up to all the data
            assert_equal(l[train].size + l[test].size, l.size)

            # Third test: train and test are disjoint
            assert_array_equal(np.intersect1d(train, test), [])

            # Fourth test:
            # unique train and test groups are correct, +- 1 for rounding error
            assert_true(abs(len(l_test_unique) -
                            round(test_size * len(l_unique))) <= 1)
            assert_true(abs(len(l_train_unique) -
                            round((1.0 - test_size) * len(l_unique))) <= 1)
示例#2
0
文件: utils.py 项目: marl/medleydb
def artist_conditional_split(trackid_list=None, test_size=0.15, num_splits=5,
                             random_state=None, artist_index=None):
    """Create artist-conditional train-test splits.
    The same artist (as defined by the artist_index) cannot appear
    in both the training and testing set.

    Parameters
    ----------
    trackid_list : list or None, default=None
        List of trackids to use in train-test split. If None, uses all tracks
    test_size : float, default=0.15
        Fraction of tracks to use in test set. The test set will be as close
        as possible in size to this value, but it may not be exact due to the
        artist-conditional constraint.
    num_splits : int, default=5
        Number of random splits to create
    random_state : int or None, default=None
        A random state to optionally reproduce the same random split.
    artist_index : dict or None, default=None
        Dictionary mapping each track id in trackid_list to a string that
        uniquely identifies each artist.
        If None, uses the predefined index ARTIST_INDEX.

    Returns
    -------
    splits : list of dicts
        List of length num_splits of train/test split dictionaries. Each
        dictionary has the keys 'train' and 'test', each which map to lists of
        trackids.

    """
    if trackid_list is None:
        trackid_list = TRACK_LIST_V1

    if artist_index is None:
        artist_index = ARTIST_INDEX

    artists = np.asarray([ARTIST_INDEX[trackid] for trackid in trackid_list])

    splitter = GroupShuffleSplit(n_splits=num_splits,
                                 random_state=random_state,
                                 test_size=test_size)

    trackid_array = np.array(trackid_list)
    splits = []
    for train, test in splitter.split(trackid_array, groups=artists):
        splits.append({
            'train': list(trackid_array[train]),
            'test': list(trackid_array[test])
        })

    return splits
示例#3
0
def train_test_split(*arrays,
                     test_size=None,
                     train_size=None,
                     random_state=None,
                     shuffle=True,
                     labels=None):
    """Extend sklearn.model_selection.train_test_slit to have group split.

    Parameters
    ----------
    *arrays : sequence of indexables with same length / shape[0]
        Allowed inputs are lists, numpy arrays, scipy-sparse
        matrices or pandas dataframes.
    test_size : float or int, default=None
        If float, should be between 0.0 and 1.0 and represent the proportion
        of the dataset to include in the test split. If int, represents the
        absolute number of test samples. If None, the value is set to the
        complement of the train size. If ``train_size`` is also None, it will
        be set to 0.25.
    train_size : float or int, default=None
        If float, should be between 0.0 and 1.0 and represent the
        proportion of the dataset to include in the train split. If
        int, represents the absolute number of train samples. If None,
        the value is automatically set to the complement of the test size.
    random_state : int, RandomState instance or None, default=None
        Controls the shuffling applied to the data before applying the split.
        Pass an int for reproducible output across multiple function calls.
    shuffle : str, default='simple'
        One of [None, 'simple', 'stratified', 'group']. Whether or not to
        shuffle the data before splitting. None: no shuffle; 'simple': non
        stratified shuffle; 'stratified': shuffle with class labels;
        'group': shuffle with group labels.
    labels : array-like or None, default=None
        If shuffle='simple' or shuffle=None, this must be None. If shuffle=
        'stratified', this array is used as class labels. If shuffle='group',
        this array is used as group labels

    Returns
    -------
    splitting : list, length=2 * len(arrays)
        List containing train-test split of inputs.

    """
    if shuffle and shuffle not in \
            ['simple', 'stratified', 'group', True, False]:
        raise ValueError("The argument `shuffle` only supports None, "
                         "'simple', 'stratified' and 'group', but got `%s`!" %
                         shuffle)

    if shuffle != 'group':
        shuffle = False if not shuffle else True
        return sk_train_test_split(*arrays,
                                   test_size=test_size,
                                   train_size=train_size,
                                   random_state=random_state,
                                   shuffle=shuffle,
                                   stratify=labels)

    if labels is None:
        raise ValueError("When shuffle='group', labels should not be None!")

    n_arrays = len(arrays)
    if n_arrays == 0:
        raise ValueError("At least one array required as input")

    arrays = indexable(*arrays)

    labels = check_array(labels, ensure_2d=False, dtype=None)
    n_samples = np.unique(labels).size

    n_train, n_test = _validate_shuffle_split(n_samples,
                                              test_size,
                                              train_size,
                                              default_test_size=0.25)

    cv = GroupShuffleSplit(n_splits=1,
                           test_size=n_test,
                           train_size=n_train,
                           random_state=random_state)

    train, test = next(cv.split(X=arrays[0], y=None, groups=labels))

    return list(
        chain.from_iterable((_safe_indexing(a, train), _safe_indexing(a, test))
                            for a in arrays))
示例#4
0
# %% [markdown]
# ### Learn-to-rank with LGBMRanker

# %% [markdown]
# If we decide to split our data into train/val, we can do it this way.

# %%
from sklearn.model_selection import GroupShuffleSplit

print("\n TRAINING THE MODEL: \n")

# feature_encoded_X = encoded_X
# Split data into (default) 80% train and 20% validation, maintaining the groups however.
print("\tGoing to split data now!")
train_inds, val_inds = next(
    GroupShuffleSplit(test_size=0.05, n_splits=2,
                      random_state=7).split(encoded_X, groups=srch_id_col))

print(
    f"\tWill train with {len(train_inds)} and validate with {len(val_inds)} amount of data."
)

# Split train / validation by their indices
X_train = encoded_X[train_inds]
y_train = np.array(y[train_inds])
X_val = encoded_X[val_inds]
y_val = np.array(y[val_inds])

# Get the groups related to `srch_id`
query_train = get_user_groups_from_df(train_data.iloc[train_inds])
query_val = get_user_groups_from_df(train_data.iloc[val_inds])
示例#5
0
    def sweep(
        self,
        params: Dict,
        X,
        y,
        search_algorithm: str = "bayesian",
        num_trials: int = 3,
        scoring_func: str = "r2",
        early_stopping: bool = False,
        results_csv_path: str = "outputs/results.csv",
        splitting_criteria: str = "CV",
        test_indices: Union[None, List[int]] = None,
        num_splits: int = 5,
    ) -> pd.DataFrame:

        if self.scale_data:
            X, y = self.scalar(X, y)

        if splitting_criteria.lower() == "cv":
            cv = None
        elif splitting_criteria.lower() == "timeseries":
            cv = TimeSeriesSplit(n_splits=num_splits)
        elif splitting_criteria.lower() == "grouped":
            cv = GroupShuffleSplit(n_splits=num_splits)
        elif splitting_criteria.lower() == "fixed":
            if type(test_indices) != list:
                raise ValueError(
                    "fixed split used but no test-indices provided...")
            cv = PredefinedSplit(test_fold=test_indices)
        else:
            raise ValueError(
                "Unknowing splitting criteria provided: {splitting_criteria}, should be one of [cv, timeseries, grouped]"
            )

        # early stopping only supported for learners that have a
        # `partial_fit` method
        from tune_sklearn import TuneSearchCV
        import mlflow
        import time

        mlflow.set_tracking_uri(os.path.join("file:/", os.getcwd(), "outputs"))

        # start mlflow auto-logging
        # mlflow.sklearn.autolog()

        if search_algorithm.lower() == "bohb":
            early_stopping = True

        if any([
                search_algorithm.lower()
                in ["bohb", "bayesian", "hyperopt", "optuna"]
        ]):
            search = TuneSearchCV(
                self.model,
                params,
                search_optimization=search_algorithm,
                cv=cv,
                n_trials=num_trials,
                early_stopping=early_stopping,
                scoring=scoring_func,
                loggers=["csv", "tensorboard"],
                verbose=1,
            )
        elif search_algorithm == "grid":
            search = GridSearchCV(
                self.model,
                param_grid=params,
                refit=True,
                cv=cv,
                scoring=scoring_func,
                verbose=1,
            )
        elif search_algorithm == "random":
            search = RandomizedSearchCV(
                self.model,
                param_distributions=params,
                refit=True,
                cv=cv,
                scoring=scoring_func,
                verbose=1,
            )
        else:
            raise NotImplementedError(
                "Search algorithm should be one of grid, hyperopt, bohb, optuna, bayesian, or random"
            )

        # with mlflow.start_run() as run:
        search.fit(X, y)
        self.model = search.best_estimator_
        results_df = pd.DataFrame(search.cv_results_)
        if not pathlib.Path(results_csv_path).parent.exists():
            pathlib.Path(results_csv_path).parent.mkdir(exist_ok=True,
                                                        parents=True)
        final_path = (results_csv_path[:-4] + "_" +
                      time.strftime("%Y%m%d-%H%M%S") + ".csv")
        logger.info(f"Saving sweeping results to {final_path}")
        results_df.to_csv(final_path)
        logger.info(f"Best hyperparams: {search.best_params_}")
        logger.info(f"Best score: {search.best_score_}")

        return results_df
示例#6
0
def split_test_train_v2(df, test_ratio=0.2, random_state=None):
    train_inds, test_inds = next(
        GroupShuffleSplit(test_size=test_ratio, random_state=random_state).split(df, groups=df['Patient ID']))

    return train_inds, test_inds, df.iloc[train_inds], df.iloc[test_inds]
示例#7
0
def split_validation(train_set, val_method='fo', fold_num=1, val_size=.1):
    """
    - parameters
    train_set : pd.DataFrame train set waiting for split validation
    val_method : str, way to split validation
                    'cv': combine with fold_num => fold_num-CV
                    'fo': combine with fold_num & val_size => fold_num-Split by ratio(9:1)
                    'tfo': Split by ratio with timestamp, combine with val_size => 1-Split by ratio(9:1)
                    'tloo': Leave one out with timestamp => 1-Leave one out
                    'loo': combine with fold_num => fold_num-Leave one out
                    'ufo': split by ratio in user level with K-fold
                    'utfo': time-aware split by ratio in user level
    fold_num : int, the number of folder need to be validated, only work when val_method is 'cv', 'loo', or 'fo'
    val_size: float, the size of validation dataset

    - returns
    train_set_list : List, list of generated training datasets
    val_set_list : List, list of generated validation datasets
    cnt : cnt: int, the number of train-validation pair

    """
    if val_method in ['tloo', 'tfo', 'utfo']:
        cnt = 1
    elif val_method in ['cv', 'loo', 'fo', 'ufo']:
        cnt = fold_num
    else:
        raise ValueError(
            'Invalid val_method value, expect: cv, loo, tloo, tfo')

    train_set_list, val_set_list = [], []
    if val_method == 'oldufo':
        driver_ids = train_set['user']
        _, driver_indices = np.unique(np.array(driver_ids),
                                      return_inverse=True)
        gss = GroupShuffleSplit(n_splits=fold_num,
                                test_size=val_size,
                                random_state=2020)
        for train_idx, val_idx in gss.split(train_set, groups=driver_indices):
            train_set_list.append(train_set.loc[train_idx, :])
            val_set_list.append(train_set.loc[val_idx, :])
    # 修改ufo
    elif val_method == 'ufo':
        for _ in range(fold_num):
            val_index=train_set.groupby(['user']).apply(\
                lambda grp:np.random.choice(grp.index,\
                np.floor(len(grp)*val_size),\
                replace=False)).explode().values

            val_set = train_set.loc[val_index, :].reset_index(drop=True).copy()
            sub_train_set = train_set[~train_set.index.
                                      isin(val_index)].reset_index(
                                          drop=True).copy()
            train_set_list.append(train_set)
            val_set_list.append(val_set)

    if val_method == 'utfo':
        train_set = train_set.sort_values(['user',
                                           'timestamp']).reset_index(drop=True)

        def time_split(grp):
            start_idx = grp.index[0]
            split_len = int(np.ceil(len(grp) * (1 - val_size)))
            split_idx = start_idx + split_len
            end_idx = grp.index[-1]

            return list(range(split_idx, end_idx + 1))

        val_index = train_set.groupby('user').apply(
            time_split).explode().values
        val_set = train_set.loc[val_index, :]
        train_set = train_set[~train_set.index.isin(val_index)]
        train_set_list.append(train_set)
        val_set_list.append(val_set)
    if val_method == 'cv':
        kf = KFold(n_splits=fold_num, shuffle=False, random_state=2019)
        for train_index, val_index in kf.split(train_set):
            train_set_list.append(train_set.loc[train_index, :])
            val_set_list.append(train_set.loc[val_index, :])
    if val_method == 'fo':
        for _ in range(fold_num):
            train, validation = train_test_split(train_set, test_size=val_size)
            train_set_list.append(train)
            val_set_list.append(validation)
    elif val_method == 'tfo':
        # train_set = train_set.sample(frac=1)
        train_set = train_set.sort_values(['timestamp']).reset_index(drop=True)
        split_idx = int(np.ceil(len(train_set) * (1 - val_size)))
        train_set_list.append(train_set.iloc[:split_idx, :])
        val_set_list.append(train_set.iloc[split_idx:, :])
    elif val_method == 'loo':
        for _ in range(fold_num):
            val_index = train_set.groupby(
                ['user']).apply(lambda grp: np.random.choice(grp.index))
            val_set = train_set.loc[val_index, :].reset_index(drop=True).copy()
            sub_train_set = train_set[~train_set.index.
                                      isin(val_index)].reset_index(
                                          drop=True).copy()

            train_set_list.append(sub_train_set)
            val_set_list.append(val_set)
    elif val_method == 'tloo':
        # train_set = train_set.sample(frac=1)
        train_set = train_set.sort_values(['timestamp']).reset_index(drop=True)

        train_set['rank_latest'] = train_set.groupby(
            ['user'])['timestamp'].rank(method='first', ascending=False)
        new_train_set = train_set[train_set['rank_latest'] > 1].copy()
        val_set = train_set[train_set['rank_latest'] == 1].copy()
        del new_train_set['rank_latest'], val_set['rank_latest']

        train_set_list.append(new_train_set)
        val_set_list.append(val_set)

    return train_set_list, val_set_list, cnt
示例#8
0
# 3.1.5.3
from sklearn.model_selection import LeavePGroupsOut
X = np.arange(6)
y = [1, 1, 1, 2, 2, 2]
groups = [1, 1, 2, 2, 3, 3]
lpgo = LeavePGroupsOut(n_groups=2)
for train, test in lpgo.split(X, y, groups=groups):
    print("%s %s" % (train, test))

# 3.1.5.4
from sklearn.model_selection import GroupShuffleSplit
X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 0.001]
y = ["a", "b", "b", "b", "c", "c", "c", "a"]
groups = [1, 1, 2, 2, 3, 3, 4, 4]
gss = GroupShuffleSplit(n_splits=4, test_size=0.5, random_state=0)
for train, test in gss.split(X, y, groups=groups):
    print("%s %s" % (train, test))

# 3.1.6
# nothing

# 3.1.7
# nothing

# 3.1.7.1
from sklearn.model_selection import TimeSeriesSplit
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4, 5, 6])
tscv = TimeSeriesSplit(n_splits=3)
for train, test in tscv.split(X):
    clean_df.loc[~clean_df[pr].isin(PR_in), pr] = 'missing'
clean_df = clean_df.loc[(clean_df[DXs] != 'missing').sum(axis=1)>0]
clean_df = clean_df.loc[(clean_df[PRs] != 'missing').sum(axis=1)>0]
        
all_df = clean_df.reset_index(drop=True)
n_sample = len(all_df)
n_code = len(code_cat)-1

int_df = all_df.copy()
for dx in DXs:
    int_df[dx] = int_df[dx].map(DX_dict)    
for pr in PRs:
    int_df[pr] = int_df[pr].map(PR_dict)
all_df = int_df.reset_index(drop=True)

gss = GroupShuffleSplit(n_splits=1, test_size=1/n_fold, random_state=tst_seed)
train_idx, tst_idx = next(gss.split(all_df, groups=all_df.SUBJECT_ID))
train_df0 = all_df.loc[train_idx].reset_index(drop=True)
tst_df = all_df.loc[tst_idx].reset_index(drop=True)

# GloVe pretraining
g = Glove(input_dim=len(code_cat), embedding_dim=code_embed_dim, count_cap=count_cap)
g.update_cooccur(train_df0[DXs+PRs])
cooccur_df = g.get_cooccur_df()
g.train_glove(cooccur_df=cooccur_df, cache_path=model_path+'temp/{}/'.format(job_index), epochs=80, earlystop_patience=10, 
reducelr_patience=3, batch_size=1024, verbose=2)
embed_mat = g.get_embed_mat()
if embed_file=='random':
    embed_initializer = 'uniform'
else:
    embed_initializer = Constant(embed_mat)
示例#10
0
def main(args):
    """
    :param object args: namespace object containing the arguments passed
    """
    clf_type = args.classifier
    # %% set file paths and load OD table
    data_dir = os.path.dirname(examples.__file__)
    output_dir = os.path.join(data_dir, 'classifier_outputs')
    os.makedirs(output_dir, exist_ok=True)
    logger = io_utils.make_logger(
        log_dir=output_dir,
        logger_name=LOG_NAME,
    )
    stitched_multisero_df = pd.read_csv(os.path.join(data_dir, 'master_report.csv'), index_col=0, low_memory=False)
    stitched_multisero_df['serum ID'] = stitched_multisero_df['serum ID'].apply(
        lambda x: unicodedata.normalize('NFKC', x)).str.strip()
    # serum ID to exclude from computing ROC
    sera_roc_list = ['Pool', 'mab', 'Blank', 'CR3022']
    df_norm = stitched_multisero_df.copy()
    norm_antigen = 'xIgG Fc'
    offset_antigen = None
    norm_group = 'plate'
    offset_group = 'well'
    suffix = '_'.join([norm_antigen, 'norm_per_plate'])
    pipeline = 'nautilus'
    suffix = pipeline
    if norm_antigen is not None:
        suffix = '_'.join([pipeline, norm_antigen, 'norm_per', norm_group])
    # %% slice OD table
    slice_cols = ['pipeline', 'serum ID', 'antigen']
    slice_keys = [[pipeline], sera_roc_list, ['xkappa-biotin']]
    slice_actions = ['keep', 'drop', 'drop']
    for col, action, key in zip(slice_cols, slice_actions, slice_keys):
        df_norm = slice_df(df_norm, action, col, key)
    # %% Normalize OD and transform the table to wide format for model training
    df_norm = normalize_od(df_norm, norm_antigen, norm_group)
    df_norm = offset_od(df_norm, offset_antigen, offset_group)
    df_norm['antigen_row'] = df_norm['antigen_row'].map(str)
    df_norm['antigen_col'] = df_norm['antigen_col'].map(str)
    multisero_df_pivot = df_norm.copy()
    multisero_df_pivot = pd.pivot_table(multisero_df_pivot,
                                     values='OD',
                                     index=['plate ID', 'well_id', 'serum ID',
                                            'serum type', 'serum dilution', 'secondary ID',
                                            'secondary dilution', 'pipeline'],
                                     columns=['antigen', 'antigen_row', 'antigen_col'])
    multisero_df_pivot.columns = ["_".join(cols) for cols in multisero_df_pivot.columns]
    features = multisero_df_pivot.columns.tolist()
    multisero_df_pivot.dropna(inplace=True)
    multisero_df_pivot.reset_index(inplace=True)
    # "positive" = 1, "negative" = 0
    multisero_df_pivot['target'] = (multisero_df_pivot['serum type'] == 'positive')
    # %% Split the dataset into train and test sets
    rand_seed = 0
    gss = GroupShuffleSplit(test_size=.4, n_splits=2, random_state=rand_seed)
    (train_ids, test_ids), _ = gss.split(multisero_df_pivot, groups=multisero_df_pivot['serum ID'])
    train = multisero_df_pivot.iloc[train_ids]
    test = multisero_df_pivot.iloc[test_ids]

    # %% set up CV folds by serum ID
    gkf = GroupKFold(n_splits=4)
    folds = []
    for fold in gkf.split(train, groups=train['serum ID']):
        folds.append(fold)
    #%% Initiate classifier instance
    if clf_type == 'xgboost':
        clf = xgb.sklearn.XGBClassifier(
                        learning_rate=0.01,
                        n_estimators=1000,
                        max_depth=5,
                        min_child_weight=1,
                        gamma=0,
                        subsample=0.8,
                        colsample_bytree=0.8,
                        objective='binary:logistic',
                        nthread=8,
                        scale_pos_weight=1,
                        seed=0)
        #%% Tune xgboost parameter
        param_tests = [{
            'max_depth': range(1, 8, 1),
            'min_child_weight': range(1, 8, 1)
        },
            {
                'gamma': [i / 10.0 for i in range(0, 5)]
            },
            {
                'subsample': [i / 10.0 for i in range(6, 10)],
                'colsample_bytree': [i / 10.0 for i in range(6, 10)]
            },
            {
                'reg_alpha': [0, 1e-5, 1e-2, 0.1, 1, 100]
            }
        ]
        for param_test in param_tests:
            clf = tune_cls_para(clf, train, features, target='target', param_test=param_test, cross_valid=folds, n_jobs=-1)

        #%% retrain the classifier with optimal parameters from tun_cls_para with lower learning rate and more steps
        param = {'learning_rate': 0.001,
                 'n_estimators': 10000}
        clf.set_params(**param)
        clf, score = xgb_fit(clf, train, features, target='target', folds=folds, early_stopping_rounds=10000)
        plot_xgb_fscore(clf, output_dir=output_dir, output_fname='xgb_feature_importance')

    elif clf_type == 'logistic_regression':
        clf = LogisticRegressionCV(
            Cs=10,
            intercept_scaling=1,
            max_iter=5000,
            random_state=rand_seed,
            solver='saga',
            dual=False,
            fit_intercept=True,
            penalty='l2',
            tol=0.0001,
            cv=folds,
            verbose=0)
        clf, score = model_fit(clf, train, features, target='target')
    else:
        raise ValueError('Classifier type {} is not supported.'.format(clf_type))
    #%% Model prediction
    for df in [train, test]:
        df.loc[:, 'OD'] = clf.predict_proba(df[features])[:, 1]
        df.loc[:, 'antigen'] = 'combined'
        df.loc[:, 'antigen type'] = 'Diagnostic'
    # %% transform the dataframe back to long format for plotting
    test_keys = test.drop(features + ['target'], axis=1)
    antigen_list = ['SARS CoV2 N 50', 'SARS CoV2 RBD 250', 'SARS CoV2 spike 62.5']
    suffix = '_'.join([pipeline, norm_antigen, 'norm_per_plate', 'mean_rands', str(rand_seed), 'low_c', 'ci'])
    test_keys = test_keys[['plate ID', 'well_id']].drop_duplicates()
    roc_df = df_norm.copy()
    slice_cols = ['serum ID', 'antigen type', 'antigen']
    slice_keys = [sera_roc_list, ['Diagnostic'], antigen_list]
    slice_actions = ['drop', 'keep', 'keep']
    fpr = 0.05
    ci = 95
    hue = 'pipeline'
    for col, action, key in zip(slice_cols, slice_actions, slice_keys):
        roc_df = slice_df(roc_df, action, col, key)
    roc_df = pd.merge(test_keys, roc_df, how='left', on=['plate ID', 'well_id'])
    # %% compute ROC curves and AUC
    roc_df = roc_df.groupby(['antigen', 'serum ID', 'well_id', 'plate ID',
                             'serum type', 'serum dilution', 'pipeline', 'secondary ID',
                             'secondary dilution'])['OD'].mean()
    roc_df = roc_df.reset_index()
    roc_df = pd.concat([test, roc_df])
    _ = roc_plot_grid(roc_df, output_dir, '_'.join(['ROC', clf_type, suffix]), 'pdf', col_wrap=4, ci=ci, fpr=fpr, hue=hue)
    # %% plot FPR & TPR v.s. threshold
    # threshold plot currently doesn't support CI
    ci = None
    roc_df = get_roc_df(roc_df, ci=ci)
    roc_df = roc_df.melt(id_vars=['antigen',
                                  'secondary ID',
                                  'secondary dilution',
                                  'pipeline',
                                  'threshold',
                                  'AUC'],
                         var_name='category',
                         value_name='rate'
                         )
    thr_plot_grid(roc_df, output_dir, '_'.join(['ROC_thr', clf_type, suffix]), 'pdf', col_wrap=4)
示例#11
0
# local files
import data_get

plt.style.use('seaborn')

x_data, _y, full_data = data_get.get_data('as7262 mango', average=False)

print(full_data.columns)
currents = full_data['LED current'].unique()
times = full_data['integration time'].unique()
print(currents, times)
print(full_data['saturation check'].unique())
pls = PLSRegression(n_components=6)
# pls = linear_model.LinearRegression()
cv = RepeatedKFold(n_splits=5, n_repeats=20)
cv_group = GroupShuffleSplit(n_splits=200)
scores = []
labels = []
errors = []
training_scores = []
training_errors = []
for current in currents:
    for time in times:
        print(current, time)
        labels.append("{0}, {1}".format(current, time))

        X, Y = data_get.get_data("as7262 mango",
                                 integration_time=time,
                                 led_current=current,
                                 return_type="XY",
                                 read_number=1)
示例#12
0
    print("test_indices:", test_indices)
    x_test = x_train_names_all[test_indices, ...]
    print("x_test:\n", x_test)
    y_test = y_train_labels_all[test_indices, ...]
    print("y_test:\n", y_test)
    n_fold += 1

# %%
# 2.2、GroupShuffleSplit
# sklearn.model_selection.GroupShuffleSplit作用与ShuffleSplit相同,不同之处在于GroupShuffleSplit先将待划分的样本集按groups分组,再按照分组划分训练集、测试集。
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
y = np.array([3, 3, 3, 2, 3, 2])
groups = np.array([1, 3, 1, 2, 3, 2])  # 根据 groups 进行划分,不根据y
# 分3折,先按groups进行划分为3组(优先); 再按train_size、test_size划分(可以不满足): test_size=0.25,那么train_size=0.75
group_shuff = GroupShuffleSplit(n_splits=3, test_size=0.25, random_state=0)
print(group_shuff.get_n_splits(X, y, groups))
print(group_shuff)
for train_index, test_index in group_shuff.split(X, y, groups):
    print("Train Index:", train_index, ",Test Index:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print(X_train)
    print(y_train)
    print(X_test)
    print(y_test)

# %%
sample = pd.DataFrame({
    'subject': [
        'p012', 'p012', 'p014', 'p014', 'p014', 'p024', 'p024', 'p024', 'p024',
    def _train_test_split(self):
        self.metadata = self.metadata.copy()
        # create train and val and test splits for artist splitting
        if 'split' in self.metadata.columns:
            train_songs = self.metadata[self.metadata['split'] ==
                                        'train']['songId'].unique()
            val_songs = self.metadata[self.metadata['split'] ==
                                      'val']['songId'].unique()
            test_songs = self.metadata[self.metadata['split'] ==
                                       'test']['songId'].unique()

            if len(val_songs) == 0:

                tracks = self.metadata[self.metadata['songId'].isin(
                    train_songs)]['trackId'].get_values()
                songs = self.metadata[self.metadata['songId'].isin(
                    train_songs)]['songId'].get_values()

                # test split
                np.random.seed(10)
                tracks, songs = shuffle(tracks, songs)

                gss = GroupShuffleSplit(n_splits=1,
                                        test_size=self.val_pct,
                                        random_state=10)
                train_mask, val_mask = next(
                    gss.split(X=tracks, y=None, groups=songs))
                val_songs = songs[val_mask]
                train_songs = songs[train_mask]

        else:
            tracks = self.metadata['trackId'].get_values()
            songs = self.metadata['songId'].get_values()

            # test split
            np.random.seed(10)
            tracks, songs = shuffle(tracks, songs)
            gss = GroupShuffleSplit(n_splits=1, test_size=.2, random_state=10)
            train_val_mask, test_mask = next(
                gss.split(X=tracks, y=None, groups=songs))
            test_songs = songs[test_mask]
            train_val_tracks = tracks[train_val_mask]
            train_val_songs = songs[train_val_mask]

            # train and val splits
            gss = GroupShuffleSplit(n_splits=1,
                                    test_size=.1 / .8,
                                    random_state=10)
            train_mask, val_mask = next(
                gss.split(X=train_val_tracks, y=None, groups=train_val_songs))
            val_songs = train_val_songs[val_mask]
            train_songs = train_val_songs[train_mask]

        if self.split == 'train':
            self.metadata = self.metadata[self.metadata['songId'].isin(
                train_songs)]
        elif self.split == 'val':
            self.metadata = self.metadata[self.metadata['songId'].isin(
                val_songs)]
        elif self.split == 'test':
            self.metadata = self.metadata[self.metadata['songId'].isin(
                test_songs)]
示例#14
0
    'objective': 'regression',
    'max_depth': 6,
    'learning_rate': LEARNING_RATE,
    "boosting_type": "gbdt",
    "subsample_freq": 1,
    "subsample": 0.9,
    "bagging_seed": 11,
    "metric": 'mae',
    "verbosity": -1,
    'reg_alpha': 0.1,
    'reg_lambda': 0.4,
    'colsample_bytree': 1.0,
    'random_state': RANDOM_STATE
}

folds = GroupShuffleSplit(n_splits=N_FOLDS, random_state=RANDOM_STATE)

# Setup arrays for storing results
oof_df = train_df[['id', 'type', 'scalar_coupling_constant']].copy()
oof_df['oof_preds'] = 0
prediction = np.zeros(len(X_test))
feature_importance = pd.DataFrame()
test_pred_df = test_df.copy()
test_pred_df['prediction'] = 0
bond_count = 1
number_of_bonds = len(X['type'].unique())
for bond_type in X['type'].unique():
    bond_start = timer()
    fold_count = 1
    # Train the model
    X_type = X.loc[X['type'] == bond_type]
示例#15
0
def evaluate_classifier(features,
                        class_names,
                        classifier_name,
                        params,
                        parameter_mode,
                        list_of_ids=None,
                        n_exp=-1,
                        train_percentage=0.90,
                        smote=False):
    """
    ARGUMENTS:
        features:     a list ([numOfClasses x 1]) whose elements containt
                      np matrices of features. Each matrix features[i] of
                      class i is [n_samples x numOfDimensions]
        class_names:    list of class names (strings)
        classifier_name: svm or knn or randomforest
        params:        list of classifier parameters (for parameter
                       tuning during cross-validation)
        parameter_mode:    0: choose parameters that lead to maximum overall
                             classification ACCURACY
                          1: choose parameters that lead to maximum overall
                          f1 MEASURE
        n_exp:        number of cross-validation experiments 
                      (use -1 for auto calculation based on the num of samples)
        train_percentage: percentage of training (vs validation) data
                          default 0.90

    RETURNS:
         bestParam:    the value of the input parameter that optimizes the
         selected performance measure
    """

    # transcode list of feature matrices to X, y (sklearn)
    X, y = features_to_matrix(features)

    # features_norm = features;
    n_classes = len(features)
    ac_all = []
    f1_all = []
    f1_std_all = []
    pre_class_all = []
    rec_classes_all = []
    f1_classes_all = []
    cms_all = []

    # dynamically compute total number of samples:
    # (so that if number of samples is >10K only one train-val repetition
    # is performed)
    n_samples_total = X.shape[0]

    if n_exp == -1:
        n_exp = int(50000 / n_samples_total) + 1

    if list_of_ids:
        train_indeces, test_indeces = [], []
        gss = GroupShuffleSplit(n_splits=n_exp, train_size=.8)
        for train_index, test_index in gss.split(X, y, list_of_ids):
            train_indeces.append(train_index)
            test_indeces.append(test_index)

    for Ci, C in enumerate(params):
        # for each param value
        cm = np.zeros((n_classes, n_classes))
        f1_per_exp = []
        y_pred_all = []
        y_test_all = []
        for e in range(n_exp):
            y_pred = []
            # for each cross-validation iteration:
            print("Param = {0:.5f} - classifier Evaluation "
                  "Experiment {1:d} of {2:d}".format(C, e + 1, n_exp))
            # split features:

            if list_of_ids:
                X_train, X_test, y_train, y_test = group_split(
                    X, y, train_indeces, test_indeces, e)
            else:
                X_train, X_test, y_train, y_test = \
                    train_test_split(X, y, test_size=1-train_percentage)

            # mean/std scale the features:
            scaler = StandardScaler()
            if smote:
                sm = SMOTE(random_state=2)
                #sm = RandomUnderSampler(random_state=0)
                X_train, y_train = sm.fit_resample(X_train, y_train)
            scaler.fit(X_train)
            X_train = scaler.transform(X_train)

            # train multi-class svms:
            if classifier_name == "svm":
                classifier = train_svm(X_train, y_train, C)
            elif classifier_name == "svm_rbf":
                classifier = train_svm(X_train, y_train, C, kernel='rbf')
            elif classifier_name == "knn":
                classifier = train_knn(X_train, y_train, C)
            elif classifier_name == "randomforest":
                classifier = train_random_forest(X_train, y_train, C)
            elif classifier_name == "gradientboosting":
                classifier = train_gradient_boosting(X_train, y_train, C)
            elif classifier_name == "extratrees":
                classifier = train_extra_trees(X_train, y_train, C)

            # get predictions and compute current comfusion matrix
            cmt = np.zeros((n_classes, n_classes))
            X_test = scaler.transform(X_test)
            for i_test_sample in range(X_test.shape[0]):
                y_pred.append(
                    classifier_wrapper(classifier, classifier_name,
                                       X_test[i_test_sample, :])[0])
            # current confusion matrices and F1:
            cmt = sklearn.metrics.confusion_matrix(y_test, y_pred)
            f1t = sklearn.metrics.f1_score(y_test, y_pred, average='macro')
            # aggregated predicted and ground truth labels
            # (used for the validation of final F1)
            y_pred_all += y_pred
            y_test_all += y_test.tolist()

            f1_per_exp.append(f1t)
            if cmt.size != cm.size:
                all_classes = set(y)
                split_classes = set(y_test.tolist() + y_pred)
                missing_classes = all_classes.difference(split_classes)
                missing_classes = list(missing_classes)
                missing_classes = [int(x) for x in missing_classes]
                for mm in missing_classes:
                    cmt = np.insert(cmt, mm, 0, axis=0)
                for mm in missing_classes:
                    cmt = np.insert(cmt, mm, 0, axis=1)
            cm = cm + cmt
        cm = cm + 0.0000000010

        rec = np.array(
            [cm[ci, ci] / np.sum(cm[ci, :]) for ci in range(cm.shape[0])])
        pre = np.array(
            [cm[ci, ci] / np.sum(cm[:, ci]) for ci in range(cm.shape[0])])

        pre_class_all.append(pre)
        rec_classes_all.append(rec)

        f1 = 2 * rec * pre / (rec + pre)

        # this is just for debugging (it should be equal to f1)
        f1_b = sklearn.metrics.f1_score(y_test_all,
                                        y_pred_all,
                                        average='macro')
        # Note: np.mean(f1_per_exp) will not be exacty equal to the
        # overall f1 (i.e. f1 and f1_b because these are calculated on a
        # per-sample basis)
        f1_std = np.std(f1_per_exp)
        #print(np.mean(f1), f1_b, f1_std)

        f1_classes_all.append(f1)
        ac_all.append(np.sum(np.diagonal(cm)) / np.sum(cm))

        cms_all.append(cm)
        f1_all.append(np.mean(f1))
        f1_std_all.append(f1_std)

    print("\t\t", end="")
    for i, c in enumerate(class_names):
        if i == len(class_names) - 1:
            print("{0:s}\t\t".format(c), end="")
        else:
            print("{0:s}\t\t\t".format(c), end="")
    print("OVERALL")
    print("\tC", end="")
    for c in class_names:
        print("\tPRE\tREC\tf1", end="")
    print("\t{0:s}\t{1:s}".format("ACC", "f1"))
    best_ac_ind = np.argmax(ac_all)
    best_f1_ind = np.argmax(f1_all)
    for i in range(len(pre_class_all)):
        print("\t{0:.3f}".format(params[i]), end="")
        for c in range(len(pre_class_all[i])):
            print("\t{0:.1f}\t{1:.1f}\t{2:.1f}".format(
                100.0 * pre_class_all[i][c], 100.0 * rec_classes_all[i][c],
                100.0 * f1_classes_all[i][c]),
                  end="")
        print("\t{0:.1f}\t{1:.1f}".format(100.0 * ac_all[i],
                                          100.0 * f1_all[i]),
              end="")
        if i == best_f1_ind:
            print("\t best f1", end="")
        if i == best_ac_ind:
            print("\t best Acc", end="")
        print("")

    if parameter_mode == 0:
        # keep parameters that maximize overall classification accuracy:
        print("Confusion Matrix:")
        print_confusion_matrix(cms_all[best_ac_ind], class_names)
        return params[best_ac_ind]
    elif parameter_mode == 1:
        # keep parameters that maximize overall f1 measure:
        print("Confusion Matrix:")
        print_confusion_matrix(cms_all[best_f1_ind], class_names)
        print(f"Best macro f1 {100 * f1_all[best_f1_ind]:.1f}")
        print(f"Best macro f1 std {100 * f1_std_all[best_f1_ind]:.1f}")
        return params[best_f1_ind]
示例#16
0
    sys.path.append(module_path)
from keras_addon import ImageFrameGenerator, AUCCheckPoint

path = "/nfs/turbo/intmed-bnallamo-turbo/wsliu/Data/colonoscopy2/"
model_path = path + 'models/CV_adjudication/'
if not os.path.exists(model_path):
    os.mkdir(model_path)
data_path = path + 'subset_adjudication/'

batch_size = 32

labels = pd.read_csv(path +
                     'CV_adjudication/train_labels{}.csv'.format(tst_seed))
labels = labels.reset_index(drop=True)
split = GroupShuffleSplit(n_splits=1,
                          test_size=0.11,
                          random_state=24 + val_seed)
ind = split.split(labels, groups=labels['SourceReportName'])
trn_ind, val_ind = next(ind)
trn_df = labels.loc[trn_ind]
val_df = labels.loc[val_ind]

train_gen = ImageFrameGenerator(rotation_range=180,
                                width_shift_range=0.2,
                                height_shift_range=0.2,
                                shear_range=0.1,
                                zoom_range=0.2,
                                horizontal_flip=True,
                                vertical_flip=True,
                                fill_mode='nearest')
test_gen = ImageFrameGenerator()
示例#17
0
def get_chexpert(seed=2020,
                 policy=1,
                 No_finding=True,
                 parenchymal=True,
                 extraparenchymal=True,
                 limit_out_labels=True,
                 with_rank=False,
                 with_path=False,
                 morph_gen=False,
                 morph_load=False,
                 dumb_morph=False,
                 out_trans='train'):
    Parenchymal = [  # i.e. findings in the lungs themselves
        'Lung Lesion',
        'Lung Opacity',
        'Edema',
        'Consolidation',
        'Pneumonia',
        'Atelectasis',
    ]

    Extraparenchymal = [  # i.e. findings outside the lungs
        'Support Devices',
        'Pleural Effusion',
        'Pleural Other',
        'Pneumothorax',
        'Cardiomegaly',
        'Enlarged Cardiomediastinum',
        'Fracture',
    ]

    labels = ['No Finding'] if No_finding else []
    labels = labels + Parenchymal if parenchymal else labels
    labels = labels + Extraparenchymal if extraparenchymal else labels

    out_labels = [
    ]  # 'No Finding' can never be OOD, as it is alway in-dist phen
    out_labels = out_labels + Parenchymal if not parenchymal else out_labels
    out_labels = out_labels + Extraparenchymal if not extraparenchymal else out_labels

    cheXpert_train = pd.read_csv('data/CheXpert-v1.0-small/train.csv')
    cheXpert_test = pd.read_csv('data/CheXpert-v1.0-small/valid.csv')
    cheXpert = pd.concat([cheXpert_train,
                          cheXpert_test]).reset_index(drop=True)
    if morph_load:
        cheXpert['Path'] = cheXpert['Path'].apply(
            lambda x: x[:len('CheXpert-v1.0-small')] + '_morph' + x[len(
                'CheXpert-v1.0-small'):])
    cheXpert['Path'] = cheXpert['Path'].apply(lambda x: 'data/' + x)
    cheXpert['Patient ID'] = cheXpert['Path'].str.extract(
        r'patient([0-9]+)\/').astype(int).values
    column_set = ['Path', 'Patient ID'] + labels
    ood_column_set = ['Path', 'Patient ID'
                      ] + ['No Finding'] + Parenchymal + Extraparenchymal

    # drop lateral views (eq to Frontal/Lateral == Frontal)
    cheXpert = cheXpert[~cheXpert['AP/PA'].isna()]

    # label policy - uncertine labels should be counted as:
    mapping = dict({1: 1, 0: 0, -1: policy})
    cheXpert[labels + out_labels] = cheXpert[labels + out_labels].fillna(
        0).applymap(lambda x: mapping[x]).values

    # limit dataset to places with no positive from the out labels
    if limit_out_labels:
        ood_exists = cheXpert[out_labels].sum(axis=1) > 0
        cheXpert_out = cheXpert[ood_exists]
        in_exists = cheXpert_out[labels].sum(axis=1) > 0
        cheXpert_out_in = cheXpert_out[in_exists][ood_column_set]
        cheXpert_out_out = cheXpert_out[~in_exists][ood_column_set]
        cheXpert = cheXpert[~ood_exists]
    else:
        ood_exists = (cheXpert[out_labels].sum(axis=1) >
                      0) & (cheXpert[labels].sum(axis=1) < 1)
        cheXpert_out = cheXpert[ood_exists]
        cheXpert = cheXpert[~ood_exists]
        cheXpert_out_in = pd.DataFrame([], columns=ood_column_set)
        cheXpert_out_out = cheXpert_out[ood_column_set]

    # split test train by identities
    splitter = GroupShuffleSplit(1, test_size=.3, random_state=seed)
    id_locs = list(
        splitter.split(range(len(cheXpert)), groups=cheXpert['Patient ID']))[0]
    train_df, test_df = cheXpert.iloc[id_locs[0]][column_set], cheXpert.iloc[
        id_locs[1]][column_set]

    # make sure every sample has some label
    # TODO move back for new model batch
    train_df = train_df[train_df[labels].sum(1) > 0]
    test_df = test_df[test_df[labels].sum(1) > 0]
    cheXpert_out_in = cheXpert_out_in[cheXpert_out_in[
        ['No Finding'] + Parenchymal + Extraparenchymal].sum(1) > 0]
    cheXpert_out_out = cheXpert_out_out[cheXpert_out_out[
        ['No Finding'] + Parenchymal + Extraparenchymal].sum(1) > 0]

    # calculate rank by rarest condition
    labels_ = train_df[train_df.columns[2:]].fillna(0).values
    if extraparenchymal and No_finding and not parenchymal:
        labels_ = np.concatenate(
            [
                labels_[:, :2],  # 'No Finding', 'Support Devices''
                labels_[:, 2:5].sum(1)
                [:,
                 None],  # 'Pleural Effusion', 'Pleural Other', 'Pneumothorax'
                labels_[:, 5:7].sum(1)
                [:, None],  # 'Cardiomegaly', 'Enlarged Cardiomediastinum'
                labels_[:, 7:]
            ],
            axis=1)  # 'Fracture'
        ranking = np.argsort(labels_.sum(0) / len(labels_))  # [::-1]
        rank_label_names = [
            'No Finding', 'Support Devices', 'Pleural/Pneumothorax', 'Cardio',
            'Fracture'
        ]
    else:
        ranking = np.argsort(labels_.sum(0) / len(labels_))  # [::-1]
        rank_label_names = train_df.columns[2:].values

    mul_value = np.arange(len(ranking))
    mul_value.put(ranking, np.arange(len(ranking))[::-1] + 1)
    rank_labels = np.argmax(labels_ * mul_value, 1)

    if morph_gen or morph_load:
        return CheXpert(train_df,
                        transform=ImageNet_trans['morph'],
                        policy=policy,
                        with_path=with_path,
                        rank_labels=rank_labels,
                        rank_label_names=rank_label_names)
    elif dumb_morph:
        return CheXpert(train_df, transform=ImageNet_trans['dumb_morph'])
    else:
        return (
            CheXpert(train_df,
                     transform=ImageNet_trans['train'],
                     policy=policy,
                     with_path=with_path,
                     with_rank=with_rank,
                     rank_labels=rank_labels,
                     rank_label_names=rank_label_names),
            CheXpert(test_df,
                     transform=ImageNet_trans[out_trans],
                     policy=policy,
                     with_path=with_path,
                     with_rank=with_rank,
                     rank_labels=rank_labels,
                     rank_label_names=rank_label_names),
            None if not limit_out_labels else CheXpert(
                cheXpert_out_in,
                transform=ImageNet_trans[out_trans],
                policy=policy,
                with_path=with_path,
                with_rank=False),
            CheXpert(cheXpert_out_out,
                     transform=ImageNet_trans[out_trans],
                     policy=policy,
                     with_path=with_path,
                     with_rank=False),
        )
示例#18
0
# Usual procedure
x_tr, x_te, y_tr = load_data(features_folder=isi_folder,
                             data_folder=data_folder)

preprocessing_steps = []
resampling_steps = []
x_tr, x_te, groups_tr, y_tr = preprocess_data(
    x_tr,
    x_te,
    y_tr=y_tr,
    preprocessing_steps=preprocessing_steps,
    resampling_steps=resampling_steps)

# Time series specific preprocessing
splitter = GroupShuffleSplit(n_splits=5, test_size=0.33, random_state=42)
train_idx, test_idx = next(splitter.split(x_tr, y_tr, groups_tr))

train_data = TimeSeriesDataset(x_tr.values[train_idx],
                               y_tr.values[train_idx].ravel(), False, True)
test_data = TimeSeriesDataset(x_tr.values[test_idx],
                              y_tr.values[test_idx].ravel(), False, False)

###############################################################################
#                                                                             #
#                                  Model run                                  #
#                                                                             #
###############################################################################


def get_models():  # tuples of (batch_size, model)
示例#19
0
文件: train.py 项目: zanachka/soft404
def main(args=None):
    parser = argparse.ArgumentParser()
    arg = parser.add_argument
    arg('in_prefix',
        help='Prefix of input filenames, ending with '
        '(.items.jl.gz and .meta.jl.gz)')
    arg('--lang', default='en', help='Train only for this language')
    arg('--show-features', action='store_true')
    arg('--limit', type=int, help='Use only a part of all data')
    arg('--no-mp', action='store_true', help='Do not use multiprocessing')
    arg('--max-features', type=int, default=50000)
    arg('--ngram-max', type=int, default=2)
    arg('--n-best-features',
        type=int,
        default=3000,
        help='Re-train using specified number of best features')
    arg('--save', help='Train on all data and save classifier')
    args = parser.parse_args(args)

    with json_lines.open(args.in_prefix + '.meta.jl.gz') as f:
        meta = list(f)
    if args.limit:
        meta = meta[:args.limit]

    # Do not include real soft404 candidates
    flt_indices = {
        idx
        for idx, item in enumerate(meta)
        if (item['status'] == 200 and not item['mangled_url']
            or item['status'] == 404 and item['mangled_url'])
    }
    if args.lang:
        flt_indices &= get_lang_indices(meta, args.lang)
        print('Using only data for "{}" language'.format(args.lang))
    meta = [item for idx, item in enumerate(meta) if idx in flt_indices]
    print_data_summary(meta)

    data = partial(reader,
                   filename=args.in_prefix + '.items.jl.gz',
                   flt_indices=flt_indices)
    text_features = get_text_features(args.in_prefix,
                                      data,
                                      len(meta),
                                      ngram_max=args.ngram_max,
                                      max_features=args.max_features)
    assert text_features.shape[0] == len(meta)

    ys = np.array([item['status'] == 404 for item in meta])
    _eval_clf = partial(
        eval_clf,
        text_features=text_features,
        ys=ys,
        show_features=args.show_features,
        vec_filename=get_vec_filename(args.in_prefix),
        n_best_features=args.n_best_features,
    )

    if args.save:
        _eval_clf((0, (np.array(range(len(meta))), [])), save=args.save)
    else:
        folds = GroupShuffleSplit(n_splits=10).split(
            meta, groups=[item['domain'] for item in meta])
        with multiprocessing.Pool() as pool:
            all_metrics = defaultdict(list)
            print('Training and evaluating...')
            _map = map if args.no_mp else pool.imap_unordered
            for eval_metrics in _map(_eval_clf, enumerate(folds)):
                for k, v in eval_metrics.items():
                    all_metrics[k].append(v)
            print()
            for k, v in sorted(all_metrics.items()):
                print('{:<5} {:.3f} ± {:.3f}'.format(k, np.mean(v),
                                                     np.std(v) * 2))
示例#20
0
 def _groups_shuffle_iterator(settings):
     k = settings['k']
     n = settings.get('n', k)
     test_size = 1.0 / k
     return GroupShuffleSplit(n_splits=n, test_size=test_size)
示例#21
0
plt.plot(mean_predicted_value, fraction_of_positives, "o", markersize = 8, label=name)

plt.ylabel("Observed Proportion of Positives")
plt.ylim([-0.05, 1.06])
plt.xlabel("Mean Predicted Risk")
plt.legend(loc="lower right")
plt.title('Suicide attempt/death within %s days following a visit' % outcome)

plt.plot(prob_pos, np.full(prob_pos.shape, 1.03), '|', color = 'lightpink', ms=8, alpha = 0.5)
plt.tight_layout()
fig = plt.savefig('./Output/Calibration_curve_%s.pdf' % outcome)
plt.close(fig)

# LEARNING CURVE

cv = GroupShuffleSplit(n_splits=10, test_size = 0.20, random_state = 33).get_n_splits(X_train.values, y_train.values, groups_train.values)
train_sizes, train_scores, test_scores =learning_curve(estimator=best_clf,
                                                       X=X_train,
                                                       y=y_train,
                                                       train_sizes = [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1.0],
                                                       scoring = 'roc_auc',
                                                       cv=cv,
                                                       n_jobs=10)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)


fig = plt.figure(figsize = (6,6))
plt.plot(train_sizes, train_mean,
示例#22
0
def split_test(df, test_method='fo', test_size=.2):
    """
    - parameters
    df : pd.DataFrame raw data waiting for test set splitting
    test_method : str, 划分测试集的方式
                    'fo': split by ratio 单纯按比例划分
                    'tfo': split by ratio with timestamp 考虑时间带比例划分
                    'tloo': leave one out with timestamp 考虑时间的留一法
                    'loo': leave one out 留一法
                    'ufo': split by ratio in user level 按照每个用户进行划分
                    'utfo': time-aware split by ratio in user level 考虑时间按照每个用户进行划分
    test_size : float, 测试集比例

    - returns
    train_set : pd.DataFrame training dataset
    test_set : pd.DataFrame test dataset

    """
    train_set, test_set = pd.DataFrame(), pd.DataFrame()

    # 原始code:100个用户 20个用户的数据作为test 那这样划分,test的用户嵌入得不到训练啊??
    if test_method == 'oldufo':
        driver_ids = df['user']
        # 原始df的uid已经重新编码了,为何还需要这一步操作??
        # 分别是:[升序排列取值set];[重新编码id原始list]
        _, driver_indices = np.unique(np.array(driver_ids),
                                      return_inverse=True)
        gss = GroupShuffleSplit(n_splits=1,
                                test_size=test_size,
                                random_state=2020)
        # 按照用户组划分
        for train_idx, test_idx in gss.split(df, groups=driver_indices):
            train_set, test_set = df.loc[train_idx, :].copy(), df.loc[
                test_idx, :].copy()

    # 自己想法:每个用户交互历史随机比例作为test
    elif test_method == 'ufo':
        # 可以加一个打乱reset_index 写成 utfo那样
        # 也可以写成loo那样
        # 对每一组:从index中随机抽取,不重复的test_size的index

        # 保证test_num>=1 不然可能返回空[] 后续的df.loc报错 或者后续检查test_index有无空值
        test_index=df.groupby(['user']).apply(\
            lambda grp: np.random.choice(grp.index,\
            max(int(np.floor(len(grp)*test_size)),1),\
            replace=False)).explode().values

        # 如果不保证test_num>=1,就得加下面这一行
        # test_index=list(filter(None, test_index))
        test_set = df.loc[test_index, :]
        train_set = df[~df.index.isin(test_index)]
        train_set = train_set.sort_values(['user']).reset_index(drop=True)
        test_set = test_set.sort_values(['user']).reset_index(drop=True)

    # 每个用户的交互记录中挑选最新比例的交互作为test
    elif test_method == 'utfo':
        df = df.sort_values(['user',
                             'timestamp']).reset_index(drop=True)  # 按照时间升序排列

        def time_split(grp):
            # grp.index是当前user的所有交互的index
            start_idx = grp.index[0]  # 第一个交互的index
            split_len = int(np.ceil(len(grp) *
                                    (1 - test_size)))  # 向上取整 train数量
            split_idx = start_idx + split_len  # test开始位置(start+train数量)
            end_idx = grp.index[-1]  # test结束位置

            # 得到每一个用户的test iid列表
            # 每一组df为 uid:[test iid集合]
            if (end_idx > split_idx):
                return list(range(split_idx, end_idx + 1))
            else:
                return [end_idx]  # 如果test算出来个数为0 返回最后一个交互

        # apply 对groupby之后的每一个分组进行操作 返回series [uid,[test id集合]]
        # user id:[test1,test2,test3]
        # explode() 某一列是[a,b,c] 转化成多行 [前面列,a] [前面列,b] [前面列,c]

        # df.groupby('user').apply(time_split) 返回series
        # explode() 将 []列表分成单行
        # .vales将单行值转化成list
        test_index = df.groupby('user').apply(time_split).explode().values
        test_set = df.loc[test_index, :]  # test_index是list
        train_set = df[~df.index.isin(test_index)]

    # 所有的交互数据 挑选最新比例
    elif test_method == 'tfo':
        # df = df.sample(frac=1)
        df = df.sort_values(['timestamp']).reset_index(drop=True)
        split_idx = int(np.ceil(len(df) *
                                (1 - test_size)))  # train数据的index最后一个
        train_set, test_set = df.iloc[:split_idx, :].copy(), df.iloc[
            split_idx:, :].copy()  # 注意split_idx是取不到的(iloc取不到、loc取到)

    # 全部交互数据单纯按照比例划分
    elif test_method == 'fo':
        train_set, test_set = train_test_split(df,
                                               test_size=test_size,
                                               random_state=2019)

    # 每个用户留最新的1个交互 作为test
    elif test_method == 'tloo':
        # df = df.sample(frac=1)
        df = df.sort_values(['timestamp']).reset_index(drop=True)
        df['rank_latest'] = df.groupby(['user'])['timestamp'].rank(
            method='first', ascending=False)  # 降序排列 first表示取值相同取第一个
        train_set, test_set = df[df['rank_latest'] > 1].copy(), df[
            df['rank_latest'] == 1].copy()
        del train_set['rank_latest'], test_set['rank_latest']

    # 每个用户交互中随机挑一个 作为test
    elif test_method == 'loo':
        # # slow method
        # test_set = df.groupby(['user']).apply(pd.DataFrame.sample, n=1).reset_index(drop=True)
        # test_key = test_set[['user', 'item']].copy()
        # train_set = df.set_index(['user', 'item']).drop(pd.MultiIndex.from_frame(test_key)).reset_index().copy()

        # # quick method
        test_index = df.groupby(
            ['user']).apply(lambda grp: np.random.choice(grp.index)).values
        test_set = df.loc[test_index, :].copy()
        train_set = df[~df.index.isin(test_index)].copy()

    train_set, test_set = train_set.reset_index(
        drop=True), test_set.reset_index(drop=True)
    print('train data num:{};test data num:{}'.format(train_set.shape[0],
                                                      test_set.shape[0]))

    return train_set, test_set
示例#23
0
#        housing
#        .pipe(lambda df: df['longitude'] * 10.0)
#        .pipe(lambda df: df['longitude'].int)
#        )
#
    #.pipe(np.int) #  + housing["latitude"]



# %%  # TODO(hali) KEY # make split on wellbore and cat sw depth and cat long and cat lat 
# [0,500,1500,1000,1500,2000,2500,3000,3500,4000] and 
from sklearn.model_selection import StratifiedShuffleSplit, GroupShuffleSplit
# %% see https://github.com/scikit-learn/scikit-learn/issues/9193
housing['stratify_group'] = housing[["income_cat", 'id']].apply(lambda x: '_'.join(x.astype(int).abs().astype(str)), axis=1)
# %%
split_gp = GroupShuffleSplit(n_splits=1,test_size=0.5, random_state=42)
for train_inds, test_inds in split_gp.split(X=housing, y=None, groups=housing['stratify_group']):
    strat_train_set_gp = housing.loc[train_inds]
    strat_test_set_gp = housing.loc[test_inds]
#X_train, X_test, y_train, y_test = X[train_inds], X[test_inds], y[train_inds], y[test_inds]
# %%
#strat_train_set_gp2, strat_test_set_gp2 = GroupShuffleSplit(
#        n_splits=1,test_size=0.5, random_state=42).split(
#                X=housing, y=None, groups=housing['stratify_group']).next()
    

# %%
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"], housing['id']):
    print(str(len(train_index)) + '_' + str(len(test_index)) + '\n')
    strat_train_set = housing.loc[train_index]
示例#24
0
dropout = 0.3
batchsize = 256
penalty = 0.

embed_glove = np.load(
    path +
    'all/sepdx1/test_embed/cosine/embed_mat_{0}_{1:.3f}_{2}_{3}{4}.npy'.format(
        code_embed_dim, 0, 20, cohort, 0))
embed_initializer = Constant(embed_glove)

auc_lst = []
y_pred_lst = []
recycle_pred = np.zeros((len(hosp_cat), n_val))
for val_ind in range(n_val):
    split = GroupShuffleSplit(n_splits=1,
                              test_size=0.2,
                              random_state=24 + val_ind)
    idx = split.split(train_df0, groups=train_df0.KEY_NRD)
    trn_idx, val_idx = next(idx)
    trn_df = train_df0.loc[trn_idx, ]
    val_df = train_df0.loc[val_idx, ]
    N_trn = len(trn_df)
    train_df = pd.concat([trn_df, val_df])

    DX1_series = train_df['DX1'].map(DX1_dict)
    DX1_array = DX1_series.values
    DX1_array_trn = DX1_array[:N_trn]
    DX1_array_val = DX1_array[N_trn:]

    DX_df = train_df[DXs]
    DX_df = DX_df.fillna('missing')
示例#25
0
文件: v02.py 项目: sgn410072018/ML
groups = groupsdata[:, 1]
y = groupsdata[:, 2]

X_means = X.mean(2)
x_vars = X.var(2)

X = np.concatenate((X_means, x_vars), axis=1)
X = np.absolute(X[:, [4, 5, 6, 7, 8, 9, 14, 15, 16, 17]])

from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
#X = normalize(X, axis=1)

gss = GroupShuffleSplit(random_state=1)

for train, test in gss.split(X, y, groups=groups):
    #print("%s %s" % (train, test))
    for classifier in [
        (KNeighborsClassifier, KNeighborsClassifier()),
        (LinearDiscriminantAnalysis, LinearDiscriminantAnalysis()),
        (SVC, SVC(gamma="auto")),
        (LogisticRegression,
         LogisticRegression(max_iter=1000,
                            class_weight="balanced",
                            penalty="l2",
                            multi_class="auto",
                            solver="lbfgs"))
    ]:
        model = classifier[1]
示例#26
0
         processed               = glob(os.path.join(saving_dir,'*.csv'))
         if csv_filename in processed: # don't repeat what have done
             print(csv_filename)
             pass
         else:
             if n_splits >= 100:
                 idxs_test       = utils.customized_partition(df_data,['id','words'],n_splits)
                 while utils.check_train_test_splits(idxs_test): # just in case
                     idxs_test   = utils.customized_partition(df_data,['id','words'],n_splits = n_splits)
                 idxs_train      = [shuffle(np.array([idx for idx in np.arange(df_data.shape[0]) if (idx not in idx_test)])) for idx_test in idxs_test]
 #                idxs_train      = [utils.check_train_balance(df_data,idx_train,list(label_map.keys())) for idx_train in tqdm(idxs_train)]
                 cv              = zip(idxs_train,idxs_test)
             else:
                 from sklearn.model_selection import GroupShuffleSplit
                 cv = GroupShuffleSplit(n_splits     = n_splits,
                                        test_size    = 0.2,
                                        random_state = 12345)
                 idxs_train,idxs_test = [],[]
                 for idx_train,idx_test in cv.split(BOLD,targets,groups=groups):
                     idxs_train.append(idx_train)
                     idxs_test.append(idx_test)
         
             embedding_features  = np.array([word2vec_vec[word.lower()] for word in df_data['words']])
             
             # define the encoding model
             encoding_model      = linear_model.Ridge(
                                     alpha                       = alpha,        # L2 penalty, higher means more weights are constrained to zero
                                     normalize                   = True,         # normalize the batch features
                                     random_state                = 12345,        # random seeding
                                     )
             # black box cross validation
import numpy as np
from sklearn.model_selection import GroupShuffleSplit

sound_data = np.load('urban_sound.npz')
X_data = sound_data['X']
y_data = sound_data['y']
groups = sound_data['groups']

print(groups[groups > 0])

gss = GroupShuffleSplit(n_splits=1, test_size=0.2)
for train_idx, test_idx in gss.split(X_data, y_data, groups=groups):
    X_train = X_data[train_idx]
    y_train = y_data[train_idx]
    groups_train = groups[train_idx]

    X_test = X_data[test_idx]
    y_test = y_data[test_idx]
    groups_test = groups[test_idx]
    
    print(X_train.shape, X_test.shape)
    
np.savez('urban_sound_train', X=X_train, y=y_train, groups=groups_train)
np.savez('urban_sound_test', X=X_test, y=y_test, groups=groups_test)
def get_cv(X, y):
    cv = GroupShuffleSplit(n_splits=4, test_size=0.33, random_state=42)
    return cv.split(X, y, groups=X['FIPS'])
示例#29
0
import os
import csv
import warnings
import random
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit

# Mute warning messages
warnings.filterwarnings("ignore")

# Set working directory
os.chdir("/nfs/home/qiche/Project/CoCA/WP7/SB_visit/")

# Load data.dta
df = pd.read_csv("./Input/df.csv")

# Split the sample into training set and test set by lopnr
train_inds, test_inds = next(
    GroupShuffleSplit(test_size=.20, n_splits=2,
                      random_state=33).split(df, groups=df['lopnr']))
train = df.iloc[train_inds]
test = df.iloc[test_inds]

train.to_csv('./Input/train.csv', index=False)
test.to_csv('./Input/test.csv', index=False)
    def fit_predict(self,
                    n_splits=3,
                    test_size=0.3,
                    with_groups=True,
                    model=None,
                    plots=True,
                    plot_splits=[-1],
                    x=None,
                    y=None):
        '''
          fit_predict осуществляет кроссвалидацию по следуюещему пайплайну:
                    если with_groups = True делит выборку с помощью GroupShuffleSplit,
                    если False - StratifiedShuffleSplit. 
                    C биологической точки зрения логичнее делать разбиения с учетом групп, 
                    так как группами являются цепочки => хорошо иметь данные всей цепочки в трейне.
          n_splits, test_size - параметры, окторые передаются GroupShuffleSplit или StratifiedShuffleSplit
          model - можете написать любую свою модель, по дефолту будет брать ту, которая была заложена при создании класса.
          plots - рисовать графики (ROC-AUC, precision-recall, confusion matrix, probability-densities) или нет
          plot_splits - для каких разбиений рисовать графики (разиения нумеруются с 0). По дефолту рисуется для последнего разюиения.
          x , y  - при желании можете подставить свои данные для кросс валидации. По дефолту берет загруженную выборку.

          Возвращает словарь с ключами:
          'test score', 'train score' - f1_score на всех сплитах
          'roc_auc': данные функции sklearn.metrics.roc_curve(y_test, y_prob)
          'prec_rec':[precision, recall, average_precision_score, prec_recall_plot]
          'confusion': [cnf_matrix, cnf_normed, cnf_plot, cnf_normed_plot] (normed - означает нормированная матрица)
          'plots':{'roc_auc', 'prec_recall', 'cnf_normed', 'cnf', 'prob_density'}
        '''
        fpr_tprs = []
        prec_recalls = []
        cnfs = []
        prob_dens_info = []

        if model is None:
            self.trained_model = deepcopy(self.model)
        else:
            self.trained_model = deepcopy(model)

        if x is None:
            x = self.x
            y = self.y
        else:
            x = x
            y = y
        gss = GroupShuffleSplit(n_splits=n_splits,
                                test_size=test_size,
                                random_state=0)
        sss = StratifiedShuffleSplit(n_splits=n_splits,
                                     test_size=test_size,
                                     random_state=0)

        if with_groups:
            splitted = gss.split(x, y, groups=self.groups)
        else:
            splitted = sss.split(x, y)

        i = 0
        iterator = tqdm_notebook(splitted, desc="Splits",
                                 leave=True) if tqdm else splitted
        for train_index, test_index in iterator:
            x_train = x[train_index]
            y_train = y[train_index]
            x_test = x[test_index]
            y_test = y[test_index]
            self.trained_model.fit(x_train, y_train)
            y_prob = self.trained_model.predict_proba(x_test)[:, 1]
            y_pred = self.trained_model.predict(x_test)
            self.train_score.append(
                f1_score(y_train, self.trained_model.predict(x_train)))
            self.test_score.append(f1_score(y_test, y_pred))

            self.y_prob.append(y_prob)
            self.y_true.append(y_test)
            self.indexes.append(test_index)
            #            treshold, _ = self.plot_probability_density(plots=False)
            #            y_pred = [1 if i>=treshold else 0 for i in y_prob]
            self.y_pred.append(y_pred)

            i = i + 1
            fpr_tprs.append(roc_curve(y_test, y_prob))
            fpr, tpr, _ = fpr_tprs[-1]

            prec_recalls.append(self.prec_recall(y_test, y_prob, True))
            cnfs.append(self.plot_confusion_matrix(y_test, y_pred, True))
            prob_dens_info.append(self.plot_probability_density())

            self.roc_auc_plot.append(
                self.form_plot_string('plt.plot',
                                      fpr,
                                      tpr,
                                      color=self.colours[1],
                                      alpha=0.5))
        self.y_data = [y_prob, test_index]
        #        print('Portion of sites in test: ', np.sum(y_test == 1)/y_test.shape[0])
        #        print('Portion of sites in train: ', np.sum(y_train == 1)/y_train.shape[0])

        self.roc_auc_plot.append(
            self.form_plot_string('plt.legend', loc=4, fontsize=12))
        self.roc_auc_plot.append(
            self.form_plot_string('plt.title',
                                  self.model_name + ". ROC curves."))

        plot_splits = [plot_splits] if (type(plot_splits)
                                        == int) else plot_splits
        for i in plot_splits:
            fpr, tpr, _ = fpr_tprs[i]
            i = i - 2 if i < 0 else i
            self.roc_auc_plot[i] = self.form_plot_string(
                'plt.plot',
                fpr,
                tpr,
                color=random.choice(list(mcolors.CSS4_COLORS.keys())),
                alpha=0.5,
                label='Split %d' % i)

        data = {
            'test score': self.test_score,
            'train score': self.train_score,  #'treshold':treshold, 
            'roc_auc': fpr_tprs,
            'prec_rec': prec_recalls,
            'confusion': cnfs,
            'plots': {
                'roc_auc': self.roc_auc_plot,
                'prec_recall': [i[3] for i in prec_recalls],
                'cnf_normed': [i[3] for i in cnfs],
                'cnf': [i[2] for i in cnfs],
                'prob_density': [i[1] for i in prob_dens_info]
            }
        }

        if plots:
            self.show_plots({'roc_auc': self.roc_auc_plot})
            for i in plot_splits:
                data_to_plot = data['plots']
                data_to_plot = {
                    key: value[i] if key != 'roc_auc' else value
                    for key, value in data_to_plot.items()
                }
                del data_to_plot['roc_auc']
                self.show_plots(data_to_plot, suptitle='Split %d' % i)


#########   Saving the trained model in binary file ######################
#        if (not os.path.isdir('trained_models')):
#            os.mkdir('trained_models')

#        model_name = '%s_depth=%d_leaves=%d_%s_validation'%(re.split("\.|\'", str(self.trained_model.__class__))[-2],
#                                              self.trained_model.__dict__['max_depth'],
#                                              self.trained_model.__dict__['min_samples_leaf'],
#                                              self.filename)
#        with open("trained_models/"+model_name+".sav", 'wb') as file_to_save:
#            pickle.dump(self.trained_model, file_to_save)

        return data
示例#31
0
    # used to split train&val/test data
    groups = np.array(label)[:, 0]

    # subject-wise k-fold train&val/test split
    gkf = GroupKFold(n_splits)
    nn = 0
    for train_val_idx, test_idx in gkf.split(sequential_x, sequential_y,
                                             groups):
        # split whole dataset into train&val and test dataset
        x_train_val = sequential_x[train_val_idx]
        x_test = sequential_x[test_idx]
        y_train_val = sequential_y[train_val_idx]
        y_test = sequential_y[test_idx]

        #subject-wise
        gss = GroupShuffleSplit(n_splits=1, test_size=0.1, random_state=10)

        for train_idx, val_idx in gss.split(x_train_val, y_train_val,
                                            groups[train_val_idx]):
            # split train&val dataset into train and validation dataset
            x_train = x_train_val[train_idx]
            x_val = x_train_val[val_idx]
            y_train = y_train_val[train_idx]
            y_val = y_train_val[val_idx]
            # earlystopping to restore weights to the epoch with lowest val_loss
            es = EarlyStopping(monitor='val_loss',
                               mode='min',
                               verbose=1,
                               patience=10,
                               restore_best_weights=True)
            loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
示例#32
0
    def perform(self, clfs, scoring, verbose=0):

        self.segmentate()

        # Estimators
        for clf_name, estimator in clfs:
            print("*" * (len(clf_name) + 8), '\n***', clf_name,
                  '***\n' + "*" * (len(clf_name) + 8))

            print("---Kfold---")
            estimator_kfold = estimator

            #scores = cross_val_score(estimator_kfold, self.signal_dt, np.array(self.signal_or), cv=2)

            #print("test_accuracy: ", scores, " Mean:", format(scores.mean(), '.2f'), "Std:", format(scores.std(), '.2f'))

            score = cross_validate(estimator_kfold,
                                   self.signal_dt,
                                   np.array(self.signal_or),
                                   scoring=scoring,
                                   cv=4,
                                   verbose=verbose)  #, error_score='raise')
            for metric, s in score.items():
                print(metric, ' \t', s, ' Mean: ', format(s.mean(), '.2f'),
                      ' Std: ', format(s.std(), '.2f'))

            print("---GroupKfold---")
            estimator_groupkfold = estimator
            score = cross_validate(estimator_groupkfold,
                                   self.signal_dt,
                                   np.array(self.signal_or),
                                   self.signal_gr,
                                   scoring,
                                   cv=GroupKFold(n_splits=4),
                                   verbose=verbose)

            for metric, s in score.items():
                print(metric, ' \t', s, ' Mean: ', format(s.mean(), '.2f'),
                      ' Std: ', format(s.std(), '.2f'))

            print("---Train Test Split---")
            estimator_ttsplit = estimator
            X_train, X_test, y_train, y_test = train_test_split(
                self.signal_dt, self.signal_or, test_size=0.3, random_state=42)

            estimator_ttsplit.fit(X_train, y_train)
            pred = estimator_ttsplit.predict(X_test)

            print("test_accuracy: ", format(accuracy_score(y_test, pred),
                                            '.2f'))
            print("test_f1_macro: ",
                  format(f1_score(y_test, pred, average='macro'), '.2f'))

            print("---Train Test Split - Group---")
            estimator_ttsplitgroup = estimator
            train_inds, test_inds = next(
                GroupShuffleSplit(test_size=.30, n_splits=2,
                                  random_state=42).split(
                                      self.signal_dt, groups=self.signal_gr))

            X_train = self.signal_dt[train_inds]
            X_test = self.signal_dt[test_inds]
            y_train = np.array(self.signal_or)[train_inds]
            y_test = np.array(self.signal_or)[test_inds]

            estimator_ttsplitgroup.fit(X_train, y_train)
            pred = estimator_ttsplitgroup.predict(X_test)

            print("test_accuracy: ", format(accuracy_score(y_test, pred),
                                            '.2f'))
            print("test_f1_macro: ",
                  format(f1_score(y_test, pred, average='macro'), '.2f'))
def split_csv(source_path, save_path, logger):

    logger.info("Saving path: {}".format(save_path))

    train_fname = os.path.join(save_path, 'train.txt')
    test_fname = os.path.join(save_path, 'test.txt')
    val_fname = os.path.join(save_path, 'dev.txt')

    df_txt = pd.read_csv(source_path,
                         delimiter=',',
                         encoding='utf-8',
                         skip_blank_lines=True,
                         header=None,
                         names=['ss', 'ac', 'at', 'text'])

    # Split the df based on sentiment strength
    # into positive and negative
    gss = GroupShuffleSplit(test_size=.20, n_splits=1,
                            random_state=163).split(df_txt,
                                                    groups=df_txt['ss'])

    # Get positive and negative dataframe
    for positive_df, negative_df in gss:

        # Get data based on the index
        negative = df_txt.iloc[negative_df]
        positive = df_txt.iloc[positive_df]

        # Split 80/10/10 -> train, test, val
        # based on sentiment strength
        train_neg, test_val_neg = train_test_split(negative, test_size=0.2)
        train_pos, test_val_pos = train_test_split(positive, test_size=0.2)
        test_neg, val_neg = train_test_split(test_val_neg, test_size=0.5)
        test_pos, val_pos = train_test_split(test_val_pos, test_size=0.5)

        # Concat negative and positive dataframe and shuffle
        train_df = pd.concat(
            [train_pos, train_neg],
            ignore_index=True).sample(frac=1).reset_index(drop=True)
        test_df = pd.concat(
            [test_pos, test_neg],
            ignore_index=True).sample(frac=1).reset_index(drop=True)
        val_df = pd.concat(
            [val_pos, val_neg],
            ignore_index=True).sample(frac=1).reset_index(drop=True)

        # Write into csv file
        write_csv(train_df, train_fname)
        write_csv(test_df, test_fname)
        write_csv(val_df, val_fname)

    # Print stat
    logger.info("******************************************************")
    logger.info("Length of train dataset: {}".format(len(train_df)))
    logger.info("Length of test dataset: {}".format(len(test_df)))
    logger.info("Length of val dataset: {}".format(len(val_df)))

    logger.info("Train dataset groupby aspect category: \n{}".format(
        train_df.groupby('ac').count()))
    logger.info("Test dataset groupby aspect category: \n{}".format(
        test_df.groupby('ac').count()))
    logger.info("Val dataset groupby aspect category: \n{}".format(
        val_df.groupby('ac').count()))
    logger.info("******************************************************")