Python discretize示例，preprocess.discretize Python示例

示例#1

0

显示文件

                    record.append([w1,dist[w1][w2],w2,dist[w2][w3],w3,dist[w3][w1],w1])
    return record


if __name__ == '__main__':
    dirname = ''#'../../..'
    # dirname = '../MODELS/'
    fname = 'wiki-news-300d-1M.vec'
    # fname = 'wiki-news-300d-1M.bin'
    NDIMS = 300
    VOCAB = 1000
    nsim = 10
    thresh = 0.65
    word_vecs = ps.load_embedding(path.join(dirname, fname), VOCAB)
    word_vecs = ps.normalize(word_vecs, 0, NDIMS)
    word_vecs = ps.discretize(word_vecs, 0, NDIMS)
    g = Graph(word_vecs, nsim, thresh)
    g2 = g.makeGraph()
    print("graph made")
    # g2 = g.removeWts()

    mode = 2

    #null mode
    if(mode==0):
        mode=input("Enter mode: 1.Prt Grph  2.Prt SCC  3.Chck Tri Ineq") 

    #Print Graph
    if(mode==1):
        print(g2)

示例#2

0

显示文件

文件： models.py 项目： szhou12/MachineLearning4PublicPolicy

def experiment_clfs(df,
                    target_col,
                    unused_cols,
                    numeric_cols,
                    clfs,
                    model_lst,
                    grid,
                    test_length=6,
                    is_temporal=True,
                    draw=True,
                    table=True):
    '''
    Experiment with different parameters for classifiers.
    Loop through each model and evaluate correspondingly.
    Inputs:
        df: dataframe (joint table)
        target_col: (numpy array) target variable
        unused_cols: (numpy array) unused varaibles in df
        numeric_cols: (numpy array) numerical variables in df
        clfs: (dictionary) classifiers from create_clfs_params() function
        model_lst: (list of strings) model names to use
        grid: (dictionary) grid from create_clfs_params() function
        test_length: (positive int) testing window (unit=month)
        is_temporal: (bool) True if use temporal validation to split data; False if use random split
        draw: (bool) True if plot PR curve for each variable
        table: (bool) True if output evaluation results
    Outputs:
        PR-curves
        classifiers_eval.csv: csv file that stores evaluation results
    '''
    output_cols = ('model', 'parameters', 'train_time', 'test_time',
                   'accuracy', 'F1_score', 'auc', 'p@1', 'p@2', 'p@5', 'p@10',
                   'p@20', 'p@30', 'p@50', 'r@1', 'r@2', 'r@5', 'r@10', 'r@20',
                   'r@30', 'r@50')

    output_df = pd.DataFrame(columns=output_cols)

    if is_temporal:
        x_train, x_test, y_train, y_test = utils.split_data(
            df, target_col, unused_cols, test_length)
        if x_train is None and x_test is None and y_train is None and y_test is None:
            print(
                "Temporal split failed. Switch to random split at test size=30%."
            )
            x_train, x_test, y_train, y_test = utils.random_split(
                df, target_col, unused_cols)
    else:
        x_train, x_test, y_train, y_test = utils.random_split(
            df, target_col, unused_cols)

    #discretize numeric cols:
    x_train, x_test = preprocess.discretize(x_train, x_test, numeric_cols)

    clf_lst = [clfs[x] for x in model_lst]
    for i, clf in enumerate(clf_lst):
        print(model_lst[i])
        params = grid[model_lst[i]]
        for p in ParameterGrid(params):
            try:
                model = clf.set_params(**p)
                start_train = time.time()
                model.fit(x_train, y_train)
                end_train = time.time()
                train_time = end_train - start_train

                start_test = time.time()
                y_pred = model.predict(x_test)
                end_test = time.time()
                test_time = end_test - start_test

                y_pred_probs = model.predict_proba(x_test)[:, 1]

                scores = evaluate(y_pred, y_pred_probs, y_test)

                index = len(output_df)
                output_df.loc[index] = [
                    model_lst[i], p, train_time, test_time, scores['accuracy'],
                    scores['F1_score'], scores['auc'], scores['p@1'],
                    scores['p@2'], scores['p@5'], scores['p@10'],
                    scores['p@20'], scores['p@30'], scores['p@50'],
                    scores['r@1'], scores['r@2'], scores['r@5'],
                    scores['r@10'], scores['r@20'], scores['r@30'],
                    scores['r@50']
                ]

                if draw:
                    model_name = model_lst[i] + str(index)
                    plot_pr_curve(y_test, y_pred_probs, model_name, p)
                    index += 1

            except Exception as e:
                print(e)
                pass
        print("1 classifier completed.")
    if table:
        output_df.to_csv('eval_results/classifiers_eval.csv')

    return output_df

示例#3

0

显示文件

文件： feat_engineering.py 项目： zhangbaibaia/DeepRec-torch

def preprocess_features(feat_meta: FeatureMeta, data: pd.DataFrame, split_continuous_category=False):
    r"""Transform raw data into index and value form.
    Continuous features will be discretized, standardized, normalized or scaled according to feature meta.
    Categorical features will be encoded with a label encoder.


    :param feat_meta: The FeatureMeta instance that describes raw_data.
    :param data: The raw_data to be transformed.
    :param split_continuous_category: Whether to return value of continuous features and index of category features.
    :return: feat_index, feat_value, category_index, continuous_value
    """
    logger = create_console_logger(name='feat_meta')
    write_info_log(logger, 'preprocess started')

    idx = 0
    continuous_feats = feat_meta.continuous_feats
    categorical_feats = feat_meta.categorical_feats
    columns = list(continuous_feats.keys())
    columns.extend(list(categorical_feats.keys()))
    data = data[columns]
    feat_idx = pd.DataFrame()

    # transform continuous features
    write_info_log(logger, 'transforming continuous features')
    feat_value_continuous = pd.DataFrame()
    for name in continuous_feats:
        feat = continuous_feats[name]
        feat.start_idx = idx
        if not feat.discretize:
            # standardized, normalize or scale
            processor = feat.transformation
            col_data = np.reshape(data[name].values, (-1, 1))
            col_data = processor.fit_transform(col_data)
            col_data = np.reshape(col_data, -1)
            feat_value_continuous[name] = col_data
            feat_idx[name] = np.repeat(idx, repeats=len(data))
            idx += 1
        else:
            # discretize
            discrete_data, intervals = discretize(data[name], feat.discretize, feat.dim)
            feat.bins = intervals
            feat_idx[name] = discrete_data + idx
            feat_value_continuous[name] = pd.Series(np.ones(len(data[name])))
            idx += feat.dim

    write_info_log(logger, 'transforming categorical features')
    # transform categorical features
    category_index = pd.DataFrame()
    for name in categorical_feats:
        categorical_feat = categorical_feats[name]
        le = LabelEncoder()
        feat_idx[name] = le.fit_transform(data[name]) + idx
        category_index[name] = feat_idx[name]
        categorical_feat.processor = le
        num_classes = len(le.classes_)
        categorical_feat.dim = num_classes
        categorical_feat.start_idx = idx
        idx += num_classes

    # TODO add multi category features
    feat_idx = feat_idx.apply(lambda x: x.values, axis=1)
    category_index = category_index.apply(lambda x: x.values, axis=1)

    feat_value_category = pd.DataFrame(np.ones((len(data), len(categorical_feats.keys()))))
    feat_value = pd.concat([feat_value_continuous, feat_value_category], axis=1)
    feat_value = feat_value.apply(lambda x: x.values, axis=1)
    continuous_value = feat_value_continuous.apply(lambda x: x.values, axis=1)

    write_info_log(logger, 'preprocess finished')
    if split_continuous_category:
        return feat_idx, feat_value, category_index, continuous_value
    return feat_idx, feat_value