Python preprocess示例，src.data.preprocess.preprocess Python示例

示例#1

0

显示文件

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i',
                        '--input',
                        type=Path,
                        required=True,
                        help='Root directory for data')
    parser.add_argument('--seed', type=int, default=18, help='Random seed')

    args = parser.parse_args()
    root = args.input
    if not os.path.exists(root):
        os.makedirs(root)

    # Download raw data at root
    download_data(root)

    # Parse data into groups
    domains = [['Solo Cello', ''], ['Solo Violin', ''],
               ['Solo Piano', 'Beethoven']]
    parsed_dir = root / 'parsed'
    parse_data(root, parsed_dir, domains)

    # Split data into train-val-test
    random.seed(args.seed)
    split_dir = root / 'split'
    for input_path in parsed_dir.glob("*/"):
        basename = os.path.basename(input_path)
        output_path = Path(split_dir / basename)
        split(input_path,
              output_path,
              train_ratio=0.8,
              val_ratio=0.1,
              filetype='wav',
              copy=True)

    # Preprocess data
    preproc_dir = root / 'preprocessed'
    preprocess(split_dir, preproc_dir)

示例#2

0

显示文件

def horizon_search():
    '''
    Experiment to demonstrate the effect of the predictive horizon on model performance on the test set.
    Trains models at different values of predictive horizon (N) in weeks.
    Set experiment options in config.yml.
    '''

    # Load relevant values from config
    input_stream = open(os.getcwd() + "/config.yml", 'r')
    cfg = yaml.full_load(input_stream)
    N_MIN = cfg['HORIZON_SEARCH']['N_MIN']
    N_MAX = cfg['HORIZON_SEARCH']['N_MAX']
    N_INTERVAL = cfg['HORIZON_SEARCH']['N_INTERVAL']

    test_metrics_df = pd.DataFrame()
    for n in range(N_MIN, N_MAX + N_INTERVAL, N_INTERVAL):
        print('** n = ', n, ' of ', N_MAX)

        # Preprocess data. Avoid recomputing ground truths and classifying features after first iteration.
        if n == N_MIN:
            preprocess(n_weeks=n, calculate_gt=True, classify_cat_feats=True, load_ct=False)
        else:
            if cfg['TRAIN']['MODEL_DEF'] == 'hifis_rnn_mlp':
                preprocess(n_weeks=n, calculate_gt=True, classify_cat_feats=False, load_ct=True)
            else:
                preprocess(n_weeks=n, calculate_gt=False, classify_cat_feats=False, load_ct=True)

        # Conduct cross validation at this prediction horizon
        callbacks = define_callbacks(cfg)
        if cfg['TRAIN']['MODEL_DEF'] == 'hifis_rnn_mlp':
            results_df = nested_cross_validation(cfg, callbacks, None)  # If time series data, do nested CV
        else:
            results_df = kfold_cross_validation(cfg, callbacks, None)  # If not time series data, do k-fold CV
        results_df = results_df[0:-2]   # Remove rows for mean and std dev
        results_df.drop('Fold', axis=1, inplace=True)   # Remove fold column
        results_df.insert(0, 'n', n)  # Add prediction horizon to test results
        test_metrics_df = test_metrics_df.append(results_df)  # Append results from this value of n

    # Save results
    test_metrics_df.to_csv(cfg['PATHS']['HORIZON_SEARCH'] + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + '.csv',
                           sep=',', header=True, index=False)

    # Plot results
    plot_horizon_search(test_metrics_df, cfg['PATHS']['IMAGES'])

示例#3

0

显示文件

import os
import yaml
import argparse
from src.data.preprocess import preprocess

parser = argparse.ArgumentParser()
parser.add_argument('--miladatadir', type=str, help="Mila dataset directory")
parser.add_argument('--fig1datadir', type=str, help="Mila dataset directory")
parser.add_argument('--rsnadatadir', type=str, help="RSNA dataset directory")
parser.add_argument('--preprocesseddir', type=str, help="preprocessed output")
args = parser.parse_args()

cfg = yaml.full_load(open(os.getcwd() + "./config.yml", 'r'))  # Load config data
cfg['PATHS']['MILA_DATA'] = args.miladatadir
cfg['PATHS']['FIGURE1_DATA'] = args.fig1datadir
cfg['PATHS']['RSNA_DATA'] = args.rsnadatadir
cfg['PATHS']['PROCESSED_DATA'] = args.preprocesseddir
cfg['PATHS']['TRAIN_SET'] = cfg['PATHS']['PROCESSED_DATA'] + '/' + cfg['PATHS']['TRAIN_SET'].split('/')[-1]
cfg['PATHS']['VAL_SET'] = cfg['PATHS']['PROCESSED_DATA'] + '/' + cfg['PATHS']['VAL_SET'].split('/')[-1]
cfg['PATHS']['TEST_SET'] = cfg['PATHS']['PROCESSED_DATA'] + '/' + cfg['PATHS']['TEST_SET'].split('/')[-1]

preprocess(cfg)

示例#4

0

显示文件

文件： preprocess_step.py 项目： aildnont/HIFIS-model

# Run preprocessing.
if os.getenv("AML_PARAMETER_PIPELINE") == 'train':
    cfg['PATHS']['DATA_INFO'] = args.preprocessedoutputdir + '/' + cfg[
        'PATHS']['DATA_INFO'].split('/')[-1]
    cfg['PATHS'][
        'ORDINAL_COL_TRANSFORMER'] = args.preprocessedoutputdir + '/' + cfg[
            'PATHS']['ORDINAL_COL_TRANSFORMER'].split('/')[-1]
    cfg['PATHS'][
        'OHE_COL_TRANSFORMER_MV'] = args.preprocessedoutputdir + '/' + cfg[
            'PATHS']['OHE_COL_TRANSFORMER_MV'].split('/')[-1]
    cfg['PATHS'][
        'OHE_COL_TRANSFORMER_SV'] = args.preprocessedoutputdir + '/' + cfg[
            'PATHS']['OHE_COL_TRANSFORMER_SV'].split('/')[-1]
    preprocessed_data = preprocess(cfg=cfg,
                                   n_weeks=None,
                                   include_gt=True,
                                   calculate_gt=True,
                                   classify_cat_feats=True,
                                   load_ct=False)  # Preprocessing for training
else:
    cfg['PATHS']['DATA_INFO'] = args.inferencedir + cfg['PATHS'][
        'DATA_INFO'].split('/')[-1]
    cfg['PATHS']['OHE_COL_TRANSFORMER_SV'] = args.inferencedir + cfg['PATHS'][
        'OHE_COL_TRANSFORMER_SV'].split('/')[-1]
    cfg['PATHS']['ORDINAL_COL_TRANSFORMER'] = args.inferencedir + cfg['PATHS'][
        'ORDINAL_COL_TRANSFORMER'].split('/')[-1]
    cfg['PATHS']['OHE_COL_TRANSFORMER_MV'] = args.inferencedir + cfg['PATHS'][
        'OHE_COL_TRANSFORMER_MV'].split('/')[-1]
    preprocessed_data = preprocess(cfg=cfg,
                                   n_weeks=0,
                                   include_gt=False,
                                   calculate_gt=False,

示例#5

0

显示文件

文件： model_selection.py 项目： TatarTopicModeler/TatarTopicModeling

                     ys=list(grid_search_res[metric_name].values()),
                     labels=list(grid_search_res[metric_name].keys()),
                     title=metric_name.name,
                     xaxis='num_topics',
                     yaxis='score',
                     file=None)  # f'selection_{metric_name}')


PATH_TOTAL_RAW = Path('data/processed/total.csv')
PATH_TOTAL_PROCESSED = Path('data/processed/total.csv')

PATH = Path(os.getcwd())
filename = 'contents.csv'
if filename in os.listdir(PATH / 'data/processed'):
    df = pd.read_csv(PATH / 'data/processed' / filename)
else:
    df = pd.read_csv(PATH / 'data/raw' / filename)
    df['preproc'] = preprocess(df['content'].values, stem=True)
    df.to_csv(PATH / 'data/processed' / filename)

df.dropna(inplace=True)

df = df[~df['title'].str.contains('Калып:')]  # remove templates
documents = df['preproc'].values

count_vect = CountVectorizer(input='content')  # , stop_words=STOPWORDS)
tf_idf_vect = TfidfVectorizer(input='content')  # , stop_words=STOPWORDS)
vect_model = count_vect

main(documents, vect_model, true_labels=df['topic'])

示例#6

0

显示文件

文件： predict.py 项目： fmaguire/HIFIS-model

def predict_and_explain_set(cfg=None,
                            data_path=None,
                            save_results=True,
                            give_explanations=True,
                            include_feat_values=True,
                            processed_df=None):
    '''
    Preprocess a raw dataset. Then get model predictions and corresponding LIME explanations.
    :param cfg: Custom config object
    :param data_path: Path to look for raw data
    :param save_results: Flag specifying whether to save the prediction results to disk
    :param give_explanations: Flag specifying whether to provide LIME explanations with predictions spreadsheet
    :param include_feat_values: Flag specifying whether to include client feature values with predictions spreadsheet
    :param processed_df: Dataframe of preprocessed data. data_path will be ignored if passed.
    :return: Dataframe of prediction results, including explanations.
    '''

    # Load project config data
    if cfg is None:
        cfg = yaml.full_load(open(os.getcwd() + "/config.yml", 'r'))

    # Load data transformers
    scaler_ct = load(cfg['PATHS']['SCALER_COL_TRANSFORMER'])
    ohe_ct_sv = load(cfg['PATHS']['OHE_COL_TRANSFORMER_SV'])

    # Get preprocessed data
    if processed_df is not None:
        df = processed_df
        if cfg['TRAIN']['DATASET_TYPE'] == 'static_and_dynamic':
            indexes = np.array(
                pd.concat(
                    [df.pop('ClientID'), df.pop('Date')], 1))
        else:
            indexes = np.array(df.pop('ClientID'))
    else:
        if data_path is None:
            data_path = cfg['PATHS']['RAW_DATA']

        # Preprocess the data, using pre-existing sklearn transformers and classification of categorical features.
        df = preprocess(n_weeks=0,
                        include_gt=False,
                        calculate_gt=False,
                        classify_cat_feats=False,
                        load_ct=True,
                        data_path=data_path)
        indexes = np.array(df.index)

    # Ensure DataFrame does not contain ground truth (could happen if custom preprocessed data is passed)
    if 'GroundTruth' in df.columns:
        df.drop('GroundTruth', axis=1, inplace=True)

    # Load feature mapping information (from preprocessing)
    data_info = yaml.full_load(open(cfg['PATHS']['DATA_INFO'], 'r'))

    # Convert dataset to numpy array
    X = np.array(df)

    # Restore the model and LIME explainer from their respective serializations
    explainer = dill.load(open(cfg['PATHS']['LIME_EXPLAINER'], 'rb'))
    model = load_model(cfg['PATHS']['MODEL_TO_LOAD'], compile=False)

    # Get the predictive horizon that this model was trained on. It's embedded within the model name.
    n_weeks = int(model._name.split('_')[1].split('-')[0])

    # Load LIME and prediction constants from config
    model_def = cfg['TRAIN']['MODEL_DEF'].upper()
    NUM_SAMPLES = cfg['LIME'][model_def]['NUM_SAMPLES']
    NUM_FEATURES = cfg['LIME'][model_def]['NUM_FEATURES']
    THRESHOLD = cfg['PREDICTION']['THRESHOLD']
    CLASS_NAMES = cfg['PREDICTION']['CLASS_NAMES']

    # Define column names of the DataFrame representing the prediction results
    col_names = [
        'ClientID', 'Predictive Horizon [weeks]',
        'At risk of chronic homelessness',
        'Probability of chronic homelessness [%]'
    ]

    # Add columns for client explanation
    if give_explanations:
        for i in range(NUM_FEATURES):
            col_names.extend(
                ['Explanation ' + str(i + 1), 'Weight ' + str(i + 1)])

    # Add columns for client feature values
    if include_feat_values:
        col_names.extend(list(df.columns))
    rows = []

    # Predict and explain all items in dataset
    print('Predicting and explaining examples.')
    for i in tqdm(range(X.shape[0])):

        # Predict this example
        x = np.expand_dims(X[i], axis=0)
        y = np.squeeze(predict_instance(x, model, ohe_ct_sv, scaler_ct).T,
                       axis=1)  # Predict example
        prediction = 1 if y[1] >= THRESHOLD else 0  # Model's classification
        predicted_class = CLASS_NAMES[prediction]
        client_id = indexes[i][0] if cfg['TRAIN'][
            'DATASET_TYPE'] == 'static_and_dynamic' else indexes[i]
        row = [int(client_id), n_weeks, predicted_class, y[1] * 100]

        # Explain this prediction
        if give_explanations:
            x = sp.sparse.csr_matrix(X[i])
            explanation = predict_and_explain(x, model, explainer, ohe_ct_sv,
                                              scaler_ct, NUM_FEATURES,
                                              NUM_SAMPLES)
            exp_tuples = explanation.as_list()
            for exp_tuple in exp_tuples:
                row.extend(list(exp_tuple))
            if len(exp_tuples) < NUM_FEATURES:
                row.extend([''] * (2 * (NUM_FEATURES - len(exp_tuples)))
                           )  # Fill with empty space if explanation too small

        # Add client's feature values
        if include_feat_values:
            client_vals = list(df.loc[indexes[i], :])
            for idx in data_info['SV_CAT_FEATURE_IDXS']:
                ordinal_encoded_val = int(client_vals[idx])
                client_vals[idx] = data_info['SV_CAT_VALUES'][idx][
                    ordinal_encoded_val]
            row.extend(client_vals)

        rows.append(row)

    # Convert results to a Pandas dataframe and save
    results_df = pd.DataFrame(rows, columns=col_names)
    if save_results:
        results_path = cfg['PATHS']['BATCH_PREDICTIONS'] + datetime.now(
        ).strftime("%Y%m%d-%H%M%S") + '.csv'
        results_df.to_csv(results_path,
                          columns=col_names,
                          index_label=False,
                          index=False)
    return results_df