Python load_data示例，data_prep.load_data Python示例

示例#1

0

显示文件

文件： submissions.py 项目： Ultramann/airbnb_kaggle

def get_test_data():
    '''
    Input:  None
    Output: DataFrame - data from test set
    '''
    df = load_data(train=False)
    df = transform_data(df)
    X, ids = prep_for_modeling(df, column='id', columns_to_drop=columns_to_drop[:-1])
    return X, ids

示例#2

0

显示文件

def main():

    training_data, test_data, output_rsts = load_data()

#######################################
#Training
#######################################
    input_layer_size = len(training_data[0][0])
    net = NN((input_layer_size, 30, 2), output_rsts)

    net.SGD(training_data, mini_batch_size=10, epochs=30, eta=3.0, test_data=test_data)

示例#3

0

显示文件

def get_model(ticker):

    n_sizes = [14, 11, 8, 5, 3, 2]
    n_filters = 64
    es = EarlyStopping(monitor="val_loss", min_delta=10e-3, patience=10)
    red = ReduceLROnPlateau()

    inputs = Input((20, 11))

    convs = []
    for size in n_sizes:

        conv = Conv1D(n_filters, size, strides=1, activation='relu')(inputs)
        #conv_max = GlobalMaxPooling1D()(conv)
        conv_avg = GlobalAveragePooling1D()(conv)
        #concat = concatenate([conv_max, conv_avg])
        convs.append(conv_avg)

    conv = concatenate(convs)
    dense = Dense(128)(conv)
    dropout = Dropout(0.2)(conv)
    dense = Dense(64)(dropout)
    dropout = Dropout(0.2)(dense)
    dense = Dense(32)(dropout)
    output = Dense(11)(dense)

    model = Model(inputs=[inputs], outputs=[output])
    model.compile('rmsprop', loss='mse')

    print(model.summary())

    X_train, y_train, X_val, y_val = dp.load_data(ticker)
    model.fit(x=X_train,
              y=y_train,
              epochs=100,
              batch_size=3200,
              validation_data=(X_val, y_val),
              callbacks=[es, red])
    new_model = Model(inputs=[model.layers[0].output],
                      outputs=[model.layers[-2].output])
    new_model.save('{}/timeseries_embeddor'.format(dir_))

    return new_model

示例#4

0

显示文件

文件： cnn_pytorch.py 项目： Shangxz/EcancerE

def main():
    ## Load Config file, make sure type of object are correct.
    config: json = {}
    if len(sys.argv) < 2:
        print("No argv provided, using DEFAULT config profile.")
        with open('./config/default.json') as jsonConfig:
            config = json.load(jsonConfig)
    else:
        print("Using ", sys.argv[1] + " config profile.")
        with open('./config/' + sys.argv[1] + '.json') as jsonConfig:
            config = json.load(jsonConfig)

    # get cuda device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    ## Create checkpoint path
    if not os.path.exists("./checkpoints"):
        os.makedirs("./checkpoints")

    #default hyperparams
    num_epoch = config['batch_size']
    batch_size = config['num_epoch']

    if config['cnn_type'] == "resnet50":
        model = models.resnet50(pretrained=True).to(device)
        # model = models.resnet50(pretrained=False).to(device)
        # feature extraction, disable to finetune whole model
        # for name, param in model.named_parameters():
        # if ("layer4" not in name):
        # if ("layer3" not in name):
        # if ("layer2" not in name):
        # param.requires_grad = False
        # print(name)

        num_ftrs = model.fc.in_features

        model.fc = nn.Sequential(
            # nn.Dropout(0.5),
            nn.Linear(num_ftrs, 2)).to(device)

        # params_to_update = []

        # for name, param in model.named_parameters():
        #     if param.requires_grad == True:
        #         params_to_update.append(param)
        #         print("\t",name)

        new_train_dataset, new_test_dataset, _ = data_prep.load_data()

        criterion = nn.CrossEntropyLoss()

        optimizer = optim.Adam(model.parameters(), lr=config["lr"])

        trainloader = torch.utils.data.DataLoader(
            new_train_dataset, batch_size=config['batch_size'], shuffle=True)

        unknownloader = torch.utils.data.DataLoader(new_test_dataset,
                                                    batch_size=22,
                                                    shuffle=False)

        model.train()
        train(trainloader, unknownloader, model, criterion, optimizer, device,
              config['num_epoch'])
        exit()

    elif config['cnn_type'] == "demo":
        print("Demo Time!")
        model = models.resnet50(pretrained=False).to(device)
        # model = models.vgg11_bn(pretrained=True).to(device)

        num_ftrs = model.fc.in_features
        model.fc = nn.Sequential(
            # nn.Dropout(0.5),
            nn.Linear(num_ftrs, 2)).to(device)
        # num_ftrs = model.classifier[6].in_features
        # model.classifier[6] = nn.Linear(num_ftrs, 2).to(device)

        #loading the 99% accurate one
        if torch.cuda.is_available():
            model.load_state_dict(torch.load('checkpoints/100'))
        else:
            model = torch.load('checkpoints/100', map_location='cpu')

        new_train_dataset, new_test_dataset, raw_unknown_data = data_prep.load_data(
        )

        unknownloader = torch.utils.data.DataLoader(new_test_dataset,
                                                    batch_size=22,
                                                    shuffle=False)

        # my_raw_unknown_dataset = MyDataset(raw_unknown_data, unknown_labels, 224)
        # raw_unknownloader = torch.utils.data.DataLoader(my_unknown_dataset, batch_size=5,
        #                             shuffle=False)

        demo_test(unknownloader, raw_unknown_data, model, device)
        exit()

    else:
        print("No Model Provided!")
        exit()

示例#5

0

显示文件

文件： classifiers_trail.py 项目： Navanshu02/fraud_detection

def main(enc='le'):
    X_train, X_test, y_train, y_test = data_prep.load_data()
    data_prep.describe_df(pd.DataFrame(X_train))
    if enc == 'ohe':
        X_train = data_prep.ohe_encode(
            X_train, np.delete(list(range(0, X_train.shape[1])), [1, 4, 12]))
        ohe = pickle.load(open('../res/ohe.pkl', 'rb'))
        X_test = ohe.transform(X_test)

# =============================================================================
# No feature engineering
# =============================================================================
    print("Random Forest.....")
    clf = random_forest(X_train, y_train)
    print(test_clf(clf, X_test, y_test))
    print("Decision Tree.....")
    clf = dt(X_train, y_train)
    print(test_clf(clf, X_test, y_test))
    print("SVC.....")
    clf = svc(X_train, y_train)
    print(test_clf(clf, X_test, y_test))
    print("Adaboost.....")
    clf = ada(X_train, y_train)
    print(test_clf(clf, X_test, y_test))

    # =============================================================================
    # PCA
    # =============================================================================
    X_train_pca = data_prep.feature_selection_pca(X_train, 17)
    pca = pickle.load(open('../res/pca.pkl', 'rb'))
    X_test_pca = pca.transform(X_test)
    print("With PCA")
    print("Random Forest.....")
    clf = random_forest(X_train_pca, y_train)
    print(test_clf(clf, X_test_pca, y_test))
    print("Decision Tree.....")
    clf = dt(X_train_pca, y_train)
    print(test_clf(clf, X_test_pca, y_test))
    print("SVC.....")
    clf = svc(X_train_pca, y_train)
    print(test_clf(clf, X_test_pca, y_test))
    print("Adaboost.....")
    clf = ada(X_train_pca, y_train)
    print(test_clf(clf, X_test_pca, y_test))
    # =============================================================================
    # Select K features
    # =============================================================================
    if enc == "le":
        np.random.seed(9)
        X_train = pd.DataFrame(X_train)
        X_test = pd.DataFrame(X_test)
        feat = data_prep.feature_selection(X_train, y_train, 15)
        X_train_feat = X_train[feat]
        X_test_feat = X_test[feat]
        print("With Features selector")
        print("Random Forest.....")
        clf = random_forest(X_train_feat, y_train)
        print(test_clf(clf, X_test_feat, y_test))
        print("Decision Tree.....")
        clf = dt(X_train_feat, y_train)
        print(test_clf(clf, X_test_feat, y_test))
        print("SVC.....")
        clf = svc(X_train_feat, y_train)
        print(test_clf(clf, X_test_feat, y_test))
        print("Adaboost.....")
        clf = ada(X_train_feat, y_train)
        print(test_clf(clf, X_test_feat, y_test))

示例#6

0

显示文件

   
    x = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    x = cv2.resize(x, (256, 256))
    x = x/255.0
    x = np.expand_dims(x, axis=-1)
    return x

def mask_parse(mask):
    mask = np.squeeze(mask)
    mask = [mask,mask,mask]
    mask = np.transpose(mask, (1,2,0))
    return mask

if __name__ == "__main__":
    
    (train_x, train_y), (valid_x, valid_y), (test_x, test_y) = load_data("dataset/")
    print(len(train_x), len(test_x))
    batch = 8
    
    test_dataset = tf_dataset(test_x, test_y, batch = batch)
    test_steps = len(test_x)//batch

    if len(test_x) % batch != 0:
        test_steps +=1

    with CustomObjectScope({'iou':iou}):
        model = tf.keras.models.load_model("files/model.h5")

    model.evaluate(test_dataset, steps = test_steps)

    for i , (x,y) in tqdm(enumerate(zip(test_x, test_y)), total = len(test_x)):

示例#7

0

显示文件

from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
classifier_idx=['SVC','DT','RF','MLP','ADA','NB']
classifiers = [
    SVC(gamma=1, C=0.001),
    DecisionTreeClassifier(max_depth=7,random_state=0),
    RandomForestClassifier(n_estimators= 1000,criterion='entropy',random_state=0),
    MLPClassifier(activation='relu',learning_rate='adaptive'),
    AdaBoostClassifier(DecisionTreeClassifier(random_state=0,max_depth=10),
                         algorithm="SAMME",
                         n_estimators=1000,random_state=0 ),
    GaussianNB()]

X_train,X_test,y_train,y_test = data_prep.load_data()
#data_prep.describe_df(pd.DataFrame(X_train))

def process_clf(X_train,y_train,X_test,y_test,filename):
    record = pd.DataFrame()
    for count,clf in enumerate(classifiers):
        print('Working on '+classifier_idx[count])
        clf = classifiers[count]
        clf= clf.fit(X_train,y_train)
        pred= clf.predict(X_test)
        acc = sum(pred==y_test)/len(y_test)
        recall = recall_score(y_test, pred, average=None)
        rec = pd.DataFrame({'accuracy':acc,'recall':recall,'confusion_mat':str(confusion_matrix(y_test, pred)),'classifier':classifier_idx[count]})
        record= record.append(rec)
    
    record.to_csv(filename)

示例#8

0

显示文件

文件： eda.py 项目： Ultramann/airbnb_kaggle

import pandas as pd
from data_prep import load_data


def category_column_correlation(column, target):
    '''
    Input:  Series - Column to be dummied, Series - Target, undummified
    Output: DataFrame - Correlation coefficient matrix dummies of column and target
                        Non-redundant
    '''
    column_dummies = pd.get_dummies(column)
    target_dummies = pd.get_dummies(target)
    correlation_df = pd.concat([column_dummies, target_dummies], axis=1).corr()
    return correlation_df.ix[column_dummies.columns, target_dummies.columns]


if __name__ == '__main__':
    df = load_data()

示例#9

0

显示文件

文件： main.py 项目： ZashaFallen/COMP4107-FinalProject

import tensorflow as tf
import sys
from datetime import datetime
now = datetime.now()

#Supplemetary Files
import data_prep as data

print("--- Dependancies Loaded ---")

data_type = ""
if (len(sys.argv) == 2):
    data_type = sys.argv[1].lower()
    if (data_type == "raw"):
        print ("Loading raw data")
        m_data, idx_q, idx_a = data.load_data(True)
    elif (data_type == "clean"):
        print ("Loading clean data")
        m_data, idx_q, idx_a = data.load_data(False)
    else:
        sys.exit("Error, incorrect command line arguments. Please supply either 'raw' or 'clean' for which dataset should be used.")
else:
    sys.exit("Error, incorrect command line arguments. Please supply either 'raw' or 'clean' for which dataset should be used.")

(trX, trY), (teX, teY), (vaX, vaY) = data.split_dataset(idx_q, idx_a)

print("--- Data Loaded ---")

#Hyperparameters
epochs = 500
batch_size = 64

示例#10

0

显示文件

文件： train.py 项目： ayushmankumar7/Polyp-Semantic-Segemntation-Tensorflow-2

def iou(y_true, y_pred):
    def f(y_true, y_pred):
        intersection = (y_true * y_pred).sum()
        union = y_true.sum() + y_pred.sum() - intersection
        x = (intersection + 1e-15) / (union + 1e-15)
        x = x.astype(np.float32)
        return x

    return tf.numpy_function(f, [y_true, y_pred], tf.float32)


if __name__ == "__main__":

    (train_x, train_y), (valid_x, valid_y), (test_x,
                                             test_y) = load_data("dataset/")

    print(len(train_x), len(test_x))

    batch = 8
    lr = 1e-4
    epochs = 50

    train_dataset = tf_dataset(train_x, train_y, batch=batch)
    valid_dataset = tf_dataset(valid_x, valid_y, batch=batch)

    model = build_model()

    optimizer = tf.keras.optimizers.Adam(lr)
    metrics = ['acc', Recall(), Precision(), iou]

示例#11

0

显示文件

import util

# %%

# Work around some truncated images: https://stackoverflow.com/a/23575424
ImageFile.LOAD_TRUNCATED_IMAGES = True

# %%

# prepare data

data_dir = "data"
all_photos_dir = os.path.join(data_dir, "photos")
split_photos_dir = os.path.join(data_dir, "from")

df = data_prep.load_data(os.path.join(data_dir, "_chat.txt"))
data_prep.show_plots(df)

dataloaders, dataset_sizes, class_names = \
    data_prep.prepare_loaders(split_photos_dir)

# %%

# View some data

# Get a batch of training data
inputs, classes = next(iter(dataloaders["train"]))

# Make a grid from batch
out = torchvision.utils.make_grid(inputs)

示例#12

0

显示文件

文件： run_model.py 项目： cheth-rowe/ihmexp

def run_ap_model(outcome,
                 ier_prior=False,
                 measure='log_ratio',
                 include_smoking=False,
                 include_shs=True,
                 mono=False,
                 cvcv=False,
                 oap_gold_standard=False,
                 n_splines=100,
                 n_ns_knots=4,
                 n_s_knots=4,
                 n_bins=200):
    # load data
    if outcome.startswith('cvd'):
        age_adjust = True
    else:
        age_adjust = False
    df, model_cols, obs_mean, obs_std, study_sizes, N = load_data(
        outcome,
        measure=measure,
        age_adjust=age_adjust,
        include_smoking=include_smoking,
        include_shs=include_shs)
    # if outcome == 'lri':
    #     model_cols = ['child']
    #     add_age = False
    # elif outcome.startswith('cvd'):
    #     model_cols = ['incidence']
    #     add_age = False
    # else:
    #     model_cols = []
    #     add_age = False
    model_cols = []
    add_age = False

    if oap_gold_standard:
        model_cols = model_cols + ['other_ap']

    # check for NAs
    for model_col in model_cols:
        # df.loc[df[model_col].isnull(), model_col] = 0
        if len(df.loc[df[model_col].isnull()]) > 0:
            problem_nid_list = df.loc[df[model_col].isnull(), 'nid'].tolist()
            problem_nid = ', '.join(problem_nid_list)
            raise ValueError(
                f'Missing value for {model_col} in NID(s) {problem_nid}')

    if add_age:
        assert df['median_age_fup'].max(
        ) > 0, 'Age included model with no age data.'

    # create spline
    ns_spline_mat = df.loc[df.ier_source != 'AS',
                           ['conc_den', 'conc']].values.flatten()
    spline_list = create_spline_list(ns_spline_mat,
                                     degree=3,
                                     n_knots=n_ns_knots,
                                     l_linear=False,
                                     r_linear=True,
                                     n_splines=n_splines,
                                     width_pct=0.2,
                                     l_zero=True)
    if include_smoking:
        s_spline_mat = df.loc[df.ier_source == 'AS',
                              'conc'].values  # just use tail end for smoking
        s_spline_list = create_spline_list(s_spline_mat,
                                           degree=3,
                                           n_knots=n_s_knots,
                                           n_splines=n_splines,
                                           width_pct=0.2,
                                           l_zero=False)
        for i in range(n_splines):
            spline_list[i].knots = np.hstack(
                [spline_list[i].knots, s_spline_list[i].knots])

    # covs and priors
    x_cov_list, z_cov_list = get_cov_lists(df,
                                           model_cols,
                                           measure=measure,
                                           add_age=add_age)
    prior_list = get_priors(outcome=outcome,
                            measure=measure,
                            n_ns_knots=n_ns_knots,
                            exp_spline=spline_list[0],
                            age_decreasing=False,
                            cvcv=cvcv,
                            mono=mono)

    if ier_prior:
        prior_list = add_ier_prior(prior_list, outcome, spline_list,
                                   n_ns_knots)

    # run meta-regression
    mr = MR_BeRT(obs_mean=obs_mean,
                 obs_std=obs_std,
                 study_sizes=study_sizes,
                 x_cov_list=x_cov_list,
                 z_cov_list=z_cov_list,
                 spline_list=spline_list,
                 inlier_percentage=0.9)
    mr.addPriors(prior_list)
    if measure == 'log_ratio':
        x0 = ratioInit(mr, 0)
    else:
        x0 = None
    mr.fitModel(x0=x0)
    mr.scoreModel(np.array([0.4, 0.6]))

    given_samples = get_parameter_samples(mr, len(mr.spline_list) * 10)

    # if include_smoking:
    #     exp_pred_array = np.linspace(spline_list[0].knots[0], spline_list[0].knots[n_ns_knots-1], int(n_bins / 2) + 1)
    #     s_exp_pred_array = np.linspace(spline_list[0].knots[n_ns_knots-1], spline_list[0].knots[-1], int(n_bins / 2))
    #     exp_pred_array = np.hstack([exp_pred_array[:-1], s_exp_pred_array])
    #     exp_pred_array = np.unique(exp_pred_array)
    # else:
    #     exp_pred_array = np.linspace(spline_list[0].knots[0], spline_list[0].knots[-1], n_bins)
    exp_pred_array = np.hstack([
        np.arange(0, 10, 0.01),
        np.arange(10, 100, 0.1),
        np.arange(100, 1000),
        np.arange(1000, 10010, 10)
    ])

    if add_age:
        age_pred_array = np.percentile(df['median_age_fup'], 50)
        age_pred_array = np.repeat(age_pred_array, n_bins)
    else:
        age_pred_array = None
    pred_x_cov_list, pred_z_cov_list = get_cov_lists(
        model_cols=model_cols,
        measure=measure,
        add_age=add_age,
        linear=False,
        pred=True,
        pred_ref=True,
        exp_pred_array=exp_pred_array,
        age_pred_array=age_pred_array)
    if len(x_cov_list) > 1:
        pred_x_cov_list_alt, pred_z_cov_list_alt = get_cov_lists(
            model_cols=model_cols,
            measure=measure,
            add_age=add_age,
            linear=False,
            pred=True,
            pred_ref=False,
            exp_pred_array=exp_pred_array,
            age_pred_array=age_pred_array)
        for i in range(len(pred_x_cov_list)):
            pred_x_cov_list[i]['mat'] = np.hstack(
                [pred_x_cov_list[i]['mat'], pred_x_cov_list_alt[i]['mat']])
        for i in range(len(pred_z_cov_list)):
            pred_z_cov_list[i]['mat'] = np.hstack(
                [pred_z_cov_list[i]['mat'], pred_z_cov_list_alt[i]['mat']])
    pred_x_cov_list_data_l, pred_z_cov_list_data_l = get_cov_lists(
        model_cols=model_cols,
        measure=measure,
        add_age=add_age,
        linear=False,
        pred=True,
        pred_ref=True,
        exp_pred_array=df['conc_den'].values)

    if measure == 'log_ratio':
        ref_point = spline_list[0].knots[0]
    elif measure == 'diff':
        ref_point = None
    y_samples = mr.predictData(
        pred_x_cov_list,
        pred_z_cov_list,
        sample_size=n_splines * 10,
        pred_study_sizes=[len(pred_x_cov_list[0]['mat'])],
        include_random_effect=True,
        ref_point=ref_point,
        **given_samples)[0]
    y_samples = np.vstack(y_samples)

    y_samples_fe = mr.predictData(
        pred_x_cov_list,
        pred_z_cov_list,
        sample_size=n_splines * 10,
        pred_study_sizes=[len(pred_x_cov_list[0]['mat'])],
        include_random_effect=False,
        ref_point=ref_point,
        **given_samples)[0]
    y_samples_fe = np.vstack(y_samples_fe)

    y_samples_fe_data_l = mr.predictData(pred_x_cov_list_data_l,
                                         pred_z_cov_list_data_l,
                                         sample_size=n_splines * 10,
                                         pred_study_sizes=mr.study_sizes,
                                         include_random_effect=False,
                                         ref_point=ref_point,
                                         **given_samples)[0]
    y_samples_fe_data_l = np.vstack(y_samples_fe_data_l)

    return df, mr, given_samples, pred_x_cov_list, y_samples, y_samples_fe, y_samples_fe_data_l