def main():
    # Load the data set
    df = data()

    X = df[ [col for col in df if col not in ['label', 'class']]]
    y = df['class'].values

    # Binarize the categorical data using a DictVectorizer
    # This requires the data be fed in the form of Python dicts
    vectorizer = DV(sparse=False)
    X_binarized = vectorizer.fit_transform(X.to_dict(orient='records'))

    X_binarized = np.array(X_binarized)

    # Split into train, cv and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_binarized, y, train_size=.8, random_state=42)
    
    # My implementation
    classifier = LogisticRegression()
    classifier.fit(y_train, X_train, regularization='l1')
    l1_error = 1 - classifier.accuracy(y_test, X_test)
    
    classifier = LogisticRegression()
    classifier.fit(y_train, X_train, regularization='l2')
    l2_error = 1 - classifier.accuracy(y_test, X_test)
    
    
    print('LogisticRegression with L1 regularization \nError: {}'.format(l1_error))
    print('LogisticRegression with L2 regularization \nError: {}'.format(l2_error))
예제 #2
0
 def train_model(self):
     k = model.modeler()
     ux = tf.placeholder(shape=(None, 4), dtype=float)  # ux作为绝对正确值
     loss = -tf.reduce_mean(ux * tf.log(tf.clip_by_value(k.model, 0, 1)))
     train_step = tf.train.GradientDescentOptimizer(0.01).minimize(loss)
     with tf.Session() as sess:
         init_op = tf.global_variables_initializer()
         sess.run(init_op)
         data = gt.data()
         for i in range(200000):  # 此处可以使用配置文件中的参数
             sess.run(train_step,
                      feed_dict={
                          k.x: data.train_future,
                          ux: data.train_label
                      })
             print(
                 sess.run(loss,
                          feed_dict={
                              k.x: data.train_future,
                              ux: data.train_label
                          }))
         # 训练结束后保存模型
         saver = tf.train.Saver()
         PATH = dnn.CONFIG['train']['save_path']
         saver.save(sess, PATH)
예제 #3
0
def main():
    df = data()
    
    X = df[ [col for col in df if col not in ['class']]]
    y = df['class'].values

    # Binarize the categorical data using a DictVectorizer
    # This requires the data be fed in the form of Python dicts
    vectorizer = DV(sparse=False)
    X_binarized = vectorizer.fit_transform( X.to_dict(orient='records') )


    # Split into test and train sets
    X_train, X_test, y_train, y_test = train_test_split(X_binarized, y, train_size=.8, random_state=42)
    
    C = np.linspace(.1, 10, 100).tolist() + np.linspace(20,100, 5).tolist() + \
        np.linspace(200, 1000, 9).tolist()
        
    param_grid = list(ParameterGrid({'C': C, 'penalty': ['l1']}))
    
    for params in param_grid:
    
        classifier = LogisticRegression(**params)

        # Fit the model to the training data
        t0 = time()
        classifier.fit(X_train, y_train)
        t1 = time()

        accuracy = classifier.score(X_test, y_test)
        error = (1 - accuracy)

        print '\nTest error: {} Time to train: {}'.format(error, (t1-t0))
        print 'Params: {}'.format(params)
def main():
    df = data()

    # Test run
    classifiers = [{'classifier': LogisticRegression, 
                   'classifier_name': 'Logistic Regression L2',
                    'param_grid': {'C': [.1], 'max_iter': [10], 'penalty': ['l2']} }]

    # To complete the full analysis, please uncomment the following block of code
    # and run this script. 
    # NOTE This will take several hours to run
    
    ##classifiers = [{'classifier': LogisticRegression, 
    ##               'classifier_name': 'Logistic Regression L2',
    ##                'param_grid': {'C': np.linspace(.1, 5, 50), 'max_iter': [10,50,100,200], 'penalty': ['l2']} }]

    feature_selection = FeatureSelection(df, classifiers)

    feature_selection.select_features()
예제 #5
0
def main():
    df = data()
    
    X = df[ [col for col in df if col not in ['class']]]
    y = df['class'].values

    # Binarize the categorical data using a DictVectorizer
    # This requires the data be fed in the form of Python dicts
    vectorizer = DV(sparse=False)
    X_binarized = vectorizer.fit_transform( X.to_dict(orient='records') )


    # Split into test and train sets
    X_train, X_test, y_train, y_test = train_test_split(X_binarized, y, train_size=.8, random_state=42)
    
    # Due to the amount of time it takes to train an SVM on the data, I've commented out the true
    # parameters I used and only included a subset. If you'd like to run the full analysis, please
    # comment out the next line and uncomment the line after that
    # Note: This will still take several minutes to run
    param_grid = list(ParameterGrid({'C': [1], 'kernel': ['rbf']}))
    #param_grid = list(ParameterGrid({'C': [1, 10, 100, 1000], 'kernel': ['linear', 'rbf']}))
    
    for params in param_grid:
    
        classifier = SVC(**params)

        # Fit the model to the training data
        t0 = time()
        classifier.fit(X_train, y_train)
        t1 = time()

        accuracy = classifier.score(X_test, y_test)
        error = (1 - accuracy)

        print '\nTest error: {} Time to train: {}'.format(error, (t1-t0))
        print 'Params: {}'.format(params)
예제 #6
0
def main():

    df = data()

    for train_size in np.linspace(0.5, 0.9, 5):

        train, test = train_test_split(df, train_size=train_size, random_state=42)

        # Since there is only 1 sample with native-country == Holand-Netherlands,
        # ensure that this sample is in the training set
        if "Holand-Netherlands" in test["native-country"].unique():
            train = train.append(test[test["native-country"] == "Holand-Netherlands"])
            test = test[test["native-country"] != "Holand-Netherlands"]

        for ignore_missing in [True, False]:
            nb = NaiveBayes(ignore_missing=ignore_missing)
            nb.learn_parameters(train)
            acc = nb.score(test[test["native-country"] != "Holand-Netherlands"])

            print(
                "\nTrain size: {} Test error: {} Ignore features with missing values: {}".format(
                    train_size, (1 - acc), ignore_missing
                )
            )
예제 #7
0
파일: main.py 프로젝트: 213584adghj/ml
# -*- coding: utf-8 -*-
import get_data as gt
import train

if __name__ == "__main__":
    data = gt.data()  # 得到数据
    model = train.trainer(data.all_data)  # 训练模型
    result_model = model.model  # 聚类结果
    print(result_model.predict(data.all_data))
예제 #8
0
#translate each digit to array of length 8 (segs + decimal pt) for YSD-160AR4B-8
to7seg = {
    0:[1,0,0,0,1,0,0,0],
    1:[1,1,1,0,1,0,1,1],
    2:[0,1,0,0,1,1,0,0],
    3:[0,1,0,0,1,0,0,1],
    4:[0,0,1,0,1,0,1,1],
    5:[0,0,0,1,1,0,0,1],
    6:[0,0,0,1,1,0,0,0],
    7:[1,1,0,0,1,0,1,1],
    8:[0,0,0,0,1,0,0,0],
    9:[0,0,0,0,1,0,1,1]}

while(1):
    #get info from get_data.py
    data = data()

    #choose what goes in each digit: 0-ones place, 1-tens place
    swell_h0 = int(round(data['swell']['h']%10))
    swell_h1 = int(data['swell']['h']/10)
    swell_p0 = int(round(data['swell']['p']%10))
    swell_p1 = int(data['swell']['p']/10)

    wind_k0 = int(round(data['wind']['kts']%10))
    wind_k1 = int(data['wind']['kts']/10)
    wind_g0 = int(round(data['wind']['gust']%10))
    wind_g1 = int(data['wind']['gust']/10)

    #pair digits back up for sr
    swell_h = to7seg[swell_h0] + to7seg[swell_h1]
    swell_p = to7seg[swell_p0] + to7seg[swell_p1]
예제 #9
0
def main():
    choice = input('which model u want to use?')
    train_iterator, valid_iterator, test_iterator, input_dim, output_dim, pad_idx = data(
    )
    if choice == 'conv_seq2seq':

        EMB_DIM = 256
        HID_DIM = 512
        ENC_LAYERS = 10
        DEC_LAYERS = 10
        ENC_KERNEL_SIZE = 3
        DEC_KERNEL_SIZE = 3
        ENC_DROPOUT = 0.25
        DEC_DROPOUT = 0.25
        N_EPOCHS = 6
        CLIP = 1
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        enc = Encoder2(input_dim, EMB_DIM, HID_DIM, ENC_LAYERS,
                       ENC_KERNEL_SIZE, ENC_DROPOUT, device)
        dec = Decoder2(output_dim, EMB_DIM, HID_DIM, DEC_LAYERS,
                       DEC_KERNEL_SIZE, DEC_DROPOUT, pad_idx, device)

        model = Seq2Seq2(enc, dec, device).to(device)
        optimizer = optim.Adam(model.parameters())
        criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

        best_valid_loss = float('inf')

        for epoch in range(N_EPOCHS):

            start_time = time.time()

            train_loss = train_conv(model, train_iterator, optimizer,
                                    criterion, CLIP)
            valid_loss = evaluate_conv(model, valid_iterator, criterion)

            end_time = time.time()

            epoch_mins, epoch_secs = epoch_time(start_time, end_time)

            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                torch.save(model.state_dict(), 'model_conv.pt')

            print(f'Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs}s')
            print(
                f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}'
            )
            print(
                f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}'
            )
        choice = input('Do you want to evaluate the model?')
        if choice == 'yes':
            model.load_state_dict(torch.load('model_transformer.pt'))

            test_loss = evaluate(model, test_iterator, criterion)

            print(
                f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |'
            )

    elif choice == 'transformer':
        hid_dim = 512
        n_layers = 6
        n_heads = 8
        pf_dim = 2048
        dropout = 0.1
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        enc = Encoder(input_dim, hid_dim, n_layers, n_heads, pf_dim,
                      EncoderLayer, SelfAttention, PositionwiseFeedforward,
                      dropout, device)
        dec = Decoder(output_dim, hid_dim, n_layers, n_heads, pf_dim,
                      DecoderLayer, SelfAttention, PositionwiseFeedforward,
                      dropout, device)
        model = Seq2Seq(enc, dec, pad_idx, device).to(device)
        for p in model.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
        optimizer = NoamOpt(
            hid_dim, 1, 2000,
            torch.optim.Adam(model.parameters(),
                             lr=0,
                             betas=(0.9, 0.98),
                             eps=1e-9))
        criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

        N_EPOCHS = 10
        CLIP = 1

        best_valid_loss = float('inf')

        for epoch in range(N_EPOCHS):

            start_time = time.time()

            train_loss = train(model, train_iterator, optimizer, criterion,
                               CLIP)
            valid_loss = evaluate(model, valid_iterator, criterion)

            end_time = time.time()

            epoch_mins, epoch_secs = epoch_time(start_time, end_time)

            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                torch.save(model.state_dict(), 'model_transformer.pt')

            print(f'Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs}s')
            print(
                f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}'
            )
            print(
                f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}'
            )
        choice = input('Do you want to evaluate the model?')
        if choice == 'yes':
            model.load_state_dict(torch.load('model_transformer.pt'))

            test_loss = evaluate(model, test_iterator, criterion)

            print(
                f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |'
            )
예제 #10
0
# -*- coding: utf-8 -*-
import model
import get_data as gt
import train
import sys
import presict as pre

sys.path.append('...')
from conf import dnn

if __name__ == "__main__":
    q = train()  # 训练并且将模型保存
    u = gt.data()
    test_data = u.birth_data(dnn.CONFIG['predict']['amount'])
    result = pre.predict()
    pre.predict_()  # 加载模型并且预测预测
예제 #11
0
import tensorflow as tf
import numpy as np
from get_data import data
'''
seq2seq model
'''

#load data
data = data("data/small_vocab_en", "data/small_vocab_fr")


#encoder
def encoder(xs, source_lens, hidden_size, num_layers, embedding_size):
    #eng_embed = tf.Variable([len(data.eng_word2id.keys()), embedding_size], dtype=tf.float32)
    #seq_embedding = tf.nn.embedding_lookup(eng_embed, xs)
    #print(seq_embedding.shape)

    def get_lstmCell(hidden_size):  #1.5之后不能再使用[lstm_cell]*num_layers这种方法
        lstm_cell = tf.contrib.rnn.BasicLSTMCell(hidden_size)
        return lstm_cell

    #LSTM model
    mlstm_cell = tf.contrib.rnn.MultiRNNCell(
        [get_lstmCell(hidden_size) for _ in range(num_layers)])
    #init_s = mlstm_cell.zero_state(batch_size, dtype=tf.float32)
    seq_embedding = tf.contrib.layers.embed_sequence(
        xs, len(data.eng_word2id.keys()), embedding_size)
    en_outputs, en_final_states = tf.nn.dynamic_rnn(mlstm_cell,
                                                    seq_embedding,
                                                    source_lens,
                                                    dtype=tf.float32)
예제 #12
0
파일: train.py 프로젝트: 213584adghj/ml
 def __init__(self):
     self.data = gt.data()
     self.parameter = kn.CONFIG['train']['parameter']
     self.model = self.get_model()
예제 #13
0
'''
Sequence to Sequence with Attention Model
'''

import tensorflow as tf
import numpy as np
import sys
sys.path.append('../seq2seq_machineTranslation')
from get_data import data
'''
seq2seq model
'''

#load data
data = data("../seq2seq_machineTranslation/data/small_vocab_en",
            "../seq2seq_machineTranslation/data/small_vocab_fr")


#encoder
def encoder(xs, source_lens, hidden_size, num_layers, embedding_size):
    #eng_embed = tf.Variable([len(data.eng_word2id.keys()), embedding_size], dtype=tf.float32)
    #seq_embedding = tf.nn.embedding_lookup(eng_embed, xs)
    #print(seq_embedding.shape)

    def get_lstmCell(hidden_size):  #1.5之后不能再使用[lstm_cell]*num_layers这种方法
        lstm_cell = tf.contrib.rnn.BasicLSTMCell(hidden_size)
        return lstm_cell

    #LSTM model
    mlstm_cell = tf.contrib.rnn.MultiRNNCell(
        [get_lstmCell(hidden_size) for _ in range(num_layers)])