Пример #1
0
 def test(self, test):
     # Run the prediction model on the testing dataset
     prediction = dict()
     for (i, sms) in enumerate(test):
         message = process(sms)
         prediction[i] = self.classify(message)
     return prediction
Пример #2
0
def load_dataset(batch_size=128, load_caption=False):

    batch_train = process_data.process(batch_size=batch_size, extract_center=True, load_caption=load_caption)
    batch_val = process_data.inital_process(nb_sub=2000, batch_size=batch_size, img_path = 'val2014', extract_center=True, load_caption=load_caption)


    try:
        batch_val.vocab = batch_train.vocab
        batch_val.mapping = batch_train.mapping
        batch_val.process_captions()
    except Exception as e:
        print "Captions not processed"
        print e

    return batch_train, batch_val
Пример #3
0
    def TF_and_IDF(self):
        self.spam_num = self.label.value_counts()[1]
        self.ham_num = self.label.value_counts()[0]

        for i in range(self.sms.shape[0]):
            message = process(self.sms[i])
            word_list = list()

            # Calculate TF
            for word in message:
                if word not in word_list:
                    word_list += [word]
                if self.label[i]:
                    self.tf_spam[word] = self.tf_spam.get(word,0) + 1
                else:
                    self.tf_ham[word] = self.tf_ham.get(word,0) + 1
                    
            # Calculate IDF
            for word in word_list:
                if self.label[i]:
                    self.idf_spam[word] = self.idf_spam.get(word,0) + 1
                else:
                    self.idf_ham[word] = self.idf_ham.get(word,0) + 1
Пример #4
0
        min_dist, min_offset = None, None
        for j in range(polygon_points.shape[0]):
            node_value, polygon_point = node_values[i], polygon_points[j]
            if min_dist is None or np.linalg.norm(polygon_point -
                                                  node_value) < min_dist:
                min_dist = np.linalg.norm(polygon_point - node_value)
                min_offset = [
                    polygon_point[0] - node_value[0],
                    polygon_point[1] - node_value[1]
                ]
        offsets.append(min_offset)
    return np.array(offsets)


# Get training data
bboxes, polygon_labels = process()

# Define GCN models
epochs_per_image = 10
gcn_models = [None] * epochs_per_image

# Run stochastic training
for image in range(len(bboxes)):
    print("Training image {0} of {1}".format(image + 1, len(bboxes)))
    # Process bounding box
    bbox, polygon_points = bboxes[image], polygon_labels[image]
    resized_bb = cv2.resize(bbox, (224, 224), interpolation=cv2.INTER_AREA)
    resized_bb_exp = preprocess_input(np.expand_dims(resized_bb, axis=0))

    # Compute feature map
    resnet_model = ResNet50V2(weights='imagenet')
Пример #5
0
import torch
import os
from torch.optim import Adam
from H_parse import H_parse
from model import build_model
from optimizer import *
from process_data import process

parse = H_parse()

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if parse.device == "cpu":
    DEVICE = torch.device("cpu")
data = process(parse)
model = build_model()
optim = NoamOpt(
    model.src_embed[0].d_model, 1, 4000,
    Adam(model.parameters(), lr=parse.lr_rate, betas=[0.9, 0.98], eps=1e-9))
label_sm = LabelSmoothing(parse.tgt_vocab_len,
                          data.vocab["tgt"]["<pad>"],
                          smoothing=0.1)
losscompute = LossCompute(model.generator, label_sm, optim)
Пример #6
0
trainData.reset_index(inplace = True)
training_counts = trainData['v1'].value_counts().tolist()
print("\nTraining data set: 80% of data set\nNumber of spam: ", training_counts[0],"\nNumber of ham: ", training_counts[1])
# Reset index in testing data set
testData.reset_index(inplace = True)
testing_counts = testData['v1'].value_counts().tolist()
print("\nTesting data set: 20% of data set\nNumber of spam: ", testing_counts[0],"\nNumber of ham: ", testing_counts[1])

# Training the TF-IDF model
tfidf = TFIDF_model(trainData)
tfidf.TF_and_IDF()
tfidf.TFIDF()

metric(testData['v1'], tfidf.test(testData['v2']))

# Running examples
message1 = 'OMW. I will call you later.'
process1 = process(message1)
print("\nMessage 1: ", message1, "\nSpam = 1, Ham = 0: ", tfidf.classify(process1))

message2 = 'I will text you when I finish work'
process2 = process(message2)
print("\nMessage 2: ", message2, "\nSpam = 1, Ham = 0: ", tfidf.classify(process2))

message3 = 'You win a trip to Europe! Call now to redeem'
process3 = process(message3)
print("\nMessage 3: ", message3, "\nSpam = 1, Ham = 0: ", tfidf.classify(process3))

message4 = 'Text or call now for a week of FREE membership.'
process4 = process(message4)
print("\nMessage 4: ", message4, "\nSpam = 1, Ham = 0: ", tfidf.classify(process4))
import tensorflow as tf
import model as m
import process_data
import matplotlib.pyplot as plt

num_samples = 7260  # Number of samples to train on.

encoder_input_data, decoder_input_data, decoder_target_data = process_data.process(
)
model = m.seq2seq()

callbacks = [
    # If 'val_loss' does not improve over 2 epochs, the training stops.
    tf.keras.callbacks.EarlyStopping(patience=2, monitor='val_loss'),
    # Record logs for displaying on tensor board
    tf.keras.callbacks.TensorBoard(log_dir='./tensor_board')
]

history = model.fit([encoder_input_data, decoder_input_data],
                    decoder_target_data,
                    callbacks=callbacks,
                    batch_size=m.batch_size,
                    epochs=m.epochs,
                    validation_split=0.2)

# Save model
model.save_weights('./pretrained_weights/t1_savedModel', save_format='tf')

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
Пример #8
0
Файл: svm.py Проект: arvindr9/ml
# for evaluating against train size
train_performance = []
val_performance = []
test_performance = []



features = None
encodings = None
for train_frac in train_fracs:
    acc = []
    clfs = []
    for kernel in kernel_funcs:
        clf = SVC(kernel = kernel)
        (x_train, y_train), (x_val, y_val), (x_test, y_test) = process(all_data, train_frac, val_frac, test_frac)
        print(x_train.shape, y_train.shape)
        clf.fit(x_train, y_train)
        clfs.append(clf)
        acc.append(accuracy_score(clf.predict(x_val), y_val))
    optimal_index = acc.index(max(acc))
    optimal_kernel = kernel_funcs[optimal_index]
    optimal_clf = clfs[optimal_index]
    train_performance.append(accuracy_score(optimal_clf.predict(x_train), y_train))
    val_performance.append(accuracy_score(optimal_clf.predict(x_val), y_val))
    test_performance.append(accuracy_score(optimal_clf.predict(x_test), y_test))

print(train_performance)
print(val_performance)
print(test_performance)
Пример #9
0
import csv
from process_data import process
from sklearn.decomposition import PCA


file = open('../data/mushroom-classification/mushrooms.csv')

all_data = list(csv.reader(file))
data_size = len(all_data) - 1

train_frac = 1.0

features = None
encodings = None

features, encodings, ((x_train, y_train), _, _) = process(all_data, train_frac, 0, 0, modify = True)

pca = PCA(n_components = 2)
pca.fit(x_train)
import pandas as pd
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
import numpy as np
reduced = np.array(pca.transform(x_train))
x = reduced[:, 0]
y = reduced[:, 1]
colors = []
eps = 1e-4
for elt in y_train:
    if abs(1 - elt) < eps:
        colors.append('r')
Пример #10
0
def run():
     x, y = process('nba_data2016-2018.csv')
         
     # learning rate for the algorithm
     learning_rate = 0.001

     # split into 75% train and 25% test

     train_size=.75
     X_train = x[:(int)(x.shape[0]*train_size),:]
    # print(X_train[len(X_train)-1])
     X_test = x[(int)(x.shape[0]*train_size):,:]
     Y_train = y[:(int)(y.shape[0]*train_size)]
     Y_test = y[(int)(y.shape[0]*train_size):]

     D= x.shape[1]
     M= 20
     B= np.random.rand(M)
     W1= np.random.rand(D,M)
     B2 = np.random.rand(1)
     W2 = np.random.rand(M,1)

     #each batch is now 6 "lines" because 3498/583=6
     batches = 583


     X_t= np.split(X_train , batches, axis=0)
     Y_t = np.split(Y_train , batches, axis=0)

     # IMPORTANT: statistic = (Yes/No - No/Yes)^2 / (Yes/No + No/Yes), Is the Mcnemar's test (a type of chi-square), to compare between 2 binary classification algorithms; with an alpha level of .05, the critical value is 3.84
     losses= []
     rates = 0
     
     for i in range(len(X_t)):
         X= X_t[i]
         Y= Y_t[i]
         
  
         parameters = feedforward(X, B, W1, B2, W2)

         Z2 = parameters["Z2"]
         Z1= parameters["Z1"]
         l = cost(Z2, Y)
         losses.append(-l)
         
         
         W2 += learning_rate* back_propW2(gradientDesc(Z2, Y), parameters)
         
         B2 += learning_rate* back_propB2(gradientDesc(Z2, Y), Z2)
         
         W1 += learning_rate* back_propW1(gradientDesc(Z2, Y), W2, parameters)
         
         B += learning_rate* back_propB1(gradientDesc(Z2, Y), W2, parameters)
         
         
         if(i>481):
           rates+=(accuracy(Z2,Y)) *100

     
     
     print(rates/100)
     plt.title('Classifier 1 (3 point percentage)')
     plt.plot(losses)
     plt.show()  
    
     return (rates/100)
Пример #11
0
train_fracs = [0.0002, 0.0003, 0.0004, 0.0005, 0.006, 0.008, 0.001, 0.002, 0.005, 0.008, 0.01, 0.02, 0.05, 0.1, 0.2, 0.4, 0.6]

# for evaluating against train size
train_performance = []
val_performance = []
test_performance = []

features = None
encodings = None
for train_frac in train_fracs:
    acc = []
    clfs = []
    for depth in depths:
        clf = DT(max_depth = depth)
        if depth == depths[0] and train_frac == train_fracs[0]:
            features, encodings, ((x_train, y_train), (x_val, y_val), (x_test, y_test)) = process(all_data, train_frac, val_frac, test_frac, modify = True)
        else:
            _, _, ((x_train, y_train), (x_val, y_val), (x_test, y_test)) = process(all_data, train_frac, val_frac, test_frac, features = features, encodings = encodings)
        print(x_train.shape, y_train.shape)
        clf.fit(x_train, y_train)
        clfs.append(clf)
        acc.append(accuracy_score(clf.predict(x_val), y_val))
    optimal_index = acc.index(max(acc))
    optimal_depth = depths[optimal_index]
    optimal_clf = clfs[optimal_index]
    train_performance.append(accuracy_score(optimal_clf.predict(x_train), y_train))
    val_performance.append(accuracy_score(optimal_clf.predict(x_val), y_val))
    test_performance.append(accuracy_score(optimal_clf.predict(x_test), y_test))

print(train_performance)
print(val_performance)
Пример #12
0
def main():
    process("c444b776")
Пример #13
0
def main():
    process("6d75e8bb")
Пример #14
0
def clean_database():
    """
    Clean ec_students_[semester] and ec_classes_[semester] table

    :return: none
    """
    conn = mysql.connector.connect(**settings.MYSQL_CONFIG)
    cursor = conn.cursor()

    query = "TRUNCATE ec_students_%s" % get_semester_code_for_db(
        settings.SEMESTER)
    cursor.execute(query)
    query = "TRUNCATE ec_classes_%s" % get_semester_code_for_db(
        settings.SEMESTER)
    cursor.execute(query)
    cursor.close()
    conn.close()


if __name__ == "__main__":
    with open("stu_data_version.json") as f:
        old_json_file = json.load(f)["stu_data_json_name"]

    fix_json(old_json_file)
    clean_directory()
    retrieve()
    clean_database()
    process()
    verify()
Пример #15
0
features = None
encodings = None

for train_frac in train_fracs:
    acc = []
    clfs = []
    best_acc = 0
    optimal_depth = None
    optimal_est = None
    optimal_clf = None
    for depth in max_depths:
        for est in n_estimators:
            clf = AdaBoostClassifier(base_estimator=DT(max_depth=depth),
                                     n_estimators=est)
            ((x_train, y_train), (x_val, y_val),
             (x_test, y_test)) = process(all_data, train_frac, val_frac,
                                         test_frac)

            clf.fit(x_train, y_train)
            accuracy = accuracy_score(clf.predict(x_val), y_val)
            if accuracy > best_acc:
                best_acc = accuracy
                optimal_depth = depth
                optimal_est = est
                optimal_clf = clf
    train_performance.append(
        accuracy_score(optimal_clf.predict(x_train), y_train))
    val_performance.append(accuracy_score(optimal_clf.predict(x_val), y_val))
    test_performance.append(accuracy_score(optimal_clf.predict(x_test),
                                           y_test))

f1, ax1 = plt.subplots()
Пример #16
0
                    val_loss = criterion(
                        output,
                        targets.view(batch_size * seq_length).long())

                    val_losses.append(val_loss.item())

                net.train(
                )  # reset to train mode after iterationg through validation data

                print("Epoch: {}/{}...".format(e + 1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()),
                      "Val Loss: {:.4f}".format(np.mean(val_losses)))


chars, encoded = process('../data/quotes_data.txt')

print(len(chars), len(encoded))

# Define and print the net
n_hidden = 512
n_layers = 2

net = CharRNN(chars, n_hidden, n_layers)
print(net)

# Declaring the hyperparameters
batch_size = 128
seq_length = 100
n_epochs = 20  # start smaller if you are just testing initial behavior
Пример #17
0
Файл: pca.py Проект: arvindr9/ml
from sklearn.decomposition import PCA
from process_data import process
import csv

file = open('../data/pima-indians-diabetes-database/diabetes.csv')

all_data = list(csv.reader(file))
data_size = len(all_data) - 1

train_frac = 1
val_frac = 0.2
test_frac = 0.2

(x_train, y_train), _, _ = process(all_data, train_frac, 0, 0)

pca = PCA(n_components=2)
pca.fit(x_train)
import pandas as pd
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
import numpy as np
reduced = np.array(pca.transform(x_train))
x = reduced[:, 0]
y = reduced[:, 1]
colors = []
eps = 1e-4
for elt in y_train:
    if abs(1 - elt) < eps:
        colors.append('r')
    else:
import process_data
import tensorflow as tf

encoder_input_data, decoder_input_data, decoder_target_data = process_data.process(
    is_train=False)
base_model = tf.keras.models.load_model('./pretrained_weights/t1_baseline.h5')

print(str(encoder_input_data.shape))
print(str(decoder_input_data.shape))
print(str(decoder_target_data.shape))

for seq_index in range(10):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index:seq_index + 1]
    decoder_input = decoder_input_data[seq_index:seq_index + 1]
    target_data = decoder_target_data[seq_index:seq_index + 1]

    decoded_sentence = base_model.predict([input_seq, decoder_input])

    print('input shape : ' + str(input_seq.shape))
    print(str(input_seq))
    print('output shape : ' + str(decoded_sentence.shape))
    print(str(decoded_sentence))
    print('result : ' + str(target_data == decoded_sentence))

    # print('This is decoded result : ' + str(decoded_sentence))
Пример #19
0
Файл: nn.py Проект: arvindr9/ml
val_performance = []
test_performance = []

features = None
encodings = None

for train_frac in train_fracs:
    acc = []
    clfs = []
    for hid_layer_specific in hid_layers:
        clf = NN(hid_layer_specific, activation='relu')
        if hid_layer_specific == hid_layers[0] and train_frac == train_fracs[0]:
            features, encodings, ((x_train, y_train), (x_val, y_val),
                                  (x_test, y_test)) = process(all_data,
                                                              train_frac,
                                                              val_frac,
                                                              test_frac,
                                                              modify=True)
        else:
            _, _, ((x_train, y_train), (x_val, y_val),
                   (x_test, y_test)) = process(all_data,
                                               train_frac,
                                               val_frac,
                                               test_frac,
                                               features=features,
                                               encodings=encodings)
        print(x_train.shape, y_train.shape)
        clf.fit(x_train, y_train)
        clfs.append(clf)
        acc.append(accuracy_score(clf.predict(x_val), y_val))
    optimal_index = acc.index(max(acc))
Пример #20
0
 def get(self):
     print(path)
     return process_data.process(path)
Пример #21
0
def main():
    process('7468f01a')
Пример #22
0
Файл: nn.py Проект: arvindr9/ml
train_frac = 0.7
#[0.0002, 0.0003, 0.0004, 0.0005,

# for evaluating against train size
train_performance = []
val_performance = []
test_performance = []

features = None
encodings = None

clf = NN(hid_layers, activation='relu')
features, encodings, ((x_train, y_train), (x_val, y_val),
                      (x_test, y_test)) = process(all_data,
                                                  train_frac,
                                                  val_frac,
                                                  test_frac,
                                                  modify=True)
print(x_train.shape, y_train.shape)
clf.fit(x_train, y_train)

print("Simulated annealing:")
acc_anneal = []
test_anneal = []

clf.coefs_ = []
clf.intercepts_ = []
#anneal(clf, hid_layers, x_train, y_train) #uses simulated annealing to find the optimal weights
anneal = NNAnneal(clf, hid_layers, x_train, y_train)
([clf.coefs_, clf.intercepts_]), e = anneal.anneal()