Пример #1
0
def main():
    """intakes, concats, splits, saves"""
    ## loads and concats
    files = os.listdir(INPUT_PATH)
    df_multi = [pd.read_csv(f'{file}') for file in files]
    df_full = pd.concat(df_multi)
    ## split
    df_inter, df_test = train_test_split(df_full,
                                         test_size=PERCENT,
                                         random_state=SEED)
    df_train, df_val = train_test_split(df_inter,
                                        test_size=PERCENT,
                                        random_state=SEED)
    ## save
    df_train.to_csv(OUTPUT_PATH_TRAIN)
    df_val.to_csv(OUTPUT_PATH_VAL)
    df_test.to_csv(OUTPUT_PATH_TEST)
Пример #2
0
def get_divided(df):
    X_train = df.drop('price actual', axis=1).as_matrix()
    y_train = df['price actual'].as_matrix()

    X_train, X_test, y_train, y_test = skl.train_test_split(X_train,
                                                            y_train,
                                                            test_size=0.2)
    plt.plot(X_train, y_train, 'ro', label='Original data')
    plt.title('Linear Regression Result')
    plt.legend()
    plt.show()
    return X_train, X_test, y_train, y_test
 def split_data(self, data):
     """Split data to training and validation set."""
     validation_file = Path("../data/valid.p")
     if validation_file.isfile():
         validate = []
         with open(validation_file, mode='rb') as f:
             validate = pickle.load(f)
         self.X_validate = validate['features']
         self.y_validate = validate['labels']
         self.X_validate.astype(np.float32)
         self.y_validate.astype(np.float32)
     else:
         self.X_train, self.y_train, self.X_validate, self.y_validate = train_test_split(data['features'],
                                                                         data['labels'], test_size=0.33,
                                                                         random_state=0)
Пример #4
0
    temp_x = pickle.load(open(trainFile + ".feature", "rb"))
    temp_y = pickle.load(open(trainFile + ".label", "rb"))

    x.append(temp_x)
    y.append(temp_y)

x = np.array(x)
y = np.array(y)

print(x.shape)

# max_x = np.amax(x.any())
# x = x/max_x

x_train, x_test, y_train, y_test = sklearn.train_test_split(x,
                                                            y,
                                                            test_size=0.15,
                                                            random_state=0)
x_train, x_val, y_train, y_val = sklearn.train_test_split(x_train,
                                                          y_train,
                                                          test_size=0.25,
                                                          random_state=0)

x_dummy = x_train[:5]
y_dummy = y_train[:5]


def get_siamese_model(input_shape):
    """
        Model architecture
    """
Пример #5
0
# y_training dim 30 * 2 (the first is x and the second is y)
y_list = [pd.DataFrame()] * 2308
for i in range(2308):
    y_list[i] = df_train_y.iloc[(i*30):(i*30+30),-2:]

# preprocessing to remove samples
del X_list[593]
del y_list[593]
del X_list[858]
del y_list[858]
del X_list[2166]
del y_list[2166]

# Cross Validation
X_list_tr, X_list_te, y_list_tr, y_list_te = train_test_split(X_list, y_list, size=0.2)

# 2308 total samples, 2305 good samples
model_list = [ARIMA.ARIMA(learning_rate=1, iterations=1000, l1_penality=1, lag=6)] * 2308
t = time.time()
num_test = 2305

# Fit the models
for i in range(num_test):
    model_list[i].fit(X_list[i], y_list[i])
    print(str(i) + " is okay.")
print("Fitting took %d seconds" % (time.time()-t))

df_test_X = pd.read_csv('../data/test_transformed.csv')
X_test_list = [pd.DataFrame()] * 20
for i in range(20):
# Logistic Regression #

import numpy as np
import pandas as pd

import sklearn.model_selection as train_test_split
 X_train,X_test,y_train,y_test = train_test_split(test_size=0.3,random_state=0)

 from sklearn.preprocessing import StandardScaler
 SS=StandardScaler()
 X_train = SS.fit_transform(X_train)
 X_test = SS.fit_transform(X_test)

 from sklearn.linear_model import LogisticRegression
 clf = LogisticRegression(random_state=0)
 clf.fit(X_train,y_train)

 y_pred = clf.predict(X_test)
 from sklearn.metrics import confusion_matrix
 cm=confusion_matrix(y_test,y_pred)
 print(cm)

 '''
TP FP
TP FN

CorrectPrediction WrongPreduction
WrongPreduction  CorrectPrediction

 Check Accuracy - by using accuracy  score  sensetivity, specificity and Precision 
# import data
import numpy as np
import pandas as pd
df = pd.read_csv(
    'https://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data',
    header=None)
print(df.tail())

y = df.iloc[1:, 4]
df.drop(df.columns[[2]], axis=1, inplace=True)
X = df.iloc[1:, 0:3]
#-----------------------------------------------------------------------------
# data pre processing
from sklearn import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)
X_combined_std = np.vstack((X_train_std, X_test_std))
y_combined = np.hstack((y_train, y_test))
#-----------------------------------------------------------------------------

from sklearn.linear_model import LogisticRegression
import plot_decision_regions as pp
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
Пример #8
0
import pandas
from sklearn import train_test_split

#from medium article https://towardsdatascience.com/how-to-build-a-simple-song-recommender-296fcbc8c85

#these lines of code did not work on IDE
#triplets_file = 'https://static.turi.com/datasets/millionsong/10000.txt'
#songs_metadata_file = 'https://static.turi.com/datasets/millionsong/song_data.csv'
#song_df_1 = pandas.read_table(triplets_file,header=None)
#song_df_1.columns = ['user_id', 'song_id', 'listen_count']
#song_df_2 =  pandas.read_csv(songs_metadata_file)
#song_df = pandas.merge(song_df_1, song_df_2.drop_duplicates(['song_id']), on="song_id", how="left")
#print('Data imported')

song_df = pandas.read_csv('song_df.csv')
print(song_df.head())
users = song_df['user_id'].unique()
print(len(users))
songs = song_df['song_id'].unique()
print(len(songs))
train_data, test_data = train_test_split(song_df,
                                         test_size=0.20,
                                         random_state=0)
Created on Tue Jul 17 16:12:01 2018

@author: xzc
"""

# read data from csv file and train_test_split
import pandas as pd
from sklearn import train_test_split

housing = pd.read_csv('cal_housing_clean.csv')
housing.head()
y_val = housing['medianHouseValue']
x_data = housing.drop('medianHouseValue', axis=1)

X_train, X_test, y_train, y_test = train_test_split(x_data,
                                                    y_val,
                                                    test_size=0.3,
                                                    random_state=101)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(data=scaler.transform(X_train),
                       columns=X_train.columns,
                       index=X_train.index)
X_test = pd.DataFrame(data=scaler.transform(X_test),
                      columns=X_test.columns,
                      index=X_test.index)

scaler.transform(X_train)

# create Feature columns
housing.columns
Пример #10
0
generated_filepath = '../Generated/'
n_signs = 29
y_base = np.zeros((n_signs,), dtype=np.int)
row_count = 0;
for sign_count in range(0:n_signs)
	inputdir = generated_filepath + sign_count + '/'
	y_sign = y_base
	y_sign(sign_count) = 1;
	for image in os.listdir(inputdir)
		img_matrix = imread(image)
		img_array = img_matrix.ravel()
		y_matrix(row_count,:) = y_sign
		x_matrix(row_count,:) = img_array
		row_count++
#full_matrix = np.concatenate((x_matrix,y_matrix.T), axis=1)
#full_matrix = np.random.shuffle(full_matrix)

#[rows, columns] = np.shape(full_matrix)
#x_matrix = full_matrix(:,0:columns-n_signs-1)
#y_matrix = full_matrix(:,columns-n_signs:columns-1)

X_train, X_test, Y_train, Y_test = train_test_split(x_matrix,y_matrix,test_size=0.33)

clf = svm.SVC()
clf.fit(X_train,Y_train)

accuracy = clf.score(X_test,Y_test)