Exemplo n.º 1
0
def load_data(path):
    train = pandas.read_csv(path)
    #train = train.head(100)
    target, train = io_yandex.get_value_column(train, 'Activity')
    train = train.values
    X_train, X_test, y_train, y_test = \
      train_test_split(train, target, test_size=0.8, random_state=241)
    return X_train, X_test, y_train, y_test
Exemplo n.º 2
0
def load_data(path):
    train = pandas.read_csv(path)
    #train = train.head(100)
    target, train = io_yandex.get_value_column(train, 'SalaryNormalized')
    train['FullDescription'] = train['FullDescription'].str.lower()
    train['FullDescription'] = train['FullDescription'].replace('[^a-z0-9]', ' ', regex = True)
    train['LocationNormalized'].fillna('nan', inplace=True)
    train['ContractTime'].fillna('nan', inplace=True)
    return target, train
Exemplo n.º 3
0
def load_data(path):
    train = pandas.read_csv(path)
    #train = train.head(100)
    target, train = io_yandex.get_value_column(train, 'SalaryNormalized')
    train['FullDescription'] = train['FullDescription'].str.lower()
    train['FullDescription'] = train['FullDescription'].replace('[^a-z0-9]',
                                                                ' ',
                                                                regex=True)
    train['LocationNormalized'].fillna('nan', inplace=True)
    train['ContractTime'].fillna('nan', inplace=True)
    return target, train
Exemplo n.º 4
0
import os, sys
import pandas
from sklearn.decomposition import PCA
from numpy import corrcoef, argmax

PACKAGE_PARENT = "../.."
SCRIPT_DIR = os.path.dirname(
    os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))
from mylib import io_yandex

train = pandas.read_csv('close_prices.csv')
target, train = io_yandex.get_value_column(train, 'date')
#print(train.head(5))

pca = PCA(n_components=10)
pca.fit(train)
ratio = 0.
number = 0
while ratio < 0.9 and number < len(pca.explained_variance_ratio_):
    ratio += pca.explained_variance_ratio_[number]
    number += 1
print(number, ratio)
io_yandex.print_result(str(number), '1_1.txt')

reduced = pca.transform(train)[:, 0]

real = pandas.read_csv('djia_index.csv')
real = real['^DJI']
correlation = corrcoef(reduced, real)[0, 1]
io_yandex.print_result(str(correlation), '1_2.txt')
Exemplo n.º 5
0
import os, sys
import pandas
from sklearn.decomposition import PCA
from numpy import corrcoef, argmax
PACKAGE_PARENT = "../.."
SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))
from mylib import io_yandex


train = pandas.read_csv('close_prices.csv')
target, train = io_yandex.get_value_column(train, 'date')
#print(train.head(5))

pca = PCA(n_components=10)
pca.fit(train)
ratio = 0.
number = 0
while ratio < 0.9 and number < len(pca.explained_variance_ratio_):
	ratio += pca.explained_variance_ratio_[number]
	number += 1
print(number, ratio)
io_yandex.print_result(str(number), '1_1.txt')

reduced = pca.transform(train)[:,0]

real = pandas.read_csv('djia_index.csv')
real = real['^DJI']
correlation = corrcoef(reduced, real)[0, 1]
io_yandex.print_result(str(correlation), '1_2.txt')
Exemplo n.º 6
0
    return df


def create_decision_tree(dataframe, value_column):
    clf = tree.DecisionTreeClassifier(random_state=241)
    return clf.fit(dataframe, value_column) 


def calculate_most_important_value(df, importances):
    first = [0, 0]     #index value
    second = [0, 0]    #index value
    index = 0
    for value in importances:
        if value > first[1]:
            second = first
            first = [index, value]
        elif value > second[1]:
            second = [index, value]
        index += 1
    result = df.columns[first[0]] + ' ' + df.columns[second[0]]
    io_yandex.print_result(result, "1b.txt")


df = io_yandex.load_titanic_to_dataframe()  
df = prepare_data(df)
is_survived, df = io_yandex.get_value_column(df,'Survived')
clf = create_decision_tree(df, is_survived)
importances = clf.feature_importances_
print(importances)
calculate_most_important_value(df, importances) #Fare Sex
Exemplo n.º 7
0
def load_data():
	data_train = pandas.read_csv('./data/svm-data.csv', header=None)
	classes_train, data_train = io_yandex.get_value_column(data_train, 0)
	return data_train, classes_train
Exemplo n.º 8
0
def load_data(path):
    train = pandas.read_csv(path)
    #train = train.head(100)
    target, train = io_yandex.get_value_column(train, 'Rings')
    train['Sex'] = list(map(replace_sex, train['Sex']))
    return train, target
Exemplo n.º 9
0
def load_data():
	data_train = pandas.read_csv('../data/perceptron-train.csv', header=None)
	classes_train, data_train = io_yandex.get_value_column(data_train, 0)
	data_test = pandas.read_csv('../data/perceptron-test.csv', header=None)
	classes_test, data_test = io_yandex.get_value_column(data_test, 0)
	return data_train, classes_train, data_test, classes_test
Exemplo n.º 10
0
def load_data():
    data_train = pandas.read_csv('../data/perceptron-train.csv', header=None)
    classes_train, data_train = io_yandex.get_value_column(data_train, 0)
    data_test = pandas.read_csv('../data/perceptron-test.csv', header=None)
    classes_test, data_test = io_yandex.get_value_column(data_test, 0)
    return data_train, classes_train, data_test, classes_test
Exemplo n.º 11
0
def cross_validate(df, classes, kf):
	accuracies = []
	for i in range(sklearn.metrics.r2_score1,51):
		classifier = KNeighborsClassifier(n_neighbors=i)
		score = cross_val_score(classifier, X=df, y=classes, cv=kf)
		accuracies.append(mean(score))
	return accuracies


def calculate_max_accuracies(df, classes, kf):
	accuracies = cross_validate(df, classes, kf)
	max_accuracy = max(accuracies)
	n_neighbors = accuracies.index(max_accuracy) + 1  # first index is 0. It is for 1 class
	print(accuracies[2])
	return n_neighbors, max_accuracy


def print_n_neighbors_and_accuracies(df, classes, kf, path1, path2):
	n_neighbors, max_accuracy = calculate_max_accuracies(df, classes, kf)
	max_accuracy = io_yandex.two_digit_round(max_accuracy)
	io_yandex.print_result(str(n_neighbors), path1)
	io_yandex.print_result(max_accuracy, path2)


df = io_yandex.load_wine_to_dataframe()
classes, df = io_yandex.get_value_column(df, 0)
kf = KFold(len(df.index), n_folds=5, shuffle=True, random_state=42)
print_n_neighbors_and_accuracies(df, classes, kf, "1.txt", "2.txt")
df = scale(df)
print_n_neighbors_and_accuracies(df, classes, kf, "3.txt", "4.txt")
Exemplo n.º 12
0
def load_data(path):
    train = pandas.read_csv(path)
    #train = train.head(100)
    target, train = io_yandex.get_value_column(train, 'Rings')
    train['Sex'] = list(map(replace_sex, train['Sex']))
    return train, target