示例#1
0
def psnr_and_save(floats, clusters, means, x, y, means_type):
	_floats = colorize(floats, clusters, means, x, y)
	imsave(arr=_floats, fname=directory + str(n) + '_' + means_type + '.jpg')
	psnr = PSNR(_floats, floats2D)
	print(str(n) + ' ' + means_type + ': ' + str(psnr))
	if psnr > 20.:
		io_yandex.print_result(str(n), '1.txt')
		sys.exit(0)
示例#2
0
def calculate_most_important_value(df, importances):
    first = [0, 0]     #index value
    second = [0, 0]    #index value
    index = 0
    for value in importances:
        if value > first[1]:
            second = first
            first = [index, value]
        elif value > second[1]:
            second = [index, value]
        index += 1
    result = df.columns[first[0]] + ' ' + df.columns[second[0]]
    io_yandex.print_result(result, "1b.txt")
示例#3
0
    return accuracy, precision, recall, f1


def calculate_roc_auc(true, *arg):
    for a in arg:
        score = roc_auc_score(true, a)
        yield score


def largest_index(a):
    return numpy.argsort(a)[::-1][:1]


data = pandas.read_csv("./data/classification.csv")
tp, fp, fn, tn = calculate_prediction_type(data["true"], data["pred"])
io_yandex.print_result(
    " ".join(map(io_yandex.two_digit_round, [tp, fp, fn, tn])), "4_1.txt")
accuracy, precision, recall, f1 = calculate_scores(data["true"], data["pred"])
io_yandex.print_result(
    " ".join(map(io_yandex.two_digit_round,
                 [accuracy, precision, recall, f1])), "4_2.txt")

data = pandas.read_csv("./data/scores.csv")
print(list(data.columns.values))
logreg, svm, knn, tree = calculate_roc_auc(data["true"], data["score_logreg"],
                                           data["score_svm"],
                                           data["score_knn"],
                                           data["score_tree"])
index = largest_index([logreg, svm, knn, tree]) + 1  # first is "true"
io_yandex.print_result("".join(data.columns.values[index]),
                       "4_3.txt")  #"".join is used to remove [' and '] symbols
示例#4
0
from mylib import io_yandex


def load_data(path):
    train = pandas.read_csv(path)
    #train = train.head(100)
    target, train = io_yandex.get_value_column(train, 'SalaryNormalized')
    train['FullDescription'] = train['FullDescription'].str.lower()
    train['FullDescription'] = train['FullDescription'].replace('[^a-z0-9]', ' ', regex = True)
    train['LocationNormalized'].fillna('nan', inplace=True)
    train['ContractTime'].fillna('nan', inplace=True)
    return target, train


target, train = load_data('salary-train.csv')
tfid_vectoriser = TfidfVectorizer(min_df=5)
train_text = tfid_vectoriser.fit_transform(train['FullDescription'])
dict_vectorizer = DictVectorizer()
train_categ = dict_vectorizer.fit_transform(train[['LocationNormalized', 'ContractTime']].to_dict('records'))
train = hstack(blocks=[train_text, train_categ])

clf = Ridge(alpha=1, random_state=241)
clf.fit(train, target)

target, train = load_data('salary-test-mini.csv')
train_text = tfid_vectoriser.transform(train['FullDescription'])
train_categ = dict_vectorizer.transform(train[['LocationNormalized', 'ContractTime']].to_dict('records'))
train = hstack(blocks=[train_text, train_categ])
target = clf.predict(train)
io_yandex.print_result(' '.join(map(io_yandex.two_digit_round, target)), '1.txt')
示例#5
0
import os, sys
import pandas
from sklearn.svm import SVC
PACKAGE_PARENT = '..'
SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))
from mylib import io_yandex


def load_data():
	data_train = pandas.read_csv('./data/svm-data.csv', header=None)
	classes_train, data_train = io_yandex.get_value_column(data_train, 0)
	return data_train, classes_train


def constact_SVC():
	svc = SVC(kernel='linear', C=100000, random_state=241)
	return svc

data_train, classes_train = load_data()
abc = SVC(kernel='linear', C=100000, random_state=241)
#abc.fit(data_train[:20], classes_train[:20])
abc.fit(data_train, classes_train)
print(abc.support_)
vectors = [x+1 for x in abc.support_]
vectors.sort()
io_yandex.print_result(' '.join(map(str, vectors)), "1.txt")
示例#6
0

def replace_sex(x):
    if x == 'M':
        return 1
    elif x == 'I':
        return 0
    elif x == 'F':
        return -1


def load_data(path):
    train = pandas.read_csv(path)
    #train = train.head(100)
    target, train = io_yandex.get_value_column(train, 'Rings')
    train['Sex'] = list(map(replace_sex, train['Sex']))
    return train, target


X, y = load_data('abalone.csv')
start = time.time()
for i in range(1, 51):
    clf = RandomForestRegressor(n_estimators=i, random_state=1)
    kf = KFold(len(y), n_folds=5, random_state=1, shuffle=True)
    score = mean(cross_val_score(clf, X, y, cv=kf, scoring='r2', n_jobs=-1))
    #print(i, score)
    if (score > 0.52):
        io_yandex.print_result(str(i), "1.txt")
        break
end = time.time()
print(end - start)
示例#7
0
def print_n_neighbors_and_accuracies(df, classes, kf, path1, path2):
	n_neighbors, max_accuracy = calculate_max_accuracies(df, classes, kf)
	max_accuracy = io_yandex.two_digit_round(max_accuracy)
	io_yandex.print_result(str(n_neighbors), path1)
	io_yandex.print_result(max_accuracy, path2)
示例#8
0
PACKAGE_PARENT = "../.."
SCRIPT_DIR = os.path.dirname(
    os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))
from mylib import io_yandex

train = pandas.read_csv('close_prices.csv')
target, train = io_yandex.get_value_column(train, 'date')
#print(train.head(5))

pca = PCA(n_components=10)
pca.fit(train)
ratio = 0.
number = 0
while ratio < 0.9 and number < len(pca.explained_variance_ratio_):
    ratio += pca.explained_variance_ratio_[number]
    number += 1
print(number, ratio)
io_yandex.print_result(str(number), '1_1.txt')

reduced = pca.transform(train)[:, 0]

real = pandas.read_csv('djia_index.csv')
real = real['^DJI']
correlation = corrcoef(reduced, real)[0, 1]
io_yandex.print_result(str(correlation), '1_2.txt')

company = train.columns[argmax(pca.components_[0])]
io_yandex.print_result(company, '1_3.txt')
示例#9
0

newsgroups = datasets.fetch_20newsgroups(
    subset='all', categories=['alt.atheism', 'sci.space'])

vectoriser = TfidfVectorizer()
train = vectoriser.fit_transform(newsgroups.data).toarray()
#grid = {'C': numpy.power(10.0, numpy.arange(-5, 6))}
#cv = KFold(len(newsgroups.data), n_folds=5, shuffle=True, random_state=241)
#clf = SVC(kernel='linear', random_state=241)
#gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv)
#gs.fit(train, newsgroups.target)
#print(gs.best_estimator_.coef_)
#coef = gs.best_estimator_.coef_
#coef_data = numpy.abs(coef.data)
#coef_class = numpy.abs(coef.class)
#key_words = get_key_words(vectoriser, coef_data, 10)
#key_words.sort()
#print(gs.best_params_['C'])
#for a in gs.grid_scores_:
#    print(a.mean_validation_score) # — оценка качества по кросс-валидации
#    print(a.parameters) # — значения параметров
clf = SVC(kernel='linear', C=1., random_state=241)  # C > 1 - best values
clf.fit(train, newsgroups.target)
coef = clf.coef_
coef_data = numpy.abs(coef.data)[0]
print(coef_data)
key_words = get_key_words(vectoriser, coef_data, 10)
key_words.sort()
io_yandex.print_result(','.join(map(str, key_words)), "1_2.txt")
示例#10
0
	return accuracy, precision, recall, f1


def calculate_roc_auc(true, *arg):
	for a in arg:
		score = roc_auc_score(true, a)
		yield score


def largest_index(a):
    return numpy.argsort(a)[::-1][:1]


data = pandas.read_csv("./data/classification.csv")
tp, fp, fn, tn = calculate_prediction_type(data["true"], data["pred"])
io_yandex.print_result(" ".join(map(io_yandex.two_digit_round,
	[tp, fp, fn, tn])), "4_1.txt")
accuracy, precision, recall, f1 = calculate_scores(data["true"], data["pred"])
io_yandex.print_result(" ".join(map(io_yandex.two_digit_round,
	[accuracy, precision, recall, f1])), "4_2.txt")


data = pandas.read_csv("./data/scores.csv")
print(list(data.columns.values))
logreg, svm, knn, tree = calculate_roc_auc(data["true"], data["score_logreg"],
	data["score_svm"], data["score_knn"], data["score_tree"])
index = largest_index([logreg, svm, knn, tree]) + 1 # first is "true"
io_yandex.print_result("".join(data.columns.values[index]), "4_3.txt") #"".join is used to remove [' and '] symbols

max_val = 0
max_name = data.columns.values[2]
for index in range(2, len(data.columns.values)):
示例#11
0
    train = pandas.read_csv(path)
    #train = train.head(100)
    target, train = io_yandex.get_value_column(train, 'SalaryNormalized')
    train['FullDescription'] = train['FullDescription'].str.lower()
    train['FullDescription'] = train['FullDescription'].replace('[^a-z0-9]',
                                                                ' ',
                                                                regex=True)
    train['LocationNormalized'].fillna('nan', inplace=True)
    train['ContractTime'].fillna('nan', inplace=True)
    return target, train


target, train = load_data('salary-train.csv')
tfid_vectoriser = TfidfVectorizer(min_df=5)
train_text = tfid_vectoriser.fit_transform(train['FullDescription'])
dict_vectorizer = DictVectorizer()
train_categ = dict_vectorizer.fit_transform(
    train[['LocationNormalized', 'ContractTime']].to_dict('records'))
train = hstack(blocks=[train_text, train_categ])

clf = Ridge(alpha=1, random_state=241)
clf.fit(train, target)

target, train = load_data('salary-test-mini.csv')
train_text = tfid_vectoriser.transform(train['FullDescription'])
train_categ = dict_vectorizer.transform(
    train[['LocationNormalized', 'ContractTime']].to_dict('records'))
train = hstack(blocks=[train_text, train_categ])
target = clf.predict(train)
io_yandex.print_result(' '.join(map(io_yandex.two_digit_round, target)),
                       '1.txt')
示例#12
0
def cross_validate(df):
    accuracies = []
    params = []
    kf = KFold(len(df.target), n_folds=5, shuffle=True, random_state=42)
    for p in linspace(1., 10., 100):
        regressor = KNeighborsRegressor(n_neighbors=5,
                                        weights='distance',
                                        metric='minkowski',
                                        p=p)
        score = cross_val_score(regressor,
                                X=df.data,
                                y=df.target,
                                cv=kf,
                                scoring='mean_squared_error')
        accuracies.append(mean(score))
        params.append(p)
    return accuracies, params


def calculate_max_accuracies(df):
    accuracies, params = cross_validate(df)
    max_accuracy = max(accuracies)
    index = accuracies.index(max_accuracy)
    return (params[index])


df = sklearn.datasets.load_boston()
df.data = scale(df.data)
index = calculate_max_accuracies(df)
io_yandex.print_result(io_yandex.one_digit_round(index), "1b.txt")
示例#13
0
sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))
from mylib import io_yandex

def get_procent(dataframe, colomn_name, value):
    s = dataframe.loc[dataframe[colomn_name] == value]
    s = float(len(s.index)) / float(len(dataframe.index)) * 100
    s = io_yandex.two_digit_round(s)
    return s


data = io_yandex.load_titanic_to_dataframe()
print (list(data.columns.values))
len(df.index)
s = data['Sex'].value_counts()
s = str(s[0]) + ' ' + str(s[1])
io_yandex.print_result(s, '1.txt')

s = get_procent(data, 'Survived', 1)
io_yandex.print_result(s, '2.txt')

s = get_procent(data, 'Pclass', 1)
io_yandex.print_result(s, '3.txt')

s = data.loc[data.Age.notnull()]
s = s['Age']
s = str(io_yandex.two_digit_round(float(numpy.mean(s, axis = 0)))) + ' ' \
     + str(io_yandex.two_digit_round(float(numpy.median(s, axis = 0))))
io_yandex.print_result(s, '4.txt')

s = scipy.stats.pearsonr(data['SibSp'], data['Parch'])
s = io_yandex.two_digit_round(s[0])
示例#14
0
def draw_plot(learning_rate, train_loss, test_loss, index):
    plt.figure()
    plt.title('Learning rate = ' + str(learning_rate))
    plt.plot(test_loss, 'r', linewidth=2)
    plt.plot(train_loss, 'g', linewidth=2)
    plt.legend(['test', 'train'])
    plt.savefig(str(index) + '.png')


X_train, X_test, y_train, y_test = load_data('gbm-data.csv')
min_res = 1
for index, learning_rate in enumerate([1, 0.5, 0.3, 0.2, 0.1], start=1):
    train_loss, test_loss = fit_and_log_loss(X_train, y_train, learning_rate)
    draw_plot(learning_rate, train_loss, test_loss, index)
    if index == 4:  # learning_rate = 0.2
        min_res = numpy.argmin(test_loss)
        io_yandex.print_result(
            io_yandex.two_digit_round(test_loss[min_res]) + ' ' + str(min_res),
            '2.txt')

io_yandex.print_result('overfitting', '1.txt')

min_res = 37
clf = RandomForestClassifier(n_estimators=min_res, random_state=241)
clf.fit(X_train, y_train)
train_score = clf.predict_proba(X_train)
test_score = clf.predict_proba(X_test)
test_loss = log_loss(y_test, test_score)
io_yandex.print_result(io_yandex.two_digit_round(test_loss), '3.txt')
示例#15
0
    return data_train, classes_train, data_test, classes_test


def scale_data(*arg):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(arg[0])
    yield X_train_scaled
    for a in arg[1:]:
        yield scaler.transform(a)


def teach(data_train, classes_train, data_test):
    clf = Perceptron(random_state=241)
    clf.fit(data_train, classes_train)
    classes_predictions = clf.predict(data_test)
    return classes_predictions


data_train, classes_train, data_test, classes_test = load_data()
predictions = teach(data_train, classes_train, data_test)
non_scaled_accuracies = accuracy_score(classes_test, predictions)
print(non_scaled_accuracies)

data_train, data_test = scale_data(data_train, data_test)
predictions = teach(data_train, classes_train, data_test)
scaled_accuracies = accuracy_score(classes_test, predictions)
print(scaled_accuracies)

io_yandex.print_result(
    io_yandex.three_digit_round((scaled_accuracies - non_scaled_accuracies)),
    "2_2.txt")
示例#16
0
from numpy import corrcoef, argmax
PACKAGE_PARENT = "../.."
SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))
from mylib import io_yandex


train = pandas.read_csv('close_prices.csv')
target, train = io_yandex.get_value_column(train, 'date')
#print(train.head(5))

pca = PCA(n_components=10)
pca.fit(train)
ratio = 0.
number = 0
while ratio < 0.9 and number < len(pca.explained_variance_ratio_):
	ratio += pca.explained_variance_ratio_[number]
	number += 1
print(number, ratio)
io_yandex.print_result(str(number), '1_1.txt')

reduced = pca.transform(train)[:,0]

real = pandas.read_csv('djia_index.csv')
real = real['^DJI']
correlation = corrcoef(reduced, real)[0, 1]
io_yandex.print_result(str(correlation), '1_2.txt')

company = train.columns[argmax(pca.components_[0])]
io_yandex.print_result(company, '1_3.txt')
示例#17
0
	data_test = pandas.read_csv('../data/perceptron-test.csv', header=None)
	classes_test, data_test = io_yandex.get_value_column(data_test, 0)
	return data_train, classes_train, data_test, classes_test


def scale_data(*arg):
	scaler = StandardScaler()
	X_train_scaled = scaler.fit_transform(arg[0])
	yield X_train_scaled
	for a in arg[1:]:
		yield scaler.transform(a)

def teach(data_train, classes_train, data_test):
	clf = Perceptron(random_state=241)
	clf.fit(data_train, classes_train)
	classes_predictions = clf.predict(data_test)
	return classes_predictions


data_train, classes_train, data_test, classes_test = load_data()
predictions = teach(data_train, classes_train, data_test)
non_scaled_accuracies = accuracy_score(classes_test, predictions)
print(non_scaled_accuracies)

data_train, data_test = scale_data(data_train, data_test)
predictions = teach(data_train, classes_train, data_test)
scaled_accuracies = accuracy_score(classes_test, predictions)
print(scaled_accuracies)

io_yandex.print_result(io_yandex.three_digit_round((scaled_accuracies - non_scaled_accuracies)), "2_2.txt") 
示例#18
0
from numpy import linspace, mean
PACKAGE_PARENT = '..'
SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))
from mylib import io_yandex


def cross_validate(df):
	accuracies = []
	params = []
	kf = KFold(len(df.target), n_folds=5, shuffle=True, random_state=42)
	for p in linspace(1., 10., 100):
		regressor = KNeighborsRegressor(n_neighbors=5, weights='distance', metric='minkowski', p = p)
		score = cross_val_score(regressor, X=df.data, y=df.target, cv=kf, scoring='mean_squared_error')
		accuracies.append(mean(score))
		params.append(p)
	return accuracies, params


def calculate_max_accuracies(df):
	accuracies, params = cross_validate(df)
	max_accuracy = max(accuracies)
	index = accuracies.index(max_accuracy)
	return(params[index])


df = sklearn.datasets.load_boston()
df.data = scale(df.data)
index = calculate_max_accuracies(df)
io_yandex.print_result(io_yandex.one_digit_round(index), "1b.txt")
示例#19
0

def replace_sex(x):
	if x == 'M':
		return 1
	elif x == 'I':
		return 0
	elif x == 'F':
		return -1


def load_data(path):
    train = pandas.read_csv(path)
    #train = train.head(100)
    target, train = io_yandex.get_value_column(train, 'Rings')
    train['Sex'] = list(map(replace_sex, train['Sex']))
    return train, target


X, y = load_data('abalone.csv')
start = time.time()
for i in range(1, 51):
	clf = RandomForestRegressor(n_estimators=i, random_state=1)
	kf = KFold(len(y), n_folds=5, random_state=1, shuffle=True)
	score = mean(cross_val_score(clf, X, y, cv=kf, scoring='r2', n_jobs=-1))
	#print(i, score) 
	if (score > 0.52):
		io_yandex.print_result(str(i), "1.txt")
		break
end = time.time()
print(end - start)