def get_x_matrix(var_to_calc, restricted=False): var_to_calc -= 1 x_2 = read_column_from_csv(column_number=0 + var_to_calc * 4, file='data/4problem.csv') x_3 = read_column_from_csv(column_number=1 + var_to_calc * 4, file='data/4problem.csv') x_4 = read_column_from_csv(column_number=2 + var_to_calc * 4, file='data/4problem.csv') y_1 = read_column_from_csv(column_number=3 + var_to_calc * 4, file='data/4problem.csv') len_of_data = len(y_1) # vector of MSE coefficients is (X^T * X)^-1 * X^T * Y # 1. make vector-column x = numpy.ones((len_of_data, 1), dtype=float) x_2 = numpy.array([x_2]).T x_3 = numpy.array([x_3]).T x_4 = numpy.array([x_4]).T y_1 = numpy.array([y_1]).T x = numpy.concatenate((x, x_2), axis=1) if not restricted: x = numpy.concatenate((x, x_3), axis=1) x = numpy.concatenate((x, x_4), axis=1) return x
def ls(var_to_calc, ridge=0, restricted=False): """ :param ridge: ridge coefficient :param var_to_calc: variant from 1 to 10 :param restricted: model with restriction or not :return: array of LS(least squares) coefficients """ x = get_x_matrix(var_to_calc, restricted) # (X^T * X) # numpy.dot is a matrix multiplication # if ridge is none 0 if not restricted: ar = numpy.zeros((4, 4), float) numpy.fill_diagonal(ar, float(ridge)) x_step1 = numpy.dot(x.T, x) + ar else: x_step1 = numpy.dot(x.T, x) # (X^T * X)^-1 x_step2 = linal.inv(x_step1) # (X^T * X)^-1 * X^T x_step3 = numpy.dot(x_step2, x.T) # (X^T * X)^-1 * X^T * Y y_1 = read_column_from_csv(column_number=3 + (var_to_calc - 1) * 4, file='data/4problem.csv') y_1 = numpy.array([y_1]).T coefficient_vector = numpy.dot(x_step3, y_1) return coefficient_vector.T[0]
for i in range(len(prev_centers)): prev_centers[i] = list(centers[i]) # we must copy this way. for i in range(len(centers)): # calculating new centers sx = 0 sy = 0 for j in range(len(clusters[i])): sx += clusters[i][j][0] sy += clusters[i][j][1] if len(clusters[i] ) > 0: # situation when cluster is empty is possible centers[i][0] = round(sx / len(clusters[i]), 5) centers[i][1] = round(sy / len(clusters[i]), 5) is_continue = False for i in range(len(centers)): # decide must we continue or not if centers[i] not in prev_centers: is_continue = True break return clusters X = read_column_from_csv(0 + (v_number - 1) * 2, 'data/7problem.csv') Y = read_column_from_csv(1 + (v_number - 1) * 2, 'data/7problem.csv') draw_plot(X, Y) for k in range(2, 5): clusters = clustering(k, X, Y) draw_clusters(clusters)
import pandas as pd import statsmodels.formula.api as smf import math from lib import rss, ess from scipy.stats import f, norm, chi import numpy as np from lib import mk_data_var, read_column_from_csv # TODO alter this to your variant v_number = 1 mk_data_var(v_number) class_1 = read_column_from_csv(0, 'data/6problem_{}.csv'.format(v_number), type='f') class_2 = read_column_from_csv(1, 'data/6problem_{}.csv'.format(v_number), type='f') sex = read_column_from_csv(3, 'data/6problem_{}.csv'.format(v_number), type='f') survived = read_column_from_csv(4, 'data/6problem_{}.csv'.format(v_number), type='f') df = pd.DataFrame({ "class_1": class_1, "class_2": class_2, "sex": sex,
d3=1, если квартира трёхкомнатная, 0 иначе; d4=1, если квартира четырёхкомнатная, 0 иначе; dist расстояние от центра Москвы (в км); walk=1, если до метро можно быстро дойти пешком, 0 иначе; brick=1, если дом кирпичный, 0 иначе; bal=1, если есть балкон, 0 иначе; floor=0, если этаж первый или последний, 1 иначе. """ variation = 1 # TODO Вы должны сделать свой .csv файл из того что прислал Фурманов. Нужно удалить из него все строки # в которых есть пустые элементы. Пустые строки в каждом варианте разные, поэтому удалите # только те, что пустые именно в вашем варианте. После этих операций сохраните его как data/5problem.csv bal = read_column_from_csv(column_number=0 + (variation - 1) * 11, file='data/5problem.csv') brick = read_column_from_csv(column_number=1 + (variation - 1) * 11, file='data/5problem.csv') d2 = read_column_from_csv(column_number=2 + (variation - 1) * 11, file='data/5problem.csv') d3 = read_column_from_csv(column_number=3 + (variation - 1) * 11, file='data/5problem.csv') d4 = read_column_from_csv(column_number=4 + (variation - 1) * 11, file='data/5problem.csv') dist = read_column_from_csv(column_number=5 + (variation - 1) * 11, file='data/5problem.csv') floor = read_column_from_csv(column_number=6 + (variation - 1) * 11, file='data/5problem.csv') price = read_column_from_csv(column_number=7 + (variation - 1) * 11, file='data/5problem.csv') totsp = read_column_from_csv(column_number=8 + (variation - 1) * 11,
методом наименьших квадратов""") print() # LS(least squares) # b1 b2 b3 b4 coefficient_vector = ls(variation) print("Коэфициенты b1 b2 b3 b4:") print(coefficient_vector) print() ############################################################## ######## Проверьте значимость регрессии в целом ############## ############################################################## print("2. Проверьте значимость регрессии в целом") print() x_2 = read_column_from_csv(column_number=0 + (variation - 1) * 4, file='data/4problem.csv') x_3 = read_column_from_csv(column_number=1 + (variation - 1) * 4, file='data/4problem.csv') x_4 = read_column_from_csv(column_number=2 + (variation - 1) * 4, file='data/4problem.csv') y_1 = read_column_from_csv(column_number=3 + (variation - 1) * 4, file='data/4problem.csv') y_estimation = [ coefficient_vector[0] + coefficient_vector[1] * x_2[i] + coefficient_vector[2] * x_3[i] + coefficient_vector[3] * x_4[i] for i in range(40) ] ess_ur = ess(y_1, y_estimation) rss_ur = rss(y_1, y_estimation)
методом наименьших квадратов""") print() # LS(least squares) # b1 b2 b3 b4 coefficient_vector = ls(variation) print("Коэфициенты b1 b2 b3 b4:") print(coefficient_vector) print() ############################################################## ######## Проверьте значимость регрессии в целом ############## ############################################################## print("2. Проверьте значимость регрессии в целом") print() x_2 = read_column_from_csv(column_number=0 + (variation - 1) * 4, file=file_path) x_3 = read_column_from_csv(column_number=1 + (variation - 1) * 4, file=file_path) x_4 = read_column_from_csv(column_number=2 + (variation - 1) * 4, file=file_path) y_1 = read_column_from_csv(column_number=3 + (variation - 1) * 4, file=file_path) y_estimation = [ coefficient_vector[0] + coefficient_vector[1] * x_2[i] + coefficient_vector[2] * x_3[i] + coefficient_vector[3] * x_4[i] for i in range(40) ] ess_ur = ess(y_1, y_estimation) rss_ur = rss(y_1, y_estimation)