Exemplo n.º 1
0
    def test_linear_regression(self):
        X_test = DataFrame([1.5, 2.5, 3.5])
        y_test = DataFrame([1.5, 2.5, 3.5])

        fit = linear_regression(X_test, y_test)
        assert_frame_equal(y_test, DataFrame(fit.predict(X_test)))
        self.assertEqual(round(fit.coef_, 2), 1.0)
        self.assertEqual(round(fit.intercept_, 2), 0.0)
        r, _ = pearsonr(X_test.values, y_test.values)
        self.assertEqual(r, 1.0)
Exemplo n.º 2
0
def hypothesisTesting():
	for i in range(0,CSVcount):
		for j in range(0,CSVcount):
			csv1_name = "CSV" + str(i+1)
			csv2_name = "CSV" + str(j+1)
			csv1 = ResList[csv1_name]
			csv2 = ResList[csv2_name]
			relation = CsvRelations[i][j]
			
			if(relation == 0):
				pass
			elif(relation == -1):
			# Hypothesis 1
				pass
			elif(relation == 2):
				pass
			elif(relation == 1):
				# Hypothesis 2
				# Correlation Results
				anomalies_from_correlation = anomaliesFromWindowCorrelationWithConstantlag(csv1, csv2, window_size=15,maxlag=15, positive_correlation=True, pos=1, neg=1)
				# Slope Based Detection Technique
				# Extracting only data
				data1 = [x[1] for x in csv1]
				data2 = [x[1] for x in csv2]
				slope_based = slopeBasedDetection(data1,False,data2,False)
				anomalies_from_slope_based = anomalyDatesSlopeBaseddetetion(slope_based,csv1)
				(lr_based,lr_object) = linear_regression(data1, data2, 1)
				anomalies_from_lr = anomalies_from_linear_regression(lr_based,csv1)
				
				# Converting results to string
				resultString = ""
				resultString = "Anomalies from Correlation test <br>"
				resultString += "Start Date &nbsp;&nbsp;&nbsp;&nbsp; End Date &nbsp;&nbsp;&nbsp;&nbsp; Correlation Value<br>"
				for dataPoint in anomalies_from_correlation:
					resultString += str(dataPoint[0]) + "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;" + str(dataPoint[1]) + "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;" + str(dataPoint[2]) + "<br>"
				resultString += "Anomalies from Slope Based test <br>"
				resultString += "Start Date &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; End Date &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Slope Value <br>"
				for dataPoint in anomalies_from_slope_based:
					resultString += str(dataPoint[0]) + "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;" + str(dataPoint[1]) + "&nbsp;&nbsp;&nbsp;&nbsp;" + str(dataPoint[2]) + "&nbsp;&nbsp;&nbsp;&nbsp; <br>" 
				resultString += "Anomalies from Linear Regression test<br>"
				resultString += "Date &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; X Val &nbsp;&nbsp;&nbsp;&nbsp; Y Val &nbsp;&nbsp;&nbsp;&nbsp; Expected Y Val &nbsp;&nbsp;&nbsp;&nbsp; Difference <br>"
				for dataPoint in anomalies_from_lr:
					resultString += str(dataPoint[0]) + "&nbsp;&nbsp;&nbsp;&nbsp;" + str(dataPoint[1]) + "&nbsp;&nbsp;&nbsp;&nbsp;" + str(dataPoint[2]) + "&nbsp;&nbsp;&nbsp;&nbsp;" + str(dataPoint[3]) + "&nbsp;&nbsp;&nbsp;&nbsp;" + str(dataPoint[4]) + "<br>" 
				plotGraph(csv1,csv2,anomalies_from_correlation)
				return resultString
			elif(relation == -2):				
				pass
				
	# Hypothesis 1 Methods
	# Correlation
	pass 
Exemplo n.º 3
0
def hypothesis4Testing(numOfFiles, *timeSeriesFileNames):
    if len(timeSeriesFileNames) != numOfFiles:
        print "Number of files mentioned do not match the specified files provided"
        return

    csvDataList = []  # 2D list storing data of each file
    for fileName in timeSeriesFileNames:
        with open(fileName, "rb") as f:
            reader = csv.reader(f)
            csvData = map(tuple, reader)
        csvDataList.append(csvData)

    centresList = []
    testData = []
    temp1 = []
    for i in csvDataList:
        td = getColumnFromListOfTuples(i, 2)  # wholesale price, indexing starts from 1
        testData.append(convertListToFloat(td))
        temp1 = getColumnFromListOfTuples(i, 0)
        temp2 = getColumnFromListOfTuples(i, 2)
        temp = zip(temp1, temp2)
        centresList.append(temp)
    # print "testData" + str(testData)

    avgTimeSeries = findAverageTimeSeries(testData)
    avgTimeSeries = zip(temp1, avgTimeSeries)
    # print "Average Time Series :::::: "+ str(avgTimeSeries)

    for i, c_list in enumerate(centresList):
        # CALL SLOPE BASED
        slopeBasedResult = slopeBased(c_list, False, avgTimeSeries, False)
        slopeBasedResult = mergeDates(slopeBasedResult)
        # Correlation
        correlationResult = anomaliesFromWindowCorrelationWithConstantlag(c_list, avgTimeSeries)
        correlationResult = mergeDates(correlationResult)
        # Linear Regression
        lrResult = linear_regression(avgTimeSeries, c_list, 1)
        lrResult = mergeDates(lrResult)
        result = intersection(
            3, slopeBasedResult, "slope_based", correlationResult, "correlation", lrResult, "linear_regression"
        )
        print "Anomalies fior time-series " + str(i) + " are:"
        for (a, b, c) in result:
            print str(a) + "," + str(b) + "," + str(c)
Exemplo n.º 4
0
import numpy as np
from matplotlib import pyplot as plt
from linear_regression import linear_regression

from sklearn import linear_model, datasets


n_samples = 1000
n_outliers = 50


X = np.random.uniform(-10,10,1000)
y = X
X = X + X *  np.random.normal(0, 0.2, 1000)
X = X.reshape((1000, 1))

print X.shape, y.shape
model = linear_regression()
model.fit(X, y)
pred = model.predict(X)

print model.coef
plt.scatter(X, y, color='gold', marker='.')
plt.plot(X.reshape((1000)), pred)
plt.grid(True)
plt.show()

Exemplo n.º 5
0
                                                least_fold,
                                                param['alpha'],
                                                test="False"))
    avg_train_rmse = (sum(train_cost) / len(train_cost))
    lr.plt.title("RMSE vs Iterations for " + str(model) + " regularization")
    lr.plt.xlabel('Iterations', fontsize=18)
    lr.plt.ylabel('RMSE', fontsize=18)
    lr.plt.plot(lr.np.linspace(0, iterations, len(train_cost)), train_cost,
                'r')
    lr.plt.show()
    lr_model.cost_func_val(least_fold + 1)
    print('Test RMSE Error for ' + str(model) + " " + str(lr_model.Val_rmse))


if (__name__ == "__main__"):
    lr_model = lr.linear_regression()
    # filename=lr_model.convert_data_to_csv('./abalone.data')
    dataset = lr.pd.read_csv('./q1.csv')
    lr_model.find_vectors_k_fold(dataset, 5)
    least_val = 10**5
    least_fold = 0
    print("K-folds created")
    for i in range(5):
        # lr_model.find_vector(dataset,fold_count,5)
        lr_model.optimise_weight_normal(i)
        lr_model.cost_func_train(i + 1)
        lr_model.cost_func_val(i + 1)
        if lr_model.Val_rmse < least_val:
            least_val = lr_model.Val_rmse
            least_fold = i
    print("Choose Fold " + str(least_fold + 1))
Exemplo n.º 6
0
#    def calc_potential_energy (self, xx):
#        potential_energy=torch.dot(xx,torch.matmul(self.weight_matrix,xx))
#        return potential_energy
#Regular run
#print("Potential")

#
#Amat = torch.FloatTensor([[-2, 0, 0, 0],
#                          [0, -2, 0, 0],
#                          [0, 0, -2, 0],
#                          [0, 0, 0, -2]])

dim = 4
bias = True

c = linear_regression(dim, lamb=0.1, bias=bias)
c.generate_data(200, scale=1, noise_db=-np.inf)
ps = lambda x: -20 * c.get_regularized_loss(x)

w, b = c.get_ground_truth(lr=2)
if bias:
    init_position = np.append(w.numpy(), b)
else:
    init_position = w

hmc = sampler(position_dim=dim + bias,
              step_size=0.02,
              potential_struct=ps,
              T=0.1,
              init_position=init_position)
#sample,rej_cnt = hmc.main_hmc_loop(1000)
Exemplo n.º 7
0
def stats_calc(t, res, err, flog):
    '''Statistical result of residual.
    '''
    # beginning epoch for calculate the trend
    t0 = 2000.0

    resn, errn, mean, wrms, std, cond = elim_wrms(res, err)

    # slope, intercept, r_value, p_value, std_err = stats.linregress(
    #     t[cond], resn)

    tn = t[cond]

    par, parerr, outlier, cor = linear_regression(tn - t0, resn, errn)
    slope, intercept = par
    slperr, itperr = parerr

    # print("# weighted\n",
    #       "# Mean      : %.3f\n" % mean,
    #       "# Std       : %.3f\n" % std,
    #       "# WRMS      : %.3f\n" % wrms,
    #       "# Slope     : %.3f +/- %.3f\n" % (slope, slperr),
    #       "# Intercept : %.3f  " % intercept,
    #       file=flog)

    print("# weighted\n",
          "# Mean      : %.2f\n" % mean,
          "# Std       : %.2f\n" % std,
          "# WRMS      : %.2f\n" % wrms,
          "# Slope     : %.2f +/- %.2f\n" % (slope, slperr),
          "# Intercept : %.2f  " % intercept,
          file=flog)

    print("STAS_ALL ", mean, std, wrms, slope, slperr, file=flog)

    # Add the statistics after removing the linear trend;
    res1 = res - slope * (t - t0)
    # resn1, errn1, mean1, wrms1, std1, cond1 = elim_wrms(res1, err)
    # tn1 = t[cond1]
    # par1, parerr1, outlier1, cor1 = linear_regression(
    #     tn1 - t0, resn1, errn1)
    # slope1, intercept1 = par1
    # slperr1, itperr1 = parerr1

    resn1 = resn - slope * (tn - t0)
    errn1 = errn
    _, _, mean1, wrms1, std1, cond1 = elim_wrms(res1, err)
    # tn1 = t[cond1]
    par1, parerr1, outlier1, cor1 = linear_regression(tn - t0, resn1, errn1)
    slope1, intercept1 = par1
    slperr1, itperr1 = parerr1

    print("# After removing linear trend:\n",
          "# Mean      : %.2f\n" % mean1,
          "# Std       : %.2f\n" % std1,
          "# WRMS      : %.2f\n" % wrms1,
          "# Slope     : %.2f +/- %.2f\n" % (slope1, slperr1),
          "# Intercept : %.2f\n" % intercept1,
          file=flog)

    print("STAS_AFTER ", mean1, std1, wrms1, slope1, slperr1, file=flog)

    return slope, intercept
Exemplo n.º 8
0
sys.path.append('unsupervised/')

import preprocessing, linear_regression, logistic_regression, decision_tree, svm, k_means

best_score = []
counter = 0.0

# Call Preprocess class to format the data
preprocess = preprocessing.Preprocess()
# Uncomment the line below if need to preprocess the dataframe on run
#preprocess.preprocess()

# Supervised learning methods
# Linear Regression
linear_regression = linear_regression.LinearReg()
best_score.append(linear_regression.linear_regression())

# Logistic Regression
logistic_regression = logistic_regression.LogisticReg()
best_score.append(logistic_regression.logistic_regression())

# Decision Tree
decision_tree = decision_tree.DeciTree()
best_score.append(decision_tree.decision_tree())

# Unsupervised learning methods
# Support Vector Machines
svm = svm.Svm()
best_score.append(svm.svmachines())

# K-means
Exemplo n.º 9
0
    daily_df = daily_gb.sort_values(by=['date'], ignore_index=True)
    state_dfs[state_code] = daily_df

#%% Add US and states into single entity 
# to simplify plotting and calcuations
entity_df_dict.update(state_dfs)
entity_codes.extend(state_codes)
entity_names.extend(list(state_names_dict.values()))
entity_names_dict = dict(zip(entity_codes, entity_names))

#%% plots!
for entity_code, entity_df in entity_df_dict.items():
    plot_daily_data_cum(entity_df, title_str=entity_names_dict[entity_code], plot_trend=True)
    plot_daily_data_diff(entity_df, title_str=entity_names_dict[entity_code], plot_trend=False)
    
#%% some regression tests
for entity_code, entity_df in entity_df_dict.items():
    df = entity_df[['datetime', 'positive']].copy().dropna()
    df = df[df['positive'] > 0].reset_index(drop=True)
    slope, intercept, r_value, std_err, y_hat = lr.linear_regression(df.index, np.log10(df['positive']))
    days2double = doubling.days_to_double(slope)
    print(f"Days for positive cases in {entity_names_dict[entity_code]} to double => {days2double:0.2f}")

    df = entity_df[['datetime', 'death']].copy().dropna().reset_index(drop=True)
    slope, intercept, r_value, std_err, y_hat = lr.linear_regression(df.index, np.log10(df['death']))
    days2double = doubling.days_to_double(slope)
    print(f"Days for deaths in {entity_names_dict[entity_code]} to double => {days2double:0.2f}")
    print()


Exemplo n.º 10
0
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from stand_functions import create_file, append_text, append_math, end_file

# Define data file and columns to use
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data/')
data_file = 'offenes_experiment.csv'
x_col = 1
y_col = 3

data = pd.read_csv(data_dir + data_file, sep=',', header=0)
x_col_name = data.columns[x_col]
y_col_name = data.columns[y_col]

data, calculations = linear_regression(data, x_col_name, y_col_name)

# Make the graph
plt.plot(data[x_col_name].drop(['Summe', 'Mittelwert']),
         data[y_col_name].drop(['Summe', 'Mittelwert']), 'rx')
## Make the linear regression line
axes = plt.gca()
x_vals = np.array(axes.get_xlim())
y_vals = calculations['a'] + calculations['b'] * x_vals
plt.plot(x_vals, y_vals, 'r--')
## Make the line from the calculations
plt.plot(data['Zeit (in s)'].drop(['Summe', 'Mittelwert']),
         data['berechnete Strecke (in m)'].drop(['Summe',
                                                 'Mittelwert']), 'bx-')
## Graphical enhancement
plt.xlabel('Zeit (in s)')
Exemplo n.º 11
0
    sum_3 = 0

    for i in range(total_flavors):
        sum_1 += math.pow((predict[i] - actual[i]), 2)
        sum_2 += math.pow((predict[i]), 2)
        sum_3 += math.pow(actual[i], 2)
    score_1 = (
        1 - math.sqrt(sum_1 / total_flavors) /
        (math.sqrt(sum_2 / total_flavors) + math.sqrt(sum_3 / total_flavors)))
    return score_1


if __name__ == '__main__':
    history_data, future_data, sample_ps, sample_vm, dim_to_be_optimized, history_begin, predict_begin, predict_end, flavor_num = read_data(
    )
    lse_model = linear_regression()
    predict = []
    actual = []
    for i in range(total_flavors):
        predict_list = []
        # history_data[i] = avg_filter(history_data[i])
        history_data[i] = get_pow(history_data[i], exponent)
        history_data[i] = batch_add(history_data[i], addition)

        x_train, y_train, x_last = create_dataset(history_data[i], 7, 1)
        x_train = gaussian_weighted(x_train)
        x_last = gaussian_weighted(x_last)
        lse_model.lse_fit(x_train, y_train)
        x_train.show()

        for j in range(predict_span):
Exemplo n.º 12
0
    ]

    normal = pd.DataFrame({
        'x': [1, 2, 3, 4, 5, 6, 7],
        'y': [1, 3, 2, 5, 3, 7, 5]
    })
    normal.name = ''

    outlier = pd.DataFrame({
        'x': [1, 1, 5, 5, 25, 10],
        'y': [1, 5, 1, 10, 10, 5]
    })
    outlier.name = 'with outlier'

    dataframes = [normal, outlier]

    for df in dataframes:
        linear_regression(df)
        dfs = [df] * len(plots)
        pc = PlotContainer(plots, dfs)
        # now we want to successively create plots and overlay them over previous
        # ones
        for i in range(len(pc)):
            fname = 'Linear Regression {0} example part {1}'.format(
                pc.dfs[i].name, i)
            pc.graph(fname,
                     directory='images',
                     setup=shared_setup,
                     start=0,
                     stop=1 + i)
Exemplo n.º 13
0
    def test_linear_regression_can_learn_doubling(self):
        model = linr.linear_regression(
            np.array([[1.0, 2.0], [2.0, 4.0], [3.0, 6.0], [4.0, 8.0]]), )

        prediction = model.predict(np.array([[6.0]]))
        self.assertAlmostEqual(prediction[0][0], 12.0, places=3)
Exemplo n.º 14
0
run_linear_regression(100)

# In[161]:

run_linear_regression(100, 'noisy')

# In[ ]:

train_data = scipy.io.loadmat('data/poly_train.mat')
test_data = scipy.io.loadmat('data/poly_test.mat')

x_train = train_data['X']
y_train = train_data['y']
x_test = test_data['X_test']
y_test = test_data['y_test']
w = linear_regression(x_train, y_train)

x_train = add_bias(x_train)
x_test = add_bias(x_test)
e_train = np.where(y_train * (w.T @ x_train) < 0)[0].shape[0] / len(y_train[0])
e_test = np.where(y_test * (w.T @ x_test) < 0)[0].shape[0] / len(y_test[0])

print('E_train is %f, E_test is %f.' % (e_train, e_test))

# In[171]:

train_data = scipy.io.loadmat('data/poly_train.mat')
test_data = scipy.io.loadmat('data/poly_test.mat')

x_train = train_data['X']
y_train = train_data['y']
print "\n=== Naive Bayes CLassifier with Laplace Smoothing ==="
c = NaiveBayesClassifier(SPAM, HAM, 1)
result("SPAM", c.spam.p, 0.4)
result("HAM", c.ham.p, 0.6)
result("today|SPAM", c.spam.p_word("today"), 0.0476)
result("today|HAM",  c.ham.p_word("today"), 0.1111)
result("SPAM|today is secret)", c.p_spam_given_phrase("today is secret"), 0.4858)


from linear_regression import linear_regression, gaussian
from scipy import matrix
print "\n=== Linear Regression ==="
x = [3,  4,  5,  6]
y = [0, -1, -2, -3]
(w0, w1), err = linear_regression(x, y)
print "(w0=%.1f, w1=%.1f) err=%.2f" % (w0, w1, err)

x = [2, 4, 6, 8]
y = [2, 5, 5, 8]
(w0, w1), err = linear_regression(x, y)
print "(w0=%.1f, w1=%.1f) err=%.2f" % (w0, w1, err)

x = matrix([[3],
            [4],
            [5],
            [6],
            [7]])
m, s = gaussian(x)
print "m  = %s" % str(m)
print "s^2= %s" % str(s)
Exemplo n.º 16
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# File name: verify_codes.py
"""
Created on Tue Jan  9 15:26:08 2018

@author: Neo([email protected])
"""

import numpy as np
from scipy import stats
from linear_regression import linear_regression

# -----------------------------  FUNCTIONS -----------------------------
x = np.random.normal(0, 1, 100)
y = 1.5 * x + 0.4 + np.random.normal(0, 0.5, 100)
# err = np.ones_like(x)
err = np.random.normal(0, 0.5, 100)
pi = err**-2

slope, intercept, r_value, p_value, std_err = stats.linregress(x, y * pi)
print(slope, intercept)

par, err, outlier, cor = linear_regression(x, y, err)
print(par)
# --------------------------------- END --------------------------------
Exemplo n.º 17
0
 def test_linear_regression(self):
     points = ((0, -1), (1, 0.2), (2, 0.9), (3, 2.1))
     k, n = linear_regression(points)
     self.assertAlmostEqual(k, 1.0)
     self.assertAlmostEqual(n, -0.95)
Exemplo n.º 18
0
from linear_regression import linear_regression
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

df = pd.read_csv("winequality-white.csv", sep=";")

covariates = df.drop("quality", axis=1).values
targets = df["quality"].values

beta, se_beta, lower_bounds, upper_bounds = linear_regression(
    covariates, targets)

result_table = pd.DataFrame.from_dict({
    "lower_bound_for_estimates": lower_bounds,
    "estimates": beta,
    "upper_bound_for_etimates": upper_bounds,
    "standard_errors": se_beta
})

print("Result table:")
display(result_table)

plt.plot(lower_bounds)
plt.plot(beta)
plt.plot(upper_bounds)
plt.title("Result plot")