def decompose(self, verbose=False): '''Perform the Singular Value Decomposition and identify the rank of the embedding subspace Characteristic of projection: the proportion of variance captured in the subspace''' X = self.X_com self.S = X * X.T self.U, self.s, self.V = linalg.svd(self.S) self.U, self.s, self.V = m(self.U), np.sqrt(self.s), m(self.V) self.d = np.linalg.matrix_rank(X) Vs, Xs, Ys, Zs = {}, {}, {}, {} for i in range(self.d): Zs[i] = self.s[i] * self.V[:, i] Vs[i] = X.T * (self.U[:, i] / self.s[i]) Ys[i] = self.s[i] * self.U[:, i] Xs[i] = Ys[i] * (m(Vs[i]).T) self.Vs, self.Xs = Vs, Xs self.s_contributions = self.get_contributions(X, self.s, False) self.r = len(self.s_contributions[self.s_contributions > 0]) self.r_characteristic = round( (self.s[:self.r]**2).sum() / (self.s**2).sum(), 4) self.orthonormal_base = {i: self.U[:, i] for i in range(self.r)} if verbose: msg1 = 'Rank of trajectory\t\t: {}\nDimension of projection space\t: {}' msg1 = msg1.format(self.d, self.r) msg2 = 'Characteristic of projection\t: {}'.format( self.r_characteristic) self._printer('DECOMPOSITION SUMMARY', msg1, msg2)
def get_cos_sim(vector, matrix): """ This function returns the cosine similarity between a vector and all the vectors in a matrix. Arguments: - (ndarray (n,)) vector: the vector (unoriented) - (np matrix (m, n)) matrix: a matrix, each row should be a vector Returns: - (np matrix (m, 1)): cosine similarities between the input vector and each vector in the matrix """ from numpy import matrix as m from numpy.linalg import norm # we want the cosine similarity between each row and the input # vector therefore, we want (u.v)/(|u|*|v|) for all u in matrix # with v the input vector # therefore, we want: # - the dot product of all rows with the input vector which should # give us an m by 1 column vector with all the dot products, # which is (X.v^t) with X the matrix, v the input doc, and ^t is # transposition # - the product of the norms of all the vectors, for which we will # use np.linalg.norm, specifying the axis that yields an m by 1 # vector in the case of `matrix` # for numpy vector representation reasons, we have to transpose one # side of the division return matrix.dot(m(vector).T) / (norm(vector) * m(norm(matrix, axis=1))).T
def embed(self, embedding_dimension=None, suspected_frequency=None, verbose=False, return_df=False): '''Embed the time series with embedding_dimension window size. Optional: suspected_frequency changes embedding_dimension such that it is divisible by suspected frequency''' if not embedding_dimension: self.embedding_dimension = self.ts_N//2 else: self.embedding_dimension = embedding_dimension if suspected_frequency: self.suspected_frequency = suspected_frequency self.embedding_dimension = (self.embedding_dimension//self.suspected_frequency)*self.suspected_frequency self.K = self.ts_N-self.embedding_dimension+1 self.X = m(linalg.hankel(self.ts, np.zeros(self.embedding_dimension))).T[:,:self.K] self.X_df = df(self.X) self.X_complete = self.X_df.dropna(axis=1) self.X_com = m(self.X_complete.values) self.X_missing = self.X_df.drop(self.X_complete.columns, axis=1) self.X_miss = m(self.X_missing.values) self.trajectory_dimentions = self.X_df.shape self.complete_dimensions = self.X_complete.shape self.missing_dimensions = self.X_missing.shape self.no_missing = self.missing_dimensions[1]==0 if verbose: msg1 = 'Embedding dimension\t: {}\nTrajectory dimensions\t: {}' msg2 = 'Complete dimension\t: {}\nMissing dimension \t: {}' msg1 = msg1.format(self.embedding_dimension, self.trajectory_dimentions) msg2 = msg2.format(self.complete_dimensions, self.missing_dimensions) self._printer('EMBEDDING SUMMARY', msg1, msg2) if return_df: return self.X_df
def forecast_recurrent(self, steps_ahead=12, singular_values=None, plot=False, return_df=False, **plotargs): '''Forecast from last point of original time series up to steps_ahead using recurrent methodology This method also fills any missing data from the original time series.''' try: self.X_com_hat except(AttributeError): self._forecast_prep(singular_values) self.ts_forecast = np.array(self.ts_v[0]) for i in range(1, self.ts_N+steps_ahead): try: if np.isnan(self.ts_v[i]): x = self.R.T*m(self.ts_forecast[max(0,i-self.R.shape[0]): i]).T self.ts_forecast = np.append(self.ts_forecast,x[0]) else: self.ts_forecast = np.append(self.ts_forecast,self.ts_v[i]) except(IndexError): x = self.R.T*m(self.ts_forecast[i-self.R.shape[0]: i]).T self.ts_forecast = np.append(self.ts_forecast, x[0]) self.forecast_N = i+1 new_index = pd.date_range(start=self.ts.index.min(),periods=self.forecast_N, freq=self.freq) forecast_df = df(self.ts_forecast, columns=['Forecast'], index=new_index) forecast_df['Original'] = np.append(self.ts_v, [np.nan]*steps_ahead) if plot: forecast_df.plot(title='Forecasted vs. original time series', **plotargs) if return_df: return forecast_df
def embed( self, embedding_dimension=None, suspected_frequency=None, verbose=False, return_df=False, ): """Embed the time series with embedding_dimension window size. Optional: suspected_frequency changes embedding_dimension such that it is divisible by suspected frequency""" if not embedding_dimension: self.embedding_dimension = self.ts_N // 2 else: self.embedding_dimension = embedding_dimension if suspected_frequency: self.suspected_frequency = suspected_frequency self.embedding_dimension = ( self.embedding_dimension // self.suspected_frequency ) * self.suspected_frequency self.K = self.ts_N - self.embedding_dimension + 1 self.X = m(linalg.hankel(self.ts, np.zeros(self.embedding_dimension))).T[ :, : self.K ] self.X_df = pd.DataFrame(self.X) self.X_complete = self.X_df.dropna(axis=1) self.X_com = m(self.X_complete.values) self.X_missing = self.X_df.drop(self.X_complete.columns, axis=1) self.X_miss = m(self.X_missing.values) self.trajectory_dimentions = self.X_df.shape self.complete_dimensions = self.X_complete.shape self.missing_dimensions = self.X_missing.shape self.no_missing = self.missing_dimensions[1] == 0 if return_df: return self.X_df
def get_docs_in_topic_space(model, extra_doc=None): """ Computes and returns the document vectors expressed as a function of the topics. Arguments: - (gensim.models.doc2vec.Doc2Vec) model: A doc2vec model - (str) extra_doc: optional. If not None, will place the extra document in the topic space and return it Returns: - (np.matrix) docs: Matrix with all the document vectors expressed as a function of the topics. - (np.ndarray) extra_vec: the vector for the `extra_doc` in the topic space. If no `extra_doc` is given, will be None. """ import math import numpy as np topics = get_topic_vecs(model) # modifying math.exp so that it can be applied over an array exp = np.vectorize(math.exp) # shortening function name m = np.matrix ndocs = len(model.docvecs) # projecting documents onto topics doc_topic_proj = model.docvecs.vectors_docs.dot(topics.T) # this is a vectorized version of equation 3 in Hashimoto et al.'s # "Topic detection using paragraph vectors to support # active learning in systematic reviews", June 2016 # instead of computing each item independantly, we compute # the entire matrix at once docs_as_topics = ( np.apply_along_axis(func1d=exp, axis=0, arr=doc_topic_proj) / m(np.ones(ndocs)).T.dot(m(exp(sum(doc_topic_proj)))) ) # This chunk of code is only for the case that we want to place an extra # document in the topic space new_vec_proj = None if extra_doc is not None: new_vector = model.infer_vector(extra_doc) # placing extra document in topic space new_vec_proj = (exp(new_vector.dot(topics.T)) / (sum(exp(new_vector.dot(topics.T))) * np.ones(len(topics)) ) ) # here is a version that is vectorized to a lesser # degree (still looping on columns) # [exp(dv.dot(topics.T)) / # (sum(exp(dv.dot(topics.T))) * # np.ones(len(topics))) # multiplying ones vector by a scalar returns a # # vector with many times the same value # for dv in model.docvecs] return docs_as_topics, new_vec_proj
def get_mfcc(name, path): b, _ = librosa.core.load(path + name, sr=SAMPLE_RATE) assert _ == SAMPLE_RATE try: ft1 = librosa.feature.mfcc(b, sr=SAMPLE_RATE, n_mfcc=20) ft2 = librosa.feature.zero_crossing_rate(b)[0] ft3 = librosa.feature.spectral_rolloff(b)[0] ft4 = librosa.feature.spectral_centroid(b)[0] ft5 = librosa.feature.spectral_contrast(b)[0] ft6 = librosa.feature.spectral_bandwidth(b)[0] ft1_trunc = np.hstack( (np.mean(ft1, axis=1), np.std(ft1, axis=1), skew(ft1, axis=1), np.max(ft1, axis=1), np.min(ft1, axis=1))) ft2_trunc = np.hstack( (np.mean(ft2), np.std(ft2), skew(ft2), np.max(ft2), np.min(ft2))) ft3_trunc = np.hstack( (np.mean(ft3), np.std(ft3), skew(ft3), np.max(ft3), np.min(ft3))) ft4_trunc = np.hstack( (np.mean(ft4), np.std(ft4), skew(ft4), np.max(ft4), np.min(ft4))) ft5_trunc = np.hstack( (np.mean(ft5), np.std(ft5), skew(ft5), np.max(ft5), np.min(ft5))) ft6_trunc = np.hstack( (np.mean(ft6), np.std(ft6), skew(ft6), np.max(ft6), np.m(ft6))) return pd.Series( np.hstack((ft1_trunc, ft2_trunc, ft3_trunc, ft4_trunc, ft5_trunc, ft6_trunc))) except: print('bad file') return pd.Series([0] * 125)
def _forecast_prep(self, singular_values=None): self.X_com_hat = np.zeros(self.complete_dimensions) self.verticality_coefficient = 0 self.forecast_orthonormal_base = {} if singular_values: try: for i in singular_values: self.forecast_orthonormal_base[i] = self.orthonormal_base[i] except: if singular_values == 0: self.forecast_orthonormal_base[0] = self.orthonormal_base[0] else: raise ( "Please pass in a list/array of singular value indices to use for forecast" ) else: self.forecast_orthonormal_base = self.orthonormal_base self.R = np.zeros(self.forecast_orthonormal_base[0].shape)[:-1] for Pi in self.forecast_orthonormal_base.values(): self.X_com_hat += Pi * Pi.T * self.X_com pi = np.ravel(Pi)[-1] self.verticality_coefficient += pi ** 2 self.R += pi * Pi[:-1] self.R = m(self.R / (1 - self.verticality_coefficient)) self.X_com_tilde = self.diagonal_averaging(self.X_com_hat)
def decompose(self, verbose=False): """Perform the Singular Value Decomposition and identify the rank of the embedding subspace Characteristic of projection: the proportion of variance captured in the subspace""" X = self.X_com self.S = X * X.T self.U, self.s, self.V = linalg.svd(self.S) self.U, self.s, self.V = m(self.U), np.sqrt(self.s), m(self.V) self.d = np.linalg.matrix_rank(X) Vs, Xs, Ys, Zs = {}, {}, {}, {} for i in range(self.d): Zs[i] = self.s[i] * self.V[:, i] Vs[i] = X.T * (self.U[:, i] / self.s[i]) Ys[i] = self.s[i] * self.U[:, i] Xs[i] = Ys[i] * (m(Vs[i]).T) self.Vs, self.Xs = Vs, Xs self.s_contributions = self.get_contributions(X, self.s, False) self.r = len(self.s_contributions[self.s_contributions > 0]) self.r_characteristic = round( (self.s[:self.r]**2).sum() / (self.s**2).sum(), 4) self.orthonormal_base = {i: self.U[:, i] for i in range(self.r)}
def diagonal_averaging(hankel_matrix): """Performs anti-diagonal averaging from given hankel matrix Returns: Pandas DataFrame object containing the reconstructed series""" mat = m(hankel_matrix) L, K = mat.shape L_star, K_star = min(L, K), max(L, K) # new = np.zeros((L, K)) if L > K: mat = mat.T ret = [] # Diagonal Averaging for k in range(1 - K_star, L_star): mask = np.eye(K_star, k=k, dtype="bool")[::-1][:L_star, :] mask_n = sum(sum(mask)) ma = np.ma.masked_array(mat.A, mask=1 - mask) ret += [ma.sum() / mask_n] return pd.DataFrame(ret).rename(columns={0: "Reconstruction"})
usecols = [0, 1], dtype = {0: 'S30', 1: 'int'}, names = ['word', document], header = None) # merge with previous ones y = pd.merge(y, y_i, on = 'word', how = 'outer') # kill NaNs y = y.fillna(0) # choose prior print '' priorChoice = int(input('Uninformative (1) or informative (2) prior? ')) if priorChoice == 1: alpha_i = m.transpose(m([0.01] * len(y))) elif priorChoice == 2: priors = pd.read_csv(rpath + 'corpus.csv', # load global frequencies usecols = [0, 1], names = ['word', 'gfreq'], header = None) y = pd.merge(y, priors, on = 'word', how = 'left') # merge w/ y y = y.fillna(y['gfreq'].min()) # replace missing by argmin(alphas) alpha_i = m.transpose(m(y.gfreq)) # extract alphas del y['gfreq'] # clean up y else: sys.exit('Invalid choice') # estimate p_i yword = m.transpose(m(np.hstack((['word'], np.array(y.word))))) # word list y_i = m(y.iloc[:, 1:])
# load(m) # pos = queue.pop() # while pos: # solution.appendleft(pos) # pos = trail[pos.canonical()] # return list(solution) ##### from numpy import matrix as m grid = m([[5, 3, 0, 0, 7, 0, 0, 0, 0], [6, 0, 0, 1, 9, 5, 0, 0, 0], [0, 9, 8, 0, 0, 0, 0, 6, 0], [8, 0, 0, 0, 6, 0, 0, 0, 3], [4, 0, 0, 8, 0, 3, 0, 0, 1], [7, 0, 0, 0, 2, 0, 0, 0, 6], [0, 6, 0, 0, 0, 0, 2, 8, 0], [0, 0, 0, 4, 1, 9, 0, 0, 5], [0, 0, 0, 0, 8, 0, 0, 7, 9]]) def possible(row, col, n): for i in range(9): if grid[row, i] == n or grid[i, col] == n: return False row0 = row // 3 * 3 col0 = col // 3 * 3 for i in range(3): for j in range(3): if grid[row0 + i, col0 + j] == n: return False return True
set_printoptions(precision=4, threshold=None, edgeitems=None, linewidth=100, suppress=1, nanstr=None, infstr=None, formatter=None) #if the modulo size for example is 1.5 so we will not change the values between -1.5 and 1.5, and 1.6 will become -1.4 def mod(num,modulo_size): # return multiply(m(sign(num)),m(num)%modulo_size) return m((m(num)+modulo_size)%(2*modulo_size)-modulo_size) def quantizer(left,right,options): delta=1.0*(right-left)/options return r_[left+delta/2:right:delta] def quantizise(numbers,quants): return m([min(quants, key=lambda x:abs(x-number)) for number in numbers.A1]).reshape(numbers.shape) mod_size=1.5 y=random.uniform(-1.5,1.5,9).tolist() x=m([i+0.1+random.normal(0,0.1) for i in y]).tolist() nx=mod(x,mod_size) ny=mod(y,mod_size) if 0: q=quantizer(-mod_size,mod_size,70) nx=quantizise(nx,q) ny=quantizise(ny,q) #A=m([[1,-1],[-2,1]]) #c=concatenate((nx,ny)) #d=c.T*A #print A #print d*A.I
def mod(num,modulo_size): # return multiply(m(sign(num)),m(num)%modulo_size) return m((m(num)+modulo_size)%(2*modulo_size)-modulo_size)
def quantizise(numbers,quants): return m([min(quants, key=lambda x:abs(x-number)) for number in numbers.A1]).reshape(numbers.shape)
def matrix_addition(a, b): return (m(a) + m(b)).tolist()
def __init__(self, y, X, k = 0, nocons = False, vce = "ROBUST", cluster = None): y, X = clearNaN(y, X) self.depname = y.name self.nocons = nocons self.X0 = X self.k = k try: self.klength = len(self.k) print("Using separated Ridge paramters for each eigenvalue") except: self.klength = 1 print("Using constant Ridge paramter for each eigenvalue") self.n = len(y) if nocons == False: cons = pd.Series(np.ones(self.n), index = X.index, name = "Cons") X = pd.concat([X, cons], axis = 1) self.l = X.shape[1] self.dep = np.array(y.values, dtype = float) self.X = X.values self.Xt = t(self.X) self.varlist = X.columns if self.klength == 1: self.VarX = inv(self.Xt @ self.X + self.k * np.identity(self.l)) elif self.klength > 1: self.lam, self.vec = eigh(self.Xt @ self.X) idx = self.lam.argsort()[::-1] self.vec = self.vec[:,idx] self.lam = self.lam[idx] self.lam1 = self.lam + self.k self.D = self.lam1 * np.identity(self.l) self.VarX = self.vec @ inv(self.D) @ self.vec.transpose() self.VarXOLS = inv(self.Xt @ self.X) self.Px = self.X @ self.VarX @ self.Xt self.Mx = np.identity(self.n) - self.Px self.CovXy = self.Xt @ self.dep self.b = self.VarX @ self.CovXy self.bOLS = self.VarXOLS @ self.CovXy self.df = np.trace(self.Mx) self.u_hat = self.dep - m(self.X, self.b) self.u1 = self.u_hat.reshape(self.n, 1) self.u2 = self.u_hat**2 self.SSR = t(self.u_hat) @ self.u_hat self.SE = self.SSR/float(self.df) self.Varb = self.SE * (self.VarX @ self.Xt @ self.X @ self.VarX) #default if vce == None: vce = "" if vce.upper() == "ROBUST": self.u2 = self.u2*self.n/self.df self.ohm = np.zeros([self.n,self.n]) for i in range(self.n): self.ohm[i][i] = self.u2[i] self.XOX = self.Xt @ self.ohm @ self.X self.Varb = self.VarX @ self.XOX @ self.VarX if vce.upper() == "HC2": self.u2 = self.u2/np.diag(self.Mx) self.ohm = np.zeros([self.n,self.n]) for i in range(self.n): self.ohm[i][i] = self.u2[i] self.XOX = self.Xt @ self.ohm @ self.X if np.all(cluster) != None: for i in range(self.l): for j in range(self.l): if cluster.iloc[i] != cluster.iloc[j]: self.XOX[i][j] = 0 self.Varb = self.VarX @ self.XOX @ self.VarX if np.all(np.any(cluster) != None): if np.all(np.all(cluster) != None): print("Cluster ID Retrieved!") try: clsize = cluster.shape[1] except: clsize = 1 if clsize == 1: ncl = len(np.unique(cluster)) self.o1 = self.u1 @ t(self.u1)*ncl/(ncl-1)*(self.n-1)/self.df self.ohm = np.zeros(self.o1.shape) for i in range(self.n): for j in range(self.n): if np.all(cluster.iloc[i] == cluster.iloc[j]): self.ohm[i][j] = self.o1[i][j] elif clsize == 2: print("Twoway clustering: ", cluster.columns) ncl1 = len(np.unique(cluster.iloc[:,0])) ncl2 = len(np.unique(cluster.iloc[:,1])) ncl12 = len(np.unique(cluster, axis = 0)) self.o1 = self.u1 @ t(self.u1)*(self.n-1)/self.df*ncl1/(ncl1-1) self.o2 = self.u1 @ t(self.u1)*(self.n-1)/self.df*ncl2/(ncl2-1) self.o12 = self.u1 @ t(self.u1)*(self.n-1)/self.df*ncl12/(ncl12-1) #reghdfe in STATA did not adjust for cluster df properly #This version is a better adjustment to cluster df self.ohm1 = np.zeros(self.o1.shape) self.ohm2 = np.zeros(self.o2.shape) self.ohm12 = np.zeros(self.o12.shape) print("Retrieving First VCE") for i in range(self.n): for j in range(self.n): if cluster.iloc[i,0] == cluster.iloc[j,0]: self.ohm1[i][j] = self.o1[i][j] print("Retrieving Second VCE") for i in range(self.n): for j in range(self.n): if cluster.iloc[i,1] == cluster.iloc[j,1]: self.ohm2[i][j] = self.o2[i][j] print("Retrieving Third VCE") if ncl12 == self.n: d1 = np.diag(self.o12) self.ohm12 = d1 * np.identity(self.n) else: for i in range(self.n): for j in range(self.n): if np.any(cluster.iloc[i] == cluster.iloc[j]): self.ohm12[i][j] = self.o12[i][j] self.ohm = self.ohm1 + self.ohm2 - self.ohm12 else: print("Supports up to two way clustering only!") else: print("Incomplete Cluster ID!") print("HC1 Assumed") self.u2 = self.u2*self.n/self.df self.ohm = np.zeros([self.n,self.n]) for i in range(self.n): self.ohm[i][i] = self.u2[i] self.XOX = self.Xt @ self.ohm @ self.X self.Varb = self.VarX @ self.XOX @ self.VarX self.Varb1 = self.Varb if np.any(np.diag(self.Varb) < 0): print("Non Positive Semi-Definite VCE Matrix! Cameron, Gelbach & Miller (2011) Transformation Used") lb, vb = eigh(self.Varb) idx = lb.argsort()[::-1] vb = vb[:,idx] lb = lb[idx] for i in range(len(lb)): lb[i] = max(0, lb[i]) diag = lb * np.identity(len(lb)) self.Varb = vb @ diag @ t(vb) self.SEb = np.sqrt(np.diag(self.Varb)) if nocons == False: self.ypred = m(self.X, self.b) - np.mean(self.dep) self.ESS = t(self.ypred) @ self.ypred if nocons == True: self.ESS = t(m(self.X, self.b)) @ m(self.X, self.b) self.TSS = self.SSR + self.ESS self.R2 = self.ESS/self.TSS self.AR2 = 1 - (1-self.R2)*float(self.n-1)/float(self.df) self.ts = np.zeros(len(self.b)) self.pvalue = np.zeros(len(self.b)) for j in range(len(self.b)): self.ts[j] = self.b[j]/self.SEb[j] self.pvalue[j] = 2*ss.t.cdf(-abs(self.ts[j]), self.df) self.Mx = (np.identity(self.n) - self.Px) self.SSR1 = t(self.dep) @ self.Mx @ self.dep
import numpy as np from numpy import array as a from numpy import matrix as m from numpy import mat ''' Problem set 1 Do problems: 23 and 28 from section 1.2 ''' A = mat('[1 2 3;2 5 2;6 -3 1]') A * m([1, 1, 1]).T A = mat('1 1 1;1 1 1;1 1 0') v = mat('4 5 6').T A * v I = np.identity(3) I I * v A = mat('1 2 3;4 5 6;7 8 9') A[:, 0] A = mat('1 2 3; 2 5 2;6 -3 1') x = mat('0;0;2') b = A * x m([A[0, :] * x, A[1, :] * x, A[2, :] * x])
def __init__(self, y, X, nocons = False, vce = "ROBUST", cluster = None, gls = False): y, X = clearNaN(y, X) # get rid of null obs self.gls = gls self.depname = y.name self.nocons = nocons self.n = len(y) if nocons == False: cons = pd.Series(np.ones(self.n), index = X.index, name = "Cons") X = pd.concat([X, cons], axis = 1) self.l = X.shape[1] self.dep = np.array(y.values, dtype = float) self.X = X.values self.Xt = t(self.X) self.varlist = X.columns self.VarX = inv(self.Xt @ self.X) # the main inverse self.Px = self.X @ self.VarX @ self.Xt # Px matrix from Davidson Mackinnon self.Mx = np.identity(self.n) - self.Px # Mx matrix "" self.CovXy = self.Xt @ self.dep # Xty self.b = self.VarX @ self.CovXy # combining the inverse and the Xty self.df = np.trace(self.Mx) # degrees of freedom = trace(Mx) self.u_hat = self.dep - m(self.X, self.b) # get residuals self.u1 = self.u_hat.reshape(len(self.u_hat), 1) # data organisation self.u2 = self.u_hat**2 # get squared residuals self.SSR = t(self.u_hat) @ self.u_hat # get SSR self.SE = self.SSR/float(self.df) # get sigma_u estimate self.Varb = self.SE * self.VarX #default #Heteroskedasticity/Clustered/Serial Correlation works below if vce == None: vce = "" if vce.upper() == "ROBUST": self.u2 = self.u2*self.n/self.df self.ohm = np.zeros([self.n,self.n]) for i in range(self.n): self.ohm[i][i] = self.u2[i] self.XOX = self.Xt @ self.ohm @ self.X self.Varb = self.VarX @ self.XOX @ self.VarX if vce.upper() == "HC2": self.u2 = self.u2/np.diag(self.Mx) self.ohm = np.zeros([self.n,self.n]) for i in range(self.n): self.ohm[i][i] = self.u2[i] self.XOX = self.Xt @ self.ohm @ self.X self.Varb = self.VarX @ self.XOX @ self.VarX if np.all(np.any(cluster) != None): if np.all(np.all(cluster) != None): print("Cluster ID Retrieved!") try: clsize = cluster.shape[1] except: clsize = 1 if clsize == 1: ncl = len(np.unique(cluster)) self.o1 = self.u1 @ t(self.u1)*ncl/(ncl-1)*(self.n-1)/self.df self.ohm = np.zeros(self.o1.shape) for i in range(self.n): for j in range(self.n): if np.all(cluster.iloc[i] == cluster.iloc[j]): self.ohm[i][j] = self.o1[i][j] elif clsize == 2: print("Twoway clustering: ", cluster.columns) ncl1 = len(np.unique(cluster.iloc[:,0])) ncl2 = len(np.unique(cluster.iloc[:,1])) ncl12 = len(np.unique(cluster, axis = 0)) self.o1 = self.u1 @ t(self.u1)*(self.n-1)/self.df*ncl1/(ncl1-1) self.o2 = self.u1 @ t(self.u1)*(self.n-1)/self.df*ncl2/(ncl2-1) self.o12 = self.u1 @ t(self.u1)*(self.n-1)/self.df*ncl12/(ncl12-1) self.ohm1 = np.zeros(self.o1.shape) self.ohm2 = np.zeros(self.o2.shape) self.ohm12 = np.zeros(self.o12.shape) print("Retrieving First VCE") for i in range(self.n): for j in range(self.n): if cluster.iloc[i,0] == cluster.iloc[j,0]: self.ohm1[i][j] = self.o1[i][j] print("Retrieving Second VCE") for i in range(self.n): for j in range(self.n): if cluster.iloc[i,1] == cluster.iloc[j,1]: self.ohm2[i][j] = self.o2[i][j] print("Retrieving Third VCE") if ncl12 == self.n: d1 = np.diag(self.o12) self.ohm12 = d1 * np.identity(self.n) else: for i in range(self.n): for j in range(self.n): if np.any(cluster.iloc[i] == cluster.iloc[j]): self.ohm12[i][j] = self.o12[i][j] self.ohm = self.ohm1 + self.ohm2 - self.ohm12 else: print("Supports up to two way clustering only!") else: print("Incomplete Cluster ID!") print("HC1 Assumed") self.u2 = self.u2*self.n/self.df self.ohm = np.zeros([self.n,self.n]) for i in range(self.n): self.ohm[i][i] = self.u2[i] self.XOX = self.Xt @ self.ohm @ self.X self.Varb = self.VarX @ self.XOX @ self.VarX self.Varb1 = self.Varb if np.any(np.diag(self.Varb) < 0): print("Non Positive Semi-Definite VCE Matrix! Cameron, Gelbach & Miller (2011) Transformation Used") lb, vb = eigh(self.Varb) idx = lb.argsort()[::-1] vb = vb[:,idx] lb = lb[idx] for i in range(len(lb)): lb[i] = max(0, lb[i]) diag = lb * np.identity(len(lb)) self.Varb = vb @ diag @ t(vb) if self.gls == True: print("One step GLS") self.VarX = inv(self.XOX) self.CovXy = self.Xt @ self.ohm @ self.dep self.b = self.VarX @ self.CovXy self.SEb = np.sqrt(np.diag(self.Varb)) if nocons == False: self.ypred = m(self.X, self.b) - np.mean(self.dep) self.ESS = t(self.ypred) @ self.ypred if nocons == True: self.ESS = t(m(self.X, self.b)) @ m(self.X, self.b) self.TSS = self.SSR + self.ESS self.R2 = self.ESS/self.TSS self.AR2 = 1 - (1-self.R2)*float(self.n-1)/float(self.df) self.ts = np.zeros(len(self.b)) self.pvalue = np.zeros(len(self.b)) for j in range(len(self.b)): self.ts[j] = self.b[j]/self.SEb[j] self.pvalue[j] = 2*ss.t.cdf(-abs(self.ts[j]), self.df) self.Mx = (np.identity(self.n) - self.Px) self.SSR1 = t(self.dep) @ self.Mx @ self.dep self.settest()
import numpy as np from numpy.linalg import inv from numpy import matmul as m X = np.random.standard_normal((20, 20)) y = 10 * (np.random.random(20)) y = np.int32(y) Xt = X.transpose() theta = m(m(inv(m(Xt, X)), Xt), y) print("Matrix y: ", y) print("Matrix x: ", X) print("Matrix Theta: ", theta)