def age_design(indices): tmp = np.hstack((sm.categorical(hrdat['sex'][indices])[:,2:], sm.categorical(hrdat['educ'][indices])[:,2:], sm.categorical(hrdat['PTFT'][indices])[:,2:], hrdat['age'].reshape(n,1)[indices,:], (hrdat['age']**2).reshape(n,1)[indices,:])) return sm.add_constant(tmp, prepend = True)
def age_design(indices): tmp = np.hstack( ( sm.categorical(hrdat["sex"][indices])[:, 2:], sm.categorical(hrdat["educ"][indices])[:, 2:], sm.categorical(hrdat["PTFT"][indices])[:, 2:], hrdat["age"].reshape(n, 1)[indices, :], (hrdat["age"] ** 2).reshape(n, 1)[indices, :], ) ) return sm.add_constant(tmp, prepend=True)
X1 = hrdat["sex"] == 2 X1 = sm.add_constant(X1, prepend=True) model1 = sm.WLS(np.log(hrdat["hrwage"]), X1, weights=hrdat["A_ERNLWT"]) results1 = model1.fit() print results1.summary() # Pre-defining model matrix components for more complicated models # dat_mat is DATa model MATtrices n = len(hrdat) dat_mat = {} dat_names = {} factor_vars = ["sex", "educ", "PTFT", "ind", "occ", "marstat", "GEDIV", "race", "hispanic", "disabled"] for name in factor_vars: dat_mat[name], dat_names[name] = sm.categorical(hrdat[name], dictnames=True) dat_mat[name] = dat_mat[name][:, 2:] dat_mat["age"] = hrdat["age"].reshape(n, 1) dat_mat["age^2"] = (hrdat["age"] ** 2).reshape(n, 1) dat_mat["const"] = np.ones((n, 1)) dat_names["age"] = ["age"] dat_names["age^2"] = ["age^2"] dat_names["const"] = ["const"] for name in factor_vars: fact_names = sorted(dat_names[name].values())[1:] dat_names[name] = ["".join([name, str(val)]) for val in fact_names] # helper function to spit out design matrix and names
X1 = hrdat['sex']==2 X1 = sm.add_constant(X1, prepend=True) model1 = sm.WLS(np.log(hrdat['hrwage']), X1, weights = hrdat['A_ERNLWT']) results1 = model1.fit() print results1.summary() #More complicated model, log(hrwage)~sex+educ+age+PTFT n = len(hrdat) logwage = np.log(hrdat['hrwage']) w = hrdat['A_ERNLWT'] X2 = np.hstack((sm.categorical(hrdat['sex'])[:,2:], sm.categorical(hrdat['educ'])[:,2:], hrdat['age'].reshape(n,1), sm.categorical(hrdat['PTFT'])[:,2:])) X2 = sm.add_constant(X2, prepend=True) model2 = sm.WLS(logwage, X2, weights = w) results2 = model2.fit() print results2.summary() #Now include ind and occ (industry and occupation codings) X2_5 = np.hstack((sm.categorical(hrdat['sex'])[:,2:], sm.categorical(hrdat['educ'])[:,2:],
def getInstructionIdentifier(array_str, array_nr, str): for x in range(0, array_str.size - 1): if array_str[x] == str: return array_nr[x] print("Error could not find instruction") return -100 def column(matrix, i): return [row[i] for row in matrix] #We make all strings to an identification integer #X_Dictionary_str = numpy.genfromtxt("input/input_train.csv",dtype=str) X_Dictionary_str = numpy.genfromtxt("input/input_Dict.csv", dtype=str) X_Dictionary = categorical(X_Dictionary_str, drop=True) X_Dictionary = X_Dictionary.argmax(1) #X_train_str = numpy.genfromtxt("input/input_train.csv",dtype=str) #X_train = categorical(X_train_str, drop=True) #X_train = X_train.argmax(1) X_train_str = X_Dictionary_str X_train = X_Dictionary #Target power and performance values Y_train = numpy.loadtxt("input/input_target.csv", delimiter=",") #Test X_test_str = numpy.genfromtxt("input/input_test.csv", dtype=str) X_test = numpy.zeros(X_test_str.size) for x in range(0, X_test_str.size - 1):
def getInstructionIdentifier(array_str, array_nr, str): for x in range(0, array_str.size - 1): if array_str[x] == str: return array_nr[x] print "instruction number is ", x print("Error could not find instruction") def column(matrix, i): return [row[i] for row in matrix] #We make all strings to an identification integer X_Dictionary_str = numpy.genfromtxt("input/input_train.csv", dtype=str) X_Dictionary = categorical(X_Dictionary_str, drop=True) X_Dictionary = X_Dictionary.argmax(1) X_train_str = numpy.genfromtxt("input/input_train.csv", dtype=str) X_train = categorical(X_train_str, drop=True) X_train = X_train.argmax(1) #Target power and performance values Y_train = numpy.loadtxt("input/input_target.csv", delimiter=",") #Test X_test_str = numpy.genfromtxt("input/input_test.csv", dtype=str) X_test = numpy.zeros(X_test_str.size) for x in range(0, X_test_str.size - 1): X_test[x] = getInstructionIdentifier(X_train_str, X_train, X_test_str[x]) #Float