def __init__(self, filename, year=None, encoding='GBK'): self.cgss_db = CgssDatabase() self.stata_object = Statadata(filename, encoding=encoding) self.stata_file = self.stata_object.stata_file self.year = int(year) # 变量-标签字典 self.variables_labels_dict = dict( zip(self.stata_object.variables, self.stata_object.variables_labels)) # 变量-值标签字典 self.variables_values_labels_dict = dict( zip(self.stata_object.variables, self.stata_object.variables_values_labels)) # 值标签字典 self.values_labels_dict = self.values_labels
def adj_Rsquared(self): return self.robj.get_var('slm_obj$adj.r.squared') @property def cov(self): return self.robj.get_var('slm_obj$cov.unscaled') @property def qr(self): return self.robj.get_var('lm_obj$qr$qr') if __name__ == '__main__': rthread = RThread() stata_file = r'D:\data\test\wage1.dta' stdata = Statadata(stata_file) mdata = stdata.read() rdata = mdata[['lwage','educ','exper','tenure']] rthread.create_vars(rdata) lm = Rreg(robj=rthread,xvar=['educ','exper','tenure'],yvar='lwage') ''' print(lm.coefficients) print(lm.fstatistic) print(lm.sigma) print(lm.df) print(lm.qr) print(lm.Rsquared) print(lm.adj_Rsquared) print(lm.cov) print(lm.rank)
class CgssStataSheet: """类CgssStataSheet用来读写cgss.dta(stata格式)的数据文件 :param str filename: 想要读写的文件名 :param int year: 年份 :param str encoding: 编码方式,默认是'GBK' :return: 无返回值 :var list variables: 变量列表 :var list variables_labels: 变量标签列表 :var list variables_values_labels: 变量的值标签 :var list values_labels: 值标签的内容 :var dict variables_labels_dict: 变量-标签字典 :var dict variables_values_labels_dict: 变量-值标签字典 :var dict values_labels_dict: 值-标签字典 """ def __init__(self, filename, year=None, encoding='GBK'): self.cgss_db = CgssDatabase() self.stata_object = Statadata(filename, encoding=encoding) self.stata_file = self.stata_object.stata_file self.year = int(year) # 变量-标签字典 self.variables_labels_dict = dict( zip(self.stata_object.variables, self.stata_object.variables_labels)) # 变量-值标签字典 self.variables_values_labels_dict = dict( zip(self.stata_object.variables, self.stata_object.variables_values_labels)) # 值标签字典 self.values_labels_dict = self.values_labels def insert(self): """insert方法用来插入数据到Mongodb数据库集合 """ # 读入stata文件数据 data = self.stata_object.read() # 数据行数 data_rows = data.shape[0] print(data_rows) # 读取每一行数据(产生一条记录) for i in range(data_rows): # 每一条记录用字典记录,存储在record变量中 record = dict({'year': self.year}) # 读取一条记录 row_data = data.iloc[i] j = 0 # 包装记录,即记录变量数据在字典record中 # ind是行数据的索引(即变量) for ind in row_data.index: j = j + 1 # 初始化值标签 value_label = None # 变量-值标签变量 variable_value_labels = self.variables_values_labels_dict[ind] # 值标签字典 value_labels = self.values_labels_dict.get( variable_value_labels, None) # 变量的值 value = row_data[ind] # 如果变量的值是数值型 if isinstance(value, (np.float32, np.float64, np.integer)): if np.isnan(value): value = None else: if value_labels is None: value = None else: if np.equal(value, np.int(value)): value = int(value) value_label = value_labels.get( np.int(value), None) else: value = float(value) elif isinstance(value, str): if re.match('^\s*$', value) is not None: value = None elif isinstance(value, (pd.tslib.Timestamp, datetime)): if isinstance(value, pd.tslib.NaTType): value = None else: value = re.split(' ', str(value))[0] else: print(ind) print(value, type(value)) raise TypeError record[ind] = {'label': self.variables_labels_dict[ind], 'serial_number': j, 'value': { 'value': value, 'label': value_label }} print(i) print('record', record) self.cgss_db.collection.insert(record) @property def values_labels(self): labels = self.stata_object.values_labels new_labels_dict = dict() for var in labels: var_label = list(labels[var].items()) label_tuple = [(np.uint(item[0]).astype(int), item[1]) for item in var_label] new_labels_dict[var] = dict(label_tuple) return new_labels_dict