def convert_to_df(): """ 将stata文件中的重要特征抽取出来,并合成一张dataframe表格 :return: """ # 装载每个年份合并之后的DataFrame的文件名 data_merge_file_name = [ 'read_json_output_file/2010.csv', 'read_json_output_file/2012.csv', 'read_json_output_file/2014.csv', 'read_json_output_file/2016.csv' ] data_path = read_json() # 读取stata文件存放地址 for i in range(len(data_path)): temp = [] for j in range(len(data_path[i])): for key in data_path[i][j].keys(): stata_data_path = key # 当年某表的存放路径 columns_name = data_path[i][j][key] # 该表对应的重要特征 print(columns_name) stata_data = StataReader( stata_data_path, convert_categoricals=False) # 读取stata文件 pd_important_feature = pd.DataFrame( stata_data.read())[columns_name] # 将格式转成DataFrame,并读取其重要特征 temp.append(pd_important_feature) data_merge(temp, data_merge_file_name[i]) # 合并并生成csv文件 print('-------------------------')
def test_read_dta1(self): reader_114 = StataReader(self.dta1_114) parsed_114 = reader_114.read() reader_117 = StataReader(self.dta1_117) parsed_117 = reader_117.read() # Pandas uses np.nan as missing value. # Thus, all columns will be of type float, regardless of their name. expected = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)], columns=['float_miss', 'double_miss', 'byte_miss', 'int_miss', 'long_miss']) # this is an oddity as really the nan should be float64, but # the casting doesn't fail so need to match stata here expected['float_miss'] = expected['float_miss'].astype(np.float32) tm.assert_frame_equal(parsed_114, expected) tm.assert_frame_equal(parsed_117, expected)
def test_data_method(self): # Minimal testing of legacy data method reader_114 = StataReader(self.dta1_114) with warnings.catch_warnings(record=True) as w: parsed_114_data = reader_114.data() reader_114 = StataReader(self.dta1_114) parsed_114_read = reader_114.read() tm.assert_frame_equal(parsed_114_data, parsed_114_read)
def read_stata_file(dir, file_name): """ :param dir: stata文件存放目录 :param file_name: :return:返回DataFrame格式和特征表 """ stata_data = StataReader(dir + file_name, convert_categoricals=False) columns_list = list(stata_data.value_labels().keys()) # 列 print(file_name) print(len(columns_list)) print(columns_list[0:10]) print('---------------') return pd.DataFrame(stata_data.read()), columns_list
if not os.path.exists(paths.data): os.mkdir(paths.data) '''Load and Cache Datasets ----------------------- Notes: - Ensures no overlap in id - Trims observations with any labor income over $300,000 (U.S., 2014) ''' #-------------------------------------------------------------------- print "Loading PSID" reader = StataReader(paths.psid) psid = reader.read(convert_dates=False, convert_categoricals=False) psid = psid.dropna(subset=['id']).set_index('id') # Trimming inc = psid.filter(regex='^inc_labor[0-9][0-9]') psid = psid.loc[psid.male == 0] psid = psid.loc[psid.black == 1] psid = psid.loc[((inc < inc.quantile(0.90)) | (inc.isnull())).all(axis=1)] # Interpolating plong = pd.wide_to_long(psid[inc.columns].reset_index(), ['inc_labor'], i='id', j='age').sort_index() plong = plong.interpolate(limit=5) pwide = plong.unstack() pwide.columns = pwide.columns.droplevel(0)
# -*- coding:utf-8 -*- import pandas as pd from pandas.io.stata import StataReader infilename = r"merge_2.dta" outfile = 'out.csv' if input('are you sure to clear outputfile>>' + outfile + '<<(y/n)?') == 'y': open(outfile, 'w').close() stata_data = StataReader(infilename, convert_categoricals=False) data = stata_data.read() col_n = ['stkcd', 'time', 'rt_year', 'lnme', 'lev', 'size'] data = pd.DataFrame(data, columns=col_n) data = data.dropna(axis=0) def output(string): with open(outfile, 'a') as f: f.write(string) def slice(df_year): # df_year已经从低到高排序 l_stk = df_year.iloc[:int(len(df_year) * 0.3)]['stkcd'].tolist() m_stk = df_year.iloc[int(len(df_year) * 0.3):int(len(df_year) * 0.7)]['stkcd'].tolist() h_stk = df_year.iloc[int(len(df_year) * 0.7):]['stkcd'].tolist() return h_stk, m_stk, l_stk
if not os.path.exists(paths.data): os.mkdir(paths.data) '''Load and Cache Datasets ----------------------- Notes: - Ensures no overlap in id - Trims observations with any labor income over $300,000 (U.S., 2014) ''' #-------------------------------------------------------------------- print "Loading PSID" reader = StataReader(paths.psid) psid = reader.read(convert_dates=False, convert_categoricals=False) psid = psid.dropna(subset=['id']).set_index('id') # Trimming inc = psid.filter(regex='^inc_labor[0-9][0-9]') psid = psid.loc[psid.male == 0] psid = psid.loc[psid.black == 1] psid = psid.loc[((inc < inc.quantile(0.90)) | (inc.isnull())).all(axis=1)] # Interpolating plong = pd.wide_to_long(psid[inc.columns].reset_index(), ['inc_labor'], i='id', j='age').sort_index() plong = plong.interpolate(limit=5) pwide = plong.unstack() pwide.columns = pwide.columns.droplevel(0) pwide.columns = ['{}{}'.format('inc_labor', a) for a in pwide.columns]
Desc: This code selects the IPW variables for specific ABC outcomes. We only do this for the pooled sample to increase power. We use a linear probiaility model for this. We select the 3 variables that minimize the BIC. """ import pandas as pd from pandas.io.stata import StataReader import numpy as np import statsmodels.api as sm from patsy import dmatrices import itertools from paths import paths # import data reader = StataReader(paths.abccare) data = reader.read(convert_dates=False, convert_categoricals=False) data = data.set_index('id') data = data.sort_index() data.drop(data.loc[(data.RV==1) & (data.R==0)].index, inplace=True) # bring in outcomes files, and find the ABC-only/CARE-only ones outcomes = pd.read_csv(paths.outcomes, index_col='variable') only_abc = outcomes.loc[outcomes.only_abc == 1].index only_care = outcomes.loc[outcomes.only_care == 1].index bank = pd.read_csv(paths.controls) ipwvars = np.unique(outcomes.loc[~outcomes.ipw_var.isnull(),'ipw_var'].get_values()) # generate the list of all possible models models = itertools.chain.from_iterable([itertools.combinations(bank.loc[:, 'variable'], 3)]) models = list(models)