Пример #1
0
def convert_to_df():
    """
    将stata文件中的重要特征抽取出来,并合成一张dataframe表格
    :return:
    """
    # 装载每个年份合并之后的DataFrame的文件名
    data_merge_file_name = [
        'read_json_output_file/2010.csv', 'read_json_output_file/2012.csv',
        'read_json_output_file/2014.csv', 'read_json_output_file/2016.csv'
    ]

    data_path = read_json()  # 读取stata文件存放地址
    for i in range(len(data_path)):
        temp = []
        for j in range(len(data_path[i])):
            for key in data_path[i][j].keys():
                stata_data_path = key  # 当年某表的存放路径
                columns_name = data_path[i][j][key]  # 该表对应的重要特征
                print(columns_name)
                stata_data = StataReader(
                    stata_data_path, convert_categoricals=False)  # 读取stata文件
                pd_important_feature = pd.DataFrame(
                    stata_data.read())[columns_name]  # 将格式转成DataFrame,并读取其重要特征
                temp.append(pd_important_feature)
        data_merge(temp, data_merge_file_name[i])  # 合并并生成csv文件
        print('-------------------------')
Пример #2
0
    def test_read_dta1(self):
        reader_114 = StataReader(self.dta1_114)
        parsed_114 = reader_114.read()
        reader_117 = StataReader(self.dta1_117)
        parsed_117 = reader_117.read()
        # Pandas uses np.nan as missing value.
        # Thus, all columns will be of type float, regardless of their name.
        expected = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)],
                             columns=['float_miss', 'double_miss', 'byte_miss',
                                      'int_miss', 'long_miss'])

        # this is an oddity as really the nan should be float64, but
        # the casting doesn't fail so need to match stata here
        expected['float_miss'] = expected['float_miss'].astype(np.float32)

        tm.assert_frame_equal(parsed_114, expected)
        tm.assert_frame_equal(parsed_117, expected)
Пример #3
0
    def test_read_dta1(self):
        reader_114 = StataReader(self.dta1_114)
        parsed_114 = reader_114.read()
        reader_117 = StataReader(self.dta1_117)
        parsed_117 = reader_117.read()
        # Pandas uses np.nan as missing value.
        # Thus, all columns will be of type float, regardless of their name.
        expected = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)],
                             columns=['float_miss', 'double_miss', 'byte_miss',
                                      'int_miss', 'long_miss'])

        # this is an oddity as really the nan should be float64, but
        # the casting doesn't fail so need to match stata here
        expected['float_miss'] = expected['float_miss'].astype(np.float32)

        tm.assert_frame_equal(parsed_114, expected)
        tm.assert_frame_equal(parsed_117, expected)
Пример #4
0
    def test_data_method(self):
        # Minimal testing of legacy data method
        reader_114 = StataReader(self.dta1_114)
        with warnings.catch_warnings(record=True) as w:
            parsed_114_data = reader_114.data()

        reader_114 = StataReader(self.dta1_114)
        parsed_114_read = reader_114.read()
        tm.assert_frame_equal(parsed_114_data, parsed_114_read)
Пример #5
0
    def test_data_method(self):
        # Minimal testing of legacy data method
        reader_114 = StataReader(self.dta1_114)
        with warnings.catch_warnings(record=True) as w:
            parsed_114_data = reader_114.data()

        reader_114 = StataReader(self.dta1_114)
        parsed_114_read = reader_114.read()
        tm.assert_frame_equal(parsed_114_data, parsed_114_read)
Пример #6
0
def read_stata_file(dir, file_name):
    """
    :param dir: stata文件存放目录
    :param file_name:
    :return:返回DataFrame格式和特征表
    """
    stata_data = StataReader(dir + file_name, convert_categoricals=False)
    columns_list = list(stata_data.value_labels().keys())  # 列
    print(file_name)
    print(len(columns_list))
    print(columns_list[0:10])
    print('---------------')
    return pd.DataFrame(stata_data.read()), columns_list
Пример #7
0
if not os.path.exists(paths.data):
    os.mkdir(paths.data)
'''Load and Cache Datasets
   -----------------------

Notes:
- Ensures no overlap in id
- Trims observations with any labor income over $300,000 (U.S., 2014)
'''

#--------------------------------------------------------------------

print "Loading PSID"
reader = StataReader(paths.psid)
psid = reader.read(convert_dates=False, convert_categoricals=False)
psid = psid.dropna(subset=['id']).set_index('id')

# Trimming
inc = psid.filter(regex='^inc_labor[0-9][0-9]')
psid = psid.loc[psid.male == 0]
psid = psid.loc[psid.black == 1]
psid = psid.loc[((inc < inc.quantile(0.90)) | (inc.isnull())).all(axis=1)]

# Interpolating
plong = pd.wide_to_long(psid[inc.columns].reset_index(), ['inc_labor'],
                        i='id',
                        j='age').sort_index()
plong = plong.interpolate(limit=5)
pwide = plong.unstack()
pwide.columns = pwide.columns.droplevel(0)
Пример #8
0
# -*- coding:utf-8 -*-

import pandas as pd
from pandas.io.stata import StataReader
infilename = r"merge_2.dta"

outfile = 'out.csv'
if input('are you sure to clear outputfile>>' + outfile + '<<(y/n)?') == 'y':
    open(outfile, 'w').close()
stata_data = StataReader(infilename, convert_categoricals=False)
data = stata_data.read()

col_n = ['stkcd', 'time', 'rt_year', 'lnme', 'lev', 'size']
data = pd.DataFrame(data, columns=col_n)
data = data.dropna(axis=0)


def output(string):
    with open(outfile, 'a') as f:
        f.write(string)


def slice(df_year):
    # df_year已经从低到高排序
    l_stk = df_year.iloc[:int(len(df_year) * 0.3)]['stkcd'].tolist()
    m_stk = df_year.iloc[int(len(df_year) * 0.3):int(len(df_year) *
                                                     0.7)]['stkcd'].tolist()
    h_stk = df_year.iloc[int(len(df_year) * 0.7):]['stkcd'].tolist()
    return h_stk, m_stk, l_stk

if not os.path.exists(paths.data):
	os.mkdir(paths.data)

'''Load and Cache Datasets
   -----------------------

Notes:
- Ensures no overlap in id
- Trims observations with any labor income over $300,000 (U.S., 2014)
'''

#--------------------------------------------------------------------

print "Loading PSID"
reader = StataReader(paths.psid)
psid = reader.read(convert_dates=False, convert_categoricals=False)
psid = psid.dropna(subset=['id']).set_index('id')

# Trimming
inc = psid.filter(regex='^inc_labor[0-9][0-9]')
psid = psid.loc[psid.male == 0]
psid = psid.loc[psid.black == 1]
psid = psid.loc[((inc < inc.quantile(0.90)) | (inc.isnull())).all(axis=1)]

# Interpolating
plong = pd.wide_to_long(psid[inc.columns].reset_index(), 
    ['inc_labor'], i='id', j='age').sort_index()
plong = plong.interpolate(limit=5)
pwide = plong.unstack()
pwide.columns = pwide.columns.droplevel(0)
pwide.columns = ['{}{}'.format('inc_labor', a) for a in pwide.columns]
Пример #10
0
Desc:   This code selects the IPW variables for specific ABC outcomes.
        We only do this for the pooled sample to increase power. We use
        a linear probiaility model for this. We select the 3 variables
        that minimize the BIC.
"""
import pandas as pd
from pandas.io.stata import StataReader
import numpy as np
import statsmodels.api as sm
from patsy import dmatrices
import itertools
from paths import paths

# import data
reader = StataReader(paths.abccare)
data = reader.read(convert_dates=False, convert_categoricals=False)
data = data.set_index('id')
data = data.sort_index()
data.drop(data.loc[(data.RV==1) & (data.R==0)].index, inplace=True)

# bring in outcomes files, and find the ABC-only/CARE-only ones
outcomes = pd.read_csv(paths.outcomes, index_col='variable')
only_abc = outcomes.loc[outcomes.only_abc == 1].index
only_care = outcomes.loc[outcomes.only_care == 1].index

bank = pd.read_csv(paths.controls)
ipwvars = np.unique(outcomes.loc[~outcomes.ipw_var.isnull(),'ipw_var'].get_values())

# generate the list of all possible models
models = itertools.chain.from_iterable([itertools.combinations(bank.loc[:, 'variable'], 3)])
models = list(models)