def test_read_chunks_115(self): files_115 = [self.dta2_115, self.dta3_115, self.dta4_115, self.dta14_115, self.dta15_115, self.dta16_115, self.dta17_115, self.dta18_115, self.dta19_115, self.dta20_115] for fname in files_115: for chunksize in 1,2: for convert_categoricals in False, True: for convert_dates in False, True: with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") parsed = read_stata(fname, convert_categoricals=convert_categoricals, convert_dates=convert_dates) itr = read_stata(fname, iterator=True, convert_categoricals=convert_categoricals) pos = 0 for j in range(5): with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") try: chunk = itr.read(chunksize) except StopIteration: break from_frame = parsed.iloc[pos:pos+chunksize, :] try: tm.assert_frame_equal(from_frame, chunk, check_dtype=False) except AssertionError: # datetime.datetime and pandas.tslib.Timestamp may hold # equivalent values but fail assert_frame_equal assert(all([x == y for x, y in zip(from_frame, chunk)])) pos += chunksize
def test_categorical_order(self): # Directly construct using expected codes # Format is is_cat, col_name, labels (in order), underlying data expected = [(True, 'ordered', ['a', 'b', 'c', 'd', 'e'], np.arange(5)), (True, 'reverse', ['a', 'b', 'c', 'd', 'e'], np.arange(5)[::-1]), (True, 'noorder', ['a', 'b', 'c', 'd', 'e'], np.array([2, 1, 4, 0, 3])), (True, 'floating', ['a', 'b', 'c', 'd', 'e'], np.arange(0, 5)), (True, 'float_missing', ['a', 'd', 'e'], np.array([0, 1, 2, -1, -1])), (False, 'nolabel', [1.0, 2.0, 3.0, 4.0, 5.0], np.arange(5)), (True, 'int32_mixed', ['d', 2, 'e', 'b', 'a'], np.arange(5))] cols = [] for is_cat, col, labels, codes in expected: if is_cat: cols.append((col, pd.Categorical.from_codes(codes, labels))) else: cols.append((col, pd.Series(labels, dtype=np.float32))) expected = DataFrame.from_items(cols) # Read with and with out categoricals, ensure order is identical parsed_115 = read_stata(self.dta19_115) parsed_117 = read_stata(self.dta19_117) tm.assert_frame_equal(expected, parsed_115) tm.assert_frame_equal(expected, parsed_117) # Check identity of codes for col in expected: if is_categorical_dtype(expected[col]): tm.assert_series_equal(expected[col].cat.codes, parsed_115[col].cat.codes) tm.assert_index_equal(expected[col].cat.categories, parsed_115[col].cat.categories)
def test_read_chunks_115(self): files_115 = [self.dta2_115, self.dta3_115, self.dta4_115, self.dta14_115, self.dta15_115, self.dta16_115, self.dta17_115, self.dta18_115, self.dta19_115, self.dta20_115] for fname in files_115: for chunksize in 1,2: for convert_categoricals in False, True: for convert_dates in False, True: # Read the whole file with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") parsed = read_stata(fname, convert_categoricals=convert_categoricals, convert_dates=convert_dates) # Compare to what we get when reading by chunk itr = read_stata(fname, iterator=True, convert_dates=convert_dates, convert_categoricals=convert_categoricals) pos = 0 for j in range(5): with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") try: chunk = itr.read(chunksize) except StopIteration: break from_frame = parsed.iloc[pos:pos+chunksize, :] tm.assert_frame_equal(from_frame, chunk, check_dtype=False, check_datetimelike_compat=True) pos += chunksize
def test_categorical_order(self): # Directly construct using expected codes # Format is is_cat, col_name, labels (in order), underlying data expected = [ (True, 'ordered', ['a', 'b', 'c', 'd', 'e'], np.arange(5)), (True, 'reverse', ['a', 'b', 'c', 'd', 'e'], np.arange(5)[::-1]), (True, 'noorder', ['a', 'b', 'c', 'd', 'e'], np.array([2, 1, 4, 0, 3])), (True, 'floating', ['a', 'b', 'c', 'd', 'e'], np.arange(0, 5)), (True, 'float_missing', ['a', 'd', 'e'], np.array([0, 1, 2, -1, -1])), (False, 'nolabel', [1.0, 2.0, 3.0, 4.0, 5.0], np.arange(5)), (True, 'int32_mixed', ['d', 2, 'e', 'b', 'a'], np.arange(5)) ] cols = [] for is_cat, col, labels, codes in expected: if is_cat: cols.append((col, pd.Categorical.from_codes(codes, labels))) else: cols.append((col, pd.Series(labels, dtype=np.float32))) expected = DataFrame.from_items(cols) # Read with and with out categoricals, ensure order is identical parsed_115 = read_stata(self.dta19_115) parsed_117 = read_stata(self.dta19_117) tm.assert_frame_equal(expected, parsed_115, check_categorical=False) tm.assert_frame_equal(expected, parsed_117, check_categorical=False) # Check identity of codes for col in expected: if is_categorical_dtype(expected[col]): tm.assert_series_equal(expected[col].cat.codes, parsed_115[col].cat.codes) tm.assert_index_equal(expected[col].cat.categories, parsed_115[col].cat.categories)
def test_big_dates(self): yr = [1960, 2000, 9999, 100, 2262, 1677] mo = [1, 1, 12, 1, 4, 9] dd = [1, 1, 31, 1, 22, 23] hr = [0, 0, 23, 0, 0, 0] mm = [0, 0, 59, 0, 0, 0] ss = [0, 0, 59, 0, 0, 0] expected = [] for i in range(len(yr)): row = [] for j in range(7): if j == 0: row.append( datetime(yr[i], mo[i], dd[i], hr[i], mm[i], ss[i])) elif j == 6: row.append(datetime(yr[i], 1, 1)) else: row.append(datetime(yr[i], mo[i], dd[i])) expected.append(row) expected.append([NaT] * 7) columns = [ 'date_tc', 'date_td', 'date_tw', 'date_tm', 'date_tq', 'date_th', 'date_ty' ] # Fixes for weekly, quarterly,half,year expected[2][2] = datetime(9999, 12, 24) expected[2][3] = datetime(9999, 12, 1) expected[2][4] = datetime(9999, 10, 1) expected[2][5] = datetime(9999, 7, 1) expected[4][2] = datetime(2262, 4, 16) expected[4][3] = expected[4][4] = datetime(2262, 4, 1) expected[4][5] = expected[4][6] = datetime(2262, 1, 1) expected[5][2] = expected[5][3] = expected[5][4] = datetime( 1677, 10, 1) expected[5][5] = expected[5][6] = datetime(1678, 1, 1) expected = DataFrame(expected, columns=columns, dtype=np.object) parsed_115 = read_stata(self.dta18_115) parsed_117 = read_stata(self.dta18_117) tm.assert_frame_equal(expected, parsed_115, check_datetimelike_compat=True) tm.assert_frame_equal(expected, parsed_117, check_datetimelike_compat=True) date_conversion = dict((c, c[-2:]) for c in columns) # {c : c[-2:] for c in columns} with tm.ensure_clean() as path: expected.index.name = 'index' expected.to_stata(path, date_conversion) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), expected, check_datetimelike_compat=True)
def dta_to_csv(self): tr = read_stata("Treatment.dta") mh = read_stata("migration_history_4_22.dta") merged = mh.merge(tr) merged = merged.drop(['country_fill', 'admbound_fill', 'village_fill', 'programbase_', 'origin_location', 'origin_gps'], axis=1) merged = merged.dropna() merged.columns = ['id', 'dt', 'lat', 'long', 'treatment'] merged = merged[merged['dt']=='2013-08-01'] merged.to_csv("migration.csv", index=False)
def test_big_dates(self): yr = [1960, 2000, 9999, 100, 2262, 1677] mo = [1, 1, 12, 1, 4, 9] dd = [1, 1, 31, 1, 22, 23] hr = [0, 0, 23, 0, 0, 0] mm = [0, 0, 59, 0, 0, 0] ss = [0, 0, 59, 0, 0, 0] expected = [] for i in range(len(yr)): row = [] for j in range(7): if j == 0: row.append( datetime(yr[i], mo[i], dd[i], hr[i], mm[i], ss[i])) elif j == 6: row.append(datetime(yr[i], 1, 1)) else: row.append(datetime(yr[i], mo[i], dd[i])) expected.append(row) expected.append([NaT] * 7) columns = ['date_tc', 'date_td', 'date_tw', 'date_tm', 'date_tq', 'date_th', 'date_ty'] # Fixes for weekly, quarterly,half,year expected[2][2] = datetime(9999, 12, 24) expected[2][3] = datetime(9999, 12, 1) expected[2][4] = datetime(9999, 10, 1) expected[2][5] = datetime(9999, 7, 1) expected[4][2] = datetime(2262, 4, 16) expected[4][3] = expected[4][4] = datetime(2262, 4, 1) expected[4][5] = expected[4][6] = datetime(2262, 1, 1) expected[5][2] = expected[5][3] = expected[ 5][4] = datetime(1677, 10, 1) expected[5][5] = expected[5][6] = datetime(1678, 1, 1) expected = DataFrame(expected, columns=columns, dtype=np.object) parsed_115 = read_stata(self.dta18_115) parsed_117 = read_stata(self.dta18_117) tm.assert_frame_equal(expected, parsed_115, check_datetimelike_compat=True) tm.assert_frame_equal(expected, parsed_117, check_datetimelike_compat=True) date_conversion = dict((c, c[-2:]) for c in columns) # {c : c[-2:] for c in columns} with tm.ensure_clean() as path: expected.index.name = 'index' expected.to_stata(path, date_conversion) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), expected, check_datetimelike_compat=True)
def test_categorical_ordering(self): parsed_115 = read_stata(self.dta19_115) parsed_117 = read_stata(self.dta19_117) parsed_115_unordered = read_stata(self.dta19_115, order_categoricals=False) parsed_117_unordered = read_stata(self.dta19_117, order_categoricals=False) for col in parsed_115: if not is_categorical_dtype(parsed_115[col]): continue tm.assert_equal(True, parsed_115[col].cat.ordered) tm.assert_equal(True, parsed_117[col].cat.ordered) tm.assert_equal(False, parsed_115_unordered[col].cat.ordered) tm.assert_equal(False, parsed_117_unordered[col].cat.ordered)
def dta_to_csv(self): tr = read_stata("Treatment.dta") mh = read_stata("migration_history_4_22.dta") merged = mh.merge(tr) merged = merged.drop([ 'country_fill', 'admbound_fill', 'village_fill', 'programbase_', 'origin_location', 'origin_gps' ], axis=1) merged = merged.dropna() merged.columns = ['id', 'dt', 'lat', 'long', 'treatment'] merged = merged[merged['dt'] == '2013-08-01'] merged.to_csv("migration.csv", index=False)
def test_categorical_sorting(self): parsed_115 = read_stata(self.dta20_115) parsed_117 = read_stata(self.dta20_117) # Sort based on codes, not strings parsed_115 = parsed_115.sort("srh") parsed_117 = parsed_117.sort("srh") # Don't sort index parsed_115.index = np.arange(parsed_115.shape[0]) parsed_117.index = np.arange(parsed_117.shape[0]) codes = [-1, -1, 0, 1, 1, 1, 2, 2, 3, 4] categories = ["Poor", "Fair", "Good", "Very good", "Excellent"] expected = pd.Series(pd.Categorical.from_codes(codes=codes, categories=categories)) tm.assert_series_equal(expected, parsed_115["srh"]) tm.assert_series_equal(expected, parsed_117["srh"])
def test_encoding(self): # GH 4626, proper encoding handling raw = read_stata(self.dta_encoding) encoded = read_stata(self.dta_encoding, encoding="latin-1") result = encoded.kreis1849[0] if compat.PY3: expected = raw.kreis1849[0] self.assert_(result == expected) self.assert_(isinstance(result, compat.string_types)) else: expected = raw.kreis1849.str.decode("latin-1")[0] self.assert_(result == expected) self.assert_(isinstance(result, unicode))
def test_read_chunks_columns(self): fname = self.dta3_117 columns = ['quarter', 'cpi', 'm1'] chunksize = 2 parsed = read_stata(fname, columns=columns) itr = read_stata(fname, iterator=True) pos = 0 for j in range(5): chunk = itr.read(chunksize, columns=columns) if chunk is None: break from_frame = parsed.iloc[pos:pos+chunksize, :] tm.assert_frame_equal(from_frame, chunk, check_dtype=False) pos += chunksize
def test_read_chunks_columns(self): fname = self.dta3_117 columns = ['quarter', 'cpi', 'm1'] chunksize = 2 parsed = read_stata(fname, columns=columns) with read_stata(fname, iterator=True) as itr: pos = 0 for j in range(5): chunk = itr.read(chunksize, columns=columns) if chunk is None: break from_frame = parsed.iloc[pos:pos + chunksize, :] tm.assert_frame_equal(from_frame, chunk, check_dtype=False) pos += chunksize
def test_categorical_sorting(self): parsed_115 = read_stata(self.dta20_115) parsed_117 = read_stata(self.dta20_117) # Sort based on codes, not strings parsed_115 = parsed_115.sort("srh") parsed_117 = parsed_117.sort("srh") # Don't sort index parsed_115.index = np.arange(parsed_115.shape[0]) parsed_117.index = np.arange(parsed_117.shape[0]) codes = [-1, -1, 0, 1, 1, 1, 2, 2, 3, 4] categories = ["Poor", "Fair", "Good", "Very good", "Excellent"] expected = pd.Series( pd.Categorical.from_codes(codes=codes, categories=categories)) tm.assert_series_equal(expected, parsed_115["srh"]) tm.assert_series_equal(expected, parsed_117["srh"])
def test_read_empty_dta(self): empty_ds = DataFrame(columns=['unit']) # GH 7369, make sure can read a 0-obs dta file with tm.ensure_clean() as path: empty_ds.to_stata(path, write_index=False) empty_ds2 = read_stata(path) tm.assert_frame_equal(empty_ds, empty_ds2)
def test_read_empty_dta(self): empty_ds = DataFrame(columns=['unit']) # GH 7369, make sure can read a 0-obs dta file with tm.ensure_clean() as path: empty_ds.to_stata(path,write_index=False) empty_ds2 = read_stata(path) tm.assert_frame_equal(empty_ds, empty_ds2)
def test_missing_value_conversion(self): columns = ['int8_', 'int16_', 'int32_', 'float32_', 'float64_'] smv = StataMissingValue(101) keys = [key for key in iterkeys(smv.MISSING_VALUES)] keys.sort() data = [] for i in range(27): row = [StataMissingValue(keys[i + (j * 27)]) for j in range(5)] data.append(row) expected = DataFrame(data, columns=columns) parsed_113 = read_stata(self.dta17_113, convert_missing=True) parsed_115 = read_stata(self.dta17_115, convert_missing=True) parsed_117 = read_stata(self.dta17_117, convert_missing=True) tm.assert_frame_equal(expected, parsed_113) tm.assert_frame_equal(expected, parsed_115) tm.assert_frame_equal(expected, parsed_117)
def test_missing_value_conversion(self): columns = ['int8_', 'int16_', 'int32_', 'float32_', 'float64_'] smv = StataMissingValue(101) keys = [key for key in iterkeys(smv.MISSING_VALUES)] keys.sort() data = [] for i in range(27): row = [StataMissingValue(keys[i+(j*27)]) for j in range(5)] data.append(row) expected = DataFrame(data,columns=columns) parsed_113 = read_stata(self.dta17_113, convert_missing=True) parsed_115 = read_stata(self.dta17_115, convert_missing=True) parsed_117 = read_stata(self.dta17_117, convert_missing=True) tm.assert_frame_equal(expected, parsed_113) tm.assert_frame_equal(expected, parsed_115) tm.assert_frame_equal(expected, parsed_117)
def test_dtype_conversion(self): expected = self.read_csv(self.csv15) expected["byte_"] = expected["byte_"].astype(np.int8) expected["int_"] = expected["int_"].astype(np.int16) expected["long_"] = expected["long_"].astype(np.int32) expected["float_"] = expected["float_"].astype(np.float32) expected["double_"] = expected["double_"].astype(np.float64) expected["date_td"] = expected["date_td"].apply(datetime.strptime, args=("%Y-%m-%d",)) no_conversion = read_stata(self.dta15_117, convert_dates=True) tm.assert_frame_equal(expected, no_conversion) conversion = read_stata(self.dta15_117, convert_dates=True, preserve_dtypes=False) # read_csv types are the same expected = self.read_csv(self.csv15) expected["date_td"] = expected["date_td"].apply(datetime.strptime, args=("%Y-%m-%d",)) tm.assert_frame_equal(expected, conversion)
def test_read_chunks_117(self): files_117 = [ self.dta1_117, self.dta2_117, self.dta3_117, self.dta4_117, self.dta14_117, self.dta15_117, self.dta16_117, self.dta17_117, self.dta18_117, self.dta19_117, self.dta20_117 ] for fname in files_117: for chunksize in 1, 2: for convert_categoricals in False, True: for convert_dates in False, True: with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") parsed = read_stata( fname, convert_categoricals=convert_categoricals, convert_dates=convert_dates) itr = read_stata( fname, iterator=True, convert_categoricals=convert_categoricals, convert_dates=convert_dates) pos = 0 for j in range(5): with warnings.catch_warnings( record=True) as w: # noqa warnings.simplefilter("always") try: chunk = itr.read(chunksize) except StopIteration: break from_frame = parsed.iloc[pos:pos + chunksize, :] tm.assert_frame_equal( from_frame, chunk, check_dtype=False, check_datetimelike_compat=True, check_categorical=False) pos += chunksize itr.close()
def test_encoding(self): # GH 4626, proper encoding handling raw = read_stata(self.dta_encoding) encoded = read_stata(self.dta_encoding, encoding="latin-1") result = encoded.kreis1849[0] if compat.PY3: expected = raw.kreis1849[0] self.assertEqual(result, expected) self.assertIsInstance(result, compat.string_types) else: expected = raw.kreis1849.str.decode("latin-1")[0] self.assertEqual(result, expected) self.assertIsInstance(result, unicode) with tm.ensure_clean() as path: encoded.to_stata(path,encoding='latin-1', write_index=False) reread_encoded = read_stata(path, encoding='latin-1') tm.assert_frame_equal(encoded, reread_encoded)
def test_encoding(self): # GH 4626, proper encoding handling raw = read_stata(self.dta_encoding) encoded = read_stata(self.dta_encoding, encoding="latin-1") result = encoded.kreis1849[0] if compat.PY3: expected = raw.kreis1849[0] self.assertEqual(result, expected) self.assertIsInstance(result, compat.string_types) else: expected = raw.kreis1849.str.decode("latin-1")[0] self.assertEqual(result, expected) self.assertIsInstance(result, unicode) with tm.ensure_clean() as path: encoded.to_stata(path, encoding='latin-1', write_index=False) reread_encoded = read_stata(path, encoding='latin-1') tm.assert_frame_equal(encoded, reread_encoded)
def test_iterator(self): fname = self.dta3_117 parsed = read_stata(fname) itr = read_stata(fname, iterator=True) chunk = itr.read(5) tm.assert_frame_equal(parsed.iloc[0:5, :], chunk) itr = read_stata(fname, chunksize=5) chunk = list(itr) tm.assert_frame_equal(parsed.iloc[0:5, :], chunk[0]) itr = read_stata(fname, iterator=True) chunk = itr.get_chunk(5) tm.assert_frame_equal(parsed.iloc[0:5, :], chunk) itr = read_stata(fname, chunksize=5) chunk = itr.get_chunk() tm.assert_frame_equal(parsed.iloc[0:5, :], chunk)
def test_drop_column(self): expected = self.read_csv(self.csv15) expected["byte_"] = expected["byte_"].astype(np.int8) expected["int_"] = expected["int_"].astype(np.int16) expected["long_"] = expected["long_"].astype(np.int32) expected["float_"] = expected["float_"].astype(np.float32) expected["double_"] = expected["double_"].astype(np.float64) expected["date_td"] = expected["date_td"].apply(datetime.strptime, args=("%Y-%m-%d",)) columns = ["byte_", "int_", "long_"] expected = expected[columns] dropped = read_stata(self.dta15_117, convert_dates=True, columns=columns) tm.assert_frame_equal(expected, dropped) with tm.assertRaises(ValueError): columns = ["byte_", "byte_"] read_stata(self.dta15_117, convert_dates=True, columns=columns) with tm.assertRaises(ValueError): columns = ["byte_", "int_", "long_", "not_found"] read_stata(self.dta15_117, convert_dates=True, columns=columns)
def test_drop_column(self): expected = self.read_csv(self.csv15) expected['byte_'] = expected['byte_'].astype(np.int8) expected['int_'] = expected['int_'].astype(np.int16) expected['long_'] = expected['long_'].astype(np.int32) expected['float_'] = expected['float_'].astype(np.float32) expected['double_'] = expected['double_'].astype(np.float64) expected['date_td'] = expected['date_td'].apply(datetime.strptime, args=('%Y-%m-%d', )) columns = ['byte_', 'int_', 'long_'] expected = expected[columns] dropped = read_stata(self.dta15_117, convert_dates=True, columns=columns) tm.assert_frame_equal(expected, dropped) # See PR 10757 columns = ['int_', 'long_', 'byte_'] expected = expected[columns] reordered = read_stata(self.dta15_117, convert_dates=True, columns=columns) tm.assert_frame_equal(expected, reordered) with tm.assertRaises(ValueError): columns = ['byte_', 'byte_'] read_stata(self.dta15_117, convert_dates=True, columns=columns) with tm.assertRaises(ValueError): columns = ['byte_', 'int_', 'long_', 'not_found'] read_stata(self.dta15_117, convert_dates=True, columns=columns)
def test_drop_column(self): expected = self.read_csv(self.csv15) expected['byte_'] = expected['byte_'].astype(np.int8) expected['int_'] = expected['int_'].astype(np.int16) expected['long_'] = expected['long_'].astype(np.int32) expected['float_'] = expected['float_'].astype(np.float32) expected['double_'] = expected['double_'].astype(np.float64) expected['date_td'] = expected['date_td'].apply(datetime.strptime, args=('%Y-%m-%d',)) columns = ['byte_', 'int_', 'long_'] expected = expected[columns] dropped = read_stata(self.dta15_117, convert_dates=True, columns=columns) tm.assert_frame_equal(expected, dropped) # See PR 10757 columns = ['int_', 'long_', 'byte_'] expected = expected[columns] reordered = read_stata(self.dta15_117, convert_dates=True, columns=columns) tm.assert_frame_equal(expected, reordered) with tm.assertRaises(ValueError): columns = ['byte_', 'byte_'] read_stata(self.dta15_117, convert_dates=True, columns=columns) with tm.assertRaises(ValueError): columns = ['byte_', 'int_', 'long_', 'not_found'] read_stata(self.dta15_117, convert_dates=True, columns=columns)
def test_iterator(self): fname = self.dta3_117 parsed = read_stata(fname) with read_stata(fname, iterator=True) as itr: chunk = itr.read(5) tm.assert_frame_equal(parsed.iloc[0:5, :], chunk) with read_stata(fname, chunksize=5) as itr: chunk = list(itr) tm.assert_frame_equal(parsed.iloc[0:5, :], chunk[0]) with read_stata(fname, iterator=True) as itr: chunk = itr.get_chunk(5) tm.assert_frame_equal(parsed.iloc[0:5, :], chunk) with read_stata(fname, chunksize=5) as itr: chunk = itr.get_chunk() tm.assert_frame_equal(parsed.iloc[0:5, :], chunk) # GH12153 from_chunks = pd.concat(read_stata(fname, chunksize=4)) tm.assert_frame_equal(parsed, from_chunks)
def test_read_chunks_117(self): files_117 = [self.dta1_117, self.dta2_117, self.dta3_117, self.dta4_117, self.dta14_117, self.dta15_117, self.dta16_117, self.dta17_117, self.dta18_117, self.dta19_117, self.dta20_117] for fname in files_117: for chunksize in 1, 2: for convert_categoricals in False, True: for convert_dates in False, True: with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") parsed = read_stata( fname, convert_categoricals=convert_categoricals, convert_dates=convert_dates) itr = read_stata( fname, iterator=True, convert_categoricals=convert_categoricals, convert_dates=convert_dates) pos = 0 for j in range(5): with warnings.catch_warnings(record=True) as w: # noqa warnings.simplefilter("always") try: chunk = itr.read(chunksize) except StopIteration: break from_frame = parsed.iloc[pos:pos + chunksize, :] tm.assert_frame_equal( from_frame, chunk, check_dtype=False, check_datetimelike_compat=True, check_categorical=False) pos += chunksize itr.close()
def test_dtype_conversion(self): expected = self.read_csv(self.csv15) expected['byte_'] = expected['byte_'].astype(np.int8) expected['int_'] = expected['int_'].astype(np.int16) expected['long_'] = expected['long_'].astype(np.int32) expected['float_'] = expected['float_'].astype(np.float32) expected['double_'] = expected['double_'].astype(np.float64) expected['date_td'] = expected['date_td'].apply(datetime.strptime, args=('%Y-%m-%d', )) no_conversion = read_stata(self.dta15_117, convert_dates=True) tm.assert_frame_equal(expected, no_conversion) conversion = read_stata(self.dta15_117, convert_dates=True, preserve_dtypes=False) # read_csv types are the same expected = self.read_csv(self.csv15) expected['date_td'] = expected['date_td'].apply(datetime.strptime, args=('%Y-%m-%d', )) tm.assert_frame_equal(expected, conversion)
sedf[r'$\log\lambda$']=e.std().as_matrix()/np.sqrt(resultdf['$N$']) tstats=pd.DataFrame({'TUP':resultdf['TUP']/sedf['TUP'], 'CTL':resultdf['CTL']/sedf['CTL'], 'Diff.':resultdf['Diff.']/sedf['Diff.']}) if not loglambdas is None: llb=b.filter(like='loglambda_') resultdf[r'$\log\lambda$']=llb.iloc[0,:].as_matrix() tstats[r'$\log\lambda$']=resultdf['$\log\lambda$'].as_matrix()/sedf[r'$\log\lambda$'] return resultdf,sedf,tstats,chi2test import pandas as pd from pandas.io import stata D = stata.read_stata('../data/Midline/TUP_merged2014.dta').set_index('idno',drop=False) #~ Get data #~ Make Balanced D['balanced'] = D['merge_midline'].apply(lambda x: '3' in x) D = D[D['balanced']] midline = stata.read_stata('../data/Midline/ssbusiness_survey.dta').set_index('rid') midline.rename(columns={'s6a':'land_owncult_m','s6d':'land_rentcult_m','s6e':'land_communitycult_m'},inplace=True) D = D.join(midline.filter(regex='^land_'), how='left') D.drop(602,inplace=True) #~ Eliminate Duplicates D = D.groupby(D.index).first() #~ in_business(_m) indicates whether they said "Yes" to "have you been involved in any non-farm self-employment in the past year?" D['in_business_m'] = D['s3_m']
def read_dta(self, file): # Legacy default reader configuration return read_stata(file, convert_dates=True)
def read_dta(self, file): return read_stata(file, convert_dates=True)