Exemplo n.º 1
0
    def test_read_chunks_115(self):
        files_115 = [self.dta2_115, self.dta3_115, self.dta4_115,
                     self.dta14_115, self.dta15_115, self.dta16_115,
                     self.dta17_115, self.dta18_115, self.dta19_115,
                     self.dta20_115]

        for fname in files_115:
            for chunksize in 1,2:
                for convert_categoricals in False, True:
                    for convert_dates in False, True:

                        with warnings.catch_warnings(record=True) as w:
                            warnings.simplefilter("always")
                            parsed = read_stata(fname, convert_categoricals=convert_categoricals,
                                                convert_dates=convert_dates)
                        itr = read_stata(fname, iterator=True,
                                         convert_categoricals=convert_categoricals)

                        pos = 0
                        for j in range(5):
                            with warnings.catch_warnings(record=True) as w:
                                warnings.simplefilter("always")
                                try:
                                    chunk = itr.read(chunksize)
                                except StopIteration:
                                    break
                            from_frame = parsed.iloc[pos:pos+chunksize, :]
                            try:
                                tm.assert_frame_equal(from_frame, chunk, check_dtype=False)
                            except AssertionError:
                                # datetime.datetime and pandas.tslib.Timestamp may hold
                                # equivalent values but fail assert_frame_equal
                                assert(all([x == y for x, y in zip(from_frame, chunk)]))

                            pos += chunksize
Exemplo n.º 2
0
    def test_categorical_order(self):
        # Directly construct using expected codes
        # Format is is_cat, col_name, labels (in order), underlying data
        expected = [(True, 'ordered', ['a', 'b', 'c', 'd', 'e'], np.arange(5)),
                    (True, 'reverse', ['a', 'b', 'c', 'd', 'e'], np.arange(5)[::-1]),
                    (True, 'noorder', ['a', 'b', 'c', 'd', 'e'], np.array([2, 1, 4, 0, 3])),
                    (True, 'floating', ['a', 'b', 'c', 'd', 'e'], np.arange(0, 5)),
                    (True, 'float_missing', ['a', 'd', 'e'], np.array([0, 1, 2, -1, -1])),
                    (False, 'nolabel', [1.0, 2.0, 3.0, 4.0, 5.0], np.arange(5)),
                    (True, 'int32_mixed', ['d', 2, 'e', 'b', 'a'], np.arange(5))]
        cols = []
        for is_cat, col, labels, codes in expected:
            if is_cat:
                cols.append((col, pd.Categorical.from_codes(codes, labels)))
            else:
                cols.append((col, pd.Series(labels, dtype=np.float32)))
        expected = DataFrame.from_items(cols)

        # Read with and with out categoricals, ensure order is identical
        parsed_115 = read_stata(self.dta19_115)
        parsed_117 = read_stata(self.dta19_117)
        tm.assert_frame_equal(expected, parsed_115)
        tm.assert_frame_equal(expected, parsed_117)

        # Check identity of codes
        for col in expected:
            if is_categorical_dtype(expected[col]):
                tm.assert_series_equal(expected[col].cat.codes,
                                       parsed_115[col].cat.codes)
                tm.assert_index_equal(expected[col].cat.categories,
                                      parsed_115[col].cat.categories)
Exemplo n.º 3
0
    def test_read_chunks_115(self):
        files_115 = [self.dta2_115, self.dta3_115, self.dta4_115,
                     self.dta14_115, self.dta15_115, self.dta16_115,
                     self.dta17_115, self.dta18_115, self.dta19_115,
                     self.dta20_115]

        for fname in files_115:
            for chunksize in 1,2:
                for convert_categoricals in False, True:
                    for convert_dates in False, True:

                        # Read the whole file
                        with warnings.catch_warnings(record=True) as w:
                            warnings.simplefilter("always")
                            parsed = read_stata(fname, convert_categoricals=convert_categoricals,
                                                convert_dates=convert_dates)

                        # Compare to what we get when reading by chunk
                        itr = read_stata(fname, iterator=True, convert_dates=convert_dates,
                                         convert_categoricals=convert_categoricals)
                        pos = 0
                        for j in range(5):
                            with warnings.catch_warnings(record=True) as w:
                                warnings.simplefilter("always")
                                try:
                                    chunk = itr.read(chunksize)
                                except StopIteration:
                                    break
                            from_frame = parsed.iloc[pos:pos+chunksize, :]
                            tm.assert_frame_equal(from_frame,
                                                  chunk,
                                                  check_dtype=False,
                                                  check_datetimelike_compat=True)

                            pos += chunksize
Exemplo n.º 4
0
    def test_read_chunks_115(self):
        files_115 = [self.dta2_115, self.dta3_115, self.dta4_115,
                     self.dta14_115, self.dta15_115, self.dta16_115,
                     self.dta17_115, self.dta18_115, self.dta19_115,
                     self.dta20_115]

        for fname in files_115:
            for chunksize in 1,2:
                for convert_categoricals in False, True:
                    for convert_dates in False, True:

                        with warnings.catch_warnings(record=True) as w:
                            warnings.simplefilter("always")
                            parsed = read_stata(fname, convert_categoricals=convert_categoricals,
                                                convert_dates=convert_dates)
                        itr = read_stata(fname, iterator=True,
                                         convert_categoricals=convert_categoricals)

                        pos = 0
                        for j in range(5):
                            with warnings.catch_warnings(record=True) as w:
                                warnings.simplefilter("always")
                                try:
                                    chunk = itr.read(chunksize)
                                except StopIteration:
                                    break
                            from_frame = parsed.iloc[pos:pos+chunksize, :]
                            try:
                                tm.assert_frame_equal(from_frame, chunk, check_dtype=False)
                            except AssertionError:
                                # datetime.datetime and pandas.tslib.Timestamp may hold
                                # equivalent values but fail assert_frame_equal
                                assert(all([x == y for x, y in zip(from_frame, chunk)]))

                            pos += chunksize
Exemplo n.º 5
0
    def test_read_chunks_115(self):
        files_115 = [self.dta2_115, self.dta3_115, self.dta4_115,
                     self.dta14_115, self.dta15_115, self.dta16_115,
                     self.dta17_115, self.dta18_115, self.dta19_115,
                     self.dta20_115]

        for fname in files_115:
            for chunksize in 1,2:
                for convert_categoricals in False, True:
                    for convert_dates in False, True:

                        # Read the whole file
                        with warnings.catch_warnings(record=True) as w:
                            warnings.simplefilter("always")
                            parsed = read_stata(fname, convert_categoricals=convert_categoricals,
                                                convert_dates=convert_dates)

                        # Compare to what we get when reading by chunk
                        itr = read_stata(fname, iterator=True, convert_dates=convert_dates,
                                         convert_categoricals=convert_categoricals)
                        pos = 0
                        for j in range(5):
                            with warnings.catch_warnings(record=True) as w:
                                warnings.simplefilter("always")
                                try:
                                    chunk = itr.read(chunksize)
                                except StopIteration:
                                    break
                            from_frame = parsed.iloc[pos:pos+chunksize, :]
                            tm.assert_frame_equal(from_frame,
                                                  chunk,
                                                  check_dtype=False,
                                                  check_datetimelike_compat=True)

                            pos += chunksize
Exemplo n.º 6
0
    def test_categorical_order(self):
        # Directly construct using expected codes
        # Format is is_cat, col_name, labels (in order), underlying data
        expected = [
            (True, 'ordered', ['a', 'b', 'c', 'd', 'e'], np.arange(5)),
            (True, 'reverse', ['a', 'b', 'c', 'd', 'e'], np.arange(5)[::-1]),
            (True, 'noorder', ['a', 'b', 'c', 'd',
                               'e'], np.array([2, 1, 4, 0, 3])),
            (True, 'floating', ['a', 'b', 'c', 'd', 'e'], np.arange(0, 5)),
            (True, 'float_missing', ['a', 'd',
                                     'e'], np.array([0, 1, 2, -1, -1])),
            (False, 'nolabel', [1.0, 2.0, 3.0, 4.0, 5.0], np.arange(5)),
            (True, 'int32_mixed', ['d', 2, 'e', 'b', 'a'], np.arange(5))
        ]
        cols = []
        for is_cat, col, labels, codes in expected:
            if is_cat:
                cols.append((col, pd.Categorical.from_codes(codes, labels)))
            else:
                cols.append((col, pd.Series(labels, dtype=np.float32)))
        expected = DataFrame.from_items(cols)

        # Read with and with out categoricals, ensure order is identical
        parsed_115 = read_stata(self.dta19_115)
        parsed_117 = read_stata(self.dta19_117)
        tm.assert_frame_equal(expected, parsed_115, check_categorical=False)
        tm.assert_frame_equal(expected, parsed_117, check_categorical=False)

        # Check identity of codes
        for col in expected:
            if is_categorical_dtype(expected[col]):
                tm.assert_series_equal(expected[col].cat.codes,
                                       parsed_115[col].cat.codes)
                tm.assert_index_equal(expected[col].cat.categories,
                                      parsed_115[col].cat.categories)
Exemplo n.º 7
0
    def test_big_dates(self):
        yr = [1960, 2000, 9999, 100, 2262, 1677]
        mo = [1, 1, 12, 1, 4, 9]
        dd = [1, 1, 31, 1, 22, 23]
        hr = [0, 0, 23, 0, 0, 0]
        mm = [0, 0, 59, 0, 0, 0]
        ss = [0, 0, 59, 0, 0, 0]
        expected = []
        for i in range(len(yr)):
            row = []
            for j in range(7):
                if j == 0:
                    row.append(
                        datetime(yr[i], mo[i], dd[i], hr[i], mm[i], ss[i]))
                elif j == 6:
                    row.append(datetime(yr[i], 1, 1))
                else:
                    row.append(datetime(yr[i], mo[i], dd[i]))
            expected.append(row)
        expected.append([NaT] * 7)
        columns = [
            'date_tc', 'date_td', 'date_tw', 'date_tm', 'date_tq', 'date_th',
            'date_ty'
        ]

        # Fixes for weekly, quarterly,half,year
        expected[2][2] = datetime(9999, 12, 24)
        expected[2][3] = datetime(9999, 12, 1)
        expected[2][4] = datetime(9999, 10, 1)
        expected[2][5] = datetime(9999, 7, 1)
        expected[4][2] = datetime(2262, 4, 16)
        expected[4][3] = expected[4][4] = datetime(2262, 4, 1)
        expected[4][5] = expected[4][6] = datetime(2262, 1, 1)
        expected[5][2] = expected[5][3] = expected[5][4] = datetime(
            1677, 10, 1)
        expected[5][5] = expected[5][6] = datetime(1678, 1, 1)

        expected = DataFrame(expected, columns=columns, dtype=np.object)
        parsed_115 = read_stata(self.dta18_115)
        parsed_117 = read_stata(self.dta18_117)
        tm.assert_frame_equal(expected,
                              parsed_115,
                              check_datetimelike_compat=True)
        tm.assert_frame_equal(expected,
                              parsed_117,
                              check_datetimelike_compat=True)

        date_conversion = dict((c, c[-2:]) for c in columns)
        # {c : c[-2:] for c in columns}
        with tm.ensure_clean() as path:
            expected.index.name = 'index'
            expected.to_stata(path, date_conversion)
            written_and_read_again = self.read_dta(path)
            tm.assert_frame_equal(written_and_read_again.set_index('index'),
                                  expected,
                                  check_datetimelike_compat=True)
Exemplo n.º 8
0
 def dta_to_csv(self):
     tr = read_stata("Treatment.dta")
     mh = read_stata("migration_history_4_22.dta")
     merged = mh.merge(tr)
     merged = merged.drop(['country_fill', 'admbound_fill', 'village_fill',
                           'programbase_', 'origin_location', 'origin_gps'],
                           axis=1)
     merged = merged.dropna()
     merged.columns = ['id', 'dt', 'lat', 'long', 'treatment']
     merged = merged[merged['dt']=='2013-08-01']
     merged.to_csv("migration.csv", index=False)
Exemplo n.º 9
0
    def test_big_dates(self):
        yr = [1960, 2000, 9999, 100, 2262, 1677]
        mo = [1, 1, 12, 1, 4, 9]
        dd = [1, 1, 31, 1, 22, 23]
        hr = [0, 0, 23, 0, 0, 0]
        mm = [0, 0, 59, 0, 0, 0]
        ss = [0, 0, 59, 0, 0, 0]
        expected = []
        for i in range(len(yr)):
            row = []
            for j in range(7):
                if j == 0:
                    row.append(
                        datetime(yr[i], mo[i], dd[i], hr[i], mm[i], ss[i]))
                elif j == 6:
                    row.append(datetime(yr[i], 1, 1))
                else:
                    row.append(datetime(yr[i], mo[i], dd[i]))
            expected.append(row)
        expected.append([NaT] * 7)
        columns = ['date_tc', 'date_td', 'date_tw', 'date_tm', 'date_tq',
                   'date_th', 'date_ty']

        # Fixes for weekly, quarterly,half,year
        expected[2][2] = datetime(9999, 12, 24)
        expected[2][3] = datetime(9999, 12, 1)
        expected[2][4] = datetime(9999, 10, 1)
        expected[2][5] = datetime(9999, 7, 1)
        expected[4][2] = datetime(2262, 4, 16)
        expected[4][3] = expected[4][4] = datetime(2262, 4, 1)
        expected[4][5] = expected[4][6] = datetime(2262, 1, 1)
        expected[5][2] = expected[5][3] = expected[
            5][4] = datetime(1677, 10, 1)
        expected[5][5] = expected[5][6] = datetime(1678, 1, 1)

        expected = DataFrame(expected, columns=columns, dtype=np.object)
        parsed_115 = read_stata(self.dta18_115)
        parsed_117 = read_stata(self.dta18_117)
        tm.assert_frame_equal(expected, parsed_115,
                              check_datetimelike_compat=True)
        tm.assert_frame_equal(expected, parsed_117,
                              check_datetimelike_compat=True)

        date_conversion = dict((c, c[-2:]) for c in columns)
        # {c : c[-2:] for c in columns}
        with tm.ensure_clean() as path:
            expected.index.name = 'index'
            expected.to_stata(path, date_conversion)
            written_and_read_again = self.read_dta(path)
            tm.assert_frame_equal(written_and_read_again.set_index('index'),
                                  expected,
                                  check_datetimelike_compat=True)
Exemplo n.º 10
0
    def test_categorical_ordering(self):
        parsed_115 = read_stata(self.dta19_115)
        parsed_117 = read_stata(self.dta19_117)

        parsed_115_unordered = read_stata(self.dta19_115, order_categoricals=False)
        parsed_117_unordered = read_stata(self.dta19_117, order_categoricals=False)
        for col in parsed_115:
            if not is_categorical_dtype(parsed_115[col]):
                continue
            tm.assert_equal(True, parsed_115[col].cat.ordered)
            tm.assert_equal(True, parsed_117[col].cat.ordered)
            tm.assert_equal(False, parsed_115_unordered[col].cat.ordered)
            tm.assert_equal(False, parsed_117_unordered[col].cat.ordered)
Exemplo n.º 11
0
 def dta_to_csv(self):
     tr = read_stata("Treatment.dta")
     mh = read_stata("migration_history_4_22.dta")
     merged = mh.merge(tr)
     merged = merged.drop([
         'country_fill', 'admbound_fill', 'village_fill', 'programbase_',
         'origin_location', 'origin_gps'
     ],
                          axis=1)
     merged = merged.dropna()
     merged.columns = ['id', 'dt', 'lat', 'long', 'treatment']
     merged = merged[merged['dt'] == '2013-08-01']
     merged.to_csv("migration.csv", index=False)
Exemplo n.º 12
0
 def test_categorical_sorting(self):
     parsed_115 = read_stata(self.dta20_115)
     parsed_117 = read_stata(self.dta20_117)
     # Sort based on codes, not strings
     parsed_115 = parsed_115.sort("srh")
     parsed_117 = parsed_117.sort("srh")
     # Don't sort index
     parsed_115.index = np.arange(parsed_115.shape[0])
     parsed_117.index = np.arange(parsed_117.shape[0])
     codes = [-1, -1, 0, 1, 1, 1, 2, 2, 3, 4]
     categories = ["Poor", "Fair", "Good", "Very good", "Excellent"]
     expected = pd.Series(pd.Categorical.from_codes(codes=codes, categories=categories))
     tm.assert_series_equal(expected, parsed_115["srh"])
     tm.assert_series_equal(expected, parsed_117["srh"])
Exemplo n.º 13
0
    def test_encoding(self):

        # GH 4626, proper encoding handling
        raw = read_stata(self.dta_encoding)
        encoded = read_stata(self.dta_encoding, encoding="latin-1")
        result = encoded.kreis1849[0]

        if compat.PY3:
            expected = raw.kreis1849[0]
            self.assert_(result == expected)
            self.assert_(isinstance(result, compat.string_types))
        else:
            expected = raw.kreis1849.str.decode("latin-1")[0]
            self.assert_(result == expected)
            self.assert_(isinstance(result, unicode))
Exemplo n.º 14
0
    def test_read_chunks_columns(self):
        fname = self.dta3_117
        columns = ['quarter', 'cpi', 'm1']
        chunksize = 2

        parsed = read_stata(fname, columns=columns)
        itr = read_stata(fname, iterator=True)
        pos = 0
        for j in range(5):
            chunk = itr.read(chunksize, columns=columns)
            if chunk is None:
                break
            from_frame = parsed.iloc[pos:pos+chunksize, :]
            tm.assert_frame_equal(from_frame, chunk, check_dtype=False)
            pos += chunksize
Exemplo n.º 15
0
    def test_categorical_ordering(self):
        parsed_115 = read_stata(self.dta19_115)
        parsed_117 = read_stata(self.dta19_117)

        parsed_115_unordered = read_stata(self.dta19_115,
                                          order_categoricals=False)
        parsed_117_unordered = read_stata(self.dta19_117,
                                          order_categoricals=False)
        for col in parsed_115:
            if not is_categorical_dtype(parsed_115[col]):
                continue
            tm.assert_equal(True, parsed_115[col].cat.ordered)
            tm.assert_equal(True, parsed_117[col].cat.ordered)
            tm.assert_equal(False, parsed_115_unordered[col].cat.ordered)
            tm.assert_equal(False, parsed_117_unordered[col].cat.ordered)
Exemplo n.º 16
0
    def test_read_chunks_columns(self):
        fname = self.dta3_117
        columns = ['quarter', 'cpi', 'm1']
        chunksize = 2

        parsed = read_stata(fname, columns=columns)
        with read_stata(fname, iterator=True) as itr:
            pos = 0
            for j in range(5):
                chunk = itr.read(chunksize, columns=columns)
                if chunk is None:
                    break
                from_frame = parsed.iloc[pos:pos + chunksize, :]
                tm.assert_frame_equal(from_frame, chunk, check_dtype=False)
                pos += chunksize
Exemplo n.º 17
0
 def test_categorical_sorting(self):
     parsed_115 = read_stata(self.dta20_115)
     parsed_117 = read_stata(self.dta20_117)
     # Sort based on codes, not strings
     parsed_115 = parsed_115.sort("srh")
     parsed_117 = parsed_117.sort("srh")
     # Don't sort index
     parsed_115.index = np.arange(parsed_115.shape[0])
     parsed_117.index = np.arange(parsed_117.shape[0])
     codes = [-1, -1, 0, 1, 1, 1, 2, 2, 3, 4]
     categories = ["Poor", "Fair", "Good", "Very good", "Excellent"]
     expected = pd.Series(
         pd.Categorical.from_codes(codes=codes, categories=categories))
     tm.assert_series_equal(expected, parsed_115["srh"])
     tm.assert_series_equal(expected, parsed_117["srh"])
Exemplo n.º 18
0
 def test_read_empty_dta(self):
     empty_ds = DataFrame(columns=['unit'])
     # GH 7369, make sure can read a 0-obs dta file
     with tm.ensure_clean() as path:
         empty_ds.to_stata(path, write_index=False)
         empty_ds2 = read_stata(path)
         tm.assert_frame_equal(empty_ds, empty_ds2)
Exemplo n.º 19
0
 def test_read_empty_dta(self):
     empty_ds = DataFrame(columns=['unit'])
     # GH 7369, make sure can read a 0-obs dta file
     with tm.ensure_clean() as path:
         empty_ds.to_stata(path,write_index=False)
         empty_ds2 = read_stata(path)
         tm.assert_frame_equal(empty_ds, empty_ds2)
Exemplo n.º 20
0
    def test_missing_value_conversion(self):
        columns = ['int8_', 'int16_', 'int32_', 'float32_', 'float64_']
        smv = StataMissingValue(101)
        keys = [key for key in iterkeys(smv.MISSING_VALUES)]
        keys.sort()
        data = []
        for i in range(27):
            row = [StataMissingValue(keys[i + (j * 27)]) for j in range(5)]
            data.append(row)
        expected = DataFrame(data, columns=columns)

        parsed_113 = read_stata(self.dta17_113, convert_missing=True)
        parsed_115 = read_stata(self.dta17_115, convert_missing=True)
        parsed_117 = read_stata(self.dta17_117, convert_missing=True)

        tm.assert_frame_equal(expected, parsed_113)
        tm.assert_frame_equal(expected, parsed_115)
        tm.assert_frame_equal(expected, parsed_117)
Exemplo n.º 21
0
    def test_missing_value_conversion(self):
        columns = ['int8_', 'int16_', 'int32_', 'float32_', 'float64_']
        smv = StataMissingValue(101)
        keys = [key for key in iterkeys(smv.MISSING_VALUES)]
        keys.sort()
        data = []
        for i in range(27):
            row = [StataMissingValue(keys[i+(j*27)]) for j in range(5)]
            data.append(row)
        expected = DataFrame(data,columns=columns)

        parsed_113 = read_stata(self.dta17_113, convert_missing=True)
        parsed_115 = read_stata(self.dta17_115, convert_missing=True)
        parsed_117 = read_stata(self.dta17_117, convert_missing=True)

        tm.assert_frame_equal(expected, parsed_113)
        tm.assert_frame_equal(expected, parsed_115)
        tm.assert_frame_equal(expected, parsed_117)
Exemplo n.º 22
0
    def test_dtype_conversion(self):
        expected = self.read_csv(self.csv15)
        expected["byte_"] = expected["byte_"].astype(np.int8)
        expected["int_"] = expected["int_"].astype(np.int16)
        expected["long_"] = expected["long_"].astype(np.int32)
        expected["float_"] = expected["float_"].astype(np.float32)
        expected["double_"] = expected["double_"].astype(np.float64)
        expected["date_td"] = expected["date_td"].apply(datetime.strptime, args=("%Y-%m-%d",))

        no_conversion = read_stata(self.dta15_117, convert_dates=True)
        tm.assert_frame_equal(expected, no_conversion)

        conversion = read_stata(self.dta15_117, convert_dates=True, preserve_dtypes=False)

        # read_csv types are the same
        expected = self.read_csv(self.csv15)
        expected["date_td"] = expected["date_td"].apply(datetime.strptime, args=("%Y-%m-%d",))

        tm.assert_frame_equal(expected, conversion)
Exemplo n.º 23
0
    def test_read_chunks_117(self):
        files_117 = [
            self.dta1_117, self.dta2_117, self.dta3_117, self.dta4_117,
            self.dta14_117, self.dta15_117, self.dta16_117, self.dta17_117,
            self.dta18_117, self.dta19_117, self.dta20_117
        ]

        for fname in files_117:
            for chunksize in 1, 2:
                for convert_categoricals in False, True:
                    for convert_dates in False, True:

                        with warnings.catch_warnings(record=True) as w:
                            warnings.simplefilter("always")
                            parsed = read_stata(
                                fname,
                                convert_categoricals=convert_categoricals,
                                convert_dates=convert_dates)
                        itr = read_stata(
                            fname,
                            iterator=True,
                            convert_categoricals=convert_categoricals,
                            convert_dates=convert_dates)

                        pos = 0
                        for j in range(5):
                            with warnings.catch_warnings(
                                    record=True) as w:  # noqa
                                warnings.simplefilter("always")
                                try:
                                    chunk = itr.read(chunksize)
                                except StopIteration:
                                    break
                            from_frame = parsed.iloc[pos:pos + chunksize, :]
                            tm.assert_frame_equal(
                                from_frame,
                                chunk,
                                check_dtype=False,
                                check_datetimelike_compat=True,
                                check_categorical=False)

                            pos += chunksize
                        itr.close()
Exemplo n.º 24
0
    def test_encoding(self):

        # GH 4626, proper encoding handling
        raw = read_stata(self.dta_encoding)
        encoded = read_stata(self.dta_encoding, encoding="latin-1")
        result = encoded.kreis1849[0]

        if compat.PY3:
            expected = raw.kreis1849[0]
            self.assertEqual(result, expected)
            self.assertIsInstance(result, compat.string_types)
        else:
            expected = raw.kreis1849.str.decode("latin-1")[0]
            self.assertEqual(result, expected)
            self.assertIsInstance(result, unicode)

        with tm.ensure_clean() as path:
            encoded.to_stata(path,encoding='latin-1', write_index=False)
            reread_encoded = read_stata(path, encoding='latin-1')
            tm.assert_frame_equal(encoded, reread_encoded)
Exemplo n.º 25
0
    def test_encoding(self):

        # GH 4626, proper encoding handling
        raw = read_stata(self.dta_encoding)
        encoded = read_stata(self.dta_encoding, encoding="latin-1")
        result = encoded.kreis1849[0]

        if compat.PY3:
            expected = raw.kreis1849[0]
            self.assertEqual(result, expected)
            self.assertIsInstance(result, compat.string_types)
        else:
            expected = raw.kreis1849.str.decode("latin-1")[0]
            self.assertEqual(result, expected)
            self.assertIsInstance(result, unicode)

        with tm.ensure_clean() as path:
            encoded.to_stata(path, encoding='latin-1', write_index=False)
            reread_encoded = read_stata(path, encoding='latin-1')
            tm.assert_frame_equal(encoded, reread_encoded)
Exemplo n.º 26
0
    def test_iterator(self):

        fname = self.dta3_117

        parsed = read_stata(fname)

        itr = read_stata(fname, iterator=True)
        chunk = itr.read(5)
        tm.assert_frame_equal(parsed.iloc[0:5, :], chunk)

        itr = read_stata(fname, chunksize=5)
        chunk = list(itr)
        tm.assert_frame_equal(parsed.iloc[0:5, :], chunk[0])

        itr = read_stata(fname, iterator=True)
        chunk = itr.get_chunk(5)
        tm.assert_frame_equal(parsed.iloc[0:5, :], chunk)

        itr = read_stata(fname, chunksize=5)
        chunk = itr.get_chunk()
        tm.assert_frame_equal(parsed.iloc[0:5, :], chunk)
Exemplo n.º 27
0
    def test_drop_column(self):
        expected = self.read_csv(self.csv15)
        expected["byte_"] = expected["byte_"].astype(np.int8)
        expected["int_"] = expected["int_"].astype(np.int16)
        expected["long_"] = expected["long_"].astype(np.int32)
        expected["float_"] = expected["float_"].astype(np.float32)
        expected["double_"] = expected["double_"].astype(np.float64)
        expected["date_td"] = expected["date_td"].apply(datetime.strptime, args=("%Y-%m-%d",))

        columns = ["byte_", "int_", "long_"]
        expected = expected[columns]
        dropped = read_stata(self.dta15_117, convert_dates=True, columns=columns)

        tm.assert_frame_equal(expected, dropped)
        with tm.assertRaises(ValueError):
            columns = ["byte_", "byte_"]
            read_stata(self.dta15_117, convert_dates=True, columns=columns)

        with tm.assertRaises(ValueError):
            columns = ["byte_", "int_", "long_", "not_found"]
            read_stata(self.dta15_117, convert_dates=True, columns=columns)
Exemplo n.º 28
0
    def test_iterator(self):

        fname = self.dta3_117

        parsed = read_stata(fname)

        itr = read_stata(fname, iterator=True)
        chunk = itr.read(5)
        tm.assert_frame_equal(parsed.iloc[0:5, :], chunk)

        itr = read_stata(fname, chunksize=5)
        chunk = list(itr)
        tm.assert_frame_equal(parsed.iloc[0:5, :], chunk[0])

        itr = read_stata(fname, iterator=True)
        chunk = itr.get_chunk(5)
        tm.assert_frame_equal(parsed.iloc[0:5, :], chunk)

        itr = read_stata(fname, chunksize=5)
        chunk = itr.get_chunk()
        tm.assert_frame_equal(parsed.iloc[0:5, :], chunk)
Exemplo n.º 29
0
    def test_drop_column(self):
        expected = self.read_csv(self.csv15)
        expected['byte_'] = expected['byte_'].astype(np.int8)
        expected['int_'] = expected['int_'].astype(np.int16)
        expected['long_'] = expected['long_'].astype(np.int32)
        expected['float_'] = expected['float_'].astype(np.float32)
        expected['double_'] = expected['double_'].astype(np.float64)
        expected['date_td'] = expected['date_td'].apply(datetime.strptime,
                                                        args=('%Y-%m-%d', ))

        columns = ['byte_', 'int_', 'long_']
        expected = expected[columns]
        dropped = read_stata(self.dta15_117,
                             convert_dates=True,
                             columns=columns)

        tm.assert_frame_equal(expected, dropped)

        # See PR 10757
        columns = ['int_', 'long_', 'byte_']
        expected = expected[columns]
        reordered = read_stata(self.dta15_117,
                               convert_dates=True,
                               columns=columns)
        tm.assert_frame_equal(expected, reordered)

        with tm.assertRaises(ValueError):
            columns = ['byte_', 'byte_']
            read_stata(self.dta15_117, convert_dates=True, columns=columns)

        with tm.assertRaises(ValueError):
            columns = ['byte_', 'int_', 'long_', 'not_found']
            read_stata(self.dta15_117, convert_dates=True, columns=columns)
Exemplo n.º 30
0
    def test_drop_column(self):
        expected = self.read_csv(self.csv15)
        expected['byte_'] = expected['byte_'].astype(np.int8)
        expected['int_'] = expected['int_'].astype(np.int16)
        expected['long_'] = expected['long_'].astype(np.int32)
        expected['float_'] = expected['float_'].astype(np.float32)
        expected['double_'] = expected['double_'].astype(np.float64)
        expected['date_td'] = expected['date_td'].apply(datetime.strptime,
                                                        args=('%Y-%m-%d',))

        columns = ['byte_', 'int_', 'long_']
        expected = expected[columns]
        dropped = read_stata(self.dta15_117, convert_dates=True,
                             columns=columns)

        tm.assert_frame_equal(expected, dropped)

        # See PR 10757
        columns = ['int_', 'long_', 'byte_']
        expected = expected[columns]
        reordered = read_stata(self.dta15_117, convert_dates=True,
                               columns=columns)
        tm.assert_frame_equal(expected, reordered)

        with tm.assertRaises(ValueError):
            columns = ['byte_', 'byte_']
            read_stata(self.dta15_117, convert_dates=True, columns=columns)

        with tm.assertRaises(ValueError):
            columns = ['byte_', 'int_', 'long_', 'not_found']
            read_stata(self.dta15_117, convert_dates=True, columns=columns)
Exemplo n.º 31
0
    def test_iterator(self):

        fname = self.dta3_117

        parsed = read_stata(fname)

        with read_stata(fname, iterator=True) as itr:
            chunk = itr.read(5)
            tm.assert_frame_equal(parsed.iloc[0:5, :], chunk)

        with read_stata(fname, chunksize=5) as itr:
            chunk = list(itr)
            tm.assert_frame_equal(parsed.iloc[0:5, :], chunk[0])

        with read_stata(fname, iterator=True) as itr:
            chunk = itr.get_chunk(5)
            tm.assert_frame_equal(parsed.iloc[0:5, :], chunk)

        with read_stata(fname, chunksize=5) as itr:
            chunk = itr.get_chunk()
            tm.assert_frame_equal(parsed.iloc[0:5, :], chunk)

        # GH12153
        from_chunks = pd.concat(read_stata(fname, chunksize=4))
        tm.assert_frame_equal(parsed, from_chunks)
Exemplo n.º 32
0
    def test_iterator(self):

        fname = self.dta3_117

        parsed = read_stata(fname)

        with read_stata(fname, iterator=True) as itr:
            chunk = itr.read(5)
            tm.assert_frame_equal(parsed.iloc[0:5, :], chunk)

        with read_stata(fname, chunksize=5) as itr:
            chunk = list(itr)
            tm.assert_frame_equal(parsed.iloc[0:5, :], chunk[0])

        with read_stata(fname, iterator=True) as itr:
            chunk = itr.get_chunk(5)
            tm.assert_frame_equal(parsed.iloc[0:5, :], chunk)

        with read_stata(fname, chunksize=5) as itr:
            chunk = itr.get_chunk()
            tm.assert_frame_equal(parsed.iloc[0:5, :], chunk)

        # GH12153
        from_chunks = pd.concat(read_stata(fname, chunksize=4))
        tm.assert_frame_equal(parsed, from_chunks)
Exemplo n.º 33
0
    def test_read_chunks_117(self):
        files_117 = [self.dta1_117, self.dta2_117, self.dta3_117,
                     self.dta4_117, self.dta14_117, self.dta15_117,
                     self.dta16_117, self.dta17_117, self.dta18_117,
                     self.dta19_117, self.dta20_117]

        for fname in files_117:
            for chunksize in 1, 2:
                for convert_categoricals in False, True:
                    for convert_dates in False, True:

                        with warnings.catch_warnings(record=True) as w:
                            warnings.simplefilter("always")
                            parsed = read_stata(
                                fname,
                                convert_categoricals=convert_categoricals,
                                convert_dates=convert_dates)
                        itr = read_stata(
                            fname, iterator=True,
                            convert_categoricals=convert_categoricals,
                            convert_dates=convert_dates)

                        pos = 0
                        for j in range(5):
                            with warnings.catch_warnings(record=True) as w:  # noqa
                                warnings.simplefilter("always")
                                try:
                                    chunk = itr.read(chunksize)
                                except StopIteration:
                                    break
                            from_frame = parsed.iloc[pos:pos + chunksize, :]
                            tm.assert_frame_equal(
                                from_frame, chunk, check_dtype=False,
                                check_datetimelike_compat=True,
                                check_categorical=False)

                            pos += chunksize
                        itr.close()
Exemplo n.º 34
0
    def test_dtype_conversion(self):
        expected = self.read_csv(self.csv15)
        expected['byte_'] = expected['byte_'].astype(np.int8)
        expected['int_'] = expected['int_'].astype(np.int16)
        expected['long_'] = expected['long_'].astype(np.int32)
        expected['float_'] = expected['float_'].astype(np.float32)
        expected['double_'] = expected['double_'].astype(np.float64)
        expected['date_td'] = expected['date_td'].apply(datetime.strptime,
                                                        args=('%Y-%m-%d', ))

        no_conversion = read_stata(self.dta15_117, convert_dates=True)
        tm.assert_frame_equal(expected, no_conversion)

        conversion = read_stata(self.dta15_117,
                                convert_dates=True,
                                preserve_dtypes=False)

        # read_csv types are the same
        expected = self.read_csv(self.csv15)
        expected['date_td'] = expected['date_td'].apply(datetime.strptime,
                                                        args=('%Y-%m-%d', ))

        tm.assert_frame_equal(expected, conversion)
Exemplo n.º 35
0
    sedf[r'$\log\lambda$']=e.std().as_matrix()/np.sqrt(resultdf['$N$'])

    tstats=pd.DataFrame({'TUP':resultdf['TUP']/sedf['TUP'],
                         'CTL':resultdf['CTL']/sedf['CTL'],
                         'Diff.':resultdf['Diff.']/sedf['Diff.']})

    if not loglambdas is None:
        llb=b.filter(like='loglambda_')
        resultdf[r'$\log\lambda$']=llb.iloc[0,:].as_matrix()
        tstats[r'$\log\lambda$']=resultdf['$\log\lambda$'].as_matrix()/sedf[r'$\log\lambda$']

    return resultdf,sedf,tstats,chi2test
import pandas as pd
from pandas.io import stata

D = stata.read_stata('../data/Midline/TUP_merged2014.dta').set_index('idno',drop=False)
#~ Get data
#~ Make Balanced
D['balanced'] = D['merge_midline'].apply(lambda x: '3' in x)
D = D[D['balanced']]

midline = stata.read_stata('../data/Midline/ssbusiness_survey.dta').set_index('rid')
midline.rename(columns={'s6a':'land_owncult_m','s6d':'land_rentcult_m','s6e':'land_communitycult_m'},inplace=True)
D = D.join(midline.filter(regex='^land_'), how='left')
D.drop(602,inplace=True)
#~ Eliminate Duplicates
D = D.groupby(D.index).first()

#~ in_business(_m) indicates whether they said "Yes" to "have you been involved in any non-farm self-employment in the past year?"
D['in_business_m'] = D['s3_m']
Exemplo n.º 36
0
 def read_dta(self, file):
     # Legacy default reader configuration
     return read_stata(file, convert_dates=True)
Exemplo n.º 37
0
 def read_dta(self, file):
     return read_stata(file, convert_dates=True)
Exemplo n.º 38
0
 def read_dta(self, file):
     # Legacy default reader configuration
     return read_stata(file, convert_dates=True)