示例#1
0
def test_override_set_noconvert_columns():
    # see gh-17351
    #
    # Usecols needs to be sorted in _set_noconvert_columns based
    # on the test_usecols_with_parse_dates test from test_usecols.py
    class MyTextFileReader(TextFileReader):
        def __init__(self):
            self._currow = 0
            self.squeeze = False

    class MyCParserWrapper(CParserWrapper):
        def _set_noconvert_columns(self):
            if self.usecols_dtype == "integer":
                # self.usecols is a set, which is documented as unordered
                # but in practice, a CPython set of integers is sorted.
                # In other implementations this assumption does not hold.
                # The following code simulates a different order, which
                # before GH 17351 would cause the wrong columns to be
                # converted via the parse_dates parameter
                self.usecols = list(self.usecols)
                self.usecols.reverse()
            return CParserWrapper._set_noconvert_columns(self)

    data = """a,b,c,d,e
0,1,20140101,0900,4
0,1,20140102,1000,4"""

    parse_dates = [[1, 2]]
    cols = {
        "a": [0, 0],
        "c_d":
        [Timestamp("2014-01-01 09:00:00"),
         Timestamp("2014-01-02 10:00:00")],
    }
    expected = DataFrame(cols, columns=["c_d", "a"])

    parser = MyTextFileReader()
    parser.options = {
        "usecols": [0, 2, 3],
        "parse_dates": parse_dates,
        "delimiter": ",",
    }
    parser.engine = "c"
    parser._engine = MyCParserWrapper(StringIO(data), **parser.options)

    result = parser.read()
    tm.assert_frame_equal(result, expected)
    def test_override__set_noconvert_columns(self):
        # GH 17351 - usecols needs to be sorted in _setnoconvert_columns
        # based on the test_usecols_with_parse_dates test from usecols.py
        from pandas.io.parsers import CParserWrapper, TextFileReader

        s = """a,b,c,d,e
        0,1,20140101,0900,4
        0,1,20140102,1000,4"""

        parse_dates = [[1, 2]]
        cols = {
            'a': [0, 0],
            'c_d': [
                Timestamp('2014-01-01 09:00:00'),
                Timestamp('2014-01-02 10:00:00')
            ]
        }
        expected = DataFrame(cols, columns=['c_d', 'a'])

        class MyTextFileReader(TextFileReader):
            def __init__(self):
                self._currow = 0
                self.squeeze = False

        class MyCParserWrapper(CParserWrapper):
            def _set_noconvert_columns(self):
                if self.usecols_dtype == 'integer':
                    # self.usecols is a set, which is documented as unordered
                    # but in practice, a CPython set of integers is sorted.
                    # In other implementations this assumption does not hold.
                    # The following code simulates a different order, which
                    # before GH 17351 would cause the wrong columns to be
                    # converted via the parse_dates parameter
                    self.usecols = list(self.usecols)
                    self.usecols.reverse()
                return CParserWrapper._set_noconvert_columns(self)

        parser = MyTextFileReader()
        parser.options = {
            'usecols': [0, 2, 3],
            'parse_dates': parse_dates,
            'delimiter': ','
        }
        parser._engine = MyCParserWrapper(StringIO(s), **parser.options)
        df = parser.read()

        tm.assert_frame_equal(df, expected)
示例#3
0
def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names):
    # see gh-9755
    s = """0,1,20140101,0900,4
0,1,20140102,1000,4"""
    parse_dates = [[1, 2]]
    parser = all_parsers

    cols = {
        "a": [0, 0],
        "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
    }
    expected = DataFrame(cols, columns=["c_d", "a"])

    result = parser.read_csv(
        StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols
    )
    tm.assert_frame_equal(result, expected)
    def test_multiple_date_col_timestamp_parse(self):
        data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25
05/31/2012,15:30:00.029,1306.25,8,E,0,,1306.25"""
        result = self.read_csv(StringIO(data), sep=',', header=None,
                               parse_dates=[[0, 1]], date_parser=Timestamp)

        ex_val = Timestamp('05/31/2012 15:30:00.029')
        assert result['0_1'][0] == ex_val
示例#5
0
 def create_event(self, port, time, type, value):
     e = Event(
         Mention(airport_id=port,
                 type='snow',
                 datetime_reported=Timestamp.today(),
                 datetime_happened=time,
                 raw_description=value))
     self.add_event(e, time_area='01:00:00')
     return e
示例#6
0
def _to_m8(key, tz=None):
    """
    Timestamp-like => dt64
    """
    if not isinstance(key, Timestamp):
        # this also converts strings
        key = Timestamp(key, tz=tz)

    return np.int64(conversion.pydt_to_i8(key)).view(_NS_DTYPE)
示例#7
0
    def _has_same_tz(self, other):
        zzone = self._timezone

        # vzone sholdn't be None if value is non-datetime like
        if isinstance(other, np.datetime64):
            # convert to Timestamp as np.datetime64 doesn't have tz attr
            other = Timestamp(other)
        vzone = timezones.get_timezone(getattr(other, 'tzinfo', '__no_tz__'))
        return zzone == vzone
示例#8
0
def _generate_regular_range(cls, start, end, periods, freq):
    if isinstance(freq, Tick):
        stride = freq.nanos
        if periods is None:
            b = Timestamp(start).value
            # cannot just use e = Timestamp(end) + 1 because arange breaks when
            # stride is too large, see GH10887
            e = (b + (Timestamp(end).value - b) // stride * stride +
                 stride // 2 + 1)
            # end.tz == start.tz by this point due to _generate implementation
            tz = start.tz
        elif start is not None:
            b = Timestamp(start).value
            e = b + np.int64(periods) * stride
            tz = start.tz
        elif end is not None:
            e = Timestamp(end).value + stride
            b = e - np.int64(periods) * stride
            tz = end.tz
        else:
            raise ValueError("at least 'start' or 'end' should be specified "
                             "if a 'period' is given.")

        data = np.arange(b, e, stride, dtype=np.int64)
        data = cls._simple_new(data.view(_NS_DTYPE), None, tz=tz)
    else:
        tz = None
        if isinstance(start, Timestamp):
            tz = start.tz
            start = start.to_pydatetime()

        if isinstance(end, Timestamp):
            tz = end.tz
            end = end.to_pydatetime()

        xdr = generate_range(start=start,
                             end=end,
                             periods=periods,
                             offset=freq)

        values = np.array([x.value for x in xdr])
        data = cls._simple_new(values, freq=freq, tz=tz)

    return data
    def test_csvtext(self):
        csvtext = """2017-09-12,932.1
2017-09-13,935.0
2017-09-14,925.1
2017-09-15,920.2
"""
        series = from_csvtext(csvtext)

        self.assertEqual(series.index[0], Timestamp('2017-09-12'))
        self.assertEqual(series.index[1], Timestamp('2017-09-13'))
        self.assertEqual(series.index[2], Timestamp('2017-09-14'))
        self.assertEqual(series.index[3], Timestamp('2017-09-15'))

        self.assertEqual(series[0], 932.1)
        self.assertEqual(series[1], 935.0)
        self.assertEqual(series[2], 925.1)
        self.assertEqual(series[3], 920.2)

        self.assertEqual(to_csvtext(series), csvtext)
示例#10
0
def test_multiple_date_col_timestamp_parse(all_parsers):
    parser = all_parsers
    data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25
05/31/2012,15:30:00.029,1306.25,8,E,0,,1306.25"""

    result = parser.read_csv(StringIO(data),
                             parse_dates=[[0, 1]],
                             header=None,
                             date_parser=Timestamp)
    expected = DataFrame([[
        Timestamp("05/31/2012, 15:30:00.029"), 1306.25, 1, "E", 0, np.nan,
        1306.25
    ],
                          [
                              Timestamp("05/31/2012, 15:30:00.029"), 1306.25,
                              8, "E", 0, np.nan, 1306.25
                          ]],
                         columns=["0_1", 2, 3, 4, 5, 6, 7])
    tm.assert_frame_equal(result, expected)
示例#11
0
def test_parse_tz_aware(all_parsers):
    # See gh-1693
    parser = all_parsers
    data = "Date,x\n2012-06-13T01:39:00Z,0.5"

    result = parser.read_csv(StringIO(data), index_col=0, parse_dates=True)
    expected = DataFrame({"x": [0.5]},
                         index=Index([Timestamp("2012-06-13 01:39:00+00:00")],
                                     name="Date"))
    tm.assert_frame_equal(result, expected)
    assert result.index.tz is pytz.utc
示例#12
0
def test_usecols_with_parse_dates2(all_parsers):
    # see gh-13604
    parser = all_parsers
    data = """2008-02-07 09:40,1032.43
2008-02-07 09:50,1042.54
2008-02-07 10:00,1051.65"""

    names = ["date", "values"]
    usecols = names[:]
    parse_dates = [0]

    index = Index([Timestamp("2008-02-07 09:40"),
                   Timestamp("2008-02-07 09:50"),
                   Timestamp("2008-02-07 10:00")],
                  name="date")
    cols = {"values": [1032.43, 1042.54, 1051.65]}
    expected = DataFrame(cols, index=index)

    result = parser.read_csv(StringIO(data), parse_dates=parse_dates,
                             index_col=0, usecols=usecols,
                             header=None, names=names)
    tm.assert_frame_equal(result, expected)
示例#13
0
def test_nat_parse(all_parsers):
    # see gh-3062
    parser = all_parsers
    df = DataFrame(
        dict({"A": np.arange(10, dtype="float64"), "B": Timestamp("20010101")})
    )
    df.iloc[3:6, :] = np.nan

    with tm.ensure_clean("__nat_parse_.csv") as path:
        df.to_csv(path)

        result = parser.read_csv(path, index_col=0, parse_dates=["B"])
        tm.assert_frame_equal(result, df)
示例#14
0
    def test_index_groupby(self):
        int_idx = Index(range(6))
        float_idx = Index(np.arange(0, 0.6, 0.1))
        obj_idx = Index('A B C D E F'.split())
        dt_idx = pd.date_range('2013-01-01', freq='M', periods=6)

        for idx in [int_idx, float_idx, obj_idx, dt_idx]:
            to_groupby = np.array([1, 2, np.nan, np.nan, 2, 1])
            tm.assert_dict_equal(idx.groupby(to_groupby),
                                 {1.0: idx[[0, 5]], 2.0: idx[[1, 4]]})

            to_groupby = Index([datetime(2011, 11, 1),
                                datetime(2011, 12, 1),
                                pd.NaT,
                                pd.NaT,
                                datetime(2011, 12, 1),
                                datetime(2011, 11, 1)],
                               tz='UTC').values

            ex_keys = [Timestamp('2011-11-01'), Timestamp('2011-12-01')]
            expected = {ex_keys[0]: idx[[0, 5]],
                        ex_keys[1]: idx[[1, 4]]}
            tm.assert_dict_equal(idx.groupby(to_groupby), expected)
示例#15
0
def test_date_parser_int_bug(all_parsers):
    # see gh-3071
    parser = all_parsers
    data = (
        "posix_timestamp,elapsed,sys,user,queries,query_time,rows,"
        "accountid,userid,contactid,level,silo,method\n"
        "1343103150,0.062353,0,4,6,0.01690,3,"
        "12345,1,-1,3,invoice_InvoiceResource,search\n"
    )

    result = parser.read_csv(
        StringIO(data),
        index_col=0,
        parse_dates=[0],
        date_parser=lambda x: datetime.utcfromtimestamp(int(x)),
    )
    expected = DataFrame(
        [
            [
                0.062353,
                0,
                4,
                6,
                0.01690,
                3,
                12345,
                1,
                -1,
                3,
                "invoice_InvoiceResource",
                "search",
            ]
        ],
        columns=[
            "elapsed",
            "sys",
            "user",
            "queries",
            "query_time",
            "rows",
            "accountid",
            "userid",
            "contactid",
            "level",
            "silo",
            "method",
        ],
        index=Index([Timestamp("2012-07-24 04:12:30")], name="posix_timestamp"),
    )
    tm.assert_frame_equal(result, expected)
示例#16
0
def test_date_parser_usecols_thousands(all_parsers):
    # GH#39365
    data = """A,B,C
    1,3,20-09-01-01
    2,4,20-09-01-01
    """

    parser = all_parsers
    result = parser.read_csv(
        StringIO(data),
        parse_dates=[1],
        usecols=[1, 2],
        thousands="-",
    )
    expected = DataFrame({"B": [3, 4], "C": [Timestamp("20-09-2001 01:00:00")] * 2})
    tm.assert_frame_equal(result, expected)
示例#17
0
    def test_datetime_units(self):
        val = datetime.datetime(2013, 8, 17, 21, 17, 12, 215504)
        stamp = Timestamp(val)

        roundtrip = ujson.decode(ujson.encode(val, date_unit='s'))
        assert roundtrip == stamp.value // 10**9

        roundtrip = ujson.decode(ujson.encode(val, date_unit='ms'))
        assert roundtrip == stamp.value // 10**6

        roundtrip = ujson.decode(ujson.encode(val, date_unit='us'))
        assert roundtrip == stamp.value // 10**3

        roundtrip = ujson.decode(ujson.encode(val, date_unit='ns'))
        assert roundtrip == stamp.value

        pytest.raises(ValueError, ujson.encode, val, date_unit='foo')
示例#18
0
def test_usecols_with_parse_dates3(all_parsers):
    # see gh-14792
    parser = all_parsers
    data = """a,b,c,d,e,f,g,h,i,j
2016/09/21,1,1,2,3,4,5,6,7,8"""

    usecols = list("abcdefghij")
    parse_dates = [0]

    cols = {"a": Timestamp("2016-09-21"),
            "b": [1], "c": [1], "d": [2],
            "e": [3], "f": [4], "g": [5],
            "h": [6], "i": [7], "j": [8]}
    expected = DataFrame(cols, columns=usecols)

    result = parser.read_csv(StringIO(data), usecols=usecols,
                             parse_dates=parse_dates)
    tm.assert_frame_equal(result, expected)
示例#19
0
    def test_datetime_units(self):
        val = datetime.datetime(2013, 8, 17, 21, 17, 12, 215504)
        stamp = Timestamp(val)

        roundtrip = ujson.decode(ujson.encode(val, date_unit="s"))
        assert roundtrip == stamp.value // 10**9

        roundtrip = ujson.decode(ujson.encode(val, date_unit="ms"))
        assert roundtrip == stamp.value // 10**6

        roundtrip = ujson.decode(ujson.encode(val, date_unit="us"))
        assert roundtrip == stamp.value // 10**3

        roundtrip = ujson.decode(ujson.encode(val, date_unit="ns"))
        assert roundtrip == stamp.value

        msg = "Invalid value 'foo' for option 'date_unit'"
        with pytest.raises(ValueError, match=msg):
            ujson.encode(val, date_unit="foo")
示例#20
0
def test_datetimeindex():
    idx1 = pd.DatetimeIndex(
        ["2013-04-01 9:00", "2013-04-02 9:00", "2013-04-03 9:00"] * 2, tz="Asia/Tokyo"
    )
    idx2 = date_range("2010/01/01", periods=6, freq="M", tz="US/Eastern")
    idx = MultiIndex.from_arrays([idx1, idx2])

    expected1 = pd.DatetimeIndex(
        ["2013-04-01 9:00", "2013-04-02 9:00", "2013-04-03 9:00"], tz="Asia/Tokyo"
    )

    tm.assert_index_equal(idx.levels[0], expected1)
    tm.assert_index_equal(idx.levels[1], idx2)

    # from datetime combos
    # GH 7888
    date1 = date.today()
    date2 = datetime.today()
    date3 = Timestamp.today()

    for d1, d2 in itertools.product([date1, date2, date3], [date1, date2, date3]):
        index = MultiIndex.from_product([[d1], [d2]])
        assert isinstance(index.levels[0], pd.DatetimeIndex)
        assert isinstance(index.levels[1], pd.DatetimeIndex)
示例#21
0
import pandas as pd
import numpy as np
from pandas._libs.tslib import Timestamp

if __name__ == '__main__':
    df1 = pd.read_csv('pdroughts.csv')
    ddata = df1.copy()
    ddata['valid_start'] = pd.to_datetime(ddata['valid_start'])
    ddata['valid_end'] = pd.to_datetime(ddata['valid_end'])

    #select 2010 droughts data
    ddata = ddata[((ddata['valid_start'] >= Timestamp('2010-01-01 00:00:00')) \
     & (ddata['valid_start'] < Timestamp('2011-01-01 00:00:00'))) \
    & ((ddata['valid_end'] >= Timestamp('2010-01-01 00:00:00')) \
     & (ddata['valid_end'] < Timestamp('2011-01-01 00:00:00')))]
    #only select the droughts(d2-4) that affect at least 50% of the population
    ddata = ddata[ddata['d2'] + ddata['d3'] + ddata['d4'] >= 50]
    ddata = ddata.sort_values(['fips', 'valid_start'])
    fips = ddata.fips.unique()
    print(len(fips))
    ddata = ddata.reset_index(drop=True)
    # #clean water_usage and merge (2010 only)
    df2 = pd.read_csv('water_usage.csv')
    wdata = df2.copy()[[
        'fips', 'state_fips', 'county_fips', 'population', 'd_totaluse'
    ]]
    mapping = []
    for i in range(len(wdata)):
        if wdata.iloc[i]['fips'] in fips:
            mapping.append((wdata.iloc[i]['fips'], wdata.iloc[i]['d_totaluse'],
                            wdata.iloc[i]['population']))
示例#22
0
 def _box_func(self):
     return lambda x: Timestamp(x, freq=self.freq, tz=self.tz)
    def test_constructor_invalid(self):

        # invalid
        pytest.raises(TypeError, Float64Index, 0.)
        pytest.raises(TypeError, Float64Index, ['a', 'b', 0.])
        pytest.raises(TypeError, Float64Index, [Timestamp('20130101')])
示例#24
0
    def _generate_range(cls, start, end, periods, freq, tz=None,
                        normalize=False, ambiguous='raise', closed=None):
        if com.count_not_none(start, end, periods, freq) != 3:
            raise ValueError('Of the four parameters: start, end, periods, '
                             'and freq, exactly three must be specified')
        freq = to_offset(freq)

        if start is not None:
            start = Timestamp(start)

        if end is not None:
            end = Timestamp(end)

        if start is None and end is None:
            if closed is not None:
                raise ValueError("Closed has to be None if not both of start"
                                 "and end are defined")

        left_closed, right_closed = dtl.validate_endpoints(closed)

        start, end, _normalized = _maybe_normalize_endpoints(start, end,
                                                             normalize)

        tz, inferred_tz = _infer_tz_from_endpoints(start, end, tz)

        if hasattr(freq, 'delta') and freq != Day():
            # sub-Day Tick
            if inferred_tz is None and tz is not None:
                # naive dates
                if start is not None and start.tz is None:
                    start = start.tz_localize(tz, ambiguous=False)

                if end is not None and end.tz is None:
                    end = end.tz_localize(tz, ambiguous=False)

            if start and end:
                if start.tz is None and end.tz is not None:
                    start = start.tz_localize(end.tz, ambiguous=False)

                if end.tz is None and start.tz is not None:
                    end = end.tz_localize(start.tz, ambiguous=False)

            if cls._use_cached_range(freq, _normalized, start, end):
                index = cls._cached_range(start, end, periods=periods,
                                          freq=freq)
            else:
                index = _generate_regular_range(cls, start, end, periods, freq)

        else:

            if tz is not None:
                # naive dates
                if start is not None and start.tz is not None:
                    start = start.replace(tzinfo=None)

                if end is not None and end.tz is not None:
                    end = end.replace(tzinfo=None)

            if start and end:
                if start.tz is None and end.tz is not None:
                    end = end.replace(tzinfo=None)

                if end.tz is None and start.tz is not None:
                    start = start.replace(tzinfo=None)

            if freq is not None:
                if cls._use_cached_range(freq, _normalized, start, end):
                    index = cls._cached_range(start, end, periods=periods,
                                              freq=freq)
                else:
                    index = _generate_regular_range(cls, start, end,
                                                    periods, freq)

                if tz is not None and getattr(index, 'tz', None) is None:
                    arr = conversion.tz_localize_to_utc(
                        ensure_int64(index.values),
                        tz, ambiguous=ambiguous)

                    index = cls(arr)

                    # index is localized datetime64 array -> have to convert
                    # start/end as well to compare
                    if start is not None:
                        start = start.tz_localize(tz).asm8
                    if end is not None:
                        end = end.tz_localize(tz).asm8
            else:
                # Create a linearly spaced date_range in local time
                start = start.tz_localize(tz)
                end = end.tz_localize(tz)
                arr = np.linspace(start.value, end.value, periods)
                index = cls._simple_new(arr.astype('M8[ns]'), freq=None, tz=tz)

        if not left_closed and len(index) and index[0] == start:
            index = index[1:]
        if not right_closed and len(index) and index[-1] == end:
            index = index[:-1]

        return cls._simple_new(index.values, freq=freq, tz=tz)
示例#25
0
    def actualizer(self, sch, after_date=None):
        if after_date is None:
            after_date = Timestamp.today()

        return sch[sch['Arrival datatime'] > after_date]
示例#26
0
    def _generate_range(cls,
                        start,
                        end,
                        periods,
                        freq,
                        tz=None,
                        normalize=False,
                        ambiguous='raise',
                        closed=None):
        if com.count_not_none(start, end, periods, freq) != 3:
            raise ValueError('Of the four parameters: start, end, periods, '
                             'and freq, exactly three must be specified')
        freq = to_offset(freq)

        if start is not None:
            start = Timestamp(start)

        if end is not None:
            end = Timestamp(end)

        if start is None and end is None:
            if closed is not None:
                raise ValueError("Closed has to be None if not both of start"
                                 "and end are defined")

        left_closed, right_closed = dtl.validate_endpoints(closed)

        start, end, _normalized = _maybe_normalize_endpoints(
            start, end, normalize)

        tz, inferred_tz = _infer_tz_from_endpoints(start, end, tz)

        if hasattr(freq, 'delta') and freq != Day():
            # sub-Day Tick
            if inferred_tz is None and tz is not None:
                # naive dates
                if start is not None and start.tz is None:
                    start = start.tz_localize(tz, ambiguous=False)

                if end is not None and end.tz is None:
                    end = end.tz_localize(tz, ambiguous=False)

            if start and end:
                if start.tz is None and end.tz is not None:
                    start = start.tz_localize(end.tz, ambiguous=False)

                if end.tz is None and start.tz is not None:
                    end = end.tz_localize(start.tz, ambiguous=False)

            if cls._use_cached_range(freq, _normalized, start, end):
                index = cls._cached_range(start,
                                          end,
                                          periods=periods,
                                          freq=freq)
            else:
                index = _generate_regular_range(cls, start, end, periods, freq)

        else:

            if tz is not None:
                # naive dates
                if start is not None and start.tz is not None:
                    start = start.replace(tzinfo=None)

                if end is not None and end.tz is not None:
                    end = end.replace(tzinfo=None)

            if start and end:
                if start.tz is None and end.tz is not None:
                    end = end.replace(tzinfo=None)

                if end.tz is None and start.tz is not None:
                    start = start.replace(tzinfo=None)

            if freq is not None:
                if cls._use_cached_range(freq, _normalized, start, end):
                    index = cls._cached_range(start,
                                              end,
                                              periods=periods,
                                              freq=freq)
                else:
                    index = _generate_regular_range(cls, start, end, periods,
                                                    freq)

                if tz is not None and getattr(index, 'tz', None) is None:
                    arr = conversion.tz_localize_to_utc(ensure_int64(
                        index.values),
                                                        tz,
                                                        ambiguous=ambiguous)

                    index = cls(arr)

                    # index is localized datetime64 array -> have to convert
                    # start/end as well to compare
                    if start is not None:
                        start = start.tz_localize(tz).asm8
                    if end is not None:
                        end = end.tz_localize(tz).asm8
            else:
                # Create a linearly spaced date_range in local time
                start = start.tz_localize(tz)
                end = end.tz_localize(tz)
                arr = np.linspace(start.value, end.value, periods)
                index = cls._simple_new(arr.astype('M8[ns]'), freq=None, tz=tz)

        if not left_closed and len(index) and index[0] == start:
            index = index[1:]
        if not right_closed and len(index) and index[-1] == end:
            index = index[:-1]

        return cls._simple_new(index.values, freq=freq, tz=tz)
示例#27
0
    def test_usecols_with_parse_dates(self):
        # See gh-9755
        s = """a,b,c,d,e
        0,1,20140101,0900,4
        0,1,20140102,1000,4"""
        parse_dates = [[1, 2]]

        cols = {
            'a': [0, 0],
            'c_d': [
                Timestamp('2014-01-01 09:00:00'),
                Timestamp('2014-01-02 10:00:00')
            ]
        }
        expected = DataFrame(cols, columns=['c_d', 'a'])

        df = self.read_csv(StringIO(s), usecols=[0, 2, 3],
                           parse_dates=parse_dates)
        tm.assert_frame_equal(df, expected)

        df = self.read_csv(StringIO(s), usecols=[3, 0, 2],
                           parse_dates=parse_dates)
        tm.assert_frame_equal(df, expected)

        # See gh-13604
        s = """2008-02-07 09:40,1032.43
        2008-02-07 09:50,1042.54
        2008-02-07 10:00,1051.65
        """
        parse_dates = [0]
        names = ['date', 'values']
        usecols = names[:]

        index = Index([Timestamp('2008-02-07 09:40'),
                       Timestamp('2008-02-07 09:50'),
                       Timestamp('2008-02-07 10:00')],
                      name='date')
        cols = {'values': [1032.43, 1042.54, 1051.65]}
        expected = DataFrame(cols, index=index)

        df = self.read_csv(StringIO(s), parse_dates=parse_dates, index_col=0,
                           usecols=usecols, header=None, names=names)
        tm.assert_frame_equal(df, expected)

        # See gh-14792
        s = """a,b,c,d,e,f,g,h,i,j
        2016/09/21,1,1,2,3,4,5,6,7,8"""
        parse_dates = [0]
        usecols = list('abcdefghij')
        cols = {'a': Timestamp('2016-09-21'),
                'b': [1], 'c': [1], 'd': [2],
                'e': [3], 'f': [4], 'g': [5],
                'h': [6], 'i': [7], 'j': [8]}
        expected = DataFrame(cols, columns=usecols)
        df = self.read_csv(StringIO(s), usecols=usecols,
                           parse_dates=parse_dates)
        tm.assert_frame_equal(df, expected)

        s = """a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8"""
        parse_dates = [[0, 1]]
        usecols = list('abcdefghij')
        cols = {'a_b': '2016/09/21 1',
                'c': [1], 'd': [2], 'e': [3], 'f': [4],
                'g': [5], 'h': [6], 'i': [7], 'j': [8]}
        expected = DataFrame(cols, columns=['a_b'] + list('cdefghij'))
        df = self.read_csv(StringIO(s), usecols=usecols,
                           parse_dates=parse_dates)
        tm.assert_frame_equal(df, expected)
示例#28
0
    def _generate_range(cls, start, end, periods, freq, tz=None,
                        normalize=False, ambiguous='raise', closed=None):
        if com.count_not_none(start, end, periods, freq) != 3:
            raise ValueError('Of the four parameters: start, end, periods, '
                             'and freq, exactly three must be specified')
        freq = to_offset(freq)

        if start is not None:
            start = Timestamp(start)

        if end is not None:
            end = Timestamp(end)

        if start is None and end is None:
            if closed is not None:
                raise ValueError("Closed has to be None if not both of start"
                                 "and end are defined")

        left_closed, right_closed = dtl.validate_endpoints(closed)

        start, end, _normalized = _maybe_normalize_endpoints(start, end,
                                                             normalize)

        tz, _ = _infer_tz_from_endpoints(start, end, tz)

        if tz is not None:
            # Localize the start and end arguments
            start = _maybe_localize_point(
                start, getattr(start, 'tz', None), start, freq, tz
            )
            end = _maybe_localize_point(
                end, getattr(end, 'tz', None), end, freq, tz
            )
        if start and end:
            # Make sure start and end have the same tz
            start = _maybe_localize_point(
                start, start.tz, end.tz, freq, tz
            )
            end = _maybe_localize_point(
                end, end.tz, start.tz, freq, tz
            )
        if freq is not None:
            if cls._use_cached_range(freq, _normalized, start, end):
                # Currently always False; never hit
                # Should be reimplemented as apart of GH 17914
                index = cls._cached_range(start, end, periods=periods,
                                          freq=freq)
            else:
                index = _generate_regular_range(cls, start, end, periods, freq)

                if tz is not None and getattr(index, 'tz', None) is None:
                    arr = conversion.tz_localize_to_utc(
                        ensure_int64(index.values),
                        tz, ambiguous=ambiguous)

                    index = cls(arr)

                    # index is localized datetime64 array -> have to convert
                    # start/end as well to compare
                    if start is not None:
                        start = start.tz_localize(tz).asm8
                    if end is not None:
                        end = end.tz_localize(tz).asm8
        else:
            # Create a linearly spaced date_range in local time
            arr = np.linspace(start.value, end.value, periods)
            index = cls._simple_new(
                arr.astype('M8[ns]', copy=False), freq=None, tz=tz
            )

        if not left_closed and len(index) and index[0] == start:
            index = index[1:]
        if not right_closed and len(index) and index[-1] == end:
            index = index[:-1]

        return cls._simple_new(index.values, freq=freq, tz=tz)
示例#29
0
    def _generate_range(cls, start, end, periods, freq, tz=None,
                        normalize=False, ambiguous='raise', closed=None):
        if com.count_not_none(start, end, periods, freq) != 3:
            raise ValueError('Of the four parameters: start, end, periods, '
                             'and freq, exactly three must be specified')
        freq = to_offset(freq)

        if start is not None:
            start = Timestamp(start)

        if end is not None:
            end = Timestamp(end)

        if start is None and end is None:
            if closed is not None:
                raise ValueError("Closed has to be None if not both of start"
                                 "and end are defined")

        left_closed, right_closed = dtl.validate_endpoints(closed)

        start, end, _normalized = _maybe_normalize_endpoints(start, end,
                                                             normalize)

        tz, _ = _infer_tz_from_endpoints(start, end, tz)

        if tz is not None:
            # Localize the start and end arguments
            start = _maybe_localize_point(
                start, getattr(start, 'tz', None), start, freq, tz
            )
            end = _maybe_localize_point(
                end, getattr(end, 'tz', None), end, freq, tz
            )
        if start and end:
            # Make sure start and end have the same tz
            start = _maybe_localize_point(
                start, start.tz, end.tz, freq, tz
            )
            end = _maybe_localize_point(
                end, end.tz, start.tz, freq, tz
            )
        if freq is not None:
            if cls._use_cached_range(freq, _normalized, start, end):
                # Currently always False; never hit
                # Should be reimplemented as apart of GH 17914
                index = cls._cached_range(start, end, periods=periods,
                                          freq=freq)
            else:
                index = _generate_regular_range(cls, start, end, periods, freq)

                if tz is not None and getattr(index, 'tz', None) is None:
                    arr = conversion.tz_localize_to_utc(
                        ensure_int64(index.values),
                        tz, ambiguous=ambiguous)

                    index = cls(arr)

                    # index is localized datetime64 array -> have to convert
                    # start/end as well to compare
                    if start is not None:
                        start = start.tz_localize(tz).asm8
                    if end is not None:
                        end = end.tz_localize(tz).asm8
        else:
            # Create a linearly spaced date_range in local time
            arr = np.linspace(start.value, end.value, periods)
            index = cls._simple_new(
                arr.astype('M8[ns]', copy=False), freq=None, tz=tz
            )

        if not left_closed and len(index) and index[0] == start:
            index = index[1:]
        if not right_closed and len(index) and index[-1] == end:
            index = index[:-1]

        return cls._simple_new(index.values, freq=freq, tz=tz)