Exemplo n.º 1
0
    def test_with_duplicates(self):

        q = pd.concat([self.quotes, self.quotes]).sort_values(
            ['time', 'ticker']).reset_index(drop=True)
        result = merge_asof(self.trades, q,
                            on='time',
                            by='ticker')
        expected = self.read_data('asof.csv')
        assert_frame_equal(result, expected)

        result = merge_asof(self.trades, q,
                            on='time',
                            by='ticker',
                            check_duplicates=False)
        expected = self.read_data('asof.csv')
        expected = pd.concat([expected, expected]).sort_values(
            ['time', 'ticker']).reset_index(drop=True)

        # the results are not ordered in a meaningful way
        # nor are the exact matches duplicated, so comparisons
        # are pretty tricky here, however the uniques are the same

        def aligner(x, ticker):
            return (x[x.ticker == ticker]
                    .sort_values(['time', 'ticker', 'quantity', 'price',
                                  'marketCenter', 'bid', 'ask'])
                    .drop_duplicates(keep='last')
                    .reset_index(drop=True)
                    )

        for ticker in expected.ticker.unique():
            r = aligner(result, ticker)
            e = aligner(expected, ticker)
            assert_frame_equal(r, e)
Exemplo n.º 2
0
    def test_on_specialized_type(self):
        # GH13936
        for dtype in [np.uint8, np.uint16, np.uint32, np.uint64,
                      np.int8, np.int16, np.int32, np.int64,
                      np.float16, np.float32, np.float64]:
            df1 = pd.DataFrame({
                'value': [5, 2, 25, 100, 78, 120, 79],
                'symbol': list("ABCDEFG")},
                columns=['symbol', 'value'])
            df1.value = dtype(df1.value)

            df2 = pd.DataFrame({
                'value': [0, 80, 120, 125],
                'result': list('xyzw')},
                columns=['value', 'result'])
            df2.value = dtype(df2.value)

            df1 = df1.sort_values('value').reset_index(drop=True)

            if dtype == np.float16:
                with self.assertRaises(MergeError):
                    pd.merge_asof(df1, df2, on='value')
                continue

            result = pd.merge_asof(df1, df2, on='value')

            expected = pd.DataFrame(
                {'symbol': list("BACEGDF"),
                 'value': [2, 5, 25, 78, 79, 100, 120],
                 'result': list('xxxxxyz')
                 }, columns=['symbol', 'value', 'result'])
            expected.value = dtype(expected.value)

            assert_frame_equal(result, expected)
Exemplo n.º 3
0
    def test_allow_exact_matches_and_tolerance2(self):
        # GH 13695
        df1 = pd.DataFrame({
            'time': pd.to_datetime(['2016-07-15 13:30:00.030']),
            'username': ['bob']})
        df2 = pd.DataFrame({
            'time': pd.to_datetime(['2016-07-15 13:30:00.000',
                                    '2016-07-15 13:30:00.030']),
            'version': [1, 2]})

        result = pd.merge_asof(df1, df2, on='time')
        expected = pd.DataFrame({
            'time': pd.to_datetime(['2016-07-15 13:30:00.030']),
            'username': ['bob'],
            'version': [2]})
        assert_frame_equal(result, expected)

        result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False)
        expected = pd.DataFrame({
            'time': pd.to_datetime(['2016-07-15 13:30:00.030']),
            'username': ['bob'],
            'version': [1]})
        assert_frame_equal(result, expected)

        result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False,
                               tolerance=pd.Timedelta('10ms'))
        expected = pd.DataFrame({
            'time': pd.to_datetime(['2016-07-15 13:30:00.030']),
            'username': ['bob'],
            'version': [np.nan]})
        assert_frame_equal(result, expected)
Exemplo n.º 4
0
    def test_multiby_indexed(self):
        # GH15676
        left = pd.DataFrame([
            [pd.to_datetime('20160602'), 1, 'a'],
            [pd.to_datetime('20160602'), 2, 'a'],
            [pd.to_datetime('20160603'), 1, 'b'],
            [pd.to_datetime('20160603'), 2, 'b']],
            columns=['time', 'k1', 'k2']).set_index('time')

        right = pd.DataFrame([
            [pd.to_datetime('20160502'), 1, 'a', 1.0],
            [pd.to_datetime('20160502'), 2, 'a', 2.0],
            [pd.to_datetime('20160503'), 1, 'b', 3.0],
            [pd.to_datetime('20160503'), 2, 'b', 4.0]],
            columns=['time', 'k1', 'k2', 'value']).set_index('time')

        expected = pd.DataFrame([
            [pd.to_datetime('20160602'), 1, 'a', 1.0],
            [pd.to_datetime('20160602'), 2, 'a', 2.0],
            [pd.to_datetime('20160603'), 1, 'b', 3.0],
            [pd.to_datetime('20160603'), 2, 'b', 4.0]],
            columns=['time', 'k1', 'k2', 'value']).set_index('time')

        result = pd.merge_asof(left,
                               right,
                               left_index=True,
                               right_index=True,
                               by=['k1', 'k2'])

        assert_frame_equal(expected, result)

        with self.assertRaises(MergeError):
            pd.merge_asof(left, right, left_index=True, right_index=True,
                          left_by=['k1', 'k2'], right_by=['k1'])
Exemplo n.º 5
0
    def test_examples1(self):
        """ doc-string examples """

        left = pd.DataFrame({'a': [1, 5, 10],
                             'left_val': ['a', 'b', 'c']})
        right = pd.DataFrame({'a': [1, 2, 3, 6, 7],
                              'right_val': [1, 2, 3, 6, 7]})

        pd.merge_asof(left, right, on='a')
Exemplo n.º 6
0
    def test_valid_allow_exact_matches(self):

        trades = self.trades
        quotes = self.quotes

        with self.assertRaises(MergeError):
            merge_asof(trades, quotes,
                       on='time',
                       by='ticker',
                       allow_exact_matches='foo')
Exemplo n.º 7
0
    def test_merge_datatype_error(self):
        """ Tests merge datatype mismatch error """
        msg = r'merge keys \[0\] object and int64, must be the same type'

        left = pd.DataFrame({'left_val': [1, 5, 10],
                             'a': ['a', 'b', 'c']})
        right = pd.DataFrame({'right_val': [1, 2, 3, 6, 7],
                              'a': [1, 2, 3, 6, 7]})

        with pytest.raises(MergeError, match=msg):
            merge_asof(left, right, on='a')
Exemplo n.º 8
0
    def test_merge_on_nans(self, func, side):
        # GH 23189
        msg = "Merge keys contain null values on {} side".format(side)
        nulls = func([1.0, 5.0, np.nan])
        non_nulls = func([1.0, 5.0, 10.])
        df_null = pd.DataFrame({'a': nulls, 'left_val': ['a', 'b', 'c']})
        df = pd.DataFrame({'a': non_nulls, 'right_val': [1, 6, 11]})

        with pytest.raises(ValueError, match=msg):
            if side == 'left':
                merge_asof(df_null, df, on='a')
            else:
                merge_asof(df, df_null, on='a')
Exemplo n.º 9
0
def test_asof_join_predicate(time_left, time_right, time_df1, time_df2):
    expr = time_left.asof_join(time_right, time_left.time == time_right.time)[
        time_left, time_right.other_value
    ]
    result = expr.execute()
    expected = pd.merge_asof(time_df1, time_df2, on='time')
    tm.assert_frame_equal(result[expected.columns], expected)
Exemplo n.º 10
0
    def test_on_specialized_type_by_int(self, any_real_dtype):
        # see gh-13936
        dtype = np.dtype(any_real_dtype).type

        df1 = pd.DataFrame({
            "value": [5, 2, 25, 100, 78, 120, 79],
            "key": [1, 2, 3, 2, 3, 1, 2],
            "symbol": list("ABCDEFG")},
            columns=["symbol", "key", "value"])
        df1.value = dtype(df1.value)

        df2 = pd.DataFrame({
            "value": [0, 80, 120, 125],
            "key": [1, 2, 2, 3],
            "result": list("xyzw")},
            columns=["value", "key", "result"])
        df2.value = dtype(df2.value)

        df1 = df1.sort_values("value").reset_index(drop=True)
        result = pd.merge_asof(df1, df2, on="value", by="key")

        expected = pd.DataFrame({
            "symbol": list("BACEGDF"),
            "key": [2, 1, 3, 3, 2, 2, 1],
            "value": [2, 5, 25, 78, 79, 100, 120],
            "result": [np.nan, "x", np.nan, np.nan, np.nan, "y", "x"]},
            columns=["symbol", "key", "value", "result"])
        expected.value = dtype(expected.value)

        assert_frame_equal(result, expected)
Exemplo n.º 11
0
    def test_on_float_by_int(self):
        # type specialize both "by" and "on" parameters
        df1 = pd.DataFrame({
            'symbol': list("AAABBBCCC"),
            'exch': [1, 2, 3, 1, 2, 3, 1, 2, 3],
            'price': [3.26, 3.2599, 3.2598, 12.58, 12.59,
                      12.5, 378.15, 378.2, 378.25]},
            columns=['symbol', 'exch', 'price'])

        df2 = pd.DataFrame({
            'exch': [1, 1, 1, 2, 2, 2, 3, 3, 3],
            'price': [0.0, 1.0, 100.0, 0.0, 5.0, 100.0, 0.0, 5.0, 1000.0],
            'mpv': [0.0001, 0.01, 0.05, 0.0001, 0.01, 0.1, 0.0001, 0.25, 1.0]},
            columns=['exch', 'price', 'mpv'])

        df1 = df1.sort_values('price').reset_index(drop=True)
        df2 = df2.sort_values('price').reset_index(drop=True)

        result = pd.merge_asof(df1, df2, on='price', by='exch')

        expected = pd.DataFrame({
            'symbol': list("AAABBBCCC"),
            'exch': [3, 2, 1, 3, 1, 2, 1, 2, 3],
            'price': [3.2598, 3.2599, 3.26, 12.5, 12.58,
                      12.59, 378.15, 378.2, 378.25],
            'mpv': [0.0001, 0.0001, 0.01, 0.25, 0.01, 0.01, 0.05, 0.1, 0.25]},
            columns=['symbol', 'exch', 'price', 'mpv'])

        assert_frame_equal(result, expected)
Exemplo n.º 12
0
    def test_multi_index(self):

        # MultiIndex is prohibited
        trades = self.trades.set_index(['time', 'price'])
        quotes = self.quotes.set_index('time')
        with self.assertRaises(MergeError):
            merge_asof(trades, quotes,
                       left_index=True,
                       right_index=True)

        trades = self.trades.set_index('time')
        quotes = self.quotes.set_index(['time', 'bid'])
        with self.assertRaises(MergeError):
            merge_asof(trades, quotes,
                       left_index=True,
                       right_index=True)
Exemplo n.º 13
0
    def test_allow_exact_matches(self):

        result = merge_asof(self.trades, self.quotes,
                            on='time',
                            by='ticker',
                            allow_exact_matches=False)
        expected = self.allow_exact_matches
        assert_frame_equal(result, expected)
Exemplo n.º 14
0
    def test_with_duplicates(self):

        q = pd.concat([self.quotes, self.quotes]).sort_values(
            ['time', 'ticker']).reset_index(drop=True)
        result = merge_asof(self.trades, q,
                            on='time',
                            by='ticker')
        expected = self.read_data('asof.csv')
        assert_frame_equal(result, expected)
Exemplo n.º 15
0
    def test_allow_exact_matches_and_tolerance(self):

        result = merge_asof(self.trades, self.quotes,
                            on='time',
                            by='ticker',
                            tolerance=Timedelta('100ms'),
                            allow_exact_matches=False)
        expected = self.allow_exact_matches_and_tolerance
        assert_frame_equal(result, expected)
Exemplo n.º 16
0
    def test_on_and_index(self):

        # 'on' parameter and index together is prohibited
        trades = self.trades.set_index('time')
        quotes = self.quotes.set_index('time')
        with self.assertRaises(MergeError):
            merge_asof(trades, quotes,
                       left_on='price',
                       left_index=True,
                       right_index=True)

        trades = self.trades.set_index('time')
        quotes = self.quotes.set_index('time')
        with self.assertRaises(MergeError):
            merge_asof(trades, quotes,
                       right_on='bid',
                       left_index=True,
                       right_index=True)
Exemplo n.º 17
0
    def test_multiby(self):
        # GH13936
        trades = pd.DataFrame({
            'time': pd.to_datetime(['20160525 13:30:00.023',
                                    '20160525 13:30:00.023',
                                    '20160525 13:30:00.046',
                                    '20160525 13:30:00.048',
                                    '20160525 13:30:00.050']),
            'ticker': ['MSFT', 'MSFT',
                       'GOOG', 'GOOG', 'AAPL'],
            'exch': ['ARCA', 'NSDQ', 'NSDQ', 'BATS', 'NSDQ'],
            'price': [51.95, 51.95,
                      720.77, 720.92, 98.00],
            'quantity': [75, 155,
                         100, 100, 100]},
            columns=['time', 'ticker', 'exch',
                     'price', 'quantity'])

        quotes = pd.DataFrame({
            'time': pd.to_datetime(['20160525 13:30:00.023',
                                    '20160525 13:30:00.023',
                                    '20160525 13:30:00.030',
                                    '20160525 13:30:00.041',
                                    '20160525 13:30:00.045',
                                    '20160525 13:30:00.049']),
            'ticker': ['GOOG', 'MSFT', 'MSFT',
                       'MSFT', 'GOOG', 'AAPL'],
            'exch': ['BATS', 'NSDQ', 'ARCA', 'ARCA',
                     'NSDQ', 'ARCA'],
            'bid': [720.51, 51.95, 51.97, 51.99,
                    720.50, 97.99],
            'ask': [720.92, 51.96, 51.98, 52.00,
                    720.93, 98.01]},
            columns=['time', 'ticker', 'exch', 'bid', 'ask'])

        expected = pd.DataFrame({
            'time': pd.to_datetime(['20160525 13:30:00.023',
                                    '20160525 13:30:00.023',
                                    '20160525 13:30:00.046',
                                    '20160525 13:30:00.048',
                                    '20160525 13:30:00.050']),
            'ticker': ['MSFT', 'MSFT',
                       'GOOG', 'GOOG', 'AAPL'],
            'exch': ['ARCA', 'NSDQ', 'NSDQ', 'BATS', 'NSDQ'],
            'price': [51.95, 51.95,
                      720.77, 720.92, 98.00],
            'quantity': [75, 155,
                         100, 100, 100],
            'bid': [np.nan, 51.95, 720.50, 720.51, np.nan],
            'ask': [np.nan, 51.96, 720.93, 720.92, np.nan]},
            columns=['time', 'ticker', 'exch',
                     'price', 'quantity', 'bid', 'ask'])

        result = pd.merge_asof(trades, quotes, on='time',
                               by=['ticker', 'exch'])
        assert_frame_equal(result, expected)
Exemplo n.º 18
0
    def test_basic(self):

        expected = self.asof
        trades = self.trades
        quotes = self.quotes

        result = merge_asof(trades, quotes,
                            on='time',
                            by='ticker')
        assert_frame_equal(result, expected)
Exemplo n.º 19
0
    def test_basic2(self):

        expected = self.read_data('asof2.csv')
        trades = self.read_data('trades2.csv')
        quotes = self.read_data('quotes2.csv', dedupe=True)

        result = merge_asof(trades, quotes,
                            on='time',
                            by='ticker')
        assert_frame_equal(result, expected)
Exemplo n.º 20
0
def test_keyed_asof_join(
    time_keyed_left, time_keyed_right, time_keyed_df1, time_keyed_df2
):
    expr = time_keyed_left.asof_join(time_keyed_right, 'time', by='key')[
        time_keyed_left, time_keyed_right.other_value
    ]
    result = expr.execute()
    expected = pd.merge_asof(
        time_keyed_df1, time_keyed_df2, on='time', by='key'
    )
    tm.assert_frame_equal(result[expected.columns], expected)
Exemplo n.º 21
0
    def test_tolerance(self):

        trades = self.trades
        quotes = self.quotes

        result = merge_asof(trades, quotes,
                            on='time',
                            by='ticker',
                            tolerance=Timedelta('1day'))
        expected = self.tolerance
        assert_frame_equal(result, expected)
Exemplo n.º 22
0
    def test_basic_left_index_right_index(self):

        expected = self.asof.set_index('time')
        trades = self.trades.set_index('time')
        quotes = self.quotes.set_index('time')

        result = merge_asof(trades, quotes,
                            left_index=True,
                            right_index=True,
                            by='ticker')
        assert_frame_equal(result, expected)
Exemplo n.º 23
0
    def test_with_duplicates_no_on(self):

        df1 = pd.DataFrame({'key': [1, 1, 3],
                            'left_val': [1, 2, 3]})
        df2 = pd.DataFrame({'key': [1, 2, 2],
                            'right_val': [1, 2, 3]})
        result = merge_asof(df1, df2, on='key')
        expected = pd.DataFrame({'key': [1, 1, 3],
                                 'left_val': [1, 2, 3],
                                 'right_val': [1, 1, 3]})
        assert_frame_equal(result, expected)
Exemplo n.º 24
0
    def test_basic_categorical(self):

        expected = self.asof
        trades = self.trades.copy()
        trades.ticker = trades.ticker.astype('category')
        quotes = self.quotes.copy()
        quotes.ticker = quotes.ticker.astype('category')

        result = merge_asof(trades, quotes,
                            on='time',
                            by='ticker')
        assert_frame_equal(result, expected)
Exemplo n.º 25
0
    def test_index_tolerance(self):
        # GH 15135
        expected = self.tolerance.set_index('time')
        trades = self.trades.set_index('time')
        quotes = self.quotes.set_index('time')

        result = pd.merge_asof(trades, quotes,
                               left_index=True,
                               right_index=True,
                               by='ticker',
                               tolerance=pd.Timedelta('1day'))
        assert_frame_equal(result, expected)
Exemplo n.º 26
0
    def test_missing_right_by(self):

        expected = self.asof
        trades = self.trades
        quotes = self.quotes

        q = quotes[quotes.ticker != 'MSFT']
        result = merge_asof(trades, q,
                            on='time',
                            by='ticker')
        expected.loc[expected.ticker == 'MSFT', ['bid', 'ask']] = np.nan
        assert_frame_equal(result, expected)
Exemplo n.º 27
0
    def test_basic_no_by(self):
        f = lambda x: x[x.ticker == 'MSFT'].drop('ticker', axis=1) \
            .reset_index(drop=True)

        # just use a single ticker
        expected = f(self.asof)
        trades = f(self.trades)
        quotes = f(self.quotes)

        result = merge_asof(trades, quotes,
                            on='time')
        assert_frame_equal(result, expected)
Exemplo n.º 28
0
    def test_basic_left_by_right_by(self):

        # GH14253
        expected = self.asof
        trades = self.trades
        quotes = self.quotes

        result = merge_asof(trades, quotes,
                            on='time',
                            left_by='ticker',
                            right_by='ticker')
        assert_frame_equal(result, expected)
Exemplo n.º 29
0
    def test_non_sorted(self):

        trades = self.trades.sort_values('time', ascending=False)
        quotes = self.quotes.sort_values('time', ascending=False)

        # we require that we are already sorted on time & quotes
        assert not trades.time.is_monotonic
        assert not quotes.time.is_monotonic
        with pytest.raises(ValueError):
            merge_asof(trades, quotes,
                       on='time',
                       by='ticker')

        trades = self.trades.sort_values('time')
        assert trades.time.is_monotonic
        assert not quotes.time.is_monotonic
        with pytest.raises(ValueError):
            merge_asof(trades, quotes,
                       on='time',
                       by='ticker')

        quotes = self.quotes.sort_values('time')
        assert trades.time.is_monotonic
        assert quotes.time.is_monotonic

        # ok, though has dupes
        merge_asof(trades, self.quotes,
                   on='time',
                   by='ticker')
Exemplo n.º 30
0
    def test_non_sorted(self):

        trades = self.trades.sort_values('time', ascending=False)
        quotes = self.quotes.sort_values('time', ascending=False)

        # we require that we are already sorted on time & quotes
        self.assertFalse(trades.time.is_monotonic)
        self.assertFalse(quotes.time.is_monotonic)
        with self.assertRaises(ValueError):
            merge_asof(trades, quotes,
                       on='time',
                       by='ticker')

        trades = self.trades.sort_values('time')
        self.assertTrue(trades.time.is_monotonic)
        self.assertFalse(quotes.time.is_monotonic)
        with self.assertRaises(ValueError):
            merge_asof(trades, quotes,
                       on='time',
                       by='ticker')

        quotes = self.quotes.sort_values('time')
        self.assertTrue(trades.time.is_monotonic)
        self.assertTrue(quotes.time.is_monotonic)

        # ok, though has dupes
        merge_asof(trades, self.quotes,
                   on='time',
                   by='ticker')
Exemplo n.º 31
0
 def time_on_uint64(self, direction, tolerance):
     merge_asof(self.df1f,
                self.df2f,
                on="timeu64",
                direction=direction,
                tolerance=tolerance)
Exemplo n.º 32
0
def parse_gpsapilog_to_df_v2(filename):
    """Parse GPS API log to Pandas dataframes, by using merge_asof.

    Args:
      filename: full log file name.
        Type, String.

    Returns:
      timestamp_df: Timestamp Data Frame.
        Type, Pandas DataFrame.
      sv_info_df: GNSS SV info Data Frame.
        Type, Pandas DataFrame.
      sv_stat_df: GNSS SV statistic Data Frame.
        Type, Pandas DataFrame
      loc_info_df: Location Information Data Frame.
        Type, Pandas DataFrame.
        include Provider, Latitude, Longitude, Altitude, GNSSTime, Speed, Bearing
    """
    # Get parsed dataframe list
    parsed_data = parse_log_to_df(
        filename=filename,
        configs=CONFIG_GPSAPILOG,
    )

    # get DUT Timestamp
    timestamp_df = parsed_data['phone_time']
    timestamp_df['phone_time'] = timestamp_df.apply(
        lambda row: datetime.datetime.strptime(row.date + '-' + row.time,
                                               '%Y/%m/%d-%H:%M:%S'),
        axis=1)
    # drop logsize, date, time
    parsed_data['phone_time'] = timestamp_df.drop(['logsize', 'date', 'time'],
                                                  axis=1)

    # Add phone_time from timestamp dataframe by row number
    for key in parsed_data:
        if (key != 'phone_time') and (not parsed_data[key].empty):
            parsed_data[key] = pds.merge_asof(parsed_data[key],
                                              parsed_data['phone_time'],
                                              left_index=True,
                                              right_index=True)

    # Get space vehicle info dataframe
    # Handle the pre GPSTool 2.12.24 case
    if not parsed_data['SpaceVehicle'].empty:
        sv_info_df = parsed_data['SpaceVehicle']

    # Handle the post GPSTool 2.12.24 case with baseband CNo
    elif not parsed_data['SpaceVehicle_wBB'].empty:
        sv_info_df = parsed_data['SpaceVehicle_wBB']

    # Get space vehicle statistics dataframe
    # Handle the pre GPSTool 2.12.24 case
    if not parsed_data['HistoryAvgTop4CNo'].empty:
        # First merge all dataframe from LIST_SVSTAT[1:],
        sv_stat_df = fts.reduce(
            lambda item1, item2: pds.merge(item1, item2, on='phone_time'),
            [parsed_data[key] for key in LIST_SVSTAT[1:]])
        # Then merge with LIST_SVSTAT[0]
        sv_stat_df = pds.merge(sv_stat_df,
                               parsed_data[LIST_SVSTAT[0]],
                               on='phone_time')

    # Handle the post GPSTool 2.12.24 case with baseband CNo
    elif not parsed_data['AntennaHistoryAvgTop4CNo'].empty:
        # First merge all dataframe from LIST_SVSTAT[1:],
        sv_stat_df = fts.reduce(
            lambda item1, item2: pds.merge(item1, item2, on='phone_time'),
            [parsed_data[key] for key in LIST_SVSTAT_WBB[1:]])
        # Then merge with LIST_SVSTAT[0]
        sv_stat_df = pds.merge(sv_stat_df,
                               parsed_data[LIST_SVSTAT_WBB[0]],
                               on='phone_time')

    # Get location fix information dataframe
    # First merge all dataframe from LIST_LOCINFO[1:],
    loc_info_df = fts.reduce(
        lambda item1, item2: pds.merge(item1, item2, on='phone_time'),
        [parsed_data[key] for key in LIST_LOCINFO[1:]])
    # Then merge with LIST_LOCINFO[8]
    loc_info_df = pds.merge(loc_info_df,
                            parsed_data[LIST_LOCINFO[0]],
                            on='phone_time')
    # Convert GNSS Time
    loc_info_df['gnsstime'] = loc_info_df.apply(
        lambda row: datetime.datetime.strptime(row.Date + '-' + row.Time,
                                               '%Y/%m/%d-%H:%M:%S'),
        axis=1)

    # Data Conversion
    timestamp_df['logsize'] = timestamp_df['logsize'].astype(int)

    sv_info_df['SV'] = sv_info_df['SV'].astype(int)
    sv_info_df['Elevation'] = sv_info_df['Elevation'].astype(float)
    sv_info_df['Azimuth'] = sv_info_df['Azimuth'].astype(float)
    sv_info_df['Frequency'] = sv_info_df['Frequency'].astype(float)

    if 'CNo' in list(sv_info_df.columns):
        sv_info_df['CNo'] = sv_info_df['CNo'].astype(float)
        sv_info_df['AntCNo'] = sv_info_df['CNo']
    elif 'AntCNo' in list(sv_info_df.columns):
        sv_info_df['AntCNo'] = sv_info_df['AntCNo'].astype(float)
        sv_info_df['BbCNo'] = sv_info_df['BbCNo'].astype(float)

    if 'CurrentAvgTop4CNo' in list(sv_stat_df.columns):
        sv_stat_df['CurrentAvgTop4CNo'] = sv_stat_df[
            'CurrentAvgTop4CNo'].astype(float)
        sv_stat_df['CurrentAvgCNo'] = sv_stat_df['CurrentAvgCNo'].astype(float)
        sv_stat_df['HistoryAvgTop4CNo'] = sv_stat_df[
            'HistoryAvgTop4CNo'].astype(float)
        sv_stat_df['HistoryAvgCNo'] = sv_stat_df['HistoryAvgCNo'].astype(float)
        sv_stat_df['AntennaCurrentAvgTop4CNo'] = sv_stat_df[
            'CurrentAvgTop4CNo']
        sv_stat_df['AntennaCurrentAvgCNo'] = sv_stat_df['CurrentAvgCNo']
        sv_stat_df['AntennaHistoryAvgTop4CNo'] = sv_stat_df[
            'HistoryAvgTop4CNo']
        sv_stat_df['AntennaHistoryAvgCNo'] = sv_stat_df['HistoryAvgCNo']
        sv_stat_df['BasebandCurrentAvgTop4CNo'] = npy.nan
        sv_stat_df['BasebandCurrentAvgCNo'] = npy.nan
        sv_stat_df['BasebandHistoryAvgTop4CNo'] = npy.nan
        sv_stat_df['BasebandHistoryAvgCNo'] = npy.nan

    elif 'AntennaCurrentAvgTop4CNo' in list(sv_stat_df.columns):
        sv_stat_df['AntennaCurrentAvgTop4CNo'] = sv_stat_df[
            'AntennaCurrentAvgTop4CNo'].astype(float)
        sv_stat_df['AntennaCurrentAvgCNo'] = sv_stat_df[
            'AntennaCurrentAvgCNo'].astype(float)
        sv_stat_df['AntennaHistoryAvgTop4CNo'] = sv_stat_df[
            'AntennaHistoryAvgTop4CNo'].astype(float)
        sv_stat_df['AntennaHistoryAvgCNo'] = sv_stat_df[
            'AntennaHistoryAvgCNo'].astype(float)
        sv_stat_df['BasebandCurrentAvgTop4CNo'] = sv_stat_df[
            'BasebandCurrentAvgTop4CNo'].astype(float)
        sv_stat_df['BasebandCurrentAvgCNo'] = sv_stat_df[
            'BasebandCurrentAvgCNo'].astype(float)
        sv_stat_df['BasebandHistoryAvgTop4CNo'] = sv_stat_df[
            'BasebandHistoryAvgTop4CNo'].astype(float)
        sv_stat_df['BasebandHistoryAvgCNo'] = sv_stat_df[
            'BasebandHistoryAvgCNo'].astype(float)

    sv_stat_df['L5EngagingRate'] = sv_stat_df['L5EngagingRate'].astype(float)

    loc_info_df['Latitude'] = loc_info_df['Latitude'].astype(float)
    loc_info_df['Longitude'] = loc_info_df['Longitude'].astype(float)
    loc_info_df['Altitude'] = loc_info_df['Altitude'].astype(float)
    loc_info_df['Speed'] = loc_info_df['Speed'].astype(float)
    loc_info_df['Bearing'] = loc_info_df['Bearing'].astype(float)

    return timestamp_df, sv_info_df, sv_stat_df, loc_info_df
Exemplo n.º 33
0
def calc_estimates():
    #calculate signals from earnings estimate date (IBES), including earnings surprises, fwd P/E, change in analyst coverage, fwd EPS growth
    ibes = get_ibes_summary()
    #keep estimates for all fwd financial periods which will be used later to calculate fwd P/E ratios
    all_fpi = ibes.loc[ibes.fpi.isin(
        ['0', '1', '2', '3', '4', '6', '7', '8', '9'])]
    #lag by a month to align with t+1 return
    all_fpi['date'] = pd.to_datetime(
        all_fpi['statpers']) + DateOffset(months=1)
    #reshape to create columns for each fwd financial period, indexed by company/date
    all_fpi = all_fpi[['ticker', 'date', 'fpi',
                       'meanest']].pivot_table(index=['ticker', 'date'],
                                               columns=['fpi'],
                                               values='meanest',
                                               aggfunc=np.sum)
    all_fpi = all_fpi.sort_values(by=['date', 'ticker'])
    #rename columns
    replace_dict = dict(
        zip([str(i) for i in range(5)] + [str(i) for i in range(6, 10)],
            ['ltg'] + ['ann' + str(i) for i in range(1, 5)] +
            ['qtr' + str(i) for i in range(1, 5)]))
    all_fpi.columns = [replace_dict[x] for x in all_fpi.columns]
    #subset ibes data to 1 qtr fwd EPS estimate and select the most recent estimate before the earnings date
    ibes = ibes.loc[(ibes.measure == 'EPS') & (ibes.fpi == '6')]
    ibes['fpedats'] = pd.to_datetime(ibes.fpedats)
    ibes = ibes.sort_values(by=['statpers', 'ticker'])
    ibes = ibes.groupby(['fpedats', 'ticker'])[[
        'statpers', 'meanest', 'medest', 'stdev', 'numest'
    ]].last().reset_index()
    #merge on the actual earnings release figures
    actuals = get_ibes_actual()
    actuals = actuals.rename(columns={'value': 'actual'})
    actuals = actuals.loc[(actuals.measure == 'EPS')
                          & (actuals.pdicity == 'QTR')]
    actuals['fpedats'] = pd.to_datetime(actuals.pends)
    ibes = ibes.merge(actuals[['fpedats', 'ticker', 'actual', 'anndats']],
                      how='left',
                      on=['fpedats', 'ticker'])
    #set the date index to the earnings releast date and lag by 2 business days + 1 month to align with t+1 return
    ibes['anndats'] = pd.to_datetime(ibes['anndats'])
    ibes['date'] = ibes['anndats'] + BDay(2) + MonthBegin(1)
    ibes = ibes.loc[pd.notnull(ibes.anndats)]
    ibes = ibes.sort_values(by=['date', 'ticker']).drop_duplicates(
        subset=['ticker', 'date'])
    #calculate standardized unexpected earnings (SUE) and earnings surprise z-score
    ibes['sue'] = (ibes['actual'] - ibes['meanest']) / ibes['stdev']
    ibes['sue'] = ibes['sue'].where(ibes.stdev != 0, np.nan)
    ibes['surprise'] = ibes['actual'] - ibes['meanest']
    ibes['surprise_z'] = ibes.groupby('ticker')['surprise'].rolling(
        8, min_periods=6).apply(lambda x: (x[-1] - x.mean()) / x.std(),
                                raw=True).reset_index(0, drop=True)
    #calculate change in analyst coverage
    ibes['numest_lag'] = ibes.groupby('ticker')['numest'].shift(1)
    ibes['chanalyst'] = ibes['numest'] - ibes['numest_lag']
    ibes['pchanalyst'] = ibes['numest'] / ibes['numest_lag'] - 1
    #characteristics to output
    cols = ['sue', 'surprise_z', 'chanalyst', 'pchanalyst']
    #get crsp file,  merge with IBES linking table, and merge characteristics
    crsp = get_crsp_m()
    crsp = crsp.sort_values(by=['date', 'permno'])
    link = get_crspibes_link()
    crsp = crsp.merge(link[['permno', 'ticker', 'sdate', 'edate']],
                      how='left',
                      on='permno')
    crsp['date'] = pd.to_datetime(crsp.date)
    crsp['ticker'] = crsp.ticker.where(
        ~((crsp.date < crsp.sdate) | (crsp.date > crsp.edate)), np.nan)
    crsp['date'] = pd.to_datetime(crsp.date) - MonthBegin(1)
    crsp = crsp.dropna(subset=['ticker']).drop_duplicates(
        subset=['permno', 'date'])
    crsp = crsp.merge(ibes[['ticker', 'date'] + cols],
                      how='left',
                      on=['ticker', 'date'])
    #merge all fwd earning period estimates onto crsp data
    crsp = pd.merge_asof(crsp, all_fpi, on='date', by=['ticker'])
    #fwd earnings yield at various horizons
    crsp['prc_lag'] = crsp.groupby('permno')['prc'].shift(1)
    crsp['prc_lag'] = crsp.prc_lag.abs()
    crsp['pe0'] = crsp['ann1'] / crsp['prc_lag']
    crsp['pe1'] = crsp['ann2'] / crsp['prc_lag']
    crsp['pe2'] = crsp['ann3'] / crsp['prc_lag']
    crsp['pe3'] = crsp['ann4'] / crsp['prc_lag']
    crsp['pe4'] = crsp['qtr1'] / crsp['prc_lag']
    crsp['pe5'] = crsp[['qtr1', 'qtr2', 'qtr3', 'qtr4'
                        ]].sum(axis=1) / crsp['prc_lag']
    #add to list of characteristics to output
    cols += ['pe' + str(x) for x in range(6)] + ['ltg']
    output_characteristics(crsp, cols)
    with open(data_output_path + 'characteristics_output_log.txt', 'a') as f:
        f.write('estimates\n')