def _split_symbol_mappings(df): """Split out the symbol: sid mappings from the raw data. Parameters ---------- df : pd.DataFrame The dataframe with multiple rows for each symbol: sid pair. Returns ------- asset_info : pd.DataFrame The asset info with one row per asset. symbol_mappings : pd.DataFrame The dataframe of just symbol: sid mappings. The index will be the sid, then there will be three columns: symbol, start_date, and end_date. """ mappings = df[list(mapping_columns)] for symbol in mappings.symbol.unique(): persymbol = mappings[mappings.symbol == symbol] intersections = list(intersecting_ranges( map(from_tuple, zip(persymbol.start_date, persymbol.end_date)), )) if intersections: raise ValueError( 'Ambiguous ownership of %r, multiple companies held this' ' ticker over the following ranges:\n%s' % ( symbol, list(map(_format_range, intersections)), ), ) return ( df.groupby(level=0).apply(_check_asset_group), df[list(mapping_columns)], )
def _split_symbol_mappings(df): """Split out the symbol: sid mappings from the raw data. Parameters ---------- df : pd.DataFrame The dataframe with multiple rows for each symbol: sid pair. Returns ------- asset_info : pd.DataFrame The asset info with one row per asset. symbol_mappings : pd.DataFrame The dataframe of just symbol: sid mappings. The index will be the sid, then there will be three columns: symbol, start_date, and end_date. """ mappings = df[list(mapping_columns)] ambigious = {} for symbol in mappings.symbol.unique(): persymbol = mappings[mappings.symbol == symbol] intersections = list(intersecting_ranges(map( from_tuple, zip(persymbol.start_date, persymbol.end_date), ))) if intersections: ambigious[symbol] = ( intersections, persymbol[['start_date', 'end_date']].astype('datetime64[ns]'), ) if ambigious: raise ValueError( 'Ambiguous ownership for %d symbol%s, multiple assets held the' ' following symbols:\n%s' % ( len(ambigious), '' if len(ambigious) == 1 else 's', '\n'.join( '%s:\n intersections: %s\n %s' % ( symbol, tuple(map(_format_range, intersections)), # indent the dataframe string '\n '.join(str(df).splitlines()), ) for symbol, (intersections, df) in sorted( ambigious.items(), key=first, ), ), ) ) return ( df.groupby(level=0).apply(_check_asset_group), df[list(mapping_columns)], )
def check_intersections(persymbol): intersections = list(intersecting_ranges(map( from_tuple, zip(persymbol.start_date, persymbol.end_date), ))) if intersections: data = persymbol[ ['start_date', 'end_date'] ].astype('datetime64[ns]') # indent the dataframe string, also compute this early because # ``persymbol`` is a view and ``astype`` doesn't copy the index # correctly in pandas 0.22 msg_component = '\n '.join(str(data).splitlines()) ambigious[persymbol.name] = intersections, msg_component