def semi_join(left, right=None, on=None): if isinstance(on, Mapping): # coerce colnames to list, to avoid indexing with tuples on_cols, right_on = map(list, zip(*on.items())) right = right[right_on].rename(dict(zip(right_on, on_cols))) elif on is None: on_cols = set(left.columns).intersection(set(right.columns)) if not len(on_cols): raise Exception( "No joining column specified, and no shared column names") elif isinstance(on, str): on_cols = [on] else: on_cols = on # get our semi join on ---- if len(on_cols) == 1: col_name = on_cols[0] indx = left[col_name].isin(right[col_name]) return left.loc[indx] # Not a super efficient approach. Effectively, an inner join with what would # be duplicate rows removed. merger = _MergeOperation(left, right, left_on=on_cols, right_on=on_cols) _, l_indx, _ = merger._get_join_info() range_indx = pd.RangeIndex(len(left)) return left.loc[range_indx.isin(l_indx)]
def anti_join(left, right=None, on=None): """Return the left table with every row that would *not* be kept in an inner join. """ # copied from semi_join if isinstance(on, Mapping): left_on, right_on = zip(*on.items()) # manually perform merge, up to getting pieces need for indexing merger = _MergeOperation(left, right, left_on=left_on, right_on=right_on) _, l_indx, _ = merger._get_join_info() # use the left table's indexer to exclude those rows range_indx = pd.RangeIndex(len(left)) return left.iloc[range_indx.difference(l_indx), :]