Exemplo n.º 1
0
def inner_join(dataframe1, dataframe2, on=None, left_on=None, right_on=None):
    # join by common columns if nothing is specified
    if on == left_on == right_on is None:
        print('No columns specified')
        common_cols = list(
            set(dataframe1.columns).intersection(dataframe2.columns))
        if len(common_cols) == 0:
            raise ValueError('No common columns exist')
        print('inner joining on: ', common_cols)
        on = common_cols

    # check if on was specified and create left_on/right_on
    if on is not None:
        left_on = right_on = on

    left_on = make_list_if_not_list(left_on)
    right_on = make_list_if_not_list(right_on)

    if not all([col in dataframe1.columns for col in left_on]):
        raise ValueError('All columns not present in dataframe1')
    if not all([col in dataframe2.columns for col in right_on]):
        raise ValueError('All columns not present in dataframe2')

    return dataframe1.merge(dataframe2,
                            left_on=left_on,
                            right_on=right_on,
                            how='inner')
Exemplo n.º 2
0
def unmelt(df, values, columns, index=None, reset_index=True, remove_multiindex=True):

    """
    This function unmelts a dataframe

    :param df:
    :param values:
    :param columns:
    :param index:
    :param reset_index:
    :return:
    """

    # if no index is provided use all other columns in DataFrame
    if index is None:
        index = set(df.columns)
        _cols = make_list_if_not_list(columns)
        _vals = make_list_if_not_list(values)
        index = index - set(_cols + _vals)
    # pivot table to unmelt the dataframe using first item found for each group
    dfpiv = df.pivot_table(values=values, columns=columns, index=index, aggfunc='first')
    if reset_index:
        dfpiv.reset_index(inplace=True)
    if remove_multiindex:
        dfpiv = flatten_hier_column_names(dfpiv)
    # remove column index name in case one was created
    dfpiv.columns.name = ''
    return dfpiv
Exemplo n.º 3
0
def anti_join(dataframe1,
              dataframe2,
              on=None,
              left_on=None,
              right_on=None,
              reset_index=True):
    # join by common columns if nothing is specified
    if on == left_on == right_on is None:
        print('No columns specified')
        common_cols = list(
            set(dataframe1.columns).intersection(dataframe2.columns))
        if len(common_cols) == 0:
            raise ValueError('No common columns exist')
        print('anti joining on: ', common_cols)
        on = common_cols

    # check if on was specified and create left_on/right_on
    if on is not None:
        left_on = right_on = on

    left_on = make_list_if_not_list(left_on)
    right_on = make_list_if_not_list(right_on)

    if not all([col in dataframe1.columns for col in left_on]):
        raise ValueError('All columns not present in dataframe1')
    if not all([col in dataframe2.columns for col in right_on]):
        raise ValueError('All columns not present in dataframe2')

    # pdb.set_trace()
    dataframe2[
        '_jointag_'] = 'tag'  # add a tag in case of joining on all columns (if all cols no NA to drop)
    df2_nodups = dataframe2.drop_duplicates(
        right_on)  # drop duplicates so as not to add any rows during merge
    df2_nodups = df2_nodups[right_on + [
        '_jointag_'
    ]]  # subset just the columns of interest in case df is very wide
    dataframe1.reset_index(drop=True, inplace=True)
    all_df1_ids = list(dataframe1.index)
    df1_ids_in_df2 = list(
        dataframe1.merge(df2_nodups,
                         left_on=left_on,
                         right_on=right_on,
                         how='left').dropna(subset=['_jointag_']).index)
    ids_not_in_df2 = set(all_df1_ids).difference(df1_ids_in_df2)

    dataframe2.drop('_jointag_', axis=1, inplace=True)

    if len(ids_not_in_df2) == 0:
        print('Warning: anti join is returning and empty dataframe')
    if reset_index:
        return dataframe1.iloc[list(ids_not_in_df2)].reset_index(drop=True)
    else:
        return dataframe1.iloc[list(ids_not_in_df2)]
Exemplo n.º 4
0
def left_interlace(dataframe1,
                   dataframe2,
                   interlace_cols,
                   on=None,
                   left_on=None,
                   right_on=None,
                   drop_right_on=False,
                   **kwargs):

    interlace_cols = make_list_if_not_list(interlace_cols)

    if on == left_on == right_on is None:
        print('No columns specified')
        d1cols = dataframe1.columns
        d2cols = dataframe2.columns
        d2cols = [c for c in d2cols if c not in interlace_cols]
        common_cols = list(set(d2cols).intersection(d1cols))
        if len(common_cols) == 0:
            raise ValueError('No common columns exist')
        print('left joining on: ', common_cols)
        on = common_cols

    # check if on was specified and create left_on/right_on
    if on is not None:
        left_on = right_on = on

    if any([lo in interlace_cols for lo in left_on]):
        raise ValueError('interlace columns found in left joining columns')
    if any([ro in interlace_cols for ro in right_on]):
        raise ValueError('interlace columns found in right joining columns')

    left_on = make_list_if_not_list(left_on)
    right_on = make_list_if_not_list(right_on)

    if not all([col in dataframe1.columns for col in left_on]):
        raise ValueError('All columns not present in dataframe1')
    if not all([col in dataframe2.columns for col in right_on]):
        raise ValueError('All columns not present in dataframe2')

    dataframe1 = dataframe1.merge(dataframe2,
                                  left_on=left_on,
                                  right_on=right_on,
                                  how='left',
                                  **kwargs)

    for col in interlace_cols:
        dataframe1[col] = np.where(dataframe1[col + '_x'].notnull(),
                                   dataframe1[col + '_x'],
                                   dataframe1[col + '_y'])
        dataframe1.drop([col + '_x', col + '_y'], axis=1, inplace=True)

    return dataframe1
Exemplo n.º 5
0
def left_join(dataframe1,
              dataframe2,
              on=None,
              left_on=None,
              right_on=None,
              drop_right_on=False,
              verbosity=1,
              **kwargs):
    # join by common columns if nothing is specified
    if on == left_on == right_on is None:
        if verbosity == 1:
            print('No columns specified')
        common_cols = list(
            set(dataframe1.columns).intersection(dataframe2.columns))
        if len(common_cols) == 0:
            raise ValueError('No common columns exist')
        if verbosity == 1:
            print('left joining on: ', common_cols)
        on = common_cols

    # check if on was specified and create left_on/right_on
    if on is not None:
        left_on = right_on = on

    left_on = make_list_if_not_list(left_on)
    right_on = make_list_if_not_list(right_on)

    if not all([col in dataframe1.columns for col in left_on]):
        raise ValueError('All columns not present in dataframe1')
    if not all([col in dataframe2.columns for col in right_on]):
        raise ValueError('All columns not present in dataframe2')

    dataframe1 = dataframe1.merge(dataframe2,
                                  left_on=left_on,
                                  right_on=right_on,
                                  how='left',
                                  **kwargs)

    if drop_right_on:
        if on is not None:
            warnings.warn(
                'Can not drop right joining columns if they are all in both DataFrames'
            )
        else:
            drops = set(right_on).difference(left_on)
            dataframe1.drop(drops, axis=1, inplace=True)

    return dataframe1
Exemplo n.º 6
0
def encode_datetime(df,
                    datetime_col_name,
                    features=['hour', 'day'],
                    suffix='',
                    make_categorical=False,
                    drop_datetime=True,
                    inplace=False):

    features = make_list_if_not_list(features)

    if not isinstance(df[datetime_col_name].iloc[0],
                      (pd._libs.tslib.Timestamp)):
        print('Converting to pandas._libs.tslib.Timestamp')
        df[datetime_col_name] = pd.to_datetime(df[datetime_col_name])

    if not inplace:
        df = df.copy(deep=True)

    # pandas dt.dayofweek monday = 0 sunday = 6 for some reason
    column_names = [f + suffix for f in features]
    feature_dict = dict(zip(features, column_names))
    datecol = df[datetime_col_name]

    for key in feature_dict.keys():
        df[feature_dict[key]] = datecol.dt.__getattribute__(key)

    if drop_datetime:
        df.drop(datetime_col_name, axis=1, inplace=True)
    if make_categorical:
        df = to_categorical(df, features)

    if not inplace:
        return df
Exemplo n.º 7
0
def semi_join(dataframe1, dataframe2, on=None, left_on=None, right_on=None):

    # TODO: seems like a simple .isin() is faster? Test this out and change if neccessary.
    # join by common columns if nothing is specified
    if on == left_on == right_on is None:
        print('No columns specified')
        common_cols = list(
            set(dataframe1.columns).intersection(dataframe2.columns))
        if len(common_cols) == 0:
            raise ValueError('No common columns exist')
        print('semi joining on: ', common_cols)
        on = common_cols

    # check if on was specified and create left_on/right_on
    if on is not None:
        left_on = right_on = on

    left_on = make_list_if_not_list(left_on)
    right_on = make_list_if_not_list(right_on)

    if not all([col in dataframe1.columns for col in left_on]):
        raise ValueError('All columns not present in dataframe1')
    if not all([col in dataframe2.columns for col in right_on]):
        raise ValueError('All columns not present in dataframe2')

    dataframe2[
        '_jointag_'] = 'tag'  # add a tag in case of joining on all columns (if all cols no NA to drop)
    df2_nodups = dataframe2.drop_duplicates(
        right_on)  # drop duplicates so as not to add any rows during merge
    df2_nodups = df2_nodups[right_on + [
        '_jointag_'
    ]]  # subset just the columns of interest in case df is very wide
    # merge resets index
    merged = dataframe1.merge(df2_nodups,
                              left_on=left_on,
                              right_on=right_on,
                              how='inner')
    if len(merged) == 0:
        print('Warning: semi join is returning and empty dataframe')
    return merged[dataframe1.columns]
Exemplo n.º 8
0
def append_to_column_name(df, cols, string_to_append, before=True):

    cols = make_list_if_not_list(cols)

    if all(isinstance(i, str) for i in cols):
        old_names = df.columns[df.columns.isin(cols)]
    elif all(isinstance(i, int) for i in cols):
        old_names = df.columns[cols]
    else:
        raise ValueError('cols should either be a list of indices or a list of column names')
    if before:
        new_names = [string_to_append + o for o in old_names]
    else:
        new_names = [o + string_to_append for o in old_names]

    rename_dict = dict(zip(old_names, new_names))
    df.rename(columns=rename_dict, inplace=True)