Exemplo n.º 1
0
Arquivo: manip.py Projeto: yz-/ut
def hetero_concat(df_list):
    df = df_list[0]
    for dfi in df_list[1:]:
        add_nan_cols(df, colloc.setdiff(dfi.columns, df.columns))
        add_nan_cols(dfi, colloc.setdiff(df.columns, dfi.columns))
        dfi = dfi[df.columns]
        df = pd.concat([df, dfi])
    return replace_nans_with_spaces_in_object_columns(df)
Exemplo n.º 2
0
def hetero_concat(df_list):
    df = df_list[0]
    for dfi in df_list[1:]:
        add_nan_cols(df, colloc.setdiff(dfi.columns, df.columns))
        add_nan_cols(dfi, colloc.setdiff(df.columns, dfi.columns))
        dfi = dfi[df.columns]
        df = pd.concat([df, dfi])
    return replace_nans_with_spaces_in_object_columns(df)
Exemplo n.º 3
0
Arquivo: pot.py Projeto: yz-/ut
 def from_count_df_to_count(cls, count_df, count_col='pval'):
     """
     Creates a potential from a dataframe specifying point counts (where the count column name is specified by
     count_col
     """
     pot_vars = list(colloc.setdiff(count_df.columns, [count_col]))
     tb = count_df[pot_vars+[count_col]].groupby(pot_vars).sum().reset_index()
     tb = ch_col_names(tb, 'pval', count_col)
     return Pot(tb)
Exemplo n.º 4
0
def semantics_term_stats_maker_mk_terms_df(df,
                                           text_cols,
                                           id_cols=None,
                                           tokenizer_re=tokenizer_re):
    text_cols = util_ulist.ascertain_list(text_cols)
    if id_cols is None:
        id_cols = colloc.setdiff(df.columns, text_cols)
    else:
        id_cols = util_ulist.ascertain_list(id_cols)
        id_cols_missing = colloc.setdiff(id_cols, df.columns)
        if id_cols_missing:  # if any columns are missing, try to get them from named index
            df = df.reset_index(id_cols_missing)
    dd = pd.DataFrame()
    for c in text_cols:
        d = df[id_cols]
        d['term'] = map(lambda x: re.findall(tokenizer_re, x), df[c])
        d = daf_manip.rollout_cols(d, cols_to_rollout='term')
        dd = pd.concat([dd, d])
    return dd
Exemplo n.º 5
0
 def from_count_df_to_count(cls, count_df, count_col='pval'):
     """
     Creates a potential from a dataframe specifying point counts (where the count column name is specified by
     count_col
     """
     pot_vars = list(colloc.setdiff(count_df.columns, [count_col]))
     tb = count_df[pot_vars +
                   [count_col]].groupby(pot_vars).sum().reset_index()
     tb = ch_col_names(tb, 'pval', count_col)
     return Pot(tb)
Exemplo n.º 6
0
Arquivo: manip.py Projeto: yz-/ut
def gather_col_values(df,
                      cols_to_gather=None,
                      gathered_col_name='gathered_cols',
                      keep_cols_that_were_gathered=False,
                      remove_empty_values=True):
    cols_to_gather = cols_to_gather or df.columns
    df = df.copy()
    if remove_empty_values == False:
        df[gathered_col_name] = [list(x[1:]) for x in df[cols_to_gather].itertuples()]
    else:
        df[gathered_col_name] = \
            map(lambda x: [xx for xx in x if xx], [list(x[1:]) for x in df[cols_to_gather].itertuples()])
    if keep_cols_that_were_gathered==False:
        df = df[colloc.setdiff(df.columns, cols_to_gather)]
    return df
Exemplo n.º 7
0
def gather_col_values(df,
                      cols_to_gather=None,
                      gathered_col_name='gathered_cols',
                      keep_cols_that_were_gathered=False,
                      remove_empty_values=True):
    if cols_to_gather is None:
        cols_to_gather = df.columns
    df = df.copy()
    if remove_empty_values == False:
        df[gathered_col_name] = [list(x[1:]) for x in df[cols_to_gather].itertuples()]
    else:
        df[gathered_col_name] = \
            map(lambda x: [xx for xx in x if xx], [list(x[1:]) for x in df[cols_to_gather].itertuples()])
    if keep_cols_that_were_gathered==False:
        df = df[colloc.setdiff(df.columns, cols_to_gather)]
    return df
Exemplo n.º 8
0
def rollout_cols(df, cols_to_rollout=None):
    """
    rolls out the values of cols_to_rollout so that each individual list (or other iterable) element is on it's own row,
    with other non-cols_to_rollout values aligned with them as in the original dataframe
    Example:
    df =
        A   B
        1   [11,111]
        2   [22]
        3   [3,33,333]
    rollout_cols(df, cols_to_rollout='B') =
        A   B
        1   11
        1   111
        2   22
        3   3
        3   33
        3   333
    """
    # if no cols_to_rollout is given, (try to) rollout all columns that are iterable (lists, etc.)
    cols_to_rollout = cols_to_rollout or daf_diagnosis.cols_that_are_of_the_type(df, util_var.is_an_iter)
    # make sure cols_to_rollout is a list
    cols_to_rollout = util_ulist.ascertain_list(cols_to_rollout)
    # get non_rollout_columns
    non_rollout_columns = colloc.setdiff(df.columns, cols_to_rollout)
    # mk an array with the lengths of the lists to rollout (get it from the first cols_to_rollout and cross fingers that
    # all cols_to_rollout have the same list lengths
    rollout_lengths = np.array(df[cols_to_rollout[0]].apply(len))
    # create a rollout_df dataframe (this will be the output)
    rollout_df = pd.DataFrame(range(np.sum(rollout_lengths)))  # TODO: I CANNOT F**ING BELIEVE I'M DOING THIS!!! But found no other way to make a dataframe empty, and then construct it on the fly!
    # rollout cols_to_rollout
    for c in cols_to_rollout:
        rollout_df[c] = np.concatenate(list(df[c]))
    # rollout cols_to_rollout
    for c in non_rollout_columns:
        t = [np.tile(x, (y, 1)) for (x, y) in zip(df[c], rollout_lengths)]
        try:
            rollout_df[c] = np.concatenate(t)
        except ValueError:
            rollout_df[c] = [x for x in chain(*t)]
    # put the columns in their original order
    return rollout_df[df.columns]
Exemplo n.º 9
0
Arquivo: manip.py Projeto: yz-/ut
def rollout_cols(df, cols_to_rollout=None):
    """
    rolls out the values of cols_to_rollout so that each individual list (or other iterable) element is on it's own row,
    with other non-cols_to_rollout values aligned with them as in the original dataframe
    Example:
    df =
        A   B
        1   [11,111]
        2   [22]
        3   [3,33,333]
    rollout_cols(df, cols_to_rollout='B') =
        A   B
        1   11
        1   111
        2   22
        3   3
        3   33
        3   333
    """
    # if no cols_to_rollout is given, (try to) rollout all columns that are iterable (lists, etc.)
    cols_to_rollout = cols_to_rollout or daf_diagnosis.cols_that_are_of_the_type(df, util_var.is_an_iter)
    # make sure cols_to_rollout is a list
    cols_to_rollout = util_ulist.ascertain_list(cols_to_rollout)
    # get non_rollout_columns
    non_rollout_columns = colloc.setdiff(df.columns, cols_to_rollout)
    # mk an array with the lengths of the lists to rollout (get it from the first cols_to_rollout and cross fingers that
    # all cols_to_rollout have the same list lengths
    rollout_lengths = np.array(df[cols_to_rollout[0]].apply(len))
    # create a rollout_df dataframe (this will be the output)
    rollout_df = pd.DataFrame(range(np.sum(rollout_lengths)))  # TODO: I CANNOT F**ING BELIEVE I'M DOING THIS!!! But found no other way to make a dataframe empty, and then construct it on the fly!
    # rollout cols_to_rollout
    for c in cols_to_rollout:
        rollout_df[c] = np.concatenate(list(df[c]))
    # rollout cols_to_rollout
    for c in non_rollout_columns:
        t = [np.tile(x, (y, 1)) for (x, y) in zip(df[c], rollout_lengths)]
        try:
            rollout_df[c] = np.concatenate(t)
        except ValueError:
            rollout_df[c] = [x for x in chain(*t)]
    # put the columns in their original order
    return rollout_df[df.columns]
Exemplo n.º 10
0
def rm_cols_if_present(df, cols):
    cols = util_ulist.ascertain_list(cols)
    return df[colloc.setdiff(df.columns, cols)]
Exemplo n.º 11
0
Arquivo: manip.py Projeto: yz-/ut
def rm_cols_if_present(df, cols):
    cols = util_ulist.ascertain_list(cols)
    return df[colloc.setdiff(df.columns, cols)]
Exemplo n.º 12
0
 def vars(self):
     return colloc.setdiff(list(self.tb.columns), ['pval'])
Exemplo n.º 13
0
Arquivo: pot.py Projeto: yz-/ut
 def vars(self):
     return colloc.setdiff(list(self.tb.columns), ['pval'])