def mk_terms_df(df, text_cols, id_cols=None, tokenizer_re=tokenizer_re): text_cols = util_ulist.ascertain_list(text_cols) if id_cols is None: id_cols = colloc.setdiff(df.columns, text_cols) else: id_cols = util_ulist.ascertain_list(id_cols) id_cols_missing = colloc.setdiff(id_cols, df.columns) if id_cols_missing: # if any columns are missing, try to get them from named index df = df.reset_index(id_cols_missing) dd = pd.DataFrame() for c in text_cols: d = df[id_cols] d['term'] = map(lambda x: re.findall(tokenizer_re, x), df[c]) d = daf_manip.rollout_cols(d, cols_to_rollout='term') dd = pd.concat([dd, d]) return dd
def __init__(self, obj): [setattr(self, k, v) for k, v in obj.__dict__.iteritems()] if not hasattr(self, 'data_dependencies'): self.data_dependencies = dict() if not hasattr(self, 'data_makers'): self.data_makers = dict() if not hasattr(self, 'data_storers'): self.data_storers = dict() # make sure values of data_dependencies are lists self.data_dependencies = {k: util_ulist.ascertain_list(v) for k, v in self.data_dependencies.iteritems()} ## mark all data_maker_types that already exist as "call" #self.data_maker_type = {k: 'call' for k in self.data_makers.keys()} # default data_makers to functions of the same name as data_dependencies missing_data_makers = set(self.data_dependencies.keys()).difference(self.data_makers.keys()) for k in self.data_dependencies.keys(): #if k in self.data_makers.keys(): # if the data_maker was already given # if hasattr(self, k): k_attr = getattr(self, k) if isinstance(k_attr, pd.HDFStore): # if k_attr is a store self.data_makers[k] = StoreDataGetter(store=k_attr, key=k) elif hasattr(k_attr, '__call__'): # if k_attr is callable (method, function, ...) self.data_makers[k] = CallDataGetter(fun=k_attr) else: # if not, assume there is, or will be, an attribute of that name, and have data_maker return it self.data_makers[k] = AttrDataGetter(obj=self, attr=k_attr) if not hasattr(self, 'verbose_level'): self.verbose_level = 1
def mk_terms_df(df, text_cols, id_cols=None, tokenizer_re=tokenizer_re): text_cols = util_ulist.ascertain_list(text_cols) if id_cols is None: id_cols = colloc.setdiff(df.columns, text_cols) else: id_cols = util_ulist.ascertain_list(id_cols) id_cols_missing = colloc.setdiff(id_cols, df.columns) if id_cols_missing: # if any columns are missing, try to get them from named index df = df.reset_index(id_cols_missing) dd = pd.DataFrame() for c in text_cols: d = df[id_cols] d['term'] = map(lambda x : re.findall(tokenizer_re, x), df[c]) d = daf_manip.rollout_cols(d, cols_to_rollout='term') dd = pd.concat([dd, d]) return dd
def project_to(self, var_list=[]): """ project to a subset of variables (marginalize out other variables) """ var_list = colloc.intersect(ascertain_list(var_list), self.vars()) if var_list: # if non-empty, marginalize out other variables return Pot(self.tb[var_list + ['pval']].groupby(var_list).sum().reset_index()) else: # if var_list is empty, return a singleton potential containing the sum of the vals of self.tb return Pot(pd.DataFrame({'pval': self.tb['pval'].sum()}, index=['']))
def add_bpx_col(df, groupby_keys=[]): groupby_keys = ulist.ascertain_list(groupby_keys) + [ 'kw_stripped_and_lowered' ] df['kw_stripped_and_lowered'] = pstr_trans.lower( aw_manip.strip_kw(df['keyword'])) dg = df.groupby(groupby_keys, group_keys=False).apply(lambda x: _bpx_tag(x)) del dg['kw_stripped_and_lowered'] return dg
def mk_fanout_score_df(df, fromVars, toVars, statVars=None, keep_statVars=False): if statVars is None: statVars = list(set(df.columns)-set(fromVars+toVars)) fromVars = ascertain_list(fromVars) toVars = ascertain_list(toVars) statVars = ascertain_list(statVars) # make a dataframe with all same fromVars+toVars aggregated (summing the statVars) agg_df = df[fromVars+toVars+statVars].groupby(fromVars+toVars,as_index=False).sum() # group agg_df by fromVars, keeping only fromVars+statVars agg_df_gr = agg_df[fromVars+statVars].groupby(fromVars) # compute the sum-normalize values of every group agg_df_freq = agg_df_gr.transform(group_normalized_freq).add_suffix('_freq_fanout_ratio') # compute the inverse of the group sizes agg_df_count = agg_df_gr.agg(group_normalized_count).add_suffix('_count_fanout_ratio') d = agg_df.join(agg_df_freq) if keep_statVars==False: d = d.drop(statVars,axis=1) d = d.join(agg_df_count,on=fromVars) return d
def project_to(self, var_list=[]): """ project to a subset of variables (marginalize out other variables) """ var_list = colloc.intersect(ascertain_list(var_list), self.vars()) if var_list: # if non-empty, marginalize out other variables return Pot(self.tb[var_list + ['pval']].groupby(var_list).sum().reset_index()) else: # if _var_list is empty, return a singleton potential containing the sum of the vals of self.tb return Pot( pd.DataFrame({'pval': self.tb['pval'].sum()}, index=['']))
def rep_tags(df, rep_cols, with_cols, name_to_tag_fun=None): """ Replaces tags (specified by with_cols and the name_to_tag_fun) of the strings of rep_cols with the values of the with_cols columns of df. """ # process inputs df = df.copy() rep_cols = util_ulist.ascertain_list(rep_cols) with_cols = util_ulist.ascertain_list(with_cols) if name_to_tag_fun is None: name_to_tag_fun = lambda x: name_to_tag(x, tag_str_format='#{%s}') tag_exp_with_col = \ [{'with': name, 'tag_exp': re.compile(name_to_tag_fun(name))} for name in with_cols] # go through all rep_cols and replace tags with the value in the with_cols for r in rep_cols: for t in tag_exp_with_col: w = t['with'] tag_exp = t['tag_exp'] print w, tag_exp.pattern df[r] = map(lambda x, y: tag_exp.sub(x, y), df[w], df[r]) return df
def group_count(df, gr_cols=None, count_col=None, keep_order=True): """ adds a column containing the count of the number of groups (defined by the gr_cols columns) """ gr_cols = gr_cols or df.columns count_col = count_col or ut.daf.get.free_col_name(df, ['count', 'gr_count']) if keep_order: df = df.copy() df['column_to_keep_original_order'] = range(len(df)) gr_cols = ulist.ascertain_list(gr_cols) gr_df = ut.daf.gr.group_and_count(df[gr_cols], count_col=count_col) df = df.merge(gr_df, left_on=gr_cols, right_on=gr_cols) if keep_order: df.sort(columns=['column_to_keep_original_order'], inplace=True) del df['column_to_keep_original_order'] return df
def get_info_df(store, keys=None, info=None, cols=None): # process inputs if not keys: keys = store.keys() else: keys = util_ulist.ascertain_list(keys) keys = colloc.intersect(keys, store.keys()) # get info_dict info_dict = get_info_dict(store) # make the df df = pd.DataFrame([dict(v, **{'key': k}) for k, v in info_dict.iteritems()]) df = df[df['key'].isin(keys)] if 'shape' in df.columns: del df['shape'] if 'ncols' not in df.columns: df['ncols'] = np.nan if 'nrows' not in df.columns: df['nrows'] = np.nan # get ncols and nrows with missing idx = df['ncols'].isnull().nonzero()[0] # ncols and nrows should both be missing when one is for i in idx: d = store[df['key'].iloc[i]] df['nrows'].iloc[i] = len(d) df['ncols'].iloc[i] = len(d.columns) # clean up and return df = df.set_index('key') df = df.sort_index() df = daf_manip.reorder_columns_as(df, ['nrows', 'ncols', 'isa', 'typ', 'indexers', 'dc']) df = df.replace(to_replace=np.nan, value='') if info: if isinstance(info, dict): # add as many columns as there are keys in dict, using the values of the dict as functions applied to # the whole stored dataframe to get the column value df = pd.concat([df, pd.DataFrame(columns=info.keys(), index=df.index)], axis=1) for key in df.index.values: key_data = store[key] for k, v in info.iteritems(): df[k].loc[key] = v(key_data) elif np.all(map(lambda x: isinstance(x, basestring), info)): df = daf_manip.filter_columns(df, info) else: raise ValueError('Unrecognized info format') # filter cols if cols: df = daf_manip.filter_columns(df, cols) return df
def mk_data_flow(self): # make sure values of data_dependencies are lists self.data_dependencies = {k: util_ulist.ascertain_list(v) for k, v in self.data_dependencies.iteritems()} # default data_makers to functions of the same name as data_dependencies missing_data_makers = set(self.data_dependencies.keys()).difference(self.data_makers.keys()) bundles = list() for k in missing_data_makers: if hasattr(self, k): self.data_makers[k] = getattr(self, k) else: bundles.append(k) if not hasattr(self, 'verbose_level'): setattr(self, 'verbose_level', 1) if bundles: print("Bundles:") for k in bundles: print(" {}: \n {}").format(k, ', '.join(self.data_dependencies[k]))
def rollout_cols(df, cols_to_rollout=None): """ rolls out the values of cols_to_rollout so that each individual list (or other iterable) element is on it's own row, with other non-cols_to_rollout values aligned with them as in the original dataframe Example: df = A B 1 [11,111] 2 [22] 3 [3,33,333] rollout_cols(df, cols_to_rollout='B') = A B 1 11 1 111 2 22 3 3 3 33 3 333 """ # if no cols_to_rollout is given, (try to) rollout all columns that are iterable (lists, etc.) cols_to_rollout = cols_to_rollout or daf_diagnosis.cols_that_are_of_the_type(df, util_var.is_an_iter) # make sure cols_to_rollout is a list cols_to_rollout = util_ulist.ascertain_list(cols_to_rollout) # get non_rollout_columns non_rollout_columns = colloc.setdiff(df.columns, cols_to_rollout) # mk an array with the lengths of the lists to rollout (get it from the first cols_to_rollout and cross fingers that # all cols_to_rollout have the same list lengths rollout_lengths = np.array(df[cols_to_rollout[0]].apply(len)) # create a rollout_df dataframe (this will be the output) rollout_df = pd.DataFrame(range(np.sum(rollout_lengths))) # TODO: I CANNOT F**ING BELIEVE I'M DOING THIS!!! But found no other way to make a dataframe empty, and then construct it on the fly! # rollout cols_to_rollout for c in cols_to_rollout: rollout_df[c] = np.concatenate(list(df[c])) # rollout cols_to_rollout for c in non_rollout_columns: t = [np.tile(x, (y, 1)) for (x, y) in zip(df[c], rollout_lengths)] try: rollout_df[c] = np.concatenate(t) except ValueError: rollout_df[c] = [x for x in chain(*t)] # put the columns in their original order return rollout_df[df.columns]
def copy_data(from_store, to_store, from_keys, overwrite=False): ''' Copies key contents from one store to another, overwriting or not (default), and respecting original store format. :param from_store: store (or path of store) to copy from :param to_store: store (or path of store) to copy to :param from_keys: list of keys to copy from from_store :param overwrite: if True, will remove existing key in to_store if they exist, if False, will not copy (silently) :return: None ''' # handle input formats if isinstance(from_store, basestring): from_store = MyStore(from_store) close_from_store = True else: close_from_store = False if isinstance(to_store, basestring): to_store = MyStore(to_store) close_to_store = True else: close_to_store = False from_keys = util_ulist.ascertain_list(from_keys) from_keys = map(ascertain_prefix_slash, from_keys) # if overwrite is False, keep only those keys that don't exist if not overwrite: from_keys = list(set(from_keys).difference(to_store.keys())) # get some info on the from_store store_info = get_info_dict(from_store) # do the copying for k in from_keys: store_df_respecting_given_format(to_store, k, from_store[k], key_info=store_info[k]) to_store.flush() # close stores (if they were specified by paths if close_from_store: from_store.close() if close_to_store: to_store.close()
def get_col_names(store, keys=None, singular_info='index_and_columns', print_results=False, style='dict'): ''' :param store: a HDFStore :param keys: list of keys to get info from (if present) :return: a cols_info dict whose keys are the keys of the store and values are a dict with 'index', 'columns', and 'index_and_columns' which contain the data col names ''' # process inputs if not keys: keys = store.keys() else: keys = util_ulist.ascertain_list(keys) keys = colloc.intersect(keys, store.keys()) # make a dict with col (and index) info cols_info = dict() for key in keys: cols_info[key] = dict() df = store[key] cols_info[key]['index'] = list(df.index.names) cols_info[key]['columns'] = list(df.columns) cols_info[key]['index_and_columns'] = cols_info[key]['index'] + cols_info[key]['columns'] if singular_info: cols_info_copy = cols_info cols_info = dict() for key in keys: cols_info[key] = cols_info_copy[key][singular_info] if print_results: PrettyPrinter(indent=2).pprint(cols_info) if style == 'dataframe': d = pd.DataFrame() for k, v in cols_info.iteritems(): v = [x for x in v if x] d = pd.concat([d, pd.DataFrame(data=v, columns=[k])], axis=1) d = d.fillna(value='') cols_info = d.transpose() return cols_info
def join_col(self, df, add_cols, join_cols=None, join_key=None, join_store=None, join_filter=None, drop_joining_duplicates=True): """ This function is meant to return the input df with add_cols added. These columns are fetched in join_store[join_key] and are aligned to df using join_cols. Note: At the time of this writing, only a restricted case is handled, namely: join_cols has only one element that must be in the index of the store """ join_store = join_store or self.join_store join_key = join_key or self.join_key if isinstance(add_cols, basestring): if add_cols in self.add_from.keys(): if 'join_store' in self.add_from[add_cols].keys(): join_store = join_store or self.add_from[add_cols]['join_store'] if 'join_key' in self.add_from[add_cols].keys(): join_key = join_key or self.add_from[add_cols]['join_key'] if 'join_cols' in self.add_from[add_cols].keys(): join_cols = join_cols or self.add_from[add_cols]['join_cols'] join_cols = util_ulist.ascertain_list(join_cols) add_cols = util_ulist.ascertain_list(add_cols) # get the df values to join (and see if they're in cols or index) if coll_op.contains(list(df.columns), join_cols): df_join_cols_in_columns = True df_join_col_values = np.unique(df[join_cols]) else: df_join_cols_in_columns = False df_join_col_values = np.unique(list(df.index)) # get necessary information from store store_key_info = self.store_info[join_store] join_key = ascertain_prefix_slash(join_key) store_key_info = store_key_info[join_key] if len(join_cols) == 1 and join_cols[0] == 'index': print "uploading only specific indices for join_df" join_df = self.store[join_store].select( key=join_key, where=[pd.Term('index', df_join_col_values)], columns=add_cols) elif join_cols in store_key_info['column_names']: print "uploading only specific columns for join_df" join_df = self.store[join_store].select( key=join_key, where=[pd.Term(join_cols[0], df_join_col_values)], columns=join_cols+add_cols) join_df.set_index(join_cols[0]) else: print "uploading the whole potential join_df" join_df = self.store[join_store].select( key=join_key, columns=join_cols+add_cols) #print join_cols #print add_cols #print join_df.head(10) # drop duplicates if drop_joining_duplicates==True: join_df = join_df.drop_duplicates() if coll_op.contains(list(join_df.columns), join_cols): join_df_cols_in_cols = True else: join_df_cols_in_cols = False #print df_join_cols_in_columns #print join_df_cols_in_cols # join if df_join_cols_in_columns: if join_df_cols_in_cols: return pd.merge(df, join_df, on=join_cols) else: return pd.merge(df, join_df, right_on=join_cols, left_index=True) else: if join_df_cols_in_cols: return pd.merge(df, join_df, right_index=True, left_on=join_cols) else: return pd.merge(df, join_df, right_index=True, left_index=True)
def add_bpx_col(df, groupby_keys=[]): groupby_keys = ulist.ascertain_list(groupby_keys) + ['kw_stripped_and_lowered'] df['kw_stripped_and_lowered'] = pstr_trans.lower(aw_manip.strip_kw(df['keyword'])) dg = df.groupby(groupby_keys,group_keys=False).apply(lambda x:_bpx_tag(x)) del dg['kw_stripped_and_lowered'] return dg
def has_attributes(obj, attr_list): attr_list = util_ulist.ascertain_list(attr_list) return all([x in obj.__dict__.keys() for x in attr_list])
def order_vars(self, var_list, sort_pts=True): self.tb = reorder_columns_as(self.tb, ascertain_list(var_list)) if sort_pts: self.sort_pts() return self
def add_grp_info(grp): print [ulist.ascertain_list(my_counter.next()) for i in range(len(grp))]
def all_but(d, exclude_keys): return get_subdict(d, set(d.keys()).difference(ulist.ascertain_list(exclude_keys)))
def rm_cols_if_present(df, cols): cols = util_ulist.ascertain_list(cols) return df[colloc.setdiff(df.columns, cols)]