def map_col_vals_to_ints(df, column_to_change, return_map=False): cols = df.columns unik_vals_map = ch_col_names( pd.DataFrame(df[column_to_change].unique()).reset_index(), ['tmp_new_col', column_to_change]) df = pd.merge(df, unik_vals_map) df = rm_cols_if_present(df, column_to_change) df = ch_col_names(df, column_to_change, 'tmp_new_col') if return_map: return df[cols], else: return df[cols]
def add_query_fanout_scores(query_report_df): if 'destination' not in query_report_df.columns: query_report_df = add_destination(query_report_df) ad_group_fanout = mk_query_fanout_scores(query_report_df,target='ad_group',statVars='impressions',keep_statVars=False) ad_group_fanout = daf_ch.ch_col_names(ad_group_fanout, ['ad_group_imps_freq_fanout_ratio','ad_group_count_fanout_ratio'], ['impressions_freq_fanout_ratio','impressions_count_fanout_ratio']) destination_fanout = mk_query_fanout_scores(query_report_df,target='destination',statVars='impressions',keep_statVars=False) destination_fanout = daf_ch.ch_col_names(destination_fanout, ['destination_imps_freq_fanout_ratio','destination_count_fanout_ratio'], ['impressions_freq_fanout_ratio','impressions_count_fanout_ratio']) query_report_df = query_report_df.merge(ad_group_fanout,on=['search_term','ad_group']) query_report_df = query_report_df.merge(destination_fanout,on=['search_term','destination']) return query_report_df
def get_kw_duplicates_01(df, dup_def='kw_lower', gr_keys=['match_type', 'ad_group', 'campaign']): """ old function to get kw_duplicates probably better to use kw_dup_diagnosis """ if dup_def == 'all': # get kw_lower dup_count df d = get_kw_duplicates_01(df, dup_def='kw_lower', gr_keys=gr_keys) d = daf_ch.ch_col_names(d, 'dup_count', 'lower_dups') del d['kw_lower'] # get kw_lower_ascii dup_count df dd = get_kw_duplicates_01(df, dup_def='kw_lower_ascii', gr_keys=gr_keys) dd = daf_ch.ch_col_names(dd, 'dup_count', 'ascii_dups') del dd['kw_lower_ascii'] # merge d and dd d = d.merge(dd, how='outer') # get kw_lower_ascii_ordered dup_count df dd = get_kw_duplicates_01(df, dup_def='kw_lower_ascii_ordered', gr_keys=gr_keys) dd = daf_ch.ch_col_names(dd, 'dup_count', 'order_dups') del dd['kw_lower_ascii_ordered'] # merge d and dd d = d.merge(dd, how='outer') # replace nans with 0s d = d.fillna(0) return d else: df = df.copy() # make a copy if dup_def == 'kw_lower': d = add_col(df, 'kw_lower', overwrite=False) gr_keys = oc.union(['kw_lower'], gr_keys) elif dup_def == 'kw_lower_ascii': d = add_col(df, 'kw_lower_ascii', overwrite=False) gr_keys = oc.union(['kw_lower_ascii'], gr_keys) elif dup_def == 'kw_lower_ascii_ordered': d = df[df.match_type == 'Broad'] d = add_col(d, 'kw_lower_ascii_ordered', overwrite=False) gr_keys = oc.union((['kw_lower_ascii_ordered'], gr_keys)) else: raise ValueError("don't know how to handle that dup_def") assert_dependencies(d, gr_keys, "to get duplicates") return get_duplicates(d, gr_keys, keep_count=True)
def termdoc_to_doc_counts(term_doc_df, doc_var=None, term_var='term', count_var='count'): term_doc_df, doc_var, term_var = __process_term_doc_var_names__(term_doc_df, doc_var=doc_var, term_var=term_var) # keep only doc and terms, and one copy of any (doc,term) pair term_doc_df = term_doc_df[[doc_var, term_var]].drop_duplicates(cols=[doc_var, term_var]).reset_index(drop=True) # group by terms term_doc_df = term_doc_df[[term_var]].groupby(term_var).count() term_doc_df = daf_ch.ch_col_names(term_doc_df, count_var, term_var) return term_doc_df
def add_target_scores(query_report_df): vars_to_keep = ['search_term','impressions','destination','ad_group','destination_imps_freq_fanout_ratio','ad_group_imps_freq_fanout_ratio'] query_report_df = add_query_fanout_scores(query_report_df) query_report_df = query_report_df[vars_to_keep] query_report_df = daf_ch.ch_col_names(query_report_df, ['ad_group_score','destination_score'], ['ad_group_imps_freq_fanout_ratio','destination_imps_freq_fanout_ratio']) query_report_df = query_report_df.sort(columns=['search_term','destination_score','ad_group_score']) return query_report_df
def from_count_df_to_count(cls, count_df, count_col='pval'): """ Creates a potential from a dataframe specifying point counts (where the count column name is specified by count_col """ pot_vars = list(colloc.setdiff(count_df.columns, [count_col])) tb = count_df[pot_vars+[count_col]].groupby(pot_vars).sum().reset_index() tb = ch_col_names(tb, 'pval', count_col) return Pot(tb)
def from_count_df_to_count(cls, count_df, count_col='pval'): """ Creates a potential from a dataframe specifying point counts (where the count column name is specified by count_col """ pot_vars = list(colloc.setdiff(count_df.columns, [count_col])) tb = count_df[pot_vars + [count_col]].groupby(pot_vars).sum().reset_index() tb = ch_col_names(tb, 'pval', count_col) return Pot(tb)
def from_points_to_bins(cls, pts, **kwargs): """ Creates a potential from a dataframe specifying point counts (where the count column name is specified by count_col """ if isinstance(pts, pd.DataFrame): tb = group_and_count(pts) tb = ch_col_names(tb, 'pval', 'count') return Pot(tb)
def __process_term_doc_var_names__(term_doc_df, doc_var=None, term_var='term'): if term_var != 'term': term_doc_df = daf_ch.ch_col_names(term_doc_df, [term_var], [term_var]) cols = term_doc_df.columns if doc_var is None: # try to guess it if len(cols) != 2: raise ValueError("In order to guess the doc_var, there needs to be only two columns") else: doc_var = list(set(cols)-set([term_var]))[0] return (term_doc_df, doc_var, term_var)
def get_kw_duplicates_01(df,dup_def='kw_lower',gr_keys=['match_type','ad_group','campaign']): """ old function to get kw_duplicates probably better to use kw_dup_diagnosis """ if dup_def=='all': # get kw_lower dup_count df d = get_kw_duplicates_01(df,dup_def='kw_lower',gr_keys=gr_keys) d = daf_ch.ch_col_names(d,'dup_count','lower_dups') del d['kw_lower'] # get kw_lower_ascii dup_count df dd = get_kw_duplicates_01(df,dup_def='kw_lower_ascii',gr_keys=gr_keys) dd = daf_ch.ch_col_names(dd,'dup_count','ascii_dups') del dd['kw_lower_ascii'] # merge d and dd d = d.merge(dd,how='outer') # get kw_lower_ascii_ordered dup_count df dd = get_kw_duplicates_01(df,dup_def='kw_lower_ascii_ordered',gr_keys=gr_keys) dd = daf_ch.ch_col_names(dd,'dup_count','order_dups') del dd['kw_lower_ascii_ordered'] # merge d and dd d = d.merge(dd,how='outer') # replace nans with 0s d = d.fillna(0) return d else: df = df.copy() # make a copy if dup_def=='kw_lower': d = add_col(df,'kw_lower',overwrite=False) gr_keys = oc.union(['kw_lower'],gr_keys) elif dup_def=='kw_lower_ascii': d = add_col(df,'kw_lower_ascii',overwrite=False) gr_keys = oc.union(['kw_lower_ascii'],gr_keys) elif dup_def=='kw_lower_ascii_ordered': d = df[df.match_type=='Broad'] d = add_col(d,'kw_lower_ascii_ordered',overwrite=False) gr_keys = oc.union((['kw_lower_ascii_ordered'],gr_keys)) else: raise ValueError("don't know how to handle that dup_def") assert_dependencies(d,gr_keys,"to get duplicates") return get_duplicates(d,gr_keys,keep_count=True)
def mk_df_of_travel_domains(): # set up resources html_folder = '/D/Dropbox/dev/py/data/html/google_results_tests/' file_list = [ 'hotel - 100 Google Search Results.html', 'find hotel deals - 100 Google Search Results.html', 'hotel travel sites - 100 Google Search Results.html', 'find hotels - 100 Google Search Results.html', 'hotel paris - 100 Google Search Results.html', 'hotel rome - 100 Google Search Results.html', 'hotel london - 100 Google Search Results.html', 'hotel nyc - 100 Google Search Results.html', 'hotels in france - 100 Google Search Results.html', 'hotels in italy - 100 Google Search Results.html' ] filepath_list = [os.path.join(html_folder, f) for f in file_list] # parse all this r = [google.mk_gresult_tag_dict(f) for f in filepath_list] r = [google.parse_tag_dict(f) for f in r] # make domain lists org_domain_list = [] ads_domain_list = [] tads_domain_list = [] for rr in r: rrr = rr['organic_results_list'] org_domain_list = org_domain_list + [ x['domain'] for x in rrr if 'domain' in x ] rrr = rr['rhs_ads_list'] ads_domain_list = ads_domain_list + [ x['disp_url_domain'] for x in rrr if 'disp_url_domain' in x ] rrr = rr['top_ads_list'] ads_domain_list = ads_domain_list + [ x['disp_url_domain'] for x in rrr if 'disp_url_domain' in x ] domain_list = org_domain_list + ads_domain_list print("number of org_domain_list entries = %d" % len(org_domain_list)) print("number of ads_domain_list entries = %d" % len(ads_domain_list)) print("number of (all) domain_list entries = %d" % len(domain_list)) # make a dataframe counting the number of times we encouter each domain df = pd.DataFrame(domain_list, columns=['domain']) dg = df.groupby('domain').count() #agg([('domain_count','len')]) dg = daf_ch.ch_col_names(dg, 'count', 'domain') thresh = 4 print("length before removing count<%d entries = %d" % (thresh, len(dg))) dg = dg[dg['count'] >= thresh] print("length before removing count<%d entries = %d" % (thresh, len(dg))) dg['frequency'] = dg['count'] / float(max(dg['count'])) dg = dg.sort(columns=['count'], ascending=False) dg.head(30) # return this! return dg
def termdoc_to_term_idf(term_doc_df, doc_var=None, term_var='term'): # processing input term_doc_df, doc_var, term_var = __process_term_doc_var_names__(term_doc_df, doc_var=doc_var, term_var=term_var) # get the number of docs num_of_docs = len(np.unique(term_doc_df[doc_var])) # get doc_counts term_doc_df = termdoc_to_doc_counts(term_doc_df, doc_var, term_var) # # keep only doc and terms, and one copy of any (doc,term) pair # term_doc_df = term_doc_df[[doc_var,'term']].drop_duplicates(cols=[doc_var, 'term']).reset_index(drop=True) # # group by terms # term_doc_df = term_doc_df[['term']].groupby('term').count() term_doc_df['term'] = \ semantics_math.idf_log10(num_of_docs_containing_term=np.array(term_doc_df['term']), num_of_docs=float(num_of_docs)) return daf_ch.ch_col_names(term_doc_df, ['stat'], [term_var])
def mk_df_of_travel_domains(): # set up resources html_folder = '/D/Dropbox/dev/py/data/html/google_results_tests/' file_list = ['hotel - 100 Google Search Results.html', 'find hotel deals - 100 Google Search Results.html', 'hotel travel sites - 100 Google Search Results.html', 'find hotels - 100 Google Search Results.html', 'hotel paris - 100 Google Search Results.html', 'hotel rome - 100 Google Search Results.html', 'hotel london - 100 Google Search Results.html', 'hotel nyc - 100 Google Search Results.html', 'hotels in france - 100 Google Search Results.html', 'hotels in italy - 100 Google Search Results.html' ] filepath_list = [os.path.join(html_folder,f) for f in file_list] # parse all this r = [google.mk_gresult_tag_dict(f) for f in filepath_list] r = [google.parse_tag_dict(f) for f in r] # make domain lists org_domain_list = [] ads_domain_list = [] tads_domain_list = [] for rr in r: rrr = rr['organic_results_list'] org_domain_list = org_domain_list + [x['domain'] for x in rrr if x.has_key('domain')] rrr = rr['rhs_ads_list'] ads_domain_list = ads_domain_list + [x['disp_url_domain'] for x in rrr if x.has_key('disp_url_domain')] rrr = rr['top_ads_list'] ads_domain_list = ads_domain_list + [x['disp_url_domain'] for x in rrr if x.has_key('disp_url_domain')] domain_list = org_domain_list + ads_domain_list print "number of org_domain_list entries = %d" % len(org_domain_list) print "number of ads_domain_list entries = %d" % len(ads_domain_list) print "number of (all) domain_list entries = %d" % len(domain_list) # make a dataframe counting the number of times we encouter each domain df = pd.DataFrame(domain_list,columns=['domain']) dg = df.groupby('domain').count() #agg([('domain_count','len')]) dg = daf_ch.ch_col_names(dg,'count','domain') thresh = 4 print "length before removing count<%d entries = %d" % (thresh,len(dg)) dg = dg[dg['count']>=thresh] print "length before removing count<%d entries = %d" % (thresh,len(dg)) dg['frequency'] = dg['count']/float(max(dg['count'])) dg = dg.sort(columns=['count'],ascending=False) dg.head(30) # return this! return dg
def termdoc_to_termdoc_count(term_doc_df, doc_var=None, term_var='term', count_var='count'): # processing input term_doc_df, doc_var, term_var = __process_term_doc_var_names__(term_doc_df, doc_var=doc_var, term_var=term_var) term_doc_df = term_doc_df.groupby([doc_var, term_var]).count() term_doc_df = daf_ch.ch_col_names(term_doc_df, count_var, term_var) return term_doc_df
def ch_col_names(self, df): return daf_ch.ch_col_names(df, self.variable_map.values(), self.variable_map.keys())
def id_year_of_birth(raw_life_course_data, **kwargs): d = raw_life_course_data[['id', 'year']].groupby('id').min() return daf_ch.ch_col_names(d, 'yob', 'year').reset_index()
def add_is_unik(): tok_str_count = daf_ch.ch_col_names( tok_lists[['tok_str']].groupby('tok_str').count(), 'tok_str')
def add_is_unik(): tok_str_count = daf_ch.ch_col_names(tok_lists[['tok_str']].groupby('tok_str').count(), 'tok_str')
def mk_awe_cols(df): old_cols = df.columns new_cols = map(awe_col_replacer, old_cols) return daf_ch.ch_col_names(df, new_cols, old_cols)