예제 #1
0
파일: manip.py 프로젝트: yz-/ut
def map_col_vals_to_ints(df, column_to_change, return_map=False):
    cols = df.columns
    unik_vals_map = ch_col_names(
        pd.DataFrame(df[column_to_change].unique()).reset_index(), ['tmp_new_col', column_to_change])
    df = pd.merge(df, unik_vals_map)
    df = rm_cols_if_present(df, column_to_change)
    df = ch_col_names(df, column_to_change, 'tmp_new_col')
    if return_map:
        return df[cols],
    else:
        return df[cols]
예제 #2
0
def map_col_vals_to_ints(df, column_to_change, return_map=False):
    cols = df.columns
    unik_vals_map = ch_col_names(
        pd.DataFrame(df[column_to_change].unique()).reset_index(), ['tmp_new_col', column_to_change])
    df = pd.merge(df, unik_vals_map)
    df = rm_cols_if_present(df, column_to_change)
    df = ch_col_names(df, column_to_change, 'tmp_new_col')
    if return_map:
        return df[cols],
    else:
        return df[cols]
예제 #3
0
파일: khan01_spike.py 프로젝트: SRHerzog/ut
def add_query_fanout_scores(query_report_df):
    if 'destination' not in query_report_df.columns:
        query_report_df = add_destination(query_report_df)
    ad_group_fanout = mk_query_fanout_scores(query_report_df,target='ad_group',statVars='impressions',keep_statVars=False)
    ad_group_fanout = daf_ch.ch_col_names(ad_group_fanout,
                                   ['ad_group_imps_freq_fanout_ratio','ad_group_count_fanout_ratio'],
                                   ['impressions_freq_fanout_ratio','impressions_count_fanout_ratio'])
    destination_fanout = mk_query_fanout_scores(query_report_df,target='destination',statVars='impressions',keep_statVars=False)
    destination_fanout = daf_ch.ch_col_names(destination_fanout,
                                   ['destination_imps_freq_fanout_ratio','destination_count_fanout_ratio'],
                                   ['impressions_freq_fanout_ratio','impressions_count_fanout_ratio'])
    query_report_df = query_report_df.merge(ad_group_fanout,on=['search_term','ad_group'])
    query_report_df = query_report_df.merge(destination_fanout,on=['search_term','destination'])
    return query_report_df
예제 #4
0
파일: dup_diag.py 프로젝트: SRHerzog/ut
def get_kw_duplicates_01(df,
                         dup_def='kw_lower',
                         gr_keys=['match_type', 'ad_group', 'campaign']):
    """
    old function to get kw_duplicates
    probably better to use kw_dup_diagnosis
    """
    if dup_def == 'all':
        # get kw_lower dup_count df
        d = get_kw_duplicates_01(df, dup_def='kw_lower', gr_keys=gr_keys)
        d = daf_ch.ch_col_names(d, 'dup_count', 'lower_dups')
        del d['kw_lower']
        # get kw_lower_ascii dup_count df
        dd = get_kw_duplicates_01(df,
                                  dup_def='kw_lower_ascii',
                                  gr_keys=gr_keys)
        dd = daf_ch.ch_col_names(dd, 'dup_count', 'ascii_dups')
        del dd['kw_lower_ascii']
        # merge d and dd
        d = d.merge(dd, how='outer')
        # get kw_lower_ascii_ordered dup_count df
        dd = get_kw_duplicates_01(df,
                                  dup_def='kw_lower_ascii_ordered',
                                  gr_keys=gr_keys)
        dd = daf_ch.ch_col_names(dd, 'dup_count', 'order_dups')
        del dd['kw_lower_ascii_ordered']
        # merge d and dd
        d = d.merge(dd, how='outer')
        # replace nans with 0s
        d = d.fillna(0)
        return d
    else:
        df = df.copy()  # make a copy
        if dup_def == 'kw_lower':
            d = add_col(df, 'kw_lower', overwrite=False)
            gr_keys = oc.union(['kw_lower'], gr_keys)
        elif dup_def == 'kw_lower_ascii':
            d = add_col(df, 'kw_lower_ascii', overwrite=False)
            gr_keys = oc.union(['kw_lower_ascii'], gr_keys)
        elif dup_def == 'kw_lower_ascii_ordered':
            d = df[df.match_type == 'Broad']
            d = add_col(d, 'kw_lower_ascii_ordered', overwrite=False)
            gr_keys = oc.union((['kw_lower_ascii_ordered'], gr_keys))
        else:
            raise ValueError("don't know how to handle that dup_def")
        assert_dependencies(d, gr_keys, "to get duplicates")
        return get_duplicates(d, gr_keys, keep_count=True)
예제 #5
0
파일: term_stats.py 프로젝트: yz-/ut
def termdoc_to_doc_counts(term_doc_df, doc_var=None, term_var='term', count_var='count'):
    term_doc_df, doc_var, term_var = __process_term_doc_var_names__(term_doc_df, doc_var=doc_var, term_var=term_var)
    # keep only doc and terms, and one copy of any (doc,term) pair
    term_doc_df = term_doc_df[[doc_var, term_var]].drop_duplicates(cols=[doc_var, term_var]).reset_index(drop=True)
    # group by terms
    term_doc_df = term_doc_df[[term_var]].groupby(term_var).count()
    term_doc_df = daf_ch.ch_col_names(term_doc_df, count_var, term_var)
    return term_doc_df
예제 #6
0
파일: term_stats.py 프로젝트: SRHerzog/ut
def termdoc_to_doc_counts(term_doc_df, doc_var=None, term_var='term', count_var='count'):
    term_doc_df, doc_var, term_var = __process_term_doc_var_names__(term_doc_df, doc_var=doc_var, term_var=term_var)
    # keep only doc and terms, and one copy of any (doc,term) pair
    term_doc_df = term_doc_df[[doc_var, term_var]].drop_duplicates(cols=[doc_var, term_var]).reset_index(drop=True)
    # group by terms
    term_doc_df = term_doc_df[[term_var]].groupby(term_var).count()
    term_doc_df = daf_ch.ch_col_names(term_doc_df, count_var, term_var)
    return term_doc_df
예제 #7
0
파일: khan01_spike.py 프로젝트: SRHerzog/ut
def add_target_scores(query_report_df):
    vars_to_keep = ['search_term','impressions','destination','ad_group','destination_imps_freq_fanout_ratio','ad_group_imps_freq_fanout_ratio']
    query_report_df = add_query_fanout_scores(query_report_df)
    query_report_df = query_report_df[vars_to_keep]
    query_report_df = daf_ch.ch_col_names(query_report_df,
                                   ['ad_group_score','destination_score'],
                                   ['ad_group_imps_freq_fanout_ratio','destination_imps_freq_fanout_ratio'])
    query_report_df = query_report_df.sort(columns=['search_term','destination_score','ad_group_score'])
    return query_report_df
예제 #8
0
파일: pot.py 프로젝트: yz-/ut
 def from_count_df_to_count(cls, count_df, count_col='pval'):
     """
     Creates a potential from a dataframe specifying point counts (where the count column name is specified by
     count_col
     """
     pot_vars = list(colloc.setdiff(count_df.columns, [count_col]))
     tb = count_df[pot_vars+[count_col]].groupby(pot_vars).sum().reset_index()
     tb = ch_col_names(tb, 'pval', count_col)
     return Pot(tb)
예제 #9
0
 def from_count_df_to_count(cls, count_df, count_col='pval'):
     """
     Creates a potential from a dataframe specifying point counts (where the count column name is specified by
     count_col
     """
     pot_vars = list(colloc.setdiff(count_df.columns, [count_col]))
     tb = count_df[pot_vars +
                   [count_col]].groupby(pot_vars).sum().reset_index()
     tb = ch_col_names(tb, 'pval', count_col)
     return Pot(tb)
예제 #10
0
파일: pot.py 프로젝트: yz-/ut
    def from_points_to_bins(cls, pts, **kwargs):
        """
        Creates a potential from a dataframe specifying point counts (where the count column name is specified by
        count_col
        """
        if isinstance(pts, pd.DataFrame):

            tb = group_and_count(pts)
            tb = ch_col_names(tb, 'pval', 'count')
            return Pot(tb)
예제 #11
0
파일: term_stats.py 프로젝트: yz-/ut
def __process_term_doc_var_names__(term_doc_df, doc_var=None, term_var='term'):
    if term_var != 'term':
        term_doc_df = daf_ch.ch_col_names(term_doc_df, [term_var], [term_var])
    cols = term_doc_df.columns
    if doc_var is None:  # try to guess it
        if len(cols) != 2:
            raise ValueError("In order to guess the doc_var, there needs to be only two columns")
        else:
            doc_var = list(set(cols)-set([term_var]))[0]
    return (term_doc_df, doc_var, term_var)
예제 #12
0
    def from_points_to_bins(cls, pts, **kwargs):
        """
        Creates a potential from a dataframe specifying point counts (where the count column name is specified by
        count_col
        """
        if isinstance(pts, pd.DataFrame):

            tb = group_and_count(pts)
            tb = ch_col_names(tb, 'pval', 'count')
            return Pot(tb)
예제 #13
0
파일: term_stats.py 프로젝트: SRHerzog/ut
def __process_term_doc_var_names__(term_doc_df, doc_var=None, term_var='term'):
    if term_var != 'term':
        term_doc_df = daf_ch.ch_col_names(term_doc_df, [term_var], [term_var])
    cols = term_doc_df.columns
    if doc_var is None:  # try to guess it
        if len(cols) != 2:
            raise ValueError("In order to guess the doc_var, there needs to be only two columns")
        else:
            doc_var = list(set(cols)-set([term_var]))[0]
    return (term_doc_df, doc_var, term_var)
예제 #14
0
파일: dup_diag.py 프로젝트: yz-/ut
def get_kw_duplicates_01(df,dup_def='kw_lower',gr_keys=['match_type','ad_group','campaign']):
    """
    old function to get kw_duplicates
    probably better to use kw_dup_diagnosis
    """
    if dup_def=='all':
        # get kw_lower dup_count df
        d = get_kw_duplicates_01(df,dup_def='kw_lower',gr_keys=gr_keys)
        d = daf_ch.ch_col_names(d,'dup_count','lower_dups')
        del d['kw_lower']
        # get kw_lower_ascii dup_count df
        dd = get_kw_duplicates_01(df,dup_def='kw_lower_ascii',gr_keys=gr_keys)
        dd = daf_ch.ch_col_names(dd,'dup_count','ascii_dups')
        del dd['kw_lower_ascii']
        # merge d and dd
        d = d.merge(dd,how='outer')
        # get kw_lower_ascii_ordered dup_count df
        dd = get_kw_duplicates_01(df,dup_def='kw_lower_ascii_ordered',gr_keys=gr_keys)
        dd = daf_ch.ch_col_names(dd,'dup_count','order_dups')
        del dd['kw_lower_ascii_ordered']
        # merge d and dd
        d = d.merge(dd,how='outer')
        # replace nans with 0s
        d = d.fillna(0)
        return d
    else:
        df = df.copy() # make a copy
        if dup_def=='kw_lower':
            d = add_col(df,'kw_lower',overwrite=False)
            gr_keys = oc.union(['kw_lower'],gr_keys)
        elif dup_def=='kw_lower_ascii':
            d = add_col(df,'kw_lower_ascii',overwrite=False)
            gr_keys = oc.union(['kw_lower_ascii'],gr_keys)
        elif dup_def=='kw_lower_ascii_ordered':
            d = df[df.match_type=='Broad']
            d = add_col(d,'kw_lower_ascii_ordered',overwrite=False)
            gr_keys = oc.union((['kw_lower_ascii_ordered'],gr_keys))
        else:
            raise ValueError("don't know how to handle that dup_def")
        assert_dependencies(d,gr_keys,"to get duplicates")
        return get_duplicates(d,gr_keys,keep_count=True)
예제 #15
0
def mk_df_of_travel_domains():
    # set up resources
    html_folder = '/D/Dropbox/dev/py/data/html/google_results_tests/'
    file_list = [
        'hotel - 100 Google Search Results.html',
        'find hotel deals - 100 Google Search Results.html',
        'hotel travel sites - 100 Google Search Results.html',
        'find hotels - 100 Google Search Results.html',
        'hotel paris - 100 Google Search Results.html',
        'hotel rome - 100 Google Search Results.html',
        'hotel london - 100 Google Search Results.html',
        'hotel nyc - 100 Google Search Results.html',
        'hotels in france - 100 Google Search Results.html',
        'hotels in italy - 100 Google Search Results.html'
    ]
    filepath_list = [os.path.join(html_folder, f) for f in file_list]
    # parse all this
    r = [google.mk_gresult_tag_dict(f) for f in filepath_list]
    r = [google.parse_tag_dict(f) for f in r]
    # make domain lists
    org_domain_list = []
    ads_domain_list = []
    tads_domain_list = []
    for rr in r:
        rrr = rr['organic_results_list']
        org_domain_list = org_domain_list + [
            x['domain'] for x in rrr if 'domain' in x
        ]
        rrr = rr['rhs_ads_list']
        ads_domain_list = ads_domain_list + [
            x['disp_url_domain'] for x in rrr if 'disp_url_domain' in x
        ]
        rrr = rr['top_ads_list']
        ads_domain_list = ads_domain_list + [
            x['disp_url_domain'] for x in rrr if 'disp_url_domain' in x
        ]
    domain_list = org_domain_list + ads_domain_list
    print("number of org_domain_list entries = %d" % len(org_domain_list))
    print("number of ads_domain_list entries = %d" % len(ads_domain_list))
    print("number of (all) domain_list entries = %d" % len(domain_list))
    # make a dataframe counting the number of times we encouter each domain
    df = pd.DataFrame(domain_list, columns=['domain'])
    dg = df.groupby('domain').count()  #agg([('domain_count','len')])
    dg = daf_ch.ch_col_names(dg, 'count', 'domain')
    thresh = 4
    print("length before removing count<%d entries = %d" % (thresh, len(dg)))
    dg = dg[dg['count'] >= thresh]
    print("length before removing count<%d entries = %d" % (thresh, len(dg)))
    dg['frequency'] = dg['count'] / float(max(dg['count']))
    dg = dg.sort(columns=['count'], ascending=False)
    dg.head(30)
    # return this!
    return dg
예제 #16
0
파일: term_stats.py 프로젝트: SRHerzog/ut
def termdoc_to_term_idf(term_doc_df, doc_var=None, term_var='term'):
    # processing input
    term_doc_df, doc_var, term_var = __process_term_doc_var_names__(term_doc_df, doc_var=doc_var, term_var=term_var)
    # get the number of docs
    num_of_docs = len(np.unique(term_doc_df[doc_var]))
    # get doc_counts
    term_doc_df = termdoc_to_doc_counts(term_doc_df, doc_var, term_var)
    # # keep only doc and terms, and one copy of any (doc,term) pair
    # term_doc_df = term_doc_df[[doc_var,'term']].drop_duplicates(cols=[doc_var, 'term']).reset_index(drop=True)
    # # group by terms
    # term_doc_df = term_doc_df[['term']].groupby('term').count()
    term_doc_df['term'] = \
        semantics_math.idf_log10(num_of_docs_containing_term=np.array(term_doc_df['term']), num_of_docs=float(num_of_docs))
    return daf_ch.ch_col_names(term_doc_df, ['stat'], [term_var])
예제 #17
0
파일: term_stats.py 프로젝트: yz-/ut
def termdoc_to_term_idf(term_doc_df, doc_var=None, term_var='term'):
    # processing input
    term_doc_df, doc_var, term_var = __process_term_doc_var_names__(term_doc_df, doc_var=doc_var, term_var=term_var)
    # get the number of docs
    num_of_docs = len(np.unique(term_doc_df[doc_var]))
    # get doc_counts
    term_doc_df = termdoc_to_doc_counts(term_doc_df, doc_var, term_var)
    # # keep only doc and terms, and one copy of any (doc,term) pair
    # term_doc_df = term_doc_df[[doc_var,'term']].drop_duplicates(cols=[doc_var, 'term']).reset_index(drop=True)
    # # group by terms
    # term_doc_df = term_doc_df[['term']].groupby('term').count()
    term_doc_df['term'] = \
        semantics_math.idf_log10(num_of_docs_containing_term=np.array(term_doc_df['term']), num_of_docs=float(num_of_docs))
    return daf_ch.ch_col_names(term_doc_df, ['stat'], [term_var])
예제 #18
0
파일: khan01_spike.py 프로젝트: yz-/ut
def mk_df_of_travel_domains():
    # set up resources
    html_folder = '/D/Dropbox/dev/py/data/html/google_results_tests/'
    file_list = ['hotel - 100 Google Search Results.html',
             'find hotel deals - 100 Google Search Results.html',
             'hotel travel sites - 100 Google Search Results.html',
             'find hotels - 100 Google Search Results.html',
             'hotel paris - 100 Google Search Results.html',
             'hotel rome - 100 Google Search Results.html',
             'hotel london - 100 Google Search Results.html',
             'hotel nyc - 100 Google Search Results.html',
             'hotels in france - 100 Google Search Results.html',
             'hotels in italy - 100 Google Search Results.html'
            ]
    filepath_list = [os.path.join(html_folder,f) for f in file_list]
    # parse all this
    r = [google.mk_gresult_tag_dict(f) for f in filepath_list]
    r = [google.parse_tag_dict(f) for f in r]
    # make domain lists
    org_domain_list = []
    ads_domain_list = []
    tads_domain_list = []
    for rr in r:
        rrr = rr['organic_results_list']
        org_domain_list = org_domain_list + [x['domain'] for x in rrr if x.has_key('domain')]
        rrr = rr['rhs_ads_list']
        ads_domain_list = ads_domain_list + [x['disp_url_domain'] for x in rrr if x.has_key('disp_url_domain')]
        rrr = rr['top_ads_list']
        ads_domain_list = ads_domain_list + [x['disp_url_domain'] for x in rrr if x.has_key('disp_url_domain')]
    domain_list = org_domain_list + ads_domain_list
    print "number of org_domain_list entries = %d" % len(org_domain_list)
    print "number of ads_domain_list entries = %d" % len(ads_domain_list)
    print "number of (all) domain_list entries = %d" % len(domain_list)
    # make a dataframe counting the number of times we encouter each domain
    df = pd.DataFrame(domain_list,columns=['domain'])
    dg = df.groupby('domain').count() #agg([('domain_count','len')])
    dg = daf_ch.ch_col_names(dg,'count','domain')
    thresh = 4
    print "length before removing count<%d entries = %d" % (thresh,len(dg))
    dg = dg[dg['count']>=thresh]
    print "length before removing count<%d entries = %d" % (thresh,len(dg))
    dg['frequency'] = dg['count']/float(max(dg['count']))
    dg = dg.sort(columns=['count'],ascending=False)
    dg.head(30)
    # return this!
    return dg
예제 #19
0
파일: term_stats.py 프로젝트: yz-/ut
def termdoc_to_termdoc_count(term_doc_df, doc_var=None, term_var='term', count_var='count'):
    # processing input
    term_doc_df, doc_var, term_var = __process_term_doc_var_names__(term_doc_df, doc_var=doc_var, term_var=term_var)
    term_doc_df = term_doc_df.groupby([doc_var, term_var]).count()
    term_doc_df = daf_ch.ch_col_names(term_doc_df, count_var, term_var)
    return term_doc_df
예제 #20
0
파일: term_stats.py 프로젝트: SRHerzog/ut
def termdoc_to_termdoc_count(term_doc_df, doc_var=None, term_var='term', count_var='count'):
    # processing input
    term_doc_df, doc_var, term_var = __process_term_doc_var_names__(term_doc_df, doc_var=doc_var, term_var=term_var)
    term_doc_df = term_doc_df.groupby([doc_var, term_var]).count()
    term_doc_df = daf_ch.ch_col_names(term_doc_df, count_var, term_var)
    return term_doc_df
예제 #21
0
 def ch_col_names(self, df):
     return daf_ch.ch_col_names(df, self.variable_map.values(),
                                self.variable_map.keys())
예제 #22
0
 def id_year_of_birth(raw_life_course_data, **kwargs):
     d = raw_life_course_data[['id', 'year']].groupby('id').min()
     return daf_ch.ch_col_names(d, 'yob', 'year').reset_index()
예제 #23
0
파일: sutils.py 프로젝트: SRHerzog/ut
 def add_is_unik():
     tok_str_count = daf_ch.ch_col_names(
         tok_lists[['tok_str']].groupby('tok_str').count(), 'tok_str')
예제 #24
0
파일: sutils.py 프로젝트: yz-/ut
 def add_is_unik():
     tok_str_count = daf_ch.ch_col_names(tok_lists[['tok_str']].groupby('tok_str').count(), 'tok_str')
예제 #25
0
파일: aw_editor.py 프로젝트: yz-/ut
def mk_awe_cols(df):
    old_cols = df.columns
    new_cols = map(awe_col_replacer, old_cols)
    return daf_ch.ch_col_names(df, new_cols, old_cols)