def unicode_save(self, obj, key_name, bucket_name): if isinstance(obj, basestring): self.s3.dumps(the_str=pstr_trans.to_unicode_or_bust(obj), key_name=key_name, bucket_name=bucket_name) else: self.s3.dumpo(obj=obj, key_name=key_name, bucket_name=bucket_name)
def process_text_for_word_count(text): """ Preprocesses the text before it will be fed to the tokenizer. Here, we should put things like lower-casing the text, casting letters to "simple" ("ascii", "non-accentuated") letters, replacing some common strings (such as "bed and breakfast", "New York" by singular token representatives such as "b&b", "new_york"), and what ever needs to be done before tokens are retrieved from text. """ return toascii(to_unicode_or_bust(text)).lower()
def unicode_load(self, key_name, bucket_name): """ try pickle.loading, and if it doesn't work, try file_to.string """ try: return self.s3.loado(key_name=key_name, bucket_name=bucket_name) except: return pstr_trans.to_unicode_or_bust(self.s3.loads(key_name=key_name, bucket_name=bucket_name))
def kw_str(keyword): """ produces a kw_str version of the input keyword (or list of keywords), i.e. lower ascii and strip_kw are applied """ #return strip_kw(pstr_trans.lower(pstr_trans.toascii(pstr_trans.to_unicode_or_bust(keyword)))) if isinstance(keyword, basestring): return str( strip_kw( pstr_trans.lower( pstr_trans.toascii( pstr_trans.to_unicode_or_bust(keyword))))) else: return map( lambda x: str( strip_kw( pstr_trans.lower( pstr_trans.toascii(pstr_trans.to_unicode_or_bust(x)))) ), keyword)
def unicode_load(self, key_name, bucket_name): """ try pickle.loading, and if it doesn't work, try file_to.string """ try: return self.s3.loado(key_name=key_name, bucket_name=bucket_name) except: return pstr_trans.to_unicode_or_bust( self.s3.loads(key_name=key_name, bucket_name=bucket_name))
def kw_dup_diagnosis(df, grp_keys=['match_type'], # grp_keys=['match_type','ad_group','campaign']? grp_fun_dict={'dups': lambda x: len(x)}, grp_id_name='grp_id',grp_id_type='int',output_nondup_df=False): dup_df_dict = dict() grp_keys = oc.intersect(df.columns, grp_keys) + ['kw_representative'] df = df.copy() # to change the input df (can be handled differently if need to spare memory) df.keyword = df.keyword.apply(lambda x:to_unicode_or_bust(x)) # change all keyword strings to unicode # util function (returns a dataframe containing grp_id and dups of a df def _get_grp_id_and_dups(df): """ this function makes grp_id and dups duplication info columns and returns only those rows with dups>1 NOTE: It is not meant to be used externally, but by the kw_dup_diagnosis() only """ df = daf_dup_diag.ad_group_info_cols(df, grp_keys=grp_keys, grp_fun_dict=grp_fun_dict, grp_id_name=grp_id_name, grp_id_type=grp_id_type ) if len(df)>0: return df[['grp_id', 'dups']][df.dups>1] else: # return an empty dataframe (but with the usual columns (necessary for the further joins) return pd.DataFrame(columns=['grp_id', 'dups']) # make a kw_representative column where different "group representatives" will be placed df['kw_representative'] = df['keyword'] # get the kw_stripped duplicates df['kw_representative'] = aw_manip.strip_kw(df['kw_representative']) dup_df_dict['strip'] = _get_grp_id_and_dups(df) # get the kw_lower duplicates df['kw_representative'] = pstr_trans.lower(df['kw_representative']) dup_df_dict['lower'] = _get_grp_id_and_dups(df) # get the ascii duplicates df['kw_representative'] = pstr_trans.toascii(df['kw_representative']) dup_df_dict['ascii'] = _get_grp_id_and_dups(df) # get the order duplicates (only for Broads) d = df[df.match_type=='Broad'] d['kw_representative'] = aw_manip.order_words(d['kw_representative']) dup_df_dict['order'] = _get_grp_id_and_dups(d) # join all this together d = dup_df_dict['strip'].join(dup_df_dict['lower'],how='outer',lsuffix='_strip').fillna(0) d = d.join(dup_df_dict['ascii'],how='outer',lsuffix='_lower').fillna(0) d = d.join(dup_df_dict['order'],how='outer',lsuffix='_ascii',rsuffix='_order').fillna(0) del df['kw_representative'] d = d.join(df) if output_nondup_df==False: return d else: named_tuple = collections.namedtuple('dup_stats',['dup_diag_df','non_dup_df']) return named_tuple(dup_diag_df=d, non_dup_df=df.ix[list(set(df.index)-set(d.index))])
def kw_dup_diagnosis( df, grp_keys=['match_type' ], # grp_keys=['match_type','ad_group','campaign']? grp_fun_dict={'dups': lambda x: len(x)}, grp_id_name='grp_id', grp_id_type='int', output_nondup_df=False): dup_df_dict = dict() grp_keys = oc.intersect(df.columns, grp_keys) + ['kw_representative'] df = df.copy( ) # to change the input df (can be handled differently if need to spare memory) df.keyword = df.keyword.apply(lambda x: to_unicode_or_bust(x) ) # change all keyword strings to unicode # util function (returns a dataframe containing grp_id and dups of a df def _get_grp_id_and_dups(df): """ this function makes grp_id and dups duplication info columns and returns only those rows with dups>1 NOTE: It is not meant to be used externally, but by the kw_dup_diagnosis() only """ df = daf_dup_diag.ad_group_info_cols(df, grp_keys=grp_keys, grp_fun_dict=grp_fun_dict, grp_id_name=grp_id_name, grp_id_type=grp_id_type) if len(df) > 0: return df[['grp_id', 'dups']][df.dups > 1] else: # return an empty dataframe (but with the usual columns (necessary for the further joins) return pd.DataFrame(columns=['grp_id', 'dups']) # make a kw_representative column where different "group representatives" will be placed df['kw_representative'] = df['keyword'] # get the kw_stripped duplicates df['kw_representative'] = aw_manip.strip_kw(df['kw_representative']) dup_df_dict['strip'] = _get_grp_id_and_dups(df) # get the kw_lower duplicates df['kw_representative'] = pstr_trans.lower(df['kw_representative']) dup_df_dict['lower'] = _get_grp_id_and_dups(df) # get the ascii duplicates df['kw_representative'] = pstr_trans.toascii(df['kw_representative']) dup_df_dict['ascii'] = _get_grp_id_and_dups(df) # get the order duplicates (only for Broads) d = df[df.match_type == 'Broad'] d['kw_representative'] = aw_manip.order_words(d['kw_representative']) dup_df_dict['order'] = _get_grp_id_and_dups(d) # join all this together d = dup_df_dict['strip'].join(dup_df_dict['lower'], how='outer', lsuffix='_strip').fillna(0) d = d.join(dup_df_dict['ascii'], how='outer', lsuffix='_lower').fillna(0) d = d.join(dup_df_dict['order'], how='outer', lsuffix='_ascii', rsuffix='_order').fillna(0) del df['kw_representative'] d = d.join(df) if output_nondup_df == False: return d else: named_tuple = collections.namedtuple('dup_stats', ['dup_diag_df', 'non_dup_df']) return named_tuple(dup_diag_df=d, non_dup_df=df.ix[list(set(df.index) - set(d.index))])
def lower_series(sr): return sr.apply(lambda x: to_unicode_or_bust(x).lower())
def unicode_load(self, filepath=None, **kwargs): """ try pd.from_pickle, then pickle.loading, and if it doesn't work, try file_to.string """ return pstr_trans.to_unicode_or_bust(self.simple_load(filepath=filepath, **kwargs))
def unicode_load(self, filepath=None, **kwargs): """ try pd.from_pickle, then pickle.loading, and if it doesn't work, try file_to.string """ return pstr_trans.to_unicode_or_bust( self.simple_load(filepath=filepath, **kwargs))