예제 #1
0
파일: accessor.py 프로젝트: SRHerzog/ut
 def unicode_save(self, obj, key_name, bucket_name):
     if isinstance(obj, basestring):
         self.s3.dumps(the_str=pstr_trans.to_unicode_or_bust(obj),
                       key_name=key_name,
                       bucket_name=bucket_name)
     else:
         self.s3.dumpo(obj=obj, key_name=key_name, bucket_name=bucket_name)
예제 #2
0
파일: khan01_spike.py 프로젝트: SRHerzog/ut
def process_text_for_word_count(text):
    """
    Preprocesses the text before it will be fed to the tokenizer.
    Here, we should put things like lower-casing the text, casting letters to "simple" ("ascii", "non-accentuated")
    letters, replacing some common strings (such as "bed and breakfast", "New York" by singular token representatives
    such as "b&b", "new_york"), and what ever needs to be done before tokens are retrieved from text.
    """
    return toascii(to_unicode_or_bust(text)).lower()
예제 #3
0
파일: accessor.py 프로젝트: yz-/ut
 def unicode_load(self, key_name, bucket_name):
     """
     try pickle.loading, and if it doesn't work, try file_to.string
     """
     try:
         return self.s3.loado(key_name=key_name, bucket_name=bucket_name)
     except:
         return pstr_trans.to_unicode_or_bust(self.s3.loads(key_name=key_name, bucket_name=bucket_name))
예제 #4
0
def kw_str(keyword):
    """
        produces a kw_str version of the input keyword (or list of keywords), i.e. lower ascii and strip_kw are applied
    """
    #return strip_kw(pstr_trans.lower(pstr_trans.toascii(pstr_trans.to_unicode_or_bust(keyword))))
    if isinstance(keyword, basestring):
        return str(
            strip_kw(
                pstr_trans.lower(
                    pstr_trans.toascii(
                        pstr_trans.to_unicode_or_bust(keyword)))))
    else:
        return map(
            lambda x: str(
                strip_kw(
                    pstr_trans.lower(
                        pstr_trans.toascii(pstr_trans.to_unicode_or_bust(x))))
            ), keyword)
예제 #5
0
파일: accessor.py 프로젝트: SRHerzog/ut
 def unicode_load(self, key_name, bucket_name):
     """
     try pickle.loading, and if it doesn't work, try file_to.string
     """
     try:
         return self.s3.loado(key_name=key_name, bucket_name=bucket_name)
     except:
         return pstr_trans.to_unicode_or_bust(
             self.s3.loads(key_name=key_name, bucket_name=bucket_name))
예제 #6
0
파일: dup_diag.py 프로젝트: yz-/ut
def kw_dup_diagnosis(df, grp_keys=['match_type'], # grp_keys=['match_type','ad_group','campaign']?
                     grp_fun_dict={'dups': lambda x: len(x)}, grp_id_name='grp_id',grp_id_type='int',output_nondup_df=False):
    dup_df_dict = dict()
    grp_keys = oc.intersect(df.columns, grp_keys) + ['kw_representative']
    df = df.copy() # to change the input df (can be handled differently if need to spare memory)
    df.keyword = df.keyword.apply(lambda x:to_unicode_or_bust(x)) # change all keyword strings to unicode

    # util function (returns a dataframe containing grp_id and dups of a df
    def _get_grp_id_and_dups(df):
        """
        this function makes grp_id and dups duplication info columns and returns only those rows with dups>1
        NOTE: It is not meant to be used externally, but by the kw_dup_diagnosis() only
        """
        df = daf_dup_diag.ad_group_info_cols(df, grp_keys=grp_keys,
                                            grp_fun_dict=grp_fun_dict,
                                            grp_id_name=grp_id_name,
                                            grp_id_type=grp_id_type
        )
        if len(df)>0:
            return df[['grp_id', 'dups']][df.dups>1]
        else: # return an empty dataframe (but with the usual columns (necessary for the further joins)
            return pd.DataFrame(columns=['grp_id', 'dups'])

    # make a kw_representative column where different "group representatives" will be placed
    df['kw_representative'] = df['keyword']
    # get the kw_stripped duplicates
    df['kw_representative'] = aw_manip.strip_kw(df['kw_representative'])
    dup_df_dict['strip'] = _get_grp_id_and_dups(df)
    # get the kw_lower duplicates
    df['kw_representative'] = pstr_trans.lower(df['kw_representative'])
    dup_df_dict['lower'] = _get_grp_id_and_dups(df)
    # get the ascii duplicates
    df['kw_representative'] = pstr_trans.toascii(df['kw_representative'])
    dup_df_dict['ascii'] = _get_grp_id_and_dups(df)
    # get the order duplicates (only for Broads)
    d = df[df.match_type=='Broad']
    d['kw_representative'] = aw_manip.order_words(d['kw_representative'])
    dup_df_dict['order'] = _get_grp_id_and_dups(d)
    # join all this together
    d = dup_df_dict['strip'].join(dup_df_dict['lower'],how='outer',lsuffix='_strip').fillna(0)
    d = d.join(dup_df_dict['ascii'],how='outer',lsuffix='_lower').fillna(0)
    d = d.join(dup_df_dict['order'],how='outer',lsuffix='_ascii',rsuffix='_order').fillna(0)
    del df['kw_representative']
    d = d.join(df)
    if output_nondup_df==False:
        return d
    else:
        named_tuple = collections.namedtuple('dup_stats',['dup_diag_df','non_dup_df'])
        return named_tuple(dup_diag_df=d, non_dup_df=df.ix[list(set(df.index)-set(d.index))])
예제 #7
0
파일: dup_diag.py 프로젝트: SRHerzog/ut
def kw_dup_diagnosis(
        df,
        grp_keys=['match_type'
                  ],  # grp_keys=['match_type','ad_group','campaign']?
        grp_fun_dict={'dups': lambda x: len(x)},
        grp_id_name='grp_id',
        grp_id_type='int',
        output_nondup_df=False):
    dup_df_dict = dict()
    grp_keys = oc.intersect(df.columns, grp_keys) + ['kw_representative']
    df = df.copy(
    )  # to change the input df (can be handled differently if need to spare memory)
    df.keyword = df.keyword.apply(lambda x: to_unicode_or_bust(x)
                                  )  # change all keyword strings to unicode

    # util function (returns a dataframe containing grp_id and dups of a df
    def _get_grp_id_and_dups(df):
        """
        this function makes grp_id and dups duplication info columns and returns only those rows with dups>1
        NOTE: It is not meant to be used externally, but by the kw_dup_diagnosis() only
        """
        df = daf_dup_diag.ad_group_info_cols(df,
                                             grp_keys=grp_keys,
                                             grp_fun_dict=grp_fun_dict,
                                             grp_id_name=grp_id_name,
                                             grp_id_type=grp_id_type)
        if len(df) > 0:
            return df[['grp_id', 'dups']][df.dups > 1]
        else:  # return an empty dataframe (but with the usual columns (necessary for the further joins)
            return pd.DataFrame(columns=['grp_id', 'dups'])

    # make a kw_representative column where different "group representatives" will be placed
    df['kw_representative'] = df['keyword']
    # get the kw_stripped duplicates
    df['kw_representative'] = aw_manip.strip_kw(df['kw_representative'])
    dup_df_dict['strip'] = _get_grp_id_and_dups(df)
    # get the kw_lower duplicates
    df['kw_representative'] = pstr_trans.lower(df['kw_representative'])
    dup_df_dict['lower'] = _get_grp_id_and_dups(df)
    # get the ascii duplicates
    df['kw_representative'] = pstr_trans.toascii(df['kw_representative'])
    dup_df_dict['ascii'] = _get_grp_id_and_dups(df)
    # get the order duplicates (only for Broads)
    d = df[df.match_type == 'Broad']
    d['kw_representative'] = aw_manip.order_words(d['kw_representative'])
    dup_df_dict['order'] = _get_grp_id_and_dups(d)
    # join all this together
    d = dup_df_dict['strip'].join(dup_df_dict['lower'],
                                  how='outer',
                                  lsuffix='_strip').fillna(0)
    d = d.join(dup_df_dict['ascii'], how='outer', lsuffix='_lower').fillna(0)
    d = d.join(dup_df_dict['order'],
               how='outer',
               lsuffix='_ascii',
               rsuffix='_order').fillna(0)
    del df['kw_representative']
    d = d.join(df)
    if output_nondup_df == False:
        return d
    else:
        named_tuple = collections.namedtuple('dup_stats',
                                             ['dup_diag_df', 'non_dup_df'])
        return named_tuple(dup_diag_df=d,
                           non_dup_df=df.ix[list(set(df.index) -
                                                 set(d.index))])
예제 #8
0
def lower_series(sr):
    return sr.apply(lambda x: to_unicode_or_bust(x).lower())
예제 #9
0
파일: accessor.py 프로젝트: yz-/ut
 def unicode_save(self, obj, key_name, bucket_name):
     if isinstance(obj, basestring):
         self.s3.dumps(the_str=pstr_trans.to_unicode_or_bust(obj), key_name=key_name, bucket_name=bucket_name)
     else:
         self.s3.dumpo(obj=obj, key_name=key_name, bucket_name=bucket_name)
예제 #10
0
파일: accessor.py 프로젝트: yz-/ut
 def unicode_load(self, filepath=None, **kwargs):
     """
     try pd.from_pickle, then pickle.loading, and if it doesn't work, try file_to.string
     """
     return pstr_trans.to_unicode_or_bust(self.simple_load(filepath=filepath, **kwargs))
예제 #11
0
파일: manip.py 프로젝트: yz-/ut
def lower_series(sr):
    return sr.apply(lambda x: to_unicode_or_bust(x).lower())
예제 #12
0
파일: accessor.py 프로젝트: SRHerzog/ut
 def unicode_load(self, filepath=None, **kwargs):
     """
     try pd.from_pickle, then pickle.loading, and if it doesn't work, try file_to.string
     """
     return pstr_trans.to_unicode_or_bust(
         self.simple_load(filepath=filepath, **kwargs))