from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import StandardScaler from sklearn.linear_model import RidgeCV from sklearn.model_selection import KFold from dirty_cat import datasets from dirty_cat import SimilarityEncoder, TargetEncoder # encoding methods encoder_dict = { 'one-hot': OneHotEncoder(handle_unknown='ignore'), 'similarity': SimilarityEncoder(similarity='ngram', handle_unknown='ignore'), 'target': TargetEncoder(handle_unknown='ignore'), 'num': FunctionTransformer(None) } data_file = datasets.fetch_employee_salaries() for method in ['one-hot', 'target', 'similarity']: # Load the data df = pd.read_csv(data_file).astype(str) df['Current Annual Salary'] = [float(s[1:]) for s in df['Current Annual Salary']] df['Year First Hired'] = [int(s.split('/')[-1]) for s in df['Date First Hired']] target_column = 'Current Annual Salary' y = df[target_column].values.ravel()
def __init__( self, encoder_name, reduction_method=None, ngram_range=(2, 4), categories="auto", dtype=np.float64, handle_unknown="ignore", clf_type=None, n_components=None, ): self.ngram_range = ngram_range self.encoder_name = encoder_name self.categories = categories self.dtype = dtype self.clf_type = clf_type self.handle_unknown = handle_unknown self.reduction_method = reduction_method self.n_components = n_components self.encoders_dict = { "OneHotEncoder": OneHotEncoder(handle_unknown="ignore"), "OneHotEncoder-1": OneHotEncoderRemoveOne(handle_unknown="ignore"), "Categorical": None, "OneHotEncoderDense": OneHotEncoder(handle_unknown="ignore", sparse=False), "OneHotEncoderDense-1": OneHotEncoderRemoveOne(handle_unknown="ignore", sparse=False), "SimilarityEncoder": SimilarityEncoder(ngram_range=self.ngram_range, random_state=10), "NgramNaiveFisherKernel": NgramNaiveFisherKernel(ngram_range=self.ngram_range, random_state=10), "ngrams_hot_vectorizer": [], "NgramsCountVectorizer": CountVectorizer(analyzer="char", ngram_range=self.ngram_range), "NgramsTfIdfVectorizer": TfidfVectorizer(analyzer="char", ngram_range=self.ngram_range, smooth_idf=False), "WordNgramsTfIdfVectorizer": TfidfVectorizer(analyzer="word", ngram_range=(1, 1), smooth_idf=False), "TargetEncoder": TargetEncoder(clf_type=self.clf_type, handle_unknown="ignore"), "MDVEncoder": MDVEncoder(self.clf_type), "BackwardDifferenceEncoder": cat_enc.BackwardDifferenceEncoder(), "BinaryEncoder": cat_enc.BinaryEncoder(), "HashingEncoder": cat_enc.HashingEncoder(), "HelmertEncoder": cat_enc.HelmertEncoder(), "SumEncoder": cat_enc.SumEncoder(), "PolynomialEncoder": cat_enc.PolynomialEncoder(), "BaseNEncoder": cat_enc.BaseNEncoder(), "LeaveOneOutEncoder": cat_enc.LeaveOneOutEncoder(), "NgramsLDA": Pipeline([ ( "ngrams_count", CountVectorizer(analyzer="char", ngram_range=self.ngram_range), ), ( "LDA", LatentDirichletAllocation(n_components=self.n_components, learning_method="batch"), ), ]), "NMF": Pipeline([ ( "ngrams_count", CountVectorizer(analyzer="char", ngram_range=self.ngram_range), ), ("NMF", NMF(n_components=self.n_components)), ]), "WordNMF": Pipeline([ ("ngrams_count", CountVectorizer(analyzer="word", ngram_range=(1, 1))), ("NMF", NMF(n_components=self.n_components)), ]), "NgramsMultinomialMixture": NgramsMultinomialMixture(n_topics=self.n_components, max_iters=10), "AdHocNgramsMultinomialMixture": AdHocNgramsMultinomialMixture(n_iters=0), "AdHocIndependentPDF": AdHocIndependentPDF(), "OnlineGammaPoissonFactorization": gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, rho=0.99, r=None, tol=1e-4, random_state=18, init="k-means++", ngram_range=self.ngram_range, rescale_W=True, max_iter_e_step=10, ), "OnlineGammaPoissonFactorization2": gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=0.3, rho=None, batch_size=256, tol=1e-4, random_state=18, init="k-means++", ngram_range=self.ngram_range, rescale_W=True, max_iter_e_step=20, ), "OnlineGammaPoissonFactorization3": gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=0.3, rho=None, batch_size=256, tol=1e-4, random_state=18, init="k-means", ngram_range=self.ngram_range, rescale_W=True, max_iter_e_step=20, ), "OnlineGammaPoissonFactorization4": gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=None, rho=0.95, batch_size=256, tol=1e-4, random_state=18, init="k-means", ngram_range=self.ngram_range, rescale_W=True, max_iter_e_step=20, ), "WordOnlineGammaPoissonFactorization": gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=0.3, tol=1e-4, random_state=18, init="k-means++", ngram_range=(1, 1), analizer="word", rescale_W=True, max_iter_e_step=10, ), "OnlineGammaPoissonFactorization_fast": gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=0.3, ngram_range=(3, 3), max_iter=1, min_iter=1, tol=1e-4, random_state=18, init="k-means++", rescale_W=False, ), "MinHashEncoder": MinHashEncoder(n_components=self.n_components), "PretrainedFastText": PretrainedFastText(n_components=self.n_components), "PretrainedFastText_fr": PretrainedFastText(n_components=self.n_components, language="french"), "PretrainedFastText_hu": PretrainedFastText(n_components=self.n_components, language="hungarian"), None: FunctionTransformer(None, validate=True), "Passthrough": PasstroughEncoder(), } self.list_1D_array_methods = [ "NgramsCountVectorizer", "NgramsTfIdfVectorizer", "WordNgramsTfIdfVectorizer", "ngrams_hot_vectorizer", "NgramsLDA", "NMF", "WordNMF", "NgramsMultinomialMixture", "NgramsMultinomialMixtureKMeans2", "AdHocNgramsMultinomialMixture", "AdHocIndependentPDF", "GammaPoissonFactorization", "OnlineGammaPoissonFactorization", "WordOnlineGammaPoissonFactorization", "OnlineGammaPoissonFactorization2", "OnlineGammaPoissonFactorization3", "OnlineGammaPoissonFactorization4", "OnlineGammaPoissonFactorization_fast", "MinHashEncoder", "MinMeanMinHashEncoder", ]
def __init__(self, encoder_name, reduction_method=None, ngram_range=(2, 4), categories='auto', dtype=np.float64, handle_unknown='ignore', clf_type=None, n_components=None): self.ngram_range = ngram_range self.encoder_name = encoder_name self.categories = categories self.dtype = dtype self.clf_type = clf_type self.handle_unknown = handle_unknown self.reduction_method = reduction_method self.n_components = n_components self.encoders_dict = { 'OneHotEncoder': OneHotEncoder(handle_unknown='ignore'), 'OneHotEncoder-1': OneHotEncoderRemoveOne(handle_unknown='ignore'), 'Categorical': None, 'OneHotEncoderDense': OneHotEncoder(handle_unknown='ignore', sparse=False), 'OneHotEncoderDense-1': OneHotEncoderRemoveOne(handle_unknown='ignore', sparse=False), 'SimilarityEncoder': SimilarityEncoder(ngram_range=self.ngram_range, random_state=10), 'NgramNaiveFisherKernel': NgramNaiveFisherKernel(ngram_range=self.ngram_range, random_state=10), 'ngrams_hot_vectorizer': [], 'NgramsCountVectorizer': CountVectorizer(analyzer='char', ngram_range=self.ngram_range), 'NgramsTfIdfVectorizer': TfidfVectorizer(analyzer='char', ngram_range=self.ngram_range, smooth_idf=False), 'WordNgramsTfIdfVectorizer': TfidfVectorizer(analyzer='word', ngram_range=(1, 1), smooth_idf=False), 'TargetEncoder': TargetEncoder(clf_type=self.clf_type, handle_unknown='ignore'), 'MDVEncoder': MDVEncoder(self.clf_type), 'BackwardDifferenceEncoder': cat_enc.BackwardDifferenceEncoder(), 'BinaryEncoder': cat_enc.BinaryEncoder(), 'HashingEncoder': cat_enc.HashingEncoder(), 'HelmertEncoder': cat_enc.HelmertEncoder(), 'SumEncoder': cat_enc.SumEncoder(), 'PolynomialEncoder': cat_enc.PolynomialEncoder(), 'BaseNEncoder': cat_enc.BaseNEncoder(), 'LeaveOneOutEncoder': cat_enc.LeaveOneOutEncoder(), 'NgramsLDA': Pipeline([ ('ngrams_count', CountVectorizer(analyzer='char', ngram_range=self.ngram_range)), ( 'LDA', LatentDirichletAllocation(n_components=self.n_components, learning_method='batch'), ) ]), 'NMF': Pipeline([('ngrams_count', CountVectorizer(analyzer='char', ngram_range=self.ngram_range)), ('NMF', NMF(n_components=self.n_components))]), 'WordNMF': Pipeline([('ngrams_count', CountVectorizer(analyzer='word', ngram_range=(1, 1))), ('NMF', NMF(n_components=self.n_components))]), 'NgramsMultinomialMixture': NgramsMultinomialMixture(n_topics=self.n_components, max_iters=10), 'AdHocNgramsMultinomialMixture': AdHocNgramsMultinomialMixture(n_iters=0), 'AdHocIndependentPDF': AdHocIndependentPDF(), 'OnlineGammaPoissonFactorization': gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, rho=.99, r=None, tol=1e-4, random_state=18, init='k-means++', ngram_range=self.ngram_range, rescale_W=True, max_iter_e_step=10), 'OnlineGammaPoissonFactorization2': gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=.3, rho=None, batch_size=256, tol=1e-4, random_state=18, init='k-means++', ngram_range=self.ngram_range, rescale_W=True, max_iter_e_step=20), 'OnlineGammaPoissonFactorization3': gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=.3, rho=None, batch_size=256, tol=1e-4, random_state=18, init='k-means', ngram_range=self.ngram_range, rescale_W=True, max_iter_e_step=20), 'OnlineGammaPoissonFactorization4': gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=None, rho=.95, batch_size=256, tol=1e-4, random_state=18, init='k-means', ngram_range=self.ngram_range, rescale_W=True, max_iter_e_step=20), 'WordOnlineGammaPoissonFactorization': gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=.3, tol=1e-4, random_state=18, init='k-means++', ngram_range=(1, 1), analizer='word', rescale_W=True, max_iter_e_step=10), 'OnlineGammaPoissonFactorization_fast': gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=.3, ngram_range=(3, 3), max_iter=1, min_iter=1, tol=1e-4, random_state=18, init='k-means++', rescale_W=False), 'MinHashEncoder': MinHashEncoder(n_components=self.n_components), 'PretrainedFastText': PretrainedFastText(n_components=self.n_components), 'PretrainedFastText_fr': PretrainedFastText(n_components=self.n_components, language='french'), 'PretrainedFastText_hu': PretrainedFastText(n_components=self.n_components, language='hungarian'), None: FunctionTransformer(None, validate=True), 'Passthrough': PasstroughEncoder(), } self.list_1D_array_methods = [ 'NgramsCountVectorizer', 'NgramsTfIdfVectorizer', 'WordNgramsTfIdfVectorizer', 'ngrams_hot_vectorizer', 'NgramsLDA', 'NMF', 'WordNMF', 'NgramsMultinomialMixture', 'NgramsMultinomialMixtureKMeans2', 'AdHocNgramsMultinomialMixture', 'AdHocIndependentPDF', 'GammaPoissonFactorization', 'OnlineGammaPoissonFactorization', 'WordOnlineGammaPoissonFactorization', 'OnlineGammaPoissonFactorization2', 'OnlineGammaPoissonFactorization3', 'OnlineGammaPoissonFactorization4', 'OnlineGammaPoissonFactorization_fast', 'MinHashEncoder', 'MinMeanMinHashEncoder', ]
def categorical_encoding(A, B, y_train, encoder, clf_type, n_jobs): '''Build the matrix of encoders. Given two arrays of strings to compare an a encoder, returns the corresponding encoder matrix of size len(A)xlen(B)''' if encoder == 'levenshtein-ratio_SimilarityEncoder': B = np.unique(B).reshape(-1, 1) encoder = SimilarityEncoder(similarity='levenshtein-ratio') encoder.fit(B) se = encoder.transform(A.reshape(-1, 1)) return se if encoder == 'one-hot_encoding': return one_hot_encoding(A, B) if encoder == 'one-hot_encoding_sparse': return sparse.csr_matrix(one_hot_encoding_sparse(A, B)) if encoder == 'jaccard_similarity': B = np.unique(B) warning = (('Warning: %s is not a well defined similarity ' + 'metric because two different values can have a ' + 'similarity of 1') % encoder) print(warning) unqA = np.unique(A) vlev = np.vectorize(dist.jaccard) # dvec = Parallel(n_jobs=n_jobs)(delayed(vlev)(a, B.reshape(1, -1)) # for a in unqA) dvec = [vlev(a, B.reshape(1, -1)) for a in unqA] ddict = {unqA[i]: dvec[i] for i in range(len(dvec))} dms = (ddict[a] for a in A) dm = np.vstack(dms) return 1 - dm if encoder == 'sorensen_similarity': B = np.unique(B) unqA = np.unique(A) vlev = np.vectorize(dist.sorensen) # dvec = Parallel(n_jobs=n_jobs)(delayed(vlev)(a, B.reshape(1, -1)) # for a in unqA) dvec = [vlev(a, B.reshape(1, -1)) for a in unqA] ddict = {unqA[i]: dvec[i] for i in range(len(dvec))} dms = (ddict[a] for a in A) dm = np.vstack(dms) return 1 - dm if encoder == 'jaro-winkler_SimilarityEncoder': B = np.unique(B).reshape(-1, 1) encoder = SimilarityEncoder(similarity='jaro-winkler') encoder.fit(B) se = encoder.transform(A.reshape(-1, 1)) return se if encoder[1:] == 'gram_SimilarityEncoder': n = int(encoder[0]) B = np.unique(B).reshape(-1, 1) encoder = SimilarityEncoder() encoder.fit(B) return encoder.transform(A.reshape(-1, 1)) if encoder[1:] == 'gram_similarity2': n = int(encoder[0]) B = np.unique(B) return ngram_similarity(A, B, n, sim_type='sim2') if encoder[1:] == 'gram_presence_fisher_kernel': n = int(encoder[0]) return ngram_similarity(A, B, n, sim_type='fisher_kernel') if encoder[1:] == 'gram_similarity2_1': n = int(encoder[0]) B = np.unique(B) sm = ngram_similarity(A, B, n, sim_type='sim2_1') return sm if encoder[1:] == 'gram_similarity2_2': n = int(encoder[0]) B = np.unique(B) sm = ngram_similarity(A, B, n, sim_type='sim2_2') return sm if encoder[1:] == 'gram_similarity3': n = int(encoder[0]) B = np.unique(B) sm = ngram_similarity(A, B, n, sim_type='sim3') return sm if encoder[1:] == 'gram_similarity3_2': n = int(encoder[0]) B = np.unique(B) sm = ngram_similarity(A, B, n, sim_type='sim3_2') return sm if encoder[1:] == 'gram_similarity4': n = int(encoder[0]) B = np.unique(B) sm = ngram_similarity(A, B, n, sim_type='sim4') return sm if encoder[1:] == 'gram_similarity5': n = int(encoder[0]) B = np.unique(B) sm = ngram_similarity(A, B, n, sim_type='sim5') return sm if encoder[1:] == 'gram_similarity6': n = int(encoder[0]) B = np.unique(B) sm = ngram_similarity(A, B, n, sim_type='sim6') return sm if encoder[1:] == 'gram_similarity7': n = int(encoder[0]) B = np.unique(B) sm = ngram_similarity(A, B, n, sim_type='sim7') return sm if encoder[1:] == 'grams_count_vectorizer': n = int(encoder[0]) B = np.unique(B) vectorizer = CountVectorizer(analyzer='char', ngram_range=(n, n)) vectorizer.fit(B) return vectorizer.transform(A) if encoder[1:] == 'grams_tfidf_vectorizer': n = int(encoder[0]) B = np.unique(B) vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(n, n), smooth_idf=False) vectorizer.fit(B) return vectorizer.transform(A) if encoder[1:] == 'grams_tf_vectorizer': n = int(encoder[0]) B = np.unique(B) vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(n, n), smooth_idf=False, use_idf=False) vectorizer.fit(B) return vectorizer.transform(A) if encoder[1:] == 'grams_hot_vectorizer': n = int(encoder[0]) B = np.unique(B) vectorizer = CountVectorizer(analyzer='char', ngram_range=(n, n)) vectorizer.fit(B) count_matrix1 = vectorizer.transform(A) return (count_matrix1 > 0).astype('float64') if encoder[1:] == 'grams_hot_vectorizer_tfidf': n = int(encoder[0]) B = np.unique(B) vectorizer = CountVectorizer(analyzer='char', ngram_range=(n, n)) presenceB = (vectorizer.fit_transform(B) > 0).astype('float64') presenceA = (vectorizer.transform(A) > 0).astype('float64') transformer = TfidfTransformer(smooth_idf=True) transformer.fit(presenceB) tfidfA = transformer.transform(presenceA) return tfidfA if encoder[1:] == 'grams_hashing': n = int(encoder[0]) hashingA = ngrams_hashing_vectorizer(A, n, 10000) return hashingA if encoder == 'TargetEncoder': encoder = TargetEncoder(clf_type=clf_type, handle_unknown='ignore') encoder.fit(B.reshape(-1, 1), y_train) return encoder.transform(A.reshape(-1, 1)) if encoder == 'MDVEncoder': return mdv_encoding(A, B, y_train, clf_type) if encoder == 'BackwardDifferenceEncoder': encoder = ce.BackwardDifferenceEncoder() encoder.fit(B) return encoder.transform(A) if encoder == 'BinaryEncoder': encoder = ce.BinaryEncoder() encoder.fit(B) return encoder.transform(A) if encoder == 'HashingEncoder': encoder = ce.HashingEncoder() encoder.fit(B) return encoder.transform(A) if encoder == 'HelmertEncoder': encoder = ce.HelmertEncoder() encoder.fit(B) return encoder.transform(A) if encoder == 'OneHotEncoder': encoder = ce.OneHotEncoder() encoder.fit(B) return encoder.transform(A) if encoder == 'OrdinalEncoder': encoder = ce.OrdinalEncoder() encoder.fit(B) return encoder.transform(A) if encoder == 'SumEncoder': encoder = ce.SumEncoder() encoder.fit(B) return encoder.transform(A) if encoder == 'PolynomialEncoder': encoder = ce.PolynomialEncoder() encoder.fit(B) return encoder.transform(A) if encoder == 'BaseNEncoder': encoder = ce.BaseNEncoder() encoder.fit(B) return encoder.transform(A) if encoder == 'LeaveOneOutEncoder': encoder = ce.LeaveOneOutEncoder() encoder.fit(B, y_train) return encoder.transform(A) else: message = 'Encoder %s has not been implemented yet.' % encoder return message