def test_make_union_kwargs(): pca = PCA(svd_solver='full') mock = Transf() fu = make_union(pca, mock, n_jobs=3) assert_equal(fu.transformer_list, make_union(pca, mock).transformer_list) assert_equal(3, fu.n_jobs) # invalid keyword parameters should raise an error message assert_raise_message( TypeError, 'Unknown keyword arguments: "transformer_weights"', make_union, pca, mock, transformer_weights={'pca': 10, 'Transf': 1} )
def get_results(dataset): X_full, y_full = dataset.data, dataset.target n_samples = X_full.shape[0] n_features = X_full.shape[1] # Estimate the score on the entire dataset, with no missing values estimator = RandomForestRegressor(random_state=0, n_estimators=100) full_scores = cross_val_score(estimator, X_full, y_full, scoring='neg_mean_squared_error') # Add missing values in 75% of the lines missing_rate = 0.75 n_missing_samples = int(np.floor(n_samples * missing_rate)) missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples, dtype=np.bool), np.ones(n_missing_samples, dtype=np.bool))) rng.shuffle(missing_samples) missing_features = rng.randint(0, n_features, n_missing_samples) # Estimate the score after replacing missing values by 0 X_missing = X_full.copy() X_missing[np.where(missing_samples)[0], missing_features] = 0 y_missing = y_full.copy() estimator = RandomForestRegressor(random_state=0, n_estimators=100) zero_impute_scores = cross_val_score(estimator, X_missing, y_missing, scoring='neg_mean_squared_error') # Estimate the score after imputation (mean strategy) of the missing values X_missing = X_full.copy() X_missing[np.where(missing_samples)[0], missing_features] = 0 y_missing = y_full.copy() estimator = make_pipeline( make_union(SimpleImputer(missing_values=0, strategy="mean"), MissingIndicator(missing_values=0)), RandomForestRegressor(random_state=0, n_estimators=100)) mean_impute_scores = cross_val_score(estimator, X_missing, y_missing, scoring='neg_mean_squared_error') # Estimate the score after chained imputation of the missing values estimator = make_pipeline( make_union(ChainedImputer(missing_values=0, random_state=0), MissingIndicator(missing_values=0)), RandomForestRegressor(random_state=0, n_estimators=100)) chained_impute_scores = cross_val_score(estimator, X_missing, y_missing, scoring='neg_mean_squared_error') return ((full_scores.mean(), full_scores.std()), (zero_impute_scores.mean(), zero_impute_scores.std()), (mean_impute_scores.mean(), mean_impute_scores.std()), (chained_impute_scores.mean(), chained_impute_scores.std()))
def __init__(self, training_values=None, training_targets=None): self.vectorizer = make_union(TfidfVectorizer(), PostTransformer()) # Set using parameter_search. TODO: review after updating # corpus. self.classifier = svm.LinearSVC(C=1, loss='squared_hinge', multi_class='ovr', class_weight='balanced', tol=1e-6) if training_values is not None and training_targets is not None: self.fit(training_values, training_targets)
def PipelineTelstra(Classifier): pipeline = make_pipeline( make_union( make_pipeline( DataSpliterTrans(cols='location',transp=True), preprocessing.OneHotEncoder(handle_unknown='ignore') ), make_pipeline( DataSpliterTrans(cols='event_type',matrix=True), DictVectorizer() ), make_pipeline( DataSpliterTrans(cols='severity_type',matrix=True), DictVectorizer() ), make_pipeline( DataSpliterTrans(cols='resource_type',matrix=True), DictVectorizer() ), make_pipeline( DataSpliterTrans(cols='volume',matrix=True), DictVectorizer() ), make_pipeline( DataSpliterTrans(cols='log_feature',matrix=True), DictVectorizer() ) ), Classifier() ) print('pipeline done.') return pipeline
def __init__(self, transforms): self.transforms = transforms union = make_union(*[t() for t in transforms]) pipeline = [union] self.pipeline = make_pipeline(*pipeline) self.classifier = LogisticRegression(penalty="l1", class_weight="auto")
def test_make_union(): pca = PCA() mock = TransfT() fu = make_union(pca, mock) names, transformers = zip(*fu.transformer_list) assert_equal(names, ("pca", "transft")) assert_equal(transformers, (pca, mock))
def PipelineTelstra(Classifier): pipeline = make_pipeline( make_union( make_pipeline( DataSpliterTrans(cols='event_type',matrix=True), DictVectorizer() ), make_pipeline( DataSpliterTrans(cols='severity_type',matrix=True), DictVectorizer() ), make_pipeline( DataSpliterTrans(cols='resource_type',matrix=True), DictVectorizer() ), make_pipeline( DataSpliterTrans(cols='volume',matrix=True), DictVectorizer() ), make_pipeline( DataSpliterTrans(cols='log_feature',matrix=True), DictVectorizer() ) ), Classifier() ) print('pipeline done.') return pipeline
def __init__(self, classifier="sgd", classifier_args=None, lowercase=True, text_replacements=None, map_to_synsets=False, binary=False, min_df=0, ngram=1, stopwords=None, limit_train=None, map_to_lex=False, duplicates=False): self.limit_train = limit_train self.duplicates = duplicates pipeline = [ExtractText(lowercase)] if text_replacements: pipeline.append(ReplaceText(text_replacements)) ext = [build_text_extraction(binary=binary, min_df=min_df, ngram=ngram, stopwords=stopwords)] if map_to_synsets: ext.append(build_synset_extraction(binary=binary, min_df=min_df, ngram=ngram)) if map_to_lex: ext.append(build_lex_extraction(binary=binary, min_df=min_df, ngram=ngram)) ext = make_union(*ext) pipeline.append(ext) #Building classifier if classifier_args is None: classifier_args={} classifier = _valid_classifiers[classifier](**classifier_args) self.pipeline = make_pipeline(*pipeline) self.classifier = classifier
def get_extra_features(args): forest = ExtraTreesClassifier(n_estimators=2000, criterion='entropy', max_features='sqrt', max_depth=6, min_samples_split=8, n_jobs=-1, bootstrap=True, oob_score=True, verbose=1, class_weight='balanced') pca = PCA(n_components=200) ica = FastICA(n_components=200, max_iter=1000) kmeans = KMeans(n_clusters=200, n_init=20, max_iter=1000) pipeline = make_pipeline(selectKFromModel(forest, k=1000), StandardScaler(), make_union(pca, ica, kmeans)) X_train = np.load('feature/1_100/X_train.npy') y_train = np.load('feature/1_100/y_train.npy') X_test = np.load('feature/1_100/X_test.npy') pipeline.fit(X_train, y_train[:, args.yix]) sel_ixs = pipeline.steps[0][1].indices[:500] X_train_ext = np.hstack((pipeline.transform(X_train), X_train[:, sel_ixs])) X_test_ext = np.hstack((pipeline.transform(X_test), X_test[:, sel_ixs])) with open(path.join(save_dir, 'pipe.pkl'), 'wb') as f_pipe: pickle.dump(pipeline, f_pipe) np.save(path.join(save_dir, 'selix.npy'), sel_ixs) return X_train_ext, X_test_ext
def test_missing_indicator_with_imputer(X, missing_values, X_trans_exp): trans = make_union( SimpleImputer(missing_values=missing_values, strategy='most_frequent'), MissingIndicator(missing_values=missing_values) ) X_trans = trans.fit_transform(X) assert_array_equal(X_trans, X_trans_exp)
def get_pipeline(fsmethods, clfmethod): """Returns an instance of a sklearn Pipeline given the parameters fsmethod1 and fsmethod2 will be joined in a FeatureUnion, then it will joined in a Pipeline with clfmethod Parameters ---------- fsmethods: list of estimators All estimators in a pipeline, must be transformers (i.e. must have a transform method). clfmethod: classifier The last estimator may be any type (transformer, classifier, etc.). Returns ------- pipe """ feat_union = None if not isinstance(fsmethods, list): if hasattr(fsmethods, 'transform'): feat_union = fsmethods else: raise ValueError('fsmethods expected to be either a list or a transformer method') else: feat_union = make_union(*fsmethods) if feat_union is None: pipe = make_pipeline(clfmethod) else: pipe = make_pipeline(feat_union, clfmethod) return pipe
def preprocess(self,any_set,is_train): if is_train: dico_pattern={'match_lowercase_only':'\\b[a-z]+\\b', 'match_word':'\\w{2,}', 'match_word1': '(?u)\\b\\w+\\b', 'match_word_punct': '\w+|[,.?!;]', 'match_NNP': '\\b[A-Z][a-z]+\\b|\\b[A-Z]+\\b', 'match_punct': "[,.?!;'-]" } tfv_title = TfidfVectorizer(lowercase=True, stop_words='english', token_pattern=dico_pattern["match_word1"], ngram_range=(1, 2), max_df=1.0, min_df=2, max_features=None, vocabulary=None, binary=True, norm=u'l2', use_idf=True, smooth_idf=True, sublinear_tf=True) tfv_desc = TfidfVectorizer(lowercase=True, stop_words='english', token_pattern=dico_pattern["match_word1"], ngram_range=(1, 2), max_df=1.0, min_df=2, max_features=None, vocabulary=None, binary=True, norm=u'l2', use_idf=True, smooth_idf=True, sublinear_tf=True) title_pipe = make_pipeline(ColumnSelector(key='title'), tfv_title) desc_pipe = make_pipeline(ColumnSelector(key='description'), tfv_desc) self.pipeline = make_union(title_pipe, desc_pipe) return self.pipeline.fit_transform(any_set) else: return self.pipeline.transform(any_set)
def pca_kpca(train_data, labels): estimators = make_union(PCA(), TruncatedSVD(), KernelPCA()) # estimators = [('linear_pca', PCA()), ('kernel_pca', KernelPCA())] combined = FeatureUnion(estimators) combined.fit(train_data, labels) # combined.fit_tranform(tain_data, labels) return combined
def __init__(self, **config): # Validate options are present for option in _configuration_options: if option not in config: raise ValueError("Missing configuration " "option {!r}".format(option)) # Feature extraction sparse_features = parse_features(config["sparse_features"]) densifier = make_pipeline(Vectorizer(sparse_features, sparse=True), ClassifierAsFeature()) dense_features = parse_features(config["dense_features"]) vectorization = make_union(densifier, Vectorizer(dense_features, sparse=False)) # Classifier try: classifier = _valid_classifiers[config["classifier"]] except KeyError: raise ValueError("Unknown classification algorithm " "{!r}".format(config["classifier"])) classifier = classifier(**config["classifier_args"]) self.pipeline = make_pipeline(vectorization, StandardScaler()) self.classifier = classifier
def test_make_union(): pca = PCA(svd_solver='full') mock = Transf() fu = make_union(pca, mock) names, transformers = zip(*fu.transformer_list) assert_equal(names, ("pca", "transf")) assert_equal(transformers, (pca, mock))
def get_scores_for_imputer(imputer, X_missing, y_missing): estimator = make_pipeline( make_union(imputer, MissingIndicator(missing_values=0)), REGRESSOR) impute_scores = cross_val_score(estimator, X_missing, y_missing, scoring='neg_mean_squared_error', cv=N_SPLITS) return impute_scores
def fit(self, X, y): # Filthy hack sids = X[:, -1] all_pipelines = [make_pipeline(LogisticRegressionCV()).fit(X_s, y_s) for X_s, y_s in subject_splitter(X[:, :-1], y, sids)] f_union = make_union(*[FeatureUnionWrapper(p) for p in all_pipelines]) self.clf_ = make_pipeline(f_union, LogisticRegressionCV()).fit(X[:, :-1], y) return self
def _create_feature_union(features): """ Create a FeatureUnion. Each "feature" is a 3-tuple: (name, feature_extractor, vectorizer). """ return make_union(*[ make_pipeline(fe, vec) for name, fe, vec in features ])
def make_pipe(classifier): language_featurizer = make_union(CountVectorizer(), FunctionFeaturizer(longest_run_of_capital_letters_feature, percent_character_feature, percent_character_combinations, longest_run_of_character_feature, character_combinations_binary )) return make_pipeline(language_featurizer, classifier)
def __init__(self, transforms, n_estimators=2000, criterion='gini', min_samples_leaf=2, n_jobs=-1): self.transforms = transforms self.n_estimators = n_estimators self.criterion = criterion self.min_samples_leaf = min_samples_leaf self.n_jobs = n_jobs union = make_union(*[t() for t in transforms]) pipeline = [union] self.pipeline = make_pipeline(*pipeline) self.classifier = RandomForestClassifier(n_estimators, criterion, min_samples_leaf=min_samples_leaf, n_jobs=-1)
def create_input_transformer(fields, vec_name): """Create a pipeline of input transformations, allowing to use scaling of input fields.""" pipeline = [] for field in fields: field_name = field['name'] field_scale = field['scale'] field_type = processed_db.get_field_type(field_name) pipeline.append( make_pipeline(ItemSelector(field_name), # select the correct column Vectorizer(vec_name, field_type), # vectorize (depending on str/numeric input) Scaler(field_scale)) # scale column based on user input ) return make_union(*pipeline)
def build_prediction(): p_age = make_pipeline( make_union( OneHotTransformer(lambda x: x[1]['phone_brand'].lower()), OneHotTransformer(lambda x: x[1]['device_model'].lower()), TfidfVectorizer(preprocessor=lambda x: ' '.join(x[1]['app_id'])) ), LogisticRegression() ) x_train = [(x, y) for x, y in PERSONS.items()] x_test = [(x, y) for x, y in PERSONS_TESTS.items()] y_train_age = [y.get('group') for y in PERSONS.values()] print "fit age predictor" p_age.fit(x_train, y_train_age) print "predicting age" classes = p_age.classes_ age_prediction = p_age.predict_proba(x_test) return classes, age_prediction
def preprocess(self,any_set,is_train): if is_train: tfv_text = TfidfVectorizer(lowercase=True, max_features=2500) tfv_topics = TfidfVectorizer(lowercase=True, max_features=20) clf = MultinomialNB(alpha=0.05, fit_prior=True, class_prior=None) title_pipe = make_pipeline(ColumnSelector(key=u'title'), tfv_text) topics_pipe = make_pipeline(ColumnSelector(key=u'topicIds'), tfv_topics) rel_topic_pipe = make_pipeline(ColumnSelector(key=u'relevantTopicIds'), tfv_topics) text_pipe = make_pipeline(ColumnSelector(key=u'description'), tfv_text) self.pipeline = make_union(title_pipe, topics_pipe,rel_topic_pipe,text_pipe) return self.pipeline.fit_transform(any_set) else: return self.pipeline.transform(any_set)
def PipelineBNP(Classifier): pipeline = make_pipeline( NulltoNanTrans(), make_union( make_pipeline( DataSpliterTrans(dtype=np.float64), Imputer(strategy='median') ), make_pipeline( DataSpliterTrans(dtype=np.int), Imputer(strategy='most_frequent'), preprocessing.OneHotEncoder(handle_unknown='ignore') ), make_pipeline( DataSpliterTrans(dtype=np.object), ObjtoCatStrtoIntTrans(), Imputer(strategy='most_frequent'), preprocessing.OneHotEncoder(handle_unknown='ignore') ) ), Classifier() ) print('pipeline done.') return pipeline
def run_lr(train, test, y_train, num_models): feature_extractor = make_union(create_basic_feature_extractor(), create_BoW_feature_extractor()) X_train = feature_extractor.fit_transform(train) X_test = feature_extractor.transform(test) return [lr_predict(X_train, y_train, X_test) for _ in range(num_models)]
def __init__(self, classifier="sgd", classifier_args=None, lowercase=True, text_replacements=None, map_to_synsets=False, binary=False, min_df=0, ngram=1, stopwords=None, limit_train=None, map_to_lex=False, duplicates=False): """ Parameter description: - `classifier`: The type of classifier used as main classifier, valid values are "sgd", "knn", "svc", "randomforest". - `classifier_args`: A dict to be passed as arguments to the main classifier. - `lowercase`: wheter or not all words are lowercased at the start of the pipeline. - `text_replacements`: A list of tuples `(from, to)` specifying string replacements to be made at the start of the pipeline (after lowercasing). - `map_to_synsets`: Whether or not to use the Wordnet synsets feature set. - `binary`: Whether or not to count words in the bag-of-words representation as 0 or 1. - `min_df`: Minumim frequency a word needs to have to be included in the bag-of-word representation. - `ngram`: The maximum size of ngrams to be considered in the bag-of-words representation. - `stopwords`: A list of words to filter out of the bag-of-words representation. Can also be the string "english", in which case a default list of english stopwords will be used. - `limit_train`: The maximum amount of training samples to give to the main classifier. This can be useful for some slow main classifiers (ex: svc) that converge with less samples to an optimum. - `max_to_lex`: Whether or not to use the Harvard Inquirer lexicon features. - `duplicates`: Whether or not to check for identical phrases between train and prediction. """ self.limit_train = limit_train self.duplicates = duplicates # Build pre-processing common to every extraction pipeline = [ExtractText(lowercase)] if text_replacements: pipeline.append(ReplaceText(text_replacements)) # Build feature extraction schemes ext = [ build_text_extraction(binary=binary, min_df=min_df, ngram=ngram, stopwords=stopwords) ] if map_to_synsets: ext.append( build_synset_extraction(binary=binary, min_df=min_df, ngram=ngram)) if map_to_lex: ext.append( build_lex_extraction(binary=binary, min_df=min_df, ngram=ngram)) ext = make_union(*ext) pipeline.append(ext) # Build classifier and put everything togheter if classifier_args is None: classifier_args = {} classifier = _valid_classifiers[classifier](**classifier_args) self.pipeline = make_pipeline(*pipeline) self.classifier = classifier
def __init__(self, classifier="sgd", classifier_args=None, lowercase=True, text_replacements=None, map_to_synsets=False, binary=False, min_df=0, ngram=1, stopwords=None, limit_train=None, map_to_lex=False, duplicates=False): """ Parameter description: - `classifier`: The type of classifier used as main classifier, valid values are "sgd", "knn", "svc", "randomforest". - `classifier_args`: A dict to be passed as arguments to the main classifier. - `lowercase`: wheter or not all words are lowercased at the start of the pipeline. - `text_replacements`: A list of tuples `(from, to)` specifying string replacements to be made at the start of the pipeline (after lowercasing). - `map_to_synsets`: Whether or not to use the Wordnet synsets feature set. - `binary`: Whether or not to count words in the bag-of-words representation as 0 or 1. - `min_df`: Minumim frequency a word needs to have to be included in the bag-of-word representation. - `ngram`: The maximum size of ngrams to be considered in the bag-of-words representation. - `stopwords`: A list of words to filter out of the bag-of-words representation. Can also be the string "english", in which case a default list of english stopwords will be used. - `limit_train`: The maximum amount of training samples to give to the main classifier. This can be useful for some slow main classifiers (ex: svc) that converge with less samples to an optimum. - `max_to_lex`: Whether or not to use the Harvard Inquirer lexicon features. - `duplicates`: Whether or not to check for identical phrases between train and prediction. """ self.limit_train = limit_train self.duplicates = duplicates # Build pre-processing common to every extraction pipeline = [ExtractText(lowercase)] if text_replacements: pipeline.append(ReplaceText(text_replacements)) # Build feature extraction schemes ext = [build_text_extraction(binary=binary, min_df=min_df, ngram=ngram, stopwords=stopwords)] if map_to_synsets: ext.append(build_synset_extraction(binary=binary, min_df=min_df, ngram=ngram)) if map_to_lex: ext.append(build_lex_extraction(binary=binary, min_df=min_df, ngram=ngram)) ext = make_union(*ext) pipeline.append(ext) # Build classifier and put everything togheter if classifier_args is None: classifier_args = {} classifier = _valid_classifiers[classifier](**classifier_args) self.pipeline = make_pipeline(*pipeline) self.classifier = classifier
# In[13]: # full pipeline for data engineering full_pipeline = Pipeline(steps=[ ( "features", make_union( make_pipeline(DataFrameSelector(["Embarked"]), MostFrequentImputer( ), CategoricalEncoder(encoding='onehot-dense')), make_pipeline(DataFrameSelector(["Pclass", "Sex"]), CategoricalEncoder(encoding='onehot-dense')), make_pipeline(DataFrameSelector(["Age", "Fare"]), Imputer(strategy="median"), StandardScaler()), make_pipeline(DataFrameSelector(["Name"]), ExtractTitle(), CategoricalEncoder(encoding='onehot-dense')), #make_pipeline(DataFrameSelector(["Cabin"]), FillMissingCabin(), ExtractCabin(), CategoricalEncoder(encoding='onehot-dense')), make_pipeline(DataFrameSelector(["Cabin"]), HasCabin()), make_pipeline(DataFrameSelector(["SibSp", "Parch"]), CreateFamilySize(), CategoricalEncoder(encoding='onehot-dense')), )), ("poly", PolynomialFeatures()), #("PCA", PCA(n_components=0.95)), #("best", SelectKBest(k=20)), ("clf", RandomForestClassifier(random_state=42)) ]) # **STEP 6 - Splitting the training dataset** #
import numpy as np from sklearn.cross_validation import train_test_split from sklearn.ensemble import RandomForestClassifier, VotingClassifier from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import FunctionTransformer # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_classes, testing_classes = \ train_test_split(features, tpot_data['class'], random_state=42) exported_pipeline = make_pipeline( make_union(FunctionTransformer(lambda X: X), FunctionTransformer(lambda X: X)), RandomForestClassifier(n_estimators=500)) exported_pipeline.fit(training_features, training_classes) results = exported_pipeline.predict(testing_features)
import numpy as np from sklearn.cross_validation import train_test_split from sklearn.decomposition import RandomizedPCA from sklearn.ensemble import RandomForestClassifier, VotingClassifier from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import FunctionTransformer from sklearn.svm import LinearSVC # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_classes, testing_classes = \ train_test_split(features, tpot_data['class'], random_state=42) exported_pipeline = make_pipeline( RandomizedPCA(iterated_power=10), make_union(VotingClassifier([("est", LinearSVC(C=0.59, dual=False, penalty="l1"))]), FunctionTransformer(lambda X: X)), RandomForestClassifier(n_estimators=500) ) exported_pipeline.fit(training_features, training_classes) results = exported_pipeline.predict(testing_features)
TOKENIZER = re.compile('([{}“”¨«»®´·º½¾¿¡§£₤‘’])'.format(string.punctuation)) def tokenize(s): return TOKENIZER.sub(r' \1 ', s).split() vectorizer = make_union( on_field( 'question_text', TfidfVectorizer(max_features=13000, token_pattern='\w+', strip_accents='unicode', tokenizer=tokenize, sublinear_tf=True)), on_field('question_text', TfidfVectorizer(ngram_range=(3, 3), analyzer='char', min_df=25)), make_pipeline( PandasSelector(columns=[ 'num_words', 'num_singletons', 'caps_vs_length', ], return_vector=False), MaxAbsScaler()), ) with timer('process train'): # df_train = pd.read_csv(os.path.join(INPUT_PATH, "train.csv")) df_train = joblib.load('train.pkl') df_test = pd.read_csv(os.path.join(INPUT_PATH, "test.csv")) # df_test = joblib.load('valid_for_emsemble.pkl') train_count = len(df_train)
import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import MinMaxScaler, Normalizer from tpot.builtins import StackingEstimator from xgboost import XGBClassifier from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Average CV score on the training set was:0.839915792130982 exported_pipeline = make_pipeline( make_union( MinMaxScaler(), FunctionTransformer(copy) ), Normalizer(norm="max"), XGBClassifier(learning_rate=0.01, max_depth=3, min_child_weight=7, n_estimators=600, nthread=1, subsample=0.8) ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
from tpot.export_utils import set_param_recursive # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=42) # Average CV score on the training set was: -5.853055578955521 exported_pipeline = make_pipeline( make_union( StackingEstimator(estimator=RidgeCV()), make_pipeline( FeatureAgglomeration(affinity="manhattan", linkage="complete"), Nystroem(gamma=0.30000000000000004, kernel="sigmoid", n_components=5))), FeatureAgglomeration(affinity="cosine", linkage="average"), SGDRegressor(alpha=0.0, eta0=0.01, fit_intercept=False, l1_ratio=1.0, learning_rate="invscaling", loss="epsilon_insensitive", penalty="elasticnet", power_t=100.0)) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 42)
# NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=None) # Average CV score on the training set was: 0.8326392221287445 exported_pipeline = make_pipeline( make_union( make_pipeline( make_union( make_union( FunctionTransformer(copy), StackingEstimator(estimator=RandomForestClassifier(bootstrap=True, criterion="entropy", max_features=0.35000000000000003, min_samples_leaf=1, min_samples_split=7, n_estimators=100)) ), make_union( FunctionTransformer(copy), FunctionTransformer(copy) ) ), SelectPercentile(score_func=f_classif, percentile=58) ), FunctionTransformer(copy) ), MultinomialNB(alpha=0.1, fit_prior=True) ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
})], input_df=True, df_out=True, default=False) engineered_feature_pipeline4 = skp.DataFrameMapper( [(['c1', 'c2', 'c3', 'c4', 'c5'], uf.Straight(), { 'alias': 'has_straight' })], input_df=True, df_out=True, default=False) # here we lose feature names features_pipeline = ppl.make_union(engineered_feature_pipeline1, engineered_feature_pipeline2, engineered_feature_pipeline3, engineered_feature_pipeline4) temp = d_in[d_in['hand'] == '8'] #features_pipeline.fit_transform(temp).head() a = features_pipeline.fit_transform(temp) a[0:10, ] # modelling complete pipeline pipe = ppl.Pipeline([ ('prep', features_pipeline), ('encoding', ppr.OneHotEncoder()), ('clf', LogisticRegression(multi_class='multinomial', penalty='l2', random_state=9546, solver="lbfgs"))
import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from tpot.builtins import StackingEstimator, ZeroCount from xgboost import XGBClassifier from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.504247990815155 exported_pipeline = make_pipeline( make_union(FunctionTransformer(copy), ZeroCount()), XGBClassifier(learning_rate=0.001, max_depth=3, min_child_weight=3, n_estimators=100, nthread=1, subsample=0.1)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
def __init__(self, classifier="sgd", classifier_args=None, lowercase=True, text_replacements=None, map_to_synsets=True, binary=False, min_df=0, ngram=1, stopwords=None, limit_train=None, map_to_lex=True, duplicates=True, svm_features=False , preprocessor=False, useLemmatization = True, stemming = False, useStopWords = True, word2vecFeatures = False, splitModel= False, useTfIdf= False): """ Parameter description: - `classifier`: The type of classifier used as main classifier, valid values are "sgd", "knn", "svc", "randomforest". - `classifier_args`: A dict to be passed as arguments to the main classifier. - `lowercase`: wheter or not all words are lowercased at the start of the pipeline. - `text_replacements`: A list of tuples `(from, to)` specifying string replacements to be made at the start of the pipeline (after lowercasing). - `map_to_synsets`: Whether or not to use the Wordnet synsets feature set. - `binary`: Whether or not to count words in the bag-of-words representation as 0 or 1. - `min_df`: Minumim frequency a word needs to have to be included in the bag-of-word representation. - `ngram`: The maximum size of ngrams to be considered in the bag-of-words representation. - `stopwords`: A list of words to filter out of the bag-of-words representation. Can also be the string "english", in which case a default list of english stopwords will be used. - `limit_train`: The maximum amount of training samples to give to the main classifier. This can be useful for some slow main classifiers (ex: svc) that converge with less samples to an optimum. - `max_to_lex`: Whether or not to use the Harvard Inquirer lexicon features. - `duplicates`: Whether or not to check for identical phrases between train and prediction. - `svm_features`: Whether or not to include features from an SVM classifier """ self.limit_train = limit_train self.duplicates = duplicates print("Using tfidf: ", useTfIdf) # Build pre-processing common to every extraction pipeline = [Preprocessor(removeStopWords=useStopWords, lemmatize=useLemmatization, stem=stemming)] if preprocessor else [ExtractText(lowercase)] if text_replacements: pipeline.append(ReplaceText(text_replacements)) # Build feature extraction schemes ext = [build_text_extraction(binary=binary, min_df=min_df, ngram=ngram, stopwords=stopwords, useTfIdf=useTfIdf)] if map_to_synsets: ext.append(build_synset_extraction(binary=binary, min_df=min_df, ngram=ngram, useTfIdf=useTfIdf)) if map_to_lex: ext.append(build_lex_extraction(binary=binary, min_df=min_df, ngram=ngram)) if svm_features: ext.append(build_svm_features()) if word2vecFeatures: ext.append(build_word2vec_features()) ext = make_union(*ext) pipeline.append(ext) # Build classifier and put everything togheter if classifier_args is None: classifier_args = {} if classifier == "ensemble": classifier_args = {"classifiers": [SGDClassifier(), RandomForestClassifier(), SVC(),KNeighborsClassifier(), RandomForestClassifier(n_estimators= 100, min_samples_leaf=10, n_jobs=-1)]} #Classifier constructor E.g. SGDClassifier(args) classifier = _valid_classifiers[classifier](**classifier_args) self.pipeline = make_pipeline(*pipeline) self.classifier = classifier self.splitModel = splitModel self.splitSize = 1
def get_feature_pipeline(tr_artifact, hist_artifacts, all_data, cachedir='data/'): """Define feature transformation pipeline.""" return make_pipeline( make_union( identity(input_cols=[ FieldNames.customer_id, FieldNames.coupon_id, FieldNames.rented, FieldNames.age_range, FieldNames.marital_status, FieldNames.no_of_children, FieldNames.family_size, FieldNames.income_bracket, ]), make_pipeline(SelectCols(cols=[FieldNames.campaign_type]), OrdinalEncoder()), # make_pipeline( # SelectCols(cols=[FieldNames.cust_cohort]), # OneHotEncoder(handle_unknown='ignore') # ), make_pipeline( GroupCatCatNUnique(FieldNames.campaign_id, FieldNames.customer_id)), make_pipeline( GroupCatCatNUnique(FieldNames.campaign_id, FieldNames.coupon_id)), # make_pipeline( # SelectCols(cols=[FieldNames.campaign_id]), # GroupCatCountEncoder() # ), make_pipeline( ExpandingMean( date_col=FieldNames.campaign_start_date, user_col=FieldNames.customer_id, key_col=FieldNames.target, hist_artifact=tr_artifact, ), ), make_pipeline( ExpandingCount( date_col=FieldNames.campaign_start_date, user_col=FieldNames.customer_id, key_col=FieldNames.target, hist_artifact=tr_artifact, ), ), # make_pipeline( # ExpandingMedian( # date_col=FieldNames.campaign_start_date, # user_col=FieldNames.customer_id, # key_col=FieldNames.transaction_day, # hist_artifact=hist_artifacts[0] # ) # ), # make_pipeline( # ExpandingSum( # date_col=FieldNames.campaign_start_date, # user_col=FieldNames.customer_id, # key_col=FieldNames.target, # hist_artifact=tr_artifact, # ) # ), # make_pipeline( # ExpandingCount( # date_col=FieldNames.campaign_start_date, # user_col=FieldNames.customer_id, # key_col=FieldNames.coupon_discount, # hist_artifact=hist_artifacts[0], # ) # ), # make_pipeline( # ExpandingMean( # date_col=FieldNames.campaign_start_date, # user_col=FieldNames.customer_id, # key_col=FieldNames.selling_price, # hist_artifact=hist_artifacts[0], # ) # ), # make_pipeline( # ExpandingMean( # date_col=FieldNames.campaign_start_date, # user_col=FieldNames.customer_id, # key_col=FieldNames.coupon_discount, # hist_artifact=hist_artifacts[1], # ) # ), # make_pipeline( # ExpandingSum( # date_col=FieldNames.campaign_start_date, # user_col=FieldNames.customer_id, # key_col=FieldNames.selling_price, # hist_artifact=hist_artifacts[1], # ) # ), # make_pipeline( # ExpandingMax( # date_col=FieldNames.campaign_start_date, # user_col=FieldNames.customer_id, # key_col=FieldNames.pct_discount, # hist_artifact=hist_artifacts[0], # ) # ), make_pipeline( make_union( SetAggregation( date_col=FieldNames.campaign_start_date, user_col=FieldNames.customer_id, key_col=FieldNames.item_set, hist_artifact=hist_artifacts[0], ), SelectCols(cols=[FieldNames.item_set]), ), CountCommon()), make_pipeline( make_union( SetAggregation( date_col=FieldNames.campaign_start_date, user_col=FieldNames.customer_id, key_col=FieldNames.item_set, hist_artifact=hist_artifacts[0], ), SelectCols(cols=[FieldNames.item_set]), ), Jaccard()), make_pipeline( make_union( SetAggregation( date_col=FieldNames.campaign_start_date, user_col=FieldNames.customer_id, key_col=FieldNames.item_set, hist_artifact=hist_artifacts[1], ), SelectCols(cols=[FieldNames.item_set]), ), CountCommon(), ), make_pipeline( make_union( SetAggregation( date_col=FieldNames.campaign_start_date, user_col=FieldNames.customer_id, key_col=FieldNames.item_set, hist_artifact=hist_artifacts[1], ), SelectCols(cols=[FieldNames.item_set]), ), Jaccard(), QuantileTransformer(output_distribution='normal')), # make_pipeline( # make_union( # SetAggregation( # date_col=FieldNames.campaign_start_date, # user_col=FieldNames.customer_id, # key_col=FieldNames.item_set, # hist_artifact=hist_artifacts[2], # ), # SelectCols(cols=[FieldNames.item_set]), # ), # Jaccard(), # ), # make_pipeline( # make_union( # SetAggregation( # date_col=FieldNames.campaign_start_date, # user_col=FieldNames.customer_id, # key_col=FieldNames.item_brand, # hist_artifact=hist_artifacts[0], # ), # SelectCols(cols=[FieldNames.item_brand]), # ), # CountCommon(), # ), make_pipeline( make_union( SetAggregation( date_col=FieldNames.campaign_start_date, user_col=FieldNames.customer_id, key_col=FieldNames.item_brand, hist_artifact=hist_artifacts[0], ), SelectCols(cols=[FieldNames.item_brand]), ), Jaccard(), ), make_pipeline( make_union( SetAggregation( date_col=FieldNames.campaign_start_date, user_col=FieldNames.customer_id, key_col=FieldNames.item_brand, hist_artifact=hist_artifacts[1], ), SelectCols(cols=[FieldNames.item_brand]), ), Jaccard(), ), # make_pipeline( # CouponItemMean(coupon_col=FieldNames.coupon_id, # target_col=FieldNames.target) # ) # make_pipeline( # make_union( # SetAggregation( # date_col=FieldNames.campaign_start_date, # user_col=FieldNames.customer_id, # key_col=FieldNames.item_category, # hist_artifact=hist_artifacts[0], # ), # SelectCols(cols=[FieldNames.item_category]), # ), # Jaccard(), # ), make_pipeline( make_union( SetAggregation( date_col=FieldNames.campaign_start_date, user_col=FieldNames.customer_id, key_col=FieldNames.item_category, hist_artifact=hist_artifacts[1], ), SelectCols(cols=[FieldNames.item_category]), ), Jaccard(), ), make_pipeline( make_union( SetAggregation( date_col=FieldNames.campaign_start_date, user_col=FieldNames.customer_id, key_col=FieldNames.item_category, hist_artifact=hist_artifacts[2], ), SelectCols(cols=[FieldNames.item_category]), ), Jaccard(), ), make_pipeline( SetLen( date_col=FieldNames.campaign_start_date, user_col=FieldNames.customer_id, key_col=FieldNames.item_brand, hist_artifact=hist_artifacts[0], ), ), make_pipeline( SelectCols(cols=[ FieldNames.campaign_start_date, FieldNames.campaign_end_date ]), FunctionTransfomer(lambda x: (x.iloc[:, 1] - x.iloc[:, 0]).dt.days)), # make_pipeline( # FunctionTransfomer(lambda x: x[FieldNames.item_set].apply(len).values.reshape(-1, 1)) # ), make_pipeline( FunctionTransfomer(lambda x: x[FieldNames.item_brand].apply( len).values.reshape(-1, 1))), make_pipeline( FunctionTransfomer(lambda x: x[FieldNames.item_category].apply( len).values.reshape(-1, 1))), make_pipeline( ZeroPct( date_col=FieldNames.campaign_start_date, user_col=FieldNames.customer_id, key_col=FieldNames.coupon_discount, hist_artifact=hist_artifacts[0], )), make_pipeline( AllCountEncoder( cols=[FieldNames.customer_id, FieldNames.coupon_id], data=all_data)), # make_pipeline( # SetMean( # date_col=FieldNames.campaign_start_date, # user_col=FieldNames.customer_id, # key_col=FieldNames.selling_price, # hist_artifact=hist_artifacts[0], # ) # ), # make_pipeline( # ZeroPct( # date_col=FieldNames.campaign_start_date, # user_col=FieldNames.customer_id, # key_col=FieldNames.other_discount, # hist_artifact=hist_artifacts[0], # ) # ) # make_pipeline( # VectorMapper(FieldNames.coupon_id, 'data/coupon_vectors_lda.npy') # ), # make_pipeline( # VectorMapper(FieldNames.coupon_id, 'data/coupon_vectors_svd.npy') # ), # make_pipeline( # SetLen( # date_col=FieldNames.campaign_start_date, # user_col=FieldNames.customer_id, # key_col=FieldNames.item_set, # hist_artifact=hist_artifacts[0], # ), # ), # make_pipeline( # SetLen( # date_col=FieldNames.campaign_start_date, # user_col=FieldNames.customer_id, # key_col=FieldNames.item_set, # hist_artifact=hist_artifacts[1], # ), # ), # make_pipeline( # make_union( # SetAggregationLast3( # date_col=FieldNames.campaign_start_date, # user_col=FieldNames.customer_id, # key_col=FieldNames.item_set, # hist_artifact=hist_artifacts[1], # ), # SelectCols(cols=[FieldNames.item_set]), # ), # Jaccard(), # ), # make_pipeline( # make_union( # SetAggregation( # date_col=FieldNames.campaign_start_date, # user_col=FieldNames.customer_id, # key_col=FieldNames.item_set, # hist_artifact=hist_artifacts[2], # ), # SelectCols(cols=[FieldNames.item_set]), # ), # Jaccard(), # ), ), make_union( FunctionTransfomer(lambda x: x), FunctionTransfomer(lambda x: x[:, 13] / (1e-4 + x[:, 15])), FunctionTransfomer(lambda x: x[:, 14] / (1e-4 + x[:, 16])), FunctionTransfomer(lambda x: x[:, 17] / (1e-4 + x[:, 18])), FunctionTransfomer(lambda x: x[:, 19] / (1e-4 + x[:, 20])), # FunctionTransfomer(lambda x: x[:, 17]/(1e-4 + x[:, 14])), ), )
return len(sample_text) def transform(self, X, y=None): """The workhorse of this feature extractor""" result_series = X['review'].apply(self.text_length) return result_series.to_frame(name=self.get_feature_names()[0]) def fit(self, df, y=None): """Returns `self` unless something different happens in train and test""" return self if __name__ == '__main__': from sklearn.pipeline import make_pipeline, make_union base_path = '../data/stanford_imdb' df = pd.read_csv(f'{base_path}/imdb_df.csv.gzip', compression='gzip') X = df.drop(['sentiment'], axis=1) t1 = AverageWordLengthExtractor() t2 = TextLengthExtractor() # I expect the make_union to produce 2 additional columns pipe = make_union(t1, t2) n = pipe.transform(X) print(n.shape) print(type(n)) #numpy.ndarray
import pandas as pd from sklearn.decomposition import FastICA from sklearn.ensemble import ExtraTreesClassifier from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from tpot.builtins import StackingEstimator, ZeroCount from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.8453186610518303 exported_pipeline = make_pipeline( make_union(make_pipeline(ZeroCount(), FastICA(tol=0.2)), FunctionTransformer(copy)), ExtraTreesClassifier(bootstrap=False, criterion="entropy", max_features=0.2, min_samples_leaf=1, min_samples_split=4, n_estimators=100)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
def main(): vectorizer = make_union( on_field('title', Tfidf(max_features=100000, token_pattern='\w+')), on_field('text', Tfidf(max_features=100000, token_pattern='\w+', ngram_range=(1, 2))), on_field(['price', 'user_type', 'image_top_1'], FunctionTransformer(to_records, validate=False), DictVectorizer()), n_jobs=8) y_scaler = StandardScaler() with timer('process train'): print('read train data ...') train = pd.read_csv('../input/train.csv', parse_dates = ["activation_date"]) # cv = KFold(n_splits=10, shuffle=True, random_state=42) # train_ids, valid_ids = next(cv.split(train)) # train, valid = train.iloc[train_ids], train.iloc[valid_ids] train, valid = train_test_split(train, test_size=0.10, random_state=23) y_train = y_scaler.fit_transform(train['deal_probability'].values.reshape(-1, 1)) X_train = vectorizer.fit_transform(preprocess(train)) print('X_train: {} of {}'.format(X_train.shape,X_train.dtype)) del train; gc.collect() with timer('process valid'): X_valid = vectorizer.transform(preprocess(valid)) with timer('process test'): # TODO print('read test data ...') test = pd.read_csv('../input/test.csv', parse_dates = ["activation_date"]) X_test = vectorizer.transform(preprocess(test)) del test; gc.collect() with ThreadPool(processes=8) as pool: # Xb_train, Xb_valid = [x.astype(np.bool) for x in [X_train, X_valid]] Xb_train, Xb_valid, Xb_test = [x.astype(np.bool) for x in [X_train, X_valid, X_test]] xs = [[Xb_train, Xb_valid], [X_train, X_valid]] * 2 del X_valid; gc.collect() # TODO xs_test = [[Xb_train, Xb_test], [X_train, Xb_test]] * 2 del X_train, X_test; gc.collect() y_pred = np.mean(pool.map(partial(fit_predict, y_train=y_train), xs), axis=0) # TODO y_pred_test = np.mean(pool.map(partial(fit_predict, y_train=y_train), xs_test), axis=0) y_pred = y_scaler.inverse_transform(y_pred.reshape(-1, 1))[:, 0] # TODO y_pred_test = y_scaler.inverse_transform(y_pred_test.reshape(-1, 1))[:, 0] print('Valid RMSLE: {:.4f}'.format(np.sqrt(mean_squared_log_error(valid['deal_probability'], y_pred)))) del valid; gc.collect() sub = pd.read_csv('../input/sample_submission.csv') sub['deal_probability'] = y_pred_test sub.to_csv('sub.csv', index=False) print('all done!')
import pandas as pd from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB from sklearn.pipeline import make_pipeline, make_union from tpot.builtins import StackingEstimator from xgboost import XGBClassifier from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=None) # Average CV score on the training set was: 0.9100211739663182 exported_pipeline = make_pipeline( make_union(StackingEstimator(estimator=GaussianNB()), FunctionTransformer(copy)), XGBClassifier(learning_rate=0.1, max_depth=6, min_child_weight=2, n_estimators=100, nthread=1, subsample=0.6000000000000001)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Average CV score on the training set was:0.84550605863897 exported_pipeline = make_pipeline( make_union( make_pipeline( OneHotEncoder(minimum_fraction=0.25, sparse=False, threshold=10), RFE(estimator=ExtraTreesClassifier(criterion="gini", max_features=0.5, n_estimators=100), step=0.2), ZeroCount(), MinMaxScaler()), FunctionTransformer(copy)), Normalizer(norm="max"), XGBClassifier(learning_rate=0.01, max_depth=6, min_child_weight=7, n_estimators=600, nthread=1, subsample=0.9500000000000001)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
def get_feature_pipeline(tr_artifact, hist_artifacts, all_data): """Feature generation pipeline.""" hist_n = 3 # len(hist_artifacts) tr_artifact_kws = { "date_col": FieldNames.campaign_start_date, "user_col": FieldNames.customer_id, "key_col": FieldNames.target, "hist_artifact": tr_artifact, } hist_cols = [ FieldNames.item_set, FieldNames.item_brand, FieldNames.item_category ] hist_cols2 = [ FieldNames.coupon_discount, FieldNames.other_discount, FieldNames.pct_discount, FieldNames.quantity, FieldNames.selling_price, ] return make_pipeline( make_union( # Numerical features directly available make_pipeline( SelectCols(cols=[ FieldNames.customer_id, FieldNames.coupon_id, FieldNames.age_range, FieldNames.marital_status, FieldNames.family_size, FieldNames.no_of_children, FieldNames.income_bracket, FieldNames.campaign_type, ]), FunctionTransfomer(lambda x: x), ), # coupon-no. of unique item attributes make_union(*[ make_pipeline( SelectCols(cols=[col]), FunctionTransfomer( lambda X: [len(set(x)) for x in X.values.flatten().tolist()]), ) for col in [ FieldNames.item_set, FieldNames.item_brand, FieldNames.item_brand_type, FieldNames.item_category, ] ], verbose=True), # Campaign id features make_union(*[ GroupCatCatNUnique(FieldNames.campaign_id, col2) for col2 in [FieldNames.customer_id, FieldNames.coupon_id] ], verbose=True), # Customer id expanding mean, count, sum make_pipeline(ExpandingMean(**tr_artifact_kws)), make_pipeline(ExpandingCount(**tr_artifact_kws)), make_pipeline(ExpandingSum(**tr_artifact_kws)), # Count items common between current coupon and historical customer transactions make_union(*[ make_pipeline( make_union( SetAggregation( date_col=FieldNames.campaign_start_date, user_col=FieldNames.customer_id, key_col=col, hist_artifact=hist_artifacts[i], ), SelectCols(cols=[col]), ), CountCommon(), ) for col, i in itertools.product(hist_cols, range(hist_n)) ]), make_union(*[ make_pipeline( make_union( SetAggregation( date_col=FieldNames.campaign_start_date, user_col=FieldNames.customer_id, key_col=col, hist_artifact=hist_artifacts[i], ), SelectCols(cols=[col]), ), Jaccard(), ) for col, i in itertools.product(hist_cols, range(hist_n)) ], verbose=True), make_union(*[ make_pipeline( make_union( SetAggregation( date_col=FieldNames.campaign_start_date, user_col=FieldNames.customer_id, key_col=col, hist_artifact=hist_artifacts[i], ), SelectCols(cols=[col]), ), CountCommonRepeats(), ) for col, i in itertools.product(hist_cols, range(hist_n)) ]), # campaign length make_pipeline( SelectCols(cols=[ FieldNames.campaign_start_date, FieldNames.campaign_end_date ]), FunctionTransfomer(lambda x: (x.iloc[:, 1] - x.iloc[:, 0]).dt.days), ), # coupon discount, other dicount, selling price and quantity aggregations make_union(*[ ExpandingMean( date_col=FieldNames.campaign_start_date, user_col=FieldNames.customer_id, key_col=col, hist_artifact=hist_artifacts[i], ) for col, i in itertools.product(hist_cols2, range(hist_n)) ]), make_pipeline( GroupCatCountEncoder( cols=[FieldNames.customer_id, FieldNames.campaign_id])), make_pipeline( AllCountEncoder( cols=[FieldNames.customer_id, FieldNames.coupon_id], data=all_data)), make_pipeline( make_union(*[ SetLen( date_col=FieldNames.campaign_start_date, user_col=FieldNames.customer_id, key_col=FieldNames.item_set, hist_artifact=hist_artifacts[0], ) for i in range(hist_n) ])), make_pipeline( make_union( VectorMapper(col=FieldNames.coupon_id, vector_file=FileNames.coupon_vectors), HistVectorMean( vector_file=FileNames.item_vectors, user_col=FieldNames.customer_id, key_col=FieldNames.item_set, date_col=FieldNames.campaign_start_date, hist_artifact=hist_artifacts[0], ), ), CosineSimilarity(), ), ), make_union( FunctionTransfomer(lambda x: x), # Ratios make_pipeline( make_union(*[ FunctionTransfomer(lambda x: x[:, i] / x[:, j]) for (i, j) in itertools.product(range(16, 34), range(16, 34)) ], verbose=True)), ), )
import pandas as pd from sklearn.ensemble import ExtraTreesClassifier from sklearn.feature_selection import RFE, VarianceThreshold from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import MinMaxScaler, StandardScaler from tpot.builtins import StackingEstimator # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.8172868435911914 exported_pipeline = make_pipeline( make_union( StandardScaler(), RFE(estimator=ExtraTreesClassifier(criterion="gini", max_features=0.2, n_estimators=100), step=0.1)), VarianceThreshold(threshold=0.25), StandardScaler(), StandardScaler(), MinMaxScaler(), StandardScaler(), LogisticRegression(C=0.01, dual=False, penalty="l2")) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
s = str(i.month) + str(i.day) + str(i.hour) + str(i.minute) model_name = "GM_export/main_new/" + "GM" + s + ".py" tpo.export(model_name) import numpy as np import pandas as pd from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from tpot.builtins import StackingEstimator from xgboost import XGBClassifier from sklearn.preprocessing import FunctionTransformer from copy import copy exported_pipeline = make_pipeline( make_union(FunctionTransformer(copy), FunctionTransformer(copy)), StackingEstimator(estimator=XGBClassifier(learning_rate=0.1, max_depth=5, min_child_weight=6, n_estimators=21, scale_pos_weight=4.16, subsample=0.85)), RandomForestClassifier(class_weight={1: 4.16}, criterion="gini", max_depth=8, max_features=6, n_estimators=23)) exported_pipeline.fit(x_train, y_train) evalution_model(exported_pipeline, x_train, y_train) evalution_model(exported_pipeline, x_test, y_test)
data,label = data[idx_row,:],label[idx_row] features = data tpot_data=pd.DataFrame({'class':label},columns=['class']) # train the machine learning model kf = KFold(n_splits=10,random_state=556,shuffle=True) results,auc=[],[] cnt = 0 print('machine learning model best ML model and cross validation by 10 folds') fp,tp=[],[] for train_index, test_index in kf.split(features): training_features, testing_features = features[train_index],features[test_index] training_classes, testing_classes = tpot_data['class'].values[train_index],tpot_data['class'].values[test_index] exported_pipeline = make_pipeline( make_union(VotingClassifier([("est", DecisionTreeClassifier())]), FunctionTransformer(lambda X: X)), GradientBoostingClassifier(learning_rate=0.24, max_features=0.24, n_estimators=500) ) exported_pipeline.fit(training_features, training_classes) results.append(exported_pipeline.predict_proba(testing_features)[:,1]) fpr, tpr, thresholds = metrics.roc_curve(testing_classes,exported_pipeline.predict_proba(testing_features)[:,1]) auc.append(metrics.roc_auc_score(testing_classes,exported_pipeline.predict_proba(testing_features)[:,1])) #ax.plot(fpr,tpr,label='%s,Area under the curve: %.3f'%(type_,auc[cnt])) fp.append(fpr);tp.append(tpr) print('get one done') cnt += 1 print('done') #from sklearn.externals import joblib #pickle.dump(exported_pipeline, open('%smy_model.pkl'%folder,'wb')) #exported_pipeline = joblib.load('%smy_model.pkl'%folder) pickle.dump([results,auc,fp,tp],open("%slong process.p"%folder,"wb"))
X_test_post_hoc = df_test df = df.drop(columns=['eid', '21022-0.0'], axis=1) df_test = df_test.drop(columns=['eid', '21022-0.0'], axis=1) # Learning curves: train sizes train_sizes = [100, 500, 1000, 1500, 2000, 2500, 3000, 3500, 3700] # Model estimator = RandomForestRegressor(n_estimators=250, criterion='mse', n_jobs=10, verbose=1, random_state=0) pipeline = Pipeline([('imputation', make_union(SimpleImputer(strategy="median"), MissingIndicator(error_on_new=False))), ('estimator', estimator)]) cv = ShuffleSplit(n_splits=100, test_size=0.1, random_state=0) param_grid = { 'estimator__max_depth': [5, 10, 20, 40, None], 'estimator__max_features': [1, 5, 'log2', 'sqrt', 'auto', None] } grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, verbose=2) metrics = [] train_sizes, train_scores, validation_scores = learning_curve( estimator=grid_search, X=df, y=y_train,
def _make_preprocessor(self): def lexicon_pipeline(lexicon): return make_pipeline(LexiconFeatures(lexicon), DictVectorizer()) unigram_lexicons_features = make_union( lexicon_pipeline(NRCEmotionLexicon()), lexicon_pipeline(NRCHashtagEmotionLexicon()), lexicon_pipeline(MaxDiffTwitterLexicon()), lexicon_pipeline(NRCHashtagSentimentWithContextUnigrams()), lexicon_pipeline(NRCHashtagSentimentLexiconUnigrams()), lexicon_pipeline(Sentiment140WithContextUnigrams()), lexicon_pipeline(Sentiment140LexiconUnigrams()), lexicon_pipeline(YelpReviewsLexiconUnigrams()), lexicon_pipeline(AmazonLaptopsReviewsLexiconUnigrams()), lexicon_pipeline(MPQAEffectLexicon()), lexicon_pipeline(MPQASubjectivityLexicon()), lexicon_pipeline(HarvardInquirerLexicon()), lexicon_pipeline(BingLiuLexicon()), lexicon_pipeline(AFINN111Lexicon()), lexicon_pipeline(SentiWordNetLexicon()), lexicon_pipeline(LoughranMcDonaldLexicon()), ) bigram_lexicons_features = make_union( lexicon_pipeline(NRCHashtagSentimentWithContextBigrams()), lexicon_pipeline(NRCHashtagSentimentLexiconBigrams()), lexicon_pipeline(Sentiment140WithContextBigrams()), lexicon_pipeline(Sentiment140LexiconBigrams()), lexicon_pipeline(YelpReviewsLexiconBigrams()), lexicon_pipeline(AmazonLaptopsReviewsLexiconBigrams()), lexicon_pipeline(MPQAEffectLexicon()), ) preprocessor = make_pipeline( BasicTokenizer(), make_union( make_pipeline( CMUArkTweetPOSTagger(), ListCountVectorizer(lowercase=False, binary=True)), # POS features # make_pipeline(W2Vembedding()), make_pipeline( CharNGramTransformer([1, 2, 3]), ListCountVectorizer(lowercase=True, max_features=10000, binary=True)), # Character n-grams make_pipeline( LowercaseTransformer(), CMUArkTweetBrownClusters(), ListCountVectorizer(lowercase=False, binary=True)), # brown clusters make_pipeline( Negater(), make_union( make_pipeline( NGramTransformer([3, 4]), ListCountVectorizer(lowercase=True, max_features=10000, binary=True)), # ngram word make_pipeline( CountingFeatures(), DictVectorizer() ), # allcaps, punctuations, lengthening, emoticons, etc. counting feature make_pipeline(LowercaseTransformer(), unigram_lexicons_features ), # unigram lexicon features ListCountVectorizer(lowercase=True, max_features=10000, binary=True), # ngram word make_pipeline( LowercaseTransformer(), NGramTransformer(2), make_union( bigram_lexicons_features, # bigram lexicon features ListCountVectorizer(lowercase=True, max_features=10000, binary=True), # ngram word ), ), ), ), make_pipeline(LowercaseTransformer(), SSWEFeatures()), make_pipeline( NormalizedTokens(), CollapsedTokens(), PorterStemmer(), Negater(), make_union( ListCountVectorizer(lowercase=False, max_features=10000, binary=True), # processed word make_pipeline( ClusterFeaturesWithNegation(), ListCountVectorizer( lowercase=False, binary=True)), # processed cluster features ), )), ) return preprocessor
DATA_DIR_PORTABLE = "C:\\Users\\T149900\\ml_mercari\\" DATA_DIR_BASEMENT = "D:\\mercari\\" DATA_DIR = DATA_DIR_PORTABLE df = pd.read_table(DATA_DIR + "train.tsv"); q = df[:10] q_test = df[10:13] q.price.isnull().sum() q_test.price.isnull().sum() vectorizer = make_union( on_field(['shipping', 'item_condition_id'], PlussOneStage() ), n_jobs=1) p = on_field('item_condition_id', PlussOneStage()) p.fit(q) X = p.transform(q) X_test = p.transform(q_test) X_train = vectorizer.fit_transform(preprocess(q)).astype(np.float32) X_test = vectorizer.transform(preprocess(q_test)).astype(np.float32)
def get_feature_union(): return make_union(WaveletApprx(), SpatialFt(), DepthFt(),)
features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) imputer = Imputer(strategy="median") imputer.fit(training_features) training_features = imputer.transform(training_features) testing_features = imputer.transform(testing_features) # Score on the training set was:0.9576386406262041 exported_pipeline = make_pipeline( make_union( StackingEstimator(estimator=make_pipeline( MaxAbsScaler(), RandomForestClassifier(bootstrap=False, criterion="entropy", max_features=0.3, min_samples_leaf=5, min_samples_split=11, n_estimators=100))), FunctionTransformer(copy)), OneHotEncoder(minimum_fraction=0.25, sparse=False), ExtraTreesClassifier(bootstrap=True, criterion="entropy", max_features=0.6500000000000001, min_samples_leaf=1, min_samples_split=5, n_estimators=100)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
import numpy as np from sklearn.ensemble import GradientBoostingClassifier from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import StandardScaler from tpot.builtins import StackingEstimator from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['class'], random_state=42) exported_pipeline = make_pipeline( make_union( FunctionTransformer(copy), FunctionTransformer(copy) ), StandardScaler(), GradientBoostingClassifier(learning_rate=0.1, max_depth=10, min_samples_leaf=15, min_samples_split=15, n_estimators=100, subsample=0.4) ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
def test_missing_indicator_with_imputer(X, missing_values, X_trans_exp): trans = make_union( SimpleImputer(missing_values=missing_values, strategy='most_frequent'), MissingIndicator(missing_values=missing_values)) X_trans = trans.fit_transform(X) assert_array_equal(X_trans, X_trans_exp)
return [tok.strip().lower() for tok in re.findall(r'\w', re.sub(r'\d', ' ', text))] train_data = [] train_lbls = [] for line in open("./data/data_set.json", "r"): data = json.loads(line) train_data.append(data["data"]) train_lbls.append(int(data["label"] == "EN")) # Класс 1 - Английский, класс 0 - Тагальский #Создаём 3 извлекателя признаков (по словам, по парам букв, по буквам) word_vectoriser = TfidfVectorizer(tokenizer=word_tokenizer) ends_vectoriser = TfidfVectorizer(tokenizer=ending_tokenizer) char_vectorizer = TfidfVectorizer(tokenizer=char_tokenizer) #Группируем наши feature extractor-ы и создаём конвейер feature_extractor = make_union(word_vectoriser, ends_vectoriser, char_vectorizer) pipeline = make_pipeline(feature_extractor, LinearSVC(C=2)) #pipeline.fit(train_data[::2], train_lbls[::2]) #print(f1_score(train_lbls[1::2], pipeline.predict(train_data[1::2]))) scores = cross_validation.cross_val_score(pipeline, train_data, train_lbls, cv=5, scoring='f1_macro') print(mean(scores))
def predict_with_best_parameters(train, test, class_names): submission = pd.DataFrame.from_dict({"id": test["id"]}) scores = [] #assuming best parameters are already found by grid_search best_parameters = { "toxic": { 'clf': { 'C': 10.0 }, 'vect': { 'ngram_range': (1, 1), 'stop_words': None, 'tokenizer': tokenizer_porter, 'use_idf': False }, }, "severe_toxic": { 'clf': { 'C': 1.0 }, 'vect': { 'max_features': 30000, 'ngram_range': (1, 1), 'stop_words': None, 'tokenizer': tokenizer_porter }, }, "obscene": { 'clf': { 'C': 10.0 }, 'vect': { 'ngram_range': (1, 1), 'stop_words': None, 'tokenizer': tokenizer_porter, 'use_idf': False }, }, "threat": { 'clf': { 'C': 10.0 }, 'vect': { 'max_features': 30000, 'ngram_range': (1, 1), 'stop_words': None, 'tokenizer': tokenizer_porter }, }, "insult": { 'clf': { 'C': 1.0 }, 'vect': { 'max_features': 30000, 'ngram_range': (1, 1), 'stop_words': None, 'tokenizer': tokenizer_porter }, }, "identity_hate": { 'clf': { 'C': 1.0 }, 'vect': { 'max_features': 30000, 'ngram_range': (1, 1), 'stop_words': None, 'tokenizer': tokenizer_porter } } } for target_class in class_names: print("\nWorking with target_class: ", target_class) train_target = train[target_class] word_vectorizer = TfidfVectorizer( sublinear_tf=True, strip_accents="unicode", analyzer="word", **best_parameters[target_class]["vect"]) char_vectorizer = TfidfVectorizer(sublinear_tf=True, strip_accents="unicode", analyzer="char", ngram_range=(1, 4), max_features=30000) vectorizer = make_union(word_vectorizer, char_vectorizer, n_jobs=2) classifier = LogisticRegression(solver="sag", n_jobs=4, **best_parameters[target_class]["clf"]) lr_vectorizer = Pipeline([('vect', vectorizer), ('clf', classifier)]) start_time = time.time() cv_score = np.mean( cross_val_score(lr_vectorizer, train_text, train_target, cv=3, scoring="roc_auc", n_jobs=3)) end_time = time.time() print("CV time:", end_time - start_time) scores.append(cv_score) print("cv_score for class {} : {}".format(target_class, cv_score)) start_time = time.time() lr_vectorizer.fit(train_text, train_target) end_time = time.time() print("fitting time: ", end_time - start_time) start_time = time.time() submission[target_class] = lr_vectorizer.predict_proba(test_text)[:, 1] #predict_proba returns two columns. The first is the probability that # the sample is of class 0, the second is the probability that the # sample is of class 1. So we only need to slice the second column. end_time = time.time() print("Prediction time: ", end_time - start_time) print("total CV score is: {}".format(np.mean(scores))) submission.to_csv("submission_gs_best_est_union_and_piped.csv", index=False) pass
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier from sklearn.linear_model import LogisticRegression from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import FunctionTransformer # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_classes, testing_classes = \ train_test_split(features, tpot_data['class'], random_state=42) exported_pipeline = make_pipeline( make_union( VotingClassifier([("est", LogisticRegression(C=0.0001, dual=False, penalty="l1"))]), FunctionTransformer(lambda X: X)), GradientBoostingClassifier(learning_rate=1.0, max_features=1.0, n_estimators=500)) exported_pipeline.fit(training_features, training_classes) results = exported_pipeline.predict(testing_features) #56
from sklearn.preprocessing import StandardScaler from sklearn.svm import LinearSVC from tpot.builtins import StackingEstimator from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.8194243156199679 exported_pipeline = make_pipeline( make_union( FunctionTransformer(copy), make_union( make_pipeline( SelectPercentile(score_func=f_classif, percentile=83), StandardScaler() ), FunctionTransformer(copy) ) ), PCA(iterated_power=5, svd_solver="randomized"), LinearSVC(C=0.001, dual=True, loss="hinge", penalty="l2", tol=0.001) ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import MaxAbsScaler from tpot.builtins import StackingEstimator from xgboost import XGBRegressor from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:-0.00023017383030155843 exported_pipeline = make_pipeline( make_union(MaxAbsScaler(), FunctionTransformer(copy)), XGBRegressor(learning_rate=0.1, max_depth=10, min_child_weight=1, n_estimators=100, nthread=1, subsample=0.8)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
import pandas as pd from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier from sklearn.feature_selection import VarianceThreshold from sklearn.model_selection import train_test_split from sklearn.naive_bayes import MultinomialNB from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline, make_union from tpot.builtins import StackingEstimator # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=None) # Average CV score on the training set was: 0.6814117647058824 exported_pipeline = make_pipeline( make_union( StackingEstimator(estimator=KNeighborsClassifier(n_neighbors=23, p=1, weights="distance")), make_pipeline( StackingEstimator(estimator=ExtraTreesClassifier(bootstrap=False, criterion="gini", max_features=0.9500000000000001, min_samples_leaf=12, min_samples_split=16, n_estimators=100)), StackingEstimator(estimator=MultinomialNB(alpha=10.0,True)), VarianceThreshold(threshold=0.01) ) ), StackingEstimator(estimator=MultinomialNB(alpha=0.1,False)), RandomForestClassifier(bootstrap=True, criterion="gini", max_features=0.1, min_samples_leaf=4, min_samples_split=14, n_estimators=100) ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)