Exemplo n.º 1
0
def test_make_union_kwargs():
    pca = PCA(svd_solver='full')
    mock = Transf()
    fu = make_union(pca, mock, n_jobs=3)
    assert_equal(fu.transformer_list, make_union(pca, mock).transformer_list)
    assert_equal(3, fu.n_jobs)
    # invalid keyword parameters should raise an error message
    assert_raise_message(
        TypeError,
        'Unknown keyword arguments: "transformer_weights"',
        make_union, pca, mock, transformer_weights={'pca': 10, 'Transf': 1}
    )
Exemplo n.º 2
0
def get_results(dataset):
    X_full, y_full = dataset.data, dataset.target
    n_samples = X_full.shape[0]
    n_features = X_full.shape[1]

    # Estimate the score on the entire dataset, with no missing values
    estimator = RandomForestRegressor(random_state=0, n_estimators=100)
    full_scores = cross_val_score(estimator, X_full, y_full,
                                  scoring='neg_mean_squared_error')

    # Add missing values in 75% of the lines
    missing_rate = 0.75
    n_missing_samples = int(np.floor(n_samples * missing_rate))
    missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
                                          dtype=np.bool),
                                 np.ones(n_missing_samples,
                                         dtype=np.bool)))
    rng.shuffle(missing_samples)
    missing_features = rng.randint(0, n_features, n_missing_samples)

    # Estimate the score after replacing missing values by 0
    X_missing = X_full.copy()
    X_missing[np.where(missing_samples)[0], missing_features] = 0
    y_missing = y_full.copy()
    estimator = RandomForestRegressor(random_state=0, n_estimators=100)
    zero_impute_scores = cross_val_score(estimator, X_missing, y_missing,
                                         scoring='neg_mean_squared_error')

    # Estimate the score after imputation (mean strategy) of the missing values
    X_missing = X_full.copy()
    X_missing[np.where(missing_samples)[0], missing_features] = 0
    y_missing = y_full.copy()
    estimator = make_pipeline(
        make_union(SimpleImputer(missing_values=0, strategy="mean"),
                   MissingIndicator(missing_values=0)),
        RandomForestRegressor(random_state=0, n_estimators=100))
    mean_impute_scores = cross_val_score(estimator, X_missing, y_missing,
                                         scoring='neg_mean_squared_error')

    # Estimate the score after chained imputation of the missing values
    estimator = make_pipeline(
        make_union(ChainedImputer(missing_values=0, random_state=0),
                   MissingIndicator(missing_values=0)),
        RandomForestRegressor(random_state=0, n_estimators=100))
    chained_impute_scores = cross_val_score(estimator, X_missing, y_missing,
                                            scoring='neg_mean_squared_error')

    return ((full_scores.mean(), full_scores.std()),
            (zero_impute_scores.mean(), zero_impute_scores.std()),
            (mean_impute_scores.mean(), mean_impute_scores.std()),
            (chained_impute_scores.mean(), chained_impute_scores.std()))
Exemplo n.º 3
0
 def __init__(self, training_values=None, training_targets=None):
     self.vectorizer = make_union(TfidfVectorizer(), PostTransformer())
     # Set using parameter_search. TODO: review after updating
     # corpus.
     self.classifier = svm.LinearSVC(C=1, loss='squared_hinge', multi_class='ovr', class_weight='balanced', tol=1e-6)
     if training_values is not None and training_targets is not None:
         self.fit(training_values, training_targets)
Exemplo n.º 4
0
def PipelineTelstra(Classifier):
    pipeline = make_pipeline(
        make_union(
            make_pipeline(
                DataSpliterTrans(cols='location',transp=True),
                preprocessing.OneHotEncoder(handle_unknown='ignore')
            ),
            make_pipeline(
                DataSpliterTrans(cols='event_type',matrix=True),
                DictVectorizer()
            ),
            make_pipeline(
                DataSpliterTrans(cols='severity_type',matrix=True),
                DictVectorizer()
            ),
            make_pipeline(
                DataSpliterTrans(cols='resource_type',matrix=True),
                DictVectorizer()
            ),
            make_pipeline(
                DataSpliterTrans(cols='volume',matrix=True),
                DictVectorizer()
            ),
            make_pipeline(
                DataSpliterTrans(cols='log_feature',matrix=True),
                DictVectorizer()
            )
        ),
        Classifier()
        )
    print('pipeline done.')
    return pipeline
Exemplo n.º 5
0
    def __init__(self, transforms):
        self.transforms = transforms

        union = make_union(*[t() for t in transforms])
        pipeline = [union]
        self.pipeline = make_pipeline(*pipeline)
        self.classifier = LogisticRegression(penalty="l1", class_weight="auto")
Exemplo n.º 6
0
def test_make_union():
    pca = PCA()
    mock = TransfT()
    fu = make_union(pca, mock)
    names, transformers = zip(*fu.transformer_list)
    assert_equal(names, ("pca", "transft"))
    assert_equal(transformers, (pca, mock))
Exemplo n.º 7
0
def PipelineTelstra(Classifier):
    pipeline = make_pipeline(
        make_union(
            make_pipeline(
                DataSpliterTrans(cols='event_type',matrix=True),
                DictVectorizer()
            ),
            make_pipeline(
                DataSpliterTrans(cols='severity_type',matrix=True),
                DictVectorizer()
            ),
            make_pipeline(
                DataSpliterTrans(cols='resource_type',matrix=True),
                DictVectorizer()
            ),
            make_pipeline(
                DataSpliterTrans(cols='volume',matrix=True),
                DictVectorizer()
            ),
            make_pipeline(
                DataSpliterTrans(cols='log_feature',matrix=True),
                DictVectorizer()
            )
        ),
        Classifier()
        )
    print('pipeline done.')
    return pipeline
Exemplo n.º 8
0
    def __init__(self, classifier="sgd", classifier_args=None, lowercase=True,
                 text_replacements=None, map_to_synsets=False, binary=False,
                 min_df=0, ngram=1, stopwords=None, limit_train=None,
                 map_to_lex=False, duplicates=False):
        self.limit_train = limit_train
        self.duplicates = duplicates

        pipeline = [ExtractText(lowercase)]
        if text_replacements:
            pipeline.append(ReplaceText(text_replacements))

        ext = [build_text_extraction(binary=binary, min_df=min_df,
                                    ngram=ngram, stopwords=stopwords)]

        if map_to_synsets:
            ext.append(build_synset_extraction(binary=binary, min_df=min_df,
                                               ngram=ngram))
        if map_to_lex:
            ext.append(build_lex_extraction(binary=binary, min_df=min_df,
                                            ngram=ngram))
        ext = make_union(*ext)
        pipeline.append(ext)

        #Building classifier
        if classifier_args is None:
            classifier_args={}
        classifier = _valid_classifiers[classifier](**classifier_args)
        self.pipeline = make_pipeline(*pipeline)
        self.classifier = classifier
Exemplo n.º 9
0
def get_extra_features(args):
    forest = ExtraTreesClassifier(n_estimators=2000,
                                  criterion='entropy',
                                  max_features='sqrt',
                                  max_depth=6,
                                  min_samples_split=8,
                                  n_jobs=-1,
                                  bootstrap=True,
                                  oob_score=True,
                                  verbose=1,
                                  class_weight='balanced')
    pca = PCA(n_components=200)
    ica = FastICA(n_components=200, max_iter=1000)
    kmeans = KMeans(n_clusters=200, n_init=20, max_iter=1000)

    pipeline = make_pipeline(selectKFromModel(forest, k=1000),
                             StandardScaler(),
                             make_union(pca, ica, kmeans))

    X_train = np.load('feature/1_100/X_train.npy')
    y_train = np.load('feature/1_100/y_train.npy')
    X_test = np.load('feature/1_100/X_test.npy')

    pipeline.fit(X_train, y_train[:, args.yix])
    sel_ixs = pipeline.steps[0][1].indices[:500]
    X_train_ext = np.hstack((pipeline.transform(X_train), X_train[:, sel_ixs]))
    X_test_ext = np.hstack((pipeline.transform(X_test), X_test[:, sel_ixs]))

    with open(path.join(save_dir, 'pipe.pkl'), 'wb') as f_pipe:
        pickle.dump(pipeline, f_pipe)

    np.save(path.join(save_dir, 'selix.npy'), sel_ixs)
    return X_train_ext, X_test_ext
Exemplo n.º 10
0
def test_missing_indicator_with_imputer(X, missing_values, X_trans_exp):
    trans = make_union(
        SimpleImputer(missing_values=missing_values, strategy='most_frequent'),
        MissingIndicator(missing_values=missing_values)
    )
    X_trans = trans.fit_transform(X)
    assert_array_equal(X_trans, X_trans_exp)
Exemplo n.º 11
0
def get_pipeline(fsmethods, clfmethod):
    """Returns an instance of a sklearn Pipeline given the parameters
    fsmethod1 and fsmethod2 will be joined in a FeatureUnion, then it will joined
    in a Pipeline with clfmethod

    Parameters
    ----------
    fsmethods: list of estimators
        All estimators in a pipeline, must be transformers (i.e. must have a transform method).

    clfmethod: classifier
        The last estimator may be any type (transformer, classifier, etc.).

    Returns
    -------
    pipe
    """
    feat_union = None
    if not isinstance(fsmethods, list):
        if hasattr(fsmethods, 'transform'):
            feat_union = fsmethods
        else:
            raise ValueError('fsmethods expected to be either a list or a transformer method')
    else:
        feat_union = make_union(*fsmethods)

    if feat_union is None:
        pipe = make_pipeline(clfmethod)
    else:
        pipe = make_pipeline(feat_union, clfmethod)

    return pipe
Exemplo n.º 12
0
    def preprocess(self,any_set,is_train):

        if is_train:
            dico_pattern={'match_lowercase_only':'\\b[a-z]+\\b',
              'match_word':'\\w{2,}',
              'match_word1': '(?u)\\b\\w+\\b',
              'match_word_punct': '\w+|[,.?!;]',
              'match_NNP': '\\b[A-Z][a-z]+\\b|\\b[A-Z]+\\b',
              'match_punct': "[,.?!;'-]"
             }

            tfv_title = TfidfVectorizer(lowercase=True, stop_words='english', token_pattern=dico_pattern["match_word1"],
                              ngram_range=(1, 2), max_df=1.0, min_df=2, max_features=None,
                              vocabulary=None, binary=True, norm=u'l2',
                              use_idf=True, smooth_idf=True, sublinear_tf=True)

            tfv_desc = TfidfVectorizer(lowercase=True, stop_words='english', token_pattern=dico_pattern["match_word1"],
                              ngram_range=(1, 2), max_df=1.0, min_df=2, max_features=None,
                              vocabulary=None, binary=True, norm=u'l2',
                              use_idf=True, smooth_idf=True, sublinear_tf=True)

            title_pipe = make_pipeline(ColumnSelector(key='title'), tfv_title)
            desc_pipe = make_pipeline(ColumnSelector(key='description'), tfv_desc)
            self.pipeline = make_union(title_pipe, desc_pipe)

            return self.pipeline.fit_transform(any_set)
        else:
            return self.pipeline.transform(any_set)
Exemplo n.º 13
0
def pca_kpca(train_data, labels):
    estimators = make_union(PCA(), TruncatedSVD(), KernelPCA())
#    estimators = [('linear_pca', PCA()), ('kernel_pca', KernelPCA())]
    combined = FeatureUnion(estimators)
    combined.fit(train_data, labels) # combined.fit_tranform(tain_data, labels)

    return combined
Exemplo n.º 14
0
    def __init__(self, **config):
        # Validate options are present
        for option in _configuration_options:
            if option not in config:
                raise ValueError("Missing configuration "
                                 "option {!r}".format(option))

        # Feature extraction
        sparse_features = parse_features(config["sparse_features"])
        densifier = make_pipeline(Vectorizer(sparse_features, sparse=True),
                                  ClassifierAsFeature())
        dense_features = parse_features(config["dense_features"])
        vectorization = make_union(densifier,
                                   Vectorizer(dense_features, sparse=False))

        # Classifier
        try:
            classifier = _valid_classifiers[config["classifier"]]
        except KeyError:
            raise ValueError("Unknown classification algorithm "
                             "{!r}".format(config["classifier"]))
        classifier = classifier(**config["classifier_args"])

        self.pipeline = make_pipeline(vectorization, StandardScaler())
        self.classifier = classifier
Exemplo n.º 15
0
def test_make_union():
    pca = PCA(svd_solver='full')
    mock = Transf()
    fu = make_union(pca, mock)
    names, transformers = zip(*fu.transformer_list)
    assert_equal(names, ("pca", "transf"))
    assert_equal(transformers, (pca, mock))
Exemplo n.º 16
0
def get_scores_for_imputer(imputer, X_missing, y_missing):
    estimator = make_pipeline(
        make_union(imputer, MissingIndicator(missing_values=0)),
        REGRESSOR)
    impute_scores = cross_val_score(estimator, X_missing, y_missing,
                                    scoring='neg_mean_squared_error',
                                    cv=N_SPLITS)
    return impute_scores
Exemplo n.º 17
0
 def fit(self, X, y):
     # Filthy hack
     sids = X[:, -1]
     all_pipelines = [make_pipeline(LogisticRegressionCV()).fit(X_s, y_s) for
                      X_s, y_s in subject_splitter(X[:, :-1], y, sids)]
     f_union = make_union(*[FeatureUnionWrapper(p) for p in all_pipelines])
     self.clf_ = make_pipeline(f_union, LogisticRegressionCV()).fit(X[:, :-1], y)
     return self
def _create_feature_union(features):
    """
    Create a FeatureUnion.
    Each "feature" is a 3-tuple: (name, feature_extractor, vectorizer).
    """
    return make_union(*[
        make_pipeline(fe, vec)
        for name, fe, vec in features
    ])
def make_pipe(classifier):
    language_featurizer = make_union(CountVectorizer(),
                                     FunctionFeaturizer(longest_run_of_capital_letters_feature,
                                                    percent_character_feature,
                                                    percent_character_combinations,
                                                    longest_run_of_character_feature,
                                                    character_combinations_binary
                                                    ))
    return make_pipeline(language_featurizer, classifier)
Exemplo n.º 20
0
    def __init__(self, transforms, n_estimators=2000, criterion='gini', min_samples_leaf=2, n_jobs=-1):
        self.transforms = transforms
        self.n_estimators = n_estimators
        self.criterion = criterion
        self.min_samples_leaf = min_samples_leaf
        self.n_jobs = n_jobs

        union = make_union(*[t() for t in transforms])
        pipeline = [union]

        self.pipeline = make_pipeline(*pipeline)
        self.classifier = RandomForestClassifier(n_estimators, criterion, min_samples_leaf=min_samples_leaf, n_jobs=-1)
Exemplo n.º 21
0
def create_input_transformer(fields, vec_name):
    """Create a pipeline of input transformations, allowing to use scaling of input fields."""
    pipeline = []
    for field in fields:
        field_name = field['name']
        field_scale = field['scale']
        field_type = processed_db.get_field_type(field_name)

        pipeline.append(
            make_pipeline(ItemSelector(field_name),             # select the correct column
                          Vectorizer(vec_name, field_type),     # vectorize (depending on str/numeric input)
                          Scaler(field_scale))                  # scale column based on user input
        )

    return make_union(*pipeline)
Exemplo n.º 22
0
def build_prediction():
    p_age = make_pipeline(
        make_union(
            OneHotTransformer(lambda x: x[1]['phone_brand'].lower()),
            OneHotTransformer(lambda x: x[1]['device_model'].lower()),
            TfidfVectorizer(preprocessor=lambda x: ' '.join(x[1]['app_id']))
        ),
        LogisticRegression()
    )

    x_train = [(x, y) for x, y in PERSONS.items()]
    x_test = [(x, y) for x, y in PERSONS_TESTS.items()]
    y_train_age = [y.get('group') for y in PERSONS.values()]

    print "fit age predictor"
    p_age.fit(x_train, y_train_age)
    print "predicting age"
    classes = p_age.classes_
    age_prediction = p_age.predict_proba(x_test)
    return classes, age_prediction
Exemplo n.º 23
0
    def preprocess(self,any_set,is_train):

        if is_train:

            tfv_text = TfidfVectorizer(lowercase=True, max_features=2500)
            tfv_topics = TfidfVectorizer(lowercase=True, max_features=20)

            clf = MultinomialNB(alpha=0.05, fit_prior=True, class_prior=None)
            title_pipe = make_pipeline(ColumnSelector(key=u'title'), tfv_text)
            topics_pipe = make_pipeline(ColumnSelector(key=u'topicIds'), tfv_topics)
            rel_topic_pipe = make_pipeline(ColumnSelector(key=u'relevantTopicIds'), tfv_topics)
            text_pipe = make_pipeline(ColumnSelector(key=u'description'), tfv_text)



            self.pipeline = make_union(title_pipe, topics_pipe,rel_topic_pipe,text_pipe)



            return self.pipeline.fit_transform(any_set)
        else:
            return  self.pipeline.transform(any_set)
Exemplo n.º 24
0
def PipelineBNP(Classifier):
    pipeline = make_pipeline(
        NulltoNanTrans(),
        make_union(
            make_pipeline(
                DataSpliterTrans(dtype=np.float64),
                Imputer(strategy='median')
            ),
            make_pipeline(
                DataSpliterTrans(dtype=np.int),
                Imputer(strategy='most_frequent'),
                preprocessing.OneHotEncoder(handle_unknown='ignore')
            ),
            make_pipeline(
                DataSpliterTrans(dtype=np.object),
                ObjtoCatStrtoIntTrans(),
                Imputer(strategy='most_frequent'),
                preprocessing.OneHotEncoder(handle_unknown='ignore')
            )
        ),
        Classifier()
        )
    print('pipeline done.')
    return pipeline
def run_lr(train, test, y_train, num_models):
    feature_extractor = make_union(create_basic_feature_extractor(),
                                   create_BoW_feature_extractor())
    X_train = feature_extractor.fit_transform(train)
    X_test = feature_extractor.transform(test)
    return [lr_predict(X_train, y_train, X_test) for _ in range(num_models)]
Exemplo n.º 26
0
    def __init__(self,
                 classifier="sgd",
                 classifier_args=None,
                 lowercase=True,
                 text_replacements=None,
                 map_to_synsets=False,
                 binary=False,
                 min_df=0,
                 ngram=1,
                 stopwords=None,
                 limit_train=None,
                 map_to_lex=False,
                 duplicates=False):
        """
        Parameter description:
            - `classifier`: The type of classifier used as main classifier,
              valid values are "sgd", "knn", "svc", "randomforest".
            - `classifier_args`: A dict to be passed as arguments to the main
              classifier.
            - `lowercase`: wheter or not all words are lowercased at the start of
              the pipeline.
            - `text_replacements`: A list of tuples `(from, to)` specifying
              string replacements to be made at the start of the pipeline (after
              lowercasing).
            - `map_to_synsets`: Whether or not to use the Wordnet synsets
              feature set.
            - `binary`: Whether or not to count words in the bag-of-words
              representation as 0 or 1.
            - `min_df`: Minumim frequency a word needs to have to be included
              in the bag-of-word representation.
            - `ngram`: The maximum size of ngrams to be considered in the
              bag-of-words representation.
            - `stopwords`: A list of words to filter out of the bag-of-words
              representation. Can also be the string "english", in which case
              a default list of english stopwords will be used.
            - `limit_train`: The maximum amount of training samples to give to
              the main classifier. This can be useful for some slow main
              classifiers (ex: svc) that converge with less samples to an
              optimum.
            - `max_to_lex`: Whether or not to use the Harvard Inquirer lexicon
              features.
            - `duplicates`: Whether or not to check for identical phrases between
              train and prediction.
        """
        self.limit_train = limit_train
        self.duplicates = duplicates

        # Build pre-processing common to every extraction
        pipeline = [ExtractText(lowercase)]
        if text_replacements:
            pipeline.append(ReplaceText(text_replacements))

        # Build feature extraction schemes
        ext = [
            build_text_extraction(binary=binary,
                                  min_df=min_df,
                                  ngram=ngram,
                                  stopwords=stopwords)
        ]
        if map_to_synsets:
            ext.append(
                build_synset_extraction(binary=binary,
                                        min_df=min_df,
                                        ngram=ngram))
        if map_to_lex:
            ext.append(
                build_lex_extraction(binary=binary, min_df=min_df,
                                     ngram=ngram))
        ext = make_union(*ext)
        pipeline.append(ext)
        # Build classifier and put everything togheter
        if classifier_args is None:
            classifier_args = {}
        classifier = _valid_classifiers[classifier](**classifier_args)
        self.pipeline = make_pipeline(*pipeline)
        self.classifier = classifier
Exemplo n.º 27
0
    def __init__(self, classifier="sgd", classifier_args=None, lowercase=True,
                 text_replacements=None, map_to_synsets=False, binary=False,
                 min_df=0, ngram=1, stopwords=None, limit_train=None,
                 map_to_lex=False, duplicates=False):
        """
        Parameter description:
            - `classifier`: The type of classifier used as main classifier,
              valid values are "sgd", "knn", "svc", "randomforest".
            - `classifier_args`: A dict to be passed as arguments to the main
              classifier.
            - `lowercase`: wheter or not all words are lowercased at the start of
              the pipeline.
            - `text_replacements`: A list of tuples `(from, to)` specifying
              string replacements to be made at the start of the pipeline (after
              lowercasing).
            - `map_to_synsets`: Whether or not to use the Wordnet synsets
              feature set.
            - `binary`: Whether or not to count words in the bag-of-words
              representation as 0 or 1.
            - `min_df`: Minumim frequency a word needs to have to be included
              in the bag-of-word representation.
            - `ngram`: The maximum size of ngrams to be considered in the
              bag-of-words representation.
            - `stopwords`: A list of words to filter out of the bag-of-words
              representation. Can also be the string "english", in which case
              a default list of english stopwords will be used.
            - `limit_train`: The maximum amount of training samples to give to
              the main classifier. This can be useful for some slow main
              classifiers (ex: svc) that converge with less samples to an
              optimum.
            - `max_to_lex`: Whether or not to use the Harvard Inquirer lexicon
              features.
            - `duplicates`: Whether or not to check for identical phrases between
              train and prediction.
        """
        self.limit_train = limit_train
        self.duplicates = duplicates

        # Build pre-processing common to every extraction
        pipeline = [ExtractText(lowercase)]
        if text_replacements:
            pipeline.append(ReplaceText(text_replacements))

        # Build feature extraction schemes
        ext = [build_text_extraction(binary=binary, min_df=min_df,
                                     ngram=ngram, stopwords=stopwords)]
        if map_to_synsets:
            ext.append(build_synset_extraction(binary=binary, min_df=min_df,
                                               ngram=ngram))
        if map_to_lex:
            ext.append(build_lex_extraction(binary=binary, min_df=min_df,
                                            ngram=ngram))
        ext = make_union(*ext)
        pipeline.append(ext)

        # Build classifier and put everything togheter
        if classifier_args is None:
            classifier_args = {}
        classifier = _valid_classifiers[classifier](**classifier_args)
        self.pipeline = make_pipeline(*pipeline)
        self.classifier = classifier
# In[13]:

# full pipeline for data engineering

full_pipeline = Pipeline(steps=[
    (
        "features",
        make_union(
            make_pipeline(DataFrameSelector(["Embarked"]), MostFrequentImputer(
            ), CategoricalEncoder(encoding='onehot-dense')),
            make_pipeline(DataFrameSelector(["Pclass", "Sex"]),
                          CategoricalEncoder(encoding='onehot-dense')),
            make_pipeline(DataFrameSelector(["Age", "Fare"]),
                          Imputer(strategy="median"), StandardScaler()),
            make_pipeline(DataFrameSelector(["Name"]), ExtractTitle(),
                          CategoricalEncoder(encoding='onehot-dense')),
            #make_pipeline(DataFrameSelector(["Cabin"]), FillMissingCabin(), ExtractCabin(), CategoricalEncoder(encoding='onehot-dense')),
            make_pipeline(DataFrameSelector(["Cabin"]), HasCabin()),
            make_pipeline(DataFrameSelector(["SibSp", "Parch"]),
                          CreateFamilySize(),
                          CategoricalEncoder(encoding='onehot-dense')),
        )),
    ("poly", PolynomialFeatures()),
    #("PCA", PCA(n_components=0.95)),
    #("best", SelectKBest(k=20)),
    ("clf", RandomForestClassifier(random_state=42))
])

# **STEP 6 - Splitting the training dataset**
#
Exemplo n.º 29
0
import numpy as np

from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('PATH/TO/DATA/FILE',
                          delimiter='COLUMN_SEPARATOR',
                          dtype=np.float64)
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1),
                     tpot_data.dtype.names.index('class'),
                     axis=1)
training_features, testing_features, training_classes, testing_classes = \
    train_test_split(features, tpot_data['class'], random_state=42)

exported_pipeline = make_pipeline(
    make_union(FunctionTransformer(lambda X: X),
               FunctionTransformer(lambda X: X)),
    RandomForestClassifier(n_estimators=500))

exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)
Exemplo n.º 30
0
import numpy as np

from sklearn.cross_validation import train_test_split
from sklearn.decomposition import RandomizedPCA
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer
from sklearn.svm import LinearSVC

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64)
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1)
training_features, testing_features, training_classes, testing_classes = \
    train_test_split(features, tpot_data['class'], random_state=42)

exported_pipeline = make_pipeline(
    RandomizedPCA(iterated_power=10),
    make_union(VotingClassifier([("est", LinearSVC(C=0.59, dual=False, penalty="l1"))]), FunctionTransformer(lambda X: X)),
    RandomForestClassifier(n_estimators=500)
)

exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)
Exemplo n.º 31
0
TOKENIZER = re.compile('([{}“”¨«»®´·º½¾¿¡§£₤‘’])'.format(string.punctuation))


def tokenize(s):
    return TOKENIZER.sub(r' \1 ', s).split()


vectorizer = make_union(
    on_field(
        'question_text',
        TfidfVectorizer(max_features=13000,
                        token_pattern='\w+',
                        strip_accents='unicode',
                        tokenizer=tokenize,
                        sublinear_tf=True)),
    on_field('question_text',
             TfidfVectorizer(ngram_range=(3, 3), analyzer='char', min_df=25)),
    make_pipeline(
        PandasSelector(columns=[
            'num_words',
            'num_singletons',
            'caps_vs_length',
        ],
                       return_vector=False), MaxAbsScaler()),
)

with timer('process train'):
    # df_train = pd.read_csv(os.path.join(INPUT_PATH, "train.csv"))
    df_train = joblib.load('train.pkl')
    df_test = pd.read_csv(os.path.join(INPUT_PATH, "test.csv"))
    # df_test = joblib.load('valid_for_emsemble.pkl')
    train_count = len(df_train)
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import MinMaxScaler, Normalizer
from tpot.builtins import StackingEstimator
from xgboost import XGBClassifier
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Average CV score on the training set was:0.839915792130982
exported_pipeline = make_pipeline(
    make_union(
        MinMaxScaler(),
        FunctionTransformer(copy)
    ),
    Normalizer(norm="max"),
    XGBClassifier(learning_rate=0.01, max_depth=3, min_child_weight=7, n_estimators=600, nthread=1, subsample=0.8)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Exemplo n.º 33
0
from tpot.export_utils import set_param_recursive

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=42)

# Average CV score on the training set was: -5.853055578955521
exported_pipeline = make_pipeline(
    make_union(
        StackingEstimator(estimator=RidgeCV()),
        make_pipeline(
            FeatureAgglomeration(affinity="manhattan", linkage="complete"),
            Nystroem(gamma=0.30000000000000004,
                     kernel="sigmoid",
                     n_components=5))),
    FeatureAgglomeration(affinity="cosine", linkage="average"),
    SGDRegressor(alpha=0.0,
                 eta0=0.01,
                 fit_intercept=False,
                 l1_ratio=1.0,
                 learning_rate="invscaling",
                 loss="epsilon_insensitive",
                 penalty="elasticnet",
                 power_t=100.0))
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)
# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: 0.8326392221287445
exported_pipeline = make_pipeline(
    make_union(
        make_pipeline(
            make_union(
                make_union(
                    FunctionTransformer(copy),
                    StackingEstimator(estimator=RandomForestClassifier(bootstrap=True, criterion="entropy", max_features=0.35000000000000003, min_samples_leaf=1, min_samples_split=7, n_estimators=100))
                ),
                make_union(
                    FunctionTransformer(copy),
                    FunctionTransformer(copy)
                )
            ),
            SelectPercentile(score_func=f_classif, percentile=58)
        ),
        FunctionTransformer(copy)
    ),
    MultinomialNB(alpha=0.1, fit_prior=True)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Exemplo n.º 35
0
    })],
    input_df=True,
    df_out=True,
    default=False)

engineered_feature_pipeline4 = skp.DataFrameMapper(
    [(['c1', 'c2', 'c3', 'c4', 'c5'], uf.Straight(), {
        'alias': 'has_straight'
    })],
    input_df=True,
    df_out=True,
    default=False)

# here we lose feature names
features_pipeline = ppl.make_union(engineered_feature_pipeline1,
                                   engineered_feature_pipeline2,
                                   engineered_feature_pipeline3,
                                   engineered_feature_pipeline4)

temp = d_in[d_in['hand'] == '8']
#features_pipeline.fit_transform(temp).head()
a = features_pipeline.fit_transform(temp)
a[0:10, ]

# modelling complete pipeline
pipe = ppl.Pipeline([
    ('prep', features_pipeline), ('encoding', ppr.OneHotEncoder()),
    ('clf',
     LogisticRegression(multi_class='multinomial',
                        penalty='l2',
                        random_state=9546,
                        solver="lbfgs"))
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator, ZeroCount
from xgboost import XGBClassifier
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.504247990815155
exported_pipeline = make_pipeline(
    make_union(FunctionTransformer(copy), ZeroCount()),
    XGBClassifier(learning_rate=0.001,
                  max_depth=3,
                  min_child_weight=3,
                  n_estimators=100,
                  nthread=1,
                  subsample=0.1))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
    def __init__(self, classifier="sgd", classifier_args=None, lowercase=True,
                 text_replacements=None, map_to_synsets=True, binary=False,
                 min_df=0, ngram=1, stopwords=None, limit_train=None,
                 map_to_lex=True, duplicates=True, svm_features=False
                 , preprocessor=False, useLemmatization = True, stemming = False,
                 useStopWords = True, word2vecFeatures = False, splitModel= False,
                 useTfIdf= False):


        """
        Parameter description:
            - `classifier`: The type of classifier used as main classifier,
              valid values are "sgd", "knn", "svc", "randomforest".
            - `classifier_args`: A dict to be passed as arguments to the main
              classifier.
            - `lowercase`: wheter or not all words are lowercased at the start of
              the pipeline.
            - `text_replacements`: A list of tuples `(from, to)` specifying
              string replacements to be made at the start of the pipeline (after
              lowercasing).
            - `map_to_synsets`: Whether or not to use the Wordnet synsets
              feature set.
            - `binary`: Whether or not to count words in the bag-of-words
              representation as 0 or 1.
            - `min_df`: Minumim frequency a word needs to have to be included
              in the bag-of-word representation.
            - `ngram`: The maximum size of ngrams to be considered in the
              bag-of-words representation.
            - `stopwords`: A list of words to filter out of the bag-of-words
              representation. Can also be the string "english", in which case
              a default list of english stopwords will be used.
            - `limit_train`: The maximum amount of training samples to give to
              the main classifier. This can be useful for some slow main
              classifiers (ex: svc) that converge with less samples to an
              optimum.
            - `max_to_lex`: Whether or not to use the Harvard Inquirer lexicon
              features.
            - `duplicates`: Whether or not to check for identical phrases between
              train and prediction.
            - `svm_features`: Whether or not to include features from an SVM classifier
        """
        self.limit_train = limit_train
        self.duplicates = duplicates
        print("Using tfidf: ", useTfIdf)
        # Build pre-processing common to every extraction
        pipeline = [Preprocessor(removeStopWords=useStopWords, lemmatize=useLemmatization, stem=stemming)] if preprocessor else [ExtractText(lowercase)]
        if text_replacements:
            pipeline.append(ReplaceText(text_replacements))

        # Build feature extraction schemes
        ext = [build_text_extraction(binary=binary, min_df=min_df,
                                     ngram=ngram, stopwords=stopwords, useTfIdf=useTfIdf)]
        if map_to_synsets:
            ext.append(build_synset_extraction(binary=binary, min_df=min_df,
                                               ngram=ngram, useTfIdf=useTfIdf))
        if map_to_lex:
            ext.append(build_lex_extraction(binary=binary, min_df=min_df,
                                            ngram=ngram))
        if svm_features:
            ext.append(build_svm_features())
        if word2vecFeatures:
            ext.append(build_word2vec_features())
        ext = make_union(*ext)
        pipeline.append(ext)

        # Build classifier and put everything togheter
        if classifier_args is None:
            classifier_args = {}
        if classifier == "ensemble":
            classifier_args = {"classifiers": [SGDClassifier(), RandomForestClassifier(), SVC(),KNeighborsClassifier(), RandomForestClassifier(n_estimators= 100, min_samples_leaf=10, n_jobs=-1)]}
        #Classifier constructor E.g. SGDClassifier(args)
        classifier = _valid_classifiers[classifier](**classifier_args)
        self.pipeline = make_pipeline(*pipeline)
        self.classifier = classifier
        self.splitModel = splitModel
        self.splitSize = 1
def get_feature_pipeline(tr_artifact,
                         hist_artifacts,
                         all_data,
                         cachedir='data/'):
    """Define feature transformation pipeline."""
    return make_pipeline(
        make_union(
            identity(input_cols=[
                FieldNames.customer_id,
                FieldNames.coupon_id,
                FieldNames.rented,
                FieldNames.age_range,
                FieldNames.marital_status,
                FieldNames.no_of_children,
                FieldNames.family_size,
                FieldNames.income_bracket,
            ]),
            make_pipeline(SelectCols(cols=[FieldNames.campaign_type]),
                          OrdinalEncoder()),
            # make_pipeline(
            #     SelectCols(cols=[FieldNames.cust_cohort]),
            #     OneHotEncoder(handle_unknown='ignore')
            # ),
            make_pipeline(
                GroupCatCatNUnique(FieldNames.campaign_id,
                                   FieldNames.customer_id)),
            make_pipeline(
                GroupCatCatNUnique(FieldNames.campaign_id,
                                   FieldNames.coupon_id)),
            # make_pipeline(
            #     SelectCols(cols=[FieldNames.campaign_id]),
            #     GroupCatCountEncoder()
            # ),
            make_pipeline(
                ExpandingMean(
                    date_col=FieldNames.campaign_start_date,
                    user_col=FieldNames.customer_id,
                    key_col=FieldNames.target,
                    hist_artifact=tr_artifact,
                ), ),
            make_pipeline(
                ExpandingCount(
                    date_col=FieldNames.campaign_start_date,
                    user_col=FieldNames.customer_id,
                    key_col=FieldNames.target,
                    hist_artifact=tr_artifact,
                ), ),
            # make_pipeline(
            #     ExpandingMedian(
            #         date_col=FieldNames.campaign_start_date,
            #         user_col=FieldNames.customer_id,
            #         key_col=FieldNames.transaction_day,
            #         hist_artifact=hist_artifacts[0]
            #         )
            # ),
            # make_pipeline(
            #     ExpandingSum(
            #         date_col=FieldNames.campaign_start_date,
            #         user_col=FieldNames.customer_id,
            #         key_col=FieldNames.target,
            #         hist_artifact=tr_artifact,
            #     )
            # ),
            # make_pipeline(
            #     ExpandingCount(
            #         date_col=FieldNames.campaign_start_date,
            #         user_col=FieldNames.customer_id,
            #         key_col=FieldNames.coupon_discount,
            #         hist_artifact=hist_artifacts[0],
            #     )
            # ),
            # make_pipeline(
            #     ExpandingMean(
            #         date_col=FieldNames.campaign_start_date,
            #         user_col=FieldNames.customer_id,
            #         key_col=FieldNames.selling_price,
            #         hist_artifact=hist_artifacts[0],
            #     )
            # ),
            # make_pipeline(
            #     ExpandingMean(
            #         date_col=FieldNames.campaign_start_date,
            #         user_col=FieldNames.customer_id,
            #         key_col=FieldNames.coupon_discount,
            #         hist_artifact=hist_artifacts[1],
            #     )
            # ),
            # make_pipeline(
            #     ExpandingSum(
            #         date_col=FieldNames.campaign_start_date,
            #         user_col=FieldNames.customer_id,
            #         key_col=FieldNames.selling_price,
            #         hist_artifact=hist_artifacts[1],
            #     )
            # ),
            # make_pipeline(
            #     ExpandingMax(
            #         date_col=FieldNames.campaign_start_date,
            #         user_col=FieldNames.customer_id,
            #         key_col=FieldNames.pct_discount,
            #         hist_artifact=hist_artifacts[0],
            #     )
            # ),
            make_pipeline(
                make_union(
                    SetAggregation(
                        date_col=FieldNames.campaign_start_date,
                        user_col=FieldNames.customer_id,
                        key_col=FieldNames.item_set,
                        hist_artifact=hist_artifacts[0],
                    ),
                    SelectCols(cols=[FieldNames.item_set]),
                ), CountCommon()),
            make_pipeline(
                make_union(
                    SetAggregation(
                        date_col=FieldNames.campaign_start_date,
                        user_col=FieldNames.customer_id,
                        key_col=FieldNames.item_set,
                        hist_artifact=hist_artifacts[0],
                    ),
                    SelectCols(cols=[FieldNames.item_set]),
                ), Jaccard()),
            make_pipeline(
                make_union(
                    SetAggregation(
                        date_col=FieldNames.campaign_start_date,
                        user_col=FieldNames.customer_id,
                        key_col=FieldNames.item_set,
                        hist_artifact=hist_artifacts[1],
                    ),
                    SelectCols(cols=[FieldNames.item_set]),
                ),
                CountCommon(),
            ),
            make_pipeline(
                make_union(
                    SetAggregation(
                        date_col=FieldNames.campaign_start_date,
                        user_col=FieldNames.customer_id,
                        key_col=FieldNames.item_set,
                        hist_artifact=hist_artifacts[1],
                    ),
                    SelectCols(cols=[FieldNames.item_set]),
                ), Jaccard(),
                QuantileTransformer(output_distribution='normal')),
            # make_pipeline(
            #     make_union(
            #         SetAggregation(
            #             date_col=FieldNames.campaign_start_date,
            #             user_col=FieldNames.customer_id,
            #             key_col=FieldNames.item_set,
            #             hist_artifact=hist_artifacts[2],
            #         ),
            #         SelectCols(cols=[FieldNames.item_set]),
            #     ),
            #     Jaccard(),
            # ),
            # make_pipeline(
            #     make_union(
            #         SetAggregation(
            #             date_col=FieldNames.campaign_start_date,
            #             user_col=FieldNames.customer_id,
            #             key_col=FieldNames.item_brand,
            #             hist_artifact=hist_artifacts[0],
            #         ),
            #         SelectCols(cols=[FieldNames.item_brand]),
            #     ),
            #     CountCommon(),
            # ),
            make_pipeline(
                make_union(
                    SetAggregation(
                        date_col=FieldNames.campaign_start_date,
                        user_col=FieldNames.customer_id,
                        key_col=FieldNames.item_brand,
                        hist_artifact=hist_artifacts[0],
                    ),
                    SelectCols(cols=[FieldNames.item_brand]),
                ),
                Jaccard(),
            ),
            make_pipeline(
                make_union(
                    SetAggregation(
                        date_col=FieldNames.campaign_start_date,
                        user_col=FieldNames.customer_id,
                        key_col=FieldNames.item_brand,
                        hist_artifact=hist_artifacts[1],
                    ),
                    SelectCols(cols=[FieldNames.item_brand]),
                ),
                Jaccard(),
            ),
            # make_pipeline(
            #     CouponItemMean(coupon_col=FieldNames.coupon_id,
            #                    target_col=FieldNames.target)
            # )
            # make_pipeline(
            #     make_union(
            #         SetAggregation(
            #             date_col=FieldNames.campaign_start_date,
            #             user_col=FieldNames.customer_id,
            #             key_col=FieldNames.item_category,
            #             hist_artifact=hist_artifacts[0],
            #         ),
            #         SelectCols(cols=[FieldNames.item_category]),
            #     ),
            #     Jaccard(),
            # ),
            make_pipeline(
                make_union(
                    SetAggregation(
                        date_col=FieldNames.campaign_start_date,
                        user_col=FieldNames.customer_id,
                        key_col=FieldNames.item_category,
                        hist_artifact=hist_artifacts[1],
                    ),
                    SelectCols(cols=[FieldNames.item_category]),
                ),
                Jaccard(),
            ),
            make_pipeline(
                make_union(
                    SetAggregation(
                        date_col=FieldNames.campaign_start_date,
                        user_col=FieldNames.customer_id,
                        key_col=FieldNames.item_category,
                        hist_artifact=hist_artifacts[2],
                    ),
                    SelectCols(cols=[FieldNames.item_category]),
                ),
                Jaccard(),
            ),
            make_pipeline(
                SetLen(
                    date_col=FieldNames.campaign_start_date,
                    user_col=FieldNames.customer_id,
                    key_col=FieldNames.item_brand,
                    hist_artifact=hist_artifacts[0],
                ), ),
            make_pipeline(
                SelectCols(cols=[
                    FieldNames.campaign_start_date,
                    FieldNames.campaign_end_date
                ]),
                FunctionTransfomer(lambda x:
                                   (x.iloc[:, 1] - x.iloc[:, 0]).dt.days)),
            # make_pipeline(
            #     FunctionTransfomer(lambda x: x[FieldNames.item_set].apply(len).values.reshape(-1, 1))
            # ),
            make_pipeline(
                FunctionTransfomer(lambda x: x[FieldNames.item_brand].apply(
                    len).values.reshape(-1, 1))),
            make_pipeline(
                FunctionTransfomer(lambda x: x[FieldNames.item_category].apply(
                    len).values.reshape(-1, 1))),
            make_pipeline(
                ZeroPct(
                    date_col=FieldNames.campaign_start_date,
                    user_col=FieldNames.customer_id,
                    key_col=FieldNames.coupon_discount,
                    hist_artifact=hist_artifacts[0],
                )),
            make_pipeline(
                AllCountEncoder(
                    cols=[FieldNames.customer_id, FieldNames.coupon_id],
                    data=all_data)),
            # make_pipeline(
            #     SetMean(
            #         date_col=FieldNames.campaign_start_date,
            #         user_col=FieldNames.customer_id,
            #         key_col=FieldNames.selling_price,
            #         hist_artifact=hist_artifacts[0],
            #     )
            # ),
            # make_pipeline(
            #     ZeroPct(
            #         date_col=FieldNames.campaign_start_date,
            #         user_col=FieldNames.customer_id,
            #         key_col=FieldNames.other_discount,
            #         hist_artifact=hist_artifacts[0],
            #     )
            # )
            # make_pipeline(
            #     VectorMapper(FieldNames.coupon_id, 'data/coupon_vectors_lda.npy')
            # ),
            # make_pipeline(
            #     VectorMapper(FieldNames.coupon_id, 'data/coupon_vectors_svd.npy')
            # ),
            # make_pipeline(
            #     SetLen(
            #             date_col=FieldNames.campaign_start_date,
            #             user_col=FieldNames.customer_id,
            #             key_col=FieldNames.item_set,
            #             hist_artifact=hist_artifacts[0],
            #         ),
            # ),
            # make_pipeline(
            #     SetLen(
            #             date_col=FieldNames.campaign_start_date,
            #             user_col=FieldNames.customer_id,
            #             key_col=FieldNames.item_set,
            #             hist_artifact=hist_artifacts[1],
            #         ),
            # ),
            # make_pipeline(
            #     make_union(
            #         SetAggregationLast3(
            #             date_col=FieldNames.campaign_start_date,
            #             user_col=FieldNames.customer_id,
            #             key_col=FieldNames.item_set,
            #             hist_artifact=hist_artifacts[1],
            #         ),
            #         SelectCols(cols=[FieldNames.item_set]),
            #     ),
            #     Jaccard(),
            # ),
            # make_pipeline(
            #     make_union(
            #         SetAggregation(
            #             date_col=FieldNames.campaign_start_date,
            #             user_col=FieldNames.customer_id,
            #             key_col=FieldNames.item_set,
            #             hist_artifact=hist_artifacts[2],
            #         ),
            #         SelectCols(cols=[FieldNames.item_set]),
            #     ),
            #     Jaccard(),
            # ),
        ),
        make_union(
            FunctionTransfomer(lambda x: x),
            FunctionTransfomer(lambda x: x[:, 13] / (1e-4 + x[:, 15])),
            FunctionTransfomer(lambda x: x[:, 14] / (1e-4 + x[:, 16])),
            FunctionTransfomer(lambda x: x[:, 17] / (1e-4 + x[:, 18])),
            FunctionTransfomer(lambda x: x[:, 19] / (1e-4 + x[:, 20])),
            # FunctionTransfomer(lambda x: x[:, 17]/(1e-4 + x[:, 14])),
        ),
    )
Exemplo n.º 39
0
        return len(sample_text)

    def transform(self, X, y=None):
        """The workhorse of this feature extractor"""
        result_series = X['review'].apply(self.text_length)
        return result_series.to_frame(name=self.get_feature_names()[0])

    def fit(self, df, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self


if __name__ == '__main__':
    from sklearn.pipeline import make_pipeline, make_union

    base_path = '../data/stanford_imdb'

    df = pd.read_csv(f'{base_path}/imdb_df.csv.gzip', compression='gzip')
    X = df.drop(['sentiment'], axis=1)

    t1 = AverageWordLengthExtractor()
    t2 = TextLengthExtractor()

    # I expect the make_union to produce 2 additional columns
    pipe = make_union(t1, t2)

    n = pipe.transform(X)

    print(n.shape)
    print(type(n))  #numpy.ndarray
import pandas as pd
from sklearn.decomposition import FastICA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator, ZeroCount
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.8453186610518303
exported_pipeline = make_pipeline(
    make_union(make_pipeline(ZeroCount(), FastICA(tol=0.2)),
               FunctionTransformer(copy)),
    ExtraTreesClassifier(bootstrap=False,
                         criterion="entropy",
                         max_features=0.2,
                         min_samples_leaf=1,
                         min_samples_split=4,
                         n_estimators=100))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Exemplo n.º 41
0
def main():
    vectorizer = make_union(
        on_field('title', Tfidf(max_features=100000, token_pattern='\w+')),
        on_field('text', Tfidf(max_features=100000, token_pattern='\w+', ngram_range=(1, 2))),
        on_field(['price', 'user_type', 'image_top_1'],
                 FunctionTransformer(to_records, validate=False), DictVectorizer()),
        n_jobs=8)
    y_scaler = StandardScaler()
    
    with timer('process train'):
        print('read train data ...')
        train = pd.read_csv('../input/train.csv', parse_dates = ["activation_date"])


#        cv = KFold(n_splits=10, shuffle=True, random_state=42)
#        train_ids, valid_ids = next(cv.split(train))
#        train, valid = train.iloc[train_ids], train.iloc[valid_ids]
        
        train, valid = train_test_split(train, test_size=0.10, random_state=23)
        y_train = y_scaler.fit_transform(train['deal_probability'].values.reshape(-1, 1))
        X_train = vectorizer.fit_transform(preprocess(train))       
        print('X_train: {} of {}'.format(X_train.shape,X_train.dtype))
        del train; gc.collect()
        
    with timer('process valid'):
        X_valid = vectorizer.transform(preprocess(valid))
        
        
    with timer('process test'):
        # TODO
        print('read test data ...')
        test = pd.read_csv('../input/test.csv', parse_dates = ["activation_date"])
        X_test = vectorizer.transform(preprocess(test))
        del test; gc.collect()

     
    with ThreadPool(processes=8) as pool:
#        Xb_train, Xb_valid = [x.astype(np.bool) for x in [X_train, X_valid]]
        Xb_train, Xb_valid, Xb_test = [x.astype(np.bool) for x in [X_train, X_valid, X_test]]
        xs = [[Xb_train, Xb_valid], [X_train, X_valid]] * 2
        del X_valid; gc.collect()
        
        # TODO
        xs_test = [[Xb_train, Xb_test], [X_train, Xb_test]] * 2
        del X_train, X_test; gc.collect()
    
        y_pred = np.mean(pool.map(partial(fit_predict, y_train=y_train), xs), axis=0)
        
        # TODO
        y_pred_test = np.mean(pool.map(partial(fit_predict, y_train=y_train), xs_test), axis=0)
        
    y_pred = y_scaler.inverse_transform(y_pred.reshape(-1, 1))[:, 0]
    
    # TODO
    y_pred_test = y_scaler.inverse_transform(y_pred_test.reshape(-1, 1))[:, 0]
    
    print('Valid RMSLE: {:.4f}'.format(np.sqrt(mean_squared_log_error(valid['deal_probability'], y_pred))))
    del valid; gc.collect()
    
    sub = pd.read_csv('../input/sample_submission.csv')
    sub['deal_probability'] = y_pred_test
    sub.to_csv('sub.csv', index=False)
    print('all done!')
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from xgboost import XGBClassifier
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: 0.9100211739663182
exported_pipeline = make_pipeline(
    make_union(StackingEstimator(estimator=GaussianNB()),
               FunctionTransformer(copy)),
    XGBClassifier(learning_rate=0.1,
                  max_depth=6,
                  min_child_weight=2,
                  n_estimators=100,
                  nthread=1,
                  subsample=0.6000000000000001))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Exemplo n.º 43
0
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Average CV score on the training set was:0.84550605863897
exported_pipeline = make_pipeline(
    make_union(
        make_pipeline(
            OneHotEncoder(minimum_fraction=0.25, sparse=False, threshold=10),
            RFE(estimator=ExtraTreesClassifier(criterion="gini",
                                               max_features=0.5,
                                               n_estimators=100),
                step=0.2), ZeroCount(), MinMaxScaler()),
        FunctionTransformer(copy)), Normalizer(norm="max"),
    XGBClassifier(learning_rate=0.01,
                  max_depth=6,
                  min_child_weight=7,
                  n_estimators=600,
                  nthread=1,
                  subsample=0.9500000000000001))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
def get_feature_pipeline(tr_artifact, hist_artifacts, all_data):
    """Feature generation pipeline."""
    hist_n = 3  # len(hist_artifacts)
    tr_artifact_kws = {
        "date_col": FieldNames.campaign_start_date,
        "user_col": FieldNames.customer_id,
        "key_col": FieldNames.target,
        "hist_artifact": tr_artifact,
    }
    hist_cols = [
        FieldNames.item_set, FieldNames.item_brand, FieldNames.item_category
    ]
    hist_cols2 = [
        FieldNames.coupon_discount,
        FieldNames.other_discount,
        FieldNames.pct_discount,
        FieldNames.quantity,
        FieldNames.selling_price,
    ]
    return make_pipeline(
        make_union(
            # Numerical features directly available
            make_pipeline(
                SelectCols(cols=[
                    FieldNames.customer_id,
                    FieldNames.coupon_id,
                    FieldNames.age_range,
                    FieldNames.marital_status,
                    FieldNames.family_size,
                    FieldNames.no_of_children,
                    FieldNames.income_bracket,
                    FieldNames.campaign_type,
                ]),
                FunctionTransfomer(lambda x: x),
            ),
            # coupon-no. of unique item attributes
            make_union(*[
                make_pipeline(
                    SelectCols(cols=[col]),
                    FunctionTransfomer(
                        lambda X:
                        [len(set(x)) for x in X.values.flatten().tolist()]),
                ) for col in [
                    FieldNames.item_set,
                    FieldNames.item_brand,
                    FieldNames.item_brand_type,
                    FieldNames.item_category,
                ]
            ],
                       verbose=True),
            # Campaign id features
            make_union(*[
                GroupCatCatNUnique(FieldNames.campaign_id, col2)
                for col2 in [FieldNames.customer_id, FieldNames.coupon_id]
            ],
                       verbose=True),
            # Customer id expanding mean, count, sum
            make_pipeline(ExpandingMean(**tr_artifact_kws)),
            make_pipeline(ExpandingCount(**tr_artifact_kws)),
            make_pipeline(ExpandingSum(**tr_artifact_kws)),
            # Count items common between current coupon and historical customer transactions
            make_union(*[
                make_pipeline(
                    make_union(
                        SetAggregation(
                            date_col=FieldNames.campaign_start_date,
                            user_col=FieldNames.customer_id,
                            key_col=col,
                            hist_artifact=hist_artifacts[i],
                        ),
                        SelectCols(cols=[col]),
                    ),
                    CountCommon(),
                ) for col, i in itertools.product(hist_cols, range(hist_n))
            ]),
            make_union(*[
                make_pipeline(
                    make_union(
                        SetAggregation(
                            date_col=FieldNames.campaign_start_date,
                            user_col=FieldNames.customer_id,
                            key_col=col,
                            hist_artifact=hist_artifacts[i],
                        ),
                        SelectCols(cols=[col]),
                    ),
                    Jaccard(),
                ) for col, i in itertools.product(hist_cols, range(hist_n))
            ],
                       verbose=True),
            make_union(*[
                make_pipeline(
                    make_union(
                        SetAggregation(
                            date_col=FieldNames.campaign_start_date,
                            user_col=FieldNames.customer_id,
                            key_col=col,
                            hist_artifact=hist_artifacts[i],
                        ),
                        SelectCols(cols=[col]),
                    ),
                    CountCommonRepeats(),
                ) for col, i in itertools.product(hist_cols, range(hist_n))
            ]),
            # campaign length
            make_pipeline(
                SelectCols(cols=[
                    FieldNames.campaign_start_date,
                    FieldNames.campaign_end_date
                ]),
                FunctionTransfomer(lambda x:
                                   (x.iloc[:, 1] - x.iloc[:, 0]).dt.days),
            ),
            # coupon discount, other dicount, selling price and quantity aggregations
            make_union(*[
                ExpandingMean(
                    date_col=FieldNames.campaign_start_date,
                    user_col=FieldNames.customer_id,
                    key_col=col,
                    hist_artifact=hist_artifacts[i],
                ) for col, i in itertools.product(hist_cols2, range(hist_n))
            ]),
            make_pipeline(
                GroupCatCountEncoder(
                    cols=[FieldNames.customer_id, FieldNames.campaign_id])),
            make_pipeline(
                AllCountEncoder(
                    cols=[FieldNames.customer_id, FieldNames.coupon_id],
                    data=all_data)),
            make_pipeline(
                make_union(*[
                    SetLen(
                        date_col=FieldNames.campaign_start_date,
                        user_col=FieldNames.customer_id,
                        key_col=FieldNames.item_set,
                        hist_artifact=hist_artifacts[0],
                    ) for i in range(hist_n)
                ])),
            make_pipeline(
                make_union(
                    VectorMapper(col=FieldNames.coupon_id,
                                 vector_file=FileNames.coupon_vectors),
                    HistVectorMean(
                        vector_file=FileNames.item_vectors,
                        user_col=FieldNames.customer_id,
                        key_col=FieldNames.item_set,
                        date_col=FieldNames.campaign_start_date,
                        hist_artifact=hist_artifacts[0],
                    ),
                ),
                CosineSimilarity(),
            ),
        ),
        make_union(
            FunctionTransfomer(lambda x: x),
            # Ratios
            make_pipeline(
                make_union(*[
                    FunctionTransfomer(lambda x: x[:, i] / x[:, j])
                    for (i,
                         j) in itertools.product(range(16, 34), range(16, 34))
                ],
                           verbose=True)),
        ),
    )
Exemplo n.º 45
0
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFE, VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.8172868435911914
exported_pipeline = make_pipeline(
    make_union(
        StandardScaler(),
        RFE(estimator=ExtraTreesClassifier(criterion="gini",
                                           max_features=0.2,
                                           n_estimators=100),
            step=0.1)), VarianceThreshold(threshold=0.25), StandardScaler(),
    StandardScaler(), MinMaxScaler(), StandardScaler(),
    LogisticRegression(C=0.01, dual=False, penalty="l2"))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
    s = str(i.month) + str(i.day) + str(i.hour) + str(i.minute)
    model_name = "GM_export/main_new/" + "GM" + s + ".py"
    tpo.export(model_name)

    import numpy as np
    import pandas as pd
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import train_test_split
    from sklearn.pipeline import make_pipeline, make_union
    from tpot.builtins import StackingEstimator
    from xgboost import XGBClassifier
    from sklearn.preprocessing import FunctionTransformer
    from copy import copy

    exported_pipeline = make_pipeline(
        make_union(FunctionTransformer(copy), FunctionTransformer(copy)),
        StackingEstimator(estimator=XGBClassifier(learning_rate=0.1,
                                                  max_depth=5,
                                                  min_child_weight=6,
                                                  n_estimators=21,
                                                  scale_pos_weight=4.16,
                                                  subsample=0.85)),
        RandomForestClassifier(class_weight={1: 4.16},
                               criterion="gini",
                               max_depth=8,
                               max_features=6,
                               n_estimators=23))

    exported_pipeline.fit(x_train, y_train)
    evalution_model(exported_pipeline, x_train, y_train)
    evalution_model(exported_pipeline, x_test, y_test)
data,label = data[idx_row,:],label[idx_row]
features = data
tpot_data=pd.DataFrame({'class':label},columns=['class'])

# train the machine learning model  
 
kf = KFold(n_splits=10,random_state=556,shuffle=True)
results,auc=[],[]
cnt = 0
print('machine learning model best ML model and cross validation by 10 folds') 
fp,tp=[],[]   
for train_index, test_index in kf.split(features):
    training_features, testing_features = features[train_index],features[test_index]
    training_classes, testing_classes = tpot_data['class'].values[train_index],tpot_data['class'].values[test_index]
    exported_pipeline = make_pipeline(
    make_union(VotingClassifier([("est", DecisionTreeClassifier())]), FunctionTransformer(lambda X: X)),
    GradientBoostingClassifier(learning_rate=0.24, max_features=0.24, n_estimators=500)
        ) 
    exported_pipeline.fit(training_features, training_classes)
    results.append(exported_pipeline.predict_proba(testing_features)[:,1])
    fpr, tpr, thresholds = metrics.roc_curve(testing_classes,exported_pipeline.predict_proba(testing_features)[:,1])
    auc.append(metrics.roc_auc_score(testing_classes,exported_pipeline.predict_proba(testing_features)[:,1]))
    #ax.plot(fpr,tpr,label='%s,Area under the curve: %.3f'%(type_,auc[cnt]))
    fp.append(fpr);tp.append(tpr)
    print('get one done')
    cnt += 1
print('done')
#from sklearn.externals import joblib
#pickle.dump(exported_pipeline, open('%smy_model.pkl'%folder,'wb'))
#exported_pipeline = joblib.load('%smy_model.pkl'%folder)
pickle.dump([results,auc,fp,tp],open("%slong process.p"%folder,"wb"))
X_test_post_hoc = df_test

df = df.drop(columns=['eid', '21022-0.0'], axis=1)
df_test = df_test.drop(columns=['eid', '21022-0.0'], axis=1)

# Learning curves: train sizes
train_sizes = [100, 500, 1000, 1500, 2000, 2500, 3000, 3500, 3700]
# Model
estimator = RandomForestRegressor(n_estimators=250,
                                  criterion='mse',
                                  n_jobs=10,
                                  verbose=1,
                                  random_state=0)

pipeline = Pipeline([('imputation',
                      make_union(SimpleImputer(strategy="median"),
                                 MissingIndicator(error_on_new=False))),
                     ('estimator', estimator)])

cv = ShuffleSplit(n_splits=100, test_size=0.1, random_state=0)

param_grid = {
    'estimator__max_depth': [5, 10, 20, 40, None],
    'estimator__max_features': [1, 5, 'log2', 'sqrt', 'auto', None]
}

grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, verbose=2)
metrics = []
train_sizes, train_scores, validation_scores = learning_curve(
    estimator=grid_search,
    X=df,
    y=y_train,
Exemplo n.º 49
0
    def _make_preprocessor(self):
        def lexicon_pipeline(lexicon):
            return make_pipeline(LexiconFeatures(lexicon), DictVectorizer())

        unigram_lexicons_features = make_union(
            lexicon_pipeline(NRCEmotionLexicon()),
            lexicon_pipeline(NRCHashtagEmotionLexicon()),
            lexicon_pipeline(MaxDiffTwitterLexicon()),
            lexicon_pipeline(NRCHashtagSentimentWithContextUnigrams()),
            lexicon_pipeline(NRCHashtagSentimentLexiconUnigrams()),
            lexicon_pipeline(Sentiment140WithContextUnigrams()),
            lexicon_pipeline(Sentiment140LexiconUnigrams()),
            lexicon_pipeline(YelpReviewsLexiconUnigrams()),
            lexicon_pipeline(AmazonLaptopsReviewsLexiconUnigrams()),
            lexicon_pipeline(MPQAEffectLexicon()),
            lexicon_pipeline(MPQASubjectivityLexicon()),
            lexicon_pipeline(HarvardInquirerLexicon()),
            lexicon_pipeline(BingLiuLexicon()),
            lexicon_pipeline(AFINN111Lexicon()),
            lexicon_pipeline(SentiWordNetLexicon()),
            lexicon_pipeline(LoughranMcDonaldLexicon()),
        )

        bigram_lexicons_features = make_union(
            lexicon_pipeline(NRCHashtagSentimentWithContextBigrams()),
            lexicon_pipeline(NRCHashtagSentimentLexiconBigrams()),
            lexicon_pipeline(Sentiment140WithContextBigrams()),
            lexicon_pipeline(Sentiment140LexiconBigrams()),
            lexicon_pipeline(YelpReviewsLexiconBigrams()),
            lexicon_pipeline(AmazonLaptopsReviewsLexiconBigrams()),
            lexicon_pipeline(MPQAEffectLexicon()),
        )

        preprocessor = make_pipeline(
            BasicTokenizer(),
            make_union(
                make_pipeline(
                    CMUArkTweetPOSTagger(),
                    ListCountVectorizer(lowercase=False,
                                        binary=True)),  # POS features
                # make_pipeline(W2Vembedding()),
                make_pipeline(
                    CharNGramTransformer([1, 2, 3]),
                    ListCountVectorizer(lowercase=True,
                                        max_features=10000,
                                        binary=True)),  # Character n-grams
                make_pipeline(
                    LowercaseTransformer(), CMUArkTweetBrownClusters(),
                    ListCountVectorizer(lowercase=False,
                                        binary=True)),  # brown clusters
                make_pipeline(
                    Negater(),
                    make_union(
                        make_pipeline(
                            NGramTransformer([3, 4]),
                            ListCountVectorizer(lowercase=True,
                                                max_features=10000,
                                                binary=True)),  # ngram word
                        make_pipeline(
                            CountingFeatures(), DictVectorizer()
                        ),  # allcaps, punctuations, lengthening, emoticons, etc. counting feature
                        make_pipeline(LowercaseTransformer(),
                                      unigram_lexicons_features
                                      ),  # unigram lexicon features
                        ListCountVectorizer(lowercase=True,
                                            max_features=10000,
                                            binary=True),  # ngram word
                        make_pipeline(
                            LowercaseTransformer(),
                            NGramTransformer(2),
                            make_union(
                                bigram_lexicons_features,  # bigram lexicon features
                                ListCountVectorizer(lowercase=True,
                                                    max_features=10000,
                                                    binary=True),  # ngram word
                            ),
                        ),
                    ),
                ),
                make_pipeline(LowercaseTransformer(), SSWEFeatures()),
                make_pipeline(
                    NormalizedTokens(),
                    CollapsedTokens(),
                    PorterStemmer(),
                    Negater(),
                    make_union(
                        ListCountVectorizer(lowercase=False,
                                            max_features=10000,
                                            binary=True),  # processed word
                        make_pipeline(
                            ClusterFeaturesWithNegation(),
                            ListCountVectorizer(
                                lowercase=False,
                                binary=True)),  # processed cluster features
                    ),
                )),
        )

        return preprocessor
Exemplo n.º 50
0
DATA_DIR_PORTABLE = "C:\\Users\\T149900\\ml_mercari\\"
DATA_DIR_BASEMENT = "D:\\mercari\\"
DATA_DIR = DATA_DIR_PORTABLE


df = pd.read_table(DATA_DIR + "train.tsv");


q = df[:10]
q_test = df[10:13]

q.price.isnull().sum()
q_test.price.isnull().sum()

vectorizer = make_union(
        on_field(['shipping', 'item_condition_id'], PlussOneStage() ),
        n_jobs=1)

p = on_field('item_condition_id', PlussOneStage())

p.fit(q)

X = p.transform(q)

X_test = p.transform(q_test)


X_train = vectorizer.fit_transform(preprocess(q)).astype(np.float32)


X_test = vectorizer.transform(preprocess(q_test)).astype(np.float32)
Exemplo n.º 51
0
def get_feature_union():
    return make_union(WaveletApprx(), SpatialFt(), DepthFt(),)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

imputer = Imputer(strategy="median")
imputer.fit(training_features)
training_features = imputer.transform(training_features)
testing_features = imputer.transform(testing_features)

# Score on the training set was:0.9576386406262041
exported_pipeline = make_pipeline(
    make_union(
        StackingEstimator(estimator=make_pipeline(
            MaxAbsScaler(),
            RandomForestClassifier(bootstrap=False,
                                   criterion="entropy",
                                   max_features=0.3,
                                   min_samples_leaf=5,
                                   min_samples_split=11,
                                   n_estimators=100))),
        FunctionTransformer(copy)),
    OneHotEncoder(minimum_fraction=0.25, sparse=False),
    ExtraTreesClassifier(bootstrap=True,
                         criterion="entropy",
                         max_features=0.6500000000000001,
                         min_samples_leaf=1,
                         min_samples_split=5,
                         n_estimators=100))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Exemplo n.º 53
0
import numpy as np

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import StandardScaler
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64)
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1)
training_features, testing_features, training_target, testing_target = \
    train_test_split(features, tpot_data['class'], random_state=42)

exported_pipeline = make_pipeline(
    make_union(
        FunctionTransformer(copy),
        FunctionTransformer(copy)
    ),
    StandardScaler(),
    GradientBoostingClassifier(learning_rate=0.1, max_depth=10, min_samples_leaf=15, min_samples_split=15, n_estimators=100, subsample=0.4)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Exemplo n.º 54
0
def test_missing_indicator_with_imputer(X, missing_values, X_trans_exp):
    trans = make_union(
        SimpleImputer(missing_values=missing_values, strategy='most_frequent'),
        MissingIndicator(missing_values=missing_values))
    X_trans = trans.fit_transform(X)
    assert_array_equal(X_trans, X_trans_exp)
Exemplo n.º 55
0
    return [tok.strip().lower() for tok in re.findall(r'\w', re.sub(r'\d', ' ', text))]


train_data = []
train_lbls = []

for line in open("./data/data_set.json", "r"):
    data = json.loads(line)
    train_data.append(data["data"])
    train_lbls.append(int(data["label"] == "EN")) # Класс 1 - Английский, класс 0 - Тагальский

#Создаём 3 извлекателя признаков (по словам, по парам букв, по буквам)
word_vectoriser = TfidfVectorizer(tokenizer=word_tokenizer)
ends_vectoriser = TfidfVectorizer(tokenizer=ending_tokenizer)
char_vectorizer = TfidfVectorizer(tokenizer=char_tokenizer)

#Группируем наши feature extractor-ы и создаём конвейер
feature_extractor = make_union(word_vectoriser, ends_vectoriser, char_vectorizer)
pipeline = make_pipeline(feature_extractor, LinearSVC(C=2))


#pipeline.fit(train_data[::2], train_lbls[::2])
#print(f1_score(train_lbls[1::2], pipeline.predict(train_data[1::2])))

scores = cross_validation.cross_val_score(pipeline, train_data, train_lbls, cv=5, scoring='f1_macro')
print(mean(scores))




def predict_with_best_parameters(train, test, class_names):

    submission = pd.DataFrame.from_dict({"id": test["id"]})

    scores = []

    #assuming best parameters are already found by grid_search
    best_parameters = {
        "toxic": {
            'clf': {
                'C': 10.0
            },
            'vect': {
                'ngram_range': (1, 1),
                'stop_words': None,
                'tokenizer': tokenizer_porter,
                'use_idf': False
            },
        },
        "severe_toxic": {
            'clf': {
                'C': 1.0
            },
            'vect': {
                'max_features': 30000,
                'ngram_range': (1, 1),
                'stop_words': None,
                'tokenizer': tokenizer_porter
            },
        },
        "obscene": {
            'clf': {
                'C': 10.0
            },
            'vect': {
                'ngram_range': (1, 1),
                'stop_words': None,
                'tokenizer': tokenizer_porter,
                'use_idf': False
            },
        },
        "threat": {
            'clf': {
                'C': 10.0
            },
            'vect': {
                'max_features': 30000,
                'ngram_range': (1, 1),
                'stop_words': None,
                'tokenizer': tokenizer_porter
            },
        },
        "insult": {
            'clf': {
                'C': 1.0
            },
            'vect': {
                'max_features': 30000,
                'ngram_range': (1, 1),
                'stop_words': None,
                'tokenizer': tokenizer_porter
            },
        },
        "identity_hate": {
            'clf': {
                'C': 1.0
            },
            'vect': {
                'max_features': 30000,
                'ngram_range': (1, 1),
                'stop_words': None,
                'tokenizer': tokenizer_porter
            }
        }
    }

    for target_class in class_names:

        print("\nWorking with target_class: ", target_class)

        train_target = train[target_class]

        word_vectorizer = TfidfVectorizer(
            sublinear_tf=True,
            strip_accents="unicode",
            analyzer="word",
            **best_parameters[target_class]["vect"])
        char_vectorizer = TfidfVectorizer(sublinear_tf=True,
                                          strip_accents="unicode",
                                          analyzer="char",
                                          ngram_range=(1, 4),
                                          max_features=30000)
        vectorizer = make_union(word_vectorizer, char_vectorizer, n_jobs=2)

        classifier = LogisticRegression(solver="sag",
                                        n_jobs=4,
                                        **best_parameters[target_class]["clf"])

        lr_vectorizer = Pipeline([('vect', vectorizer), ('clf', classifier)])

        start_time = time.time()

        cv_score = np.mean(
            cross_val_score(lr_vectorizer,
                            train_text,
                            train_target,
                            cv=3,
                            scoring="roc_auc",
                            n_jobs=3))

        end_time = time.time()
        print("CV time:", end_time - start_time)

        scores.append(cv_score)
        print("cv_score for class {} : {}".format(target_class, cv_score))

        start_time = time.time()
        lr_vectorizer.fit(train_text, train_target)
        end_time = time.time()
        print("fitting time: ", end_time - start_time)

        start_time = time.time()
        submission[target_class] = lr_vectorizer.predict_proba(test_text)[:, 1]
        #predict_proba returns two columns. The first is the probability that
        # the sample is of class 0, the second is the probability that the
        # sample is of class 1. So we only need to slice the second column.
        end_time = time.time()
        print("Prediction time: ", end_time - start_time)

    print("total CV score is: {}".format(np.mean(scores)))

    submission.to_csv("submission_gs_best_est_union_and_piped.csv",
                      index=False)

    pass
Exemplo n.º 57
0
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('PATH/TO/DATA/FILE',
                          delimiter='COLUMN_SEPARATOR',
                          dtype=np.float64)
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1),
                     tpot_data.dtype.names.index('class'),
                     axis=1)
training_features, testing_features, training_classes, testing_classes = \
    train_test_split(features, tpot_data['class'], random_state=42)

exported_pipeline = make_pipeline(
    make_union(
        VotingClassifier([("est",
                           LogisticRegression(C=0.0001,
                                              dual=False,
                                              penalty="l1"))]),
        FunctionTransformer(lambda X: X)),
    GradientBoostingClassifier(learning_rate=1.0,
                               max_features=1.0,
                               n_estimators=500))

exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)

#56
Exemplo n.º 58
0
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.8194243156199679
exported_pipeline = make_pipeline(
    make_union(
        FunctionTransformer(copy),
        make_union(
            make_pipeline(
                SelectPercentile(score_func=f_classif, percentile=83),
                StandardScaler()
            ),
            FunctionTransformer(copy)
        )
    ),
    PCA(iterated_power=5, svd_solver="randomized"),
    LinearSVC(C=0.001, dual=True, loss="hinge", penalty="l2", tol=0.001)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import MaxAbsScaler
from tpot.builtins import StackingEstimator
from xgboost import XGBRegressor
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:-0.00023017383030155843
exported_pipeline = make_pipeline(
    make_union(MaxAbsScaler(), FunctionTransformer(copy)),
    XGBRegressor(learning_rate=0.1,
                 max_depth=10,
                 min_child_weight=1,
                 n_estimators=100,
                 nthread=1,
                 subsample=0.8))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Exemplo n.º 60
0
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: 0.6814117647058824
exported_pipeline = make_pipeline(
    make_union(
        StackingEstimator(estimator=KNeighborsClassifier(n_neighbors=23, p=1, weights="distance")),
        make_pipeline(
            StackingEstimator(estimator=ExtraTreesClassifier(bootstrap=False, criterion="gini", max_features=0.9500000000000001, min_samples_leaf=12, min_samples_split=16, n_estimators=100)),
            StackingEstimator(estimator=MultinomialNB(alpha=10.0,True)),
            VarianceThreshold(threshold=0.01)
        )
    ),
    StackingEstimator(estimator=MultinomialNB(alpha=0.1,False)),
    RandomForestClassifier(bootstrap=True, criterion="gini", max_features=0.1, min_samples_leaf=4, min_samples_split=14, n_estimators=100)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)