Пример #1
0
def execute(training_pipeline_id,
            config,
            sampler,
            algorithm,
            n_splits=5,
            param_grid={},
            rfe__run=False,
            rfe__step_size=20):
    training_configuration = TrainingConfiguration.load_by_config(
        training_pipeline_id=training_pipeline_id, config=config)
    data_loader = DataLoader.load(
        training_configuration.training_pipeline.data_loader_path)
    X, y = load_features_and_transform(
        training_configuration=training_configuration, data_loader=data_loader)

    if rfe__run and algorithm().clf.__class__ != DummyClassifier:
        result = recursive_feature_elimination(
            X=X,
            y=y,
            step_size=rfe__step_size,
            n_splits=5,
            algorithm=algorithm,
        )
        X_supported = result['X_supported']
    else:
        X_supported = X.copy()

    metrics = []
    cv = StratifiedKFold(n_splits=n_splits)
    for train_idx, test_idx, in cv.split(X_supported, y):
        X_train, y_train = X_supported.loc[train_idx], y.loc[train_idx]
        X_test, y_test = X_supported.loc[test_idx], y.loc[test_idx]

        X_train_sampled, y_train_sampled = sampler().fit_resample(X=X_train,
                                                                  y=y_train)

        estimator = algorithm()
        cv = GridSearchCV(estimator=estimator,
                          cv=5,
                          n_jobs=-1,
                          scoring=estimator.score_auroc,
                          param_grid=param_grid,
                          verbose=1)
        cv.fit(X_train_sampled, y_train_sampled)

        evaluation = {
            'y_true': y_test,
            'y_pred': cv.predict(X_test),
            'y_probs': cv.predict_proba(X_test)[:, 1],
        }
        metrics.append([calculate_metrics(evaluation), cv])
    return metrics
Пример #2
0
word2vec = KeyedVectors.load_word2vec_format('/input/Kaggle/Word2Vec/GoogleNews-vectors-negative300.bin.gz', binary=True)
'apple' in word2vec.vocab # True

#### Tokenizer-Scikit-Learn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem.snowball import EnglishStemmer

# 自定义analyzer,加入stem
count_analyzer = CountVectorizer().build_analyzer()
stemmer = EnglishStemmer()

def stem_count_analyzer(doc):
    return (stemmer.stem(w) for w in count_analyzer(doc))

cv = CountVectorizer(analyzer=stem_count_analyzer, preprocessor=None, stop_words='english', max_features=128)
cv.fit(unique_questions)
q1_cv = cv.transform(train.question1)

# 使用默认word analyzer。再加入preprocessor
def preprocessor(review):
    return BeautifulSoup(review, 'html5lib').get_text()
count_v = CountVectorizer(analyzer='word', preprocessor=preprocessor, stop_words='english', max_features=5000)

#### Keras Tokenizer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=NUM_WORDS)
tokenizer.fit_on_texts(all_unique_questions)