def execute(training_pipeline_id, config, sampler, algorithm, n_splits=5, param_grid={}, rfe__run=False, rfe__step_size=20): training_configuration = TrainingConfiguration.load_by_config( training_pipeline_id=training_pipeline_id, config=config) data_loader = DataLoader.load( training_configuration.training_pipeline.data_loader_path) X, y = load_features_and_transform( training_configuration=training_configuration, data_loader=data_loader) if rfe__run and algorithm().clf.__class__ != DummyClassifier: result = recursive_feature_elimination( X=X, y=y, step_size=rfe__step_size, n_splits=5, algorithm=algorithm, ) X_supported = result['X_supported'] else: X_supported = X.copy() metrics = [] cv = StratifiedKFold(n_splits=n_splits) for train_idx, test_idx, in cv.split(X_supported, y): X_train, y_train = X_supported.loc[train_idx], y.loc[train_idx] X_test, y_test = X_supported.loc[test_idx], y.loc[test_idx] X_train_sampled, y_train_sampled = sampler().fit_resample(X=X_train, y=y_train) estimator = algorithm() cv = GridSearchCV(estimator=estimator, cv=5, n_jobs=-1, scoring=estimator.score_auroc, param_grid=param_grid, verbose=1) cv.fit(X_train_sampled, y_train_sampled) evaluation = { 'y_true': y_test, 'y_pred': cv.predict(X_test), 'y_probs': cv.predict_proba(X_test)[:, 1], } metrics.append([calculate_metrics(evaluation), cv]) return metrics
word2vec = KeyedVectors.load_word2vec_format('/input/Kaggle/Word2Vec/GoogleNews-vectors-negative300.bin.gz', binary=True) 'apple' in word2vec.vocab # True #### Tokenizer-Scikit-Learn from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from nltk.stem.snowball import EnglishStemmer # 自定义analyzer,加入stem count_analyzer = CountVectorizer().build_analyzer() stemmer = EnglishStemmer() def stem_count_analyzer(doc): return (stemmer.stem(w) for w in count_analyzer(doc)) cv = CountVectorizer(analyzer=stem_count_analyzer, preprocessor=None, stop_words='english', max_features=128) cv.fit(unique_questions) q1_cv = cv.transform(train.question1) # 使用默认word analyzer。再加入preprocessor def preprocessor(review): return BeautifulSoup(review, 'html5lib').get_text() count_v = CountVectorizer(analyzer='word', preprocessor=preprocessor, stop_words='english', max_features=5000) #### Keras Tokenizer from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences tokenizer = Tokenizer(num_words=NUM_WORDS) tokenizer.fit_on_texts(all_unique_questions)