def build_model(): ''' Build a machine learning pipeline using count-vectorizer, Tf-Idf, and Random forest Returns Results of GridSearchCV ''' pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(RandomForestClassifier()))]) parameters = { 'clf__estimator__n_estimators': [50, 100], 'clf__estimator__min_samples_split': [2, 3, 4], 'clf__estimator__criterion': ['entropy', 'gini'] } cv = GridSearchCV(pipeline, param_grid=parameters) return cv
def build_model(): """ Build a model to predict the class of a message. The model is a nlp pipeline made of tfidf and randomclassifier """ pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier( RandomForestClassifier(max_depth=100, min_samples_split=10)))]) parameters = parameters = { 'vect__ngram_range': ((1, 1), (1, 2)), 'vect__max_df': (0.5, 0.75, 1.0), 'vect__max_features': (None, 5000, 10000), 'tfidf__use_idf': (True, False), 'clf__estimator__n_estimators': [1, 5, 10, 15], 'clf__estimator__min_samples_split': [10, 20, 30, 40], 'clf__estimator__max_depth': [50, 100, 200] } return GridSearchCV(pipeline, param_grid=parameters)
def build_model(): # build pipeline pipeline = Pipeline([ ('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(RandomForestClassifier(random_state=42), n_jobs=-1)), ]) # set tuning parameters parameters = { 'tfidf__norm': ['l1', 'l2'], 'clf__estimator__criterion': ['gini', 'entropy'] } # get uptimised model with grid search model = GridSearchCV(pipeline, param_grid=parameters, cv=2, n_jobs=-1, verbose=1) return model
def build_model(): """ Note: """ """ planned to run, but my mac crashes whenever running, so please refer to Jupyter notebook for complixity studies instead. """ # pipeline = Pipeline([ # ('vect',TfidfVectorizer(tokenizer=tokenize)), # ('clf',MultiOutputClassifier(RandomForestClassifier(n_estimators=100,random_state=20))) # ]) # parameters = { # 'vect__norm': ['l1','l2'], # 'vect__min_df': [0, 0.25, 0.5] # } # cv = GridSearchCV(pipeline,param_grid=parameters, cv=5, n_jobs=-1) cv = Pipeline([('vect', TfidfVectorizer(tokenizer=tokenize, norm='l2')), ('clf', MultiOutputClassifier( RandomForestClassifier(n_estimators=100, random_state=20)))]) return cv
def build_model(): ''' input: None output: cv: GridSearch model result. ''' pipeline = Pipeline([ ('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(OneVsRestClassifier(LinearSVC(random_state=0)))) ]) parameters = { 'tfidf__smooth_idf': [True, False], 'clf__estimator__estimator__C': [1, 2, 5] } cv = GridSearchCV(pipeline, param_grid=parameters, scoring='precision_samples', cv=5) return cv
def build_model(X_train,y_train): ''' INPUT X_Train: Training features y_train: Training labels OUTPUT Returns a trained model ''' pipeline = Pipeline([ ('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(RandomForestClassifier())) ]) parameters = { 'clf__estimator__min_samples_split': [2, 4], } cv = GridSearchCV(estimator=pipeline, param_grid=parameters) cv.fit(X_train,y_train) return cv
def build_model(): ''' Model Pipeline with GridSearch optimization for parameters. Input: None. Output: classification model. ''' pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('rfc', MultiOutputClassifier(RandomForestClassifier()))]) parameters = { 'tfidf__use_idf': (True, False), # 'clf__estimator__n_estimators': [50, 60, 70], } # Optimizes model parameters trough GridSearchCV cv = GridSearchCV(pipeline, param_grid=parameters) return cv
def build_model(): ''' creates pipeline to build model and finds the best parameters by using GridSearchCV.''' # build the pipeline for the text transformation and then for the estimator instance pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(RandomForestClassifier()))]) # parameters are set to reduce the size of the pickle file, since my first files were larger than 1GB. parameters = { 'clf__estimator__n_estimators': [4, 6, 9], 'clf__estimator__min_samples_split': [2, 3, 5], } model = GridSearchCV(pipeline, param_grid=parameters, cv=3, verbose=2, n_jobs=4) return model
def build_model(): ''' Builds model as pipeline Inputs: None Output: cv: model with best parameters found during GridSearch for pipeline consisting of nlp steps and final estimator with multioutput wrapper ''' model = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(SGDClassifier(random_state=42)))]) parameters = { 'vect__stop_words': (tokenized_stop_words, None), 'vect__ngram_range': ((1, 1), (1, 2)), 'vect__max_df': (0.5, 0.75, 1.0) } cv = GridSearchCV(model, param_grid=parameters, verbose=2) return cv
def build_model(gridsearch=True): """ Builds an NLP pipeline to do teh following: 1. Tokenize 2. Vectorize (count then tfidf) 3. other custom extractors 4. finally, a classifier The pipeline will also support methods such as .fit and .predict Will also apply a grid search optionally. """ pipeline = Pipeline([ ( 'features', FeatureUnion([ ('text_pipeline', Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer())])), # ('starting_verb', StartingVerbExtractor()) ])), ('clf', MultiOutputClassifier(MultinomialNB())) ]) if gridsearch == True: parameters = { 'features__text_pipeline__vect__ngram_range': ((1, 1), (1, 2), (1, 3)), 'features__text_pipeline__vect__max_df': (0.5, 0.75, 1.0), 'features__text_pipeline__vect__max_features': (None, 5000, 10000), 'features__text_pipeline__tfidf__use_idf': (True, False) } cv = GridSearchCV(pipeline, param_grid=parameters) return cv return pipeline
def build_model(): """ Build machine learning model (KNeighborsClassifier) Input: clean-tokens: X_train, Y_train, X_test, Y_test Returns: pipline: sklearn.model_selection.GridSearchCV. """ pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(KNeighborsClassifier())) ]) parameters = { 'clf__estimator__n_neighbors': [5, 10], 'clf__estimator__weights': ['uniform', 'distance'] } cv = GridSearchCV(pipeline, param_grid=parameters) model = cv return model
def build_model(): ''' function that build the machine learning pipeline ''' pipeline = Pipeline([ ('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('multioutput', MultiOutputClassifier(LinearSVC(class_weight='balanced'), n_jobs=-1)) ]) parameters = { 'vect__ngram_range': ((1, 1), (1, 2)), 'vect__max_features': (None, 5000, 10000), 'tfidf__use_idf': (True, False), 'multioutput__estimator__C': [100, 1000, 10000] } with active_session(): cv = GridSearchCV(pipeline, param_grid=parameters, cv=3) return cv
def build_model(): """Returns the GridSearchCV object to be used as the model Args: None Returns: cv (scikit-learn GridSearchCV): Grid search model object """ pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(RandomForestClassifier()))]) # specify parameters for grid search parameters = { 'clf__estimator__n_estimators': [20], 'clf__estimator__min_samples_split': [2] } # create grid search object cv = GridSearchCV(pipeline, param_grid=parameters) return cv
def build_model(): ''' build model Returns ----------------------- model: model for prediction ''' pipeline = Pipeline([ ('vect', CountVectorizer(tokenizer=tokenize, max_df=0.75)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(estimator=RandomForestClassifier(), n_jobs=-1)) ]) parameters = { 'vect__max_df': (0.5, 0.75), 'clf__estimator__n_estimators': [25, 50], } cv = GridSearchCV(pipeline, param_grid=parameters, n_jobs=-1, verbose=2) return cv
def build_model(): """ Return Grid Search model with pipeline and Classifier parameters: return: cv : the estimator """ moc = MultiOutputClassifier(RandomForestClassifier()) pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', moc)]) parameters = { 'clf__estimator__max_depth': [10, 50, None], 'clf__estimator__min_samples_leaf': [2, 5, 10] } cv = GridSearchCV(pipeline, parameters) return cv
def build_model(): """ Function to build model which involves setting up the pipeline of various steps to train an NLP model. Input: None Return: model: scikit-learn model: Can be used to train on data and evaluate on test set. """ # Steps: tokenize, transform to get Tfidf vectors for data, classifier to train pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(AdaBoostClassifier()))]) # Parameter search on sklearn cross validation parameters = { 'clf__estimator__n_estimators': [50, 100, 200], 'clf__estimator__learning_rate': [0.1, 0.5, 1.0] } # grid search on data to obtain best parameters. model = GridSearchCV(pipeline, param_grid=parameters, n_jobs=-1) return model
def build_model(): """ Create model build pipline Then specify parameters Run gridsearch to select optimal parameters input: nothing output: model """ model = Pipeline([ ('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(RandomForestClassifier())) ]) parameters = { 'clf__estimator__n_estimators': [1, 200], 'clf__estimator__min_samples_split': [2, 100], 'clf__estimator__min_samples_leaf': [5, 100] } cv = GridSearchCV(estimator=model, param_grid=parameters, cv=3) return model
def build_model(): ''' Builds the machine learning pipeline using gridsearch. :return cv: model ''' # model pipeline pipeline = Pipeline([ ('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(estimator=RandomForestClassifier())) ]) parameters = { 'vect__max_features': (None, 5000), 'tfidf__use_idf': (True, False), 'clf__estimator__n_estimators': [10, 20, 50], 'clf__estimator__min_samples_split': [2, 3, 4] } cv = GridSearchCV(estimator=pipeline, param_grid=parameters, n_jobs=-1, verbose=2) return cv
def build_model(): ''' Function to build a model, create pipeline, hypertuning as well as gridsearchcv Input: N/A Output: Returns the model ''' # Creating Machine Learning Pipeline pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(RandomForestClassifier()))]) # choose parameters parameters = {'clf__estimator__n_estimators': [50]} # create grid search object model = GridSearchCV(pipeline, param_grid=parameters, scoring='recall_micro', cv=4) return model
def build_model(): """ Build the machine learning model. """ # Create pipeline pipeline = Pipeline([('vectorize', CountVectorizer(tokenizer=tokenize)), ('tf-idf', TfidfTransformer()), ('classifier', MultiOutputClassifier(RandomForestClassifier()))]) # Do parameter tuning parameters = { #'classifier__estimator__n_estimators': [100, 200], #'classifier__estimator__criterion': ['gini', 'entropy'], 'classifier__estimator__max_depth': [6, 9] } # Fit the model cv = GridSearchCV(pipeline, parameters, n_jobs=-1) return cv
def build_model(): ''' model pipeline to train classifier to predict outputs for the 36 categories cv - grid search for tuning parameters ''' rf = RandomForestClassifier() pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(rf))]) # define parameters for GridSearchCV parameters = { 'clf__estimator__min_samples_split': [2, 3, 4], 'vect__ngram_range': ((1, 2), (2, 2)) } # create gridsearch object and return as final model pipeline cv = GridSearchCV(pipeline, param_grid=parameters) return cv
def build_model(): ''' Args: None Returns: a pipeline for model training ''' # Build a machine learning pipeline pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(RandomForestClassifier()))]) # set parameters for tuning parameters = { 'vect__ngram_range': [(1, 1), (1, 2)], 'clf__estimator__n_estimators': [10, 20, 50] } cv = GridSearchCV(pipeline, param_grid=parameters, cv=None) return cv
def build_model(): pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(RandomForestClassifier()))]) parameters = { # RandomForestClassifier 'clf__estimator__n_estimators': [50, 100], #'clf__estimator__min_samples_split': [2,5], #'clf__estimator__criterion': ['entropy', 'gini'] # SVC #"clf__estimator__C": [0.001, 0.01, 0.1, 1, 10], #"clf__estimator__gamma":[0.001, 0.01, 0.1, 1] # DecisionTreeClassifier #"clf__estimator__criterion": ['entropy', 'gini'], #"clf__estimator__min_samples_split":[2,4] } cv = GridSearchCV(pipeline, param_grid=parameters) return cv
def model_pipeline(): """ Set up model pipeline. Include custom transformers and optimise parameters using gridsearchCV :return: an instance of the model """ pipeline = Pipeline([ ('features', FeatureUnion([ ('text_pipeline', Pipeline([ ('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()) ])), ('starting_verb', StartingVerbExtractor()), ('text_length', TextLengthExtractor()), ('word_count', WordCountExtractor()), ('sentiment', SentimentExtractor()) ])), ('clf', MultiOutputClassifier(RandomForestClassifier())) ]) # parameters = {'features__text_pipeline__tfidf__norm': ['l1', 'l2'], # 'clf__estimator__criterion': ["gini", "entropy"], # 'clf__estimator__max_features': ['auto', 'sqrt', 'log2'], # 'clf__estimator__class_weight': ['balanced']} # used to account for class imbalance # Best CV params parameters = {'features__text_pipeline__tfidf__norm': ['l2'], 'clf__estimator__criterion': ["gini"], 'clf__estimator__max_features': ['sqrt'], 'clf__estimator__class_weight': ['balanced']} # used to account for class imbalance # Focus on the f1 score due to the unbalanced classes cv = GridSearchCV(pipeline, param_grid=parameters, verbose=3, n_jobs=-1) logging.debug('function:model_pipeline: model pipeline instantiated') return cv
def build_model(): ''' Build a ML pipeline using ifidf, random forest, and gridsearch Input: None Output: Results of GridSearchCV ''' pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(RandomForestClassifier()))]) parameters = { 'vec__max_df': [0.8], 'clf__estimator__max_depth': (25, 50, None), 'clf__estimator__min_samples_split': (2, 10, 25, 50, 100), 'clf__estimator__n_estimators': [500] } cv = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=10) return cv
def build_model(): """ Function to build the ML pipeline. Arguments: None Returns: model: Scikit Pipeline or GridSearchCV object """ # LinearSVC Classifier was selected as it performed better than DecisionTree, # RandomForest and AdaBoost, all tried in the ML Pipeline Preparation notebook. pipeline = Pipeline([ ('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(LinearSVC())) ]) model = pipeline ############ Use grid search to find better parameters #################### # check pipeline parameters # pipeline.get_params() parameters = { 'clf__estimator__loss': ('hinge', 'squared_hinge'), 'clf__estimator__C': (0.5, 1.0) } cv = GridSearchCV(estimator=pipeline, n_jobs = -1, param_grid=parameters) model = cv ########################################################################### return model
def build_model(): """ Build a TF-IDF pipeline that, processes text and then performs, multi-output classification on the 36 categories in the dataset. Returns: cv (scikit-learn GridSearchCV): GridSearchCV model object. """ pipeline = Pipeline([ ('vect', CountVectorizer(tokenizer = tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(OneVsRestClassifier(LinearSVC()))) ]) parameters = { 'tfidf__smooth_idf':[True, False], 'clf__estimator__estimator__C':[1,2,5]} cv = GridSearchCV(pipeline, parameters) return cv
def build_model(): """Builds classification model Args: N/A Returns: return trained model """ pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(RandomForestClassifier()))]) parameters = { 'vect__ngram_range': ((1, 1), (1, 2)), 'clf__estimator__min_samples_split': [2, 4], } cv = GridSearchCV(pipeline, param_grid=parameters, verbose=2, n_jobs=4) return cv
def build_model(): ''' Function to build model pipeline with feature extraction and estimator. ARGS: None OUTPUT: pipeline: built model ''' pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(RandomForestClassifier(), n_jobs=-1))]) parameters = { 'clf__estimator__criterion': ["gini", "entropy"], 'clf__estimator__n_jobs': [-1] } cv = GridSearchCV(pipeline, parameters, n_jobs=-1) return cv
def build_model(): """ Build Model pipeline Output is a tuned model that process text messages and apply model for scoring. """ modelp = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(AdaBoostClassifier()))]) # hyper-parameter grid parameters = {'clf__estimator__n_estimators': (50, 100)} # create model model = GridSearchCV(estimator=modelp, param_grid=parameters, verbose=3, cv=2) return model