def get_ptsd_mh_model(valid_users=None, forced_retrain=False): if not forced_retrain and os.path.isfile(config.ACTIVE_MENTIONS_PTSD_MH_MODEL_PATH): return util.load_picke_file(config.ACTIVE_MENTIONS_PTSD_MH_MODEL_PATH) tweets, labels = get_labelled_data_for_ptsd(valid_users) pipeline = get_classifier_pipeline() pipeline.fit(tweets, labels) util.dump_picke_file(pipeline, config.ACTIVE_MENTIONS_PTSD_MH_MODEL_PATH) return pipeline
scoring=scoring, cv=cv_folds, return_train_score=False, verbose=3) for t in scoring: s = scores['test_' + t] print(str(round(s.mean(), 3)) + '(' + str(round(s.std(), 3)) + '), ', end=' ') print() return scores if __name__ == "__main__": df = pd.concat([ util.load_picke_file(config.CTRL_DEPR_FILTERED_DF), util.load_picke_file(config.CTRL_DEPR_HELD_OUT_FILTERED_DF) ]) # df = pd.concat([util.load_picke_file(config.CTRL_DEPR_DF), util.load_picke_file(config.CTRL_DEPR_HELD_OUT_DF)]) # df = pd.concat([util.load_picke_file(config.CTRL_PTSD_DF), util.load_picke_file(config.CTRL_PTSD_HELD_OUT_DF)]) # df = pd.concat([util.load_picke_file(config.CTRL_PTSD_FILTERED_DF), util.load_picke_file(config.CTRL_PTSD_HELD_OUT_FILTERED_DF)]) # df = util.load_picke_file(config.CTRL_PTSD_HELD_OUT_DF) labels = df['labels'].astype(int).values pipelines = models.get_pipelines( mlutils.selector_fn_noop, 'split_filtered_tweets', CalibratedClassifierCV(svm.LinearSVC(), cv=3), slda_priors_model, vocab_path) # cross_val_results = {} cross_val_results = util.load_picke_file('crossval_dvc_filtered_results.p')
def get_transformers(col_selector_fn, slda_dir, slda_priors_model, vocab_path): vocab = util.load_picke_file(vocab_path) return { 'aggr_bow': [('union', FeatureUnion(transformer_list=[( 'aggr', mlutils.TweetChunker( FeatureUnion([ ('bow', TfidfVectorizer(vocabulary=vocab, tokenizer=mlutils. tokenize_only_alphanumeric_tokens)), ]), col_selector_fn))]))], 'aggr_clusters': [('union', FeatureUnion(transformer_list=[( 'aggr', mlutils.TweetChunker( FeatureUnion([ ('clusters', TfidfVectorizer(tokenizer=bc.tokenize_and_tag, min_df=0.01)), ]), col_selector_fn))]))], 'bow': [('union', FeatureUnion(transformer_list=[( 'bagofwords', Pipeline([('selector', mlutils.ColumnSelector(col_selector_fn)), ('tfidf', TfidfVectorizer(vocabulary=vocab, tokenizer=mlutils. tokenize_only_alphanumeric_tokens) )]))]))], 'clusters': [('union', FeatureUnion(transformer_list=[( 'brown_clusters', Pipeline([('selector', mlutils.ColumnSelector(col_selector_fn)), ('tfidf', TfidfVectorizer(tokenizer=bc.tokenize_and_tag, min_df=0.01))]))]))], 'slda': [('union', FeatureUnion(transformer_list=[( 'agg', mlutils.TweetChunker( FeatureUnion([('slda', mlutils.SLDA(vocab_path, dir=slda_dir, iters=100, e_step_iters=100, priors_model=slda_priors_model) )]), col_selector_fn))]))], 'aggslda_bow': [ ('union', FeatureUnion(transformer_list=[( 'agg', mlutils.TweetChunker( FeatureUnion([ ('slda', mlutils.SLDA(vocab_path, dir=slda_dir, iters=100, e_step_iters=100, priors_model=slda_priors_model)), ('bow', TfidfVectorizer(vocabulary=vocab, tokenizer=mlutils. tokenize_only_alphanumeric_tokens)), ]), col_selector_fn))])) ], 'aggslda_aggbow_clusters': [ ('union', FeatureUnion( transformer_list=[ ('agg', mlutils.TweetChunker( FeatureUnion([ ('slda', mlutils.SLDA(vocab_path, dir=slda_dir, iters=100, e_step_iters=100, priors_model=slda_priors_model)), ('bow', TfidfVectorizer( vocabulary=vocab, tokenizer=mlutils .tokenize_only_alphanumeric_tokens)), ]), col_selector_fn)), ('brown_clusters', Pipeline([('selector', mlutils.ColumnSelector(col_selector_fn)), ('tfidf', TfidfVectorizer(tokenizer=bc.tokenize_and_tag, min_df=0.01))])) ])) ], 'aggslda_aggbow_clusters_nltk': [ ('union', FeatureUnion( transformer_list=[ ('agg', mlutils.TweetChunker( FeatureUnion([ ('slda', mlutils.SLDA(vocab_path, dir=slda_dir, iters=100, e_step_iters=100, priors_model=slda_priors_model)), ('bow', TfidfVectorizer( vocabulary=vocab, tokenizer=mlutils .tokenize_only_alphanumeric_tokens)), ]), col_selector_fn)), ('brown_clusters', Pipeline([('selector', mlutils.ColumnSelector(col_selector_fn)), ('tfidf', TfidfVectorizer(tokenizer=bc.tokenize_and_tag, min_df=0.01))])), ('pos_tags', Pipeline( [('selector', mlutils.POSTagColumnSelector('nltk_pos_tags')), ('tfidf', TfidfVectorizer(tokenizer=mlutils.tknzr.tokenize, min_df=0.01, ngram_range=(1, 3)))])) ])) ], 'aggslda_aggbow_clusters_cmu': [ ('union', FeatureUnion( transformer_list=[ ('agg', mlutils.TweetChunker( FeatureUnion([ ('slda', mlutils.SLDA(vocab_path, dir=slda_dir, iters=100, e_step_iters=100, priors_model=slda_priors_model)), ('bow', TfidfVectorizer( vocabulary=vocab, tokenizer=mlutils .tokenize_only_alphanumeric_tokens)), ]), col_selector_fn)), ('brown_clusters', Pipeline([('selector', mlutils.ColumnSelector(col_selector_fn)), ('tfidf', TfidfVectorizer(tokenizer=bc.tokenize_and_tag, min_df=0.01))])), ('pos_tags', Pipeline([('selector', mlutils.POSTagColumnSelector('cmu_pos_tags')), ('tfidf', TfidfVectorizer( tokenizer=mlutils.tknzr.tokenize, min_df=0.01, ngram_range=(1, 3)))])) ])) ], 'bow_clusters_cmu': [('union', FeatureUnion(transformer_list=[ ('bagofwords', Pipeline( [('selector', mlutils.ColumnSelector(col_selector_fn)), ('tfidf', TfidfVectorizer( vocabulary=vocab, tokenizer=mlutils .tokenize_only_alphanumeric_tokens))])), ('brown_clusters', Pipeline([('selector', mlutils.ColumnSelector(col_selector_fn)), ('tfidf', TfidfVectorizer(tokenizer=bc.tokenize_and_tag, min_df=0.01))])), ('pos_tags', Pipeline([('selector', mlutils.POSTagColumnSelector('cmu_pos_tags')), ('tfidf', TfidfVectorizer(tokenizer=mlutils.tknzr.tokenize, min_df=0.01, ngram_range=(1, 3)))])) ]))], 'bow_clusters_nltk': [('union', FeatureUnion(transformer_list=[ ('bagofwords', Pipeline( [('selector', mlutils.ColumnSelector(col_selector_fn)), ('tfidf', TfidfVectorizer( vocabulary=vocab, tokenizer=mlutils .tokenize_only_alphanumeric_tokens))])), ('brown_clusters', Pipeline([('selector', mlutils.ColumnSelector(col_selector_fn)), ('tfidf', TfidfVectorizer(tokenizer=bc.tokenize_and_tag, min_df=0.01))])), ('pos_tags', Pipeline([('selector', mlutils.POSTagColumnSelector('nltk_pos_tags')), ('tfidf', TfidfVectorizer(tokenizer=mlutils.tknzr.tokenize, min_df=0.01, ngram_range=(1, 3)))])) ]))] }
print(index) probs.append(model.predict_proba(row['tweets'])[:, 1]) df['mental_health_probs'] = probs return df if __name__ == '__main__': # ctrl_depr = util.load_picke_file(config.CTRL_DEPR_DF) # valid_users = ctrl_depr.index.values # ctrl_depr = add_depr_and_mental_health_tweet_probs(ctrl_depr, valid_users=valid_users) # util.dump_picke_file(ctrl_depr, config.CTRL_DEPR_DF) # ctrl_depr = None # ctrl_depr_held_out = util.load_picke_file(config.CTRL_DEPR_HELD_OUT_DF) # ctrl_depr_held_out = add_depr_and_mental_health_tweet_probs(ctrl_depr_held_out, valid_users=valid_users) # util.dump_picke_file(ctrl_depr_held_out, config.CTRL_DEPR_HELD_OUT_DF) # ctrl_depr_held_out=None ctrl_ptsd = util.load_picke_file(config.CTRL_PTSD_DF) valid_users = None #ctrl_ptsd.index.values ctrl_ptsd = add_ptsd_and_mental_health_tweet_probs(ctrl_ptsd, valid_users=valid_users) util.dump_picke_file(ctrl_ptsd, config.CTRL_PTSD_DF) ctrl_ptsd_held_out = util.load_picke_file(config.CTRL_PTSD_HELD_OUT_DF) ctrl_ptsd_held_out = add_ptsd_and_mental_health_tweet_probs( ctrl_ptsd_held_out, valid_users=valid_users) util.dump_picke_file(ctrl_ptsd_held_out, config.CTRL_PTSD_HELD_OUT_DF)