def get_ptsd_mh_model(valid_users=None, forced_retrain=False):
    if not forced_retrain and os.path.isfile(config.ACTIVE_MENTIONS_PTSD_MH_MODEL_PATH):
        return util.load_picke_file(config.ACTIVE_MENTIONS_PTSD_MH_MODEL_PATH)

    tweets, labels = get_labelled_data_for_ptsd(valid_users)
    pipeline = get_classifier_pipeline()
    pipeline.fit(tweets, labels)
    util.dump_picke_file(pipeline, config.ACTIVE_MENTIONS_PTSD_MH_MODEL_PATH)
    return pipeline
                            scoring=scoring,
                            cv=cv_folds,
                            return_train_score=False,
                            verbose=3)

    for t in scoring:
        s = scores['test_' + t]
        print(str(round(s.mean(), 3)) + '(' + str(round(s.std(), 3)) + '), ',
              end=' ')
    print()
    return scores


if __name__ == "__main__":
    df = pd.concat([
        util.load_picke_file(config.CTRL_DEPR_FILTERED_DF),
        util.load_picke_file(config.CTRL_DEPR_HELD_OUT_FILTERED_DF)
    ])
    # df = pd.concat([util.load_picke_file(config.CTRL_DEPR_DF), util.load_picke_file(config.CTRL_DEPR_HELD_OUT_DF)])
    # df = pd.concat([util.load_picke_file(config.CTRL_PTSD_DF), util.load_picke_file(config.CTRL_PTSD_HELD_OUT_DF)])
    # df = pd.concat([util.load_picke_file(config.CTRL_PTSD_FILTERED_DF), util.load_picke_file(config.CTRL_PTSD_HELD_OUT_FILTERED_DF)])
    # df = util.load_picke_file(config.CTRL_PTSD_HELD_OUT_DF)
    labels = df['labels'].astype(int).values

    pipelines = models.get_pipelines(
        mlutils.selector_fn_noop, 'split_filtered_tweets',
        CalibratedClassifierCV(svm.LinearSVC(), cv=3), slda_priors_model,
        vocab_path)

    # cross_val_results = {}
    cross_val_results = util.load_picke_file('crossval_dvc_filtered_results.p')
示例#3
0
def get_transformers(col_selector_fn, slda_dir, slda_priors_model, vocab_path):
    vocab = util.load_picke_file(vocab_path)
    return {
        'aggr_bow':
        [('union',
          FeatureUnion(transformer_list=[(
              'aggr',
              mlutils.TweetChunker(
                  FeatureUnion([
                      ('bow',
                       TfidfVectorizer(vocabulary=vocab,
                                       tokenizer=mlutils.
                                       tokenize_only_alphanumeric_tokens)),
                  ]), col_selector_fn))]))],
        'aggr_clusters':
        [('union',
          FeatureUnion(transformer_list=[(
              'aggr',
              mlutils.TweetChunker(
                  FeatureUnion([
                      ('clusters',
                       TfidfVectorizer(tokenizer=bc.tokenize_and_tag,
                                       min_df=0.01)),
                  ]), col_selector_fn))]))],
        'bow':
        [('union',
          FeatureUnion(transformer_list=[(
              'bagofwords',
              Pipeline([('selector', mlutils.ColumnSelector(col_selector_fn)),
                        ('tfidf',
                         TfidfVectorizer(vocabulary=vocab,
                                         tokenizer=mlutils.
                                         tokenize_only_alphanumeric_tokens)
                         )]))]))],
        'clusters':
        [('union',
          FeatureUnion(transformer_list=[(
              'brown_clusters',
              Pipeline([('selector', mlutils.ColumnSelector(col_selector_fn)),
                        ('tfidf',
                         TfidfVectorizer(tokenizer=bc.tokenize_and_tag,
                                         min_df=0.01))]))]))],
        'slda':
        [('union',
          FeatureUnion(transformer_list=[(
              'agg',
              mlutils.TweetChunker(
                  FeatureUnion([('slda',
                                 mlutils.SLDA(vocab_path,
                                              dir=slda_dir,
                                              iters=100,
                                              e_step_iters=100,
                                              priors_model=slda_priors_model)
                                 )]), col_selector_fn))]))],
        'aggslda_bow': [
            ('union',
             FeatureUnion(transformer_list=[(
                 'agg',
                 mlutils.TweetChunker(
                     FeatureUnion([
                         ('slda',
                          mlutils.SLDA(vocab_path,
                                       dir=slda_dir,
                                       iters=100,
                                       e_step_iters=100,
                                       priors_model=slda_priors_model)),
                         ('bow',
                          TfidfVectorizer(vocabulary=vocab,
                                          tokenizer=mlutils.
                                          tokenize_only_alphanumeric_tokens)),
                     ]), col_selector_fn))]))
        ],
        'aggslda_aggbow_clusters': [
            ('union',
             FeatureUnion(
                 transformer_list=[
                     ('agg',
                      mlutils.TweetChunker(
                          FeatureUnion([
                              ('slda',
                               mlutils.SLDA(vocab_path,
                                            dir=slda_dir,
                                            iters=100,
                                            e_step_iters=100,
                                            priors_model=slda_priors_model)),
                              ('bow',
                               TfidfVectorizer(
                                   vocabulary=vocab,
                                   tokenizer=mlutils
                                   .tokenize_only_alphanumeric_tokens)),
                          ]), col_selector_fn)),
                     ('brown_clusters',
                      Pipeline([('selector',
                                 mlutils.ColumnSelector(col_selector_fn)),
                                ('tfidf',
                                 TfidfVectorizer(tokenizer=bc.tokenize_and_tag,
                                                 min_df=0.01))]))
                 ]))
        ],
        'aggslda_aggbow_clusters_nltk': [
            ('union',
             FeatureUnion(
                 transformer_list=[
                     ('agg',
                      mlutils.TweetChunker(
                          FeatureUnion([
                              ('slda',
                               mlutils.SLDA(vocab_path,
                                            dir=slda_dir,
                                            iters=100,
                                            e_step_iters=100,
                                            priors_model=slda_priors_model)),
                              ('bow',
                               TfidfVectorizer(
                                   vocabulary=vocab,
                                   tokenizer=mlutils
                                   .tokenize_only_alphanumeric_tokens)),
                          ]), col_selector_fn)),
                     ('brown_clusters',
                      Pipeline([('selector',
                                 mlutils.ColumnSelector(col_selector_fn)),
                                ('tfidf',
                                 TfidfVectorizer(tokenizer=bc.tokenize_and_tag,
                                                 min_df=0.01))])),
                     ('pos_tags',
                      Pipeline(
                          [('selector',
                            mlutils.POSTagColumnSelector('nltk_pos_tags')),
                           ('tfidf',
                            TfidfVectorizer(tokenizer=mlutils.tknzr.tokenize,
                                            min_df=0.01,
                                            ngram_range=(1, 3)))]))
                 ]))
        ],
        'aggslda_aggbow_clusters_cmu': [
            ('union',
             FeatureUnion(
                 transformer_list=[
                     ('agg',
                      mlutils.TweetChunker(
                          FeatureUnion([
                              ('slda',
                               mlutils.SLDA(vocab_path,
                                            dir=slda_dir,
                                            iters=100,
                                            e_step_iters=100,
                                            priors_model=slda_priors_model)),
                              ('bow',
                               TfidfVectorizer(
                                   vocabulary=vocab,
                                   tokenizer=mlutils
                                   .tokenize_only_alphanumeric_tokens)),
                          ]), col_selector_fn)),
                     ('brown_clusters',
                      Pipeline([('selector',
                                 mlutils.ColumnSelector(col_selector_fn)),
                                ('tfidf',
                                 TfidfVectorizer(tokenizer=bc.tokenize_and_tag,
                                                 min_df=0.01))])),
                     ('pos_tags',
                      Pipeline([('selector',
                                 mlutils.POSTagColumnSelector('cmu_pos_tags')),
                                ('tfidf',
                                 TfidfVectorizer(
                                     tokenizer=mlutils.tknzr.tokenize,
                                     min_df=0.01,
                                     ngram_range=(1, 3)))]))
                 ]))
        ],
        'bow_clusters_cmu':
        [('union',
          FeatureUnion(transformer_list=[
              ('bagofwords',
               Pipeline(
                   [('selector', mlutils.ColumnSelector(col_selector_fn)),
                    ('tfidf',
                     TfidfVectorizer(
                         vocabulary=vocab,
                         tokenizer=mlutils
                         .tokenize_only_alphanumeric_tokens))])),
              ('brown_clusters',
               Pipeline([('selector', mlutils.ColumnSelector(col_selector_fn)),
                         ('tfidf',
                          TfidfVectorizer(tokenizer=bc.tokenize_and_tag,
                                          min_df=0.01))])),
              ('pos_tags',
               Pipeline([('selector',
                          mlutils.POSTagColumnSelector('cmu_pos_tags')),
                         ('tfidf',
                          TfidfVectorizer(tokenizer=mlutils.tknzr.tokenize,
                                          min_df=0.01,
                                          ngram_range=(1, 3)))]))
          ]))],
        'bow_clusters_nltk':
        [('union',
          FeatureUnion(transformer_list=[
              ('bagofwords',
               Pipeline(
                   [('selector', mlutils.ColumnSelector(col_selector_fn)),
                    ('tfidf',
                     TfidfVectorizer(
                         vocabulary=vocab,
                         tokenizer=mlutils
                         .tokenize_only_alphanumeric_tokens))])),
              ('brown_clusters',
               Pipeline([('selector', mlutils.ColumnSelector(col_selector_fn)),
                         ('tfidf',
                          TfidfVectorizer(tokenizer=bc.tokenize_and_tag,
                                          min_df=0.01))])),
              ('pos_tags',
               Pipeline([('selector',
                          mlutils.POSTagColumnSelector('nltk_pos_tags')),
                         ('tfidf',
                          TfidfVectorizer(tokenizer=mlutils.tknzr.tokenize,
                                          min_df=0.01,
                                          ngram_range=(1, 3)))]))
          ]))]
    }
示例#4
0
        print(index)
        probs.append(model.predict_proba(row['tweets'])[:, 1])
    df['mental_health_probs'] = probs

    return df


if __name__ == '__main__':
    # ctrl_depr = util.load_picke_file(config.CTRL_DEPR_DF)
    # valid_users = ctrl_depr.index.values

    # ctrl_depr = add_depr_and_mental_health_tweet_probs(ctrl_depr, valid_users=valid_users)
    # util.dump_picke_file(ctrl_depr, config.CTRL_DEPR_DF)
    # ctrl_depr = None

    # ctrl_depr_held_out = util.load_picke_file(config.CTRL_DEPR_HELD_OUT_DF)
    # ctrl_depr_held_out = add_depr_and_mental_health_tweet_probs(ctrl_depr_held_out, valid_users=valid_users)
    # util.dump_picke_file(ctrl_depr_held_out, config.CTRL_DEPR_HELD_OUT_DF)
    # ctrl_depr_held_out=None

    ctrl_ptsd = util.load_picke_file(config.CTRL_PTSD_DF)
    valid_users = None  #ctrl_ptsd.index.values

    ctrl_ptsd = add_ptsd_and_mental_health_tweet_probs(ctrl_ptsd,
                                                       valid_users=valid_users)
    util.dump_picke_file(ctrl_ptsd, config.CTRL_PTSD_DF)

    ctrl_ptsd_held_out = util.load_picke_file(config.CTRL_PTSD_HELD_OUT_DF)
    ctrl_ptsd_held_out = add_ptsd_and_mental_health_tweet_probs(
        ctrl_ptsd_held_out, valid_users=valid_users)
    util.dump_picke_file(ctrl_ptsd_held_out, config.CTRL_PTSD_HELD_OUT_DF)