Exemplo n.º 1
0
def tfidf_logreg(config):
    preprocessed_input = _preprocessing(config, is_train=False)
    tfidf_char_vectorizer, tfidf_word_vectorizer = _tfidf(
        preprocessed_input, config)

    tfidf_logreg = Step(name='tfidf_logreg',
                        transformer=LogisticRegressionMultilabel(
                            **config.logistic_regression_multilabel),
                        input_steps=[
                            preprocessed_input, tfidf_char_vectorizer,
                            tfidf_word_vectorizer
                        ],
                        adapter={
                            'X': ([('tfidf_char_vectorizer', 'features'),
                                   ('tfidf_word_vectorizer', 'features')],
                                  sparse_hstack_inputs),
                            'y': ([('cleaning_output', 'y')]),
                        },
                        cache_dirpath=config.env.cache_dirpath)
    output = Step(name='tfidf_logreg_output',
                  transformer=Dummy(),
                  input_steps=[tfidf_logreg],
                  adapter={
                      'y_pred': ([('tfidf_logreg', 'prediction_probability')]),
                  },
                  cache_dirpath=config.env.cache_dirpath)
    return output
Exemplo n.º 2
0
def bad_word_count_features_logreg(config):
    preprocessed_input = _preprocessing(config, is_train=False)
    normalizer = _count_features(config)
    xy_split = normalizer.get_step('xy_split')
    tfidf_word_vectorizer = _bad_word_tfidf(preprocessed_input, config)

    bad_word_count_logreg = Step(
        name='bad_word_count_logreg',
        transformer=LogisticRegressionMultilabel(
            **config.logistic_regression_multilabel),
        input_steps=[xy_split, normalizer, tfidf_word_vectorizer],
        adapter={
            'X': ([('normalizer', 'X'),
                   ('bad_word_tfidf_word_vectorizer', 'features')],
                  sparse_hstack_inputs),
            'y': ([('xy_split', 'y')]),
        },
        cache_dirpath=config.env.cache_dirpath)

    output = Step(name='bad_word_count_features_logreg_output',
                  transformer=Dummy(),
                  input_steps=[bad_word_count_logreg],
                  adapter={
                      'y_pred':
                      ([('bad_word_count_logreg', 'prediction_probability')]),
                  },
                  cache_dirpath=config.env.cache_dirpath)
    return output
Exemplo n.º 3
0
def hand_crafted_all_logreg(config):
    xy_split, normalizer, char_vector, word_vector, bad_word_vector = hand_crafted_all(
        config)

    logreg_multi = Step(name='logreg_multi',
                        transformer=LogisticRegressionMultilabel(
                            **config.logistic_regression_multilabel),
                        input_steps=[
                            xy_split, normalizer, char_vector, word_vector,
                            bad_word_vector
                        ],
                        adapter={
                            'X': ([('normalizer', 'X'),
                                   ('tfidf_char_vectorizer', 'features'),
                                   ('tfidf_word_vectorizer', 'features'),
                                   ('bad_word_tfidf_word_vectorizer',
                                    'features')], sparse_hstack_inputs),
                            'y': ([('xy_split', 'y')]),
                        },
                        cache_dirpath=config.env.cache_dirpath)

    logreg_output = Step(name='logreg_output',
                         transformer=Dummy(),
                         input_steps=[logreg_multi],
                         adapter={
                             'y_pred':
                             ([('logreg_multi', 'prediction_probability')]),
                         },
                         cache_dirpath=config.env.cache_dirpath)
    return logreg_output
Exemplo n.º 4
0
def logistic_regression_ensemble_train(config):
    model_outputs = ensemble_extraction(config)
    output_mappings = [(output_step.name, 'prediction_probability')
                       for output_step in model_outputs]

    label = model_outputs[0].get_step('xy_train')

    input_steps = model_outputs + [label]

    logreg = Step(name='logreg_ensemble',
                  transformer=LogisticRegressionMultilabel(
                      **config.logistic_regression_ensemble),
                  overwrite_transformer=True,
                  input_steps=input_steps,
                  adapter={
                      'X': (output_mappings, hstack_inputs),
                      'y': ([('xy_train', 'y')])
                  },
                  cache_dirpath=config.env.cache_dirpath)

    logreg_ensemble_output = Step(
        name='logreg_ensemble_output',
        transformer=Dummy(),
        input_steps=[logreg],
        adapter={'y_pred': ([('logreg_ensemble', 'prediction_probability')])},
        cache_dirpath=config.env.cache_dirpath)
    return logreg_ensemble_output
def bad_word_logreg(config):
    preprocessed_input = inference_preprocessing(config)
    tfidf_word_vectorizer = bad_word_tfidf(preprocessed_input, config)

    logreg_bad_word = Step(name='logreg_bad_word',
                           transformer=LogisticRegressionMultilabel(**config.logistic_regression_multilabel),
                           input_steps=[preprocessed_input, tfidf_word_vectorizer],
                           adapter={'X': ([('bad_word_tfidf_word_vectorizer', 'features')]),
                                    'y': ([('cleaning_output', 'y')]),
                                    },
                           cache_dirpath=config.env.cache_dirpath)
    logreg_output = Step(name='logreg_output',
                         transformer=Dummy(),
                         input_steps=[logreg_bad_word],
                         adapter={'y_pred': ([('logreg_bad_word', 'prediction_probability')]),
                                  },
                         cache_dirpath=config.env.cache_dirpath)
    return logreg_output
def count_features_logreg(config):
    normalizer = count_features(config)
    xy_split = normalizer.get_step('xy_split')

    logreg_count = Step(name='logreg_count',
                        transformer=LogisticRegressionMultilabel(**config.logistic_regression_multilabel),
                        input_steps=[xy_split, normalizer],
                        adapter={'X': ([('normalizer', 'X')]),
                                 'y': ([('xy_split', 'y')]),
                                 },
                        cache_dirpath=config.env.cache_dirpath)

    logreg_output = Step(name='logreg_output',
                         transformer=Dummy(),
                         input_steps=[logreg_count],
                         adapter={'y_pred': ([('logreg_count', 'prediction_probability')]),
                                  },
                         cache_dirpath=config.env.cache_dirpath)
    return logreg_output
Exemplo n.º 7
0
def tfidf_log_reg(preprocessed_input, config):
    tfidf_char_vectorizer = Step(
        name='tfidf_char_vectorizer',
        transformer=TfidfVectorizer(**config.tfidf_char_vectorizer),
        input_steps=[preprocessed_input],
        adapter={
            'text': ([('xy_split', 'X')], fetch_x_train),
        },
        cache_dirpath=config.env.cache_dirpath)
    tfidf_word_vectorizer = Step(
        name='tfidf_word_vectorizer',
        transformer=TfidfVectorizer(**config.tfidf_word_vectorizer),
        input_steps=[preprocessed_input],
        adapter={
            'text': ([('xy_split', 'X')], fetch_x_train),
        },
        cache_dirpath=config.env.cache_dirpath)
    log_reg_multi = Step(name='log_reg_multi',
                         transformer=LogisticRegressionMultilabel(
                             **config.logistic_regression_multilabel),
                         input_steps=[
                             preprocessed_input, tfidf_char_vectorizer,
                             tfidf_word_vectorizer
                         ],
                         adapter={
                             'X': ([('tfidf_char_vectorizer', 'features'),
                                    ('tfidf_word_vectorizer', 'features')],
                                   sparse_hstack_inputs),
                             'y': ([('xy_split', 'y')]),
                         },
                         cache_dirpath=config.env.cache_dirpath)
    log_reg_output = Step(name='log_reg_output',
                          transformer=Dummy(),
                          input_steps=[log_reg_multi],
                          adapter={
                              'y_pred':
                              ([('log_reg_multi', 'prediction_probability')]),
                          },
                          cache_dirpath=config.env.cache_dirpath)
    return log_reg_output
def ensemble_extraction(config):
    xy_train = Step(name='xy_train',
                    transformer=XYSplit(**config.xy_splitter),
                    input_data=['input_ensemble'],
                    adapter={'meta': ([('input_ensemble', 'meta')]),
                             'train_mode': ([('input_ensemble', 'train_mode')])
                             },
                    cache_dirpath=config.env.cache_dirpath)
    text_cleaner_train = Step(name='text_cleaner_train',
                              transformer=TextCleaner(**config.text_cleaner),
                              input_steps=[xy_train],
                              adapter={'X': ([('xy_train', 'X')])},
                              cache_dirpath=config.env.cache_dirpath)

    char_tokenizer = Step(name='char_tokenizer',
                          transformer=Tokenizer(**config.char_tokenizer),
                          input_steps=[text_cleaner_train],
                          input_data=['input_ensemble'],
                          adapter={'X': ([('text_cleaner_train', 'X')]),
                                   'train_mode': ([('input_ensemble', 'train_mode')])
                                   },
                          cache_dirpath=config.env.cache_dirpath)

    word_tokenizer = Step(name='word_tokenizer',
                          transformer=Tokenizer(**config.word_tokenizer),
                          input_steps=[text_cleaner_train],
                          input_data=['input_ensemble'],
                          adapter={'X': ([('text_cleaner_train', 'X')]),
                                   'train_mode': ([('input_ensemble', 'train_mode')])
                                   },
                          cache_dirpath=config.env.cache_dirpath)

    tfidf_char_vectorizer = Step(name='tfidf_char_vectorizer',
                                 transformer=TfidfVectorizer(**config.tfidf_char_vectorizer),
                                 input_steps=[text_cleaner_train],
                                 adapter={'text': ([('text_cleaner_train', 'X')]),
                                          },
                                 cache_dirpath=config.env.cache_dirpath)
    tfidf_word_vectorizer = Step(name='tfidf_word_vectorizer',
                                 transformer=TfidfVectorizer(**config.tfidf_word_vectorizer),
                                 input_steps=[text_cleaner_train],
                                 adapter={'text': ([('text_cleaner_train', 'X')]),
                                          },
                                 cache_dirpath=config.env.cache_dirpath)

    bad_word_filter = Step(name='bad_word_filter',
                           transformer=WordListFilter(**config.bad_word_filter),
                           input_steps=[text_cleaner_train],
                           adapter={'X': ([('text_cleaner_train', 'X')]),
                                    },
                           cache_dirpath=config.env.cache_dirpath)

    bad_word_tfidf_word_vectorizer = Step(name='bad_word_tfidf_word_vectorizer',
                                          transformer=TfidfVectorizer(**config.tfidf_word_vectorizer),
                                          input_steps=[bad_word_filter],
                                          adapter={'text': ([('bad_word_filter', 'X')]),
                                                   },
                                          cache_dirpath=config.env.cache_dirpath)

    text_counter = Step(name='text_counter',
                        transformer=TextCounter(),
                        input_steps=[xy_train],
                        adapter={'X': ([('xy_train', 'X')])},
                        cache_dirpath=config.env.cache_dirpath)

    normalizer = Step(name='normalizer',
                      transformer=Normalizer(),
                      input_steps=[text_counter],
                      adapter={'X': ([('text_counter', 'X')])},
                      cache_dirpath=config.env.cache_dirpath)

    glove_embeddings = Step(name='glove_embeddings',
                            transformer=GloveEmbeddingsMatrix(**config.glove_embeddings),
                            input_steps=[word_tokenizer],
                            adapter={'tokenizer': ([('word_tokenizer', 'tokenizer')]),
                                     },
                            cache_dirpath=config.env.cache_dirpath)

    logreg_count = Step(name='logreg_count',
                        transformer=LogisticRegressionMultilabel(**config.logistic_regression_multilabel),
                        input_steps=[xy_train, normalizer],
                        adapter={'X': ([('normalizer', 'X')]),
                                 'y': ([('xy_train', 'y')]),
                                 },
                        cache_dirpath=config.env.cache_dirpath,
                        cache_output=True)
    logreg_bad_word = Step(name='logreg_bad_word',
                           transformer=LogisticRegressionMultilabel(**config.logistic_regression_multilabel),
                           input_steps=[xy_train, bad_word_tfidf_word_vectorizer],
                           adapter={'X': ([('bad_word_tfidf_word_vectorizer', 'features')]),
                                    'y': ([('xy_train', 'y')]),
                                    },
                           cache_dirpath=config.env.cache_dirpath,
                           cache_output=True)
    logreg_bad_word_count = Step(name='logreg_bad_word_count',
                                 transformer=LogisticRegressionMultilabel(**config.logistic_regression_multilabel),
                                 input_steps=[xy_train, normalizer, bad_word_tfidf_word_vectorizer],
                                 adapter={'X': ([('normalizer', 'X'),
                                                 ('bad_word_tfidf_word_vectorizer', 'features')], sparse_hstack_inputs),
                                          'y': ([('xy_train', 'y')]),
                                          },
                                 cache_dirpath=config.env.cache_dirpath,
                                 cache_output=True)
    logreg_tfidf = Step(name='logreg_tfidf',
                        transformer=LogisticRegressionMultilabel(**config.logistic_regression_multilabel),
                        input_steps=[xy_train, tfidf_char_vectorizer, tfidf_word_vectorizer],
                        adapter={'X': ([('tfidf_char_vectorizer', 'features'),
                                        ('tfidf_word_vectorizer', 'features')], sparse_hstack_inputs),
                                 'y': ([('xy_train', 'y')]),
                                 },
                        cache_dirpath=config.env.cache_dirpath,
                        cache_output=True)
    char_vdcnn = Step(name='char_vdcnn',
                      transformer=CharVDCNN(**config.char_vdcnn_network),
                      input_steps=[char_tokenizer, xy_train],
                      adapter={'X': ([('char_tokenizer', 'X')]),
                               'y': ([('xy_train', 'y')]),
                               },
                      cache_dirpath=config.env.cache_dirpath,
                      cache_output=True)
    word_lstm = Step(name='word_lstm',
                     transformer=WordLSTM(**config.word_lstm_network),
                     input_steps=[word_tokenizer, xy_train],
                     adapter={'X': ([('word_tokenizer', 'X')]),
                              'y': ([('xy_train', 'y')]),
                              },
                     cache_dirpath=config.env.cache_dirpath,
                     cache_output=True)
    glove_lstm = Step(name='glove_lstm',
                      transformer=GloveLSTM(**config.glove_lstm_network),
                      input_steps=[word_tokenizer, xy_train, glove_embeddings],
                      adapter={'X': ([('word_tokenizer', 'X')]),
                               'y': ([('xy_train', 'y')]),
                               'embedding_matrix': ([('glove_embeddings', 'embeddings_matrix')]),
                               },
                      cache_dirpath=config.env.cache_dirpath,
                      cache_output=True)
    glove_scnn = Step(name='glove_scnn',
                      transformer=GloveSCNN(**config.glove_scnn_network),
                      input_steps=[word_tokenizer, xy_train, glove_embeddings],
                      adapter={'X': ([('word_tokenizer', 'X')]),
                               'y': ([('xy_train', 'y')]),
                               'embedding_matrix': ([('glove_embeddings', 'embeddings_matrix')]),
                               },
                      cache_dirpath=config.env.cache_dirpath,
                      cache_output=True)
    glove_dpcnn = Step(name='glove_dpcnn',
                       transformer=GloveDPCNN(**config.glove_dpcnn_network),
                       input_steps=[word_tokenizer, xy_train, glove_embeddings],
                       adapter={'X': ([('word_tokenizer', 'X')]),
                                'y': ([('xy_train', 'y')]),
                                'embedding_matrix': ([('glove_embeddings', 'embeddings_matrix')]),
                                },
                       cache_dirpath=config.env.cache_dirpath,
                       cache_output=True)

    return [logreg_count, logreg_bad_word, logreg_bad_word_count,
            logreg_tfidf, char_vdcnn, word_lstm, glove_lstm,
            glove_scnn, glove_dpcnn]
Exemplo n.º 9
0
def ensemble_extraction(config):
    fill_na_x = Step(name='fill_na_x',
                     transformer=FillNA(**config.fill_na),
                     input_data=['input_ensemble'],
                     adapter={'X': ([('input_ensemble', 'meta')])},
                     cache_dirpath=config.env.cache_dirpath)
    xy_split = Step(name='xy_split',
                    transformer=XYSplit(**config.xy_split),
                    input_data=['input_ensemble'],
                    input_steps=[fill_na_x],
                    adapter={
                        'meta': ([('fill_na_x', 'X')]),
                        'train_mode': ([('input_ensemble', 'train_mode')])
                    },
                    cache_dirpath=config.env.cache_dirpath)

    char_tokenizer = Step(name='char_tokenizer',
                          transformer=Tokenizer(**config.char_tokenizer),
                          input_steps=[xy_split],
                          adapter={
                              'X': ([('xy_split', 'X')], fetch_x_train),
                              'train_mode': ([('xy_split', 'train_mode')])
                          },
                          cache_dirpath=config.env.cache_dirpath)

    word_tokenizer = Step(name='word_tokenizer',
                          transformer=Tokenizer(**config.word_tokenizer),
                          input_steps=[xy_split],
                          adapter={
                              'X': ([('xy_split', 'X')], fetch_x_train),
                              'train_mode': ([('xy_split', 'train_mode')])
                          },
                          cache_dirpath=config.env.cache_dirpath)

    tfidf_char_vectorizer = Step(
        name='tfidf_char_vectorizer',
        transformer=TfidfVectorizer(**config.tfidf_char_vectorizer),
        input_steps=[xy_split],
        adapter={
            'text': ([('xy_split', 'X')], fetch_x_train),
        },
        cache_dirpath=config.env.cache_dirpath)
    tfidf_word_vectorizer = Step(
        name='tfidf_word_vectorizer',
        transformer=TfidfVectorizer(**config.tfidf_word_vectorizer),
        input_steps=[xy_split],
        adapter={
            'text': ([('xy_split', 'X')], fetch_x_train),
        },
        cache_dirpath=config.env.cache_dirpath)

    glove_embeddings = Step(
        name='glove_embeddings',
        transformer=GloveEmbeddingsMatrix(**config.glove_embeddings),
        input_steps=[word_tokenizer],
        adapter={
            'tokenizer': ([('word_tokenizer', 'tokenizer')]),
        },
        cache_dirpath=config.env.cache_dirpath)

    log_reg_multi = Step(
        name='log_reg_multi',
        transformer=LogisticRegressionMultilabel(
            **config.logistic_regression_multilabel),
        input_steps=[xy_split, tfidf_char_vectorizer, tfidf_word_vectorizer],
        adapter={
            'X':
            ([('tfidf_char_vectorizer', 'features'),
              ('tfidf_word_vectorizer', 'features')], sparse_hstack_inputs),
            'y': ([('xy_split', 'y')]),
        },
        cache_dirpath=config.env.cache_dirpath,
        cache_output=True)

    char_vdcnn = Step(name='char_vdcnn',
                      transformer=CharVDCNN(**config.char_vdcnn_network),
                      input_steps=[char_tokenizer, xy_split],
                      adapter={
                          'X': ([('char_tokenizer', 'X')]),
                          'y': ([('xy_split', 'y')]),
                      },
                      cache_dirpath=config.env.cache_dirpath,
                      cache_output=True)
    word_lstm = Step(name='word_lstm',
                     transformer=WordLSTM(**config.word_lstm_network),
                     input_steps=[word_tokenizer, xy_split],
                     adapter={
                         'X': ([('word_tokenizer', 'X')]),
                         'y': ([('xy_split', 'y')]),
                     },
                     cache_dirpath=config.env.cache_dirpath,
                     cache_output=True)
    glove_lstm = Step(name='glove_lstm',
                      transformer=GloveLSTM(**config.glove_lstm_network),
                      input_steps=[word_tokenizer, xy_split, glove_embeddings],
                      adapter={
                          'X': ([('word_tokenizer', 'X')]),
                          'y': ([('xy_split', 'y')]),
                          'embedding_matrix':
                          ([('glove_embeddings', 'embeddings_matrix')]),
                      },
                      cache_dirpath=config.env.cache_dirpath,
                      cache_output=True)
    glove_scnn = Step(name='glove_scnn',
                      transformer=GloveSCNN(**config.glove_scnn_network),
                      input_steps=[word_tokenizer, xy_split, glove_embeddings],
                      adapter={
                          'X': ([('word_tokenizer', 'X')]),
                          'y': ([('xy_split', 'y')]),
                          'embedding_matrix':
                          ([('glove_embeddings', 'embeddings_matrix')]),
                      },
                      cache_dirpath=config.env.cache_dirpath,
                      cache_output=True)

    glove_dpcnn = Step(
        name='glove_dpcnn',
        transformer=GloveDPCNN(**config.glove_dpcnn_network),
        input_steps=[word_tokenizer, xy_split, glove_embeddings],
        adapter={
            'X': ([('word_tokenizer', 'X')]),
            'y': ([('xy_split', 'y')]),
            'embedding_matrix': ([('glove_embeddings', 'embeddings_matrix')]),
        },
        cache_dirpath=config.env.cache_dirpath,
        cache_output=True)

    return [
        log_reg_multi, char_vdcnn, word_lstm, glove_lstm, glove_scnn,
        glove_dpcnn
    ]