def am_glove_fasttext(data): job = Job('am_glove_fasttext', cv = cv_n_fold_dl, n_threads = 1, model_package = 'keras') max_features = 40000 max_seq_len = 700 embedding_dims = 300 batch_size = 256 nb_epoch = 200 global embedding_matrix embedding_matrix = create_embedding_matrix(data.df[data.fs_ind], max_features=max_features, embedding_dims = embedding_dims) m = KerasClassifier(build_fn=glove_fasttext, batch_size=batch_size, validation_split = 0.1, nb_epoch=nb_epoch, verbose = 1) pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence()), ('padd_seq', PadNumericSequence()), ('m', m)]) parameters = dict(txt_to_seq__n_max_features = [max_features], padd_seq__max_seq_len = [max_seq_len], m__max_features = [max_features], m__max_seq_len = [max_seq_len], m__embedding_dims = [embedding_dims]) job.run(pipeline, parameters, data) return None
def pre_process_data_for_deep_learning(data, fs_text = 'text', verbose = 0): pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence(max_features)), ('padd_seq', PadNumericSequence(max_seq_len))]) m_pre_proc = pipeline.fit(data.df[fs_text].values) text_num = m_pre_proc.transform(data.df[fs_text].values) # text_num = np.array(text_num) return text_num, m_pre_proc
def ak_embedding_cnn_lstm(data): job = Job('ak_embedding_cnn_lstm', cv = cv_n_fold_dl, n_threads = 1) cnn_lstm_model = KerasClassifier(build_fn=cnn_lstm, batch_size=32, nb_epoch=10, validation_split = 0.1, verbose = 1) pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence()), ('padd_seq', PadNumericSequence()), ('cnn_lstm', cnn_lstm_model)]) parameters = dict(txt_to_seq__n_max_features = [20000], padd_seq__max_seq_len = [300], cnn_lstm__embedding_dims = [50]) job.run(pipeline, parameters, data) return None
def pre_process_data_for_deep_learning(data, fs_text = 'text', verbose = 0): pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence()), ('padd_seq', PadNumericSequence())]) p = pipeline.fit(data.df[fs_text].values) text_num = p.transform(data.df[fs_text].values) print(text_num) text_num = pd.DataFrame(text_num) fe_columns = ['text_numeric'] text_num.columns = fe_columns data.df = data.df.join(text_num) data.fs_ind = fe_columns return data
def aj_embedding_fasttext(data): job = Job('aj_embedding_fasttext', cv = cv_n_fold_dl, n_threads = 1) ft_model = KerasClassifier(build_fn=fasttext, batch_size=32, nb_epoch=5, validation_split = 0.1, verbose = 1) pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence()), ('padd_seq', PadNumericSequence()), ('ft', ft_model)]) # TODO: add ngram features based on the paper parameters = dict(txt_to_seq__n_max_features = [20000], padd_seq__max_seq_len = [300], ft__max_seq_len = [300], ft__embedding_dims = [100]) job.run(pipeline, parameters, data) return None
def al_glove_cnn_lstm(data): job = Job('al_glove_cnn_lstm', cv = cv_n_fold_dl, n_threads = 1) global embedding_matrix embedding_matrix = create_embedding_matrix(data.df[data.fs_ind], max_features=20000, embedding_dims = 300) glove_cnn_lstm_m = KerasClassifier(build_fn=glove_cnn_lstm, batch_size=64, nb_epoch=10, validation_split = 0.1, verbose = 1) pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence()), ('padd_seq', PadNumericSequence()), ('g_c_l', glove_cnn_lstm_m)]) parameters = dict(txt_to_seq__n_max_features = [20000], padd_seq__max_seq_len = [300], g_c_l__max_seq_len = [300], g_c_l__embedding_dims = [300]) job.run(pipeline, parameters, data) return None
def ao_multi_fltr_glove_cnn(data): job = Job('ao_multi_fltr_glove_cnn', cv = cv_n_fold_dl, n_threads = 1) max_features = 20000 max_seq_len = 300 embedding_dims = 300 batch_size = 64 nb_epoch = 10 global embedding_matrix embedding_matrix = create_embedding_matrix(data.df[data.fs_ind], max_features=max_features, embedding_dims = embedding_dims) m = KerasClassifier(build_fn=multi_fltr_glove_cnn, batch_size=batch_size, nb_epoch=nb_epoch, validation_split = 0.1, verbose = 1) pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence()), ('padd_seq', PadNumericSequence()), ('m', m)]) parameters = dict(txt_to_seq__n_max_features = [max_features], padd_seq__max_seq_len = [max_seq_len], m__max_features = [max_features], m__max_seq_len = [max_seq_len], m__embedding_dims = [embedding_dims]) job.run(pipeline, parameters, data) return None