def textcnn_predict(ip, up_url, down_url, access_url, access_key, _init_companyId, data_id, textcnn_mlb_id, textcnn_tokenizer_id, textcnn_model_id, max_sequence_length, batch_size): dl = Data_Load(ip, up_url, down_url, access_url, access_key, _init_companyId) data = dl.get_train_data(data_id) data = predict_data_cut(data) dl.get_api_ticket() textcnn_mlb = pickle.loads(dl.download_model_by_id(textcnn_mlb_id)) print(len(textcnn_mlb.classes_)) dl.get_api_ticket() textcnn_tokenizer = pickle.loads( dl.download_model_by_id(textcnn_tokenizer_id)) dl.get_api_ticket() textcnn_model = pickle.loads(dl.download_model_by_id(textcnn_model_id)) train_word_seq = textcnn_tokenizer.texts_to_sequences(data['context']) x_train = pad_sequences(train_word_seq, maxlen=int(max_sequence_length)) print("Shape of word train data tensor:", x_train.shape) y_pred = textcnn_model.predict(x_train, batch_size=int(batch_size)) y_pred = (y_pred > 0.5) # print(y_pred) label = [','.join(i) for i in textcnn_mlb.inverse_transform(y_pred)] # print(label) ans = {'id': data['id'], 'label': label} ans = pd.DataFrame(ans) # print(ans) ans['result'] = ans['id'].map(str) + "#" + ans['label'].map(str) result_list = ans['result'].tolist() result = '' for i in result_list: result = i + "##" + result return result[:-2]
def textcnn_train(ip, up_url, down_url, access_url, access_key, _init_companyId, train_data_id, w2v_size, w2v_window, w2v_min_count, w2v_negative, batch_size, epochs, max_sequence_length, num_filter, drop_rate): EMBEDDING_DIM = int(w2v_size) dl = Data_Load(ip, up_url, down_url, access_url, access_key, _init_companyId) train_data = dl.get_train_data(train_data_id) train_data = train_data_cut(train_data) textcnn_mlb = get_label_model(train_data['label']) print('数据行数:', len(train_data['label'])) mlb_content = pickle.dumps(textcnn_mlb) dl.get_api_ticket() textcnn_mlb_id = dl.upload_file_by_data(mlb_content) print('textcnn_mlb_id:', textcnn_mlb_id) y_train = textcnn_mlb.transform(train_data['label']) w2v_model = get_word2vec_model(train_data['context'], int(w2v_size), int(w2v_window), int(w2v_min_count), int(w2v_negative)) MAX_NB_WORDS = len(list(w2v_model.wv.vocab)) tokenizer = get_word_index(MAX_NB_WORDS, train_data['context']) tokenizer_content = pickle.dumps(tokenizer) dl.get_api_ticket() textcnn_tokenizer_id = dl.upload_file_by_data(tokenizer_content) print('textcnn_tokenizer_id', textcnn_tokenizer_id) train_word_seq = tokenizer.texts_to_sequences(train_data['context']) word_index = tokenizer.word_index embeddings_index = get_word_vector(w2v_model) nb_words = min(MAX_NB_WORDS, len(word_index)) print('nb_words:', nb_words) word_embedding_matrix = get_word_embedding_matrix(nb_words, EMBEDDING_DIM, word_index, MAX_NB_WORDS, embeddings_index) x_train = pad_sequences(train_word_seq, maxlen=int(max_sequence_length)) print("Shape of word train data tensor:", x_train.shape) model = get_textcnn_model(int(max_sequence_length), nb_words, EMBEDDING_DIM, word_embedding_matrix, float(drop_rate), int(num_filter), textcnn_mlb) Jaccard = JaccardEvaluation(validation_data=(x_train, y_train), interval=1) model.fit(x_train, y_train, batch_size=int(batch_size), epochs=int(epochs), validation_data=(x_train, y_train), callbacks=[Jaccard], verbose=2) make_keras_picklabel() textcnn_model = load_model('./model/best_textcnn_model.h5') textcnn_model_content = pickle.dumps(textcnn_model) dl.get_api_ticket() textcnn_model_id = dl.upload_file_by_data(textcnn_model_content) print('best_textcnn_model_id', textcnn_model_id) result = "mlb_id#tokenizer_id#model_id##" + str( textcnn_mlb_id) + "#" + str(textcnn_tokenizer_id) + "#" + str( textcnn_model_id) return result
def lp_train(ip, up_url, down_url, access_url, access_key, _init_companyId, train_data_id, ngram_num, feature_num, samples_leaf, samples_split): dl = Data_Load(ip, up_url, down_url, access_url, access_key, _init_companyId) train_data = dl.get_train_data(train_data_id) train_data = train_data_cut(train_data) lp_tfidf = get_tfidf_model(train_data['context'], ngram_num, feature_num) lp_tfidf_content = pickle.dumps(lp_tfidf) dl.get_api_ticket() lp_tfidf_id = dl.upload_file_by_data(lp_tfidf_content) print('lp_tfidf_id:', lp_tfidf_id) lp_mlb = get_label_model(train_data['label']) print('数据行数:', len(train_data['label'])) lp_mlb_content = pickle.dumps(lp_mlb) dl.get_api_ticket() lp_mlb_id = dl.upload_file_by_data(lp_mlb_content) print('lp_mlb_id:', lp_mlb_id) feat = lp_tfidf.transform(train_data['context']) label = lp_mlb.transform(train_data['label']) lp_classifier = LabelPowerset_method(feat, label, int(samples_leaf), int(samples_split)) lp_classifier_content = pickle.dumps(lp_classifier) dl.get_api_ticket() lp_model_id = dl.upload_file_by_data(lp_classifier_content) print('lp_model_id:', lp_model_id) result = "tfidf_id#mlb_id#model_id##" + str(lp_tfidf_id) + "#" + str( lp_mlb_id) + "#" + str(lp_model_id) return result
def lp_predict(ip, up_url, down_url, access_url, access_key, _init_companyId, data_id, lp_tfidf_id, lp_mlb_id, lp_model_id): dl = Data_Load(ip, up_url, down_url, access_url, access_key, _init_companyId) data = dl.get_train_data(data_id) data = predict_data_cut(data) dl.get_api_ticket() lp_tfidf = pickle.loads(dl.download_model_by_id(lp_tfidf_id)) dl.get_api_ticket() lp_mlb = pickle.loads(dl.download_model_by_id(lp_mlb_id)) dl.get_api_ticket() lp_model = pickle.loads(dl.download_model_by_id(lp_model_id)) data_vec = lp_tfidf.transform(data['context']) y_pred = lp_model.predict(data_vec) y_pred = (y_pred > 0.5) # print(y_pred) label = [','.join(i) for i in lp_mlb.inverse_transform(y_pred)] # print(label) ans = {'id': data['id'], 'label': label} ans = pd.DataFrame(ans) # print(ans) ans['result'] = ans['id'].map(str) + "#" + ans['label'].map(str) result_list = ans['result'].tolist() result = '' for i in result_list: result = i + "##" + result return result[:-2]
def knn_train(ip, up_url, down_url, access_url, access_key, _init_companyId, train_data_id, ngram_num, feature_num, ml_k, ml_s): dl = Data_Load(ip, up_url, down_url, access_url, access_key, _init_companyId) train_data = dl.get_train_data(train_data_id) train_data = train_data_cut(train_data) knn_tfidf = get_tfidf_model(train_data['context'], ngram_num, feature_num) knn_tfidf_content = pickle.dumps(knn_tfidf) dl.get_api_ticket() knn_tfidf_id = dl.upload_file_by_data(knn_tfidf_content) print('knn_tfidf_id:', knn_tfidf_id) knn_mlb = get_label_model(train_data['label']) print('数据行数:', len(train_data['label'])) knn_mlb_content = pickle.dumps(knn_mlb) dl.get_api_ticket() knn_mlb_id = dl.upload_file_by_data(knn_mlb_content) print('knn_mlb_id:', knn_mlb_id) feat = knn_tfidf.transform(train_data['context']) label = knn_mlb.transform(train_data['label']) knn_classifier = MLKNN_method(feat, label, ml_k, ml_s) knn_classifier_content = pickle.dumps(knn_classifier) dl.get_api_ticket() knn_model_id = dl.upload_file_by_data(knn_classifier_content) print('knn_model_id:', knn_model_id) result = "tfidf_id#mlb_id#classifier_id##" + str(knn_tfidf_id) + "#" + str( knn_mlb_id) + "#" + str(knn_model_id) return result