def textcnn_predict(ip, up_url, down_url, access_url, access_key,
                    _init_companyId, data_id, textcnn_mlb_id,
                    textcnn_tokenizer_id, textcnn_model_id,
                    max_sequence_length, batch_size):
    dl = Data_Load(ip, up_url, down_url, access_url, access_key,
                   _init_companyId)
    data = dl.get_train_data(data_id)
    data = predict_data_cut(data)

    dl.get_api_ticket()
    textcnn_mlb = pickle.loads(dl.download_model_by_id(textcnn_mlb_id))
    print(len(textcnn_mlb.classes_))

    dl.get_api_ticket()
    textcnn_tokenizer = pickle.loads(
        dl.download_model_by_id(textcnn_tokenizer_id))

    dl.get_api_ticket()
    textcnn_model = pickle.loads(dl.download_model_by_id(textcnn_model_id))

    train_word_seq = textcnn_tokenizer.texts_to_sequences(data['context'])

    x_train = pad_sequences(train_word_seq, maxlen=int(max_sequence_length))
    print("Shape of word train data tensor:", x_train.shape)

    y_pred = textcnn_model.predict(x_train, batch_size=int(batch_size))
    y_pred = (y_pred > 0.5)
    # print(y_pred)

    label = [','.join(i) for i in textcnn_mlb.inverse_transform(y_pred)]
    # print(label)

    ans = {'id': data['id'], 'label': label}
    ans = pd.DataFrame(ans)
    # print(ans)

    ans['result'] = ans['id'].map(str) + "#" + ans['label'].map(str)
    result_list = ans['result'].tolist()

    result = ''
    for i in result_list:
        result = i + "##" + result

    return result[:-2]
def textcnn_train(ip, up_url, down_url, access_url, access_key,
                  _init_companyId, train_data_id, w2v_size, w2v_window,
                  w2v_min_count, w2v_negative, batch_size, epochs,
                  max_sequence_length, num_filter, drop_rate):
    EMBEDDING_DIM = int(w2v_size)

    dl = Data_Load(ip, up_url, down_url, access_url, access_key,
                   _init_companyId)
    train_data = dl.get_train_data(train_data_id)
    train_data = train_data_cut(train_data)

    textcnn_mlb = get_label_model(train_data['label'])
    print('数据行数:', len(train_data['label']))

    mlb_content = pickle.dumps(textcnn_mlb)
    dl.get_api_ticket()
    textcnn_mlb_id = dl.upload_file_by_data(mlb_content)
    print('textcnn_mlb_id:', textcnn_mlb_id)

    y_train = textcnn_mlb.transform(train_data['label'])

    w2v_model = get_word2vec_model(train_data['context'], int(w2v_size),
                                   int(w2v_window), int(w2v_min_count),
                                   int(w2v_negative))
    MAX_NB_WORDS = len(list(w2v_model.wv.vocab))

    tokenizer = get_word_index(MAX_NB_WORDS, train_data['context'])
    tokenizer_content = pickle.dumps(tokenizer)
    dl.get_api_ticket()
    textcnn_tokenizer_id = dl.upload_file_by_data(tokenizer_content)
    print('textcnn_tokenizer_id', textcnn_tokenizer_id)

    train_word_seq = tokenizer.texts_to_sequences(train_data['context'])
    word_index = tokenizer.word_index

    embeddings_index = get_word_vector(w2v_model)
    nb_words = min(MAX_NB_WORDS, len(word_index))
    print('nb_words:', nb_words)

    word_embedding_matrix = get_word_embedding_matrix(nb_words, EMBEDDING_DIM,
                                                      word_index, MAX_NB_WORDS,
                                                      embeddings_index)
    x_train = pad_sequences(train_word_seq, maxlen=int(max_sequence_length))
    print("Shape of word train data tensor:", x_train.shape)

    model = get_textcnn_model(int(max_sequence_length), nb_words,
                              EMBEDDING_DIM, word_embedding_matrix,
                              float(drop_rate), int(num_filter), textcnn_mlb)

    Jaccard = JaccardEvaluation(validation_data=(x_train, y_train), interval=1)

    model.fit(x_train,
              y_train,
              batch_size=int(batch_size),
              epochs=int(epochs),
              validation_data=(x_train, y_train),
              callbacks=[Jaccard],
              verbose=2)

    make_keras_picklabel()
    textcnn_model = load_model('./model/best_textcnn_model.h5')

    textcnn_model_content = pickle.dumps(textcnn_model)
    dl.get_api_ticket()
    textcnn_model_id = dl.upload_file_by_data(textcnn_model_content)
    print('best_textcnn_model_id', textcnn_model_id)

    result = "mlb_id#tokenizer_id#model_id##" + str(
        textcnn_mlb_id) + "#" + str(textcnn_tokenizer_id) + "#" + str(
            textcnn_model_id)

    return result
示例#3
0
def lp_train(ip, up_url, down_url, access_url, access_key, _init_companyId,
             train_data_id, ngram_num, feature_num, samples_leaf,
             samples_split):
    dl = Data_Load(ip, up_url, down_url, access_url, access_key,
                   _init_companyId)
    train_data = dl.get_train_data(train_data_id)
    train_data = train_data_cut(train_data)

    lp_tfidf = get_tfidf_model(train_data['context'], ngram_num, feature_num)
    lp_tfidf_content = pickle.dumps(lp_tfidf)
    dl.get_api_ticket()
    lp_tfidf_id = dl.upload_file_by_data(lp_tfidf_content)
    print('lp_tfidf_id:', lp_tfidf_id)

    lp_mlb = get_label_model(train_data['label'])
    print('数据行数:', len(train_data['label']))

    lp_mlb_content = pickle.dumps(lp_mlb)
    dl.get_api_ticket()
    lp_mlb_id = dl.upload_file_by_data(lp_mlb_content)
    print('lp_mlb_id:', lp_mlb_id)

    feat = lp_tfidf.transform(train_data['context'])
    label = lp_mlb.transform(train_data['label'])

    lp_classifier = LabelPowerset_method(feat, label, int(samples_leaf),
                                         int(samples_split))
    lp_classifier_content = pickle.dumps(lp_classifier)
    dl.get_api_ticket()
    lp_model_id = dl.upload_file_by_data(lp_classifier_content)
    print('lp_model_id:', lp_model_id)

    result = "tfidf_id#mlb_id#model_id##" + str(lp_tfidf_id) + "#" + str(
        lp_mlb_id) + "#" + str(lp_model_id)

    return result
示例#4
0
def lp_predict(ip, up_url, down_url, access_url, access_key, _init_companyId,
               data_id, lp_tfidf_id, lp_mlb_id, lp_model_id):
    dl = Data_Load(ip, up_url, down_url, access_url, access_key,
                   _init_companyId)
    data = dl.get_train_data(data_id)
    data = predict_data_cut(data)

    dl.get_api_ticket()
    lp_tfidf = pickle.loads(dl.download_model_by_id(lp_tfidf_id))

    dl.get_api_ticket()
    lp_mlb = pickle.loads(dl.download_model_by_id(lp_mlb_id))

    dl.get_api_ticket()
    lp_model = pickle.loads(dl.download_model_by_id(lp_model_id))

    data_vec = lp_tfidf.transform(data['context'])

    y_pred = lp_model.predict(data_vec)
    y_pred = (y_pred > 0.5)
    # print(y_pred)

    label = [','.join(i) for i in lp_mlb.inverse_transform(y_pred)]
    # print(label)

    ans = {'id': data['id'], 'label': label}
    ans = pd.DataFrame(ans)
    # print(ans)

    ans['result'] = ans['id'].map(str) + "#" + ans['label'].map(str)
    result_list = ans['result'].tolist()

    result = ''
    for i in result_list:
        result = i + "##" + result

    return result[:-2]
示例#5
0
def knn_train(ip, up_url, down_url, access_url, access_key, _init_companyId,
              train_data_id, ngram_num, feature_num, ml_k, ml_s):
    dl = Data_Load(ip, up_url, down_url, access_url, access_key,
                   _init_companyId)

    train_data = dl.get_train_data(train_data_id)
    train_data = train_data_cut(train_data)

    knn_tfidf = get_tfidf_model(train_data['context'], ngram_num, feature_num)
    knn_tfidf_content = pickle.dumps(knn_tfidf)
    dl.get_api_ticket()
    knn_tfidf_id = dl.upload_file_by_data(knn_tfidf_content)
    print('knn_tfidf_id:', knn_tfidf_id)

    knn_mlb = get_label_model(train_data['label'])
    print('数据行数:', len(train_data['label']))

    knn_mlb_content = pickle.dumps(knn_mlb)
    dl.get_api_ticket()
    knn_mlb_id = dl.upload_file_by_data(knn_mlb_content)
    print('knn_mlb_id:', knn_mlb_id)

    feat = knn_tfidf.transform(train_data['context'])
    label = knn_mlb.transform(train_data['label'])

    knn_classifier = MLKNN_method(feat, label, ml_k, ml_s)
    knn_classifier_content = pickle.dumps(knn_classifier)
    dl.get_api_ticket()
    knn_model_id = dl.upload_file_by_data(knn_classifier_content)
    print('knn_model_id:', knn_model_id)

    result = "tfidf_id#mlb_id#classifier_id##" + str(knn_tfidf_id) + "#" + str(
        knn_mlb_id) + "#" + str(knn_model_id)

    return result