def classify(text): text = PreProcessor.clean_text(text) data = np.array([text]) data = tokenizer.texts_to_sequences(data) data = tf.keras.preprocessing.sequence.pad_sequences(data, maxlen=MAX_DOCUMENT_LENGTH) y_predicted = np.argmax(classifier.predict(data), axis=1) topic = news_classes.class_map[str(y_predicted[0]+1)] return topic
import pandas as pd import os, sys import config from preprocessor import PreProcessor csvname = 'df_data.csv' df_data = pd.read_csv(os.path.join('tmp2', csvname), index_col=0) print('df_data') csvname = 'attributes.csv' df_attr = pd.read_csv(os.path.join('../data', csvname)) preproc = PreProcessor(config) df_attr = preproc.clean_text(df_attr) print(df_attr) with open('data.text','wt') as f: for col in ['q','t','d']: f.write(' '.join(df_data[col].tolist())) f.write(' ') f.write(' '.join(df_attr['value'].tolist()))
FILTER_SIZE = 64 POOL_SIZE = 4 if REMOVE_PREVIOUS_MODEL: # Remove old model shutil.rmtree(MODEL_OUTPUT_DIR) mkdir(MODEL_OUTPUT_DIR) # read data df = pd.read_csv(DATA_SET_FILE, header=None) X, y = df[1], df[0] # class range from 0 ~ N_CLASSES-1 y = y.apply(lambda x: x - 1) # preprocess X = X.apply(lambda x: PreProcessor.clean_text(x)) # split train and test data x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) # tokenizer tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE) tokenizer.fit_on_texts(x_train) x_train = tokenizer.texts_to_sequences(x_train) x_test = tokenizer.texts_to_sequences(x_test) x_train = tf.keras.preprocessing.sequence.pad_sequences(