Exemplo n.º 1
0
def classify(text):
    text = PreProcessor.clean_text(text)
    data = np.array([text])
    data = tokenizer.texts_to_sequences(data)
    data = tf.keras.preprocessing.sequence.pad_sequences(data, maxlen=MAX_DOCUMENT_LENGTH)

    y_predicted = np.argmax(classifier.predict(data), axis=1)

    topic = news_classes.class_map[str(y_predicted[0]+1)]
    return topic
Exemplo n.º 2
0
Arquivo: gen_text.py Projeto: amsqr/hd
import pandas as pd
import os, sys
import config
from preprocessor import PreProcessor


csvname = 'df_data.csv'
df_data = pd.read_csv(os.path.join('tmp2', csvname), index_col=0)

print('df_data')
       
csvname = 'attributes.csv'
df_attr = pd.read_csv(os.path.join('../data', csvname))

preproc = PreProcessor(config)      
df_attr = preproc.clean_text(df_attr)

print(df_attr)

with open('data.text','wt') as f:
    for col in ['q','t','d']:
        f.write(' '.join(df_data[col].tolist()))
        f.write(' ')
    f.write(' '.join(df_attr['value'].tolist()))
Exemplo n.º 3
0
FILTER_SIZE = 64
POOL_SIZE = 4

if REMOVE_PREVIOUS_MODEL:
    # Remove old model
    shutil.rmtree(MODEL_OUTPUT_DIR)
    mkdir(MODEL_OUTPUT_DIR)

# read data
df = pd.read_csv(DATA_SET_FILE, header=None)
X, y = df[1], df[0]

# class range from 0 ~ N_CLASSES-1
y = y.apply(lambda x: x - 1)
# preprocess
X = X.apply(lambda x: PreProcessor.clean_text(x))

# split train and test data
x_train, x_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)

# tokenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(x_train)

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

x_train = tf.keras.preprocessing.sequence.pad_sequences(