Пример #1
0
def pred_result():
    vocabulary_size = 400000
    time_step = 300
    embedding_size = 100

    # fit the input information with the trained model
    predset = pd.read_csv('check.csv')
    texts = []
    train = pd.read_csv('train.csv')

    texts = predset['text'].astype(str)
    text_ml = texts
    from DataPrep.Clean_Texts import clean_text
    texts = texts.map(lambda x: clean_text(x))

    tokenizer_pred = Tokenizer(num_words=vocabulary_size)
    tokenizer_pred.fit_on_texts(texts)
    encoded_pred = tokenizer_pred.texts_to_sequences(texts)
    #print(encoded_docs)
    vocab_size_pred = len(tokenizer_pred.word_index) + 1

    X_pred = sequence.pad_sequences(encoded_pred,
                                    maxlen=time_step,
                                    padding='post')

    #Load models
    model1 = load_model('Model_CNN.h5')
    model2 = pickle.load(open("NB.sav", 'rb'))
    model3 = pickle.load(open("Logistic.sav", 'rb'))
    model4 = pickle.load(open("SVM.sav", 'rb'))
    y_pred_1 = model1.predict(X_pred)
    cv = TfidfVectorizer(max_features=5000)
    cleaned = DataClean.cleaned()
    X_all = cv.fit_transform(cleaned)
    arr_texts = cv.transform(text_ml)
    model2 = pickle.load(open("NB.sav", 'rb'))
    y_pred_2 = model2.predict(arr_texts)
    y_pred_3 = model3.predict(arr_texts)
    y_pred_4 = model4.predict(arr_texts)
    count = []

    #count prediction result and get the reliability index
    if math.ceil(y_pred_1[0][0]) == 1:
        count.append(1)
    if y_pred_2[0] == 1:
        count.append(1)
    if y_pred_3[0] == 1:
        count.append(1)
    if y_pred_4[0] == 1:
        count.append(1)
    return count
Пример #2
0
from sklearn.model_selection import train_test_split
import numpy as np
import pickle

from sklearn.preprocessing import LabelEncoder

dataVal_Fake_Real = pd.read_csv('fake_or_real_news.csv')

texts = []
texts = dataVal_Fake_Real['text']  #####################################
label = dataVal_Fake_Real['label']
#print(label)
#X=texts.astype(str).values.tolist()
#X=np.reshape(X,(-1,1))
from DataPrep.Clean_Texts import clean_text
X = texts.map(lambda x: clean_text(x))
#print(X)
#label=label.astype(int).values
labelEncoder = LabelEncoder()
encoded_label = labelEncoder.fit_transform(label)
y = np.reshape(encoded_label, (-1, 1))

training_size = int(0.8 * X.shape[0])
X_train = X[:training_size]
y_train = y[:training_size]
X_test = X[training_size:]
y_test = y[training_size:]
#print(X_test)

#print(len(X_train),len(X_test))
import pandas as pd
from DataPrep.Clean_Texts import clean_text
import pickle

dataset = pd.read_csv('balanced_train_data.csv')
#dataset=dataset.head(10)

texts = dataset['comment_text']
#texts=texts.map(lambda x: clean_text(x))
for i in range(len(texts)):
    texts[i] = clean_text(texts[i])

label = dataset.iloc[:, 2:8].values

#print(texts[0])

with open('Pickles/pickle_toxic_clean_balanced_train_Xy.pickle', 'wb') as f:
    pickle._dump((texts, label), f)
'''pickle_load=open('Pickles/pickle_toxic_clean_balanced_train_Xy.pickle','rb')
X,y=pickle.load(pickle_load)
print(X[0])'''

dataset_test = pd.read_csv('balanced_test_data.csv')

texts_test = dataset_test['comment_text']
#texts=texts.map(lambda x: clean_text(x))
for i in range(len(texts_test)):
    texts_test[i] = clean_text(texts_test[i])

label_test = dataset_test.iloc[:, 2:8].values