def pred_result(): vocabulary_size = 400000 time_step = 300 embedding_size = 100 # fit the input information with the trained model predset = pd.read_csv('check.csv') texts = [] train = pd.read_csv('train.csv') texts = predset['text'].astype(str) text_ml = texts from DataPrep.Clean_Texts import clean_text texts = texts.map(lambda x: clean_text(x)) tokenizer_pred = Tokenizer(num_words=vocabulary_size) tokenizer_pred.fit_on_texts(texts) encoded_pred = tokenizer_pred.texts_to_sequences(texts) #print(encoded_docs) vocab_size_pred = len(tokenizer_pred.word_index) + 1 X_pred = sequence.pad_sequences(encoded_pred, maxlen=time_step, padding='post') #Load models model1 = load_model('Model_CNN.h5') model2 = pickle.load(open("NB.sav", 'rb')) model3 = pickle.load(open("Logistic.sav", 'rb')) model4 = pickle.load(open("SVM.sav", 'rb')) y_pred_1 = model1.predict(X_pred) cv = TfidfVectorizer(max_features=5000) cleaned = DataClean.cleaned() X_all = cv.fit_transform(cleaned) arr_texts = cv.transform(text_ml) model2 = pickle.load(open("NB.sav", 'rb')) y_pred_2 = model2.predict(arr_texts) y_pred_3 = model3.predict(arr_texts) y_pred_4 = model4.predict(arr_texts) count = [] #count prediction result and get the reliability index if math.ceil(y_pred_1[0][0]) == 1: count.append(1) if y_pred_2[0] == 1: count.append(1) if y_pred_3[0] == 1: count.append(1) if y_pred_4[0] == 1: count.append(1) return count
from sklearn.model_selection import train_test_split import numpy as np import pickle from sklearn.preprocessing import LabelEncoder dataVal_Fake_Real = pd.read_csv('fake_or_real_news.csv') texts = [] texts = dataVal_Fake_Real['text'] ##################################### label = dataVal_Fake_Real['label'] #print(label) #X=texts.astype(str).values.tolist() #X=np.reshape(X,(-1,1)) from DataPrep.Clean_Texts import clean_text X = texts.map(lambda x: clean_text(x)) #print(X) #label=label.astype(int).values labelEncoder = LabelEncoder() encoded_label = labelEncoder.fit_transform(label) y = np.reshape(encoded_label, (-1, 1)) training_size = int(0.8 * X.shape[0]) X_train = X[:training_size] y_train = y[:training_size] X_test = X[training_size:] y_test = y[training_size:] #print(X_test) #print(len(X_train),len(X_test))
import pandas as pd from DataPrep.Clean_Texts import clean_text import pickle dataset = pd.read_csv('balanced_train_data.csv') #dataset=dataset.head(10) texts = dataset['comment_text'] #texts=texts.map(lambda x: clean_text(x)) for i in range(len(texts)): texts[i] = clean_text(texts[i]) label = dataset.iloc[:, 2:8].values #print(texts[0]) with open('Pickles/pickle_toxic_clean_balanced_train_Xy.pickle', 'wb') as f: pickle._dump((texts, label), f) '''pickle_load=open('Pickles/pickle_toxic_clean_balanced_train_Xy.pickle','rb') X,y=pickle.load(pickle_load) print(X[0])''' dataset_test = pd.read_csv('balanced_test_data.csv') texts_test = dataset_test['comment_text'] #texts=texts.map(lambda x: clean_text(x)) for i in range(len(texts_test)): texts_test[i] = clean_text(texts_test[i]) label_test = dataset_test.iloc[:, 2:8].values