rating.append(rating_all[i]) ### TO get 2 classes only # for i in range(0, len(body_all)): # if rating_all[i] != 0: # body.append(body_all[i] ) # rating.append(rating_all[i]) columns = {'body': body, 'rating': rating} data = pd.DataFrame(columns, columns = ['body', 'rating']) reviews = pd.DataFrame([[body, rating]]) ############### Preprocessing ######## for i in range(0,len(data)): data.iloc[i,0] = re.sub("\"",'',data.iloc[i,0]) data.iloc[i,0] = LoadDataset_General.Emoticon_detection(data.iloc[i,0]) data.iloc[i,0] = LoadDataset_General.clean_raw_review(data.iloc[i,0]) data.iloc[i,0] = LoadDataset_General.normalizeArabic(data.iloc[i,0]) data.iloc[i,0] = LoadDataset_General.Elong_remove(data.iloc[i,0]) data.iloc[i,0] = LoadDataset_General.deNoise(data.iloc[i,0]) # data.iloc[i,0] = LoadDataset_General.Remove_Stopwords(data.iloc[i,0]) # data.iloc[i,0] = LoadDataset_General.Named_Entity_Recognition(data.iloc[i,0]) # data[i] = LoadDataset_General.Stem_word(data[i]) # data.iloc[i,0] = LoadDataset_General.Light_Stem_word(data.iloc[i,0]) # data[i] = LoadDataset_General.Get_root_word(data[i]) # random.shuffle( data ) train_texts = data.iloc[:,0].tolist() train_labels = data.iloc[:,1].tolist()
import codecs import numpy as np import pandas as pd import re from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from qalsadi import analex from Classifiers import * from Feature_Generation import * import pyarabic.arabrepr from tashaphyne.stemming import ArabicLightStemmer from pyarabic.named import * LoadDataset_General = LoadDataset_General() ############### Preprocessing ######## for i in range(0,len(data)): data.iloc[i,0] = LoadDataset_General.Emoticon_detection(data.iloc[i,0]) data.iloc[i,0] = LoadDataset_General.clean_raw_review(data.iloc[i,0]) data.iloc[i,0] = LoadDataset_General.normalizeArabic(data.iloc[i,0]) data.iloc[i,0] = LoadDataset_General.Elong_remove(data.iloc[i,0]) data.iloc[i,0] = LoadDataset_General.deNoise(data.iloc[i,0]) data.iloc[i,0] = LoadDataset_General.Remove_Stopwords(data.iloc[i,0]) data.iloc[i,0] = LoadDataset_General.Named_Entity_Recognition(data.iloc[i,0]) # data[i] = LoadDataset_General.Stem_word(data[i]) # data.iloc[i,0] = LoadDataset_General.Light_Stem_word(data.iloc[i,0]) # data[i] = LoadDataset_General.Get_root_word(data[i]) data[0][2] = LoadDataset_General.Emoticon_detection(data[0][2]) random.shuffle( data ) train_size = int(len(data) * val_split)