all_datasets = [] #%% #*************HARD************ df_HARD = pd.read_csv('Datasets\\HARD\\balanced-reviews-utf8.tsv', sep='\t', header=0) df_HARD = df_HARD[['rating', 'review']] # we are interested in rating and review only # code rating as +ve if > 3, -ve if less, no 3s in dataset df_HARD['rating'] = df_HARD['rating'].apply(lambda x: 0 if x < 3 else 1) # rename columns to fit default constructor in fastai df_HARD.columns = ['label', 'text'] df_HARD['text'] = df_HARD['text'].progress_apply( lambda x: preprocess(x, do_farasa_tokenization=True, farasa=farasa)) train_HARD, test_HARD = train_test_split(df_HARD, test_size=0.2, random_state=42) label_list_HARD = [0, 1] data_Hard = Dataset("HARD", train_HARD, test_HARD, label_list_HARD) all_datasets.append(data_Hard) #%% #*************ASTD-Unbalanced************ df_ASTD_UN = pd.read_csv('Datasets\\ASTD-master\\data\\Tweets.txt', sep='\t', header=None) DATA_COLUMN = 'text'
all_datasets = [] #%% # *************HARD************ df_HARD = pd.read_csv("Datasets\\HARD\\balanced-reviews-utf8.tsv", sep="\t", header=0) df_HARD = df_HARD[["rating", "review"]] # we are interested in rating and review only # code rating as +ve if > 3, -ve if less, no 3s in dataset df_HARD["rating"] = df_HARD["rating"].apply(lambda x: 0 if x < 3 else 1) # rename columns to fit default constructor in fastai df_HARD.columns = ["label", "text"] df_HARD["text"] = df_HARD["text"].progress_apply(lambda x: preprocess( x, do_farasa_tokenization=True, farasa=farasa_segmenter, use_farasapy=True) ) train_HARD, test_HARD = train_test_split(df_HARD, test_size=0.2, random_state=42) label_list_HARD = [0, 1] data_Hard = Dataset("HARD", train_HARD, test_HARD, label_list_HARD) all_datasets.append(data_Hard) #%% # *************ASTD-Unbalanced************ df_ASTD_UN = pd.read_csv("Datasets\\ASTD-master\\data\\Tweets.txt", sep="\t", header=None)
self.test_features = test_features all_datasets = [] #%% # *************HARD************ df_HARD = pd.read_csv("Datasets\\HARD\\balanced-reviews-utf8.tsv", sep="\t", header=0) df_HARD = df_HARD[["rating", "review"]] # we are interested in rating and review only # code rating as +ve if > 3, -ve if less, no 3s in dataset df_HARD["rating"] = df_HARD["rating"].apply(lambda x: 0 if x < 3 else 1) # rename columns to fit default constructor in fastai df_HARD.columns = ["label", "text"] df_HARD["text"] = df_HARD["text"].progress_apply( lambda x: preprocess( x, do_farasa_tokenization=True, farasa=farasa_segmenter, use_farasapy=True ) ) train_HARD, test_HARD = train_test_split(df_HARD, test_size=0.2, random_state=42) label_list_HARD = [0, 1] data_Hard = Dataset("HARD", train_HARD, test_HARD, label_list_HARD) all_datasets.append(data_Hard) #%% # *************ASTD-Unbalanced************ df_ASTD_UN = pd.read_csv( "Datasets\\ASTD-master\\data\\Tweets.txt", sep="\t", header=None ) DATA_COLUMN = "text"