def __init__(self, X_train, X_test, y_train, y_test, tags): self.y_train = y_train self.y_test = y_test self.tags = tags # Limit is used to get the most-frequent 500,000 word's vectors, so speed loading vectors a little. glove_model = KeyedVectors.load_word2vec_format( "pretrained_vectors/gensim_glove_vectors.txt", binary=False, limit=500000) # Used for initialization of model.syn0norm glove_model.init_sims(replace=True) print("Done Processing Pretrained Vectors") pre = PreProcessing() test_tokenized = X_test.apply(lambda item: self.tokenize_text(item)) train_tokenized = X_train.apply(lambda item: self.tokenize_text(item)) self.X_train_word_average = pre.word_averaging_list( glove_model, train_tokenized) self.X_test_word_average = pre.word_averaging_list( glove_model, test_tokenized) print("Done Applying Pretrained Vectors")
def main(self): preprocess = PreProcessing() self.wordsList, self.wordVectors = preprocess.load_glove() data, labels, types = preprocess.load_mutations() self.numClasses = len(types) # create dictionary of type and it's respective value in int count = 0 for i in types: dic = {i: count} self.types.update(dic) count = count + 1 train_seqs, test_seqs, train_labels, test_labels = self.normalize_data( data, labels) # Spit out details about data classes = np.sort(np.unique(train_labels)) print("\n=================================\nData details:") print("- Training-set:\t{}".format(len(train_seqs))) print("- Test-set:\t\t{}".format(len(test_seqs))) print("- Classes:\t\t{}".format(classes)) print("=================================\n\n") self.prepare_lstm(train_seqs, train_labels, test_seqs, test_labels)
def run_experiment(self): ''' Run specified experiments return: dict with metrics ''' pre = PreProcessing() ds = DataSource() met = Metrics() print('Reading Data') train_df = ds.read_data(train=True) test_df = ds.read_data(train=False) y_test = test_df['y'] print('Preprocessing train data') X_train, y_train = pre.preprocess(train_df, train=True) print('Preprocessing test data') X_test = pre.preprocess(test_df[pre.train_features], train=False) print('Training model') models = Experiments().train_model(X_train, y_train) print('Running metrics') for model in models.keys(): print(model) y_pred = models[model].predict(X_test) print( met.calculate_classification(model, y_test, pd.Series(y_pred))) metrics = met.calculate_classification(model, y_test, pd.Series(y_pred)) pd.DataFrame.from_dict( metrics, orient='index').to_csv('../output/' + model + '.csv') return metrics
def music_test(): # Carregando e normalizando os dados da base de musicas dataset = PreProcessing("default_features_1059_tracks.txt") dataset.normalize(ignore_first_column=False) # Atributos a serem variados no teste n_layers = [1, 2] hidden_layer = [20, [10, 10]] momentums = [0.3, 0.5, 0.7] max_iterations = [100, 250, 500] etas = [0.3, 0.5, 0.7] ps = [0.5, 0.7, 0.9] # Teste for layer in n_layers: for momentum in momentums: for eta in etas: for max_iteration in max_iterations: for p in ps: train, test = training.holdout( p, dataset.normalized_dataframe) example = test.values.tolist() print("INPUT NEURONS = 68 HIDDEN NEURONS = " + str(int(10 / layer)) + " OUTPUT NEURONS = 2 HIDDEN LAYER = " + str(layer) + " ETA = " + str(eta) + " MAX ITERATIONS = " + str(max_iteration) + " MOMENTUM = " + str(momentum) + " P = " + str(p)) print() nn = Mlp(68, hidden_layer[layer - 1], 2, n_hidden_layers=layer) nn.backpropagation(train.values.tolist(), eta=eta, max_iterations=max_iteration) print("SQUARED ERROR =", training.squared_error(nn, test, n_classes=2)) print() print("Input 1") nn.feed_forward(example[0][:(-1 * 2)]) print(example[0]) print("Result 1") nn.show_class() print() print("Input 2") print(example[15]) nn.feed_forward(example[15][:(-1 * 2)]) print("Result 2") nn.show_class() print() print( "******************************************************//******************************************************" ) print()
def process(self, frame, name="TrainingSamples/Image_"): # preprocessing for contour detection preprocessed = PreProcessing().background_contour_removal(frame) # find contours using algorithm by Suzuki et al. (1985) contours, hierarchy = cv.findContours(preprocessed, cv.RETR_TREE, cv.CHAIN_APPROX_NONE) # limit observed contours if len(contours) > 500: contours = contours[:500] # ignore first contour, as it is outer border of the frame contours = contours[1:] hierarchy = hierarchy[0][1:] - 1 hierarchy = np.where(hierarchy < 0, -1, hierarchy) if len(contours) == 0: return preprocessed # initialize contour object from each contour in contour list binarized = PreProcessing().custom_binarize(frame) contourList = [ Contour(contour=cnt, imgShape=frame.shape, frameBinary=binarized) for cnt in contours ] # filter, classify and group segmented contours sg = Segmentation(contourList, hierarchy, frame.shape) sg.group_and_classify() filtered = sg.get_contours() if len(filtered) == 0: return preprocessed # colouring preprocessing for ease in debugging preprocessed = cv.cvtColor(preprocessed, cv.COLOR_GRAY2BGR) lines = LineOrdering(filtered).get_lines(frame) # label contours with additional positional information lines = sg.label_contours(lines) for l in range(len(lines)): line = lines[l] for i in range(len(line)): cnt = line[i] cv.putText(frame, str(l) + str(i), (cnt.center[0], cnt.center[1]), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1) solutions = [ self.solver.solve([cnt.unwrap() for cnt in line], frame) for line in lines if len(line) > 2 ] return preprocessed # orderedImage
def get(self, preprocessing, c): global X_train global y_train global X_test global y_test if preprocessing == 'StandardScaler': X_train, X_test = PreProcessing.standard_scaler(X_train, X_test) if preprocessing == 'MinMaxScaler': X_train, X_test = PreProcessing.min_max_scaler(X_train, X_test) return Model.train_and_test(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, c=c)
def main(): dataset = PreProcessing("wine_dataset.txt") dataset.normalize(ignore_first_column=True) dataset.switch_first_last_column() dataset.normalize_class() train, test = training.holdout(0.7, dataset.normalized_dataframe) nn = Mlp(13, 10, 3, n_hidden_layers=1) nn.backpropagation(train.values.tolist(), eta=0.5) example = test.values.tolist() print(len(example)) input() #print(example) #print(example[17]) #feed example nn.feed_forward(example[0][:(-1 * 3)]) print(example[0]) nn.show_class() nn.feed_forward(example[40][:(-1 * 3)]) print(example[40]) print(test.iloc[[40]].values.tolist()) input() nn.show_class() nn.feed_forward(example[31][:(-1 * 3)]) print(example[31]) nn.show_class() print(training.accuracy(nn, test, n_classes=3)) """
def wine_test(): # Carregando e Normalizando os dados da base de vinhos dataset = PreProcessing("wine_dataset.txt") dataset.normalize(ignore_first_column=True) dataset.switch_first_last_column() dataset.normalize_class() # Atributos a serem variados nos testes n_layers = [1, 2] hidden_layer = [10, [5, 5]] momentums = [0.3, 0.5, 0.7] max_iterations = [100, 250, 500] etas = [0.3, 0.5, 0.7] ps = [0.5, 0.7, 0.9] # Teste for layer in n_layers: for momentum in momentums: for eta in etas: for max_iteration in max_iterations: for p in ps: train, test = training.holdout( p, dataset.normalized_dataframe) example = test.values.tolist() print("INPUT NEURONS = 13 HIDDEN NEURONS = " + str(int(10 / layer)) + " OUTPUT NEURONS = 3 HIDDEN LAYER = " + str(layer) + " ETA = " + str(eta) + " MAX ITERATIONS = " + str(max_iteration) + " MOMENTUM = " + str(momentum) + " P = " + str(p)) print() nn = Mlp(13, hidden_layer[layer - 1], 3, n_hidden_layers=layer) nn.backpropagation(train.values.tolist(), eta=eta, max_iterations=max_iteration) print("ACCURACY =", training.accuracy(nn, test, n_classes=3)) print() print("Input 1") nn.feed_forward(example[0][:(-1 * 3)]) print(example[0]) print("Result 1") nn.show_class() print() print("Input 2") print(example[15]) nn.feed_forward(example[15][:(-1 * 3)]) print("Result 2") nn.show_class() print() print( "******************************************************//******************************************************" ) print()
def get_data(): """Uses EntsoeDownloader and PreProcessing class to initiate the scraping and preprocessing process Parameters ---------- Returns ------- df A dataframe in form of json in order to dispatch to next services prices A list of electricity prices of the past 24 hours timestamps A list of timestamps of the past 24 hours and the next 24 hours (following day) """ date = datetime.now().strftime("%Y-%m-%d %H:%M:%S") print("Todays date & type: ", date, " & ", type(date)) # Initiate Web Scraping # Feed ENTSOE Account details into the EntsoeDownloader class downloader = EntsoeDownloader(date, "username", "password").setup(headless=True) downloader.login_and_download() # Initiate PreProcessing pre_processing = PreProcessing() pre_processing.start_preprocess() df = pd.read_csv(os.getcwd() + "/download/final_dataset_kafka.csv") # Get Day-Ahead price and generate new dates temp = df[[df.columns[0], "Day-ahead Price [EUR/MWh]"]].dropna() temp.rename(columns={df.columns[0]: "cet_timestamp"}, inplace=True) temp["cet_timestamp"] = pd.to_datetime(temp["cet_timestamp"], format="%Y-%m-%d %H:%M") temp.set_index("cet_timestamp", inplace=True) time = df[df.columns[0]][-24:].values last_date = temp.index[-1:][0] timestamp_list = list(time) for i in range(1, 25): last_date += timedelta(hours=1) timestamp_list.append(last_date.strftime("%Y-%m-%d %H:%M:%S")) df = df.to_json(orient="split") price_list = list(temp["Day-ahead Price [EUR/MWh]"][-24:].values) return df, price_list, timestamp_list
def preprocess_dataset(pca_processing: Union[KPCAPreprocessing, PCAPreprocessing], preprocessing: PreProcessing, dataset: np.ndarray) -> Any: ret_list = [] for data_i in dataset: stnd_img = preprocessing.regular_preprocess(data_i) ret_list.append(pca_processing.apply_method(stnd_img)) return ret_list
def model_training(self): pre = PreProcessing() print('Reading data') df = self.data.read_data(train=True) print('Starting training') X_train, y_train = pre.preprocess(df, train=True) print('Starting training model') model = CatBoostClassifier() steps = [('over', SMOTE()), ('model', CatBoostClassifier())] pipeline = Pipeline(steps=steps) pipeline.fit(X_train, y_train) modelo = pipeline['model'] model = { 'model': modelo, 'preprocessing': pre, 'columns': pre.feature_names } print(model) dump(model, '../output/modelo.pkl') return model
def train_with_svm( dataset: np.ndarray, labels: np.ndarray, names: np.ndarray, classifier: Classifier, is_pca: bool ) -> Tuple[PreProcessing, Union[KPCAPreprocessing, PCAPreprocessing]]: preprocessing = PreProcessing(dataset, dataset.shape[1], dataset.shape[2], dataset.shape[3]) c_matrix: np.ndarray if is_pca: c_matrix = np.matmul(preprocessing.training_set, preprocessing.training_set.T) else: c_matrix = KPCAPreprocessing.get_kernel_pol_method( preprocessing.training_set) # Uses QR method to get eigenvalues and eigenvectors eigenvalues, eigenvectors = qr_eig_algorithm(c_matrix) total = np.sum(np.abs(eigenvalues)) acum = 0 i = 0 while acum < PRECISION: acum += eigenvalues[i] / total i = i + 1 print( f"In order to win {round(acum, 4)} variance ratio we will use {i} eigenvectors" ) # Grab the first i eigenvectors eigenvectors = eigenvectors[:i] if is_pca: processing = PCAPreprocessing(preprocessing.training_set, preprocessing.avg_face, eigenvectors, dataset.shape[1], dataset.shape[2], dataset.shape[3], names, labels) else: processing = KPCAPreprocessing(preprocessing.training_set, preprocessing.avg_face, eigenvectors, dataset.shape[1], dataset.shape[2], dataset.shape[3], names, labels, c_matrix) # Feature scaling sc = StandardScaler() scaled_training_set = sc.fit_transform(processing.training_set) # Train classifier with default C and gamma values classifier.train_classifier(scaled_training_set, labels) print("Training done!") classifier.save(preprocessing, processing) return preprocessing, processing
def __init__(self): data_src = './data_repository/geological_similarity/' self.model_path = './trained_model/model_triplet/' self.dataset = PreProcessing(data_src) self.model = SiameseNetwork() # Define Tensor self.img_placeholder = tf.placeholder(tf.float32, [None, 28, 28, 3], name='img') self.net = self.model.conv_net(self.img_placeholder, reuse=False) self.normalized_training_vectors = self.generate_db_normed_vectors() print('Prediction object loaded successfully.')
def main(): preProcessing = PreProcessing("mnist_train.csv") #preProcessing.preProcessData() # number or hidden units processing = Processing(10) processing.load_data("mnist_train_scaled.csv", "mnist_train_targetClass.csv") processing.processing() for arg in sys.argv[1:]: print(arg)
def testKMEANS(): data = pd.read_table("datasets/mouse.txt", engine='python', sep = ',', header=None ) data = pp.scale(data) # setting cols names # TODO: set standard names to N columns or use default pandas names assigns data.columns = ['x','y'] # instanciating kmeans km = K_means(data, k=3, alpha=0.001) # updating algorithm km.update() # plotting the final result km.plot()
def train_with_svm(dataset_train, labels_train, classifier, is_pca, names): preprocessing = PreProcessing(dataset_train, dataset_train.shape[1], dataset_train.shape[2], dataset_train.shape[3]) # Over this matrix we need to calculate eigenvectorss if is_pca: C_matrix = np.matmul(preprocessing.training_set, preprocessing.training_set.T) else: C_matrix = KPCAPreprocessing.rbf_kernel_pca(preprocessing.training_set) # Uses QR method to get eigenvalues and eigenvectors eigenvalues, eigenvec = np.linalg.eig( C_matrix) #calculate_eigenvectors(C_matrix) total = np.sum(np.abs(eigenvalues)) accumulated = 0 i = 0 while accumulated < 0.50: accumulated = accumulated + eigenvalues[i] / total i = i + 1 print( f"In order to win {round(accumulated, 4)} variance ratio we will use {i} eigenvectors" ) print("Training...") # Grab the first i eigenvectors eigenvectors = eigenvec[:i] if is_pca: # Apply PCA transformation to training training_data pca_processing = PCAPreprocessing(preprocessing.training_set, preprocessing.avg_face, eigenvectors, dataset_train.shape[1], dataset_train.shape[2], dataset_train.shape[3], names, labels_train) else: # Apply KPCA transformation to training training_data pca_processing = KPCAPreprocessing( preprocessing.training_set, preprocessing.avg_face, eigenvectors, dataset_train.shape[1], dataset_train.shape[2], dataset_train.shape[3], names, labels_train, C_matrix) # Train classifier with default C and gamma values classifier.train_classifier(pca_processing.training_set, labels_train) classifier.save(preprocessing, pca_processing) return preprocessing, pca_processing
def main(): #read dataset and preprocess it dataset = PreProcessing("seeds_dataset.txt", separator='\s+') dataset.normalize() dataset.normalize_class() #divide dataset into training and test sets train, test = training.holdout(0.7, dataset.normalized_dataframe) nn = Rbf(7, 3) nn.train(train, eta=0.5, max_iterations=500) print("RBF:", training.accuracy(nn, test, 3)) mm = Mlp(7, 3, 3) mm.backpropagation(train.values.tolist(), max_iterations=500) print("MLP:", training.accuracy(mm, test, 3))
FLAGS = flags.FLAGS flags.DEFINE_integer('batch_size', 150, 'Batch size.') flags.DEFINE_integer('train_iter', 100, 'Total training iter') flags.DEFINE_integer('step', 50, 'Save after ... iteration') flags.DEFINE_float('learning_rate', '0.01', 'Learning rate') flags.DEFINE_float('momentum', '0.99', 'Momentum') flags.DEFINE_string('model', 'conv_net', 'model to run') flags.DEFINE_string('data_src', r'C:\OData', 'source of training dataset') #Low_data OData all tf.compat.v1.disable_eager_execution() if __name__ == "__main__": # Setup Dataset dataset = PreProcessing(FLAGS.data_src) model = TripletLoss() placeholder_shape = [None] + list(dataset.images_train.shape[1:]) print("placeholder_shape", placeholder_shape) # Setup Network next_batch = dataset.get_triplets_batch anchor_input = tf.placeholder(tf.float32, placeholder_shape, name='anchor_input') positive_input = tf.placeholder(tf.float32, placeholder_shape, name='positive_input') negative_input = tf.placeholder(tf.float32, placeholder_shape, name='negative_input')
from get_data import GetData from preprocessing import PreProcessing from autoencoder import AutoEncoder from data_processing import DataProcessing from model import NeuralNetwork data = GetData("AAPL", "2000-01-01", "2018-10-01") data.get_stock_data() preprocess = PreProcessing(0.8, 0.25) preprocess.make_wavelet_train() preprocess.make_test_data() autoencoder = AutoEncoder(20) autoencoder.build_train_model(55, 40, 30, 30, 40) process = DataProcessing(0.8, 0.25) process.make_test_data() process.make_train_data() process.make_train_y() process.make_test_y() model = NeuralNetwork(20, True) model.make_train_model()
def seed_test(): # Carregando e Normalizando os dados da base de vinhos dataset = PreProcessing("seeds_dataset.txt", separator='\s+') dataset.normalize() dataset.normalize_class() # Atributos a serem variados nos testes n_layers = [1, 2] hidden_layer = [3, [6, 6]] momentums = [0.3, 0.5] max_iterations = [100, 250, 500] etas = [0.3, 0.5] ps = [0.7, 0.9] rbf_accuracy = 0 mlp_accuracy = 0 tests = 0 # Teste for layer in n_layers: for momentum in momentums: for eta in etas: for max_iteration in max_iterations: for p in ps: tests += 1 print("Test number", tests) train, test = training.holdout( p, dataset.normalized_dataframe) print("INPUT NEURONS = 7 HIDDEN NEURONS = " + str(int(6 / layer)) + " OUTPUT NEURONS = 3 HIDDEN LAYER = " + str(layer) + " ETA = " + str(eta) + " MAX ITERATIONS = " + str(max_iteration) + " MOMENTUM = " + str(momentum) + " P = " + str(p)) print() print("RBF") nn = Rbf(7, 3) nn.train(train, eta=0.5, max_iterations=max_iteration) ac = training.accuracy(nn, test, 3) rbf_accuracy += ac print("ACCURACY =", ac) print() print("MLP") example = test.values.tolist() mm = Mlp(7, hidden_layer[layer - 1], 3, n_hidden_layers=layer) mm.backpropagation(train.values.tolist(), eta=eta, max_iterations=max_iteration) ac = training.accuracy(mm, test, n_classes=3) mlp_accuracy += ac print("ACCURACY =", ac) print() print("Rbf:") nn.feed_forward(example[15][:(-1 * 3)]) print(example[15]) print("Result 1") nn.show_class() print() print("Mlp") print(example[15]) nn.feed_forward(example[15][:(-1 * 3)]) print("Result 2") mm.show_class() print() print( "******************************************************//******************************************************" ) print() print(tests, " tests executed. Rbf accuracy:", rbf_accuracy / tests, " Mlp accuracy:", mlp_accuracy / tests)
from preprocessing import PreProcessing from visuals import ClassifierVisual # Import model library from sklearn.svm import SVC from sklearn.metrics import confusion_matrix # Import data dataset = LoadData("Social_Network_Ads.csv").data # Split the dataset X = dataset.iloc[:, [2,3]].values y = dataset.iloc[:, 4].values # Lets do some preprocessing... processor = PreProcessing() # Split the data X_train, X_test, y_train, y_test = processor.split(X, y, test_size=0.25) # scale the data X_train = processor.fit_scaler(X_train) X_test = processor.scale(X_test) # Lets fit the model now classifier = SVC(kernel='rbf', random_state=0) classifier.fit(X_train, y_train) # Predict! y_pred = classifier.predict(X_test) # Creating the confusion matrix cm = confusion_matrix(y_test, y_pred)
############################################################################################################################# # preprocessing, make or load and save dataset ############################################################################################################################# if LOAD_DATA == True: dataset = load_dataset(DATASET_PATH) if SPLIT_DATASET: valid_dataset = load_dataset(VALID_DATASET_PATH) else: # preprocessing source_path = r'./data/mnist-in-csv/mnist_test.csv' target_path = None sources, targets = PreProcessing(source_path, target_path,mode='csv') # make dataset ############################################################################## transform = transforms.Compose([self_transform()]) if SPLIT_DATASET: pivot = int(len(sources) * SPLIT_RATIO) train_sources = sources[:pivot] train_targets = targets[:pivot] valid_sources = sources[pivot:] valid_targets = targets[pivot:] valid_dataset = Mydataset(valid_sources,valid_targets,transform)
from flask import Flask, jsonify, request from flask_cors import CORS from preprocessing import PreProcessing from seq2seq import TextSummarization from config import * import nltk import pickle import numpy as np app = Flask(__name__) CORS(app) preprocessing = PreProcessing() model = TextSummarization(forward_only=True) file = open(WORD_DICT_PATH, "rb") word_dict = pickle.load(file, encoding="utf-8") reversed_word_dict = dict(zip(word_dict.values(), word_dict.keys())) @app.route('/get_summary', methods=['POST']) def get_summary(): data = request.get_json() text = data["text"] clean_text = preprocessing.clean_string(text) clean_text = [clean_text] x = nltk.word_tokenize(clean_text) x = [word_dict.get(d, word_dict["<unk>"]) for d in x] x = x[:(MAX_ARTICLE_LEN - 1)] x = [x + (MAX_ARTICLE_LEN - len(x)) * [word_dict["<padding>"]]] x = np.array(x) summary_text = model.get_summary(x, reversed_word_dict)
from preprocessing import PreProcessing from sklearn.model_selection import train_test_split from Models import LR, SVM, NaiveBayes, Word2VecDeep, BOWDeep, RNN import matplotlib.pyplot as plt pre = PreProcessing() data = pre.clean_text() X = data['post'] y = data['tags'] # Split to 20% test data and 80% training data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) nb = NaiveBayes(X_train, X_test, y_train, y_test, pre.tags) score = nb.train() #lr = LR(X_train, X_test, y_train, y_test, pre.tags) #lr.train() #svm = SVM(X_train, X_test, y_train, y_test, pre.tags) #svm.train() #wv = Word2VecDeep(X_train, X_test, y_train, y_test, pre.tags) #wv.train() #bow = BOWDeep(X_train, X_test, y_train, y_test, pre.tags) #print(bow.train()) # Use different function for pre-processing """pre = PreProcessing() X, y = pre.filter_rnn() # Split to 20% test data and 80% training data
"""this module generate inverted index for each query box, generate the article shelve, offer conjunctive query """ import shelve import json from preprocessing import PreProcessing import time from nltk import word_tokenize # instantiate pp to access normalize and flatten, and to create the test corpus prep = PreProcessing() def timing(func): def wrapper(*args, **kwargs): t1 = time.time() func(*args, **kwargs) t2 = time.time() print("Time it took to build this index: " + str((t2 - t1)) + "\n") return wrapper @timing def main_query_inverted_index(shelvename1, shelvename2, corpus_name='2018_movies.json'): """ create a title+free text inverted index, and put it into 2 shelve files (because one cannot hold) :param shelvename1: String :param shelvename2: String :param corpus_name: String a json file
# Import lib files envs = load_dotenv(find_dotenv()) file = os.getenv("lib") sys.path.insert(0, file) from utils import LoadData from preprocessing import PreProcessing # Load data dataset = LoadData("50_Startups.csv").data # Split the dataset X = dataset.iloc[:, :-1].values y = dataset.iloc[:, 4].values # Using the PreProcessing class from preprocessing processor = PreProcessing() # Encoding dummy variables X = processor.dummy_encoding(data=X, feature_position=3) # Avoiding the dummy variable trap X = X[:, 1:] # Building the optimal model using Backward Elimination X = np.append(arr=np.ones((X.shape[0], 1)).astype(int), values=X, axis=1) X_opt = X[:, [0, 1, 2, 3, 4, 5]] regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit() regressor_OLS.summary() X_opt = X[:, [0, 1, 3, 4, 5]] regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit() regressor_OLS.summary()
ps.stem(word) for word in review if not word in stopwords.words('english') ] review = " ".join(review) corpus.append(review) corpus dataset['cleaned'] = corpus # Create the Bag of Words model BoW cv = CountVectorizer(max_features=500) X = cv.fit_transform(corpus).toarray() y = dataset.iloc[:, 1] ### Build a model using the BoW model # Lets do some preprocessing... processor = PreProcessing() # Split the data X_train, X_test, y_train, y_test = processor.split(X, y, test_size=0.20) # Lets fit the model now classifier = GaussianNB() classifier.fit(X_train, y_train) # Predict! y_pred = classifier.predict(X_test) # Creating the confusion matrix cm = confusion_matrix(y_test, y_pred) cm
# Import lib files envs = load_dotenv(find_dotenv()) file = os.getenv("lib") sys.path.insert(0, file) from utils import LoadData from preprocessing import PreProcessing # Load the data dataset = LoadData("Salary_Data.csv").data # Split the dataset X = dataset.iloc[:, :-1].values y = dataset.iloc[:, 1].values # Using the PreProcessing class from preprocessing processor = PreProcessing() # Split the data X_train, X_test, y_train, y_test = processor.split(X, y, test_size=0.2, random_state=0) # Fit Simple Linear Regression model regressor = LinearRegression() regressor.fit(X_train, y_train) # Predict the test set y_pred = regressor.predict(X_test) # Visualizing the data plt.scatter(X_train, y_train, color='red') plt.plot(X_train, regressor.predict(X_train), color='blue') plt.title('Salary vs. Exp.')
accuracy.append(acc) average = np.mean(accuracy) std = np.std(accuracy) ret_acc = [] for i in range(len(test_y)-1): if test_y[i] != 0: acc = 100 - (np.abs(predicted_data[i] - test_y[i]))/test_y[i] * 100 ret_acc.append(acc) ret_avg = np.mean(ret_acc) ret_std = np.std(ret_acc) pd.DataFrame(np.reshape(ret_acc, (len(ret_acc, )))).to_csv(return_acc) prediction = np.exp(model.predict(np.reshape(test_data[-2], (1, 20))))*price[-2] print(prediction) return dataset, average, std # if __name__ == "__main__": preprocess = PreProcessing(0.8, 0.25,"stock_data.csv","preprocessing/rbm_train.csv","preprocessing/rbm_test.csv","preprocessing/log_train.csv") preprocess.make_wavelet_train() preprocess.make_test_data() # if __name__ == "__main__": autoencoder = AutoEncoder(20,True,"preprocessing/rbm_train.csv","preprocessing/rbm_test.csv","features/autoencoded_data.csv","preprocessing/log_train.csv") autoencoder.build_train_model(55, 40, 30, 30, 40) # if __name__ == "__main__": dataset, average, std = nnmodel(500, 0.05, 0.01,"features/autoencoded_data.csv","60_return_forex/encoded_return_test_data.csv","preprocessing/log_train.csv","forex_y/log_test_y.csv","forex_y/test_price.csv","60_return_forex/predicted_price.csv","60_return_forex/price.csv","60_return_forex/ret_acc.csv") print(f"Price Accuracy Average = {average} \nPrice Accuracy Standard Deviation = {std}")
# Import lib files envs = load_dotenv(find_dotenv()) file = os.getenv("lib") sys.path.insert(0, file) from utils import LoadData from preprocessing import PreProcessing from visuals import ClassifierVisual # Import data dataset = LoadData("Churn_Modelling.csv").data X = dataset.iloc[:, 3:13].values y = dataset.iloc[:, 13].values # Lets do some preprocessing... processor = PreProcessing() # Encode the data (Country/Gender) X[:, 1] = processor.encode(X[:, 1]) X[:, 2] = processor.encode(X[:, 2]) X = processor.hot_encoding(data=X, features=[1]) X = X[:, 1:] # Split the data into training+test X_train, X_test, y_train, y_test = processor.split(X, y, test_size=0.2) # Fitting XGboost classifier = XGBClassifier() classifier.fit(X_train, y_train) # Predicting the test results y_pred = classifier.predict(X_test)