def upload_file(): if request.method == 'POST': file = request.files['file'] if file and allowed_file(file.filename): filename = secure_filename(file.filename) path = (os.path.join(app.config['UPLOAD_FOLDER'], filename)) file.save(path) clean_data(path,request.form['report_date']) os.remove(path) ## function with database connection and mysql script ## takes the csv file path as an arguement read_data(str(path[:path.index('.')] + request.form['report_date'] + '.csv')) return flask.redirect('microfinance/') return 'file upload failed'
def nn_algorithms(file_data_name): # calculate performing time start = time.time() data_frame = clean_data(file_data_name) X_train, Y_train, X_test, Y_test = X_train_Y_train_X_test_Y_test( data_frame) # print data's size print('X train: ', X_train.shape) print('y train: ', Y_train.shape) print('X test: ', X_test.shape) print('y test: ', Y_test.shape) # call model and fit model and training data clf_ann = MLPClassifier().fit(X_train, Y_train) # predict test data y_pred_ann = clf_ann.predict(X_test) cm_ann = confusion_matrix(Y_test, y_pred_ann) print('Confusion matrix SVM:\n', cm_ann) precision = clf_ann.score(X_test, Y_test) print('Accuracy of Neural Network on test set: ', precision) end = time.time() performing_time = end - start return performing_time, precision
def lr_algorithms(file_data_name): # calculate performing time start = time.time() data_frame = clean_data(file_data_name) X_train, Y_train, X_test, Y_test = X_train_Y_train_X_test_Y_test( data_frame) # print data's size print('X train: ', X_train.shape) print('y train: ', Y_train.shape) print('X test: ', X_test.shape) print('y test: ', Y_test.shape) # call model and fit model and training data clf_lr = LogisticRegression().fit(X_train, Y_train) # predict test data y_pred_lr = clf_lr.predict(X_test) cm_lr = confusion_matrix(Y_test, y_pred_lr) print('Confusion matrix SVM:\n', cm_lr) precision = clf_lr.score(X_test, Y_test) print('Accuracy of Logistic Regression on test set: ', precision) end = time.time() performing_time = end - start return performing_time, precision
def test_clean_data(): datapath = os.path.dirname(os.path.abspath(maxime-prevost.__file__)) + '/data' df = pd.read_csv('{}/data.csv.gz'.format(datapath)) first_cols = ['id', 'civility', 'birthdate', 'city', 'postal_code', 'vote_1'] assert list(df.columns)[:6] == first_cols assert df.shape == (999, 142) out = clean_data(df) assert out.shape == (985, 119)
def test_clean_data(self): datapath = os.path.dirname(os.path.abspath(friday-feeling.__file__)) + '/data' df = pd.read_csv('{}/data.csv.gz'.format(datapath)) self.assertListEqual(list(df.columns)[:6], ['id', 'civility', 'birthdate', 'city', 'postal_code', 'vote_1']) self.assertEqual(df.shape, (999, 142)) out = clean_data(df) self.assertEqual(out.shape, (985, 119))
# exercise 8.2.6 import matplotlib.pyplot as plt import numpy as np from scipy.io import loadmat import torch from sklearn import model_selection from __init__ import train_neural_net, draw_neural_net from scipy import stats from clean_data import * from pandas import DataFrame import pandas as pd # Load Matlab data file and extract variables of interest data = clean_data('Datasets/**videos.csv') data = transform_data( data, ['likes', 'dislikes', 'views', 'comment_count', 'trending_time']) np.random.seed(180820) data = data.head(10000) #X = np.array(data[['likes','dislikes','comment_count','trending_time']]) #y = np.array(data['views']).squeeze() X = np.array(data) y = X[:, [4]] X = X[:, 0:4] attributeNames = [ 'likes', 'dislikes', 'views', 'comment_count', 'trending_time' ] N, M = X.shape C = 2 # Parameters for neural network classifier n_hidden_units = 5 # number of hidden units
import numpy as np import pylab as plt from clean_data import * from sklearn.ensemble import GradientBoostingClassifier from sklearn.linear_model import LogisticRegression from sklearn import metrics, cross_validation from sklearn.feature_selection import SelectKBest, f_classif from sklearn.cross_validation import train_test_split from sklearn.metrics import accuracy_score from sklearn.cross_validation import KFold df_train = pd.read_csv('../../data/train.csv') df_test = pd.read_csv('../../data/test.csv') ### Training train_data_frame = clean_data(df_train,drop_passenger_id=True) train_data = train_data_frame.values # Training data features, skip the first column 'Survived' train_features = train_data[:, 1:] # # which features are the best? predictors = ["Pclass", "Fare", "Sex", "Ticket","Embarked_Val_C","Embarked_Val_Q", "Embarked_Val_S", "FamilySize", "AgeFill","AgeCat","Fare_per_person", "Title", "HighLow","FamilyId", \ "Age_class", "Fare_class","Family", \ #"Sex_class","AgeFill_squared","Age_class_squared",\ ] ### Ensemble Model algorithms = [ [GradientBoostingClassifier(learning_rate=0.005, n_estimators=250,
import numpy as np import pylab as plt from clean_data import * from sklearn.ensemble import GradientBoostingClassifier from sklearn.linear_model import LogisticRegression from sklearn import metrics, cross_validation from sklearn.feature_selection import SelectKBest, f_classif from sklearn.cross_validation import train_test_split from sklearn.metrics import accuracy_score from sklearn.cross_validation import KFold df_train = pd.read_csv('../../data/train.csv') df_test = pd.read_csv('../../data/test.csv') ### Training train_data_frame = clean_data(df_train, drop_passenger_id=True) train_data = train_data_frame.values # Training data features, skip the first column 'Survived' train_features = train_data[:, 1:] # # which features are the best? predictors = ["Pclass", "Fare", "Sex", "Ticket","Embarked_Val_C","Embarked_Val_Q", "Embarked_Val_S", "FamilySize", "AgeFill","AgeCat","Fare_per_person", "Title", "HighLow","FamilyId", \ "Age_class", "Fare_class","Family", \ #"Sex_class","AgeFill_squared","Age_class_squared",\ ] ### Ensemble Model algorithms = [[ GradientBoostingClassifier(learning_rate=0.005, n_estimators=250,
### Laura Buchanan ### lcb402 #from restaurant_grades import * from clean_data import * from grade_funcs import * import os.path import sys if __name__ == "__main__": try: # Load clean data if os.path.exists('../clean_data.csv') == True: data = pd.read_csv('../clean_data.csv',sep=',',header=0,low_memory=False) else: clean_data() data = pd.read_csv('../clean_data.csv',sep=',',header=0,low_memory=False) # Look at trends in grade changes and plot region by grade and year regions = ['NYC','STATEN ISLAND', 'QUEENS', 'MANHATTAN', 'BRONX', 'BROOKLYN'] for region in regions: grade = grades(data,region) grade.improve_over_region() grade.count_for_bargraph() grade.make_bargraph() except KeyboardInterrupt: print "\nProgram ended by user." sys.ext(1)
from sklearn.model_selection import train_test_split from clean_data import * from add_column import * data_jour = read_data('reporting_jour.csv') data_jour = clean_data(data_jour) data_jour = add_column_semaine(data_jour) data_jour = add_column_jour(data_jour) data_jour = add_column_year(data_jour) data_jour = add_column_month(data_jour) data_jour = add_column_date(data_jour) data_jour[['jour', 'year', 'month', 'date']] = data_jour[['jour', 'year', 'month', 'date']].astype(str) data_jour = data_jour.drop( ['date_commande', 'midi_soir', 'jour_semaine', 'type_vente'], axis=1) y = data_jour.frequentation X = data_jour.drop('frequentation', axis=1) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=2)
# -*- coding: utf-8 -*- from sklearn.externals import joblib from clean_data import * clf = joblib.load('model.pkl') vectorizer = joblib.load('vectorembedding.pkl') comment = 'nó đẹp' comment = remove_Stopword(tokenize(convert_Abbreviation(normalize_Text(clean_data(comment))))) listcomment = [] listcomment.append(comment) vectorcomment = vectorizer.transform(listcomment).toarray() print clf.predict(vectorcomment)
print(target) # run wls, centering weights at target observation resp = wls( X, Y, target, h, wt_fxn ) # run wls regression. return coefficient estimates and estimators coeff_sub = pd.DataFrame([resp[0].flatten().tolist()[0]]) coeffs = coeffs.append(coeff_sub) #resids.append(resp[1].flatten().tolist()[0]) return coeffs #return [coeffs,resids,Ws] ############################################################################### data = clean_data('19800101', '20161230') # test CAPM specification, assuming lag-one AR in returns asset = 'NoDur' X = gen_X(data, asset, 'CAPM') Y = np.matrix(data.loc[1:, asset].values).T reg_out = rolling_regression(X, Y, 0.2, 'uniform') # This works, BUT IT RUNS SUPER SLOWLY. # It can't even get through all 9000+ data points. This will certainly NOT work for bootstrapping. # Need to figure out some way to run this faster (in parallel?)