def __rolling_window_test(self, data, window_size, test_size, step=1): print("\t\tRolling Window Validation Results:") # TODO: Hide the STDOUT of pp.split() and __fit_model(), and prevent __fit_model() from saving a .pkl on each run windows = [data.loc[idx * step:(idx * step) + round(window_size * len(data))] for idx in range(int((len(data) - round(window_size * len(data))) / step))] decoupled_windows = [pp.split(window, test_size=test_size, balanced=False) for window in windows] # TODO: Do a nonrandom split to respect the temporal order of observations results = {"accuracy": [], "precision": [], "specificity": [], "sensitivity": []} for feature_set in decoupled_windows: self.x_train, self.x_test, self.y_train, self.y_test = feature_set self.scaler = StandardScaler() self.scaler.fit(self.x_train) self.__fit_model() self.y_pred = self.model.predict(self.scaler.transform(self.x_test)) results["accuracy"].append(analysis.accuracy(self.y_pred, self.y_test)) results["precision"].append(analysis.precision(self.y_pred, self.y_test)) results["specificity"].append(analysis.specificity(self.y_pred, self.y_test)) results["sensitivity"].append(analysis.sensitivity(self.y_pred, self.y_test)) print("\t\t\tAccuracy: ", str(sum(results["accuracy"]) / float(len(results["accuracy"])))) print("\t\t\tPrecision: ", str(sum(results["precision"]) / float(len(results["precision"])))) print("\t\t\tSpecificity: ", str(sum(results["specificity"]) / float(len(results["specificity"])))) print("\t\t\tSensitivity: ", str(sum(results["sensitivity"]) / float(len(results["sensitivity"]))))
def main(args): set_settings(args) data,gt = fetch(args) train_data,train_gt,test_data,test_gt,train_names,test_names = pp.split(data,gt, args.split_method, train_share=args.settings["train_share"], test_share=args.settings["test_share"], names=args.names, return_names=True) #print(train_names) print(len(train_data)) #print(test_names) print(len(test_data)) try: if args.settings["self_test"]: test_data = train_data test_gt = train_gt test_names = train_names except KeyError: pass args.train_names = train_names args.test_names = test_names #print(len(train_data)) #probe_data,probe_gt = fetch(args,"PROBE",data[0].shape[1]) #print(len(probe_data)) #print(probe_data[0]) print("Training") model = train(train_data,train_gt,args.model,args) test_model(model,test_data,test_gt,args)
def preprocess(self, document, info=[]): document = to_unicode(document, info) words = tokenize(document) if self.split: words = split(words) if self.lower: words = (word.lower() for word in words) if self.remove_stops: words = remove_stops(words, STOPS) def include(word): return len(word) >= self.min_len and len(word) <= self.max_len words = (word for word in words if include(word)) return words
def preprocess(self, document, info=[]): document = preprocessing.to_unicode(document, info) words = preprocessing.tokenize(document) if self.split: words = preprocessing.split(words) if self.lower: words = (word.lower() for word in words) if self.remove_stops: words = preprocessing.remove_stops(words, preprocessing.FOX_STOPS) words = preprocessing.remove_stops(words, preprocessing.JAVA_RESERVED) def include(word): return len(word) >= self.min_len and len(word) <= self.max_len words = (word for word in words if include(word)) return words
def __rolling_window_test(self, data, window_size, test_size, step=1): print("\t\tRolling Window Validation Results:") # TODO: Hide the STDOUT of pp.split() and __fit_model(), and prevent __fit_model() from saving a .pkl on each run windows = [ data.loc[idx * step:(idx * step) + round(window_size * len(data))] for idx in range( int((len(data) - round(window_size * len(data))) / step)) ] decoupled_windows = [ pp.split(window, test_size=test_size, balanced=False) for window in windows ] results = {"accuracy": [], "precision": [], "f1": [], "recall": []} for feature_set in decoupled_windows: self.x_train, self.x_test, self.y_train, self.y_test = feature_set self.__fit_model() self.y_pred = self.model.predict(self.x_test) results["accuracy"].append( analysis.accuracy(self.y_test, self.y_pred)) results["precision"].append( analysis.precision(self.y_test, self.y_pred, weighted_avg=True)) results["recall"].append( analysis.recall(self.y_test, self.y_pred, weighted_avg=True)) results["f1"].append(analysis.f1(self.y_test, self.y_pred)) print("\t\t\tAccuracy: ", str(sum(results["accuracy"]) / float(len(results["accuracy"])))) print( "\t\t\tPrecision: ", str(sum(results["precision"]) / float(len(results["precision"])))) print("\t\t\tRecall: ", str(sum(results["recall"]) / float(len(results["recall"])))) print("\t\t\tF1: ", str(sum(results["f1"]) / float(len(results["f1"]))))
def main(): print("Fetching data") price_data = scraper.fetch_data( os.path.dirname(os.getcwd()) + "/data/price_data.csv") blockchain_data = scraper.fetch_data( os.path.dirname(os.getcwd()) + "/data/blockchain_data.csv") coindesk_headlines = pd.read_csv(os.path.dirname(os.getcwd()) + "/data/scored_headlines_sentiment.csv", usecols=["Headline", "Sentiment"], sep=",") # Preprocessing # #### ## START Sentiment Analysis Block #### print("Sentiment Analysis") coindesk_headlines, stemmed = pp.sentiment_preprocessing( coindesk_headlines) # Create bag of words model. coindesk_headlines = pp.make_bag_of_words(coindesk_headlines, stemmed) x_train, x_test, y_train, y_test = pp.headlines_balanced_split( coindesk_headlines, test_size=.2) print("\nFitting sentiment models...\n") rand_forest = SentimentModel(estimator="RandomForest", train_set=(x_train, y_train), test_set=(x_test, y_test)) grad_boost = SentimentModel(estimator="GradientBoosting", train_set=(x_train, y_train), test_set=(x_test, y_test)) support_vec = SentimentModel(estimator="SupportVectorClassifier", train_set=(x_train, y_train), test_set=(x_test, y_test)) # Evaluation # print("\nEvaluating sentiment models...\n") conf_matrix_counter = 0 # Random Forest Classifier print("\tRandom Forest Classifier") analysis.plot_cnf_matrix(rand_forest.y_pred, rand_forest.y_test) rand_forest.cross_validate(method="Holdout") # Gradient Boosting Classifier print("\tGradient Boosting Classifier") analysis.plot_cnf_matrix(grad_boost.y_pred, grad_boost.y_test) grad_boost.cross_validate(method="Holdout") # Support Vector Classifier print("\tSupport Vector Classifier") analysis.plot_cnf_matrix(support_vec.y_pred, support_vec.y_test) support_vec.cross_validate(method="Holdout") #### ## END Sentiment Analysis Block #### print("Preprocessing") data = ( price_data.pipe(pp.calculate_indicators).pipe( pp.merge_datasets, other_sets=[blockchain_data ]) # [blockchain_data, coindesk_headlines] .pipe(pp.binarize_labels).pipe(pp.fix_null_vals).pipe( pp.add_lag_variables, lag=3).pipe(pp.power_transform)) x_train, x_test, y_train, y_test = pp.split(data, test_size=.2, balanced=True) # Exploratory Analysis # print("Analyzing features") #print(data.describe()) analysis.plot_corr_matrix(data) # Fitting Models # print("Fitting models") log_reg = Model(estimator="LogisticRegression", train_set=(x_train, y_train), test_set=(x_test, y_test), select_features="RecursiveFE", optimize=OPTIMIZE) rand_forest = Model(estimator="RandomForest", train_set=(x_train, y_train), test_set=(x_test, y_test), select_features="RecursiveFE", optimize=OPTIMIZE) grad_boost = Model(estimator="GradientBoosting", train_set=(x_train, y_train), test_set=(x_test, y_test), select_features="RecursiveFE", optimize=OPTIMIZE) # Evaluation # print("Evaluating") # Logistic Regression print("\tLogistic Regression Estimator") log_reg.plot_cnf_matrix() log_reg.cross_validate(method="Holdout") log_reg.cross_validate(method="RollingWindow", data=data, window_size=.9, test_size=.1) # Random Forest print("\tRandom Forest Classifier") rand_forest.plot_cnf_matrix() rand_forest.cross_validate(method="holdout") rand_forest.cross_validate(method="RollingWindow", data=data, window_size=.9, test_size=.1) # Gradient Boosting print("\tGradient Boosting Classifier") grad_boost.plot_cnf_matrix() grad_boost.cross_validate(method="holdout") grad_boost.cross_validate(method="RollingWindow", data=data, window_size=.9, test_size=.1)
from preprocessing import split, build_monthly_trends from supply_planning import plan_supply import argparse np.random.seed(1) pd.set_option('display.max_columns', None) headcount = pd.read_excel("../res/Demandv1.1.xlsx", 'Headcount') demand_trend = pd.read_excel("../res/Demandv1.1.xlsx", 'Demand Trend Last year') monthly_demand_trends, utils = build_monthly_trends(demand_trend) billable, bench = split(headcount) employees = (billable, bench) revenue_per_billable_r = 900 cost_per_r = 685 total_bench_budget = 5760000 attrition = 0.2 max_r = 12000 notice_period = 2 parser = argparse.ArgumentParser() parser.add_argument( '-br', '--billable_revenue', help='Revenue that a billable employee brings to the companypytho', required=False)
def main(): names = data_handling.read_names(["keynote"]) samples, targets, zero_length_indices = data_handling.dataset(names) names = np.delete(names, zero_length_indices) print("Original shapes, samples: {} \t targets : {} \t names: {}".format( np.shape(samples), np.shape(targets), np.shape(names))) unique, counts = np.unique(targets, return_counts=True) print("Class breakdown: {}".format(dict(zip(unique, counts)))) # Samples Preprocessing samples = preprocessing.scale_select(samples) print("Feature selection shapes, samples: {} \t targets : {}\n".format( np.shape(samples), np.shape(targets))) # ########### # Dimensionality reduction and classification exploration if (False): dimensionalities = [100, 150, 170, 200] perplexities = [35, 40, 45, 50] learning_rates = [500, 800] for dimensionality in dimensionalities: for perplexity in perplexities: for l_r in learning_rates: print( "Dimensionality: {} \t Perplexity: {} \t Learning rate : {}" .format(dimensionality, perplexity, l_r)) # Reduction print("Reduction") reduced_samples = tSNE_IO.load_or_reduce(samples) print("Reduced samples shape: {}".format( np.shape(reduced_samples))) # ######## # Class balancing data = np.hstack( (reduced_samples, np.reshape(targets, (len(targets), 1)))) data, names = preprocessing.balance_classes( data, names, np.shape(reduced_samples)[1]) print("Balanced data shape: {}".format(np.shape(data))) reduced_samples = data[:, :-1] targets = data[:, -1] unique, counts = np.unique(targets, return_counts=True) print("Class breakdown: {}\n".format( dict(zip(unique, counts)))) # ########### # Splitting # Split in train and test train, test = preprocessing.split(reduced_samples, targets, 0.15) # ######### for c in np.arange(.1, .9, .1): # SVM fitting clf = SVC(C=c, class_weight='balanced', verbose=0, probability=True) clf.fit(train[:, :-1], train[:, -1]) score = clf.score(test[:, :-1], test[:, -1]) print( "SVC score of fit on case study data: {}, with C: {}" .format(score, clf.get_params()["C"])) parameters = { "algo": "SVC", "kernel": clf.get_params()["kernel"], "dataset": "keynote", "tsne_perplexity": perplexity, "tsne_learning_rate": l_r, "tnse_dimensionality": dimensionality, "svc_c": c, "score": score, } logger.store_log_entry( parameters, "keynote_supervised_exploration_log.json") print() # Assume dimensionality reduction, classification exploration if (True): # Reduction print("Reduction") reduced_samples = tSNE_IO.load_or_reduce(samples, PERPLEXITY, LEARNING_RATE, EXAGGERATION) print("Reduced samples shape: {}".format(np.shape(reduced_samples))) # ######## # Class balancing data = np.hstack( (reduced_samples, np.reshape(targets, (len(targets), 1)))) data, names = preprocessing.balance_classes( data, names, np.shape(reduced_samples)[1]) print("Balanced data shape: {}".format(np.shape(data))) reduced_samples = data[:, :-1] targets = data[:, -1] unique, counts = np.unique(targets, return_counts=True) print("Class breakdown: {}\n".format(dict(zip(unique, counts)))) # ########### # Split in train and test train, test = preprocessing.split(reduced_samples, targets, 0.15) # ######### for c in np.arange(.1, 1, .1): # SVM fitting clf = SVC(C=c, class_weight='balanced', verbose=0, probability=True) clf.fit(train[:, :-1], train[:, -1]) score = clf.score(test[:, :-1], test[:, -1]) print("SVC score of fit on case study data: {}, with C: {}".format( score, clf.get_params()["C"])) parameters = { "algo": "SVC", "kernel": clf.get_params()["kernel"], "dataset": "keynote", "tsne_perplexity": PERPLEXITY, "tsne_learning_rate": LEARNING_RATE, "tnse_dimensionality": 2, "svc_c": c, "score": score, } logger.store_log_entry(parameters, "keynote_supervised_exploration_log.json")
print('test data SVD finishes !\n') three = time.clock() print('time of SVD = %s' % (three - two)) #V_address = 'C:/Users/Chaomin/Desktop/data_mining/data/all_V_1000.npy' #V_1000 = np.load((V_address)) V_address = cwd + '/data/temp/all_V_1000' np.save(V_address, V_1000) three = time.clock() print('time of SVD = %s \n' % (three - two)) four = time.clock() print('Spliting data to test_data and training data:\n') X_train_ay, X_test_ay, X_train_index, X_test_index = preprocessing.split( X_data_matrix, 0.2, 1) #np.shape(X_train_ay) == (80,13628) #np.shape(X_test_ay) == (20, 13628) # The first column of X_train_ay and X_test_ay is the app_name, and the last column is the app_label m_test, n_test = np.shape(X_test_ay) m_train, n_train = np.shape(X_train_ay) X_test_label = X_test_ay[:, n_test - 1] X_train_label = X_train_ay[:, n_train - 1] X_test_appname = X_test_ay[:, 0] X_train_appname = X_train_ay[:, 0] X_raw_test = X_test_ay[:, 1:n_test - 1] X_raw_train = X_train_ay[:, 1:n_train - 1] X_raw_test_address = cwd + '/data/temp/X_raw_test' np.save(X_raw_test_address, X_raw_test) X_test_label_address = cwd + '/data/temp/X_test_label'
os.path.dirname(os.getcwd()) + "/data/price_data.csv") blockchain_data = scraper.fetch_data( os.path.dirname(os.getcwd()) + "/data/blockchain_data.csv") #coindesk_headlines = pd.read_csv(os.path.dirname(os.getcwd()) + "/data/test_scores.csv", sep=",") # Preprocessing # print("Preprocessing") data = ( price_data.pipe(pp.calculate_indicators).pipe( pp.merge_datasets, other_sets=[blockchain_data]) # [blockchain_data, coindesk_headlines] .pipe(pp.binarize_labels).pipe(pp.fix_null_vals).pipe( pp.add_lag_variables, lag=3).pipe(pp.power_transform)) x_train, x_test, y_train, y_test = pp.split(data, test_size=.2, balanced=True) # Exploratory Analysis # print("Analyzing features") #print(data.describe()) analysis.plot_corr_matrix(data) # Fitting Models # print("Fitting models") log_reg = Model(estimator="LogisticRegression", train_set=(x_train, y_train), test_set=(x_test, y_test),