def main(path): # read raw data churn_raw_data = pd.read_csv(path) # preprocessing data X_train, X_test, y_train, y_test = data_preprocess.preprocess(churn_raw_data) print(X_train.shape)
def run(max_epoch=1, nfolds=10, batch_size=128): if os.path.isfile('../Data/data.csv') and os.path.isfile('../Data/max.json'): df = pd.read_csv('../Data/data.csv', engine='python', header=None) df.dropna(axis=0, how='any') X, y = df.iloc[:, 0:-1].values, df.iloc[:, -1].values with open('../Data/max.json', 'r') as file: js = file.read() max_dict = json.loads(js) max_features = max_dict['feature'] max_len = max_dict['len'] else: X, y, max_features, max_len = data_preprocess.preprocess() labels = ['bad' if x == 0 else 'good' for x in y] final_data = [] for fold in range(nfolds): print("fold %u/%u" % (fold + 1, nfolds)) X_train, X_test, y_train, y_test, _, label_test = train_test_split(X, y, labels, test_size=0.2) print('Build model...') model = build_model(max_features, max_len) print("Train...") X_train, X_holdout, y_train, y_holdout = train_test_split(X_train, y_train, test_size=0.05) best_iter = -1 best_auc = 0.0 out_data = {} for ep in range(max_epoch): model.fit(X_train, y_train, batch_size=batch_size, epochs=1) t_probs = model.predict_proba(X_holdout) t_auc = sklearn.metrics.roc_auc_score(y_holdout, t_probs) print('Epoch %d: auc = %f (best=%f)' % (ep, t_auc, best_auc)) if t_auc > best_auc: best_auc = t_auc best_iter = ep probs = model.predict_proba(X_test) out_data = {'y': y_test, 'labels': label_test, 'probs': probs, 'epochs': ep, 'confusion_matrix': sklearn.metrics.confusion_matrix(y_test, probs > .5)} print(sklearn.metrics.confusion_matrix(y_test, probs > .5)) else: if (ep - best_iter) > 2: break model.save('../Model/model.h5') model.save_weights('../Model/model_weights.h5') final_data.append(out_data) pickle.dump(final_data, open('../Model/results.pkl', 'wb')) return final_data
def __init__(self): print("Initializing Data Loader...") # Load dataframe and process it self.dataframe = dv.preprocess( os.path.join(os.getcwd(), "shuffled-full-set-hashed.csv")) print("Load Complete") # Create train and test sets # self.X_train,self.X_test,self.y_train,self.y_test= dv.create_train_test(self.dataframe) print("Created Train(90%) and Test(10%) Stratified splits")
def start(self): ''' The function reads and merges the datasets populates the instance methods for training and testing data ''' cdf = prepare_crime_data() self.hdf = prepare_housing_data() self.cdf = preprocess(cdf) mdf = merge_datasets(self.cdf, self.hdf) self.train = mdf.drop(['Mean_Price'], axis=1) self.target = mdf.Mean_Price return
def train(): # read the data set scallop = pd.read_csv(file_dict["data_file"], usecols=["longitude", "latitude", "tot.catch"]) data = scallop.copy() data["tot.catch"] = np.log(scallop["tot.catch"] + 1) # split data set random.seed(0) scdata = preprocess(long_block_num=10, lat_block_num=10) scdata.split_block_data(data, plot_block=True, save_file=file_dict["plot1"]) scdata.generate_posterior_plot_data() gx, gy, X_new = scdata.generate_suface_data() scmodel = GP_model(dim=2, x=scdata.X_2D_train, y=scdata.Y_2D_train) scmodel.train(niter=1000) scmodel.plot_trace(save_file=file_dict["plot3"]) scmodel.print_summary(save_file=file_dict["plot2"]) mu, sd, _ = scmodel.predict_GP(X_new, pred_noise=True, samples=50, pred_name="surface1") plot_gp_2D(gx, gy, mu, sd, scdata.X_2D_train, scdata.Y_2D_train, scdata.X_2D_test, scdata.X_2D_test, save_file=file_dict["plot4"]) scmodel.plot_range(scdata.long_new, scdata.lat_new, save_file=file_dict["plot5"]) mu, sd, _ = scmodel.predict_GP(scdata.X_2D_test, pred_noise=True, samples=500, pred_name="test") # RMSE print(RMSE(mu, scdata.Y_2D_test))
def load_data(sz): """ Load the dataset, either from source files, or from pre-prepared compressed numpy array If the pre-prepared file does not exist - create it """ img_filez = config.datafile("train_data_{}_{}_{}.npz".format(config.category,sz,config.num_images)) if not os.path.isfile(img_filez): print("Creating {} file for faster processing...".format(img_filez)) print("Genre: {}, # of images = {}".format(config.category,config.num_images)) final_images_stacked = preprocess(genre_or_style=config.category, min_vals=[sz,sz],n=config.num_images) np.savez_compressed(file=img_filez, a=final_images_stacked) else: print("Load preprocessed image data from {}".format(img_filez)) final_images_stacked = np.load(file=img_filez)["a"] return final_images_stacked
def main(): test_mini_batch_gd = True test_numerical_grad = False # Data parameters train_size = 60000 test_size = 10000 val_frac = 0.1 train_data, val_data, test_data, idx_to_class, class_to_idx = data.preprocess( train_size, test_size, val_frac) # Model parameters layers = [784, 32, 32, 10] act_funs = ['ELU', 'tanh', 'softmax'] loss_fun = 'cross_entropy' alpha = 0.5 mu = 0.4 T = 80 xavier = True p = 0 gamma = 0 network = MLP(layers, act_funs, loss_fun, alpha, mu, T, idx_to_class, xavier, p, gamma) if test_mini_batch_gd: # Training parameters epochs = 50 batch_size = 128 shuffle = True augment = False shift = 2 stop = 3 report = True log = True network.mini_batch_GD(epochs, batch_size, train_data, val_data, test_data, shuffle, augment, shift, stop, report, log) network.plot_accuracy() network.plot_loss() if test_numerical_grad: # Numerical grad parameters input, output = train_data[0][0], train_data[1][0] epsilon = 10**-2 max_difference_1 = numerical_grad(network, 0, input, label, epsilon) max_difference_2 = numerical_grad(network, 1, input, label, epsilon)
def main(path): # read raw data churn_raw_data = pd.read_csv(path) # preprocessing data X_train, X_test, y_train, y_test = data_preprocess.preprocess(churn_raw_data) # xgboost model training xgb_fpr, xgb_tpr, xgb_roc_auc = xgboost_churn.xgboost_churn(X_train, y_train, X_test, y_test) # lightGBM model training gbm_fpr, gbm_tpr, gbm_roc_auc = lightGBM_churn.lightGBM_churn(X_train, y_train, X_test, y_test) # randomForest model training rf_fpr, rf_tpr, rf_roc_auc = randomForest_churn.random_forest_churn(X_train, y_train, X_test, y_test) # DNN model training dnn_fpr, dnn_tpr, dnn_roc_auc = dnn_churn.dnn_churn(X_train, y_train, X_test, y_test) # plot ROC drawRoc.drawRoc([xgb_fpr, xgb_tpr, xgb_roc_auc], [gbm_fpr, gbm_tpr, gbm_roc_auc], [rf_fpr, rf_tpr, rf_roc_auc], [dnn_fpr, dnn_tpr, dnn_roc_auc])
# using ML-KNN from skmultilearn.adapt import MLkNN from scipy.sparse import csr_matrix, lil_matrix from sklearn.model_selection import GridSearchCV from data_preprocess import preprocess features_train, features_test, labels_train, labels_test = preprocess() features_train = lil_matrix(features_train).toarray() labels_train = lil_matrix(labels_train).toarray() features_test = lil_matrix(features_test).toarray() parameters = {'k': [3, 5, 7]} grid_clf = GridSearchCV(MLkNN(), parameters, scoring='accuracy', cv=3) grid_clf.fit(features_train, labels_train) print("Best parameters:\n", grid_clf.best_params_) # accuracy print("Accuracy = ", grid_clf.best_score_)
def main(): ### Loading TRAIN Data df_train = load_df("train_v2.csv",10000) ### Loading TEST Data df_test = load_df('test_v2.csv', 10000) ### Fix outliers that do not fall within +/-3 std dev from mean of log transaction value # create a new dummy column logTransaction which is the log of all totalTransactionRevenue df_train['logTransaction']= df_train['totals.totalTransactionRevenue'].fillna(0).astype(float).apply(lambda x: np.log1p(x)) std_dev = df_train.logTransaction.std() mean_val = df_train.logTransaction.mean() df_train['logTransaction'] = np.where(np.abs(df_train.logTransaction-mean_val) > 3*std_dev,3*std_dev,df_train.logTransaction) # create a new dummy column logTransaction which is the log of all totalTransactionRevenue df_test['logTransaction']= df_test['totals.totalTransactionRevenue'].fillna(0).astype(float).apply(lambda x: np.log1p(x)) std_dev = df_test.logTransaction.std() mean_val = df_test.logTransaction.mean() df_test['logTransaction'] = np.where(np.abs(df_test.logTransaction-mean_val) > 3*std_dev,3*std_dev,df_test.logTransaction) ### Extract Labels y = df_train['logTransaction'] # Get true values from test set y_true = df_test['logTransaction'] ### Removing unnecessary columns # colums that contain no data ones = unique_valued_cols(df_train) cols_to_remove = [x for x in ones if set(df_train[x].unique()) == set(['not available in demo dataset'])] cols_to_remove.extend(['hits','customDimensions']) # Drop them from both the sets df_train = drop_cols(df_train, list(cols_to_remove)) df_test = drop_cols(df_test, list(cols_to_remove)) # Remove transaction related columns transaction_cols = ['totals.totalTransactionRevenue', 'totals.transactionRevenue', 'totals.transactions', 'fullVisitorId', 'logTransaction'] df_train = drop_cols(df_train, transaction_cols) df_test = drop_cols(df_test, transaction_cols) # Remove extra column in training #df_train = df_train.drop('trafficSource.campaignCode', axis=1) ### Preprocess the data before we start training df_train = preprocess(df_train) df_test = preprocess(df_test) ### Create categorical and numeric features dataframe df_categorical = df_train.select_dtypes(include=['object']) df_categorical_test = df_test.select_dtypes(include=['object']) # Numeric df_numeric = df_train.select_dtypes(include=['float64', 'int64']) df_numeric_test = df_test.select_dtypes(include=['float64', 'int64']) # Label encoding on categorical df_categorical = label_encoding(df_categorical) df_categorical_test = label_encoding(df_categorical_test) # Predict on categorical lm = LinearRegression() categorize_model, RMSE_test, RMSE_train = train_and_predict(lm, df_categorical, df_categorical_test, y, y_true) print("In-sample RMSE categorical:", RMSE_train) print("Out-sample RMSE categorical:", RMSE_test) print("Model parameters: ", categorize_model.get_params()) joblib.dump(categorize_model, "modl_LR_cat.joblib") print('-'*10) # Predict on numerical df_numeric = df_numeric.fillna(0) df_numeric_test = df_numeric_test.fillna(0) num_model, RMSE_test, RMSE_train = train_and_predict(lm, df_numeric, df_numeric_test, y, y_true) print("In-sample RMSE numeric:", RMSE_train) print("Out-sample RMSE numeric:", RMSE_test) print("Model parameters: ", num_model.get_params()) joblib.dump(num_model, "modl_LR_cat.joblib") print('-'*10) # Predict on all features df_train = pd.concat([df_numeric,df_categorical],axis=1) df_test = pd.concat([df_numeric_test,df_categorical_test],axis=1) full_model, RMSE_test, RMSE_train = train_and_predict(lm, df_train, df_test, y, y_true) print("In-sample RMSE:", RMSE_train) print("Out-sample RMSE:", RMSE_test) print("Model parameters: ", full_model.get_params()) joblib.dump(full_model, "modl_LR_cat.joblib") print('-'*10)
def run(self, df, time_col, predictor_sets, label, start, end, test_window_months, outcome_lag_days, output_dir, output_filename, grid_size='test', thresholds=[], ks=list(np.arange(0, 1, 0.1)), save_output=True, debug=False): ''' Run the pipeline using temporal cross validation. Inputs: predictor_sets: list of lists of predictors Output: ''' if debug: print('START') print('GRID SIZE = {}'.format(grid_size)) # load data self.load_clean_data(df) # set outcome self.label = label # set predictors self.add_predictor_sets(predictor_sets, reset=True) # set parametergrid for classifiers self.grid_size = grid_size self.set_paramgrid(grid_size) # get cutoff times for train test split self.get_train_test_times(start, end, test_window_months, outcome_lag_days) # initialize run number and results N = 0 model_results = {} # initialize output file if not os.path.exists(output_dir): os.mkdir(output_dir) output_path = os.path.join(output_dir, output_filename) headers = [ 'model_id', 'N_split', 'i', 'label', 'model_type', 'roc_auc', 'k', 'precision', 'recall', 'accuracy', 'params', 'predictors' ] pd.DataFrame(columns=headers).to_csv(output_path, index=False) if debug: print("set up done. output: {}".format(output_path)) # 1) loop over temporal sets for train_start, train_end, test_start, test_end in self.train_test_times: model_results[N] = {} i = 0 if debug: print('## TRAIN: {} - {}, TEST:{} - {} ##'.format( str(train_start), str(train_end), str(test_start), str(test_end))) # 2) loop over predictor combos for predictor_cols in self.predictor_combos: if debug: print('### Predictors: {}'.format(predictor_cols)) # train test split X_train, y_train, X_test, y_test = self.temporal_split( time_col, train_start, train_end, test_start, test_end, predictor_cols) if debug: print('...train test split done') # pre-process training and test sets preprocess.preprocess(X_train, y_train) preprocess.preprocess(X_test, y_test) if debug: print('...pre-processing done') # generate features train_to_concat = [X_train] test_to_concat = [X_test] for p in predictor_cols: dummies_train, dummies_test = self.generate_feature( p, X_train, X_test) train_to_concat.append(dummies_train) test_to_concat.append(dummies_test) X_train = pd.concat(train_to_concat, axis=1) X_test = pd.concat(test_to_concat, axis=1) X_train.drop(columns=predictor_cols, inplace=True) X_test.drop(columns=predictor_cols, inplace=True) if debug: print('...feature generation done') # 3) loop over classifier types for model_type, clf in self.clfs.items(): if debug: print('#### {}-{}: {}'.format(N, i, model_type)) # 4) loop over parameter combinations for params in ParameterGrid(self.paramgrid[model_type]): if debug: print('{}'.format(params)) m = self.build_model(clf, X_train, y_train, X_test, y_test, params, N, i, model_type, predictor_cols, label, output_dir, output_filename, thresholds, ks, save_output) model_results[N][i] = m if debug: print('---model results saved---') i += 1 N += 1 # store model results self.models = model_results if debug: print('FINISH')
ones = unique_valued_cols(df_test) cols_to_remove = [x for x in ones if set(df_test[x].unique()) == set(['not available in demo dataset'])] cols_to_remove.extend(['hits', 'customDimensions', 'device.isMobile']) # Drop them df_test = drop_cols(df_test, list(cols_to_remove)) # Remove transaction related columns transaction_cols = ['totals.totalTransactionRevenue', 'totals.transactionRevenue', 'totals.transactions', 'fullVisitorId', 'logTransaction'] df_test = drop_cols(df_test, transaction_cols) ### Preprocess the data before we start training # print(df_test.iloc[0]) df_test = df_test.fillna(0) df_test = label_encoding(df_test) df_test = preprocess(df_test) ### Get Predictions pred = mdl.predict(df_test) vals = [] acc = [] for idx, each in enumerate(list(pred)): vals.append((idx, (each-y_true[idx]))) for idx, each in enumerate(list(pred)): if float(y_true[idx]) != 0.0: acc.append((idx, abs(each-y_true[idx])))
# Output layer logits = layers.fully_connected(avg_cell_out, OUT_DIM, activation_fn=None, scope='output') return logits # Preprocessing raw_files = os.listdir(DIR_RAW) label_file = os.listdir(DIR_LABELS)[0] dp.preprocess(raw_data_files=raw_files, raw_label_file=label_file, processed_file_name=PROCESSED_FILE, dir_raw=DIR_RAW, dir_label=DIR_LABELS, stride=BATCH_SIZE) # Training phase setup global_step = tf.Variable(0, trainable=False) csvFileList = [DIR_PROCESSED + 'processed_data.csv'] batch_feature, batch_label = input_pipeline(csvFileList, BATCH_SIZE) logits = deepSense(batch_feature, True, name='deepSense') predict = logits[1] # Regression task, 1-dim tensor batchLoss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=batch_label) loss = tf.reduce_mean(batchLoss) # Evaluation phase setup
def cross_val(self): # so that we get different metrics used in this custom version # preprocess the data prep = preprocess(self.fasta_file, self.readout_file) # if want mono-nucleotide sequences dict = prep.one_hot_encode() # if want dinucleotide sequences #dict = prep.dinucleotide_encode() np.set_printoptions(threshold=sys.maxsize) # seed to reproduce results seed = random.randint(1, 1000) fw_fasta = dict["forward"] rc_fasta = dict["reverse"] readout = dict["readout"] #if self.activation_type == 'linear': # readout = np.log2(readout) # readout = np.ndarray.tolist(readout) forward_shuffle, readout_shuffle = shuffle(fw_fasta, readout, random_state=seed) reverse_shuffle, readout_shuffle = shuffle(rc_fasta, readout, random_state=seed) readout_shuffle = np.array(readout_shuffle) # initialize metrics to save values metrics = [] # save the information of 10 folds auc scores train_auc_scores = [] test_auc_scores = [] # Provides train/test indices to split data in train/test sets. kFold = StratifiedKFold(n_splits=10) ln = np.zeros(len(readout_shuffle)) for train, test in kFold.split(ln, ln): model = None model, model2 = self.create_model() fwd_train = forward_shuffle[train] fwd_test = forward_shuffle[test] rc_train = reverse_shuffle[train] rc_test = reverse_shuffle[test] y_train = readout_shuffle[train] y_test = readout_shuffle[test] model = self.create_model() # Early stopping callback = EarlyStopping(monitor='loss', min_delta=0.001, patience=3, verbose=0, mode='auto', baseline=None, restore_best_weights=False) history = model.fit({ 'forward': fwd_train, 'reverse': rc_train }, y_train, epochs=self.epochs, batch_size=self.batch_size, validation_split=0.0, callbacks=[callback]) # Without early stopping model.fit({ 'forward': x1_train, 'reverse': x2_train }, y1_train, epochs=self.epochs, batch_size=self.batch_size, validation_split=0.0) pred_train = model.predict({ 'forward': x1_test, 'reverse': x2_test }) vals = [] for i in range(len(pred_train)): if pred_train[i] < 0.5: val = 0 vals.append(val) if pred_train[i] >= 0.5: val = 1 vals.append(val) print(y1_train[0:10]) print(vals[0:10]) true_pred = 0 false_pred = 0 for ind in range(len(pred_train)): if y1_train[ind] == vals[ind]: true_pred += 1 else: false_pred += 1 print('Total number of train-set predictions is: ' + str(len(y1_train))) print('Number of correct train-set predictions is: ' + str(true_pred)) print('Number of incorrect train-set predictions is: ' + str(false_pred)) auc_score = sklearn.metrics.roc_auc_score(y1_train, pred_train) print('train-set auc score is: ' + str(auc_score)) print('train-set seed number is: ' + str(seed)) train_auc_scores.append(auc_score) ########################################################## pred = model.predict({'forward': x1_test, 'reverse': x2_test}) vals = [] for i in range(len(pred)): if pred[i] < 0.5: val = 0 vals.append(val) if pred[i] >= 0.5: val = 1 vals.append(val) true_pred = 0 false_pred = 0 for ind in range(len(y1_test)): if y1_test[ind] == vals[ind]: true_pred += 1 else: false_pred += 1 print('Total number of test-set predictions is: ' + str(len(y1_test))) print('Number of correct test-set predictions is: ' + str(true_pred)) print('Number of incorrect test-set predictions is: ' + str(false_pred)) auc_score = sklearn.metrics.roc_auc_score(y1_test, pred) print('test-set auc score is: ' + str(auc_score)) print('test-set seed number is: ' + str(seed)) test_auc_scores.append(auc_score) print('seed number = %d' % seed) print(train_auc_scores) print('Mean train auc_scores of 10-fold cv is ' + str(np.mean(train_auc_scores))) print(test_auc_scores) print('Mean test auc_scores of 10-fold cv is ' + str(np.mean(test_auc_scores)))
def eval(self): prep = preprocess(self.fasta_file, self.readout_file) # if want mono-nucleotide sequences dict = prep.one_hot_encode() # if want dinucleotide sequences # dict = prep.dinucleotide_encode() # print maximum length without truncation np.set_printoptions(threshold=sys.maxsize) fw_fasta = dict["forward"] rc_fasta = dict["reverse"] readout = dict["readout"] seed = self.seed #random.randint(1,1000) x1_train, x1_test, y1_train, y1_test = train_test_split( fw_fasta, readout, test_size=0.1, random_state=seed) # split for reverse complemenet sequences x2_train, x2_test, y2_train, y2_test = train_test_split( rc_fasta, readout, test_size=0.1, random_state=seed) #assert x1_test == x2_test #assert y1_test == y2_test model = self.create_model() # change from list to numpy array y1_train = np.asarray(y1_train) y1_test = np.asarray(y1_test) y2_train = np.asarray(y2_train) y2_test = np.asarray(y2_test) # Copy the original target values for later uses y1_train_orig = y1_train.copy() y1_test_orig = y1_test.copy() # if we want to merge two training dataset # comb = np.concatenate((y1_train, y2_train)) ## Change it to categorical values y1_train = keras.utils.to_categorical(y1_train, 2) y1_test = keras.utils.to_categorical(y1_test, 2) checkpoint = ModelCheckpoint( 'model-{epoch:03d}-{accuracy:03f}-{val_accuracy:03f}.h5', verbose=1, monitor='val_loss', save_best_only=True, mode='auto') # train the data model.fit({ 'forward': x1_train, 'reverse': x2_train }, y1_train, epochs=self.epochs, batch_size=self.batch_size, validation_split=0.1, callbacks=[checkpoint]) ## Save the entire model as a SavedModel. ##model.save('my_model') # Save weights only: later used in self.filter_importance() #model.save_weights('./my_checkpoint') # save each convolution learned filters as txt file """ motif_weight = model.get_weights() motif_weight = np.asarray(motif_weight[0]) for i in range(int(self.filters)): x = motif_weight[:,:,i] berd = np.divide(np.exp(100*x), np.transpose(np.expand_dims(np.sum(np.exp(100*x), axis = 1), axis = 0), [1,0])) np.savetxt(os.path.join('./motif_files', 'filter_num_%d'%i+'.txt'), berd) """ pred_train = model.predict({'forward': x1_train, 'reverse': x2_train}) # See which label has the highest confidence value predictions_train = np.argmax(pred_train, axis=1) print(y1_train_orig[0:10]) print(predictions_train[0:10]) true_pred = 0 false_pred = 0 for count, value in enumerate(predictions_train): if y1_train_orig[count] == predictions_train[count]: true_pred += 1 else: false_pred += 1 print('Total number of train-set predictions is: ' + str(len(y1_train_orig))) print('Number of correct train-set predictions is: ' + str(true_pred)) print('Number of incorrect train-set predictions is: ' + str(false_pred)) # Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores. # Returns AUC auc_score = sklearn.metrics.roc_auc_score(y1_train_orig, predictions_train) print('train-set auc score is: ' + str(auc_score)) print('train-set seed number is: ' + str(seed)) ########################################################## # Apply on test data pred_test = model.predict({'forward': x1_test, 'reverse': x2_test}) # See which label has the highest confidence value predictions_test = np.argmax(pred_test, axis=1) true_pred = 0 false_pred = 0 for count, value in enumerate(predictions_test): if y1_test_orig[count] == predictions_test[count]: true_pred += 1 else: false_pred += 1 print('Total number of test-set predictions is: ' + str(len(y1_test_orig))) print('Number of correct test-set predictions is: ' + str(true_pred)) print('Number of incorrect test-set predictions is: ' + str(false_pred)) auc_score = sklearn.metrics.roc_auc_score(y1_test_orig, predictions_test) print('test-set auc score is: ' + str(auc_score)) print('test-set seed number is: ' + str(seed))
tr_dat = sys.argv[1] conditions = os.path.isfile( "./data/preprocessed_train_pasges(" + tr_dat + ").npy") and os.path.isfile( "./data/preprocessed_train_queries(" + tr_dat + ").npy") and os.path.isfile( "./data/preprocessed_test_queries(" + tr_dat + ").npy") and os.path.isfile( "./data/preprocessed_test_pasges(" + tr_dat + ").npy") and os.path.isfile( "./data/embed_word_matrix(" + tr_dat + ").npy") and os.path.isfile( "./data/labels(" + tr_dat + ").npy") and os.path.isfile( "./data/embed_char_matrix(" + tr_dat + ").npy") if (conditions == False): preprocess(tr_dat) queries = np.loadtxt("./data/preprocessed_train_queries(" + tr_dat + ").npy") pasges = np.loadtxt("./data/preprocessed_train_pasges(" + tr_dat + ").npy") test_queries = np.loadtxt("./data/preprocessed_test_queries(" + tr_dat + ").npy") test_pasges = np.loadtxt("./data/preprocessed_test_pasges(" + tr_dat + ").npy") labels = np.loadtxt("./data/labels(" + tr_dat + ").npy") embed_word_matrix = np.loadtxt("./data/embed_word_matrix(" + tr_dat + ").npy") embed_char_matrix = np.loadtxt("./data/embed_char_matrix(" + tr_dat + ").npy") df = pd.read_csv("./data/eval1_unlabelled.tsv", sep="\t", header=None) # read dummy .tsv file into memory test_data = df.values # ====== End of Loading Data =======
def main(): ### Loading TRAIN Data df_train = load_df("train_v2.csv") ### Loading TEST Data df_test = load_df('test_v2.csv') ### Fix outliers that do not fall within +/-3 std dev from mean of log transaction value # create a new dummy column logTransaction which is the log of all totalTransactionRevenue df_train['logTransaction']= df_train['totals.totalTransactionRevenue'].fillna(0).astype(float).apply(lambda x: np.log1p(x)) std_dev = df_train.logTransaction.std() mean_val = df_train.logTransaction.mean() df_train['logTransaction'] = np.where(np.abs(df_train.logTransaction-mean_val) > 3*std_dev,3*std_dev,df_train.logTransaction) # create a new dummy column logTransaction which is the log of all totalTransactionRevenue df_test['logTransaction']= df_test['totals.totalTransactionRevenue'].fillna(0).astype(float).apply(lambda x: np.log1p(x)) std_dev = df_test.logTransaction.std() mean_val = df_test.logTransaction.mean() df_test['logTransaction'] = np.where(np.abs(df_test.logTransaction-mean_val) > 3*std_dev,3*std_dev,df_test.logTransaction) ### Extract Labels y = df_train['logTransaction'] # Get true values from test set y_true = df_test['logTransaction'] ### Removing unnecessary columns # colums that contain no data ones = unique_valued_cols(df_train) cols_to_remove = [x for x in ones if set(df_train[x].unique()) == set(['not available in demo dataset'])] cols_to_remove.append(['hits', 'customDimensions']) # Drop them df_train = drop_cols(df_train, list(cols_to_remove)) df_test = drop_cols(df_test, list(cols_to_remove)) # Remove transaction related columns transaction_cols = ['totals.totalTransactionRevenue', 'totals.transactionRevenue', 'totals.transactions', 'fullVisitorId', 'logTransaction'] df_train = drop_cols(df_train, transaction_cols) df_test = drop_cols(df_test, transaction_cols) # Remove extra column in training df_train = df_train.drop('trafficSource.campaignCode', axis=1) ### Preprocess the data before we start training df_train = preprocess(df_train) df_test = preprocess(df_test) ### Create categorical and numeric features dataframe df_categorical = df_train.select_dtypes(include=['object']) df_categorical_test = df_test.select_dtypes(include=['object']) # Numeric df_numeric = df_train.select_dtypes(include=['float64', 'int64']) df_numeric_test = df_test.select_dtypes(include=['float64', 'int64']) # Label encoding df_categorical = label_encoding(df_categorical) df_categorical_test = label_encoding(df_categorical_test) ### Training and Predictions ################### Categorical ############### reg_tree = tree.DecisionTreeRegressor() model_cat, RMSE_test, RMSE_train = train_and_predict(reg_tree, df_categorical, df_categorical_test, y, y_true) for idx, each in enumerate(df_categorical.columns): print(idx, each) for idx, each in enumerate(model_cat.feature_importances_): print(idx, each*1e5) print('-'*10) print('\n') print("-- Scores for Categorical --") print("RMSE on test: ", RMSE_test) print("RMSE on train: ", RMSE_train) print('\n\n') print('-- Getting list of columns for categorical model --\n') for each in df_categorical.columns: print(each) for imp in model_cat.feature_importances_: print(imp) # Save Categorical model joblib.dump(model_cat, "modl_DT_cat.joblib") ###################### Numerical ##################### df_numeric = df_numeric.fillna(0) df_numeric_test = df_numeric_test.fillna(0) model_num, RMSE_test, RMSE_train = train_and_predict(reg_tree, df_numeric, df_numeric_test, y, y_true) for idx, each in enumerate(df_numeric.columns): print(idx, each) for idx, each in enumerate(model_num.feature_importances_): print(idx, each*1e5) print('-'*10) print('\n') print("-- Scores for Numerical --") print("RMSE on test: ", RMSE_test) print("RMSE on train: ", RMSE_train) print('\n\n') print('-- Getting list of columns for Numerical Model --\n') for each in df_numeric.columns: print(each) for imp in model_num.feature_importances_: print(imp) # Save Numerical model joblib.dump(model_num, "modl_DT_num.joblib") ###################### Full ##################### df_train = pd.concat([df_numeric,df_categorical],axis=1) df_test = pd.concat([df_numeric_test,df_categorical_test],axis=1) model_full, RMSE_test, RMSE_train = train_and_predict(reg_tree, df_train, df_test, y, y_true) for idx, each in enumerate(df_train.columns): print(idx, each) for idx, each in enumerate(model_full.feature_importances_): print(idx, each*1e5) print('-'*10) print('\n') print("-- Scores for Full --") print("RMSE on test: ", RMSE_test) print("RMSE on train: ", RMSE_train) print('\n\n') print('-- Getting list of columns for Full model --\n') for each in df_train.columns: print(each) for imp in model_full.feature_importances_: print(imp) # Save full model joblib.dump(model_full, "modl_DT_full.joblib")
train_summary_writer = tf.summary.FileWriter(log_dir + '/train', sess.graph) valid_summary_writer = tf.summary.FileWriter(log_dir + '/valid', sess.graph) # ------------------------------------------------------------- for epoch in range(1, 1 + 30): pi = np.random.permutation(len(train_images)) train_data, train_labels = train_images[pi], train_cords[pi] t0 = tm.time() for i in range(batch_count): start = i * cg.batch_size end = (i + 1) * cg.batch_size t1 = tm.time() input, label = preprocess(train_data[start:end], train_labels[start:end], cg.crop_size, 8) #8*n train_res = sess.run([train_step, loss, merged_summary_op], feed_dict={ x: input, y: label, is_training: True }) if i % 200 == 0 or i < 3: train_summary_writer.add_summary(train_res[2], epoch * all_count + i) if epoch <= 5: print('Epoch: %d--Iter: %d--Train_loss: %.3f' % (epoch, i, train_res[1])) # ------------------------------------------------------------- if epoch > 5:
def create_model(self): # different metric functions def coeff_determination(y_true, y_pred): SS_res = K.sum(K.square(y_true - y_pred)) SS_tot = K.sum(K.square(y_true - K.mean(y_true))) return (1 - SS_res / (SS_tot + K.epsilon())) def auroc(y_true, y_pred): return tf.py_func(roc_auc_score, (y_true, y_pred), tf.double) # building model prep = preprocess(self.fasta_file, self.readout_file) # if want mono-nucleotide sequences dict = prep.one_hot_encode() # if want dinucleotide sequences #dict = prep.dinucleotide_encode() readout = dict["readout"] fw_fasta = dict["forward"] rc_fasta = dict["reverse"] dim_num = fw_fasta.shape # To build this model with the functional API, # you would start by creating an input node: forward = keras.Input(shape=(dim_num[1], dim_num[2]), name='forward') reverse = keras.Input(shape=(dim_num[1], dim_num[2]), name='reverse') #first_layer = Conv1D(filters=self.filters, kernel_size=self.kernel_size, data_format='channels_last', input_shape=(dim_num[1],dim_num[2]), use_bias = False) ## with trainable = False #first_layer = Conv1D(filters=self.filters, kernel_size=self.kernel_size, kernel_initializer = my_init, data_format='channels_last', input_shape=(dim_num[1],dim_num[2]), use_bias = False, trainable=False) first_layer = ConvolutionLayer(filters=self.filters, kernel_size=self.kernel_size, data_format='channels_last', use_bias=True, alpha=self.alpha) fw = first_layer(forward) bw = first_layer(reverse) concat = concatenate([fw, bw], axis=1) pool_size_input = concat.shape[1] #concat = ReLU()(concat) #concat = Dense(1, activation= 'sigmoid')(concat) if self.pool_type == 'Max': pool_layer = MaxPooling1D(pool_size=pool_size_input)(concat) elif self.pool_type == 'Ave': pool_layer = AveragePooling1D(pool_size=pool_size_input)(concat) elif self.pool_type == 'custom': def out_shape(input_shape): shape = list(input_shape) print(input_shape) shape[0] = 10 return tuple(shape) #model.add(Lambda(top_k, arguments={'k': 10})) def top_k(inputs, k): # tf.nn.top_k Finds values and indices of the k largest entries for the last dimension print(inputs.shape) inputs2 = tf.transpose(inputs, [0, 2, 1]) new_vals = tf.nn.top_k(inputs2, k=k, sorted=True).values # transform back to (None, 10, 512) return tf.transpose(new_vals, [0, 2, 1]) pool_layer = Lambda(top_k, arguments={'k': 2})(concat_relu) pool_layer = AveragePooling1D(pool_size=2)(pool_layer) elif self.pool_type == 'custom_sum': ## apply relu function before custom_sum functions def summed_up(inputs): #nonzero_vals = tf.keras.backend.relu(inputs) new_vals = tf.math.reduce_sum(inputs, axis=1, keepdims=True) return new_vals pool_layer = Lambda(summed_up)(concat_relu) else: raise NameError('Set the pooling layer name correctly') flat = Flatten()(pool_layer) after_flat = Dense(32)(flat) # Binary classification with 2 output neurons if self.regularizer == 'L_1': #outputs = Dense(1, kernel_initializer='normal', kernel_regularizer=regularizers.l1(0.001), activation= self.activation_type)(flat) ## trainable = False with learned bias #outputs = Dense(1, kernel_initializer='normal', kernel_regularizer=regularizers.l1(0.001), activation= self.activation_type)(after_flat) outputs = Dense(2, kernel_initializer='normal', kernel_regularizer=regularizers.l1(0.001), activation='sigmoid')(after_flat) elif self.regularizer == 'L_2': #outputs = Dense(1, kernel_initializer='normal', kernel_regularizer=regularizers.l1(0.001), activation= self.activation_type)(flat) ## trainable = False with learned bias outputs = Dense(2, kernel_initializer='normal', kernel_regularizer=regularizers.l2(0.001), activation=self.activation_type)(after_flat) else: raise NameError('Set the regularizer name correctly') #weight_forwardin_0=model.layers[0].get_weights()[0] #print(weight_forwardin_0) model = keras.Model(inputs=[forward, reverse], outputs=outputs) #print model summary model.summary() #model.compile(loss='mean_squared_error', optimizer='adam', metrics = ['accuracy']) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) #model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', auroc]) return model
parser.add_argument('--train_data', type=str, default='data/corpus.txt', help='train data source') parser.add_argument('--alg', type=str, default='sklearn', help='sklearn/self') parser.add_argument('--topic', type=int, default=10, help='topic num') parser.add_argument('--iter', type=int, default=100, help='training iter') parser.add_argument('--n_top_words', type=int, default=10, help='topic word num') args = parser.parse_args() if not os.path.exists(args.train_data): preprocess() corpus = load_data(args.train_data) tf_vectorizer = CountVectorizer(max_df=0.50, min_df=5, max_features=2000) tf = tf_vectorizer.fit_transform(corpus) tf_feature_names = tf_vectorizer.get_feature_names() if args.alg == 'sklearn': model = LatentDirichletAllocation(n_components=args.topic, max_iter=args.iter, learning_method='batch', n_jobs=-1) print("Begin training.") model.fit(tf)
y = df_train['logTransaction'] # Remove colums that contain no data ones = unique_valued_cols(df_train) cols_to_remove = [x for x in ones if set(df_train[x].unique()) == set(['not available in demo dataset'])] df_train = df_train.drop(cols_to_remove, axis=1) # Remove transaction related columns transaction_cols = ['totals.totalTransactionRevenue', 'totals.transactionRevenue', 'totals.transactions', 'fullVisitorId', 'logTransaction'] df_train = drop_cols(df_train, transaction_cols) # Remove extra column in training df_train = df_train.drop('trafficSource.campaignCode', axis=1) ### Preprocess the data before we start training df_train = preprocess(df_train) # Get the categorical variables df_categorical = df_train.select_dtypes(include=['object']) # add logTransaction (dependent variable) column df_categorical['logTransaction'] = y # delete train set as we don't need it anymore del df_train #Initialize ChiSquare Class cT = ChiSquare(df_categorical) # Test independence of categorical variables on logTransaction - that we are predicting. # print which variables are important and which ones are not important
def filter_importance(self): prep = preprocess(self.fasta_file, self.readout_file) # if want mono-nucleotide sequences dict = prep.one_hot_encode() # if want dinucleotide sequences # dict = prep.dinucleotide_encode() # print maximum length without truncation np.set_printoptions(threshold=sys.maxsize) fw_fasta = dict["forward"] rc_fasta = dict["reverse"] readout = dict["readout"] seed = self.seed #random.randint(1,1000) x1_train, x1_test, y1_train, y1_test = train_test_split( fw_fasta, readout, test_size=0.1, random_state=seed) # split for reverse complemenet sequences x2_train, x2_test, y2_train, y2_test = train_test_split( rc_fasta, readout, test_size=0.1, random_state=seed) #assert x1_test == x2_test #assert y1_test == y2_test model = self.create_model() # change from list to numpy array y1_train = np.asarray(y1_train) y1_test = np.asarray(y1_test) y2_train = np.asarray(y2_train) y2_test = np.asarray(y2_test) # Copy the original target values for later uses y1_train_orig = y1_train.copy() y1_test_orig = y1_test.copy() # if we want to merge two training dataset # comb = np.concatenate((y1_train, y2_train)) ## Change it to categorical values y1_train = keras.utils.to_categorical(y1_train, 2) y1_test = keras.utils.to_categorical(y1_test, 2) # Restore the weights #weight_dir = './data/E13RACtrlF1_E13RAMutF1_DMR_toppos2000/checkpoint/my_checkpoint' weight_dir = '/Users/minjunpark/Documents/MuSeAM/classification/saved_weights/my_checkpoint' model.load_weights(weight_dir) #######******************************* pred_train = model.predict({'forward': x1_train, 'reverse': x2_train}) # See which label has the highest confidence value predictions_train = np.argmax(pred_train, axis=1) print(y1_train_orig[0:10]) print(predictions_train[0:10]) true_pred = 0 false_pred = 0 for count, value in enumerate(predictions_train): if y1_train_orig[count] == predictions_train[count]: true_pred += 1 else: false_pred += 1 print('Total number of train-set predictions is: ' + str(len(y1_train_orig))) print('Number of correct train-set predictions is: ' + str(true_pred)) print('Number of incorrect train-set predictions is: ' + str(false_pred)) # Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores. # Returns AUC auc_score = sklearn.metrics.roc_auc_score(y1_train_orig, predictions_train) print('train-set auc score is: ' + str(auc_score)) print('train-set seed number is: ' + str(seed)) ########################################################## # Apply on test data pred_test = model.predict({'forward': x1_test, 'reverse': x2_test}) # See which label has the highest confidence value predictions_test = np.argmax(pred_test, axis=1) true_pred = 0 false_pred = 0 for count, value in enumerate(predictions_test): if y1_test_orig[count] == predictions_test[count]: true_pred += 1 else: false_pred += 1 print('Total number of test-set predictions is: ' + str(len(y1_test_orig))) print('Number of correct test-set predictions is: ' + str(true_pred)) print('Number of incorrect test-set predictions is: ' + str(false_pred)) auc_score = sklearn.metrics.roc_auc_score(y1_test_orig, predictions_test) print('test-set auc score is: ' + str(auc_score)) print('test-set seed number is: ' + str(seed)) sys.exit() #######******************************* """ model.load_weights(weight_dir) weights = model.get_weights() # Apply on test data pred_test = model.predict({'forward': x1_test, 'reverse': x2_test}) # Sum the absolute difference between y1_test and pred_test vals = np.sum(np.absolute(np.subtract(y1_test, pred_test)), axis=1) baseline = np.average(vals) """ distances = [] for i in range(self.filters): model.load_weights(weight_dir) weights = model.get_weights() zeros = np.zeros((12, 4)) weights[0][:, :, i] = zeros model.set_weights(weights) ########################################################## # Apply on test data pred_test = model.predict({'forward': x1_test, 'reverse': x2_test}) # See which label has the highest confidence value vals = np.sum(np.absolute(np.subtract(y1_test, pred_test)), axis=1) ave_distance = np.average(vals) distances.append(ave_distance) print(i) print(distances) np.savetxt('distances.txt', distances)
def objective(params): prep = preprocess(self.fasta_file, self.readout_file) # if want mono-nucleotide sequences dict = prep.one_hot_encode() # if want dinucleotide sequences # dict = prep.dinucleotide_encode() # print maximum length without truncation np.set_printoptions(threshold=sys.maxsize) fw_fasta = dict["forward"] rc_fasta = dict["reverse"] readout = dict["readout"] # seed = random.randint(1,1000) seed = self.seed x1_train, x1_test, y1_train, y1_test = train_test_split( fw_fasta, readout, test_size=0.2, random_state=seed) # split for reverse complemenet sequences x2_train, x2_test, y2_train, y2_test = train_test_split( rc_fasta, readout, test_size=0.2, random_state=seed) #assert x1_test == x2_test #assert y1_test == y2_test self.filters = params["filters"] self.kernel_size = params["kernel_size"] self.epochs = params["epochs"] self.batch_size = params["batch_size"] self.alpha = params["alpha"] model = self.create_model() # change from list to numpy array y1_train = np.asarray(y1_train) y1_test = np.asarray(y1_test) y2_train = np.asarray(y2_train) y2_test = np.asarray(y2_test) y1_train_orig = y1_train.copy() y1_test_orig = y1_test.copy() # if we want to merge two training dataset # comb = np.concatenate((y1_train, y2_train)) ## Change it to categorical values y1_train = keras.utils.to_categorical(y1_train, 2) y1_test = keras.utils.to_categorical(y1_test, 2) # if we want to merge two training dataset # comb = np.concatenate((y1_train, y2_train)) # train the data model.fit({ 'forward': x1_train, 'reverse': x2_train }, y1_train, epochs=self.epochs, batch_size=self.batch_size, validation_split=0.1) test_pred = model.predict({'forward': x1_test, 'reverse': x2_test}) test_pred = np.argmax(test_pred, axis=1) auc_score = sklearn.metrics.roc_auc_score(y1_test_orig, test_pred) return -auc_score
def main(model, init_train, start_epoch, cycle, epochs, batch_size, save_intervals): model = model.upper() if model == 'DCGAN_1': my_model = DCGAN(name='DCGAN_1') elif model == 'DCGAN_2': my_model = DCGAN(name='DCGAN_2') elif model == 'DCGAN_3': my_model = DCGAN(name='DCGAN_3') elif model == 'VAE_1': my_model = VAE(name='VAE_1') elif model.upper() == 'VAE_2': my_model = VAE(name='VAE_2') elif model == 'VAE_3': my_model = VAE(name='VAE_3') elif model == 'VAE_4': my_model = VAE(name='VAE_4') else: print( 'The selected model {} is not in the list [DCGAN_1, DCGAN_2, DCGAN_3, VAE_1, VAE_2, VAE_3, VAE_4]' .format(model)) print("Python main programm for generating images using {}".format(model)) ## preprocess data images if init_train and save the images as pickle file. if not init_train load the saved file if init_train: print("Start initial process of building the {} model.".format(model)) print("Do Preprocessing by loading scraped images...") ### manually merged into merged_japanese, so take that subdirectory as datapath source: if False: ## select genre = "yakusha-e" image_resized_1 = preprocess(genre_or_style="yakusha-e", min_vals=[128, 128]) ## select style = "Japanese Art" image_resized_2 = preprocess(genre_or_style="Japanese Art", min_vals=[128, 128]) final_images_stacked = np.vstack( (image_resized_1, image_resized_2)) del image_resized_1, image_resized_2 gc.collect() else: final_images_stacked = preprocess(genre_or_style="merged_japanese", min_vals=[128, 128]) ## save the train data such that in the next intermediate steps the preprocess() fnc is not needed, rather load file try: print( "Save preprocessed image data on ../data/train_data.npz in order to retrieve in upcoming training cycles." ) np.savez_compressed(file="../data/train_data.npz", a=final_images_stacked) except: print( "Could not save train data on machine for upcoming training cycles." ) else: try: print("Load preprocessed image data from earlier training cycles.") final_images_stacked = np.load(file="../data/train_data.npz")["a"] except: ### manually merged into merged_japanese, so take that subdirectory as datapath source: if False: ## select genre = "yakusha-e" image_resized_1 = preprocess(genre_or_style="yakusha-e", min_vals=[128, 128]) ## select style = "Japanese Art" image_resized_2 = preprocess(genre_or_style="Japanese Art", min_vals=[128, 128]) final_images_stacked = np.vstack( (image_resized_1, image_resized_2)) del image_resized_1, image_resized_2 gc.collect() else: final_images_stacked = preprocess( genre_or_style="merged_japanese", min_vals=[128, 128]) if init_train: print("Start initial training of the {} model:".format(model)) print("There are {} images provided for training".format( len(final_images_stacked))) my_model.train(data=final_images_stacked, epochs=epochs, batch_size=batch_size, save_intervals=save_intervals, init_train=init_train, start_epoch=start_epoch, cycle=cycle) else: if model in ['DCGAN_1', 'DCGAN_2', 'DCGAN_3']: print( "Using last epoch {} of generator and discriminator for the stacked {} model:" .format(start_epoch, model)) generator_weights = "../model/{}/epoch_{}_generator.h5".format( model, start_epoch) discrimininator_weights = "../model/{}/epoch_{}_discriminator.h5".format( model, start_epoch) #load generator weights my_model.generator.load_weights(filepath=generator_weights) #load discriminator weights my_model.discriminator.load_weights( filepath=discrimininator_weights) #train the dcgan with last epoch weights print( "Training the {} model based on last epoch weights {}.".format( model, start_epoch)) elif model in ['VAE_1', 'VAE_2', 'VAE_3', 'VAE_4']: print( "Using last epoch {} of encoder and decoder for the stacked {} model:" .format(start_epoch, model)) encoder_weights = "../model/{}/epoch_{}_encoder.h5".format( model, start_epoch) decoder_weights = "../model/{}/epoch_{}_decoder.h5".format( model, start_epoch) vae_weights = "../model/{}/epoch_{}_vae.h5".format( model, start_epoch) #load encoder weights my_model.encoder.load_weights(filepath=encoder_weights) #load decoder weights my_model.decoder.load_weights(filepath=decoder_weights) #load VAE weights my_model.vae.load_weights(filepath=vae_weights) #train the VAE with last epoch weights print( "Training the {} model based on last epoch weights {}.".format( model, start_epoch)) else: print('Selected model {} is not available') my_model.train(data=final_images_stacked, epochs=epochs, batch_size=batch_size, save_intervals=save_intervals, init_train=init_train, start_epoch=start_epoch, cycle=cycle)
testing_labels.append(testing_data[files[i]]['label']) testing_content.append(testing_data[files[i]]['content']) print("Finished loading and splitting data for training and testing.\n") print("Preprocessing data for training.\n") #removing stop words and lemmatizing preprocessed_content_training = [] #Bigrams and trigrams bigrams_content_training = [] trigrams_content_training = [] for i,t in enumerate(training_content): words = [] for word in training_content[i].split(' '): words.append(word) preprocessed_content_training.append(dp.preprocess(training_content[i])) print("Finished preprocessing data for training.\n") #preprocess for bigrams and trigrams print("Getting bigram for training.\n") bigrams_content_training, bigram_mod = dp.get_bigrams(preprocessed_content_training) print("Finished getting bigram for training.\n") print("Getting trigram for training.\n") trigrams_content_training, trigram_mod = dp.get_trigrams(bigrams_content_training) print("Finished getting trigram for training.\n") print("Getting bow corpus and dictionary for training.\n") bow_corpus_training, dictionary = dp.get_dictionary_corpus(preprocessed_content_training) print("Finished getting bow corpus and dictionary for training.\n") # vectorizer = CountVectorizer()