def is_continuous_data(code,code_basic,last_exchange_day): print 'entering is_continuous_data',code,code_basic['name'] isTrue = False try: data = pd.read_csv(data_const.My_Database_Dir+code+data_const.Suffix) date_index_list = [str(u_date) for u_date in data['date']] last_index =len(date_index_list)-1 last_date = date_index_list[last_index] if len(data)!=len(data.dropna()):raise RuntimeError('data hava nan or something should drop') if(getYesterdayDate() in date_index_list): isTrue = True data = prep_d.preprocess_data(data, code_basic) else: last_date_tomorrow = getTomorrowDate(last_date) if(len(ts.get_k_data(code,last_date_tomorrow,getYesterdayDate()))==0): isTrue = True#两个时间之间没有数据,说明数据是连续的 data = prep_d.preprocess_data(data, code_basic) except Exception as e: print e,code,'something wrong in is_continuous_data' print 'leaving is_continuous_data',isTrue,0,None return isTrue,0,None if(data.loc[last_index]['date'] != last_exchange_day): last_index+=1 print 'leaving is_continuous_data',isTrue,last_index return isTrue,last_index,data
def update_all(): #last_exchange_day = getDatetimeToday().strftime("%Y-%m-%d") last_exchange_day = get_last_exchange_day() basic_env = pd.read_csv(data_const.My_Store_Dir+'basic.csv',index_col='code') realtime_data = pd.read_csv(data_const.My_Store_Dir+'realtime_data.csv') realtime_data = realtime_data.drop_duplicates((['code']))#去除重复操作十分重要 realtime_data = realtime_data.set_index('code') #用601988.csv中国银行的连续性评估整体文件csv的连续性 #引入new_code_exist强制更新全部数据 new_code_exist_force = True#force update is_sh_continuous ,_,_ = is_continuous_data('601988', basic_env.loc[601988],last_exchange_day) if(not new_code_exist_force and last_exchange_day != getDatetimeToday().strftime("%Y-%m-%d") and last_exchange_day!=getDatetimeYesterday().strftime("%Y-%m-%d") and is_sh_continuous): print '今日和昨日都不是交易日并且不需要强制更新全部数据,数据无需更新' return realtime_data if not realtime_data.index.is_unique: print '索引重复exception' raise i=1;codes = data_const.Whole_codes; for code in codes: print i,'/',len(codes),'update_data',code;i+=1; try: code_basic = basic_env.loc[int(code)] code_rt_dt = realtime_data.loc[int(code)] except: print 'code_basic = basic_env.loc[int(code)] error' continue if code_rt_dt['high'].item()==code_rt_dt['low'].item() and code_rt_dt['open'].item()==0 :#停牌处理 try: data = pd.read_csv(data_const.My_Database_Dir+code+data_const.Suffix) data = prep_d.preprocess_data(data, code_basic) wanted = pd.DataFrame(data[data_const.Feature]) wanted.to_csv(data_const.My_Database_Dir+code+data_const.Suffix,index=False) except Exception as e: print e,code continue isTrue ,last_index,data = is_continuous_data(code, code_basic,last_exchange_day); Force_all_renew = True if(isTrue and not Force_all_renew): try: #print 'new_row' new_row = idx.new_data_line(last_exchange_day,data.loc[last_index-1],code_rt_dt) data.loc[last_index-1,'review_pc'] = code_rt_dt['changepercent'] except: print 'except in new_row' new_row = idx.new_data_line(last_exchange_day,data.loc[last_index],code_rt_dt) data.loc[last_index,'review_pc'] = code_rt_dt['changepercent'] data = data[data_const.Feature] data.loc[last_index] = new_row #print data.tail()[['review_pc','date','p_change']] else: #data = ts.get_hist_data(code).sort_index().reset_index()#有时候有些股票会丢失数据 print 'is_continuous_data error decided to data=ts.get_k_data(code):',code data=ts.get_k_data(code) data = prep_d.preprocess_data(data, code_basic) wanted = pd.DataFrame(data[data_const.Feature]) wanted.to_csv(data_const.My_Database_Dir+code+data_const.Suffix,index=False) return realtime_data
def main(): test_df, train_df = load_data() train_x, train_y, indices_for_masking, pca_instance, scaler_instance = preprocess_data( train_df) test_x = preprocess_data(test_df, indices_for_masking, pca_instance, scaler_instance) model = create_model(train_x.shape[1], MODEL_TYPE) train_model(model, MODEL_TYPE, train_x, train_y) predictions = predict_data(model, MODEL_TYPE, test_x) submission_file_directory = write_output(predictions)
def get_data(path): reviews = preprocess_data.preprocess_data(path) cnn_data_train, embedding_weights_train, cnn_data_test, embedding_weights_test = preprocess_data.split_and_tokenize( reviews) pos_data_train = combine_data_and_weights( cnn_data_train, embedding_weights_train) pos_data_test = combine_data_and_weights( cnn_data_test, embedding_weights_test) return pos_data_train, pos_data_test
def train_model(): """ Trains the model. """ # Ensure that the data has been processed already. preprocess_data() # Saves the checkpoint after every epoch. checkpoint_path = "checkpoints\\epoch={epoch:02d} acc={acc:.2f} loss={loss:.2f}" \ " val_acc={val_acc:.2f} val_loss={val_loss:.2f}.hdf5" checkpoints = ModelCheckpoint(checkpoint_path, verbose=True) # Saves the checkpoint with the smallest validation loss. val_checkpoint = ModelCheckpoint(Path.model, monitor='val_loss', save_best_only=True) # Creates the folder: checkpoints. if os.path.isdir(os.getcwd() + "\\checkpoints") is False: os.mkdir(os.getcwd() + "\\checkpoints") model = sentiment_analysis_model() steps_per_epoch = (1600000 * 0.8) / Config.batch_size # Train the model. history = model.fit_generator(generator=training_data_generator(), steps_per_epoch=steps_per_epoch, validation_data=testing_data_generator(), validation_steps=len(dataset), epochs=10, verbose=1, callbacks=[checkpoints, val_checkpoint]) plot_graph(history) model.summary()
def predict(): model_filename = 'model_titanic_survival.pkl' interesting_columns = ['Pclass', 'Sex', 'Age', 'Cabin', 'Embarked'] median_age = 28 json_ = request.json df = pd.DataFrame(json_) processed_df = preprocess_data(df, interesting_columns, median_age) features = processed_df.select_dtypes(include='number') loaded_model = pickle.load(open(model_filename, 'rb')) prediction = loaded_model.predict(features) df['prediction'] = prediction return str(df.to_json(orient='records', force_ascii=False))
def train(self, epochs=200, lr=0.01, show_fig=False): X, Y, Y_one_hot = preprocess_data() self.lr = lr X_train, X_test = self.train_test_split(X) Y_train, Y_test = self.train_test_split(Y) Y_train_ohe, Y_test_ohe = self.train_test_split(Y_one_hot) self.cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.phY, logits=self.Y_)) self.train = tf.train.AdamOptimizer(self.lr).minimize(self.cost) init = tf.global_variables_initializer() self.epochs = epochs self.cost_array = [] with tf.Session() as sess: sess.run(init) for i in range(self.epochs): sess.run(self.train, feed_dict={ self.phX: X_train, self.phY: Y_train_ohe }) self.c = np.mean( sess.run(self.cost, feed_dict={ self.phX: X_train, self.phY: Y_train_ohe })) if i % 10 == 0: self.predictions = sess.run(self.pred, feed_dict={self.phX: X_test}) self.acc = np.mean(self.predictions == Y_test) print( f"Iteration {i}. Cost: {self.c}. Accuracy: {self.acc}") self.cost_array.append(self.c) if show_fig: plt.plot(self.cost_array) plt.show() self.save_path = self.saver.save(sess, "/tmp/model.ckpt") print("Model saved in path: %s" % self.save_path)
def train(attribute_names, input_csv, model_output_path, exp): x, y = preprocess_data(input_csv) num_classes = y.shape[-1] y = split_array(y, num_classes) x = split_array(x, x.shape[-1]) monitor_values = [ "val_output_class_0_precision", "val_output_class_0_recall", "val_output_class_1_precision", "val_output_class_1_recall", "val_output_class_2_precision", "val_output_class_2_recall", "val_output_class_3_precision", "val_output_class_3_recall", "val_output_class_4_precision", "val_output_class_4_recall" ] model = get_model(attribute_names=attribute_names, lr=0.0001, num_output_classes=num_classes) # early_stopper_callback = tf.keras.callbacks.EarlyStopping( # monitor='val_loss', min_delta=0, patience=10, verbose=0, # mode='min', baseline=None, restore_best_weights=True # ) early_stopper_callback = EarlyStoppingModified( model_output_dir=model_output_path, exp=exp, monitor='val_loss', min_delta=0, patience=15, verbose=0, mode='min', baseline=None, restore_best_weights=True ) # this callback has been modified, taken from tensorflow. It has been modified to use a list of monitor value and also implements model saver callback model.fit(x, y, epochs=2000, shuffle=True, validation_split=0.5, batch_size=64, callbacks=[early_stopper_callback])
def main(argv): sys.path.append(app_dir) import rw_bat_data as rwd import preprocess_data as ppd import func as fc import scale_data as sd para_dict = init_data_para() para_dict = fc.deal_argv(argv, para_dict) mode = para_dict['run_mode'] #读取所需的数据,并进行处理后存储到指定位置 print('starting processing the data...') bat_list = rwd.get_bat_list(para_dict, mode) regx, mask_filename = fc.get_filename_regx(para_dict['log_pro'], **para_dict) if bat_list is not None: for bat_name in bat_list: raw_data = rwd.read_bat_data(para_dict, mode, bat_name, limit=para_dict['data_limit'][mode]) data = ppd.preprocess_data(bat_name, raw_data) rwd.save_bat_data(data, para_dict['log_pro'] + '_' + bat_name, para_dict, mode) #存储处理后的数据 else: print('there is no bat!') #将处理好的数据按工作状态划分后存储到指定位置 print('save the processed data...') result = fc.save_workstate_data(regx, mask_filename, para_dict['processed_data_dir'][mode], para_dict['processed_data_dir'][mode]) if not result: print( 'there is not any files included the data which would been scaled.' ) return else: #进行扩充 print('to be scaled...') for state in para_dict['states']: file_name = r'%s_' % state + mask_filename processed_data = sd.get_processed_data( os.path.join(para_dict['processed_data_dir'][mode], file_name)) scale_data = sd.generate_data(processed_data, **para_dict) sd.save_scale_data(scale_data, para_dict['log_scale'] + '_' + file_name, para_dict['scale_data_dir'][mode]) print('finished scaling the %s data' % state) #训练模型 print('starting training the model...') for state in para_dict['states']: file_name = r'%s_%s_%s' % (para_dict['log_scale'], state, mask_filename) para_dict['pkl_dir'] = { 'run': os.path.normpath('/raid/data/processed_data/pkl/' + save_dir + '/%s_pkl' % state), 'debug': os.path.normpath(app_dir + '/%s_pkl' % state) } import build_model as bm bm.train_model(file_name, state, **para_dict)
current_time = time.strftime("%Y-%m-%d %H:%M:%S %Z", time.gmtime(time_start)) logger.info("Execution started at " + current_time) # Get paths dir_data, dir_output, dir_tmp, config_path = get_folder_structure(root_path=ROOT_PATH, \ config_fname=CONFIG_NAME) logger.info("Validation config file..") # Load config schema = get_schema() config = read_config(config_path=config_path) config = validate_config(config=config, schema=schema) # Clean data df_cleaned = clean_data(dir_tmp=dir_tmp, path_data=dir_data) df_base = preprocess_data(df_cleaned, dir_tmp=dir_tmp, path_data=dir_data) # Create trained model folder dir_model = os.path.join(dir_output, "model_trained") print(dir_model) print(dir_output) print(config['n_top_words']) # If its train if config['train']: if not os.path.isdir(dir_model): os.makedirs(dir_model) logger.debug("Calculating best LDA model..") search_params = {
# This is where we will be training and evaluating the model import tensorflow as tf import preprocess_data from model import LeNet from sklearn.utils import shuffle X_train, y_train, X_validation, y_validation, X_test, y_test = ( preprocess_data.preprocess_data()) # HYPERSSS! EPOCHS = 10 BATCH_SIZE = 128 lr = 0.001 x = tf.placeholder(tf.float32, shape=(None, 32, 32, 1)) y = tf.placeholder(tf.int32, shape=(None)) one_hot_y = tf.one_hot(y, 10) logits = LeNet(x) cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=one_hot_y, logits=logits) loss = tf.math.reduce_mean(cross_entropy) optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_op = optimizer.minimize(loss) correct_pred = tf.math.equal(tf.math.argmax(logits, 1), tf.argmax(one_hot_y, 1)) # bool accuracy_op = tf.reduce_mean( tf.cast(correct_pred, tf.float32)) # put all trues to 1, falses to 0, and find mean
# coding:utf-8 import pandas as pd import numpy as np from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import OneHotEncoder from sklearn.cross_validation import train_test_split from sklearn.metrics import roc_auc_score from preprocess_data import preprocess_data from conf import conf if __name__ == '__main__': train, train_target = preprocess_data(pd.read_csv(conf.train)) X_train, X_val, Y_train, Y_val = train_test_split(train, train_target, \ test_size=conf.test_size) best_C = 0 best_val_auc = 0 best_val_score = 0 for C in [1e-2, 3e-2, 1e-1, 3e-1, 1, 3, 10, 1e2, 3e2, 1e3, 3e3]: #for C in [1e-2, 3e-3, 1e-3, 3e-4, 1e-4, 3e-5, 1e-5, 3e-6, 1e-6]: print 'C:', C lr = LogisticRegression( penalty='l1', n_jobs=-1, C=C, random_state=conf.random_state) lr.fit(X_train, Y_train)
def main(data_config, output_dir, num_epochs=10, batch_size=5, lr=0.001, target_fs=44100, audio_window_size=2048, patience=5, model_type='spectrogram', k_smoothing=1): """ Train a deep beat tracker model """ # Set up logger init_console_logger(LOGGER, verbose=True) with open(data_config, 'r') as f: data_config = json.load(f) sorted_train_datasets = sorted(data_config['train'].keys()) train_dataset_desc = "train_" + "_".join(sorted_train_datasets) test_dataset_desc = "test_" + "_".join(sorted(data_config['test'].keys())) dataset_desc = train_dataset_desc + "-" + test_dataset_desc output_dir = os.path.join(output_dir, model_type, dataset_desc) LOGGER.info('Output will be saved to {}'.format(output_dir)) feature_data_dir = os.path.join(output_dir, 'data') model_dir = os.path.join(output_dir, 'model', datetime.datetime.now().strftime("%Y%m%d%H%M%S")) if not os.path.exists(feature_data_dir): os.makedirs(feature_data_dir) if not os.path.exists(model_dir): os.makedirs(model_dir) LOGGER.info('Saving configuration.') config = { 'data_config': data_config, 'output_dir': output_dir, 'num_epochs': num_epochs, 'batch_size': batch_size, 'lr': lr, 'patience': patience, 'k_smoothing': k_smoothing, 'target_fs': target_fs, 'audio_window_size': audio_window_size, 'model_type': model_type } config_path = os.path.join(model_dir, 'config.json') with open(config_path, 'w') as f: json.dump(config, f) LOGGER.info('Loading {} data.'.format(dataset_desc)) train_data_path = os.path.join(feature_data_dir, '{}_train_data.npz').format(dataset_desc) valid_data_path = os.path.join(feature_data_dir, '{}_valid_data.npz').format(dataset_desc) test_data_path = os.path.join(feature_data_dir, '{}_test_data.npz').format(dataset_desc) data_exists = os.path.exists(train_data_path) \ and os.path.exists(valid_data_path) \ and os.path.exists(test_data_path) if model_type == 'spectrogram': assert target_fs == 44100 hop_length = int(target_fs * HOP_SIZE) sorted_train_datasets = sorted(data_config['train'].keys()) a_train = [] r_train = [] # Load audio and annotations for dataset in sorted_train_datasets: data_dir = data_config['train'][dataset]['data_dir'] label_dir = data_config['train'][dataset]['label_dir'] if dataset == 'hainsworth': a, r = prep_hainsworth_data(data_dir, label_dir, target_fs, load_audio=not data_exists) elif dataset == 'ballroom': a, r = prep_ballroom_data(data_dir, label_dir, target_fs, load_audio=not data_exists) a_train += a r_train += r a_test = [] r_test = [] for dataset, dataset_dirs in data_config['test'].items(): data_dir = dataset_dirs['data_dir'] label_dir = dataset_dirs['label_dir'] if dataset == 'hainsworth': a, r = prep_hainsworth_data(data_dir, label_dir, target_fs, load_audio=not data_exists) elif dataset == 'ballroom': a, r = prep_ballroom_data(data_dir, label_dir, target_fs, load_audio=not data_exists) a_test += a r_test += r if not data_exists: # Create preprocessed data if it doesn't exist LOGGER.info( 'Preprocessing data for model type "{}".'.format(model_type)) # Get features and targets from data X_train, y_train = preprocess_data(a_train, r_train, mode=model_type, hop_size=hop_length, audio_window_size=audio_window_size, sr=target_fs) X_test, y_test = preprocess_data(a_test, r_test, mode=model_type, hop_size=hop_length, audio_window_size=audio_window_size, sr=target_fs) test_data = { 'X': X_test, 'y': y_test, 'indices': np.arange(len(y_test)) # Hack } LOGGER.info('Creating data subsets.') train_data, valid_data = create_data_subsets(X_train, y_train) LOGGER.info('Saving data subsets to disk.') np.savez(train_data_path, **train_data) np.savez(valid_data_path, **valid_data) np.savez(test_data_path, **test_data) else: # Otherwise, just load existing data train_data = load_data(train_data_path, model_type) valid_data = load_data(valid_data_path, model_type) test_data = load_data(test_data_path, model_type) model_path = os.path.join(model_dir, 'model.hdf5') if not os.path.exists(model_path): # Only train model if we haven't done so already LOGGER.info('Training model.') # Create, train, and save model model_path = train_model(train_data, valid_data, model_type, model_path, lr=lr, batch_size=batch_size, num_epochs=num_epochs, audio_window_size=audio_window_size, patience=patience) # Evaluate model LOGGER.info('Evaluating model.') perform_evaluation(train_data, valid_data, test_data, model_dir, r_train, r_test, target_fs, batch_size, k_smoothing=k_smoothing) LOGGER.info('Done!')
# coding:utf-8 import pandas as pd import numpy as np from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import OneHotEncoder from sklearn.cross_validation import train_test_split from sklearn.metrics import roc_auc_score from preprocess_data import preprocess_data from conf import conf if __name__ == '__main__': train, train_target = preprocess_data(pd.read_csv(conf.train)) X_train, X_val, Y_train, Y_val = train_test_split(train, train_target, \ test_size=conf.test_size) best_C = 0 best_val_auc = 0 best_val_score = 0 for C in [1e-2, 3e-2, 1e-1, 3e-1, 1, 3, 10, 1e2, 3e2, 1e3, 3e3]: #for C in [1e-2, 3e-3, 1e-3, 3e-4, 1e-4, 3e-5, 1e-5, 3e-6, 1e-6]: print 'C:', C lr = LogisticRegression(penalty='l1', n_jobs=-1, C=C, random_state=conf.random_state) lr.fit(X_train, Y_train)
model_checkpoint_callback, early_stopping_callback ] ) return history def make_prediction(model, img): img = np.array([img]) res = model.predict(x=img)[0] # print(res) y_hat = np.argmax(res) # convert from softmax return LABELS[y_hat], res[y_hat] # ARCHITECTURE_PLOT_PATH = os.path.join(os.path.dirname(__file__), "model_architecture.png") # print(ARCHITECTURE_PLOT_PATH) if __name__ == "__main__": data = preprocess_data() model = get_model() model.summary() # plot_model(model, to_file=ARCHITECTURE_PLOT_PATH) history = train_model(model, data) print(history)
import numpy as np import pandas as pd from preprocess_data import preprocess_data df = pd.read_csv("data.csv") data = preprocess_data("logGDP", "EmanzV", "year", df) def dysymod(paramnr, var1, var2, chVar1, chVar2, mvar1, mvar2): nterms = 17 nmodelterms = paramnr nmodels = 3 # definition of polynomial terms term = [ "", "/x", "/y", "x", "y", "/(x*y)", "x/y", "y/x", "x*y", "x^2", "/x^2", "y^2", "/y^2", "x^3", "y^3", "/x^3", "/y^3" ] print(term) # scaling terms with means scaling = [] scaling.append(1) scaling.append(mvar1) scaling.append(mvar2) scaling.append(1 / mvar1) scaling.append(1 / mvar2) scaling.append(mvar1 * mvar2) scaling.append(mvar2 / mvar1) scaling.append(mvar1 / mvar2)
import tensorflow as tf import preprocess_data as pd import utils as utl import numpy as np train_x, val_x, test_x, train_y, val_y, test_y, vocab_to_int = pd.preprocess_data( ) # Model input pipes for data feed def model_inputs(): inputs_ = tf.placeholder(tf.int32, [None, None], name="inputs") labels_ = tf.placeholder(tf.int32, [None, None], name="labels") keep_prob_ = tf.placeholder(tf.float32, name="keep_prob") return inputs_, labels_, keep_prob_ # Build, multi-dimensional vector of the current word. def build_embedding_layer(inputs_, vocab_size, embed_size): embedding = tf.Variable(tf.random_uniform((vocab_size, embed_size), -1, 1)) embed = tf.nn.embedding_lookup(embedding, inputs_) return embed # Create LSTM Layer def build_lstm_layers(lstm_sizes, embed, keep_prob_, batch_size): lstms = [tf.contrib.rnn.BasicLSTMCell(size) for size in lstm_sizes] # Add dropout to the cell drops = [ tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob_) for lstm in lstms
import torch from torch import nn from torch import optim import torch.nn.functional as F from torchvision import models from preprocess_data import preprocess_data from get_input_args import get_input_args import my_model from my_model import model, model_classifier, model_criterion, model_optimizer, classifier_hyperparam train_data, valid_data, test_data, trainloader, validloader, testloader = preprocess_data( ) input_size, hidden_layers, output_size, dropout_prob = classifier_hyperparam() input_args = get_input_args() model.classifier = model_classifier() optimizer = model_optimizer() criterion = model_criterion() #assigne a device if input_args.gpu_cpu: device = input_args.gpu if torch.cuda.is_available() and device == 'cuda': model = model.to(device) else: model = model.to('cpu') else: device = 'cpu' model = model.to(device) epochs = input_args.epochs
from sklearn import neighbors def build_tree(data): tree = neighbors.KDTree(data, leaf_size=2) return tree if __name__ == '__main__': from preprocess_data import preprocess_data from normalize import normalize from numpy import array matrix, labels, categories = preprocess_data('datingTestSet.txt') normalized_matrix, ranges, min_vals, max_vals = normalize(matrix) labelsList = ['not at all', 'in small doses', 'in large doses'] tree = build_tree(normalized_matrix) test = array([34.0, 4400.0, 0.3]) dist, ind = tree.query([test], k=3) ind = ind[0] for i in ind: print(labelsList[labels[i]])
import pandas as pd import pickle preprocessing_override = pd.read_csv('preprocessing_override.csv') dataset_X = pd.read_csv('train.csv') dataset_y = dataset_X['Survived'] dataset_X_verify = pd.read_csv('test.csv') del dataset_X['Survived'] del preprocessing_override['Survived'] import preprocess_data as prd preprocessed_data = prd.preprocess_data(dataset_X, dataset_y, preprocessing_override, dataset_X_verify) X = preprocessed_data["X"] y = preprocessed_data["y"] # import get_best_model as bfm #import get_best_classification_model as bfm import get_best_model as bfm best_fit_model = pickle.loads(bfm.get_best_model(X, y)) print(best_fit_model.predict(X[0:25])) print(y[0:25])
def main(args): # Parse arguments file_name = args.input category = args.attribute hidden_layers = args.hiddennodes iterations = args.iterations repeat = args.repeat output_path = args.output running = True paused = False drawing = args.visualise # Set up PyGame if drawing: (width, height) = (1200, 500) screen = pygame.display.set_mode((width, height)) pygame.font.init() a = 0 # Counter tests = [] while running: # Test network and reset if a % iterations == 0: if a != 0: # Run test and save results test = test_network(nn, testing_data) tests.append(test) if a < iterations * repeat: # Preprocess and divide data into two sets, ratio 2:1 training_data, testing_data, headings, categories = preprocess_data(file_name, category) # Initialise neural network keys, values = list(training_data.keys()), list(training_data.values()) input_layers, output_layers = len(keys[0]), len(values[0]) nn = NeuralNetwork(input_layers, hidden_layers, output_layers, args.learningrate) # Get current weights weights = nn.get_weights() weights_ih = weights['input-hidden'].data weights_oh = weights['hidden-output'].data else: # Save results path = output_path + 'results.csv' file = open(path, 'w') writer = csv.writer(file) writer.writerow(['successes', 'failures', 'success%']) t_successes = t_failures = 0 for test in tests: test = list(test) test.append("{:.2f}".format((test[0] / (test[1] + test[0])) * 100)) writer.writerow(test) t_successes += test[0] t_failures += test[1] print("Total successes: " + str(t_successes) + "\t\tTotal fails: " + str( t_failures) + "\t\tTotal success rate: " + "{:.2f}".format((t_successes / (t_failures + t_successes)) * 100) + "%") running = False # Create visualisation if drawing: for event in pygame.event.get(): # Stop on close if event.type == pygame.QUIT: running = False if event.type == pygame.KEYDOWN: # Pause/unpause when space is pressed if event.key == pygame.K_SPACE: paused = not paused # Black background screen.fill((0, 0, 0)) # Calculate node positions c = input_layers * 40 + (input_layers - 1) * 10 offset_i = 20 + (height - c) // 2 c = hidden_layers * 40 + (hidden_layers - 1) * 10 offset_h = 20 + (height - c) // 2 c = output_layers * 40 + (output_layers - 1) * 10 offset_o = 20 + (height - c) // 2 # Calculate min and max values for input-hidden weights min_weight = min([abs(item) for sublist in weights_ih for item in sublist]) max_weight = max([abs(item) for sublist in weights_ih for item in sublist]) # Draw input-hidden weights for i in range(input_layers): for j in range(hidden_layers): weight = weights_ih[j][i] if weight < 0: weight = (abs(weight) - min_weight) / (max_weight - min_weight) shade = int(255 * weight) pygame.draw.aaline(screen, (0, shade, 0), (300, offset_i + 50 * i), (width / 2, offset_h + 50 * j)) else: weight = (abs(weight) - min_weight) / (max_weight - min_weight) shade = int(255 * weight) pygame.draw.aaline(screen, (shade, 0, 0), (300, offset_i + 50 * i), (width / 2, offset_h + 50 * j)) # Calculate min and max values for hidden-output weights min_weight = min([abs(item) for sublist in weights_oh for item in sublist]) max_weight = max([abs(item) for sublist in weights_oh for item in sublist]) # Draw hidden-output weights for i in range(hidden_layers): for j in range(output_layers): weight = weights_oh[j][i] if weight < 0: weight = (abs(weight) - min_weight) / (max_weight - min_weight) shade = int(255 * weight) pygame.draw.aaline(screen, (0, shade, 0), (width / 2, offset_h + 50 * i), (width - 300, offset_o + 50 * j)) else: weight = (abs(weight) - min_weight) / (max_weight - min_weight) shade = int(255 * weight) pygame.draw.aaline(screen, (shade, 0, 0), (width / 2, offset_h + 50 * i), (width - 300, offset_o + 50 * j)) if a % iterations != 0: activations = nn.get_activations() # Draw input nodes activation = [item for sublist in activations['input'].data for item in sublist] for i in range(input_layers): shade = int(activation[i] * 255) pygame.draw.circle(screen, (70, 70, 70), (300, offset_i + 50 * i), 20) pygame.draw.circle(screen, (shade, shade, shade), (300, offset_i + 50 * i), 15) # Label nodes myfont = pygame.font.SysFont('Consolas', 20) textsurface = myfont.render(headings[i], False, (255, 255, 255)) rect = textsurface.get_rect() rect.right = 270 rect.top = offset_i - 10 + 50 * i screen.blit(textsurface, rect) # Draw hidden nodes activation = [item for sublist in activations['hidden'].data for item in sublist] for i in range(hidden_layers): shade = int(activation[i] * 255) pygame.draw.circle(screen, (100, 100, 100), (width // 2, offset_h + 50 * i), 20) pygame.draw.circle(screen, (shade, shade, shade), (width // 2, offset_h + 50 * i), 15) # Draw output nodes activation = [item for sublist in activations['output'].data for item in sublist] for i in range(output_layers): shade = int(activation[i] * 255) pygame.draw.circle(screen, (100, 100, 100), (width - 300, offset_o + 50 * i), 20) pygame.draw.circle(screen, (shade, shade, shade), (width - 300, offset_o + 50 * i), 15) # Label nodes myfont = pygame.font.SysFont('Consolas', 20) textsurface = myfont.render(categories[i], False, (255, 255, 255)) screen.blit(textsurface, (width - 270, offset_o - 10 + 50 * i)) # Show iterations myfont = pygame.font.SysFont('Consolas', 20) textsurface = myfont.render('Iterations: ' + str(a % iterations) + '\t Repeat: ' + str(a // iterations), False, (255, 255, 255)) screen.blit(textsurface, (width - 500, 30)) pygame.display.flip() # Train network inputs, target = random.choice(list(training_data.items())) if not paused: nn.train(list(inputs), target) a += 1
parser.add_argument('--gpu', action='store_true', default=True, dest='gpu', help='Use GPU for training, set a switch to true') parse_results = parser.parse_args() # parse data_dir = parse_results.data_directory save_dir = parse_results.save_dir arch = parse_results.arch learning_rate = float(parse_results.learning_rate) hidden_units = int(parse_results.hidden_units) epochs = int(parse_results.epochs) device = parse_results.gpu #load data and preprocessing as well image_datasets,train_loader,valid_loader,test_loader = preprocess_data(data_dir) #bulid the pre_trained model structure model_init,optimizer= load_pre_trained_model(arch, hidden_units) #train the model model,validation_accuracies = nn_classifer_train_valid(epochs,model_init,optimizer,train_loader,valid_loader,device) #saving the checkpoint check_point = save_check_point(model,image_datasets['train'],save_dir)
def main(): ################################################################################################################## # Prepare data ################################################################################################################## LOG.info('=' * 50) LOG.info('# Prepare data..') prepare_data(LOG) ################################################################################################################## # Preprocessing ################################################################################################################## LOG.info('=' * 50) LOG.info('# Preprocessing data..') preprocess_data(LOG) ################################################################################################################## # Feature Engineering ################################################################################################################## LOG.info('=' * 50) LOG.info('# Feature Engineering..') trn_path = './input/trn.csv' tst_path = './input/tst.csv' trg_path = './input/target.csv' # load data trn = pd.read_csv(trn_path) tst = pd.read_csv(tst_path) trg = pd.read_csv(trg_path) target_cols = [ 'ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1', 'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1', 'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1', 'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1', 'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1', 'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1', 'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1', 'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1' ] lags = ['_lag_one', '_lag_two', '_lag_thr', '_lag_fou', '_lag_fiv'] diffs = [['fiv', 'fou'], ['fou', 'thr'], ['thr', 'two'], ['two', 'one']] LOG.info('# na_count') # null count per row trn['na_count'] = trn.isnull().sum(axis=1) tst['na_count'] = tst.isnull().sum(axis=1) LOG.info('# target_sum_lag') # total count of purchases per month for lag in lags: trn['target_sum' + lag] = (trn[[col + lag for col in target_cols]].sum(axis=1)) tst['target_sum' + lag] = (tst[[col + lag for col in target_cols]].sum(axis=1)) LOG.info('# avg of cols') # average of cols over past 5 months cols = ['ind_actividad_cliente', 'ult_fec_cli_1t'] for col in cols: trn[col + lag + '_avg'] = (trn[[col + lag for lag in lags]]).mean(axis=1) tst[col + lag + '_avg'] = (tst[[col + lag for lag in lags]]).mean(axis=1) LOG.info('# target_sum over lag-5') # cumulative sum of target cols over past 5 months for col in target_cols: trn[col + '_sum'] = (trn[[col + lag for lag in lags]].sum(axis=1)) tst[col + '_sum'] = (tst[[col + lag for lag in lags]].sum(axis=1)) LOG.info('# target_sum_diff for each months') # change in count of purchases per month compared to its last month for diff in diffs: pre = diff[0] post = diff[1] trn['target_diff_' + post + '-' + pre] = trn['target_sum_lag_' + post] - trn['target_sum_lag_' + pre] tst['target_diff_' + post + '-' + pre] = tst['target_sum_lag_' + post] - tst['target_sum_lag_' + pre] LOG.info('# target_diff for each months') # change in individual purchases for each month compared to its last month for col in target_cols: for diff in diffs: pre = diff[0] post = diff[1] trn[col + '_label_lag_' + post] = trn[col + '_lag_' + post] - trn[col + '_lag_' + pre] tst[col + '_label_lag_' + post] = tst[col + '_lag_' + post] - tst[col + '_lag_' + pre] LOG.info('# unique target count') # unique count of purchased targets over 5 months trn['unique_target_count'] = (trn[[col + '_sum' for col in target_cols]] > 0).astype(int).sum(axis=1) tst['unique_target_count'] = (tst[[col + '_sum' for col in target_cols]] > 0).astype(int).sum(axis=1) LOG.info('# Drop infrequent targets..') rem_targets = [2, 23, 22, 21, 18, 17, 4, 12, 11, 9, 6, 13, 7, 19, 8] trn = trn[trg['0'].isin(rem_targets)] trg = trg[trg['0'].isin(rem_targets)] trg = LabelEncoder().fit_transform(trg) LOG.info('# trn : {} | trg : {} | tst : {}'.format(trn.shape, trg.shape, tst.shape)) # cache LOG.info('# Caching data as trn.csv / tst.csv ..') trn.to_csv('./input/trn_cache.csv', index=False) tst.to_csv('./input/tst_cache.csv', index=False) pd.DataFrame(trg).to_csv('./input/trg_cache.csv', index=False) ################################################################################################################## # CV Evaluation ################################################################################################################## # from cache trn = pd.read_csv('./input/trn_cache.csv') tst = pd.read_csv('./input/tst_cache.csv') trg = pd.read_csv('./input/trg_cache.csv') LOG.info('=' * 50) LOG.info('# Cross validation..') # XGB Model Param num_round = 500 early_stop = 50 xgb_params = { 'booster': 'gbtree', 'gamma': 1, 'learning_rate': 0.1, 'max_depth': 4, 'min_child_weight': 3, 'nthread': 4, 'num_class': 15, 'objective': 'multi:softprob', 'silent': 1, 'eval_metric': 'mlogloss', 'seed': 777, } trn_scores = [] vld_scores = [] best_iters = [] n_splits = 2 sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=0.05, random_state=777) for i, (t_ind, v_ind) in enumerate(sss.split(trn, trg)): LOG.info('# Iter {} / {}'.format(i + 1, n_splits)) x_trn = np.asarray(trn)[t_ind] x_vld = np.asarray(trn)[v_ind] y_trn = np.asarray(trg)[t_ind] y_vld = np.asarray(trg)[v_ind] dtrn = xgb.DMatrix(x_trn, label=y_trn) dvld = xgb.DMatrix(x_vld, label=y_vld) watch_list = [(dtrn, 'train'), (dvld, 'eval')] # fit xgb bst = xgb.train(xgb_params, dtrn, num_round, watch_list, early_stopping_rounds=early_stop, verbose_eval=True) # eval _ trn score = log_loss(y_trn, bst.predict(dtrn)) trn_scores.append(score) # eval _ vld score = log_loss(y_vld, bst.predict(dvld)) vld_scores.append(score) # best iters best_iters.append(bst.best_iteration) LOG.info('# TRN logloss: {}'.format(np.mean(trn_scores))) LOG.info('# VLD logloss: {}'.format(np.mean(vld_scores))) LOG.info('# Best Iters : {}'.format(np.mean(best_iters))) ################################################################################################################## # Model Fit ################################################################################################################## LOG.info('=' * 50) LOG.info('# Refit and predict on test data..') dtrn = xgb.DMatrix(trn, label=trg) num_round = int(np.mean(best_iters) / 0.9) bst = xgb.train(xgb_params, dtrn, num_round, verbose_eval=False) dtst = xgb.DMatrix(tst) preds = bst.predict(dtst) preds = np.fliplr(np.argsort(preds, axis=1)) ################################################################################################################## # Submission ################################################################################################################## LOG.info('=' * 50) LOG.info('# Generating a submission..') submit_cols = [ target_cols[i] for i, col in enumerate(target_cols) if i in rem_targets ] final_preds = [] for pred in preds: top_products = [] for i, product in enumerate(pred): top_products.append(submit_cols[product]) if i == 6: break final_preds.append(' '.join(top_products)) t_index = pd.read_csv('../root_input/test_ver2.csv', usecols=['ncodpers']) test_id = t_index['ncodpers'] out_df = pd.DataFrame({'ncodpers': test_id, 'added_products': final_preds}) file_name = datetime.now().strftime("result_%Y%m%d%H%M%S") + '.csv' path = './output' if not os.path.exists(path): os.makedirs(path) out_df.to_csv(os.path.join(path, file_name), index=False) LOG.info('# Clean files') cmd = 'rm -rf ./input' os.system(cmd) LOG.info('=' * 50) LOG.info('# Finished!') LOG.info('=' * 50)
# Author: xiaoyi | 小一 # email: [email protected] # Date: 2020/3/27 16:09 # Description: import pandas as pd import numpy as np # 显示所有列 from explore_data import explore_area from preprocess_data import preprocess_data from read_data import read_data from view_data import view_data pd.set_option('display.max_columns', None) # 显示所有行 # pd.set_option('display.max_rows', None) if __name__ == '__main__': # 为避免我们频繁的读取数据库,可将数据保存到本地文件 df_data = read_data() """数据预处理""" df_data = preprocess_data(df_data) """可视化分析""" df_data = view_data(df_data) """热力图探索""" explore_area(df_data)
from sklearn.model_selection import RandomizedSearchCV #load the data df_train = pd.read_csv('data/verkehrsunfaelle_train.csv', engine='python', index_col=0) df_test = pd.read_csv('data/verkehrsunfaelle_test.csv', engine='python', index_col=0) #get features and the target variable accidents = df_train.drop('Unfallschwere', axis=1) accidents_labels = df_train['Unfallschwere'].copy() #preprocess the training data model_sel, X_train, X_test, y_train, y_test = preprocess_data( accidents, accidents_labels) #preprocess the test set prediction_data = preprocess_data_to_predict(df_test, model_sel) #initialize the DeepNeuralNetwork dnn = DeepNeuralNetClassifier(show_progress=None, random_state=42) #set hyper parameters for RandomizedSearchCV parameter_distributions = { 'n_hidden_layers': [3, 4, 5], 'n_neurons': [40, 50, 100], 'batch_size': [64, 128], 'learning_rate': [0.01, 0.005], 'activation': [tf.nn.elu, tf.nn.relu], 'max_checks_without_progress': [20, 30],