def model_train_predict_test(self, input_file_regx="^(\d+)\.csv", override=False): """ :param override=Fasle: rerun the model prediction no matter if the expected output file exists :return: model file, model weights files, prediction file, discrepancy statistic bar plot file """ # get training sets for lstm training print "Scanning files within select id range ..." ids, files = get_ids_and_files_in_dir(inputdir=self.training_set_dir, range=self.training_set_id_range, input_file_regx=input_file_regx) print "Scanning done! Selected enterprise ids are {}".format(ids) if not files: raise ValueError( "No files selected in current id range. Please check the input training set directory, " "input enterprise id range or file format which should be '[0-9]+.csv'" ) # get train, test, validation data for id_index, id_file in enumerate(files): # store prediction result to prediction directory enter_file = self.training_set_dir + "/" + id_file print "Processing dataset - enterprise_id is: {}".format( ids[id_index]) print "Reading from file {}".format(enter_file) df = pd.read_csv(enter_file) df.index = range(len(df.index)) # retrieve training X and Y columns. First column is customer_id select_col = ['customer_id'] select_col = np.append( select_col, ['X' + str(i) for i in range(1, 1 + self.training_set_length)]) select_col = np.append(select_col, ['Y', 'enterprise_id']) df_selected = df.ix[:, select_col] # remove outlier records df_selected = percentile_remove_outlier(df_selected, filter_start=1, filter_end=2 + self.training_set_length) # scale the train columns print "Scaling..." if self.scaler == 'mm': df_scale, minVal, maxVal = MinMaxScaler( df_selected, start_col_index=1, end_col_index=self.training_set_length + 1) elif self.scaler == 'norm': df_scale, meanVal, stdVal = NormalDistributionScaler( df_selected, start_col_index=1, end_col_index=self.training_set_length + 1) else: raise ValueError("Argument scaler must be mm or norm!") # bin date y df_bin, bin_boundary = binning_date_y(df_scale, y_col=1 + self.training_set_length, n_group=5) print "Bin boundary is {}".format(bin_boundary) # get train and test dataset print "Randomly selecting training set and test set..." all_data_x = np.asarray( df_bin.ix[:, 1:1 + self.training_set_length]).reshape( (len(df_bin.index), 1, self.training_set_length)) all_data_y = np.asarray(df_bin.ix[:, 1 + self.training_set_length]) # convert y label to one-hot dummy label y_dummy_label = np.asarray(pd.get_dummies(all_data_y)) # format train, test, validation data sub_train, val_train, sub_test, val_test = train_test_split( all_data_x, y_dummy_label, test_size=self.test_size) train_x, test_x, train_y, test_y = train_test_split( sub_train, sub_test, test_size=self.test_size) # create and fit the NN model model_save_path = self.model_save_dir + "/" + self.model_file_prefix + "-" + str( ids[id_index]) + ".h5" # check if model file exists if not os.path.exists(model_save_path) or override: self.NN_model_train(train_x, train_y, test_x, test_y, model_save_path=model_save_path) # generate prediction for training print "Predicting the output of validation set..." val_predict_class, val_predict_prob = self.NN_prediction( val_train, model_save_path=model_save_path) # statistic of discrepancy between expected value and real value total_sample_count = len(val_predict_class) val_test_label = np.asarray([list(x).index(1) for x in val_test]) match_count = (np.asarray(val_predict_class) == np.asarray( val_test_label.ravel())).sum() print "Precision using validation dataset is {}".format( float(match_count) / total_sample_count)
def merge_similar_dataset(self, override=True): """ Merge data set within similar range used for neural network training :param ks_pv=0.1: use KS statistics p_value to assess the distribution similarity of 2 data set. :param qq_slope_range=(0.85, 1.15): use slope of fitted regression line to assess the similarity of 2 data set. If ks_pv surpass the threshold and qq_slope is within defined range, merge the 2 data set :param override=True: override exists files :return: files containing clustered dfs and summary clustering info """ # import intervals df within enterprise id range ids, train_files = get_ids_and_files_in_dir(self.input_dir, self.enter_id_range, input_file_regx="^(\d+)\.csv") _, itv_files = get_ids_and_files_in_dir(self.input_dir, self.enter_id_range, input_file_regx="(\d+)\.intervals.csv") # search output directory if cluster files exist. If not, copy first file in file list to output directory as initial cluster 1 file interval_dir = self.output_dir + "/interval" train_dir = self.output_dir + "/train" if not os.path.exists(interval_dir): os.makedirs(interval_dir) if not os.path.exists(train_dir): os.makedirs(train_dir) # record each cluster's enterprise consists cluster = {} if override: # remove all content in specified directory print "Remove old files..." for x in os.listdir(interval_dir): os.remove(interval_dir + "/" + x) for y in os.listdir(train_dir): os.remove(train_dir + "/" + y) if not os.listdir(interval_dir): id = ids.pop(0) cluster['0'] = np.array([int(id)]) itv_src_file = self.input_dir + "/" + itv_files.pop(0) itv_dst_file = interval_dir + "/" + self.outfile_prefix + "-0.intervals.csv" train_src_file = self.input_dir + "/" + train_files.pop(0) train_dst_file = train_dir + "/" + self.outfile_prefix + "-0.csv" copyfile(itv_src_file, itv_dst_file) copyfile(train_src_file, train_dst_file) # read in sample interval file for samp_index, samp_file in enumerate(itv_files): samp_file = self.input_dir + "/" + samp_file df_samp = pd.read_csv(samp_file, header=None) # remove outliers print "Removing outliers in sample intervals..." df_samp_fil = percentile_remove_outlier(df_samp, 1, 1) df_samp_itv = df_samp_fil.ix[:, 1] df_samp_itv.columns = np.array(ids[samp_index]) samp_id = ids[samp_index] # record slope of sample vs cluster Q-Q plot. # Merge sample training data to most similar cluster (slope in qq plot closest to 1). # If sample is distinct from all clusters, initialize it as a new cluster slope_to_1 = {} # read in cluster file cluster_interval_files = os.listdir(interval_dir) for ref_index, ref_file in enumerate(cluster_interval_files): # get cluster id id_match = re.match(r"cluster-(\d+)\.intervals\.csv", ref_file) cluster_id = id_match.group(1) ref_file = interval_dir + "/" + ref_file df_clu = pd.read_csv(ref_file, header=None) print "Removing outliers in cluster intervals..." df_clu_fil = percentile_remove_outlier(df_clu, 1, 1) df_clu_itv = df_clu_fil.ix[:, 1] df_clu_itv.columns = np.array(cluster_id) # get slope from qq plot qq_slope, qq_intercept, qq_error = self.qq_plot(df_samp_itv, df_clu_itv) ks_sta, p_value = ks_2samp(np.asarray(df_samp_itv), np.asarray(df_clu_itv)) print "KS test results: ks-statistic: {}; p_value: {}".format(ks_sta, p_value) # check current sample is similar to current cluster based on ks statistics and qq slope. # If so, record discrepancy error from 45-degree line: sum(y_real-y_predicted)^2/2N. if qq_slope >= self.qq_slope_range[0] and qq_slope <= self.qq_slope_range[1]: # if p_value > ks_pv: if np.abs(qq_intercept) <= self.qq_intercept: slope_to_1[cluster_id] = qq_error else: print "Purchase interval days distribution of enterprise {} is not similar enough with cluster {} (inconsistency)".format(samp_index, ref_index) else: print "Purchase interval days distribution of enterprise {} is not similar enough with cluster {} (slope degree)".format( samp_index, ref_index) # If not exist a similar cluster, new one if not slope_to_1: new_cluster_id = len(cluster_interval_files) print "Data distribution of sample {} is apparently distinct from existed clusters. " \ "Assign it as a new cluster {}".format(samp_id, new_cluster_id) interval_src_file = "{}/{}.intervals.csv".format(self.input_dir, samp_id) interval_dst_file = "{}/{}-{}.intervals.csv".format(interval_dir, self.outfile_prefix, new_cluster_id) train_src_file = "{}/{}.csv".format(self.input_dir, samp_id) train_dst_file = "{}/{}-{}.csv".format(train_dir, self.outfile_prefix, new_cluster_id) copyfile(interval_src_file, interval_dst_file) copyfile(train_src_file, train_dst_file) # store cluster info cluster[str(new_cluster_id)] = np.array([int(samp_id)]) else: # get the most similar cluster id. Merge current enterprise to cluster with minimum qq-error cluster_id = min(slope_to_1, key=slope_to_1.get) print "Data distribution of sample {} is most similar to cluster {}. " \ "Merge sample data with cluster data.".format(samp_id, cluster_id) interval_dst_file = "{}/{}-{}.intervals.csv".format(interval_dir, self.outfile_prefix, cluster_id) train_dst_file = "{}/{}-{}.csv".format(train_dir, self.outfile_prefix, cluster_id) # merge interval files df_clu_int = pd.read_csv(interval_dst_file, header=None) df_clu_int = pd.concat([df_clu_int, df_samp], axis=0) df_clu_int.to_csv(interval_dst_file, index=False) # merge train files df_train_samp = pd.read_csv(self.input_dir + "/" + train_files[samp_index]) df_clu_train = pd.read_csv(train_dst_file) df_clu_train = pd.concat([df_clu_train, df_train_samp], axis=0) df_clu_train.to_csv(train_dst_file, index=False) # record enterprise ids in each clusters if cluster_id in cluster.keys(): cluster[cluster_id] = np.append(cluster[cluster_id], samp_id) else: cluster[cluster_id] = np.array([samp_id]) # output cluster info print "Cluster Info: {}".format(cluster) df_cluster = pd.DataFrame.from_dict(cluster, orient='index') df_cluster = pd.DataFrame.sort_index(df_cluster) df_cluster = df_cluster.transpose() cluster_info_file = self.output_dir + "/cluster-consists-info.csv" df_cluster.to_csv(cluster_info_file) print "Output each cluster consists (enterprise ids) to file: {}.".format(cluster_info_file)