Exemplo n.º 1
0
    def model_train_predict_test(self,
                                 input_file_regx="^(\d+)\.csv",
                                 override=False):
        """
        :param override=Fasle: rerun the model prediction no matter if the expected output file exists
        :return: model file, model weights files, prediction file, discrepancy statistic bar plot file
        """
        # get training sets for lstm training
        print "Scanning files within select id range ..."
        ids, files = get_ids_and_files_in_dir(inputdir=self.training_set_dir,
                                              range=self.training_set_id_range,
                                              input_file_regx=input_file_regx)
        print "Scanning done! Selected enterprise ids are {}".format(ids)
        if not files:
            raise ValueError(
                "No files selected in current id range. Please check the input training set directory, "
                "input enterprise id range or file format which should be '[0-9]+.csv'"
            )

        # get train, test, validation data
        for id_index, id_file in enumerate(files):
            # store prediction result to prediction directory
            enter_file = self.training_set_dir + "/" + id_file
            print "Processing dataset - enterprise_id is: {}".format(
                ids[id_index])
            print "Reading from file {}".format(enter_file)
            df = pd.read_csv(enter_file)
            df.index = range(len(df.index))
            # retrieve training X and Y columns. First column is customer_id
            select_col = ['customer_id']
            select_col = np.append(
                select_col,
                ['X' + str(i) for i in range(1, 1 + self.training_set_length)])
            select_col = np.append(select_col, ['Y', 'enterprise_id'])
            df_selected = df.ix[:, select_col]
            # remove outlier records
            df_selected = percentile_remove_outlier(df_selected,
                                                    filter_start=1,
                                                    filter_end=2 +
                                                    self.training_set_length)
            # scale the train columns
            print "Scaling..."
            if self.scaler == 'mm':
                df_scale, minVal, maxVal = MinMaxScaler(
                    df_selected,
                    start_col_index=1,
                    end_col_index=self.training_set_length + 1)
            elif self.scaler == 'norm':
                df_scale, meanVal, stdVal = NormalDistributionScaler(
                    df_selected,
                    start_col_index=1,
                    end_col_index=self.training_set_length + 1)
            else:
                raise ValueError("Argument scaler must be mm or norm!")
            # bin date y
            df_bin, bin_boundary = binning_date_y(df_scale,
                                                  y_col=1 +
                                                  self.training_set_length,
                                                  n_group=5)
            print "Bin boundary is {}".format(bin_boundary)
            # get train and test dataset
            print "Randomly selecting training set and test set..."
            all_data_x = np.asarray(
                df_bin.ix[:, 1:1 + self.training_set_length]).reshape(
                    (len(df_bin.index), 1, self.training_set_length))
            all_data_y = np.asarray(df_bin.ix[:, 1 + self.training_set_length])
            # convert y label to one-hot dummy label
            y_dummy_label = np.asarray(pd.get_dummies(all_data_y))
            # format train, test, validation data
            sub_train, val_train, sub_test, val_test = train_test_split(
                all_data_x, y_dummy_label, test_size=self.test_size)
            train_x, test_x, train_y, test_y = train_test_split(
                sub_train, sub_test, test_size=self.test_size)
            # create and fit the NN model
            model_save_path = self.model_save_dir + "/" + self.model_file_prefix + "-" + str(
                ids[id_index]) + ".h5"
            # check if model file exists
            if not os.path.exists(model_save_path) or override:
                self.NN_model_train(train_x,
                                    train_y,
                                    test_x,
                                    test_y,
                                    model_save_path=model_save_path)
            # generate prediction for training
            print "Predicting the output of validation set..."
            val_predict_class, val_predict_prob = self.NN_prediction(
                val_train, model_save_path=model_save_path)
            # statistic of discrepancy between expected value and real value
            total_sample_count = len(val_predict_class)
            val_test_label = np.asarray([list(x).index(1) for x in val_test])
            match_count = (np.asarray(val_predict_class) == np.asarray(
                val_test_label.ravel())).sum()
            print "Precision using validation dataset is {}".format(
                float(match_count) / total_sample_count)
Exemplo n.º 2
0
 def merge_similar_dataset(self, override=True):
     """
     Merge data set within similar range used for neural network training
     :param ks_pv=0.1: use KS statistics p_value to assess the distribution similarity of 2 data set.
     :param qq_slope_range=(0.85, 1.15): use slope of fitted regression line to assess the similarity of 2 data set.
             If ks_pv surpass the threshold and qq_slope is within defined range, merge the 2 data set
     :param override=True: override exists files
     :return: files containing clustered dfs and summary clustering info
     """
     # import intervals df within enterprise id range
     ids, train_files = get_ids_and_files_in_dir(self.input_dir, self.enter_id_range, input_file_regx="^(\d+)\.csv")
     _, itv_files = get_ids_and_files_in_dir(self.input_dir, self.enter_id_range, input_file_regx="(\d+)\.intervals.csv")
     # search output directory if cluster files exist. If not, copy first file in file list to output directory as initial cluster 1 file
     interval_dir = self.output_dir + "/interval"
     train_dir = self.output_dir + "/train"
     if not os.path.exists(interval_dir):
         os.makedirs(interval_dir)
     if not os.path.exists(train_dir):
         os.makedirs(train_dir)
     # record each cluster's enterprise consists
     cluster = {}
     if override:
         # remove all content in specified directory
         print "Remove old files..."
         for x in os.listdir(interval_dir):
             os.remove(interval_dir + "/" + x)
         for y in os.listdir(train_dir):
             os.remove(train_dir + "/" + y)
     if not os.listdir(interval_dir):
         id = ids.pop(0)
         cluster['0'] = np.array([int(id)])
         itv_src_file = self.input_dir + "/" + itv_files.pop(0)
         itv_dst_file = interval_dir + "/" + self.outfile_prefix + "-0.intervals.csv"
         train_src_file = self.input_dir + "/" + train_files.pop(0)
         train_dst_file = train_dir + "/" + self.outfile_prefix + "-0.csv"
         copyfile(itv_src_file, itv_dst_file)
         copyfile(train_src_file, train_dst_file)
     # read in sample interval file
     for samp_index, samp_file in enumerate(itv_files):
         samp_file = self.input_dir + "/" + samp_file
         df_samp = pd.read_csv(samp_file, header=None)
         # remove outliers
         print "Removing outliers in sample intervals..."
         df_samp_fil = percentile_remove_outlier(df_samp, 1, 1)
         df_samp_itv = df_samp_fil.ix[:, 1]
         df_samp_itv.columns = np.array(ids[samp_index])
         samp_id = ids[samp_index]
         # record slope of sample vs cluster Q-Q plot.
         # Merge sample training data to most similar cluster (slope in qq plot closest to 1).
         # If sample is distinct from all clusters, initialize it as a new cluster
         slope_to_1 = {}
         # read in cluster file
         cluster_interval_files = os.listdir(interval_dir)
         for ref_index, ref_file in enumerate(cluster_interval_files):
             # get cluster id
             id_match = re.match(r"cluster-(\d+)\.intervals\.csv", ref_file)
             cluster_id = id_match.group(1)
             ref_file = interval_dir + "/" + ref_file
             df_clu = pd.read_csv(ref_file, header=None)
             print "Removing outliers in cluster intervals..."
             df_clu_fil = percentile_remove_outlier(df_clu, 1, 1)
             df_clu_itv = df_clu_fil.ix[:, 1]
             df_clu_itv.columns = np.array(cluster_id)
             # get slope from qq plot
             qq_slope, qq_intercept, qq_error = self.qq_plot(df_samp_itv, df_clu_itv)
             ks_sta, p_value = ks_2samp(np.asarray(df_samp_itv), np.asarray(df_clu_itv))
             print "KS test results: ks-statistic: {}; p_value: {}".format(ks_sta, p_value)
             # check current sample is similar to current cluster based on ks statistics and qq slope.
             # If so, record discrepancy error from 45-degree line: sum(y_real-y_predicted)^2/2N.
             if qq_slope >= self.qq_slope_range[0] and qq_slope <= self.qq_slope_range[1]:
                 # if p_value > ks_pv:
                 if np.abs(qq_intercept) <= self.qq_intercept:
                     slope_to_1[cluster_id] = qq_error
                 else:
                     print "Purchase interval days distribution of enterprise {} is not similar enough with cluster {} (inconsistency)".format(samp_index, ref_index)
             else:
                 print "Purchase interval days distribution of enterprise {} is not similar enough with cluster {} (slope degree)".format(
                     samp_index, ref_index)
         # If not exist a similar cluster, new one
         if not slope_to_1:
             new_cluster_id = len(cluster_interval_files)
             print "Data distribution of sample {} is apparently distinct from existed clusters. " \
                   "Assign it as a new cluster {}".format(samp_id, new_cluster_id)
             interval_src_file = "{}/{}.intervals.csv".format(self.input_dir, samp_id)
             interval_dst_file = "{}/{}-{}.intervals.csv".format(interval_dir, self.outfile_prefix, new_cluster_id)
             train_src_file = "{}/{}.csv".format(self.input_dir, samp_id)
             train_dst_file = "{}/{}-{}.csv".format(train_dir, self.outfile_prefix, new_cluster_id)
             copyfile(interval_src_file, interval_dst_file)
             copyfile(train_src_file, train_dst_file)
             # store cluster info
             cluster[str(new_cluster_id)] = np.array([int(samp_id)])
         else:
             # get the most similar cluster id. Merge current enterprise to cluster with minimum qq-error
             cluster_id = min(slope_to_1, key=slope_to_1.get)
             print "Data distribution of sample {} is most similar to cluster {}. " \
                   "Merge sample data with cluster data.".format(samp_id, cluster_id)
             interval_dst_file = "{}/{}-{}.intervals.csv".format(interval_dir, self.outfile_prefix, cluster_id)
             train_dst_file = "{}/{}-{}.csv".format(train_dir, self.outfile_prefix, cluster_id)
             # merge interval files
             df_clu_int = pd.read_csv(interval_dst_file, header=None)
             df_clu_int = pd.concat([df_clu_int, df_samp], axis=0)
             df_clu_int.to_csv(interval_dst_file, index=False)
             # merge train files
             df_train_samp = pd.read_csv(self.input_dir + "/" + train_files[samp_index])
             df_clu_train = pd.read_csv(train_dst_file)
             df_clu_train = pd.concat([df_clu_train, df_train_samp], axis=0)
             df_clu_train.to_csv(train_dst_file, index=False)
             # record enterprise ids in each clusters
             if cluster_id in cluster.keys():
                 cluster[cluster_id] = np.append(cluster[cluster_id], samp_id)
             else:
                 cluster[cluster_id] = np.array([samp_id])
     # output cluster info
     print "Cluster Info: {}".format(cluster)
     df_cluster = pd.DataFrame.from_dict(cluster, orient='index')
     df_cluster = pd.DataFrame.sort_index(df_cluster)
     df_cluster = df_cluster.transpose()
     cluster_info_file = self.output_dir + "/cluster-consists-info.csv"
     df_cluster.to_csv(cluster_info_file)
     print "Output each cluster consists (enterprise ids) to file: {}.".format(cluster_info_file)