def _select_enterprises(self): print("Scanning all enterprises transaction data to filter enterprises whose number of frequent customer reach the minimum threshold ...") # get enterprise_id list in the enterprise db print("Retrieving enterprise id list from enterprise table ...") enterprises_id_df = next(read_df_from_mysql_db(localhost=self.localhost, username=self.username, password=self.password, dbname=self.dbname, tbname=self.enter_tbname, enter_field=self.enter_field, enterprise_id=self.enter_list, fields="enterprise_id")) enterprises_trans_df = next(read_df_from_mysql_db(localhost=self.localhost, username=self.username, password=self.password, dbname=self.dbname, tbname=self.trans_tbname, enter_field=self.enter_field, enterprise_id=self.enter_list, fields=["customer_id", "enterprise_id", "create_time"], time_field="create_time", start_time=self.init_date.strftime("%Y-%m-%d"))) # filter enterprises filter_enters = [] for enterprise_id in enterprises_id_df.enterprise_id: print("Analyzing current enterprise: {}".format(enterprise_id)) enter_df = enterprises_trans_df[enterprises_trans_df['enterprise_id'] == enterprise_id] # next loop if df is empty: if len(enter_df.index) == 0: continue # remove duplicates of a customer in same day enter_df.create_time = enter_df.create_time.apply(lambda x: x.replace(hour=0, minute=0, second=0, microsecond=0)) try: assert 'customer_id' in enter_df.columns and 'create_time' in enter_df.columns except: raise ValueError("Input df must have customer_id header and crete_time header!") enter_df = enter_df.drop_duplicates(['customer_id', 'create_time']) cus_count = (enter_df.customer_id.value_counts() >= self.min_purchase_count).sum() if cus_count >= self.customer_threshold: filter_enters.append(enterprise_id) print("enterprise {} satisfied: {} customers.".format(enterprise_id, cus_count)) print("Analyzing enterprise done!") return filter_enters
def trainingset_generation(self, enterprise_id_list_file=None, fields=["customer_id", "enterprise_id", "price", "create_time"], outdir=".", override=False): """ :param enterprise_id_list_file: enterprise those data meets the minimum requirement. If not provided, function select_enterprises will be performed :param outdir: output directory for generated training set file :param fields: column header for retrieve data :param override: re-generate existing files :return: training set files corresponding to each filtered enterprise Note: 1. Total transaction period should be larger than train_input_length + 1 (test_set_times) 2. if init date is not current date, it should follow time format: yyyy-mm-dd """ print("Get training dataset of each enterprise...") # create output dir if not exists if not os.path.exists(outdir): os.makedirs(outdir) # get enterprise id list if not enterprise_id_list_file: enterprise_id_list_file = outdir + "/filtered_enterprise_id.txt" if not os.path.exists(enterprise_id_list_file) or override: filter_enterprises = self._select_enterprises() # save filtered enterprise ids to file list2file(filter_enterprises, enterprise_id_list_file) filter_enterprises = file2list(enterprise_id_list_file) # get transaction df trans_df = next(read_df_from_mysql_db(localhost=self.localhost, username=self.username, password=self.password, dbname=self.dbname, tbname=self.trans_tbname, enter_field=self.enter_field, enterprise_id=self.enter_list, fields=fields, start_time=self.init_date.strftime("%Y-%m-%d"))) for enterprise in filter_enterprises: outfile = outdir + "/" + str(enterprise) + ".csv" # override the existing file or not interval_file = outdir + "/" + str(enterprise) + ".intervals.csv" if os.path.exists(interval_file) and not override: continue print("Retrieving transaction data of {} from transaction table".format(enterprise)) enter_df = trans_df[trans_df['enterprise_id'] == int(enterprise)] # df with interval df_interval = self._calculate_time_interval(enter_df) # remove lines with time interval is 0 (when encounter new customers) df_interval = df_interval.ix[df_interval.time_interval > 0, :] # output intervals data to file for later distribution assessment and data merging interval_output = df_interval.time_interval interval_output.to_csv(interval_file) if os.path.exists(outfile) and not override: continue # get customers whose transaction intervals overpass the minimum requirement: training set count + 1 # cus_trans_count = df_interval.customer_id.value_counts().index[df_interval.customer_id.value_counts() >= self.training_set_times + 1].tolist() # df_interval = df_interval.ix[df_interval.customer_id.isin(cus_trans_count), :] print("Filtering customers whose purchase times meet the minimum threshold: {}".format(self.min_purchase_count)) df_interval = self.check_transaction_data(df_interval, init_date=self.init_date) # get all unique customer_ids all_cus_ids = df_interval.customer_id.unique() df_cur_enter = pd.DataFrame() print("Formating the dataset...") for current_customer in all_cus_ids: dataset = df_interval.time_interval[df_interval.customer_id == current_customer] dataset = np.asarray(dataset) dataX, dataY = self._create_interval_dataset(dataset, look_back=self.train_input_length) X_cols = [] for x in range(1, 1+self.train_input_length): X_cols.append('X' + str(x)) dfX = pd.DataFrame(dataX, columns=X_cols) dfY = pd.DataFrame(dataY, columns=['Y']) dfY['customer_id'] = current_customer dfY['enterprise_id'] = enterprise df_cur_cus = pd.concat((dfX, dfY), axis=1) df_cur_enter = pd.concat((df_cur_enter, df_cur_cus), axis=0) # output training dataset of current enterprise to output directory print("Output formated training dataset to file: {}".format(outfile)) # reindex the output file df_cur_enter.index = range(len(df_cur_enter.index)) df_cur_enter.to_csv(outfile) print("End generation!")