def _load_p2adn(self): if self.p2adn is not None: return self._load_db() p2adn = self.pdDB.data['products'] self.p2adn = p2adn.set_index('product_id') print_mem_time("Loaded p2adn %d" % len(p2adn))
def __init__(self, flags, tables=[], prob_dtype=False): """ Input: tables: a list of namedtuples, which have attributes: name, fname, dtype name and fname are strings dtype is a {} columna name -> data type e.g. 'order_id': numpy.int32 set tables to [] if only some member functions are needed without loading data prob_dtype: if True, will detect optimal dtype automatically with additional time build a self.data {} fname->pd data frame """ print() self.flags = flags path = flags.data_path data = {} for table in tables: name = table.name fname = table.fname dtype = table.dtype pname = "%s/%s.pkl" % (path, name.split('/')[-1].split('.')[0]) if os.path.exists(pname): data[name] = pd.read_pickle(pname) else: if len(dtype) == 0 and prob_dtype: dtype = self._get_dtype(fname) data[name] = pd.read_csv(fname, dtype=dtype) data[name].to_pickle(pname) print_mem_time("Loaded {} {}".format( fname.split('/')[-1], data[name].shape)) self.data = data # no copy, pass the inference print()
def _print_and_save(self, acc, vacc, tr_acc, va_acc, counter, last, epoch): tr_acc = self._get_acc_loss(tr_acc, acc, ratio=0.99) B = self.flags.batch_size counter += 1 if counter == 1: print("\nFirst Train %s %.3f" % (self.flags.metric, tr_acc)) if counter % 10 == 0: if vacc == 0: line = "Epoch %d Samples %d Train %s %.4f" % ( epoch, counter * B, self.flags.metric, tr_acc) else: va_acc = self._get_acc_loss(va_acc, vacc) line = "Epoch %d Samples %d Train %s %.4f Valid %s %.4f" % ( epoch, counter * B, self.flags.metric, tr_acc, self.flags.metric, va_acc) print_mem_time(line) if self.flags.epochs is None: if counter % 100 == 0: self.epoch = counter self._save() else: if epoch > last: last = epoch self.epoch = epoch self._save() return tr_acc, va_acc, counter, last
def predict_from_placeholder(self, activation=None): self._build() self._get_summary() if activation is not None: self.logit = self._activate(self.logit, activation) with open(self.flags.pred_path, 'w') as f: pass count = 0 with tf.Session() as sess: self.sess = sess sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) if self.flags.log_path and self.flags.visualize is not None: summary_writer = tf.summary.FileWriter(self.flags.log_path, sess.graph) for batch in self._batch_gen_test(): x, _, epoch = batch if self.flags.log_path and self.flags.visualize is not None: summary, pred = sess.run([self.summ_op, self.logit], feed_dict={ self.inputs: x, self.is_training: 0 }) summary_writer.add_summary(summary, count) else: pred = sess.run(self.logit, feed_dict={ self.inputs: x, self.is_training: 0 }) count += 1 if count % self.flags.verbosity == 0: print_mem_time("Epoch %d Batch %d " % (epoch, count)) self.write_pred(pred)
def _load_u2o(self): if self.u2o: return path = self.flags.data_path p = "%s/u2o.pkl" % path if os.path.exists(p) == False: self._load_db() u2o = self.pdDB.data['orders'].groupby( 'user_id')['order_id'].apply(list) u2o.to_pickle(p) else: u2o = pd.read_pickle(p) self.u2o = u2o print_mem_time("Loaded u2o %d" % len(u2o))
def _write_user_tfrecord(self): outpath = "%s/users.tfrecords" % self.flags.record_path if os.path.exists(outpath) == True: print("%s exists." % outpath) return self._load_u2o() # get u2o, o2p and p2adn self._load_o2p() inpath = self.flags.input_path self._load_db(files=["orders"]) orders = self.pdDB.data["orders"].set_index('order_id', drop=0) writer = tf.python_io.TFRecordWriter(outpath) i = 0 for uid, oids in self.u2o.iteritems(): user = User() user.uid = uid ordered_orders = orders.loc[oids].sort_values('order_number') for oid, orow in ordered_orders.iterrows( ): # don't forget set_index! test = orow.eval_set == 'test' #print(oid,orow) if test: user.test = True order = user.testorder else: order = user.orders.add() # what does this mean? order.orderid = oid order.nth = orow.order_number order.dow = orow.order_dow order.hour = orow.order_hour_of_day days = orow.days_since_prior_order if not pd.isnull(days): order.days_since_prior = int(days) # If this is a test order, products gets left empty. We don't # know which products are in this order. if test: #user.testorder = order pass else: order.products.extend(self.o2p.loc[oid]) writer.write(user.SerializeToString()) if uid == TEST_UID: print("Writing uid {} to testuser.pb".format(uid)) with open('%s/testuser.pb' % self.flags.record_path, 'wb') as f: f.write(user.SerializeToString()) i += 1 if i % LOG_EVERY == 0: print_mem_time("{} users written".format(i)) writer.close()
def _load_o2p(self): if self.o2p: return path = self.flags.data_path p = "%s/o2p.pkl" % path if os.path.exists(p) == False: self._load_db() ops = self.pdDB.data['op_prior'] ops = ops.append(self.pdDB.data['op_train']) o2p = ops.sort_values(['order_id', 'add_to_cart_order'])\ .groupby('order_id')['product_id'].apply(list) o2p.to_pickle(p) else: o2p = pd.read_pickle(p) self.o2p = o2p print_mem_time("Loaded o2p %d" % len(o2p))
def train_from_placeholder(self, va=False): labels = tf.placeholder(tf.float32, shape=(None,None)) self._build() self._get_loss(labels) self._get_opt() self._get_summary() with tf.Session() as sess: self.sess = sess sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) self._restore() if self.flags.log_path and self.flags.visualize is not None: summary_writer = tf.summary.FileWriter(self.flags.log_path, sess.graph) count = 0 ave_loss = 0 self.epoch = 0 for batch in self._batch_gen(): x,y,epoch = batch if self.flags.log_path and self.flags.visualize is not None: summary,_,loss = sess.run([self.summ_op,self.opt_op,self.loss],feed_dict={self.inputs:x,labels:y}) summary_writer.add_summary(summary, count) else: _,loss = sess.run([self.opt_op,self.loss],feed_dict={self.inputs:x,labels:y}) if count==0: print("First loss",loss) count+=1 ave_loss = self._update_ave_loss(ave_loss,loss) if count%100 == 0: print_mem_time("Epoch %d Batch %d ave loss %.3f"%(epoch,count,ave_loss)) if va and count%100 == 0: losses = [] #print() for x,y,_ in self._batch_gen_va(): loss = sess.run(self.loss,feed_dict={self.inputs:x,labels:y}) #print_mem_time("Validation loss %.3f"%(loss)) losses.append(loss) print("Ave validation loss {}".format(np.mean(losses))) if epoch>self.epoch: self._save() self.epoch = epoch self._save()
def _write_train_tfrecord(self, max_prods): outpath = "%s/train.tfrecords" % self.flags.record_path if os.path.exists(outpath) == True: print("%s exists." % outpath) return path = "%s/users.tfrecords" % self.flags.record_path ctype = getattr(tf.python_io.TFRecordCompressionType, "GZIP") writer_options = tf.python_io.TFRecordOptions(compression_type=ctype) writer = tf.python_io.TFRecordWriter(outpath, options=writer_options) ces = 0 for cu, user in enumerate( self._iterate_wrapped_users(path, mode="train")): for ce, example in enumerate( self._get_user_sequence_examples(user, max_prods=max_prods)): writer.write(example.SerializeToString()) ces += ce if cu > 0 and cu % 100 == 0: print_mem_time("%d users %d samples" % (cu, ces))
def train_from_placeholder(self, va=False): if self.labels is None: self.labels = tf.placeholder(tf.float32, shape=(None, None)) self._build() self._get_loss(self.labels) self._get_opt() self._get_summary() ve = self.flags.verbosity with tf.Session() as sess: self.sess = sess sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) self._restore() if self.flags.log_path and self.flags.visualize is not None: self.summary_writer = tf.summary.FileWriter( self.flags.log_path, sess.graph) count = 0 ave_loss = 0 self.epoch = 0 for batch in self._batch_gen(): x, y, epoch = batch loss = self._run_train(x, y) if count == 0: print("First loss", loss) count += 1 ave_loss = self._update_ave_loss(ave_loss, loss) if count % (ve) == 0: print_mem_time("Epoch %d Batch %d ave loss %.3f" % (epoch, count, ave_loss)) if va and (count % ve == 0 or epoch > self.epoch): self.eval_va() if epoch > self.epoch: self.epoch = epoch if epoch % self.flags.save_epochs == 0: self._save() self.epoch = self.flags.epochs self._save()
def predictPL(self): B = self.flags.batch_size W, H, C = self.flags.width, self.flags.height, self.flags.color inputs = tf.placeholder(dtype=tf.float32, shape=[None, H, W, C]) #with open(self.flags.pred_path,'w') as f: # pass self._build(inputs, resize=False) counter = 0 with tf.Session() as sess: self.sess = sess sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) for imgs, imgnames in self.DATA.test_generator(): pred = sess.run(self.logit, feed_dict={inputs: imgs}) np.save("%s/%d.npy" % (self.flags.pred_path, counter), { "pred": pred, "name": imgnames }) counter += len(imgs) if counter / B % 10 == 0: print_mem_time("%d images predicted" % counter)
def _build(self, flags, files): fnames, names = self.fnames, self.names path = self.path Table = namedtuple('Table', 'name fname dtype') tables = [ Table(i, "%s/%s" % (path, j), {}) for i, j in zip(names, fnames) if files == "all" or i in files ] print() self.flags = flags path = flags.data_path data = {} for table in tables: name, fname, dtype = table.name, table.fname, table.dtype pname = "%s/%s_%s.pkl" % (path, self.name, name.split('/')[-1].split('.')[0]) if os.path.exists(pname): data[name] = pd.read_pickle(pname) else: if '_text' in name: data[name] = pd.read_csv(fname, header=None, sep="\|\|", skiprows=1, names=['ID', 'Text']) else: data[name] = pd.read_csv(fname) data[name].to_pickle(pname) print_mem_time("Loaded {} {}".format( fname.split('/')[-1], data[name].shape)) self.data = data # no copy, pass the reference if "training_variants" in self.data: y = self.data["training_variants"]['Class'] - 1 from utils.np_utils.encoder import onehot_encode self.y = onehot_encode(y, self.flags.classes) print()
def predict_from_placeholder(self): self._build() self._get_summary() with open(self.flags.pred_path,'w') as f: pass count = 0 with tf.Session() as sess: self.sess = sess sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) if self.flags.log_path and self.flags.visualize is not None: summary_writer = tf.summary.FileWriter(self.flags.log_path, sess.graph) for batch in self._batch_gen(sequential=True): x,_,epoch = batch if self.flags.log_path and self.flags.visualize is not None: summary,pred = sess.run([self.summ_op,self.logit],feed_dict={self.inputs:x}) summary_writer.add_summary(summary, count) else: pred = sess.run(self.logit,feed_dict={self.inputs:x}) count+=1 if count%100 == 0: print_mem_time("Epoch %d Batch %d "%(epoch,count)) with open(self.flags.pred_path,'a') as f: pd.DataFrame(pred).to_csv(f, header=False,index=False, float_format='%.5f')