Exemplo n.º 1
0
 def _load_p2adn(self):
     if self.p2adn is not None:
         return
     self._load_db()
     p2adn = self.pdDB.data['products']
     self.p2adn = p2adn.set_index('product_id')
     print_mem_time("Loaded p2adn %d" % len(p2adn))
Exemplo n.º 2
0
 def __init__(self, flags, tables=[], prob_dtype=False):
     """
         Input:
           tables: a list of namedtuples,
             which have attributes: name, fname, dtype
             name and fname are strings
             dtype is a {} columna name -> data type
             e.g. 'order_id': numpy.int32
             set tables to [] if only some member functions 
             are needed without loading data
           prob_dtype:
             if True, will detect optimal dtype automatically
             with additional time  
         build a self.data {} fname->pd data frame
     """
     print()
     self.flags = flags
     path = flags.data_path
     data = {}
     for table in tables:
         name = table.name
         fname = table.fname
         dtype = table.dtype
         pname = "%s/%s.pkl" % (path, name.split('/')[-1].split('.')[0])
         if os.path.exists(pname):
             data[name] = pd.read_pickle(pname)
         else:
             if len(dtype) == 0 and prob_dtype:
                 dtype = self._get_dtype(fname)
             data[name] = pd.read_csv(fname, dtype=dtype)
             data[name].to_pickle(pname)
         print_mem_time("Loaded {} {}".format(
             fname.split('/')[-1], data[name].shape))
     self.data = data  # no copy, pass the inference
     print()
Exemplo n.º 3
0
    def _print_and_save(self, acc, vacc, tr_acc, va_acc, counter, last, epoch):
        tr_acc = self._get_acc_loss(tr_acc, acc, ratio=0.99)
        B = self.flags.batch_size
        counter += 1
        if counter == 1:
            print("\nFirst Train %s %.3f" % (self.flags.metric, tr_acc))
        if counter % 10 == 0:
            if vacc == 0:
                line = "Epoch %d Samples %d Train %s %.4f" % (
                    epoch, counter * B, self.flags.metric, tr_acc)
            else:
                va_acc = self._get_acc_loss(va_acc, vacc)
                line = "Epoch %d Samples %d Train %s %.4f Valid %s %.4f" % (
                    epoch, counter * B, self.flags.metric, tr_acc,
                    self.flags.metric, va_acc)

            print_mem_time(line)
        if self.flags.epochs is None:
            if counter % 100 == 0:
                self.epoch = counter
                self._save()
        else:
            if epoch > last:
                last = epoch
                self.epoch = epoch
                self._save()
        return tr_acc, va_acc, counter, last
Exemplo n.º 4
0
 def predict_from_placeholder(self, activation=None):
     self._build()
     self._get_summary()
     if activation is not None:
         self.logit = self._activate(self.logit, activation)
     with open(self.flags.pred_path, 'w') as f:
         pass
     count = 0
     with tf.Session() as sess:
         self.sess = sess
         sess.run(tf.global_variables_initializer())
         sess.run(tf.local_variables_initializer())
         if self.flags.log_path and self.flags.visualize is not None:
             summary_writer = tf.summary.FileWriter(self.flags.log_path,
                                                    sess.graph)
         for batch in self._batch_gen_test():
             x, _, epoch = batch
             if self.flags.log_path and self.flags.visualize is not None:
                 summary, pred = sess.run([self.summ_op, self.logit],
                                          feed_dict={
                                              self.inputs: x,
                                              self.is_training: 0
                                          })
                 summary_writer.add_summary(summary, count)
             else:
                 pred = sess.run(self.logit,
                                 feed_dict={
                                     self.inputs: x,
                                     self.is_training: 0
                                 })
             count += 1
             if count % self.flags.verbosity == 0:
                 print_mem_time("Epoch %d Batch %d " % (epoch, count))
             self.write_pred(pred)
Exemplo n.º 5
0
 def _load_u2o(self):
     if self.u2o:
         return
     path = self.flags.data_path
     p = "%s/u2o.pkl" % path
     if os.path.exists(p) == False:
         self._load_db()
         u2o = self.pdDB.data['orders'].groupby(
             'user_id')['order_id'].apply(list)
         u2o.to_pickle(p)
     else:
         u2o = pd.read_pickle(p)
     self.u2o = u2o
     print_mem_time("Loaded u2o %d" % len(u2o))
Exemplo n.º 6
0
 def _write_user_tfrecord(self):
     outpath = "%s/users.tfrecords" % self.flags.record_path
     if os.path.exists(outpath) == True:
         print("%s exists." % outpath)
         return
     self._load_u2o()  # get u2o, o2p and p2adn
     self._load_o2p()
     inpath = self.flags.input_path
     self._load_db(files=["orders"])
     orders = self.pdDB.data["orders"].set_index('order_id', drop=0)
     writer = tf.python_io.TFRecordWriter(outpath)
     i = 0
     for uid, oids in self.u2o.iteritems():
         user = User()
         user.uid = uid
         ordered_orders = orders.loc[oids].sort_values('order_number')
         for oid, orow in ordered_orders.iterrows(
         ):  # don't forget set_index!
             test = orow.eval_set == 'test'
             #print(oid,orow)
             if test:
                 user.test = True
                 order = user.testorder
             else:
                 order = user.orders.add()  # what does this mean?
             order.orderid = oid
             order.nth = orow.order_number
             order.dow = orow.order_dow
             order.hour = orow.order_hour_of_day
             days = orow.days_since_prior_order
             if not pd.isnull(days):
                 order.days_since_prior = int(days)
             # If this is a test order, products gets left empty. We don't
             # know which products are in this order.
             if test:
                 #user.testorder = order
                 pass
             else:
                 order.products.extend(self.o2p.loc[oid])
         writer.write(user.SerializeToString())
         if uid == TEST_UID:
             print("Writing uid {} to testuser.pb".format(uid))
             with open('%s/testuser.pb' % self.flags.record_path,
                       'wb') as f:
                 f.write(user.SerializeToString())
         i += 1
         if i % LOG_EVERY == 0:
             print_mem_time("{} users written".format(i))
     writer.close()
Exemplo n.º 7
0
 def _load_o2p(self):
     if self.o2p:
         return
     path = self.flags.data_path
     p = "%s/o2p.pkl" % path
     if os.path.exists(p) == False:
         self._load_db()
         ops = self.pdDB.data['op_prior']
         ops = ops.append(self.pdDB.data['op_train'])
         o2p = ops.sort_values(['order_id', 'add_to_cart_order'])\
             .groupby('order_id')['product_id'].apply(list)
         o2p.to_pickle(p)
     else:
         o2p = pd.read_pickle(p)
     self.o2p = o2p
     print_mem_time("Loaded o2p %d" % len(o2p))
Exemplo n.º 8
0
    def train_from_placeholder(self, va=False):

        labels = tf.placeholder(tf.float32, shape=(None,None)) 
        self._build()
        self._get_loss(labels)
        self._get_opt()
        self._get_summary()

        with tf.Session() as sess:
            self.sess = sess
            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())
            self._restore()
            if self.flags.log_path and self.flags.visualize is not None:
                summary_writer = tf.summary.FileWriter(self.flags.log_path, sess.graph)
            count = 0
            ave_loss = 0
            self.epoch = 0
            for batch in self._batch_gen():
                x,y,epoch = batch
                if self.flags.log_path and self.flags.visualize is not None:
                    summary,_,loss = sess.run([self.summ_op,self.opt_op,self.loss],feed_dict={self.inputs:x,labels:y})
                    summary_writer.add_summary(summary, count)
                else:
                    _,loss = sess.run([self.opt_op,self.loss],feed_dict={self.inputs:x,labels:y})
                if count==0:
                    print("First loss",loss)
                count+=1
                ave_loss = self._update_ave_loss(ave_loss,loss)
                if count%100 == 0:
                    print_mem_time("Epoch %d Batch %d ave loss %.3f"%(epoch,count,ave_loss))
                if va and count%100 == 0:
                    losses = []
                    #print()
                    for x,y,_  in self._batch_gen_va():
                        loss = sess.run(self.loss,feed_dict={self.inputs:x,labels:y})
                        #print_mem_time("Validation loss %.3f"%(loss))
                        losses.append(loss)
                    print("Ave validation loss {}".format(np.mean(losses)))
                if epoch>self.epoch:
                    self._save()
                    self.epoch = epoch
            self._save()
Exemplo n.º 9
0
 def _write_train_tfrecord(self, max_prods):
     outpath = "%s/train.tfrecords" % self.flags.record_path
     if os.path.exists(outpath) == True:
         print("%s exists." % outpath)
         return
     path = "%s/users.tfrecords" % self.flags.record_path
     ctype = getattr(tf.python_io.TFRecordCompressionType, "GZIP")
     writer_options = tf.python_io.TFRecordOptions(compression_type=ctype)
     writer = tf.python_io.TFRecordWriter(outpath, options=writer_options)
     ces = 0
     for cu, user in enumerate(
             self._iterate_wrapped_users(path, mode="train")):
         for ce, example in enumerate(
                 self._get_user_sequence_examples(user,
                                                  max_prods=max_prods)):
             writer.write(example.SerializeToString())
         ces += ce
         if cu > 0 and cu % 100 == 0:
             print_mem_time("%d users %d samples" % (cu, ces))
Exemplo n.º 10
0
    def train_from_placeholder(self, va=False):

        if self.labels is None:
            self.labels = tf.placeholder(tf.float32, shape=(None, None))
        self._build()
        self._get_loss(self.labels)
        self._get_opt()
        self._get_summary()
        ve = self.flags.verbosity

        with tf.Session() as sess:
            self.sess = sess
            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())
            self._restore()
            if self.flags.log_path and self.flags.visualize is not None:
                self.summary_writer = tf.summary.FileWriter(
                    self.flags.log_path, sess.graph)
            count = 0
            ave_loss = 0
            self.epoch = 0
            for batch in self._batch_gen():
                x, y, epoch = batch
                loss = self._run_train(x, y)
                if count == 0:
                    print("First loss", loss)
                count += 1
                ave_loss = self._update_ave_loss(ave_loss, loss)
                if count % (ve) == 0:
                    print_mem_time("Epoch %d Batch %d ave loss %.3f" %
                                   (epoch, count, ave_loss))
                if va and (count % ve == 0 or epoch > self.epoch):
                    self.eval_va()
                if epoch > self.epoch:
                    self.epoch = epoch
                    if epoch % self.flags.save_epochs == 0:
                        self._save()
            self.epoch = self.flags.epochs
            self._save()
Exemplo n.º 11
0
    def predictPL(self):
        B = self.flags.batch_size
        W, H, C = self.flags.width, self.flags.height, self.flags.color
        inputs = tf.placeholder(dtype=tf.float32, shape=[None, H, W, C])

        #with open(self.flags.pred_path,'w') as f:
        #    pass

        self._build(inputs, resize=False)
        counter = 0
        with tf.Session() as sess:
            self.sess = sess
            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())
            for imgs, imgnames in self.DATA.test_generator():
                pred = sess.run(self.logit, feed_dict={inputs: imgs})
                np.save("%s/%d.npy" % (self.flags.pred_path, counter), {
                    "pred": pred,
                    "name": imgnames
                })
                counter += len(imgs)
                if counter / B % 10 == 0:
                    print_mem_time("%d images predicted" % counter)
Exemplo n.º 12
0
    def _build(self, flags, files):
        fnames, names = self.fnames, self.names
        path = self.path
        Table = namedtuple('Table', 'name fname dtype')
        tables = [
            Table(i, "%s/%s" % (path, j), {}) for i, j in zip(names, fnames)
            if files == "all" or i in files
        ]

        print()
        self.flags = flags
        path = flags.data_path
        data = {}
        for table in tables:
            name, fname, dtype = table.name, table.fname, table.dtype
            pname = "%s/%s_%s.pkl" % (path, self.name,
                                      name.split('/')[-1].split('.')[0])
            if os.path.exists(pname):
                data[name] = pd.read_pickle(pname)
            else:
                if '_text' in name:
                    data[name] = pd.read_csv(fname,
                                             header=None,
                                             sep="\|\|",
                                             skiprows=1,
                                             names=['ID', 'Text'])
                else:
                    data[name] = pd.read_csv(fname)
                data[name].to_pickle(pname)
            print_mem_time("Loaded {} {}".format(
                fname.split('/')[-1], data[name].shape))
        self.data = data  # no copy, pass the reference
        if "training_variants" in self.data:
            y = self.data["training_variants"]['Class'] - 1
            from utils.np_utils.encoder import onehot_encode
            self.y = onehot_encode(y, self.flags.classes)
        print()
Exemplo n.º 13
0
 def predict_from_placeholder(self):
     self._build()
     self._get_summary()
     with open(self.flags.pred_path,'w') as f:
         pass
     count = 0
     with tf.Session() as sess:
         self.sess = sess
         sess.run(tf.global_variables_initializer())
         sess.run(tf.local_variables_initializer())
         if self.flags.log_path and self.flags.visualize is not None:
             summary_writer = tf.summary.FileWriter(self.flags.log_path, sess.graph)
         for batch in self._batch_gen(sequential=True):
             x,_,epoch = batch
             if self.flags.log_path and self.flags.visualize is not None:
                 summary,pred = sess.run([self.summ_op,self.logit],feed_dict={self.inputs:x})
                 summary_writer.add_summary(summary, count)
             else:
                 pred = sess.run(self.logit,feed_dict={self.inputs:x})
             count+=1
             if count%100 == 0:
                 print_mem_time("Epoch %d Batch %d "%(epoch,count))
             with open(self.flags.pred_path,'a') as f:
                 pd.DataFrame(pred).to_csv(f, header=False,index=False, float_format='%.5f')