Exemplo n.º 1
0
def submit_partial_merge(base, folder, all_blended=False):
  root_path = '/home/workspace/checkins'
  folder = "%s/submit/%s" % (root_path, folder)
  stamp = str(datetime.now().strftime("%Y%m%d_%H%M%S"))
  output = "%s/submit/treva_overwrite_%s_all_blended_%s.csv" % (root_path, stamp, all_blended)

  if all_blended:
    tfiles = [f for f in listdir(folder) if 'blend' in f]
  else:
    tfiles = [f for f in listdir(folder) if 'blend' not in f]

  # # remove old batch
  # print("tfiles before removing old batch: %i" % len(tfiles))
  # old_partials = [f for f in listdir(root_path + "/submit/treva_merge")]
  # tfiles = [f for f in tfiles if f not in old_partials]
  # print("tfiles after removing old batch: %i" % len(tfiles))

  # concat and merge
  df_treva = [pd.read_csv("%s/%s" % (folder, f)) for f in tfiles]
  df_treva = pd.concat(df_treva).sort_values(by='row_id')
  df_base = pd.read_csv("%s/data/submits/%s" % (root_path, base))

  df_base = df_base[~df_base.row_id.isin(df_treva.row_id.values)]
  df_overwrite = pd.concat([df_base, df_treva]).sort_values(by='row_id')
  df_overwrite[['row_id', 'place_id']].sort_values(by='row_id').to_csv(output, index=False)
  print("ensure dim:", len(df_treva), len(set(df_treva.row_id.values)), len(set(df_overwrite.row_id.values)))
  print("overwrite output written in %s @ %s" % (output, datetime.now()))
  submiter.submiter().submit(entry=output, message="treva submit_partial_merge with %s and all_blended=%s" % (base, all_blended))
Exemplo n.º 2
0
def main(stamp, x_step, y_step, do_submit=False, base=None):
    start_time = time.time()
    timestamp = str(datetime.now().strftime("%Y%m%d_%H%M%S"))
    path = '/home/workspace/checkins/submit/%s' % stamp

    score_all, df_all = process_all(path=path,
                                    size=10.0,
                                    x_step=x_step,
                                    y_step=y_step)
    score_avg = sum([v * l
                     for v, l in score_all]) / sum([l for v, l in score_all])
    print("score_avg: ", score_avg)

    sub_fname = '/home/workspace/checkins/submit/%s_%s_cv%.4f.csv' % (
        stamp, timestamp, score_avg)
    df_all['place_id'] = [
        " ".join([str(k) for k in l])
        for l in df_all[[0, 1, 2]].values.tolist()
    ]

    if base:
        df_base = pd.read_csv("./data/submits/%s" % (base))
        df_base = df_base[~df_base.row_id.isin(df_all.row_id.values)]
        df_all = pd.concat([df_base, df_all]).sort_values(by='row_id')

    df_all[['row_id', 'place_id']].to_csv(sub_fname, index=False)
    print("submit file written in %s" % sub_fname)
    if do_submit:
        submiter.submiter().submit(entry=sub_fname,
                                   message="%s_%s_cv%s" %
                                   (stamp, timestamp, score_avg))
    print("[Finish!] Elapsed %.1f secs" % (time.time() - start_time))
Exemplo n.º 3
0
  def train_alg(self, alg, keep_model=False, submit=False, upload=False, mdl_config={}):
    # get data
    start_time = time.time()
    norm = self.params.get('norm')
    df_train, df_valid, df_test = self.pas.get_data()
      
    # train & test
    print("[train_alg]: alg=%s, mdl_config=%s" % (alg, mdl_config))
    self.tra.train(df_train, alg=alg, mdl_config=mdl_config, norm=norm)
    train_score, valid_score = 0, 0
    if self.params['size'] <= 0.5:  # eva.train only when dev.
      _, train_score = self.eva.evaluate(df_train, title='Eva.Train', norm=norm)
    if len(df_valid) > 0:
      valids_total, valid_score = self.eva.evaluate(df_valid, title='Eva.Test', norm=norm)
      pickle.dump([valids_total, df_valid], open("%s/valid/valid_%s.pkl" % (self.params['root'], self.params['stamp']), 'wb'))
      # self.eva.gen_submit_file(valids_total, valid_score, title='valid')
    
    if alg in ['skrf', 'skrfp', 'sket', 'sketp']: 
      print("[skrf feature_importance]", self.get_feature_importance())

    # save & clear
    if not keep_model:
      self.eva.clear_meta_files()
    if submit:
      preds_total, _ = self.eva.evaluate(df_test, title='Submit', norm=norm)
      sfile = self.eva.gen_submit_file(preds_total, valid_score)
      if upload: submiter.submiter().submit(entry=sfile, message=self.params)
    print("[Finished!] Elapsed time overall for %.2f secs" % (time.time() - start_time))
    return valid_score
Exemplo n.º 4
0
def blending_flow(va_paths, te_paths, top_w=2, submit=False):
    va_preds = [pickle.load(open(path, 'rb')) for path in va_paths]
    te_preds = [pickle.load(open(path, 'rb')) for path in te_paths]

    scores = [v['score'][1] for v in va_preds]
    best_mdl = scores.index(max(scores))
    mdl_weights = [(top_w if mi == best_mdl else 1)
                   for mi in range(len(va_preds))]
    print("scores=%s, mdl_weights=%s" % (scores, mdl_weights))

    # blending
    _ = blendor(va_preds, mdl_weights, ytest=va_preds[0]['ytest'])
    blended_submits = blendor(te_preds, mdl_weights, ytest=None)

    # output
    output = "./submit/knn2_blended_%s.csv" % (stamp)
    df = pd.DataFrame(blended_submits)
    df['row_id'] = df.index
    df['place_id'] = df[[0, 1, 2]].astype(str).apply(lambda x: ' '.join(x),
                                                     axis=1)
    df.drop([0, 1, 2], axis=1, inplace=True)
    df[['row_id', 'place_id']].sort_values(by='row_id').to_csv(output,
                                                               index=False)
    print("submit file written in %s @ %s" % (output, datetime.now()))
    if submit:
        submiter.submiter().submit(entry=output, message="knn2")
Exemplo n.º 5
0
def submit_partial_merge(base, folder, all_blended=False):
    root_path = '/home/workspace/checkins'
    folder = "%s/submit/%s" % (root_path, folder)
    stamp = str(datetime.now().strftime("%Y%m%d_%H%M%S"))
    output = "%s/submit/treva_overwrite_%s_all_blended_%s.csv" % (
        root_path, stamp, all_blended)

    if all_blended:
        tfiles = [f for f in listdir(folder) if 'blend' in f]
    else:
        tfiles = [f for f in listdir(folder) if 'blend' not in f]

    # # remove old batch
    # print("tfiles before removing old batch: %i" % len(tfiles))
    # old_partials = [f for f in listdir(root_path + "/submit/treva_merge")]
    # tfiles = [f for f in tfiles if f not in old_partials]
    # print("tfiles after removing old batch: %i" % len(tfiles))

    # concat and merge
    df_treva = [pd.read_csv("%s/%s" % (folder, f)) for f in tfiles]
    df_treva = pd.concat(df_treva).sort_values(by='row_id')
    df_base = pd.read_csv("%s/data/submits/%s" % (root_path, base))

    df_base = df_base[~df_base.row_id.isin(df_treva.row_id.values)]
    df_overwrite = pd.concat([df_base, df_treva]).sort_values(by='row_id')
    df_overwrite[['row_id',
                  'place_id']].sort_values(by='row_id').to_csv(output,
                                                               index=False)
    print("ensure dim:", len(df_treva), len(set(df_treva.row_id.values)),
          len(set(df_overwrite.row_id.values)))
    print("overwrite output written in %s @ %s" % (output, datetime.now()))
    submiter.submiter().submit(
        entry=output,
        message="treva submit_partial_merge with %s and all_blended=%s" %
        (base, all_blended))
Exemplo n.º 6
0
 def run(self, cmd=None):
     #---------------------------------------------
     if cmd == 'gs_top_w':
         for top_w in [1.5, 1.7, 1.9]:
             self.top_w = {0: top_w}
             stamp = "gs_top_w%s_%s" % (
                 top_w, str(datetime.now().strftime("%Y%m%d_%H%M%S")))
             self.launch(stamp=stamp)
             print("[RUN] done gs_top_w=%i" % (top_w))
     #---------------------------------------------
     elif cmd == 'gs_rank_w':
         rank_ws = [
             [1, 0.8, 0.6],
             [1, 0.8, 0.4],
             [1, 0.6, 0.4],
             [1, 0.6, 0.1],
             [1, 0.4, 0.2],
         ]
         for rank_w in rank_ws:
             self.rank_w = rank_w
             stamp = "gs_rank_ws_%s_%s" % ("_".join([
                 str(w) for w in rank_w
             ]), str(datetime.now().strftime("%Y%m%d_%H%M%S")))
             self.launch(stamp=stamp)
             print("[RUN] done gs_rank_ws=%s" % (rank_w))
     #---------------------------------------------
     elif cmd == 'gs_top_n':
         self.init_models()
         all_mdl_names = [(k, 1) for k, v in self.mdl_names]
         for n in [8, 15, 30]:
             print("[gs_top_n] n=%i" % n)
             self.mdl_names = all_mdl_names[:n]
             self.launch()
             submiter.submiter().submit(entry=self.output_fname,
                                        message=self.mdl_names)
         return
     #---------------------------------------------
     elif cmd == 'debug':
         self.do_corr_rows = 100000
         self.do_blend_rows = 100000
         self.launch()
     elif cmd == 'average':
         self.init_models()
         self.mdl_names = [(k, 1) for k, v in self.mdl_names]
         self.launch()
     elif cmd == 'average_but_top':
         self.init_models()
         self.mdl_names = [((k, 1) if idx > 0 else (k, 2))
                           for idx, (k, v) in enumerate(self.mdl_names)]
         self.launch()
     else:
         self.launch()
     # auto-submit
     if self.do_upload:
         submiter.submiter().submit(entry=self.output_fname,
                                    message=self.mdl_names)
Exemplo n.º 7
0
def generate_submission(preds, sfile, msg, submit=None):    
    print('Writing submission file')
    preds = np.sort(preds.view('i8,i8,i8,i8'), order=['f0'], axis=0).view(np.int)
    with open(sfile, "w") as out:
        out.write("row_id,place_id\n")
        rows = ['']*preds.shape[0]
        for num in range(preds.shape[0]):
            rows[num]='%d,%d %d %d\n' % (preds[num,0],preds[num,1],preds[num,2],preds[num,3])
        out.writelines(rows)
    if submit:
        submiter.submiter().submit(entry=sfile, message=msg)
Exemplo n.º 8
0
 def evaluate_model(self, evaluate=False, submit=False, upload=False):
   print("[Evaluate_model] with params=%s" % (self.params))
   start_time = time.time()
   norm = self.params.get('norm')
   df_train, df_valid, df_test = self.pas.get_data()
   valid_score = 0.0
   if evaluate:
     _, valid_score = self.eva.evaluate(df_valid, title='Test', norm=norm)
   if submit:
     preds_total, _ = self.eva.evaluate(df_test, title='Submit', norm=norm)
     sfile = self.eva.gen_submit_file(preds_total, valid_score)
     if upload: submiter.submiter().submit(entry=sfile, message=self.params)
   print("[Finished!] evaluate_model for %.2f secs" % (time.time() - start_time))
Exemplo n.º 9
0
def generate_submission(preds, sfile, msg, submit=None):
    print('Writing submission file')
    preds = np.sort(preds.view('i8,i8,i8,i8'), order=['f0'],
                    axis=0).view(np.int)
    with open(sfile, "w") as out:
        out.write("row_id,place_id\n")
        rows = [''] * preds.shape[0]
        for num in range(preds.shape[0]):
            rows[num] = '%d,%d %d %d\n' % (preds[num, 0], preds[num, 1],
                                           preds[num, 2], preds[num, 3])
        out.writelines(rows)
    if submit:
        submiter.submiter().submit(entry=sfile, message=msg)
Exemplo n.º 10
0
 def run(self, cmd=None):
   #---------------------------------------------
   if cmd == 'gs_top_w':
     for top_w in [1.5, 1.7, 1.9]:
       self.top_w = {0: top_w}
       stamp = "gs_top_w%s_%s" % (top_w, str(datetime.now().strftime("%Y%m%d_%H%M%S")))
       self.launch(stamp=stamp)
       print("[RUN] done gs_top_w=%i" % (top_w))
   #---------------------------------------------
   elif cmd == 'gs_rank_w':
     rank_ws = [
       [1, 0.8, 0.6],
       [1, 0.8, 0.4],
       [1, 0.6, 0.4],
       [1, 0.6, 0.1],
       [1, 0.4, 0.2],
     ]
     for rank_w in rank_ws:
       self.rank_w = rank_w
       stamp = "gs_rank_ws_%s_%s" % ("_".join([str(w) for w in rank_w]), str(datetime.now().strftime("%Y%m%d_%H%M%S")))
       self.launch(stamp=stamp)
       print("[RUN] done gs_rank_ws=%s" % (rank_w))
   #---------------------------------------------
   elif cmd == 'gs_top_n':
     self.init_models()
     all_mdl_names = [(k, 1) for k,v in self.mdl_names]
     for n in [8, 15, 30]:
       print("[gs_top_n] n=%i" % n)
       self.mdl_names = all_mdl_names[:n]
       self.launch()
       submiter.submiter().submit(entry=self.output_fname, message=self.mdl_names)
     return
   #---------------------------------------------
   elif cmd == 'debug':
     self.do_corr_rows = 100000
     self.do_blend_rows = 100000
     self.launch()
   elif cmd == 'average':
     self.init_models()
     self.mdl_names = [(k, 1) for k,v in self.mdl_names]
     self.launch()
   elif cmd == 'average_but_top':
     self.init_models()
     self.mdl_names = [((k, 1) if idx > 0 else (k, 2)) for idx, (k,v) in enumerate(self.mdl_names)]
     self.launch()
   else:
     self.launch()
   # auto-submit
   if self.do_upload:
     submiter.submiter().submit(entry=self.output_fname, message=self.mdl_names)
Exemplo n.º 11
0
    def train_alg(self,
                  alg,
                  keep_model=False,
                  submit=False,
                  upload=False,
                  mdl_config={}):
        # get data
        start_time = time.time()
        norm = self.params.get('norm')
        df_train, df_valid, df_test = self.pas.get_data()

        # train & test
        print("[train_alg]: alg=%s, mdl_config=%s" % (alg, mdl_config))
        self.tra.train(df_train, alg=alg, mdl_config=mdl_config, norm=norm)
        train_score, valid_score = 0, 0
        if self.params['size'] <= 0.5:  # eva.train only when dev.
            _, train_score = self.eva.evaluate(df_train,
                                               title='Eva.Train',
                                               norm=norm)
        if len(df_valid) > 0:
            valids_total, valid_score = self.eva.evaluate(df_valid,
                                                          title='Eva.Test',
                                                          norm=norm)
            pickle.dump([valids_total, df_valid],
                        open(
                            "%s/valid/valid_%s.pkl" %
                            (self.params['root'], self.params['stamp']), 'wb'))
            # self.eva.gen_submit_file(valids_total, valid_score, title='valid')

        if alg in ['skrf', 'skrfp', 'sket', 'sketp']:
            print("[skrf feature_importance]", self.get_feature_importance())

        # save & clear
        if not keep_model:
            self.eva.clear_meta_files()
        if submit:
            preds_total, _ = self.eva.evaluate(df_test,
                                               title='Submit',
                                               norm=norm)
            sfile = self.eva.gen_submit_file(preds_total, valid_score)
            if upload:
                submiter.submiter().submit(entry=sfile, message=self.params)
        print("[Finished!] Elapsed time overall for %.2f secs" %
              (time.time() - start_time))
        return valid_score
Exemplo n.º 12
0
def conclude(cross_validation, test, ytest, max3index, max3placeids, indices):
    if cross_validation == 1:
        indices = ([1 / 1.0, 1 / 2.0, 1 / 3.0] *
                   (ytest[:, None] == indices[:, 0:3])
                   ).sum() / indices[np.nonzero(indices[:, 0])].shape[0]
        map3 = ([1 / 1.0, 1 / 2.0, 1 / 3.0] *
                (ytest[:, None] == max3placeids[:, 0:3])
                ).sum() / max3placeids[np.nonzero(max3placeids[:, 0])].shape[0]
        ## calculation assumes unique values
        print('indices: %.5f, map@3: %.5f' % (indices, map3))

        ## calculate map3 for each grid
        max3placeids1 = pd.DataFrame({
            'row_id': test.index.values,
            'grid_cell': test['grid_cell'],
            'ytest': ytest.values,
            'id1': max3placeids[:, 0],
            'id2': max3placeids[:, 1],
            'id3': max3placeids[:, 2]
        })
        gridwisemap3 = max3placeids1.groupby('grid_cell').apply(
            calcgridwisemap3)
        print("[Finish!] @ %s" % (datetime.now()))
        return indices, map3
    else:
        print('writing submission file...')
        max3placeids = pd.DataFrame({
            'row_id': test.index.values,
            'id1': max3placeids[:, 0],
            'id2': max3placeids[:, 1],
            'id3': max3placeids[:, 2]
        })
        max3placeids['place_id'] = max3placeids.id1.astype(str).str.cat(
            [max3placeids.id2.astype(str),
             max3placeids.id3.astype(str)],
            sep=' ')

        sfile = './submit/%s_%s.csv' % (alg, stamp)
        max3placeids[['row_id', 'place_id']].to_csv(sfile,
                                                    header=True,
                                                    index=False)
        print("[Finish!] @ %s" % datetime.now())
        if False:
            submiter.submiter().submit(entry=sfile, message="knn2")
        return None, None
Exemplo n.º 13
0
 def evaluate_model(self, evaluate=False, submit=False, upload=False):
     print("[Evaluate_model] with params=%s" % (self.params))
     start_time = time.time()
     norm = self.params.get('norm')
     df_train, df_valid, df_test = self.pas.get_data()
     valid_score = 0.0
     if evaluate:
         _, valid_score = self.eva.evaluate(df_valid,
                                            title='Test',
                                            norm=norm)
     if submit:
         preds_total, _ = self.eva.evaluate(df_test,
                                            title='Submit',
                                            norm=norm)
         sfile = self.eva.gen_submit_file(preds_total, valid_score)
         if upload:
             submiter.submiter().submit(entry=sfile, message=self.params)
     print("[Finished!] evaluate_model for %.2f secs" %
           (time.time() - start_time))
Exemplo n.º 14
0
def main(stamp, x_step, y_step, do_submit=False, base=None):
    start_time = time.time()
    timestamp = str(datetime.now().strftime("%Y%m%d_%H%M%S"))
    path = '/home/workspace/checkins/submit/%s' % stamp

    score_all, df_all = process_all(path=path, size=10.0, x_step=x_step, y_step=y_step)
    score_avg = sum([v*l for v, l in score_all]) / sum([l for v,l in score_all])
    print("score_avg: ", score_avg)
  
    sub_fname = '/home/workspace/checkins/submit/%s_%s_cv%.4f.csv' % (stamp, timestamp, score_avg)
    df_all['place_id'] = [" ".join([str(k) for k in l]) for l in df_all[[0,1,2]].values.tolist()]

    if base:
        df_base = pd.read_csv("./data/submits/%s" % (base))
        df_base = df_base[~df_base.row_id.isin(df_all.row_id.values)]
        df_all = pd.concat([df_base, df_all]).sort_values(by='row_id')

    df_all[['row_id', 'place_id']].to_csv(sub_fname, index=False)
    print("submit file written in %s" % sub_fname)
    if do_submit:
        submiter.submiter().submit(entry=sub_fname, message="%s_%s_cv%s" % (stamp, timestamp, score_avg))
    print("[Finish!] Elapsed %.1f secs" % (time.time() - start_time))
Exemplo n.º 15
0
def blending_flow(va_paths, te_paths, top_w=2, submit=False):
    va_preds = [pickle.load(open(path, 'rb')) for path in va_paths]
    te_preds = [pickle.load(open(path, 'rb')) for path in te_paths]
    
    scores = [v['score'][1] for v in va_preds]
    best_mdl = scores.index(max(scores))
    mdl_weights = [(top_w if mi == best_mdl else 1) for mi in range(len(va_preds))]
    print("scores=%s, mdl_weights=%s" % (scores, mdl_weights))

    # blending
    _ = blendor(va_preds, mdl_weights, ytest=va_preds[0]['ytest'])
    blended_submits = blendor(te_preds, mdl_weights, ytest=None)
    
    # output
    output = "./submit/knn2_blended_%s.csv" % (stamp)
    df = pd.DataFrame(blended_submits)
    df['row_id'] = df.index
    df['place_id'] = df[[0,1,2]].astype(str).apply(lambda x: ' '.join(x), axis=1)
    df.drop([0,1,2], axis=1, inplace=True)
    df[['row_id', 'place_id']].sort_values(by='row_id').to_csv(output, index=False)
    print("submit file written in %s @ %s" % (output, datetime.now()))
    if submit:
        submiter.submiter().submit(entry=output, message="knn2")
Exemplo n.º 16
0
def conclude(cross_validation, test, ytest, max3index, max3placeids, indices):
    if cross_validation==1: 
        indices = ([1/1.0, 1/2.0, 1/3.0]*(ytest[:,None] == indices[:,0:3]) ).sum()/indices[np.nonzero(indices[:,0])].shape[0]
        map3 = ([1/1.0, 1/2.0, 1/3.0]*(ytest[:,None] == max3placeids[:,0:3]) ).sum()/max3placeids[np.nonzero(max3placeids[:,0])].shape[0]
        ## calculation assumes unique values  
        print('indices: %.5f, map@3: %.5f' % (indices, map3))
     
        ## calculate map3 for each grid 
        max3placeids1 = pd.DataFrame({'row_id':test.index.values, 'grid_cell': test['grid_cell'], 'ytest': ytest.values, 'id1':max3placeids[:,0],'id2':max3placeids[:,1],'id3':max3placeids[:,2]} )                  
        gridwisemap3 = max3placeids1.groupby('grid_cell').apply(calcgridwisemap3)
        print("[Finish!] @ %s" % (datetime.now()))
        return indices, map3
    else:
        print('writing submission file...')
        max3placeids = pd.DataFrame({'row_id':test.index.values,'id1':max3placeids[:,0],'id2':max3placeids[:,1],'id3':max3placeids[:,2]} )
        max3placeids['place_id']=max3placeids.id1.astype(str).str.cat([max3placeids.id2.astype(str),max3placeids.id3.astype(str)], sep = ' ')       
        
        sfile = './submit/%s_%s.csv' % (alg, stamp)
        max3placeids[['row_id','place_id']].to_csv(sfile, header=True, index=False)
        print("[Finish!] @ %s" % datetime.now())
        if False:
            submiter.submiter().submit(entry=sfile, message="knn2")
        return None, None
Exemplo n.º 17
0
  def run(self):
    run_cmd = self.params['alg']
    alg = run_cmd.split('_')[0]
    print("[RUN_CMD] %s" % run_cmd)
    start_time = time.time()
    #==========================================
    #   Shared config
    #==========================================
    if 'knn' in run_cmd:
      self.params['norm'] = {
        'x': 500, 'y':1000, 
        'hour':4, 'logacc':1, 'weekday':3, 
        'qday':1, 'month':2, 'year':10, 'day':1./22,
      }
      # self.params['norm'] = {'x': 700, 'y':1100, 'hour':4, 'qday':1, 'logacc':1, 'weekday':3, 'day':1./22., 'month':2, 'year':10}
      self.params['x_cols'] = [x for x in self.params['x_cols'] if x in self.params['norm'].keys()]
      self.params['x_step'] = 0.25
      self.params['y_step'] = 0.25

    if 'try_inter' in run_cmd:
      self.params['x_inter'] = 2
      self.params['y_inter'] = 2
      self.params['mdl_weights'] = (0.4, 1, 0.4)

    if 'try_large_grid' in run_cmd:
      self.params['x_step'] = 0.4
      self.params['y_step'] = 0.4

    #==========================================
    #   Choose-one config
    #==========================================
    if run_cmd == 'all':
      for a in ['skrf', 'skrfp', 'sket', 'sketp', 'knn', 'xgb', 'skgbc']:
        self.init_team()
        self.train_alg(a)
    #------------------------------------------
    elif 'skrf_reverse_valid_split_time' in run_cmd:
      self.params['train_test_split_time'] = 100000
      self.params['place_min_last_checkin'] = None
      self.init_team()
      self.train_alg(alg)
    #------------------------------------------
    elif '_grid_step' in run_cmd:
      # for x_step in [0.04, 0.05, 0.08, 0.1, 0.2]:
      #   for y_step in [0.04, 0.05, 0.08, 0.1, 0.2]:
      for x_step in [0.1, 0.2, 0.5, 1]:
        for y_step in [0.1, 0.2, 0.5, 1]:
          print("=====[%s for step=(%.2f, %.2f)]=====" % (run_cmd, x_step, y_step))
          self.params['x_step'] = x_step
          self.params['y_step'] = y_step
          self.init_team()
          self.train_alg(alg)
    #------------------------------------------
    elif run_cmd == 'knn_grid_weights':
      # self.params['norm'] = {'x': 500, 'y':1000, 'hour':4, 'qday':1, 'logacc':1, 'weekday':3, 'day':1./22., 'month':2, 'year':10}
      for logacc in np.arange(1, 10, 2):
        for qday in np.arange(1, 10, 2):   
          print("[knn_grid_weights] logacc=%i, qday=%i" % (logacc, qday))
          self.params['norm']['logacc'] = logacc
          self.params['norm']['qday'] = qday
          self.init_team()
          self.train_alg(alg)
    #------------------------------------------
    elif run_cmd == 'skrf_recursive_feature_elimination':
      fixed_feats = {'logacc', 'qday', 'x', 'y', 'hour', 'weekday', 'year', 'month'}
      feats = set(self.all_feats)
      print("[RFE] checking x_cols for %s" % (feats - fixed_feats))
      while True:
        scores = {}
        self.params['x_cols'] = list(feats)
        self.init_team()
        scores['all'] = self.train_alg(alg)
        print("[RFE] baseline = %.4f" % scores['all'])
        for af in (feats - fixed_feats):
          self.params['x_cols'] = [a for a in feats if a != af]
          self.init_team()
          print("[RFE] x_cols remove [%s], using %s" % (af, self.params['x_cols']))
          scores[af] = self.train_alg(alg)
        rm_feat, rm_score = sorted(scores.items(), key=operator.itemgetter(1), reverse=True)[0]
        if rm_score > scores['all'] - 0.01: 
          print("[RFE] base_score = %.4f, remove %s to achieve %.4f" % (scores['all'], rm_feat, rm_score))
          feats -= set([rm_feat])
        else:
          print("[RFE] finished since no feature shall be removed!")
          break
    #------------------------------------------
    elif 'skrf_mdl_weights' in run_cmd:
      for sw in np.arange(0, 1.2, 0.1):
        self.params['mdl_weights'] = (sw, 0, 1.0, 0, sw)
        self.init_team()
        self.train_alg(alg)
    #------------------------------------------
    elif 'skrf_preprocessing' in run_cmd:
      for en in [0, 1]:
        self.params['en_preprocessing'] = en
        self.init_team()
        self.train_alg(alg)
    #------------------------------------------
    elif 'skrf_max_cands' in run_cmd:
      for proc in ['W', 'H']:
        for cants in np.arange(10, 50, 10):
          self.params['en_preprocessing'] = proc
          self.params['max_cands'] = cants
          self.init_team()
          self.train_alg(alg)
    #------------------------------------------
    elif 'skrf_remove_distance_outlier' in run_cmd:
      for std in np.arange(1, 3, 0.5):
        self.params['remove_distance_outlier'] = std
        self.init_team()
        self.train_alg(alg)
    #------------------------------------------
    elif run_cmd == 'skrf_feats_sel':
      all_feats = self.all_feats
      # baseline
      self.params['x_cols'] = all_feats
      self.init_team()
      self.train_alg(alg)
      # drop 1 feature
      for af in all_feats:
        self.params['x_cols'] = [a for a in all_feats if a != af]
        self.init_team()
        self.train_alg(alg)
    #------------------------------------------
    elif run_cmd == 'skrf_gs_time':
      for mfc in [None, 200000, 250000, 300000]:
        for tmt in [None]: #, 400000, 500000, 600000, 700000]:
          self.params['place_max_first_checkin'] = mfc
          self.params['train_max_time'] = tmt
          self.init_team()
          self.train_alg(alg)
    #------------------------------------------
    elif run_cmd == 'skrf_gs_loc_th':
      # for th_y in np.arange(1.5, 2.5, 0.1):
      #   for th_x in np.arange(0.6, 2, 0.2):
      for th_y in np.arange(1.7, 2.5, 0.2):
        for th_x in np.arange(2.3, 3.5, 0.2):
          print("[SKRF_GS_LOC_TH]: th_x=%s, th_y=%s" % (th_x, th_y))
          self.params['loc_th_x'] = th_x
          self.params['loc_th_y'] = th_y
          self.init_team()
          self.evaluate_model(evaluate=True, submit=False)
    #------------------------------------------
    elif run_cmd == 'skrf_place_min_checkin':
      for mc in np.arange(0, 5, 1):
        self.params['place_min_checkin'] = mc
        self.init_team()
        self.train_alg(alg)
    #------------------------------------------
    elif run_cmd == 'skrf_gs_time_th_wd':
      for pth in np.arange(0, 0.005, 0.001):
        self.params['time_th_wd'] = pth
        self.init_team()
        self.evaluate_model(evaluate=True, submit=False)
    #------------------------------------------
    elif run_cmd == 'skrf_gs_time_th_hr':
      for pth in np.arange(0.005, 0.02, 0.002):
        self.params['time_th_hr'] = pth
        self.init_team()
        self.train_alg(alg)
    #------------------------------------------
    elif run_cmd == 'skrf_gs_popu_th':
      for pth in np.arange(0, 0.005, 0.001):
        self.params['popu_th'] = pth
        self.init_team()
        self.evaluate_model(evaluate=True, submit=False)
    #------------------------------------------
    elif run_cmd in ['skrf_gs_params', 'skrfp_gs_params']:
      self.init_team()
      for max_feats in [0.3, 0.35, 0.4]:
        for n_estimators in [500]:
          for max_depth in [15]:
            self.train_alg(alg, mdl_config={'n_estimators': n_estimators, 'max_depth': max_depth, 'max_features': max_feats})
    elif run_cmd in ['sket_gs_params', 'sketp_gs_params']:
      self.init_team()
      for n_estimators in [800, 1200, 1500]:
        for max_depth in [13, 15, 18]:
          for max_feats in ['auto', 0.4, 0.5, 0.6]:
            self.train_alg(alg, mdl_config={'n_estimators': n_estimators, 'max_depth': max_depth, 'max_features': max_feats})
    elif run_cmd == 'xgb_gs_params':
      self.init_team()
      for n_estimators in [30, 35, 40]:
        for max_depth in [3, 4, 5]:
          for learning_rate in [0.1]:
            self.train_alg(alg, params={'n_estimators': n_estimators, 'max_depth': max_depth, 'learning_rate': learning_rate})
    #------------------------------------------
    elif run_cmd == 'skrf_place_min_last_checkin':
      for mlc in [550000, 650000]:
        self.params['place_min_last_checkin'] = mlc
        self.params['stamp'] = "%s_%s_%i" % (self.params['alg'], self.timestamp, mlc/1e4)
        self.init_team()
        self.train_alg(alg, submit=True)
    #------------------------------------------
    elif run_cmd == 'skrf_train_min_time':
      for mlc in [0, 50000, 100000, 150000, 200000]:
        self.params['train_min_time'] = mlc
        self.params['stamp'] = "%s_%s_%i" % (self.params['alg'], self.timestamp, mlc/1e4)
        self.init_team()
        self.train_alg(alg, submit=True)
    #------------------------------------------
    elif 'submit_rf_family' in run_cmd:
      self.params['train_test_split_time'] = 1e10   # use all samples for training
      self.init_team()
      for a in ['skrf', 'skrfp']:
        self.train_alg(a, keep_model=True, submit=True, upload=True)
      self.train_alg('knn', submit=True, upload=True)
    elif 'submit_et_family' in run_cmd:
      self.params['train_test_split_time'] = 1e10   # use all samples for training
      self.init_team()
      for a in ['sket', 'sketp']:
        self.train_alg(a, submit=True, upload=True)
    elif 'submit_full' in run_cmd:
      self.params['train_test_split_time'] = 1e10   # use all samples for training
      self.init_team()
      self.train_alg(alg, submit=True, upload=True)
    elif '_submit' in run_cmd:
      self.init_team()
      self.train_alg(alg, keep_model=True, submit=True)
    elif 'eva_exist' in run_cmd:
      self.init_team()
      self.evaluate_model(evaluate=True, submit=False)
    elif 'smt_exist' in run_cmd:
      self.params['train_test_split_time'] = 1e10
      self.init_team()
      self.evaluate_model(evaluate=False, submit=True, upload=True)
    #------------------------------------------
    elif 'fast' in run_cmd: # fast flow debug
      self.init_team()
      self.train_alg(alg, mdl_config={'n_estimators': 5})
    #------------------------------------------
    elif run_cmd == 'treva_cv':
      self.init_team()
      df_train, df_valid, df_test = self.pas.get_data()
      tva = treva.trainer(self.params)
      tva.train(df_train, df_valid, df_test)
    #------------------------------------------
    elif 'treva' in run_cmd:
      if 'elite' in run_cmd:
        self.params['train_test_split_time'] = 1e10
      else:
        self.params['train_test_split_time'] = 700000
      self.init_team()
      df_train, df_valid, df_test = self.pas.get_data()
      tva = treva.trainer(self.params)
      sfile = tva.train(df_train, df_valid, df_test)
      submiter.submiter().submit(entry=sfile, message=self.params)
    elif run_cmd == 'tuner':
      self.init_team()
      df_train, df_valid, _ = self.pas.get_data()
      grids = []
      for i in range(10):
        xb, yb = int(125*random())*0.08, int(125*random())*0.08
        grids += [(xb, xb+0.08, yb, yb+0.08)]
      print(grids)
      df_all = pd.concat([df_train, df_valid])
      all_scores = tuner.tuner(df_all, grids)
    else: # single model
      self.init_team()
      self.train_alg(alg)
    #------------------------------------------
    print("[Finished!] Elapsed time overall for %.2f secs" % (time.time() - start_time))
Exemplo n.º 18
0
    def run(self):
        run_cmd = self.params['alg']
        alg = run_cmd.split('_')[0]
        print("[RUN_CMD] %s" % run_cmd)
        start_time = time.time()
        #==========================================
        #   Shared config
        #==========================================
        if 'knn' in run_cmd:
            self.params['norm'] = {
                'x': 500,
                'y': 1000,
                'hour': 4,
                'logacc': 1,
                'weekday': 3,
                'qday': 1,
                'month': 2,
                'year': 10,
                'day': 1. / 22,
            }
            # self.params['norm'] = {'x': 700, 'y':1100, 'hour':4, 'qday':1, 'logacc':1, 'weekday':3, 'day':1./22., 'month':2, 'year':10}
            self.params['x_cols'] = [
                x for x in self.params['x_cols']
                if x in self.params['norm'].keys()
            ]
            self.params['x_step'] = 0.25
            self.params['y_step'] = 0.25

        if 'try_inter' in run_cmd:
            self.params['x_inter'] = 2
            self.params['y_inter'] = 2
            self.params['mdl_weights'] = (0.4, 1, 0.4)

        if 'try_large_grid' in run_cmd:
            self.params['x_step'] = 0.4
            self.params['y_step'] = 0.4

        #==========================================
        #   Choose-one config
        #==========================================
        if run_cmd == 'all':
            for a in ['skrf', 'skrfp', 'sket', 'sketp', 'knn', 'xgb', 'skgbc']:
                self.init_team()
                self.train_alg(a)
        #------------------------------------------
        elif 'skrf_reverse_valid_split_time' in run_cmd:
            self.params['train_test_split_time'] = 100000
            self.params['place_min_last_checkin'] = None
            self.init_team()
            self.train_alg(alg)
        #------------------------------------------
        elif '_grid_step' in run_cmd:
            # for x_step in [0.04, 0.05, 0.08, 0.1, 0.2]:
            #   for y_step in [0.04, 0.05, 0.08, 0.1, 0.2]:
            for x_step in [0.1, 0.2, 0.5, 1]:
                for y_step in [0.1, 0.2, 0.5, 1]:
                    print("=====[%s for step=(%.2f, %.2f)]=====" %
                          (run_cmd, x_step, y_step))
                    self.params['x_step'] = x_step
                    self.params['y_step'] = y_step
                    self.init_team()
                    self.train_alg(alg)
        #------------------------------------------
        elif run_cmd == 'knn_grid_weights':
            # self.params['norm'] = {'x': 500, 'y':1000, 'hour':4, 'qday':1, 'logacc':1, 'weekday':3, 'day':1./22., 'month':2, 'year':10}
            for logacc in np.arange(1, 10, 2):
                for qday in np.arange(1, 10, 2):
                    print("[knn_grid_weights] logacc=%i, qday=%i" %
                          (logacc, qday))
                    self.params['norm']['logacc'] = logacc
                    self.params['norm']['qday'] = qday
                    self.init_team()
                    self.train_alg(alg)
        #------------------------------------------
        elif run_cmd == 'skrf_recursive_feature_elimination':
            fixed_feats = {
                'logacc', 'qday', 'x', 'y', 'hour', 'weekday', 'year', 'month'
            }
            feats = set(self.all_feats)
            print("[RFE] checking x_cols for %s" % (feats - fixed_feats))
            while True:
                scores = {}
                self.params['x_cols'] = list(feats)
                self.init_team()
                scores['all'] = self.train_alg(alg)
                print("[RFE] baseline = %.4f" % scores['all'])
                for af in (feats - fixed_feats):
                    self.params['x_cols'] = [a for a in feats if a != af]
                    self.init_team()
                    print("[RFE] x_cols remove [%s], using %s" %
                          (af, self.params['x_cols']))
                    scores[af] = self.train_alg(alg)
                rm_feat, rm_score = sorted(scores.items(),
                                           key=operator.itemgetter(1),
                                           reverse=True)[0]
                if rm_score > scores['all'] - 0.01:
                    print(
                        "[RFE] base_score = %.4f, remove %s to achieve %.4f" %
                        (scores['all'], rm_feat, rm_score))
                    feats -= set([rm_feat])
                else:
                    print("[RFE] finished since no feature shall be removed!")
                    break
        #------------------------------------------
        elif 'skrf_mdl_weights' in run_cmd:
            for sw in np.arange(0, 1.2, 0.1):
                self.params['mdl_weights'] = (sw, 0, 1.0, 0, sw)
                self.init_team()
                self.train_alg(alg)
        #------------------------------------------
        elif 'skrf_preprocessing' in run_cmd:
            for en in [0, 1]:
                self.params['en_preprocessing'] = en
                self.init_team()
                self.train_alg(alg)
        #------------------------------------------
        elif 'skrf_max_cands' in run_cmd:
            for proc in ['W', 'H']:
                for cants in np.arange(10, 50, 10):
                    self.params['en_preprocessing'] = proc
                    self.params['max_cands'] = cants
                    self.init_team()
                    self.train_alg(alg)
        #------------------------------------------
        elif 'skrf_remove_distance_outlier' in run_cmd:
            for std in np.arange(1, 3, 0.5):
                self.params['remove_distance_outlier'] = std
                self.init_team()
                self.train_alg(alg)
        #------------------------------------------
        elif run_cmd == 'skrf_feats_sel':
            all_feats = self.all_feats
            # baseline
            self.params['x_cols'] = all_feats
            self.init_team()
            self.train_alg(alg)
            # drop 1 feature
            for af in all_feats:
                self.params['x_cols'] = [a for a in all_feats if a != af]
                self.init_team()
                self.train_alg(alg)
        #------------------------------------------
        elif run_cmd == 'skrf_gs_time':
            for mfc in [None, 200000, 250000, 300000]:
                for tmt in [None]:  #, 400000, 500000, 600000, 700000]:
                    self.params['place_max_first_checkin'] = mfc
                    self.params['train_max_time'] = tmt
                    self.init_team()
                    self.train_alg(alg)
        #------------------------------------------
        elif run_cmd == 'skrf_gs_loc_th':
            # for th_y in np.arange(1.5, 2.5, 0.1):
            #   for th_x in np.arange(0.6, 2, 0.2):
            for th_y in np.arange(1.7, 2.5, 0.2):
                for th_x in np.arange(2.3, 3.5, 0.2):
                    print("[SKRF_GS_LOC_TH]: th_x=%s, th_y=%s" % (th_x, th_y))
                    self.params['loc_th_x'] = th_x
                    self.params['loc_th_y'] = th_y
                    self.init_team()
                    self.evaluate_model(evaluate=True, submit=False)
        #------------------------------------------
        elif run_cmd == 'skrf_place_min_checkin':
            for mc in np.arange(0, 5, 1):
                self.params['place_min_checkin'] = mc
                self.init_team()
                self.train_alg(alg)
        #------------------------------------------
        elif run_cmd == 'skrf_gs_time_th_wd':
            for pth in np.arange(0, 0.005, 0.001):
                self.params['time_th_wd'] = pth
                self.init_team()
                self.evaluate_model(evaluate=True, submit=False)
        #------------------------------------------
        elif run_cmd == 'skrf_gs_time_th_hr':
            for pth in np.arange(0.005, 0.02, 0.002):
                self.params['time_th_hr'] = pth
                self.init_team()
                self.train_alg(alg)
        #------------------------------------------
        elif run_cmd == 'skrf_gs_popu_th':
            for pth in np.arange(0, 0.005, 0.001):
                self.params['popu_th'] = pth
                self.init_team()
                self.evaluate_model(evaluate=True, submit=False)
        #------------------------------------------
        elif run_cmd in ['skrf_gs_params', 'skrfp_gs_params']:
            self.init_team()
            for max_feats in [0.3, 0.35, 0.4]:
                for n_estimators in [500]:
                    for max_depth in [15]:
                        self.train_alg(alg,
                                       mdl_config={
                                           'n_estimators': n_estimators,
                                           'max_depth': max_depth,
                                           'max_features': max_feats
                                       })
        elif run_cmd in ['sket_gs_params', 'sketp_gs_params']:
            self.init_team()
            for n_estimators in [800, 1200, 1500]:
                for max_depth in [13, 15, 18]:
                    for max_feats in ['auto', 0.4, 0.5, 0.6]:
                        self.train_alg(alg,
                                       mdl_config={
                                           'n_estimators': n_estimators,
                                           'max_depth': max_depth,
                                           'max_features': max_feats
                                       })
        elif run_cmd == 'xgb_gs_params':
            self.init_team()
            for n_estimators in [30, 35, 40]:
                for max_depth in [3, 4, 5]:
                    for learning_rate in [0.1]:
                        self.train_alg(alg,
                                       params={
                                           'n_estimators': n_estimators,
                                           'max_depth': max_depth,
                                           'learning_rate': learning_rate
                                       })
        #------------------------------------------
        elif run_cmd == 'skrf_place_min_last_checkin':
            for mlc in [550000, 650000]:
                self.params['place_min_last_checkin'] = mlc
                self.params['stamp'] = "%s_%s_%i" % (self.params['alg'],
                                                     self.timestamp, mlc / 1e4)
                self.init_team()
                self.train_alg(alg, submit=True)
        #------------------------------------------
        elif run_cmd == 'skrf_train_min_time':
            for mlc in [0, 50000, 100000, 150000, 200000]:
                self.params['train_min_time'] = mlc
                self.params['stamp'] = "%s_%s_%i" % (self.params['alg'],
                                                     self.timestamp, mlc / 1e4)
                self.init_team()
                self.train_alg(alg, submit=True)
        #------------------------------------------
        elif 'submit_rf_family' in run_cmd:
            self.params[
                'train_test_split_time'] = 1e10  # use all samples for training
            self.init_team()
            for a in ['skrf', 'skrfp']:
                self.train_alg(a, keep_model=True, submit=True, upload=True)
            self.train_alg('knn', submit=True, upload=True)
        elif 'submit_et_family' in run_cmd:
            self.params[
                'train_test_split_time'] = 1e10  # use all samples for training
            self.init_team()
            for a in ['sket', 'sketp']:
                self.train_alg(a, submit=True, upload=True)
        elif 'submit_full' in run_cmd:
            self.params[
                'train_test_split_time'] = 1e10  # use all samples for training
            self.init_team()
            self.train_alg(alg, submit=True, upload=True)
        elif '_submit' in run_cmd:
            self.init_team()
            self.train_alg(alg, keep_model=True, submit=True)
        elif 'eva_exist' in run_cmd:
            self.init_team()
            self.evaluate_model(evaluate=True, submit=False)
        elif 'smt_exist' in run_cmd:
            self.params['train_test_split_time'] = 1e10
            self.init_team()
            self.evaluate_model(evaluate=False, submit=True, upload=True)
        #------------------------------------------
        elif 'fast' in run_cmd:  # fast flow debug
            self.init_team()
            self.train_alg(alg, mdl_config={'n_estimators': 5})
        #------------------------------------------
        elif run_cmd == 'treva_cv':
            self.init_team()
            df_train, df_valid, df_test = self.pas.get_data()
            tva = treva.trainer(self.params)
            tva.train(df_train, df_valid, df_test)
        #------------------------------------------
        elif 'treva' in run_cmd:
            if 'elite' in run_cmd:
                self.params['train_test_split_time'] = 1e10
            else:
                self.params['train_test_split_time'] = 700000
            self.init_team()
            df_train, df_valid, df_test = self.pas.get_data()
            tva = treva.trainer(self.params)
            sfile = tva.train(df_train, df_valid, df_test)
            submiter.submiter().submit(entry=sfile, message=self.params)
        elif run_cmd == 'tuner':
            self.init_team()
            df_train, df_valid, _ = self.pas.get_data()
            grids = []
            for i in range(10):
                xb, yb = int(125 * random()) * 0.08, int(125 * random()) * 0.08
                grids += [(xb, xb + 0.08, yb, yb + 0.08)]
            print(grids)
            df_all = pd.concat([df_train, df_valid])
            all_scores = tuner.tuner(df_all, grids)
        else:  # single model
            self.init_team()
            self.train_alg(alg)
        #------------------------------------------
        print("[Finished!] Elapsed time overall for %.2f secs" %
              (time.time() - start_time))