def validate(model, infile_base, passes, bits):
  '''
  Trains a model on 0.zip...3.zip and evaluates it on 4.zip.
  
  Args:
    model - the model to validate with
    infile_base - bare input data name without path or extension
    pases - number of passes over data in training
    bits - the feature space shoul be of dimension 2**bits
    
  Output:
    prints AUC score on 4.zip
  '''
  for k in range(passes):
    print 'Pass %d' % k
    for file_num in range(4):
      train_set_name = '%s.%d' % (infile_base, file_num)
      print 'loading training file: ' + train_set_name
      x, y = util.load_sparse(train_set_name, n_features=2**bits, verbose=False)
      model.partial_fit(x, y, classes=[0., 1.])
  val_set_name = infile_base + '.4'
  print 'loading validation set...'
  x, y = util.load_sparse(val_set_name, n_features=2**bits, verbose=False)
  dv = model.decision_function(x)
  score = roc_auc_score(y, dv)
  print 'AUC: %.4f' % score
示例#2
0
def validate(model, infile_base, passes, bits):
    '''
  Trains a model on 0.zip...3.zip and evaluates it on 4.zip.
  
  Args:
    model - the model to validate with
    infile_base - bare input data name without path or extension
    pases - number of passes over data in training
    bits - the feature space shoul be of dimension 2**bits
    
  Output:
    prints AUC score on 4.zip
  '''
    for k in range(passes):
        print 'Pass %d' % k
        for file_num in range(4):
            train_set_name = '%s.%d' % (infile_base, file_num)
            print 'loading training file: ' + train_set_name
            x, y = util.load_sparse(train_set_name,
                                    n_features=2**bits,
                                    verbose=False)
            model.partial_fit(x, y, classes=[0., 1.])
    val_set_name = infile_base + '.4'
    print 'loading validation set...'
    x, y = util.load_sparse(val_set_name, n_features=2**bits, verbose=False)
    dv = model.decision_function(x)
    score = roc_auc_score(y, dv)
    print 'AUC: %.4f' % score
示例#3
0
def avg_validate(n_models, base_model, infile_base, passes, bits):
    '''
  Runs a batch of linear models over 0.zip to 3.zip, with the input files 
  presented to each in a random order. Produces a prediction by averaging.
  Prints out the AUC score of the average prediction on 4.zip.
  
  Args:
    n_models - the number of models to produce
    base_model - a model that is cloned to produce the models
    infile_base - bare input data name without path or extension
    pases - number of passes over data in training
    bits - the feature space shoul be of dimension 2**bits
    
  Output:
    prints AUC score on 4.zip
  '''
    models = []
    orders = []
    l = range(4)
    for k in range(n_models):
        model_k = base_model.__class__()
        model_k.set_params(**base_model.get_params())
        models.append(model_k)
        random.shuffle(l)
        orders.append(l[:])
    model_orders = zip(models, orders)

    for k in range(passes):
        print 'Pass %d' % k
        for (m, order) in model_orders:
            for file_num in order:
                train_set_name = '%s.%d' % (infile_base, file_num)
                print 'loading training file: ' + train_set_name
                x, y = util.load_sparse(train_set_name,
                                        n_features=2**bits,
                                        verbose=False)
                m.partial_fit(x, y, classes=[0., 1.])

    val_set_name = infile_base + '.4'
    print 'loading validation set...'
    x, y = util.load_sparse(val_set_name, n_features=2**bits, verbose=False)
    dvs = np.zeros((len(y), n_models))
    for (k, m) in enumerate(models):
        dvs[:, k] = m.decision_function(x)
    dv = dvs.mean(axis=1)
    score = roc_auc_score(y, dv)
    print 'AUC: %.4f' % score
示例#4
0
def avg_run_all(n_models, base_model, infile_base, passes, bits, submit_id):
    '''
  Runs a batch of linear models over the data, with the input files presented
  to each in a random order. Writes a submission based on the models averaged
  predictions.
  
  Args:
    n_models - the number of models to produce
    base_model - a model that is cloned to produce the models
    infile_base - bare input data name without path or extension
    pases - number of passes over data in training
    bits - the feature space shoul be of dimension 2**bits
    submit_id - the result is written as submissions/submission_<submit_id>.csv

  Writes:
    A submission at paths.SUBMIT/submisssion_<submit_id>.csv
  '''
    models = []
    orders = []
    l = range(5)
    for k in range(n_models):
        model_k = base_model.__class__()
        model_k.set_params(**base_model.get_params())
        models.append(model_k)
        random.shuffle(l)
        orders.append(l[:])
    model_orders = zip(models, orders)

    for k in range(passes):
        print 'Pass %d' % k
        for (m, order) in model_orders:
            for file_num in order:
                train_set_name = '%s.%d' % (infile_base, file_num)
                print 'loading training file: ' + train_set_name
                x, y = util.load_sparse(train_set_name,
                                        n_features=2**bits,
                                        verbose=False)
                m.partial_fit(x, y, classes=[0., 1.])

    test_set_name = infile_base + '.5'
    print 'loading test set...'
    x, y = util.load_sparse(test_set_name, n_features=2**bits, verbose=False)
    dvs = np.zeros((len(y), n_models))
    for (k, m) in enumerate(models):
        dvs[:, k] = m.decision_function(x)
    dv = dvs.mean(axis=1)
    util.write_submission(dv, submit_id)
def avg_validate(n_models, base_model, infile_base, passes, bits):
  '''
  Runs a batch of linear models over 0.zip to 3.zip, with the input files 
  presented to each in a random order. Produces a prediction by averaging.
  Prints out the AUC score of the average prediction on 4.zip.
  
  Args:
    n_models - the number of models to produce
    base_model - a model that is cloned to produce the models
    infile_base - bare input data name without path or extension
    pases - number of passes over data in training
    bits - the feature space shoul be of dimension 2**bits
    
  Output:
    prints AUC score on 4.zip
  '''
  models = []
  orders = []
  l = range(4)
  for k in range(n_models):
    model_k = base_model.__class__()
    model_k.set_params(**base_model.get_params())
    models.append(model_k)
    random.shuffle(l)
    orders.append(l[:])
  model_orders = zip(models, orders)
    
  for k in range(passes):
    print 'Pass %d' % k
    for (m, order) in model_orders:
      for file_num in order:
        train_set_name = '%s.%d' % (infile_base, file_num)
        print 'loading training file: ' + train_set_name
        x, y = util.load_sparse(train_set_name, n_features=2**bits, verbose=False)
        m.partial_fit(x, y, classes=[0., 1.])
        
  val_set_name = infile_base + '.4'
  print 'loading validation set...'
  x, y = util.load_sparse(val_set_name, n_features=2**bits, verbose=False)
  dvs = np.zeros((len(y), n_models))
  for (k, m) in enumerate(models):
    dvs[:, k] = m.decision_function(x)
  dv = dvs.mean(axis=1)
  score = roc_auc_score(y, dv)
  print 'AUC: %.4f' % score
def avg_run_all(n_models, base_model, infile_base, passes, bits, submit_id):
  '''
  Runs a batch of linear models over the data, with the input files presented
  to each in a random order. Writes a submission based on the models averaged
  predictions.
  
  Args:
    n_models - the number of models to produce
    base_model - a model that is cloned to produce the models
    infile_base - bare input data name without path or extension
    pases - number of passes over data in training
    bits - the feature space shoul be of dimension 2**bits
    submit_id - the result is written as submissions/submission_<submit_id>.csv

  Writes:
    A submission at paths.SUBMIT/submisssion_<submit_id>.csv
  '''
  models = []
  orders = []
  l = range(5)
  for k in range(n_models):
    model_k = base_model.__class__()
    model_k.set_params(**base_model.get_params())
    models.append(model_k)
    random.shuffle(l)
    orders.append(l[:])
  model_orders = zip(models, orders)
    
  for k in range(passes):
    print 'Pass %d' % k
    for (m, order) in model_orders:
      for file_num in order:
        train_set_name = '%s.%d' % (infile_base, file_num)
        print 'loading training file: ' + train_set_name
        x, y = util.load_sparse(train_set_name, n_features=2**bits, verbose=False)
        m.partial_fit(x, y, classes=[0., 1.])
        
  test_set_name = infile_base + '.5'
  print 'loading test set...'
  x, y = util.load_sparse(test_set_name, n_features=2**bits, verbose=False)
  dvs = np.zeros((len(y), n_models))
  for (k, m) in enumerate(models):
    dvs[:, k] = m.decision_function(x)
  dv = dvs.mean(axis=1)
  util.write_submission(dv, submit_id)
def test(model, infile_base, bits):
  '''
  Predicts on 5.zip with the provided model.
  
  Args:
    model - the model to predict with
    infile_base - bare input data name without path or extension
    bits - the feature space shoul be of dimension 2**bits

  Returns:
    predictions on 5.zip
  '''
  test_set_name = infile_base + '.5'
  print 'loading test set...'
  x, _ = util.load_sparse(test_set_name, n_features=2**bits, verbose=False)
  dv = model.decision_function(x)
  return dv
示例#8
0
def test(model, infile_base, bits):
    '''
  Predicts on 5.zip with the provided model.
  
  Args:
    model - the model to predict with
    infile_base - bare input data name without path or extension
    bits - the feature space shoul be of dimension 2**bits

  Returns:
    predictions on 5.zip
  '''
    test_set_name = infile_base + '.5'
    print 'loading test set...'
    x, _ = util.load_sparse(test_set_name, n_features=2**bits, verbose=False)
    dv = model.decision_function(x)
    return dv
def train(model, infile_base, passes, bits):
  '''
  Trains a model on 0.zip...4.zip.
  
  Args:
    model - the model to train
    infile_base - bare input data name without path or extension
    pases - number of passes over data in training
    bits - the feature space shoul be of dimension 2**bits
    
  Side-effects:
    model is trained
  '''
  for k in range(passes):
    print 'Pass %d' % k
    for file_num in range(5):
      train_set_name = '%s.%d' % (infile_base, file_num)
      print 'loading training file: ' + train_set_name
      x, y = util.load_sparse(train_set_name, n_features=2**bits, verbose=False)
      model.partial_fit(x, y, classes=[0., 1.])
示例#10
0
def train(model, infile_base, passes, bits):
    '''
  Trains a model on 0.zip...4.zip.
  
  Args:
    model - the model to train
    infile_base - bare input data name without path or extension
    pases - number of passes over data in training
    bits - the feature space shoul be of dimension 2**bits
    
  Side-effects:
    model is trained
  '''
    for k in range(passes):
        print 'Pass %d' % k
        for file_num in range(5):
            train_set_name = '%s.%d' % (infile_base, file_num)
            print 'loading training file: ' + train_set_name
            x, y = util.load_sparse(train_set_name,
                                    n_features=2**bits,
                                    verbose=False)
            model.partial_fit(x, y, classes=[0., 1.])