def main():
    test = data_io.read_test()
    ## deal with the NAs, and add features
    train.feature_eng(test)

    ## predict the booking_bool
    print("Loading the Booking classifier..")
    tstart = datetime.now()
    classifier = data_io.load_model(True)
    print("Time used,")
    print(datetime.now() - tstart)
    print("Making predictions on the booking_bool..")
    tstart = datetime.now()
    b_fnames = train.get_features(test, True)
    b_test_f = test[b_fnames].values
    b_prob = classifier.predict_proba(b_test_f)[:, 1]
    b_prob = list(-1.0 * b_prob)
    print("Time used,")
    print(datetime.now() - tstart)

    ## predict the click_bool
    print("Loading the Click classifier..")
    tstart = datetime.now()
    classifier = data_io.load_model(False)
    print("Time used,")
    print(datetime.now() - tstart)
    print("Making predictions on the click_bool..")
    tstart = datetime.now()
    c_fnames = train.get_features(test, False)
    c_test_f = test[c_fnames].values
    c_prob = classifier.predict_proba(c_test_f)[:, 1]
    c_prob = list(-1.0 * c_prob)
    print("Time used,")
    print(datetime.now() - tstart)

    ## Making Recommendations
    recommendations = zip(test["srch_id"], test["prop_id"],
                          4 * b_prob + c_prob)

    print("Writing predictions to file..")
    tstart = datetime.now()
    data_io.write_submission(recommendations)
    print("Time used,")
    print(datetime.now() - tstart)
def main():
    test = data_io.read_test()
    ## deal with the NAs, and add features
    train.feature_eng(test)

    ## predict the booking_bool
    print("Loading the Booking classifier..")
    tstart = datetime.now()
    classifier = data_io.load_model(True)
    print("Time used,")
    print datetime.now() - tstart
    print("Making predictions on the booking_bool..")
    tstart = datetime.now()
    b_fnames = train.get_features(test, True)
    b_test_f =  test[b_fnames].values
    b_prob = classifier.predict_proba(b_test_f)[:,1]
    b_prob = list(-1.0*b_prob)
    print("Time used,")
    print datetime.now() - tstart

    ## predict the click_bool
    print("Loading the Click classifier..")
    tstart = datetime.now()
    classifier = data_io.load_model(False)
    print("Time used,")
    print datetime.now() - tstart
    print("Making predictions on the click_bool..")
    tstart = datetime.now()
    c_fnames = train.get_features(test, False)
    c_test_f =  test[c_fnames].values
    c_prob = classifier.predict_proba(c_test_f)[:,1]
    c_prob = list(-1.0*c_prob)
    print("Time used,")
    print datetime.now() - tstart

    ## Making Recommendations
    recommendations = zip(test["srch_id"], test["prop_id"], 4*b_prob+c_prob)
    
    print("Writing predictions to file..")
    tstart = datetime.now()
    data_io.write_submission(recommendations)
    print("Time used,")
    print datetime.now() - tstart
Exemplo n.º 3
0
def main():
    ## load test data set and do feature engineering
    test = data_import.load_test()
    train.feature_eng(test)

    ## load classifier for the booking_bool
    print("Loading the Booking classifier..")
    tstart = datetime.now()
    classifier = data_import.load_model(True)
    print("Time used:" + str(datetime.now() - tstart) + "\n")

    ## predict the booking_bool
    print("Making predictions on the booking_bool..")
    tstart = datetime.now()
    book_feature_names = train.get_features(test)
    book_X = test[book_feature_names].values
    book_Y_pred = classifier.predict_proba(book_X)[:, 1]
    book_Y_pred = list(-1.0 * book_Y_pred)
    print("Time used:" + str(datetime.now() - tstart) + "\n")

    ## load classifier for the click_bool
    print("Loading the Click classifier..")
    tstart = datetime.now()
    classifier = data_import.load_model(False)
    print("Time used:" + str(datetime.now() - tstart) + "\n")

    ## predict the click_bool
    print("Making predictions on the click_bool..")
    tstart = datetime.now()
    click_feature_names = train.get_features(test)
    click_X = test[click_feature_names].values
    click_Y_pred = classifier.predict_proba(click_X)[:, 1]
    click_Y_pred = list(-1.0 * click_Y_pred)
    print("Time used:" + str(datetime.now() - tstart) + "\n")

    ## Making results where 3rd column is the score based on likelihood of click and booking
    results = zip(test["srch_id"], test["prop_id"],
                  4 * book_Y_pred + click_Y_pred)

    print("Writing predictions to file..")
    tstart = datetime.now()
    data_import.write_submission(results)
    print("Time used:" + str(datetime.now() - tstart) + "\n")
Exemplo n.º 4
0
def initialize_model():
    if not os.path.exists(cfg.MODEL_BIN):
        car_features, notcar_features = get_features()
        svc, X_scaler = train_model(car_features, notcar_features)
        model = {'svc': svc, 'X_scaler': X_scaler}

        # Save the model on disk
        with open(cfg.MODEL_BIN, 'wb') as f:
            pickle.dump(model, f)
    else:
        svc, X_scaler = load_model()

    return svc, X_scaler
Exemplo n.º 5
0
def main(args):
    pairs = []
    features, labels = [], []
    dist_predictions = []

    val = {'True': 1, 'False': 0}
    sys.stdout.write('> Computing features for test data ...')
    with open(args['flashprofile_output'], 'r') as f:
        val = {'True': 1, 'False': 0}
        data = f.read().split('\n')[:-1]
        dist_predictions.append(
            ('FlashProfile',
             np.fromiter((float(s.split(' :: ')[0].split(' @ ')[1])
                          for s in data[::3]), float)))
        labels.extend(val[s.split('|')[0].strip()] for s in data[::3])
        strings = iter(s[11:-1] for (i, s) in enumerate(data) if i % 3 > 0)
        for s1, s2 in zip(strings, strings):
            features.append(get_features(s1, s2))
            pairs.append((s1, s2))
    print('\r> Feature vector computation DONE (on %d points)\n' % len(pairs))

    dist_predictions.append(
        ('JaroWinkler', [jelly.jaro_winkler(*p) for p in pairs]))

    for pair in args['sim-dis-combination']:
        num_sim_pairs, num_dis_pairs = pair.split(',')
        num_sim_pairs, num_dis_pairs = int(num_sim_pairs), int(num_dis_pairs)
        model = joblib.load(
            os.path.join(
                args['root_dir'], 'logs',
                'RandomForest.%d.%d.pkl' % (num_sim_pairs, num_dis_pairs)))
        dist_predictions.append(('RF.%d.%d' % (num_sim_pairs, num_dis_pairs),
                                 model.predict(features)))

    for (dfile, predictions) in dist_predictions:
        with open(
                os.path.join(args['root_dir'], 'logs',
                             'Similarity.%sPR.log' % dfile), 'w') as f:
            f.write('precision\trecall\n')
            precision, recall, _ = precision_recall_curve(labels, predictions)
            for pr in zip(precision, recall):
                f.write('%f\t%f\n' % pr)
        vauc = auc(recall, precision, reorder=True)
        print('AUC(%s) = %f' % (dfile, vauc))
                      type='string',
                      dest='config',
                      default='train_config_threelayer.yml',
                      help='configuration file')
    (options, args) = parser.parse_args()

    yamlConfig = parse_config(options.config)

    if os.path.isdir(options.outputDir):
        #raise Exception('output directory must not exists yet')
        raw_input(
            "Warning: output directory exists. Press Enter to continue...")
    else:
        os.mkdir(options.outputDir)

    X_train_val, X_test, y_train_val, y_test, labels = get_features(
        options, yamlConfig)

    model_constraint = getattr(models, yamlConfig['KerasModelRetrain'])

    # Instantiate new model with added custom constraints
    if 'L1RegR' in yamlConfig:
        keras_model = model_constraint(Input(shape=X_train_val.shape[1:]),
                                       y_train_val.shape[1],
                                       l1Reg=yamlConfig['L1Reg'],
                                       l1RegR=yamlConfig['L1RegR'],
                                       h5fName=options.dropWeights)
    else:
        keras_model = model_constraint(Input(shape=X_train_val.shape[1:]),
                                       y_train_val.shape[1],
                                       l1Reg=yamlConfig['L1Reg'],
                                       h5fName=options.dropWeights)
Exemplo n.º 7
0
    # Declare what we will be optimizing, and how:
    "spec": {
        "metric": "ROC",
        "objective": "maximize",
    },
}

parameters = open("parameters.yml")
yamlparameters = yaml.load(parameters, Loader=yaml.FullLoader)
opt = Optimizer(config,
                api_key=yamlparameters["comet_api_key"],
                project_name="NNqhmv6",
                auto_metric_logging=True)

X_train, X_test, y_train, y_test = get_features(yamlparameters["DataDir"])

for experiment in opt.get_experiments():
    keras_model = models.qdense_model(
        Input(shape=X_train.shape[1:]),
        l1Reg=experiment.get_parameter("Regularization"),
        bits=14,
        ints=2)
    #keras_model = models.dense_model(Input(shape=X_train.shape[1:]), l1Reg=experiment.get_parameter("Regularization"))
    startlearningrate = experiment.get_parameter("learning_rate")
    adam = Adam(lr=startlearningrate,
                beta_1=experiment.get_parameter("learning_beta1"),
                beta_2=experiment.get_parameter("learning_beta2"),
                amsgrad=experiment.get_parameter("Adagrad"))
    keras_model.compile(optimizer=adam,
                        loss='binary_crossentropy',
Exemplo n.º 8
0
def add_embeddings(CLASSES, model_name, fold_index, checkPoint_start,
                   features_file):
    # Get the model
    device = torch.device('cuda')
    model = model_whale(num_classes=CLASSES * 2,
                        inchannels=4,
                        model_name=model_name).to(device)

    # Find result dir
    resultDir = './result/{}_{}'.format(model_name, fold_index)
    checkPoint = os.path.join(resultDir, 'checkpoint')

    # Load the pretrained weights
    if not checkPoint_start == 0:
        ckp = torch.load(
            os.path.join(checkPoint,
                         '%08d_optimizer.pth' % (checkPoint_start)))
        model.load_state_dict(
            torch.load(
                os.path.join(checkPoint,
                             '%08d_model.pth' % (checkPoint_start))))

    # Load image data
    to_add = pd.read_csv('./input/embed_split_{}_add.csv'.format(fold_index))
    # Only do if necessary
    if 0:
        to_add = pd.read_csv('./input/embed_split_{}.csv'.format(fold_index))
        # Split up the embedding images and save to different files
        data_test = to_add[1::2]
        outfile = "./input/embed_split_{}_test.csv".format(fold_index)
        data_test.to_csv(outfile, index=None)

        to_add = to_add[::2]
        outfile = "./input/embed_split_{}_add.csv".format(fold_index)
        to_add.to_csv(outfile, index=None)
    names_embed = to_add['Image'].tolist()
    labels_embed = to_add['Id'].tolist()
    batch_size = 16
    mode = 'embed'
    print("\nNumber of images to add:", len(names_embed))

    # Setup dataloader
    dst_embed = WhaleTestDataset(names_embed,
                                 labels_embed,
                                 mode=mode,
                                 transform=transform)
    dataloader_embed = DataLoader(dst_embed,
                                  shuffle=False,
                                  drop_last=False,
                                  batch_size=batch_size,
                                  num_workers=8,
                                  collate_fn=embed_collate)

    # Load the embeddings
    infile = "train_features{}.csv".format(features_file)
    embeddings = torch.Tensor(pd.read_csv(infile).to_numpy()).float()
    infile2 = "train_ids{}.csv".format(features_file)
    ids = torch.Tensor(pd.read_csv(infile2).to_numpy()).long()

    # Get the features to add
    new_ids, feats = get_features(dataloader_embed, model, CLASSES * 2)

    # Concatenate and save the features
    added_feats = torch.cat([embeddings, torch.Tensor(feats).float()], 0)
    added_ids = torch.cat([ids.view(-1), torch.Tensor(new_ids).long()], 0)
    outfile = "train_ids{}_added.csv".format(features_file)
    outfile2 = "train_features{}_added.csv".format(features_file)
    df1 = pd.DataFrame(added_ids.numpy())
    df2 = pd.DataFrame(added_feats.numpy())
    # Keep track of id, vector and some info about where this
    # was gotten (model, fold, iteration?)
    df1.to_csv(outfile, index=None)
    df2.to_csv(outfile2, index=None)

    print("Files {} and {} created with added ids and features.".format(
        outfile, outfile2))