def main():
    sample_size = int(sys.argv[1])
    train = data_io.read_train()
    print("Data Size:")
    print(train.shape)
    feature_eng(train)
    ## originally sample size = 100000
    train_sample = train[:sample_size]

    ## Train the booking model
    for i in range(0,2):
        if i==0:
            model_name = "Booking"
            response_name = "booking_bool"
            isBook = True
        else:
            model_name = "Click"
            response_name = "click_bool"
            isBook = False
        print("Training the "+model_name+" Classifier...")
        tstart = datetime.now()
        feature_names = get_features(train_sample, isBook)
        print("Using "+str(len(feature_names))+" features...")
        features = train_sample[feature_names].values
        target = train_sample[response_name].values
        classifier = model.model()
        classifier.fit(features, target)
        # print the time interval
        print("Time used,")
        print datetime.now() - tstart
        print("Saving the classifier...")
        tstart = datetime.now()
        data_io.save_model(classifier, isBook)
        print("Time used,")
        print datetime.now() - tstart
def main():
    print("Reading training data")
    train = data_io.read_train()

    train.fillna(-1, inplace=True)

    #train_sample = train.fillna(value=-2)
    #train_sample = train[:2500000].fillna(value=0)
    train_sample = train[:100000]
    #train_sample = train.fillna(value=0)

    feature_names = list(train_sample.columns)
    feature_names.remove("click_bool")
    feature_names.remove("booking_bool")
    feature_names.remove("gross_bookings_usd")
    feature_names.remove("date_time")
    feature_names.remove("position")

    features = train_sample[feature_names].values
    #train_sample["position"] *= -1.0
    #target = train_sample["position"].values
    #target = train_sample["booking_bool"].values
    target = train_sample["booking_bool"].values

    print("Training the Classifier")
    classifier = LambdaMART(n_estimators=50,
                                        verbose=2,
                                        min_samples_split=10,
                                        random_state=1)
    classifier.fit(features, target)

    print("Saving the classifier")
    data_io.save_model(classifier)
Exemplo n.º 3
0
def main():
    print("Reading training data")
    train = data_io.read_train()
    train.fillna(0, inplace=True)

    train_sample = train[:100000].fillna(value=0)

    feature_names = list(train_sample.columns)
    feature_names.remove("click_bool")
    feature_names.remove("booking_bool")
    feature_names.remove("gross_bookings_usd")
    feature_names.remove("date_time")
    feature_names.remove("position")

    features = train_sample[feature_names].values
    target = train_sample["booking_bool"].values

    print("Training the Classifier")
    classifier = RandomForestClassifier(n_estimators=50,
                                        verbose=2,
                                        n_jobs=1,
                                        min_samples_split=10,
                                        random_state=1)
    classifier.fit(features, target)

    print("Saving the classifier")
    data_io.save_model(classifier)
Exemplo n.º 4
0
def main():
    sample_size = int(sys.argv[1])
    train = data_io.read_train()
    print("Data Size:")
    print(train.shape)
    feature_eng(train)
    ## originally sample size = 100000
    train_sample = train[:sample_size]

    ## Train the booking model
    for i in range(0, 2):
        if i == 0:
            model_name = "Booking"
            response_name = "booking_bool"
            isBook = True
        else:
            model_name = "Click"
            response_name = "click_bool"
            isBook = False
        print("Training the " + model_name + " Classifier...")
        tstart = datetime.now()
        feature_names = get_features(train_sample, isBook)
        print("Using " + str(len(feature_names)) + " features...")
        features = train_sample[feature_names].values
        target = train_sample[response_name].values
        classifier = model.model()
        classifier.fit(features, target)
        # print the time interval
        print("Time used,")
        print datetime.now() - tstart
        print("Saving the classifier...")
        tstart = datetime.now()
        data_io.save_model(classifier, isBook)
        print("Time used,")
        print datetime.now() - tstart
Exemplo n.º 5
0
def main():
    print("Reading training data ...")
    train = data_io.read_train()
    train.fillna(0, inplace=True)

    train_sample = train.fillna(value=0)

    features = ut.preprocess(train_sample)
    target = ut.construct_target(train_sample)
    # target = train_sample["booking_bool"].values
    # save the processed data, which may be useful 
    # to test the performance of our model
    print("Saving processed training data ...")
    data_io.save_processed_data([features, target])

    print("Training the Regressor ...")
    regressor = RandomForestRegressor(n_estimators=10, #RandomForestClassifier
                                        verbose=2,
                                        n_jobs=-1,
                                        max_features = "sqrt",
                                        min_samples_split=10,
                                        random_state=1)
    regressor.fit(features, target)
    
    print("Saving the Regressor ...")
    data_io.save_model(regressor)
Exemplo n.º 6
0
    def __init__(self):
        self.train = data_io.read_train()
        self.test = data_io.read_test()
        self.destin = data_io.read_desin()

        # pca analysis on the destination
        pca = PCA(n_components=3)
        self.dest_pca = pca.fit_transform(
            self.destin[["d{0}".format(i + 1) for i in range(149)]])
        self.dest_pca = pd.DataFrame(self.dest_pca)
        self.dest_pca["srch_destination_id"] = self.destin[
            "srch_destination_id"]
Exemplo n.º 7
0
def main():
    print("Reading in the training data")
    train = data_io.read_train()

    print("Reading in the meta data")
    paper_author, paper_author_indexed = f.get_paper_author()

    print("Computing Relational Information")
    computed_features = f.get_all_computed_features(paper_author)

    print("Extracting features")
    features = []
    target = []
    for author_id, row in train.iterrows():
        for paper_id in row["DeletedPaperIds"]:
            s = f.get_features(paper_id, author_id, paper_author_indexed,
                               computed_features)
            if s is None:
                print("Error at Author Id %d And Paper Id %d" %
                      (author_id, paper_id))
            else:
                target.append(1)
                features.append(s)
        for paper_id in row["ConfirmedPaperIds"]:
            s = f.get_features(paper_id, author_id, paper_author_indexed,
                               computed_features)
            if s is None:
                print("Error at Author Id %d And Paper Id %d" %
                      (author_id, paper_id))
            else:
                target.append(0)
                features.append(s)

    print("Target Length: %d" % len(target))
    print("Feature Length: %d" % len(features))

    feature_matrix = pd.DataFrame(features)

    print("Training the Classifier")
    classifier = RandomForestClassifier(n_estimators=50,
                                        verbose=2,
                                        n_jobs=1,
                                        min_samples_split=10,
                                        random_state=1)
    try:
        classifier.fit(feature_matrix, target)
    except:
        import pdb
        pdb.set_trace()

    print("Saving the classifier")
    data_io.save_model(classifier)
Exemplo n.º 8
0
def main():
    #sample_size = int(sys.argv[1])
    sample_size = int(1000000)
    train = data_io.read_train()
    print("Data Size:")
    print(train.shape)
    feature_eng(train)
    # originally sample size = 100000
    train_set = train  #[:sample_size]
    book_trainset = train_set[train_set['booking_bool'] == 1]
    book_rows = book_trainset.index.tolist()
    bsize = len(book_trainset.index)
    click_trainset = train_set[train_set['click_bool'] == 1]
    click_rows = click_trainset.index.tolist()
    csize = len(click_trainset.index)
    print('bsize ' + str(bsize))
    print('csize ' + str(csize))
    book_trainset = book_trainset.append(train_set.iloc[random.sample(
        list(train_set.drop(book_rows).index), bsize)])
    click_trainset = click_trainset.append(train_set.iloc[random.sample(
        list(train_set.drop(click_rows).index), csize)])
    # Train the booking model
    for i in range(0, 2):
        if i == 0:
            model_name = "Booking"
            response_name = "booking_bool"
            train_sample = book_trainset
            isBook = True
        else:
            model_name = "Click"
            response_name = "click_bool"
            train_sample = click_trainset
            isBook = False
        print("Training the " + model_name + " Classifier...")
        tstart = datetime.now()
        feature_names = get_features(train_sample, isBook)
        print("Using " + str(len(feature_names)) + " features...")
        features = train_sample[feature_names].values
        target = train_sample[response_name].values
        classifier = model.model()
        classifier.fit(features, target)
        # print the time interval
        print("Time used,")
        print(datetime.now() - tstart)
        print("Saving the classifier...")
        tstart = datetime.now()
        data_io.save_model(classifier, isBook)
        print("Time used,")
        print(datetime.now() - tstart)
Exemplo n.º 9
0
def do_train_samples_processing():
    ## step1 read training data
    print "reading training data..."
    # train_samples = data_io.read_train(nrows= 100000)
    train_samples = data_io.read_train()
    print "Done"

    ## step2 Data preprocessing
    print "Processing training data..."
    # replace NAN with 0    fillna函数
    train_samples = train_samples.fillna(value=0)
    # processing training samples
    process_train_samples(train_samples)

    print "Processing training data done"
def main():
    sample_size = int(sys.argv[1])
    ## sample_size = int(1000)
    train = data_io.read_train()
    print("Data Size:")
    print(train.shape)
    feature_eng(train)
    ## originally sample size = 100000
    train_set = train[:sample_size]
    book_trainset = train_set[train_set['booking_bool']==1]
    book_rows = book_trainset.index.tolist()
    bsize = len(book_trainset.index)
    click_trainset = train_set[train_set['click_bool']==1]
    click_rows = click_trainset.index.tolist()
    csize = len(click_trainset.index)
    print 'bsize ' + str(bsize)
    print 'csize ' + str(csize)
    book_trainset = book_trainset.append(train_set.ix[random.sample(train_set.drop(book_rows).index, bsize)])
    click_trainset =click_trainset.append(train_set.ix[random.sample(train_set.drop(click_rows).index, csize)])
    ## Train the booking model
    for i in range(0,2):
        if i==0:
            model_name = "Booking"
            response_name = "booking_bool"
            train_sample = book_trainset
            isBook = True
        else:
            model_name = "Click"
            response_name = "click_bool"
            train_sample = click_trainset
            isBook = False
        print("Training the "+model_name+" Classifier...")
        tstart = datetime.now()
        feature_names = get_features(train_sample, isBook)
        print("Using "+str(len(feature_names))+" features...")
        features = train_sample[feature_names].values
        target = train_sample[response_name].values
        classifier = model.model()
        classifier.fit(features, target)
        # print the time interval
        print("Time used,")
        print datetime.now() - tstart
        print("Saving the classifier...")
        tstart = datetime.now()
        data_io.save_model(classifier, isBook)
        print("Time used,")
        print datetime.now() - tstart
def main():
    print("Reading in the training data")
    train = data_io.read_train()

    print("Reading in the meta data")
    paper_author, paper_author_indexed = f.get_paper_author()

    print("Computing Relational Information")
    computed_features = f.get_all_computed_features(paper_author)

    print("Extracting features")
    features = []
    target = []
    for author_id, row in train.iterrows():
        for paper_id in row["DeletedPaperIds"]:
            s = f.get_features(paper_id, author_id, paper_author_indexed,computed_features)
            if s is None:
                print("Error at Author Id %d And Paper Id %d" % (author_id, paper_id))
            else:
                target.append(1)
                features.append(s)
        for paper_id in row["ConfirmedPaperIds"]:
            s = f.get_features(paper_id, author_id, paper_author_indexed,computed_features)
            if s is None:
                print("Error at Author Id %d And Paper Id %d" % (author_id, paper_id))
            else:
                target.append(0)
                features.append(s)

    print("Target Length: %d" % len(target))
    print("Feature Length: %d" % len(features))

    feature_matrix = pd.DataFrame(features)

    print("Training the Classifier")
    classifier = RandomForestClassifier(n_estimators=50, 
                                        verbose=2,
                                        n_jobs=1,
                                        min_samples_split=10,
                                        random_state=1)
    try:
        classifier.fit(feature_matrix, target)
    except:
        import pdb;pdb.set_trace()

    print("Saving the classifier")
    data_io.save_model(classifier)
Exemplo n.º 12
0
def do_train_samples_processing():
    print "Reading training data..."
    train_samples = data_io.read_train()
    print "Processing training data..."
    train_samples = train_samples.fillna(value=0)
    process_train_samples(train_samples)
Exemplo n.º 13
0
          srch_length_of_stay_features.SrchLengthOfStayFeatures(self.X),
          srch_booking_window_features.SrchBookingWindowFeatures(self.X),
          ]

      return map(self.transformer, feature_list)
  


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Generate features using train/test data")
    parser.add_argument("--test", action="store_true", default=False, help="Weather to use test data", required=False)
    result = parser.parse_args()

    if result.test:
        print("Reading test data")
        data = data_io.read_test()
    else:
        print("Reading training data")
        data = data_io.read_train()

    fm = FeatureExtractor(data)
    derived_features = fm.feature_extractor()
    data.fillna(0, inplace=True)
    data = pandas.concat([data] + derived_features, axis=1)
  
    if result.test:
        data_io.save_test_features(data)
    else:
        data_io.save_train_features(data)
      
def main():
    print("Reading training data")
    train = data_io.read_train()

    train.fillna(-2, inplace=True)

    #train_sample = train.fillna(value=-2)
    train_sample = train
    #train_sample = train[:100000]
    #train_sample = train.fillna(value=0)

    #feature_names = [
        #'srch_id',
        #'price_usd',
        #'price_person',
        #'price_usd',
        #'prop_location_score2',
        #'prop_log_historical_price',
        #'srch_children_count',
        #'srch_query_affinity_score',
        #'prop_starrating',
        #'visitor_hist_starrating',
        #'promotion_flag',
        #'prop_review_score',
        #'srch_destination_id',
        #'prop_id',
        #'visitor_hist_adr_usd',
        #'prop_brand_bool',
    #]

    feature_names = list(train_sample.columns)
    feature_names.remove("click_bool")
    feature_names.remove("booking_bool")
    feature_names.remove("gross_bookings_usd")
    feature_names.remove("date_time")
    feature_names.remove("position")

    #feature_names.remove('price_diff')
    #feature_names.remove('price_person')
    feature_names.remove('star_diff')
    #feature_names.remove('pay_diff')
    feature_names.remove('price_night')
    feature_names.remove('loc_desire')
    feature_names.remove('no_kids')
    feature_names.remove('couple')
    feature_names.remove('price_down')
    feature_names.remove('same_country')

    #feature_names.remove('prop_location_score1')

    features = train_sample[feature_names].values
    #train_sample["position"] *= -1.0
    #target = train_sample["position"].values
    #target = train_sample["booking_bool"].values
    target = train_sample["click_bool"].values

    print("Training the Classifier")
    classifier = GradientBoostingClassifier(n_estimators=80,
                                        verbose=2,
                                        min_samples_split=10,
                                        random_state=1)
    classifier.fit(features, target)

    print("Saving the classifier")
    data_io.save_model(classifier, 'click')
Exemplo n.º 15
0
def main():
    class bcolors:
        HEADER = '\033[95m'
        OKBLUE = '\033[94m'
        OKGREEN = '\033[92m'
        WARNING = '\033[93m'
        FAIL = '\033[91m'
        ENDC = '\033[0m'
        BOLD = '\033[1m'
        UNDERLINE = '\033[4m'

    print bcolors.HEADER + "Start Training" + bcolors.HEADER
    print bcolors.OKBLUE + "Reading and making Trainingset" + bcolors.OKBLUE

    train = data_io.read_train()
    train.fillna(0, inplace=True)

    train_sample = train[:1250000].fillna(value=0)       # change the samplesize over here

    # list of features that can be removed if you want
    feature_names = list(train_sample.columns)
    feature_names.remove("click_bool")
    feature_names.remove("booking_bool")
    feature_names.remove("gross_bookings_usd")
    feature_names.remove("date_time")
    feature_names.remove("position")

    features = train_sample[feature_names].values
    target = train_sample["booking_bool"].values

    print bcolors.OKGREEN + "Training Dataset" + bcolors.OKGREEN

    # check over here , you can find the algorithms at http://scikit-learn.org/stable/modules/ensemble.html

    # random forest
    classifier = RandomForestClassifier(n_estimators=3200,  verbose=2,n_jobs=-1,min_samples_split=10,random_state=1)

    # extra Trees (better then random forest) (best till now!)
    #classifier = ExtraTreesClassifier(n_estimators=300,  verbose=2, n_jobs=-1, min_samples_split=10,random_state=1)

    # Adaboost
    #classifier = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),n_estimators=600,learning_rate=1)



    # Knearest neighbour with bagging
    #classifier = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5)


    # Gradient Boosting  BEST SOLUTION (i suppose,will try tomorrow)
    # classifier = GradientBoostingClassifier(loss='deviance', learning_rate=0.1,  n_estimators=100,  subsample=1.0, min_samples_split=2, min_samples_leaf=1, max_depth=3, init=None, random_state=None, max_features=None, verbose=0)


    classifier.fit(features, target)

    print bcolors.OKBLUE + "Saving Classifier" + bcolors.OKBLUE
    data_io.save_model(classifier)


    print bcolors.OKGREEN + "Start Making Predictions On Testset" + bcolors.OKGREEN

    print bcolors.OKBLUE + "Reading Testset" + bcolors.OKBLUE

    test = data_io.read_test()
    test.fillna(0, inplace=True)

    feature_names = list(test.columns)
    feature_names.remove("date_time")

    features = test[feature_names].values

    classifier = data_io.load_model()

    print bcolors.OKGREEN + "Make Predictions" + bcolors.OKGREEN
    predictions = classifier.predict_proba(features)[:,1]

    print bcolors.OKBLUE + "Calculate NDcg" + bcolors.OKBLUE
    predictions = list(-1.0*predictions)

    print bcolors.OKBLUE +  "Sort Predictions" + bcolors.OKBLUE
    recommendations = zip(test["srch_id"], test["prop_id"], predictions)

    print bcolors.OKGREEN + "Writing Predictions To Outputfile" + bcolors.OKGREEN


    data_io.write_submission(recommendations)

    print ""
    print bcolors.ENDC + "Thats all folks,goodbye!" + bcolors.ENDC