def main(): sample_size = int(sys.argv[1]) train = data_io.read_train() print("Data Size:") print(train.shape) feature_eng(train) ## originally sample size = 100000 train_sample = train[:sample_size] ## Train the booking model for i in range(0,2): if i==0: model_name = "Booking" response_name = "booking_bool" isBook = True else: model_name = "Click" response_name = "click_bool" isBook = False print("Training the "+model_name+" Classifier...") tstart = datetime.now() feature_names = get_features(train_sample, isBook) print("Using "+str(len(feature_names))+" features...") features = train_sample[feature_names].values target = train_sample[response_name].values classifier = model.model() classifier.fit(features, target) # print the time interval print("Time used,") print datetime.now() - tstart print("Saving the classifier...") tstart = datetime.now() data_io.save_model(classifier, isBook) print("Time used,") print datetime.now() - tstart
def main(): print("Reading training data") train = data_io.read_train() train.fillna(-1, inplace=True) #train_sample = train.fillna(value=-2) #train_sample = train[:2500000].fillna(value=0) train_sample = train[:100000] #train_sample = train.fillna(value=0) feature_names = list(train_sample.columns) feature_names.remove("click_bool") feature_names.remove("booking_bool") feature_names.remove("gross_bookings_usd") feature_names.remove("date_time") feature_names.remove("position") features = train_sample[feature_names].values #train_sample["position"] *= -1.0 #target = train_sample["position"].values #target = train_sample["booking_bool"].values target = train_sample["booking_bool"].values print("Training the Classifier") classifier = LambdaMART(n_estimators=50, verbose=2, min_samples_split=10, random_state=1) classifier.fit(features, target) print("Saving the classifier") data_io.save_model(classifier)
def main(): print("Reading training data") train = data_io.read_train() train.fillna(0, inplace=True) train_sample = train[:100000].fillna(value=0) feature_names = list(train_sample.columns) feature_names.remove("click_bool") feature_names.remove("booking_bool") feature_names.remove("gross_bookings_usd") feature_names.remove("date_time") feature_names.remove("position") features = train_sample[feature_names].values target = train_sample["booking_bool"].values print("Training the Classifier") classifier = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=1, min_samples_split=10, random_state=1) classifier.fit(features, target) print("Saving the classifier") data_io.save_model(classifier)
def main(): sample_size = int(sys.argv[1]) train = data_io.read_train() print("Data Size:") print(train.shape) feature_eng(train) ## originally sample size = 100000 train_sample = train[:sample_size] ## Train the booking model for i in range(0, 2): if i == 0: model_name = "Booking" response_name = "booking_bool" isBook = True else: model_name = "Click" response_name = "click_bool" isBook = False print("Training the " + model_name + " Classifier...") tstart = datetime.now() feature_names = get_features(train_sample, isBook) print("Using " + str(len(feature_names)) + " features...") features = train_sample[feature_names].values target = train_sample[response_name].values classifier = model.model() classifier.fit(features, target) # print the time interval print("Time used,") print datetime.now() - tstart print("Saving the classifier...") tstart = datetime.now() data_io.save_model(classifier, isBook) print("Time used,") print datetime.now() - tstart
def main(): print("Reading training data ...") train = data_io.read_train() train.fillna(0, inplace=True) train_sample = train.fillna(value=0) features = ut.preprocess(train_sample) target = ut.construct_target(train_sample) # target = train_sample["booking_bool"].values # save the processed data, which may be useful # to test the performance of our model print("Saving processed training data ...") data_io.save_processed_data([features, target]) print("Training the Regressor ...") regressor = RandomForestRegressor(n_estimators=10, #RandomForestClassifier verbose=2, n_jobs=-1, max_features = "sqrt", min_samples_split=10, random_state=1) regressor.fit(features, target) print("Saving the Regressor ...") data_io.save_model(regressor)
def __init__(self): self.train = data_io.read_train() self.test = data_io.read_test() self.destin = data_io.read_desin() # pca analysis on the destination pca = PCA(n_components=3) self.dest_pca = pca.fit_transform( self.destin[["d{0}".format(i + 1) for i in range(149)]]) self.dest_pca = pd.DataFrame(self.dest_pca) self.dest_pca["srch_destination_id"] = self.destin[ "srch_destination_id"]
def main(): print("Reading in the training data") train = data_io.read_train() print("Reading in the meta data") paper_author, paper_author_indexed = f.get_paper_author() print("Computing Relational Information") computed_features = f.get_all_computed_features(paper_author) print("Extracting features") features = [] target = [] for author_id, row in train.iterrows(): for paper_id in row["DeletedPaperIds"]: s = f.get_features(paper_id, author_id, paper_author_indexed, computed_features) if s is None: print("Error at Author Id %d And Paper Id %d" % (author_id, paper_id)) else: target.append(1) features.append(s) for paper_id in row["ConfirmedPaperIds"]: s = f.get_features(paper_id, author_id, paper_author_indexed, computed_features) if s is None: print("Error at Author Id %d And Paper Id %d" % (author_id, paper_id)) else: target.append(0) features.append(s) print("Target Length: %d" % len(target)) print("Feature Length: %d" % len(features)) feature_matrix = pd.DataFrame(features) print("Training the Classifier") classifier = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=1, min_samples_split=10, random_state=1) try: classifier.fit(feature_matrix, target) except: import pdb pdb.set_trace() print("Saving the classifier") data_io.save_model(classifier)
def main(): #sample_size = int(sys.argv[1]) sample_size = int(1000000) train = data_io.read_train() print("Data Size:") print(train.shape) feature_eng(train) # originally sample size = 100000 train_set = train #[:sample_size] book_trainset = train_set[train_set['booking_bool'] == 1] book_rows = book_trainset.index.tolist() bsize = len(book_trainset.index) click_trainset = train_set[train_set['click_bool'] == 1] click_rows = click_trainset.index.tolist() csize = len(click_trainset.index) print('bsize ' + str(bsize)) print('csize ' + str(csize)) book_trainset = book_trainset.append(train_set.iloc[random.sample( list(train_set.drop(book_rows).index), bsize)]) click_trainset = click_trainset.append(train_set.iloc[random.sample( list(train_set.drop(click_rows).index), csize)]) # Train the booking model for i in range(0, 2): if i == 0: model_name = "Booking" response_name = "booking_bool" train_sample = book_trainset isBook = True else: model_name = "Click" response_name = "click_bool" train_sample = click_trainset isBook = False print("Training the " + model_name + " Classifier...") tstart = datetime.now() feature_names = get_features(train_sample, isBook) print("Using " + str(len(feature_names)) + " features...") features = train_sample[feature_names].values target = train_sample[response_name].values classifier = model.model() classifier.fit(features, target) # print the time interval print("Time used,") print(datetime.now() - tstart) print("Saving the classifier...") tstart = datetime.now() data_io.save_model(classifier, isBook) print("Time used,") print(datetime.now() - tstart)
def do_train_samples_processing(): ## step1 read training data print "reading training data..." # train_samples = data_io.read_train(nrows= 100000) train_samples = data_io.read_train() print "Done" ## step2 Data preprocessing print "Processing training data..." # replace NAN with 0 fillna函数 train_samples = train_samples.fillna(value=0) # processing training samples process_train_samples(train_samples) print "Processing training data done"
def main(): sample_size = int(sys.argv[1]) ## sample_size = int(1000) train = data_io.read_train() print("Data Size:") print(train.shape) feature_eng(train) ## originally sample size = 100000 train_set = train[:sample_size] book_trainset = train_set[train_set['booking_bool']==1] book_rows = book_trainset.index.tolist() bsize = len(book_trainset.index) click_trainset = train_set[train_set['click_bool']==1] click_rows = click_trainset.index.tolist() csize = len(click_trainset.index) print 'bsize ' + str(bsize) print 'csize ' + str(csize) book_trainset = book_trainset.append(train_set.ix[random.sample(train_set.drop(book_rows).index, bsize)]) click_trainset =click_trainset.append(train_set.ix[random.sample(train_set.drop(click_rows).index, csize)]) ## Train the booking model for i in range(0,2): if i==0: model_name = "Booking" response_name = "booking_bool" train_sample = book_trainset isBook = True else: model_name = "Click" response_name = "click_bool" train_sample = click_trainset isBook = False print("Training the "+model_name+" Classifier...") tstart = datetime.now() feature_names = get_features(train_sample, isBook) print("Using "+str(len(feature_names))+" features...") features = train_sample[feature_names].values target = train_sample[response_name].values classifier = model.model() classifier.fit(features, target) # print the time interval print("Time used,") print datetime.now() - tstart print("Saving the classifier...") tstart = datetime.now() data_io.save_model(classifier, isBook) print("Time used,") print datetime.now() - tstart
def main(): print("Reading in the training data") train = data_io.read_train() print("Reading in the meta data") paper_author, paper_author_indexed = f.get_paper_author() print("Computing Relational Information") computed_features = f.get_all_computed_features(paper_author) print("Extracting features") features = [] target = [] for author_id, row in train.iterrows(): for paper_id in row["DeletedPaperIds"]: s = f.get_features(paper_id, author_id, paper_author_indexed,computed_features) if s is None: print("Error at Author Id %d And Paper Id %d" % (author_id, paper_id)) else: target.append(1) features.append(s) for paper_id in row["ConfirmedPaperIds"]: s = f.get_features(paper_id, author_id, paper_author_indexed,computed_features) if s is None: print("Error at Author Id %d And Paper Id %d" % (author_id, paper_id)) else: target.append(0) features.append(s) print("Target Length: %d" % len(target)) print("Feature Length: %d" % len(features)) feature_matrix = pd.DataFrame(features) print("Training the Classifier") classifier = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=1, min_samples_split=10, random_state=1) try: classifier.fit(feature_matrix, target) except: import pdb;pdb.set_trace() print("Saving the classifier") data_io.save_model(classifier)
def do_train_samples_processing(): print "Reading training data..." train_samples = data_io.read_train() print "Processing training data..." train_samples = train_samples.fillna(value=0) process_train_samples(train_samples)
srch_length_of_stay_features.SrchLengthOfStayFeatures(self.X), srch_booking_window_features.SrchBookingWindowFeatures(self.X), ] return map(self.transformer, feature_list) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Generate features using train/test data") parser.add_argument("--test", action="store_true", default=False, help="Weather to use test data", required=False) result = parser.parse_args() if result.test: print("Reading test data") data = data_io.read_test() else: print("Reading training data") data = data_io.read_train() fm = FeatureExtractor(data) derived_features = fm.feature_extractor() data.fillna(0, inplace=True) data = pandas.concat([data] + derived_features, axis=1) if result.test: data_io.save_test_features(data) else: data_io.save_train_features(data)
def main(): print("Reading training data") train = data_io.read_train() train.fillna(-2, inplace=True) #train_sample = train.fillna(value=-2) train_sample = train #train_sample = train[:100000] #train_sample = train.fillna(value=0) #feature_names = [ #'srch_id', #'price_usd', #'price_person', #'price_usd', #'prop_location_score2', #'prop_log_historical_price', #'srch_children_count', #'srch_query_affinity_score', #'prop_starrating', #'visitor_hist_starrating', #'promotion_flag', #'prop_review_score', #'srch_destination_id', #'prop_id', #'visitor_hist_adr_usd', #'prop_brand_bool', #] feature_names = list(train_sample.columns) feature_names.remove("click_bool") feature_names.remove("booking_bool") feature_names.remove("gross_bookings_usd") feature_names.remove("date_time") feature_names.remove("position") #feature_names.remove('price_diff') #feature_names.remove('price_person') feature_names.remove('star_diff') #feature_names.remove('pay_diff') feature_names.remove('price_night') feature_names.remove('loc_desire') feature_names.remove('no_kids') feature_names.remove('couple') feature_names.remove('price_down') feature_names.remove('same_country') #feature_names.remove('prop_location_score1') features = train_sample[feature_names].values #train_sample["position"] *= -1.0 #target = train_sample["position"].values #target = train_sample["booking_bool"].values target = train_sample["click_bool"].values print("Training the Classifier") classifier = GradientBoostingClassifier(n_estimators=80, verbose=2, min_samples_split=10, random_state=1) classifier.fit(features, target) print("Saving the classifier") data_io.save_model(classifier, 'click')
def main(): class bcolors: HEADER = '\033[95m' OKBLUE = '\033[94m' OKGREEN = '\033[92m' WARNING = '\033[93m' FAIL = '\033[91m' ENDC = '\033[0m' BOLD = '\033[1m' UNDERLINE = '\033[4m' print bcolors.HEADER + "Start Training" + bcolors.HEADER print bcolors.OKBLUE + "Reading and making Trainingset" + bcolors.OKBLUE train = data_io.read_train() train.fillna(0, inplace=True) train_sample = train[:1250000].fillna(value=0) # change the samplesize over here # list of features that can be removed if you want feature_names = list(train_sample.columns) feature_names.remove("click_bool") feature_names.remove("booking_bool") feature_names.remove("gross_bookings_usd") feature_names.remove("date_time") feature_names.remove("position") features = train_sample[feature_names].values target = train_sample["booking_bool"].values print bcolors.OKGREEN + "Training Dataset" + bcolors.OKGREEN # check over here , you can find the algorithms at http://scikit-learn.org/stable/modules/ensemble.html # random forest classifier = RandomForestClassifier(n_estimators=3200, verbose=2,n_jobs=-1,min_samples_split=10,random_state=1) # extra Trees (better then random forest) (best till now!) #classifier = ExtraTreesClassifier(n_estimators=300, verbose=2, n_jobs=-1, min_samples_split=10,random_state=1) # Adaboost #classifier = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),n_estimators=600,learning_rate=1) # Knearest neighbour with bagging #classifier = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5) # Gradient Boosting BEST SOLUTION (i suppose,will try tomorrow) # classifier = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2, min_samples_leaf=1, max_depth=3, init=None, random_state=None, max_features=None, verbose=0) classifier.fit(features, target) print bcolors.OKBLUE + "Saving Classifier" + bcolors.OKBLUE data_io.save_model(classifier) print bcolors.OKGREEN + "Start Making Predictions On Testset" + bcolors.OKGREEN print bcolors.OKBLUE + "Reading Testset" + bcolors.OKBLUE test = data_io.read_test() test.fillna(0, inplace=True) feature_names = list(test.columns) feature_names.remove("date_time") features = test[feature_names].values classifier = data_io.load_model() print bcolors.OKGREEN + "Make Predictions" + bcolors.OKGREEN predictions = classifier.predict_proba(features)[:,1] print bcolors.OKBLUE + "Calculate NDcg" + bcolors.OKBLUE predictions = list(-1.0*predictions) print bcolors.OKBLUE + "Sort Predictions" + bcolors.OKBLUE recommendations = zip(test["srch_id"], test["prop_id"], predictions) print bcolors.OKGREEN + "Writing Predictions To Outputfile" + bcolors.OKGREEN data_io.write_submission(recommendations) print "" print bcolors.ENDC + "Thats all folks,goodbye!" + bcolors.ENDC