def main(): testset = pd.read_csv(path + "test_x.csv", index_col=0) ## deal with the NAs, and add features #train.feature_eng(test) ## predict print "Loading the predict_model classifier.." tstart = datetime.now() classifier = data_io.load_model("predict_model") print "Time used", datetime.now() - tstart print "Making predictions on the predict_model" tstart = datetime.now() fnames = ['year', 'month', 'trade_no', 'sigungu_no', 'price', 'monthly_expense'] test_f = testset[fnames].values predic_proba = classifier.predict_proba(test_f)[:,1] print "Time used", datetime.now() - tstart ## Making prediction prediction = zip(testset['year'], testset['month'], testset['trade_no'], testset['sigungu_no'], testset['price'], testset['monthly_expense'], predic_proba) print "Writing predictions to file.." tstart = datetime.now() data_io.write_submission(prediction) print "Time used,", datetime.now() - tstart
def main(): print("Reading in the training data") data = data_io.get_train_df() print("Extracting features") feature_extractor = Vectorizer(MAX_FEATURES) category_vectorizer = DictVectorizer() #category_title = pd.get_dummies(train['Title']) #print (category_vectorizer.shape, X.shape) X = form_input(data, feature_extractor, category_vectorizer) #location = pd.get_dummies(train['LocationNormalized']) #X = hstack((X, location)) #contract_time = pd.get_dummies(train['ContractTime']) #X = hstack((X, contract_time)) #print(X) y = data["SalaryNormalized"] print("Training model") linreg.train(X, y) print("Making predictions") predictions = linreg.predict(X) mae_train = metrics.MAE(predictions, data["SalaryNormalized"]) print('MAE train=%s', mae_train) print("Validating...") data = data_io.get_valid_df() X = form_input(data, feature_extractor, category_vectorizer, train=False) predictions = linreg.predict(X) data_io.write_submission(predictions) '''
def main(): print "sklearn version", pkg_resources.get_distribution("scikit-learn").version print "numpy version", pkg_resources.get_distribution("numpy").version print "pandas version", pkg_resources.get_distribution("pandas").version print("Loading the classifier") clf = data_io.load_model() X = data_io.load_matlab_valid_features() X = delete_unused_columns(X) X = X.fillna(0) if(X is None): print("No feature file found!") exit(1) print_importances(X,clf, 0.0) print("Predictions outcomes with shape: " + str(X.shape)) print clf predictions = clf.predict(X) #predictions = clf.predict_pruned(X,3000) predictions = predictions.flatten() print("Writing predictions to file") data_io.write_submission(predictions)
def main(): print("Getting features for valid papers from the database") if (os.path.exists("features_valid.obj")): with open("features_valid.obj", 'r') as loadfile: data = cPickle.load(loadfile) else: data = data_io.get_features_db("ValidPaper") with open("features_valid.obj", 'w') as dumpfile: cPickle.dump(data, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) author_paper_ids = [x[:2] for x in data] features = [x[2:] for x in data] print("Loading the classifier") classifier = data_io.load_model(prefix="forest_") print("Making predictions") predictions = classifier.predict_proba(features)[:, 1] predictions = list(predictions) author_predictions = defaultdict(list) paper_predictions = {} for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[a_id].append((pred, p_id)) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] print("Writing predictions to file") data_io.write_submission(paper_predictions, prefix="forest_")
def recommendationNewPOI(self, submission_path): index_extent = (-90, -180, 90, 180) ndimx = int((index_extent[3]-index_extent[1])/settings["GRID_LNG"]) ndimy = int((index_extent[2]-index_extent[0])/settings["GRID_LAT"]) recommendation_result = {} user_visited = defaultdict(list) for entry in csv.reader(open(self.trdata_path)): uid, pid1, pid2 = int(entry[0]), int(entry[1]), int(entry[4]) user_visited[uid].append(pid1) user_visited[uid].append(pid2) for i, entry in enumerate(csv.reader(open(self.tedata_path))): uid, pid1 = int(entry[0]), int(entry[1]) near_grids = getNearGridsForPOI(self.pois_latlng[pid1], ndimx, ndimy, True) candidate_pois = [] for grididx in near_grids: candidate_pois += self.grids_pois[grididx[0]][grididx[1]] result = [] pois_score = [] for c_pid in set(candidate_pois)-user_visited[uid]: if self.bias_tag == True: score = np.dot(self.user_factor[self.user_ids[uid]]+self.query_factor[self.poi_ids[pid1]], self.poi_factor[self.poi_ids[c_pid]])\ + self.poi_bias[self.poi_ids[c_pid]] else: score = np.dot(self.user_factor[self.user_ids[uid]]+self.query_factor[self.poi_ids[pid1]], self.poi_factor[self.poi_ids[c_pid]]) pois_score.append([c_pid, score]) result = sorted(pois_score, key=lambda x:x[1], reverse=True)[:settings["MAX_TOPK"]] recommendation_result[i] = [pair[0] for pair in result] sys.stdout.write("\rFINISHED RECOMMENDATION TRIPLE NUM: %d. " % (i+1)) sys.stdout.flush() write_submission(recommendation_result, submission_path)
def recommendation(self, submission_path): index_extent = (-90, -180, 90, 180) ndimx = int((index_extent[3]-index_extent[1])/settings["GRID_LNG"]) ndimy = int((index_extent[2]-index_extent[0])/settings["GRID_LAT"]) recommendation_result = {} cache_user_poi_score = defaultdict(dict) for i, entry in enumerate(csv.reader(open(self.tedata_path))): uid, pid1 = int(entry[0]), int(entry[1]) near_grids = getNearGridsForPOI(self.pois_latlng[pid1], ndimx, ndimy, True) candidate_pois = [] for grididx in near_grids: candidate_pois += self.grids_pois[grididx[0]][grididx[1]] result = [] pois_score = [] for c_pid in candidate_pois: if uid in cache_user_poi_score and c_pid in cache_user_poi_score[uid]: result.append([c_pid, cache_user_poi_score[uid][c_pid]]) else: if self.bias_tag == True: score = np.dot(self.user_factor[self.user_ids[uid]],\ self.poi_factor[self.poi_ids[c_pid]])\ + self.poi_bias[self.poi_ids[c_pid]] else: score = np.dot(self.user_factor[self.user_ids[uid]],\ self.poi_factor[self.poi_ids[c_pid]]) pois_score.append([c_pid, score]) cache_user_poi_score[uid][c_pid] = score result = sorted(pois_score, key=lambda x:x[1], reverse=True)[:settings["MAX_TOPK"]] recommendation_result[i] = [pair[0] for pair in result] sys.stdout.write("\rFINISHED PAIR NUM: %d. " % (i+1)) sys.stdout.flush() write_submission(recommendation_result, submission_path)
def recommendation(self, submission_path): index_extent = (-90, -180, 90, 180) ndimx = int((index_extent[3] - index_extent[1]) / settings["GRID_LNG"]) ndimy = int((index_extent[2] - index_extent[0]) / settings["GRID_LAT"]) recommendation_result = {} for i, entry in enumerate(csv.reader(open(self.tedata_path))): uid, pid1 = int(entry[0]), int(entry[1]) near_grids = getNearGridsForPOI(self.pois_latlng[pid1], ndimx, ndimy, True) pois_score = [] for grididx in near_grids: for candidate_poi in self.grids_pois[grididx[0]][grididx[1]]: if candidate_poi in self.per_pois_pop[uid]: pois_score.append([ candidate_poi, self.per_pois_pop[uid][candidate_poi] ]) else: pois_score.append([candidate_poi, 0]) result = sorted(pois_score, key=lambda x: x[1], reverse=True)[:settings["MAX_TOPK"]] recommendation_result[i] = [pair[0] for pair in result] print i write_submission(recommendation_result, submission_path)
def main(): print("Reading the test data") test = data_io.read_test() print("Reading in the meta data") paper_author, paper_author_indexed = f.get_paper_author() print("Computing Relational Information") computed_features = f.get_all_computed_features(paper_author) print("Loading the classifier") classifier = data_io.load_model() print("Making predictions") predictions = [] for author_id, row in test.iterrows(): features = [] paper_ids = [] for paper_id in row["PaperIds"]: s = f.get_features(paper_id, author_id, paper_author_indexed, computed_features) if s is None: print("Error at Author Id %d And Paper Id %d" % (author_id, paper_id)) else: features.append(s) paper_ids.append(paper_id) feature_matrix = pd.DataFrame(features) preds = classifier.predict_proba(feature_matrix)[:, 1] paper_ids_sorted = sorted(zip(preds, row["PaperIds"]), reverse=True) print(paper_ids_sorted) predictions.append([x[1] for x in paper_ids_sorted]) print("Writing predictions to file") data_io.write_submission(predictions)
def main(): print "Getting features for valid papers from the database" data = data_io.get_features_db("ValidPaper") author_paper_ids = [x[:2] for x in data] features = [x[2:] for x in data] print "Loading the classifier" classifier = data_io.load_model() print "Making predictions" predictions = classifier.predict_proba(features)[:,1] predictions = list(predictions) author_predictions = defaultdict(list) paper_predictions = {} for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[a_id].append((pred, p_id)) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] print "Writing predictions to file" data_io.write_submission(paper_predictions)
def runWithoutWndchrm(self): print "Loading the classifier" classifier = data_io.load_model() imageCollections = data_io.get_valid_df() featureGetter = FeatureGetter() print "Getting the features" fileName = data_io.get_savez_name_test() if not self.load: #Last features calculated from candidates (namesObservations, coordinates, valid) = Utils.calculateFeatures(fileName, featureGetter, imageCollections) else: (namesObservations, coordinates, valid) = Utils.loadFeatures(fileName) print "Making predictions" #valid = normalize(valid, axis=0) #askdfhashdf predictions = classifier.predict(valid) predictions = predictions.reshape(len(predictions), 1) print "Writing predictions to file" data_io.write_submission(namesObservations, coordinates, predictions) data_io.write_submission_nice(namesObservations, coordinates, predictions) print "Calculating final results" return Predictor.finalResults(namesObservations, predictions, coordinates)
def recommendation(self, submission_path): index_extent = (-90, -180, 90, 180) ndimx = int((index_extent[3] - index_extent[1]) / settings["GRID_LNG"]) ndimy = int((index_extent[2] - index_extent[0]) / settings["GRID_LAT"]) recommendation_result = {} cache_user_poi_score = defaultdict(dict) for i, entry in enumerate(csv.reader(open(self.tedata_path))): uid, pid1 = int(entry[0]), int(entry[1]) near_grids = getNearGridsForPOI(self.pois_latlng[pid1], ndimx, ndimy, True) candidate_pois = [] for grididx in near_grids: candidate_pois += self.grids_pois[grididx[0]][grididx[1]] result = [] pois_score = [] for c_pid in candidate_pois: if uid in cache_user_poi_score and c_pid in cache_user_poi_score[ uid]: result.append([c_pid, cache_user_poi_score[uid][c_pid]]) else: if self.bias_tag == True: score = np.dot(self.user_factor[self.user_ids[uid]],\ self.poi_factor[self.poi_ids[c_pid]])\ + self.poi_bias[self.poi_ids[c_pid]] else: score = np.dot(self.user_factor[self.user_ids[uid]],\ self.poi_factor[self.poi_ids[c_pid]]) pois_score.append([c_pid, score]) cache_user_poi_score[uid][c_pid] = score result = sorted(pois_score, key=lambda x: x[1], reverse=True)[:settings["MAX_TOPK"]] recommendation_result[i] = [pair[0] for pair in result] sys.stdout.write("\rFINISHED PAIR NUM: %d. " % (i + 1)) sys.stdout.flush() write_submission(recommendation_result, submission_path)
def main(): print("Getting features for valid papers from the database") data = data_io.get_features_db("ValidPaper") author_paper_ids = [x[:2] for x in data] features = [x[2:] for x in data] print("Loading the classifier") classifier = data_io.load_model() print classifier.feature_importances_ print("Making predictions") predictions = classifier.predict_proba(features)[:,1] predictions = list(predictions) author_predictions = defaultdict(list) paper_predictions = {} for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[a_id].append((pred, p_id)) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] print("Writing predictions to file") data_io.write_submission(paper_predictions)
def main(): print("Reading the test data") test = data_io.read_test() print("Reading in the meta data") paper_author, paper_author_indexed = f.get_paper_author() print("Computing Relational Information") computed_features = f.get_all_computed_features(paper_author) print("Loading the classifier") classifier = data_io.load_model() print("Making predictions") predictions = [] for author_id, row in test.iterrows(): features = [] paper_ids = [] for paper_id in row["PaperIds"]: s = f.get_features(paper_id, author_id, paper_author_indexed,computed_features) if s is None: print("Error at Author Id %d And Paper Id %d" % (author_id, paper_id)) else: features.append(s) paper_ids.append(paper_id) feature_matrix = pd.DataFrame(features) preds = classifier.predict_proba(feature_matrix)[:,1] paper_ids_sorted = sorted(zip(preds,row["PaperIds"]), reverse=True) print(paper_ids_sorted) predictions.append([x[1] for x in paper_ids_sorted]) print("Writing predictions to file") data_io.write_submission(predictions)
def main(): print("Reading test data") test_chunks = data_io.read_test_features() test = pandas.concat([chunk for chunk in test_chunks], ignore_index=True) feature_names = list(test.columns) #feature_names.remove("date_time") features = test[feature_names].values print("Loading the classifier") classifiers = data_io.load_model() print("Making predictions") #orig_predictions = classifier.predict_proba(features) #multiplier = 2 ** classifier.classes_ #predictions = orig_predictions * multiplier #predictions = predictions.sum(axis=1) predictions = class_probabilities(features, classifiers) print predictions predictions = list(-1.0*predictions) recommendations = zip(test["srch_id"], test["prop_id"], predictions) print("Writing predictions to file") data_io.write_submission(recommendations)
def main(): print("Getting features for valid papers from the database") if(os.path.exists("features_valid.obj")): with open("features_valid.obj", 'r') as loadfile: data = cPickle.load(loadfile) else: data = data_io.get_features_db("ValidPaper") with open("features_valid.obj", 'w') as dumpfile: cPickle.dump(data, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) author_paper_ids = [x[:2] for x in data] features = [x[2:] for x in data] print("Loading the classifier") classifier = data_io.load_model(prefix="forest_") print("Making predictions") predictions = classifier.predict_proba(features)[:,1] predictions = list(predictions) author_predictions = defaultdict(list) paper_predictions = {} for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[a_id].append((pred, p_id)) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] print("Writing predictions to file") data_io.write_submission(paper_predictions, prefix="forest_")
def main(): print("Getting features for valid papers from the database") data = data_io.get_features_db("ValidPaper") author_paper_ids = [x[:2] for x in data] features = [x[2:] for x in data] featuresfloat = [] for tup in features: a, b, c, d, e = tup featuresfloat.append( (float(a), float(b), float(c), float(d), float(e))) print("Totoal number of samples: ", len(featuresfloat)) print("Loading the logistic regression model") logistic = data_io.load_model() print("Making predictions") predictions = logistic.predict_proba(featuresfloat)[:, 1] predictions = list(predictions) author_predictions = defaultdict(list) paper_predictions = {} for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[a_id].append((pred, p_id)) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] print("Writing predictions to file") data_io.write_submission(paper_predictions)
def main(): print("Getting features for valid papers from the database") data = data_io.get_features_db("ValidPaper") author_paper_ids = [x[:2] for x in data] features = [x[2:] for x in data] featuresfloat = [] for tup in features: a, b, c, d, e = tup featuresfloat.append((float(a), float(b), float(c), float(d), float(e))) print("Totoal number of samples: ", len(featuresfloat)) print("Loading the logistic regression model") logistic = data_io.load_model() print("Making predictions") predictions = logistic.predict_proba(featuresfloat)[:,1] predictions = list(predictions) author_predictions = defaultdict(list) paper_predictions = {} for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[a_id].append((pred, p_id)) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] print("Writing predictions to file") data_io.write_submission(paper_predictions)
def main(): print("Reading test data") test_chunks = data_io.read_test_features() test = pandas.concat([chunk for chunk in test_chunks], ignore_index=True) feature_names = list(test.columns) #feature_names.remove("date_time") features = test[feature_names].values print("Loading the classifier") classifiers = data_io.load_model() print("Making predictions") #orig_predictions = classifier.predict_proba(features) #multiplier = 2 ** classifier.classes_ #predictions = orig_predictions * multiplier #predictions = predictions.sum(axis=1) predictions = class_probabilities(features, classifiers) print predictions predictions = list(-1.0 * predictions) recommendations = zip(test["srch_id"], test["prop_id"], predictions) print("Writing predictions to file") data_io.write_submission(recommendations)
def main(): print "sklearn version", pkg_resources.get_distribution( "scikit-learn").version print "numpy version", pkg_resources.get_distribution("numpy").version print "pandas version", pkg_resources.get_distribution("pandas").version print("Loading the classifier") clf = data_io.load_model() X = data_io.load_matlab_valid_features() X = delete_unused_columns(X) X = X.fillna(0) if (X is None): print("No feature file found!") exit(1) print_importances(X, clf, 0.0) print("Predictions outcomes with shape: " + str(X.shape)) print clf predictions = clf.predict(X) #predictions = clf.predict_pruned(X,3000) predictions = predictions.flatten() print("Writing predictions to file") data_io.write_submission(predictions)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-tv', type=float, action='store', dest='threshold_val', help='specify how to generate recommendation result.') parser.add_argument('-t', type=int, action='store', dest='target', help='for validation or test dataset') if len(sys.argv) != 5: print 'Command e.g.: python predict.py -tv 0.8 -t 0(1)' sys.exit(1) para = parser.parse_args() if para.target == 0: file_name = settings["MTLR_TEST_FILE"] gbt_feature_file = settings["MTLR_FEATURE_FILE"] elif para.target == 1: file_name = settings["MTLR_TEST_FILE_FOR_SUBMIT"] gbt_feature_file = settings["MTLR_FEATURE_FILE_FOR_SUBMIT"] writer = csv.writer(open(gbt_feature_file, "w"), lineterminator="\n") classifier = data_io.load_model(settings["MTLR_MODEL_FILE"]) #print classifier.coef_ #raw_input() user_recommend_result = defaultdict(list) finished_num = 0 features = [] user_product_ids = [] cache_uid = -1 for i, entry in enumerate(csv.reader(open(file_name))): feature = map(float, entry[2:]) uid, pid = map(int, entry[:2]) if i == 0: cache_uid = uid if uid != cache_uid: predictions = classifier.predict_proba(user_product_ids, features) #predictions = classifier.predict(features) for (t_uid, t_pid), pred in zip(user_product_ids, predictions): writer.writerow([t_uid, t_pid, pred]) if pred > para.threshold_val: user_recommend_result[t_uid].append(t_pid) features = [feature] user_product_ids = [[uid, pid]] cache_uid = uid finished_num += 1 #print("FINISHED UID NUM: %d. " % (finished_num)) #sys.stderr.write("\rFINISHED UID NUM: %d. " % (finished_num)) #sys.stderr.flush() else: features.append(feature) user_product_ids.append([uid, pid]) data_io.write_submission(user_recommend_result)
def main(): submission_path = data_io.get_paths()["submission_path"] reader = csv.reader(open(submission_path)) reader.next() # skipping the header recommendations = [(int(row[0]), int(row[1]), -i) for i, row in enumerate(reader)] out_path = submission_path[:-4] + "Reversed.csv" data_io.write_submission(recommendations, submission_path=out_path)
def main(): submission_path = data_io.get_paths()["submission_path"] reader = csv.reader(open(submission_path)) reader.next() # skipping the header recommendations = [(int(row[0]), int(row[1]), -i) for i,row in enumerate(reader)] out_path = submission_path[:-4]+"Reversed.csv" data_io.write_submission(recommendations, submission_path=out_path)
def main(): parser = argparse.ArgumentParser() parser.add_argument( '-Csampling', type=str, action='store', dest='sample_method', help='specify which sampling method.\n' 'Currently including three sampling method:\t1.uniform\n\t' '2.adaptive pairwise sampling') parser.add_argument( '-Cbehavior', type=str, action='store', dest='behavior_num', help='specify whether to utilize various behaviours of users') parser.add_argument( '-Init', type=str, action='store', dest='init_choice', help='specify which method to initialize model parameters') parser.add_argument( '-Retrain', type=str, action='store', dest='retrain_choice', help='specify which method to initialize model parameters') parser.add_argument('-topk', type=int, action='store', dest='topk', help='specify how many products to be recommended') if len(sys.argv) != 11: print 'Command e.g.: python train.py -Retrain True -Init zero(gaussian) '\ + '-Csampling uniform(adaptive) -Cbehavior triple(tuple) -topk 4' sys.exit(1) para = parser.parse_args() #genTrainFile(para.behavior_num) #genTrainFile1(para.behavior_num) genTrainFile2() #bpr = BPR() #bpr1 = BPR() bpr2 = BPR() if para.retrain_choice == "True": bpr2.model_init(settings["BPR_TRAIN_FILE"], para.init_choice) bpr2.train() recommend_result = bpr2.genRecommendResult(True, para.topk, settings["BPR_TRAIN_FILE"], para.init_choice) write_submission(recommend_result) else: recommend_result = bpr2.genRecommendResult(False, para.topk, settings["BPR_TRAIN_FILE"], para.init_choice) write_submission(recommend_result)
def main(): print("Loading the model") model = data_io.load_model() print("Making predictions") valid = data_io.get_valid_df() predictions = model * np.ones(len(valid)) print("Writing predictions to file") data_io.write_submission(predictions)
def main(): print("Reading test data") test = data_io.read_test() ordinals = np.arange(len(test)) recommendations = zip(test["srch_id"], test["prop_id"], ordinals) print("Writing predictions to file") data_io.write_submission(recommendations, "testOrderBenchmark.csv")
def main(): print("Loading the classifier") classifier = data_io.load_model() print("Making predictions") test = data_io.get_test() predictions = classifier.predict(test) print("Writing predictions to file") data_io.write_submission(predictions)
def main(): print("Reading the test data") test = data_io.read_test() print("Making predictions") np.random.seed(12341234) predictions = test.apply(shuffle, axis=1) print("Writing predictions to file") data_io.write_submission(predictions)
def main(): print("Loading the classifier") classifier = data_io.load_model() print("Making predictions") valid = data_io.get_valid_df() predictions = classifier.predict(valid) predictions = np.rint(predictions) # Round predictions to nearest integer. print("Writing predictions to file") data_io.write_submission(predictions)
def main(): print("Loading the classifier") classifier = data_io.load_model() print("Making predictions") valid = data_io.get_valid_df() predictions = classifier.predict(valid) predictions = predictions.reshape(len(predictions), 1) print("Writing predictions to file") data_io.write_submission(predictions)
def predict(self, submission_path): prediction_result = [] for line in open(self.tedata_path): uid, pid = line.strip("\r\t\n").split(" ")[:2] if self.m_choice == 0: prediction_result.append([uid, pid, self.ave_rating[uid]]) elif self.m_choice == 1: prediction_result.append([uid, pid, self.ave_rating[pid]]) else: print 'Invalid choice of average rating method!' sys.exit(1) write_submission(prediction_result, submission_path)
def main(): valid = data_io.get_valid_df() P={} for key in valid: print("Loading the classifier for %s" %key) classifier = data_io.load_model(key) print("Making predictions") P[key] = classifier.predict(valid[key]) P[key] = P[key].reshape(len(P[key]), 1) print("Writing predictions to file") data_io.write_submission(P)
def main(): comm = MPI.COMM_WORLD size = comm.Get_size() rank = comm.Get_rank() conn = data_io.get_db_conn() feature_name = open("feature_list.txt").read().split() # if size < len(feature_name): # to be done! for table_name in ["ValidPaper"]: if rank > 0: # getting features by parallel computing print "getting features at node " + str(rank) feature = data_io_parallel.get_features_db_parallel(conn, rank, table_name, feature_name[rank - 1]) else: feature = data_io_parallel.get_trained_validation_data(conn, table_name) # sending features to rank 0 print "sending features to node " + str(rank) features = comm.gather(feature, root = 0) #print features if rank == 0: temp = [] for f in features: temp.extend(f) print "Successfully got the features from " + table_name data = map(list, np.array(temp).T) if rank == 0: author_paper_ids = [x[:2] for x in data] features = [x[2:] for x in data] print("Loading the classifier") classifier = data_io.load_model() print classifier.feature_importances_ print("Making predictions") predictions = classifier.predict_proba(features)[:,1] predictions = list(predictions) author_predictions = defaultdict(list) paper_predictions = {} for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[a_id].append((pred, p_id)) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] print("Writing predictions to file") data_io.write_submission(paper_predictions) print "Prediction completed, exit..." comm.Abort()
def main(): print("Reading the valid pairs") valid = data_io.read_valid_pairs() print("Loading the classifier") classifier = data_io.load_model() print("Making predictions") predictions = classifier.predict(valid) predictions = predictions.flatten() print("Writing predictions to file") data_io.write_submission(predictions)
def run(self): valid = self.getValidationDataset() if f.preprocessedFeatures != []: intermediate = data_io.read_intermediate_valid() for i in f.preprocessedFeatures: valid[i] = intermediate[i] print "Loading the classifier" classifier = data_io.load_model() print "Making predictions" predictions = classifier.predict(valid) predictions = predictions.flatten() print "Writing predictions to file" data_io.write_submission(predictions)
def main(): cf = ClassifierFactory() filename = None modelnames = ["basic_python_benchmark"] numRows = None try: opts, args = getopt.getopt(sys.argv[1:], "f:m:n:h") except getopt.GetoptError as err: print str(err) sys.exit(2) for o, a in opts: if o == "-f": filename = a elif o == "-n": numRows = int(a) elif o == "-m": if a == "all": modelnames = [] for clf_key in cf.get_all_keys(): modelnames.append(clf_key) elif cf.is_valid_key(a): modelnames = [a] elif o == "-h": print 'options:' print "\t -m [classifier key | all]" print "\t -f [filename]" sys.exit(0) else: print "try help: python predict.py -h" sys.exit(1) print "Reading the test pairs" test = data_io.read_test_pairs(numRows) testInfo = data_io.read_test_info(numRows) test['A type'] = testInfo['A type'] test['B type'] = testInfo['B type'] for modelname in modelnames: print "Loading the classifier:", cf.get_classifier_name(modelname) classifier = data_io.load_model(modelname) print "Making predictions" predictions = classifier.predict(test) predictions = predictions.flatten() filename = modelname + '.csv' data_io.write_submission(predictions, filename)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-t', type=int, action='store', dest='target', help='for validation or test dataset') parser.add_argument('-tv', type=float, action='store', dest='threshold_val', help='specify threshold value.') parser.add_argument('-s1', type=float, action='store', dest='click_score', help='specify the score of user click behavior.') parser.add_argument('-s2', type=float, action='store', dest='collect_score', help='specify the score of user click behavior.') parser.add_argument('-s3', type=float, action='store', dest='buy_score', help='specify the score of user click behavior.') parser.add_argument('-ta', type=float, action='store', dest='time_alpha', help='specify the decay parameter for training') parser.add_argument('-tb', type=float, action='store', dest='time_beta', help='specify the decay parameter for prediction') parser.add_argument('-month', type=int, action='store', dest='month', help='specify the month when the recommendation being generated.') parser.add_argument('-day', type=int, action='store', dest='day', help='specify the day when the recommendation being generated.') if len(sys.argv) != 19: print 'Command e.g.: python itemcf.py -t (1) -tv 1.5 -s1 1 -s2 2 -s3 4 -ta 0.5 -tb 0.5 -month 7 -day 15' sys.exit(1) para = parser.parse_args() if para.target == 0: data = [entry for entry in csv.reader(open(settings["TRAIN_DATA_FILE"]))] elif para.target == 1: data = [entry for entry in csv.reader(open(settings["TAR_DATA_FILE"]))] else: print 'Invalid train data target choice...' sys.exit(1) data = [map(int, entry) for entry in data[1:]] user_behavior = getUserBehavior(data) user_inverted_index = createdInvertedIndex(data) #user_inverted_index = createdInvertedIndex1(data) #sim_items = itemSimilarity(user_inverted_index, para.click_score, # para.collect_score, para.buy_score, para.time_alpha) sim_items = itemSimilarity1(user_inverted_index, para.click_score, para.collect_score, para.buy_score, para.time_alpha) #recommend_result = genRecommendResult(sim_items, user_behavior, para.click_score, # para.collect_score, para.buy_score, para.time_alpha, para.threshold_val, # para.month, para.day) recommend_result = genRecommendResult1(sim_items, user_behavior, para.click_score, para.collect_score, para.buy_score, para.time_beta, para.threshold_val, para.month, para.day, para.target) write_submission(recommend_result)
def recommendation(self, submission_path): index_extent = (-90, -180, 90, 180) ndimx = int((index_extent[3]-index_extent[1])/settings["GRID_LNG"]) ndimy = int((index_extent[2]-index_extent[0])/settings["GRID_LAT"]) recommendation_result = {} for i, entry in enumerate(csv.reader(open(self.tedata_path))): uid, pid1 = int(entry[0]), int(entry[1]) near_grids = getNearGridsForPOI(self.pois_latlng[pid1], ndimx, ndimy, True) candidate_pois = [] for grididx in near_grids: candidate_pois += self.grids_pois[grididx[0]][grididx[1]] pois_score = [[poi, self.pois_popularity[poi]] for poi in candidate_pois] result = sorted(pois_score, key=lambda x:x[1], reverse=True)[:settings["MAX_TOPK"]] recommendation_result[i] = [pair[0] for pair in result] print i write_submission(recommendation_result, submission_path)
def main(): print("Reading the valid pairs") valid = data_io.read_valid_pairs() valid_info = data_io.read_valid_info() valid = pd.concat([valid, valid_info],axis =1) valid = train.get_types(valid) print("Loading the classifier") classifier = data_io.load_model() print("Making predictions") predictions = classifier.predict(valid) predictions = predictions.flatten() print("Writing predictions to file") data_io.write_submission(predictions, fn)
def main(): print("Loading the test data") classifier = data_io.load_model() print("Load test data. And Clean..") test = data_io.get_test_df() test = FeatureConverter().clean_data(test) passengerIds = test['Id'] test.drop(['Id'], axis=1, inplace=True) test = test.values print("Making predictions") predictions = classifier.predict(test).astype(int) #predictions = predictions.reshape(len(predictions), 1) print("Writing predictions to file") data_io.write_submission(predictions, passengerIds, ['Id', 'Cover_Type'])
def main(): print("Getting features for valid papers from the database") if(os.path.exists("features_valid.obj")): with open("features_valid.obj", 'r') as loadfile: data = cPickle.load(loadfile) else: data = data_io.get_features_db("ValidPaper") with open("features_valid.obj", 'w') as dumpfile: cPickle.dump(data, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) author_paper_ids = [x[:2] for x in data] features = [x[2:] for x in data] predictInts = [] for tup in features: a, b, c, d, e = tup predictInts.append((int(a), int(b), int(c), int(d), int(e))) print("Loading the classifier") mlp = data_io.load_model(prefix="mlp_") print("Making predictions") predictions = [] for x in predictInts : #Propagate the inputs forward to compute the outputs outp = list(x) #output of input layer i.e. output of previous layer to be used as input for next layer for layer in mlp.layers[1:] : #for all layers starting from the second layer for i in range(layer.nNeurons): layer.net[i] = weightedSum(outp, layer.W[1:,i]) + layer.W[0,i] layer.out[i] = g(layer.net[i], layer.transferF) #pass this weighted sum through the transfer function of this layer outp = layer.out predictions.append(mlp.layers[-1].out[0]) author_predictions = defaultdict(list) paper_predictions = {} for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[a_id].append((pred, p_id)) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] print("Writing predictions to file") data_io.write_submission(paper_predictions, prefix="mlp_")
def main(): print("Loading the test data") classifier = data_io.load_model() print ("Load test data. And Clean..") test = data_io.get_test_df() test = FeatureConverter().clean_data(test) passengerIds = test['Id'] test.drop(['Id'], axis = 1, inplace = True) test = test.values print("Making predictions") predictions = classifier.predict(test).astype(int) #predictions = predictions.reshape(len(predictions), 1) print("Writing predictions to file") data_io.write_submission(predictions, passengerIds, ['Id', 'Cover_Type'])
def main(): print("Getting features for valid papers from the database") if(os.path.exists("features_valid.obj")): with open("features_valid.obj", 'r') as loadfile: data = cPickle.load(loadfile) else: data = data_io.get_features_db("ValidPaper") with open("features_valid.obj", 'w') as dumpfile: cPickle.dump(data, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) author_paper_ids = [x[:2] for x in data] features = [x[2:] for x in data] #code for including keywords match feature print "adding addtional features..." import additional_features as af all_features = af.get_additional_features() _, _, kw_features = all_features for i in range(len(features)): features[i]+= tuple(kw_features[i][2:]) featuresnp = np.array(features, dtype='int32') # featuresnp -= np.mean(featuresnp, axis=0) # featuresnp /= np.std(featuresnp, axis=0) print("Loading the classifier") classifier = data_io.load_model(prefix="forest_") print("Making predictions") predictions = classifier.predict_proba(featuresnp)[:,1] predictions = list(predictions) author_predictions = defaultdict(list) paper_predictions = {} for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[a_id].append((pred, p_id)) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] print("Writing predictions to file") data_io.write_submission(paper_predictions, prefix="forest_")
def predict_write(data, predict_type): author_paper_ids = [x[:2] for x in data] features = [x[2:] for x in data] print("Loading the classifier") classifier = data_io.load_model() print("Making predictions") features = np.array(features) # This line is for xgboost predictions = classifier.predict_proba(features)[:, 1] predictions = list(predictions) author_predictions = defaultdict(list) paper_predictions = {} if (predict_type == "valid"): targetset = pd.read_csv('dataRev2/Valid.csv') else: targetset = pd.read_csv('dataRev2/Test.csv') parsed_counter = parse_targetset_maintain_duplicate(targetset) for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[a_id].append((pred, p_id)) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) new_result = [] for x in paper_ids_sorted: pid = x[1] for i in range(parsed_counter[author_id, pid]): new_result.append(pid) paper_predictions[author_id] = new_result #paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] paper_predictions[author_id] = processDuplicates( paper_predictions[author_id]) print("Writing predictions to file") data_io.write_submission(paper_predictions, predict_type)
def main(): test = data_io.read_test() ## deal with the NAs, and add features train.feature_eng(test) ## predict the booking_bool print("Loading the Booking classifier..") tstart = datetime.now() classifier = data_io.load_model(True) print("Time used,") print(datetime.now() - tstart) print("Making predictions on the booking_bool..") tstart = datetime.now() b_fnames = train.get_features(test, True) b_test_f = test[b_fnames].values b_prob = classifier.predict_proba(b_test_f)[:, 1] b_prob = list(-1.0 * b_prob) print("Time used,") print(datetime.now() - tstart) ## predict the click_bool print("Loading the Click classifier..") tstart = datetime.now() classifier = data_io.load_model(False) print("Time used,") print(datetime.now() - tstart) print("Making predictions on the click_bool..") tstart = datetime.now() c_fnames = train.get_features(test, False) c_test_f = test[c_fnames].values c_prob = classifier.predict_proba(c_test_f)[:, 1] c_prob = list(-1.0 * c_prob) print("Time used,") print(datetime.now() - tstart) ## Making Recommendations recommendations = zip(test["srch_id"], test["prop_id"], 4 * b_prob + c_prob) print("Writing predictions to file..") tstart = datetime.now() data_io.write_submission(recommendations) print("Time used,") print(datetime.now() - tstart)
def main(): parser = argparse.ArgumentParser() parser.add_argument( '-topk', type=int, action='store', dest='topk', help='specify the number of products to be recommended to\ users, 0 stands for using user personal average.') if len(sys.argv) != 3: print 'Command e.g.: python runPopularity -topk 5' sys.exit(1) para = parser.parse_args() products = genPopularList(settings["TRAIN_DATA_FILE"]) user_average_buy = getAverageUserBuy(para.topk) recommend_result = genRecommendResult(products, user_average_buy) write_submission(recommend_result)
def main(): print("Reading the valid pairs") valid = data_io.read_valid_pairs() features = fe.feature_extractor() print("Transforming features") trans_valid = features.fit_transform(valid) trans_valid = np.nan_to_num(trans_valid) print("Saving Valid Features") data_io.save_valid_features(trans_valid) print("Loading the classifier") #(both_classifier, A_classifier, B_classifier, none_classifier) = data_io.load_model() classifier = data_io.load_model() print("Making predictions") valid_info = data_io.read_valid_info() predictions = list() curr_pred = None """ for i in range(len(trans_valid)): if valid_info["A type"][i] == "Numerical" and valid_info["B type"][i] == "Numerical": curr_pred = both_classifier.predict_proba(trans_valid[i, :]) elif valid_info["A type"][i] == "Numerical" and valid_info["B type"][i] != "Numerical": curr_pred = A_classifier.predict_proba(trans_valid[i, :]) elif valid_info["A type"][i] != "Numerical" and valid_info["B type"][i] == "Numerical": curr_pred = B_classifier.predict_proba(trans_valid[i, :]) else: curr_pred = none_classifier.predict_proba(trans_valid[i, :]) predictions.append(curr_pred[0][2] - curr_pred[0][0]) """ orig_predictions = classifier.predict_proba(trans_valid) predictions = orig_predictions[:, 2] - orig_predictions[:, 0] predictions = predictions.flatten() print("Writing predictions to file") data_io.write_submission(predictions)