def parse_db(self): conn = data_io.get_db_conn() cursor = conn.cursor() # Create authors print "Parsing Authors..." cursor.execute("SELECT * from Author;") for res in cursor: self.authors[res[0]] = author.Author(res[0], res[1], res[2]) print "Done" # Create Papers print "Parsing Papers..." cursor.execute("SELECT * from Paper;") for res in cursor: self.papers[res[0]] = paper.Paper(res[0], res[1], res[2], res[3], res[4], res[5]) print "Done" # First Update all journal/conference/coauthor information print "Parsing PaperAuthors..." cursor.execute("SELECT * from PaperAuthor;") for res in cursor: paper_id = res[0] author_id = res[1] curr_author = None curr_paper = None if paper_id in self.papers.keys(): curr_paper = self.papers[paper_id] if author_id in self.authors.keys(): curr_author = self.authors[author_id] self.update_paperauthor(curr_paper, curr_author, author_id) print "Done"
def main(): comm = MPI.COMM_WORLD size = comm.Get_size() rank = comm.Get_rank() conn = data_io.get_db_conn() feature_name = open("feature_list.txt").read().split() # if size < len(feature_name): # to be done! for table_name in ["ValidPaper"]: if rank > 0: # getting features by parallel computing print "getting features at node " + str(rank) feature = data_io_parallel.get_features_db_parallel(conn, rank, table_name, feature_name[rank - 1]) else: feature = data_io_parallel.get_trained_validation_data(conn, table_name) # sending features to rank 0 print "sending features to node " + str(rank) features = comm.gather(feature, root = 0) #print features if rank == 0: temp = [] for f in features: temp.extend(f) print "Successfully got the features from " + table_name data = map(list, np.array(temp).T) if rank == 0: author_paper_ids = [x[:2] for x in data] features = [x[2:] for x in data] print("Loading the classifier") classifier = data_io.load_model() print classifier.feature_importances_ print("Making predictions") predictions = classifier.predict_proba(features)[:,1] predictions = list(predictions) author_predictions = defaultdict(list) paper_predictions = {} for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[a_id].append((pred, p_id)) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] print("Writing predictions to file") data_io.write_submission(paper_predictions) print "Prediction completed, exit..." comm.Abort()
def main(): comm = MPI.COMM_WORLD size = comm.Get_size() rank = comm.Get_rank() conn = data_io.get_db_conn() feature_name = open("feature_list.txt").read().split() # if size < len(feature_name): # to be done! for table_name in ["TrainDeleted", "TrainConfirmed"]: if rank > 0: # getting features by parallel computing print "getting features at node " + str(rank) feature = data_io_parallel.get_features_db_parallel(conn, rank, table_name, feature_name[rank - 1]) else: feature = data_io_parallel.get_trained_validation_data(conn, table_name) # sending features to rank 0 print "sending features to node " + str(rank) features = comm.gather(feature, root = 0) #print features if rank == 0: temp = [] for f in features: temp.extend(f) print "Successfully got the features from " + table_name if table_name == "TrainDeleted": features_deleted = map(list, np.array(temp).T) else: features_conf = map(list, np.array(temp).T) if rank == 0: features = [x[2:] for x in features_deleted + features_conf] target = [0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))] print("Training the Classifier") classifier = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=1, min_samples_split=10, random_state=1) classifier.fit(features, target) print("Saving the classifier") data_io.save_model(classifier) print "Training completed, exit..." comm.Abort()
def main(): conn = data_io.get_db_conn() cursor = conn.cursor() if not data_io.table_view_existence_db('AP_features', conn): query = """ CREATE TABLE AP_features ( Result int, authorid bigint, paperid bigint, AP float, AP_PP float, AP_PJ_JP float, AP_PC_CP float, AP_PJ_JJ_JP float, AP_PC_CC_CP float) """ cursor.execute(query) conn.commit() query = """ COPY AP_features FROM '##path##sampleTrain.txt' DELIMITER ' ' """ query = query.replace('##path##', '/home/yingzhen/Projects/KDDCUP2013/benchmark/PythonBenchmark/') cursor.execute(query) conn.commit() query = """ SELECT * FROM AP_features LIMIT 3 """ cursor.execute(query) res = cursor.fetchall() return res
def main(): conn = data_io.get_db_conn() cursor = conn.cursor() if not data_io.table_view_existence_db('AP_features', conn): query = """ CREATE TABLE AP_features ( Result int, authorid bigint, paperid bigint, AP float, AP_PP float, AP_PJ_JP float, AP_PC_CP float, AP_PJ_JJ_JP float, AP_PC_CC_CP float) """ cursor.execute(query) conn.commit() query = """ COPY AP_features FROM '##path##sampleTrain.txt' DELIMITER ' ' """ query = query.replace( '##path##', '/home/yingzhen/Projects/KDDCUP2013/benchmark/PythonBenchmark/') cursor.execute(query) conn.commit() query = """ SELECT * FROM AP_features LIMIT 3 """ cursor.execute(query) res = cursor.fetchall() return res