def main(): model_type = None if len(sys.argv) > 1: model_type = sys.argv[1] os_utils._makedirs("../logs") os_utils._makedirs("../output") logger = log_utils._get_logger("../logs", "tf-%s.log" % time_utils._timestamp()) # load data Q = load_question(params) dfTrain = load_train() dfTest = load_test() train_features = np.load(config.TRAIN_FEATURES_FILE) test_features = np.load(config.TEST_FEATURES_FILE) params["num_features"] = train_features.shape[1] # load split with open(config.SPLIT_FILE, "rb") as f: train_idx, valid_idx = pkl.load(f) # validation X_train = get_model_data(dfTrain.loc[train_idx], train_features[train_idx], params) X_valid = get_model_data(dfTrain.loc[valid_idx], train_features[valid_idx], params) model = get_model(model_type)(params, logger, init_embedding_matrix=init_embedding_matrix) model.fit(X_train, Q, validation_data=X_valid, shuffle=True) # submit X_train = get_model_data(dfTrain, train_features, params) X_test = get_model_data(dfTest, test_features, params) y_proba = np.zeros((dfTest.shape[0], params["n_runs"]), dtype=np.float32) for run in range(params["n_runs"]): params["random_seed"] = run params["model_name"] = "semantic_model_%s"%str(run+1) model = get_model(model_type)(params, logger, init_embedding_matrix=init_embedding_matrix) model.fit(X_train, Q, validation_data=None, shuffle=True) y_proba[:,run] = model.predict_proba(X_test, Q).flatten() dfTest["y_pre"] = np.mean(y_proba[:,:(run+1)], axis=1) dfTest[["y_pre"]].to_csv(config.SINGLE_SUB_FILE_PATTERN%(model_type, str(run+1)), header=True, index=False)
def main(): model_type = None if len(sys.argv) > 1: model_type = sys.argv[1] os_utils._makedirs("../logs") os_utils._makedirs("../output") logger = log_utils._get_logger("../logs", "tf-%s.log" % time_utils._timestamp()) Q = load_question(params) dfTrain = load_train() dfTest = load_test() X_test = get_model_data(dfTest, params) # shuffle training data dfTrain = dfTrain.sample(frac=1.0) # validation train_ratio = 0.7 N = dfTrain.shape[0] train_num = int(N * train_ratio) X_train = get_model_data(dfTrain[:train_num], params) X_valid = get_model_data(dfTrain[train_num:], params) model = get_model(model_type)(params, logger, init_embedding_matrix=init_embedding_matrix) model.fit(X_train, Q, validation_data=X_valid, shuffle=True) # submit X_train = get_model_data(dfTrain, params) y_proba = np.zeros((dfTest.shape[0], params["n_runs"]), dtype=np.float32) for run in range(params["n_runs"]): params["random_seed"] = run params["model_name"] = "semantic_model_%s" % str(run + 1) model = get_model(model_type)( params, logger, init_embedding_matrix=init_embedding_matrix) model.fit(X_train, Q, validation_data=None, shuffle=True) y_proba[:, run] = model.predict_proba(X_test, Q).flatten() dfTest["y_pre"] = np.mean(y_proba[:, :(run + 1)], axis=1) dfTest[["y_pre"]].to_csv(config.SUB_FILE_PATTERN % str(run + 1), header=True, index=False)
def get_train_valid_test_data(augmentation=False): # load data Q = load_question(params) dfTrain = load_train() #dfTest = load_test() # train_features = load_feat("train") # test_features = load_feat("test") # params["num_features"] = train_features.shape[1] # load split #with open(config.SPLIT_FILE, "rb") as f: # train_idx, valid_idx = pkl.load(f) # validation if augmentation: dfDev = pd.read_csv(config.DATA_DIR + "/" + "dev_aug.csv") dfDev = downsample(dfDev) params["use_features"] = False params["augmentation_decay_steps"] = 50000 params["decay_steps"] = 50000 X_dev = get_model_data(dfDev, None, params) else: #X_dev = get_model_data(dfTrain.loc[train_idx], None, params)245000 X_dev = get_model_data(dfTrain.loc[:210000,:], None, params) #X_valid = get_model_data(dfTrain.loc[valid_idx], None, params) X_valid = get_model_data(dfTrain.loc[210000:220000,:], None, params) X_itest = get_model_data(dfTrain.loc[220000:,:], None, params) # submit #if augmentation: # dfTrain = pd.read_csv(config.DATA_DIR + "/" + "train_aug.csv") # dfTrain = downsample(dfTrain) # params["use_features"] = False # params["augmentation_decay_steps"] = 50000 # params["decay_steps"] = 50000 # X_train = get_model_data(dfTrain, None, params) #else: # X_train = get_model_data(dfTrain, None, params) #X_test = get_model_data(dfTest, None, params) return X_dev, X_valid, Q, X_itest