def get_training_set(kBestSents, tgtSents, srcSents, T, C, numProc): #get the difference in bleu values for pairs of hypothesis X = [] Y = [] for refNum, ref in tgtSents.iteritems(): mnPairDict = {} ref = tgtSents[refNum] selected = 0 tries = 0 while selected != T: tries += 1 if tries > 5*T: #sys.stderr.write("Skipped ") break i = random.randint(0, len(kBestSents[refNum])-1) j = random.randint(0, len(kBestSents[refNum])-1) if i == j: continue hyp1, feat1, bleuScore1 = kBestSents[refNum][i] hyp2, feat2, bleuScore2 = kBestSents[refNum][j] delBleu = bleuScore1 - bleuScore2 if abs(delBleu) < 0.05: continue mnPairDict[(refNum, i, j)] = delBleu selected += 1 if refNum %100 == 0: sys.stderr.write(str(refNum)+' ') selectedPairs = {key: abs(delBleu) for key, delBleu in mnPairDict.iteritems()} for pair, delBleu in sorted(selectedPairs.items(), key=itemgetter(1), reverse=True)[:C]: #add new features (refNum, i, j) = pair hyp1, feat1, bleuScore1 = kBestSents[refNum][i] hyp2, feat2, bleuScore2 = kBestSents[refNum][j] newFeat1 = features.add_features(srcSents[refNum], hyp1) + feat1 newFeat2 = features.add_features(srcSents[refNum], hyp2) + feat2 featDiff = numpy.array(newFeat1)-numpy.array(newFeat2) signDelBleu = abs(bleuScore1-bleuScore2)/(bleuScore1-bleuScore2) X += [ featDiff , -1.*featDiff ] Y += [ signDelBleu, -1.*signDelBleu ] del selectedPairs, mnPairDict return (X, Y)
def preprocess(**config): pred_var = config.get('pred_var', 'Torvet PM10') stations = config.get('stations', ['Torvet']) test_size = config.get('test_size', 0.3) val_size = config.get('val_size', 0.1) shuffle = config.get('shuffle', True) window = config.get('window', 6) if os.path.exists(cache_path): df = pd.read_csv(cache_path) df = df.set_index(pd.to_datetime(df['timestamp'])).drop(columns=['timestamp']).sort_index() else: df = pd.read_csv(data_path, index_col=[0], header=[0, 1]) df.index.name = 'timestamp' df.index = pd.to_datetime(df.index) df = df[[*stations, 'weather']] df.columns = [' '.join(col).strip() for col in df.columns.values] df = handle_missing(df, strategy='mean') df = add_features(df, labels=['Torvet PM10', 'Torvet PM2.5']) df.to_csv(cache_path) y = get_targets(df, pred_var, window) X = df data_dict = split_data(X, y, val_size=val_size, test_size=test_size, shuffle=shuffle) return data_dict
def ds_from_df(data_df, sessions_df, is_test): print('ds_from_df <<') data_df = add_features(data_df) data_df = add_sessions_features(data_df, sessions_df) if not is_test: data_df = data_df.drop(['country_destination'], axis=1) print('ds_from_df >>') return DataSet.create_from_df(data_df)
def get_training_set(kBestSents, tgtSents, srcSents, T, C): #get the difference in bleu values for pairs of hypothesis mnPairDict = {} selected = 0 while selected != T: refNum = random.randint(0, len(tgtSents)-1) ref = tgtSents[refNum] i = random.randint(0, len(kBestSents[refNum])-1) j = random.randint(0, len(kBestSents[refNum])-1) if i == j: continue hyp1, feat1 = kBestSents[refNum][i] hyp2, feat2 = kBestSents[refNum][j] delBleu = bleu.bleu_pair(hyp1, ref) - bleu.bleu_pair(hyp2, ref) if abs(delBleu) < 0.05: continue mnPairDict[(refNum, i, j)] = delBleu selected += 1 if selected %1000 == 0: sys.stderr.write(str(selected)+' ') X = [] Y = [] selectedPairs = {key: abs(delBleu) for key, delBleu in mnPairDict.iteritems()} selected = 0 for pair, delBleu in sorted(selectedPairs.items(), key=itemgetter(1), reverse=True): #add new features (refNum, i, j) = pair hyp1, feat1 = kBestSents[refNum][i] hyp2, feat2 = kBestSents[refNum][j] newFeat1 = features.add_features(srcSents[refNum], hyp1) + feat1 newFeat2 = features.add_features(srcSents[refNum], hyp2) + feat2 featDiff = numpy.array(newFeat1)-numpy.array(newFeat2) X += [ featDiff , -1.*featDiff ] Y += [ abs(mnPairDict[pair])/mnPairDict[pair], -1.*abs(mnPairDict[pair])/mnPairDict[pair] ] selected += 1 if selected == C: break del selectedPairs, mnPairDict return (X, Y)
def main(): parser = argparse.ArgumentParser(description='Predict the testing set') parser.add_argument('--model_type', default='RandomForest') parser.add_argument('--test', action='store_true') args = parser.parse_args() if args.test: suffix = 'test' else: suffix = time.strftime("%d_%m_%Y") model = get_model(args.model_type, args.test) print "Loaded Model: %s" % model print "Loading Training Data" training = load_training() if not args.test: print "Adding new features" training = add_features(training) print "Training Model" classifier = train(training, model) print "Saving Classifier" output_dir = 'models/classifier_%s' % suffix try: os.mkdir(output_dir) except: pass joblib.dump(classifier, '%s/%s.pkl' % (output_dir, classifier.__class__.__name__)) print "Loading testing set" testing = load_testing() if not args.test: print "Adding new features to testing set" testing = add_features(testing) print "Making predictions on testing set" predictions = predict(classifier, testing) output_predictions(predictions, threshold=0.7, filename='prediction_%s.csv' % suffix)
def process_csv_file(file, tidy_file=True, add_features=False): extension = re.search(r'(?<=\.).+$', str(file))[0] # Read the data in and run it through the cleaner table_data = pd.read_csv(file, header=None, names=name_files[str(extension)]) if tidy_file: table_data = tidy.tidy_it_up(table_data, extension) if add_features: table_data = features.add_features(table_data, extension) # Strip out unused columns for column in columns_to_delete[str(extension)]: del table_data[column] return table_data, extension
def main(): parser = argparse.ArgumentParser(description='Predict the testing set') parser.add_argument('--model_type', default='RandomForest') parser.add_argument('--test', action='store_true') args = parser.parse_args() model = get_model(args.model_type, args.test) print "Loaded Model: %s" % model print "Loading Training Data" training = load_training() print "Adding new features" training = add_features(training) print "Running Cross Validaton" cross_validate(training, model)
def main(path='data/raw_files', file_to_process=None): valid_extensions = ['1', '2', '3', '4', '5', '6', 'DRF'] preprocessor = TidyFile() if file_to_process: file_paths = [pathlib.Path(path, file_to_process)] else: file_paths = [pathlib.Path(path, file) for file in os.listdir(path)] file_paths = [file for file in file_paths if file.is_file()] bar = Bar(f'Processing files:', max=len(file_paths), suffix='%(percent).3f%% - %(index)d/%(max)d - %(eta)s secs.') for file in file_paths: bar.next() logging.info(f'Processing {file.name}') # Only process the file if it's a valid file type if file.suffix[1:] not in valid_extensions: logging.info(f'File {file.name} not a valid file type--skipping') continue file_already_processed = ( file.parent.parent / 'preprocessed_files' / (file.stem + '_processed' + file.suffix)).exists() if file_already_processed: continue extension = re.search(r'(?<=\.).+$', str(file))[0] table_data = pd.read_csv(file, header=None, names=name_files[str(extension)]) table_data = preprocessor.clean_table(table_data, extension, verbose=False) table_data = features.add_features(table_data, extension, verbose=False) table_data.fillna('NULL', inplace=True) output_file = file.stem + '_processed' + file.suffix table_data.to_pickle('data/preprocessed_files/' + output_file) bar.finish()
# https://github.com/gramolin/flavours-of-physics import pandas, xgboost, features, parameters # Read the training dataset: train = pandas.read_csv('data/training.csv', index_col='id') train = train[train['min_ANNmuon'] > 0.4] # Add extra features: train = features.add_features(train) # Train the first (geometric) XGBoost classifier: bst1 = xgboost.train(parameters.params, xgboost.DMatrix(train[features.list1], train['signal']), parameters.num_trees1) bst1.save_model('bst1.model') # Train the second (kinematic) XGBoost classifier: bst2 = xgboost.train(parameters.params, xgboost.DMatrix(train[features.list2], train['signal']), parameters.num_trees2) bst2.save_model('bst2.model')
def test_add_features1(): assert (add_features(test_case41) != check41).sum().sum() == 0
# https://github.com/gramolin/flavours-of-physics import csv import pandas as pd import xgboost as xgb import features import parameters # Read the training set: train = pd.read_csv('data/training.csv', index_col='id') train = train[train['min_ANNmuon'] > 0.4] # Add extra features: train = features.add_features(train) # Train the first XGBoost booster: bst1 = xgb.train(parameters.params1, xgb.DMatrix(train[features.list1], train['signal']), parameters.num_trees1) bst1.save_model('bst1.model') # Train the second XGBoost booster: bst2 = xgb.train(parameters.params2, xgb.DMatrix(train[features.list2], train['signal']), parameters.num_trees2) bst2.save_model('bst2.model')
import features import parameters # Load the first booster: bst1 = xgb.Booster() bst1.load_model("bst1.model") # Load the second booster: bst2 = xgb.Booster() bst2.load_model("bst2.model") # Create a submission file: with open('submission.csv', 'w') as csvfile: csv.writer(csvfile, delimiter=',').writerow(['id', 'prediction']) # Prediction and output: for chunk in pd.read_csv("data/test.csv", index_col='id', chunksize=100000): # Add extra features: chunk = features.add_features(chunk) # Predict probabilities: probs1 = bst1.predict(xgb.DMatrix(chunk[features.list1])) # Booster 1 probs2 = bst2.predict(xgb.DMatrix(chunk[features.list2])) # Booster 2 # Weighted average of the predictions: result = pd.DataFrame({'id': chunk.index}) result['prediction'] = 0.5*(parameters.w1*probs1 + (1 - parameters.w1)*probs2) # Write to the submission file: result.to_csv('submission.csv', index=False, header=False, sep=',', mode='a')
# Load the first booster: bst1 = xgb.Booster() bst1.load_model("bst1.model") # Load the second booster: bst2 = xgb.Booster() bst2.load_model("bst2.model") # Create a submission file: with open('submission.csv', 'w') as csvfile: csv.writer(csvfile, delimiter=',').writerow(['id', 'prediction']) # Prediction and output: for chunk in pd.read_csv("data/test.csv", index_col='id', chunksize=100000): # Add extra features: chunk = features.add_features(chunk) # Predict probabilities: probs1 = bst1.predict(xgb.DMatrix(chunk[features.list1])) # Booster 1 probs2 = bst2.predict(xgb.DMatrix(chunk[features.list2])) # Booster 2 # Weighted average of the predictions: result = pd.DataFrame({'id': chunk.index}) result['prediction'] = 0.5 * (parameters.w1 * probs1 + (1 - parameters.w1) * probs2) # Write to the submission file: result.to_csv('submission.csv', index=False, header=False, sep=',',
if not os.path.exists(log_folder): os.mkdir(log_folder) logger.info('script mode: {}'.format(args.mode)) logger.info('num_models: {}'.format(num_models)) logger.info('n_epochs: {}'.format(epochs)) logger.info('max_len: {}'.format(max_len)) seed_everything() logger.info("Loading train and test datas ...") start_time = time() train = pd.read_csv('datas/train_sample.csv') test = pd.read_csv('datas/test_sample.csv') train.dropna(subset=['comment_text'], inplace=True) train = add_features(train) test = add_features(test) x_train = eval(config['preprocessing'])(train['comment_text']) x_test = eval(config['preprocessing'])(test['comment_text']) features = train[['caps_vs_length', 'words_vs_unique']].fillna(0) test_features = test[['caps_vs_length', 'words_vs_unique']].fillna(0) ss = StandardScaler() ss.fit(np.vstack((features, test_features))) train_features = ss.transform(features) test_features = ss.transform(test_features) y_aux_train = train[[ 'target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat' ]].values