Exemplo n.º 1
0
Arquivo: pro.py Projeto: Sandy4321/dwl
def get_training_set(kBestSents, tgtSents, srcSents, T, C, numProc):
    
    #get the difference in bleu values for pairs of hypothesis
    X = []
    Y = []
    for refNum, ref in tgtSents.iteritems():
        mnPairDict = {}
        ref = tgtSents[refNum]
        selected = 0
        tries = 0
        while selected != T:
            tries += 1
            if tries > 5*T: 
                #sys.stderr.write("Skipped ")
                break
            
            i = random.randint(0, len(kBestSents[refNum])-1)
            j = random.randint(0, len(kBestSents[refNum])-1)
            if i == j: continue
        
            hyp1, feat1, bleuScore1 = kBestSents[refNum][i]
            hyp2, feat2, bleuScore2 = kBestSents[refNum][j]
            delBleu = bleuScore1 - bleuScore2
        
            if abs(delBleu) < 0.05: continue
        
            mnPairDict[(refNum, i, j)] = delBleu
            selected += 1
         
        if refNum %100 == 0: sys.stderr.write(str(refNum)+' ')
    
        selectedPairs = {key: abs(delBleu) for key, delBleu in mnPairDict.iteritems()}
        for pair, delBleu in sorted(selectedPairs.items(), key=itemgetter(1), reverse=True)[:C]:
            #add new features
            (refNum, i, j) = pair
            hyp1, feat1, bleuScore1 = kBestSents[refNum][i]
            hyp2, feat2, bleuScore2 = kBestSents[refNum][j]
            
            newFeat1 = features.add_features(srcSents[refNum], hyp1) + feat1
            newFeat2 = features.add_features(srcSents[refNum], hyp2) + feat2
            featDiff = numpy.array(newFeat1)-numpy.array(newFeat2)
            signDelBleu = abs(bleuScore1-bleuScore2)/(bleuScore1-bleuScore2)
            
            X += [ featDiff , -1.*featDiff ]
            Y += [ signDelBleu, -1.*signDelBleu ]
            
    del selectedPairs, mnPairDict
    return (X, Y)
Exemplo n.º 2
0
def preprocess(**config):
    pred_var = config.get('pred_var', 'Torvet PM10')
    stations = config.get('stations', ['Torvet'])
    test_size = config.get('test_size', 0.3)
    val_size = config.get('val_size', 0.1)
    shuffle = config.get('shuffle', True)
    window = config.get('window', 6)

    if os.path.exists(cache_path):
        df = pd.read_csv(cache_path)
        df = df.set_index(pd.to_datetime(df['timestamp'])).drop(columns=['timestamp']).sort_index()
    else:
        df = pd.read_csv(data_path, index_col=[0], header=[0, 1])
        df.index.name = 'timestamp'
        df.index = pd.to_datetime(df.index)

        df = df[[*stations, 'weather']]
        df.columns = [' '.join(col).strip() for col in df.columns.values]

        df = handle_missing(df, strategy='mean')
        df = add_features(df, labels=['Torvet PM10', 'Torvet PM2.5'])

        df.to_csv(cache_path)

    y = get_targets(df, pred_var, window)
    X = df
    data_dict = split_data(X, y, val_size=val_size, test_size=test_size, shuffle=shuffle)
    return data_dict
Exemplo n.º 3
0
def ds_from_df(data_df, sessions_df, is_test):
    print('ds_from_df <<')
    data_df = add_features(data_df)
    data_df = add_sessions_features(data_df, sessions_df)
    if not is_test:
        data_df = data_df.drop(['country_destination'], axis=1)
    print('ds_from_df >>')
    return DataSet.create_from_df(data_df)
Exemplo n.º 4
0
def get_training_set(kBestSents, tgtSents, srcSents, T, C):
    
    #get the difference in bleu values for pairs of hypothesis
    mnPairDict = {}
    selected = 0
    while selected != T:
        refNum = random.randint(0, len(tgtSents)-1)
        ref = tgtSents[refNum]
        i = random.randint(0, len(kBestSents[refNum])-1)
        j = random.randint(0, len(kBestSents[refNum])-1)
        if i == j: continue
        
        hyp1, feat1 = kBestSents[refNum][i]
        hyp2, feat2 = kBestSents[refNum][j]
        delBleu = bleu.bleu_pair(hyp1, ref) - bleu.bleu_pair(hyp2, ref)
        if abs(delBleu) < 0.05: continue
        
        mnPairDict[(refNum, i, j)] = delBleu
        selected += 1
        if selected %1000 == 0: sys.stderr.write(str(selected)+' ')
    
    X = []
    Y = []            
    selectedPairs = {key: abs(delBleu) for key, delBleu in mnPairDict.iteritems()}
    selected = 0
    for pair, delBleu in sorted(selectedPairs.items(), key=itemgetter(1), reverse=True):
        
        #add new features
        (refNum, i, j) = pair
        hyp1, feat1 = kBestSents[refNum][i]
        hyp2, feat2 = kBestSents[refNum][j]
        
        newFeat1 = features.add_features(srcSents[refNum], hyp1) + feat1
        newFeat2 = features.add_features(srcSents[refNum], hyp2) + feat2
        featDiff = numpy.array(newFeat1)-numpy.array(newFeat2)
        
        X += [ featDiff , -1.*featDiff ]
        Y += [ abs(mnPairDict[pair])/mnPairDict[pair], -1.*abs(mnPairDict[pair])/mnPairDict[pair] ]
        
        selected += 1
        if selected == C:
            break
            
    del selectedPairs, mnPairDict
    return (X, Y)
Exemplo n.º 5
0
def main():

    parser = argparse.ArgumentParser(description='Predict the testing set')
    parser.add_argument('--model_type', default='RandomForest')
    parser.add_argument('--test', action='store_true')
    args = parser.parse_args()

    if args.test:
        suffix = 'test'
    else:
        suffix = time.strftime("%d_%m_%Y")

    model = get_model(args.model_type, args.test)
    print "Loaded Model: %s" % model

    print "Loading Training Data"
    training = load_training()

    if not args.test:
        print "Adding new features"
        training = add_features(training)

    print "Training Model"
    classifier = train(training, model)

    print "Saving Classifier"
    output_dir = 'models/classifier_%s' % suffix
    try:
        os.mkdir(output_dir)
    except:
        pass
    joblib.dump(classifier, '%s/%s.pkl' % (output_dir, classifier.__class__.__name__))

    print "Loading testing set"
    testing = load_testing()

    if not args.test:
        print "Adding new features to testing set"
        testing = add_features(testing)

    print "Making predictions on testing set"
    predictions = predict(classifier, testing)
    output_predictions(predictions, threshold=0.7,
                       filename='prediction_%s.csv' % suffix)
Exemplo n.º 6
0
def process_csv_file(file, tidy_file=True, add_features=False):
    extension = re.search(r'(?<=\.).+$', str(file))[0]
    #   Read the data in and run it through the cleaner
    table_data = pd.read_csv(file,
                             header=None,
                             names=name_files[str(extension)])
    if tidy_file:
        table_data = tidy.tidy_it_up(table_data, extension)
    if add_features:
        table_data = features.add_features(table_data, extension)
    #   Strip out unused columns
    for column in columns_to_delete[str(extension)]:
        del table_data[column]
    return table_data, extension
Exemplo n.º 7
0
def main():

    parser = argparse.ArgumentParser(description='Predict the testing set')
    parser.add_argument('--model_type', default='RandomForest')
    parser.add_argument('--test', action='store_true')
    args = parser.parse_args()

    model = get_model(args.model_type, args.test)
    print "Loaded Model: %s" % model

    print "Loading Training Data"
    training = load_training()

    print "Adding new features"
    training = add_features(training)

    print "Running Cross Validaton"
    cross_validate(training, model)
Exemplo n.º 8
0
def main(path='data/raw_files', file_to_process=None):
    valid_extensions = ['1', '2', '3', '4', '5', '6', 'DRF']
    preprocessor = TidyFile()

    if file_to_process:
        file_paths = [pathlib.Path(path, file_to_process)]
    else:
        file_paths = [pathlib.Path(path, file) for file in os.listdir(path)]
        file_paths = [file for file in file_paths if file.is_file()]

    bar = Bar(f'Processing files:',
              max=len(file_paths),
              suffix='%(percent).3f%% - %(index)d/%(max)d - %(eta)s secs.')

    for file in file_paths:
        bar.next()
        logging.info(f'Processing {file.name}')

        # Only process the file if it's a valid file type
        if file.suffix[1:] not in valid_extensions:
            logging.info(f'File {file.name} not a valid file type--skipping')
            continue

        file_already_processed = (
            file.parent.parent / 'preprocessed_files' /
            (file.stem + '_processed' + file.suffix)).exists()
        if file_already_processed: continue
        extension = re.search(r'(?<=\.).+$', str(file))[0]
        table_data = pd.read_csv(file,
                                 header=None,
                                 names=name_files[str(extension)])
        table_data = preprocessor.clean_table(table_data,
                                              extension,
                                              verbose=False)
        table_data = features.add_features(table_data,
                                           extension,
                                           verbose=False)

        table_data.fillna('NULL', inplace=True)
        output_file = file.stem + '_processed' + file.suffix
        table_data.to_pickle('data/preprocessed_files/' + output_file)
    bar.finish()
Exemplo n.º 9
0
# https://github.com/gramolin/flavours-of-physics

import pandas, xgboost, features, parameters

# Read the training dataset:
train = pandas.read_csv('data/training.csv', index_col='id')
train = train[train['min_ANNmuon'] > 0.4]

# Add extra features:
train = features.add_features(train)

# Train the first (geometric) XGBoost classifier:
bst1 = xgboost.train(parameters.params,
                     xgboost.DMatrix(train[features.list1], train['signal']),
                     parameters.num_trees1)
bst1.save_model('bst1.model')

# Train the second (kinematic) XGBoost classifier:
bst2 = xgboost.train(parameters.params,
                     xgboost.DMatrix(train[features.list2], train['signal']),
                     parameters.num_trees2)
bst2.save_model('bst2.model')
def test_add_features1():
    assert (add_features(test_case41) != check41).sum().sum() == 0
Exemplo n.º 11
0
# https://github.com/gramolin/flavours-of-physics

import csv
import pandas as pd
import xgboost as xgb

import features
import parameters

# Read the training set:
train = pd.read_csv('data/training.csv', index_col='id')
train = train[train['min_ANNmuon'] > 0.4]

# Add extra features:
train = features.add_features(train)

# Train the first XGBoost booster:
bst1 = xgb.train(parameters.params1, xgb.DMatrix(train[features.list1], train['signal']), parameters.num_trees1)
bst1.save_model('bst1.model')

# Train the second XGBoost booster:
bst2 = xgb.train(parameters.params2, xgb.DMatrix(train[features.list2], train['signal']), parameters.num_trees2)
bst2.save_model('bst2.model')
Exemplo n.º 12
0
import features
import parameters

# Load the first booster:
bst1 = xgb.Booster()
bst1.load_model("bst1.model")

# Load the second booster:
bst2 = xgb.Booster()
bst2.load_model("bst2.model")

# Create a submission file:
with open('submission.csv', 'w') as csvfile:
  csv.writer(csvfile, delimiter=',').writerow(['id', 'prediction'])

# Prediction and output:
for chunk in pd.read_csv("data/test.csv", index_col='id', chunksize=100000):
  # Add extra features:
  chunk = features.add_features(chunk)
  
  # Predict probabilities:
  probs1 = bst1.predict(xgb.DMatrix(chunk[features.list1])) # Booster 1
  probs2 = bst2.predict(xgb.DMatrix(chunk[features.list2])) # Booster 2
  
  # Weighted average of the predictions:
  result = pd.DataFrame({'id': chunk.index})
  result['prediction'] = 0.5*(parameters.w1*probs1 + (1 - parameters.w1)*probs2)
  
  # Write to the submission file:
  result.to_csv('submission.csv', index=False, header=False, sep=',', mode='a')
Exemplo n.º 13
0
# Load the first booster:
bst1 = xgb.Booster()
bst1.load_model("bst1.model")

# Load the second booster:
bst2 = xgb.Booster()
bst2.load_model("bst2.model")

# Create a submission file:
with open('submission.csv', 'w') as csvfile:
    csv.writer(csvfile, delimiter=',').writerow(['id', 'prediction'])

# Prediction and output:
for chunk in pd.read_csv("data/test.csv", index_col='id', chunksize=100000):
    # Add extra features:
    chunk = features.add_features(chunk)

    # Predict probabilities:
    probs1 = bst1.predict(xgb.DMatrix(chunk[features.list1]))  # Booster 1
    probs2 = bst2.predict(xgb.DMatrix(chunk[features.list2]))  # Booster 2

    # Weighted average of the predictions:
    result = pd.DataFrame({'id': chunk.index})
    result['prediction'] = 0.5 * (parameters.w1 * probs1 +
                                  (1 - parameters.w1) * probs2)

    # Write to the submission file:
    result.to_csv('submission.csv',
                  index=False,
                  header=False,
                  sep=',',
Exemplo n.º 14
0
if not os.path.exists(log_folder):
    os.mkdir(log_folder)

logger.info('script mode: {}'.format(args.mode))
logger.info('num_models: {}'.format(num_models))
logger.info('n_epochs: {}'.format(epochs))
logger.info('max_len: {}'.format(max_len))

seed_everything()

logger.info("Loading train and test datas ...")
start_time = time()
train = pd.read_csv('datas/train_sample.csv')
test = pd.read_csv('datas/test_sample.csv')
train.dropna(subset=['comment_text'], inplace=True)
train = add_features(train)
test = add_features(test)

x_train = eval(config['preprocessing'])(train['comment_text'])
x_test = eval(config['preprocessing'])(test['comment_text'])

features = train[['caps_vs_length', 'words_vs_unique']].fillna(0)
test_features = test[['caps_vs_length', 'words_vs_unique']].fillna(0)
ss = StandardScaler()
ss.fit(np.vstack((features, test_features)))
train_features = ss.transform(features)
test_features = ss.transform(test_features)
y_aux_train = train[[
    'target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult',
    'threat'
]].values