示例#1
0
def main():
    # Get label encoder
    lb = LabelBinarizer()
    lbenc = lb.fit(utils.get_classes())

    # Get train data
    X_train, y_train, train_filenames = utils.get_train(
        '../input/train', list(lbenc.classes_), img_width, img_height)

    # Create and train model
    model = train(X_train, y_train, epochs=100, batch_size=32)

    print("+++++++++++++++++++++++++++++++++++++++++++")

    # Load model ...
    #model = load_model('../models/'+ 'model2_f0.86/'+ 'model2-64-0.341.h5')

    # Get test data
    X_test, X_test_id = utils.get_test('../input/test', img_width, img_height)
    # Predict on test data
    preds = model.predict(X_test, verbose=1)

    # Create submission
    utils.create_submission(lbenc.inverse_transform(preds),
                            X_test_id,
                            output_path="../submissions/",
                            filename=modelname,
                            isSubmission=True)
    utils.to_csv_ens(lbenc.inverse_transform(preds),
                     preds,
                     X_test_id,
                     utils.get_classes(),
                     output_path="../submissions/",
                     filename=modelname)
    print('Finished.')
def eval():
    # Get classes
    lb = LabelBinarizer()
    lbenc = lb.fit(utils.get_classes())

    # Load model
    model = load_model('../models/' + 'model.h5')

    # Get test data
    X_test, X_test_id = utils.get_test('../input/test', img_width, img_height)

    # Predict on test data
    preds = model.predict(X_test, verbose=1)

    # Create ensembling file
    df_csv = utils.to_csv_ens(lbenc.inverse_transform(preds),
                              preds,
                              X_test_id,
                              utils.get_classes(),
                              output_path="../submissions/",
                              filename=modelname)
    # Create submission file
    subm = utils.create_submission(lbenc.inverse_transform(preds),
                                   X_test_id,
                                   output_path="../submissions/",
                                   filename=modelname,
                                   isSubmission=True)
def read_vals(objective, graph_files):
    tool_files = ["/".join(path) for path in graph_files if utils.get_tool(utils.get_fn(path)) == tool]
    # Extract times
    name_to_n = utils.key_functions[objective]
    time_pairs = [(name_to_n(utils.get_test(utils.get_fn(path.split("/")))),
                   utils.read_times(in_dir + "/" + path))
                  for path in tool_files]

    # Sort values
    times_sorted = sorted(time_pairs, key=lambda pair: pair[0])
    n_vals = list(map(lambda pair: pair[0], times_sorted))
    t_objective_vals = list(map(lambda pair: pair[1][0], times_sorted))
    t_jacobian_vals = list(map(lambda pair: pair[1][1], times_sorted))

    return (n_vals, t_objective_vals, t_jacobian_vals)
示例#4
0
    def get_test_instance(self):
        if os.path.exists('../DataSets/mydata/test_txtid.pkl') and os.path.exists('../DataSets/mydata/test_instances.pkl'):
            test_txtid_pkl = open('../DataSets/mydata/test_txtid.pkl', 'rb')
            test_txtid = pickle.load(test_txtid_pkl)
            test_instances_pkl = open('../DataSets/mydata/test_instances.pkl', 'rb')
            test_instances = pickle.load(test_instances_pkl)
        else:
            test_txtid, test_instances = get_test()

        test_paper_list = self.load_data(test_instances)
        test_tag_list, test_id_list, test_p1_list, test_p2_list, test_y_list = self.test_padding(test_paper_list,
                                                                                                      self.word2id_dic,
                                                                                                      self.max_sequence_length,
                                                                                                      self.position_max)
        self.testb_dataset = (test_txtid, test_tag_list, test_id_list, test_p1_list, test_p2_list, test_y_list)
        return self.testb_dataset
示例#5
0
def read_vals(objective, graph_files, tool, in_dir):
    '''Extracts data for files of the specified tool.'''
    def get_violations(file_name):
        '''Extracts jacobian calculation correctness.'''

        folder, fn = os.path.split(file_name)
        correctness_file_name = os.path.join(
            in_dir, folder, fn.replace(TIMES_SUBSTRING, CORRECTNESS_SUBSTRING))

        if not os.path.isfile(correctness_file_name):
            print(f"WARNING: correctness file {correctness_file_name} "
                  "doesn't exist\n")
            return False

        try:
            with open(correctness_file_name, "r", encoding="ascii") as cf:
                correctness_data = json.load(cf)
                return correctness_data["ViolationsHappened"]
        except Exception as e:
            print(f"WARNING: correctness file {correctness_file_name} parsing "
                  f"failed.\nError message:{e.args}\n")
            return False

    tool_files = [
        os.path.join(*path) for path in graph_files
        if utils.get_tool_from_path(path) == tool
    ]

    if has_manual(tool):
        violation_info = [False for file in tool_files]
    else:
        violation_info = [get_violations(file) for file in tool_files]

    # Extract times
    name_to_n = utils.key_functions[objective]
    info = [(name_to_n(utils.get_test(utils.get_fn(path.split("/")))),
             utils.read_times(os.path.join(in_dir, path)), violation)
            for (path, violation) in zip(tool_files, violation_info)]

    # Sort values
    info_sorted = sorted(info, key=lambda t: t[0])
    n_vals = list(map(lambda t: t[0], info_sorted))
    t_objective_vals = list(map(lambda t: t[1][0], info_sorted))
    t_jacobian_vals = list(map(lambda t: t[1][1], info_sorted))
    violation_vals = list(map(lambda t: t[2], info_sorted))

    return (n_vals, t_objective_vals, t_jacobian_vals, violation_vals)
示例#6
0
文件: misc.py 项目: kuruonur1/cwi
def get_vocab(dset):
    return set(w for sent in dset for w in sent['ws'])

def get_contexts(sent, c):
    ws = (['<s>']*c) + sent['ws'] + (['</s>']*c)

    contexts = []
    for i, w in enumerate(sent['ws']):
        wi = i + c
        if sent['ii'][i]:
            contexts.append(' '.join([w for w in ws[wi-c:wi] + ['___'] + ws[wi+1:wi+c+1]]))
    return contexts

if __name__ == '__main__':
    trn = get_dset()
    tst = get_test()
    print map(len, map(get_tagged_vocab, [trn,tst]))
    print 'tagged vocab size trn {} tst {}'.format(*map(len, map(get_tagged_vocab, [trn,tst])))
    print 'all vocab size trn {} tst {}'.format(*map(len, map(get_vocab, [trn,tst])))

    vtrn, vtst = map(get_tagged_vocab, [trn,tst])
    print 'tagged vtst diff: {:.2f}'.format( len(vtst.difference(vtrn)) / len(vtst) )

    vtrn, vtst = map(get_vocab, [trn,tst])
    print 'all vtst diff: {:.2f}'.format( len(vtst.difference(vtrn)) / len(vtst) )

    precnt = Counter(w[:j] for sent in trn for w, lbl in zip(sent['ws'],sent['ls']) for j in range(3,5) if lbl==1 and len(w)>j)
    sufcnt = Counter(w[-j:] for sent in trn for w, lbl in zip(sent['ws'],sent['ls']) for j in range(3,5) if lbl==1 and len(w)>j)
    print 'most common prefixes:', precnt.most_common(100)
    print 'most common suffixes:', sufcnt.most_common(100)
示例#7
0
def submission(model,
               sampling_method,
               data_dir,
               results_dir,
               device='cpu',
               verbose=True):
    if verbose:
        print("Using device: {}".format(device))
        print("Reading train data in...")
    if model == 'lgbm':
        X_train, Y_train, feature_labels = get_train(data_dir, one_hot=False)
    else:
        X_train, Y_train, feature_labels = get_train(data_dir)

    X_test = get_test(data_dir)
    train_ids, test_ids = get_ids(data_dir)
    country_names = get_country_names(data_dir)

    if verbose:
        print("Successfully loaded data")

    lgbm_params = {
        'task': 'train',
        'objective': 'multiclass',
        'num_class': 12,
        'num_leaves': 31,
        'learning_rate': 0.3,
        'lambda_l2': 1.0,
        'feature_fraction': 0.9,
        'min_child_weight': 1.0,
        'device': device,
        'gpu_device_id': 0,
        'gpu_platform_id': 0,
        'max_bin': 63,
        'verbose': 0
    }

    if device == 'cpu':
        xgb_params = {
            "objective": "multi:softprob",
            "num_class": 12,
            "tree_method": "hist",
            "colsample_bytree": 0.9,
            "n_jobs": 2,
            "silent": 1
        }
    else:
        xgb_params = {
            "objective": "multi:softprob",
            "num_class": 12,
            "tree_method": "gpu_hist",
            "colsample_bytree": 0.9,
            "gpu_id": 0,
            "max_bin": 16,
            "silent": 1
        }
    if verbose:
        print("{} sampling process started...".format(sampling_method))
    curr_time = time.time()

    if sampling_method == "adasyn":
        X_train_resampled, Y_train_resampled = ADASYN().fit_sample(
            X_train, Y_train)
    elif sampling_method == "smote":
        X_train_resampled, Y_train_resampled = SMOTE().fit_sample(
            X_train, Y_train)
    elif sampling_method == "random":
        X_train_resampled, Y_train_resampled = RandomOverSampler().fit_sample(
            X_train, Y_train)
    elif sampling_method == "smoteenn":
        X_train_resampled, Y_train_resampled = SMOTEENN().fit_sample(
            X_train, Y_train)
    else:
        X_train_resampled, Y_train_resampled = X_train, Y_train

    if verbose:
        print("Oversampling completed")
        print("Time Taken: {:.2f}".format(time.time() - curr_time))
        print("Size of Oversampled data: {}".format(X_train_resampled.shape))
        print("{} selected for classification".format(model))

    curr_time = time.time()
    if model == 'lgbm':
        categorical_feature = [
            'age_bucket', 'gender', 'signup_method', 'signup_flow', 'language',
            'affiliate_channel', 'affiliate_provider',
            'first_affiliate_tracked', 'signup_app', 'first_device_type',
            'first_browser'
        ]
        lgb_train = lgb.Dataset(data=X_train_resampled,
                                label=Y_train_resampled,
                                feature_name=feature_labels,
                                categorical_feature=categorical_feature)
        clf = lgb.train(lgbm_params, lgb_train, num_boost_round=30)
        print("Time taken: {:.2f}".format(time.time() - curr_time))
        Y_probs = clf.predict(X_test)
        order = np.argsort(-Y_probs[:, :5], axis=1)
    else:
        X_train_xgb = xgb.DMatrix(X_train_resampled,
                                  Y_train_resampled,
                                  feature_names=feature_labels)
        X_test_xgb = xgb.DMatrix(X_test, feature_names=feature_labels)
        clf = xgb.train(xgb_params, X_train_xgb, 30)
        print("Time taken: {:.2f}".format(time.time() - curr_time))
        Y_probs = clf.predict(X_test_xgb)
        order = np.argsort(-Y_probs[:, :5], axis=1)

    print("Generating submission csv...")
    with open(os.path.join(results_dir, 'submission_{}.csv'.format(model)),
              'w') as f:
        writer = csv.writer(f, delimiter=',', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(['id', 'country'])
        for i in range(len(test_ids)):
            for k in range(5):
                writer.writerow([test_ids[i], country_names[order[i, k]]])
    print("Finished.")
示例#8
0
文件: cwi.py 项目: kuruonur1/cwi
    pred_labels = []
    for sent, pred in zip(dset,preds):
        gold_labels.extend([sent['ls'][ii] for ii, interested in enumerate(sent['ii']) if interested])
        # pred_labels.extend([pred[ii] for ii, interested in enumerate(sent['ii']) if interested])
        pred_labels.extend(pred)
    logging.debug(tabulate(confusion_matrix(np.array(gold_labels), np.array(pred_labels)), headers=[0,1]))
    p, r, f = evaluate_system.evaluateIdentifier(gold_labels, pred_labels)
    return p,r,f

if __name__ == '__main__':
    parser = get_arg_parser()
    args = vars(parser.parse_args())

    setup_logger(args)

    logging.debug(tabulate([OrderedDict((k,v) for k,v in sorted(args.iteritems()))], headers='keys'))

    if args['testf']:
        trn = utils.get_dset(args['data'])
        tst = utils.get_test()
        ytrn, ytst = fit_predict(trn, tst, args, Emb(trn+tst))
        with open(args['testf'], 'w') as out:
            out.write('\n'.join([str(y) for y in ytst]))
    else:
        dset = utils.get_dset(args['data'])
        if args['sample'] > 0:
            random.seed(0)
            dset = random.sample(dset, args['sample'])
        xvalidate(dset, args, Emb(dset))

# create initializations for the three different test problems
# for every problem: create 100*5 random X values, distributed on the given slices
# for every problem: create corresponding responses
# save the data to be used in the torch and GPy implementation
from utils import get_test
import numpy as np
import torch
import pandas as pd

torch.manual_seed(42)
np.random.seed(42)

problems = ['branin', 'eggholder', 'camel']
for problem in problems:
    testfunction, slices, scalers, hyperparameters = get_test(problem)

    # create 500 points on [0, 1]
    X1 = np.random.rand(500, 1)
    X2 = np.random.choice(slices.numpy(),
                          (500, 1))  # slices are already scaled

    X = scalers[0].inverse_transform(
        torch.tensor(np.concatenate((X1, X2),
                                    axis=1)))  # scale back for evaluation

    y = testfunction(torch.tensor(X))

    # write to csv
    df = pd.DataFrame(np.concatenate((X, y.reshape(-1, 1)), axis=1),
                      columns=['X1', 'X2', 'y'])
    df.to_csv('initialization_' + problem + '.csv', index=False)
示例#10
0
def min_max_optimization():
    np.random.seed(42)
    torch.manual_seed(32)
    iterations_list = [20, 20, 100]
    for problem_idx, problem in enumerate(['branin', 'camel', 'eggholder']):

        testfunction, slices, scalers, hyperparameters = get_test(problem)

        n_init = 5
        jj = 0

        # load data
        df_read = pd.read_csv('initialization_' + problem + '.csv')
        print("read data")
        for initialization in range(int(df_read.shape[0] / n_init)):
            X = np.array(df_read.iloc[(initialization *
                                       n_init):(initialization * n_init +
                                                n_init), 0:2])
            z = np.array(df_read.iloc[(initialization *
                                       n_init):(initialization * n_init +
                                                n_init), 2]).reshape(-1, 1)

            # scale
            X_scaled = torch.tensor(scalers[0].transform(X),
                                    dtype=torch.float32)
            z_scaled = torch.tensor(scalers[1].transform(z),
                                    dtype=torch.float32)

            # run the optimization
            iterations = iterations_list[problem_idx]
            model = build_models(X_scaled, z_scaled, hyperparameters)
            model = model.eval()

            results = torch.zeros((1, 7))
            print("started optimization")
            for i in range(iterations):
                new_candidate, min_max_location, current_min_max = thompson_sampling_acquisition(
                    model, slices)
                current_min_max_unscaled = torch.tensor(
                    scalers[1].inverse_transform(
                        current_min_max.detach().numpy().reshape(1, 1)))
                min_max_location_unscaled = torch.tensor(
                    scalers[0].inverse_transform(
                        min_max_location.detach().numpy()))
                new_candidate_unscaled = torch.tensor(
                    scalers[0].inverse_transform(
                        new_candidate.detach().numpy()))

                new_function_value = testfunction(
                    new_candidate_unscaled.reshape(1, -1))

                # update the model
                model = model.condition_on_observations(
                    new_candidate,
                    torch.tensor(scalers[1].transform(
                        new_function_value.numpy().reshape(-1, 1))))

                print('new candidate:', new_candidate_unscaled)
                print('min max location:', min_max_location_unscaled)
                print('current min max:', current_min_max_unscaled)

                print('iteration ', i)
                results[0, 0] = i
                results[0, 1:3] = new_candidate_unscaled
                results[0, 3:5] = min_max_location_unscaled
                results[0, 5] = current_min_max_unscaled
                results[0, 6] = initialization

                df = pd.DataFrame(results.detach().numpy(),
                                  columns=[
                                      'i', 'x_cand0', 'x_cand1', 'min_max0',
                                      'min_max1', 'min_max_val', 'init'
                                  ])
                df['problem'] = problem
                if jj == 0:
                    df.to_csv(problem + '_results_thompson.csv', index=False)
                else:
                    df.to_csv(problem + '_results_thompson.csv',
                              mode='a',
                              header=False,
                              index=False)
                jj += 1

            print('finished the optimization')
            best_valid_mae = model.mae_valid
            best_epoch_trial = model.early_stop_epoch

        print('trial valid rmse of best epoch:', model.best_rmse)
        print('trial valid r2 of best epoch:', model.r2_valid)
        print('trial valid mae of best epoch:', model.mae_valid)

    with open('params_dict.txt', 'w', encoding="utf8") as outfile:
        json.dump(trials_dict, outfile)

    return best_valid_rmse, best_valid_r_2, best_valid_mae, best_epoch_trial


if __name__ == '__main__':
    train, validation = get_data(True, 1, 1)
    test = get_test()
    # SGD
    best_valid_rmse_sgd, best_valid_r_2_sgd, best_valid_mae_sgd, best_epoch = \
        hyper_param_tuning('SGD', SGD_HYPER_PARAMS)

    print('best SGD model rmse:', best_valid_rmse_sgd)
    print('best SGD model r2:', best_valid_r_2_sgd)
    print('best SGD model mae:', best_valid_mae_sgd)

    final_model = SGD()
    final_model.fit_early_stop(train, validation, best_epoch)

    test['pred'] = test.apply(lambda row:
                              final_model.predict(row[USER_COL],
                                                  row[ITEM_COL]), axis=1)
    test[[USER_COL, ITEM_COL]] = test[[USER_COL, ITEM_COL]].apply(lambda col: col + 1)
    print("Ensemble : ", mean_square_error(target_val, mean_pred_data))
"""
Create a file of the ensemble prediction
 in the corresponding format for a submission on kaggle
"""

all_preds_han = []
for i in range(len(target_idx)):
    idx = target_idx[i]
    nb_model = 1
    list_pred = []
    # each model begin to predict on the documents test
    for i in range(len(type_loss)):

        name_doc = list_name_documents[i]
        docs_val = get_test(name_doc=name_doc, idx_target=idx, config=config)

        if type_loss[i] == "mse":
            custom_loss = "mean_squared_error"
        elif type_loss[i] == "higher":
            custom_loss = mse_asymetric_higher
        elif type_loss[i] == "lower":
            custom_loss = mse_asymetric_lower

        if type_model[i] == 1:
            model = get_model_1(docs_train=docs_val,
                                config=config,
                                name_embeddings=list_embeddings[i],
                                custom_loss=custom_loss)
        elif type_model[i] == 2:
            model = get_model_2(docs_train=docs_val,