Пример #1
0
def get_data_for_learner():
    """Return data ready to be used in the model.

    Returns:
        train_dl: DataLoader to iterate on.
        x_valid: validation set.
        y_valid: target validation set.
        test: data for final test.
    """
    train, test = u.get_data()
    train = u.preprocess_data(train, True)
    test = u.preprocess_data(test, False)
    X_train, X_valid, y_train, y_valid = u.train_validation_split(train, VALID_PERCENTAGE)
    x_train, y_train, x_valid, y_valid = map(torch.tensor, (X_train, y_train, X_valid, y_valid))
    test = torch.tensor(test.to_numpy())

    x_train = x_train.float()
    y_train = y_train.long()
    test = test.float()
    train_ds = TensorDataset(x_train, y_train)
    train_dl = DataLoader(train_ds, batch_size=bs, shuffle=True)
    x_valid = x_valid.float()
    y_valid = y_valid.long()

    return train_dl, x_valid, y_valid, test
 def get_rank(self, arXiv_df, field_df):
     field_matrix = self._model.transform(
         preprocess_data(field_df)).toarray()
     paper_matrix = self._model.transform(
         preprocess_data(arXiv_df[['title', 'abstract']])).toarray()
     score = self.get_similarity(paper_matrix, field_matrix)
     result_df = arXiv_df.copy()
     result_df['score'] = score
     result_df.sort_values(by='score', inplace=True, ascending=False)
     return result_df
Пример #3
0
def main():
    COMMENT_TEXT_COL = 'comment_text'
    EMB_MAX_FEAT = 300
    MAX_LEN = 220
    MAX_FEATURES = 100000
    #BATCH_SIZE = 1024
    BATCH_SIZE = 256
    
    #BATCH_SIZE = 2048
    NUM_EPOCHS = 1
    LSTM_UNITS = 64
    if args.debug:
        print('running in debug mode')
    if args.debug:
        result_dir = os.path.join(utils.RESULT_DIR, 'debug-'+datetime.strftime(datetime.now(), '%Y%m%d%H%M%S'))
    else:
        result_dir = os.path.join(utils.RESULT_DIR, datetime.strftime(datetime.now(), '%Y%m%d%H%M%S'))
    os.mkdir(result_dir)
    print(f'created: {result_dir}')

#     convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(
#         os.path.join(utils.BERT_MODEL_PATH, 'bert_model.ckpt'),
#         os.path.join(utils.BERT_MODEL_PATH, 'bert_config.json'),
#         utils.PYTORCH_BERT_MODEL_PATH)
    train_data = ToxicDataset(mode='train', debug=args.debug)
    test_data = ToxicDataset(mode='test')
    train, test = train_data.data, test_data.data
    train = utils.preprocess_data(train, mode='train')
    test = utils.preprocess_data(test)
    #tokenizer = Tokenizer(num_words=MAX_FEATURES, lower=True)
    tokenizer = BertTokenizer.from_pretrained(utils.BERT_MODEL_PATH, 
                                              do_lower_case=True)
    X_train, X_test, y_train = utils.run_bert_tokenizer(tokenizer, train, test, 
                                                               seq_len=MAX_LEN)
    #word_index = tokenizer.word_index
    word_index = None
    #print(word_index)
#    print(f'vocab size: {len(word_index)}')
#     embedding_matrix = utils.build_embeddings(word_index, emb_max_feat=EMB_MAX_FEAT)
#     print(embedding_matrix.shape)
    embedding_matrix = None
    sub_preds, oof_df = utils.run_model_pytorch(result_dir, X_train, X_test, y_train, embedding_matrix, 
                                        word_index, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS, 
                                        max_len=MAX_LEN, lstm_units=LSTM_UNITS, oof_df=train)
    bias_metrics_df = utils.compute_bias_metrics_for_model(dataset=oof_df, 
                                                           subgroups=utils.IDENTITY_COLS,
                                                           model=utils.PREDICT_COL, 
                                                           label_col=utils.TOXICITY_COLUMN)
    validation_final_socre = utils.get_final_metric(bias_metrics_df, 
                                                    utils.calculate_overall_auc(oof_df, 
                                                                          utils.TOXICITY_COLUMN)
                                                   )
    print(f'validation final score: {validation_final_socre}')
    utils.submit(result_dir, sub_preds)
    print('finish!!!')
Пример #4
0
def render_graph(graph_type, dict_data, bind_css=None, css_file_names=None):
    # Pre-process data
    utils.preprocess_data(graph_type, dict_data)

    if bind_css is None and utils.get_default_css_binding(
            graph_type) is not None:
        bind_css = utils.get_default_css_binding(graph_type)

    result = display.HTML(
        _render_graph(graph_type, dict_data, bind_css, css_file_names))
    return result
Пример #5
0
def main(train=True):
    p = {
        'batch_size': 4986,
        'dim_1': 248,
        'dim_2': 487,
        'dim_3': 269,
        'dim_4': 218,
        'dim_5': 113,
        'activation': nn.ReLU,
        'dropout': 0.01563457578202565,
        'lr': 0.00026372556533974916,
        'label_smoothing': 0.06834918091900156,
        'weight_decay': 0.005270589494631074,
        'amsgrad': False
    }
    if train:
        models, features = train_cross_val(p)
        # models, features = final_train(p, load=False)
    else:
        data_ = load_data(root_dir='./data/', mode='train')
        data_, target_, features, date = preprocess_data(data_, nn=True)
        model_path = '/kaggle/input/model-files'
        f_mean = calc_data_mean(data_, 'cache')
        models = load_model(model_path, data_.shape[-1], 1, p, False)
    # model, checkpoint = final_train(p)
    # best_model_path = checkpoint.best_model_path
    # model, features = final_train(load=best_model_path)
    test_model(models, features)
    return models
Пример #6
0
def train_autoencoder():
    data = utils.load_data(root_dir='./data/', mode='train')
    data, target, features, date = utils.preprocess_data(data, nn=True)
    dataset = utils.FinData(data=data, target=target, date=date)
    p = {'batch_size': 4597,
         'dim_1': 231,
         'dim_2': 851,
         'dim_3': 777,
         'dim_4': 192,
         'hidden': 50,
         'dropout': 0.017122456592972537,
         'lr': 0.0013131268366473552,
         'activation': nn.GELU,
         'label_smoothing': 0.09401544509474698,
         'weight_decay': 0.005078413740277699,
         'amsgrad': True}
    train_idx = [i for i in range(len(data))]
    val_idx = [i for i in range(10000)]
    dataloaders = utils.create_dataloaders(dataset=dataset,
                                           indexes={
                                               'train': train_idx, 'val': val_idx},
                                           batch_size=p['batch_size'])

    checkpoint_callback = ModelCheckpoint(
        dirpath='logs', monitor='t_loss', mode='min', save_top_k=1, period=10)
    input_size = data.shape[-1]
    output_size = 1
    model = AutoEncoder(input_size=input_size,
                        output_size=output_size, params=p)
    es = EarlyStopping(monitor='t_loss', patience=10,
                       min_delta=0.0005, mode='min')
    trainer = pl.Trainer(max_epochs=500, gpus=1, callbacks=[checkpoint_callback, es],
                         precision=16)
    trainer.fit(model, train_dataloader=dataloaders['train'])
Пример #7
0
def main():
    api_token = read_api_token()
    neptune.init(api_token=api_token,
                 project_qualified_name='jamesmccarthy65/JSMP')
    data = load_data('data/', mode='train', overide='filtered_train.csv')
    data, target, features, date = preprocess_data(data)
    data_dict = {
        'data': data,
        'target': target,
        'features': features,
        'date': date
    }
    print('creating XGBoost Trials')
    xgb_exp = neptune.create_experiment('XGBoost_HPO')
    xgb_neptune_callback = opt_utils.NeptuneCallback(experiment=xgb_exp)
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: optimize(trial, data_dict),
                   n_trials=100,
                   callbacks=[xgb_neptune_callback])
    joblib.dump(study,
                f'HPO/xgb_hpo_{str(datetime.datetime.now().date())}.pkl')
    print('Creating LightGBM Trials')
    lgb_exp = neptune.create_experiment('LGBM_HPO')
    lgbm_neptune_callback = opt_utils.NeptuneCallback(experiment=lgb_exp)
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: loptimize(trial, data_dict),
                   n_trials=100,
                   callbacks=[lgbm_neptune_callback])
    joblib.dump(study,
                f'HPO/lgb_hpo_{str(datetime.datetime.now().date())}.pkl')
Пример #8
0
    def __init__(self, args, rank):
        self.rank = rank
        self.epoch_loss = 0
        self.epoch_acc = 0

        self.args = args

        self.model, input_size, self.quant_model = initialize_model(
            args.model_name, get_num_classes(args.image_path))

        self.dataloaders_dict = preprocess_data(args.image_path,
                                                args.batch_size, input_size,
                                                args.num_workers, rank)

        self.train_iterator = iter(self.dataloaders_dict['train'])

        print("Params to learn:")
        params_to_update = []
        for name, param in self.model.named_parameters():
            if param.requires_grad == True:
                params_to_update.append(param)
                print("\t", name)

        self.optimizer = optim.Adam(params_to_update, lr=0.001)

        self.criterion = nn.CrossEntropyLoss()
Пример #9
0
def predict_api():
    if request.method == 'POST':
        data = io.BytesIO(request.files.get('resume').read())
        resume_text = preprocess_data(data)
        entities = predict(model, TOKENIZER, idx2tag, DEVICE, resume_text,
                           MAX_LEN)
        return jsonify({'entities': entities})
Пример #10
0
def load_for_jupyter():
    v_data, e_data, core_targets, ext_targets, core_testing = utils.load_data(
        from_jup=True)
    #v_sample, e_sample, core_sample, ext_sample = subsample(v_data, e_data, core_targets, ext_targets, n)
    v_sets, e_sets = utils.preprocess_data(v_data, e_data, core_targets,
                                           ext_targets, core_testing)
    return v_data, e_data, v_sets, e_sets, core_targets, ext_targets, core_testing
Пример #11
0
def test_model(obs_file=OBS_FILE, length_file=LENGTH_FILE):
    obs, length = utils.read_files(obs_file, length_file)
    obs = utils.preprocess_data(obs)
    # model = joblib.load(MODEL_NAME+str(0)+".pkl")
    model = joblib.load("../interval_model/model_mbs.pkl")
    model._print_info()
    validate_model(model, obs[:, 0].reshape(-1, 1), length)
Пример #12
0
def get_markov_chain_for_each(model, obs, lengths, patients, col_index):
    print("obs.shape: ", obs.shape)
    obs = utils.preprocess_data(obs, THREADHOLD[col_index])
    start_index = 0
    for i in range(0, len(lengths)):
        patient_id = lengths[i, 0]
        end_index = start_index + lengths[i, 1]
        states = model.predict(obs[start_index:end_index, :])
        # print("states: ", states)
        state_frequency = np.bincount(states)
        state_frequency = np.pad(state_frequency,
                                 (0, N_COMPONENTS - state_frequency.shape[0]),
                                 'constant',
                                 constant_values=0)
        if USE_PROPORTION:
            state_proportion = state_frequency / np.sum(state_frequency)
        else:
            state_proportion = state_frequency
        # print("state_frequency: ", state_frequency)
        if patient_id in patients:
            patients[patient_id].add_state_proportion(state_proportion,
                                                      col_index)
        else:
            patients[patient_id] = Patient.Patient(patient_id)
            patients[patient_id].add_state_proportion(state_proportion,
                                                      col_index)
        start_index = end_index
Пример #13
0
def entity_train(logger, tokenizer, model, to_be_trained_entities,
                 yanbao_texts):
    entities_json = to_be_trained_entities

    train_proportion = 0.9

    text_num = int(len(yanbao_texts))
    random.shuffle(yanbao_texts)
    yanbao_texts_train = yanbao_texts[:int(text_num * train_proportion)]
    yanbao_texts_dev = yanbao_texts[int(text_num * train_proportion):]

    train_preprocessed_datas = preprocess_data(entities_json,
                                               yanbao_texts_train, tokenizer)
    train_dataloader = build_dataloader(train_preprocessed_datas,
                                        tokenizer,
                                        batch_size=BATCH_SIZE)

    dev_preprocessed_datas = preprocess_data(entities_json, yanbao_texts_dev,
                                             tokenizer)
    dev_dataloader = build_dataloader(dev_preprocessed_datas,
                                      tokenizer,
                                      batch_size=BATCH_SIZE)

    best_evaluate_score = 0
    for epoch in range(TOTAL_EPOCH_NUMS):
        epoch_start_time = time.time()
        train(model,
              train_dataloader,
              logger=logger,
              epoch_id=epoch,
              device=DEVICE)
        # model.eval()
        evaluate_score = evaluate(model,
                                  dev_dataloader,
                                  logger=logger,
                                  tokenizer=tokenizer,
                                  device=DEVICE)
        f1 = evaluate_score['f']
        p = evaluate_score['p']
        r = evaluate_score['r']
        duration = time.time() - epoch_start_time
        print('f1:', f1, 'p:', p, 'r:', r, 'time:', duration)
        if f1 > best_evaluate_score:
            best_evaluate_score = f1
            save_model_path = os.path.join(SAVE_MODEL_DIR, 'best_en_model.pth')
            logger.info('saving model to {}'.format(save_model_path))
            model.save(save_model_path, epoch)
Пример #14
0
def train_cross_val(p):
    data_ = load_data(root_dir='./data/', mode='train')
    data_, target_, features, date = preprocess_data(data_, nn=True)

    gts = PurgedGroupTimeSeriesSplit(n_splits=5, group_gap=5)

    input_size = data_.shape[-1]
    output_size = 1
    tb_logger = pl_loggers.TensorBoardLogger('logs/')
    models = []
    for i, (train_idx, val_idx) in enumerate(gts.split(data_, groups=date)):
        idx = np.concatenate([train_idx, val_idx])
        data = copy.deepcopy(data_[idx])
        target = copy.deepcopy(target_[idx])
        checkpoint_callback = pl.callbacks.ModelCheckpoint(os.path.join(
            'models/', "fold_{}".format(i)),
                                                           monitor="val_auc",
                                                           mode='max',
                                                           save_top_k=1,
                                                           period=10)
        model = Classifier(input_size=input_size,
                           output_size=output_size,
                           params=p)
        if p['activation'] == nn.ReLU:
            model.apply(lambda m: init_weights(m, 'relu'))
        elif p['activation'] == nn.LeakyReLU:
            model.apply(lambda m: init_weights(m, 'leaky_relu'))
        train_idx = [i for i in range(0, max(train_idx) + 1)]
        val_idx = [i for i in range(len(train_idx), len(idx))]
        data[train_idx] = calc_data_mean(data[train_idx],
                                         './cache',
                                         train=True,
                                         mode='mean')
        data[val_idx] = calc_data_mean(data[val_idx],
                                       './cache',
                                       train=False,
                                       mode='mean')
        dataset = FinData(data=data, target=target, date=date)
        dataloaders = create_dataloaders(dataset,
                                         indexes={
                                             'train': train_idx,
                                             'val': val_idx
                                         },
                                         batch_size=p['batch_size'])
        es = EarlyStopping(monitor='val_auc',
                           patience=10,
                           min_delta=0.0005,
                           mode='max')
        trainer = pl.Trainer(logger=tb_logger,
                             max_epochs=500,
                             gpus=1,
                             callbacks=[checkpoint_callback, es],
                             precision=16)
        trainer.fit(model,
                    train_dataloader=dataloaders['train'],
                    val_dataloaders=dataloaders['val'])
        torch.save(model.state_dict(), f'models/fold_{i}_state_dict.pth')
        models.append(model)
    return models, features
    def get_top_k(self, arXiv_df, field_df, k, addition_df=None):
        field_matrix = self._model.transform(
            preprocess_data(field_df)).toarray()
        paper_matrix = self._model.transform(
            preprocess_data(arXiv_df[['title', 'abstract']])).toarray()
        if not addition_df is None:
            addition_matrix = self._model.transform(
                preprocess_data(addition_df)).toarray()

            result = np.argsort(
                self.get_similarity(paper_matrix, field_matrix,
                                    addition_matrix))[::-1]
        else:
            result = np.argsort(self.get_similarity(paper_matrix,
                                                    field_matrix))[::-1]
        # print(result)
        return arXiv_df.loc[result[:k]]
Пример #16
0
def get_train_data():
    normal_train, normal_test = get_sentence(args.train_data, args.test_data)
    transfer_train, transfer_test = get_sentence(args.transfer_train_data, args.transfer_test_data)
    char2id, id2char, tag2id, id2tag, transfer_tag2id, transfer_id2tag = get_transform(normal_train + transfer_train,
                                                                                       args.map_path,
                                                                                       args.tag2label_path,
                                                                                       args.transfer_tag2label_path)
    train_data = preprocess_data(normal_train, char2id, tag2id)
    train_manager = BatchManager(train_data, args.batch_size)
    test_data = preprocess_data(normal_test, char2id, tag2id)
    test_manager = BatchManager(test_data, args.batch_size)
    transfer_train_data = preprocess_data(transfer_train, char2id, transfer_tag2id)
    transfer_train_manager = BatchManager(transfer_train_data, args.batch_size)
    transfer_test_data = preprocess_data(transfer_test, char2id, transfer_tag2id)
    transfer_test_manager = BatchManager(transfer_test_data, args.batch_size)

    return train_manager, test_manager, transfer_train_manager, transfer_test_manager, id2char, id2tag, transfer_id2tag
Пример #17
0
def transform_and_save(source_path, target_path):
    # read data
    data = pd.read_csv(source_path)

    # transform
    data = preprocess_data(data)

    # output
    np.save(str(target_path.absolute()), data)
 def get_top_k_with_kw(self, arXiv_df, field_df, k, addition_df=None):
     field_matrix = self._model.transform(
         preprocess_data(field_df)).toarray()
     paper_matrix = self._model.transform(
         preprocess_data(arXiv_df[['title', 'abstract']])).toarray()
     if not addition_df is None:
         addition_matrix = self._model.transform(
             preprocess_data(addition_df)).toarray()
         result = np.argsort(
             self.get_similarity(paper_matrix, field_matrix,
                                 addition_matrix))[::-1]
     else:
         score = self.get_similarity(paper_matrix, field_matrix)
         addition_score = arXiv_df['contain_keywords'].str.strip(
         ).str.split(';').apply(
             lambda item: 0.01 * 2**len(item[:-1])).values
         score += addition_score
         result = np.argsort(score)[::-1]
     return arXiv_df.loc[result[:k]]
Пример #19
0
    def __getitem__(self, idx):
        case_path = self.case_paths[idx]
        series = preprocess_data(case_path, self.transform)

        case_id = int(os.path.splitext(os.path.basename(case_path))[0])
        case_row = self.labels_df[self.labels_df.case == case_id]
        diagnoses = case_row.values[0, 1:].astype(np.float32)
        labels = torch.tensor(diagnoses)

        return (series, labels)
Пример #20
0
 def get_search_results(cls, s):
     slugs, model, bigrams = cls.get_artifacts()
     s_preprocessed = preprocess_data(s, cls.regex, True, True)
     s_ = list(bigrams[s_preprocessed])
     s_vector = model.infer([(s_, 0)])
     results = cdist(s_vector, model.sv.vectors, metric='cosine').squeeze()
     results = list(enumerate(results))
     results = sorted(results, key=lambda x: x[1])[:10]
     results = [slugs[idx] for idx, _ in results]
     return results
Пример #21
0
def init_dm():
    (X_TRAIN, Y_TRAIN), (X_TEST, Y_TEST) = load_mnist10()
    X_TRAIN = X_TRAIN[:12000]  # !TODO just for tests
    Y_TRAIN = Y_TRAIN[:12000]
    x_train = preprocess_data(X_TRAIN, data_type='x')
    y_train = preprocess_data(Y_TRAIN, data_type='y')
    (x_eval, y_eval), (x_train, y_train) = split(x_train, y_train, N_EVAL)
    x_test = preprocess_data(X_TEST, data_type='x')
    y_test = preprocess_data(Y_TEST, data_type='y')
    print("""
        Shapes:
        x_train: {}
        x_eval: {}
        y_eval: {}
        x_test: {}
        y_test: {}
    """.format(x_train.shape, x_eval.shape, y_eval.shape, x_test.shape,
               y_test.shape))
    return DatasetsManager(x_train, x_eval, y_eval, x_test, y_test), y_train
Пример #22
0
def run_experiment(data_train: pd.DataFrame, data_test: pd.DataFrame,
                   config: Dict) -> pd.DataFrame:
    data_train = preprocess_data(data_train)
    data_test = preprocess_data(data_test)

    for data in [data_train, data_test]:
        data[TEXT_COLUMN] = join_text_columns(data, config["text_columns"])

    Ns = np.arange(1000, MAX_N, 1000)
    if Ns[-1] != MAX_N:
        Ns = np.append(Ns, MAX_N)

    grid_search = get_grid_search(config)

    results = []
    for N in Ns:
        logger.info(f"running for N = {N}")
        t0 = time.time()

        data_train_sample = data_train.sample(N)
        data_train_sample, data_test_sample = select_popular_classes(
            data_train_sample, data_test, LABEL_COLUMN,
            config["min_samples_per_class"])

        result = run_experiment_iteration(grid_search, data_train_sample,
                                          data_test_sample)
        result["min_samples_per_class"] = config["min_samples_per_class"]

        representation_score_threshold = config.get(
            "representation_score_threshold", DEFAULT_REPRESENTATION_THRESHOLD)
        result["representation_score"] = representation_score(
            data_train_sample[LABEL_COLUMN], representation_score_threshold)
        result[
            "representation_score_threshold"] = representation_score_threshold

        logger.info(f"completed in {int(time.time() - t0)} seconds")

        results.append(result)

    results = pd.DataFrame(results)

    return results
Пример #23
0
    def start(self):
        for ds in self.__datasets:
            print(ds)
            self.current_dataset_name = utils.get_filename(ds)
            X, y = utils.preprocess_data(ds)
            self.classes_names = utils.get_classes_names(y)
            self.num_of_classes = len(self.classes_names)
            y = label_binarize(y, classes=self.classes_names)
            self.k_folds_cross_validation(X, y)

        print(f'model won in {self.model_wins} \ {self.counter}')
Пример #24
0
def sparsify(learn_cfs,
             test_cfs,
             descriptizers,
             max_iter=50,
             startPoints=5,
             stepPoints=1):
    sink = lambda s: ''

    l = utils.random_cfs(learn_cfs, startPoints)
    l_desc, l_lables, t_desc, t_lables, descriptors_scaler, labels_scaler = utils.preprocess_data(
        learn_cfs, test_cfs, descriptizers, log=sink)

    iteration = 0
    spars_info = {}

    cur_gap_instance = utils.GAP_predict(l, test_cfs, descriptizers,
                                         log=sink)[1]

    while iteration < max_iter and len(l) <= len(l_desc):

        var = [(cf, cur_gap_instance.compute_variance(desc))
               for cf, desc in zip(learn_cfs, l_desc)]
        var = sorted(var, key=itemgetter(1), reverse=True)

        joined_cfs, joined_vars = zip(*var[0:stepPoints])
        mean_joined_variance = np.mean(joined_vars)

        l += joined_cfs

        s, cur_gap_instance = utils.GAP_predict(l,
                                                test_cfs,
                                                descriptizers,
                                                log=sink)
        mse = s['diff_mse']

        print 'Join %d cfs with mean variance = %e, mse = %e ' % (
            len(joined_cfs), mean_joined_variance, mse)

        # Note:
        #   size_db = old + n_joined
        #   mse = mse(size_db)
        #   mean_joined_variance computed on n_joined cfs
        #
        spars_info[iteration] = {
            'size_db': len(l),
            'mse': mse,
            'n_joined_cfs': stepPoints,
            'mean_joined_variance': mean_joined_variance
        }

        iteration += 1

    return spars_info
Пример #25
0
def main(argv=None):
    file = open(log_dir() + "/" +str(FLAGS.graph_id)+"_start_time_"+FLAGS.train_worker+".txt", "w")
    currentDT = datetime.datetime.now()
    print (str(currentDT))
    file.write (str(currentDT))


    print("Loading training data..")
    processed_data = preprocess_data(FLAGS.train_prefix, FLAGS.train_attr_prefix, FLAGS.train_worker,FLAGS.isLabel,FLAGS.isFeatures)
    train_data = processed_data[0:-1]
    G_local = processed_data[-1]
    print("Done loading training data..")
    file.write(str("Done loading training data.."))
    file.close()
    train(train_data,G_local)
Пример #26
0
def mnist_classification():
    training_phase = "saved_model" not in os.listdir()
    load_dataset_mnist()
    mndata = MNIST('data_mnist')

    lenet = Lenet(20, 64, tf.train.AdamOptimizer(),
                  tf.losses.softmax_cross_entropy)

    if training_phase:
        images, labels = mndata.load_training()
        images, labels = preprocess_data(images, labels, True)
        lenet.train(images, labels)
    else:
        images_test, labels_test = mndata.load_testing()
        images_test, labels_test = preprocess_data(images_test, labels_test,
                                                   True, True)
        lenet.load_model()
        pred = lenet.predict(images_test)
        print("Accuracy:",
              len(labels_test[pred == labels_test]) / len(labels_test))  # 98%

        from sklearn.metrics.classification import confusion_matrix
        print("Confusion matrix: ")
        print(confusion_matrix(labels_test, pred))
Пример #27
0
class Grid:

    nm = 41
    no_pv = 5
    total_iteration = 100
    # load mpc
    pf = 0.8
    alpha = 0.8
    beta = 0.2
    bus, branch = mpc(pf, beta)
    from_to = branch[:, 0:2]
    pv_bus = np.array([bus[1, 11], bus[14, 11], bus[15, 11], bus[17, 11], bus[18, 11]])
    pv_set = np.array([1, 14, 15, 17, 18])
    qg_min, qg_max = np.float32(bus[pv_set, 12]), np.float32(bus[pv_set, 11])

    r = np.zeros((nm, 1))
    x = np.zeros((nm, 1))
    A_tilde = np.zeros((nm, nm+1))

    for i in range(nm):
        A_tilde[i, i+1] = -1
        for k in range(nm):
            if branch[k, 1] == i + 1:
                A_tilde[i, int(from_to[k, 0])] = 1
                r[i] = branch[k, 2]
                x[i] = branch[k, 3]

    a0 = A_tilde[:, 0]
    A = A_tilde[:, 1:]
    A_inv = np.linalg.inv(A)
    R = np.diagflat(r)
    X = np.diagflat(x)
    v0 = np.ones(1)

    # load data
    n_load = sio.loadmat("bus_47_load_data.mat")
    n_solar = sio.loadmat("bus_47_solar_data.mat")
    load_data = n_load['bus47loaddata']
    solar_data = n_solar['bus47solardata']

    pc, pg, qc = preprocess_data(load_data, solar_data, bus, alpha)
    p = pg - pc
    data_set_temp = np.vstack((p, qc))
    data_set = data_set_temp.T
Пример #28
0
def train_and_test_one_model(obs, lengths, col_index):
    obs = utils.preprocess_data(obs, THREADHOLD[col_index])
    lamdas_ = init_lamdas(obs, SUGGEST[col_index])
    for i in range(0, len(lengths)):
        start_index = 0
        print("the ", i, "th round.")
        # Calculate the mask
        for j in range(0, i):
            start_index += lengths[j].sum()
        end_index = start_index + lengths[i].sum()
        test_data, train_data = utils.split_data(obs, start_index, end_index)
        test_length, train_length = utils.split_length(lengths, i)
        print(lamdas_)
        model = train_model(train_data, train_length, lamdas_)
        model_name = MODEL_NAME + TYPE[col_index] + ".pkl"
        save_model(model, model_name)
        validate_model(model, test_data, test_length)
        break
    print("end")
Пример #29
0
def classify_documents():
    # load models
    model, features_model = load_models()

    # load data
    data = load_documents()

    # preprocess data
    data = preprocess_data(data)

    # create text features
    features = features_model.transform(data['text_4']).toarray()

    # classify documents
    data['scores'] = model.predict(features)

    print('\nDocument classification:\n')
    for filename, topic in zip(data['filename'], data['scores']):
        print(' - Document {} belongs to category {}'.format(filename, topic))
Пример #30
0
def run_training():
    df = read_data_as_df(DATA_PATH)

    new_df = get_feature_df(df)
    tfidf_df = get_tfidf(new_df)

    X, y = preprocess_data(tfidf_df)

    X_test, y_test = X.loc[X.index == 'TEST'], y.loc[y.index == 'TEST'].values
    X_train, y_train = X.loc[(X.index == 'TRAIN') | (
        X.index == 'VALIDATION')], y.loc[(y.index == 'TRAIN') |
                                         (y.index == 'VALIDATION')].values
    LOG.info(f"Training set: {X_train.shape}, Testing set: {X_test.shape}")
    LOG.info(
        f"Training set positive examples: {y_train.sum()}, Testing set positive examples: {y_test.sum()}"
    )

    clf_d = get_trained_models(["RF", "SGD", "LR", "SVM"], X_train, y_train)
    evaluate_models(clf_d, X_train, X_test, y_train, y_test)
Пример #31
0
def pseudo_random_sparsify(learn_cfs, test_cfs, descriptizers,
                           max_iter=150, prob=0.1, limit=250):
    """ Naive sparsification with Hilbert-Schmidt indepedance criteria """

    spars_info = {}
    
    #print descs.shape[0], lbl.shape[0]
    var = float('inf')
    selection = np.ones(2000, dtype = bool)
    subselection = np.ones(2000, dtype = bool)
    kernel = lambda xi, xj : np.exp(-np.dot(xj-xi, xj-xi))
    iterations = 0
    desc, lbl, t_desc, t_lables, descriptors_scaler, labels_scaler = utils.preprocess_data(learn_cfs, test_cfs, descriptizers)

    while(selection.sum() > limit):
        hs = 0.0
        for i in range(max_iter):
            b = np.logical_and(np.random.binomial(1, prob, 2000), selection)
            var = HSCI(desc[b,:], lbl[b], kernel, kernel)
            print var
            if hs < var :
                hs = var
                subselection = b
        selection = np.logical_and(selection, subselection)
        lcfs = list( learn_cfs[i] for i in selection.nonzero()[0] )
        mse = utils.GAP_predict(lcfs, test_cfs, descriptizers, 
                                log=utils.empty_printer)[0]['diff_mse']
        print 'Iterations %d : Taille de la selection : %d ; HSCI = %e ; mse = %e' % (iterations, selection.sum(), hs, mse)

        spars_info[iterations] = {
            'size_db' : selection.sum(),
            'mse' : mse
        }
        iterations += 1

    return spars_info
Пример #32
0
                    dropout2_p=0.5,
                    hidden3_num_units=200,
                    dropout3_p=0.2,
                    output_num_units=1,
                    output_nonlinearity=None,
                    update=nesterov_momentum,
                    update_learning_rate=0.05,
                    update_momentum=0.9,
                    eval_size=0.1,
                    verbose=1,
                    regression=True,
                    max_epochs=35)
    return net0

train = read_csv('data/1.5/train.csv')
data = train.ix[:, train.columns != 'Hazard'][:1000] #Quitamos la columna Hazard
X = data.ix[:, data.columns != 'Id'][:1000] #Quitamos la columna Id
y = train['Hazard'][:1000]

new_X = preprocess_data(X)

X_train, X_test, y_train, y_test = train_test_split(new_X, y)

net0 = NeuralNetConstructor(32)
net0.train(X_train, y_train)

predicted = net0.predict(X_test)

print r2_score(y_test, predicted)

# R2 > 0
Пример #33
0
def classify_with_network2(
        # alignment files
        group_1, group_2, group_3,
        # which data to use
        strand, motif_start_positions, preprocess, events_per_pos, feature_set, title,
        # training params
        learning_algorithm, train_test_split, iterations, epochs, max_samples, batch_size,
        # model params
        learning_rate, L1_reg, L2_reg, hidden_dim, model_type, model_dir=None, extra_args=None,
        # output params
        out_path="./"):
    print("2 way classification")
    assert(len(motif_start_positions) >= 2)
    out_file = open(out_path + title + ".tsv", 'wa')
    if model_dir is not None:
        print("looking for model in {}".format(model_dir))
        model_file = find_model_path(model_dir, title)
    else:
        model_file = None

    # bin to hold accuracies for each iteration
    scores = []

    collect_data_vectors_args = {
        "events_per_pos": events_per_pos,
        "portion": train_test_split,
        "strand": strand,
        "max_samples": max_samples,
        "feature_set": feature_set,
        "kmer_length": 6
    }

    for i in xrange(iterations):
        list_of_datasets = []  # [((g1, g1l), (xg1, xg1l), (tg1, tg1l)), ... ]
        add_to_list = list_of_datasets.append
        for n, group in enumerate((group_1, group_2)):
            train_set, xtrain_set, test_set = collect_data_vectors2(label=n,
                                                                    files=group,
                                                                    motif_starts=motif_start_positions[n],
                                                                    dataset_title=title + "_group{}".format(n),
                                                                    **collect_data_vectors_args)
            add_to_list((train_set, xtrain_set, test_set))
        # unpack list
        g1_train, g1_tr_labels = list_of_datasets[0][0][0], list_of_datasets[0][0][1]
        g1_xtr, g1_xtr_targets = list_of_datasets[0][1][0], list_of_datasets[0][1][1]
        g1_test, g1_test_targets = list_of_datasets[0][2][0], list_of_datasets[0][2][1]

        g2_train, g2_tr_labels = list_of_datasets[1][0][0], list_of_datasets[1][0][1]
        g2_xtr, g2_xtr_targets = list_of_datasets[1][1][0], list_of_datasets[1][1][1]
        g2_test, g2_test_targets = list_of_datasets[1][2][0], list_of_datasets[1][2][1]

        nb_g1_train, nb_g1_xtr, nb_g1_test = len(g1_train), len(g1_xtr), len(g1_test)
        nb_g2_train, nb_g2_xtr, nb_g2_test = len(g2_train), len(g2_xtr), len(g2_test)
        assert(nb_g1_train > 0 and nb_g2_train > 0), "got {0} group 1 training and " \
                                                     "{1} group 2 training vectors".format(nb_g1_train, nb_g2_train)

        # level training and cross-training events so that the model gets equal exposure
        tr_level = np.min([nb_g1_train, nb_g2_train])
        xtr_level = np.min([nb_g1_xtr, nb_g2_xtr])
        test_level = np.min([nb_g1_test, nb_g2_test])
        print("{motif}: got {g1} group 1 and {g2} group 2 training vectors, leveled to {level}"
              .format(motif=title, g1=nb_g1_train, g2=nb_g2_train, level=tr_level))
        print("{motif}: got {g1} group 1 and {g2} group 2 cross-training vectors, leveled to {level}"
              .format(motif=title, g1=nb_g1_xtr, g2=nb_g2_xtr, level=xtr_level))
        print("{motif}: got {g1} group 1 and {g2} group 2 test vectors, leveled to {level}"
              .format(motif=title, g1=nb_g1_test, g2=nb_g2_test, level=test_level))

        training_data = stack_and_level_datasets2(g1_train, g2_train, tr_level)
        training_labels = append_and_level_labels2(g1_tr_labels, g2_tr_labels, tr_level)

        xtrain_data = stack_and_level_datasets2(g1_xtr, g2_xtr, xtr_level)
        xtrain_targets = append_and_level_labels2(g1_xtr_targets, g2_xtr_targets, xtr_level)

        test_data = stack_and_level_datasets2(g1_test, g2_test, test_level)
        test_targets = append_and_level_labels2(g1_test_targets, g2_test_targets, test_level)

        prc_train, prc_xtrain, prc_test = preprocess_data(training_vectors=training_data,
                                                          xtrain_vectors=xtrain_data,
                                                          test_vectors=test_data,
                                                          preprocess=preprocess)

        # evaluate

        X, y = shuffle_and_maintain_labels(prc_train, training_labels)

        trained_model_dir = "{0}{1}_Models/".format(out_path, title)

        training_routine_args = {
            "motif": title,
            "train_data": X,
            "labels": y,
            "xTrain_data": prc_xtrain,
            "xTrain_targets": xtrain_targets,
            "learning_rate": learning_rate,
            "L1_reg": L1_reg,
            "L2_reg": L2_reg,
            "epochs": epochs,
            "batch_size": batch_size,
            "hidden_dim": hidden_dim,
            "model_type": model_type,
            "model_file": model_file,
            "trained_model_dir": trained_model_dir,
            "extra_args": extra_args
        }

        if learning_algorithm == "annealing":
            net, summary = mini_batch_sgd_with_annealing(**training_routine_args)
        else:
            net, summary = mini_batch_sgd(**training_routine_args)

        errors, probs = predict(prc_test, test_targets, training_routine_args['batch_size'], net,
                                model_file=summary['best_model'])
        errors = 1 - np.mean(errors)
        print("{0}: {1} test accuracy.".format(title, (errors * 100)))
        out_file.write("{}\n".format(errors))
        scores.append(errors)

        with open("{}test_probs.pkl".format(trained_model_dir), 'w') as probs_file:
            cPickle.dump(probs, probs_file)

    print(">{motif}\t{accuracy}".format(motif=title, accuracy=np.mean(scores), end="\n"), file=out_file)
    return net
    base_dir = os.path.dirname(os.path.realpath(__file__))
    data_dir = os.path.join(base_dir, 'data')

    ann_dir = os.path.join(base_dir, 'annotation/coloncancer')
    plain_dir = os.path.join(base_dir, 'original')

    train_dir = os.path.join(data_dir, 'train')
    dev_dir = os.path.join(data_dir, 'dev')
    test_dir = os.path.join(data_dir, 'test')

    
    make_dirs([train_dir, dev_dir, test_dir])


    preprocess_data(os.path.join(ann_dir, "Train"), os.path.join(plain_dir, "train"), 
        train_dir, window_size, num_feats)
    
    preprocess_data(os.path.join(ann_dir, "Dev"), os.path.join(plain_dir, "dev"), 
        dev_dir, window_size, num_feats)

    ann_dir_2 = os.path.join(base_dir, 'thymedata-1.2.0-coloncancer-test-event-time/coloncancer')
    preprocess_test_data_phase2(os.path.join(plain_dir, "test"), os.path.join(ann_dir_2, "Test"), test_dir, window_size, num_feats)


    build_vocab(
        glob.glob(os.path.join(data_dir, '*/*.toks')),
        os.path.join(data_dir, 'vocab-cased.txt'),
        lowercase=False)

    build_word2Vector(os.path.join('../NLP-Tools', 'glove.840B.300d.txt'), data_dir, 'vocab-cased.txt')
    
Пример #35
0
            labels = ut.load(labelsname)
        else:
            X = np.concatenate((X, ut.load(objname)))
            labels = np.concatenate((labels, ut.load(labelsname)))
    
else:
    print("Loading data...")
    X, labels = ut.load_data('data/train.csv', train=True, selected=sort_idx)
    
    
dims = X.shape
print(dims, 'dims')
    
########################################################################
print("Preprocessing data")
X, scaler = ut.preprocess_data(X)
print("Preprocessing labels")
y, encoder = ut.preprocess_labels(labels)

X_test, ids = ut.load_data('data/test.csv', train=False, selected=sort_idx)
X_test, _ = ut.preprocess_data(X_test, scaler)

nb_classes = y.shape[1]
print(nb_classes, 'classes')

dims = X.shape[1]
print(dims, 'dims')

## check if model exists and resume otherwise rebuild
if os.path.isfile("./tmp/keras-nn"):
    print ("Loading existing neural network...")
Пример #36
0
def sparsifyFOHSIC(learn_cfs, test_cfs, descriptizers, limit=250, max_iter=[2400, 1200, 600, 300, 200, 100, 100, 100, 100, 80, 80, 60, 50, 40, 40]):
    """ Naive sparsification with Hilbert-Schmidt indepedance criteria """

    spars_info = {}
    
    #print descs.shape[0], lbl.shape[0]
    var = float('inf')
    #selection = np.random.binomial(1, 0.005, 2000)
    selection = np.zeros(2000, dtype = bool)
    subselection = np.zeros(2000, dtype = bool)
    kernel = lambda xi, xj : np.exp(-np.dot(xj-xi, xj-xi))
    iterations = 0
    desc, lbl, t_desc, t_lables, descriptors_scaler, labels_scaler = utils.preprocess_data(learn_cfs, test_cfs, descriptizers)
    
    """
    K = np.array([[kernel(desc[i,:], desc[j,:]) if i != j else 0. for i in range(2000)] for j in range(2000)])
    L = np.array([[kernel(lbl[i,:], lbl[j,:]) if i != j else 0. for i in range(2000)] for j in range(2000)])
    KL = K*L
    Cub = np.array([[[K[i,k]*L[k,j] for i in range(2000)] for j in range(2000)] for k in range(2000)])
    

    lcfs = list( learn_cfs[i] for i in selection.nonzero()[0] )
    mse = utils.GAP_predict(lcfs, test_cfs, descriptizers, 
                            log=utils.empty_printer)[0]['diff_mse']
    #print 'Iterations %d : Taille de la selection : %d ; HSCI = %e ; mse = %e' % (iterations, selection.sum(), hs, mse)

    spars_info[iterations] = {
        'size_db' : selection.sum(),
        'mse' : mse
    }
    iterations += 1
    """

    while(selection.sum() < limit):
        hs = 0.0
        #for i in range(2000):
        if iterations >= len(max_iter):
            repet = 30
        else:
            repet = max_iter[iterations]
        for i in range(repet):
            b = np.logical_or(np.random.binomial(1, 0.01, 2000), selection)
            #b[i] = True
            var = HSCI(desc[b,:], lbl[b], kernel, kernel)
            print var
            #m = selection.sum()
            #print 'HSCI 2 :'
            #var2 = (KL[b,:][:,b].sum() + K[b,:][:,b].sum()*L[b,:][:,b].sum()/(m-1)/(m-2) - 2*Cub[b,:,:][:,b,:][:,:,b].sum()/(m-2))/(m-3)/m
            #print var2
            if hs < var :
                hs = var
                subselection = b
        selection = subselection
        lcfs = list( learn_cfs[i] for i in selection.nonzero()[0] )
        mse = utils.GAP_predict(lcfs, test_cfs, descriptizers, 
                                log=utils.empty_printer)[0]['diff_mse']
        print 'Iterations %d : Taille de la selection : %d ; HSIC = %e ; mse = %e' % (iterations, selection.sum(), hs, mse)

        spars_info[iterations] = {
            'size_db' : selection.sum(),
            'mse' : mse
        }
        iterations += 1

    return spars_info
Пример #37
0
def classify_with_network3(
        # alignment files
        group_1, group_2, group_3,  # these arguments should be strings that are used as the file suffix
        # which data to use
        strand, motif_start_positions, preprocess, events_per_pos, feature_set, title,
        # training params
        learning_algorithm, train_test_split, iterations, epochs, max_samples, batch_size,
        # model params
        learning_rate, L1_reg, L2_reg, hidden_dim, model_type, model_dir=None, extra_args=None,
        # output params
        out_path="./"):
    # checks and file IO
    assert(len(motif_start_positions) >= 3)
    out_file = open(out_path + title + ".tsv", 'wa')
    if model_dir is not None:
        print("looking for model in {}".format(os.path.abspath(model_dir)))
        model_file = find_model_path(os.path.abspath(model_dir), title)
    else:
        model_file = None
    # bin to hold accuracies for each iteration
    scores = []

    collect_data_vectors_args = {
        "events_per_pos": events_per_pos,
        "portion": train_test_split,
        "strand": strand,
        "max_samples": max_samples,
        "feature_set": feature_set,
        "kmer_length": 6
    }

    for i in xrange(iterations):
        list_of_datasets = []  # [((g1, g1l), (xg1, xg1l), (tg1, tg1l)), ... ]
        add_to_list = list_of_datasets.append
        for n, group in enumerate((group_1, group_2, group_3)):
            train_set, xtrain_set, test_set = collect_data_vectors2(label=n,
                                                                    files=group,
                                                                    motif_starts=motif_start_positions[n],
                                                                    dataset_title=title + "_group{}".format(n),
                                                                    **collect_data_vectors_args)
            add_to_list((train_set, xtrain_set, test_set))

        # unpack to make things easier, list_of_datasets[group][set_idx][vector/labels]
        c_train, c_tr_labels = list_of_datasets[0][0][0], list_of_datasets[0][0][1]
        c_xtr, c_xtr_targets = list_of_datasets[0][1][0], list_of_datasets[0][1][1]
        c_test, c_test_targets = list_of_datasets[0][2][0], list_of_datasets[0][2][1]

        mc_train, mc_tr_labels = list_of_datasets[1][0][0], list_of_datasets[1][0][1]
        mc_xtr, mc_xtr_targets = list_of_datasets[1][1][0], list_of_datasets[1][1][1]
        mc_test, mc_test_targets = list_of_datasets[1][2][0], list_of_datasets[1][2][1]

        hmc_train, hmc_tr_labels = list_of_datasets[2][0][0], list_of_datasets[2][0][1]
        hmc_xtr, hmc_xtr_targets = list_of_datasets[2][1][0], list_of_datasets[2][1][1]
        hmc_test, hmc_test_targets = list_of_datasets[2][2][0], list_of_datasets[2][2][1]

        nb_c_train, nb_c_xtr, nb_c_test = len(c_train), len(c_xtr), len(c_test)
        nb_mc_train, nb_mc_xtr, nb_mc_test = len(mc_train), len(mc_xtr), len(mc_test)
        nb_hmc_train, nb_hmc_xtr, nb_hmc_test = len(hmc_train), len(hmc_xtr), len(hmc_test)

        assert(nb_c_train > 0 and nb_mc_train > 0 and nb_hmc_train > 0), "got zero training vectors"

        # level training events so that the model gets equal exposure
        tr_level = np.min([nb_c_train, nb_mc_train, nb_hmc_train])
        xtr_level = np.min([nb_c_xtr, nb_mc_xtr, nb_hmc_xtr])
        test_level = np.min([nb_c_test, nb_mc_test, nb_hmc_test])

        # log how many vectors we got
        print("{motif}: got {C} C, {mC} mC, and {hmC} hmC, training vectors, leveled to {level}"
              .format(motif=title, C=nb_c_train, mC=nb_mc_train, hmC=nb_hmc_train, level=tr_level), file=sys.stderr)
        print("{motif}: got {xC} C, {xmC} mC, and {xhmC} hmC, cross-training vectors, leveled to {xlevel}"
              .format(motif=title, xC=nb_c_xtr, xmC=nb_mc_xtr, xhmC=nb_hmc_xtr, xlevel=xtr_level), file=sys.stderr)
        print("{motif}: got {xC} C, {xmC} mC, and {xhmC} hmC, test vectors, leveled to {tstLevel}"
              .format(motif=title, xC=len(c_test), xmC=len(mc_test), xhmC=len(hmc_test),
                      tstLevel=test_level), file=sys.stderr)

        # stack the data into one object
        # training data
        training_data = stack_and_level_datasets3(c_train, mc_train, hmc_train, tr_level)
        training_labels = append_and_level_labels3(c_tr_labels, mc_tr_labels, hmc_tr_labels, tr_level)

        # cross training
        xtrain_data = stack_and_level_datasets3(c_xtr, mc_xtr, hmc_xtr, xtr_level)
        xtrain_targets = append_and_level_labels3(c_xtr_targets, mc_xtr_targets, hmc_xtr_targets, xtr_level)

        # test
        test_data = stack_and_level_datasets3(c_test, mc_test, hmc_test, test_level)
        test_targets = append_and_level_labels3(c_test_targets, mc_test_targets, hmc_test_targets, test_level)

        prc_train, prc_xtrain, prc_test = preprocess_data(training_vectors=training_data,
                                                          xtrain_vectors=xtrain_data,
                                                          test_vectors=test_data,
                                                          preprocess=preprocess)

        #if evaluate is True:
        #    all_test_data = np.vstack((xtrain_data, test_data))
        #    all_test_targets = np.append(xtrain_targets, test_targets)
        #    errors, probs = evaluate_network(all_test_data, all_test_targets, model_dir, model_type, batch_size, extra_args)
        #    return

        # shuffle data
        X, y = shuffle_and_maintain_labels(prc_train, training_labels)

        working_directory_path = "{outpath}/{title}_Models/".format(outpath=out_path, title=title)
        if not os.path.exists(working_directory_path):
            os.makedirs(working_directory_path)
        trained_model_dir = "{workingdirpath}{iteration}/".format(workingdirpath=working_directory_path,
                                                                  iteration=i)

        training_routine_args = {
            "motif": title,
            "train_data": X,
            "labels": y,
            "xTrain_data": prc_xtrain,
            "xTrain_targets": xtrain_targets,
            "learning_rate": learning_rate,
            "L1_reg": L1_reg,
            "L2_reg": L2_reg,
            "epochs": epochs,
            "batch_size": batch_size,
            "hidden_dim": hidden_dim,
            "model_type": model_type,
            "model_file": model_file,
            "trained_model_dir": trained_model_dir,
            "extra_args": extra_args
        }

        if learning_algorithm == "annealing":
            net, summary = mini_batch_sgd_with_annealing(**training_routine_args)
        else:
            net, summary = mini_batch_sgd(**training_routine_args)

        errors, probs = predict(prc_test, test_targets, training_routine_args['batch_size'], net,
                                model_file=summary['best_model'])
        errors = 1 - np.mean(errors)
        probs = zip(probs, test_targets)

        print("{0}:{1}:{2} test accuracy.".format(title, i, (errors * 100)))
        out_file.write("{}\n".format(errors))
        scores.append(errors)

        with open("{}test_probs.pkl".format(trained_model_dir), 'w') as probs_file:
            cPickle.dump(probs, probs_file)

    print(">{motif}\t{accuracy}".format(motif=title, accuracy=np.mean(scores), end="\n"), file=out_file)

    return net
Пример #38
0
from sklearn.metrics import adjusted_rand_score, r2_score, mean_squared_error
from sklearn import svm
from pandas import read_csv
from prettytable import PrettyTable
from utils import preprocess_data


#Obtenemos los datos de entrenamiento
train = read_csv('data/1.5/train.csv')
data = train.ix[:, train.columns != 'Hazard'][:1000] #Quitamos la columna Hazard
X = data.ix[:, data.columns != 'Id'][:1000] #Quitamos la columna Id
y = train['Hazard'][:1000]

X_train, X_test, y_train, y_test = train_test_split(X, y)

df_train = preprocess_data(X_train)

svr = svm.SVR(kernel='linear')
svr.fit(df_train.values, y_train)

df_test = preprocess_data(X_test)

predicted_values = svr.predict(df_test.values)

pt = PrettyTable()
pt.add_column("Predicted hazard", predicted_values)
print pt

#Regression score
print "R2 Score"
print r2_score(y_test, predicted_values)
Пример #39
0

def _usage(argv):
    print("Usage: python %s <action>" % argv[0])
    print("\tWhere action is one of: %s" % repr(_ACTIONS))
    exit(1)


if __name__ == '__main__':
    if len(sys.argv) != 2:
        _usage(sys.argv)
    action = sys.argv[1]
    if action not in _ACTIONS:
        _usage(sys.argv)
    if action == 'preprocess':
        preprocess_data(_DATA_FILE, _ENCODING)
    if action == 'preparetfidf':
        create_tfidf()
    if action == 'preparelsa':
        calculate_lsi()
    if action == 'preparelda':
        calculate_lda()
    if action == 'notes':
        print('Reading notes...')
        with open('data/notes.dat', 'rb') as f:
            data = pickle.loads(f.read())
        while True:
            try:
                index = int(input('Enter note number (ctrl+d to end program): '))
                print(data[index])
                print()