예제 #1
0
def _load_features(dataset, data_path, block_size=128):
    """Load the features and the associated metadata for a dataset.

    The metadata is read from a CSV file and returned as a DataFrame.
    Each DataFrame entry corresponds to an instance in the dataset.

    Args:
        dataset (Dataset): Information about the dataset.
        data_path (str): Path to directory containing feature vectors.

    Returns:
        tuple: Tuple containing the array of feature vectors and the
        metadata of the dataset.
    """
    import features
    import utils

    # Load feature vectors from disk
    features_path = os.path.join(data_path, dataset.name + '.h5')
    x, n_blocks = utils.timeit(
        lambda: features.load_features(features_path, block_size, block_size //
                                       4),
        f'Loaded features of {dataset.name} dataset')
    # Reshape feature vectors: NxTxF -> NxTxFx1
    x = np.expand_dims(x, axis=-1)

    # Load metadata and duplicate entries based on number of blocks
    df = pd.read_csv(dataset.metadata_path, index_col=0)
    df = df.loc[np.repeat(df.index, n_blocks)]

    return x, df
예제 #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('featurefile')
    parser.add_argument('modelfile',
                        nargs='?',
                        default='generated/model.pickle')
    parser.add_argument('outfile', nargs='?')
    args = parser.parse_args()
    if args.outfile == None:
        args.outfile = args.featurefile.replace('.feat', '') + '.prob'

    print_err("Loading saved classifier")
    clf, feat_indices, feat_ind_remaining, affil_median = pickle.load(
        open(args.modelfile, 'rb'))

    ids, X = feat.load_features(args.featurefile)
    X = X[:, feat_ind_remaining]
    # 	affil_ind = feat_indices.index('affil_sharedidf')
    # 	X[np.isnan(X[:, affil_ind]), affil_ind] = affil_median
    X[np.isnan(X)] = 0.

    print_err("Making predictions")
    predictions = clf.predict_proba(X)[:, 1]
    #	predictions = clf.predict(X)
    predictions = list(predictions)

    print_err("Writing predictions")
    writer = csv.writer(open(args.outfile, 'wb'))
    for i, ((id1, id2), prob) in enumerate(zip(ids, predictions)):
        writer.writerow([id1, id2, '{:g}'.format(prob)])
        if (i + 1) % 10000 == 0:
            print_err(i + 1, 'rows done')
예제 #3
0
def main():
	parser = argparse.ArgumentParser()
	parser.add_argument('featurefile')
	parser.add_argument('modelfile', nargs='?', default='generated/model.pickle')
	parser.add_argument('outfile', nargs='?')
	args = parser.parse_args()
	if args.outfile == None:
		args.outfile = args.featurefile.replace('.feat','') + '.prob'

	print_err("Loading saved classifier")	
	clf, feat_indices, feat_ind_remaining, affil_median = pickle.load(open(args.modelfile, 'rb'))

	ids, X = feat.load_features(args.featurefile)
 	X = X[:, feat_ind_remaining]
# 	affil_ind = feat_indices.index('affil_sharedidf')
# 	X[np.isnan(X[:, affil_ind]), affil_ind] = affil_median
 	X[np.isnan(X)] = 0.

	print_err("Making predictions")
	predictions = clf.predict_proba(X)[:,1]
#	predictions = clf.predict(X)
	predictions = list(predictions)

	print_err("Writing predictions")
	writer = csv.writer(open(args.outfile, 'wb'))
	for i, ((id1, id2), prob) in enumerate(zip(ids, predictions)):
		writer.writerow([id1, id2, '{:g}'.format(prob)])
 		if (i+1) % 10000 == 0:
 			print_err(i+1, 'rows done')
예제 #4
0
def _load_dataset(dataset):
    """Load input data and the associated metadata for a dataset.

    Args:
        dataset: Structure encapsulating dataset information.

    Returns:
        tuple: Tuple containing:

            x (np.ndarray): The input data of the dataset.
            df (pd.DataFrame): The metadata of the dataset.
    """
    import features

    # Load feature vectors and reshape to 4D tensor
    features_path = os.path.join(cfg.extraction_path, dataset.name + '.h5')
    x, n_chunks = utils.timeit(lambda: features.load_features(features_path),
                               'Loaded features of %s dataset' % dataset.name)
    x = np.expand_dims(x, -1)
    assert x.ndim == 4

    # Load metadata and duplicate entries based on number of chunks
    df = io.read_metadata(dataset.metadata_path)

    return x, df
예제 #5
0
def locate_cup(scene, prefix, already_found=[]):
    print prefix

    # Load saved features
    try:
        filename = "features/{0}.txt".format(prefix)
        features = load_features(filename)
    except IOError:
        return already_found

    # Get scene features, but hide features that have already been found
    scene = mask_image(scene, already_found)
    img2 = scene["img"]
    kp2, des2 = scene["kp"], scene["des"]
    if not len(kp2):
        return already_found

    # Find matches and a decent homography
    matched = find_matches(features, des2)
    found, dst, matches = find_homography(matched, features, kp2)
    if not found:
        return already_found

    # ... unless it's already been found
    exists = False
    for polygon in already_found:
        if (dst == polygon).all():
            return already_found

    return locate_cup(scene, prefix, already_found + [dst])
예제 #6
0
def test(**kwargs):
    opt.parse(kwargs, show_config=True)
    if opt.hdf5:
        from datasets import Train_Dataset_HDF5 as Train_Dataset
        from datasets import Test_Dataset_HDF5 as Test_Dataset
    else:
        from datasets import Train_Dataset_IMAGE as Train_Dataset
        from datasets import Test_Dataset_IMAGE as Test_Dataset

    reiddataset_downloader(opt.data_dir, opt.dataset_name, opt.hdf5)

    num_classes = Train_Dataset(train_val='train',
                                data_dir=opt.data_dir,
                                dataset_name=opt.dataset_name).num_ids

    test_dataloaders = {
        x: DataLoader(Test_Dataset(query_gallery=x,
                                   data_dir=opt.data_dir,
                                   dataset_name=opt.dataset_name),
                      batch_size=opt.batch_size,
                      shuffle=False,
                      num_workers=opt.num_workers)
        for x in ['query', 'gallery']
    }

    model = getattr(models, opt.model)(num_classes)
    model.load(opt.load_epoch_label)
    # Remove the final fc layer and classifier layer
    model.model.fc = nn.Sequential()
    model.classifier = nn.Sequential()
    # Change to test mode
    model = model.eval()
    model = model.cuda()

    if opt.load_features:
        all_features = load_features()
    else:
        all_features = extract_features(model, test_dataloaders, opt.flip)
        save_features(all_features)

    query_feature = all_features['query'][0]
    gallery_feature = all_features['gallery'][0]

    print('-' * 30)
    rank = ranking(query_feature, gallery_feature)

    print('-' * 30)
    query_label = all_features['query'][1]
    query_cam = all_features['query'][2]
    query_name = all_features['query'][3]
    gallery_label = all_features['gallery'][1]
    gallery_cam = all_features['gallery'][2]
    gallery_name = all_features['gallery'][3]

    result, CMC, mAP = evaluate(rank, query_label, query_cam, gallery_label,
                                gallery_cam)
    save_result(result, query_name, gallery_name, CMC, mAP)
예제 #7
0
파일: main.py 프로젝트: tqbl/gccaps
def _load_data(dataset, is_training=False):
    """Load input data, target values and file names for a dataset.

    The input data is assumed to be a dataset of feature vectors. These
    feature vectors are standardized using a scaler that is either
    loaded from disk (if it exists) or computed on-the-fly. The latter
    is only possible if the input data is training data, which is
    indicated by the `is_training` parameter.

    Target values and file names are read from the metadata file.

    Args:
        dataset: Structure encapsulating dataset information.
        training (bool): Whether the input data is training data.

    Returns:
        x (np.ndarray): The input data.
        y (np.ndarray): The target values.
        names (list): The associated file names.
    """
    import data_augmentation as aug
    import features

    features_path = os.path.join(cfg.extraction_path, dataset.name + '.h5')
    x = utils.timeit(lambda: features.load_features(features_path),
                     'Loaded features of %s dataset' % dataset.name)

    # Clip dynamic range to 90 dB
    x = np.maximum(x, x.max() - 90.0)

    # Load scaler from file if cached, or else compute it.
    scaler_path = cfg.scaler_path
    if os.path.exists(scaler_path) or not is_training:
        with open(scaler_path, 'rb') as f:
            scaler = pickle.load(f)
    else:
        scaler = utils.timeit(lambda: utils.compute_scaler(x),
                              'Computed standard scaler')
        with open(scaler_path, 'wb') as f:
            pickle.dump(scaler, f)

    x = utils.timeit(lambda: utils.standardize(x, scaler),
                     'Standardized %s features' % dataset.name)

    names, y = utils.timeit(lambda: utils.read_metadata(dataset.metadata_path),
                            'Loaded %s metadata' % dataset.name)
    if dataset == cfg.training_set and cfg.enable_augmentation:
        names, y = aug.expand_metadata((names, y))

    return x, y, names
def predict_parc():

    # Load the features.
    features, attribution_ids = f.load_features(PARC_FEATURES_PATH)

    # Load the model.
    model = m.load_model('svr')

    # Make predictions.  Convert percentage to decimal.
    predictions = model.predict(features) / 100.
    results = zip(attribution_ids, predictions)

    # sort on predicted value.
    results.sort(key=lambda x: x[1])

    predictions_path = os.path.join(DATA_DIR, 'parc-verifiability',
                                    'predictions.tsv')
    open(predictions_path, 'w').write('\n'.join(
        ['%s\t%f' % (attr_id, score) for attr_id, score in results]))
    return results
예제 #9
0
def get_data_ddi(path: str,
             skip_invalid_smiles: bool = True,
             args: Namespace = None,
             features_path: List[str] = None,
             max_data_size: int = None,
             use_compound_names: bool = None,
             logger: Logger = None):
    debug = logger.debug if logger is not None else print

    if args is not None:
        # Prefer explicit function arguments but default to args if not provided
        features_path = features_path if features_path is not None else args.features_path
        max_data_size = max_data_size if max_data_size is not None else args.max_data_size
        use_compound_names = use_compound_names if use_compound_names is not None else args.use_compound_names
    else:
        use_compound_names = False

    max_data_size = max_data_size or float('inf')

    # Load features
    if features_path is not None:
        features_data = []
        for feat_path in features_path:
            features_data.append(load_features(feat_path))  # each is num_data x num_features
        features_data = np.concatenate(features_data, axis=1)
    else:
        features_data = None

    skip_smiles = set()

    df = pd.read_csv(path, index_col=False)
    dictionaries = []
    for line_no, line in df.iterrows():
        dictionary = dict(line)
        dictionaries.append(dictionary)
    data = DDIDataset([
        DDIDatapoint(
            dictionary=dictionary,
            args=args,
            features_1=features_data[i] if features_data is not None else None,
            features_2=features_data[i] if features_data is not None else None,
        ) for i, dictionary in tqdm(enumerate(dictionaries), total=len(dictionaries))
    ])


    # Filter out invalid SMILES
    if skip_invalid_smiles:
        original_data_len = len(data)
        # data = filter_invalid_smiles(data)
        data = filter_invalid_smiles_ddi(data)

        if len(data) < original_data_len:
            debug(f'Warning: {original_data_len - len(data)} SMILES are invalid.')

    if data.data[0].features_1 is not None and data.data[0].features_2 is not None:
        features_dim_1 = len(data.data[0].features_1)
        features_dim_2 = len(data.data[0].features_2)
        assert features_dim_1 == features_dim_2
        args.features_dim = features_dim_1

    return data
예제 #10
0
파일: rebalance.py 프로젝트: gzuidhof/cad
    for d in tqdm(range(len(X_train))):
        X_train[d] = (X_train[d] - mean) / std
    for d in tqdm(range(len(X_test))):
        X_test[d] = (X_test[d] - mean) / std

    #Make once fully black vectors fully black again
    X_test = X_test*mask

    return X_train, X_test



if __name__ == "__main__":
    print "\nLoading X"
    X_train, X_test = features.load_features()
    print "Loading Y"
    y_train, y_test = features.load_y()

    print "Removing fully black features"
    X_train, y_train = remove_completely_black(X_train, y_train)

    print "Normalizing features"
    X_train, X_test = normalize_features(X_train, X_test)

    print "Balancing classes"
    X_train,y_train = balance_classes(X_train,y_train)

    print "Writing to file"
    features.write_features((X_train, X_test),"balanced")
    features.write_y((y_train, y_test),"balanced")
예제 #11
0
파일: learn.py 프로젝트: gzuidhof/cad
def dice(prediction, y):
    dices = [dice_score_img(p,t) for p,t in zip(prediction,y)]
    mean = np.mean(dices)
    std = np.std(dices)

    return mean, std, dices

def dice_score_img(p, y):
    return np.sum(p[y == 1]) * 2.0 / (np.sum(p) + np.sum(y))


def features_to_images(features, dim=0):
    images = util.chunks(features,384*512)
    for im in images:
        end_image = im[:,dim].reshape((512,384))
        print np.mean(end_image)

if __name__ == "__main__":
    print "\nLoading X"
    X_train, X_test = features.load_features("balanced")

    print "Loading Y"
    y_train, y_test = features.load_y("balanced")

    #train(X_train, X_test, y_train, y_test,LogisticRegression(), predict_black=True,name="logreg")
    #train(X_train, X_test, y_train, y_test,AdaBoostClassifier(n_estimators=200,random_state=42), predict_black=True,name="adaboost200")
    #train(X_train, X_test, y_train, y_test,RandomForestClassifier(n_estimators=250,n_jobs=-1,random_state=42), use_probability=True, predict_black=True,name="rf200")
    #train(X_train, X_test, y_train, y_test,SVC(verbose=2,max_iter=10000), use_probability=False,name="svmrbf")
    train(X_train, X_test, y_train, y_test,SVC(kernel="linear",verbose=2,max_iter=10000), use_probability=False,name="svmlinear")
예제 #12
0
파일: rebalance.py 프로젝트: gzuidhof/cad
    std = np.std(X, axis=0)

    for d in tqdm(range(len(X_train))):
        X_train[d] = (X_train[d] - mean) / std
    for d in tqdm(range(len(X_test))):
        X_test[d] = (X_test[d] - mean) / std

    #Make once fully black vectors fully black again
    X_test = X_test * mask

    return X_train, X_test


if __name__ == "__main__":
    print "\nLoading X"
    X_train, X_test = features.load_features()
    print "Loading Y"
    y_train, y_test = features.load_y()

    print "Removing fully black features"
    X_train, y_train = remove_completely_black(X_train, y_train)

    print "Normalizing features"
    X_train, X_test = normalize_features(X_train, X_test)

    print "Balancing classes"
    X_train, y_train = balance_classes(X_train, y_train)

    print "Writing to file"
    features.write_features((X_train, X_test), "balanced")
    features.write_y((y_train, y_test), "balanced")
from classifier import *
from features import load_features, save_prediction

""" Parameters """
classifier = 'LSTM'
method = 'we'
database = 'twitter'
language = 'en'
query = 'Trump'
extended = True

""" Load the data """
# x_train, x_test, y_train, y_test = load_features(database, language, method)
x_test = load_features(database, language, method, query, extended)
# x_test = np.concatenate((x_train, x_test), axis=0)

""" Load a classifier """
# load_file = 'data/model/untrained_' + classifier
load_file = 'data/model/best_trained_' + classifier + '_' + method + '_' + language
model = load_classifier(classifier, load_file)

'''
""" Train the classifier """
epochs = 5
batch_size = 16
validation_data = (x_test, y_test)
save_file = 'data/model/trained_' + classifier + '_' + method + '_' + language
model = train_classifier(classifier, model, x_train, y_train, epochs, batch_size, validation_data,
                         save_file=save_file+'.h5')
model = load_classifier(classifier, save_file)
'''
# Parameters
classifier = 'LSTM'
method = 'we'
language = 'fr'
epochs = 5
batch_size = 32
duration = 3600 * 6

# Initialization
t0 = t.time()
best_model, best_mse = None, 0
save_fname = 'data/model/test_3_random_trained_' + classifier + '_' + method + '_' + language

# Load the data
x_train, x_test, y_train, y_test = load_features(language, method)
validation_data = (x_test, y_test)

# Loop for the specified duration
cmpt = 1
while t.time() - t0 < duration:

    print("Trial number {}:".format(cmpt))

    # Create a random classifier
    new_model, new_info = create_random_classifier(classifier)

    # Train the classifier
    print("Training...")
    new_model, history = train_classifier(classifier,
                                          new_model,
예제 #15
0
def main():
    print("Connecting to services...")
    couchdb = CouchDB(user=os.environ["COUCHDB_USERNAME"],
                      auth_token=os.environ["COUCHDB_PASSWORD"],
                      url="http://%s:5984/" % os.environ["COUCHDB_HOST"],
                      connect=False,
                      auto_renew=True)
    redis = Redis(os.environ["REDIS_HOST"], 6379, 0)

    # Load features
    print("Loading features...")
    start_time = time()
    load_features(os.environ["MAP_PATH"], couchdb)
    print("Loading Time: %.2fs" % (time() - start_time))

    # Load classifier
    print("Loading classifier...")
    start_time = time()
    model, tokenizer = load_model(os.environ["MODEL_PATH"])
    print("Loading Time: %.2fs" % (time() - start_time))

    while True:
        try:
            # Select a location
            backfill = False
            couchdb.connect()

            # Use most out of date location
            result = couchdb["features"].get_query_result(selector={
                "newest": {
                    "$or": [{
                        "$exists": False
                    }, {
                        "$lt":
                        int((datetime.utcnow() -
                             timedelta(days=1)).timestamp())
                    }]
                },
                "status": {
                    "$or": [{
                        "$exists": False
                    }, {
                        "$ne": "in_use"
                    }]
                }
            },
                                                          sort=[{
                                                              "newest": "asc"
                                                          }],
                                                          limit=1).all()
            if len(result) == 0:
                # Backfill historical data if all are in date
                backfill = True
                result = couchdb["features"].get_query_result(selector={
                    "oldest": {
                        "$or": [{
                            "$exists": False
                        }, None]
                    },
                    "status": {
                        "$or": [{
                            "$exists": False
                        }, {
                            "$ne": "in_use"
                        }]
                    }
                },
                                                              limit=1).all()
                if len(result) == 0:
                    result = couchdb["features"].get_query_result(
                        selector={
                            "oldest": {
                                "$gt": datetime(2006, 4, 1).timestamp()
                            },
                            "status": {
                                "$or": [{
                                    "$exists": False
                                }, {
                                    "$ne": "in_use"
                                }]
                            }
                        },
                        sort=[{
                            "oldest": "desc"
                        }],
                        limit=1).all()

            if len(result) == 0:
                print("No jobs...")
                couchdb.disconnect()
                sleep(3600)
                continue

            # Mark location as "in use"
            doc = couchdb["features"][result[0]["_id"]]
            doc["status"] = "in_use"
            doc.save()

            try:
                # Process tweets at that location
                feature = result[0]
                print("Calling %sfor feature: %s..." %
                      ("backfill " if backfill else "", feature["_id"]))
                call_for_feature(feature, model, tokenizer, couchdb, redis,
                                 backfill)
                print()
            finally:
                # Mark location as "available"
                couchdb.connect()
                doc = couchdb["features"][result[0]["_id"]]
                doc["status"] = "available"
                doc.save()

        except Exception as e:
            print(e)
            sleep(random() * 0.3 + 0.1)
        finally:
            couchdb.disconnect()
예제 #16
0
def train_model(
    features_files,
    feature_columns,
    classifier,
    model_args,
    outlier_sigma=None,
    scale_features=True,
    submission_file=None,
    save_settings=False,
    plot=False,
    normalize_probs=None,
    n_cv=10,
    f_cv=0.3,
    verbose=False,
):
    """
    Fit a classification model (classifier, using arguments in model_args)
    to the features in columns feature_columns in the file(s) in
    features_files. Use CV with n_cv random training-CV sample splittings,
    each containing a fraction f_cv in the CV subsample, to estimate AUC
    for the fit.
    """
    settings = locals()
    hour_column = 0
    type_column = 1

    # read in feature matrix from file(s)
    X = features.load_features(features_files)
    # remove outliers
    if outlier_sigma is not None:
        X, retained_indices = features.remove_outliers(X, n_sigma=outlier_sigma)
    # scale features
    if scale_features:
        X = features.scale_features(X)

    # set up model
    model = classifier(**model_args)

    # set up plot
    if plot:
        fig = plt.figure(figsize=(8, 4))
        fig.set_tight_layout(True)
        ax0 = plt.subplot(121)
        ax1 = plt.subplot(122)
        # initialize plot arrays
        n_learn = np.zeros(10)
        learn_cv_avg = np.zeros(len(n_learn))
        learn_train_avg = np.zeros(len(n_learn))
        fp_rate_avg = np.linspace(0, 1, num=100)
        tp_rate_avg = np.zeros(len(fp_rate_avg))

    # loop over training-CV sample splittings
    auc_values = []
    for i_cv in range(n_cv):
        cv_indices = cv.cv_split_by_hour(X, n_pre_hrs=f_cv)
        if verbose:
            print "\nCV iteration", i_cv + 1
            print len(cv_indices["train"]), "training instances"
            print len(cv_indices["cv"]), "CV instances"
        # get feature matrices and class arrays for training and CV samples
        train_features_all, cv_features_all = [X[cv_indices[k], :] for k in ["train", "cv"]]
        train_features, cv_features = [y[:, np.array(feature_columns)] for y in [train_features_all, cv_features_all]]
        train_class = train_features_all[:, type_column]
        cv_class = cv_features_all[:, type_column]

        # compute learning curve
        if plot:
            learn_mask, n_train, learn_train, learn_cv = learning_curve(
                model,
                (train_features, train_class),
                (cv_features, cv_class),
                n=len(n_learn),
                normalize_probs=normalize_probs,
            )
            if len(learn_mask) > 0:
                n_learn[learn_mask] += 1
                learn_train_avg[learn_mask] += learn_train
                learn_cv_avg[learn_mask] += learn_cv
                ax0.plot(n_train, learn_train, linestyle="-", color=(1, 0.6, 0.6))
                ax0.plot(n_train, learn_cv, linestyle="-", color=(0.7, 0.7, 0.7))

        # predict probabilities
        train_prob, cv_prob = predict_probs(model, train_class, train_features, cv_features, normalize_probs)
        check_for_nan(train_prob)
        check_for_nan(cv_prob)
        if verbose:
            try:
                model_coef = model.coef_
                print "Feature coefficients:", model_coef
            except:
                pass

        # compute AUC
        auc = roc_auc_score(cv_class, cv_prob)
        auc_values.append(auc)
        if verbose:
            print "training AUC =", roc_auc_score(train_class, train_prob)
            print "CV AUC =", auc

        # plot ROC curve
        if plot:
            fp_rate, tp_rate, thresholds = roc_curve(cv_class, cv_prob)
            tp_rate_avg += np.interp(fp_rate_avg, fp_rate, tp_rate)
            ax1.plot(fp_rate, tp_rate, linestyle="-", color=(0.7, 0.7, 0.7))

    # compute mean and std. dev. of AUC over CV iterations
    auc_mean = np.mean(auc_values)
    auc_std = np.std(auc_values)
    if verbose:
        print "\nAverage AUC:", auc_mean, "+/-", auc_std

    # update submission CSV file
    if submission_file is not None:
        train_features_all = X[(X[:, type_column] == 0) | (X[:, type_column] == 1), :]
        train_features = train_features_all[:, np.array(feature_columns)]
        train_class = train_features_all[:, type_column]
        test_features_all = X[X[:, type_column] == -1, :]
        test_features = test_features_all[:, np.array(feature_columns)]
        train_prob, test_prob = predict_probs(model, train_class, train_features, test_features, normalize_probs)
        check_for_nan(train_prob, message="Replacing NaN probabilities with 0.")
        check_for_nan(test_prob, message="Replacing NaN probabilities with 0.")
        for i, ff in enumerate(features_files):
            data_list_file = ".".join(ff.split(".")[:-1]) + "_data_files.txt"
            with open(data_list_file, "r") as df:
                if i == 0:
                    data_files = np.array(df.readlines())
                else:
                    data_files = np.concatenate((data_files, df.readlines()), axis=0)
        if outlier_sigma is not None:
            data_files = data_files[retained_indices]
        test_files = []
        for f in data_files:
            if "test" in f:
                test_files.append(f.strip())
        submission.update_submission(dict(zip(test_files, test_prob)), submission_file)

    # save settings
    if save_settings:
        if submission_file is not None:
            settings_file = ".".join(submission_file.split(".")[:-1]) + "_settings.txt"
            open_mode = "a"
        else:
            settings_file = "train_model_settings.txt"
            open_mode = "w"
        with open(settings_file, open_mode) as sf:
            for s in [
                "features_files",
                "feature_columns",
                "classifier",
                "model_args",
                "outlier_sigma",
                "scale_features",
                "submission_file",
                "normalize_probs",
            ]:
                if s in settings:
                    sf.write(s + ": " + str(settings[s]) + "\n")
            sf.write("AUC = {0:.2f}+/-{1:.2f}\n\n".format(auc_mean, auc_std))

    # plot average learning curves and ROC curve
    if plot:
        n_train_array = len(cv_indices["train"]) / float(len(n_learn)) * np.array(range(1, len(n_learn) + 1))
        ax0.plot(n_train_array, learn_train_avg / (n_learn + 1.0e-3), "r-", linewidth=3)
        ax0.plot(n_train_array, learn_cv_avg / (n_learn + 1.0e-3), "k-", linewidth=3)
        tp_rate_avg /= float(n_cv)
        ax1.plot(fp_rate_avg, tp_rate_avg, "k-", linewidth=3)
        # display plot
        ax0.set_ylim((0.5, 1))
        ax0.set_xlabel("number of training instances")
        ax0.set_ylabel("AUC")
        ax1.plot(np.linspace(0, 1), np.linspace(0, 1), "k:", linewidth=2)
        ax1.set_xlabel("false positive rate")
        ax1.set_ylabel("true positive rate")
        plt.show(block=False)

    return (model, auc_mean, auc_std)
예제 #17
0
def get_data(path: str,
             skip_invalid_smiles: bool = True,
             args: Namespace = None,
             features_path: List[str] = None,
             max_data_size: int = None,
             use_compound_names: bool = None,
             logger: Logger = None) -> MoleculeDataset:
    """
    Gets smiles string and target values (and optionally compound names if provided) from a CSV file.

    :param path: Path to a CSV file.
    :param skip_invalid_smiles: Whether to skip and filter out invalid smiles.
    :param args: Arguments.
    :param features_path: A list of paths to files containing features. If provided, it is used
    in place of args.features_path.
    :param max_data_size: The maximum number of data points to load.
    :param use_compound_names: Whether file has compound names in addition to smiles strings.
    :param logger: Logger.
    :return: A MoleculeDataset containing smiles strings and target values along
    with other info such as additional features and compound names when desired.
    """
    debug = logger.debug if logger is not None else print

    if args is not None:
        # Prefer explicit function arguments but default to args if not provided
        features_path = features_path if features_path is not None else args.features_path
        max_data_size = max_data_size if max_data_size is not None else args.max_data_size
        use_compound_names = use_compound_names if use_compound_names is not None else args.use_compound_names
    else:
        use_compound_names = False

    max_data_size = max_data_size or float('inf')

    # Load features
    if features_path is not None:
        features_data = []
        for feat_path in features_path:
            features_data.append(load_features(feat_path))  # each is num_data x num_features
        features_data = np.concatenate(features_data, axis=1)
    else:
        features_data = None

    skip_smiles = set()

    # Load data
    with open(path) as f:
        reader = csv.reader(f)
        next(reader)  # skip header

        lines = []
        for line in reader:
            smiles = line[0]

            if smiles in skip_smiles:
                continue

            lines.append(line)

            if len(lines) >= max_data_size:
                break

        data = MoleculeDataset([
            MoleculeDatapoint(
                line=line,
                args=args,
                features=features_data[i] if features_data is not None else None,
                use_compound_names=use_compound_names
            ) for i, line in tqdm(enumerate(lines), total=len(lines))
        ])

    # Filter out invalid SMILES
    if skip_invalid_smiles:
        original_data_len = len(data)
        data = filter_invalid_smiles(data)

        if len(data) < original_data_len:
            debug(f'Warning: {original_data_len - len(data)} SMILES are invalid.')

    if data.data[0].features is not None:
        args.features_dim = len(data.data[0].features)

    return data
예제 #18
0
def main():
	parser = argparse.ArgumentParser()
	parser.add_argument('trainfile', nargs='?', default='data/train.csv')
	parser.add_argument('featfile', nargs='?', default='generated/train.feat')
	parser.add_argument('outfile', nargs='?', default='generated/model.pickle')
	parser.add_argument('--clf', default='rf')
	parser.add_argument('--removefeat', nargs='+', default=[])
	parser.add_argument('--cv', action='store_true')
	parser.add_argument('--folds', default=3)
	parser.add_argument('--gridsearch', action='store_true')
	parser.add_argument('--usegrid', action='store_true')
	args = parser.parse_args()

	if args.removefeat:
		feat_to_remove = args.removefeat
	else:
		feat_to_remove = [
# 			'conferences',
# 			'journals',
# 			'affiliations',
# 			'jaro_distance'
		]
		
	n_jobs = min(multiprocessing.cpu_count(), 8)
	
	params = {
		# Random Forest
		'rf': {
			'max_features': 3,
			'n_estimators': 300,
			'min_samples_split': 1,
			'min_samples_leaf': 1,
		},
		# GBM
		'gbm': {
			'n_estimators': 20000,
			'learning_rate': 1e-03,
			'max_depth': 3,
		}
	}

	params_grid = {
		'rf': {
			'min_samples_split': [1, 2],
			'min_samples_leaf': [1, 2],
			'n_estimators': [130, 200, 250, 300, 500, 750, 1000, 1250], # [130, 400, 1000]
			'max_features': [3, 4, 5, 6, 7, 8, 9] # [4, 6, 9]
		},
		'gbm': {
	# 		'n_estimators': [500, 200],
	# 		'learning_rate': [1e-04],
	# 		'max_depth': [7]
			'n_estimators': [15000, 20000] + [17500],
			'learning_rate': [1e-04, 1e-03, 1e-02] + [5e-03],
			'max_depth': [7, 16] + [3, 5, 6, 8, 12, 14, 18]
		}
	}

	params_fixed = {
		'rf': {
			'random_state': 100,
			'n_jobs': n_jobs, # -1 = no. of cores on machine
			'oob_score': True,
			'verbose': 0,
			'compute_importances': True
		},
		'gbm': {
			'min_samples_split': 1,
			'min_samples_leaf': 2,
			'subsample': 0.5,
			'verbose': 0
		}
	}

	for k, v in params_fixed.iteritems():
		params[k].update(v)

	if args.usegrid or args.gridsearch:
		print params_grid[args.clf]
	else:
		print params[args.clf]
	
	X_ids, X = feat.load_features(args.featfile)
	idmap = {id: i for i, id in enumerate(X_ids)}
 	feat_indices = feat.FeaturesGenerator.fields
 	feat_ind_remaining = [i for i, faid in enumerate(feat_indices) if faid not in feat_to_remove]
 	feat_indices = [v for v in feat_indices if v not in feat_to_remove]
 	X = X[:, feat_ind_remaining]

	print feat_indices

	print_err("Loading training dataset labels")
	Y, Y_ids = loadTrainingLabels(args.trainfile, set(X_ids))
	training_indices = [idmap[id] for id in Y_ids]
	X = X[training_indices]

	# Filling in missing values
# 	affil_ind = feat_indices.index('affil_sharedidf')
#  	affil_median = sp.stats.nanmedian(X[:, affil_ind])
	affil_median = 0
# 	X[np.isnan(X[:, affil_ind]), affil_ind] = affil_median
# 	X[np.isnan(X[:, affil_ind]), affil_ind] = 0.
	X[np.isnan(X)] = 4.

	if args.clf == 'rf':
		clf = RandomForestClassifier()
	elif args.clf == 'gbm':
		clf = GradientBoostingClassifier()
	clf.set_params(**params[args.clf])
	
	if args.usegrid or args.gridsearch:
		print_err("Running grid search for best parameters")
		kwargs = {
			'n_jobs': n_jobs
		}
		if args.clf == 'rf':
			clf.set_params(n_jobs=1)
		elif args.clf == 'gbm':
			kwargs['loss_func'] = zero_one_loss
		clf_grid = grid(clf, params_grid[args.clf], X, Y, folds=args.folds, **kwargs)

		pprint(clf_grid.grid_scores_)
		print(clf_grid.best_score_)
		print(clf_grid.best_params_)
		if args.usegrid:
			clf = clf_grid.best_estimator_
	elif args.cv:
		print_err("Running cross-validation")
		m_cv(clf, X, Y, args.folds)

	if not args.cv and (not args.gridsearch or args.usegrid):
		print_err("Fitting data for training")
		clf.fit(X, Y)
		# for GBM
		if hasattr(clf, 'train_score_'):
	  		print_err("Train Score:", clf.train_score_[-1])
		print_err("OOB Score (CV-estimate):", clf.oob_score_)

		print_err("Saving trained model")
		pickle.dump((clf, feat_indices, feat_ind_remaining, affil_median), open(args.outfile, 'wb'), pickle.HIGHEST_PROTOCOL)
예제 #19
0
파일: learn.py 프로젝트: gzuidhof/cad

def dice_score_img(p, y):
    return np.sum(p[y == 1]) * 2.0 / (np.sum(p) + np.sum(y))


def features_to_images(features, dim=0):
    images = util.chunks(features, 384 * 512)
    for im in images:
        end_image = im[:, dim].reshape((512, 384))
        print np.mean(end_image)


if __name__ == "__main__":
    print "\nLoading X"
    X_train, X_test = features.load_features("balanced")

    print "Loading Y"
    y_train, y_test = features.load_y("balanced")

    #train(X_train, X_test, y_train, y_test,LogisticRegression(), predict_black=True,name="logreg")
    #train(X_train, X_test, y_train, y_test,AdaBoostClassifier(n_estimators=200,random_state=42), predict_black=True,name="adaboost200")
    #train(X_train, X_test, y_train, y_test,RandomForestClassifier(n_estimators=250,n_jobs=-1,random_state=42), use_probability=True, predict_black=True,name="rf200")
    #train(X_train, X_test, y_train, y_test,SVC(verbose=2,max_iter=10000), use_probability=False,name="svmrbf")
    train(X_train,
          X_test,
          y_train,
          y_test,
          SVC(kernel="linear", verbose=2, max_iter=10000),
          use_probability=False,
          name="svmlinear")
예제 #20
0
def classify():
    print('-' * 30)
    print('TRAINING TYPE: {0}'.format(TRAINING_TYPE))
    print('-' * 30)
    # Load data and masks
    features = load_features()
    features_info = load_features_info()
    masks = load_masks()
    num_of_images = len(features_info)
    num_of_training_pixels = 0
    num_of_validation_pixels = 0

    assert (num_of_images > NUM_OF_VALIDATION_IMAGES)
    for i in range(num_of_images - NUM_OF_VALIDATION_IMAGES):
        num_of_training_pixels += features_info[i]['num_of_pixels']
    for i in range(num_of_images - NUM_OF_VALIDATION_IMAGES, num_of_images):
        num_of_validation_pixels += features_info[i]['num_of_pixels']
    print('Training data: {0} \nValidation data: {1} '.format(
        num_of_training_pixels, num_of_validation_pixels))

    # Standardize data
    x_train = features[:num_of_training_pixels]
    scaler = preprocessing.StandardScaler().fit(x_train)
    x_train = scaler.transform(x_train)
    x_train = preprocessing.normalize(x_train)
    x_validation = scaler.transform(features[num_of_training_pixels:])
    x_validation = preprocessing.normalize(x_validation)
    y_train = masks[:num_of_training_pixels]

    clf = choose_training_type()
    if TRAINING_TYPE != 'FROM_SAVED_CLASSIFIER':
        print('-' * 30)
        print('Training started...')
        start_time = time.time()
        clf.fit(x_train, y_train)
        print('-' * 30)
        print('Training ended: {:.2f} s'.format(time.time() - start_time))
        print('-' * 30)
        if TRAINING_TYPE == 'GRID_SEARCH':
            print("GRID SEARCH RESULTS\n")
            print('Best parameters: {}\n'.format(clf.best_params_))
            means = clf.cv_results_['mean_test_score']
            for mean, params in zip(means, clf.cv_results_['params']):
                print('Mean score: {:0.3f} Parameters: {}'.format(
                    mean, params))
            print('-' * 30)

            scores = clf.cv_results_['mean_test_score'].reshape(
                len(GRID_SEARCH_PARAMETERS['base_estimator__C']),
                len(GRID_SEARCH_PARAMETERS['base_estimator__gamma']))

            mpl_style(dark=True)
            # plt.figure(figsize=(10, 10))
            for ind, i in enumerate(
                    GRID_SEARCH_PARAMETERS['base_estimator__C']):
                plt.plot(GRID_SEARCH_PARAMETERS['base_estimator__gamma'],
                         scores[ind],
                         label='C parameter: ' + str(i))
            plt.title('GRID SEARCH RESULTS')
            plt.xlabel('Gamma parameter')
            plt.ylabel('Mean score')
            plt.grid('on')
            plt.legend()
            plt.savefig('grid_search_results_figure.png',
                        bbox_inches='tight',
                        dpi=200)

        print('Saving model...')
        start_time = time.time()
        dump(clf, os.path.join('saved_models/', 'SVM_classifier.joblib'))
        print('Saving  ended: {:.2f} s'.format(time.time() - start_time))
        print('-' * 30)

    print('Predicting started...')
    print('-' * 30)
    start_time = time.time()
    predicted_masks = clf.predict(x_validation)
    print('Predicting ended: {:.2f} s'.format(time.time() - start_time))
    print('-' * 30)
    previous_mask_pixels = 0
    current_num_of_pixels = 0
    masks_predicted = 0
    # Saving predicted and truth masks as pairs
    # There is a need for converting predicted vector to 2D masks
    for i in range(num_of_images - NUM_OF_VALIDATION_IMAGES, num_of_images):
        current_num_of_pixels = features_info[i]['num_of_pixels']
        predicted_mask = np.asarray(
            predicted_masks)[previous_mask_pixels:previous_mask_pixels +
                             current_num_of_pixels]
        predicted_mask = predicted_mask.reshape(features_info[i]['height'],
                                                features_info[i]['width'])
        predicted_mask = ndimage.binary_opening(predicted_mask)
        predicted_mask = ndimage.binary_closing(predicted_mask)
        masks_predicted += 1
        plt.imsave(os.path.join(
            'predictions/', 'predicted_mask_' + str(masks_predicted) + '.png'),
                   predicted_mask,
                   cmap='gray')

        offset = num_of_training_pixels + previous_mask_pixels
        truth_mask = masks[offset:offset + current_num_of_pixels].reshape(
            features_info[i]['height'], features_info[i]['width'])
        plt.imsave(os.path.join('truth/',
                                'truth_mask_' + str(masks_predicted) + '.png'),
                   truth_mask,
                   cmap='gray')
        previous_mask_pixels = current_num_of_pixels