Exemplo n.º 1
0
def main():
    print("Reading the data")
    data = cu.get_dataframe(train_file)

    print("Extracting features")
    features.compute_features(train_file,feature_file1)

    print("Training the model")
    fea = cu.get_dataframe(feature_file1)
    rf = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=-1)
    rf.fit(fea, data["OpenStatus"][:140323])

    print("Reading test file and making predictions")
    features.compute_features(test_file,feature_file2)
    test_fea = cu.get_dataframe(feature_file2)
    probs = rf.predict_proba(test_fea)

    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    
    print("Saving submission to %s" % submission_file)

    cu.write_submission(submission_file, probs)
def classifier_predict(listname, modelname, outdir=None, n_jobs=None):
    if outdir == None:
        outdir = tempfile.mkdtemp(dir=os.curdir, prefix='out')
    else:
        if not os.path.exists(outdir):
            tsh.makedirs(outdir)
    inputname = os.path.splitext(os.path.basename(listname))[0]
    if listname.endswith('.gz'):
        inputname = os.path.splitext(inputname)[0]
    meta, data = read_listfile(listname)
    classifier = read_classifierfile(modelname)
    feature_method = classifier['features']['meta']['feature_method']
    feature_args = meta.copy()
    # Training input_name would shadow the current one.
    del classifier['features']['meta']['input_name']
    featurename = os.path.join(outdir, inputname + '-feats.csv.gz')
    if os.path.exists(featurename):
        _, features = read_listfile(featurename)
    else:
        feature_args.update(classifier['features']['meta'])
        args, features = compute_features(feature_method, feature_args, data,
                input_name=inputname, n_jobs=n_jobs, output_dir=outdir)
        assert (data['id'] == features['id']).all()
        clean_args(args)
        write_listfile(featurename, features, input_name=inputname, **args)
    labels_name = classifier['meta']['truth'] + '_labels'
    labels = classifier['meta'][labels_name]
    pred = predict(classifier['classifier'], sorted(labels.keys()), features,
            output_dir=outdir)
    write_listfile(os.path.join(outdir, inputname + '-predictions.csv.gz'), pred,
            classifier_name=modelname, truth=classifier['meta']['truth'],
            labels_name=labels, input_name=inputname)
Exemplo n.º 3
0
def extract_features(filename):
    return joblib.load('scalers/scaler.pkl').transform(
        compute_features(filename).values.reshape(1, -1))
def train_classifier( train_file='train/train.csv', recompute_feats=False ): 
	'''
	Module that reads stackoverflow data from a .csv file, 
	generates features, and trains a classifier.
	'''
	
	# custom variables
	DATA_DIR = "../data/"
	SUBMISSION_DIR = "../data/submission/"
	# train_file = 'train/train-sample.csv'
	label_file = 'train/train-labels.csv'
	feature_file = 'train/train-feats.csv'
			
	# display progress logs on stdout
	logging.basicConfig( level=logging.INFO,
						 format='%(asctime)s %(levelname)s %(message)s' )
	log = logging.getLogger(__name__)
	
	if recompute_feats:
		features.compute_features( train_file, feature_file, label_file )
	
	log.info( 'π: load features from file' )
	X = pd.io.parsers.read_csv( os.path.join( DATA_DIR, feature_file ), header=None )
	X = X.as_matrix()

	log.info( "π: encode labels" )
	labels = pd.io.parsers.read_csv( os.path.join( DATA_DIR, label_file ), header=None )['X0']
	lbl_map = { 'not a real question': 0, 'not constructive': 1, 'off topic': 2,
				'open': 3, 'too localized': 4 } # cf. required submission format
	labels = labels.map( lbl_map )
	y = labels.values
	
	log.info( 'π: select features' )
	fselect = SelectPercentile( score_func=chi2, percentile=42 ) # !?
	# X = fselect.fit_transform( X, y )
	
	log.info( 'π: define classifiers' )
	priors = cu.get_priors( os.path.join( DATA_DIR, 'train/train.csv' ) )
	clf_lda = LDA( priors=priors )
	clf_rfc = RandomForestClassifier( n_estimators=50, verbose=2, n_jobs=-1, random_state=0, 
				compute_importances=True, max_features=None ) #, criterion='entropy' )
	clf_gbc = GradientBoostingClassifier()

	log.info( 'π: fit Random Forest' )
	clf_rfc.fit( X, y )

	log.info( "π: compute feature ranking for RFC" )
	importances = clf_rfc.feature_importances_
	std = np.std([ tree.feature_importances_ for tree in clf_rfc.estimators_ ], axis=0 )
	indices = np.argsort( importances )[::-1]
	for f in xrange( 13 ): # the top thirteen features
		print "%d. feature %d (%f)" % (f + 1, indices[f], importances[ indices[f] ])

	log.info( "π: standardize and normalize features" )
	standardizer = StandardScaler( copy=False ).fit( X, y )
	standardizer.transform( X, y )	# in-place
	normalizer = Normalizer( copy=False, norm='l2' ).fit( X, y ) # 'l1'
	normalizer.transform( X, y )	# in-place
	
	log.info( 'π: fit Linear Discriminant Analysis' )
	clf_lda.fit( X, y )
	# X = cld_lda.transform( X, y )
	log.info( 'π: fit Gradient Boosting' )
	clf_gbc.fit( X, y )
	
	log.info( 'π: save classifiers' )
	np.savez( SUBMISSION_DIR+'cfy.npz', X=X, y=y, fselect=fselect, 
				standardizer=standardizer, normalizer=normalizer )
	joblib.dump( clf_lda, SUBMISSION_DIR + 'clf_lda.pkl', compress=9 )
	joblib.dump( clf_rfc, SUBMISSION_DIR + 'clf_rfc.pkl', compress=9 )
	joblib.dump( clf_gbc, SUBMISSION_DIR + 'clf_gbc.pkl', compress=9 )
Exemplo n.º 5
0
def doit(n,qmod,qcon,beta,T,outfile):
    G = nx.Graph()

    # Nodes that have gone through a DMC iteration and have not been burned.
    G.add_edge(1,2)
    InPlayPool = set([1,2])

    # Nodes that are initially isolated or have been burned.
    IsolatedPool = set()
    for i in xrange(3,n+1):
        G.add_node(i)
        IsolatedPool.add(i)

    # Header.
    print "#Iter\tNodes\tInPlay\tIso\tEdges\tNmEval\tComps\tMIS\tDensity\tCC\tTris\tFracDeg1\tFracDeg0\tNGcc\tMGcc"

    for iter in xrange(1,T+1):

        assert G.order() == len(InPlayPool) + len(IsolatedPool) == n # Always n nodes.

        # If this statement passes over, some nodes will get removed from burning
        # and then in the next iteration it will go through.
        if len(IsolatedPool) != 0:

            # 0. If all nodes are isolated, start over with the dumbell.
            if len(InPlayPool) == 0:
                u = IsolatedPool.pop()
                v = IsolatedPool.pop()
                G.add_edge(u,v)
                InPlayPool.add(u)
                InPlayPool.add(v)

            # 1. Select random node to add.
            v = IsolatedPool.pop()

            # 2. Select node to copy from.
            u = InPlayPool.pop()

            InPlayPool.add(v)
            InPlayPool.add(u) # Hack because sets can't return+retain random element.

            # 3. Run DMC iteration.
            Delete = set()
            for neighbor in G.neighbors(u):
                assert neighbor != u
                if random.random() < qmod: # modify the edge.
                    if random.random() < 0.5: # delete u->neighbor.
                        Delete.add(neighbor)
                        G.add_edge(v,neighbor)
                    #else: # delete v->neighbor -- already done.
                else: # don't modify the edge.
                    G.add_edge(v,neighbor)
                    assert v != neighbor

            for neighbor in Delete: G.remove_edge(u,neighbor)

            if random.random() < qcon:
                assert u != v
                assert not G.has_edge(u,v)
                G.add_edge(u,v)

        # 4. Burn and remove infected nodes.
        b = random.choice(G.nodes()) # random infected node.
        Infected = F.compute_infected_set_sir(G,b,beta)
        for b in Infected:
            if b in IsolatedPool: # Burnt nodes was already isolated.
                assert len(Infected) == 1 # no one else should be infected.
                continue
            else:
                InPlayPool.remove(b)
                IsolatedPool.add(b)

                # Remove and add-back as isolated.
                G.remove_node(b)
                G.add_node(b)

        # In case a burnt node / dmc iter causes an in-play node to become isolated.
        for u in nx.isolates(G):
            if u in InPlayPool:
                assert u not in IsolatedPool
                IsolatedPool.add(u)
                InPlayPool.remove(u)

        # 6. Compute distances after burning.
        (nmeval,comps,mis,density,cc,tris,fracdeg1,fracdeg0,ngcc,mgcc) = F.compute_features(G)
        m = G.size()
        
        # 8. Output results.
        print "%i\t%i\t%i\t%i\t%i\t%.5f\t%i\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\t%i\t%i" %(iter,G.order(),len(InPlayPool),len(IsolatedPool),m,nmeval,comps,mis,density,cc,tris,fracdeg1,fracdeg0,ngcc,mgcc)

    # 9. Last iteration: print component sizes for final graph.
    comps = ""
    for Gc in nx.connected_component_subgraphs(G):
        comps += "%i\t" %(Gc.order())
    print "# Components\t%s" %(comps.strip())

    # Print the final network as well.
    out = open(outfile,"w")
    #TODO: write header optionally
#    out.write("#nodes=%i\n#edges=%i\n#qmod=%.1f\n#qcon=%.1f\n#beta=%.2f\n" %(n,G.size(),qmod,qcon,beta))
#    out.write("#comps=%i\n#isolates=%i\n" %(nx.number_connected_components(G),len(nx.isolates(G))))
    for u,v in G.edges_iter(): out.write("%s\t%s\n" %(u,v))
    for u in nx.isolates(G): out.write("%s\t%s\n" %(u,u))       
    out.close()
Exemplo n.º 6
0
def process(imdb, args, validation=False):

    if validation:
        # test on the validation set
        features = compute_features(imdb, args, useValSet=False)
        print('Experiment setup: trainining set: train, test set: val')
        clf = train_classifier(
            features[
                imdb.train_indices, :],  # get rows corresponding to training
            imdb.class_ids[imdb.train_indices],
            args)
        val_preds, val_scores = make_predictions(clf,
                                                 features[imdb.val_indices, :])
        if validation:
            return get_confusion(imdb.class_ids[imdb.val_indices], val_preds)
        #show_confusion(imdb.class_ids[imdb.val_indices], val_preds)

    else:
        features = compute_features(imdb, args, useValSet=True)
        # ensure that indices haven't been accidentally modified:
        assert imdb.train_indices[0] == 0 and imdb.train_indices[-1] == 297
        assert imdb.val_indices[0] == 1 and imdb.val_indices[-1] == 298
        assert imdb.test_indices[0] == 2 and imdb.test_indices[-1] == 299
        print('Experiment setup: trainining set: train+val, test set: test')
        clf = train_classifier(
            features[np.hstack((imdb.train_indices, imdb.val_indices)), :],
            imdb.class_ids[np.hstack(
                (imdb.train_indices, imdb.val_indices))], args)
        test_preds, test_scores = make_predictions(
            clf, features[imdb.test_indices, :])
        show_confusion(imdb.class_ids[imdb.test_indices], test_preds)

        # confusion matrix of images: (store their indices in imdb.test_indices)
        # find first cat and first dog:
        cat, dog = -1, -1
        for i in range(len(
                imdb.test_indices)):  # location in imdb.test_indices
            if cat == -1 and imdb.class_ids[imdb.test_indices[i]] == 0:
                cat = i
            if dog == -1 and imdb.class_ids[imdb.test_indices[i]] == 1:
                dog = i
        top = np.array([[cat, cat], [dog, dog]])
        for i in range(len(
                imdb.test_indices)):  # location in imdb.test_indices
            # cat: 0, dog: 1 (labels)
            ans = imdb.class_ids[imdb.test_indices[i]]
            pred = test_preds[i]
            score = test_scores[i]
            if ans == 0 and pred == 0:  # look for most cat-like cat
                if score > test_scores[top[0, 0]]:
                    top[0, 0] = i
            if ans == 0 and pred == 1:  # look for most dog-like cat
                if score > test_scores[top[0, 1]]:
                    top[0, 1] = i
            if ans == 1 and pred == 1:  # look for most dog-like dog
                if score > test_scores[top[1, 1]]:
                    top[1, 1] = i
            if ans == 1 and pred == 0:  # look for most cat-like dog
                if score > test_scores[top[1, 0]]:
                    top[1, 0] = i
        # show the top images side by side
        fig, axarr = plt.subplots(2, 2, figsize=(5, 5))
        for i in range(0, 2):
            for j in range(0, 2):
                img = cv2.imread(imdb.image_dir + "/" +
                                 imdb.image_names[imdb.test_indices[top[i,
                                                                        j]]])
                axarr[i, j].imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        fig.savefig("confusion.jpg", dpi=fig.dpi)
        plt.show()
Exemplo n.º 7
0
def main():
    im_a = misc.imread(IN_PATH_A)
    im_a_p = misc.imread(IN_PATH_A_P)
    im_b = misc.imread(IN_PATH_B)

    # image single to double (float in 0 to 1 scale)
    if np.max(im_a) > 1.0:
        im_a = im_a / 255.
    if np.max(im_b) > 1.0:
        im_b = im_b / 255.
    if np.max(im_a_p) > 1.0:
        im_a_p = im_a_p / 255.
    """Remap luminance for color artistic images"""
    if LUM_REMAP:
        im_a, im_a_p = remap_luminance(im_a, im_a_p, im_b)

    pyramid_a = compute_gaussian_pyramid(im_a, max_level)
    pyramid_a_p = compute_gaussian_pyramid(im_a_p, max_level)
    pyramid_b = compute_gaussian_pyramid(im_b, max_level)
    pyramid_b_p = pyramid_b
    # im_b_yiq = rgb2yiq(im_b)
    # pyramid_color = compute_gaussian_pyramid(im_b_yiq, max_level)

    # Compute features of B
    features_b = concat_features(pyramid_b)

    # Build structure for ANN
    flann, flann_params, As, As_size = ann_index(pyramid_a, pyramid_a_p,
                                                 max_level + 1)

    ##################################################################
    # Algorithms

    for level in range(1, len(pyramid_a)):
        print('Computing level %d of %d' % (level, len(pyramid_a) - 1))

        imh, imw = pyramid_b[level].shape[:2]
        im_out = np.nan * np.ones((imh, imw, 3))

        s = []

        for row in range(imh):
            for col in range(imw):
                px = np.array([row, col])

                # do something about B and Bp feature
                feature_b_p = compute_features(pyramid_b_p)
                small_padded = np.pad(feature_b_p[level - 1], (n_sm // 2),
                                      'reflect')
                big_padded = np.pad(feature_b_p[level], (n_lg // 2), 'reflect')
                BBp_feature = np.hstack([
                    features_b[level][to_1d(px, imw), :],
                    extract_pixel_feature(small_padded, big_padded, px)
                ])

                assert (BBp_feature.shape == (As_size[level][1], ))
                # Find Approx Nearest Neighbor
                p_app_ix = best_approximate_match(flann[level],
                                                  flann_params[level],
                                                  BBp_feature)

                Ap_imh, Ap_imw = pyramid_a_p[level].shape[:2]
                p_app = to_2d(p_app_ix, Ap_imw)

                if (len(s) < 1):
                    p = p_app

                else:
                    #Coherence match
                    p_coh = best_coherence_match(As[level], (Ap_imh, Ap_imw),
                                                 BBp_feature, s, px, imw, n_lg)

                    if np.allclose(p_coh, np.array([-1, -1])):
                        p = p_app

                    else:
                        AAp_feature_app = As[level][p_app]
                        AAp_feature_coh = As[level][p_coh]
                        d_app = norm(AAp_feature_app - BBp_feature)**2
                        d_coh = norm(AAp_feature_coh - BBp_feature)**2

                        if d_coh < d_app * (1 + (2**(level - 5) * 1)):
                            p = p_coh
                        else:
                            p = p_app

                s.append(p)
                pyramid_b_p[level][row, col] = pyramid_a_p[level][tuple(p)]

        # Save color output images
        # pyramid_b_p_yiq = rgb2yiq(pyramid_b_p[level])
        # im_out_yiq = np.dstack([pyramid_b_p_yiq[:, :, 0], pyramid_color[level][:, :, 1:]])
        color_im_out = pyramid_b_p[level]
        color_im_out = np.clip(color_im_out, 0, 1)

        misc.imsave('output/level_%d_color.jpg' % level, color_im_out)
Exemplo n.º 8
0
def main(args):
    parser = argparse.ArgumentParser(
        description='Train and evaluate a model on the Cats vs. Dogs dataset')

    parser.add_argument('-d',
                        '--dataset-dir',
                        required=True,
                        type=str,
                        help='Path to the dataset')
    parser.add_argument('-f',
                        '--feature',
                        required=True,
                        choices=FEATURES,
                        help='Select which feature representation to use. '
                        'Choices are {' + ', '.join(FEATURES) + '}')
    parser.add_argument('-c',
                        '--classifier',
                        required=True,
                        choices=CLASSIFIERS,
                        help='Select which classifier to use. '
                        'Choices are {' + ', '.join(CLASSIFIERS) + '}')
    parser.add_argument('-k',
                        '--knn-k',
                        default=3,
                        type=int,
                        help='Number of neighbors for kNN classifier')
    parser.add_argument('-l',
                        '--svm-lambda',
                        default=1.0,
                        type=float,
                        help='Lambda paramter for SVM')
    parser.add_argument('--tinyimage-patchdim', default=16, type=int)
    parser.add_argument('--patches-dictionarysize', default=128, type=int)
    parser.add_argument('--patches-radius', default=8, type=float)
    parser.add_argument('--patches-stride', default=12, type=int)
    parser.add_argument('--sift-dictionarysize', default=128, type=int)
    parser.add_argument('--sift-binsize',
                        default=8,
                        type=int,
                        help='Size of the bin in terms of number of pixels in '
                        'the image. Recall that SIFT has 4x4=16 bins.')
    parser.add_argument('--sift-stride',
                        default=12,
                        type=int,
                        help='Spacing between succesive x (and y) coordinates '
                        'for sampling dense features.')

    args = parser.parse_args(args)

    imdb = read_dataset(args.dataset_dir)

    features = compute_features(imdb, args)

    if args.feature != 'tinyimage':
        features = normalize_features(features)

    print(f'Experiment setup: trainining set: train, test set: val')
    clf = train_classifier(features[imdb.train_indices, :],
                           imdb.class_ids[imdb.train_indices], args)
    val_preds, val_scores = make_predictions(clf,
                                             features[imdb.val_indices, :])
    show_confusion(imdb.class_ids[imdb.val_indices], val_preds)

    print(f'Experiment setup: trainining set: train+val, test set: test')
    clf = train_classifier(
        features[np.hstack((imdb.train_indices, imdb.val_indices)), :],
        imdb.class_ids[np.hstack(
            (imdb.train_indices, imdb.val_indices))], args)
    test_preds, test_scores = make_predictions(clf,
                                               features[imdb.test_indices, :])
    show_confusion(imdb.class_ids[imdb.test_indices], test_preds)
def predict_class(test_file="test/public_leaderboard.csv", recompute_feats=False):
    """
	Module that predicts class probabilities for test data 
	from a .csv file or precomputed feature vectors.
	"""

    # custom variables
    DATA_DIR = "../data/"
    SUBMISSION_DIR = "../data/submission/"
    train_file_all = "train/train.csv"
    test_file = "test/private_leaderboard.csv"
    feature_file = "test/private_leaderboard-feats.csv"
    output_file = "predictions.csv"

    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
    log = logging.getLogger(__name__)

    if recompute_feats:
        # features.compute_features( 'test/test.csv', 'test/test-feats.csv' )
        features.compute_features(test_file, feature_file)

    log.info("π: load features from file")
    X_test = pd.io.parsers.read_csv(os.path.join(DATA_DIR, feature_file), header=None)
    X_test = X_test.as_matrix()

    log.info("π: load classifier")
    npz_file = np.load(SUBMISSION_DIR + "cfy.npz")
    clf_lda = joblib.load(SUBMISSION_DIR + "clf_lda.pkl")
    clf_rfc = joblib.load(SUBMISSION_DIR + "clf_rfc.pkl")
    clf_gbc = joblib.load(SUBMISSION_DIR + "clf_gbc.pkl")

    log.info("π: load standardizer, normalizer")
    standardizer = npz_file["standardizer"].item()
    normalizer = npz_file["normalizer"].item()

    # log.info( 'π: perform feature selection' )
    # fselect = npz_file[ 'fselect' ].item()
    # X_test = fselect.transform( X_test )

    log.info("π: Random Forest predictions")
    y_rfc = clf_rfc.predict_proba(X_test)

    log.info("π: standardize and normalize test features")
    standardizer.transform(X_test)  # in-place
    normalizer.transform(X_test)  # in-place

    log.info("π: LDA and GBC class membership predictions")
    # X_test = clf_lda.transform( X_test )
    y_lda = clf_lda.predict_proba(X_test)
    y_gbc = clf_gbc.predict_proba(X_test)

    y_pred = (y_rfc + y_gbc) / 2.0

    log.info("π: calculate priors and update posteriors")
    new_priors = cu.get_priors(train_file_all)
    closed_reasons = pd.io.parsers.read_csv(os.path.join(DATA_DIR, train_labels), header=None)["X0"]
    closed_reason_counts = Counter(closed_reasons)
    reasons = sorted(closed_reason_counts.keys())
    total = len(closed_reasons)
    old_priors = [closed_reason_counts[reason] / total for reason in reasons]
    y_pred = cu.cap_and_update_priors(old_priors, y_pred, new_priors, 0.001)

    y_pred = (2 * y_pred + y_lda) / 3.0
    log.info("π: write predictions to file")
    writer = csv.writer(open(os.path.join(SUBMISSION_DIR, output_file), "w"), lineterminator="\n")
    writer.writerows(y_pred)